Use JSON instead of struct to store metadata

This commit is contained in:
Daoud Clarke 2022-02-18 22:22:47 +00:00
parent e6273c7f76
commit 326f7e3d7f
2 changed files with 13 additions and 22 deletions

View file

@ -1,15 +1,12 @@
import json
import os
from dataclasses import astuple, dataclass
from dataclasses import astuple, dataclass, asdict
from io import UnsupportedOperation
from mmap import mmap, PROT_READ, PROT_WRITE
from pathlib import Path
from struct import pack, unpack, calcsize
from typing import TypeVar, Generic, Callable, List
import mmh3
from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError
from zstandard import ZstdDecompressor, ZstdCompressor
VERSION = 1
METADATA_CONSTANT = 'mwmbl-tiny-search'.encode('utf8')
@ -43,11 +40,9 @@ class TinyIndexMetadata:
item_factory: str
def to_bytes(self) -> bytes:
result = METADATA_CONSTANT + pack(
METADATA_FORMAT, self.version, self.page_size, self.num_pages, self.item_factory.encode('utf8')
)
assert len(result) <= METADATA_SIZE
return result
metadata_bytes = METADATA_CONSTANT + json.dumps(asdict(self)).encode('utf8')
assert len(metadata_bytes) <= METADATA_SIZE
return metadata_bytes
@staticmethod
def from_bytes(data: bytes):
@ -56,9 +51,8 @@ class TinyIndexMetadata:
if metadata_constant != METADATA_CONSTANT:
raise ValueError("This doesn't seem to be an index file")
actual_metadata_size = calcsize(METADATA_FORMAT)
values = unpack(METADATA_FORMAT, data[constant_length:constant_length+actual_metadata_size])
return TinyIndexMetadata(values[0], values[1], values[2], values[3].decode('utf8'))
values = json.loads(data[constant_length:].decode('utf8'))
return TinyIndexMetadata(**values)
def _get_page_data(compressor, page_size, data):
@ -84,7 +78,8 @@ class TinyIndex(Generic[T]):
with open(index_path, 'rb') as index_file:
metadata_page = index_file.read(METADATA_SIZE)
metadata = TinyIndexMetadata.from_bytes(metadata_page)
metadata_bytes = metadata_page.rstrip(b'\x00')
metadata = TinyIndexMetadata.from_bytes(metadata_bytes)
if metadata.item_factory != item_factory.__name__:
raise ValueError(f"Metadata item factory '{metadata.item_factory}' in the index "
f"does not match the passed item factory: '{item_factory.__name__}'")
@ -129,7 +124,6 @@ class TinyIndex(Generic[T]):
return converted
def index(self, key: str, value: T):
# print("Index", value)
assert type(value) == self.item_factory, f"Can only index the specified type" \
f" ({self.item_factory.__name__})"
page_index = self._get_key_page_index(key)
@ -137,10 +131,8 @@ class TinyIndex(Generic[T]):
if current_page is None:
current_page = []
value_tuple = astuple(value)
# print("Value tuple", value_tuple)
current_page.append(value_tuple)
try:
# print("Page", current_page)
self._write_page(current_page, page_index)
except ValueError:
pass

View file

@ -10,8 +10,7 @@ def test_create_index():
with TemporaryDirectory() as temp_dir:
index_path = Path(temp_dir) / 'temp-index.tinysearch'
indexer = TinyIndex.create(Document, str(index_path), num_pages=num_pages, page_size=page_size)
for i in range(num_pages):
page = indexer.get_page(i)
assert page == []
with TinyIndex.create(Document, str(index_path), num_pages=num_pages, page_size=page_size) as indexer:
for i in range(num_pages):
page = indexer.get_page(i)
assert page == []