mwmbl/analyse/add_term_info.py
Daoud Clarke a2b872008f Add a script to evaluate how much it costs to add the term to the index
Old sizes mean 33.3673 0.08148019988498635
New sizes mean 32.1322 0.07700185221489449
2023-11-16 17:42:18 +00:00

61 lines
1.9 KiB
Python

"""
Investigate adding term information to the database.
How much extra space will it take?
"""
import os
from pathlib import Path
from random import Random
import numpy as np
from scipy.stats import sem
from mwmbl.indexer.index import tokenize_document
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, _trim_items_to_page, astuple
from zstandard import ZstdCompressor
random = Random(1)
INDEX_PATH = Path(os.environ["HOME"]) / "Downloads" / "index-v2.tinysearch"
# INDEX_PATH = Path(__file__).parent.parent / "devdata" / "index-v2.tinysearch"
def add_term_info(document: Document, index: TinyIndex, page_index: int):
tokenized = tokenize_document(document.url, document.title, document.extract, document.score)
for token in tokenized.tokens:
token_page_index = index.get_key_page_index(token)
if token_page_index == page_index:
return Document(document.title, document.url, document.extract, document.score, token)
raise ValueError("Could not find token in page index")
def run():
compressor = ZstdCompressor()
with TinyIndex(Document, INDEX_PATH) as index:
# Get some random integers between 0 and index.num_pages:
pages = random.sample(range(index.num_pages), 10000)
old_sizes = []
new_sizes = []
for i in pages:
page = index.get_page(i)
term_documents = []
for document in page:
term_document = add_term_info(document, index, i)
term_documents.append(term_document)
value_tuples = [astuple(value) for value in term_documents]
num_fitting, compressed = _trim_items_to_page(compressor, index.page_size, value_tuples)
new_sizes.append(num_fitting)
old_sizes.append(len(page))
print("Old sizes mean", np.mean(old_sizes), sem(old_sizes))
print("New sizes mean", np.mean(new_sizes), sem(new_sizes))
if __name__ == '__main__':
run()