Add a script to evaluate how much it costs to add the term to the index

Old sizes mean 33.3673 0.08148019988498635 New sizes mean 32.1322 0.07700185221489449
2023-11-16 17:42:18 +00:00 · 2023-11-16 17:42:18 +00:00 · a2b872008f
parent 8790d758a3
commit a2b872008f
4 changed files with 66 additions and 3 deletions
--- a/analyse/add_term_info.py
+++ b/analyse/add_term_info.py
@ -0,0 +1,60 @@
+"""
+Investigate adding term information to the database.
+
+How much extra space will it take?
+"""
+import os
+from pathlib import Path
+from random import Random
+
+import numpy as np
+from scipy.stats import sem
+
+from mwmbl.indexer.index import tokenize_document
+from mwmbl.tinysearchengine.indexer import TinyIndex, Document, _trim_items_to_page, astuple
+
+from zstandard import ZstdCompressor
+
+random = Random(1)
+
+INDEX_PATH = Path(os.environ["HOME"]) / "Downloads" / "index-v2.tinysearch"
+# INDEX_PATH = Path(__file__).parent.parent / "devdata" / "index-v2.tinysearch"
+
+
+def add_term_info(document: Document, index: TinyIndex, page_index: int):
+    tokenized = tokenize_document(document.url, document.title, document.extract, document.score)
+    for token in tokenized.tokens:
+        token_page_index = index.get_key_page_index(token)
+        if token_page_index == page_index:
+            return Document(document.title, document.url, document.extract, document.score, token)
+    raise ValueError("Could not find token in page index")
+
+
+def run():
+    compressor = ZstdCompressor()
+    with TinyIndex(Document, INDEX_PATH) as index:
+        # Get some random integers between 0 and index.num_pages:
+        pages = random.sample(range(index.num_pages), 10000)
+
+        old_sizes = []
+        new_sizes = []
+
+        for i in pages:
+            page = index.get_page(i)
+            term_documents = []
+            for document in page:
+                term_document = add_term_info(document, index, i)
+                term_documents.append(term_document)
+
+            value_tuples = [astuple(value) for value in term_documents]
+            num_fitting, compressed = _trim_items_to_page(compressor, index.page_size, value_tuples)
+
+            new_sizes.append(num_fitting)
+            old_sizes.append(len(page))
+
+        print("Old sizes mean", np.mean(old_sizes), sem(old_sizes))
+        print("New sizes mean", np.mean(new_sizes), sem(new_sizes))
+
+
+if __name__ == '__main__':
+    run()
--- a/mwmbl/indexer/index.py
+++ b/mwmbl/indexer/index.py
@ -49,7 +49,7 @@ def prepare_url_for_tokenizing(url: str):
 def get_pages(nlp, titles_urls_and_extracts, link_counts) -> Iterable[TokenizedDocument]:
    for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts):
        score = link_counts.get(url, DEFAULT_SCORE)
-        yield tokenize_document(url, title_cleaned, extract, score, nlp)
+        yield tokenize_document(url, title_cleaned, extract, score)

        if i % 1000 == 0:
            print("Processed", i)
@ -61,7 +61,7 @@ def get_index_tokens(tokens):
    return set(first_tokens + bigrams)


-def tokenize_document(url, title_cleaned, extract, score, nlp):
+def tokenize_document(url, title_cleaned, extract, score):
    title_tokens = tokenize(title_cleaned)
    prepared_url = prepare_url_for_tokenizing(unquote(url))
    url_tokens = tokenize(prepared_url)
--- a/mwmbl/indexer/index_batches.py
+++ b/mwmbl/indexer/index_batches.py
@ -71,7 +71,7 @@ def preprocess_documents(documents, index_path, nlp):
    page_documents = defaultdict(list)
    with TinyIndex(Document, index_path, 'w') as indexer:
        for document in documents:
-            tokenized = tokenize_document(document.url, document.title, document.extract, document.score, nlp)
+            tokenized = tokenize_document(document.url, document.title, document.extract, document.score)
            # logger.debug(f"Tokenized: {tokenized}")
            page_indexes = [indexer.get_key_page_index(token) for token in tokenized.tokens]
            for page in page_indexes:
--- a/mwmbl/tinysearchengine/indexer.py
+++ b/mwmbl/tinysearchengine/indexer.py
@ -79,6 +79,7 @@ class TinyIndexMetadata:
        values = json.loads(data[constant_length:].decode('utf8'))
        return TinyIndexMetadata(**values)

+
 # Find the optimal amount of data that fits onto a page
 # We do this by leveraging binary search to quickly find the index where:
 #     - index+1 cannot fit onto a page
@ -106,10 +107,12 @@ def _binary_search_fitting_size(compressor: ZstdCompressor, page_size: int, item
            # No better match, use our index
            return mid, compressed_data

+
 def _trim_items_to_page(compressor: ZstdCompressor, page_size: int, items:list[T]):
    # Find max number of items that fit on a page
    return _binary_search_fitting_size(compressor, page_size, items, 0, len(items))

+
 def _get_page_data(compressor: ZstdCompressor, page_size: int, items: list[T]):
    num_fitting, serialised_data = _trim_items_to_page(compressor, page_size, items)