mwmbl/mwmbl/indexer/index_batches.py

94 lines
3.9 KiB
Python

"""
Index batches that are stored locally.
"""
from collections import defaultdict
from logging import getLogger
from typing import Collection, Iterable
import spacy
from mwmbl.indexer import process_batch
from spacy import Language
from mwmbl.crawler.batch import HashedBatch, Item
from mwmbl.crawler.urls import URLDatabase, URLStatus
from mwmbl.database import Database
from mwmbl.indexer.batch_cache import BatchCache
from mwmbl.indexer.index import tokenize_document
from mwmbl.indexer.indexdb import BatchStatus
from mwmbl.tinysearchengine.indexer import Document, TinyIndex
from mwmbl.utils import add_term_info, add_term_infos
logger = getLogger(__name__)
def get_documents_from_batches(batches: Collection[HashedBatch]) -> Iterable[tuple[str, str, str]]:
for batch in batches:
for item in batch.items:
if item.content is not None:
yield item.content.title, item.url, item.content.extract
def run(batch_cache: BatchCache, index_path: str):
def process(batches: Collection[HashedBatch]):
with Database() as db:
url_db = URLDatabase(db.connection)
index_batches(batches, index_path, url_db)
logger.info("Indexed pages")
process_batch.run(batch_cache, BatchStatus.URLS_UPDATED, BatchStatus.INDEXED, process)
def index_batches(batch_data: Collection[HashedBatch], index_path: str, url_db: URLDatabase):
document_tuples = list(get_documents_from_batches(batch_data))
urls = [url for title, url, extract in document_tuples]
url_scores = url_db.get_url_scores(urls)
logger.info(f"Indexing {len(urls)} document tuples and {len(url_scores)} URL scores")
documents = [Document(title, url, extract, url_scores.get(url, 1.0)) for title, url, extract in document_tuples]
page_documents = preprocess_documents(documents, index_path)
index_pages(index_path, page_documents)
def index_pages(index_path, page_documents):
with TinyIndex(Document, index_path, 'w') as indexer:
for page, documents in page_documents.items():
new_documents = []
existing_documents = indexer.get_page(page)
seen_urls = set()
seen_titles = set()
sorted_documents = sorted(documents + existing_documents, key=lambda x: x.score, reverse=True)
# TODO: for now we add the term here, until all the documents in the index have terms
sorted_documents_with_terms = add_term_infos(sorted_documents, indexer, page)
for document in sorted_documents_with_terms:
if document.title in seen_titles or document.url in seen_urls:
continue
new_documents.append(document)
seen_urls.add(document.url)
seen_titles.add(document.title)
logger.info(f"Storing {len(new_documents)} documents for page {page}, originally {len(existing_documents)}")
indexer.store_in_page(page, new_documents)
def preprocess_documents(documents, index_path):
page_documents = defaultdict(list)
with TinyIndex(Document, index_path, 'w') as indexer:
for document in documents:
tokenized = tokenize_document(document.url, document.title, document.extract, document.score)
for token in tokenized.tokens:
page = indexer.get_key_page_index(token)
term_document = Document(document.title, document.url, document.extract, document.score, token)
page_documents[page].append(term_document)
print(f"Preprocessed for {len(page_documents)} pages")
return page_documents
def get_url_error_status(item: Item):
if item.status == 404:
return URLStatus.ERROR_404
if item.error is not None:
if item.error.name == 'AbortError':
return URLStatus.ERROR_TIMEOUT
elif item.error.name == 'RobotsDenied':
return URLStatus.ERROR_ROBOTS_DENIED
return URLStatus.ERROR_OTHER