2022-06-27 22:44:25 +00:00
|
|
|
"""
|
|
|
|
Script that updates data in a background process.
|
|
|
|
"""
|
2023-11-18 18:49:41 +00:00
|
|
|
import logging
|
|
|
|
import sys
|
|
|
|
from logging import getLogger, basicConfig
|
2022-07-19 20:18:43 +00:00
|
|
|
from pathlib import Path
|
2022-07-03 08:44:51 +00:00
|
|
|
from time import sleep
|
2022-06-30 19:00:38 +00:00
|
|
|
|
2022-12-31 13:32:15 +00:00
|
|
|
from mwmbl.crawler.urls import URLDatabase
|
|
|
|
from mwmbl.database import Database
|
2023-01-22 20:28:18 +00:00
|
|
|
from mwmbl.indexer import index_batches, historical
|
2022-07-20 21:21:35 +00:00
|
|
|
from mwmbl.indexer.batch_cache import BatchCache
|
2022-07-30 10:08:15 +00:00
|
|
|
from mwmbl.indexer.paths import BATCH_DIR_NAME, INDEX_NAME
|
2022-06-27 22:44:25 +00:00
|
|
|
|
2023-11-18 18:49:41 +00:00
|
|
|
|
|
|
|
basicConfig(stream=sys.stdout, level=logging.INFO)
|
2022-06-30 19:00:38 +00:00
|
|
|
logger = getLogger(__name__)
|
|
|
|
|
2022-06-27 22:44:25 +00:00
|
|
|
|
2023-01-22 20:28:18 +00:00
|
|
|
def run(data_path: str):
|
2022-12-31 13:32:15 +00:00
|
|
|
logger.info("Started background process")
|
|
|
|
|
|
|
|
with Database() as db:
|
|
|
|
url_db = URLDatabase(db.connection)
|
|
|
|
url_db.create_tables()
|
|
|
|
|
2023-01-22 20:28:18 +00:00
|
|
|
historical.run()
|
2022-07-30 10:08:15 +00:00
|
|
|
index_path = Path(data_path) / INDEX_NAME
|
2022-07-19 20:18:43 +00:00
|
|
|
batch_cache = BatchCache(Path(data_path) / BATCH_DIR_NAME)
|
2022-12-31 13:32:15 +00:00
|
|
|
|
2022-06-29 21:39:21 +00:00
|
|
|
while True:
|
2022-08-26 21:20:35 +00:00
|
|
|
try:
|
2023-01-22 20:28:18 +00:00
|
|
|
batch_cache.retrieve_batches(num_batches=10000)
|
2022-08-26 21:20:35 +00:00
|
|
|
except Exception:
|
2023-01-22 20:28:18 +00:00
|
|
|
logger.exception("Error retrieving batches")
|
2022-06-30 19:00:38 +00:00
|
|
|
try:
|
2022-07-23 22:19:36 +00:00
|
|
|
index_batches.run(batch_cache, index_path)
|
2022-06-30 19:00:38 +00:00
|
|
|
except Exception:
|
2022-07-23 22:19:36 +00:00
|
|
|
logger.exception("Error indexing batches")
|
2022-07-03 08:44:51 +00:00
|
|
|
sleep(10)
|