Start background processes

2023-10-08 21:20:32 +01:00 · 2023-10-08 21:20:32 +01:00 · a1d6fd8bb1
parent b6fd27352b
commit a1d6fd8bb1
3 changed files with 24 additions and 99 deletions
--- a/app/apps.py
+++ b/app/apps.py
@ -0,0 +1,22 @@
+import os
+from multiprocessing import Process, Queue
+
+from django.apps import AppConfig
+
+from app import settings
+from app.api import queued_batches
+from mwmbl import background
+from mwmbl.indexer.update_urls import update_urls_continuously
+from mwmbl.url_queue import update_queue_continuously
+
+
+class MwmblConfig(AppConfig):
+    name = "app"
+    verbose_name = "Mwmbl Application"
+
+    def ready(self):
+        if os.environ.get('RUN_MAIN') and settings.RUN_BACKGROUND_PROCESSES:
+            new_item_queue = Queue()
+            Process(target=background.run, args=(settings.DATA_PATH,)).start()
+            Process(target=update_queue_continuously, args=(new_item_queue, queued_batches,)).start()
+            Process(target=update_urls_continuously, args=(settings.DATA_PATH, new_item_queue)).start()
--- a/app/settings.py
+++ b/app/settings.py
@ -37,6 +37,7 @@ INSTALLED_APPS = [
    'django.contrib.sessions',
    'django.contrib.messages',
    'django.contrib.staticfiles',
+    'app',
 ]

 MIDDLEWARE = [
@ -125,5 +126,4 @@ DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
 # ===================== Custom Settings =========================

 DATA_PATH = "./devdata"
-
-
+RUN_BACKGROUND_PROCESSES = True
--- a/mwmbl/main.py
+++ b/mwmbl/main.py
@ -1,97 +0,0 @@
-import argparse
-import logging
-import sys
-from multiprocessing import Process, Queue
-from pathlib import Path
-
-import uvicorn
-from fastapi import FastAPI
-from starlette.middleware.cors import CORSMiddleware
-
-from mwmbl import background
-from mwmbl.crawler import app as crawler
-from mwmbl.indexer.batch_cache import BatchCache
-from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
-from mwmbl.platform import user
-from mwmbl.indexer.update_urls import update_urls_continuously
-from mwmbl.tinysearchengine import search
-from mwmbl.tinysearchengine.completer import Completer
-from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE
-from mwmbl.tinysearchengine.rank import HeuristicRanker
-from mwmbl.url_queue import update_queue_continuously
-
-FORMAT = '%(levelname)s %(name)s %(asctime)s %(message)s'
-logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=FORMAT)
-
-
-MODEL_PATH = Path(__file__).parent / 'resources' / 'model.pickle'
-
-
-def setup_args():
-    parser = argparse.ArgumentParser(description="Mwmbl API server and background task processor")
-    parser.add_argument("--num-pages", type=int, help="Number of pages of memory (4096 bytes) to use for the index", default=2560)
-    parser.add_argument("--data", help="Path to the data folder for storing index and cached batches", default="./devdata")
-    parser.add_argument("--port", type=int, help="Port for the server to listen at", default=5000)
-    parser.add_argument("--background", help="Enable running the background tasks to process batches",
-                        action='store_true')
-    args = parser.parse_args()
-    return args
-
-
-def run():
-    args = setup_args()
-
-    index_path = Path(args.data) / INDEX_NAME
-    try:
-        existing_index = TinyIndex(item_factory=Document, index_path=index_path)
-        if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != args.num_pages:
-            raise ValueError(f"Existing index page sizes ({existing_index.page_size}) or number of pages "
-                             f"({existing_index.num_pages}) do not match")
-    except FileNotFoundError:
-        print("Creating a new index")
-        TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=args.num_pages, page_size=PAGE_SIZE)
-
-    new_item_queue = Queue()
-    queued_batches = Queue()
-    # curation_queue = Queue()
-
-    if args.background:
-        Process(target=background.run, args=(args.data,)).start()
-        Process(target=update_queue_continuously, args=(new_item_queue, queued_batches,)).start()
-        Process(target=update_urls_continuously, args=(args.data, new_item_queue)).start()
-
-    completer = Completer()
-
-    with TinyIndex(item_factory=Document, index_path=index_path) as tiny_index:
-        ranker = HeuristicRanker(tiny_index, completer)
-        # model = pickle.load(open(MODEL_PATH, 'rb'))
-        # ranker = LTRRanker(model, tiny_index, completer)
-
-        # Initialize FastApi instance
-        app = FastAPI()
-
-        # Try disabling since this is handled by nginx
-        # app.add_middleware(
-        #     CORSMiddleware,
-        #     allow_origins=["*"],
-        #     allow_credentials=True,
-        #     allow_methods=["*"],
-        #     allow_headers=["*"],
-        # )
-
-        search_router = search.create_router(ranker)
-        app.include_router(search_router)
-
-        batch_cache = BatchCache(Path(args.data) / BATCH_DIR_NAME)
-        crawler_router = crawler.create_router(batch_cache, queued_batches)
-        app.include_router(crawler_router)
-
-        user_router = user.create_router(index_path)
-        app.include_router(user_router)
-
-        # Initialize uvicorn server using global app instance and server config params
-        uvicorn.run(app, host="0.0.0.0", port=args.port)
-
-
-if __name__ == "__main__":
-    run()