Merge pull request #115 from mwmbl/django-rewrite

Django rewrite
2023-10-10 16:25:36 +01:00 · 2023-10-10 16:25:36 +01:00 · 213bdaa365
parent d716cb347f 918eaa8709
commit 213bdaa365
35 changed files with 346 additions and 167 deletions
--- a/7
+++ b/7
@ -46,5 +46,8 @@ VOLUME ["/data"]

 EXPOSE 5000

-# Using the mwmbl-tinysearchengine binary/entrypoint which comes packaged with mwmbl
-CMD ["/venv/bin/mwmbl-tinysearchengine", "--num-pages", "10240000", "--background", "--data", "/app/storage"]
+ENV DJANGO_SETTINGS_MODULE=mwmbl.settings_dev
+
+# WORKDIR "/venv/lib/python3.10/site-packages/mwmbl/"
+# CMD ["/venv/bin/python", "-m", "uvicorn", "app.asgi:application"]
+CMD ["/venv/bin/mwmbl-tinysearchengine"]
--- a/analyse/analyse_crawled_domains.py
+++ b/analyse/analyse_crawled_domains.py
@ -7,8 +7,8 @@ import json
 from collections import defaultdict, Counter
 from urllib.parse import urlparse

-from mwmbl.crawler.batch import HashedBatch
-from mwmbl.indexer.paths import CRAWL_GLOB, MWMBL_DATA_DIR
+from mwmbl.crawler import HashedBatch
+from mwmbl.indexer import CRAWL_GLOB, MWMBL_DATA_DIR


 # TODO: remove this line - temporary override
--- a/analyse/export_top_domains.py
+++ b/analyse/export_top_domains.py
@ -1,6 +1,6 @@
 import json

-from mwmbl.indexer.paths import TOP_DOMAINS_JSON_PATH
+from mwmbl.indexer import TOP_DOMAINS_JSON_PATH
 from mwmbl.hn_top_domains_filtered import DOMAINS


--- a/analyse/export_urls.py
+++ b/analyse/export_urls.py
@ -3,7 +3,7 @@ Export the list of unique URLs to a SQLite file for analysis/evaluation.
 """
 import sqlite3

-from mwmbl.indexer.paths import URLS_PATH
+from mwmbl.indexer import URLS_PATH
 from mwmbl.app import get_config_and_index


--- a/analyse/index_local.py
+++ b/analyse/index_local.py
@ -7,16 +7,15 @@ import json
 import logging
 import os
 import sys
-from pathlib import Path
 from datetime import datetime

 import spacy

-from mwmbl.crawler.batch import HashedBatch
+from mwmbl.crawler import HashedBatch
 from mwmbl.crawler.urls import URLDatabase
 from mwmbl.database import Database
-from mwmbl.indexer.index_batches import index_batches
-from mwmbl.tinysearchengine.indexer import TinyIndex, Document
+from mwmbl.indexer import index_batches
+from mwmbl.tinysearchengine import TinyIndex, Document

 LOCAL_BATCHES_PATH = f'{os.environ["HOME"]}/data/mwmbl/file/**/*.json.gz'
 NUM_BATCHES = 10000
--- a/analyse/index_url_count.py
+++ b/analyse/index_url_count.py
@ -1,7 +1,7 @@
 """
 Count unique URLs in the index.
 """
-from mwmbl.tinysearchengine.indexer import TinyIndex, Document
+from mwmbl.tinysearchengine import TinyIndex, Document


 def run():
--- a/analyse/inspect_index.py
+++ b/analyse/inspect_index.py
@ -5,9 +5,9 @@ import numpy as np
 import spacy

 from analyse.index_local import EVALUATE_INDEX_PATH
-from mwmbl.indexer.index import tokenize_document
-from mwmbl.indexer.paths import INDEX_PATH
-from mwmbl.tinysearchengine.indexer import TinyIndex, Document
+from mwmbl.indexer import tokenize_document
+from mwmbl.indexer import INDEX_PATH
+from mwmbl.tinysearchengine import TinyIndex, Document


 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
--- a/analyse/record_historical_batches.py
+++ b/analyse/record_historical_batches.py
@ -4,12 +4,10 @@ See how many unique URLs and root domains we have crawled.
 import glob
 import gzip
 import json
-from collections import defaultdict, Counter
-from urllib.parse import urlparse

 import requests

-from mwmbl.indexer.paths import CRAWL_GLOB
+from mwmbl.indexer import CRAWL_GLOB


 API_ENDPOINT = "http://95.216.215.29/batches/historical"
--- a/analyse/search.py
+++ b/analyse/search.py
@ -2,9 +2,9 @@ import logging
 import sys
 from itertools import islice

-from mwmbl.indexer.paths import INDEX_PATH
+from mwmbl.indexer import INDEX_PATH
 from mwmbl.tinysearchengine.completer import Completer
-from mwmbl.tinysearchengine.indexer import TinyIndex, Document
+from mwmbl.tinysearchengine import TinyIndex, Document
 from mwmbl.tinysearchengine.rank import HeuristicRanker

 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
--- a/analyse/send_batch.py
+++ b/analyse/send_batch.py
@ -3,7 +3,7 @@ Send a batch to a running instance.
 """
 import requests

-from mwmbl.crawler.batch import Batch, Item, ItemContent
+from mwmbl.crawler import Batch, Item, ItemContent


 URL = 'http://localhost:5000/crawler/batches/'
--- a/analyse/update_urls.py
+++ b/analyse/update_urls.py
@ -4,7 +4,7 @@ from datetime import datetime
 from pathlib import Path
 from queue import Queue

-from mwmbl.indexer.update_urls import record_urls_in_database
+from mwmbl.indexer import record_urls_in_database


 def run_update_urls_on_fixed_batches():
--- a/manage.py
+++ b/manage.py
@ -0,0 +1,22 @@
+#!/usr/bin/env python
+"""Django's command-line utility for administrative tasks."""
+import os
+import sys
+
+
+def main():
+    """Run administrative tasks."""
+    os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'app.settings')
+    try:
+        from django.core.management import execute_from_command_line
+    except ImportError as exc:
+        raise ImportError(
+            "Couldn't import Django. Are you sure it's installed and "
+            "available on your PYTHONPATH environment variable? Did you "
+            "forget to activate a virtual environment?"
+        ) from exc
+    execute_from_command_line(sys.argv)
+
+
+if __name__ == '__main__':
+    main()
--- a/mwmbl/api.py
+++ b/mwmbl/api.py
@ -0,0 +1,31 @@
+from multiprocessing import Queue
+from pathlib import Path
+
+from django.conf import settings
+from ninja import NinjaAPI
+
+import mwmbl.crawler.app as crawler
+from mwmbl.indexer.batch_cache import BatchCache
+from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
+from mwmbl.tinysearchengine import search
+from mwmbl.tinysearchengine.completer import Completer
+from mwmbl.tinysearchengine.indexer import TinyIndex, Document
+from mwmbl.tinysearchengine.rank import HeuristicRanker
+
+api = NinjaAPI(version="1.0.0")
+
+index_path = Path(settings.DATA_PATH) / INDEX_NAME
+tiny_index = TinyIndex(item_factory=Document, index_path=index_path)
+tiny_index.__enter__()
+
+completer = Completer()
+ranker = HeuristicRanker(tiny_index, completer)
+
+search_router = search.create_router(ranker)
+api.add_router("/search/", search_router)
+
+batch_cache = BatchCache(Path(settings.DATA_PATH) / BATCH_DIR_NAME)
+
+queued_batches = Queue()
+crawler_router = crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches)
+api.add_router("/crawler/", crawler_router)
--- a/mwmbl/apps.py
+++ b/mwmbl/apps.py
@ -0,0 +1,35 @@
+from multiprocessing import Process, Queue
+from pathlib import Path
+
+from django.apps import AppConfig
+from django.conf import settings
+
+from mwmbl.api import queued_batches
+from mwmbl import background
+from mwmbl.indexer.paths import INDEX_NAME
+from mwmbl.indexer.update_urls import update_urls_continuously
+from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE
+from mwmbl.url_queue import update_queue_continuously
+
+
+class MwmblConfig(AppConfig):
+    name = "mwmbl"
+    verbose_name = "Mwmbl Application"
+
+    def ready(self):
+        index_path = Path(settings.DATA_PATH) / INDEX_NAME
+        try:
+            existing_index = TinyIndex(item_factory=Document, index_path=index_path)
+            if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != settings.NUM_PAGES:
+                raise ValueError(f"Existing index page sizes ({existing_index.page_size}) or number of pages "
+                                 f"({existing_index.num_pages}) do not match")
+        except FileNotFoundError:
+            print("Creating a new index")
+            TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=settings.NUM_PAGES,
+                             page_size=PAGE_SIZE)
+
+        if settings.RUN_BACKGROUND_PROCESSES:
+            new_item_queue = Queue()
+            Process(target=background.run, args=(settings.DATA_PATH,)).start()
+            Process(target=update_queue_continuously, args=(new_item_queue, queued_batches,)).start()
+            Process(target=update_urls_continuously, args=(settings.DATA_PATH, new_item_queue)).start()
--- a/mwmbl/asgi.py
+++ b/mwmbl/asgi.py
@ -0,0 +1,16 @@
+"""
+ASGI config for app project.
+
+It exposes the ASGI callable as a module-level variable named ``application``.
+
+For more information on this file, see
+https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/
+"""
+
+import os
+
+from django.core.asgi import get_asgi_application
+
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mwmbl.settings_dev')
+
+application = get_asgi_application()
--- a/mwmbl/crawler/app.py
+++ b/mwmbl/crawler/app.py
@ -10,10 +10,11 @@ from uuid import uuid4
 import boto3
 import justext
 import requests
-from fastapi import HTTPException, APIRouter
+from fastapi import HTTPException
 from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
    LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
    STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
+from ninja import Router
 from redis import Redis

 from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
@ -82,17 +83,15 @@ def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
    return paragraphs, title


-def get_router(batch_cache: BatchCache, queued_batches: Queue):
-    router = APIRouter(prefix="/crawler", tags=["crawler"])
+def create_router(batch_cache: BatchCache, queued_batches: Queue) -> Router:
+    router = Router(tags=["crawler"])

-    @router.on_event("startup")
-    async def on_startup():
-        with Database() as db:
-            url_db = URLDatabase(db.connection)
-            return url_db.create_tables()
+    # TODO: # ensure tables are created before crawler code is used:
+    #       #
+    #       #     url_db.create_tables()

    @router.get('/fetch')
-    def fetch_url(url: str, query: str):
+    def fetch_url(request, url: str, query: str):
        response = requests.get(url)
        paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
        good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
@ -105,7 +104,7 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
        return format_result(result, query)

    @router.post('/batches/')
-    def post_batch(batch: Batch):
+    def post_batch(request, batch: Batch):
        if len(batch.items) > MAX_BATCH_SIZE:
            raise HTTPException(400, f"Batch size too large (maximum {MAX_BATCH_SIZE}), got {len(batch.items)}")

@ -159,7 +158,7 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
        }

    @router.post('/batches/new')
-    def request_new_batch(batch_request: NewBatchRequest) -> list[str]:
+    def request_new_batch(request, batch_request: NewBatchRequest) -> list[str]:
        user_id_hash = _get_user_id_hash(batch_request)
        try:
            urls = queued_batches.get(block=False)
@ -174,14 +173,14 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
        return urls

    @router.get('/batches/{date_str}/users/{public_user_id}')
-    def get_batches_for_date_and_user(date_str, public_user_id):
+    def get_batches_for_date_and_user(request, date_str, public_user_id):
        check_date_str(date_str)
        check_public_user_id(public_user_id)
        prefix = f'1/{VERSION}/{date_str}/1/{public_user_id}/'
        return get_batch_ids_for_prefix(prefix)

    @router.get('/batches/{date_str}/users/{public_user_id}/batch/{batch_id}')
-    def get_batch_from_id(date_str, public_user_id, batch_id):
+    def get_batch_from_id(request, date_str, public_user_id, batch_id):
        url = get_batch_url(batch_id, date_str, public_user_id)
        data = json.loads(gzip.decompress(requests.get(url).content))
        return {
@ -189,22 +188,22 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
            'batch': data,
        }

-    @router.get('/latest-batch', response_model=list[HashedBatch])
-    def get_latest_batch():
+    @router.get('/latest-batch')
+    def get_latest_batch(request) -> list[HashedBatch]:
        return [] if last_batch is None else [last_batch]

    @router.get('/batches/{date_str}/users')
-    def get_user_id_hashes_for_date(date_str: str):
+    def get_user_id_hashes_for_date(request, date_str: str):
        check_date_str(date_str)
        prefix = f'1/{VERSION}/{date_str}/1/'
        return get_subfolders(prefix)

    @router.get('/stats')
-    def get_stats() -> MwmblStats:
+    def get_stats(request) -> MwmblStats:
        return stats_manager.get_stats()

    @router.get('/')
-    def status():
+    def status(request):
        return {
            'status': 'ok'
        }
--- a/mwmbl/crawler/batch.py
+++ b/mwmbl/crawler/batch.py
@ -1,21 +1,21 @@
 from typing import Optional

-from pydantic import BaseModel
+from ninja import Schema


-class ItemContent(BaseModel):
+class ItemContent(Schema):
    title: str
    extract: str
    links: list[str]
    extra_links: Optional[list[str]]


-class ItemError(BaseModel):
+class ItemError(Schema):
    name: str
    message: Optional[str]


-class Item(BaseModel):
+class Item(Schema):
    url: str
    status: Optional[int]
    timestamp: int
@ -23,16 +23,16 @@ class Item(BaseModel):
    error: Optional[ItemError]


-class Batch(BaseModel):
+class Batch(Schema):
    user_id: str
    items: list[Item]


-class NewBatchRequest(BaseModel):
+class NewBatchRequest(Schema):
    user_id: str


-class HashedBatch(BaseModel):
+class HashedBatch(Schema):
    user_id_hash: str
    timestamp: int
    items: list[Item]
--- a/mwmbl/crawler/urls.py
+++ b/mwmbl/crawler/urls.py
@ -1,16 +1,13 @@
 """
 Database storing info on URLs
 """
-import random
 from dataclasses import dataclass
-from datetime import datetime, timedelta
+from datetime import datetime
 from enum import Enum
 from logging import getLogger

 from psycopg2.extras import execute_values

-from mwmbl.hn_top_domains_filtered import DOMAINS
-from mwmbl.settings import CORE_DOMAINS
 # Client has one hour to crawl a URL that has been assigned to them, or it will be reassigned
 from mwmbl.utils import batch

--- a/mwmbl/indexer/batch_cache.py
+++ b/mwmbl/indexer/batch_cache.py
@ -9,7 +9,6 @@ import os
 from logging import getLogger
 from multiprocessing.pool import ThreadPool
 from pathlib import Path
-from tempfile import NamedTemporaryFile
 from urllib.parse import urlparse

 from pydantic import ValidationError
--- a/mwmbl/indexer/index.py
+++ b/mwmbl/indexer/index.py
@ -1,13 +1,10 @@
 """
 Create a search index
 """
-from collections import Counter
 from typing import Iterable
 from urllib.parse import unquote

-import pandas as pd
-
-from mwmbl.tinysearchengine.indexer import Document, TokenizedDocument, TinyIndex
+from mwmbl.tinysearchengine.indexer import TokenizedDocument
 from mwmbl.tokenizer import tokenize, get_bigrams

 DEFAULT_SCORE = 0
--- a/mwmbl/indexer/update_urls.py
+++ b/mwmbl/indexer/update_urls.py
@ -1,13 +1,10 @@
-import os
-import pickle
-import re
 from collections import defaultdict
 from datetime import datetime, timezone, timedelta
 from logging import getLogger
 from multiprocessing import Queue
 from pathlib import Path
 from time import sleep
-from typing import Iterable, Collection
+from typing import Collection
 from urllib.parse import urlparse

 from requests_cache import CachedSession
--- a/mwmbl/main.py
+++ b/mwmbl/main.py
@ -1,96 +1,8 @@
-import argparse
-import logging
-import sys
-from multiprocessing import Process, Queue
-from pathlib import Path
-
 import uvicorn
-from fastapi import FastAPI
-from starlette.middleware.cors import CORSMiddleware
-
-from mwmbl import background
-from mwmbl.crawler import app as crawler
-from mwmbl.indexer.batch_cache import BatchCache
-from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
-from mwmbl.platform import user
-from mwmbl.indexer.update_urls import update_urls_continuously
-from mwmbl.tinysearchengine import search
-from mwmbl.tinysearchengine.completer import Completer
-from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE
-from mwmbl.tinysearchengine.rank import HeuristicRanker
-from mwmbl.url_queue import update_queue_continuously
-
-FORMAT = '%(levelname)s %(name)s %(asctime)s %(message)s'
-logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=FORMAT)
-
-
-MODEL_PATH = Path(__file__).parent / 'resources' / 'model.pickle'
-
-
-def setup_args():
-    parser = argparse.ArgumentParser(description="Mwmbl API server and background task processor")
-    parser.add_argument("--num-pages", type=int, help="Number of pages of memory (4096 bytes) to use for the index", default=2560)
-    parser.add_argument("--data", help="Path to the data folder for storing index and cached batches", default="./devdata")
-    parser.add_argument("--port", type=int, help="Port for the server to listen at", default=5000)
-    parser.add_argument("--background", help="Enable running the background tasks to process batches",
-                        action='store_true')
-    args = parser.parse_args()
-    return args


 def run():
-    args = setup_args()
-
-    index_path = Path(args.data) / INDEX_NAME
-    try:
-        existing_index = TinyIndex(item_factory=Document, index_path=index_path)
-        if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != args.num_pages:
-            raise ValueError(f"Existing index page sizes ({existing_index.page_size}) or number of pages "
-                             f"({existing_index.num_pages}) do not match")
-    except FileNotFoundError:
-        print("Creating a new index")
-        TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=args.num_pages, page_size=PAGE_SIZE)
-
-    new_item_queue = Queue()
-    queued_batches = Queue()
-    # curation_queue = Queue()
-
-    if args.background:
-        Process(target=background.run, args=(args.data,)).start()
-        Process(target=update_queue_continuously, args=(new_item_queue, queued_batches,)).start()
-        Process(target=update_urls_continuously, args=(args.data, new_item_queue)).start()
-
-    completer = Completer()
-
-    with TinyIndex(item_factory=Document, index_path=index_path) as tiny_index:
-        ranker = HeuristicRanker(tiny_index, completer)
-        # model = pickle.load(open(MODEL_PATH, 'rb'))
-        # ranker = LTRRanker(model, tiny_index, completer)
-
-        # Initialize FastApi instance
-        app = FastAPI()
-
-        # Try disabling since this is handled by nginx
-        # app.add_middleware(
-        #     CORSMiddleware,
-        #     allow_origins=["*"],
-        #     allow_credentials=True,
-        #     allow_methods=["*"],
-        #     allow_headers=["*"],
-        # )
-
-        search_router = search.create_router(ranker)
-        app.include_router(search_router)
-
-        batch_cache = BatchCache(Path(args.data) / BATCH_DIR_NAME)
-        crawler_router = crawler.get_router(batch_cache, queued_batches)
-        app.include_router(crawler_router)
-
-        user_router = user.create_router(index_path)
-        app.include_router(user_router)
-
-        # Initialize uvicorn server using global app instance and server config params
-        uvicorn.run(app, host="0.0.0.0", port=args.port)
+    uvicorn.run("mwmbl.asgi:application", host="0.0.0.0", port=8000)


 if __name__ == "__main__":
--- a/mwmbl/platform/user.py
+++ b/mwmbl/platform/user.py
@ -7,7 +7,7 @@ import requests
 from fastapi import APIRouter, Response
 from pydantic import BaseModel

-from mwmbl.tinysearchengine.indexer import TinyIndex, Document, DocumentState
+from mwmbl.tinysearchengine.indexer import TinyIndex, Document
 from mwmbl.tokenizer import tokenize


--- a/mwmbl/settings_common.py
+++ b/mwmbl/settings_common.py
@ -0,0 +1,125 @@
+"""
+Django settings for mwmbl project.
+
+Generated by 'django-admin startproject' using Django 4.2.4.
+
+For more information on this file, see
+https://docs.djangoproject.com/en/4.2/topics/settings/
+
+For the full list of settings and their values, see
+https://docs.djangoproject.com/en/4.2/ref/settings/
+"""
+
+from pathlib import Path
+
+# Build paths inside the project like this: BASE_DIR / 'subdir'.
+BASE_DIR = Path(__file__).resolve().parent.parent
+
+
+# Quick-start development settings - unsuitable for production
+# See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/
+
+# SECURITY WARNING: keep the secret key used in production secret!
+SECRET_KEY = 'django-insecure-qqr#f(i3uf%m8%8u35vn=ov-uk(*8!a&1t-hxa%ev2^t1%j&sm'
+
+# SECURITY WARNING: don't run with debug turned on in production!
+DEBUG = True
+
+ALLOWED_HOSTS = []
+
+
+# Application definition
+
+INSTALLED_APPS = [
+    'django.contrib.admin',
+    'django.contrib.auth',
+    'django.contrib.contenttypes',
+    'django.contrib.sessions',
+    'django.contrib.messages',
+    'django.contrib.staticfiles',
+    'mwmbl',
+]
+
+MIDDLEWARE = [
+    'django.middleware.security.SecurityMiddleware',
+    'django.contrib.sessions.middleware.SessionMiddleware',
+    'django.middleware.common.CommonMiddleware',
+    'django.middleware.csrf.CsrfViewMiddleware',
+    'django.contrib.auth.middleware.AuthenticationMiddleware',
+    'django.contrib.messages.middleware.MessageMiddleware',
+    'django.middleware.clickjacking.XFrameOptionsMiddleware',
+]
+
+ROOT_URLCONF = 'mwmbl.urls'
+
+TEMPLATES = [
+    {
+        'BACKEND': 'django.template.backends.django.DjangoTemplates',
+        'DIRS': [],
+        'APP_DIRS': True,
+        'OPTIONS': {
+            'context_processors': [
+                'django.template.context_processors.debug',
+                'django.template.context_processors.request',
+                'django.contrib.auth.context_processors.auth',
+                'django.contrib.messages.context_processors.messages',
+            ],
+        },
+    },
+]
+
+WSGI_APPLICATION = 'mwmbl.wsgi.application'
+
+
+# Database
+# https://docs.djangoproject.com/en/4.2/ref/settings/#databases
+
+DATABASES = {
+    'default': {
+        'ENGINE': 'django.db.backends.sqlite3',
+        'NAME': BASE_DIR / 'db.sqlite3',
+    }
+}
+
+
+# Password validation
+# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators
+
+AUTH_PASSWORD_VALIDATORS = [
+    {
+        'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
+    },
+    {
+        'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
+    },
+    {
+        'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
+    },
+    {
+        'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
+    },
+]
+
+
+# Internationalization
+# https://docs.djangoproject.com/en/4.2/topics/i18n/
+
+LANGUAGE_CODE = 'en-us'
+
+TIME_ZONE = 'UTC'
+
+USE_I18N = True
+
+USE_TZ = True
+
+
+# Static files (CSS, JavaScript, Images)
+# https://docs.djangoproject.com/en/4.2/howto/static-files/
+
+STATIC_URL = 'static/'
+
+# Default primary key field type
+# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
+
+DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
+
--- a/mwmbl/settings_dev.py
+++ b/mwmbl/settings_dev.py
@ -0,0 +1,5 @@
+from mwmbl.settings_common import *
+
+DATA_PATH = "./devdata"
+RUN_BACKGROUND_PROCESSES = False
+NUM_PAGES = 2560
--- a/mwmbl/settings_prod.py
+++ b/mwmbl/settings_prod.py
@ -0,0 +1,5 @@
+from mwmbl.settings_common import *
+
+DATA_PATH = "/app/storage"
+RUN_BACKGROUND_PROCESSES = True
+NUM_PAGES = 10240000
--- a/mwmbl/tinysearchengine/rank.py
+++ b/mwmbl/tinysearchengine/rank.py
@ -6,7 +6,6 @@ from operator import itemgetter
 from urllib.parse import urlparse

 from mwmbl.format import format_result_with_pattern, get_query_regex
-from mwmbl.platform.user import MAX_CURATED_SCORE
 from mwmbl.tokenizer import tokenize, get_bigrams
 from mwmbl.tinysearchengine.completer import Completer
 from mwmbl.hn_top_domains_filtered import DOMAINS
--- a/mwmbl/tinysearchengine/search.py
+++ b/mwmbl/tinysearchengine/search.py
@ -1,6 +1,6 @@
 from logging import getLogger

-from fastapi import APIRouter
+from ninja import Router

 from mwmbl.tinysearchengine.rank import HeuristicRanker

@ -10,15 +10,15 @@ logger = getLogger(__name__)
 SCORE_THRESHOLD = 0.25


-def create_router(ranker: HeuristicRanker) -> APIRouter:
-    router = APIRouter(prefix="/search", tags=["search"])
+def create_router(ranker: HeuristicRanker) -> Router:
+    router = Router(tags=["search"])

    @router.get("")
-    def search(s: str):
+    def search(request, s: str):
        return ranker.search(s)

    @router.get("/complete")
-    def complete(q: str):
+    def complete(request, q: str):
        return ranker.complete(q)

    return router
--- a/mwmbl/url_queue.py
+++ b/mwmbl/url_queue.py
@ -1,6 +1,5 @@
 import time
 from collections import defaultdict
-from dataclasses import dataclass
 from datetime import datetime, timedelta
 from logging import getLogger
 from multiprocessing import Queue
--- a/mwmbl/urls.py
+++ b/mwmbl/urls.py
@ -0,0 +1,25 @@
+"""
+URL configuration for app project.
+
+The `urlpatterns` list routes URLs to views. For more information please see:
+    https://docs.djangoproject.com/en/4.2/topics/http/urls/
+Examples:
+Function views
+    1. Add an import:  from my_app import views
+    2. Add a URL to urlpatterns:  path('', views.home, name='home')
+Class-based views
+    1. Add an import:  from other_app.views import Home
+    2. Add a URL to urlpatterns:  path('', Home.as_view(), name='home')
+Including another URLconf
+    1. Import the include() function: from django.urls import include, path
+    2. Add a URL to urlpatterns:  path('blog/', include('blog.urls'))
+"""
+from django.contrib import admin
+from django.urls import path
+
+from mwmbl.api import api
+
+urlpatterns = [
+    path('admin/', admin.site.urls),
+    path('', api.urls)
+]
--- a/mwmbl/wsgi.py
+++ b/mwmbl/wsgi.py
@ -0,0 +1,16 @@
+"""
+WSGI config for app project.
+
+It exposes the WSGI callable as a module-level variable named ``application``.
+
+For more information on this file, see
+https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/
+"""
+
+import os
+
+from django.core.wsgi import get_wsgi_application
+
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mwmbl.settings_dev')
+
+application = get_wsgi_application()
--- a/pyproject.toml
+++ b/pyproject.toml
@ -33,6 +33,8 @@ langdetect = {version= "==1.0.9", optional = true}
 pyarrow = {version= "==6.0.0", optional = true}
 pyspark = {version= "==3.2.0", optional = true}
 Levenshtein = {version= "==0.16.0", optional = true}
+django = "^4.2.4"
+django-ninja = "^0.22.2"
 requests-cache = "^1.1.0"
 redis = {extras = ["hiredis"], version = "^5.0.1"}

--- a/test/test_completer.py
+++ b/test/test_completer.py
@ -1,5 +1,3 @@
-import mwmbl.tinysearchengine.completer
-import pytest
 import pandas as pd

 def mockCompleterData(mocker, data):
@ -16,7 +14,7 @@ def test_correctCompletions(mocker):
        [3, 'buildings', 1]]
    mockCompleterData(mocker, testdata)
    
-    completer = mwmbl.tinysearchengine.completer.Completer()
+    completer = app.tinysearchengine.completer.Completer()
    completion = completer.complete('build')
    assert ['build', 'builder', 'buildings'] == completion

@ -29,7 +27,7 @@ def test_correctSortOrder(mocker):
        [3, 'buildings', 3]]
    mockCompleterData(mocker, testdata)
    
-    completer = mwmbl.tinysearchengine.completer.Completer()
+    completer = app.tinysearchengine.completer.Completer()
    completion = completer.complete('build')
    assert ['build', 'buildings', 'builder'] == completion
    
@ -42,7 +40,7 @@ def test_noCompletions(mocker):
        [3, 'buildings', 1]]
    mockCompleterData(mocker, testdata)
    
-    completer = mwmbl.tinysearchengine.completer.Completer()
+    completer = app.tinysearchengine.completer.Completer()
    completion = completer.complete('test')
    assert [] == completion
    
@ -55,7 +53,7 @@ def test_singleCompletions(mocker):
        [3, 'buildings', 1]]
    mockCompleterData(mocker, testdata)
    
-    completer = mwmbl.tinysearchengine.completer.Completer()
+    completer = app.tinysearchengine.completer.Completer()
    completion = completer.complete('announce')
    assert ['announce'] == completion
    
@ -68,7 +66,7 @@ def test_idempotencyWithSameScoreCompletions(mocker):
        [3, 'buildings', 1]]
    mockCompleterData(mocker, testdata)
    
-    completer = mwmbl.tinysearchengine.completer.Completer()
+    completer = app.tinysearchengine.completer.Completer()
    for i in range(3):
        print(f"iteration: {i}")
        completion = completer.complete('build')
--- a/test/test_indexer.py
+++ b/test/test_indexer.py
@ -1,9 +1,9 @@
 from pathlib import Path
 from tempfile import TemporaryDirectory

-from mwmbl.tinysearchengine.indexer import Document, TinyIndex, _binary_search_fitting_size, astuple, _trim_items_to_page, _get_page_data, _pad_to_page_size
-from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError
-import json
+from mwmbl.tinysearchengine import Document, TinyIndex, _binary_search_fitting_size, astuple, _trim_items_to_page, _get_page_data, _pad_to_page_size
+from zstandard import ZstdCompressor
+

 def test_create_index():
    num_pages = 10
--- a/test/test_update_urls.py
+++ b/test/test_update_urls.py
@ -1,4 +1,4 @@
-from mwmbl.indexer.update_urls import process_link
+from mwmbl.indexer import process_link


 def test_process_link_normal():