Merge pull request #115 from mwmbl/django-rewrite

Django rewrite
This commit is contained in:
Daoud Clarke 2023-10-10 16:25:36 +01:00 committed by GitHub
commit 213bdaa365
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
35 changed files with 346 additions and 167 deletions

View file

@ -46,5 +46,8 @@ VOLUME ["/data"]
EXPOSE 5000 EXPOSE 5000
# Using the mwmbl-tinysearchengine binary/entrypoint which comes packaged with mwmbl ENV DJANGO_SETTINGS_MODULE=mwmbl.settings_dev
CMD ["/venv/bin/mwmbl-tinysearchengine", "--num-pages", "10240000", "--background", "--data", "/app/storage"]
# WORKDIR "/venv/lib/python3.10/site-packages/mwmbl/"
# CMD ["/venv/bin/python", "-m", "uvicorn", "app.asgi:application"]
CMD ["/venv/bin/mwmbl-tinysearchengine"]

View file

@ -7,8 +7,8 @@ import json
from collections import defaultdict, Counter from collections import defaultdict, Counter
from urllib.parse import urlparse from urllib.parse import urlparse
from mwmbl.crawler.batch import HashedBatch from mwmbl.crawler import HashedBatch
from mwmbl.indexer.paths import CRAWL_GLOB, MWMBL_DATA_DIR from mwmbl.indexer import CRAWL_GLOB, MWMBL_DATA_DIR
# TODO: remove this line - temporary override # TODO: remove this line - temporary override

View file

@ -1,6 +1,6 @@
import json import json
from mwmbl.indexer.paths import TOP_DOMAINS_JSON_PATH from mwmbl.indexer import TOP_DOMAINS_JSON_PATH
from mwmbl.hn_top_domains_filtered import DOMAINS from mwmbl.hn_top_domains_filtered import DOMAINS

View file

@ -3,7 +3,7 @@ Export the list of unique URLs to a SQLite file for analysis/evaluation.
""" """
import sqlite3 import sqlite3
from mwmbl.indexer.paths import URLS_PATH from mwmbl.indexer import URLS_PATH
from mwmbl.app import get_config_and_index from mwmbl.app import get_config_and_index

View file

@ -7,16 +7,15 @@ import json
import logging import logging
import os import os
import sys import sys
from pathlib import Path
from datetime import datetime from datetime import datetime
import spacy import spacy
from mwmbl.crawler.batch import HashedBatch from mwmbl.crawler import HashedBatch
from mwmbl.crawler.urls import URLDatabase from mwmbl.crawler.urls import URLDatabase
from mwmbl.database import Database from mwmbl.database import Database
from mwmbl.indexer.index_batches import index_batches from mwmbl.indexer import index_batches
from mwmbl.tinysearchengine.indexer import TinyIndex, Document from mwmbl.tinysearchengine import TinyIndex, Document
LOCAL_BATCHES_PATH = f'{os.environ["HOME"]}/data/mwmbl/file/**/*.json.gz' LOCAL_BATCHES_PATH = f'{os.environ["HOME"]}/data/mwmbl/file/**/*.json.gz'
NUM_BATCHES = 10000 NUM_BATCHES = 10000

View file

@ -1,7 +1,7 @@
""" """
Count unique URLs in the index. Count unique URLs in the index.
""" """
from mwmbl.tinysearchengine.indexer import TinyIndex, Document from mwmbl.tinysearchengine import TinyIndex, Document
def run(): def run():

View file

@ -5,9 +5,9 @@ import numpy as np
import spacy import spacy
from analyse.index_local import EVALUATE_INDEX_PATH from analyse.index_local import EVALUATE_INDEX_PATH
from mwmbl.indexer.index import tokenize_document from mwmbl.indexer import tokenize_document
from mwmbl.indexer.paths import INDEX_PATH from mwmbl.indexer import INDEX_PATH
from mwmbl.tinysearchengine.indexer import TinyIndex, Document from mwmbl.tinysearchengine import TinyIndex, Document
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

View file

@ -4,12 +4,10 @@ See how many unique URLs and root domains we have crawled.
import glob import glob
import gzip import gzip
import json import json
from collections import defaultdict, Counter
from urllib.parse import urlparse
import requests import requests
from mwmbl.indexer.paths import CRAWL_GLOB from mwmbl.indexer import CRAWL_GLOB
API_ENDPOINT = "http://95.216.215.29/batches/historical" API_ENDPOINT = "http://95.216.215.29/batches/historical"

View file

@ -2,9 +2,9 @@ import logging
import sys import sys
from itertools import islice from itertools import islice
from mwmbl.indexer.paths import INDEX_PATH from mwmbl.indexer import INDEX_PATH
from mwmbl.tinysearchengine.completer import Completer from mwmbl.tinysearchengine.completer import Completer
from mwmbl.tinysearchengine.indexer import TinyIndex, Document from mwmbl.tinysearchengine import TinyIndex, Document
from mwmbl.tinysearchengine.rank import HeuristicRanker from mwmbl.tinysearchengine.rank import HeuristicRanker
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

View file

@ -3,7 +3,7 @@ Send a batch to a running instance.
""" """
import requests import requests
from mwmbl.crawler.batch import Batch, Item, ItemContent from mwmbl.crawler import Batch, Item, ItemContent
URL = 'http://localhost:5000/crawler/batches/' URL = 'http://localhost:5000/crawler/batches/'

View file

@ -4,7 +4,7 @@ from datetime import datetime
from pathlib import Path from pathlib import Path
from queue import Queue from queue import Queue
from mwmbl.indexer.update_urls import record_urls_in_database from mwmbl.indexer import record_urls_in_database
def run_update_urls_on_fixed_batches(): def run_update_urls_on_fixed_batches():

22
manage.py Executable file
View file

@ -0,0 +1,22 @@
#!/usr/bin/env python
"""Django's command-line utility for administrative tasks."""
import os
import sys
def main():
"""Run administrative tasks."""
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'app.settings')
try:
from django.core.management import execute_from_command_line
except ImportError as exc:
raise ImportError(
"Couldn't import Django. Are you sure it's installed and "
"available on your PYTHONPATH environment variable? Did you "
"forget to activate a virtual environment?"
) from exc
execute_from_command_line(sys.argv)
if __name__ == '__main__':
main()

31
mwmbl/api.py Normal file
View file

@ -0,0 +1,31 @@
from multiprocessing import Queue
from pathlib import Path
from django.conf import settings
from ninja import NinjaAPI
import mwmbl.crawler.app as crawler
from mwmbl.indexer.batch_cache import BatchCache
from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
from mwmbl.tinysearchengine import search
from mwmbl.tinysearchengine.completer import Completer
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
from mwmbl.tinysearchengine.rank import HeuristicRanker
api = NinjaAPI(version="1.0.0")
index_path = Path(settings.DATA_PATH) / INDEX_NAME
tiny_index = TinyIndex(item_factory=Document, index_path=index_path)
tiny_index.__enter__()
completer = Completer()
ranker = HeuristicRanker(tiny_index, completer)
search_router = search.create_router(ranker)
api.add_router("/search/", search_router)
batch_cache = BatchCache(Path(settings.DATA_PATH) / BATCH_DIR_NAME)
queued_batches = Queue()
crawler_router = crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches)
api.add_router("/crawler/", crawler_router)

35
mwmbl/apps.py Normal file
View file

@ -0,0 +1,35 @@
from multiprocessing import Process, Queue
from pathlib import Path
from django.apps import AppConfig
from django.conf import settings
from mwmbl.api import queued_batches
from mwmbl import background
from mwmbl.indexer.paths import INDEX_NAME
from mwmbl.indexer.update_urls import update_urls_continuously
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE
from mwmbl.url_queue import update_queue_continuously
class MwmblConfig(AppConfig):
name = "mwmbl"
verbose_name = "Mwmbl Application"
def ready(self):
index_path = Path(settings.DATA_PATH) / INDEX_NAME
try:
existing_index = TinyIndex(item_factory=Document, index_path=index_path)
if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != settings.NUM_PAGES:
raise ValueError(f"Existing index page sizes ({existing_index.page_size}) or number of pages "
f"({existing_index.num_pages}) do not match")
except FileNotFoundError:
print("Creating a new index")
TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=settings.NUM_PAGES,
page_size=PAGE_SIZE)
if settings.RUN_BACKGROUND_PROCESSES:
new_item_queue = Queue()
Process(target=background.run, args=(settings.DATA_PATH,)).start()
Process(target=update_queue_continuously, args=(new_item_queue, queued_batches,)).start()
Process(target=update_urls_continuously, args=(settings.DATA_PATH, new_item_queue)).start()

16
mwmbl/asgi.py Normal file
View file

@ -0,0 +1,16 @@
"""
ASGI config for app project.
It exposes the ASGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/
"""
import os
from django.core.asgi import get_asgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mwmbl.settings_dev')
application = get_asgi_application()

View file

@ -10,10 +10,11 @@ from uuid import uuid4
import boto3 import boto3
import justext import justext
import requests import requests
from fastapi import HTTPException, APIRouter from fastapi import HTTPException
from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \ from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \ LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
from ninja import Router
from redis import Redis from redis import Redis
from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
@ -82,17 +83,15 @@ def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
return paragraphs, title return paragraphs, title
def get_router(batch_cache: BatchCache, queued_batches: Queue): def create_router(batch_cache: BatchCache, queued_batches: Queue) -> Router:
router = APIRouter(prefix="/crawler", tags=["crawler"]) router = Router(tags=["crawler"])
@router.on_event("startup") # TODO: # ensure tables are created before crawler code is used:
async def on_startup(): # #
with Database() as db: # # url_db.create_tables()
url_db = URLDatabase(db.connection)
return url_db.create_tables()
@router.get('/fetch') @router.get('/fetch')
def fetch_url(url: str, query: str): def fetch_url(request, url: str, query: str):
response = requests.get(url) response = requests.get(url)
paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English")) paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
good_paragraphs = [p for p in paragraphs if p.class_type == 'good'] good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
@ -105,7 +104,7 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
return format_result(result, query) return format_result(result, query)
@router.post('/batches/') @router.post('/batches/')
def post_batch(batch: Batch): def post_batch(request, batch: Batch):
if len(batch.items) > MAX_BATCH_SIZE: if len(batch.items) > MAX_BATCH_SIZE:
raise HTTPException(400, f"Batch size too large (maximum {MAX_BATCH_SIZE}), got {len(batch.items)}") raise HTTPException(400, f"Batch size too large (maximum {MAX_BATCH_SIZE}), got {len(batch.items)}")
@ -159,7 +158,7 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
} }
@router.post('/batches/new') @router.post('/batches/new')
def request_new_batch(batch_request: NewBatchRequest) -> list[str]: def request_new_batch(request, batch_request: NewBatchRequest) -> list[str]:
user_id_hash = _get_user_id_hash(batch_request) user_id_hash = _get_user_id_hash(batch_request)
try: try:
urls = queued_batches.get(block=False) urls = queued_batches.get(block=False)
@ -174,14 +173,14 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
return urls return urls
@router.get('/batches/{date_str}/users/{public_user_id}') @router.get('/batches/{date_str}/users/{public_user_id}')
def get_batches_for_date_and_user(date_str, public_user_id): def get_batches_for_date_and_user(request, date_str, public_user_id):
check_date_str(date_str) check_date_str(date_str)
check_public_user_id(public_user_id) check_public_user_id(public_user_id)
prefix = f'1/{VERSION}/{date_str}/1/{public_user_id}/' prefix = f'1/{VERSION}/{date_str}/1/{public_user_id}/'
return get_batch_ids_for_prefix(prefix) return get_batch_ids_for_prefix(prefix)
@router.get('/batches/{date_str}/users/{public_user_id}/batch/{batch_id}') @router.get('/batches/{date_str}/users/{public_user_id}/batch/{batch_id}')
def get_batch_from_id(date_str, public_user_id, batch_id): def get_batch_from_id(request, date_str, public_user_id, batch_id):
url = get_batch_url(batch_id, date_str, public_user_id) url = get_batch_url(batch_id, date_str, public_user_id)
data = json.loads(gzip.decompress(requests.get(url).content)) data = json.loads(gzip.decompress(requests.get(url).content))
return { return {
@ -189,22 +188,22 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
'batch': data, 'batch': data,
} }
@router.get('/latest-batch', response_model=list[HashedBatch]) @router.get('/latest-batch')
def get_latest_batch(): def get_latest_batch(request) -> list[HashedBatch]:
return [] if last_batch is None else [last_batch] return [] if last_batch is None else [last_batch]
@router.get('/batches/{date_str}/users') @router.get('/batches/{date_str}/users')
def get_user_id_hashes_for_date(date_str: str): def get_user_id_hashes_for_date(request, date_str: str):
check_date_str(date_str) check_date_str(date_str)
prefix = f'1/{VERSION}/{date_str}/1/' prefix = f'1/{VERSION}/{date_str}/1/'
return get_subfolders(prefix) return get_subfolders(prefix)
@router.get('/stats') @router.get('/stats')
def get_stats() -> MwmblStats: def get_stats(request) -> MwmblStats:
return stats_manager.get_stats() return stats_manager.get_stats()
@router.get('/') @router.get('/')
def status(): def status(request):
return { return {
'status': 'ok' 'status': 'ok'
} }

View file

@ -1,21 +1,21 @@
from typing import Optional from typing import Optional
from pydantic import BaseModel from ninja import Schema
class ItemContent(BaseModel): class ItemContent(Schema):
title: str title: str
extract: str extract: str
links: list[str] links: list[str]
extra_links: Optional[list[str]] extra_links: Optional[list[str]]
class ItemError(BaseModel): class ItemError(Schema):
name: str name: str
message: Optional[str] message: Optional[str]
class Item(BaseModel): class Item(Schema):
url: str url: str
status: Optional[int] status: Optional[int]
timestamp: int timestamp: int
@ -23,16 +23,16 @@ class Item(BaseModel):
error: Optional[ItemError] error: Optional[ItemError]
class Batch(BaseModel): class Batch(Schema):
user_id: str user_id: str
items: list[Item] items: list[Item]
class NewBatchRequest(BaseModel): class NewBatchRequest(Schema):
user_id: str user_id: str
class HashedBatch(BaseModel): class HashedBatch(Schema):
user_id_hash: str user_id_hash: str
timestamp: int timestamp: int
items: list[Item] items: list[Item]

View file

@ -1,16 +1,13 @@
""" """
Database storing info on URLs Database storing info on URLs
""" """
import random
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime, timedelta from datetime import datetime
from enum import Enum from enum import Enum
from logging import getLogger from logging import getLogger
from psycopg2.extras import execute_values from psycopg2.extras import execute_values
from mwmbl.hn_top_domains_filtered import DOMAINS
from mwmbl.settings import CORE_DOMAINS
# Client has one hour to crawl a URL that has been assigned to them, or it will be reassigned # Client has one hour to crawl a URL that has been assigned to them, or it will be reassigned
from mwmbl.utils import batch from mwmbl.utils import batch

View file

@ -9,7 +9,6 @@ import os
from logging import getLogger from logging import getLogger
from multiprocessing.pool import ThreadPool from multiprocessing.pool import ThreadPool
from pathlib import Path from pathlib import Path
from tempfile import NamedTemporaryFile
from urllib.parse import urlparse from urllib.parse import urlparse
from pydantic import ValidationError from pydantic import ValidationError

View file

@ -1,13 +1,10 @@
""" """
Create a search index Create a search index
""" """
from collections import Counter
from typing import Iterable from typing import Iterable
from urllib.parse import unquote from urllib.parse import unquote
import pandas as pd from mwmbl.tinysearchengine.indexer import TokenizedDocument
from mwmbl.tinysearchengine.indexer import Document, TokenizedDocument, TinyIndex
from mwmbl.tokenizer import tokenize, get_bigrams from mwmbl.tokenizer import tokenize, get_bigrams
DEFAULT_SCORE = 0 DEFAULT_SCORE = 0

View file

@ -1,13 +1,10 @@
import os
import pickle
import re
from collections import defaultdict from collections import defaultdict
from datetime import datetime, timezone, timedelta from datetime import datetime, timezone, timedelta
from logging import getLogger from logging import getLogger
from multiprocessing import Queue from multiprocessing import Queue
from pathlib import Path from pathlib import Path
from time import sleep from time import sleep
from typing import Iterable, Collection from typing import Collection
from urllib.parse import urlparse from urllib.parse import urlparse
from requests_cache import CachedSession from requests_cache import CachedSession

View file

@ -1,96 +1,8 @@
import argparse
import logging
import sys
from multiprocessing import Process, Queue
from pathlib import Path
import uvicorn import uvicorn
from fastapi import FastAPI
from starlette.middleware.cors import CORSMiddleware
from mwmbl import background
from mwmbl.crawler import app as crawler
from mwmbl.indexer.batch_cache import BatchCache
from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
from mwmbl.platform import user
from mwmbl.indexer.update_urls import update_urls_continuously
from mwmbl.tinysearchengine import search
from mwmbl.tinysearchengine.completer import Completer
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE
from mwmbl.tinysearchengine.rank import HeuristicRanker
from mwmbl.url_queue import update_queue_continuously
FORMAT = '%(levelname)s %(name)s %(asctime)s %(message)s'
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=FORMAT)
MODEL_PATH = Path(__file__).parent / 'resources' / 'model.pickle'
def setup_args():
parser = argparse.ArgumentParser(description="Mwmbl API server and background task processor")
parser.add_argument("--num-pages", type=int, help="Number of pages of memory (4096 bytes) to use for the index", default=2560)
parser.add_argument("--data", help="Path to the data folder for storing index and cached batches", default="./devdata")
parser.add_argument("--port", type=int, help="Port for the server to listen at", default=5000)
parser.add_argument("--background", help="Enable running the background tasks to process batches",
action='store_true')
args = parser.parse_args()
return args
def run(): def run():
args = setup_args() uvicorn.run("mwmbl.asgi:application", host="0.0.0.0", port=8000)
index_path = Path(args.data) / INDEX_NAME
try:
existing_index = TinyIndex(item_factory=Document, index_path=index_path)
if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != args.num_pages:
raise ValueError(f"Existing index page sizes ({existing_index.page_size}) or number of pages "
f"({existing_index.num_pages}) do not match")
except FileNotFoundError:
print("Creating a new index")
TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=args.num_pages, page_size=PAGE_SIZE)
new_item_queue = Queue()
queued_batches = Queue()
# curation_queue = Queue()
if args.background:
Process(target=background.run, args=(args.data,)).start()
Process(target=update_queue_continuously, args=(new_item_queue, queued_batches,)).start()
Process(target=update_urls_continuously, args=(args.data, new_item_queue)).start()
completer = Completer()
with TinyIndex(item_factory=Document, index_path=index_path) as tiny_index:
ranker = HeuristicRanker(tiny_index, completer)
# model = pickle.load(open(MODEL_PATH, 'rb'))
# ranker = LTRRanker(model, tiny_index, completer)
# Initialize FastApi instance
app = FastAPI()
# Try disabling since this is handled by nginx
# app.add_middleware(
# CORSMiddleware,
# allow_origins=["*"],
# allow_credentials=True,
# allow_methods=["*"],
# allow_headers=["*"],
# )
search_router = search.create_router(ranker)
app.include_router(search_router)
batch_cache = BatchCache(Path(args.data) / BATCH_DIR_NAME)
crawler_router = crawler.get_router(batch_cache, queued_batches)
app.include_router(crawler_router)
user_router = user.create_router(index_path)
app.include_router(user_router)
# Initialize uvicorn server using global app instance and server config params
uvicorn.run(app, host="0.0.0.0", port=args.port)
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -7,7 +7,7 @@ import requests
from fastapi import APIRouter, Response from fastapi import APIRouter, Response
from pydantic import BaseModel from pydantic import BaseModel
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, DocumentState from mwmbl.tinysearchengine.indexer import TinyIndex, Document
from mwmbl.tokenizer import tokenize from mwmbl.tokenizer import tokenize

125
mwmbl/settings_common.py Normal file
View file

@ -0,0 +1,125 @@
"""
Django settings for mwmbl project.
Generated by 'django-admin startproject' using Django 4.2.4.
For more information on this file, see
https://docs.djangoproject.com/en/4.2/topics/settings/
For the full list of settings and their values, see
https://docs.djangoproject.com/en/4.2/ref/settings/
"""
from pathlib import Path
# Build paths inside the project like this: BASE_DIR / 'subdir'.
BASE_DIR = Path(__file__).resolve().parent.parent
# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'django-insecure-qqr#f(i3uf%m8%8u35vn=ov-uk(*8!a&1t-hxa%ev2^t1%j&sm'
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
ALLOWED_HOSTS = []
# Application definition
INSTALLED_APPS = [
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'mwmbl',
]
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
]
ROOT_URLCONF = 'mwmbl.urls'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [],
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
WSGI_APPLICATION = 'mwmbl.wsgi.application'
# Database
# https://docs.djangoproject.com/en/4.2/ref/settings/#databases
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': BASE_DIR / 'db.sqlite3',
}
}
# Password validation
# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators
AUTH_PASSWORD_VALIDATORS = [
{
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
},
]
# Internationalization
# https://docs.djangoproject.com/en/4.2/topics/i18n/
LANGUAGE_CODE = 'en-us'
TIME_ZONE = 'UTC'
USE_I18N = True
USE_TZ = True
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/4.2/howto/static-files/
STATIC_URL = 'static/'
# Default primary key field type
# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'

5
mwmbl/settings_dev.py Normal file
View file

@ -0,0 +1,5 @@
from mwmbl.settings_common import *
DATA_PATH = "./devdata"
RUN_BACKGROUND_PROCESSES = False
NUM_PAGES = 2560

5
mwmbl/settings_prod.py Normal file
View file

@ -0,0 +1,5 @@
from mwmbl.settings_common import *
DATA_PATH = "/app/storage"
RUN_BACKGROUND_PROCESSES = True
NUM_PAGES = 10240000

View file

@ -6,7 +6,6 @@ from operator import itemgetter
from urllib.parse import urlparse from urllib.parse import urlparse
from mwmbl.format import format_result_with_pattern, get_query_regex from mwmbl.format import format_result_with_pattern, get_query_regex
from mwmbl.platform.user import MAX_CURATED_SCORE
from mwmbl.tokenizer import tokenize, get_bigrams from mwmbl.tokenizer import tokenize, get_bigrams
from mwmbl.tinysearchengine.completer import Completer from mwmbl.tinysearchengine.completer import Completer
from mwmbl.hn_top_domains_filtered import DOMAINS from mwmbl.hn_top_domains_filtered import DOMAINS

View file

@ -1,6 +1,6 @@
from logging import getLogger from logging import getLogger
from fastapi import APIRouter from ninja import Router
from mwmbl.tinysearchengine.rank import HeuristicRanker from mwmbl.tinysearchengine.rank import HeuristicRanker
@ -10,15 +10,15 @@ logger = getLogger(__name__)
SCORE_THRESHOLD = 0.25 SCORE_THRESHOLD = 0.25
def create_router(ranker: HeuristicRanker) -> APIRouter: def create_router(ranker: HeuristicRanker) -> Router:
router = APIRouter(prefix="/search", tags=["search"]) router = Router(tags=["search"])
@router.get("") @router.get("")
def search(s: str): def search(request, s: str):
return ranker.search(s) return ranker.search(s)
@router.get("/complete") @router.get("/complete")
def complete(q: str): def complete(request, q: str):
return ranker.complete(q) return ranker.complete(q)
return router return router

View file

@ -1,6 +1,5 @@
import time import time
from collections import defaultdict from collections import defaultdict
from dataclasses import dataclass
from datetime import datetime, timedelta from datetime import datetime, timedelta
from logging import getLogger from logging import getLogger
from multiprocessing import Queue from multiprocessing import Queue

25
mwmbl/urls.py Normal file
View file

@ -0,0 +1,25 @@
"""
URL configuration for app project.
The `urlpatterns` list routes URLs to views. For more information please see:
https://docs.djangoproject.com/en/4.2/topics/http/urls/
Examples:
Function views
1. Add an import: from my_app import views
2. Add a URL to urlpatterns: path('', views.home, name='home')
Class-based views
1. Add an import: from other_app.views import Home
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
Including another URLconf
1. Import the include() function: from django.urls import include, path
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
"""
from django.contrib import admin
from django.urls import path
from mwmbl.api import api
urlpatterns = [
path('admin/', admin.site.urls),
path('', api.urls)
]

16
mwmbl/wsgi.py Normal file
View file

@ -0,0 +1,16 @@
"""
WSGI config for app project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mwmbl.settings_dev')
application = get_wsgi_application()

View file

@ -33,6 +33,8 @@ langdetect = {version= "==1.0.9", optional = true}
pyarrow = {version= "==6.0.0", optional = true} pyarrow = {version= "==6.0.0", optional = true}
pyspark = {version= "==3.2.0", optional = true} pyspark = {version= "==3.2.0", optional = true}
Levenshtein = {version= "==0.16.0", optional = true} Levenshtein = {version= "==0.16.0", optional = true}
django = "^4.2.4"
django-ninja = "^0.22.2"
requests-cache = "^1.1.0" requests-cache = "^1.1.0"
redis = {extras = ["hiredis"], version = "^5.0.1"} redis = {extras = ["hiredis"], version = "^5.0.1"}

View file

@ -1,5 +1,3 @@
import mwmbl.tinysearchengine.completer
import pytest
import pandas as pd import pandas as pd
def mockCompleterData(mocker, data): def mockCompleterData(mocker, data):
@ -16,7 +14,7 @@ def test_correctCompletions(mocker):
[3, 'buildings', 1]] [3, 'buildings', 1]]
mockCompleterData(mocker, testdata) mockCompleterData(mocker, testdata)
completer = mwmbl.tinysearchengine.completer.Completer() completer = app.tinysearchengine.completer.Completer()
completion = completer.complete('build') completion = completer.complete('build')
assert ['build', 'builder', 'buildings'] == completion assert ['build', 'builder', 'buildings'] == completion
@ -29,7 +27,7 @@ def test_correctSortOrder(mocker):
[3, 'buildings', 3]] [3, 'buildings', 3]]
mockCompleterData(mocker, testdata) mockCompleterData(mocker, testdata)
completer = mwmbl.tinysearchengine.completer.Completer() completer = app.tinysearchengine.completer.Completer()
completion = completer.complete('build') completion = completer.complete('build')
assert ['build', 'buildings', 'builder'] == completion assert ['build', 'buildings', 'builder'] == completion
@ -42,7 +40,7 @@ def test_noCompletions(mocker):
[3, 'buildings', 1]] [3, 'buildings', 1]]
mockCompleterData(mocker, testdata) mockCompleterData(mocker, testdata)
completer = mwmbl.tinysearchengine.completer.Completer() completer = app.tinysearchengine.completer.Completer()
completion = completer.complete('test') completion = completer.complete('test')
assert [] == completion assert [] == completion
@ -55,7 +53,7 @@ def test_singleCompletions(mocker):
[3, 'buildings', 1]] [3, 'buildings', 1]]
mockCompleterData(mocker, testdata) mockCompleterData(mocker, testdata)
completer = mwmbl.tinysearchengine.completer.Completer() completer = app.tinysearchengine.completer.Completer()
completion = completer.complete('announce') completion = completer.complete('announce')
assert ['announce'] == completion assert ['announce'] == completion
@ -68,7 +66,7 @@ def test_idempotencyWithSameScoreCompletions(mocker):
[3, 'buildings', 1]] [3, 'buildings', 1]]
mockCompleterData(mocker, testdata) mockCompleterData(mocker, testdata)
completer = mwmbl.tinysearchengine.completer.Completer() completer = app.tinysearchengine.completer.Completer()
for i in range(3): for i in range(3):
print(f"iteration: {i}") print(f"iteration: {i}")
completion = completer.complete('build') completion = completer.complete('build')

View file

@ -1,9 +1,9 @@
from pathlib import Path from pathlib import Path
from tempfile import TemporaryDirectory from tempfile import TemporaryDirectory
from mwmbl.tinysearchengine.indexer import Document, TinyIndex, _binary_search_fitting_size, astuple, _trim_items_to_page, _get_page_data, _pad_to_page_size from mwmbl.tinysearchengine import Document, TinyIndex, _binary_search_fitting_size, astuple, _trim_items_to_page, _get_page_data, _pad_to_page_size
from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError from zstandard import ZstdCompressor
import json
def test_create_index(): def test_create_index():
num_pages = 10 num_pages = 10

View file

@ -1,4 +1,4 @@
from mwmbl.indexer.update_urls import process_link from mwmbl.indexer import process_link
def test_process_link_normal(): def test_process_link_normal():