From 918eaa8709c0f38739474d15392d60a0482ec5a6 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Tue, 10 Oct 2023 13:51:06 +0100 Subject: [PATCH] Rename django app to mwmbl --- Dockerfile | 7 +++++-- analyse/analyse_crawled_domains.py | 4 ++-- analyse/export_top_domains.py | 2 +- analyse/export_urls.py | 2 +- analyse/index_local.py | 7 +++---- analyse/index_url_count.py | 2 +- analyse/inspect_index.py | 6 +++--- analyse/record_historical_batches.py | 4 +--- analyse/search.py | 4 ++-- analyse/send_batch.py | 2 +- analyse/update_urls.py | 2 +- app/__init__.py | 0 {app => mwmbl}/api.py | 0 {app => mwmbl}/apps.py | 4 ++-- {app => mwmbl}/asgi.py | 2 +- mwmbl/crawler/app.py | 2 +- mwmbl/crawler/urls.py | 5 +---- mwmbl/indexer/batch_cache.py | 1 - mwmbl/indexer/index.py | 5 +---- mwmbl/indexer/update_urls.py | 5 +---- mwmbl/main.py | 9 +++++++++ mwmbl/platform/user.py | 2 +- {app => mwmbl}/settings_common.py | 8 ++++---- {app => mwmbl}/settings_dev.py | 2 +- {app => mwmbl}/settings_prod.py | 2 +- mwmbl/tinysearchengine/rank.py | 1 - mwmbl/url_queue.py | 1 - {app => mwmbl}/urls.py | 2 +- {app => mwmbl}/wsgi.py | 2 +- test/test_completer.py | 12 +++++------- test/test_indexer.py | 6 +++--- test/test_update_urls.py | 2 +- 32 files changed, 55 insertions(+), 60 deletions(-) delete mode 100644 app/__init__.py rename {app => mwmbl}/api.py (100%) rename {app => mwmbl}/apps.py (96%) rename {app => mwmbl}/asgi.py (82%) create mode 100644 mwmbl/main.py rename {app => mwmbl}/settings_common.py (96%) rename {app => mwmbl}/settings_dev.py (67%) rename {app => mwmbl}/settings_prod.py (69%) rename {app => mwmbl}/urls.py (96%) rename {app => mwmbl}/wsgi.py (82%) diff --git a/Dockerfile b/Dockerfile index f57bce7..4283a14 100644 --- a/Dockerfile +++ b/Dockerfile @@ -46,5 +46,8 @@ VOLUME ["/data"] EXPOSE 5000 -# Using the mwmbl-tinysearchengine binary/entrypoint which comes packaged with mwmbl -CMD ["/venv/bin/mwmbl-tinysearchengine", "--num-pages", "10240000", "--background", "--data", "/app/storage"] +ENV DJANGO_SETTINGS_MODULE=mwmbl.settings_dev + +# WORKDIR "/venv/lib/python3.10/site-packages/mwmbl/" +# CMD ["/venv/bin/python", "-m", "uvicorn", "app.asgi:application"] +CMD ["/venv/bin/mwmbl-tinysearchengine"] diff --git a/analyse/analyse_crawled_domains.py b/analyse/analyse_crawled_domains.py index 371cbb6..5e87abb 100644 --- a/analyse/analyse_crawled_domains.py +++ b/analyse/analyse_crawled_domains.py @@ -7,8 +7,8 @@ import json from collections import defaultdict, Counter from urllib.parse import urlparse -from mwmbl.crawler.batch import HashedBatch -from mwmbl.indexer.paths import CRAWL_GLOB, MWMBL_DATA_DIR +from mwmbl.crawler import HashedBatch +from mwmbl.indexer import CRAWL_GLOB, MWMBL_DATA_DIR # TODO: remove this line - temporary override diff --git a/analyse/export_top_domains.py b/analyse/export_top_domains.py index 9f4d495..b9b4479 100644 --- a/analyse/export_top_domains.py +++ b/analyse/export_top_domains.py @@ -1,6 +1,6 @@ import json -from mwmbl.indexer.paths import TOP_DOMAINS_JSON_PATH +from mwmbl.indexer import TOP_DOMAINS_JSON_PATH from mwmbl.hn_top_domains_filtered import DOMAINS diff --git a/analyse/export_urls.py b/analyse/export_urls.py index a042260..39ba98e 100644 --- a/analyse/export_urls.py +++ b/analyse/export_urls.py @@ -3,7 +3,7 @@ Export the list of unique URLs to a SQLite file for analysis/evaluation. """ import sqlite3 -from mwmbl.indexer.paths import URLS_PATH +from mwmbl.indexer import URLS_PATH from mwmbl.app import get_config_and_index diff --git a/analyse/index_local.py b/analyse/index_local.py index 24628a5..334868d 100644 --- a/analyse/index_local.py +++ b/analyse/index_local.py @@ -7,16 +7,15 @@ import json import logging import os import sys -from pathlib import Path from datetime import datetime import spacy -from mwmbl.crawler.batch import HashedBatch +from mwmbl.crawler import HashedBatch from mwmbl.crawler.urls import URLDatabase from mwmbl.database import Database -from mwmbl.indexer.index_batches import index_batches -from mwmbl.tinysearchengine.indexer import TinyIndex, Document +from mwmbl.indexer import index_batches +from mwmbl.tinysearchengine import TinyIndex, Document LOCAL_BATCHES_PATH = f'{os.environ["HOME"]}/data/mwmbl/file/**/*.json.gz' NUM_BATCHES = 10000 diff --git a/analyse/index_url_count.py b/analyse/index_url_count.py index f0c7ac2..dcb7245 100644 --- a/analyse/index_url_count.py +++ b/analyse/index_url_count.py @@ -1,7 +1,7 @@ """ Count unique URLs in the index. """ -from mwmbl.tinysearchengine.indexer import TinyIndex, Document +from mwmbl.tinysearchengine import TinyIndex, Document def run(): diff --git a/analyse/inspect_index.py b/analyse/inspect_index.py index 20b0619..c48ad22 100644 --- a/analyse/inspect_index.py +++ b/analyse/inspect_index.py @@ -5,9 +5,9 @@ import numpy as np import spacy from analyse.index_local import EVALUATE_INDEX_PATH -from mwmbl.indexer.index import tokenize_document -from mwmbl.indexer.paths import INDEX_PATH -from mwmbl.tinysearchengine.indexer import TinyIndex, Document +from mwmbl.indexer import tokenize_document +from mwmbl.indexer import INDEX_PATH +from mwmbl.tinysearchengine import TinyIndex, Document logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) diff --git a/analyse/record_historical_batches.py b/analyse/record_historical_batches.py index 4d8ccd3..c482e49 100644 --- a/analyse/record_historical_batches.py +++ b/analyse/record_historical_batches.py @@ -4,12 +4,10 @@ See how many unique URLs and root domains we have crawled. import glob import gzip import json -from collections import defaultdict, Counter -from urllib.parse import urlparse import requests -from mwmbl.indexer.paths import CRAWL_GLOB +from mwmbl.indexer import CRAWL_GLOB API_ENDPOINT = "http://95.216.215.29/batches/historical" diff --git a/analyse/search.py b/analyse/search.py index 4ffbd54..4bc3b72 100644 --- a/analyse/search.py +++ b/analyse/search.py @@ -2,9 +2,9 @@ import logging import sys from itertools import islice -from mwmbl.indexer.paths import INDEX_PATH +from mwmbl.indexer import INDEX_PATH from mwmbl.tinysearchengine.completer import Completer -from mwmbl.tinysearchengine.indexer import TinyIndex, Document +from mwmbl.tinysearchengine import TinyIndex, Document from mwmbl.tinysearchengine.rank import HeuristicRanker logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) diff --git a/analyse/send_batch.py b/analyse/send_batch.py index 9191834..6d52d41 100644 --- a/analyse/send_batch.py +++ b/analyse/send_batch.py @@ -3,7 +3,7 @@ Send a batch to a running instance. """ import requests -from mwmbl.crawler.batch import Batch, Item, ItemContent +from mwmbl.crawler import Batch, Item, ItemContent URL = 'http://localhost:5000/crawler/batches/' diff --git a/analyse/update_urls.py b/analyse/update_urls.py index 0655df7..f26c804 100644 --- a/analyse/update_urls.py +++ b/analyse/update_urls.py @@ -4,7 +4,7 @@ from datetime import datetime from pathlib import Path from queue import Queue -from mwmbl.indexer.update_urls import record_urls_in_database +from mwmbl.indexer import record_urls_in_database def run_update_urls_on_fixed_batches(): diff --git a/app/__init__.py b/app/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/app/api.py b/mwmbl/api.py similarity index 100% rename from app/api.py rename to mwmbl/api.py diff --git a/app/apps.py b/mwmbl/apps.py similarity index 96% rename from app/apps.py rename to mwmbl/apps.py index 166aaff..bfc21a5 100644 --- a/app/apps.py +++ b/mwmbl/apps.py @@ -4,7 +4,7 @@ from pathlib import Path from django.apps import AppConfig from django.conf import settings -from app.api import queued_batches +from mwmbl.api import queued_batches from mwmbl import background from mwmbl.indexer.paths import INDEX_NAME from mwmbl.indexer.update_urls import update_urls_continuously @@ -13,7 +13,7 @@ from mwmbl.url_queue import update_queue_continuously class MwmblConfig(AppConfig): - name = "app" + name = "mwmbl" verbose_name = "Mwmbl Application" def ready(self): diff --git a/app/asgi.py b/mwmbl/asgi.py similarity index 82% rename from app/asgi.py rename to mwmbl/asgi.py index c8d5aaa..73088a9 100644 --- a/app/asgi.py +++ b/mwmbl/asgi.py @@ -11,6 +11,6 @@ import os from django.core.asgi import get_asgi_application -os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'app.settings') +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mwmbl.settings_dev') application = get_asgi_application() diff --git a/mwmbl/crawler/app.py b/mwmbl/crawler/app.py index bda2dc8..a4f0524 100644 --- a/mwmbl/crawler/app.py +++ b/mwmbl/crawler/app.py @@ -10,7 +10,7 @@ from uuid import uuid4 import boto3 import justext import requests -from fastapi import HTTPException, APIRouter +from fastapi import HTTPException from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \ LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \ STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor diff --git a/mwmbl/crawler/urls.py b/mwmbl/crawler/urls.py index cefe19e..7c83edf 100644 --- a/mwmbl/crawler/urls.py +++ b/mwmbl/crawler/urls.py @@ -1,16 +1,13 @@ """ Database storing info on URLs """ -import random from dataclasses import dataclass -from datetime import datetime, timedelta +from datetime import datetime from enum import Enum from logging import getLogger from psycopg2.extras import execute_values -from mwmbl.hn_top_domains_filtered import DOMAINS -from mwmbl.settings import CORE_DOMAINS # Client has one hour to crawl a URL that has been assigned to them, or it will be reassigned from mwmbl.utils import batch diff --git a/mwmbl/indexer/batch_cache.py b/mwmbl/indexer/batch_cache.py index e7af6db..01d8cc9 100644 --- a/mwmbl/indexer/batch_cache.py +++ b/mwmbl/indexer/batch_cache.py @@ -9,7 +9,6 @@ import os from logging import getLogger from multiprocessing.pool import ThreadPool from pathlib import Path -from tempfile import NamedTemporaryFile from urllib.parse import urlparse from pydantic import ValidationError diff --git a/mwmbl/indexer/index.py b/mwmbl/indexer/index.py index 4edcb8a..fb61405 100644 --- a/mwmbl/indexer/index.py +++ b/mwmbl/indexer/index.py @@ -1,13 +1,10 @@ """ Create a search index """ -from collections import Counter from typing import Iterable from urllib.parse import unquote -import pandas as pd - -from mwmbl.tinysearchengine.indexer import Document, TokenizedDocument, TinyIndex +from mwmbl.tinysearchengine.indexer import TokenizedDocument from mwmbl.tokenizer import tokenize, get_bigrams DEFAULT_SCORE = 0 diff --git a/mwmbl/indexer/update_urls.py b/mwmbl/indexer/update_urls.py index 3819777..8a1b973 100644 --- a/mwmbl/indexer/update_urls.py +++ b/mwmbl/indexer/update_urls.py @@ -1,13 +1,10 @@ -import os -import pickle -import re from collections import defaultdict from datetime import datetime, timezone, timedelta from logging import getLogger from multiprocessing import Queue from pathlib import Path from time import sleep -from typing import Iterable, Collection +from typing import Collection from urllib.parse import urlparse from requests_cache import CachedSession diff --git a/mwmbl/main.py b/mwmbl/main.py new file mode 100644 index 0000000..0281edc --- /dev/null +++ b/mwmbl/main.py @@ -0,0 +1,9 @@ +import uvicorn + + +def run(): + uvicorn.run("mwmbl.asgi:application", host="0.0.0.0", port=8000) + + +if __name__ == "__main__": + run() diff --git a/mwmbl/platform/user.py b/mwmbl/platform/user.py index a3006c4..bbdcb0e 100644 --- a/mwmbl/platform/user.py +++ b/mwmbl/platform/user.py @@ -7,7 +7,7 @@ import requests from fastapi import APIRouter, Response from pydantic import BaseModel -from mwmbl.tinysearchengine.indexer import TinyIndex, Document, DocumentState +from mwmbl.tinysearchengine.indexer import TinyIndex, Document from mwmbl.tokenizer import tokenize diff --git a/app/settings_common.py b/mwmbl/settings_common.py similarity index 96% rename from app/settings_common.py rename to mwmbl/settings_common.py index 2753dc6..b08b62c 100644 --- a/app/settings_common.py +++ b/mwmbl/settings_common.py @@ -1,5 +1,5 @@ """ -Django settings for app project. +Django settings for mwmbl project. Generated by 'django-admin startproject' using Django 4.2.4. @@ -37,7 +37,7 @@ INSTALLED_APPS = [ 'django.contrib.sessions', 'django.contrib.messages', 'django.contrib.staticfiles', - 'app', + 'mwmbl', ] MIDDLEWARE = [ @@ -50,7 +50,7 @@ MIDDLEWARE = [ 'django.middleware.clickjacking.XFrameOptionsMiddleware', ] -ROOT_URLCONF = 'app.urls' +ROOT_URLCONF = 'mwmbl.urls' TEMPLATES = [ { @@ -68,7 +68,7 @@ TEMPLATES = [ }, ] -WSGI_APPLICATION = 'app.wsgi.application' +WSGI_APPLICATION = 'mwmbl.wsgi.application' # Database diff --git a/app/settings_dev.py b/mwmbl/settings_dev.py similarity index 67% rename from app/settings_dev.py rename to mwmbl/settings_dev.py index bb8e33f..fe07890 100644 --- a/app/settings_dev.py +++ b/mwmbl/settings_dev.py @@ -1,4 +1,4 @@ -from app.settings_common import * +from mwmbl.settings_common import * DATA_PATH = "./devdata" RUN_BACKGROUND_PROCESSES = False diff --git a/app/settings_prod.py b/mwmbl/settings_prod.py similarity index 69% rename from app/settings_prod.py rename to mwmbl/settings_prod.py index 37c9cf3..f7c50ee 100644 --- a/app/settings_prod.py +++ b/mwmbl/settings_prod.py @@ -1,4 +1,4 @@ -from app.settings_common import * +from mwmbl.settings_common import * DATA_PATH = "/app/storage" RUN_BACKGROUND_PROCESSES = True diff --git a/mwmbl/tinysearchengine/rank.py b/mwmbl/tinysearchengine/rank.py index 81109fd..7f331b8 100644 --- a/mwmbl/tinysearchengine/rank.py +++ b/mwmbl/tinysearchengine/rank.py @@ -6,7 +6,6 @@ from operator import itemgetter from urllib.parse import urlparse from mwmbl.format import format_result_with_pattern, get_query_regex -from mwmbl.platform.user import MAX_CURATED_SCORE from mwmbl.tokenizer import tokenize, get_bigrams from mwmbl.tinysearchengine.completer import Completer from mwmbl.hn_top_domains_filtered import DOMAINS diff --git a/mwmbl/url_queue.py b/mwmbl/url_queue.py index ab0f1bc..8151550 100644 --- a/mwmbl/url_queue.py +++ b/mwmbl/url_queue.py @@ -1,6 +1,5 @@ import time from collections import defaultdict -from dataclasses import dataclass from datetime import datetime, timedelta from logging import getLogger from multiprocessing import Queue diff --git a/app/urls.py b/mwmbl/urls.py similarity index 96% rename from app/urls.py rename to mwmbl/urls.py index 440a2f4..ff67f2d 100644 --- a/app/urls.py +++ b/mwmbl/urls.py @@ -17,7 +17,7 @@ Including another URLconf from django.contrib import admin from django.urls import path -from app.api import api +from mwmbl.api import api urlpatterns = [ path('admin/', admin.site.urls), diff --git a/app/wsgi.py b/mwmbl/wsgi.py similarity index 82% rename from app/wsgi.py rename to mwmbl/wsgi.py index ef30895..ebdf0ff 100644 --- a/app/wsgi.py +++ b/mwmbl/wsgi.py @@ -11,6 +11,6 @@ import os from django.core.wsgi import get_wsgi_application -os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'app.settings') +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mwmbl.settings_dev') application = get_wsgi_application() diff --git a/test/test_completer.py b/test/test_completer.py index b1fb49e..8867f26 100644 --- a/test/test_completer.py +++ b/test/test_completer.py @@ -1,5 +1,3 @@ -import mwmbl.tinysearchengine.completer -import pytest import pandas as pd def mockCompleterData(mocker, data): @@ -16,7 +14,7 @@ def test_correctCompletions(mocker): [3, 'buildings', 1]] mockCompleterData(mocker, testdata) - completer = mwmbl.tinysearchengine.completer.Completer() + completer = app.tinysearchengine.completer.Completer() completion = completer.complete('build') assert ['build', 'builder', 'buildings'] == completion @@ -29,7 +27,7 @@ def test_correctSortOrder(mocker): [3, 'buildings', 3]] mockCompleterData(mocker, testdata) - completer = mwmbl.tinysearchengine.completer.Completer() + completer = app.tinysearchengine.completer.Completer() completion = completer.complete('build') assert ['build', 'buildings', 'builder'] == completion @@ -42,7 +40,7 @@ def test_noCompletions(mocker): [3, 'buildings', 1]] mockCompleterData(mocker, testdata) - completer = mwmbl.tinysearchengine.completer.Completer() + completer = app.tinysearchengine.completer.Completer() completion = completer.complete('test') assert [] == completion @@ -55,7 +53,7 @@ def test_singleCompletions(mocker): [3, 'buildings', 1]] mockCompleterData(mocker, testdata) - completer = mwmbl.tinysearchengine.completer.Completer() + completer = app.tinysearchengine.completer.Completer() completion = completer.complete('announce') assert ['announce'] == completion @@ -68,7 +66,7 @@ def test_idempotencyWithSameScoreCompletions(mocker): [3, 'buildings', 1]] mockCompleterData(mocker, testdata) - completer = mwmbl.tinysearchengine.completer.Completer() + completer = app.tinysearchengine.completer.Completer() for i in range(3): print(f"iteration: {i}") completion = completer.complete('build') diff --git a/test/test_indexer.py b/test/test_indexer.py index dd25b18..cf714c0 100644 --- a/test/test_indexer.py +++ b/test/test_indexer.py @@ -1,9 +1,9 @@ from pathlib import Path from tempfile import TemporaryDirectory -from mwmbl.tinysearchengine.indexer import Document, TinyIndex, _binary_search_fitting_size, astuple, _trim_items_to_page, _get_page_data, _pad_to_page_size -from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError -import json +from mwmbl.tinysearchengine import Document, TinyIndex, _binary_search_fitting_size, astuple, _trim_items_to_page, _get_page_data, _pad_to_page_size +from zstandard import ZstdCompressor + def test_create_index(): num_pages = 10 diff --git a/test/test_update_urls.py b/test/test_update_urls.py index 8f205f8..089caea 100644 --- a/test/test_update_urls.py +++ b/test/test_update_urls.py @@ -1,4 +1,4 @@ -from mwmbl.indexer.update_urls import process_link +from mwmbl.indexer import process_link def test_process_link_normal():