Rename django app to mwmbl

This commit is contained in:
Daoud Clarke 2023-10-10 13:51:06 +01:00
parent fab5e5c782
commit 918eaa8709
32 changed files with 55 additions and 60 deletions

View file

@ -46,5 +46,8 @@ VOLUME ["/data"]
EXPOSE 5000 EXPOSE 5000
# Using the mwmbl-tinysearchengine binary/entrypoint which comes packaged with mwmbl ENV DJANGO_SETTINGS_MODULE=mwmbl.settings_dev
CMD ["/venv/bin/mwmbl-tinysearchengine", "--num-pages", "10240000", "--background", "--data", "/app/storage"]
# WORKDIR "/venv/lib/python3.10/site-packages/mwmbl/"
# CMD ["/venv/bin/python", "-m", "uvicorn", "app.asgi:application"]
CMD ["/venv/bin/mwmbl-tinysearchengine"]

View file

@ -7,8 +7,8 @@ import json
from collections import defaultdict, Counter from collections import defaultdict, Counter
from urllib.parse import urlparse from urllib.parse import urlparse
from mwmbl.crawler.batch import HashedBatch from mwmbl.crawler import HashedBatch
from mwmbl.indexer.paths import CRAWL_GLOB, MWMBL_DATA_DIR from mwmbl.indexer import CRAWL_GLOB, MWMBL_DATA_DIR
# TODO: remove this line - temporary override # TODO: remove this line - temporary override

View file

@ -1,6 +1,6 @@
import json import json
from mwmbl.indexer.paths import TOP_DOMAINS_JSON_PATH from mwmbl.indexer import TOP_DOMAINS_JSON_PATH
from mwmbl.hn_top_domains_filtered import DOMAINS from mwmbl.hn_top_domains_filtered import DOMAINS

View file

@ -3,7 +3,7 @@ Export the list of unique URLs to a SQLite file for analysis/evaluation.
""" """
import sqlite3 import sqlite3
from mwmbl.indexer.paths import URLS_PATH from mwmbl.indexer import URLS_PATH
from mwmbl.app import get_config_and_index from mwmbl.app import get_config_and_index

View file

@ -7,16 +7,15 @@ import json
import logging import logging
import os import os
import sys import sys
from pathlib import Path
from datetime import datetime from datetime import datetime
import spacy import spacy
from mwmbl.crawler.batch import HashedBatch from mwmbl.crawler import HashedBatch
from mwmbl.crawler.urls import URLDatabase from mwmbl.crawler.urls import URLDatabase
from mwmbl.database import Database from mwmbl.database import Database
from mwmbl.indexer.index_batches import index_batches from mwmbl.indexer import index_batches
from mwmbl.tinysearchengine.indexer import TinyIndex, Document from mwmbl.tinysearchengine import TinyIndex, Document
LOCAL_BATCHES_PATH = f'{os.environ["HOME"]}/data/mwmbl/file/**/*.json.gz' LOCAL_BATCHES_PATH = f'{os.environ["HOME"]}/data/mwmbl/file/**/*.json.gz'
NUM_BATCHES = 10000 NUM_BATCHES = 10000

View file

@ -1,7 +1,7 @@
""" """
Count unique URLs in the index. Count unique URLs in the index.
""" """
from mwmbl.tinysearchengine.indexer import TinyIndex, Document from mwmbl.tinysearchengine import TinyIndex, Document
def run(): def run():

View file

@ -5,9 +5,9 @@ import numpy as np
import spacy import spacy
from analyse.index_local import EVALUATE_INDEX_PATH from analyse.index_local import EVALUATE_INDEX_PATH
from mwmbl.indexer.index import tokenize_document from mwmbl.indexer import tokenize_document
from mwmbl.indexer.paths import INDEX_PATH from mwmbl.indexer import INDEX_PATH
from mwmbl.tinysearchengine.indexer import TinyIndex, Document from mwmbl.tinysearchengine import TinyIndex, Document
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

View file

@ -4,12 +4,10 @@ See how many unique URLs and root domains we have crawled.
import glob import glob
import gzip import gzip
import json import json
from collections import defaultdict, Counter
from urllib.parse import urlparse
import requests import requests
from mwmbl.indexer.paths import CRAWL_GLOB from mwmbl.indexer import CRAWL_GLOB
API_ENDPOINT = "http://95.216.215.29/batches/historical" API_ENDPOINT = "http://95.216.215.29/batches/historical"

View file

@ -2,9 +2,9 @@ import logging
import sys import sys
from itertools import islice from itertools import islice
from mwmbl.indexer.paths import INDEX_PATH from mwmbl.indexer import INDEX_PATH
from mwmbl.tinysearchengine.completer import Completer from mwmbl.tinysearchengine.completer import Completer
from mwmbl.tinysearchengine.indexer import TinyIndex, Document from mwmbl.tinysearchengine import TinyIndex, Document
from mwmbl.tinysearchengine.rank import HeuristicRanker from mwmbl.tinysearchengine.rank import HeuristicRanker
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

View file

@ -3,7 +3,7 @@ Send a batch to a running instance.
""" """
import requests import requests
from mwmbl.crawler.batch import Batch, Item, ItemContent from mwmbl.crawler import Batch, Item, ItemContent
URL = 'http://localhost:5000/crawler/batches/' URL = 'http://localhost:5000/crawler/batches/'

View file

@ -4,7 +4,7 @@ from datetime import datetime
from pathlib import Path from pathlib import Path
from queue import Queue from queue import Queue
from mwmbl.indexer.update_urls import record_urls_in_database from mwmbl.indexer import record_urls_in_database
def run_update_urls_on_fixed_batches(): def run_update_urls_on_fixed_batches():

View file

View file

@ -4,7 +4,7 @@ from pathlib import Path
from django.apps import AppConfig from django.apps import AppConfig
from django.conf import settings from django.conf import settings
from app.api import queued_batches from mwmbl.api import queued_batches
from mwmbl import background from mwmbl import background
from mwmbl.indexer.paths import INDEX_NAME from mwmbl.indexer.paths import INDEX_NAME
from mwmbl.indexer.update_urls import update_urls_continuously from mwmbl.indexer.update_urls import update_urls_continuously
@ -13,7 +13,7 @@ from mwmbl.url_queue import update_queue_continuously
class MwmblConfig(AppConfig): class MwmblConfig(AppConfig):
name = "app" name = "mwmbl"
verbose_name = "Mwmbl Application" verbose_name = "Mwmbl Application"
def ready(self): def ready(self):

View file

@ -11,6 +11,6 @@ import os
from django.core.asgi import get_asgi_application from django.core.asgi import get_asgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'app.settings') os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mwmbl.settings_dev')
application = get_asgi_application() application = get_asgi_application()

View file

@ -10,7 +10,7 @@ from uuid import uuid4
import boto3 import boto3
import justext import justext
import requests import requests
from fastapi import HTTPException, APIRouter from fastapi import HTTPException
from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \ from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \ LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor

View file

@ -1,16 +1,13 @@
""" """
Database storing info on URLs Database storing info on URLs
""" """
import random
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime, timedelta from datetime import datetime
from enum import Enum from enum import Enum
from logging import getLogger from logging import getLogger
from psycopg2.extras import execute_values from psycopg2.extras import execute_values
from mwmbl.hn_top_domains_filtered import DOMAINS
from mwmbl.settings import CORE_DOMAINS
# Client has one hour to crawl a URL that has been assigned to them, or it will be reassigned # Client has one hour to crawl a URL that has been assigned to them, or it will be reassigned
from mwmbl.utils import batch from mwmbl.utils import batch

View file

@ -9,7 +9,6 @@ import os
from logging import getLogger from logging import getLogger
from multiprocessing.pool import ThreadPool from multiprocessing.pool import ThreadPool
from pathlib import Path from pathlib import Path
from tempfile import NamedTemporaryFile
from urllib.parse import urlparse from urllib.parse import urlparse
from pydantic import ValidationError from pydantic import ValidationError

View file

@ -1,13 +1,10 @@
""" """
Create a search index Create a search index
""" """
from collections import Counter
from typing import Iterable from typing import Iterable
from urllib.parse import unquote from urllib.parse import unquote
import pandas as pd from mwmbl.tinysearchengine.indexer import TokenizedDocument
from mwmbl.tinysearchengine.indexer import Document, TokenizedDocument, TinyIndex
from mwmbl.tokenizer import tokenize, get_bigrams from mwmbl.tokenizer import tokenize, get_bigrams
DEFAULT_SCORE = 0 DEFAULT_SCORE = 0

View file

@ -1,13 +1,10 @@
import os
import pickle
import re
from collections import defaultdict from collections import defaultdict
from datetime import datetime, timezone, timedelta from datetime import datetime, timezone, timedelta
from logging import getLogger from logging import getLogger
from multiprocessing import Queue from multiprocessing import Queue
from pathlib import Path from pathlib import Path
from time import sleep from time import sleep
from typing import Iterable, Collection from typing import Collection
from urllib.parse import urlparse from urllib.parse import urlparse
from requests_cache import CachedSession from requests_cache import CachedSession

9
mwmbl/main.py Normal file
View file

@ -0,0 +1,9 @@
import uvicorn
def run():
uvicorn.run("mwmbl.asgi:application", host="0.0.0.0", port=8000)
if __name__ == "__main__":
run()

View file

@ -7,7 +7,7 @@ import requests
from fastapi import APIRouter, Response from fastapi import APIRouter, Response
from pydantic import BaseModel from pydantic import BaseModel
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, DocumentState from mwmbl.tinysearchengine.indexer import TinyIndex, Document
from mwmbl.tokenizer import tokenize from mwmbl.tokenizer import tokenize

View file

@ -1,5 +1,5 @@
""" """
Django settings for app project. Django settings for mwmbl project.
Generated by 'django-admin startproject' using Django 4.2.4. Generated by 'django-admin startproject' using Django 4.2.4.
@ -37,7 +37,7 @@ INSTALLED_APPS = [
'django.contrib.sessions', 'django.contrib.sessions',
'django.contrib.messages', 'django.contrib.messages',
'django.contrib.staticfiles', 'django.contrib.staticfiles',
'app', 'mwmbl',
] ]
MIDDLEWARE = [ MIDDLEWARE = [
@ -50,7 +50,7 @@ MIDDLEWARE = [
'django.middleware.clickjacking.XFrameOptionsMiddleware', 'django.middleware.clickjacking.XFrameOptionsMiddleware',
] ]
ROOT_URLCONF = 'app.urls' ROOT_URLCONF = 'mwmbl.urls'
TEMPLATES = [ TEMPLATES = [
{ {
@ -68,7 +68,7 @@ TEMPLATES = [
}, },
] ]
WSGI_APPLICATION = 'app.wsgi.application' WSGI_APPLICATION = 'mwmbl.wsgi.application'
# Database # Database

View file

@ -1,4 +1,4 @@
from app.settings_common import * from mwmbl.settings_common import *
DATA_PATH = "./devdata" DATA_PATH = "./devdata"
RUN_BACKGROUND_PROCESSES = False RUN_BACKGROUND_PROCESSES = False

View file

@ -1,4 +1,4 @@
from app.settings_common import * from mwmbl.settings_common import *
DATA_PATH = "/app/storage" DATA_PATH = "/app/storage"
RUN_BACKGROUND_PROCESSES = True RUN_BACKGROUND_PROCESSES = True

View file

@ -6,7 +6,6 @@ from operator import itemgetter
from urllib.parse import urlparse from urllib.parse import urlparse
from mwmbl.format import format_result_with_pattern, get_query_regex from mwmbl.format import format_result_with_pattern, get_query_regex
from mwmbl.platform.user import MAX_CURATED_SCORE
from mwmbl.tokenizer import tokenize, get_bigrams from mwmbl.tokenizer import tokenize, get_bigrams
from mwmbl.tinysearchengine.completer import Completer from mwmbl.tinysearchengine.completer import Completer
from mwmbl.hn_top_domains_filtered import DOMAINS from mwmbl.hn_top_domains_filtered import DOMAINS

View file

@ -1,6 +1,5 @@
import time import time
from collections import defaultdict from collections import defaultdict
from dataclasses import dataclass
from datetime import datetime, timedelta from datetime import datetime, timedelta
from logging import getLogger from logging import getLogger
from multiprocessing import Queue from multiprocessing import Queue

View file

@ -17,7 +17,7 @@ Including another URLconf
from django.contrib import admin from django.contrib import admin
from django.urls import path from django.urls import path
from app.api import api from mwmbl.api import api
urlpatterns = [ urlpatterns = [
path('admin/', admin.site.urls), path('admin/', admin.site.urls),

View file

@ -11,6 +11,6 @@ import os
from django.core.wsgi import get_wsgi_application from django.core.wsgi import get_wsgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'app.settings') os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mwmbl.settings_dev')
application = get_wsgi_application() application = get_wsgi_application()

View file

@ -1,5 +1,3 @@
import mwmbl.tinysearchengine.completer
import pytest
import pandas as pd import pandas as pd
def mockCompleterData(mocker, data): def mockCompleterData(mocker, data):
@ -16,7 +14,7 @@ def test_correctCompletions(mocker):
[3, 'buildings', 1]] [3, 'buildings', 1]]
mockCompleterData(mocker, testdata) mockCompleterData(mocker, testdata)
completer = mwmbl.tinysearchengine.completer.Completer() completer = app.tinysearchengine.completer.Completer()
completion = completer.complete('build') completion = completer.complete('build')
assert ['build', 'builder', 'buildings'] == completion assert ['build', 'builder', 'buildings'] == completion
@ -29,7 +27,7 @@ def test_correctSortOrder(mocker):
[3, 'buildings', 3]] [3, 'buildings', 3]]
mockCompleterData(mocker, testdata) mockCompleterData(mocker, testdata)
completer = mwmbl.tinysearchengine.completer.Completer() completer = app.tinysearchengine.completer.Completer()
completion = completer.complete('build') completion = completer.complete('build')
assert ['build', 'buildings', 'builder'] == completion assert ['build', 'buildings', 'builder'] == completion
@ -42,7 +40,7 @@ def test_noCompletions(mocker):
[3, 'buildings', 1]] [3, 'buildings', 1]]
mockCompleterData(mocker, testdata) mockCompleterData(mocker, testdata)
completer = mwmbl.tinysearchengine.completer.Completer() completer = app.tinysearchengine.completer.Completer()
completion = completer.complete('test') completion = completer.complete('test')
assert [] == completion assert [] == completion
@ -55,7 +53,7 @@ def test_singleCompletions(mocker):
[3, 'buildings', 1]] [3, 'buildings', 1]]
mockCompleterData(mocker, testdata) mockCompleterData(mocker, testdata)
completer = mwmbl.tinysearchengine.completer.Completer() completer = app.tinysearchengine.completer.Completer()
completion = completer.complete('announce') completion = completer.complete('announce')
assert ['announce'] == completion assert ['announce'] == completion
@ -68,7 +66,7 @@ def test_idempotencyWithSameScoreCompletions(mocker):
[3, 'buildings', 1]] [3, 'buildings', 1]]
mockCompleterData(mocker, testdata) mockCompleterData(mocker, testdata)
completer = mwmbl.tinysearchengine.completer.Completer() completer = app.tinysearchengine.completer.Completer()
for i in range(3): for i in range(3):
print(f"iteration: {i}") print(f"iteration: {i}")
completion = completer.complete('build') completion = completer.complete('build')

View file

@ -1,9 +1,9 @@
from pathlib import Path from pathlib import Path
from tempfile import TemporaryDirectory from tempfile import TemporaryDirectory
from mwmbl.tinysearchengine.indexer import Document, TinyIndex, _binary_search_fitting_size, astuple, _trim_items_to_page, _get_page_data, _pad_to_page_size from mwmbl.tinysearchengine import Document, TinyIndex, _binary_search_fitting_size, astuple, _trim_items_to_page, _get_page_data, _pad_to_page_size
from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError from zstandard import ZstdCompressor
import json
def test_create_index(): def test_create_index():
num_pages = 10 num_pages = 10

View file

@ -1,4 +1,4 @@
from mwmbl.indexer.update_urls import process_link from mwmbl.indexer import process_link
def test_process_link_normal(): def test_process_link_normal():