From 11eedcde8428f39e5da3becf4f8a5fa3e62a4d71 Mon Sep 17 00:00:00 2001 From: nitred Date: Tue, 28 Dec 2021 12:02:48 +0100 Subject: [PATCH] renamed package to mwmbl - renamed package to mwmbl in pyproject.toml - tinysearchengine and indexer modules have been moved into mwmbl package folder - analyse module has been left as is in the root of the repo - import statements in tinysearchengine now use mwmbl.tinysearchengine - import statements in indexer now use mwmbl.indexer or mwmbl.tinysearchengine or relative imports like .paths - import statements in analyse now use mwmbl.indexer or mwmbl.tinysearchengine - final CMD in Dockerfile now uses updated path mwmbl.tinysearchengine.app - fixed a couple of import statement errors in tinysearchengine/indexer.py --- Dockerfile | 2 +- analyse/inspect_index.py | 4 ++-- analyse/make_curl.py | 4 ++-- analyse/performance.py | 10 +++++----- {indexer => mwmbl}/__init__.py | 0 {indexer/domains => mwmbl/indexer}/__init__.py | 0 {indexer => mwmbl/indexer}/bootstrap.sh | 0 {indexer => mwmbl/indexer}/crawl.py | 2 +- {indexer => mwmbl/indexer}/deploy.sh | 0 {indexer => mwmbl/indexer}/domains.py | 0 .../indexer/domains}/__init__.py | 0 .../indexer}/domains/domain_titles.py | 4 ++-- .../indexer}/domains/queue_domains.py | 4 ++-- {indexer => mwmbl/indexer}/extract.py | 0 {indexer => mwmbl/indexer}/extract_local.py | 6 +++--- {indexer => mwmbl/indexer}/extract_process.py | 0 {indexer => mwmbl/indexer}/fsqueue.py | 0 .../indexer}/hn-top-domains-filtered.py | 0 {indexer => mwmbl/indexer}/index.py | 2 +- {indexer => mwmbl/indexer}/index_glob.py | 7 ++++--- {indexer => mwmbl/indexer}/index_queue.py | 8 ++++---- {indexer => mwmbl/indexer}/indexcc.py | 8 ++++---- {indexer => mwmbl/indexer}/paths.py | 0 {indexer => mwmbl/indexer}/wiki.py | 6 +++--- mwmbl/tinysearchengine/__init__.py | 0 mwmbl/tinysearchengine/app.py | 17 +++++++++++++++++ .../tinysearchengine}/create_app.py | 2 +- .../tinysearchengine}/indexer.py | 2 +- .../tinysearchengine}/static/index.css | 0 .../tinysearchengine}/static/index.html | 0 .../tinysearchengine}/static/index.js | 0 .../tinysearchengine}/static/landing.html | 0 .../tinysearchengine}/static/plugin.xml | 0 .../tinysearchengine}/static/search.html | 0 .../tinysearchengine}/static/typeahead.css | 0 .../tinysearchengine}/static/typeahead.js | 0 pyproject.toml | 2 +- tinysearchengine/app.py | 17 ----------------- 38 files changed, 54 insertions(+), 53 deletions(-) rename {indexer => mwmbl}/__init__.py (100%) rename {indexer/domains => mwmbl/indexer}/__init__.py (100%) rename {indexer => mwmbl/indexer}/bootstrap.sh (100%) rename {indexer => mwmbl/indexer}/crawl.py (94%) rename {indexer => mwmbl/indexer}/deploy.sh (100%) rename {indexer => mwmbl/indexer}/domains.py (100%) rename {tinysearchengine => mwmbl/indexer/domains}/__init__.py (100%) rename {indexer => mwmbl/indexer}/domains/domain_titles.py (94%) rename {indexer => mwmbl/indexer}/domains/queue_domains.py (82%) rename {indexer => mwmbl/indexer}/extract.py (100%) rename {indexer => mwmbl/indexer}/extract_local.py (92%) rename {indexer => mwmbl/indexer}/extract_process.py (100%) rename {indexer => mwmbl/indexer}/fsqueue.py (100%) rename {indexer => mwmbl/indexer}/hn-top-domains-filtered.py (100%) rename {indexer => mwmbl/indexer}/index.py (96%) rename {indexer => mwmbl/indexer}/index_glob.py (83%) rename {indexer => mwmbl/indexer}/index_queue.py (74%) rename {indexer => mwmbl/indexer}/indexcc.py (82%) rename {indexer => mwmbl/indexer}/paths.py (100%) rename {indexer => mwmbl/indexer}/wiki.py (85%) create mode 100644 mwmbl/tinysearchengine/__init__.py create mode 100644 mwmbl/tinysearchengine/app.py rename {tinysearchengine => mwmbl/tinysearchengine}/create_app.py (98%) rename {tinysearchengine => mwmbl/tinysearchengine}/indexer.py (98%) rename {tinysearchengine => mwmbl/tinysearchengine}/static/index.css (100%) rename {tinysearchengine => mwmbl/tinysearchengine}/static/index.html (100%) rename {tinysearchengine => mwmbl/tinysearchengine}/static/index.js (100%) rename {tinysearchengine => mwmbl/tinysearchengine}/static/landing.html (100%) rename {tinysearchengine => mwmbl/tinysearchengine}/static/plugin.xml (100%) rename {tinysearchengine => mwmbl/tinysearchengine}/static/search.html (100%) rename {tinysearchengine => mwmbl/tinysearchengine}/static/typeahead.css (100%) rename {tinysearchengine => mwmbl/tinysearchengine}/static/typeahead.js (100%) delete mode 100644 tinysearchengine/app.py diff --git a/Dockerfile b/Dockerfile index 784381a..3f525b8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -31,4 +31,4 @@ COPY data /data #COPY docker-entrypoint.sh wsgi.py ./ #CMD ["./docker-entrypoint.sh"] -CMD ["/venv/bin/python", "-m", "tinysearchengine.app", "/data/index.tinysearch"] +CMD ["/venv/bin/python", "-m", "mwmbl.tinysearchengine.app", "/data/index.tinysearch"] diff --git a/analyse/inspect_index.py b/analyse/inspect_index.py index f73064a..18a5a96 100644 --- a/analyse/inspect_index.py +++ b/analyse/inspect_index.py @@ -1,5 +1,5 @@ -from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document -from indexer.paths import INDEX_PATH +from mwmbl.tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document +from mwmbl.indexer.paths import INDEX_PATH def get_items(): diff --git a/analyse/make_curl.py b/analyse/make_curl.py index c411806..465f990 100644 --- a/analyse/make_curl.py +++ b/analyse/make_curl.py @@ -5,8 +5,8 @@ import os from itertools import islice from urllib.parse import quote -from indexer.paths import DATA_DIR -from indexer.wiki import get_wiki_titles_and_urls +from mwmbl.indexer.paths import DATA_DIR +from mwmbl.indexer.wiki import get_wiki_titles_and_urls URL_TEMPLATE = "http://localhost:8000/complete?q={}" CURL_FILE = os.path.join(DATA_DIR, "urls.curl") diff --git a/analyse/performance.py b/analyse/performance.py index 53fdcae..4a675d4 100644 --- a/analyse/performance.py +++ b/analyse/performance.py @@ -8,11 +8,11 @@ import numpy as np from spacy.lang.en import English from starlette.testclient import TestClient -from tinysearchengine import create_app -from indexer.fsqueue import ZstdJsonSerializer -from indexer.index import index_titles_urls_and_extracts -from tinysearchengine.indexer import TinyIndex, TinyIndexer, Document -from indexer.paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH +from mwmbl.tinysearchengine import create_app +from mwmbl.indexer.fsqueue import ZstdJsonSerializer +from mwmbl.indexer.index import index_titles_urls_and_extracts +from mwmbl.tinysearchengine.indexer import TinyIndex, TinyIndexer, Document +from mwmbl.indexer.paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH NUM_DOCUMENTS = 30000 NUM_PAGES_FOR_STATS = 10 diff --git a/indexer/__init__.py b/mwmbl/__init__.py similarity index 100% rename from indexer/__init__.py rename to mwmbl/__init__.py diff --git a/indexer/domains/__init__.py b/mwmbl/indexer/__init__.py similarity index 100% rename from indexer/domains/__init__.py rename to mwmbl/indexer/__init__.py diff --git a/indexer/bootstrap.sh b/mwmbl/indexer/bootstrap.sh similarity index 100% rename from indexer/bootstrap.sh rename to mwmbl/indexer/bootstrap.sh diff --git a/indexer/crawl.py b/mwmbl/indexer/crawl.py similarity index 94% rename from indexer/crawl.py rename to mwmbl/indexer/crawl.py index 609deb6..11405d0 100644 --- a/indexer/crawl.py +++ b/mwmbl/indexer/crawl.py @@ -10,7 +10,7 @@ from traceback import print_tb, print_exc import pandas as pd import requests -from paths import DATA_DIR, HN_TOP_PATH, CRAWL_PREFIX +from .paths import DATA_DIR, HN_TOP_PATH, CRAWL_PREFIX def crawl(): diff --git a/indexer/deploy.sh b/mwmbl/indexer/deploy.sh similarity index 100% rename from indexer/deploy.sh rename to mwmbl/indexer/deploy.sh diff --git a/indexer/domains.py b/mwmbl/indexer/domains.py similarity index 100% rename from indexer/domains.py rename to mwmbl/indexer/domains.py diff --git a/tinysearchengine/__init__.py b/mwmbl/indexer/domains/__init__.py similarity index 100% rename from tinysearchengine/__init__.py rename to mwmbl/indexer/domains/__init__.py diff --git a/indexer/domains/domain_titles.py b/mwmbl/indexer/domains/domain_titles.py similarity index 94% rename from indexer/domains/domain_titles.py rename to mwmbl/indexer/domains/domain_titles.py index be6203d..907367e 100644 --- a/indexer/domains/domain_titles.py +++ b/mwmbl/indexer/domains/domain_titles.py @@ -8,8 +8,8 @@ from urllib.parse import urlsplit, urlunsplit import bs4 import requests -from indexer.fsqueue import FSQueue, ZstdJsonSerializer -from indexer.paths import DATA_DIR, DOMAINS_QUEUE_NAME, DOMAINS_TITLES_QUEUE_NAME +from mwmbl.indexer.fsqueue import FSQueue, ZstdJsonSerializer +from mwmbl.indexer.paths import DATA_DIR, DOMAINS_QUEUE_NAME, DOMAINS_TITLES_QUEUE_NAME NUM_PROCESSES = 10 diff --git a/indexer/domains/queue_domains.py b/mwmbl/indexer/domains/queue_domains.py similarity index 82% rename from indexer/domains/queue_domains.py rename to mwmbl/indexer/domains/queue_domains.py index 8136de2..3eb7ac6 100644 --- a/indexer/domains/queue_domains.py +++ b/mwmbl/indexer/domains/queue_domains.py @@ -4,8 +4,8 @@ Add domains to the queue to be retrieved import csv import gzip -from indexer.fsqueue import FSQueue, ZstdJsonSerializer -from indexer.paths import DOMAINS_PATH, DOMAINS_QUEUE_NAME, DATA_DIR +from mwmbl.indexer.fsqueue import FSQueue, ZstdJsonSerializer +from mwmbl.indexer.paths import DOMAINS_PATH, DOMAINS_QUEUE_NAME, DATA_DIR BATCH_SIZE = 250 diff --git a/indexer/extract.py b/mwmbl/indexer/extract.py similarity index 100% rename from indexer/extract.py rename to mwmbl/indexer/extract.py diff --git a/indexer/extract_local.py b/mwmbl/indexer/extract_local.py similarity index 92% rename from indexer/extract_local.py rename to mwmbl/indexer/extract_local.py index 040883f..b293f08 100644 --- a/indexer/extract_local.py +++ b/mwmbl/indexer/extract_local.py @@ -4,9 +4,9 @@ import os from glob import glob from multiprocessing import Process, Lock -from extract_process import fetch_process_warc_records -from fsqueue import FSQueue, GzipJsonRowSerializer -from paths import DATA_DIR +from .extract_process import fetch_process_warc_records +from .fsqueue import FSQueue, GzipJsonRowSerializer +from .paths import DATA_DIR ARCHIVE_INFO_GLOB = 'outputs/records/*.gz' diff --git a/indexer/extract_process.py b/mwmbl/indexer/extract_process.py similarity index 100% rename from indexer/extract_process.py rename to mwmbl/indexer/extract_process.py diff --git a/indexer/fsqueue.py b/mwmbl/indexer/fsqueue.py similarity index 100% rename from indexer/fsqueue.py rename to mwmbl/indexer/fsqueue.py diff --git a/indexer/hn-top-domains-filtered.py b/mwmbl/indexer/hn-top-domains-filtered.py similarity index 100% rename from indexer/hn-top-domains-filtered.py rename to mwmbl/indexer/hn-top-domains-filtered.py diff --git a/indexer/index.py b/mwmbl/indexer/index.py similarity index 96% rename from indexer/index.py rename to mwmbl/indexer/index.py index 3350560..d0f0efe 100644 --- a/indexer/index.py +++ b/mwmbl/indexer/index.py @@ -10,7 +10,7 @@ import pandas as pd # NUM_PAGES = 8192 # PAGE_SIZE = 512 -from tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument +from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument NUM_INITIAL_TOKENS = 50 diff --git a/indexer/index_glob.py b/mwmbl/indexer/index_glob.py similarity index 83% rename from indexer/index_glob.py rename to mwmbl/indexer/index_glob.py index 31decc3..e9102c2 100644 --- a/indexer/index_glob.py +++ b/mwmbl/indexer/index_glob.py @@ -4,12 +4,13 @@ from glob import glob import bs4 from spacy.lang.en import English -from index import tokenize -from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE -from paths import INDEX_PATH, CRAWL_GLOB +from .index import tokenize +from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE +from .paths import INDEX_PATH, CRAWL_GLOB def run(): + # TODO: item_factory argument is unfilled. indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) indexer.create_if_not_exists() nlp = English() diff --git a/indexer/index_queue.py b/mwmbl/indexer/index_queue.py similarity index 74% rename from indexer/index_queue.py rename to mwmbl/indexer/index_queue.py index eadfd75..f048e28 100644 --- a/indexer/index_queue.py +++ b/mwmbl/indexer/index_queue.py @@ -3,10 +3,10 @@ Index items in the file-system queue """ from spacy.lang.en import English -from fsqueue import FSQueue, ZstdJsonSerializer -from index import index_titles_urls_and_extracts -from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE -from paths import DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, INDEX_PATH +from .fsqueue import FSQueue, ZstdJsonSerializer +from .index import index_titles_urls_and_extracts +from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE +from .paths import DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, INDEX_PATH def get_queue_items(): diff --git a/indexer/indexcc.py b/mwmbl/indexer/indexcc.py similarity index 82% rename from indexer/indexcc.py rename to mwmbl/indexer/indexcc.py index 549bb0e..4f68025 100644 --- a/indexer/indexcc.py +++ b/mwmbl/indexer/indexcc.py @@ -7,10 +7,10 @@ from logging import getLogger import spacy -from fsqueue import FSQueue, GzipJsonRowSerializer, FSQueueError -from index import index_titles_urls_and_extracts -from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE, Document -from paths import INDEX_PATH, DATA_DIR, COMMON_CRAWL_TERMS_PATH +from .fsqueue import FSQueue, GzipJsonRowSerializer, FSQueueError +from .index import index_titles_urls_and_extracts +from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE, Document +from .paths import INDEX_PATH, DATA_DIR, COMMON_CRAWL_TERMS_PATH logging.basicConfig(level=logging.DEBUG, stream=sys.stdout) diff --git a/indexer/paths.py b/mwmbl/indexer/paths.py similarity index 100% rename from indexer/paths.py rename to mwmbl/indexer/paths.py diff --git a/indexer/wiki.py b/mwmbl/indexer/wiki.py similarity index 85% rename from indexer/wiki.py rename to mwmbl/indexer/wiki.py index a3a66ff..93ac1c7 100644 --- a/indexer/wiki.py +++ b/mwmbl/indexer/wiki.py @@ -7,9 +7,9 @@ from urllib.parse import quote from spacy.lang.en import English -from indexer.index import index_titles_urls_and_extracts -from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE -from indexer.paths import WIKI_TITLES_PATH, INDEX_PATH +from .index import index_titles_urls_and_extracts +from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE +from .paths import WIKI_TITLES_PATH, INDEX_PATH TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text'] TITLE_START = 'Wikipedia: ' diff --git a/mwmbl/tinysearchengine/__init__.py b/mwmbl/tinysearchengine/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mwmbl/tinysearchengine/app.py b/mwmbl/tinysearchengine/app.py new file mode 100644 index 0000000..daaca87 --- /dev/null +++ b/mwmbl/tinysearchengine/app.py @@ -0,0 +1,17 @@ +import logging +import sys + +import uvicorn + +from mwmbl.tinysearchengine import create_app +from mwmbl.tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document + +logging.basicConfig() + + +index_path = sys.argv[1] +tiny_index = TinyIndex(Document, index_path, NUM_PAGES, PAGE_SIZE) +app = create_app.create(tiny_index) + +if __name__ == "__main__": + uvicorn.run("mwmbl.tinysearchengine.app:app", host="0.0.0.0", port=8080, log_level="info") diff --git a/tinysearchengine/create_app.py b/mwmbl/tinysearchengine/create_app.py similarity index 98% rename from tinysearchengine/create_app.py rename to mwmbl/tinysearchengine/create_app.py index 19c4a3b..7af8e9f 100644 --- a/tinysearchengine/create_app.py +++ b/mwmbl/tinysearchengine/create_app.py @@ -7,7 +7,7 @@ from fastapi import FastAPI from starlette.responses import FileResponse from starlette.staticfiles import StaticFiles -from tinysearchengine.indexer import TinyIndex, Document +from mwmbl.tinysearchengine.indexer import TinyIndex, Document logger = getLogger(__name__) diff --git a/tinysearchengine/indexer.py b/mwmbl/tinysearchengine/indexer.py similarity index 98% rename from tinysearchengine/indexer.py rename to mwmbl/tinysearchengine/indexer.py index 7e17dc1..d5fe684 100644 --- a/tinysearchengine/indexer.py +++ b/mwmbl/tinysearchengine/indexer.py @@ -6,7 +6,7 @@ from pathlib import Path from typing import TypeVar, Generic, Callable, List import mmh3 -from zstandard import ZstdDecompressor +from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError NUM_PAGES = 25600 diff --git a/tinysearchengine/static/index.css b/mwmbl/tinysearchengine/static/index.css similarity index 100% rename from tinysearchengine/static/index.css rename to mwmbl/tinysearchengine/static/index.css diff --git a/tinysearchengine/static/index.html b/mwmbl/tinysearchengine/static/index.html similarity index 100% rename from tinysearchengine/static/index.html rename to mwmbl/tinysearchengine/static/index.html diff --git a/tinysearchengine/static/index.js b/mwmbl/tinysearchengine/static/index.js similarity index 100% rename from tinysearchengine/static/index.js rename to mwmbl/tinysearchengine/static/index.js diff --git a/tinysearchengine/static/landing.html b/mwmbl/tinysearchengine/static/landing.html similarity index 100% rename from tinysearchengine/static/landing.html rename to mwmbl/tinysearchengine/static/landing.html diff --git a/tinysearchengine/static/plugin.xml b/mwmbl/tinysearchengine/static/plugin.xml similarity index 100% rename from tinysearchengine/static/plugin.xml rename to mwmbl/tinysearchengine/static/plugin.xml diff --git a/tinysearchengine/static/search.html b/mwmbl/tinysearchengine/static/search.html similarity index 100% rename from tinysearchengine/static/search.html rename to mwmbl/tinysearchengine/static/search.html diff --git a/tinysearchengine/static/typeahead.css b/mwmbl/tinysearchengine/static/typeahead.css similarity index 100% rename from tinysearchengine/static/typeahead.css rename to mwmbl/tinysearchengine/static/typeahead.css diff --git a/tinysearchengine/static/typeahead.js b/mwmbl/tinysearchengine/static/typeahead.js similarity index 100% rename from tinysearchengine/static/typeahead.js rename to mwmbl/tinysearchengine/static/typeahead.js diff --git a/pyproject.toml b/pyproject.toml index c634fc9..c51ea16 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [tool.poetry] -name = "tinysearchengine" +name = "mwmbl" version = "0.1.0" description = "" authors = ["Daoud Clarke <daoud.clarke@gmail.com>"] diff --git a/tinysearchengine/app.py b/tinysearchengine/app.py deleted file mode 100644 index 27e8946..0000000 --- a/tinysearchengine/app.py +++ /dev/null @@ -1,17 +0,0 @@ -import logging -import sys - -import uvicorn - -from tinysearchengine import create_app -from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document - -logging.basicConfig() - - -index_path = sys.argv[1] -tiny_index = TinyIndex(Document, index_path, NUM_PAGES, PAGE_SIZE) -app = create_app.create(tiny_index) - -if __name__ == "__main__": - uvicorn.run("tinysearchengine.app:app", host="0.0.0.0", port=8080, log_level="info")