Get Dockerfile working

This commit is contained in:
Daoud Clarke 2021-12-23 21:30:51 +00:00
parent 9c65bf3c8f
commit 7e520fb32f
8 changed files with 7885 additions and 11 deletions

7
.dockerignore Normal file
View File

@ -0,0 +1,7 @@
Dockerfile
README.md
*.pyc
*.pyo
*.pyd
__pycache__
.pytest_cache

1
.gitignore vendored
View File

@ -1,2 +1,3 @@
./data
.idea .idea
*~ *~

View File

@ -27,7 +27,8 @@ FROM base as final
#RUN apk add --no-cache libffi libpq #RUN apk add --no-cache libffi libpq
COPY --from=builder /venv /venv COPY --from=builder /venv /venv
COPY data /data
#COPY docker-entrypoint.sh wsgi.py ./ #COPY docker-entrypoint.sh wsgi.py ./
#CMD ["./docker-entrypoint.sh"] #CMD ["./docker-entrypoint.sh"]
CMD ["/venv/bin/python", "-m", "tinysearchengine.app"] CMD ["/venv/bin/python", "-m", "tinysearchengine.app", "/data/index.tinysearch"]

7861
hn-top-domains-filtered.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -9,13 +9,13 @@ COMMON_CRAWL_TERMS_PATH = DATA_DIR / 'common-craw-terms.csv'
HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv') HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
CRAWL_PREFIX = 'crawl_' CRAWL_PREFIX = 'crawl_'
CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*") CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
INDEX_PATH = os.path.join(DATA_DIR, 'index.tinysearch')
TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.tinysearch') TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.tinysearch')
TEST_TERMS_PATH = os.path.join(DATA_DIR, 'index-terms.csv') TEST_TERMS_PATH = os.path.join(DATA_DIR, 'index-terms.csv')
WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2') WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz') WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')
DOMAINS_QUEUE_NAME = 'domains-queue-fs' DOMAINS_QUEUE_NAME = 'domains-queue-fs'
DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs' DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs'
DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz') DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')
INDEX_PATH = Path(__file__).parent / 'data' / 'index.tinysearch'

View File

@ -1,17 +1,17 @@
import logging import logging
import sys
import uvicorn import uvicorn
from tinysearchengine import create_app from tinysearchengine import create_app
from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
from paths import INDEX_PATH
tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
app = create_app.create(tiny_index)
logging.basicConfig() logging.basicConfig()
index_path = sys.argv[1]
tiny_index = TinyIndex(Document, index_path, NUM_PAGES, PAGE_SIZE)
app = create_app.create(tiny_index)
if __name__ == "__main__": if __name__ == "__main__":
uvicorn.run("tinysearchengine.app:app", host="127.0.0.1", port=8080, log_level="info", reload=True) uvicorn.run("tinysearchengine.app:app", host="127.0.0.1", port=8080, log_level="info")

View File

@ -1,6 +1,7 @@
import re import re
from logging import getLogger from logging import getLogger
from operator import itemgetter from operator import itemgetter
from pathlib import Path
from fastapi import FastAPI from fastapi import FastAPI
from starlette.responses import FileResponse from starlette.responses import FileResponse
@ -11,6 +12,7 @@ from tinysearchengine.indexer import TinyIndex, Document
logger = getLogger(__name__) logger = getLogger(__name__)
STATIC_FILES_PATH = Path(__file__).parent / 'static'
SCORE_THRESHOLD = 0.25 SCORE_THRESHOLD = 0.25
@ -107,7 +109,7 @@ def create(tiny_index: TinyIndex):
@app.get('/') @app.get('/')
def index(): def index():
return FileResponse('tinysearchengine/static/index.html') return FileResponse(STATIC_FILES_PATH / 'index.html')
app.mount('/', StaticFiles(directory="tinysearchengine/static"), name="static") app.mount('/', StaticFiles(directory=STATIC_FILES_PATH), name="static")
return app return app

View File

@ -2,11 +2,13 @@ import json
import os import os
from dataclasses import astuple, dataclass from dataclasses import astuple, dataclass
from mmap import mmap, PROT_READ from mmap import mmap, PROT_READ
from pathlib import Path
from typing import TypeVar, Generic, Callable, List from typing import TypeVar, Generic, Callable, List
import mmh3 import mmh3
from zstandard import ZstdDecompressor from zstandard import ZstdDecompressor
NUM_PAGES = 25600 NUM_PAGES = 25600
PAGE_SIZE = 4096 PAGE_SIZE = 4096