Investigate duplication of URLs in batches

2022-06-26 21:11:51 +01:00 · 2022-06-26 21:11:51 +01:00 · e27d749e18
parent eb571fc5fe
commit e27d749e18
10 changed files with 75 additions and 63 deletions
--- a/analyse/analyse_crawled_domains.py
+++ b/analyse/analyse_crawled_domains.py
@ -7,15 +7,23 @@ import json
 from collections import defaultdict, Counter
 from urllib.parse import urlparse

-from mwmbl.indexer.paths import CRAWL_GLOB
+from mwmbl.crawler.batch import HashedBatch
+from mwmbl.indexer.paths import CRAWL_GLOB, MWMBL_DATA_DIR
+
+
+# TODO: remove this line - temporary override
+CRAWL_GLOB = str(MWMBL_DATA_DIR / "b2") + "/*/*/2022-06-23/*/*/*.json.gz"


 def get_urls():
    for path in glob.glob(CRAWL_GLOB):
        data = json.load(gzip.open(path))
-        user = data['user_id_hash']
-        for item in data['items']:
-            yield user, item['url']
+        batch = HashedBatch.parse_obj(data)
+        user = batch.user_id_hash
+        for item in batch.items:
+            if item.content is not None:
+                for url in item.content.links:
+                    yield user, url


 def analyse_urls(urls):
--- a/analyse/recent_batches.py
+++ b/analyse/recent_batches.py
@ -0,0 +1,4 @@
+"""
+Analyse recent batches looking for duplicates.
+"""
+
--- a/mwmbl/crawler/app.py
+++ b/mwmbl/crawler/app.py
@ -4,14 +4,14 @@ import json
 import os
 import re
 from datetime import datetime, timezone, timedelta
-from typing import Optional, Union
+from typing import Union
 from uuid import uuid4

 import boto3
 import requests
 from fastapi import HTTPException, APIRouter
-from pydantic import BaseModel

+from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
 from mwmbl.crawler.urls import URLDatabase
 from mwmbl.database import Database

@ -43,40 +43,6 @@ def upload(data: bytes, name: str):
    return result


-class ItemContent(BaseModel):
-    title: str
-    extract: str
-    links: list[str]
-
-
-class ItemError(BaseModel):
-    name: str
-    message: Optional[str]
-
-
-class Item(BaseModel):
-    url: str
-    status: Optional[int]
-    timestamp: int
-    content: Optional[ItemContent]
-    error: Optional[ItemError]
-
-
-class Batch(BaseModel):
-    user_id: str
-    items: list[Item]
-
-
-class NewBatchRequest(BaseModel):
-    user_id: str
-
-
-class HashedBatch(BaseModel):
-    user_id_hash: str
-    timestamp: int
-    items: list[Item]
-
-
 last_batch = None


@ -232,7 +198,6 @@ def get_subfolders(prefix):
    items = client.list_objects(Bucket=BUCKET_NAME,
                                Prefix=prefix,
                                Delimiter='/')
-    print("Got items", items)
    item_keys = [item['Prefix'][len(prefix):].strip('/') for item in items['CommonPrefixes']]
    return item_keys

--- a/mwmbl/crawler/batch.py
+++ b/mwmbl/crawler/batch.py
@ -0,0 +1,37 @@
+from typing import Optional
+
+from pydantic import BaseModel
+
+
+class ItemContent(BaseModel):
+    title: str
+    extract: str
+    links: list[str]
+
+
+class ItemError(BaseModel):
+    name: str
+    message: Optional[str]
+
+
+class Item(BaseModel):
+    url: str
+    status: Optional[int]
+    timestamp: int
+    content: Optional[ItemContent]
+    error: Optional[ItemError]
+
+
+class Batch(BaseModel):
+    user_id: str
+    items: list[Item]
+
+
+class NewBatchRequest(BaseModel):
+    user_id: str
+
+
+class HashedBatch(BaseModel):
+    user_id_hash: str
+    timestamp: int
+    items: list[Item]
--- a/mwmbl/indexer/historical.py
+++ b/mwmbl/indexer/historical.py
@ -3,7 +3,8 @@ from datetime import date, datetime, timedelta
 import spacy

 from mwmbl.crawler.app import get_user_id_hashes_for_date, get_batches_for_date_and_user, get_batch_from_id, \
-    create_historical_batch, HashedBatch, get_batch_url
+    create_historical_batch, get_batch_url
+from mwmbl.crawler.batch import HashedBatch
 from mwmbl.database import Database
 from mwmbl.indexer.indexdb import BatchInfo, BatchStatus, IndexDatabase
 from mwmbl.indexer.index import tokenize_document
@ -17,13 +18,13 @@ def run():
    for day in range(DAYS):
        date_str = str(date.today() - timedelta(days=day))
        users = get_user_id_hashes_for_date(date_str)
-        print("Users", users)
+        print(f"Got {len(users)} for day {date_str}")
        with Database() as db:
            index_db = IndexDatabase(db.connection)
            index_db.create_tables()
            for user in users:
                batches = get_batches_for_date_and_user(date_str, user)
-                print("Batches", batches)
+                print("Historical batches for user", user, len(batches))
                batch_urls = [get_batch_url(batch_id, date_str, user) for batch_id in batches["batch_ids"]]
                infos = [BatchInfo(url, user, BatchStatus.REMOTE) for url in batch_urls]
                index_db.record_batches(infos)
--- a/mwmbl/indexer/indexdb.py
+++ b/mwmbl/indexer/indexdb.py
@ -12,7 +12,6 @@ from mwmbl.tinysearchengine.indexer import Document
 class BatchStatus(Enum):
    REMOTE = 0    # The batch only exists in long term storage
    LOCAL = 1     # We have a copy of the batch locally in Postgresql
-    INDEXED = 2   # The batch has been indexed and the local data has been deleted


 class DocumentStatus(Enum):
@ -108,6 +107,7 @@ class IndexDatabase:
        data = [(document.url, document.title, document.extract, document.score, DocumentStatus.NEW.value)
                for document in sorted_documents]

+        print("Queueing documents", len(data))
        with self.connection.cursor() as cursor:
            execute_values(cursor, sql, data)

@ -126,7 +126,6 @@ class IndexDatabase:
        with self.connection.cursor() as cursor:
            cursor.execute(sql)
            results = cursor.fetchall()
-            print("Results", results)
            return [Document(title, url, extract, score) for url, title, extract, score in results]

    def queue_documents_for_page(self, urls_and_page_indexes: list[tuple[str, int]]):
@ -134,7 +133,7 @@ class IndexDatabase:
        INSERT INTO document_pages (url, page) values %s
        """

-        print("Queuing", urls_and_page_indexes)
+        print(f"Queuing {len(urls_and_page_indexes)} urls and page indexes")
        with self.connection.cursor() as cursor:
            execute_values(cursor, sql, urls_and_page_indexes)

--- a/mwmbl/indexer/preprocess.py
+++ b/mwmbl/indexer/preprocess.py
@ -27,7 +27,7 @@ def run_preprocessing(index_path):
    with Database() as db:
        index_db = IndexDatabase(db.connection)
        documents = index_db.get_documents_for_preprocessing()
-        print(f"Got {len(documents)} documents")
+        print(f"Got {len(documents)} documents for preprocessing")
        if len(documents) == 0:
            sleep(10)
        with TinyIndex(Document, index_path, 'w') as indexer:
--- a/mwmbl/indexer/retrieve.py
+++ b/mwmbl/indexer/retrieve.py
@ -9,7 +9,7 @@ from time import sleep

 import requests

-from mwmbl.crawler.app import HashedBatch
+from mwmbl.crawler.batch import HashedBatch
 from mwmbl.database import Database
 from mwmbl.indexer.indexdb import IndexDatabase, BatchStatus
 from mwmbl.retry import retry_requests
@ -26,21 +26,19 @@ def retrieve_batches():
    with Database() as db:
        index_db = IndexDatabase(db.connection)
        batches = index_db.get_batches_by_status(BatchStatus.REMOTE)
-        print("Batches", batches)
+        print(f"Found {len(batches)} remote batches")
        urls = [batch.url for batch in batches]
        pool = ThreadPool(NUM_THREADS)
        results = pool.imap_unordered(retrieve_batch, urls)
        for result in results:
            print("Processed batch with items:", result)
-
-        with Database() as db:
-            index_db = IndexDatabase(db.connection)
-            index_db.update_batch_status(urls, BatchStatus.LOCAL)
+        index_db.update_batch_status(urls, BatchStatus.LOCAL)


 def retrieve_batch(url):
    data = json.loads(gzip.decompress(retry_requests.get(url).content))
    batch = HashedBatch.parse_obj(data)
+    print(f"Retrieved batch with {len(batch.items)} items")
    queue_batch(batch)
    return len(batch.items)

--- a/mwmbl/indexer/update_pages.py
+++ b/mwmbl/indexer/update_pages.py
@ -18,21 +18,20 @@ def run_update(index_path):
        for i in range(indexer.num_pages):
            with Database() as db:
                index_db = IndexDatabase(db.connection)
-                pages = index_db.get_queued_documents_for_page(i)
-                if len(pages) > 0:
-                    print("Pages", len(pages))
-                else:
+                documents = index_db.get_queued_documents_for_page(i)
+                print(f"Documents queued for page {i}: {len(documents)}")
+                if len(documents) == 0:
                    continue

                for j in range(3):
                    try:
-                        indexer.add_to_page(i, pages)
+                        indexer.add_to_page(i, documents)
                        break
                    except ValueError:
-                        pages = pages[:len(pages)//2]
-                        if len(pages) == 0:
+                        documents = documents[:len(documents)//2]
+                        if len(documents) == 0:
                            break
-                        print(f"Not enough space, adding {len(pages)}")
+                        print(f"Not enough space, adding {len(documents)}")
                index_db.clear_queued_documents_for_page(i)


--- a/mwmbl/main.py
+++ b/mwmbl/main.py
@ -6,7 +6,7 @@ from multiprocessing import Process
 import uvicorn
 from fastapi import FastAPI

-from mwmbl.indexer import historical, retrieve, preprocess
+from mwmbl.indexer import historical, retrieve, preprocess, update_pages
 from mwmbl.crawler.app import router as crawler_router
 from mwmbl.tinysearchengine import search
 from mwmbl.tinysearchengine.completer import Completer
@ -43,6 +43,7 @@ def run():
    Process(target=historical.run).start()
    Process(target=retrieve.run).start()
    Process(target=preprocess.run, args=(args.index,)).start()
+    Process(target=update_pages.run, args=(args.index,)).start()

    completer = Completer()