106 lines
4.9 KiB
Python
106 lines
4.9 KiB
Python
from collections import defaultdict
|
|
from datetime import datetime, timezone, timedelta
|
|
from logging import getLogger
|
|
from multiprocessing import Queue
|
|
from pathlib import Path
|
|
from time import sleep
|
|
from typing import Collection
|
|
from urllib.parse import urlparse
|
|
|
|
from mwmbl.crawler.batch import HashedBatch
|
|
from mwmbl.crawler.urls import URLDatabase, URLStatus, FoundURL
|
|
from mwmbl.database import Database
|
|
from mwmbl.hn_top_domains_filtered import DOMAINS
|
|
from mwmbl.indexer import process_batch
|
|
from mwmbl.indexer.batch_cache import BatchCache
|
|
from mwmbl.indexer.blacklist import get_blacklist_domains, is_domain_blacklisted
|
|
from mwmbl.indexer.index_batches import get_url_error_status
|
|
from mwmbl.indexer.indexdb import BatchStatus
|
|
from mwmbl.indexer.paths import BATCH_DIR_NAME
|
|
from mwmbl.settings import UNKNOWN_DOMAIN_MULTIPLIER, SCORE_FOR_SAME_DOMAIN, \
|
|
SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER
|
|
from mwmbl.utils import get_domain
|
|
|
|
logger = getLogger(__name__)
|
|
|
|
|
|
def update_urls_continuously(data_path: str, new_item_queue: Queue):
|
|
batch_cache = BatchCache(Path(data_path) / BATCH_DIR_NAME)
|
|
while True:
|
|
try:
|
|
run(batch_cache, new_item_queue)
|
|
except Exception:
|
|
logger.exception("Error updating URLs")
|
|
sleep(10)
|
|
|
|
|
|
def run(batch_cache: BatchCache, new_item_queue: Queue):
|
|
process_batch.run(batch_cache, BatchStatus.LOCAL, BatchStatus.URLS_UPDATED, record_urls_in_database, new_item_queue)
|
|
|
|
|
|
def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Queue):
|
|
start = datetime.now()
|
|
blacklist_domains = get_blacklist_domains()
|
|
blacklist_retrieval_time = datetime.now() - start
|
|
logger.info(f"Recording URLs in database for {len(batches)} batches, with {len(blacklist_domains)} blacklist "
|
|
f"domains, retrieved in {blacklist_retrieval_time.total_seconds()} seconds")
|
|
with Database() as db:
|
|
url_db = URLDatabase(db.connection)
|
|
url_scores = defaultdict(float)
|
|
url_users = {}
|
|
url_timestamps = {}
|
|
url_statuses = defaultdict(lambda: URLStatus.NEW)
|
|
for batch in batches:
|
|
for item in batch.items:
|
|
timestamp = get_datetime_from_timestamp(item.timestamp / 1000.0)
|
|
url_timestamps[item.url] = timestamp
|
|
url_users[item.url] = batch.user_id_hash
|
|
if item.content is None:
|
|
url_statuses[item.url] = get_url_error_status(item)
|
|
else:
|
|
url_statuses[item.url] = URLStatus.CRAWLED
|
|
try:
|
|
crawled_page_domain = get_domain(item.url)
|
|
except ValueError:
|
|
logger.info(f"Couldn't parse URL {item.url}")
|
|
continue
|
|
score_multiplier = 1 if crawled_page_domain in DOMAINS else UNKNOWN_DOMAIN_MULTIPLIER
|
|
for link in item.content.links:
|
|
process_link(batch.user_id_hash, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
|
|
url_timestamps, url_users, False, blacklist_domains)
|
|
|
|
if item.content.extra_links:
|
|
for link in item.content.extra_links:
|
|
process_link(batch.user_id_hash, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
|
|
url_timestamps, url_users, True, blacklist_domains)
|
|
|
|
found_urls = [FoundURL(url, url_users[url], url_scores[url], url_statuses[url], url_timestamps[url])
|
|
for url in url_scores.keys() | url_statuses.keys()]
|
|
|
|
logger.info(f"Found URLs, {len(found_urls)}")
|
|
urls = url_db.update_found_urls(found_urls)
|
|
new_item_queue.put(urls)
|
|
logger.info(f"Put {len(urls)} new items in the URL queue")
|
|
|
|
|
|
def process_link(user_id_hash, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool, blacklist_domains):
|
|
parsed_link = urlparse(link)
|
|
if is_domain_blacklisted(parsed_link.netloc, blacklist_domains):
|
|
logger.debug(f"Excluding link for blacklisted domain: {parsed_link}")
|
|
return
|
|
|
|
extra_multiplier = EXTRA_LINK_MULTIPLIER if is_extra else 1.0
|
|
score = SCORE_FOR_SAME_DOMAIN if parsed_link.netloc == crawled_page_domain else SCORE_FOR_DIFFERENT_DOMAIN
|
|
url_scores[link] += score * unknown_domain_multiplier * extra_multiplier
|
|
url_users[link] = user_id_hash
|
|
url_timestamps[link] = timestamp
|
|
domain = f'{parsed_link.scheme}://{parsed_link.netloc}/'
|
|
url_scores[domain] += SCORE_FOR_ROOT_PATH * unknown_domain_multiplier
|
|
url_users[domain] = user_id_hash
|
|
url_timestamps[domain] = timestamp
|
|
|
|
|
|
def get_datetime_from_timestamp(timestamp: float) -> datetime:
|
|
batch_datetime = datetime(1970, 1, 1, tzinfo=timezone.utc) + timedelta(seconds=timestamp)
|
|
return batch_datetime
|