Merge branch 'main' into django-rewrite

This commit is contained in:
Daoud Clarke 2023-09-23 12:56:22 +01:00
commit 177324353f
4 changed files with 690 additions and 520 deletions

View file

@ -9,6 +9,8 @@ from time import sleep
from typing import Iterable, Collection
from urllib.parse import urlparse
from requests_cache import CachedSession
from mwmbl.crawler.batch import HashedBatch
from mwmbl.crawler.urls import URLDatabase, URLStatus, FoundURL
from mwmbl.database import Database
@ -19,7 +21,7 @@ from mwmbl.indexer.index_batches import get_url_error_status
from mwmbl.indexer.indexdb import BatchStatus
from mwmbl.indexer.paths import BATCH_DIR_NAME
from mwmbl.settings import UNKNOWN_DOMAIN_MULTIPLIER, EXCLUDED_DOMAINS, SCORE_FOR_SAME_DOMAIN, \
SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER
SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER, BLACKLIST_DOMAINS_URL
from mwmbl.utils import get_domain
logger = getLogger(__name__)
@ -40,7 +42,11 @@ def run(batch_cache: BatchCache, new_item_queue: Queue):
def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Queue):
logger.info(f"Recording URLs in database for {len(batches)} batches")
start = datetime.now()
blacklist_domains = get_blacklist_domains()
blacklist_retrieval_time = datetime.now() - start
logger.info(f"Recording URLs in database for {len(batches)} batches, with {len(blacklist_domains)} blacklist "
f"domains, retrieved in {blacklist_retrieval_time.total_seconds()} seconds")
with Database() as db:
url_db = URLDatabase(db.connection)
url_scores = defaultdict(float)
@ -64,12 +70,12 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu
score_multiplier = 1 if crawled_page_domain in DOMAINS else UNKNOWN_DOMAIN_MULTIPLIER
for link in item.content.links:
process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
url_timestamps, url_users, False)
url_timestamps, url_users, False, blacklist_domains)
if item.content.extra_links:
for link in item.content.extra_links:
process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
url_timestamps, url_users, True)
url_timestamps, url_users, True, blacklist_domains)
found_urls = [FoundURL(url, url_users[url], url_scores[url], url_statuses[url], url_timestamps[url])
for url in url_scores.keys() | url_statuses.keys()]
@ -80,9 +86,16 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu
logger.info(f"Put {len(urls)} new items in the URL queue")
def process_link(batch, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool):
def get_blacklist_domains():
with CachedSession(expire_after=timedelta(days=1)) as session:
response = session.get(BLACKLIST_DOMAINS_URL)
return set(response.text.split())
def process_link(batch, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool, blacklist_domains):
parsed_link = urlparse(link)
if parsed_link.netloc in EXCLUDED_DOMAINS:
if parsed_link.netloc in EXCLUDED_DOMAINS or parsed_link.netloc in blacklist_domains:
logger.info(f"Excluding link for blacklisted domain: {parsed_link}")
return
extra_multiplier = EXTRA_LINK_MULTIPLIER if is_extra else 1.0

View file

@ -31,7 +31,7 @@ SCORE_FOR_DIFFERENT_DOMAIN = 1.0
SCORE_FOR_SAME_DOMAIN = 0.01
EXTRA_LINK_MULTIPLIER = 0.001
UNKNOWN_DOMAIN_MULTIPLIER = 0.001
EXCLUDED_DOMAINS = {'web.archive.org', 'forums.giantitp.com', 'www.crutchfield.com', 'plus.google.com'}
EXCLUDED_DOMAINS = {'web.archive.org', 'forums.giantitp.com', 'www.crutchfield.com', 'plus.google.com', 'changeporn.com'}
CORE_DOMAINS = {
'github.com',
'en.wikipedia.org',
@ -42,3 +42,5 @@ CORE_DOMAINS = {
'arxiv.org',
'www.python.org',
}
BLACKLIST_DOMAINS_URL = "https://get.domainsblacklists.com/blacklist.txt"

1180
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -35,6 +35,7 @@ pyspark = {version= "==3.2.0", optional = true}
Levenshtein = {version= "==0.16.0", optional = true}
django = "^4.2.4"
django-ninja = "^0.22.2"
requests-cache = "^1.1.0"
[tool.poetry.extras]
indexer = [