mwmbl/analyse/analyse_crawled_domains.py

62 lines
1.6 KiB
Python
Raw Normal View History

2022-01-26 18:51:58 +00:00
"""
See how many unique URLs and root domains we have crawled.
"""
import glob
import gzip
import json
from collections import defaultdict, Counter
2022-01-26 18:51:58 +00:00
from urllib.parse import urlparse
2023-10-10 12:51:06 +00:00
from mwmbl.crawler import HashedBatch
from mwmbl.indexer import CRAWL_GLOB, MWMBL_DATA_DIR
# TODO: remove this line - temporary override
CRAWL_GLOB = str(MWMBL_DATA_DIR / "b2") + "/*/*/2022-06-23/*/*/*.json.gz"
2022-01-26 18:51:58 +00:00
def get_urls():
for path in glob.glob(CRAWL_GLOB):
data = json.load(gzip.open(path))
batch = HashedBatch.parse_obj(data)
user = batch.user_id_hash
for item in batch.items:
if item.content is not None:
for url in item.content.links:
yield user, url
2022-01-26 18:51:58 +00:00
def analyse_urls(urls):
url_set = defaultdict(list)
2022-01-26 18:51:58 +00:00
domains = set()
for user, url in urls:
url_set[url].append(user)
2022-01-26 18:51:58 +00:00
parsed_url = urlparse(url)
path = parsed_url.path.strip('/')
if path == '':
domains.add(parsed_url.netloc)
count = sum(len(x) for x in url_set.values())
2022-01-26 18:51:58 +00:00
print("Root pages crawled", sorted(domains))
find_worst_pages(url_set)
2022-01-26 18:51:58 +00:00
print(f"Got {len(url_set)} URLs and {len(domains)} root pages from {count} items")
url_list_size = len(json.dumps(list(url_set.keys())))
print("Length of all URLs", url_list_size)
def find_worst_pages(url_set):
worst = sorted(((len(users), url) for url, users in url_set.items()), reverse=True)[:50]
for count, url in worst:
print("Worst", count, url, Counter(url_set[url]))
2022-01-26 18:51:58 +00:00
def run():
urls = get_urls()
analyse_urls(urls)
if __name__ == '__main__':
run()