mwmbl/analyse/analyse_crawled_domains.py

"""
See how many unique URLs and root domains we have crawled.
"""
import glob
import gzip
import json
from collections import defaultdict, Counter
from urllib.parse import urlparse

from mwmbl.crawler import HashedBatch
from mwmbl.indexer import CRAWL_GLOB, MWMBL_DATA_DIR


# TODO: remove this line - temporary override
CRAWL_GLOB = str(MWMBL_DATA_DIR / "b2") + "/*/*/2022-06-23/*/*/*.json.gz"


def get_urls():
    for path in glob.glob(CRAWL_GLOB):
        data = json.load(gzip.open(path))
        batch = HashedBatch.parse_obj(data)
        user = batch.user_id_hash
        for item in batch.items:
            if item.content is not None:
                for url in item.content.links:
                    yield user, url


def analyse_urls(urls):
    url_set = defaultdict(list)
    domains = set()
    for user, url in urls:
        url_set[url].append(user)

        parsed_url = urlparse(url)
        path = parsed_url.path.strip('/')
        if path == '':
            domains.add(parsed_url.netloc)

    count = sum(len(x) for x in url_set.values())
    print("Root pages crawled", sorted(domains))
    find_worst_pages(url_set)
    print(f"Got {len(url_set)} URLs and {len(domains)} root pages from {count} items")
    url_list_size = len(json.dumps(list(url_set.keys())))
    print("Length of all URLs", url_list_size)


def find_worst_pages(url_set):
    worst = sorted(((len(users), url) for url, users in url_set.items()), reverse=True)[:50]
    for count, url in worst:
        print("Worst", count, url, Counter(url_set[url]))


def run():
    urls = get_urls()
    analyse_urls(urls)


if __name__ == '__main__':
    run()
Analyse crawled URLs and domains 2022-01-26 18:51:58 +00:00			`"""`
			`See how many unique URLs and root domains we have crawled.`
			`"""`
			`import glob`
			`import gzip`
			`import json`
Analyse the pages that are crawled most often 2022-01-29 07:06:53 +00:00			`from collections import defaultdict, Counter`
Analyse crawled URLs and domains 2022-01-26 18:51:58 +00:00			`from urllib.parse import urlparse`

Rename django app to mwmbl 2023-10-10 12:51:06 +00:00			`from mwmbl.crawler import HashedBatch`
			`from mwmbl.indexer import CRAWL_GLOB, MWMBL_DATA_DIR`
Investigate duplication of URLs in batches 2022-06-26 20:11:51 +00:00

			`# TODO: remove this line - temporary override`
			`CRAWL_GLOB = str(MWMBL_DATA_DIR / "b2") + "///2022-06-23///*.json.gz"`
Analyse crawled URLs and domains 2022-01-26 18:51:58 +00:00

			`def get_urls():`
			`for path in glob.glob(CRAWL_GLOB):`
			`data = json.load(gzip.open(path))`
Investigate duplication of URLs in batches 2022-06-26 20:11:51 +00:00			`batch = HashedBatch.parse_obj(data)`
			`user = batch.user_id_hash`
			`for item in batch.items:`
			`if item.content is not None:`
			`for url in item.content.links:`
			`yield user, url`
Analyse crawled URLs and domains 2022-01-26 18:51:58 +00:00

			`def analyse_urls(urls):`
Analyse the pages that are crawled most often 2022-01-29 07:06:53 +00:00			`url_set = defaultdict(list)`
Analyse crawled URLs and domains 2022-01-26 18:51:58 +00:00			`domains = set()`
Analyse the pages that are crawled most often 2022-01-29 07:06:53 +00:00			`for user, url in urls:`
			`url_set[url].append(user)`

Analyse crawled URLs and domains 2022-01-26 18:51:58 +00:00			`parsed_url = urlparse(url)`
			`path = parsed_url.path.strip('/')`
			`if path == '':`
			`domains.add(parsed_url.netloc)`

Analyse the pages that are crawled most often 2022-01-29 07:06:53 +00:00			`count = sum(len(x) for x in url_set.values())`
Analyse crawled URLs and domains 2022-01-26 18:51:58 +00:00			`print("Root pages crawled", sorted(domains))`
Analyse the pages that are crawled most often 2022-01-29 07:06:53 +00:00			`find_worst_pages(url_set)`
Analyse crawled URLs and domains 2022-01-26 18:51:58 +00:00			`print(f"Got {len(url_set)} URLs and {len(domains)} root pages from {count} items")`
Analyse the pages that are crawled most often 2022-01-29 07:06:53 +00:00			`url_list_size = len(json.dumps(list(url_set.keys())))`
			`print("Length of all URLs", url_list_size)`


			`def find_worst_pages(url_set):`
			`worst = sorted(((len(users), url) for url, users in url_set.items()), reverse=True)[:50]`
			`for count, url in worst:`
			`print("Worst", count, url, Counter(url_set[url]))`
Analyse crawled URLs and domains 2022-01-26 18:51:58 +00:00

			`def run():`
			`urls = get_urls()`
			`analyse_urls(urls)`


			`if __name__ == '__main__':`
			`run()`