Analyse crawled URLs and domains

2022-01-26 18:51:58 +00:00 · 2022-01-26 18:51:58 +00:00 · 70254ae160
parent 171fa645d2
commit 70254ae160
2 changed files with 47 additions and 0 deletions
--- a/analyse/analyse_crawled_domains.py
+++ b/analyse/analyse_crawled_domains.py
@ -0,0 +1,42 @@
+"""
+See how many unique URLs and root domains we have crawled.
+"""
+import glob
+import gzip
+import json
+from urllib.parse import urlparse
+
+CRAWL_GLOB = "../../data/mwmbl/b2/*/*/*/*/*/*.json.gz"
+
+
+def get_urls():
+    for path in glob.glob(CRAWL_GLOB):
+        data = json.load(gzip.open(path))
+        for item in data['items']:
+            yield item['url']
+
+
+def analyse_urls(urls):
+    url_set = set()
+    domains = set()
+    count = 0
+    for url in urls:
+        count += 1
+        url_set.add(url)
+        parsed_url = urlparse(url)
+        path = parsed_url.path.strip('/')
+        if path == '':
+            domains.add(parsed_url.netloc)
+
+    print("Root pages crawled", sorted(domains))
+    print(f"Got {len(url_set)} URLs and {len(domains)} root pages from {count} items")
+
+
+def run():
+    urls = get_urls()
+    analyse_urls(urls)
+
+
+if __name__ == '__main__':
+    run()
+
--- a/mwmbl/indexer/index_crawl.py
+++ b/mwmbl/indexer/index_crawl.py
@ -0,0 +1,5 @@
+"""
+Index data crawled through the Mwmbl crawler.
+"""
+
+