Analyse crawled URLs and domains

This commit is contained in:
Daoud Clarke 2022-01-26 18:51:58 +00:00
parent 171fa645d2
commit 70254ae160
2 changed files with 47 additions and 0 deletions

View file

@ -0,0 +1,42 @@
"""
See how many unique URLs and root domains we have crawled.
"""
import glob
import gzip
import json
from urllib.parse import urlparse
CRAWL_GLOB = "../../data/mwmbl/b2/*/*/*/*/*/*.json.gz"
def get_urls():
for path in glob.glob(CRAWL_GLOB):
data = json.load(gzip.open(path))
for item in data['items']:
yield item['url']
def analyse_urls(urls):
url_set = set()
domains = set()
count = 0
for url in urls:
count += 1
url_set.add(url)
parsed_url = urlparse(url)
path = parsed_url.path.strip('/')
if path == '':
domains.add(parsed_url.netloc)
print("Root pages crawled", sorted(domains))
print(f"Got {len(url_set)} URLs and {len(domains)} root pages from {count} items")
def run():
urls = get_urls()
analyse_urls(urls)
if __name__ == '__main__':
run()

View file

@ -0,0 +1,5 @@
"""
Index data crawled through the Mwmbl crawler.
"""