Initial commit

This commit is contained in:
Daoud Clarke 2021-03-13 20:54:15 +00:00
commit b1bfe1cdd4
3 changed files with 87 additions and 0 deletions

36
crawl.py Normal file
View file

@ -0,0 +1,36 @@
"""
Crawl the web
"""
import gzip
import hashlib
import os
import pandas as pd
import requests
import justext
from paths import DATA_DIR, HN_TOP_PATH, CRAWL_PREFIX
def crawl():
data = pd.read_csv(HN_TOP_PATH)
for url in data['url']:
print("Fetching", url)
html = fetch(url)
filename = hashlib.md5(url.encode('utf8')).hexdigest()
path = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}{filename}.html.gz")
if os.path.isfile(path):
print("Path already exists, skipping")
with gzip.open(path, 'w') as output:
output.write(html.encode('utf8'))
def fetch(url):
page_data = requests.get(url)
return page_data.text
if __name__ == '__main__':
crawl()

44
index.py Normal file
View file

@ -0,0 +1,44 @@
"""
Create a search index
"""
import gzip
from glob import glob
import justext
from spacy.lang.en import English
from paths import CRAWL_GLOB
def is_content_token(nlp, token):
lexeme = nlp.vocab[token.orth]
return lexeme.is_alpha and not token.is_stop
def tokenize(nlp, cleaned_text):
tokens = nlp.tokenizer(cleaned_text)
content_tokens = [token for token in tokens if is_content_token(nlp, token)]
lowered = {nlp.vocab[token.orth].text.lower() for token in content_tokens}
return lowered
def clean(content):
text = justext.justext(content, justext.get_stoplist("English"))
pars = [par.text for par in text if not par.is_boilerplate]
cleaned_text = ' '.join(pars)
return cleaned_text
def run():
nlp = English()
for path in glob(CRAWL_GLOB):
with gzip.open(path) as html_file:
content = html_file.read().decode("utf8")
cleaned_text = clean(content)
tokens = tokenize(nlp, cleaned_text)
print("Tokens", tokens)
break
if __name__ == '__main__':
run()

7
paths.py Normal file
View file

@ -0,0 +1,7 @@
import os
HOME = os.getenv('HOME')
DATA_DIR = os.path.join(HOME, 'data', 'tinysearch')
HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
CRAWL_PREFIX = 'crawl_'
CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")