Create index

This commit is contained in:
Daoud Clarke 2021-03-13 22:21:50 +00:00
parent b1bfe1cdd4
commit 9815372297
3 changed files with 90 additions and 11 deletions

View file

@ -4,10 +4,11 @@ Crawl the web
import gzip
import hashlib
import os
import sys
from traceback import print_tb, print_exc
import pandas as pd
import requests
import justext
from paths import DATA_DIR, HN_TOP_PATH, CRAWL_PREFIX
@ -16,15 +17,23 @@ def crawl():
data = pd.read_csv(HN_TOP_PATH)
for url in data['url']:
print("Fetching", url)
html = fetch(url)
filename = hashlib.md5(url.encode('utf8')).hexdigest()
path = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}{filename}.html.gz")
if os.path.isfile(path):
print("Path already exists, skipping")
print("Path already exists, skipping", url)
continue
with gzip.open(path, 'w') as output:
output.write(html.encode('utf8'))
print("Fetching", url)
try:
html = fetch(url)
except Exception:
print_exc(file=sys.stderr)
print("Unable to fetch", url)
continue
with gzip.open(path, 'wt') as output:
output.write(url + '\n')
output.write(html)
def fetch(url):

View file

@ -2,12 +2,16 @@
Create a search index
"""
import gzip
import sqlite3
from glob import glob
import bs4
import justext
from spacy.lang.en import English
from paths import CRAWL_GLOB
from paths import CRAWL_GLOB, INDEX_PATH
NUM_INITIAL_TOKENS = 50
def is_content_token(nlp, token):
@ -17,7 +21,8 @@ def is_content_token(nlp, token):
def tokenize(nlp, cleaned_text):
tokens = nlp.tokenizer(cleaned_text)
content_tokens = [token for token in tokens if is_content_token(nlp, token)]
content_tokens = [token for token in tokens[:NUM_INITIAL_TOKENS]
if is_content_token(nlp, token)]
lowered = {nlp.vocab[token.orth].text.lower() for token in content_tokens}
return lowered
@ -29,15 +34,79 @@ def clean(content):
return cleaned_text
def index(tokens, url, title):
with sqlite3.connect(INDEX_PATH) as con:
con.execute("""
INSERT INTO pages (url, title)
VALUES (?, ?)
""", (url, title))
result = con.execute("""
SELECT last_insert_rowid()
""")
page_id = result.fetchone()[0]
print("Created page with id", page_id)
con.executemany("""
INSERT INTO terms (term, page_id)
VALUES (?, ?)
""", [(term, page_id) for term in tokens])
def create_if_not_exists():
con = sqlite3.connect(INDEX_PATH)
con.execute("""
CREATE TABLE IF NOT EXISTS pages (
id INTEGER PRIMARY KEY,
url TEXT UNIQUE,
title TEXT
)
""")
con.execute("""
CREATE TABLE IF NOT EXISTS terms (
term TEXT,
page_id INTEGER
)
""")
con.execute("""
CREATE INDEX IF NOT EXISTS term_index ON terms (term)
""")
def page_indexed(url):
con = sqlite3.connect(INDEX_PATH)
result = con.execute("""
SELECT EXISTS(SELECT 1 FROM pages WHERE url=?)
""", (url,))
value = result.fetchone()[0]
return value == 1
def run():
create_if_not_exists()
nlp = English()
for path in glob(CRAWL_GLOB):
with gzip.open(path) as html_file:
content = html_file.read().decode("utf8")
print("Path", path)
with gzip.open(path, 'rt') as html_file:
url = html_file.readline().strip()
content = html_file.read()
if page_indexed(url):
print("Page exists, skipping", url)
continue
cleaned_text = clean(content)
try:
title = bs4.BeautifulSoup(content, features="lxml").find('title').string
except AttributeError:
title = cleaned_text[:80]
tokens = tokenize(nlp, cleaned_text)
print("URL", url)
print("Tokens", tokens)
break
print("Title", title)
index(tokens, url, title)
if __name__ == '__main__':

View file

@ -5,3 +5,4 @@ DATA_DIR = os.path.join(HOME, 'data', 'tinysearch')
HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
CRAWL_PREFIX = 'crawl_'
CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
INDEX_PATH = os.path.join(DATA_DIR, 'index.sqlite3')