131 lines
4.5 KiB
Python
131 lines
4.5 KiB
Python
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from itertools import groupby
|
|
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse, ParseResult
|
|
|
|
import justext
|
|
import requests
|
|
from django.contrib.auth.decorators import login_required
|
|
from django.shortcuts import render
|
|
from django_htmx.http import push_url
|
|
|
|
from mwmbl.format import format_result
|
|
from mwmbl.models import UserCuration, MwmblUser
|
|
from mwmbl.search_setup import ranker
|
|
|
|
from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
|
|
LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
|
|
STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
|
|
|
|
from mwmbl.settings import NUM_EXTRACT_CHARS
|
|
from mwmbl.tinysearchengine.indexer import Document
|
|
from django.conf import settings
|
|
|
|
|
|
def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
|
|
length_high=LENGTH_HIGH_DEFAULT, stopwords_low=STOPWORDS_LOW_DEFAULT,
|
|
stopwords_high=STOPWORDS_HIGH_DEFAULT, max_link_density=MAX_LINK_DENSITY_DEFAULT,
|
|
max_heading_distance=MAX_HEADING_DISTANCE_DEFAULT, no_headings=NO_HEADINGS_DEFAULT,
|
|
encoding=None, default_encoding=DEFAULT_ENCODING,
|
|
enc_errors=DEFAULT_ENC_ERRORS):
|
|
"""
|
|
Converts an HTML page into a list of classified paragraphs. Each paragraph
|
|
is represented as instance of class ˙˙justext.paragraph.Paragraph˙˙.
|
|
"""
|
|
dom = html_to_dom(html_text, default_encoding, encoding, enc_errors)
|
|
|
|
titles = dom.xpath("//title")
|
|
title = titles[0].text if len(titles) > 0 else None
|
|
|
|
dom = preprocessor(dom)
|
|
|
|
paragraphs = ParagraphMaker.make_paragraphs(dom)
|
|
|
|
classify_paragraphs(paragraphs, stoplist, length_low, length_high,
|
|
stopwords_low, stopwords_high, max_link_density, no_headings)
|
|
revise_paragraph_classification(paragraphs, max_heading_distance)
|
|
|
|
return paragraphs, title
|
|
|
|
|
|
def index(request):
|
|
activity, query, results = _get_results_and_activity(request)
|
|
return render(request, "index.html", {
|
|
"results": results,
|
|
"query": query,
|
|
"user": request.user,
|
|
"activity": activity,
|
|
"footer_links": settings.FOOTER_LINKS,
|
|
})
|
|
|
|
|
|
def home_fragment(request):
|
|
activity, query, results = _get_results_and_activity(request)
|
|
response = render(request, "home.html", {
|
|
"results": results,
|
|
"query": query,
|
|
"activity": activity,
|
|
})
|
|
|
|
# Encode the new query string
|
|
if query:
|
|
new_query_string = urlencode({"q": query}, doseq=True)
|
|
new_url = "/?" + new_query_string
|
|
else:
|
|
new_url = "/"
|
|
response["HX-Replace-Url"] = new_url
|
|
return response
|
|
|
|
|
|
@dataclass
|
|
class Activity:
|
|
user: MwmblUser
|
|
num_curations: int
|
|
timestamp: datetime
|
|
query: str
|
|
url: str
|
|
|
|
|
|
def _get_results_and_activity(request):
|
|
query = request.GET.get("q")
|
|
if query:
|
|
results = ranker.search(query)
|
|
activity = None
|
|
else:
|
|
results = None
|
|
curations = UserCuration.objects.order_by("-timestamp")[:100]
|
|
sorted_curations = sorted(curations, key=lambda x: x.user.username)
|
|
groups = groupby(sorted_curations, key=lambda x: (x.user.username, x.url))
|
|
unsorted_activity = []
|
|
for (user, url), group in groups:
|
|
parsed_url_query = parse_qs(urlparse(url).query)
|
|
activity_query = parsed_url_query.get("q", [""])[0]
|
|
group = list(group)
|
|
unsorted_activity.append(Activity(
|
|
user=user,
|
|
num_curations=len(group),
|
|
timestamp=max([i.timestamp for i in group]),
|
|
query=activity_query,
|
|
url=url,
|
|
))
|
|
|
|
activity = sorted(unsorted_activity, key=lambda a: a.timestamp, reverse=True)
|
|
return activity, query, results
|
|
|
|
|
|
def fetch_url(request):
|
|
url = request.GET["url"]
|
|
query = request.GET["query"]
|
|
response = requests.get(url)
|
|
paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
|
|
good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
|
|
|
|
extract = ' '.join([p.text for p in good_paragraphs])
|
|
if len(extract) > NUM_EXTRACT_CHARS:
|
|
extract = extract[:NUM_EXTRACT_CHARS - 1] + '…'
|
|
|
|
result = Document(title=title, url=url, extract=extract, score=0.0)
|
|
return render(request, "result.html", {
|
|
"result": format_result(result, query),
|
|
})
|