42 lines
1.4 KiB
Python
42 lines
1.4 KiB
Python
import re
|
|
|
|
from mwmbl.tokenizer import tokenize, clean_unicode
|
|
|
|
|
|
def format_result_with_pattern(pattern, result):
|
|
formatted_result = {}
|
|
for content_type, content_raw in [('title', result.title), ('extract', result.extract)]:
|
|
content = clean_unicode(content_raw)
|
|
matches = re.finditer(pattern, content, re.IGNORECASE)
|
|
all_spans = [0] + sum((list(m.span()) for m in matches), []) + [len(content)]
|
|
content_result = []
|
|
for i in range(len(all_spans) - 1):
|
|
is_bold = i % 2 == 1
|
|
start = all_spans[i]
|
|
end = all_spans[i + 1]
|
|
content_result.append({'value': content[start:end], 'is_bold': is_bold})
|
|
formatted_result[content_type] = content_result
|
|
formatted_result['url'] = result.url
|
|
return formatted_result
|
|
|
|
|
|
def get_query_regex(terms, is_complete, is_url):
|
|
if not terms:
|
|
return ''
|
|
|
|
word_sep = r'\b' if is_url else ''
|
|
if is_complete:
|
|
term_patterns = [rf'{word_sep}{re.escape(term)}{word_sep}' for term in terms]
|
|
else:
|
|
term_patterns = [rf'{word_sep}{re.escape(term)}{word_sep}' for term in terms[:-1]] + [
|
|
rf'{word_sep}{re.escape(terms[-1])}']
|
|
pattern = '|'.join(term_patterns)
|
|
return pattern
|
|
|
|
|
|
def format_result(result, query):
|
|
tokens = tokenize(query)
|
|
pattern = get_query_regex(tokens, True, False)
|
|
return format_result_with_pattern(pattern, result)
|
|
|