Clean unicode when formatting result

This commit is contained in:
Daoud Clarke 2023-05-20 22:11:51 +01:00
parent dec7c4853d
commit b5b37629ce
4 changed files with 9 additions and 9 deletions

View File

@ -1,11 +1,12 @@
import re
from mwmbl.tokenizer import tokenize
from mwmbl.tokenizer import tokenize, clean_unicode
def format_result_with_pattern(pattern, result):
formatted_result = {}
for content_type, content in [('title', result.title), ('extract', result.extract)]:
for content_type, content_raw in [('title', result.title), ('extract', result.extract)]:
content = clean_unicode(content_raw)
matches = re.finditer(pattern, content, re.IGNORECASE)
all_spans = [0] + sum((list(m.span()) for m in matches), []) + [len(content)]
content_result = []

View File

@ -89,8 +89,3 @@ def get_url_error_status(item: Item):
elif item.error.name == 'RobotsDenied':
return URLStatus.ERROR_ROBOTS_DENIED
return URLStatus.ERROR_OTHER
# TODO: clean unicode at some point
def clean_unicode(s: str) -> str:
return s.encode('utf-8', 'ignore').decode('utf-8')

View File

@ -1,5 +1,5 @@
def tokenize(input_text):
cleaned_text = input_text.encode('utf8', 'replace').decode('utf8')
cleaned_text = clean_unicode(input_text)
tokens = cleaned_text.lower().split()
if input_text.endswith(''):
# Discard the last two tokens since there will likely be a word cut in two
@ -11,3 +11,7 @@ def get_bigrams(num_bigrams, tokens):
num_bigrams = min(num_bigrams, len(tokens) - 1)
bigrams = [f'{tokens[i]} {tokens[i + 1]}' for i in range(num_bigrams)]
return bigrams
def clean_unicode(s: str) -> str:
return s.encode('utf-8', errors='ignore').decode('utf-8')

View File

@ -1,4 +1,4 @@
from mwmbl.indexer.index_batches import clean_unicode
from mwmbl.tokenizer import clean_unicode
def test_clean_unicode():