diff --git a/mwmbl/format.py b/mwmbl/format.py index cb30c30..ad8f2ef 100644 --- a/mwmbl/format.py +++ b/mwmbl/format.py @@ -1,11 +1,12 @@ import re -from mwmbl.tokenizer import tokenize +from mwmbl.tokenizer import tokenize, clean_unicode def format_result_with_pattern(pattern, result): formatted_result = {} - for content_type, content in [('title', result.title), ('extract', result.extract)]: + for content_type, content_raw in [('title', result.title), ('extract', result.extract)]: + content = clean_unicode(content_raw) matches = re.finditer(pattern, content, re.IGNORECASE) all_spans = [0] + sum((list(m.span()) for m in matches), []) + [len(content)] content_result = [] diff --git a/mwmbl/indexer/index_batches.py b/mwmbl/indexer/index_batches.py index 24880bb..a6e0488 100644 --- a/mwmbl/indexer/index_batches.py +++ b/mwmbl/indexer/index_batches.py @@ -89,8 +89,3 @@ def get_url_error_status(item: Item): elif item.error.name == 'RobotsDenied': return URLStatus.ERROR_ROBOTS_DENIED return URLStatus.ERROR_OTHER - - -# TODO: clean unicode at some point -def clean_unicode(s: str) -> str: - return s.encode('utf-8', 'ignore').decode('utf-8') \ No newline at end of file diff --git a/mwmbl/tokenizer.py b/mwmbl/tokenizer.py index c695a0d..0789de3 100644 --- a/mwmbl/tokenizer.py +++ b/mwmbl/tokenizer.py @@ -1,5 +1,5 @@ def tokenize(input_text): - cleaned_text = input_text.encode('utf8', 'replace').decode('utf8') + cleaned_text = clean_unicode(input_text) tokens = cleaned_text.lower().split() if input_text.endswith('…'): # Discard the last two tokens since there will likely be a word cut in two @@ -11,3 +11,7 @@ def get_bigrams(num_bigrams, tokens): num_bigrams = min(num_bigrams, len(tokens) - 1) bigrams = [f'{tokens[i]} {tokens[i + 1]}' for i in range(num_bigrams)] return bigrams + + +def clean_unicode(s: str) -> str: + return s.encode('utf-8', errors='ignore').decode('utf-8') diff --git a/test/test_indexdb.py b/test/test_indexdb.py index 5409749..14b1a3c 100644 --- a/test/test_indexdb.py +++ b/test/test_indexdb.py @@ -1,4 +1,4 @@ -from mwmbl.indexer.index_batches import clean_unicode +from mwmbl.tokenizer import clean_unicode def test_clean_unicode():