Clean unicode when formatting result

2023-05-20 22:11:51 +01:00 · 2023-05-20 22:11:51 +01:00 · b5b37629ce
parent dec7c4853d
commit b5b37629ce
4 changed files with 9 additions and 9 deletions
--- a/mwmbl/format.py
+++ b/mwmbl/format.py
@ -1,11 +1,12 @@
 import re

-from mwmbl.tokenizer import tokenize
+from mwmbl.tokenizer import tokenize, clean_unicode


 def format_result_with_pattern(pattern, result):
    formatted_result = {}
-    for content_type, content in [('title', result.title), ('extract', result.extract)]:
+    for content_type, content_raw in [('title', result.title), ('extract', result.extract)]:
+        content = clean_unicode(content_raw)
        matches = re.finditer(pattern, content, re.IGNORECASE)
        all_spans = [0] + sum((list(m.span()) for m in matches), []) + [len(content)]
        content_result = []
--- a/mwmbl/indexer/index_batches.py
+++ b/mwmbl/indexer/index_batches.py
@ -89,8 +89,3 @@ def get_url_error_status(item: Item):
        elif item.error.name == 'RobotsDenied':
            return URLStatus.ERROR_ROBOTS_DENIED
    return URLStatus.ERROR_OTHER
-
-
-# TODO: clean unicode at some point
-def clean_unicode(s: str) -> str:
-    return s.encode('utf-8', 'ignore').decode('utf-8')
--- a/mwmbl/tokenizer.py
+++ b/mwmbl/tokenizer.py
@ -1,5 +1,5 @@
 def tokenize(input_text):
-    cleaned_text = input_text.encode('utf8', 'replace').decode('utf8')
+    cleaned_text = clean_unicode(input_text)
    tokens = cleaned_text.lower().split()
    if input_text.endswith('…'):
        # Discard the last two tokens since there will likely be a word cut in two
@ -11,3 +11,7 @@ def get_bigrams(num_bigrams, tokens):
    num_bigrams = min(num_bigrams, len(tokens) - 1)
    bigrams = [f'{tokens[i]} {tokens[i + 1]}' for i in range(num_bigrams)]
    return bigrams
+
+
+def clean_unicode(s: str) -> str:
+    return s.encode('utf-8', errors='ignore').decode('utf-8')
--- a/test/test_indexdb.py
+++ b/test/test_indexdb.py
@ -1,4 +1,4 @@
-from mwmbl.indexer.index_batches import clean_unicode
+from mwmbl.tokenizer import clean_unicode


 def test_clean_unicode():