diff --git a/mwmbl/format.py b/mwmbl/format.py
index cb30c30..ad8f2ef 100644
--- a/mwmbl/format.py
+++ b/mwmbl/format.py
@@ -1,11 +1,12 @@
 import re
 
-from mwmbl.tokenizer import tokenize
+from mwmbl.tokenizer import tokenize, clean_unicode
 
 
 def format_result_with_pattern(pattern, result):
     formatted_result = {}
-    for content_type, content in [('title', result.title), ('extract', result.extract)]:
+    for content_type, content_raw in [('title', result.title), ('extract', result.extract)]:
+        content = clean_unicode(content_raw)
         matches = re.finditer(pattern, content, re.IGNORECASE)
         all_spans = [0] + sum((list(m.span()) for m in matches), []) + [len(content)]
         content_result = []
diff --git a/mwmbl/indexer/index_batches.py b/mwmbl/indexer/index_batches.py
index 24880bb..a6e0488 100644
--- a/mwmbl/indexer/index_batches.py
+++ b/mwmbl/indexer/index_batches.py
@@ -89,8 +89,3 @@ def get_url_error_status(item: Item):
         elif item.error.name == 'RobotsDenied':
             return URLStatus.ERROR_ROBOTS_DENIED
     return URLStatus.ERROR_OTHER
-
-
-# TODO: clean unicode at some point
-def clean_unicode(s: str) -> str:
-    return s.encode('utf-8', 'ignore').decode('utf-8')
\ No newline at end of file
diff --git a/mwmbl/tokenizer.py b/mwmbl/tokenizer.py
index c695a0d..0789de3 100644
--- a/mwmbl/tokenizer.py
+++ b/mwmbl/tokenizer.py
@@ -1,5 +1,5 @@
 def tokenize(input_text):
-    cleaned_text = input_text.encode('utf8', 'replace').decode('utf8')
+    cleaned_text = clean_unicode(input_text)
     tokens = cleaned_text.lower().split()
     if input_text.endswith('…'):
         # Discard the last two tokens since there will likely be a word cut in two
@@ -11,3 +11,7 @@ def get_bigrams(num_bigrams, tokens):
     num_bigrams = min(num_bigrams, len(tokens) - 1)
     bigrams = [f'{tokens[i]} {tokens[i + 1]}' for i in range(num_bigrams)]
     return bigrams
+
+
+def clean_unicode(s: str) -> str:
+    return s.encode('utf-8', errors='ignore').decode('utf-8')
diff --git a/test/test_indexdb.py b/test/test_indexdb.py
index 5409749..14b1a3c 100644
--- a/test/test_indexdb.py
+++ b/test/test_indexdb.py
@@ -1,4 +1,4 @@
-from mwmbl.indexer.index_batches import clean_unicode
+from mwmbl.tokenizer import clean_unicode
 
 
 def test_clean_unicode():