Don't replace full stops and commas

This commit is contained in:
Daoud Clarke 2022-08-23 22:06:43 +01:00
parent 4779371cf3
commit 578b705609

View file

@ -37,7 +37,7 @@ STOPWORDS = set("0,1,2,3,4,5,6,7,8,9,a,A,about,above,across,after,again,against,
def tokenize(input_text):
cleaned_text = input_text.encode('utf8', 'replace').decode('utf8')
tokens = cleaned_text.lower().replace('.', ' ').replace(',', ' ').split()
tokens = cleaned_text.lower().split()
# tokens = nlp.tokenizer(cleaned_text)
if input_text.endswith(''):
# Discard the last two tokens since there will likely be a word cut in two