From 3307baac4d2e530ab1be7e447fd682ab94dda10f Mon Sep 17 00:00:00 2001 From: Brian Huisman Date: Thu, 22 Jun 2023 15:35:40 -0400 Subject: [PATCH] Update crawler.php Run mb_convert_encoding in ALL cases to remove potentially invalid UTF-8 characters. Add the "replacement" UTF-8 character to the whitespace array to ensure it's removed. --- orcinus/crawler.php | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/orcinus/crawler.php b/orcinus/crawler.php index 27771d4..45366ba 100644 --- a/orcinus/crawler.php +++ b/orcinus/crawler.php @@ -45,8 +45,7 @@ function OS_cleanTextUTF8(&$_, $charset, $entity = false) { if (!trim($charset)) $charset = 'ISO-8859-1'; - if (strtoupper($charset) != 'UTF-8') - $_ = mb_convert_encoding($_, 'UTF-8', $charset); + $_ = mb_convert_encoding($_, 'UTF-8', $charset); if ($entity) $_ = html_entity_decode($_, $entity | ENT_SUBSTITUTE, 'UTF-8'); @@ -673,7 +672,7 @@ $_RDATA['sp_whitespace'] = array( "\u{2008}" => ' ', "\u{2009}" => ' ', "\u{200A}" => ' ', "\u{200B}" => ' ', "\u{200C}" => ' ', "\u{200D}" => '', "\u{2028}" => "\n", "\u{2029}" => "\n", "\u{202F}" => ' ', "\u{205F}" => ' ', "\u{2060}" => '', "\u{3000}" => ' ', - "\u{FEFF}" => ' ' + "\u{FEFF}" => ' ', "\u{FFFD}" => '' );