Update crawler.php
Run mb_convert_encoding in ALL cases to remove potentially invalid UTF-8 characters. Add the "replacement" UTF-8 character to the whitespace array to ensure it's removed.
This commit is contained in:
parent
b12e7991e0
commit
3307baac4d
|
@ -45,7 +45,6 @@ function OS_cleanTextUTF8(&$_, $charset, $entity = false) {
|
|||
|
||||
if (!trim($charset)) $charset = 'ISO-8859-1';
|
||||
|
||||
if (strtoupper($charset) != 'UTF-8')
|
||||
$_ = mb_convert_encoding($_, 'UTF-8', $charset);
|
||||
|
||||
if ($entity)
|
||||
|
@ -673,7 +672,7 @@ $_RDATA['sp_whitespace'] = array(
|
|||
"\u{2008}" => ' ', "\u{2009}" => ' ', "\u{200A}" => ' ', "\u{200B}" => ' ',
|
||||
"\u{200C}" => ' ', "\u{200D}" => '', "\u{2028}" => "\n", "\u{2029}" => "\n",
|
||||
"\u{202F}" => ' ', "\u{205F}" => ' ', "\u{2060}" => '', "\u{3000}" => ' ',
|
||||
"\u{FEFF}" => ' '
|
||||
"\u{FEFF}" => ' ', "\u{FFFD}" => ''
|
||||
);
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue