Update crawler.php
Don't assume that other data from a PDF is the same as the content. Bypasses some still-unfixed PDFParser encoding issues. Also exit the crawler script if we are in debug mode and there is a crawl already running.
This commit is contained in:
parent
eda57224d9
commit
042339d3ef
|
@ -43,6 +43,8 @@ function OS_crawlLog($text, $level = 0) {
|
||||||
function OS_cleanTextUTF8(&$_, $charset, $entity = false) {
|
function OS_cleanTextUTF8(&$_, $charset, $entity = false) {
|
||||||
global $_RDATA;
|
global $_RDATA;
|
||||||
|
|
||||||
|
if (!trim($charset)) $charset = 'ISO-8859-1';
|
||||||
|
|
||||||
if (strtoupper($charset) != 'UTF-8')
|
if (strtoupper($charset) != 'UTF-8')
|
||||||
$_ = mb_convert_encoding($_, 'UTF-8', $charset);
|
$_ = mb_convert_encoding($_, 'UTF-8', $charset);
|
||||||
|
|
||||||
|
@ -612,6 +614,10 @@ switch ($_SERVER['REQUEST_METHOD']) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If we are in debug mode, but the crawler is already running, exit
|
||||||
|
if ($_RDATA['debug'] && $_ODATA['sp_crawling'])
|
||||||
|
die('Crawler is already running; exiting...');
|
||||||
|
|
||||||
|
|
||||||
/* ***** Begin Crawl Execution ************************************* */
|
/* ***** Begin Crawl Execution ************************************* */
|
||||||
register_shutdown_function('OS_crawlCleanUp');
|
register_shutdown_function('OS_crawlCleanUp');
|
||||||
|
@ -1407,9 +1413,9 @@ while ($_cURL && count($_RDATA['sp_queue'])) {
|
||||||
strpos($data['content'], "\u{2}") === false &&
|
strpos($data['content'], "\u{2}") === false &&
|
||||||
strpos($data['content'], "\u{1}") === false) {
|
strpos($data['content'], "\u{1}") === false) {
|
||||||
|
|
||||||
OS_cleanTextUTF8($data['title'], $data['info']['charset']);
|
OS_cleanTextUTF8($data['title'], mb_detect_encoding($data['title']));
|
||||||
OS_cleanTextUTF8($data['keywords'], $data['info']['charset']);
|
OS_cleanTextUTF8($data['keywords'], mb_detect_encoding($data['keywords']));
|
||||||
OS_cleanTextUTF8($data['description'], $data['info']['charset']);
|
OS_cleanTextUTF8($data['description'], mb_detect_encoding($data['description']));
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
$data['errno'] = 703;
|
$data['errno'] = 703;
|
||||||
|
|
Loading…
Reference in a new issue