orcinus-search/orcinus/crawler.php
Brian Huisman 4bbe1d967b Misc fixes
Save the process id of the crawler in the sp_crawling DB value instead of just a flag; we can use it to compare and further prevent race conditions which still seem to happen occasionally.
2023-10-17 10:36:34 -04:00

1825 lines
70 KiB
PHP

<?php /* ***** Orcinus Site Search - Web Crawling Engine *********** */
require __DIR__.'/config.php';
// Setting the $_RDATA['debug'] value to true will allow you to start
// the crawler just by visiting this file's URL using your web browser.
// It will output the log lines as well as any PHP errors that may
// occur. It will also report how much memory the script is using. Use
// this mode if your crawls are failing but the logs alone aren't
// enough to tell you why. DO NOT leave the crawler in debug mode in a
// production environment, or anyone can just run your crawler whenever
// they want!
$_RDATA['debug'] = false;
/**
* Log a notice (0), message (1) or error (2)
*
*/
function OS_crawlLog($text, $level = 0) {
global $_RDATA;
switch ($level) {
case 1: $prefix = ''; break;
case 2: $prefix = '[ERROR] '; break;
default: $prefix = ' -> ';
}
fwrite($_RDATA['sp_log'], $prefix.$text."\n");
if ($_RDATA['debug'] ||
($_SERVER['REQUEST_METHOD'] == 'CLI' &&
$level >= $_RDATA['sp_log_clilevel'])) {
echo $prefix.$text."\n";
}
}
/**
* Final prep to store content in UTF-8 format in the database
*
*/
function OS_cleanTextUTF8(&$_, $charset, $entity = false) {
global $_RDATA;
if (!trim($charset)) $charset = 'ISO-8859-1';
$_ = mb_convert_encoding($_, 'UTF-8', $charset);
if ($entity)
$_ = html_entity_decode($_, $entity | ENT_SUBSTITUTE, 'UTF-8');
$_ = strtr($_, $_RDATA['sp_punct']);
$_ = strtr($_, $_RDATA['sp_whitespace']);
$_ = preg_replace(array('/\s/', '/ {2,}/'), ' ', trim($_));
}
/**
* Format a full or partial URL into a full URL according to a base URL
*
*/
function OS_formatURL($_, $base) {
$_ = str_replace(' ', '%20', $_);
$_ = preg_replace('/#.*$/', '', filter_var($_, FILTER_SANITIZE_URL));
$_ = str_replace('%20', ' ', $_);
$dirbase = preg_replace('/(?<!:\/)\/[^\/]*$/', '', $base).'/';
$pdb = parse_url($dirbase);
$port = (!empty($pdb['port'])) ? ':'.$pdb['port'] : '';
if (substr($_, 0, 3) == '../') {
$p = preg_replace('/\/[^\/]*\/$/', '/', $pdb['path']);
$_ = $pdb['scheme'].'://'.$pdb['host'].$port.$p.substr($_, 3);
}
if (substr($_, 0, 2) == './') {
$_ = $dirbase.substr($_, 2);
} else if (substr($_, 0, 2) == '//') {
$_ = $pdb['scheme'].':'.$_;
} else if (substr($_, 0, 1) == '/') {
$_ = $pdb['scheme'].'://'.$pdb['host'].$port.$_;
} else if (substr($_, 0, 1) == '?') {
$_ = preg_replace('/\?.*$/', '', $base).$_;
} else if (!preg_match('/^https?:\/\//', $_)) $_ = $dirbase.$_;
$_ = preg_replace(array('/\/[^\/]*\/\.\.\//', '/\/\.\//'), '/', $_);
if ($_ == $pdb['scheme'].'://'.$pdb['host'] ||
$_ == $pdb['scheme'].'://'.$pdb['host'].$port) $_ .= '/';
return trim($_);
}
/**
* Filter a URL by the crawling rules provided by the user
* - Sets an $_RDATA['sp_filter'] array key + value and returns the
* REASON why the URL was rejected, NOT a 'filtered' URL
*
*/
function OS_filterURL($_, $base) {
global $_RDATA;
if (!preg_match('/^https?:\/\//', $_))
$_ = OS_formatURL($_, $base);
if (!empty($_RDATA['sp_filter'][$_]))
return $_RDATA['sp_filter'][$_];
$_RDATA['sp_filter'][$_] = '';
// Accepted hostnames
$plink = parse_url($_);
if (!in_array($plink['host'], $_RDATA['sp_hostnames'], true))
return $_RDATA['sp_filter'][$_] = 'disallowed-host';
// Require URL matches
if (count($_RDATA['sp_require_url'])) {
$foundRequired = false;
foreach ($_RDATA['sp_require_url'] as $requireURL) {
if ($requireURL[0] == '*') {
if (preg_match('/'.str_replace('/', '\/', substr($requireURL, 1)).'/', $_))
$foundRequired = true;
} else if (strpos($_, $requireURL) !== false)
$foundRequired = true;
}
if (!$foundRequired)
return $_RDATA['sp_filter'][$_] = 'require-url';
}
// Ignore URL matches
foreach ($_RDATA['sp_ignore_url'] as $ignoreURL) {
if ($ignoreURL[0] == '*') {
if (preg_match('/'.str_replace('/', '\/', substr($ignoreURL, 1)).'/', $_))
return $_RDATA['sp_filter'][$_] = 'ignore-url';
} else if (strpos($_, $ignoreURL) !== false)
return $_RDATA['sp_filter'][$_] = 'ignore-url';
}
// Ignore extensions
if (preg_match('/\.('.$_RDATA['sp_ignore_ext_regexp'].')$/i', $_))
return $_RDATA['sp_filter'][$_] = 'ignore-extension';
// robots.txt rules
if (!empty($_RDATA['sp_robots'][$plink['host']]))
foreach ($_RDATA['sp_robots'][$plink['host']] as $disallowURL)
if (strpos($_, $disallowURL) === 0)
return $_RDATA['sp_filter'][$_] = 'robots-txt';
return $_RDATA['sp_filter'][$_];
}
/**
* Fetch a URL using cURL, return an array of useful information
*
*/
function OS_fetchURL($url, $referer = '') {
global $_cURL, $_RDATA;
$_RDATA['sp_robots_header'] = 0;
$_RDATA['sp_self_reference'] = 0;
curl_setopt($_cURL, CURLOPT_URL, str_replace(' ', '%20', $url));
curl_setopt($_cURL, CURLOPT_REFERER, $referer);
$_ = array(
'url' => parse_url($url),
'body' => curl_exec($_cURL),
'base' => $url,
'info' => curl_getinfo($_cURL),
'error' => curl_error($_cURL),
'errno' => curl_errno($_cURL),
'links' => array(),
'title' => '',
'content' => '',
'keywords' => '',
'weighted' => '',
'description' => ''
);
$_['info']['url'] = $url;
$_['info']['noindex'] = '';
$_['info']['nofollow'] = false;
// Process any cURL errors
switch ($_['errno']) {
case 0: // Success
case 42: // Aborted by callback
if ($_['info']['http_code'] >= 400) {
$_['errno'] = 22;
$_['error'] = $_['info']['http_code'].' error';
$_['info']['noindex'] = '400';
} else if ($_['info']['redirect_url']) {
$_['errno'] = 300;
$_['error'] = 'Redirected by HTTP header to: '.$_['info']['redirect_url'];
$_['info']['noindex'] = 'redirect-location';
} else if ($_RDATA['sp_robots_header']) {
$_['errno'] = 777;
$_['error'] = 'Blocked by \'X-Robots-Tag\' HTTP header';
$_['info']['noindex'] = 'robots-http';
} else if ($_RDATA['sp_self_reference']) {
$_['errno'] = 888;
$_['error'] = 'Refused to index myself';
$_['info']['noindex'] = 'self-reference';
} else if ($_['errno'] == 42) {
$_['errno'] = 999;
$_['error'] = 'Max filesize exceeded';
$_['info']['noindex'] = 'too-large';
}
break;
case 28: // Timeout
$_['error'] = 'Timed out waiting for data';
$_['info']['noindex'] = 'timeout';
break;
case 55: // Network send error
case 56: // Network receive error
$_['error'] = 'Network error retrieving data';
$_['info']['noindex'] = 'network-error';
break;
case 6: // Could not resolve host
case 7: // Could not connect to host
$_['error'] = 'Couldn\'t connect to host: '.$_['url']['host'];
$_['info']['noindex'] = 'couldnt-connect';
break;
default: // Uncaught cURL error
OS_crawlLog('Uncaught cURL error: '.$url, 2);
OS_crawlLog($_['errno'], 1);
OS_crawlLog($_['error'], 1);
OS_crawlLog(print_r($_['info'], true), 1);
throw new Exception('Uncaught cURL error');
}
return $_;
}
/**
* Shutdown function to provide cleanup before exit
*
*/
function OS_crawlCleanUp() {
global $_DDATA, $_ODATA, $_RDATA, $_cURL, $_MAIL;
// If the crawl has already been canceled, don't bother
if (!OS_getValue('sp_crawling')) return;
$error = error_get_last();
if (!is_null($error) && $error['type'] == E_ERROR) {
OS_crawlLog($error['message'], 2);
OS_crawlLog('File: \''.$error['file'].'\' at line number: '.$error['line'], 0);
$_RDATA['sp_complete'] = false;
}
// Save or display cookies?
$cookies = curl_getinfo($_cURL, CURLINFO_COOKIELIST);
// var_dump($cookies);
curl_close($_cURL);
// If crawl completed successfully
if ($_RDATA['sp_complete']) {
OS_crawlLog('Cleaning up database tables...', 1);
// Add a natural sort order value to each entry
natcasesort($_RDATA['sp_store']);
$_RDATA['sp_store'] = array_values($_RDATA['sp_store']);
$url_sort = $_DDATA['pdo']->prepare(
'UPDATE `'.$_DDATA['tbprefix'].'crawltemp`
SET `url_sort`=:url_sort WHERE `url`=:url;'
);
foreach ($_RDATA['sp_store'] as $key => $stored_url) {
$url_sort->execute(array(
'url_sort' => $key,
'url' => $stored_url
));
$err = $url_sort->errorInfo();
if ($err[0] != '00000') {
OS_crawlLog('Error sorting the search database', 1);
OS_crawlLog($err[2], 0);
break;
}
}
// Truncate the existing search database
$truncate = $_DDATA['pdo']->query(
'TRUNCATE `'.$_DDATA['tbprefix'].'crawldata`;'
);
$err = $truncate->errorInfo();
if ($err[0] != '00000') {
OS_crawlLog('Could not truncate the search database', 1);
OS_crawlLog($err[2], 0);
// Last chance to bail out before we make actual changes
$_RDATA['sp_complete'] = false;
}
}
// If crawl completed successfully AND we truncated the old table
if ($_RDATA['sp_complete']) {
OS_setValue('sp_time_end', time());
OS_setValue('sp_time_last', $_ODATA['sp_time_end'] - $_ODATA['sp_time_start']);
// Select all rows from the temp table into the existing search table
$insert = $_DDATA['pdo']->query(
'INSERT INTO `'.$_DDATA['tbprefix'].'crawldata`
SELECT * FROM `'.$_DDATA['tbprefix'].'crawltemp`;'
);
$err = $insert->errorInfo();
if ($err[0] == '00000') {
$tableinfo = $_DDATA['pdo']->query(
'SHOW TABLE STATUS LIKE \''.$_DDATA['tbprefix'].'crawldata\';'
);
$err = $tableinfo->errorInfo();
if ($err[0] == '00000') {
$tableinfo = $tableinfo->fetchAll();
OS_setValue('sp_data_stored', $tableinfo[0]['Data_length']);
} else OS_crawlLog('Could not read crawl table status', 1);
// Purge the search result cache
if ($_ODATA['s_limit_cache']) {
$purge = $_DDATA['pdo']->query(
'UPDATE `'.$_DDATA['tbprefix'].'query` SET `cache`=\'\';'
);
$err = $purge->errorInfo();
if ($err[0] != '00000')
OS_crawlLog('Could not purge search result cache', 1);
}
// Optimize the query log table
$optimize = $_DDATA['pdo']->query(
'OPTIMIZE TABLE `'.$_DDATA['tbprefix'].'query`;'
);
OS_setValue('sp_pages_stored', count($_RDATA['sp_store']));
OS_setValue('sp_domains', $_RDATA['sp_domains']);
OS_setValue('sp_time_end_success', $_ODATA['sp_time_end']);
OS_crawlLog('***** Crawl completed in '.$_ODATA['sp_time_last'].'s *****', 1);
OS_crawlLog('Total data transferred: '.OS_readSize($_ODATA['sp_data_transferred']), 1);
OS_crawlLog('Average transfer speed: '.OS_readSize(round($_ODATA['sp_data_transferred'] / $_ODATA['sp_time_last'])).'/s', 1);
if ($_RDATA['sp_sleep'])
OS_crawlLog('Time spent sleeping: '.(round($_RDATA['sp_sleep'] / 10) / 100).'s', 1);
OS_crawlLog('Time taken by cURL: '.(round($_RDATA['sp_time_curl'] * 100) / 100).'s', 1);
OS_crawlLog($_ODATA['sp_progress'][0].' page'.(($_ODATA['sp_progress'][0] == 1) ? '' : 's').' crawled', 1);
OS_crawlLog($_ODATA['sp_pages_stored'].' page'.(($_ODATA['sp_pages_stored'] == 1) ? '' : 's').' stored', 1);
if ($_RDATA['sp_status']['New'])
OS_crawlLog($_RDATA['sp_status']['New'].' new '.(($_RDATA['sp_status']['New'] == 1) ? 'page' : 'pages').' found', 0);
if ($_RDATA['sp_status']['Updated'])
OS_crawlLog($_RDATA['sp_status']['Updated'].' '.(($_RDATA['sp_status']['Updated'] == 1) ? 'page' : 'pages').' updated', 0);
if ($_RDATA['sp_status']['Blocked'])
OS_crawlLog($_RDATA['sp_status']['Blocked'].' '.(($_RDATA['sp_status']['Blocked'] == 1) ? 'page' : 'pages').' blocked', 0);
if ($_RDATA['sp_status']['Not Found'])
OS_crawlLog($_RDATA['sp_status']['Not Found'].' '.(($_RDATA['sp_status']['Not Found'] == 1) ? 'page' : 'pages').' not found', 0);
if ($_RDATA['sp_status']['Orphan'])
OS_crawlLog($_RDATA['sp_status']['Orphan'].' orphaned '.(($_RDATA['sp_status']['Orphan'] == 1) ? 'page' : 'pages'), 0);
if ($_ODATA['sp_autodelete'])
OS_crawlLog('Orphaned pages were auto-deleted', 1);
// Send success email to the admin(s)
if ($_MAIL && count($_MAIL->getAllRecipientAddresses()) && $_ODATA['sp_email_success']) {
$_MAIL->Subject = 'Orcinus Site Search Crawler: Crawl succeeded';
$_MAIL->Body = implode(" \r\n", preg_grep('/^[\[\*]/', explode("\n", file_get_contents($_ODATA['sp_log']))));
if (!$_MAIL->Send()) OS_crawlLog('Could not send notification email', 2);
}
$cliMessage = 'Crawl completed successfully';
$jsonMessage = json_encode(array(
'status' => 'Success',
'message' => $cliMessage
), JSON_INVALID_UTF8_IGNORE);
// We truncated the search table but FAILED to populate it!
// This is a serious error that disables searching until the
// crawler is run again!
} else {
OS_crawlLog('Could not populate the search table', 2);
OS_crawlLog($err[2], 0);
OS_crawlLog('***** Crawl failed; runtime '.$_ODATA['sp_time_last'].'s *****', 1);
OS_crawlLog('Search table was cleared, but could not be repopulated!', 1);
OS_crawlLog('The crawler MUST be run again to fix this issue!', 1);
// Send failure email to the admin(s)
if ($_MAIL && count($_MAIL->getAllRecipientAddresses()) && $_ODATA['sp_email_failure']) {
$_MAIL->Subject = 'Orcinus Site Search Crawler: Catastrophic failure!';
$_MAIL->Body = implode(" \r\n", preg_grep('/^[\[\*\w\d]/', explode("\n", file_get_contents($_ODATA['sp_log']))));
if (!$_MAIL->Send()) OS_crawlLog('Could not send notification email', 2);
}
$cliMessage = 'Could not populate search table; search table is currently empty!';
$jsonMessage = json_encode(array(
'status' => 'Error',
'message' => $cliMessage
), JSON_INVALID_UTF8_IGNORE);
}
// Else the crawl failed
} else {
OS_setValue('sp_time_last', $_ODATA['sp_time_end'] - $_ODATA['sp_time_start']);
OS_crawlLog('***** Crawl failed; runtime '.$_ODATA['sp_time_last'].'s *****', 1);
OS_crawlLog('Total data transferred: '.OS_readSize($_ODATA['sp_data_transferred']), 1);
OS_crawlLog('Search table was NOT updated', 1);
if ($_ODATA['sp_sitemap_file'])
OS_crawlLog('Sitemap was NOT updated', 1);
// Send failure email to the admin(s)
if ($_MAIL && count($_MAIL->getAllRecipientAddresses()) && $_ODATA['sp_email_failure'] && !$_ODATA['sp_cancel']) {
$_MAIL->Subject = 'Orcinus Site Search Crawler: Crawl failed';
$_MAIL->Body = implode(" \r\n", preg_grep('/^[\[\*\w\d]/', explode("\n", file_get_contents($_ODATA['sp_log']))));
if (!$_MAIL->Send()) OS_crawlLog('Could not send notification email', 2);
}
$cliMessage = 'Crawl failed; see the log for details';
$jsonMessage = json_encode(array(
'status' => 'Error',
'message' => $cliMessage
), JSON_INVALID_UTF8_IGNORE);
}
// Delete the temp search table
$drop = $_DDATA['pdo']->query(
'DROP TABLE IF EXISTS `'.$_DDATA['tbprefix'].'crawltemp`;'
);
$err = $drop->errorInfo();
if ($err[0] != '00000') {
OS_crawlLog('Could not delete the temporary search table', 1);
OS_crawlLog($err[2], 0);
}
// Store the log file to the config database
OS_setValue('sp_log', file_get_contents($_ODATA['sp_log']));
fclose($_RDATA['sp_log']);
// Unset the crawling flag
OS_setValue('sp_crawling', 0);
if ($_SERVER['REQUEST_METHOD'] != 'CLI') {
if (!$_RDATA['debug'])
header('Content-type: application/json; charset='.strtolower($_ODATA['s_charset']));
die($jsonMessage);
} else die($cliMessage."\n");
}
// ***** Accept incoming commands by REQUEST_METHOD
switch ($_SERVER['REQUEST_METHOD']) {
/* ***** Handle POST Requests ************************************ */
case 'POST':
// JSON POST request
// These are usually sent by javascript fetch()
if (strpos(trim($_SERVER['CONTENT_TYPE']), 'application/json') === 0) {
$postBody = file_get_contents('php://input');
$_POST = json_decode($postBody, false);
$response = array();
if (empty($_POST->action)) $_POST->action = '';
switch ($_POST->action) {
case 'crawl':
if (!empty($_POST->sp_key) && OS_getValue('sp_key') &&
$_POST->sp_key == $_ODATA['sp_key']) {
if (OS_getValue('sp_crawling')) {
$response = array(
'status' => 'Error',
'message' => 'Crawler is already running; current progress: '.$_ODATA['sp_progress'][0].'/'.$_ODATA['sp_progress'][1]
);
// Go crawl!
} else OS_setValue('sp_crawling', getmypid());
} else {
$response = array(
'status' => 'Error',
'message' => 'Incorrect key to initiate crawler'
);
}
OS_setValue('sp_key', '');
break;
case 'progress':
$lines = array();
if (!empty($_POST->log)) {
if (OS_getValue('sp_crawling')) {
if (strpos($_ODATA['sp_log'], "\n") === false && file_exists($_ODATA['sp_log']))
$lines = file($_ODATA['sp_log'], FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
} else $lines = explode("\n", $_ODATA['sp_log']);
if (empty($_POST->grep)) $_POST->grep = '';
switch ($_POST->grep) {
case 'all': break;
case 'errors': $lines = preg_grep('/^[\[\*]/', $lines); break;
default: $lines = preg_grep('/^[\[\*\w\d]/', $lines);
}
}
// If crawl is in progress, return just the last 15 lines
if ($_ODATA['sp_crawling']) $lines = array_slice($lines, -15);
$response = array(
'status' => ($_ODATA['sp_crawling']) ? 'Crawling' : 'Complete',
'progress' => $_ODATA['sp_progress'],
'data_transferred' => $_ODATA['sp_data_transferred'],
'time_crawl' => time() - $_ODATA['sp_time_start'],
'time_start' => $_ODATA['sp_time_start'],
'time_end' => $_ODATA['sp_time_end'],
'timeout_crawl' => $_ODATA['sp_timeout_crawl'],
'tail' => trim(implode("\n", $lines))
);
break;
case 'cancel':
if (OS_getValue('sp_crawling')) {
// IF the crawler 'time_start' is more than 'timeout_crawl'
// seconds ago, or the 'force' token is set, the crawler is
// probably stuck. Unstick it.
if (empty($_POST->force)) $_POST->force = '';
if ($_POST->force || time() - $_ODATA['sp_time_start'] > $_ODATA['sp_timeout_crawl']) {
OS_setValue('sp_crawling', 0);
if (empty($_POST->reason))
$_POST->reason = 'The crawler halted unexpectedly';
if (strpos($_ODATA['sp_log'], "\n") === false && file_exists($_ODATA['sp_log'])) {
$log = file_get_contents($_ODATA['sp_log']);
OS_setValue('sp_log', $log."\n".'[ERROR] '.$_POST->reason);
} else OS_setValue('sp_log', '[ERROR] '.$_POST->reason);
OS_setValue('sp_time_last', $_ODATA['sp_time_end'] - $_ODATA['sp_time_start']);
// Send failure email to the admin(s)
if ($_MAIL && count($_MAIL->getAllRecipientAddresses()) && $_ODATA['sp_email_failure']) {
$_MAIL->Subject = 'Orcinus Site Search Crawler: Crawler halted unexpectedly';
$_MAIL->Body = implode(" \r\n", preg_grep('/^[\[\*\w\d]/', explode("\n", $_ODATA['sp_log'])));
if (!$_MAIL->Send()) OS_setValue('sp_log', $_ODATA['sp_log']."\n".'[ERROR] Could not send notification email');
}
}
OS_setValue('sp_cancel', 1);
$response = array(
'status' => 'Success',
'message' => 'Cancel flag was set',
'crawl_time' => time() - $_ODATA['sp_time_start']
);
} else {
$response = array(
'status' => 'Error',
'message' => 'Crawler is not currently running'
);
}
break;
default:
$response = array(
'status' => 'Error',
'message' => 'Unrecognized command'
);
}
// If we have a response to give, display it and exit
if ($response) {
header('Content-type: application/json; charset='.strtolower($_ODATA['s_charset']));
die(json_encode($response, JSON_INVALID_UTF8_IGNORE));
}
// Don't do anything for normal POST request
// These are usually sent by <form> HTML elements
} else {
header('Content-type: text/plain; charset='.strtolower($_ODATA['s_charset']));
die($_ODATA['sp_useragent']);
}
break;
// Allow CLI requests through
case '':
if (!empty($_SERVER['argv'][0]) && $_SERVER['argv'][0] == $_SERVER['PHP_SELF']) {
$_SERVER['REQUEST_METHOD'] = 'CLI';
if (!OS_getValue('sp_crawling')) {
// Set the logging level, if specified
if (!empty($_SERVER['argv'][1]) && preg_match('/^-log=([012])$/', $_SERVER['argv'][1], $match)) {
$_RDATA['sp_log_clilevel'] = (int)$match[1];
} else $_RDATA['sp_log_clilevel'] = 2;
// Start a crawl
OS_setValue('sp_crawling', getmypid());
} else die('Crawler is already running; exiting...');
} else die($_ODATA['sp_useragent']);
break;
// Don't do anything for GET requests, unless in debug mode
case 'GET':
header('Content-type: text/plain; charset='.strtolower($_ODATA['s_charset']));
if ($_RDATA['debug']) {
// If we are in debug mode, but the crawler is already running, exit
if (OS_getValue('sp_crawling'))
die('Crawler is already running; exiting...');
// Start a crawl
OS_setValue('sp_crawling', getmypid());
} else die($_ODATA['sp_useragent']);
break;
// Exit for all other request types
default:
header('Content-type: text/plain; charset='.strtolower($_ODATA['s_charset']));
die($_ODATA['sp_useragent']);
}
// One last check for a race condition
sleep(1);
if (OS_getValue('sp_crawling') != getmypid())
die('Crawler is already running; exiting...');
/* ***** Begin Crawl Execution ************************************* */
register_shutdown_function('OS_crawlCleanUp');
ignore_user_abort(true);
@set_time_limit($_ODATA['sp_timeout_crawl'] * 1.1);
libxml_use_internal_errors(true);
if (function_exists('apache_setenv'))
apache_setenv('no-gzip', '1');
OS_setValue('sp_cancel', 0);
OS_setValue('sp_time_start', time());
OS_setValue('sp_progress', array(0, 1, false));
OS_setValue('sp_pages_stored', 0);
OS_setValue('sp_data_transferred', 0);
OS_setValue('sp_data_stored', 0);
OS_setValue('sp_time_last', 0);
$_RDATA['sp_log'] = tmpfile();
OS_setValue('sp_log', stream_get_meta_data($_RDATA['sp_log'])['uri']);
OS_crawlLog('***** Crawl started: '.date('r').' *****', 1);
// ***** Prepare runtime data
$_RDATA['sp_starting'] = array_filter(array_map('trim', explode("\n", $_ODATA['sp_starting'])));
$_RDATA['sp_hostnames'] = array();
$_RDATA['sp_ignore_url'] = array_filter(array_map('trim', explode("\n", $_ODATA['sp_ignore_url'])));
$_RDATA['sp_ignore_css'] = array_filter(explode(' ', $_ODATA['sp_ignore_css']));
$_RDATA['sp_title_strip'] = array_filter(array_map('trim', explode("\n", $_ODATA['sp_title_strip'])));
$_RDATA['s_weight_css'] = array_filter(explode(' ', $_ODATA['s_weight_css']));
$_RDATA['sp_require_url'] = array_filter(array_map('trim', explode("\n", $_ODATA['sp_require_url'])));
$_RDATA['sp_ignore_ext_regexp'] = implode('|', array_map('preg_quote', array_filter(explode(' ', $_ODATA['sp_ignore_ext']))));
$_RDATA['sp_robots_header'] = 0;
$_RDATA['sp_complete'] = false;
$_RDATA['sp_links'] = array();
$_RDATA['sp_store'] = array();
$_RDATA['sp_domains'] = array();
$_RDATA['sp_sitemap'] = array();
$_RDATA['sp_robots'] = array();
$_RDATA['sp_status'] = array('Orphan' => 0, 'Blocked' => 0, 'Not Found' => 0, 'Updated' => 0, 'New' => 0);
$_RDATA['sp_filter'] = array();
$_RDATA['sp_prev_dls'] = 0;
$_RDATA['sp_time_curl'] = 0;
$_RDATA['sp_sleep'] = 0;
$_RDATA['sp_sha1'] = array();
$_RDATA['sp_resumed'] = false;
$_RDATA['sp_whitespace'] = array(
"\u{0009}" => ' ', "\u{000A}" => "\n", "\u{000B}" => "\n", "\u{000C}" => "\n",
"\u{000D}" => "\n", "\u{0085}" => "\n", "\u{00A0}" => ' ', "\u{1680}" => ' ',
"\u{2000}" => ' ', "\u{2001}" => ' ', "\u{2002}" => ' ', "\u{2003}" => ' ',
"\u{2004}" => ' ', "\u{2005}" => ' ', "\u{2006}" => ' ', "\u{2007}" => ' ',
"\u{2008}" => ' ', "\u{2009}" => ' ', "\u{200A}" => ' ', "\u{200B}" => ' ',
"\u{200C}" => ' ', "\u{200D}" => '', "\u{2028}" => "\n", "\u{2029}" => "\n",
"\u{202F}" => ' ', "\u{205F}" => ' ', "\u{2060}" => '', "\u{3000}" => ' ',
"\u{FEFF}" => ' ', "\u{FFFD}" => ''
);
// ***** Load PDF parser
if (!class_exists('\Smalot\PdfParser\Parser'))
if (file_exists(__DIR__.'/pdfparser/alt_autoload.php-dist'))
include __DIR__.'/pdfparser/alt_autoload.php-dist';
if (class_exists('\Smalot\PdfParser\Parser')) {
$config = new \Smalot\PdfParser\Config();
$config->setRetainImageContent(false);
$config->setDecodeMemoryLimit(16777216);
$_PDF = new \Smalot\PdfParser\Parser([], $config);
} else {
OS_crawlLog('Could not include \'PDFParser\'; PDFs will not be indexed', 1);
$_PDF = false;
}
// ***** Check for PHPMailer
if (!$_MAIL) {
OS_crawlLog('Could not include \'PHPMailer\'; Crawler cannot send mail', 1);
} else if (!count($_MAIL->getAllRecipientAddresses()))
OS_crawlLog('No admin emails specified; Crawler will not send mail', 1);
// ***** Initialize the cURL connection
$_cURL = OS_getConnection();
if ($_cURL) {
// Customize this cURL connection
if ($_ODATA['sp_cookies'])
curl_setopt($_cURL, CURLOPT_COOKIEFILE, '');
curl_setopt($_cURL, CURLOPT_HEADERFUNCTION, function($_cURL, $line) {
global $_RDATA;
if (preg_match('/^X-Robots-Tag:\s*(noindex|none)/i', $line))
$_RDATA['sp_robots_header'] = 1;
if (trim($line) == $_RDATA['x_generated_by'])
$_RDATA['sp_self_reference'] = 1;
return strlen($line);
});
curl_setopt($_cURL, CURLOPT_NOPROGRESS, false);
curl_setopt($_cURL, CURLOPT_PROGRESSFUNCTION,
function($_cURL, $dls, $dl, $uls, $ul) {
global $_ODATA, $_RDATA;
if ($_RDATA['sp_robots_header']) return 1;
if ($_RDATA['sp_self_reference']) return 1;
// Prevent comparing this value until a Content-length header has
// been received by the cURL connection
if ($dls != $_RDATA['sp_prev_dls']) {
$_RDATA['sp_prev_dls'] = $dls;
if ($dls > $_ODATA['sp_limit_filesize'] * 1024) return 1;
}
if ($dl > $_ODATA['sp_limit_filesize'] * 1024) return 1;
$i = curl_getinfo($_cURL);
if ($i['redirect_url']) return 1;
if ($i['http_code'] && $i['http_code'] >= 400) return 1;
return $_RDATA['sp_robots_header'];
}
);
} else OS_crawlLog('cURL functions are not enabled; cannot perform crawl', 2);
// ***** Pre-fill queue with starting URL(s) at depth 0, blank referer
$_RDATA['sp_queue'] = array();
foreach ($_RDATA['sp_starting'] as $starting) {
$starting = OS_formatURL($starting, $_ODATA['admin_install_domain'].'/');
$_RDATA['sp_queue'][] = array($starting, 0, '');
// Add starting URLs to required URLs so the crawler cannot travel
// into parent directories
$_RDATA['sp_require_url'][] = preg_replace('/\/[^\/]*$/', '/', $starting);
$host = parse_url($starting)['host'];
if (!in_array($host, $_RDATA['sp_hostnames'], true))
$_RDATA['sp_hostnames'][] = $host;
}
// ***** List of previously crawled links from the database
$_RDATA['sp_exist'] = array();
$_RDATA['sp_lastmod'] = array();
$crawldata = $_DDATA['pdo']->query(
'SELECT `url`, `content_checksum`, `last_modified`
FROM `'.$_DDATA['tbprefix'].'crawldata`'
);
$err = $crawldata->errorInfo();
if ($err[0] == '00000') {
foreach ($crawldata as $value) {
$_RDATA['sp_exist'][$value['content_checksum']] = $value['url'];
$_RDATA['sp_lastmod'][$value['url']] = $value['last_modified'];
}
} else OS_crawlLog('Error getting list of previous URLs from crawldata table', 2);
// If the crawltemp table exists here, that means a crawl was
// interrupted without completing the shutdown function.
// Use the data from this partially completed crawl to resume it.
if (in_array($_DDATA['tbprefix'].'crawltemp', $_DDATA['tables'], true)) {
$select = $_DDATA['pdo']->query(
'SELECT `url`, `links`, `content_checksum` FROM `'.$_DDATA['tbprefix'].'crawltemp`;'
);
$err = $select->errorInfo();
if ($err[0] == '00000') {
OS_crawlLog('Previous crawl data exists; using it to resume crawling...', 1);
$select = $select->fetchAll();
OS_crawlLog('Found '.count($select).' previously crawled URLs', 1);
$_RDATA['sp_resumed'] = true;
// Run through every entry in the crawltemp table
foreach ($select as $row) {
// If an entry matches an existing URL in the queue (just
// starting URLs right now) then delete that queue entry
foreach ($_RDATA['sp_queue'] as $key => $queue)
if ($row['url'] == $queue[0])
unset($_RDATA['sp_queue'][$key]);
// Add it to the 'stored' and 'crawled links' lists
$_RDATA['sp_store'][] = $row['url'];
$_RDATA['sp_links'][] = $row['url'];
// Add the content hash to the tally
$_RDATA['sp_sha1'][$row['content_checksum']] = $row['url'];
// Rebuild the domains list
$prurl = parse_url($row['url']);
$domain = $prurl['scheme'].'://'.$prurl['host'];
if (!isset($_RDATA['sp_domains'][$domain])) {
$_RDATA['sp_domains'][$domain] = 1;
} else $_RDATA['sp_domains'][$domain]++;
// Add links from the entry to the queue
$row['links'] = json_decode($row['links'], true);
foreach ($row['links'] as $link) {
$link = OS_formatURL($link, $row['url']);
// ***** If this link hasn't been crawled yet
if (!in_array($link, $_RDATA['sp_links'], true)) {
// ... and if link hasn't been queued yet
foreach ($_RDATA['sp_queue'] as $queue)
if ($link == $queue[0]) continue 2;
// ... and if link passes our user filters, add the link to
// the queue
if (!OS_filterURL($link, $row['url']))
$_RDATA['sp_queue'][] = array($link, 0, $row['url']);
}
}
}
// We couldn't select any data from the crawltemp table so delete it
} else {
$drop = $_DDATA['pdo']->query(
'DROP TABLE IF EXISTS `'.$_DDATA['tbprefix'].'crawltemp`;'
);
$err = $drop->errorInfo();
if ($err[0] != '00000') {
// If we couldn't delete the interrupted crawldata table, this is
// a fatal error
OS_crawlLog('Could not delete previously interrupted crawl data; unable to crawl.', 2);
throw new Exception('Could not delete previously interrupted crawl data; unable to crawl.');
}
}
}
// Create a temp MySQL storage table using schema of the existing table
$create = $_DDATA['pdo']->query(
'CREATE TABLE IF NOT EXISTS`'.$_DDATA['tbprefix'].'crawltemp`
LIKE `'.$_DDATA['tbprefix'].'crawldata`;'
);
$err = $create->errorInfo();
if ($err[0] != '00000') {
// If we could not create the crawldata table, or an interrupted
// crawldata table doesn't exist, then this is a fatal error
OS_crawlLog('Unable to create or reuse existing crawl data table; unable to crawl.', 2);
throw new Exception('Unable to create or reuse existing crawl data table; unable to crawl.');
}
// Prepare SQL statements
$selectData = $_DDATA['pdo']->prepare(
'SELECT `url`, `category`, `links`, `content_checksum`, `last_modified`,
`flag_updated`, `flag_unlisted`, `priority`
FROM `'.$_DDATA['tbprefix'].'crawldata` WHERE `url`=:url;'
);
$updateURL = $_DDATA['pdo']->prepare(
'UPDATE `'.$_DDATA['tbprefix'].'crawltemp` SET
`url`=:url WHERE `content_checksum`=:content_checksum;'
);
$insertTemp = $_DDATA['pdo']->prepare(
'INSERT INTO `'.$_DDATA['tbprefix'].'crawltemp` SET
`url`=:url,
`url_sort`=0,
`title`=:title,
`description`=:description,
`keywords`=:keywords,
`category`=:category,
`weighted`=:weighted,
`links`=:links,
`content`=:content,
`content_mime`=:content_mime,
`content_charset`=:content_charset,
`content_checksum`=:content_checksum,
`status`=:status,
`flag_unlisted`=:flag_unlisted,
`flag_updated`=:flag_updated,
`last_modified`=:last_modified,
`priority`=:priority
;'
);
$insertNotModified = $_DDATA['pdo']->prepare(
'REPLACE INTO `'.$_DDATA['tbprefix'].'crawltemp` (
`url`, `url_sort`, `title`, `description`, `keywords`, `category`,
`weighted`, `links`, `content`, `content_mime`, `content_charset`,
`content_checksum`, `status`, `flag_unlisted`, `flag_updated`,
`last_modified`, `priority`
) SELECT
`url`, 0, `title`, `description`, `keywords`, `category`,
`weighted`, `links`, `content`, `content_mime`, `content_charset`,
`content_checksum`, :status, `flag_unlisted`, 0,
`last_modified`, `priority`
FROM `'.$_DDATA['tbprefix'].'crawldata` WHERE `url`=:url;'
);
// ***** Begin crawling URLs from the queue
while ($_cURL && count($_RDATA['sp_queue'])) {
// Check if we have run out of execution time
if ($_ODATA['sp_time_start'] + $_ODATA['sp_timeout_crawl'] <= time()) {
OS_crawlLog('Maximum script runtime ('.$_ODATA['sp_timeout_crawl'].'s) reached', 2);
break;
}
// Check if user has canceled the crawl
if (OS_getValue('sp_cancel')) {
OS_crawlLog('Crawl canceled manually by user', 2);
break;
}
// Check if we have exceeded the maximum number of crawled links
if (count($_RDATA['sp_links']) > $_ODATA['sp_limit_crawl']) {
OS_crawlLog('Maximum number of crawled pages exceeded', 2);
break;
}
// Retrieve next link to crawl from the queue
list($url, $depth, $referer) = array_shift($_RDATA['sp_queue']);
$_RDATA['sp_links'][] = $url;
// Check if URL is beyond the depth limit
if ($depth > $_ODATA['sp_limit_depth']) {
OS_crawlLog('Maximum link depth ('.$_ODATA['sp_limit_depth'].') exceeded; URL at depth '.$depth.' was not stored: '.$url, 2);
continue;
}
// Check robots.txt for newly encountered hostnames
$purl = parse_url($url);
$port = (!empty($purl['port'])) ? ':'.$purl['port'] : '';
if (!isset($_RDATA['sp_robots'][$purl['host']])) {
$_RDATA['sp_robots'][$purl['host']] = array();
OS_crawlLog('Fetching robots.txt for domain: '.$purl['host'], 1);
curl_setopt($_cURL, CURLOPT_TIMECONDITION, CURL_TIMECOND_NONE);
$robotstxt = OS_fetchURL($purl['scheme'].'://'.$purl['host'].$port.'/robots.txt', '');
if (!$robotstxt['errno']) {
$robots = array();
$robot = '';
$robolines = explode("\n", $robotstxt['content']);
foreach ($robolines as $line) {
if (preg_match('/^user-agent\s*:\s*(.*)\s*$/i', $line, $r)) {
if (empty($robots[$robot = $r[1]]))
$robots[$robot] = array('disallow' => array(), 'allow' => array());
} else if (preg_match('/((dis)?allow)\s*:\s*(.*)\s*$/i', $line, $r))
$robots[$robot][strtolower($r[1])][] = OS_formatURL($r[3], $url);
}
foreach ($robots as $agent => $rules) {
if (preg_match('/^orc(a|inus)(-?php)?-?crawler$/i', $agent) || $agent == '*') {
foreach ($rules['disallow'] as $disrule)
if (!in_array($disrule, $_RDATA['sp_robots'][$purl['host']], true))
$_RDATA['sp_robots'][$purl['host']][] = $disrule;
foreach ($rules['allow'] as $rule) {
$key = array_search($rule, $_RDATA['sp_robots'][$purl['host']]);
if ($key !== false) unset($_RDATA['sp_robots'][$purl['host']][$key]);
}
}
}
}
}
if ($_RDATA['debug'])
OS_crawlLog('Memory used: '.OS_readSize(memory_get_usage(true)), 1);
OS_crawlLog('Crawling: '.$url.' (Depth: '.$depth.')', 1);
OS_setValue('sp_progress', array(
count($_RDATA['sp_links']),
count($_RDATA['sp_links']) + count($_RDATA['sp_queue']),
$_RDATA['sp_resumed']
));
OS_setValue('sp_time_end', time());
// Set the correct If-Modified-Since request header
if ($_ODATA['sp_ifmodifiedsince'] && isset($_RDATA['sp_lastmod'][$url])) {
curl_setopt($_cURL, CURLOPT_TIMEVALUE, $_RDATA['sp_lastmod'][$url]);
curl_setopt($_cURL, CURLOPT_TIMECONDITION, CURL_TIMECOND_IFMODSINCE);
} else curl_setopt($_cURL, CURLOPT_TIMECONDITION, CURL_TIMECOND_NONE);
// Fetch the URL
$data = OS_fetchURL($url, $referer);
// Record cURL timing and data info for this fetch
OS_setValue('sp_data_transferred', $_ODATA['sp_data_transferred'] + $data['info']['size_download']);
$_RDATA['sp_time_curl'] += $data['info']['total_time'];
// If there were cURL errors while fetching this URL
if ($data['errno']) {
// Else if the page hasn't been modified since the last crawl
} else if ($data['info']['http_code'] == 304) {
$data['info']['noindex'] = 'not-modified-304';
// Else if we received any content at all
} else if (trim($data['body'])) {
// Get a 20-byte binary hash of the raw content
$data['info']['sha1'] = sha1($data['body'], true);
// If this content does not duplicate previously stored content
if (empty($_RDATA['sp_sha1'][$data['info']['sha1']])) {
// Add the content hash to the tally
$_RDATA['sp_sha1'][$data['info']['sha1']] = $url;
// If this is a new page, or an existing page but the content
// hash has changed
if (!isset($_RDATA['sp_exist'][$data['info']['sha1']]) ||
$_RDATA['sp_exist'][$data['info']['sha1']] != $url) {
// Detect MIME-type using extension?
if (empty($data['info']['content_type']))
$data['info']['content_type'] = 'text/plain';
// Parse MIME-type
$data['info']['mime_type'] = '';
if (preg_match('/\w+\/[\w.+-]+/', $data['info']['content_type'], $m))
$data['info']['mime_type'] = $m[0];
// Parse Character Encoding
$data['info']['charset'] = '';
if (preg_match('/charset=([\w\d.:-]+)/i', $data['info']['content_type'], $m))
$data['info']['charset'] = $m[1];
if (!$data['info']['charset'])
$data['info']['charset'] = 'ISO-8859-1';
// GZ-Unzip the content if necessary
while (strpos($data['body'], "\x1f\x8b") === 0)
$data['body'] = gzinflate(substr($data['body'], 10));
// Title defaults to filename
$data['title'] = basename($data['info']['url']);
// Determine how to parse the content by MIME-type
switch ($data['info']['mime_type']) {
/* ***** PLAIN TEXT ************************************** */
case 'text/plain':
$data['content'] = $data['body'];
OS_cleanTextUTF8($data['content'], $data['info']['charset']);
break;
/* ***** XML DOCUMENT ************************************ */
case 'text/xml':
case 'application/xml':
$data['body'] = preg_replace('/<br(\s?\/)?>/', ' ', $data['body']);
$document = new DOMDocument();
if ($document->loadXML($data['body'], LIBXML_PARSEHUGE | LIBXML_BIGLINES | LIBXML_COMPACT)) {
// Remove <script> elements
$scripts = $document->getElementsByTagName('script');
foreach ($scripts as $script)
$script->parentNode->removeChild($script);
// Remove <!-- comments -->
$xpath = new DOMXpath($document);
$comments = $xpath->query('//comment()');
foreach ($comments as $comment)
$comment->parentNode->removeChild($comment);
// Check XML document charset
if (strtolower($data['info']['charset']) != strtolower($document->xmlEncoding)) {
OS_crawlLog('Charset in Content-type header ('.(($data['info']['charset']) ? $data['info']['charset'] : '<none>').') differs from document charset ('.(($document->xmlEncoding) ? $document->xmlEncoding : '<none>').') at: '.$data['info']['url'], 1);
$data['info']['charset'] = $document->xmlEncoding;
}
$data['content'] = $document->textContent;
// Could not parse XML; try to store content anyway
} else {
$data['error'] = 'Invalid XML - could not parse content; storing as-is';
$data['info']['nofollow'] = true;
// Remove <script> elements and <!-- comments -->
$data['content'] = preg_replace(array('/<!--.*?-->/s', '/<script.*?\/script>/is'), '', $data['body']);
$data['content'] = strip_tags($data['content']);
}
OS_cleanTextUTF8($data['content'], $data['info']['charset'], ENT_XML1);
break;
/* ***** HTML DOCUMENT *********************************** */
case 'text/html':
case 'application/xhtml+xml':
$data['body'] = preg_replace('/<br(\s?\/)?>/', ' ', $data['body']);
$document = new DOMDocument();
if ($document->loadHTML($data['body'], LIBXML_PARSEHUGE | LIBXML_BIGLINES | LIBXML_COMPACT | LIBXML_NOCDATA)) {
// Remove <script> elements
$scripts = $document->getElementsByTagName('script');
foreach ($scripts as $script)
$script->parentNode->removeChild($script);
// Remove <!-- comments -->
$xpath = new DOMXpath($document);
$comments = $xpath->query('//comment()');
foreach ($comments as $comment)
$comment->parentNode->removeChild($comment);
// ***** Process <head> elements
$head = $document->getElementsByTagName('head');
if (!empty($head[0])) {
$base = $head[0]->getElementsByTagName('base');
if (!empty($base[0]))
for ($x = 0; $x < count($base[0]->attributes); $x++)
if (strtolower($base[0]->attributes[$x]->name) == 'href')
if (!empty($base[0]->attributes[$x]->value))
$data['base'] = filter_var($base[0]->attributes[$x]->value, FILTER_SANITIZE_URL);
$metas = $head[0]->getElementsByTagName('meta');
foreach ($metas as $meta) {
for ($x = 0; $x < count($meta->attributes); $x++) {
if (strtolower($meta->attributes[$x]->name) == 'charset') {
if (strtolower($data['info']['charset']) != strtolower($meta->attributes[$x]->value)) {
OS_crawlLog('Charset in Content-type header ('.(($data['info']['charset']) ? $data['info']['charset'] : '<none>').') differs from document charset ('.(($meta->attributes[$x]->value) ? $meta->attributes[$x]->value : '<none>').') at: '.$data['info']['url'], 1);
$data['info']['charset'] = $meta->attributes[$x]->value;
}
} else if (strtolower($meta->attributes[$x]->name) == 'http-equiv') {
switch (strtolower($meta->attributes[$x]->value)) {
case 'refresh':
for ($y = 0; $y < count($meta->attributes); $y++) {
if (strtolower($meta->attributes[$y]->name) == 'content') {
if (preg_match('/(\d+)\s?;\s?url\s?=\s?([\'"])(.+?)\2?\s?$/i', $meta->attributes[$y]->value, $m)) {
if ((int)$m[1] <= $_ODATA['sp_timeout_url']) {
$data['errno'] = 300;
$data['error'] = 'Redirected by <meta> element to: '.$m[3];
$data['info']['redirect_url'] = $m[3];
$data['info']['noindex'] = 'redirect-meta';
$data['info']['nofollow'] = true;
break 4;
} else $data['links'][] = $m[3];
}
}
}
break;
case 'content-type':
for ($y = 0; $y < count($meta->attributes); $y++) {
if (strtolower($meta->attributes[$y]->name) == 'content' && preg_match('/charset=([\w\d.:-]+)/i', $meta->attributes[$y]->value, $m)) {
if (strtolower($data['info']['charset']) != strtolower($m[1])) {
OS_crawlLog('Charset in Content-type header ('.(($data['info']['charset']) ? $data['info']['charset'] : '<none>').') differs from document charset ('.(($m[1]) ? $m[1] : '<none>').') at: '.$data['info']['url'], 1);
$data['info']['charset'] = $m[1];
}
}
}
}
} else if (strtolower($meta->attributes[$x]->name) == 'name') {
switch (strtolower($meta->attributes[$x]->value)) {
case 'keywords':
for ($y = 0; $y < count($meta->attributes); $y++)
if (strtolower($meta->attributes[$y]->name) == 'content')
$data['keywords'] = $meta->attributes[$y]->value;
break;
case 'description':
for ($y = 0; $y < count($meta->attributes); $y++)
if (strtolower($meta->attributes[$y]->name) == 'content')
$data['description'] = $meta->attributes[$y]->value;
break;
case 'robots':
case 'orcacrawler':
case 'orcaphpcrawler':
case 'orca-crawler':
case 'orcaphp-crawler':
case 'orca-phpcrawler':
case 'orca-php-crawler':
case 'orcinuscrawler':
case 'orcinus-crawler':
for ($y = 0; $y < count($meta->attributes); $y++) {
if (strtolower($meta->attributes[$y]->name) == 'content') {
$content = explode(',', $meta->attributes[$y]->value);
foreach ($content as $con) {
switch (trim(strtolower($con))) {
case 'nofollow':
$data['info']['nofollow'] = true;
break;
case 'noindex':
$data['error'] = 'Not indexed due to robots <meta> element';
$data['info']['noindex'] = 'robots-meta';
}
}
}
}
}
}
}
}
$title = $head[0]->getElementsByTagName('title');
$data['title'] = $title[0]->textContent;
$links = $head[0]->getElementsByTagName('link');
foreach ($links as $link) {
for ($x = 0; $x < count($link->attributes); $x++) {
if (strtolower($link->attributes[$x]->name) == 'rel') {
for ($y = 0; $y < count($link->attributes); $y++) {
if (strtolower($link->attributes[$y]->name) == 'href') {
$linkurl = filter_var($link->attributes[$y]->value, FILTER_SANITIZE_URL);
switch (strtolower($link->attributes[$x]->value)) {
case 'canonical':
if (OS_formatURL($linkurl, $data['base']) != $data['info']['url']) {
$data['info']['noindex'] = 'non-canonical';
$data['info']['canonical'] = $linkurl;
}
case 'alternate':
case 'author':
case 'help':
case 'license':
case 'me':
case 'next':
case 'prev':
case 'search':
case 'alternate':
$data['links'][] = $linkurl;
}
break;
}
}
}
}
}
}
// ***** Process <body> elements
$body = $document->getElementsByTagName('body');
if (!empty($body[0])) {
// Replace <img> tags with their alt text
$imgs = $body[0]->getElementsByTagName('img');
foreach ($imgs as $img) {
for ($x = 0; $x < count($img->attributes); $x++) {
if (strtolower($img->attributes[$x]->name) == 'alt') {
$img->parentNode->replaceChild(
$document->createTextNode(' '.$img->attributes[$x]->value.' '),
$img
);
break;
}
}
}
$as = $body[0]->getElementsByTagName('a');
foreach ($as as $a) {
for ($x = 0; $x < count($a->attributes); $x++) {
if (strtolower($a->attributes[$x]->name) == 'href') {
for ($y = 0; $y < count($a->attributes); $y++)
if (strtolower($a->attributes[$y]->name) == 'rel' && strtolower($a->attributes[$y]->value) == 'nofollow') continue 3;
$data['links'][] = $a->attributes[$x]->value;
}
}
}
$areas = $body[0]->getElementsByTagName('area');
foreach ($areas as $area) {
for ($x = 0; $x < count($area->attributes); $x++) {
if (strtolower($area->attributes[$x]->name) == 'href') {
for ($y = 0; $y < count($area->attributes); $y++)
if (strtolower($area->attributes[$y]->name) == 'rel' && strtolower($area->attributes[$y]->value) == 'nofollow') continue 3;
$data['links'][] = $area->attributes[$x]->value;
}
}
}
$frames = $body[0]->getElementsByTagName('frame');
foreach ($frames as $frame)
for ($x = 0; $x < count($frame->attributes); $x++)
if (strtolower($frame->attributes[$x]->name) == 'src')
$data['links'][] = $frame->attributes[$x]->value;
$iframes = $body[0]->getElementsByTagName('iframe');
foreach ($iframes as $iframe)
for ($x = 0; $x < count($iframe->attributes); $x++)
if (strtolower($iframe->attributes[$x]->name) == 'src')
$data['links'][] = $iframe->attributes[$x]->value;
}
$data['links'] = array_map(function($l) {
if (preg_match('/^(tel|telnet|mailto|ftp|sftp|ssh|gopher|news|ldap|urn|onion|magnet):/i', $l)) return '';
return preg_replace('/#.*$/', '', $l);
}, $data['links']);
$data['links'] = array_filter(array_unique($data['links']));
// Remove tags
foreach ($_RDATA['sp_ignore_css'] as $ignoreCSS) {
switch ($ignoreCSS[0]) {
case '#': // Remove by ID
$id = $document->getElementById(substr($ignoreCSS, 1));
if (!is_null($id)) $id->parentNode->removeChild($id);
break;
case '.': // Remove by class
foreach ($xpath->evaluate('//*[contains(concat(" ", normalize-space(@class), " "), " '.substr($ignoreCSS, 1).' ")]') as $cls)
$cls->parentNode->removeChild($cls);
break;
default: // Remove by tag name
$tags = $document->getElementsByTagName($ignoreCSS);
foreach ($tags as $tag)
$tag->parentNode->removeChild($tag);
}
}
// Weighted elements
foreach ($_RDATA['s_weight_css'] as $weightCSS) {
switch ($weightCSS[0]) {
case '#': // Get content by ID
$id = $document->getElementById(substr($weightCSS, 1));
if (!is_null($id)) $data['weighted'] .= $id->textContent.' ';
break;
case '.': // Get content by class
foreach ($xpath->evaluate('//*[contains(concat(" ", normalize-space(@class), " "), " '.substr($weightCSS, 1).' ")]') as $cls)
$data['weighted'] .= $cls->textContent.' ';
break;
default: // Get content by tag name
$tags = $document->getElementsByTagName($weightCSS);
foreach ($tags as $tag)
$data['weighted'] .= $tag->textContent.' ';
}
}
$data['content'] = $document->textContent;
// Could not parse HTML; try to store content anyway
} else {
$data['error'] = 'Invalid HTML - could not parse content; storing as-is';
$data['info']['nofollow'] = true;
// Remove <script> elements and <!-- comments -->
$data['content'] = preg_replace(array('/<!--.*?-->/s', '/<script.*?\/script>/is'), '', $data['body']);
$data['content'] = strip_tags($data['content']);
}
// Not sure I need to do this, but hey... I could, so...
if ($data['info']['mime_type'] == 'application/xhtml+xml') {
$ent = ENT_XHTML;
} else if (!empty($document->doctype->publicId)) {
$publicId = strtoupper($document->doctype->publicId);
if (strpos($publicId, 'DTD XHTML') !== false) {
$ent = ENT_XHTML;
} else if (strpos($publicId, 'DTD HTML') !== false) {
$ent = ENT_HTML401;
} else $ent = ENT_XML1;
} else $ent = ENT_HTML5;
OS_cleanTextUTF8($data['title'], $data['info']['charset'], $ent);
OS_cleanTextUTF8($data['keywords'], $data['info']['charset'], $ent);
OS_cleanTextUTF8($data['description'], $data['info']['charset'], $ent);
OS_cleanTextUTF8($data['weighted'], $data['info']['charset'], $ent);
OS_cleanTextUTF8($data['content'], $data['info']['charset'], $ent);
break;
/* ***** PDF ********************************************* */
case 'application/pdf':
if ($_PDF) {
try {
$pdf = $_PDF->parseContent($data['body']);
$metadata = $pdf->getDetails();
// Prefer regular PDF metadata first, then try XMP
$getItems = array(
'title' => array('Title', 'dc:title', 'pdf:title'),
'description' => array('Subject', 'dc:description', 'pdf:subject'),
'keywords' => array('Keywords', 'dc:subject', 'pdf:keywords'),
'modified' => array('SourceModified', 'pdfx:sourcemodified', 'CreationDate', 'xmp:createdate')
);
foreach ($getItems as $key => $item) {
foreach ($item as $opt) {
if (!empty($metadata[$opt])) {
// Check if this is an array of list-items and if
// so, convert it to a comma-separated string
if (is_array($metadata[$opt]) && isset($metadata[$opt][0]) && is_string($metadata[$opt][0]))
$metadata[$opt] = implode(', ', $metadata[$opt]);
// Use the first valid string value we find as
// the appropriate property value
if (is_string($metadata[$opt]) && trim($metadata[$opt])) {
$data[$key] = $metadata[$opt];
break;
}
}
}
}
$data['content'] = $pdf->getText();
$data['info']['charset'] = mb_detect_encoding($data['content']);
if (!$data['info']['charset']) $data['info']['charset'] = 'CP1252';
OS_cleanTextUTF8($data['content'], $data['info']['charset']);
if (!empty($data['modified']))
if ($stamp = strtotime($data['modified']))
$data['info']['filetime'] = $stamp;
if ($data['content']) {
// Discard the PDF text if it contains Unicode control
// characters; some of these might be simple PDF ligatures
// but PDFParser doesn't support them; any content that
// contains these is usually mostly gobbledegook
if (strpos($data['content'], "\u{3}") === false &&
strpos($data['content'], "\u{2}") === false &&
strpos($data['content'], "\u{1}") === false) {
OS_cleanTextUTF8($data['title'], mb_detect_encoding($data['title']));
OS_cleanTextUTF8($data['keywords'], mb_detect_encoding($data['keywords']));
OS_cleanTextUTF8($data['description'], mb_detect_encoding($data['description']));
} else {
$data['errno'] = 703;
$data['error'] = 'Failed to decode PDF text';
$data['content'] = '';
$data['info']['noindex'] = 'couldnt-decode-pdf';
}
} else {
$data['errno'] = 702;
$data['error'] = 'PDF is empty of extractable text';
$data['info']['noindex'] = 'empty-pdf';
}
} catch (Exception $e) {
$data['errno'] = 701;
$data['error'] = 'PDF is secured/encrypted; text extraction failed';
$data['content'] = '';
$data['info']['noindex'] = 'secured-pdf';
}
} else $data['info']['noindex'] = 'missing-pdfparser';
break;
/* ***** Unknown MIME-type ******************************* */
default:
$data['error'] = 'Not indexed due to unknown MIME type ('.$data['info']['mime_type'].')';
$data['info']['noindex'] = 'unknown-mime';
}
// Else content is identical to the old entry so don't parse
} else {
$data['info']['noindex'] = 'not-modified-sha1';
}
// Else content is a duplicate of a previously stored page
} else {
// Update the stored URL to the shortest version
if (strlen($url) < strlen($_RDATA['sp_sha1'][$data['info']['sha1']])) {
$updateURL->execute(array(
'url' => $url,
'content_checksum' => $data['info']['sha1']
));
}
$data['info']['noindex'] = 'duplicate';
}
// Else the 'body' of the response was empty
} else {
$data['error'] = 'Server returned no content';
$data['info']['noindex'] = 'empty';
}
// Decide whether or not to 'index' / store this page
switch ($data['info']['noindex']) {
// ***** There is no 'noindex' reason, so store the page
case '':
case 'not-modified-304':
case 'not-modified-sha1':
if ($referer == '<orphan>') {
$data['info']['status'] = 'Orphan';
$_RDATA['sp_status']['Orphan']++;
} else $data['info']['status'] = 'OK';
// ***** If we got new or updated content for this URL
if (!$data['info']['noindex']) {
// If this URL exists (or existed) in the live table...
if (in_array($url, $_RDATA['sp_exist'], true) || $referer == '<orphan>') {
$_RDATA['sp_status']['Updated']++;
$selectData->execute(array('url' => $url));
$err = $selectData->errorInfo();
if ($err[0] != '00000') {
OS_crawlLog('Database select error: '.$url, 2);
OS_crawlLog($err[2], 0);
break 2;
}
$row = $selectData->fetchAll()[0];
// Else provide default values for a new URL
} else {
$_RDATA['sp_status']['New']++;
$row = array(
'category' => $_ODATA['sp_category_default'],
'flag_unlisted' => 0,
'priority' => 0.5
);
}
if ($data['info']['filetime'] <= 0)
$data['info']['filetime'] = time();
// Remove text from titles
foreach ($_RDATA['sp_title_strip'] as $titleStrip) {
if ($titleStrip[0] == '*') {
$data['title'] = preg_replace('/'.str_replace('/', '\/', substr($titleStrip, 1)).'/', '', $data['title']);
} else $data['title'] = str_replace($titleStrip, '', $data['title']);
}
$port = (!empty($data['url']['port'])) ? ':'.$data['url']['port'] : '';
$insertTemp->execute(array(
'url' => $url,
'title' => trim($data['title']),
'description' => $data['description'],
'keywords' => $data['keywords'],
'category' => $row['category'],
'weighted' => $data['weighted'],
'links' => json_encode($data['links'], JSON_INVALID_UTF8_IGNORE),
'content' => $data['content'],
'content_mime' => $data['info']['mime_type'],
'content_charset' => $data['info']['charset'],
'content_checksum' => $data['info']['sha1'],
'status' => $data['info']['status'],
'flag_unlisted' => $row['flag_unlisted'],
'flag_updated' => 1,
'last_modified' => $data['info']['filetime'],
'priority' => $row['priority']
));
if (!$insertTemp->rowCount()) {
OS_crawlLog('Database primary insert error: '.$url, 2);
$err = $insertTemp->errorInfo();
if ($err[0] != '00000') OS_crawlLog($err[2], 0);
} else $_RDATA['sp_store'][] = $url;
// ***** URL hasn't been modified since the last successful crawl
} else {
OS_crawlLog('Page hasn\'t been modified since the last successful crawl', 0);
// Preset the 'last_modified' time and 'priority' until we can
// find out the actual values from the previous database record
$data['info']['filetime'] = time();
$row = array('priority' => 0.5);
// Get previous entry from existing search database
$insertNotModified->execute(array(
'url' => $url,
'status' => $data['info']['status']
));
if ($insertNotModified->rowCount()) {
// Mark as 'stored'
$_RDATA['sp_store'][] = $url;
// Get 'priority' & 'last_modified' values for the sitemap
// Load the previously saved link list to add to the queue
$selectData->execute(array('url' => $url));
$err = $selectData->errorInfo();
if ($err[0] == '00000') {
$row = $selectData->fetchAll()[0];
$data['links'] = json_decode($row['links'], true);
$data['info']['filetime'] = $row['last_modified'];
} else OS_crawlLog('Database existing table row read error: '.$url, 2);
// Could not insert previously stored row into temp table
} else {
OS_crawlLog('Database \'not-modified\' insert error: '.$url, 2);
$err = $insertNotModified->errorInfo();
if ($err[0] != '00000') OS_crawlLog($err[2], 0);
}
}
$domain = $data['url']['scheme'].'://'.$data['url']['host'];
if (!isset($_RDATA['sp_domains'][$domain])) {
$_RDATA['sp_domains'][$domain] = 1;
} else $_RDATA['sp_domains'][$domain]++;
// Store data for use in the sitemap
if ($_ODATA['sp_sitemap_file'] &&
$data['url']['host'] == $_ODATA['sp_sitemap_hostname']) {
$delta = time() - $data['info']['filetime'];
$cf = 'always';
if ($delta > 2700 && $delta <= 64800) $cf = 'hourly';
if ($delta > 64800 && $delta <= 432000) $cf = 'daily';
if ($delta > 432000 && $delta <= 2160000) $cf = 'weekly';
if ($delta > 2160000 && $delta <= 21600000) $cf = 'monthly';
if ($delta > 21600000 && $delta <= 62400000) $cf = 'yearly';
if ($delta > 62400000) $cf = 'never';
$_RDATA['sp_sitemap'][] = array(
'loc' => str_replace(' ', '%20', htmlentities($url)),
'lastmod' => date('Y-m-d', $data['info']['filetime']),
'changefreq' => $cf,
'priority' => $row['priority']
);
}
break;
// ***** Otherwise, log the reason why this page was not stored
case 'duplicate':
OS_crawlLog('Content is a duplicate of already indexed page: '.$_RDATA['sp_sha1'][$data['info']['sha1']].' (Referrer was: '.$referer.')', 2);
break;
case 'timeout':
case 'network-error':
case 'couldnt-connect':
OS_crawlLog($data['error'].': '.$url, 2);
if ($referer == '<orphan>') $_RDATA['sp_status']['Blocked']++;
break;
case 'empty':
case 'too-large':
case 'robots-meta':
case 'robots-http':
case 'unknown-mime':
case 'self-reference':
case 'empty-pdf':
case 'secured-pdf':
case 'couldnt-decode-pdf':
OS_crawlLog($data['error'], 1);
if ($referer == '<orphan>') $_RDATA['sp_status']['Blocked']++;
break;
case '400':
OS_crawlLog($data['error'].': '.$url.' (Referrer was: '.$referer.')', 2);
if ($referer == '<orphan>') $_RDATA['sp_status']['Not Found']++;
break;
case 'redirect-meta':
case 'redirect-location':
OS_crawlLog($data['error'].': '.$url.' (Referrer was: '.$referer.')', 2);
OS_crawlLog('Page was removed in favour of redirected URL', 0);
$data['links'][] = $data['info']['redirect_url'];
break;
case 'non-canonical':
OS_crawlLog('Not indexed due to canonical <link> element: '.$data['info']['canonical'], 1);
OS_crawlLog('Referrer was: '.$referer, 0);
break;
default:
OS_crawlLog('Not indexed due to noindex rule \''.$data['info']['noindex'].'\': '.$url.' (Referrer was: '.$referer.')', 2);
if ($referer == '<orphan>') $_RDATA['sp_status']['Blocked']++;
break;
}
// Check if we have stored the maximum allowed number of pages
if (count($_RDATA['sp_store']) >= $_ODATA['sp_limit_store']) {
OS_crawlLog('Maximum number of crawled pages reached ('.$_ODATA['sp_limit_store'].')', 1);
$_RDATA['sp_complete'] = true;
break;
}
// If we fetched more links from the content above, parse and add
// them to the queue
if (!$data['info']['nofollow']) {
foreach ($data['links'] as $link) {
$link = OS_formatURL($link, $data['base']);
// ***** If this link hasn't been crawled yet
if (!in_array($link, $_RDATA['sp_links'], true)) {
// ... and if link hasn't been queued yet
foreach ($_RDATA['sp_queue'] as $queue)
if ($link == $queue[0]) continue 2;
// ... and if link passes our user filters
if ($nx = OS_filterURL($link, $data['base'])) {
OS_crawlLog('Link ignored due to noindex rule \''.$nx.'\': '.$link, 0);
// ... then add the link to the queue
} else $_RDATA['sp_queue'][] = array($link, $depth + 1, $url);
}
}
}
// If we've completed the queue, check for orphans
if (!count($_RDATA['sp_queue'])) {
// Diff the previous URL list with the links we've already scanned
$_RDATA['sp_exist'] = array_diff($_RDATA['sp_exist'], $_RDATA['sp_links']);
// If we have leftover links, and we aren't autodeleting them
if (count($_RDATA['sp_exist']) && !$_ODATA['sp_autodelete']) {
OS_crawlLog('Adding '.count($_RDATA['sp_exist']).' orphan(s) to queue...', 1);
foreach ($_RDATA['sp_exist'] as $key => $link) {
// Check if orphan URL passes our user filters
if ($nx = OS_filterURL($link, $data['base'])) {
// If not, remove it from the sp_exist list
OS_crawlLog('Orphan URL ignored due to noindex rule \''.$nx.'\': '.$link, 0);
$_RDATA['sp_status']['Blocked']++;
unset($_RDATA['sp_exist'][$key]);
// If so, then add the orphan to the queue
} else $_RDATA['sp_queue'][] = array($link, 0, '<orphan>');
}
// Else if we stored some pages, we're done
} else if (count($_RDATA['sp_store'])) {
$_RDATA['sp_complete'] = true;
// No pages were stored
} else OS_crawlLog('No pages could be indexed; check your starting URL(s)', 2);
}
gc_collect_cycles();
usleep($_ODATA['sp_sleep'] * 1000);
$_RDATA['sp_sleep'] += $_ODATA['sp_sleep'];
}
// ***** Write sitemap
if ($_RDATA['sp_complete'] && $_ODATA['sp_sitemap_file']) {
if ($_RDATA['sp_sitemap_file'] != 'does not exist') {
if ($_RDATA['sp_sitemap_file'] != 'not writable') {
$sm = array('<?xml version="1.0" encoding="UTF-8"?>');
$sm[] = '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">';
foreach ($_RDATA['sp_sitemap'] as $sitemap) {
$sm[] = ' <url>';
foreach ($sitemap as $key => $value)
if ($key != 'priority' || $value != 0.5)
$sm[] = ' <'.$key.'>'.$value.'</'.$key.'>';
$sm[] = ' </url>';
}
$sm[] = '</urlset>';
if (preg_match('/\.xml\.gz$/', $_RDATA['sp_sitemap_file'])) {
if (function_exists('gzopen')) {
$smf = gzopen($_RDATA['sp_sitemap_file'], 'w');
gzwrite($smf, implode("\n", $sm));
gzclose($smf);
OS_crawlLog('Sitemap written successfully: '.$_ODATA['sp_sitemap_file'], 1);
} else OS_crawlLog('Could not write sitemap; PHP gzip functions are not enabled', 2);
} else if (preg_match('/\.xml$/', $_RDATA['sp_sitemap_file'])) {
$smf = fopen($_RDATA['sp_sitemap_file'], 'w');
fwrite($smf, implode("\n", $sm));
fclose($smf);
OS_crawlLog('Sitemap written successfully: '.$_ODATA['sp_sitemap_file'], 1);
} else OS_crawlLog('Sitemap filename ('.$_ODATA['sp_sitemap_file'].') must have extension \'.xml\' or \'.xml.gz\'', 2);
} else OS_crawlLog('Sitemap file \''.$_ODATA['sp_sitemap_file'].'\' is not writable', 2);
} else OS_crawlLog('Sitemap file \''.$_ODATA['sp_sitemap_file'].'\' does not exist', 2);
} ?>