orcinus-search/orcinus/crawler.php
Brian Huisman 1860d1f8ce Totally forgot to actually implement this feature
The "remove text from titles" feature was coded into the admin UI from the previous version, but was never actually implemented in the crawler. Wow. It works now.
2023-09-27 15:33:06 -04:00

1819 lines
70 KiB
PHP

<?php /* ***** Orcinus Site Search - Web Crawling Engine *********** */
require __DIR__.'/config.php';
// Setting the $_RDATA['debug'] value to true will allow you to start
// the crawler just by visiting this file's URL using your web browser.
// It will output the log lines as well as any PHP errors that may
// occur. It will also report how much memory the script is using. Use
// this mode if your crawls are failing but the logs alone aren't
// enough to tell you why. DO NOT leave the crawler in debug mode in a
// production environment, or anyone can just run your crawler whenever
// they want!
$_RDATA['debug'] = false;
/**
* Log a notice (0), message (1) or error (2)
*
*/
function OS_crawlLog($text, $level = 0) {
global $_RDATA;
switch ($level) {
case 1: $prefix = ''; break;
case 2: $prefix = '[ERROR] '; break;
default: $prefix = ' -> ';
}
fwrite($_RDATA['sp_log'], $prefix.$text."\n");
if ($_RDATA['debug'] ||
($_SERVER['REQUEST_METHOD'] == 'CLI' &&
$level >= $_RDATA['sp_log_clilevel'])) {
echo $prefix.$text."\n";
}
}
/**
* Final prep to store content in UTF-8 format in the database
*
*/
function OS_cleanTextUTF8(&$_, $charset, $entity = false) {
global $_RDATA;
if (!trim($charset)) $charset = 'ISO-8859-1';
$_ = mb_convert_encoding($_, 'UTF-8', $charset);
if ($entity)
$_ = html_entity_decode($_, $entity | ENT_SUBSTITUTE, 'UTF-8');
$_ = strtr($_, $_RDATA['sp_punct']);
$_ = strtr($_, $_RDATA['sp_whitespace']);
$_ = preg_replace(array('/\s/', '/ {2,}/'), ' ', trim($_));
}
/**
* Format a full or partial URL into a full URL according to a base URL
*
*/
function OS_formatURL($_, $base) {
$_ = str_replace(' ', '%20', $_);
$_ = preg_replace('/#.*$/', '', filter_var($_, FILTER_SANITIZE_URL));
$_ = str_replace('%20', ' ', $_);
$dirbase = preg_replace('/(?<!:\/)\/[^\/]*$/', '', $base).'/';
$pdb = parse_url($dirbase);
$port = (!empty($pdb['port'])) ? ':'.$pdb['port'] : '';
if (substr($_, 0, 3) == '../') {
$p = preg_replace('/\/[^\/]*\/$/', '/', $pdb['path']);
$_ = $pdb['scheme'].'://'.$pdb['host'].$port.$p.substr($_, 3);
}
if (substr($_, 0, 2) == './') {
$_ = $dirbase.substr($_, 2);
} else if (substr($_, 0, 2) == '//') {
$_ = $pdb['scheme'].':'.$_;
} else if (substr($_, 0, 1) == '/') {
$_ = $pdb['scheme'].'://'.$pdb['host'].$port.$_;
} else if (substr($_, 0, 1) == '?') {
$_ = preg_replace('/\?.*$/', '', $base).$_;
} else if (!preg_match('/^https?:\/\//', $_)) $_ = $dirbase.$_;
$_ = preg_replace(array('/\/[^\/]*\/\.\.\//', '/\/\.\//'), '/', $_);
if ($_ == $pdb['scheme'].'://'.$pdb['host'] ||
$_ == $pdb['scheme'].'://'.$pdb['host'].$port) $_ .= '/';
return trim($_);
}
/**
* Filter a URL by the crawling rules provided by the user
* - Sets an $_RDATA['sp_filter'] array key + value and returns the
* REASON why the URL was rejected, NOT a 'filtered' URL
*
*/
function OS_filterURL($_, $base) {
global $_RDATA;
if (!preg_match('/^https?:\/\//', $_))
$_ = OS_formatURL($_, $base);
if (!empty($_RDATA['sp_filter'][$_]))
return $_RDATA['sp_filter'][$_];
$_RDATA['sp_filter'][$_] = '';
// Accepted hostnames
$plink = parse_url($_);
if (!in_array($plink['host'], $_RDATA['sp_hostnames'], true))
return $_RDATA['sp_filter'][$_] = 'disallowed-host';
// Require URL matches
if (count($_RDATA['sp_require_url'])) {
$foundRequired = false;
foreach ($_RDATA['sp_require_url'] as $requireURL) {
if ($requireURL[0] == '*') {
if (preg_match('/'.str_replace('/', '\/', substr($requireURL, 1)).'/', $_))
$foundRequired = true;
} else if (strpos($_, $requireURL) !== false)
$foundRequired = true;
}
if (!$foundRequired)
return $_RDATA['sp_filter'][$_] = 'require-url';
}
// Ignore URL matches
foreach ($_RDATA['sp_ignore_url'] as $ignoreURL) {
if ($ignoreURL[0] == '*') {
if (preg_match('/'.str_replace('/', '\/', substr($ignoreURL, 1)).'/', $_))
return $_RDATA['sp_filter'][$_] = 'ignore-url';
} else if (strpos($_, $ignoreURL) !== false)
return $_RDATA['sp_filter'][$_] = 'ignore-url';
}
// Ignore extensions
if (preg_match('/\.('.$_RDATA['sp_ignore_ext_regexp'].')$/i', $_))
return $_RDATA['sp_filter'][$_] = 'ignore-extension';
// robots.txt rules
if (!empty($_RDATA['sp_robots'][$plink['host']]))
foreach ($_RDATA['sp_robots'][$plink['host']] as $disallowURL)
if (strpos($_, $disallowURL) === 0)
return $_RDATA['sp_filter'][$_] = 'robots-txt';
return $_RDATA['sp_filter'][$_];
}
/**
* Fetch a URL using cURL, return an array of useful information
*
*/
function OS_fetchURL($url, $referer = '') {
global $_cURL, $_RDATA;
$_RDATA['sp_robots_header'] = 0;
$_RDATA['sp_self_reference'] = 0;
curl_setopt($_cURL, CURLOPT_URL, str_replace(' ', '%20', $url));
curl_setopt($_cURL, CURLOPT_REFERER, $referer);
$_ = array(
'url' => parse_url($url),
'body' => curl_exec($_cURL),
'base' => $url,
'info' => curl_getinfo($_cURL),
'error' => curl_error($_cURL),
'errno' => curl_errno($_cURL),
'links' => array(),
'title' => '',
'content' => '',
'keywords' => '',
'weighted' => '',
'description' => ''
);
$_['info']['url'] = $url;
$_['info']['noindex'] = '';
$_['info']['nofollow'] = false;
// Process any cURL errors
switch ($_['errno']) {
case 0: // Success
case 42: // Aborted by callback
if ($_['info']['http_code'] >= 400) {
$_['errno'] = 22;
$_['error'] = $_['info']['http_code'].' error';
$_['info']['noindex'] = '400';
} else if ($_['info']['redirect_url']) {
$_['errno'] = 300;
$_['error'] = 'Redirected by HTTP header to: '.$_['info']['redirect_url'];
$_['info']['noindex'] = 'redirect-location';
} else if ($_RDATA['sp_robots_header']) {
$_['errno'] = 777;
$_['error'] = 'Blocked by \'X-Robots-Tag\' HTTP header';
$_['info']['noindex'] = 'robots-http';
} else if ($_RDATA['sp_self_reference']) {
$_['errno'] = 888;
$_['error'] = 'Refused to index myself';
$_['info']['noindex'] = 'self-reference';
} else if ($_['errno'] == 42) {
$_['errno'] = 999;
$_['error'] = 'Max filesize exceeded';
$_['info']['noindex'] = 'too-large';
}
break;
case 28: // Timeout
$_['error'] = 'Timed out waiting for data';
$_['info']['noindex'] = 'timeout';
break;
case 55: // Network send error
case 56: // Network receive error
$_['error'] = 'Network error retrieving data';
$_['info']['noindex'] = 'network-error';
break;
case 6: // Could not resolve host
case 7: // Could not connect to host
$_['error'] = 'Couldn\'t connect to host: '.$_['url']['host'];
$_['info']['noindex'] = 'couldnt-connect';
break;
default: // Uncaught cURL error
OS_crawlLog('Uncaught cURL error: '.$url, 2);
OS_crawlLog($_['errno'], 1);
OS_crawlLog($_['error'], 1);
OS_crawlLog(print_r($_['info'], true), 1);
throw new Exception('Uncaught cURL error');
}
return $_;
}
/**
* Shutdown function to provide cleanup before exit
*
*/
function OS_crawlCleanUp() {
global $_DDATA, $_ODATA, $_RDATA, $_cURL, $_MAIL;
// If the crawl has already been canceled, don't bother
if (!OS_getValue('sp_crawling')) return;
$error = error_get_last();
if (!is_null($error) && $error['type'] == E_ERROR) {
OS_crawlLog($error['message'], 2);
OS_crawlLog('File: \''.$error['file'].'\' at line number: '.$error['line'], 0);
$_RDATA['sp_complete'] = false;
}
// Save or display cookies?
$cookies = curl_getinfo($_cURL, CURLINFO_COOKIELIST);
// var_dump($cookies);
curl_close($_cURL);
// If crawl completed successfully
if ($_RDATA['sp_complete']) {
OS_crawlLog('Cleaning up database tables...', 1);
// Add a natural sort order value to each entry
natcasesort($_RDATA['sp_store']);
$_RDATA['sp_store'] = array_values($_RDATA['sp_store']);
$url_sort = $_DDATA['pdo']->prepare(
'UPDATE `'.$_DDATA['tbprefix'].'crawltemp`
SET `url_sort`=:url_sort WHERE `url`=:url;'
);
foreach ($_RDATA['sp_store'] as $key => $stored_url) {
$url_sort->execute(array(
'url_sort' => $key,
'url' => $stored_url
));
$err = $url_sort->errorInfo();
if ($err[0] != '00000') {
OS_crawlLog('Error sorting the search database', 1);
OS_crawlLog($err[2], 0);
break;
}
}
// Truncate the existing search database
$truncate = $_DDATA['pdo']->query(
'TRUNCATE `'.$_DDATA['tbprefix'].'crawldata`;'
);
$err = $truncate->errorInfo();
if ($err[0] != '00000') {
OS_crawlLog('Could not truncate the search database', 1);
OS_crawlLog($err[2], 0);
// Last chance to bail out before we make actual changes
$_RDATA['sp_complete'] = false;
}
}
// If crawl completed successfully AND we truncated the old table
if ($_RDATA['sp_complete']) {
OS_setValue('sp_time_end', time());
OS_setValue('sp_time_last', $_ODATA['sp_time_end'] - $_ODATA['sp_time_start']);
// Select all rows from the temp table into the existing search table
$insert = $_DDATA['pdo']->query(
'INSERT INTO `'.$_DDATA['tbprefix'].'crawldata`
SELECT * FROM `'.$_DDATA['tbprefix'].'crawltemp`;'
);
$err = $insert->errorInfo();
if ($err[0] == '00000') {
$tableinfo = $_DDATA['pdo']->query(
'SHOW TABLE STATUS LIKE \''.$_DDATA['tbprefix'].'crawldata\';'
);
$err = $tableinfo->errorInfo();
if ($err[0] == '00000') {
$tableinfo = $tableinfo->fetchAll();
OS_setValue('sp_data_stored', $tableinfo[0]['Data_length']);
} else OS_crawlLog('Could not read crawl table status', 1);
// Purge the search result cache
if ($_ODATA['s_limit_cache']) {
$purge = $_DDATA['pdo']->query(
'UPDATE `'.$_DDATA['tbprefix'].'query` SET `cache`=\'\';'
);
$err = $purge->errorInfo();
if ($err[0] != '00000')
OS_crawlLog('Could not purge search result cache', 1);
}
// Optimize the query log table
$optimize = $_DDATA['pdo']->query(
'OPTIMIZE TABLE `'.$_DDATA['tbprefix'].'query`;'
);
OS_setValue('sp_pages_stored', count($_RDATA['sp_store']));
OS_setValue('sp_domains', $_RDATA['sp_domains']);
OS_setValue('sp_time_end_success', $_ODATA['sp_time_end']);
OS_crawlLog('***** Crawl completed in '.$_ODATA['sp_time_last'].'s *****', 1);
OS_crawlLog('Total data transferred: '.OS_readSize($_ODATA['sp_data_transferred']), 1);
OS_crawlLog('Average transfer speed: '.OS_readSize(round($_ODATA['sp_data_transferred'] / $_ODATA['sp_time_last'])).'/s', 1);
if ($_RDATA['sp_sleep'])
OS_crawlLog('Time spent sleeping: '.(round($_RDATA['sp_sleep'] / 10) / 100).'s', 1);
OS_crawlLog('Time taken by cURL: '.(round($_RDATA['sp_time_curl'] * 100) / 100).'s', 1);
OS_crawlLog($_ODATA['sp_progress'][0].' page'.(($_ODATA['sp_progress'][0] == 1) ? '' : 's').' crawled', 1);
OS_crawlLog($_ODATA['sp_pages_stored'].' page'.(($_ODATA['sp_pages_stored'] == 1) ? '' : 's').' stored', 1);
if ($_RDATA['sp_status']['New'])
OS_crawlLog($_RDATA['sp_status']['New'].' new '.(($_RDATA['sp_status']['New'] == 1) ? 'page' : 'pages').' found', 0);
if ($_RDATA['sp_status']['Updated'])
OS_crawlLog($_RDATA['sp_status']['Updated'].' '.(($_RDATA['sp_status']['Updated'] == 1) ? 'page' : 'pages').' updated', 0);
if ($_RDATA['sp_status']['Blocked'])
OS_crawlLog($_RDATA['sp_status']['Blocked'].' '.(($_RDATA['sp_status']['Blocked'] == 1) ? 'page' : 'pages').' blocked', 0);
if ($_RDATA['sp_status']['Not Found'])
OS_crawlLog($_RDATA['sp_status']['Not Found'].' '.(($_RDATA['sp_status']['Not Found'] == 1) ? 'page' : 'pages').' not found', 0);
if ($_RDATA['sp_status']['Orphan'])
OS_crawlLog($_RDATA['sp_status']['Orphan'].' orphaned '.(($_RDATA['sp_status']['Orphan'] == 1) ? 'page' : 'pages'), 0);
if ($_ODATA['sp_autodelete'])
OS_crawlLog('Orphaned pages were auto-deleted', 1);
// Send success email to the admin(s)
if ($_MAIL && count($_MAIL->getAllRecipientAddresses()) && $_ODATA['sp_email_success']) {
$_MAIL->Subject = 'Orcinus Site Search Crawler: Crawl succeeded';
$_MAIL->Body = implode(" \r\n", preg_grep('/^[\[\*]/', explode("\n", file_get_contents($_ODATA['sp_log']))));
if (!$_MAIL->Send()) OS_crawlLog('Could not send notification email', 2);
}
$cliMessage = 'Crawl completed successfully';
$jsonMessage = json_encode(array(
'status' => 'Success',
'message' => $cliMessage
), JSON_INVALID_UTF8_IGNORE);
// We truncated the search table but FAILED to populate it!
// This is a serious error that disables searching until the
// crawler is run again!
} else {
OS_crawlLog('Could not populate the search table', 2);
OS_crawlLog($err[2], 0);
OS_crawlLog('***** Crawl failed; runtime '.$_ODATA['sp_time_last'].'s *****', 1);
OS_crawlLog('Search table was cleared, but could not be repopulated!', 1);
OS_crawlLog('The crawler MUST be run again to fix this issue!', 1);
// Send failure email to the admin(s)
if ($_MAIL && count($_MAIL->getAllRecipientAddresses()) && $_ODATA['sp_email_failure']) {
$_MAIL->Subject = 'Orcinus Site Search Crawler: Catastrophic failure!';
$_MAIL->Body = implode(" \r\n", preg_grep('/^[\[\*\w\d]/', explode("\n", file_get_contents($_ODATA['sp_log']))));
if (!$_MAIL->Send()) OS_crawlLog('Could not send notification email', 2);
}
$cliMessage = 'Could not populate search table; search table is currently empty!';
$jsonMessage = json_encode(array(
'status' => 'Error',
'message' => $cliMessage
), JSON_INVALID_UTF8_IGNORE);
}
// Else the crawl failed
} else {
OS_setValue('sp_time_last', $_ODATA['sp_time_end'] - $_ODATA['sp_time_start']);
OS_crawlLog('***** Crawl failed; runtime '.$_ODATA['sp_time_last'].'s *****', 1);
OS_crawlLog('Total data transferred: '.OS_readSize($_ODATA['sp_data_transferred']), 1);
OS_crawlLog('Search table was NOT updated', 1);
if ($_ODATA['sp_sitemap_file'])
OS_crawlLog('Sitemap was NOT updated', 1);
// Send failure email to the admin(s)
if ($_MAIL && count($_MAIL->getAllRecipientAddresses()) && $_ODATA['sp_email_failure'] && !$_ODATA['sp_cancel']) {
$_MAIL->Subject = 'Orcinus Site Search Crawler: Crawl failed';
$_MAIL->Body = implode(" \r\n", preg_grep('/^[\[\*\w\d]/', explode("\n", file_get_contents($_ODATA['sp_log']))));
if (!$_MAIL->Send()) OS_crawlLog('Could not send notification email', 2);
}
$cliMessage = 'Crawl failed; see the log for details';
$jsonMessage = json_encode(array(
'status' => 'Error',
'message' => $cliMessage
), JSON_INVALID_UTF8_IGNORE);
}
// Delete the temp search table
$drop = $_DDATA['pdo']->query(
'DROP TABLE IF EXISTS `'.$_DDATA['tbprefix'].'crawltemp`;'
);
$err = $drop->errorInfo();
if ($err[0] != '00000') {
OS_crawlLog('Could not delete the temporary search table', 1);
OS_crawlLog($err[2], 0);
}
// Store the log file to the config database
OS_setValue('sp_log', file_get_contents($_ODATA['sp_log']));
fclose($_RDATA['sp_log']);
// Unset the crawling flag
OS_setValue('sp_crawling', 0);
if ($_SERVER['REQUEST_METHOD'] != 'CLI') {
if (!$_RDATA['debug'])
header('Content-type: application/json; charset='.strtolower($_ODATA['s_charset']));
die($jsonMessage);
} else die($cliMessage."\n");
}
// ***** Accept incoming commands by REQUEST_METHOD
switch ($_SERVER['REQUEST_METHOD']) {
/* ***** Handle POST Requests ************************************ */
case 'POST':
// JSON POST request
// These are usually sent by javascript fetch()
if (strpos(trim($_SERVER['CONTENT_TYPE']), 'application/json') === 0) {
$postBody = file_get_contents('php://input');
$_POST = json_decode($postBody, false);
$response = array();
if (empty($_POST->action)) $_POST->action = '';
switch ($_POST->action) {
case 'crawl':
if (!empty($_POST->sp_key) &&
$_ODATA['sp_key'] &&
$_POST->sp_key == $_ODATA['sp_key']) {
if (OS_getValue('sp_crawling')) {
$response = array(
'status' => 'Error',
'message' => 'Crawler is already running; current progress: '.$_ODATA['sp_progress'][0].'/'.$_ODATA['sp_progress'][1]
);
}
// Go crawl!
OS_setValue('sp_crawling', 1);
OS_setValue('sp_key', '');
} else {
$response = array(
'status' => 'Error',
'message' => 'Incorrect key to initiate crawler'
);
}
break;
case 'progress':
$lines = array();
if (!empty($_POST->log)) {
if (OS_getValue('sp_crawling')) {
if (strpos($_ODATA['sp_log'], "\n") === false && file_exists($_ODATA['sp_log']))
$lines = file($_ODATA['sp_log'], FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
} else $lines = explode("\n", $_ODATA['sp_log']);
if (empty($_POST->grep)) $_POST->grep = '';
switch ($_POST->grep) {
case 'all': break;
case 'errors': $lines = preg_grep('/^[\[\*]/', $lines); break;
default: $lines = preg_grep('/^[\[\*\w\d]/', $lines);
}
}
if ($_ODATA['sp_crawling']) $lines = array_slice($lines, -15);
$response = array(
'status' => ($_ODATA['sp_crawling']) ? 'Crawling' : 'Complete',
'progress' => $_ODATA['sp_progress'],
'data_transferred' => $_ODATA['sp_data_transferred'],
'time_crawl' => time() - $_ODATA['sp_time_start'],
'time_start' => $_ODATA['sp_time_start'],
'time_end' => $_ODATA['sp_time_end'],
'timeout_crawl' => $_ODATA['sp_timeout_crawl'],
'tail' => trim(implode("\n", $lines))
);
break;
case 'cancel':
if (OS_getValue('sp_crawling')) {
// IF the crawler 'time_start' is more than 'timeout_crawl'
// seconds ago, or the 'force' token is set, the crawler is
// probably stuck. Unstick it.
if (empty($_POST->force)) $_POST->force = '';
if ($_POST->force || time() - $_ODATA['sp_time_start'] > $_ODATA['sp_timeout_crawl']) {
OS_setValue('sp_crawling', 0);
if (empty($_POST->reason))
$_POST->reason = 'The crawler halted unexpectedly';
if (strpos($_ODATA['sp_log'], "\n") === false && file_exists($_ODATA['sp_log'])) {
$log = file_get_contents($_ODATA['sp_log']);
OS_setValue('sp_log', $log."\n".'[ERROR] '.$_POST->reason);
} else OS_setValue('sp_log', '[ERROR] '.$_POST->reason);
OS_setValue('sp_time_last', $_ODATA['sp_time_end'] - $_ODATA['sp_time_start']);
// Send failure email to the admin(s)
if ($_MAIL && count($_MAIL->getAllRecipientAddresses()) && $_ODATA['sp_email_failure']) {
$_MAIL->Subject = 'Orcinus Site Search Crawler: Crawler halted unexpectedly';
$_MAIL->Body = implode(" \r\n", preg_grep('/^[\[\*\w\d]/', explode("\n", $_ODATA['sp_log'])));
if (!$_MAIL->Send()) OS_setValue('sp_log', $_ODATA['sp_log']."\n".'[ERROR] Could not send notification email');
}
}
OS_setValue('sp_cancel', 1);
$response = array(
'status' => 'Success',
'message' => 'Cancel flag was set',
'crawl_time' => time() - $_ODATA['sp_time_start']
);
} else {
$response = array(
'status' => 'Error',
'message' => 'Crawler is not currently running'
);
}
break;
default:
$response = array(
'status' => 'Error',
'message' => 'Unrecognized command'
);
}
// If we have a response to give, display it and exit
if ($response) {
header('Content-type: application/json; charset='.strtolower($_ODATA['s_charset']));
die(json_encode($response, JSON_INVALID_UTF8_IGNORE));
}
// Don't do anything for normal POST request
// These are usually sent by <form> HTML elements
} else {
header('Content-type: text/plain; charset='.strtolower($_ODATA['s_charset']));
die($_ODATA['sp_useragent']);
}
break;
// Allow CLI requests through
case '':
if (!empty($_SERVER['argv'][0]) && $_SERVER['argv'][0] == $_SERVER['PHP_SELF']) {
$_SERVER['REQUEST_METHOD'] = 'CLI';
if (!OS_getValue('sp_crawling')) {
// Set the logging level, if specified
if (!empty($_SERVER['argv'][1]) && preg_match('/^-log=([012])$/', $_SERVER['argv'][1], $match)) {
$_RDATA['sp_log_clilevel'] = (int)$match[1];
} else $_RDATA['sp_log_clilevel'] = 2;
// Start a crawl
OS_setValue('sp_crawling', 1);
} else die('Crawler is already running; exiting...');
} else die($_ODATA['sp_useragent']);
break;
// Don't do anything for GET requests, unless in debug mode
case 'GET':
header('Content-type: text/plain; charset='.strtolower($_ODATA['s_charset']));
if ($_RDATA['debug']) {
// If we are in debug mode, but the crawler is already running, exit
if (OS_getValue('sp_crawling'))
die('Crawler is already running; exiting...');
// Start a crawl
OS_setValue('sp_crawling', 1);
} else die($_ODATA['sp_useragent']);
break;
// Exit for all other request types
default:
header('Content-type: text/plain; charset='.strtolower($_ODATA['s_charset']));
die($_ODATA['sp_useragent']);
}
/* ***** Begin Crawl Execution ************************************* */
register_shutdown_function('OS_crawlCleanUp');
ignore_user_abort(true);
@set_time_limit($_ODATA['sp_timeout_crawl'] * 1.1);
libxml_use_internal_errors(true);
if (function_exists('apache_setenv'))
apache_setenv('no-gzip', '1');
OS_setValue('sp_cancel', 0);
OS_setValue('sp_time_start', time());
OS_setValue('sp_progress', array(0, 1, false));
OS_setValue('sp_pages_stored', 0);
OS_setValue('sp_data_transferred', 0);
OS_setValue('sp_data_stored', 0);
OS_setValue('sp_time_last', 0);
$_RDATA['sp_log'] = tmpfile();
OS_setValue('sp_log', stream_get_meta_data($_RDATA['sp_log'])['uri']);
OS_crawlLog('***** Crawl started: '.date('r').' *****', 1);
// ***** Prepare runtime data
$_RDATA['sp_starting'] = array_filter(array_map('trim', explode("\n", $_ODATA['sp_starting'])));
$_RDATA['sp_hostnames'] = array();
$_RDATA['sp_ignore_url'] = array_filter(array_map('trim', explode("\n", $_ODATA['sp_ignore_url'])));
$_RDATA['sp_ignore_css'] = array_filter(explode(' ', $_ODATA['sp_ignore_css']));
$_RDATA['sp_title_strip'] = array_filter(array_map('trim', explode("\n", $_ODATA['sp_title_strip'])));
$_RDATA['s_weight_css'] = array_filter(explode(' ', $_ODATA['s_weight_css']));
$_RDATA['sp_require_url'] = array_filter(array_map('trim', explode("\n", $_ODATA['sp_require_url'])));
$_RDATA['sp_ignore_ext_regexp'] = implode('|', array_map('preg_quote', array_filter(explode(' ', $_ODATA['sp_ignore_ext']))));
$_RDATA['sp_robots_header'] = 0;
$_RDATA['sp_complete'] = false;
$_RDATA['sp_links'] = array();
$_RDATA['sp_store'] = array();
$_RDATA['sp_domains'] = array();
$_RDATA['sp_sitemap'] = array();
$_RDATA['sp_robots'] = array();
$_RDATA['sp_status'] = array('Orphan' => 0, 'Blocked' => 0, 'Not Found' => 0, 'Updated' => 0, 'New' => 0);
$_RDATA['sp_filter'] = array();
$_RDATA['sp_prev_dls'] = 0;
$_RDATA['sp_time_curl'] = 0;
$_RDATA['sp_sleep'] = 0;
$_RDATA['sp_sha1'] = array();
$_RDATA['sp_resumed'] = false;
$_RDATA['sp_whitespace'] = array(
"\u{0009}" => ' ', "\u{000A}" => "\n", "\u{000B}" => "\n", "\u{000C}" => "\n",
"\u{000D}" => "\n", "\u{0085}" => "\n", "\u{00A0}" => ' ', "\u{1680}" => ' ',
"\u{2000}" => ' ', "\u{2001}" => ' ', "\u{2002}" => ' ', "\u{2003}" => ' ',
"\u{2004}" => ' ', "\u{2005}" => ' ', "\u{2006}" => ' ', "\u{2007}" => ' ',
"\u{2008}" => ' ', "\u{2009}" => ' ', "\u{200A}" => ' ', "\u{200B}" => ' ',
"\u{200C}" => ' ', "\u{200D}" => '', "\u{2028}" => "\n", "\u{2029}" => "\n",
"\u{202F}" => ' ', "\u{205F}" => ' ', "\u{2060}" => '', "\u{3000}" => ' ',
"\u{FEFF}" => ' ', "\u{FFFD}" => ''
);
// ***** Load PDF parser
if (!class_exists('\Smalot\PdfParser\Parser'))
if (file_exists(__DIR__.'/pdfparser/alt_autoload.php-dist'))
include __DIR__.'/pdfparser/alt_autoload.php-dist';
if (class_exists('\Smalot\PdfParser\Parser')) {
$config = new \Smalot\PdfParser\Config();
$config->setRetainImageContent(false);
$config->setDecodeMemoryLimit(16777216);
$_PDF = new \Smalot\PdfParser\Parser([], $config);
} else {
OS_crawlLog('Could not include \'PDFParser\'; PDFs will not be indexed', 1);
$_PDF = false;
}
// ***** Check for PHPMailer
if (!$_MAIL) {
OS_crawlLog('Could not include \'PHPMailer\'; Crawler cannot send mail', 1);
} else if (!count($_MAIL->getAllRecipientAddresses()))
OS_crawlLog('No admin emails specified; Crawler will not send mail', 1);
// ***** Initialize the cURL connection
$_cURL = OS_getConnection();
if ($_cURL) {
// Customize this cURL connection
if ($_ODATA['sp_cookies'])
curl_setopt($_cURL, CURLOPT_COOKIEFILE, '');
curl_setopt($_cURL, CURLOPT_HEADERFUNCTION, function($_cURL, $line) {
global $_RDATA;
if (preg_match('/^X-Robots-Tag:\s*(noindex|none)/i', $line))
$_RDATA['sp_robots_header'] = 1;
if (trim($line) == $_RDATA['x_generated_by'])
$_RDATA['sp_self_reference'] = 1;
return strlen($line);
});
curl_setopt($_cURL, CURLOPT_NOPROGRESS, false);
curl_setopt($_cURL, CURLOPT_PROGRESSFUNCTION,
function($_cURL, $dls, $dl, $uls, $ul) {
global $_ODATA, $_RDATA;
if ($_RDATA['sp_robots_header']) return 1;
if ($_RDATA['sp_self_reference']) return 1;
// Prevent comparing this value until a Content-length header has
// been received by the cURL connection
if ($dls != $_RDATA['sp_prev_dls']) {
$_RDATA['sp_prev_dls'] = $dls;
if ($dls > $_ODATA['sp_limit_filesize'] * 1024) return 1;
}
if ($dl > $_ODATA['sp_limit_filesize'] * 1024) return 1;
$i = curl_getinfo($_cURL);
if ($i['redirect_url']) return 1;
if ($i['http_code'] && $i['http_code'] >= 400) return 1;
return $_RDATA['sp_robots_header'];
}
);
} else OS_crawlLog('cURL functions are not enabled; cannot perform crawl', 2);
// ***** Pre-fill queue with starting URL(s) at depth 0, blank referer
$_RDATA['sp_queue'] = array();
foreach ($_RDATA['sp_starting'] as $starting) {
$starting = OS_formatURL($starting, $_ODATA['admin_install_domain'].'/');
$_RDATA['sp_queue'][] = array($starting, 0, '');
// Add starting URLs to required URLs so the crawler cannot travel
// into parent directories
$_RDATA['sp_require_url'][] = preg_replace('/\/[^\/]*$/', '/', $starting);
$host = parse_url($starting)['host'];
if (!in_array($host, $_RDATA['sp_hostnames'], true))
$_RDATA['sp_hostnames'][] = $host;
}
// ***** List of previously crawled links from the database
$_RDATA['sp_exist'] = array();
$_RDATA['sp_lastmod'] = array();
$crawldata = $_DDATA['pdo']->query(
'SELECT `url`, `content_checksum`, `last_modified`
FROM `'.$_DDATA['tbprefix'].'crawldata`'
);
$err = $crawldata->errorInfo();
if ($err[0] == '00000') {
foreach ($crawldata as $value) {
$_RDATA['sp_exist'][$value['content_checksum']] = $value['url'];
$_RDATA['sp_lastmod'][$value['url']] = $value['last_modified'];
}
} else OS_crawlLog('Error getting list of previous URLs from crawldata table', 2);
// If the crawltemp table exists here, that means a crawl was
// interrupted without completing the shutdown function.
// Use the data from this partially completed crawl to resume it.
if (in_array($_DDATA['tbprefix'].'crawltemp', $_DDATA['tables'], true)) {
$select = $_DDATA['pdo']->query(
'SELECT `url`, `links`, `content_checksum` FROM `'.$_DDATA['tbprefix'].'crawltemp`;'
);
$err = $select->errorInfo();
if ($err[0] == '00000') {
OS_crawlLog('Previous crawl data exists; using it to resume crawling...', 1);
$select = $select->fetchAll();
OS_crawlLog('Found '.count($select).' previously crawled URLs', 1);
$_RDATA['sp_resumed'] = true;
// Run through every entry in the crawltemp table
foreach ($select as $row) {
// If an entry matches an existing URL in the queue (just
// starting URLs right now) then delete that queue entry
foreach ($_RDATA['sp_queue'] as $key => $queue)
if ($row['url'] == $queue[0])
unset($_RDATA['sp_queue'][$key]);
// Add it to the 'stored' and 'crawled links' lists
$_RDATA['sp_store'][] = $row['url'];
$_RDATA['sp_links'][] = $row['url'];
// Add the content hash to the tally
$_RDATA['sp_sha1'][$row['content_checksum']] = $row['url'];
// Rebuild the domains list
$prurl = parse_url($row['url']);
$domain = $prurl['scheme'].'://'.$prurl['host'];
if (!isset($_RDATA['sp_domains'][$domain])) {
$_RDATA['sp_domains'][$domain] = 1;
} else $_RDATA['sp_domains'][$domain]++;
// Add links from the entry to the queue
$row['links'] = json_decode($row['links'], true);
foreach ($row['links'] as $link) {
$link = OS_formatURL($link, $row['url']);
// ***** If this link hasn't been crawled yet
if (!in_array($link, $_RDATA['sp_links'], true)) {
// ... and if link hasn't been queued yet
foreach ($_RDATA['sp_queue'] as $queue)
if ($link == $queue[0]) continue 2;
// ... and if link passes our user filters, add the link to
// the queue
if (!OS_filterURL($link, $row['url']))
$_RDATA['sp_queue'][] = array($link, 0, $row['url']);
}
}
}
// We couldn't select any data from the crawltemp table so delete it
} else {
$drop = $_DDATA['pdo']->query(
'DROP TABLE IF EXISTS `'.$_DDATA['tbprefix'].'crawltemp`;'
);
$err = $drop->errorInfo();
if ($err[0] != '00000') {
// If we couldn't delete the interrupted crawldata table, this is
// a fatal error
OS_crawlLog('Could not delete previously interrupted crawl data; unable to crawl.', 2);
throw new Exception('Could not delete previously interrupted crawl data; unable to crawl.');
}
}
}
// Create a temp MySQL storage table using schema of the existing table
$create = $_DDATA['pdo']->query(
'CREATE TABLE IF NOT EXISTS`'.$_DDATA['tbprefix'].'crawltemp`
LIKE `'.$_DDATA['tbprefix'].'crawldata`;'
);
$err = $create->errorInfo();
if ($err[0] != '00000') {
// If we could not create the crawldata table, or an interrupted
// crawldata table doesn't exist, then this is a fatal error
OS_crawlLog('Unable to create or reuse existing crawl data table; unable to crawl.', 2);
throw new Exception('Unable to create or reuse existing crawl data table; unable to crawl.');
}
// Prepare SQL statements
$selectData = $_DDATA['pdo']->prepare(
'SELECT `url`, `category`, `links`, `content_checksum`, `last_modified`,
`flag_updated`, `flag_unlisted`, `priority`
FROM `'.$_DDATA['tbprefix'].'crawldata` WHERE `url`=:url;'
);
$updateURL = $_DDATA['pdo']->prepare(
'UPDATE `'.$_DDATA['tbprefix'].'crawltemp` SET
`url`=:url WHERE `content_checksum`=:content_checksum;'
);
$insertTemp = $_DDATA['pdo']->prepare(
'INSERT INTO `'.$_DDATA['tbprefix'].'crawltemp` SET
`url`=:url,
`url_sort`=0,
`title`=:title,
`description`=:description,
`keywords`=:keywords,
`category`=:category,
`weighted`=:weighted,
`links`=:links,
`content`=:content,
`content_mime`=:content_mime,
`content_charset`=:content_charset,
`content_checksum`=:content_checksum,
`status`=:status,
`flag_unlisted`=:flag_unlisted,
`flag_updated`=:flag_updated,
`last_modified`=:last_modified,
`priority`=:priority
;'
);
$insertNotModified = $_DDATA['pdo']->prepare(
'REPLACE INTO `'.$_DDATA['tbprefix'].'crawltemp` (
`url`, `url_sort`, `title`, `description`, `keywords`, `category`,
`weighted`, `links`, `content`, `content_mime`, `content_charset`,
`content_checksum`, `status`, `flag_unlisted`, `flag_updated`,
`last_modified`, `priority`
) SELECT
`url`, 0, `title`, `description`, `keywords`, `category`,
`weighted`, `links`, `content`, `content_mime`, `content_charset`,
`content_checksum`, :status, `flag_unlisted`, 0,
`last_modified`, `priority`
FROM `'.$_DDATA['tbprefix'].'crawldata` WHERE `url`=:url;'
);
// ***** Begin crawling URLs from the queue
while ($_cURL && count($_RDATA['sp_queue'])) {
// Check if we have run out of execution time
if ($_ODATA['sp_time_start'] + $_ODATA['sp_timeout_crawl'] <= time()) {
OS_crawlLog('Maximum script runtime ('.$_ODATA['sp_timeout_crawl'].'s) reached', 2);
break;
}
// Check if user has canceled the crawl
if (OS_getValue('sp_cancel')) {
OS_crawlLog('Crawl canceled manually by user', 2);
break;
}
// Check if we have exceeded the maximum number of crawled links
if (count($_RDATA['sp_links']) > $_ODATA['sp_limit_crawl']) {
OS_crawlLog('Maximum number of crawled pages exceeded', 2);
break;
}
// Retrieve next link to crawl from the queue
list($url, $depth, $referer) = array_shift($_RDATA['sp_queue']);
$_RDATA['sp_links'][] = $url;
// Check if URL is beyond the depth limit
if ($depth > $_ODATA['sp_limit_depth']) {
OS_crawlLog('Maximum link depth ('.$_ODATA['sp_limit_depth'].') exceeded; URL at depth '.$depth.' was not stored: '.$url, 2);
continue;
}
// Check robots.txt for newly encountered hostnames
$purl = parse_url($url);
$port = (!empty($purl['port'])) ? ':'.$purl['port'] : '';
if (!isset($_RDATA['sp_robots'][$purl['host']])) {
$_RDATA['sp_robots'][$purl['host']] = array();
OS_crawlLog('Fetching robots.txt for domain: '.$purl['host'], 1);
curl_setopt($_cURL, CURLOPT_TIMECONDITION, CURL_TIMECOND_NONE);
$robotstxt = OS_fetchURL($purl['scheme'].'://'.$purl['host'].$port.'/robots.txt', '');
if (!$robotstxt['errno']) {
$robots = array();
$robot = '';
$robolines = explode("\n", $robotstxt['content']);
foreach ($robolines as $line) {
if (preg_match('/^user-agent\s*:\s*(.*)\s*$/i', $line, $r)) {
if (empty($robots[$robot = $r[1]]))
$robots[$robot] = array('disallow' => array(), 'allow' => array());
} else if (preg_match('/((dis)?allow)\s*:\s*(.*)\s*$/i', $line, $r))
$robots[$robot][strtolower($r[1])][] = OS_formatURL($r[3], $url);
}
foreach ($robots as $agent => $rules) {
if (preg_match('/^orc(a|inus)(-?php)?-?crawler$/i', $agent) || $agent == '*') {
foreach ($rules['disallow'] as $disrule)
if (!in_array($disrule, $_RDATA['sp_robots'][$purl['host']], true))
$_RDATA['sp_robots'][$purl['host']][] = $disrule;
foreach ($rules['allow'] as $rule) {
$key = array_search($rule, $_RDATA['sp_robots'][$purl['host']]);
if ($key !== false) unset($_RDATA['sp_robots'][$purl['host']][$key]);
}
}
}
}
}
if ($_RDATA['debug'])
OS_crawlLog('Memory used: '.OS_readSize(memory_get_usage(true)), 1);
OS_crawlLog('Crawling: '.$url.' (Depth: '.$depth.')', 1);
OS_setValue('sp_progress', array(
count($_RDATA['sp_links']),
count($_RDATA['sp_links']) + count($_RDATA['sp_queue']),
$_RDATA['sp_resumed']
));
OS_setValue('sp_time_end', time());
// Set the correct If-Modified-Since request header
if ($_ODATA['sp_ifmodifiedsince'] && isset($_RDATA['sp_lastmod'][$url])) {
curl_setopt($_cURL, CURLOPT_TIMEVALUE, $_RDATA['sp_lastmod'][$url]);
curl_setopt($_cURL, CURLOPT_TIMECONDITION, CURL_TIMECOND_IFMODSINCE);
} else curl_setopt($_cURL, CURLOPT_TIMECONDITION, CURL_TIMECOND_NONE);
// Fetch the URL
$data = OS_fetchURL($url, $referer);
// Record cURL timing and data info for this fetch
OS_setValue('sp_data_transferred', $_ODATA['sp_data_transferred'] + $data['info']['size_download']);
$_RDATA['sp_time_curl'] += $data['info']['total_time'];
// If there were cURL errors while fetching this URL
if ($data['errno']) {
// Else if the page hasn't been modified since the last crawl
} else if ($data['info']['http_code'] == 304) {
$data['info']['noindex'] = 'not-modified-304';
// Else if we received any content at all
} else if (trim($data['body'])) {
// Get a 20-byte binary hash of the raw content
$data['info']['sha1'] = sha1($data['body'], true);
// If this content does not duplicate previously stored content
if (empty($_RDATA['sp_sha1'][$data['info']['sha1']])) {
// Add the content hash to the tally
$_RDATA['sp_sha1'][$data['info']['sha1']] = $url;
// If this is a new page, or an existing page but the content
// hash has changed
if (!isset($_RDATA['sp_exist'][$data['info']['sha1']]) ||
$_RDATA['sp_exist'][$data['info']['sha1']] != $url) {
// Detect MIME-type using extension?
if (empty($data['info']['content_type']))
$data['info']['content_type'] = 'text/plain';
// Parse MIME-type
$data['info']['mime_type'] = '';
if (preg_match('/\w+\/[\w.+-]+/', $data['info']['content_type'], $m))
$data['info']['mime_type'] = $m[0];
// Parse Character Encoding
$data['info']['charset'] = '';
if (preg_match('/charset=([\w\d.:-]+)/i', $data['info']['content_type'], $m))
$data['info']['charset'] = $m[1];
if (!$data['info']['charset'])
$data['info']['charset'] = 'ISO-8859-1';
// GZ-Unzip the content if necessary
while (strpos($data['body'], "\x1f\x8b") === 0)
$data['body'] = gzinflate(substr($data['body'], 10));
// Title defaults to filename
$data['title'] = basename($data['info']['url']);
// Determine how to parse the content by MIME-type
switch ($data['info']['mime_type']) {
/* ***** PLAIN TEXT ************************************** */
case 'text/plain':
$data['content'] = $data['body'];
OS_cleanTextUTF8($data['content'], $data['info']['charset']);
break;
/* ***** XML DOCUMENT ************************************ */
case 'text/xml':
case 'application/xml':
$data['body'] = preg_replace('/<br(\s?\/)?>/', ' ', $data['body']);
$document = new DOMDocument();
if ($document->loadXML($data['body'], LIBXML_PARSEHUGE | LIBXML_BIGLINES | LIBXML_COMPACT)) {
// Remove <script> elements
$scripts = $document->getElementsByTagName('script');
foreach ($scripts as $script)
$script->parentNode->removeChild($script);
// Remove <!-- comments -->
$xpath = new DOMXpath($document);
$comments = $xpath->query('//comment()');
foreach ($comments as $comment)
$comment->parentNode->removeChild($comment);
// Check XML document charset
if (strtolower($data['info']['charset']) != strtolower($document->xmlEncoding)) {
OS_crawlLog('Charset in Content-type header ('.(($data['info']['charset']) ? $data['info']['charset'] : '<none>').') differs from document charset ('.(($document->xmlEncoding) ? $document->xmlEncoding : '<none>').') at: '.$data['info']['url'], 1);
$data['info']['charset'] = $document->xmlEncoding;
}
$data['content'] = $document->textContent;
// Could not parse XML; try to store content anyway
} else {
$data['error'] = 'Invalid XML - could not parse content; storing as-is';
$data['info']['nofollow'] = true;
// Remove <script> elements and <!-- comments -->
$data['content'] = preg_replace(array('/<!--.*?-->/s', '/<script.*?\/script>/is'), '', $data['body']);
$data['content'] = strip_tags($data['content']);
}
OS_cleanTextUTF8($data['content'], $data['info']['charset'], ENT_XML1);
break;
/* ***** HTML DOCUMENT *********************************** */
case 'text/html':
case 'application/xhtml+xml':
$data['body'] = preg_replace('/<br(\s?\/)?>/', ' ', $data['body']);
$document = new DOMDocument();
if ($document->loadHTML($data['body'], LIBXML_PARSEHUGE | LIBXML_BIGLINES | LIBXML_COMPACT | LIBXML_NOCDATA)) {
// Remove <script> elements
$scripts = $document->getElementsByTagName('script');
foreach ($scripts as $script)
$script->parentNode->removeChild($script);
// Remove <!-- comments -->
$xpath = new DOMXpath($document);
$comments = $xpath->query('//comment()');
foreach ($comments as $comment)
$comment->parentNode->removeChild($comment);
// ***** Process <head> elements
$head = $document->getElementsByTagName('head');
if (!empty($head[0])) {
$base = $head[0]->getElementsByTagName('base');
if (!empty($base[0]))
for ($x = 0; $x < count($base[0]->attributes); $x++)
if (strtolower($base[0]->attributes[$x]->name) == 'href')
if (!empty($base[0]->attributes[$x]->value))
$data['base'] = filter_var($base[0]->attributes[$x]->value, FILTER_SANITIZE_URL);
$metas = $head[0]->getElementsByTagName('meta');
foreach ($metas as $meta) {
for ($x = 0; $x < count($meta->attributes); $x++) {
if (strtolower($meta->attributes[$x]->name) == 'charset') {
if (strtolower($data['info']['charset']) != strtolower($meta->attributes[$x]->value)) {
OS_crawlLog('Charset in Content-type header ('.(($data['info']['charset']) ? $data['info']['charset'] : '<none>').') differs from document charset ('.(($meta->attributes[$x]->value) ? $meta->attributes[$x]->value : '<none>').') at: '.$data['info']['url'], 1);
$data['info']['charset'] = $meta->attributes[$x]->value;
}
} else if (strtolower($meta->attributes[$x]->name) == 'http-equiv') {
switch (strtolower($meta->attributes[$x]->value)) {
case 'refresh':
for ($y = 0; $y < count($meta->attributes); $y++) {
if (strtolower($meta->attributes[$y]->name) == 'content') {
if (preg_match('/(\d+)\s?;\s?url\s?=\s?([\'"])(.+?)\2?\s?$/i', $meta->attributes[$y]->value, $m)) {
if ((int)$m[1] <= $_ODATA['sp_timeout_url']) {
$data['errno'] = 300;
$data['error'] = 'Redirected by <meta> element to: '.$m[3];
$data['info']['redirect_url'] = $m[3];
$data['info']['noindex'] = 'redirect-meta';
$data['info']['nofollow'] = true;
break 4;
} else $data['links'][] = $m[3];
}
}
}
break;
case 'content-type':
for ($y = 0; $y < count($meta->attributes); $y++) {
if (strtolower($meta->attributes[$y]->name) == 'content' && preg_match('/charset=([\w\d.:-]+)/i', $meta->attributes[$y]->value, $m)) {
if (strtolower($data['info']['charset']) != strtolower($m[1])) {
OS_crawlLog('Charset in Content-type header ('.(($data['info']['charset']) ? $data['info']['charset'] : '<none>').') differs from document charset ('.(($m[1]) ? $m[1] : '<none>').') at: '.$data['info']['url'], 1);
$data['info']['charset'] = $m[1];
}
}
}
}
} else if (strtolower($meta->attributes[$x]->name) == 'name') {
switch (strtolower($meta->attributes[$x]->value)) {
case 'keywords':
for ($y = 0; $y < count($meta->attributes); $y++)
if (strtolower($meta->attributes[$y]->name) == 'content')
$data['keywords'] = $meta->attributes[$y]->value;
break;
case 'description':
for ($y = 0; $y < count($meta->attributes); $y++)
if (strtolower($meta->attributes[$y]->name) == 'content')
$data['description'] = $meta->attributes[$y]->value;
break;
case 'robots':
case 'orcacrawler':
case 'orcaphpcrawler':
case 'orca-crawler':
case 'orcaphp-crawler':
case 'orca-phpcrawler':
case 'orca-php-crawler':
case 'orcinuscrawler':
case 'orcinus-crawler':
for ($y = 0; $y < count($meta->attributes); $y++) {
if (strtolower($meta->attributes[$y]->name) == 'content') {
$content = explode(',', $meta->attributes[$y]->value);
foreach ($content as $con) {
switch (trim(strtolower($con))) {
case 'nofollow':
$data['info']['nofollow'] = true;
break;
case 'noindex':
$data['error'] = 'Not indexed due to robots <meta> element';
$data['info']['noindex'] = 'robots-meta';
}
}
}
}
}
}
}
}
$title = $head[0]->getElementsByTagName('title');
$data['title'] = $title[0]->textContent;
$links = $head[0]->getElementsByTagName('link');
foreach ($links as $link) {
for ($x = 0; $x < count($link->attributes); $x++) {
if (strtolower($link->attributes[$x]->name) == 'rel') {
for ($y = 0; $y < count($link->attributes); $y++) {
if (strtolower($link->attributes[$y]->name) == 'href') {
$linkurl = filter_var($link->attributes[$y]->value, FILTER_SANITIZE_URL);
switch (strtolower($link->attributes[$x]->value)) {
case 'canonical':
if (OS_formatURL($linkurl, $data['base']) != $data['info']['url']) {
$data['info']['noindex'] = 'non-canonical';
$data['info']['canonical'] = $linkurl;
}
case 'alternate':
case 'author':
case 'help':
case 'license':
case 'me':
case 'next':
case 'prev':
case 'search':
case 'alternate':
$data['links'][] = $linkurl;
}
break;
}
}
}
}
}
}
// ***** Process <body> elements
$body = $document->getElementsByTagName('body');
if (!empty($body[0])) {
// Replace <img> tags with their alt text
$imgs = $body[0]->getElementsByTagName('img');
foreach ($imgs as $img) {
for ($x = 0; $x < count($img->attributes); $x++) {
if (strtolower($img->attributes[$x]->name) == 'alt') {
$img->parentNode->replaceChild(
$document->createTextNode(' '.$img->attributes[$x]->value.' '),
$img
);
break;
}
}
}
$as = $body[0]->getElementsByTagName('a');
foreach ($as as $a) {
for ($x = 0; $x < count($a->attributes); $x++) {
if (strtolower($a->attributes[$x]->name) == 'href') {
for ($y = 0; $y < count($a->attributes); $y++)
if (strtolower($a->attributes[$y]->name) == 'rel' && strtolower($a->attributes[$y]->value) == 'nofollow') continue 3;
$data['links'][] = $a->attributes[$x]->value;
}
}
}
$areas = $body[0]->getElementsByTagName('area');
foreach ($areas as $area) {
for ($x = 0; $x < count($area->attributes); $x++) {
if (strtolower($area->attributes[$x]->name) == 'href') {
for ($y = 0; $y < count($area->attributes); $y++)
if (strtolower($area->attributes[$y]->name) == 'rel' && strtolower($area->attributes[$y]->value) == 'nofollow') continue 3;
$data['links'][] = $area->attributes[$x]->value;
}
}
}
$frames = $body[0]->getElementsByTagName('frame');
foreach ($frames as $frame)
for ($x = 0; $x < count($frame->attributes); $x++)
if (strtolower($frame->attributes[$x]->name) == 'src')
$data['links'][] = $frame->attributes[$x]->value;
$iframes = $body[0]->getElementsByTagName('iframe');
foreach ($iframes as $iframe)
for ($x = 0; $x < count($iframe->attributes); $x++)
if (strtolower($iframe->attributes[$x]->name) == 'src')
$data['links'][] = $iframe->attributes[$x]->value;
}
$data['links'] = array_map(function($l) {
if (preg_match('/^(tel|telnet|mailto|ftp|sftp|ssh|gopher|news|ldap|urn|onion|magnet):/i', $l)) return '';
return preg_replace('/#.*$/', '', $l);
}, $data['links']);
$data['links'] = array_filter(array_unique($data['links']));
// Remove tags
foreach ($_RDATA['sp_ignore_css'] as $ignoreCSS) {
switch ($ignoreCSS[0]) {
case '#': // Remove by ID
$id = $document->getElementById(substr($ignoreCSS, 1));
if (!is_null($id)) $id->parentNode->removeChild($id);
break;
case '.': // Remove by class
foreach ($xpath->evaluate('//*[contains(concat(" ", normalize-space(@class), " "), " '.substr($ignoreCSS, 1).' ")]') as $cls)
$cls->parentNode->removeChild($cls);
break;
default: // Remove by tag name
$tags = $document->getElementsByTagName($ignoreCSS);
foreach ($tags as $tag)
$tag->parentNode->removeChild($tag);
}
}
// Weighted elements
foreach ($_RDATA['s_weight_css'] as $weightCSS) {
switch ($weightCSS[0]) {
case '#': // Get content by ID
$id = $document->getElementById(substr($weightCSS, 1));
if (!is_null($id)) $data['weighted'] .= $id->textContent.' ';
break;
case '.': // Get content by class
foreach ($xpath->evaluate('//*[contains(concat(" ", normalize-space(@class), " "), " '.substr($weightCSS, 1).' ")]') as $cls)
$data['weighted'] .= $cls->textContent.' ';
break;
default: // Get content by tag name
$tags = $document->getElementsByTagName($weightCSS);
foreach ($tags as $tag)
$data['weighted'] .= $tag->textContent.' ';
}
}
$data['content'] = $document->textContent;
// Could not parse HTML; try to store content anyway
} else {
$data['error'] = 'Invalid HTML - could not parse content; storing as-is';
$data['info']['nofollow'] = true;
// Remove <script> elements and <!-- comments -->
$data['content'] = preg_replace(array('/<!--.*?-->/s', '/<script.*?\/script>/is'), '', $data['body']);
$data['content'] = strip_tags($data['content']);
}
// Not sure I need to do this, but hey... I could, so...
if ($data['info']['mime_type'] == 'application/xhtml+xml') {
$ent = ENT_XHTML;
} else if (!empty($document->doctype->publicId)) {
$publicId = strtoupper($document->doctype->publicId);
if (strpos($publicId, 'DTD XHTML') !== false) {
$ent = ENT_XHTML;
} else if (strpos($publicId, 'DTD HTML') !== false) {
$ent = ENT_HTML401;
} else $ent = ENT_XML1;
} else $ent = ENT_HTML5;
OS_cleanTextUTF8($data['title'], $data['info']['charset'], $ent);
OS_cleanTextUTF8($data['keywords'], $data['info']['charset'], $ent);
OS_cleanTextUTF8($data['description'], $data['info']['charset'], $ent);
OS_cleanTextUTF8($data['weighted'], $data['info']['charset'], $ent);
OS_cleanTextUTF8($data['content'], $data['info']['charset'], $ent);
break;
/* ***** PDF ********************************************* */
case 'application/pdf':
if ($_PDF) {
try {
$pdf = $_PDF->parseContent($data['body']);
$metadata = $pdf->getDetails();
// Prefer regular PDF metadata first, then try XMP
$getItems = array(
'title' => array('Title', 'dc:title', 'pdf:title'),
'description' => array('Subject', 'dc:description', 'pdf:subject'),
'keywords' => array('Keywords', 'dc:subject', 'pdf:keywords'),
'modified' => array('SourceModified', 'pdfx:sourcemodified', 'CreationDate', 'xmp:createdate')
);
foreach ($getItems as $key => $item) {
foreach ($item as $opt) {
if (!empty($metadata[$opt])) {
// Check if this is an array of list-items and if
// so, convert it to a comma-separated string
if (is_array($metadata[$opt]) && isset($metadata[$opt][0]) && is_string($metadata[$opt][0]))
$metadata[$opt] = implode(', ', $metadata[$opt]);
// Use the first valid string value we find as
// the appropriate property value
if (is_string($metadata[$opt]) && trim($metadata[$opt])) {
$data[$key] = $metadata[$opt];
break;
}
}
}
}
$data['content'] = $pdf->getText();
$data['info']['charset'] = mb_detect_encoding($data['content']);
if (!$data['info']['charset']) $data['info']['charset'] = 'CP1252';
OS_cleanTextUTF8($data['content'], $data['info']['charset']);
if (!empty($data['modified']))
if ($stamp = strtotime($data['modified']))
$data['info']['filetime'] = $stamp;
if ($data['content']) {
// Discard the PDF text if it contains Unicode control
// characters; some of these might be simple PDF ligatures
// but PDFParser doesn't support them; any content that
// contains these is usually mostly gobbledegook
if (strpos($data['content'], "\u{3}") === false &&
strpos($data['content'], "\u{2}") === false &&
strpos($data['content'], "\u{1}") === false) {
OS_cleanTextUTF8($data['title'], mb_detect_encoding($data['title']));
OS_cleanTextUTF8($data['keywords'], mb_detect_encoding($data['keywords']));
OS_cleanTextUTF8($data['description'], mb_detect_encoding($data['description']));
} else {
$data['errno'] = 703;
$data['error'] = 'Failed to decode PDF text';
$data['content'] = '';
$data['info']['noindex'] = 'couldnt-decode-pdf';
}
} else {
$data['errno'] = 702;
$data['error'] = 'PDF is empty of extractable text';
$data['info']['noindex'] = 'empty-pdf';
}
} catch (Exception $e) {
$data['errno'] = 701;
$data['error'] = 'PDF is secured/encrypted; text extraction failed';
$data['content'] = '';
$data['info']['noindex'] = 'secured-pdf';
}
} else $data['info']['noindex'] = 'missing-pdfparser';
break;
/* ***** Unknown MIME-type ******************************* */
default:
$data['error'] = 'Not indexed due to unknown MIME type ('.$data['info']['mime_type'].')';
$data['info']['noindex'] = 'unknown-mime';
}
// Else content is identical to the old entry so don't parse
} else {
$data['info']['noindex'] = 'not-modified-sha1';
}
// Else content is a duplicate of a previously stored page
} else {
// Update the stored URL to the shortest version
if (strlen($url) < strlen($_RDATA['sp_sha1'][$data['info']['sha1']])) {
$updateURL->execute(array(
'url' => $url,
'content_checksum' => $data['info']['sha1']
));
}
$data['info']['noindex'] = 'duplicate';
}
// Else the 'body' of the response was empty
} else {
$data['error'] = 'Server returned no content';
$data['info']['noindex'] = 'empty';
}
// Decide whether or not to 'index' / store this page
switch ($data['info']['noindex']) {
// ***** There is no 'noindex' reason, so store the page
case '':
case 'not-modified-304':
case 'not-modified-sha1':
if ($referer == '<orphan>') {
$data['info']['status'] = 'Orphan';
$_RDATA['sp_status']['Orphan']++;
} else $data['info']['status'] = 'OK';
// ***** If we got new or updated content for this URL
if (!$data['info']['noindex']) {
// If this URL exists (or existed) in the live table...
if (in_array($url, $_RDATA['sp_exist'], true) || $referer == '<orphan>') {
$_RDATA['sp_status']['Updated']++;
$selectData->execute(array('url' => $url));
$err = $selectData->errorInfo();
if ($err[0] != '00000') {
OS_crawlLog('Database select error: '.$url, 2);
OS_crawlLog($err[2], 0);
break 2;
}
$row = $selectData->fetchAll()[0];
// Else provide default values for a new URL
} else {
$_RDATA['sp_status']['New']++;
$row = array(
'category' => $_ODATA['sp_category_default'],
'flag_unlisted' => 0,
'priority' => 0.5
);
}
if ($data['info']['filetime'] <= 0)
$data['info']['filetime'] = time();
// Remove text from titles
foreach ($_RDATA['sp_title_strip'] as $titleStrip) {
if ($titleStrip[0] == '*') {
$data['title'] = preg_replace('/'.str_replace('/', '\/', substr($titleStrip, 1)).'/', '', $data['title']);
} else $data['title'] = str_replace($titleStrip, '', $data['title']);
}
$port = (!empty($data['url']['port'])) ? ':'.$data['url']['port'] : '';
$insertTemp->execute(array(
'url' => $url,
'title' => trim($data['title']),
'description' => $data['description'],
'keywords' => $data['keywords'],
'category' => $row['category'],
'weighted' => $data['weighted'],
'links' => json_encode($data['links'], JSON_INVALID_UTF8_IGNORE),
'content' => $data['content'],
'content_mime' => $data['info']['mime_type'],
'content_charset' => $data['info']['charset'],
'content_checksum' => $data['info']['sha1'],
'status' => $data['info']['status'],
'flag_unlisted' => $row['flag_unlisted'],
'flag_updated' => 1,
'last_modified' => $data['info']['filetime'],
'priority' => $row['priority']
));
if (!$insertTemp->rowCount()) {
OS_crawlLog('Database primary insert error: '.$url, 2);
$err = $insertTemp->errorInfo();
if ($err[0] != '00000') OS_crawlLog($err[2], 0);
} else $_RDATA['sp_store'][] = $url;
// ***** URL hasn't been modified since the last successful crawl
} else {
OS_crawlLog('Page hasn\'t been modified since the last successful crawl', 0);
// Preset the 'last_modified' time and 'priority' until we can
// find out the actual values from the previous database record
$data['info']['filetime'] = time();
$row = array('priority' => 0.5);
// Get previous entry from existing search database
$insertNotModified->execute(array(
'url' => $url,
'status' => $data['info']['status']
));
if ($insertNotModified->rowCount()) {
// Mark as 'stored'
$_RDATA['sp_store'][] = $url;
// Get 'priority' & 'last_modified' values for the sitemap
// Load the previously saved link list to add to the queue
$selectData->execute(array('url' => $url));
$err = $selectData->errorInfo();
if ($err[0] == '00000') {
$row = $selectData->fetchAll()[0];
$data['links'] = json_decode($row['links'], true);
$data['info']['filetime'] = $row['last_modified'];
} else OS_crawlLog('Database existing table row read error: '.$url, 2);
// Could not insert previously stored row into temp table
} else {
OS_crawlLog('Database \'not-modified\' insert error: '.$url, 2);
$err = $insertNotModified->errorInfo();
if ($err[0] != '00000') OS_crawlLog($err[2], 0);
}
}
$domain = $data['url']['scheme'].'://'.$data['url']['host'];
if (!isset($_RDATA['sp_domains'][$domain])) {
$_RDATA['sp_domains'][$domain] = 1;
} else $_RDATA['sp_domains'][$domain]++;
// Store data for use in the sitemap
if ($_ODATA['sp_sitemap_file'] &&
$data['url']['host'] == $_ODATA['sp_sitemap_hostname']) {
$delta = time() - $data['info']['filetime'];
$cf = 'always';
if ($delta > 2700 && $delta <= 64800) $cf = 'hourly';
if ($delta > 64800 && $delta <= 432000) $cf = 'daily';
if ($delta > 432000 && $delta <= 2160000) $cf = 'weekly';
if ($delta > 2160000 && $delta <= 21600000) $cf = 'monthly';
if ($delta > 21600000 && $delta <= 62400000) $cf = 'yearly';
if ($delta > 62400000) $cf = 'never';
$_RDATA['sp_sitemap'][] = array(
'loc' => str_replace(' ', '%20', htmlentities($url)),
'lastmod' => date('Y-m-d', $data['info']['filetime']),
'changefreq' => $cf,
'priority' => $row['priority']
);
}
break;
// ***** Otherwise, log the reason why this page was not stored
case 'duplicate':
OS_crawlLog('Content is a duplicate of already indexed page: '.$_RDATA['sp_sha1'][$data['info']['sha1']].' (Referrer was: '.$referer.')', 2);
break;
case 'timeout':
case 'network-error':
case 'couldnt-connect':
OS_crawlLog($data['error'].': '.$url, 2);
if ($referer == '<orphan>') $_RDATA['sp_status']['Blocked']++;
break;
case 'empty':
case 'too-large':
case 'robots-meta':
case 'robots-http':
case 'unknown-mime':
case 'self-reference':
case 'empty-pdf':
case 'secured-pdf':
case 'couldnt-decode-pdf':
OS_crawlLog($data['error'], 1);
if ($referer == '<orphan>') $_RDATA['sp_status']['Blocked']++;
break;
case '400':
OS_crawlLog($data['error'].': '.$url.' (Referrer was: '.$referer.')', 2);
if ($referer == '<orphan>') $_RDATA['sp_status']['Not Found']++;
break;
case 'redirect-meta':
case 'redirect-location':
OS_crawlLog($data['error'].': '.$url.' (Referrer was: '.$referer.')', 2);
OS_crawlLog('Page was removed in favour of redirected URL', 0);
$data['links'][] = $data['info']['redirect_url'];
break;
case 'non-canonical':
OS_crawlLog('Not indexed due to canonical <link> element: '.$data['info']['canonical'], 1);
OS_crawlLog('Referrer was: '.$referer, 0);
break;
default:
OS_crawlLog('Not indexed due to noindex rule \''.$data['info']['noindex'].'\': '.$url.' (Referrer was: '.$referer.')', 2);
if ($referer == '<orphan>') $_RDATA['sp_status']['Blocked']++;
break;
}
// Check if we have stored the maximum allowed number of pages
if (count($_RDATA['sp_store']) >= $_ODATA['sp_limit_store']) {
OS_crawlLog('Maximum number of crawled pages reached ('.$_ODATA['sp_limit_store'].')', 1);
$_RDATA['sp_complete'] = true;
break;
}
// If we fetched more links from the content above, parse and add
// them to the queue
if (!$data['info']['nofollow']) {
foreach ($data['links'] as $link) {
$link = OS_formatURL($link, $data['base']);
// ***** If this link hasn't been crawled yet
if (!in_array($link, $_RDATA['sp_links'], true)) {
// ... and if link hasn't been queued yet
foreach ($_RDATA['sp_queue'] as $queue)
if ($link == $queue[0]) continue 2;
// ... and if link passes our user filters
if ($nx = OS_filterURL($link, $data['base'])) {
OS_crawlLog('Link ignored due to noindex rule \''.$nx.'\': '.$link, 0);
// ... then add the link to the queue
} else $_RDATA['sp_queue'][] = array($link, $depth + 1, $url);
}
}
}
// If we've completed the queue, check for orphans
if (!count($_RDATA['sp_queue'])) {
// Diff the previous URL list with the links we've already scanned
$_RDATA['sp_exist'] = array_diff($_RDATA['sp_exist'], $_RDATA['sp_links']);
// If we have leftover links, and we aren't autodeleting them
if (count($_RDATA['sp_exist']) && !$_ODATA['sp_autodelete']) {
OS_crawlLog('Adding '.count($_RDATA['sp_exist']).' orphan(s) to queue...', 1);
foreach ($_RDATA['sp_exist'] as $key => $link) {
// Check if orphan URL passes our user filters
if ($nx = OS_filterURL($link, $data['base'])) {
// If not, remove it from the sp_exist list
OS_crawlLog('Orphan URL ignored due to noindex rule \''.$nx.'\': '.$link, 0);
$_RDATA['sp_status']['Blocked']++;
unset($_RDATA['sp_exist'][$key]);
// If so, then add the orphan to the queue
} else $_RDATA['sp_queue'][] = array($link, 0, '<orphan>');
}
// Else if we stored some pages, we're done
} else if (count($_RDATA['sp_store'])) {
$_RDATA['sp_complete'] = true;
// No pages were stored
} else OS_crawlLog('No pages could be indexed; check your starting URL(s)', 2);
}
gc_collect_cycles();
usleep($_ODATA['sp_sleep'] * 1000);
$_RDATA['sp_sleep'] += $_ODATA['sp_sleep'];
}
// ***** Write sitemap
if ($_RDATA['sp_complete'] && $_ODATA['sp_sitemap_file']) {
if ($_RDATA['sp_sitemap_file'] != 'does not exist') {
if ($_RDATA['sp_sitemap_file'] != 'not writable') {
$sm = array('<?xml version="1.0" encoding="UTF-8"?>');
$sm[] = '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">';
foreach ($_RDATA['sp_sitemap'] as $sitemap) {
$sm[] = ' <url>';
foreach ($sitemap as $key => $value)
if ($key != 'priority' || $value != 0.5)
$sm[] = ' <'.$key.'>'.$value.'</'.$key.'>';
$sm[] = ' </url>';
}
$sm[] = '</urlset>';
if (preg_match('/\.xml\.gz$/', $_RDATA['sp_sitemap_file'])) {
if (function_exists('gzopen')) {
$smf = gzopen($_RDATA['sp_sitemap_file'], 'w');
gzwrite($smf, implode("\n", $sm));
gzclose($smf);
OS_crawlLog('Sitemap written successfully: '.$_ODATA['sp_sitemap_file'], 1);
} else OS_crawlLog('Could not write sitemap; PHP gzip functions are not enabled', 2);
} else if (preg_match('/\.xml$/', $_RDATA['sp_sitemap_file'])) {
$smf = fopen($_RDATA['sp_sitemap_file'], 'w');
fwrite($smf, implode("\n", $sm));
fclose($smf);
OS_crawlLog('Sitemap written successfully: '.$_ODATA['sp_sitemap_file'], 1);
} else OS_crawlLog('Sitemap filename ('.$_ODATA['sp_sitemap_file'].') must have extension \'.xml\' or \'.xml.gz\'', 2);
} else OS_crawlLog('Sitemap file \''.$_ODATA['sp_sitemap_file'].'\' is not writable', 2);
} else OS_crawlLog('Sitemap file \''.$_ODATA['sp_sitemap_file'].'\' does not exist', 2);
} ?>