'; } fwrite($_RDATA['sp_log'], $prefix.$text."\n"); if ($_RDATA['debug'] || ($_SERVER['REQUEST_METHOD'] == 'CLI' && $level >= $_RDATA['sp_log_clilevel'])) { echo $prefix.$text."\n"; } } /** * Final prep to store content in UTF-8 format in the database * */ function OS_cleanTextUTF8(&$_, $charset, $entity = false) { global $_RDATA; if (!trim($charset)) $charset = 'ISO-8859-1'; $_ = mb_convert_encoding($_, 'UTF-8', $charset); if ($entity) $_ = html_entity_decode($_, $entity | ENT_SUBSTITUTE, 'UTF-8'); $_ = strtr($_, $_RDATA['sp_punct']); $_ = strtr($_, $_RDATA['sp_whitespace']); $_ = preg_replace(array('/\s/', '/ {2,}/'), ' ', trim($_)); } /** * Format a full or partial URL into a full URL according to a base URL * */ function OS_formatURL($_, $base) { $_ = str_replace(' ', '%20', $_); $_ = preg_replace('/#.*$/', '', filter_var($_, FILTER_SANITIZE_URL)); $_ = str_replace('%20', ' ', $_); $dirbase = preg_replace('/(? parse_url($url), 'body' => curl_exec($_cURL), 'base' => $url, 'info' => curl_getinfo($_cURL), 'error' => curl_error($_cURL), 'errno' => curl_errno($_cURL), 'links' => array(), 'title' => '', 'content' => '', 'keywords' => '', 'weighted' => '', 'description' => '' ); $_['info']['url'] = $url; $_['info']['noindex'] = ''; $_['info']['nofollow'] = false; // Process any cURL errors switch ($_['errno']) { case 0: // Success case 42: // Aborted by callback if ($_['info']['http_code'] >= 400) { $_['errno'] = 22; $_['error'] = $_['info']['http_code'].' error'; $_['info']['noindex'] = '400'; } else if ($_['info']['redirect_url']) { $_['errno'] = 300; $_['error'] = 'Redirected by HTTP header to: '.$_['info']['redirect_url']; $_['info']['noindex'] = 'redirect-location'; } else if ($_RDATA['sp_robots_header']) { $_['errno'] = 777; $_['error'] = 'Blocked by \'X-Robots-Tag\' HTTP header'; $_['info']['noindex'] = 'robots-http'; } else if ($_RDATA['sp_self_reference']) { $_['errno'] = 888; $_['error'] = 'Refused to index myself'; $_['info']['noindex'] = 'self-reference'; } else if ($_['errno'] == 42) { $_['errno'] = 999; $_['error'] = 'Max filesize exceeded'; $_['info']['noindex'] = 'too-large'; } break; case 28: // Timeout $_['error'] = 'Timed out waiting for data'; $_['info']['noindex'] = 'timeout'; break; case 55: // Network send error case 56: // Network receive error $_['error'] = 'Network error retrieving data'; $_['info']['noindex'] = 'network-error'; break; case 6: // Could not resolve host case 7: // Could not connect to host $_['error'] = 'Couldn\'t connect to host: '.$_['url']['host']; $_['info']['noindex'] = 'couldnt-connect'; break; default: // Uncaught cURL error OS_crawlLog('Uncaught cURL error: '.$url, 2); OS_crawlLog($_['errno'], 1); OS_crawlLog($_['error'], 1); OS_crawlLog(print_r($_['info'], true), 1); throw new Exception('Uncaught cURL error'); } return $_; } /** * Shutdown function to provide cleanup before exit * */ function OS_crawlCleanUp() { global $_DDATA, $_ODATA, $_RDATA, $_cURL, $_MAIL; // If the crawl has already been canceled, don't bother if (!OS_getValue('sp_crawling')) return; $error = error_get_last(); if (!is_null($error) && $error['type'] == E_ERROR) { OS_crawlLog($error['message'], 2); OS_crawlLog('File: \''.$error['file'].'\' at line number: '.$error['line'], 0); $_RDATA['sp_complete'] = false; } // Save or display cookies? $cookies = curl_getinfo($_cURL, CURLINFO_COOKIELIST); // var_dump($cookies); curl_close($_cURL); // If crawl completed successfully if ($_RDATA['sp_complete']) { OS_crawlLog('Cleaning up database tables...', 1); // Add a natural sort order value to each entry natcasesort($_RDATA['sp_store']); $_RDATA['sp_store'] = array_values($_RDATA['sp_store']); $url_sort = $_DDATA['pdo']->prepare( 'UPDATE `'.$_DDATA['tbprefix'].'crawltemp` SET `url_sort`=:url_sort WHERE `url`=:url;' ); foreach ($_RDATA['sp_store'] as $key => $stored_url) { $url_sort->execute(array( 'url_sort' => $key, 'url' => $stored_url )); $err = $url_sort->errorInfo(); if ($err[0] != '00000') { OS_crawlLog('Error sorting the search database', 1); OS_crawlLog($err[2], 0); break; } } // Truncate the existing search database $truncate = $_DDATA['pdo']->query( 'TRUNCATE `'.$_DDATA['tbprefix'].'crawldata`;' ); $err = $truncate->errorInfo(); if ($err[0] != '00000') { OS_crawlLog('Could not truncate the search database', 1); OS_crawlLog($err[2], 0); // Last chance to bail out before we make actual changes $_RDATA['sp_complete'] = false; } } // If crawl completed successfully AND we truncated the old table if ($_RDATA['sp_complete']) { OS_setValue('sp_time_end', time()); OS_setValue('sp_time_last', $_ODATA['sp_time_end'] - $_ODATA['sp_time_start']); // Select all rows from the temp table into the existing search table $insert = $_DDATA['pdo']->query( 'INSERT INTO `'.$_DDATA['tbprefix'].'crawldata` SELECT * FROM `'.$_DDATA['tbprefix'].'crawltemp`;' ); $err = $insert->errorInfo(); if ($err[0] == '00000') { $tableinfo = $_DDATA['pdo']->query( 'SHOW TABLE STATUS LIKE \''.$_DDATA['tbprefix'].'crawldata\';' ); $err = $tableinfo->errorInfo(); if ($err[0] == '00000') { $tableinfo = $tableinfo->fetchAll(); OS_setValue('sp_data_stored', $tableinfo[0]['Data_length']); } else OS_crawlLog('Could not read crawl table status', 1); // Purge the search result cache if ($_ODATA['s_limit_cache']) { $purge = $_DDATA['pdo']->query( 'UPDATE `'.$_DDATA['tbprefix'].'query` SET `cache`=\'\';' ); $err = $purge->errorInfo(); if ($err[0] != '00000') OS_crawlLog('Could not purge search result cache', 1); } // Optimize the query log table $optimize = $_DDATA['pdo']->query( 'OPTIMIZE TABLE `'.$_DDATA['tbprefix'].'query`;' ); OS_setValue('sp_pages_stored', count($_RDATA['sp_store'])); OS_setValue('sp_domains', $_RDATA['sp_domains']); OS_setValue('sp_time_end_success', $_ODATA['sp_time_end']); OS_crawlLog('***** Crawl completed in '.$_ODATA['sp_time_last'].'s *****', 1); OS_crawlLog('Total data transferred: '.OS_readSize($_ODATA['sp_data_transferred']), 1); OS_crawlLog('Average transfer speed: '.OS_readSize(round($_ODATA['sp_data_transferred'] / $_ODATA['sp_time_last'])).'/s', 1); if ($_RDATA['sp_sleep']) OS_crawlLog('Time spent sleeping: '.(round($_RDATA['sp_sleep'] / 10) / 100).'s', 1); OS_crawlLog('Time taken by cURL: '.(round($_RDATA['sp_time_curl'] * 100) / 100).'s', 1); OS_crawlLog($_ODATA['sp_progress'][0].' page'.(($_ODATA['sp_progress'][0] == 1) ? '' : 's').' crawled', 1); OS_crawlLog($_ODATA['sp_pages_stored'].' page'.(($_ODATA['sp_pages_stored'] == 1) ? '' : 's').' stored', 1); if ($_RDATA['sp_status']['New']) OS_crawlLog($_RDATA['sp_status']['New'].' new '.(($_RDATA['sp_status']['New'] == 1) ? 'page' : 'pages').' found', 0); if ($_RDATA['sp_status']['Updated']) OS_crawlLog($_RDATA['sp_status']['Updated'].' '.(($_RDATA['sp_status']['Updated'] == 1) ? 'page' : 'pages').' updated', 0); if ($_RDATA['sp_status']['Blocked']) OS_crawlLog($_RDATA['sp_status']['Blocked'].' '.(($_RDATA['sp_status']['Blocked'] == 1) ? 'page' : 'pages').' blocked', 0); if ($_RDATA['sp_status']['Not Found']) OS_crawlLog($_RDATA['sp_status']['Not Found'].' '.(($_RDATA['sp_status']['Not Found'] == 1) ? 'page' : 'pages').' not found', 0); if ($_RDATA['sp_status']['Orphan']) OS_crawlLog($_RDATA['sp_status']['Orphan'].' orphaned '.(($_RDATA['sp_status']['Orphan'] == 1) ? 'page' : 'pages'), 0); if ($_ODATA['sp_autodelete']) OS_crawlLog('Orphaned pages were auto-deleted', 1); // Send success email to the admin(s) if ($_MAIL && count($_MAIL->getAllRecipientAddresses()) && $_ODATA['sp_email_success']) { $_MAIL->Subject = 'Orcinus Site Search Crawler: Crawl succeeded'; $_MAIL->Body = implode(" \r\n", preg_grep('/^[\[\*]/', explode("\n", file_get_contents($_ODATA['sp_log'])))); if (!$_MAIL->Send()) OS_crawlLog('Could not send notification email', 2); } $cliMessage = 'Crawl completed successfully'; $jsonMessage = json_encode(array( 'status' => 'Success', 'message' => $cliMessage ), JSON_INVALID_UTF8_IGNORE); // We truncated the search table but FAILED to populate it! // This is a serious error that disables searching until the // crawler is run again! } else { OS_crawlLog('Could not populate the search table', 2); OS_crawlLog($err[2], 0); OS_crawlLog('***** Crawl failed; runtime '.$_ODATA['sp_time_last'].'s *****', 1); OS_crawlLog('Search table was cleared, but could not be repopulated!', 1); OS_crawlLog('The crawler MUST be run again to fix this issue!', 1); // Send failure email to the admin(s) if ($_MAIL && count($_MAIL->getAllRecipientAddresses()) && $_ODATA['sp_email_failure']) { $_MAIL->Subject = 'Orcinus Site Search Crawler: Catastrophic failure!'; $_MAIL->Body = implode(" \r\n", preg_grep('/^[\[\*\w\d]/', explode("\n", file_get_contents($_ODATA['sp_log'])))); if (!$_MAIL->Send()) OS_crawlLog('Could not send notification email', 2); } $cliMessage = 'Could not populate search table; search table is currently empty!'; $jsonMessage = json_encode(array( 'status' => 'Error', 'message' => $cliMessage ), JSON_INVALID_UTF8_IGNORE); } // Else the crawl failed } else { OS_setValue('sp_time_last', $_ODATA['sp_time_end'] - $_ODATA['sp_time_start']); OS_crawlLog('***** Crawl failed; runtime '.$_ODATA['sp_time_last'].'s *****', 1); OS_crawlLog('Total data transferred: '.OS_readSize($_ODATA['sp_data_transferred']), 1); OS_crawlLog('Search table was NOT updated', 1); if ($_ODATA['sp_sitemap_file']) OS_crawlLog('Sitemap was NOT updated', 1); // Send failure email to the admin(s) if ($_MAIL && count($_MAIL->getAllRecipientAddresses()) && $_ODATA['sp_email_failure'] && !$_ODATA['sp_cancel']) { $_MAIL->Subject = 'Orcinus Site Search Crawler: Crawl failed'; $_MAIL->Body = implode(" \r\n", preg_grep('/^[\[\*\w\d]/', explode("\n", file_get_contents($_ODATA['sp_log'])))); if (!$_MAIL->Send()) OS_crawlLog('Could not send notification email', 2); } $cliMessage = 'Crawl failed; see the log for details'; $jsonMessage = json_encode(array( 'status' => 'Error', 'message' => $cliMessage ), JSON_INVALID_UTF8_IGNORE); } // Delete the temp search table $drop = $_DDATA['pdo']->query( 'DROP TABLE IF EXISTS `'.$_DDATA['tbprefix'].'crawltemp`;' ); $err = $drop->errorInfo(); if ($err[0] != '00000') { OS_crawlLog('Could not delete the temporary search table', 1); OS_crawlLog($err[2], 0); } // Store the log file to the config database OS_setValue('sp_log', file_get_contents($_ODATA['sp_log'])); fclose($_RDATA['sp_log']); // Unset the crawling flag OS_setValue('sp_crawling', 0); if ($_SERVER['REQUEST_METHOD'] != 'CLI') { if (!$_RDATA['debug']) header('Content-type: application/json; charset='.strtolower($_ODATA['s_charset'])); die($jsonMessage); } else die($cliMessage."\n"); } // ***** Accept incoming commands by REQUEST_METHOD switch ($_SERVER['REQUEST_METHOD']) { /* ***** Handle POST Requests ************************************ */ case 'POST': // JSON POST request // These are usually sent by javascript fetch() if (strpos(trim($_SERVER['CONTENT_TYPE']), 'application/json') === 0) { $postBody = file_get_contents('php://input'); $_POST = json_decode($postBody, false); $response = array(); if (empty($_POST->action)) $_POST->action = ''; switch ($_POST->action) { case 'crawl': if (!empty($_POST->sp_key) && OS_getValue('sp_key') && $_POST->sp_key == $_ODATA['sp_key']) { if (OS_getValue('sp_crawling')) { $response = array( 'status' => 'Error', 'message' => 'Crawler is already running; current progress: '.$_ODATA['sp_progress'][0].'/'.$_ODATA['sp_progress'][1] ); // Go crawl! } else OS_setValue('sp_crawling', getmypid()); } else { $response = array( 'status' => 'Error', 'message' => 'Incorrect key to initiate crawler' ); } OS_setValue('sp_key', ''); break; case 'progress': $lines = array(); if (!empty($_POST->log)) { if (OS_getValue('sp_crawling')) { if (strpos($_ODATA['sp_log'], "\n") === false && file_exists($_ODATA['sp_log'])) $lines = file($_ODATA['sp_log'], FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); } else $lines = explode("\n", $_ODATA['sp_log']); if (empty($_POST->grep)) $_POST->grep = ''; switch ($_POST->grep) { case 'all': break; case 'errors': $lines = preg_grep('/^[\[\*]/', $lines); break; default: $lines = preg_grep('/^[\[\*\w\d]/', $lines); } } // If crawl is in progress, return just the last 15 lines if ($_ODATA['sp_crawling']) $lines = array_slice($lines, -15); $response = array( 'status' => ($_ODATA['sp_crawling']) ? 'Crawling' : 'Complete', 'progress' => $_ODATA['sp_progress'], 'data_transferred' => $_ODATA['sp_data_transferred'], 'time_crawl' => time() - $_ODATA['sp_time_start'], 'time_start' => $_ODATA['sp_time_start'], 'time_end' => $_ODATA['sp_time_end'], 'timeout_crawl' => $_ODATA['sp_timeout_crawl'], 'tail' => trim(implode("\n", $lines)) ); break; case 'cancel': if (OS_getValue('sp_crawling')) { // IF the crawler 'time_start' is more than 'timeout_crawl' // seconds ago, or the 'force' token is set, the crawler is // probably stuck. Unstick it. if (empty($_POST->force)) $_POST->force = ''; if ($_POST->force || time() - $_ODATA['sp_time_start'] > $_ODATA['sp_timeout_crawl']) { OS_setValue('sp_crawling', 0); if (empty($_POST->reason)) $_POST->reason = 'The crawler halted unexpectedly'; if (strpos($_ODATA['sp_log'], "\n") === false && file_exists($_ODATA['sp_log'])) { $log = file_get_contents($_ODATA['sp_log']); OS_setValue('sp_log', $log."\n".'[ERROR] '.$_POST->reason); } else OS_setValue('sp_log', '[ERROR] '.$_POST->reason); OS_setValue('sp_time_last', $_ODATA['sp_time_end'] - $_ODATA['sp_time_start']); // Send failure email to the admin(s) if ($_MAIL && count($_MAIL->getAllRecipientAddresses()) && $_ODATA['sp_email_failure']) { $_MAIL->Subject = 'Orcinus Site Search Crawler: Crawler halted unexpectedly'; $_MAIL->Body = implode(" \r\n", preg_grep('/^[\[\*\w\d]/', explode("\n", $_ODATA['sp_log']))); if (!$_MAIL->Send()) OS_setValue('sp_log', $_ODATA['sp_log']."\n".'[ERROR] Could not send notification email'); } } OS_setValue('sp_cancel', 1); $response = array( 'status' => 'Success', 'message' => 'Cancel flag was set', 'crawl_time' => time() - $_ODATA['sp_time_start'] ); } else { $response = array( 'status' => 'Error', 'message' => 'Crawler is not currently running' ); } break; default: $response = array( 'status' => 'Error', 'message' => 'Unrecognized command' ); } // If we have a response to give, display it and exit if ($response) { header('Content-type: application/json; charset='.strtolower($_ODATA['s_charset'])); die(json_encode($response, JSON_INVALID_UTF8_IGNORE)); } // Don't do anything for normal POST request // These are usually sent by
HTML elements } else { header('Content-type: text/plain; charset='.strtolower($_ODATA['s_charset'])); die($_ODATA['sp_useragent']); } break; // Allow CLI requests through case '': if (!empty($_SERVER['argv'][0]) && $_SERVER['argv'][0] == $_SERVER['PHP_SELF']) { $_SERVER['REQUEST_METHOD'] = 'CLI'; if (!OS_getValue('sp_crawling')) { // Set the logging level, if specified if (!empty($_SERVER['argv'][1]) && preg_match('/^-log=([012])$/', $_SERVER['argv'][1], $match)) { $_RDATA['sp_log_clilevel'] = (int)$match[1]; } else $_RDATA['sp_log_clilevel'] = 2; // Start a crawl OS_setValue('sp_crawling', getmypid()); } else die('Crawler is already running; exiting...'); } else die($_ODATA['sp_useragent']); break; // Don't do anything for GET requests, unless in debug mode case 'GET': header('Content-type: text/plain; charset='.strtolower($_ODATA['s_charset'])); if ($_RDATA['debug']) { // If we are in debug mode, but the crawler is already running, exit if (OS_getValue('sp_crawling')) die('Crawler is already running; exiting...'); // Start a crawl OS_setValue('sp_crawling', getmypid()); } else die($_ODATA['sp_useragent']); break; // Exit for all other request types default: header('Content-type: text/plain; charset='.strtolower($_ODATA['s_charset'])); die($_ODATA['sp_useragent']); } // One last check for a race condition sleep(1); if (OS_getValue('sp_crawling') != getmypid()) die('Crawler is already running; exiting...'); /* ***** Begin Crawl Execution ************************************* */ register_shutdown_function('OS_crawlCleanUp'); ignore_user_abort(true); @set_time_limit($_ODATA['sp_timeout_crawl'] * 1.1); libxml_use_internal_errors(true); if (function_exists('apache_setenv')) apache_setenv('no-gzip', '1'); OS_setValue('sp_cancel', 0); OS_setValue('sp_time_start', time()); OS_setValue('sp_progress', array(0, 1, false)); OS_setValue('sp_pages_stored', 0); OS_setValue('sp_data_transferred', 0); OS_setValue('sp_data_stored', 0); OS_setValue('sp_time_last', 0); $_RDATA['sp_log'] = tmpfile(); OS_setValue('sp_log', stream_get_meta_data($_RDATA['sp_log'])['uri']); OS_crawlLog('***** Crawl started: '.date('r').' *****', 1); // ***** Prepare runtime data $_RDATA['sp_starting'] = array_filter(array_map('trim', explode("\n", $_ODATA['sp_starting']))); $_RDATA['sp_hostnames'] = array(); $_RDATA['sp_ignore_url'] = array_filter(array_map('trim', explode("\n", $_ODATA['sp_ignore_url']))); $_RDATA['sp_ignore_css'] = array_filter(explode(' ', $_ODATA['sp_ignore_css'])); $_RDATA['sp_title_strip'] = array_filter(array_map('trim', explode("\n", $_ODATA['sp_title_strip']))); $_RDATA['s_weight_css'] = array_filter(explode(' ', $_ODATA['s_weight_css'])); $_RDATA['sp_require_url'] = array_filter(array_map('trim', explode("\n", $_ODATA['sp_require_url']))); $_RDATA['sp_ignore_ext_regexp'] = implode('|', array_map('preg_quote', array_filter(explode(' ', $_ODATA['sp_ignore_ext'])))); $_RDATA['sp_robots_header'] = 0; $_RDATA['sp_complete'] = false; $_RDATA['sp_links'] = array(); $_RDATA['sp_store'] = array(); $_RDATA['sp_domains'] = array(); $_RDATA['sp_sitemap'] = array(); $_RDATA['sp_robots'] = array(); $_RDATA['sp_status'] = array('Orphan' => 0, 'Blocked' => 0, 'Not Found' => 0, 'Updated' => 0, 'New' => 0); $_RDATA['sp_filter'] = array(); $_RDATA['sp_prev_dls'] = 0; $_RDATA['sp_time_curl'] = 0; $_RDATA['sp_sleep'] = 0; $_RDATA['sp_sha1'] = array(); $_RDATA['sp_resumed'] = false; $_RDATA['sp_whitespace'] = array( "\u{0009}" => ' ', "\u{000A}" => "\n", "\u{000B}" => "\n", "\u{000C}" => "\n", "\u{000D}" => "\n", "\u{0085}" => "\n", "\u{00A0}" => ' ', "\u{1680}" => ' ', "\u{2000}" => ' ', "\u{2001}" => ' ', "\u{2002}" => ' ', "\u{2003}" => ' ', "\u{2004}" => ' ', "\u{2005}" => ' ', "\u{2006}" => ' ', "\u{2007}" => ' ', "\u{2008}" => ' ', "\u{2009}" => ' ', "\u{200A}" => ' ', "\u{200B}" => ' ', "\u{200C}" => ' ', "\u{200D}" => '', "\u{2028}" => "\n", "\u{2029}" => "\n", "\u{202F}" => ' ', "\u{205F}" => ' ', "\u{2060}" => '', "\u{3000}" => ' ', "\u{FEFF}" => ' ', "\u{FFFD}" => '' ); // ***** Load PDF parser if (!class_exists('\Smalot\PdfParser\Parser')) if (file_exists(__DIR__.'/pdfparser/alt_autoload.php-dist')) include __DIR__.'/pdfparser/alt_autoload.php-dist'; if (class_exists('\Smalot\PdfParser\Parser')) { $config = new \Smalot\PdfParser\Config(); $config->setRetainImageContent(false); $config->setDecodeMemoryLimit(16777216); $_PDF = new \Smalot\PdfParser\Parser([], $config); } else { OS_crawlLog('Could not include \'PDFParser\'; PDFs will not be indexed', 1); $_PDF = false; } // ***** Check for PHPMailer if (!$_MAIL) { OS_crawlLog('Could not include \'PHPMailer\'; Crawler cannot send mail', 1); } else if (!count($_MAIL->getAllRecipientAddresses())) OS_crawlLog('No admin emails specified; Crawler will not send mail', 1); // ***** Initialize the cURL connection $_cURL = OS_getConnection(); if ($_cURL) { // Customize this cURL connection if ($_ODATA['sp_cookies']) curl_setopt($_cURL, CURLOPT_COOKIEFILE, ''); curl_setopt($_cURL, CURLOPT_HEADERFUNCTION, function($_cURL, $line) { global $_RDATA; if (preg_match('/^X-Robots-Tag:\s*(noindex|none)/i', $line)) $_RDATA['sp_robots_header'] = 1; if (trim($line) == $_RDATA['x_generated_by']) $_RDATA['sp_self_reference'] = 1; return strlen($line); }); curl_setopt($_cURL, CURLOPT_NOPROGRESS, false); curl_setopt($_cURL, CURLOPT_PROGRESSFUNCTION, function($_cURL, $dls, $dl, $uls, $ul) { global $_ODATA, $_RDATA; if ($_RDATA['sp_robots_header']) return 1; if ($_RDATA['sp_self_reference']) return 1; // Prevent comparing this value until a Content-length header has // been received by the cURL connection if ($dls != $_RDATA['sp_prev_dls']) { $_RDATA['sp_prev_dls'] = $dls; if ($dls > $_ODATA['sp_limit_filesize'] * 1024) return 1; } if ($dl > $_ODATA['sp_limit_filesize'] * 1024) return 1; $i = curl_getinfo($_cURL); if ($i['redirect_url']) return 1; if ($i['http_code'] && $i['http_code'] >= 400) return 1; return $_RDATA['sp_robots_header']; } ); } else OS_crawlLog('cURL functions are not enabled; cannot perform crawl', 2); // ***** Pre-fill queue with starting URL(s) at depth 0, blank referer $_RDATA['sp_queue'] = array(); foreach ($_RDATA['sp_starting'] as $starting) { $starting = OS_formatURL($starting, $_ODATA['admin_install_domain'].'/'); $_RDATA['sp_queue'][] = array($starting, 0, ''); // Add starting URLs to required URLs so the crawler cannot travel // into parent directories $_RDATA['sp_require_url'][] = preg_replace('/\/[^\/]*$/', '/', $starting); $host = parse_url($starting)['host']; if (!in_array($host, $_RDATA['sp_hostnames'], true)) $_RDATA['sp_hostnames'][] = $host; } // ***** List of previously crawled links from the database $_RDATA['sp_exist'] = array(); $_RDATA['sp_lastmod'] = array(); $crawldata = $_DDATA['pdo']->query( 'SELECT `url`, `content_checksum`, `last_modified` FROM `'.$_DDATA['tbprefix'].'crawldata`' ); $err = $crawldata->errorInfo(); if ($err[0] == '00000') { foreach ($crawldata as $value) { $_RDATA['sp_exist'][$value['content_checksum']] = $value['url']; $_RDATA['sp_lastmod'][$value['url']] = $value['last_modified']; } } else OS_crawlLog('Error getting list of previous URLs from crawldata table', 2); // If the crawltemp table exists here, that means a crawl was // interrupted without completing the shutdown function. // Use the data from this partially completed crawl to resume it. if (in_array($_DDATA['tbprefix'].'crawltemp', $_DDATA['tables'], true)) { $select = $_DDATA['pdo']->query( 'SELECT `url`, `links`, `content_checksum` FROM `'.$_DDATA['tbprefix'].'crawltemp`;' ); $err = $select->errorInfo(); if ($err[0] == '00000') { OS_crawlLog('Previous crawl data exists; using it to resume crawling...', 1); $select = $select->fetchAll(); OS_crawlLog('Found '.count($select).' previously crawled URLs', 1); $_RDATA['sp_resumed'] = true; // Run through every entry in the crawltemp table foreach ($select as $row) { // If an entry matches an existing URL in the queue (just // starting URLs right now) then delete that queue entry foreach ($_RDATA['sp_queue'] as $key => $queue) if ($row['url'] == $queue[0]) unset($_RDATA['sp_queue'][$key]); // Add it to the 'stored' and 'crawled links' lists $_RDATA['sp_store'][] = $row['url']; $_RDATA['sp_links'][] = $row['url']; // Add the content hash to the tally $_RDATA['sp_sha1'][$row['content_checksum']] = $row['url']; // Rebuild the domains list $prurl = parse_url($row['url']); $domain = $prurl['scheme'].'://'.$prurl['host']; if (!isset($_RDATA['sp_domains'][$domain])) { $_RDATA['sp_domains'][$domain] = 1; } else $_RDATA['sp_domains'][$domain]++; // Add links from the entry to the queue $row['links'] = json_decode($row['links'], true); foreach ($row['links'] as $link) { $link = OS_formatURL($link, $row['url']); // ***** If this link hasn't been crawled yet if (!in_array($link, $_RDATA['sp_links'], true)) { // ... and if link hasn't been queued yet foreach ($_RDATA['sp_queue'] as $queue) if ($link == $queue[0]) continue 2; // ... and if link passes our user filters, add the link to // the queue if (!OS_filterURL($link, $row['url'])) $_RDATA['sp_queue'][] = array($link, 0, $row['url']); } } } // We couldn't select any data from the crawltemp table so delete it } else { $drop = $_DDATA['pdo']->query( 'DROP TABLE IF EXISTS `'.$_DDATA['tbprefix'].'crawltemp`;' ); $err = $drop->errorInfo(); if ($err[0] != '00000') { // If we couldn't delete the interrupted crawldata table, this is // a fatal error OS_crawlLog('Could not delete previously interrupted crawl data; unable to crawl.', 2); throw new Exception('Could not delete previously interrupted crawl data; unable to crawl.'); } } } // Create a temp MySQL storage table using schema of the existing table $create = $_DDATA['pdo']->query( 'CREATE TABLE IF NOT EXISTS`'.$_DDATA['tbprefix'].'crawltemp` LIKE `'.$_DDATA['tbprefix'].'crawldata`;' ); $err = $create->errorInfo(); if ($err[0] != '00000') { // If we could not create the crawldata table, or an interrupted // crawldata table doesn't exist, then this is a fatal error OS_crawlLog('Unable to create or reuse existing crawl data table; unable to crawl.', 2); throw new Exception('Unable to create or reuse existing crawl data table; unable to crawl.'); } // Prepare SQL statements $selectData = $_DDATA['pdo']->prepare( 'SELECT `url`, `category`, `links`, `content_checksum`, `last_modified`, `flag_updated`, `flag_unlisted`, `priority` FROM `'.$_DDATA['tbprefix'].'crawldata` WHERE `url`=:url;' ); $updateURL = $_DDATA['pdo']->prepare( 'UPDATE `'.$_DDATA['tbprefix'].'crawltemp` SET `url`=:url WHERE `content_checksum`=:content_checksum;' ); $insertTemp = $_DDATA['pdo']->prepare( 'INSERT INTO `'.$_DDATA['tbprefix'].'crawltemp` SET `url`=:url, `url_sort`=0, `title`=:title, `description`=:description, `keywords`=:keywords, `category`=:category, `weighted`=:weighted, `links`=:links, `content`=:content, `content_mime`=:content_mime, `content_charset`=:content_charset, `content_checksum`=:content_checksum, `status`=:status, `flag_unlisted`=:flag_unlisted, `flag_updated`=:flag_updated, `last_modified`=:last_modified, `priority`=:priority ;' ); $insertNotModified = $_DDATA['pdo']->prepare( 'REPLACE INTO `'.$_DDATA['tbprefix'].'crawltemp` ( `url`, `url_sort`, `title`, `description`, `keywords`, `category`, `weighted`, `links`, `content`, `content_mime`, `content_charset`, `content_checksum`, `status`, `flag_unlisted`, `flag_updated`, `last_modified`, `priority` ) SELECT `url`, 0, `title`, `description`, `keywords`, `category`, `weighted`, `links`, `content`, `content_mime`, `content_charset`, `content_checksum`, :status, `flag_unlisted`, 0, `last_modified`, `priority` FROM `'.$_DDATA['tbprefix'].'crawldata` WHERE `url`=:url;' ); // ***** Begin crawling URLs from the queue while ($_cURL && count($_RDATA['sp_queue'])) { // Check if we have run out of execution time if ($_ODATA['sp_time_start'] + $_ODATA['sp_timeout_crawl'] <= time()) { OS_crawlLog('Maximum script runtime ('.$_ODATA['sp_timeout_crawl'].'s) reached', 2); break; } // Check if user has canceled the crawl if (OS_getValue('sp_cancel')) { OS_crawlLog('Crawl canceled manually by user', 2); break; } // Check if we have exceeded the maximum number of crawled links if (count($_RDATA['sp_links']) > $_ODATA['sp_limit_crawl']) { OS_crawlLog('Maximum number of crawled pages exceeded', 2); break; } // Retrieve next link to crawl from the queue list($url, $depth, $referer) = array_shift($_RDATA['sp_queue']); $_RDATA['sp_links'][] = $url; // Check if URL is beyond the depth limit if ($depth > $_ODATA['sp_limit_depth']) { OS_crawlLog('Maximum link depth ('.$_ODATA['sp_limit_depth'].') exceeded; URL at depth '.$depth.' was not stored: '.$url, 2); continue; } // Check robots.txt for newly encountered hostnames $purl = parse_url($url); $port = (!empty($purl['port'])) ? ':'.$purl['port'] : ''; if (!isset($_RDATA['sp_robots'][$purl['host']])) { $_RDATA['sp_robots'][$purl['host']] = array(); OS_crawlLog('Fetching robots.txt for domain: '.$purl['host'], 1); curl_setopt($_cURL, CURLOPT_TIMECONDITION, CURL_TIMECOND_NONE); $robotstxt = OS_fetchURL($purl['scheme'].'://'.$purl['host'].$port.'/robots.txt', ''); if (!$robotstxt['errno']) { $robots = array(); $robot = ''; $robolines = explode("\n", $robotstxt['content']); foreach ($robolines as $line) { if (preg_match('/^user-agent\s*:\s*(.*)\s*$/i', $line, $r)) { if (empty($robots[$robot = $r[1]])) $robots[$robot] = array('disallow' => array(), 'allow' => array()); } else if (preg_match('/((dis)?allow)\s*:\s*(.*)\s*$/i', $line, $r)) $robots[$robot][strtolower($r[1])][] = OS_formatURL($r[3], $url); } foreach ($robots as $agent => $rules) { if (preg_match('/^orc(a|inus)(-?php)?-?crawler$/i', $agent) || $agent == '*') { foreach ($rules['disallow'] as $disrule) if (!in_array($disrule, $_RDATA['sp_robots'][$purl['host']], true)) $_RDATA['sp_robots'][$purl['host']][] = $disrule; foreach ($rules['allow'] as $rule) { $key = array_search($rule, $_RDATA['sp_robots'][$purl['host']]); if ($key !== false) unset($_RDATA['sp_robots'][$purl['host']][$key]); } } } } } if ($_RDATA['debug']) OS_crawlLog('Memory used: '.OS_readSize(memory_get_usage(true)), 1); OS_crawlLog('Crawling: '.$url.' (Depth: '.$depth.')', 1); OS_setValue('sp_progress', array( count($_RDATA['sp_links']), count($_RDATA['sp_links']) + count($_RDATA['sp_queue']), $_RDATA['sp_resumed'] )); OS_setValue('sp_time_end', time()); // Set the correct If-Modified-Since request header if ($_ODATA['sp_ifmodifiedsince'] && isset($_RDATA['sp_lastmod'][$url])) { curl_setopt($_cURL, CURLOPT_TIMEVALUE, $_RDATA['sp_lastmod'][$url]); curl_setopt($_cURL, CURLOPT_TIMECONDITION, CURL_TIMECOND_IFMODSINCE); } else curl_setopt($_cURL, CURLOPT_TIMECONDITION, CURL_TIMECOND_NONE); // Fetch the URL $data = OS_fetchURL($url, $referer); // Record cURL timing and data info for this fetch OS_setValue('sp_data_transferred', $_ODATA['sp_data_transferred'] + $data['info']['size_download']); $_RDATA['sp_time_curl'] += $data['info']['total_time']; // If there were cURL errors while fetching this URL if ($data['errno']) { // Else if the page hasn't been modified since the last crawl } else if ($data['info']['http_code'] == 304) { $data['info']['noindex'] = 'not-modified-304'; // Else if we received any content at all } else if (trim($data['body'])) { // Get a 20-byte binary hash of the raw content $data['info']['sha1'] = sha1($data['body'], true); // If this content does not duplicate previously stored content if (empty($_RDATA['sp_sha1'][$data['info']['sha1']])) { // Add the content hash to the tally $_RDATA['sp_sha1'][$data['info']['sha1']] = $url; // If this is a new page, or an existing page but the content // hash has changed if (!isset($_RDATA['sp_exist'][$data['info']['sha1']]) || $_RDATA['sp_exist'][$data['info']['sha1']] != $url) { // Detect MIME-type using extension? if (empty($data['info']['content_type'])) $data['info']['content_type'] = 'text/plain'; // Parse MIME-type $data['info']['mime_type'] = ''; if (preg_match('/\w+\/[\w.+-]+/', $data['info']['content_type'], $m)) $data['info']['mime_type'] = $m[0]; // Parse Character Encoding $data['info']['charset'] = ''; if (preg_match('/charset=([\w\d.:-]+)/i', $data['info']['content_type'], $m)) $data['info']['charset'] = $m[1]; if (!$data['info']['charset']) $data['info']['charset'] = 'ISO-8859-1'; // GZ-Unzip the content if necessary while (strpos($data['body'], "\x1f\x8b") === 0) $data['body'] = gzinflate(substr($data['body'], 10)); // Title defaults to filename $data['title'] = basename($data['info']['url']); // Determine how to parse the content by MIME-type switch ($data['info']['mime_type']) { /* ***** PLAIN TEXT ************************************** */ case 'text/plain': $data['content'] = $data['body']; OS_cleanTextUTF8($data['content'], $data['info']['charset']); break; /* ***** XML DOCUMENT ************************************ */ case 'text/xml': case 'application/xml': $data['body'] = preg_replace('//', ' ', $data['body']); $document = new DOMDocument(); if ($document->loadXML($data['body'], LIBXML_PARSEHUGE | LIBXML_BIGLINES | LIBXML_COMPACT)) { // Remove