'; } fwrite($_RDATA['sp_log'], $prefix.$text."\n"); if ($_RDATA['debug'] || ($_SERVER['REQUEST_METHOD'] == 'CLI' && $level >= $_RDATA['sp_log_clilevel'])) { echo $prefix.$text."\n"; } } /** * Final prep to store content in UTF-8 format in the database * */ function OS_cleanTextUTF8(&$_, $charset, $entity = false) { global $_RDATA; if (!trim($charset)) $charset = 'ISO-8859-1'; $_ = mb_convert_encoding($_, 'UTF-8', $charset); if ($entity) $_ = html_entity_decode($_, $entity | ENT_SUBSTITUTE, 'UTF-8'); $_ = strtr($_, $_RDATA['sp_punct']); $_ = strtr($_, $_RDATA['sp_whitespace']); $_ = preg_replace(array('/\s/', '/ {2,}/'), ' ', trim($_)); } /** * Format a full or partial URL into a full URL according to a base URL * */ function OS_formatURL($_, $base) { $_ = str_replace(' ', '%20', $_); $_ = preg_replace('/#.*$/', '', filter_var($_, FILTER_SANITIZE_URL)); $_ = str_replace('%20', ' ', $_); $dirbase = preg_replace('/(? parse_url($url), 'body' => curl_exec($_cURL), 'base' => $url, 'info' => curl_getinfo($_cURL), 'error' => curl_error($_cURL), 'errno' => curl_errno($_cURL), 'links' => array(), 'title' => '', 'content' => '', 'keywords' => '', 'weighted' => '', 'description' => '' ); $_['info']['url'] = $url; $_['info']['noindex'] = ''; $_['info']['nofollow'] = false; // Process any cURL errors switch ($_['errno']) { case 0: // Success case 42: // Aborted by callback if ($_['info']['http_code'] >= 400) { $_['errno'] = 22; $_['error'] = $_['info']['http_code'].' error'; $_['info']['noindex'] = '400'; } else if ($_['info']['redirect_url']) { $_['errno'] = 300; $_['error'] = 'Redirected by HTTP header to: '.$_['info']['redirect_url']; $_['info']['noindex'] = 'redirect-location'; } else if ($_RDATA['sp_robots_header']) { $_['errno'] = 777; $_['error'] = 'Blocked by \'X-Robots-Tag\' HTTP header'; $_['info']['noindex'] = 'robots-http'; } else if ($_RDATA['sp_self_reference']) { $_['errno'] = 888; $_['error'] = 'Refused to index myself'; $_['info']['noindex'] = 'self-reference'; } else if ($_['errno'] == 42) { $_['errno'] = 999; $_['error'] = 'Max filesize exceeded'; $_['info']['noindex'] = 'too-large'; } break; case 28: // Timeout $_['error'] = 'Timed out waiting for data'; $_['info']['noindex'] = 'timeout'; break; case 55: // Network send error case 56: // Network receive error $_['error'] = 'Network error retrieving data'; $_['info']['noindex'] = 'network-error'; break; case 6: // Could not resolve host case 7: // Could not connect to host $_['error'] = 'Couldn\'t connect to host: '.$_['url']['host']; $_['info']['noindex'] = 'couldnt-connect'; break; default: // Uncaught cURL error OS_crawlLog('Uncaught cURL error: '.$url, 2); OS_crawlLog($_['errno'], 1); OS_crawlLog($_['error'], 1); OS_crawlLog(print_r($_['info'], true), 1); throw new Exception('Uncaught cURL error'); } return $_; } /** * Shutdown function to provide cleanup before exit * */ function OS_crawlCleanUp() { global $_DDATA, $_ODATA, $_RDATA, $_cURL, $_MAIL; // If the crawl has already been canceled, don't bother if (!OS_getValue('sp_crawling')) return; $error = error_get_last(); if (!is_null($error) && $error['type'] == E_ERROR) { OS_crawlLog($error['message'], 2); OS_crawlLog('File: \''.$error['file'].'\' at line number: '.$error['line'], 0); $_RDATA['sp_complete'] = false; } // Save or display cookies? $cookies = curl_getinfo($_cURL, CURLINFO_COOKIELIST); // var_dump($cookies); curl_close($_cURL); // If crawl completed successfully if ($_RDATA['sp_complete']) { OS_crawlLog('Cleaning up database tables...', 1); // Add a natural sort order value to each entry natcasesort($_RDATA['sp_store']); $_RDATA['sp_store'] = array_values($_RDATA['sp_store']); $url_sort = $_DDATA['pdo']->prepare( 'UPDATE `'.$_DDATA['tbprefix'].'crawltemp` SET `url_sort`=:url_sort WHERE `url`=:url;' ); foreach ($_RDATA['sp_store'] as $key => $stored_url) { $url_sort->execute(array( 'url_sort' => $key, 'url' => $stored_url )); $err = $url_sort->errorInfo(); if ($err[0] != '00000') { OS_crawlLog('Error sorting the search database', 1); OS_crawlLog($err[2], 0); break; } } // Truncate the existing search database $truncate = $_DDATA['pdo']->query( 'TRUNCATE `'.$_DDATA['tbprefix'].'crawldata`;' ); $err = $truncate->errorInfo(); if ($err[0] != '00000') { OS_crawlLog('Could not truncate the search database', 1); OS_crawlLog($err[2], 0); // Last chance to bail out before we make actual changes $_RDATA['sp_complete'] = false; } } // If crawl completed successfully AND we truncated the old table if ($_RDATA['sp_complete']) { OS_setValue('sp_time_end', time()); OS_setValue('sp_time_last', $_ODATA['sp_time_end'] - $_ODATA['sp_time_start']); // Select all rows from the temp table into the existing search table $insert = $_DDATA['pdo']->query( 'INSERT INTO `'.$_DDATA['tbprefix'].'crawldata` SELECT * FROM `'.$_DDATA['tbprefix'].'crawltemp`;' ); $err = $insert->errorInfo(); if ($err[0] == '00000') { $tableinfo = $_DDATA['pdo']->query( 'SHOW TABLE STATUS LIKE \''.$_DDATA['tbprefix'].'crawldata\';' ); $err = $tableinfo->errorInfo(); if ($err[0] == '00000') { $tableinfo = $tableinfo->fetchAll(); OS_setValue('sp_data_stored', $tableinfo[0]['Data_length']); } else OS_crawlLog('Could not read crawl table status', 1); // Purge the search result cache if ($_ODATA['s_limit_cache']) { $purge = $_DDATA['pdo']->query( 'UPDATE `'.$_DDATA['tbprefix'].'query` SET `cache`=\'\';' ); $err = $purge->errorInfo(); if ($err[0] != '00000') OS_crawlLog('Could not purge search result cache', 1); } // Optimize the query log table $optimize = $_DDATA['pdo']->query( 'OPTIMIZE TABLE `'.$_DDATA['tbprefix'].'query`;' ); OS_setValue('sp_pages_stored', count($_RDATA['sp_store'])); OS_setValue('sp_domains', $_RDATA['sp_domains']); OS_setValue('sp_time_end_success', $_ODATA['sp_time_end']); OS_crawlLog('***** Crawl completed in '.$_ODATA['sp_time_last'].'s *****', 1); OS_crawlLog('Total data transferred: '.OS_readSize($_ODATA['sp_data_transferred']), 1); OS_crawlLog('Average transfer speed: '.OS_readSize(round($_ODATA['sp_data_transferred'] / $_ODATA['sp_time_last'])).'/s', 1); if ($_RDATA['sp_sleep']) OS_crawlLog('Time spent sleeping: '.(round($_RDATA['sp_sleep'] / 10) / 100).'s', 1); OS_crawlLog('Time taken by cURL: '.(round($_RDATA['sp_time_curl'] * 100) / 100).'s', 1); OS_crawlLog($_ODATA['sp_progress'][0].' page'.(($_ODATA['sp_progress'][0] == 1) ? '' : 's').' crawled', 1); OS_crawlLog($_ODATA['sp_pages_stored'].' page'.(($_ODATA['sp_pages_stored'] == 1) ? '' : 's').' stored', 1); if ($_RDATA['sp_status']['New']) OS_crawlLog($_RDATA['sp_status']['New'].' new '.(($_RDATA['sp_status']['New'] == 1) ? 'page' : 'pages').' found', 0); if ($_RDATA['sp_status']['Updated']) OS_crawlLog($_RDATA['sp_status']['Updated'].' '.(($_RDATA['sp_status']['Updated'] == 1) ? 'page' : 'pages').' updated', 0); if ($_RDATA['sp_status']['Blocked']) OS_crawlLog($_RDATA['sp_status']['Blocked'].' '.(($_RDATA['sp_status']['Blocked'] == 1) ? 'page' : 'pages').' blocked', 0); if ($_RDATA['sp_status']['Not Found']) OS_crawlLog($_RDATA['sp_status']['Not Found'].' '.(($_RDATA['sp_status']['Not Found'] == 1) ? 'page' : 'pages').' not found', 0); if ($_RDATA['sp_status']['Orphan']) OS_crawlLog($_RDATA['sp_status']['Orphan'].' orphaned '.(($_RDATA['sp_status']['Orphan'] == 1) ? 'page' : 'pages'), 0); if ($_ODATA['sp_autodelete']) OS_crawlLog('Orphaned pages were auto-deleted', 1); // Send success email to the admin(s) if ($_MAIL && count($_MAIL->getAllRecipientAddresses()) && $_ODATA['sp_email_success']) { $_MAIL->Subject = 'Orcinus Site Search Crawler: Crawl succeeded'; $_MAIL->Body = implode(" \r\n", preg_grep('/^[\[\*]/', explode("\n", file_get_contents($_ODATA['sp_log'])))); if (!$_MAIL->Send()) OS_crawlLog('Could not send notification email', 2); } $cliMessage = 'Crawl completed successfully'; $jsonMessage = json_encode(array( 'status' => 'Success', 'message' => $cliMessage ), JSON_INVALID_UTF8_IGNORE); // We truncated the search table but FAILED to populate it! // This is a serious error that disables searching until the // crawler is run again! } else { OS_crawlLog('Could not populate the search table', 2); OS_crawlLog($err[2], 0); OS_crawlLog('***** Crawl failed; runtime '.$_ODATA['sp_time_last'].'s *****', 1); OS_crawlLog('Search table was cleared, but could not be repopulated!', 1); OS_crawlLog('The crawler MUST be run again to fix this issue!', 1); // Send failure email to the admin(s) if ($_MAIL && count($_MAIL->getAllRecipientAddresses()) && $_ODATA['sp_email_failure']) { $_MAIL->Subject = 'Orcinus Site Search Crawler: Catastrophic failure!'; $_MAIL->Body = implode(" \r\n", preg_grep('/^[\[\*\w\d]/', explode("\n", file_get_contents($_ODATA['sp_log'])))); if (!$_MAIL->Send()) OS_crawlLog('Could not send notification email', 2); } $cliMessage = 'Could not populate search table; search table is currently empty!'; $jsonMessage = json_encode(array( 'status' => 'Error', 'message' => $cliMessage ), JSON_INVALID_UTF8_IGNORE); } // Else the crawl failed } else { OS_setValue('sp_time_last', $_ODATA['sp_time_end'] - $_ODATA['sp_time_start']); OS_crawlLog('***** Crawl failed; runtime '.$_ODATA['sp_time_last'].'s *****', 1); OS_crawlLog('Total data transferred: '.OS_readSize($_ODATA['sp_data_transferred']), 1); OS_crawlLog('Search table was NOT updated', 1); if ($_ODATA['sp_sitemap_file']) OS_crawlLog('Sitemap was NOT updated', 1); // Send failure email to the admin(s) if ($_MAIL && count($_MAIL->getAllRecipientAddresses()) && $_ODATA['sp_email_failure'] && !$_ODATA['sp_cancel']) { $_MAIL->Subject = 'Orcinus Site Search Crawler: Crawl failed'; $_MAIL->Body = implode(" \r\n", preg_grep('/^[\[\*\w\d]/', explode("\n", file_get_contents($_ODATA['sp_log'])))); if (!$_MAIL->Send()) OS_crawlLog('Could not send notification email', 2); } $cliMessage = 'Crawl failed; see the log for details'; $jsonMessage = json_encode(array( 'status' => 'Error', 'message' => $cliMessage ), JSON_INVALID_UTF8_IGNORE); } // Delete the temp search table $drop = $_DDATA['pdo']->query( 'DROP TABLE IF EXISTS `'.$_DDATA['tbprefix'].'crawltemp`;' ); $err = $drop->errorInfo(); if ($err[0] != '00000') { OS_crawlLog('Could not delete the temporary search table', 1); OS_crawlLog($err[2], 0); } // Store the log file to the config database OS_setValue('sp_log', file_get_contents($_ODATA['sp_log'])); fclose($_RDATA['sp_log']); // Unset the crawling flag OS_setValue('sp_crawling', 0); if ($_SERVER['REQUEST_METHOD'] != 'CLI') { if (!$_RDATA['debug']) header('Content-type: application/json; charset='.strtolower($_ODATA['s_charset'])); die($jsonMessage); } else die($cliMessage."\n"); } // ***** Accept incoming commands by REQUEST_METHOD switch ($_SERVER['REQUEST_METHOD']) { /* ***** Handle POST Requests ************************************ */ case 'POST': // JSON POST request // These are usually sent by javascript fetch() if (strpos(trim($_SERVER['CONTENT_TYPE']), 'application/json') === 0) { $postBody = file_get_contents('php://input'); $_POST = json_decode($postBody, false); $response = array(); if (empty($_POST->action)) $_POST->action = ''; switch ($_POST->action) { case 'crawl': if (!empty($_POST->sp_key) && OS_getValue('sp_key') && $_POST->sp_key == $_ODATA['sp_key']) { if (OS_getValue('sp_crawling')) { $response = array( 'status' => 'Error', 'message' => 'Crawler is already running; current progress: '.$_ODATA['sp_progress'][0].'/'.$_ODATA['sp_progress'][1] ); // Go crawl! } else OS_setValue('sp_crawling', getmypid()); } else { $response = array( 'status' => 'Error', 'message' => 'Incorrect key to initiate crawler' ); } OS_setValue('sp_key', ''); break; case 'progress': $lines = array(); if (!empty($_POST->log)) { if (OS_getValue('sp_crawling')) { if (strpos($_ODATA['sp_log'], "\n") === false && file_exists($_ODATA['sp_log'])) $lines = file($_ODATA['sp_log'], FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); } else $lines = explode("\n", $_ODATA['sp_log']); if (empty($_POST->grep)) $_POST->grep = ''; switch ($_POST->grep) { case 'all': break; case 'errors': $lines = preg_grep('/^[\[\*]/', $lines); break; default: $lines = preg_grep('/^[\[\*\w\d]/', $lines); } } // If crawl is in progress, return just the last 15 lines if ($_ODATA['sp_crawling']) $lines = array_slice($lines, -15); $response = array( 'status' => ($_ODATA['sp_crawling']) ? 'Crawling' : 'Complete', 'progress' => $_ODATA['sp_progress'], 'data_transferred' => $_ODATA['sp_data_transferred'], 'time_crawl' => time() - $_ODATA['sp_time_start'], 'time_start' => $_ODATA['sp_time_start'], 'time_end' => $_ODATA['sp_time_end'], 'timeout_crawl' => $_ODATA['sp_timeout_crawl'], 'tail' => trim(implode("\n", $lines)) ); break; case 'cancel': if (OS_getValue('sp_crawling')) { // IF the crawler 'time_start' is more than 'timeout_crawl' // seconds ago, or the 'force' token is set, the crawler is // probably stuck. Unstick it. if (empty($_POST->force)) $_POST->force = ''; if ($_POST->force || time() - $_ODATA['sp_time_start'] > $_ODATA['sp_timeout_crawl']) { OS_setValue('sp_crawling', 0); if (empty($_POST->reason)) $_POST->reason = 'The crawler halted unexpectedly'; if (strpos($_ODATA['sp_log'], "\n") === false && file_exists($_ODATA['sp_log'])) { $log = file_get_contents($_ODATA['sp_log']); OS_setValue('sp_log', $log."\n".'[ERROR] '.$_POST->reason); } else OS_setValue('sp_log', '[ERROR] '.$_POST->reason); OS_setValue('sp_time_last', $_ODATA['sp_time_end'] - $_ODATA['sp_time_start']); // Send failure email to the admin(s) if ($_MAIL && count($_MAIL->getAllRecipientAddresses()) && $_ODATA['sp_email_failure']) { $_MAIL->Subject = 'Orcinus Site Search Crawler: Crawler halted unexpectedly'; $_MAIL->Body = implode(" \r\n", preg_grep('/^[\[\*\w\d]/', explode("\n", $_ODATA['sp_log']))); if (!$_MAIL->Send()) OS_setValue('sp_log', $_ODATA['sp_log']."\n".'[ERROR] Could not send notification email'); } } OS_setValue('sp_cancel', 1); $response = array( 'status' => 'Success', 'message' => 'Cancel flag was set', 'crawl_time' => time() - $_ODATA['sp_time_start'] ); } else { $response = array( 'status' => 'Error', 'message' => 'Crawler is not currently running' ); } break; default: $response = array( 'status' => 'Error', 'message' => 'Unrecognized command' ); } // If we have a response to give, display it and exit if ($response) { header('Content-type: application/json; charset='.strtolower($_ODATA['s_charset'])); die(json_encode($response, JSON_INVALID_UTF8_IGNORE)); } // Don't do anything for normal POST request // These are usually sent by