diff --git a/orcinus/admin.php b/orcinus/admin.php index 0f4bf17..7bafe92 100644 --- a/orcinus/admin.php +++ b/orcinus/admin.php @@ -976,22 +976,23 @@ function os_return_all() { } // {{{{{ Create the Mustache template -let os_TEMPLATE = {} -os_TEMPLATE.addError = function(text) { - if (!this.errors) { - this.errors = {}; - this.errors.error_list = []; +let os_TEMPLATE = { + version: '', + searchable: false, + addError: function(text) { + if (!this.errors) { + this.errors = {}; + this.errors.error_list = []; + } + this.errors.error_list.push(text); } - this.errors.error_list.push(text); }; -os_TEMPLATE.version = ''; -os_TEMPLATE.limit_term_length = ; - // Check if there are rows in the search database if (os_crawldata.length) { os_TEMPLATE.searchable = {}; os_TEMPLATE.searchable.form_action = window.location.pathname; + os_TEMPLATE.searchable.limit_term_length = ; os_request.c = os_params.get('c'); if (!os_request.c || !os_rdata.s_category_list[os_request.c]) diff --git a/orcinus/config.php b/orcinus/config.php index 67861ce..972c66c 100644 --- a/orcinus/config.php +++ b/orcinus/config.php @@ -336,6 +336,7 @@ $_ODATA = $_DDATA['pdo']->query( 'SELECT * FROM `'.$_DDATA['tbprefix'].'config`;' )->fetchAll()[0]; +ini_set('display_errors', 1); error_reporting(E_ALL); date_default_timezone_set($_ODATA['sp_timezone']); ini_set('mbstring.substitute_character', 'none'); @@ -573,8 +574,14 @@ if (!$_ODATA['s_result_template']) { // {{{{{ Initialize the Mustache templating engine class OS_Mustache { public $errors; + public $version; + public $searchable; - function __construct() {} + function __construct() { + global $_ODATA; + + $this->version = $_ODATA['version']; + } function addError($text) { if (!$this->errors) { diff --git a/orcinus/crawler.php b/orcinus/crawler.php index 82b9eec..a48f934 100644 --- a/orcinus/crawler.php +++ b/orcinus/crawler.php @@ -325,6 +325,11 @@ function OS_crawlCleanUp() { OS_crawlLog('Could not purge search result cache', 1); } + // Optimize the query log table + $optimize = $_DDATA['pdo']->query( + 'OPTIMIZE TABLE `'.$_DDATA['tbprefix'].'query`;' + ); + OS_setValue('sp_links_crawled', count($_RDATA['sp_links'])); OS_setValue('sp_pages_stored', count($_RDATA['sp_store'])); OS_setValue('sp_time_end_success', $_ODATA['sp_time_end']); @@ -721,10 +726,10 @@ if ($_cURL) { // ***** Pre-fill queue with starting URL(s) at depth 0, blank referer -$_QUEUE = array(); +$_RDATA['sp_queue'] = array(); foreach ($_RDATA['sp_starting'] as $starting) { $starting = OS_formatURL($starting, $_ODATA['admin_install_domain'].'/'); - $_QUEUE[] = array($starting, 0, ''); + $_RDATA['sp_queue'][] = array($starting, 0, ''); // Add starting URLs to required URLs so the crawler cannot travel // into parent directories @@ -736,14 +741,14 @@ foreach ($_RDATA['sp_starting'] as $starting) { } // ***** List of previously crawled links from the database -$_EXIST = array(); +$_RDATA['sp_exist'] = array(); $crawldata = $_DDATA['pdo']->query( 'SELECT `url`, `content_checksum` FROM `'.$_DDATA['tbprefix'].'crawldata`' ); $err = $crawldata->errorInfo(); if ($err[0] == '00000') { foreach ($crawldata as $value) - $_EXIST[$value['content_checksum']] = $value['url']; + $_RDATA['sp_exist'][$value['content_checksum']] = $value['url']; } else OS_crawlLog('Error getting list of previous URLs from crawldata table', 2); @@ -803,7 +808,7 @@ $updateNotModified = $_DDATA['pdo']->prepare( // ***** Begin crawling URLs from the queue -while ($_cURL && count($_QUEUE)) { +while ($_cURL && count($_RDATA['sp_queue'])) { // Check if we have run out of execution time if ($_ODATA['sp_time_start'] + $_ODATA['sp_timeout_crawl'] <= time()) { @@ -824,7 +829,7 @@ while ($_cURL && count($_QUEUE)) { } // Retrieve next link to crawl from the queue - list($url, $depth, $referer) = array_shift($_QUEUE); + list($url, $depth, $referer) = array_shift($_RDATA['sp_queue']); $_RDATA['sp_links'][] = $url; // Check if URL is beyond the depth limit @@ -872,10 +877,10 @@ while ($_cURL && count($_QUEUE)) { OS_crawlLog('Memory used: '.OS_readSize(memory_get_usage(true)), 1); OS_crawlLog('Crawling: '.$url.' (Depth: '.$depth.')', 1); - OS_setValue('sp_progress', count($_RDATA['sp_links']).'/'.(count($_RDATA['sp_links']) + count($_QUEUE))); + OS_setValue('sp_progress', count($_RDATA['sp_links']).'/'.(count($_RDATA['sp_links']) + count($_RDATA['sp_queue']))); // Set the correct If-Modified-Since request header - if ($_ODATA['sp_ifmodifiedsince'] && (!count($_EXIST) || in_array($url, $_EXIST))) { + if ($_ODATA['sp_ifmodifiedsince'] && (!count($_RDATA['sp_exist']) || in_array($url, $_RDATA['sp_exist']))) { curl_setopt($_cURL, CURLOPT_TIMECONDITION, CURL_TIMECOND_IFMODSINCE); } else curl_setopt($_cURL, CURLOPT_TIMECONDITION, CURL_TIMECOND_NONE); @@ -910,7 +915,8 @@ while ($_cURL && count($_QUEUE)) { // If this is a new page, or an existing page but the content // hash has changed - if (!isset($_EXIST[$data['info']['sha1']]) || $_EXIST[$data['info']['sha1']] != $url) { + if (!isset($_RDATA['sp_exist'][$data['info']['sha1']]) || + $_RDATA['sp_exist'][$data['info']['sha1']] != $url) { // Detect MIME-type using extension? if (empty($data['info']['content_type'])) @@ -1008,131 +1014,133 @@ while ($_cURL && count($_QUEUE)) { // ***** Process elements $head = $document->getElementsByTagName('head'); + if (!empty($head[0])) { - $base = $head[0]->getElementsByTagName('base'); - if (!empty($base[0])) - for ($x = 0; $x < count($base[0]->attributes); $x++) - if (strtolower($base[0]->attributes[$x]->name) == 'href') - if (!empty($base[0]->attributes[$x]->value)) - $data['base'] = filter_var($base[0]->attributes[$x]->value, FILTER_SANITIZE_URL); + $base = $head[0]->getElementsByTagName('base'); + if (!empty($base[0])) + for ($x = 0; $x < count($base[0]->attributes); $x++) + if (strtolower($base[0]->attributes[$x]->name) == 'href') + if (!empty($base[0]->attributes[$x]->value)) + $data['base'] = filter_var($base[0]->attributes[$x]->value, FILTER_SANITIZE_URL); - $metas = $head[0]->getElementsByTagName('meta'); - foreach ($metas as $meta) { - for ($x = 0; $x < count($meta->attributes); $x++) { - if (strtolower($meta->attributes[$x]->name) == 'charset') { - if (strtolower($data['info']['charset']) != strtolower($meta->attributes[$x]->value)) { - OS_crawlLog('Charset in Content-type header ('.(($data['info']['charset']) ? $data['info']['charset'] : '').') differs from document charset ('.(($meta->attributes[$x]->value) ? $meta->attributes[$x]->value : '').') at: '.$data['info']['url'], 1); - $data['info']['charset'] = $meta->attributes[$x]->value; - } + $metas = $head[0]->getElementsByTagName('meta'); + foreach ($metas as $meta) { + for ($x = 0; $x < count($meta->attributes); $x++) { + if (strtolower($meta->attributes[$x]->name) == 'charset') { + if (strtolower($data['info']['charset']) != strtolower($meta->attributes[$x]->value)) { + OS_crawlLog('Charset in Content-type header ('.(($data['info']['charset']) ? $data['info']['charset'] : '').') differs from document charset ('.(($meta->attributes[$x]->value) ? $meta->attributes[$x]->value : '').') at: '.$data['info']['url'], 1); + $data['info']['charset'] = $meta->attributes[$x]->value; + } - } else if (strtolower($meta->attributes[$x]->name) == 'http-equiv') { - switch (strtolower($meta->attributes[$x]->value)) { - case 'refresh': - for ($y = 0; $y < count($meta->attributes); $y++) { - if (strtolower($meta->attributes[$y]->name) == 'content') { - if (preg_match('/(\d+)\s?;\s?url\s?=\s?([\'"])(.+?)\2?\s?$/i', $meta->attributes[$y]->value, $m)) { - if ((int)$m[1] <= $_ODATA['sp_timeout_url']) { - $data['errno'] = 300; - $data['error'] = 'Redirected by element to: '.$m[3]; - $data['info']['redirect_url'] = $m[3]; - $data['info']['noindex'] = 'redirect-meta'; - $data['info']['nofollow'] = true; - break 4; - } else $data['links'][] = $m[3]; - } - } - } - break; - - case 'content-type': - for ($y = 0; $y < count($meta->attributes); $y++) { - if (strtolower($meta->attributes[$y]->name) == 'content' && preg_match('/charset=([\w\d.:-]+)/i', $meta->attributes[$y]->value, $m)) { - if (strtolower($data['info']['charset']) != strtolower($m[1])) { - OS_crawlLog('Charset in Content-type header ('.(($data['info']['charset']) ? $data['info']['charset'] : '').') differs from document charset ('.(($m[1]) ? $m[1] : '').') at: '.$data['info']['url'], 1); - $data['info']['charset'] = $m[1]; - } - } - } - - } - - } else if (strtolower($meta->attributes[$x]->name) == 'name') { - switch (strtolower($meta->attributes[$x]->value)) { - case 'keywords': - for ($y = 0; $y < count($meta->attributes); $y++) - if (strtolower($meta->attributes[$y]->name) == 'content') - $data['keywords'] = $meta->attributes[$y]->value; - break; - - case 'description': - for ($y = 0; $y < count($meta->attributes); $y++) - if (strtolower($meta->attributes[$y]->name) == 'content') - $data['description'] = $meta->attributes[$y]->value; - break; - - case 'robots': - case 'orcacrawler': - case 'orcaphpcrawler': - case 'orca-crawler': - case 'orcaphp-crawler': - case 'orca-phpcrawler': - case 'orca-php-crawler': - case 'orcinuscrawler': - case 'orcinus-crawler': - for ($y = 0; $y < count($meta->attributes); $y++) { - if (strtolower($meta->attributes[$y]->name) == 'content') { - $content = explode(',', $meta->attributes[$y]->value); - foreach ($content as $con) { - switch (trim(strtolower($con))) { - case 'nofollow': + } else if (strtolower($meta->attributes[$x]->name) == 'http-equiv') { + switch (strtolower($meta->attributes[$x]->value)) { + case 'refresh': + for ($y = 0; $y < count($meta->attributes); $y++) { + if (strtolower($meta->attributes[$y]->name) == 'content') { + if (preg_match('/(\d+)\s?;\s?url\s?=\s?([\'"])(.+?)\2?\s?$/i', $meta->attributes[$y]->value, $m)) { + if ((int)$m[1] <= $_ODATA['sp_timeout_url']) { + $data['errno'] = 300; + $data['error'] = 'Redirected by element to: '.$m[3]; + $data['info']['redirect_url'] = $m[3]; + $data['info']['noindex'] = 'redirect-meta'; $data['info']['nofollow'] = true; - break; - - case 'noindex': - $data['error'] = 'Not indexed due to robots element'; - $data['info']['noindex'] = 'robots-meta'; - + break 4; + } else $data['links'][] = $m[3]; } } } - } + break; + case 'content-type': + for ($y = 0; $y < count($meta->attributes); $y++) { + if (strtolower($meta->attributes[$y]->name) == 'content' && preg_match('/charset=([\w\d.:-]+)/i', $meta->attributes[$y]->value, $m)) { + if (strtolower($data['info']['charset']) != strtolower($m[1])) { + OS_crawlLog('Charset in Content-type header ('.(($data['info']['charset']) ? $data['info']['charset'] : '').') differs from document charset ('.(($m[1]) ? $m[1] : '').') at: '.$data['info']['url'], 1); + $data['info']['charset'] = $m[1]; + } + } + } + + } + + } else if (strtolower($meta->attributes[$x]->name) == 'name') { + switch (strtolower($meta->attributes[$x]->value)) { + case 'keywords': + for ($y = 0; $y < count($meta->attributes); $y++) + if (strtolower($meta->attributes[$y]->name) == 'content') + $data['keywords'] = $meta->attributes[$y]->value; + break; + + case 'description': + for ($y = 0; $y < count($meta->attributes); $y++) + if (strtolower($meta->attributes[$y]->name) == 'content') + $data['description'] = $meta->attributes[$y]->value; + break; + + case 'robots': + case 'orcacrawler': + case 'orcaphpcrawler': + case 'orca-crawler': + case 'orcaphp-crawler': + case 'orca-phpcrawler': + case 'orca-php-crawler': + case 'orcinuscrawler': + case 'orcinus-crawler': + for ($y = 0; $y < count($meta->attributes); $y++) { + if (strtolower($meta->attributes[$y]->name) == 'content') { + $content = explode(',', $meta->attributes[$y]->value); + foreach ($content as $con) { + switch (trim(strtolower($con))) { + case 'nofollow': + $data['info']['nofollow'] = true; + break; + + case 'noindex': + $data['error'] = 'Not indexed due to robots element'; + $data['info']['noindex'] = 'robots-meta'; + + } + } + } + } + + } } } } - } - $title = $head[0]->getElementsByTagName('title'); - $data['title'] = $title[0]->textContent; + $title = $head[0]->getElementsByTagName('title'); + $data['title'] = $title[0]->textContent; - $links = $head[0]->getElementsByTagName('link'); - foreach ($links as $link) { - for ($x = 0; $x < count($link->attributes); $x++) { - if (strtolower($link->attributes[$x]->name) == 'rel') { - for ($y = 0; $y < count($link->attributes); $y++) { - if (strtolower($link->attributes[$y]->name) == 'href') { - $linkurl = filter_var($link->attributes[$y]->value, FILTER_SANITIZE_URL); + $links = $head[0]->getElementsByTagName('link'); + foreach ($links as $link) { + for ($x = 0; $x < count($link->attributes); $x++) { + if (strtolower($link->attributes[$x]->name) == 'rel') { + for ($y = 0; $y < count($link->attributes); $y++) { + if (strtolower($link->attributes[$y]->name) == 'href') { + $linkurl = filter_var($link->attributes[$y]->value, FILTER_SANITIZE_URL); - switch (strtolower($link->attributes[$x]->value)) { - case 'canonical': - if (OS_formatURL($linkurl, $data['base']) != $data['info']['url']) { - $data['info']['noindex'] = 'non-canonical'; - $data['info']['canonical'] = $linkurl; - } + switch (strtolower($link->attributes[$x]->value)) { + case 'canonical': + if (OS_formatURL($linkurl, $data['base']) != $data['info']['url']) { + $data['info']['noindex'] = 'non-canonical'; + $data['info']['canonical'] = $linkurl; + } - case 'alternate': - case 'author': - case 'help': - case 'license': - case 'me': - case 'next': - case 'prev': - case 'search': - case 'alternate': - $data['links'][] = $linkurl; + case 'alternate': + case 'author': + case 'help': + case 'license': + case 'me': + case 'next': + case 'prev': + case 'search': + case 'alternate': + $data['links'][] = $linkurl; + } + break; } - break; } } } @@ -1142,54 +1150,58 @@ while ($_cURL && count($_QUEUE)) { // ***** Process elements $body = $document->getElementsByTagName('body'); + if (!empty($body[0])) { - // Replace tags with their alt text - $imgs = $body[0]->getElementsByTagName('img'); - foreach ($imgs as $img) { - for ($x = 0; $x < count($img->attributes); $x++) { - if (strtolower($img->attributes[$x]->name) == 'alt') { - $img->parentNode->replaceChild( - $document->createTextNode(' '.$img->attributes[$x]->value.' '), - $img - ); - break; + // Replace tags with their alt text + $imgs = $body[0]->getElementsByTagName('img'); + foreach ($imgs as $img) { + for ($x = 0; $x < count($img->attributes); $x++) { + if (strtolower($img->attributes[$x]->name) == 'alt') { + $img->parentNode->replaceChild( + $document->createTextNode(' '.$img->attributes[$x]->value.' '), + $img + ); + break; + } } } - } - $as = $body[0]->getElementsByTagName('a'); - foreach ($as as $a) { - for ($x = 0; $x < count($a->attributes); $x++) { - if (strtolower($a->attributes[$x]->name) == 'href') { - for ($y = 0; $y < count($a->attributes); $y++) - if (strtolower($a->attributes[$y]->name) == 'rel' && strtolower($a->attributes[$y]->value) == 'nofollow') continue 3; - $data['links'][] = $a->attributes[$x]->value; + $as = $body[0]->getElementsByTagName('a'); + foreach ($as as $a) { + for ($x = 0; $x < count($a->attributes); $x++) { + if (strtolower($a->attributes[$x]->name) == 'href') { + for ($y = 0; $y < count($a->attributes); $y++) + if (strtolower($a->attributes[$y]->name) == 'rel' && strtolower($a->attributes[$y]->value) == 'nofollow') continue 3; + $data['links'][] = $a->attributes[$x]->value; + } } } - } - $areas = $body[0]->getElementsByTagName('area'); - foreach ($areas as $area) { - for ($x = 0; $x < count($area->attributes); $x++) { - if (strtolower($area->attributes[$x]->name) == 'href') { - for ($y = 0; $y < count($area->attributes); $y++) - if (strtolower($area->attributes[$y]->name) == 'rel' && strtolower($area->attributes[$y]->value) == 'nofollow') continue 3; - $data['links'][] = $area->attributes[$x]->value; + $areas = $body[0]->getElementsByTagName('area'); + foreach ($areas as $area) { + for ($x = 0; $x < count($area->attributes); $x++) { + if (strtolower($area->attributes[$x]->name) == 'href') { + for ($y = 0; $y < count($area->attributes); $y++) + if (strtolower($area->attributes[$y]->name) == 'rel' && strtolower($area->attributes[$y]->value) == 'nofollow') continue 3; + $data['links'][] = $area->attributes[$x]->value; + } } } + + $frames = $body[0]->getElementsByTagName('frame'); + foreach ($frames as $frame) + for ($x = 0; $x < count($frame->attributes); $x++) + if (strtolower($frame->attributes[$x]->name) == 'src') + $data['links'][] = $frame->attributes[$x]->value; + + $iframes = $body[0]->getElementsByTagName('iframe'); + foreach ($iframes as $iframe) + for ($x = 0; $x < count($iframe->attributes); $x++) + if (strtolower($iframe->attributes[$x]->name) == 'src') + $data['links'][] = $iframe->attributes[$x]->value; + } - $frames = $body[0]->getElementsByTagName('frame'); - foreach ($frames as $frame) - for ($x = 0; $x < count($frame->attributes); $x++) - if (strtolower($frame->attributes[$x]->name) == 'src') - $data['links'][] = $frame->attributes[$x]->value; - - $iframes = $body[0]->getElementsByTagName('iframe'); - foreach ($iframes as $iframe) - for ($x = 0; $x < count($iframe->attributes); $x++) - if (strtolower($iframe->attributes[$x]->name) == 'src') - $data['links'][] = $iframe->attributes[$x]->value; $data['links'] = array_map(function($l) { if (preg_match('/^(tel|telnet|mailto|ftp|sftp|ssh|gopher|news|ldap|urn|onion|magnet):/i', $l)) return ''; @@ -1377,7 +1389,7 @@ while ($_cURL && count($_QUEUE)) { if (!$data['info']['noindex']) { // If this URL exists (or existed) in the live table... - if (in_array($url, $_EXIST) || $referer == '') { + if (in_array($url, $_RDATA['sp_exist']) || $referer == '') { $_RDATA['sp_status']['Updated']++; $selectData->execute(array('url' => $url)); @@ -1567,7 +1579,7 @@ while ($_cURL && count($_QUEUE)) { if (!in_array($link, $_RDATA['sp_links'])) { // ... and if link hasn't been queued yet - foreach ($_QUEUE as $queue) + foreach ($_RDATA['sp_queue'] as $queue) if ($link == $queue[0]) continue 2; // ... and if link passes our user filters @@ -1577,22 +1589,22 @@ while ($_cURL && count($_QUEUE)) { } // ... then add the link to the queue - $_QUEUE[] = array($link, $depth + 1, $url); + $_RDATA['sp_queue'][] = array($link, $depth + 1, $url); } } } // If we've completed the queue, check for orphans - if (!count($_QUEUE)) { + if (!count($_RDATA['sp_queue'])) { // Diff the previous URL list with the links we've already scanned - $_EXIST = array_diff($_EXIST, $_RDATA['sp_links']); + $_RDATA['sp_exist'] = array_diff($_RDATA['sp_exist'], $_RDATA['sp_links']); // If we have leftover links, and we aren't autodeleting them - if (count($_EXIST) && !$_ODATA['sp_autodelete']) { - OS_crawlLog('Adding '.count($_EXIST).' orphan(s) to queue...', 1); + if (count($_RDATA['sp_exist']) && !$_ODATA['sp_autodelete']) { + OS_crawlLog('Adding '.count($_RDATA['sp_exist']).' orphan(s) to queue...', 1); - foreach ($_EXIST as $key => $link) { + foreach ($_RDATA['sp_exist'] as $key => $link) { // If orphan URL passes our user filters if ($nx = OS_filterURL($link, $data['base'])) { @@ -1602,7 +1614,7 @@ while ($_cURL && count($_QUEUE)) { } // ... then add the orphan to the queue - $_QUEUE[] = array($link, 0, ''); + $_RDATA['sp_queue'][] = array($link, 0, ''); } // Else if we stored some pages, we're done diff --git a/orcinus/search.php b/orcinus/search.php index 504c4ca..a486f60 100644 --- a/orcinus/search.php +++ b/orcinus/search.php @@ -26,14 +26,13 @@ foreach ($_RDATA['s_weights'] as $key => $weight) // {{{{{ Create the Mustache template $_TEMPLATE = new OS_Mustache(); -$_TEMPLATE->version = $_ODATA['version']; -$_TEMPLATE->limit_term_length = $_ODATA['s_limit_term_length']; // Check if there are rows in the search database if ($_RDATA['s_searchable_pages']) { $_TEMPLATE->searchable = new stdClass(); $_TEMPLATE->searchable->form_action = $_SERVER['REQUEST_URI']; + $_TEMPLATE->searchable->limit_term_length = $_ODATA['s_limit_term_length']; if (empty($_REQUEST['c']) || empty($_RDATA['s_category_list'][$_REQUEST['c']])) $_REQUEST['c'] = '';