orcinus-search/orcinus/search.php

885 lines
35 KiB
PHP
Raw Normal View History

2023-04-13 12:27:41 +00:00
<?php /* ***** Orcinus Site Search - Searching Engine ************** */
2023-04-12 02:02:16 +00:00
require __DIR__.'/config.php';
$_SDATA = array(
'terms' => array(),
'formatted' => array(),
'cache' => array(
'data' => '',
'ip' => '',
'stamp' => 0,
'raw' => ''
),
'results' => array(),
'tag' => 'mark',
2023-04-12 02:02:16 +00:00
'json' => array(),
'pages' => 1,
'time' => microtime(true)
);
/* ***** Handle POST Requests ************************************** */
unset($_REQUEST['log']);
if ($_SERVER['REQUEST_METHOD'] == 'POST') {
// JSON POST request
// These are usually sent by javascript fetch()
if ($_SERVER['CONTENT_TYPE'] == 'application/json') {
$postBody = file_get_contents('php://input');
$_REQUEST = json_decode($postBody, true);
}
}
2023-05-17 13:22:00 +00:00
foreach ($_ODATA['s_weights'] as $key => $weight)
$_ODATA['s_weights'][$key] = (float)$weight;
2023-04-12 02:02:16 +00:00
// Prepare regexp translation array for accented / ligature characters
$_RDATA['s_latin_pcre'] = array();
$_RDATA['s_latin_pcre_multi'] = array();
foreach ($_RDATA['s_latin'] as $char => $latin) {
if (strlen($char) > 1) {
$pcre = '('.$char.'|'.implode('|', $latin).')';
} else $pcre = '['.$char.implode('', $latin).']';
$_RDATA['s_latin_pcre'][$char] = $pcre;
foreach ($latin as $lchar)
$_RDATA['s_latin_pcre'][$lchar] = $pcre;
if (strlen($char) > 1) {
$_RDATA['s_latin_pcre_multi'][$char] = $pcre;
foreach ($latin as $lchar)
$_RDATA['s_latin_pcre_multi'][$lchar] = $pcre;
}
}
2023-04-12 02:02:16 +00:00
2023-04-20 14:47:11 +00:00
// {{{{{ Initialize the Mustache templating engine
class OS_Mustache {
public $errors;
public $online = true;
2023-04-20 14:47:11 +00:00
public $version;
public $searchable;
function __construct() {
global $_ODATA;
$this->version = $_ODATA['version'];
}
function addError($text) {
if (!$this->errors) {
$this->errors = new stdClass();
$this->errors->error_list = array();
}
$this->errors->error_list[] = $text;
}
// We'll only autoload the Mustache engine if we need it
function render() {
global $_ODATA;
require_once __DIR__.'/mustache/src/Mustache/Autoloader.php';
Mustache_Autoloader::register();
$output = new Mustache_Engine(array('entity_flags' => ENT_QUOTES));
echo $output->render($_ODATA['s_result_template'], $this);
}
}
2023-04-20 15:03:08 +00:00
$_ORCINUS = new OS_Mustache();
2023-04-12 02:02:16 +00:00
// Check if there are rows in the search database
if ($_RDATA['s_searchable_pages']) {
2023-04-20 15:03:08 +00:00
$_ORCINUS->searchable = new stdClass();
$_ORCINUS->searchable->form_action = $_SERVER['REQUEST_URI'];
2023-04-23 01:48:43 +00:00
$_ORCINUS->searchable->limit_query = $_ODATA['s_limit_query'];
2023-04-20 15:03:08 +00:00
$_ORCINUS->searchable->limit_term_length = $_ODATA['s_limit_term_length'];
2023-04-12 02:02:16 +00:00
2023-04-17 21:47:22 +00:00
if (empty($_REQUEST['c']) || empty($_RDATA['s_category_list'][$_REQUEST['c']]))
2023-04-12 02:02:16 +00:00
$_REQUEST['c'] = '<none>';
if (count($_RDATA['s_category_list']) > 2) {
2023-04-20 15:03:08 +00:00
$_ORCINUS->searchable->categories = new stdClass();
$_ORCINUS->searchable->categories->category_list = array();
2023-04-12 02:02:16 +00:00
foreach ($_RDATA['s_category_list'] as $category => $count) {
$cat = new stdClass();
$cat->name = ($category = '<none>') ? 'All Categories' : $category;
$cat->value = $category;
$cat->selected = ($_REQUEST['c'] == $category);
2023-04-20 15:03:08 +00:00
$_ORCINUS->searchable->categories->category_list[] = $cat;
2023-04-12 02:02:16 +00:00
}
}
2023-04-17 21:47:22 +00:00
if (empty($_REQUEST['q'])) $_REQUEST['q'] = '';
2023-04-12 02:02:16 +00:00
$_REQUEST['q'] = preg_replace(array('/\s/', '/ {2,}/'), ' ', trim($_REQUEST['q']));
// If there is a text request
if ($_REQUEST['q']) {
// Convert to UTF-8 from specified encoding
$_REQUEST['q'] = mb_convert_encoding($_REQUEST['q'], 'UTF-8', $_ODATA['s_charset']);
if (mb_strlen($_REQUEST['q'], 'UTF-8') > $_ODATA['s_limit_query']) {
$_REQUEST['q'] = mb_substr($_REQUEST['q'], 0, $_ODATA['s_limit_query'], 'UTF-8');
2023-04-23 01:48:43 +00:00
$_ORCINUS->addError('Search query truncated to maximum '.$_ODATA['s_limit_query'].' characters');
2023-04-12 02:02:16 +00:00
}
2023-04-20 15:03:08 +00:00
$_ORCINUS->searchable->request_q = $_REQUEST['q'];
2023-04-12 02:02:16 +00:00
// Split request string on quotation marks (")
$request = explode('"', ' '.$_REQUEST['q'].' ');
for ($x = 0; $x < count($request) && count($_SDATA['terms']) < $_ODATA['s_limit_terms']; $x++) {
// Every second + 1 group of terms just a list of terms
if (!($x % 2)) {
// Split this list of terms on spaces
$request[$x] = explode(' ', $request[$x]);
foreach ($request[$x] as $t) {
if (!$t) continue;
// Leading + means important, a MUST match
if ($t[0] == '+' && mb_strlen($t, 'UTF-8') > 1) {
2023-04-12 02:02:16 +00:00
// Just count it as a 'phrase' of one word, functionally equivalent
$_SDATA['terms'][] = array('phrase', substr($t, 1), false);
2023-04-17 21:47:22 +00:00
// Leading - means negative, a MUST exclude
} else if ($t[0] == '-' && mb_strlen($t, 'UTF-8') > 1) {
2023-04-12 02:02:16 +00:00
$_SDATA['terms'][] = array('exclude', substr($t, 1), false);
// Restrict to a specific filetype
2023-04-24 20:31:27 +00:00
// Really, we'd only allow HTML, XML and PDF here
} else if (strpos(strtolower($t), 'filetype:') === 0) {
2023-04-12 02:02:16 +00:00
$t = trim(substr($t, 9));
if ($t && isset($_RDATA['s_filetypes'][strtoupper($t)]))
$_SDATA['terms'][] = array('filetype', $t, false);
// Else if the term is greater than the term length limit, add it
} else if (mb_strlen($t, 'UTF-8') >= $_ODATA['s_limit_term_length'])
2023-04-12 02:02:16 +00:00
$_SDATA['terms'][] = array('term', $t, false);
}
// Every second group of terms is a phrase, a MUST match
} else $_SDATA['terms'][] = array('phrase', $request[$x], false);
}
// If we successfully procured some terms
if (count($_SDATA['terms'])) {
2023-04-20 15:03:08 +00:00
$_ORCINUS->searchable->searched = new stdClass();
2023-04-12 02:02:16 +00:00
if ($_REQUEST['c'] != '<none>') {
2023-04-20 15:03:08 +00:00
$_ORCINUS->searchable->searched->category = new stdClass();
$_ORCINUS->searchable->searched->category->request_c = $_REQUEST['c'];
2023-04-12 02:02:16 +00:00
}
// Prepare PCRE match text for each phrase and term
foreach ($_SDATA['terms'] as $key => list($type, $term, $pcre)) {
// Normalize punctuation
$term = strtr($term, $_RDATA['sp_punct']);
$_SDATA['terms'][$key][1] = $term;
2023-04-12 02:02:16 +00:00
switch ($type) {
case 'filetype':
$_SDATA['formatted'][] = $type.':'.$term;
break;
case 'exclude':
$_SDATA['formatted'][] = '-'.$term;
break;
case 'phrase':
$_SDATA['formatted'][] = '"'.$term.'"';
case 'term':
if ($type == 'term')
$_SDATA['formatted'][] = $term;
// Regexp for later use pattern matching results
$_SDATA['terms'][$key][2] = '/('.strtr(
preg_quote(strtolower($term), '/'),
$_RDATA['s_latin_pcre']
).')/iu';
2023-04-12 02:02:16 +00:00
}
}
// Without this, category searches are merged, maybe okay?
// if ($_REQUEST['c'] != '<none>')
// $_SDATA['formatted'][] = '('.
// Check if this search is already cached
$_SDATA['formatted'] = implode(' ', $_SDATA['formatted']);
$checkCache = $_DDATA['pdo']->prepare(
'SELECT `stamp`, INET_NTOA(`ip`) AS `ip`, `cache`
FROM `'.$_DDATA['tbprefix'].'query`
WHERE `query`=:query AND `cache`<>\'\'
ORDER BY `stamp` DESC LIMIT 1;'
);
$checkCache->execute(array('query' => $_SDATA['formatted']));
$err = $checkCache->errorInfo();
if ($err[0] == '00000') {
$checkCache = $checkCache->fetchAll();
// If we retrieved a matching row from the query log
if (count($checkCache)) {
$_SDATA['cache']['ip'] = $checkCache[0]['ip'];
$_SDATA['cache']['stamp'] = $checkCache[0]['stamp'];
$_SDATA['cache']['raw'] = $checkCache[0]['cache'];
$_SDATA['cache']['data'] = $checkCache[0]['cache'];
// Try to gzunzip the cache data
if (function_exists('gzuncompress')) {
$checkGZ = gzuncompress($_SDATA['cache']['data']);
if ($checkGZ) $_SDATA['cache']['data'] = $checkGZ;
}
// Try to json_decode the cache data
// If this step fails, assume there is no cache data
$checkJS = json_decode($_SDATA['cache']['data'], true);
$_SDATA['cache']['data'] = (!is_null($checkJS)) ? $checkJS : '';
2023-04-12 02:02:16 +00:00
}
// Database error accessing the query log
2023-04-20 15:03:08 +00:00
} else $_ORCINUS->addError('Error reading the search result cache');
2023-04-12 02:02:16 +00:00
// ***** Nothing in the cache, so do an actual search
if (!is_array($_SDATA['cache']['data'])) {
// Begin building the basic query
$searchSQL = '
SELECT `url`, `category`, `content`, `content_mime`, `title`,
2023-09-08 19:11:27 +00:00
`description`, `keywords`, `weighted`, `last_modified`, `priority`
2023-04-12 02:02:16 +00:00
FROM `'.$_DDATA['tbprefix'].'crawldata`
2023-04-24 20:31:27 +00:00
WHERE `flag_unlisted`=0 AND `priority`>0';
2023-04-12 02:02:16 +00:00
// Restrict by category
if ($_REQUEST['c'] != '<none>')
2023-04-24 20:31:27 +00:00
$searchSQL .= ' AND `category`=\''.addslashes($_REQUEST['c']).'\'';
2023-04-12 02:02:16 +00:00
// Show or do not show Orphans
$searchSQL .= $_RDATA['s_show_orphans'];
2023-04-12 02:02:16 +00:00
$ands = array();
$ors = array();
$negs = array();
2023-04-24 20:31:27 +00:00
$filetypes = array();
2023-04-12 02:02:16 +00:00
foreach ($_SDATA['terms'] as list($type, $term, $pcre)) {
$slashTerm = addslashes($term);
2023-04-12 02:02:16 +00:00
switch ($type) {
2023-04-24 20:31:27 +00:00
case 'filetype':
if (isset($_RDATA['s_filetypes'][strtoupper($term)]))
foreach ($_RDATA['s_filetypes'][strtoupper($term)] as $mime)
$filetypes[] = '`content_mime`=\''.$mime.'\'';
2023-04-12 02:02:16 +00:00
break;
case 'exclude':
$negs[] = 'INSTR(`content`, \''.$slashTerm.'\')=0';
$negs[] = '`content` NOT LIKE \'%'.$slashTerm.'%\'';
$negs[] = 'INSTR(`url`, \''.$slashTerm.'\')=0';
$negs[] = '`url` NOT LIKE \'%'.$slashTerm.'%\'';
$negs[] = 'INSTR(`title`, \''.$slashTerm.'\')=0';
$negs[] = '`title` NOT LIKE \'%'.$slashTerm.'%\'';
$negs[] = 'INSTR(`description`, \''.$slashTerm.'\')=0';
$negs[] = '`description` NOT LIKE \'%'.$slashTerm.'%\'';
$negs[] = 'INSTR(`keywords`, \''.$slashTerm.'\')=0';
$negs[] = '`keywords` NOT LIKE \'%'.$slashTerm.'%\'';
$negs[] = 'INSTR(`weighted`, \''.$slashTerm.'\')=0';
$negs[] = '`weighted` NOT LIKE \'%'.$slashTerm.'%\'';
2023-04-12 02:02:16 +00:00
break;
case 'phrase':
$ands[] = '('.implode(' OR ', array(
'INSTR(`content`, \''.$slashTerm.'\')',
'`content` LIKE \'%'.$slashTerm.'%\'',
'INSTR(`url`, \''.$slashTerm.'\')',
'`url` LIKE \'%'.$slashTerm.'%\'',
'INSTR(`title`, \''.$slashTerm.'\')',
'`title` LIKE \'%'.$slashTerm.'%\'',
'INSTR(`description`, \''.$slashTerm.'\')',
'`description` LIKE \'%'.$slashTerm.'%\'',
'INSTR(`keywords`, \''.$slashTerm.'\')',
'`keywords` LIKE \'%'.$slashTerm.'%\'',
'INSTR(`weighted`, \''.$slashTerm.'\')',
'`weighted` LIKE \'%'.$slashTerm.'%\''
2023-04-12 02:02:16 +00:00
)).')';
break;
case 'term':
$ors[] = 'INSTR(`content`, \''.$slashTerm.'\')';
$ors[] = '`content` LIKE \'%'.$slashTerm.'%\'';
$ors[] = 'INSTR(`url`, \''.$slashTerm.'\')';
$ors[] = '`url` LIKE \'%'.$slashTerm.'%\'';
$ors[] = 'INSTR(`title`, \''.$slashTerm.'\')';
$ors[] = '`title` LIKE \'%'.$slashTerm.'%\'';
$ors[] = 'INSTR(`description`, \''.$slashTerm.'\')';
$ors[] = '`description` LIKE \'%'.$slashTerm.'%\'';
$ors[] = 'INSTR(`keywords`, \''.$slashTerm.'\')';
$ors[] = '`keywords` LIKE \'%'.$slashTerm.'%\'';
$ors[] = 'INSTR(`weighted`, \''.$slashTerm.'\')';
$ors[] = '`weighted` LIKE \'%'.$slashTerm.'%\'';
2023-04-12 02:02:16 +00:00
}
}
2023-04-24 20:31:27 +00:00
if (count($filetypes))
$searchSQL .= ' AND ('.implode(' OR ', $filetypes).') ';
2023-04-12 02:02:16 +00:00
if (count($ands)) {
2023-04-24 20:31:27 +00:00
$searchSQL .= ' AND '.implode(' AND ', $ands).' ';
} else $searchSQL .= ' AND ('.implode(' OR ', $ors).') ';
2023-04-12 02:02:16 +00:00
if (count($negs))
$searchSQL .= ' AND '.implode(' AND ', $negs);
// Execute the query
$searchQuery = $_DDATA['pdo']->query($searchSQL.';');
$err = $searchQuery->errorInfo();
if ($err[0] == '00000') {
$searchQuery = $searchQuery->fetchAll();
2023-09-08 19:11:27 +00:00
$pdfList = array();
2023-04-12 02:02:16 +00:00
// Apply relevance to each listing and then sort
foreach ($searchQuery as $key => $row) {
$searchQuery[$key]['relevance'] = 0;
$searchQuery[$key]['multi'] = -1;
$searchQuery[$key]['phrase'] = 0;
2023-09-08 19:11:27 +00:00
if ($row['content_mime'] == 'application/pdf')
$pdfList[] = array($key, $row['last_modified']);
2023-04-12 02:02:16 +00:00
// Lowercase values for easy compare
$row['lc_content'] = strtolower($row['content']);
$row['lc_url'] = strtolower($row['url']);
$row['lc_title'] = strtolower($row['title']);
$row['lc_description'] = strtolower($row['description']);
$row['lc_keywords'] = strtolower($row['keywords']);
$row['lc_weighted'] = strtolower($row['weighted']);
// Remove latin character accents
foreach ($_RDATA['s_latin'] as $char => $latin) {
$row['lc_content'] = str_replace($latin, $char, $row['lc_content']);
$row['lc_url'] = str_replace($latin, $char, $row['lc_url']);
$row['lc_title'] = str_replace($latin, $char, $row['lc_title']);
$row['lc_description'] = str_replace($latin, $char, $row['lc_description']);
$row['lc_keywords'] = str_replace($latin, $char, $row['lc_keywords']);
$row['lc_weighted'] = str_replace($latin, $char, $row['lc_weighted']);
}
// Run through each term and check content for matches
foreach ($_SDATA['terms'] as list($type, $term, $pcre)) {
switch ($type) {
case 'filetype': break;
case 'exclude': break;
case 'phrase':
$searchQuery[$key]['phrase']++;
case 'term':
$term = strtolower($term);
foreach ($_RDATA['s_latin'] as $char => $latin)
$term = str_replace($latin, $char, $term);
$pcreterm = '/\b'.preg_quote($term, '/').'/i';
// Give full points for every instance of a term
// that's at the beginning of a word or phrase
$i = preg_match_all($pcreterm, $row['lc_content']);
$j = preg_match_all($pcreterm, $row['lc_url']);
$k = preg_match_all($pcreterm, $row['lc_title']);
$l = preg_match_all($pcreterm, $row['lc_description']);
$m = preg_match_all($pcreterm, $row['lc_keywords']);
$n = preg_match_all($pcreterm, $row['lc_weighted']);
if ($i || $j || $k || $l || $m || $n)
$searchQuery[$key]['multi']++;
// Limit generic matches to a maximum of three
$a = $i + min(substr_count($row['lc_content'], $term), 3);
$b = $j + min(substr_count($row['lc_url'], $term), 3);
$c = $k + min(substr_count($row['lc_title'], $term), 3);
$d = $l + min(substr_count($row['lc_description'], $term), 3);
$e = $m + min(substr_count($row['lc_keywords'], $term), 3);
$f = $n + min(substr_count($row['lc_weighted'], $term), 3);
2023-05-17 13:22:00 +00:00
$searchQuery[$key]['relevance'] += $a * $_ODATA['s_weights']['body'];
$searchQuery[$key]['relevance'] += $b * $_ODATA['s_weights']['url'];
$searchQuery[$key]['relevance'] += $c * $_ODATA['s_weights']['title'];
$searchQuery[$key]['relevance'] += $d * $_ODATA['s_weights']['description'];
$searchQuery[$key]['relevance'] += $e * $_ODATA['s_weights']['keywords'];
$searchQuery[$key]['relevance'] += $f * $_ODATA['s_weights']['css_value'];
2023-04-12 02:02:16 +00:00
}
}
// Calculate multipliers
2023-05-17 13:22:00 +00:00
$searchQuery[$key]['relevance'] *= $_ODATA['s_weights']['multi'] ** $searchQuery[$key]['multi'];
$searchQuery[$key]['relevance'] *= $_ODATA['s_weights']['important'] ** $searchQuery[$key]['phrase'];
2023-04-12 02:02:16 +00:00
$searchQuery[$key]['relevance'] *= $row['priority'];
}
2023-09-08 19:11:27 +00:00
// Apply the PDF Last Modified multiplier
if (count($pdfList) > 1) {
foreach ($pdfList as $value) {
$diff = (time() - $value[1]) / (60 * 60 * 24 * 365);
$searchQuery[$value[0]]['relevance'] *= $_ODATA['s_weights']['pdflastmod'] ** $diff;
}
}
2023-04-12 02:02:16 +00:00
// Sort the list by relevance value
usort($searchQuery, function($a, $b) {
if ($b['relevance'] == $a['relevance']) return 0;
return ($b['relevance'] > $a['relevance']) ? 1 : -1;
});
// Normalize results from 0 - 100 and delete results with
// relevance values < 5% of the top result
for ($x = count($searchQuery) - 1; $x >= 0; $x--) {
2023-10-18 18:23:00 +00:00
if ($searchQuery[0]['relevance'] > 0 && $searchQuery[0]['relevance'] * 0.05 <= $searchQuery[$x]['relevance']) {
2023-04-12 02:02:16 +00:00
$searchQuery[$x]['relevance'] /= $searchQuery[0]['relevance'] * 0.01;
} else unset($searchQuery[$x]);
}
// The final results list is the top slice of this data
// limited by the 's_limit_results' value
$_SDATA['results'] = array_slice($searchQuery, 0, $_ODATA['s_limit_results']);
// Now loop through the remaining results to generate the
// proper match text for each
foreach ($_SDATA['results'] as $key => $row) {
$_SDATA['results'][$key]['matchtext'] = array();
$_SDATA['results'][$key]['fragment'] = array();
2023-04-12 02:02:16 +00:00
// Add the page description to use as a default match text
if (trim($row['description'])) {
if (mb_strlen($row['description'], 'UTF-8') > $_ODATA['s_limit_matchtext']) {
$match = mb_substr($row['description'], 0, $_ODATA['s_limit_matchtext'], 'UTF-8')."\u{2026}";
2023-04-12 23:08:00 +00:00
} else $match = $row['description'];
2023-04-12 02:02:16 +00:00
$_SDATA['results'][$key]['matchtext'][] = array(
'rank' => 0,
2023-04-12 23:08:00 +00:00
'text' => $match
2023-04-12 02:02:16 +00:00
);
}
// Loop through each term to capture matchtexts
foreach ($_SDATA['terms'] as list($type, $term, $pcre)) {
switch ($type) {
case 'filetype': break;
case 'exclude': break;
case 'phrase':
case 'term':
// Split the content on the current term
$splitter = preg_split($pcre, $row['content'], 0,
PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_OFFSET_CAPTURE);
// For each match, gather the appropriate amount of match
// text from either side of it
foreach ($splitter as $split) {
if (preg_match($pcre, $split[0]) || count($splitter) == 1) {
if (count($splitter) == 1) {
// Grab some random content if there were no
// matches in the content
$offset = mt_rand(0, max(0, mb_strlen($row['content'], 'UTF-8') - $_ODATA['s_limit_matchtext']));
} else {
$_SDATA['results'][$key]['fragment'][] = $split[0];
$offset = floor(max(0, $split[1] - (mb_strlen($term, 'UTF-8') + $_ODATA['s_limit_matchtext']) / 2));
}
$match = trim(mb_substr($row['content'], $offset, $_ODATA['s_limit_matchtext'], 'UTF-8'));
2023-04-12 02:02:16 +00:00
// Add appropriate ellipses
if ($offset + ((mb_strlen($term, 'UTF-8') + $_ODATA['s_limit_matchtext']) / 2) < mb_strlen($row['content'], 'UTF-8'))
2023-04-12 02:02:16 +00:00
$match .= "\u{2026}";
if ($offset) $match = "\u{2026}".$match;
$_SDATA['results'][$key]['matchtext'][] = array(
'rank' => 0,
'text' => $match
);
}
}
}
}
// For each found match text, add a point for every time a
// term is found in the match text; triple points for phrase
// or important (+) matches
foreach ($_SDATA['results'][$key]['matchtext'] as $mkey => $matchtext) {
foreach ($_SDATA['terms'] as $tkey => list($type, $term, $pcre)) {
switch ($type) {
case 'filetype': break;
case 'exclude': break;
case 'phrase':
case 'term':
$points = preg_match_all($pcre, $matchtext['text']); // / ($tkey + 1);
if ($type == 'phrase') $points *= 3;
$_SDATA['results'][$key]['matchtext'][$mkey]['rank'] += $points;
}
}
}
// Sort the match texts by score
usort($_SDATA['results'][$key]['matchtext'], function($a , $b) {
if ($b['rank'] == $a['rank']) return 0;
return ($b['rank'] > $a['rank']) ? 1 : -1;
});
// Use the top-ranked match text as the official match text
// Run an mb_convert_encoding() in case we chopped a UTF-8
// character in the middle of its bytes
$_SDATA['results'][$key]['matchtext'] = mb_convert_encoding(
$_SDATA['results'][$key]['matchtext'][0]['text'],
'UTF-8',
'UTF-8'
);
// Unset result values we no longer need so they don't
// bloat the cache unnecessarily
unset($_SDATA['results'][$key]['content']);
unset($_SDATA['results'][$key]['keywords']);
unset($_SDATA['results'][$key]['weighted']);
2023-09-08 19:11:27 +00:00
unset($_SDATA['results'][$key]['last_modified']);
2023-04-12 02:02:16 +00:00
unset($_SDATA['results'][$key]['multi']);
unset($_SDATA['results'][$key]['phrase']);
}
2023-04-20 15:03:08 +00:00
} else $_ORCINUS->addError('Database error reading results: '.$err[2]);
2023-04-12 02:02:16 +00:00
// ***** Else this is a cached set of results
} else $_SDATA['results'] = $_SDATA['cache']['data'];
// Limit $_REQUEST['page'] to within boundaries
2023-04-24 17:37:44 +00:00
$_REQUEST['page'] = (!empty($_REQUEST['page'])) ? max(1, (int)$_REQUEST['page']) : 1;
$_SDATA['pages'] = max(1, ceil(count($_SDATA['results']) / $_ODATA['s_results_pagination']));
2023-04-12 02:02:16 +00:00
$_REQUEST['page'] = min($_SDATA['pages'], $_REQUEST['page']);
// Database log (and potentially cache) this page only if:
// - This is not a JSON output request
// - The user is visiting page 1 of results
// - Their IP does not match the IP of the previous request for
// this same query
// - ... but if their IP *does* match, check that their last
// request for this same query was more than ten seconds ago
2023-04-12 23:08:00 +00:00
if (!isset($_REQUEST['json']) && $_REQUEST['page'] == 1 &&
2023-04-12 02:02:16 +00:00
($_SDATA['cache']['ip'] != $_SERVER['REMOTE_ADDR'] ||
$_SDATA['cache']['stamp'] + 10 < time())) {
// Delete the cache from all other searches for this query
$clear = $_DDATA['pdo']->prepare(
'UPDATE `'.$_DDATA['tbprefix'].'query` SET `cache`=\'\'
WHERE `query`=:query;'
);
$clear->execute(array('query' => $_SDATA['formatted']));
$err = $clear->errorInfo();
if ($err[0] != '00000')
2023-04-20 15:03:08 +00:00
$_ORCINUS->addError('Could not clear previous search cache: '.$err[2]);
2023-04-12 02:02:16 +00:00
// If we are caching search results
if ($_ODATA['s_limit_cache']) {
// If this search query hasn't been cached yet
if (!is_array($_SDATA['cache']['data'])) {
// JSON encode and potentially gzip the results for storage
$searchCache = json_encode($_SDATA['results'], JSON_INVALID_UTF8_IGNORE);
if (function_exists('gzcompress'))
$searchCache = gzcompress($searchCache);
// else use the cache we retrieved from the database
} else $searchCache = $_SDATA['cache']['raw'];
} else $searchCache = '';
$insertQuery = $_DDATA['pdo']->prepare(
'INSERT INTO `'.$_DDATA['tbprefix'].'query` SET
`query`=:query,
`results`=:results,
`stamp`=UNIX_TIMESTAMP(),
`ip`=INET_ATON(:ipaddr),
`cache`=:cache
;'
);
$insertQuery->execute(array(
'query' => $_SDATA['formatted'],
'results' => count($_SDATA['results']),
'ipaddr' => $_SERVER['REMOTE_ADDR'],
'cache' => $searchCache
));
if (!$insertQuery->rowCount()) {
2023-04-20 15:03:08 +00:00
$_ORCINUS->addError('Could not cache search results');
2023-04-12 02:02:16 +00:00
$err = $insertQuery->errorInfo();
if ($err[0] != '00000')
2023-04-20 15:03:08 +00:00
$_ORCINUS->addError('MySQL error: '.$err[2]);
2023-04-12 02:02:16 +00:00
}
}
// ***** We have completed searching and caching! *****
// Now it's time to focus on how we will format the data we
// obtained for display to the viewer
// Get a slice of the results that corresponds to the current
// search results pagination page we are on
$resultsPage = array_slice(
$_SDATA['results'],
($_REQUEST['page'] - 1) * $_ODATA['s_results_pagination'],
$_ODATA['s_results_pagination']
);
// If we have more than zero results...
if (count($resultsPage)) {
2023-04-20 15:03:08 +00:00
$_ORCINUS->searchable->searched->results = new stdClass();
$_ORCINUS->searchable->searched->results->result_list = array();
2023-04-12 02:02:16 +00:00
// Prepare PCRE for removing base domains
if (count($_ODATA['sp_domains']) == 1)
$repStr = '/^'.preg_quote(key($_ODATA['sp_domains']), '/').'/';
2023-04-12 02:02:16 +00:00
// Do a last once-over of the results
foreach ($resultsPage as $key => $result) {
$_RESULT = new stdClass();
$_RESULT->filetype = '';
foreach ($_RDATA['s_filetypes'] as $type => $mimes)
foreach ($mimes as $mime)
if ($result['content_mime'] == $mime)
$_RESULT->filetype = $type;
// Don't display filetype of HTML pages
if (!$_ODATA['s_show_filetype_html'])
if ($_RESULT->filetype == 'HTML')
$_RESULT->filetype = '';
if ($_RESULT->filetype)
$_RESULT->filetype = '['.$_RESULT->filetype.']';
// Don't display category if there's only one
if (count($_RDATA['s_category_list']) > 2) {
$_RESULT->category = $result['category'];
} else $_RESULT->category = '';
// Format relevance
$_RESULT->relevance = number_format($result['relevance'], 2, '.', '');
// Remove base domain from URL if they are all the same
if (count($_ODATA['sp_domains']) == 1)
2023-04-12 02:02:16 +00:00
$result['url'] = preg_replace($repStr, '', $result['url']);
// Highlight the terms in the title, url and matchtext
$_RESULT->query = $_REQUEST['q'];
2023-04-20 20:18:51 +00:00
$_RESULT->title = $result['title'];
$_RESULT->url = $result['url'];
$_RESULT->matchtext = $result['matchtext'];
$_RESULT->description = $result['description'];
$_RESULT->title_highlight = htmlspecialchars($result['title']);
$_RESULT->url_highlight = htmlspecialchars($result['url']);
$_RESULT->matchtext_highlight = htmlspecialchars($result['matchtext']);
$_RESULT->description_highlight = htmlspecialchars($result['description']);
2023-04-12 02:02:16 +00:00
foreach ($_SDATA['terms'] as list($type, $term, $pcre)) {
switch ($type) {
case 'filetype': break;
case 'exclude': break;
case 'phrase':
case 'term':
$_RESULT->title_highlight = preg_replace(
$pcre,
'<'.$_SDATA['tag'].'>$1</'.$_SDATA['tag'].'>',
$_RESULT->title_highlight
);
$_RESULT->url_highlight = preg_replace(
$pcre,
'<'.$_SDATA['tag'].'>$1</'.$_SDATA['tag'].'>',
$_RESULT->url_highlight
);
$_RESULT->matchtext_highlight = preg_replace(
$pcre,
'<'.$_SDATA['tag'].'>$1</'.$_SDATA['tag'].'>',
$_RESULT->matchtext_highlight
);
$_RESULT->description_highlight = preg_replace(
$pcre,
'<'.$_SDATA['tag'].'>$1</'.$_SDATA['tag'].'>',
$_RESULT->description_highlight
);
2023-04-12 02:02:16 +00:00
}
}
// Convert output back to $_ODATA['s_charset'] before storing
if (strtoupper($_ODATA['s_charset']) != 'UTF-8') {
$_RESULT = json_decode(mb_convert_encoding(
json_encode($_RESULT, JSON_INVALID_UTF8_IGNORE),
$_ODATA['s_charset'],
'UTF-8'
), true);
2023-04-12 02:02:16 +00:00
}
// Append text fragment(s) to the URL if applicable
if ($_ODATA['s_text_fragments'] && count($result['fragment'])) {
$result['fragment'] = array_map(function($a) {
return str_replace(array(',', '-'), array('%2C', '%2D'), urlencode($a));
}, array_values(array_unique($result['fragment'])));
$_RESULT->url .= '#:~:text='.implode('&text=', $result['fragment']);
}
2023-04-20 15:03:08 +00:00
$_ORCINUS->searchable->searched->results->result_list[] = $_RESULT;
2023-04-12 02:02:16 +00:00
}
// If there are more than just one page of results, prepare all
// the pagination variables for the template
if ($_SDATA['pages'] > 1) {
$pagination = new stdClass();
$pagination->page_gt1 = ($_REQUEST['page'] > 1);
$pagination->page_minus1 = $_REQUEST['page'] - 1;
$pagination->page_list = array();
for ($x = 1; $x <= $_SDATA['pages']; $x++) {
$page = new stdClass();
$page->index = $x;
$page->current = ($x == $_REQUEST['page']);
$pagination->page_list[] = $page;
}
$pagination->page_ltpages = ($_REQUEST['page'] < $_SDATA['pages']);
$pagination->page_plus1 = $_REQUEST['page'] + 1;
2023-04-20 15:03:08 +00:00
$_ORCINUS->searchable->searched->results->pagination = $pagination;
2023-04-12 02:02:16 +00:00
}
// Final numerical and stopwatch time values
2023-04-20 15:03:08 +00:00
$_ORCINUS->searchable->searched->results->from = min(count($_SDATA['results']), ($_REQUEST['page'] - 1) * $_ODATA['s_results_pagination'] + 1);
$_ORCINUS->searchable->searched->results->to = min(count($_SDATA['results']), $_REQUEST['page'] * $_ODATA['s_results_pagination']);
$_ORCINUS->searchable->searched->results->of = count($_SDATA['results']);
$_ORCINUS->searchable->searched->results->in = number_format(microtime(true) - $_SDATA['time'], 2, '.', '');
2023-04-12 02:02:16 +00:00
2023-04-20 15:03:08 +00:00
$_SDATA['json'] = array_slice($_ORCINUS->searchable->searched->results->result_list, 0, 5);
2023-04-12 02:02:16 +00:00
} // No results
} // No valid terms
// Convert request query back to $_ODATA['s_charset'] before exiting
if (strtoupper($_ODATA['s_charset']) != 'UTF-8')
$_REQUEST['q'] = mb_convert_encoding($_REQUEST['q'], $_ODATA['s_charset'], 'UTF-8');
} // No request data
} // No searchable pages in search database
// If the crawler 'time_start' is more than 'timeout_crawl'
// seconds ago, the crawler is probably stuck. Unstick it.
if (OS_getValue('sp_crawling') && time() - $_ODATA['sp_time_start'] > $_ODATA['sp_timeout_crawl']) {
OS_setValue('sp_crawling', 0);
$reason = 'Crawl timeout ('.$_ODATA['sp_timeout_crawl'].'s) exceeded';
if (strpos(OS_getValue('sp_log'), "\n") === false && file_exists($_ODATA['sp_log'])) {
$log = file_get_contents($_ODATA['sp_log']);
OS_setValue('sp_log', $log."\n".'[ERROR] '.$reason);
} else OS_setValue('sp_log', '[ERROR] '.$reason);
OS_setValue('sp_time_last', $_ODATA['sp_time_end'] - $_ODATA['sp_time_start']);
// Send failure email to the admin(s)
if ($_MAIL && count($_MAIL->getAllRecipientAddresses()) && $_ODATA['sp_email_failure']) {
$_MAIL->Subject = 'Orcinus Site Search Crawler: Crawl timeout ('.$_ODATA['sp_timeout_crawl'].'s) exceeded';
$_MAIL->Body = implode(" \r\n", preg_grep('/^[\[\*\w\d]/', explode("\n", $_ODATA['sp_log'])));
if (!$_MAIL->Send()) OS_setValue('sp_log', $_ODATA['sp_log']."\n".'[ERROR] Could not send notification email');
}
}
2023-04-12 02:02:16 +00:00
// ***** Trigger another crawl
if ($_ODATA['sp_interval'] && !$_ODATA['sp_crawling'] &&
time() - $_ODATA['sp_time_end_success'] > $_ODATA['sp_interval'] * 3600) {
2023-04-12 02:02:16 +00:00
// If we can only trigger the crawl during certain time period
if ($_ODATA['sp_interval_start'] != $_ODATA['sp_interval_stop']) {
$timeNow = new DateTime();
$timeStart = DateTime::createFromFormat('H:i:s', $_ODATA['sp_interval_start']);
$timeStop = DateTime::createFromFormat('H:i:s', $_ODATA['sp_interval_stop']);
// Move PM start times back one day
if ($timeStart > $timeStop) $timeStart->modify('-1 day');
// Make sure at least the stop time is in the future
if ($timeNow > $timeStop) {
$timeStart->modify('+1 day');
$timeStop->modify('+1 day');
}
$allowCrawl = ($timeStart < $timeNow && $timeNow < $timeStop);
// Otherwise always allow the crawl
} else $allowCrawl = true;
if ($allowCrawl) {
// Set the key for initiating the crawler
$md5 = md5(hrtime(true));
OS_setValue('sp_key', $md5);
// ***** Initialize the cURL connection
$_cURL = OS_getConnection();
if ($_cURL) {
// Customize this cURL connection
curl_setopt($_cURL, CURLOPT_POST, true);
curl_setopt($_cURL, CURLOPT_POSTFIELDS, json_encode(array(
'action' => 'crawl',
'sp_key' => $_ODATA['sp_key']
)));
curl_setopt($_cURL, CURLOPT_HTTPHEADER, array(
'Content-type: application/json; charset='.$_ODATA['s_charset']
));
curl_setopt($_cURL, CURLOPT_CONNECTTIMEOUT, 1);
curl_setopt($_cURL, CURLOPT_TIMEOUT, 1);
2023-04-17 21:47:22 +00:00
$crawlerDir = str_replace($_ODATA['admin_install_root'], '', __DIR__);
2023-04-12 02:02:16 +00:00
$crawlerURL = $_ODATA['admin_install_domain'].$crawlerDir.'/crawler.php';
curl_setopt($_cURL, CURLOPT_URL, str_replace(' ', '%20', $crawlerURL));
curl_exec($_cURL);
// Error code 28 (timeout) is okay
$errno = curl_errno($_cURL);
if ($errno && $errno != 28) {
$error = curl_error($_cURL);
2023-04-20 15:03:08 +00:00
if ($error) $_ORCINUS->addError($error); // Hide this?
2023-04-12 02:02:16 +00:00
}
curl_close($_cURL);
} // Could not create a connection, but don't let the user know
}
}
// Output JSON and exit if requested
2023-04-12 23:08:00 +00:00
if (isset($_REQUEST['json'])) {
2023-04-12 02:02:16 +00:00
header('Content-type: application/json; charset='.$_ODATA['s_charset']);
die(json_encode($_SDATA['json'], JSON_INVALID_UTF8_IGNORE));
// Just log this query and do not post results
} else if (isset($_REQUEST['log'])) die(); ?>