Daily update

This commit is contained in:
Brian Huisman 2023-04-17 17:47:22 -04:00
parent 3b5a22794c
commit 553fc019fe
5 changed files with 227 additions and 147 deletions

View file

@ -51,7 +51,7 @@ function requireFilesOfFolder($dir)
if (!$fileInfo->isDot()) {
if ($fileInfo->isDir()) {
requireFilesOfFolder($fileInfo->getPathname());
} else {
} else if ($fileInfo->getExtension() == 'php') {
require_once $fileInfo->getPathname();
}
}

View file

@ -137,7 +137,7 @@ $_RDATA['sp_starting'] = array_filter(array_map('trim', explode("\n", $_ODATA['s
$_RDATA['s_starting_domains'] = array();
foreach ($_RDATA['sp_starting'] as $starting) {
$starting = parse_url($starting);
if (isset($starting['host']) && $starting['host'])
if (!empty($starting['host']))
$_RDATA['s_starting_domains'][] = $starting['host'];
}
$_RDATA['s_starting_domains'] = array_unique($_RDATA['s_starting_domains']);
@ -164,22 +164,22 @@ $_RDATA['index_status_list'] = array(
// ***** Set session defaults
if (!isset($_SESSION['admin_page']) || !isset($_RDATA['admin_pages'][$_SESSION['admin_page']]))
if (empty($_SESSION['admin_page']) || empty($_RDATA['admin_pages'][$_SESSION['admin_page']]))
$_SESSION['admin_page'] = 'crawler';
if (!isset($_SESSION['index_page'])) $_SESSION['index_page'] = 1;
if (!isset($_SESSION['index_filter_category'])) $_SESSION['index_filter_category'] = '<none>';
if (!isset($_SESSION['index_filter_status'])) $_SESSION['index_filter_status'] = '<none>';
if (!isset($_SESSION['index_filter_text'])) $_SESSION['index_filter_text'] = '';
if (!isset($_SESSION['error'])) $_SESSION['error'] = array();
if (!isset($_SESSION['message'])) $_SESSION['message'] = array();
if (!isset($_SESSION['admin_username'])) $_SESSION['admin_username'] = '';
if (empty($_SESSION['index_page'])) $_SESSION['index_page'] = 1;
if (empty($_SESSION['index_filter_category'])) $_SESSION['index_filter_category'] = '<none>';
if (empty($_SESSION['index_filter_status'])) $_SESSION['index_filter_status'] = '<none>';
if (empty($_SESSION['index_filter_text'])) $_SESSION['index_filter_text'] = '';
if (empty($_SESSION['error'])) $_SESSION['error'] = array();
if (empty($_SESSION['message'])) $_SESSION['message'] = array();
if (empty($_SESSION['admin_username'])) $_SESSION['admin_username'] = '';
if (!$_SESSION['admin_username']) {
if ($_SERVER['REQUEST_METHOD'] == 'POST') {
if (isset($_POST['os_submit']) && $_POST['os_submit'] == 'os_admin_login') {
if (!isset($_POST['os_admin_username'])) $_POST['os_admin_username'] = '';
if (!isset($_POST['os_admin_password'])) $_POST['os_admin_password'] = '';
if (!empty($_POST['os_submit']) && $_POST['os_submit'] == 'os_admin_login') {
if (empty($_POST['os_admin_username'])) $_POST['os_admin_username'] = '';
if (empty($_POST['os_admin_password'])) $_POST['os_admin_password'] = '';
if ($_POST['os_admin_username'] == $_RDATA['admin_username'] &&
$_POST['os_admin_password'] == $_RDATA['admin_password']) {
@ -206,7 +206,7 @@ if (!$_SESSION['admin_username']) {
$response = array();
if (!isset($_POST->action)) $_POST->action = '';
if (empty($_POST->action)) $_POST->action = '';
switch ($_POST->action) {
// Set the key for initiating the crawler
@ -232,14 +232,14 @@ if (!$_SESSION['admin_username']) {
// Download a text file log of the most recent crawl
case 'download':
if (!isset($_POST->content)) $_POST->content = '';
if (empty($_POST->content)) $_POST->content = '';
switch ($_POST->content) {
case 'crawl_log':
if (!$_ODATA['sp_crawling']) {
if ($_ODATA['sp_time_end']) {
$lines = explode("\n", $_ODATA['sp_log']);
if (!isset($_POST->grep)) $_POST->grep = 'all';
if (empty($_POST->grep)) $_POST->grep = '';
switch ($_POST->grep) {
case 'all': break;
case 'errors': $lines = preg_grep('/^[\[\*]/', $lines); break;
@ -278,8 +278,8 @@ if (!$_SESSION['admin_username']) {
// Not used?
case 'fetch':
if (!isset($_POST->value)) $_POST->value = '';
if (isset($_ODATA[$_POST->value])) {
if (empty($_POST->value)) $_POST->value = '';
if (!empty($_ODATA[$_POST->value])) {
$response = array(
'status' => 'Success',
'message' => trim($_ODATA[$_POST->value])
@ -299,7 +299,7 @@ if (!$_SESSION['admin_username']) {
// Normal POST request
} else if (isset($_POST['os_submit'])) {
} else if (!empty($_POST['os_submit'])) {
switch ($_POST['os_submit']) {
@ -484,7 +484,7 @@ if (!$_SESSION['admin_username']) {
$_POST['os_admin_email'][$key] = $email[0]['name'].' <'.$email[0]['address'].'>';
} else $_POST['os_admin_email'][$key] = $email[0]['address'];
} else {
$_SESSION['error'][] = 'Invalid email address \''.$admin_email.'\'.';
$_SESSION['error'][] = 'Invalid To: email address \''.$admin_email.'\'.';
unset($_POST['os_admin_email'][$key]);
}
}
@ -520,7 +520,7 @@ if (!$_SESSION['admin_username']) {
// ***** Page Index >> With Selected...
case 'os_index_with_selected':
if (!isset($_POST['os_index_pages'])) $_POST['os_index_pages'] = array();
if (empty($_POST['os_index_pages'])) $_POST['os_index_pages'] = array();
if (is_array($_POST['os_index_pages'])) {
$checksums_good = true;
@ -532,7 +532,7 @@ if (!$_SESSION['admin_username']) {
}
if ($checksums_good) {
if (!isset($_POST['os_index_select_action'])) $_POST['os_index_select_action'] = '';
if (empty($_POST['os_index_select_action'])) $_POST['os_index_select_action'] = '';
switch ($_POST['os_index_select_action']) {
case 'delete':
$delete = $_DDATA['pdo']->prepare(
@ -550,33 +550,35 @@ if (!$_SESSION['admin_username']) {
break;
case 'category':
if (isset($_POST['os_apply_new_category'])) {
if (!empty($_POST['os_apply_new_category'])) {
$_POST['os_apply_new_category'] = preg_replace(array('/\s/', '/ {2,}/'), ' ', trim($_POST['os_apply_new_category']));
$_POST['os_apply_new_category'] = preg_replace('/[^\w \d-]/', '', $_POST['os_apply_new_category']);
$_POST['os_apply_new_category'] = substr($_POST['os_apply_new_category'], 0, 30);
$update = $_DDATA['pdo']->prepare(
'UPDATE `'.$_DDATA['tbprefix'].'crawldata` SET `category`=:category WHERE `content_checksum`=:content_checksum;'
);
if ($_POST['os_apply_new_category']) {
$update = $_DDATA['pdo']->prepare(
'UPDATE `'.$_DDATA['tbprefix'].'crawldata` SET `category`=:category WHERE `content_checksum`=:content_checksum;'
);
foreach ($_POST['os_index_pages'] as $content_checksum) {
$update->execute(array(
'category' => $_POST['os_apply_new_category'],
'content_checksum' => $content_checksum
));
$err = $update->errorInfo();
if ($err[0] != '00000') {
$_SESSION['error'][] = 'Database error on attempt to update category: '.$err[2];
break;
foreach ($_POST['os_index_pages'] as $content_checksum) {
$update->execute(array(
'category' => $_POST['os_apply_new_category'],
'content_checksum' => $content_checksum
));
$err = $update->errorInfo();
if ($err[0] != '00000') {
$_SESSION['error'][] = 'Database error on attempt to update category: '.$err[2];
break;
}
}
}
$_SESSION['index_filter_category'] = '<none>';
$_SESSION['index_filter_category'] = '<none>';
} else $_SESSION['error'][] = 'Category names may only contain letters, numbers, spaces or dashes.';
} else $_SESSION['error'][] = 'Please supply a category name.';
break;
case 'priority':
if (isset($_POST['os_apply_new_priority'])) {
if (!empty($_POST['os_apply_new_priority'])) {
$_POST['os_apply_new_priority'] = (float)$_POST['os_apply_new_priority'];
$_POST['os_apply_new_priority'] = max(0, min(1, $_POST['os_apply_new_priority']));
$_POST['os_apply_new_priority'] = round($_POST['os_apply_new_priority'], 5);
@ -625,7 +627,7 @@ if (!$_SESSION['admin_username']) {
// ***** Page Index >> Text Match filter
case 'os_index_filter_text':
if (!isset($_POST['os_index_filter_text'])) $_POST['os_index_filter_text'] = '';
if (empty($_POST['os_index_filter_text'])) $_POST['os_index_filter_text'] = '';
$_POST['os_index_filter_text'] = filter_var($_POST['os_index_filter_text'], FILTER_SANITIZE_URL);
$_SESSION['index_filter_text'] = $_POST['os_index_filter_text'];
$_SESSION['index_page'] = 1;
@ -806,7 +808,7 @@ if (!$_SESSION['admin_username']) {
foreach ($select[$key]['words'] as $index => $word) {
if (!$word) continue;
if (!isset($words[$word])) {
if (empty($words[$word])) {
$words[$word] = 1;
} else $words[$word]++;
}
@ -1007,8 +1009,7 @@ if (os_crawldata.length) {
}
os_request.q = os_params.get('q');
if (!os_request.q)
os_request.q = '';
if (!os_request.q) os_request.q = '';
os_request.q = os_request.q.trim().replace(/\s/, ' ').replace(/ {2,}/, ' ');
@ -1046,8 +1047,8 @@ if (os_crawldata.length) {
// Just count it as a 'phrase' of one word, functionally equivalent
os_sdata.terms.push(['phrase', t.substring(1), false]);
// Leading - or ! means negative, a MUST exclude
} else if (t[0] == '-' || t[0] == '!') {
// Leading - means negative, a MUST exclude
} else if (t[0] == '-') {
os_sdata.terms.push(['exclude', t.substring(1), false]);
// Restrict to a specific filetype (not yet implemented)
@ -1387,7 +1388,7 @@ document.write(mustache.render(
));<?php
// Dodgy character check on output
// Dodgy character check on javascript output
// [^\w\s()\[\]{};:.‖‘’‟„…/@©~®§⇔⇕⇒⇨⇩↪&\\^<>›×™*·,±_²°|≥!#$¢£+≤=•«%½»?"'-]
@ -1408,7 +1409,7 @@ document.write(mustache.render(
default:
header('Content-type: text/plain; charset='.strtolower($_ODATA['s_charset']));
var_dump($_POST);
die();
exit();
}
@ -1421,7 +1422,7 @@ document.write(mustache.render(
} else {
// Set new Page Index pagination value
if (isset($_POST['os_index_hidden_pagination']) && $_POST['os_index_hidden_pagination']) {
if (!empty($_POST['os_index_hidden_pagination'])) {
$_POST['os_index_hidden_pagination'] = (int)$_POST['os_index_hidden_pagination'];
if (in_array($_POST['os_index_hidden_pagination'], $_RDATA['admin_pagination_options'])) {
OS_setValue('admin_index_pagination', $_POST['os_index_hidden_pagination']);
@ -1433,8 +1434,8 @@ document.write(mustache.render(
}
// Select a Page Index Category filter
if (isset($_POST['os_index_new_filter_category']) && $_POST['os_index_new_filter_category']) {
if (isset($_RDATA['s_category_list'][$_POST['os_index_new_filter_category']])) {
if (!empty($_POST['os_index_new_filter_category'])) {
if (!empty($_RDATA['s_category_list'][$_POST['os_index_new_filter_category']])) {
$_SESSION['index_filter_category'] = $_POST['os_index_new_filter_category'];
$_SESSION['index_page'] = 1;
}
@ -1444,7 +1445,7 @@ document.write(mustache.render(
}
// Select a Page Index Status filter
if (isset($_POST['os_index_new_filter_status']) && $_POST['os_index_new_filter_status']) {
if (!empty($_POST['os_index_new_filter_status'])) {
if (in_array($_POST['os_index_new_filter_status'], $_RDATA['index_status_list'])) {
$_SESSION['index_filter_status'] = $_POST['os_index_new_filter_status'];
$_SESSION['index_page'] = 1;
@ -1457,17 +1458,17 @@ document.write(mustache.render(
// Unknown POST command
header('Content-type: text/plain; charset='.strtolower($_ODATA['s_charset']));
var_dump($_POST);
die();
exit();
}
// Select a new Administration UI page
} else if (isset($_GET['page'])) {
if (isset($_RDATA['admin_pages'][$_GET['page']]))
} else if (!empty($_GET['page'])) {
if (!empty($_RDATA['admin_pages'][$_GET['page']]))
$_SESSION['admin_page'] = $_GET['page'];
// Select a new page within the Page Index list
} else if (isset($_GET['ipage'])) {
} else if (!empty($_GET['ipage'])) {
$_GET['ipage'] = (int)$_GET['ipage'];
$_SESSION['index_page'] = $_GET['ipage'];
@ -2474,11 +2475,14 @@ document.write(mustache.render(
</li>
<li class="list-group-item">
<label class="d-flex w-100">
<strong class="pe-2">Current Cache Size</strong>
<strong class="pe-2">Current Cache Size
<img src="img/help.svg" alt="Information" class="align-middle svg-icon mb-1"
data-bs-toggle="tooltip" data-bs-placement="top" title="The Search Result Cache is cleared after each successful crawl, or you can purge the cache manually below.">
</strong>
<var class="text-end flex-grow-1 text-nowrap"><?php
if (!function_exists('gzcompress')) { ?>
<img src="img/warning.svg" alt="Notice" class="align-middle svg-icon mb-1 me-1"
data-bs-toggle="tooltip" data-bs-placement="top" title="PHP's GZip functions are not enabled. This means your Search Cache won't be able to store as many results. You may want to consider increasing the Search Result Cache limit to compensate for this."><?php
data-bs-toggle="tooltip" data-bs-placement="top" title="PHP's GZip functions are not enabled. This means your Search Result Cache won't be able to store as many results. You may want to consider increasing the Search Result Cache limit to compensate for this."><?php
}
echo OS_readSize($_RDATA['s_cache_size'], true);
?></var>

View file

@ -6,6 +6,11 @@ $_RDATA = array();
require __DIR__.'/config.ini.php';
// Check version compatibility
if (PHP_VERSION_ID < 70200)
throw new Exception('Orcinus Site Search requires a PHP version ">= 7.2.0". You are running '.PHP_VERSION.'.');
// ***** Connect to the database
$_DDATA['pdo'] = new PDO(
'mysql:host='.$_DDATA['hostname'].';dbname='.$_DDATA['database'].';charset=UTF8',
@ -13,7 +18,9 @@ $_DDATA['pdo'] = new PDO(
$_DDATA['password']
);
$err = $_DDATA['pdo']->errorInfo();
if ($err[0]) die('Fatal database connection error: '.$err[0]);
if ($err[0])
throw new Exception('Database connection error: '.$err[2]);
$_DDATA['pdo']->setAttribute(PDO::ATTR_EMULATE_PREPARES, false);
$_DDATA['pdo']->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_ASSOC);
@ -21,11 +28,12 @@ $_DDATA['tables'] = $_DDATA['pdo']->query(
'SHOW TABLES FROM `'.$_DDATA['database'].'` LIKE \''.$_DDATA['tbprefix'].'%\';'
);
$err = $_DDATA['tables']->errorInfo();
if ($err[0] == '00000') {
$_DDATA['tables'] = $_DDATA['tables']->fetchAll(PDO::FETCH_NUM);
foreach($_DDATA['tables'] as $key => $value)
$_DDATA['tables'][$key] = $value[0];
} else die('Fatal database read error: '.$err[2]);
if ($err[0] != '00000')
throw new Exception('Database table read error: '.$err[2]);
$_DDATA['tables'] = $_DDATA['tables']->fetchAll(PDO::FETCH_NUM);
foreach($_DDATA['tables'] as $key => $value)
$_DDATA['tables'][$key] = $value[0];
// ***** Create the configuration table if it doesn't exist
@ -33,7 +41,9 @@ if (!in_array($_DDATA['tbprefix'].'config', $_DDATA['tables'])) {
$create = $_DDATA['pdo']->query(
'CREATE TABLE `'.$_DDATA['tbprefix'].'config` (
`version` VARCHAR(8) NOT NULL,
`admin_from` TINYTEXT NOT NULL,
`admin_email` TEXT NOT NULL,
`admin_install_root` TINYTEXT NOT NULL,
`admin_install_domain` TINYTEXT NOT NULL,
`admin_index_pagination` SMALLINT UNSIGNED NOT NULL,
`sp_key` TINYTEXT NOT NULL,
@ -92,22 +102,26 @@ if (!in_array($_DDATA['tbprefix'].'config', $_DDATA['tables'])) {
PRIMARY KEY (`version`)
) ENGINE = MyISAM, COLLATE = utf8_general_ci;'
);
$err = $create->errorInfo();
if ($err[0] != '00000')
throw new Exception('Could not create configuration database table: '.$err[2]);
}
$testConf = $_DDATA['pdo']->query(
'SELECT `version` FROM `'.$_DDATA['tbprefix'].'config`;'
);
$err = $testConf->errorInfo();
if ($err[0] == '00000') {
$testConf = $testConf->fetchAll();
} else die('Fatal configuration table read error: '.$err[2]);
if ($err[0] != '00000')
throw new Exception('Configuration table read error: '.$err[2]);
// ***** Set default configuration table values
if (!count($testConf)) {
if (!count($testConf->fetchAll())) {
$insert = $_DDATA['pdo']->query(
'INSERT INTO `'.$_DDATA['tbprefix'].'config` SET
`version`=\'3.0\',
`admin_from`=\'\',
`admin_email`=\'\',
`admin_install_root`=\'\',
`admin_install_domain`=\'\',
`admin_index_pagination`=100,
`sp_key`=\'\',
@ -140,8 +154,8 @@ if (!count($testConf)) {
`sp_ifmodifiedsince`=1,
`sp_cookies`=1,
`sp_sitemap_file`=\'\',
`sp_sitemap_hostname`=\''.$_SERVER['HTTP_HOST'].'\',
`sp_useragent`=\'OrcinusSearch/3.0 (https://greywyvern.com/orcinus/)\',
`sp_sitemap_hostname`=\'\',
`sp_useragent`=\'OrcinusCrawler/3.0 (https://greywyvern.com/orcinus/)\',
`sp_crawling`=0,
`sp_cancel`=0,
`sp_progress`=\'\',
@ -165,8 +179,12 @@ if (!count($testConf)) {
`jw_compression`=25
;'
);
$err = $insert->errorInfo();
if ($err[0] != '00000' || !$insert->rowCount())
throw new Exception('Could not fill configuration database table: '.$err[2]);
}
// ***** Create the crawldata table if it doesn't exist
if (!in_array($_DDATA['tbprefix'].'crawldata', $_DDATA['tables'])) {
$create = $_DDATA['pdo']->query(
@ -193,6 +211,9 @@ if (!in_array($_DDATA['tbprefix'].'crawldata', $_DDATA['tables'])) {
UNIQUE `content_checksum` (`content_checksum`)
) ENGINE = MyISAM, COLLATE = utf8_general_ci;'
);
$err = $create->errorInfo();
if ($err[0] != '00000')
throw new Exception('Could not create crawldata database table: '.$err[2]);
}
// ***** Create the query log table if it doesn't exist
@ -206,6 +227,9 @@ if (!in_array($_DDATA['tbprefix'].'query', $_DDATA['tables'])) {
`cache` MEDIUMBLOB NOT NULL
) ENGINE = MyISAM, COLLATE = utf8_general_ci;'
);
$err = $create->errorInfo();
if ($err[0] != '00000')
throw new Exception('Could not create query log database table: '.$err[2]);
}
@ -279,7 +303,7 @@ function OS_getValue($columnName) {
/**
* Initialize a generic cURL connection
* - If creating a cURL connection fails, we should try some fallbacks
* - If creating a cURL connection fails, we could try some fallbacks
*
*/
function OS_getConnection() {
@ -317,16 +341,28 @@ date_default_timezone_set($_ODATA['sp_timezone']);
ini_set('mbstring.substitute_character', 'none');
// Determine the correct HTTP scheme by which we are accessing the page
if (!isset($_SERVER['REQUEST_SCHEME'])) {
if (!empty($_SERVER['HTTP_X_FORWARDED_PROTO'])) {
$_SERVER['REQUEST_SCHEME'] = $_SERVER['HTTP_X_FORWARDED_PROTO'];
} else if (!empty($_SERVER['HTTPS'])) {
$_SERVER['REQUEST_SCHEME'] = ($_SERVER['HTTPS'] == 'on') ? 'https' : 'http';
} else if (!empty($_SERVER['SERVER_PORT'])) {
if ($_SERVER['SERVER_PORT'] == 443) {
$_SERVER['REQUEST_SCHEME'] = 'https';
} else $_SERVER['REQUEST_SCHEME'] = 'http';
} else $_SERVER['REQUEST_SCHEME'] = '';
}
// ***** Determine the install domain from run location
if (!$_ODATA['admin_install_domain']) {
if (isset($_SERVER['REQUEST_SCHEME']) && $_SERVER['REQUEST_SCHEME'] &&
isset($_SERVER['HTTP_HOST']) && $_SERVER['HTTP_HOST']) {
if ($_SERVER['REQUEST_SCHEME'] && !empty($_SERVER['HTTP_HOST'])) {
$base = $_SERVER['REQUEST_SCHEME'].'://'.$_SERVER['HTTP_HOST'];
if (isset($_SERVER['SCRIPT_URI']) && $_SERVER['SCRIPT_URI']) {
if (!empty($_SERVER['SCRIPT_URI'])) {
$psuri = parse_url($_SERVER['SCRIPT_URI']);
if ($psuri && isset($psuri['port']) && !is_null($psuri['port']))
if ($psuri && !empty($psuri['port']))
$base .= ':'.$psuri['port'];
} else if (isset($_SERVER['SERVER_PORT'])) {
} else if (!empty($_SERVER['SERVER_PORT'])) {
if ($_SERVER['SERVER_PORT'] == '80') {
if ($_SERVER['REQUEST_SCHEME'] != 'http')
$base .= ':'.$_SERVER['SERVER_PORT'];
@ -340,11 +376,22 @@ if (!$_ODATA['admin_install_domain']) {
}
if (!$_ODATA['sp_starting']) {
if (!$_ODATA['admin_install_domain']) {
die('Fatal error, could not determine install domain. Please run this script from a web browser.');
throw new Exception('Could not determine install domain. Please run this script from a web browser.');
} else OS_setValue('sp_starting', $_ODATA['admin_install_domain'].'/');
}
// ***** Set the admin From: email value
if (!$_ODATA['admin_from']) {
if (!empty($_SERVER['SERVER_ADMIN'])) {
OS_setValue('admin_from', $_SERVER['SERVER_ADMIN']);
} else if (!empty($_SERVER['MAILTO'])) {
OS_setValue('admin_from', $_SERVER['MAILTO']);
} else if (isset($_SESSION['error']))
$_SESSION['error'][] = 'Could not determine the admin email for this server. Please set your server\'s \'SERVER_ADMIN\' value.';
}
// ***** Load and Initialize PHPMailer
if (!class_exists('PHPMailer\PHPMailer\PHPMailer')) {
if (file_exists(__DIR__.'/PHPMailer/PHPMailer.php')) {
@ -355,8 +402,10 @@ if (!class_exists('PHPMailer\PHPMailer\PHPMailer')) {
}
if (class_exists('PHPMailer\PHPMailer\PHPMailer')) {
$_MAIL = new PHPMailer\PHPMailer\PHPMailer();
$_MAIL->From = $_SERVER['SERVER_ADMIN'];
$_MAIL->FromName = "Orcinus Site Search Crawler";
if ($_ODATA['admin_from']) {
$_MAIL->From = $_ODATA['admin_from'];
$_MAIL->FromName = "Orcinus Crawler";
}
$_MAIL->CharSet = $_ODATA['s_charset'];
if (count($ad = $_MAIL->parseAddresses($_ODATA['admin_email'])))
foreach ($ad as $a) $_MAIL->AddAddress($a['address'], $a['name']);
@ -365,8 +414,7 @@ if (class_exists('PHPMailer\PHPMailer\PHPMailer')) {
// ***** Load the default Search Result Template
if (!$_ODATA['s_result_template']) {
OS_setValue('s_result_template', <<<ORCINUS
<section id="os_results">
OS_setValue('s_result_template', '<section id="os_results">
<!-- Orcinus Site Search {{version}} - HTML Template -->
{{#errors}}
@ -473,7 +521,7 @@ if (!$_ODATA['s_result_template']) {
<li>Search terms with fewer than {{limit_term_length}} characters are ignored</li>
<li>Enclose groups of terms in quotes ("") to search for phrases</li>
<li>Prefix terms with a plus-sign (+) to make them important</li>
<li>Prefix terms with a minus-sign (-) or exclamation point (!) to exclude terms</li>
<li>Prefix terms with a minus-sign (-) to exclude terms</li>
</ul>
</div>
{{/searched}}
@ -521,17 +569,16 @@ if (!$_ODATA['s_result_template']) {
</small>
</p>
</footer>
</section>
ORCINUS);
</section>');
}
// {{{{{ Initialize the Mustache templating engine
class OS_Mustache {
public $errors;
public function __construct() {}
function __construct() {}
public function addError($text) {
function addError($text) {
if (!$this->errors) {
$this->errors = new stdClass();
$this->errors->error_list = array();
@ -540,7 +587,7 @@ class OS_Mustache {
}
// We'll only autoload the Mustache engine if we need it
public function render() {
function render() {
global $_ODATA;
require_once __DIR__.'/Mustache/Autoloader.php';
@ -719,6 +766,7 @@ $_RDATA['s_latin'] = array(
'x' => array('×'),
'y' => array('ý', 'Ý', 'ÿ', 'Ÿ'),
'z' => array('ź', 'Ź', 'ž', 'Ž', 'ż', 'Ż'),
'!' => array('¡'),
'?' => array('¿')
);
$_RDATA['s_filetypes'] = array(
@ -729,16 +777,31 @@ $_RDATA['s_filetypes'] = array(
'TXT' => array('text/plain')
);
// Store the DOCUMENT_ROOT while we have access to it
if (!empty($_SERVER['DOCUMENT_ROOT'])) {
if ($_SERVER['DOCUMENT_ROOT'] != $_ODATA['admin_install_root'])
OS_setValue('admin_install_root', $_SERVER['DOCUMENT_ROOT']);
} else $_SERVER['DOCUMENT_ROOT'] = $_ODATA['admin_install_root'];
// Adjust the REQUEST_URI to remove query strings
if (isset($_SERVER['REQUEST_URI']))
$_SERVER['REQUEST_URI'] = preg_replace('/\?.*$/', '', $_SERVER['REQUEST_URI']);
// Locate the sitemap file if given
if (!$_ODATA['sp_sitemap_hostname'] && !empty($_SERVER['HTTP_HOST']))
OS_setValue('sp_sitemap_hostname', $_SERVER['HTTP_HOST']);
if ($_ODATA['sp_sitemap_file']) {
$sitemapPath = ($_ODATA['sp_sitemap_file'][0] == '/') ? $_SERVER['DOCUMENT_ROOT'] : __DIR__.'/';
$sitemapPath = ($_ODATA['sp_sitemap_file'][0] == '/') ? $_ODATA['admin_install_root'] : __DIR__.'/';
$sitemapPath .= $_ODATA['sp_sitemap_file'];
$sitemapPath = preg_replace(array('/\/[^\/]+\/\.\.\//', '/\/\.\//'), '/', $sitemapPath);
// If we did not try going beyond the document_root
if (strpos($sitemapPath, $_SERVER['DOCUMENT_ROOT']) === 0) {
if (strpos($sitemapPath, $_ODATA['admin_install_root']) === 0) {
if (file_exists($sitemapPath)) {
$sitemapNewFile = str_replace($_SERVER['DOCUMENT_ROOT'], '', $sitemapPath);
$sitemapNewFile = str_replace($_ODATA['admin_install_root'], '', $sitemapPath);
if ($sitemapNewFile != $_ODATA['sp_sitemap_file'])
OS_setValue('sp_sitemap_file', $sitemapNewFile);
if (is_writable($sitemapPath)) {
@ -754,9 +817,6 @@ if ($_ODATA['sp_sitemap_file']) {
} else $_RDATA['sp_sitemap_file'] = '';
$_SERVER['REQUEST_URI'] = preg_replace('/\?.*$/', '', $_SERVER['REQUEST_URI']);
$_RDATA['x_generated_by'] = 'X-Generated-By: Orcinus Site Search/'.$_ODATA['version'];
header($_RDATA['x_generated_by']);

View file

@ -1,8 +1,8 @@
<?php /* ***** Orcinus Site Search - Web Crawling Engine *********** */
$_DEBUGMODE = ($_SERVER['REQUEST_METHOD'] == 'GET') ? false : false;
require __DIR__.'/config.php';
$_RDATA['debug'] = false;
/**
@ -10,16 +10,20 @@ require __DIR__.'/config.php';
*
*/
function OS_crawlLog($text, $level = 0) {
global $_RDATA, $_DEBUGMODE;
global $_RDATA;
switch ($level) {
case 1: $level = ''; break;
case 2: $level = '[ERROR] '; break;
default: $level = ' -> ';
case 1: $prefix = ''; break;
case 2: $prefix = '[ERROR] '; break;
default: $prefix = ' -> ';
}
fwrite($_RDATA['sp_log'], $level.$text."\n");
if ($_DEBUGMODE) echo $level.$text."\n";
fwrite($_RDATA['sp_log'], $prefix.$text."\n");
if ($_RDATA['debug'] ||
($_SERVER['REQUEST_METHOD'] == 'CLI' &&
$level >= $_RDATA['sp_log_clilevel'])) {
echo $prefix.$text."\n";
}
}
@ -33,7 +37,7 @@ function OS_formatURL($_, $base) {
$_ = str_replace('%20', ' ', $_);
$dirbase = preg_replace('/(?<!:\/)\/[^\/]*$/', '', $base).'/';
$pdb = parse_url($dirbase);
$port = (isset($pdb['port']) && !is_null($pdb['port'])) ? ':'.$pdb['port'] : '';
$port = (!empty($pdb['port'])) ? ':'.$pdb['port'] : '';
if (substr($_, 0, 3) == '../') {
$p = preg_replace('/\/[^\/]*\/$/', '/', $pdb['path']);
@ -69,7 +73,7 @@ function OS_filterURL($_, $base) {
if (!preg_match('/^https?:\/\//', $_))
$_ = OS_formatURL($_, $base);
if (isset($_RDATA['sp_filter'][$_]))
if (!empty($_RDATA['sp_filter'][$_]))
return $_RDATA['sp_filter'][$_];
$_RDATA['sp_filter'][$_] = '';
@ -107,7 +111,7 @@ function OS_filterURL($_, $base) {
return $_RDATA['sp_filter'][$_] = 'ignore-extension';
// robots.txt rules
if (isset($_RDATA['sp_robots'][$plink['host']]))
if (!empty($_RDATA['sp_robots'][$plink['host']]))
foreach ($_RDATA['sp_robots'][$plink['host']] as $disallowURL)
if (strpos($_, $disallowURL) === 0)
return $_RDATA['sp_filter'][$_] = 'robots-txt';
@ -216,7 +220,7 @@ function OS_fetchURL($url, $referer = '') {
OS_crawlLog($_['errno'], 1);
OS_crawlLog($_['error'], 1);
OS_crawlLog(print_r($_['info'], true), 1);
die('Uncaught cURL error');
throw new Exception('Uncaught cURL error');
}
@ -241,7 +245,7 @@ function OS_parseURLContent($_) {
// Detect MIME-type using extension?
if (!isset($_['info']['content_type']))
if (empty($_['info']['content_type']))
$_['info']['content_type'] = 'text/plain';
// Parse MIME-type
@ -256,11 +260,12 @@ function OS_parseURLContent($_) {
$_['info']['charset'] = 'ISO-8859-1';
$_['info']['sha1'] = sha1($_['body'], true);
while (strpos($_['body'], "\x1f\x8b") === 0)
$_['body'] = gzinflate(substr($_['body'], 10));
$_['info']['sha1'] = sha1($_['body']);
switch ($_['info']['mime_type']) {
case 'text/plain':
@ -349,7 +354,7 @@ function OS_parseURLContent($_) {
$head = $document->getElementsByTagName('head');
$base = $head[0]->getElementsByTagName('base');
if (isset($base[0]))
if (!empty($base[0]))
for ($x = 0; $x < count($base[0]->attributes); $x++)
if (strtolower($base[0]->attributes[$x]->name) == 'href')
$_['base'] = filter_var($base[0]->attributes[$x]->value, FILTER_SANITIZE_URL);
@ -604,9 +609,9 @@ function OS_parseURLContent($_) {
$_['title'] = basename($_['info']['url']);
$meta = $pdf->getDetails();
if (isset($meta['Title'])) $_['title'] = strtr($meta['Title'], $_RDATA['sp_utf_replace']);
if (isset($meta['Subject'])) $_['description'] = strtr($meta['Subject'], $_RDATA['sp_utf_replace']);
if (isset($meta['Keywords'])) $_['keywords'] = strtr($meta['Keywords'], $_RDATA['sp_utf_replace']);
if (!empty($meta['Title'])) $_['title'] = strtr($meta['Title'], $_RDATA['sp_utf_replace']);
if (!empty($meta['Subject'])) $_['description'] = strtr($meta['Subject'], $_RDATA['sp_utf_replace']);
if (!empty($meta['Keywords'])) $_['keywords'] = strtr($meta['Keywords'], $_RDATA['sp_utf_replace']);
// remove escaped whitespace
$_['title'] = str_replace(array("\\\n\r", "\\\n"), '', $_['title']);
@ -678,7 +683,7 @@ function OS_parseURLContent($_) {
*
*/
function OS_crawlCleanUp() {
global $_DDATA, $_ODATA, $_RDATA, $_cURL, $_MAIL, $_DEBUGMODE;
global $_DDATA, $_ODATA, $_RDATA, $_cURL, $_MAIL;
// If the crawl has already been canceled, don't bother
if (!$_ODATA['sp_crawling']) return;
@ -796,10 +801,10 @@ function OS_crawlCleanUp() {
if (!$_MAIL->Send()) OS_crawlLog('Could not send notification email', 2);
}
$cronMessage = 'Crawl completed successfully';
$cliMessage = 'Crawl completed successfully';
$jsonMessage = json_encode(array(
'status' => 'Success',
'message' => $cronMessage
'message' => $cliMessage
), JSON_INVALID_UTF8_IGNORE);
// We truncated the search table but FAILED to populate it!
@ -820,10 +825,10 @@ function OS_crawlCleanUp() {
if (!$_MAIL->Send()) OS_crawlLog('Could not send notification email', 2);
}
$cronMessage = 'Could not populate search table; search table is currently empty!';
$cliMessage = 'Could not populate search table; search table is currently empty!';
$jsonMessage = json_encode(array(
'status' => 'Error',
'message' => $cronMessage
'message' => $cliMessage
), JSON_INVALID_UTF8_IGNORE);
}
@ -842,10 +847,10 @@ function OS_crawlCleanUp() {
if (!$_MAIL->Send()) OS_crawlLog('Could not send notification email', 2);
}
$cronMessage = 'Crawl failed; see the log for details';
$cliMessage = 'Crawl failed; see the log for details';
$jsonMessage = json_encode(array(
'status' => 'Error',
'message' => $cronMessage
'message' => $cliMessage
), JSON_INVALID_UTF8_IGNORE);
}
@ -866,25 +871,22 @@ function OS_crawlCleanUp() {
OS_setValue('sp_crawling', 0);
if ($_SERVER['REQUEST_METHOD'] != 'CRON') {
if (!$_DEBUGMODE)
if ($_SERVER['REQUEST_METHOD'] != 'CLI') {
if (!$_RDATA['debug'])
header('Content-type: application/json; charset='.strtolower($_ODATA['s_charset']));
die($jsonMessage);
} else die($cronMessage);
} else die($cliMessage."\n");
}
// This is most likely a crontab request
if (!isset($_SERVER['REQUEST_METHOD'])) {
$_SERVER['REQUEST_METHOD'] = 'CRON';
chdir(dirname($_SERVER['argv'][0]));
if (empty($_SERVER['REQUEST_METHOD'])) $_SERVER['REQUEST_METHOD'] = '';
} else {
switch ($_SERVER['REQUEST_METHOD']) {
/* ***** Handle POST Requests ************************************** */
if ($_SERVER['REQUEST_METHOD'] == 'POST') {
/* ***** Handle POST Requests ************************************ */
case 'POST':
// JSON POST request
// These are usually sent by javascript fetch()
@ -894,10 +896,10 @@ if (!isset($_SERVER['REQUEST_METHOD'])) {
$response = array();
if (!isset($_POST->action)) $_POST->action = '';
if (empty($_POST->action)) $_POST->action = '';
switch ($_POST->action) {
case 'crawl':
if (isset($_POST->sp_key) &&
if (!empty($_POST->sp_key) &&
$_ODATA['sp_key'] &&
$_POST->sp_key == $_ODATA['sp_key']) {
if ($_ODATA['sp_crawling']) {
@ -925,7 +927,7 @@ if (!isset($_SERVER['REQUEST_METHOD'])) {
$lines = file($_ODATA['sp_log'], FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
} else $lines = explode("\n", $_ODATA['sp_log']);
if (!isset($_POST->grep)) $_POST->grep = 'all';
if (empty($_POST->grep)) $_POST->grep = '';
switch ($_POST->grep) {
case 'all': break;
case 'errors': $lines = preg_grep('/^[\[\*]/', $lines); break;
@ -949,11 +951,11 @@ if (!isset($_SERVER['REQUEST_METHOD'])) {
// IF the crawler 'time_start' is more than 'timeout_crawl'
// seconds ago, or the 'force' token is set, the crawler is
// probably stuck. Unstick it.
if (!isset($_POST->force)) $_POST->force = '';
if (empty($_POST->force)) $_POST->force = '';
if ($_POST->force || time() - $_ODATA['sp_time_start'] > $_ODATA['sp_timeout_crawl']) {
OS_setValue('sp_crawling', 0);
if (!isset($_POST->reason) || !$_POST->reason)
if (empty($_POST->reason))
$_POST->reason = 'The crawler halted unexpectedly';
if (strpos($_ODATA['sp_log'], "\n") === false && file_exists($_ODATA['sp_log'])) {
@ -1007,11 +1009,26 @@ if (!isset($_SERVER['REQUEST_METHOD'])) {
die($_ODATA['sp_useragent']);
}
// Don't do anything for GET requests
} else if ($_SERVER['REQUEST_METHOD'] == 'GET') {
// Allow CLI requests through
case '':
if (!empty($_SERVER['argv'][0]) && $_SERVER['argv'][0] == $_SERVER['PHP_SELF']) {
$_SERVER['REQUEST_METHOD'] = 'CLI';
if (!empty($_SERVER['argv'][1]) && preg_match('/^-log=([012])$/', $_SERVER['argv'][1], $match)) {
$_RDATA['sp_log_clilevel'] = (int)$match[1];
} else $_RDATA['sp_log_clilevel'] = 2;
} else die($_ODATA['sp_useragent']);
break;
// Don't do anything for GET requests, unless in debug mode
case 'GET':
header('Content-type: text/plain; charset='.strtolower($_ODATA['s_charset']));
if (!$_DEBUGMODE) die($_ODATA['sp_useragent']);
}
if (!$_RDATA['debug']) die($_ODATA['sp_useragent']);
// Exit for all other request types
default:
header('Content-type: text/plain; charset='.strtolower($_ODATA['s_charset']));
die($_ODATA['sp_useragent']);
}
@ -1251,7 +1268,7 @@ while ($_cURL && count($_QUEUE)) {
// Check robots.txt for newly encountered hostnames
$purl = parse_url($url);
$port = (isset($purl['port']) && !is_null($purl['port'])) ? ':'.$purl['port'] : '';
$port = (!empty($purl['port'])) ? ':'.$purl['port'] : '';
if (!isset($_RDATA['sp_robots'][$purl['host']])) {
$_RDATA['sp_robots'][$purl['host']] = array();
OS_crawlLog('Fetching robots.txt for domain: '.$purl['host'], 1);
@ -1265,7 +1282,7 @@ while ($_cURL && count($_QUEUE)) {
$robolines = explode("\n", $robotstxt['content']);
foreach ($robolines as $line) {
if (preg_match('/^user-agent\s*:\s*(.*)\s*$/i', $line, $r)) {
if (!isset($robots[$robot = $r[1]]))
if (empty($robots[$robot = $r[1]]))
$robots[$robot] = array('disallow' => array(), 'allow' => array());
} else if (preg_match('/((dis)?allow)\s*:\s*(.*)\s*$/i', $line, $r))
$robots[$robot][strtolower($r[1])][] = OS_formatURL($r[3], $url);
@ -1284,7 +1301,7 @@ while ($_cURL && count($_QUEUE)) {
}
}
if ($_DEBUGMODE)
if ($_RDATA['debug'])
OS_crawlLog('Memory used: '.OS_readSize(memory_get_usage(true)), 1);
OS_crawlLog('Crawling: '.$url.' (Depth: '.$depth.')', 1);
@ -1319,7 +1336,7 @@ while ($_cURL && count($_QUEUE)) {
if (!$data['info']['noindex']) {
// Prevent duplicate content
if (isset($_RDATA['sp_sha1'][$data['info']['sha1']])) {
if (!empty($_RDATA['sp_sha1'][$data['info']['sha1']])) {
OS_crawlLog('Content is a duplicate of already indexed page: '.$_RDATA['sp_sha1'][$data['info']['sha1']], 2);
OS_crawlLog('Consider editing faulty redirects, or setting a \'canonical\' <link> element to avoid this', 0);
@ -1361,7 +1378,7 @@ while ($_cURL && count($_QUEUE)) {
if ($data['info']['filetime'] <= 0)
$data['info']['filetime'] = time();
if (isset($row['url'])) {
if (!empty($row['url'])) {
$_RDATA['sp_status']['Updated']++;
} else $_RDATA['sp_status']['New']++;
@ -1371,7 +1388,7 @@ while ($_cURL && count($_QUEUE)) {
$data['info']['filetime'] = $row['last_modified'];
}
$port = (isset($data['url']['port']) && !is_null($data['url']['port'])) ? ':'.$data['url']['port'] : '';
$port = (!empty($data['url']['port'])) ? ':'.$data['url']['port'] : '';
$insertTemp->execute(array(
'url' => $url,
'url_base' => $data['url']['scheme'].'://'.$data['url']['host'].$port,

View file

@ -35,7 +35,7 @@ if ($_RDATA['s_searchable_pages']) {
$_TEMPLATE->searchable = new stdClass();
$_TEMPLATE->searchable->form_action = $_SERVER['REQUEST_URI'];
if (!isset($_REQUEST['c']) || !isset($_RDATA['s_category_list'][$_REQUEST['c']]))
if (empty($_REQUEST['c']) || empty($_RDATA['s_category_list'][$_REQUEST['c']]))
$_REQUEST['c'] = '<none>';
if (count($_RDATA['s_category_list']) > 2) {
@ -50,8 +50,7 @@ if ($_RDATA['s_searchable_pages']) {
}
}
if (!isset($_REQUEST['q']) || !is_string($_REQUEST['q']))
$_REQUEST['q'] = '';
if (empty($_REQUEST['q'])) $_REQUEST['q'] = '';
$_REQUEST['q'] = preg_replace(array('/\s/', '/ {2,}/'), ' ', trim($_REQUEST['q']));
@ -87,8 +86,8 @@ if ($_RDATA['s_searchable_pages']) {
// Just count it as a 'phrase' of one word, functionally equivalent
$_SDATA['terms'][] = array('phrase', substr($t, 1), false);
// Leading - or ! means negative, a MUST exclude
} else if ($t[0] == '-' || $t[0] == '!') {
// Leading - means negative, a MUST exclude
} else if ($t[0] == '-') {
$_SDATA['terms'][] = array('exclude', substr($t, 1), false);
// Restrict to a specific filetype (not yet implemented)
@ -701,7 +700,7 @@ if ($_ODATA['sp_interval'] &&
curl_setopt($_cURL, CURLOPT_CONNECTTIMEOUT, 1);
curl_setopt($_cURL, CURLOPT_TIMEOUT, 1);
$crawlerDir = str_replace($_SERVER['DOCUMENT_ROOT'], '', __DIR__);
$crawlerDir = str_replace($_ODATA['admin_install_root'], '', __DIR__);
$crawlerURL = $_ODATA['admin_install_domain'].$crawlerDir.'/crawler.php';
curl_setopt($_cURL, CURLOPT_URL, str_replace(' ', '%20', $crawlerURL));