Daily update
This commit is contained in:
parent
3b5a22794c
commit
553fc019fe
|
@ -51,7 +51,7 @@ function requireFilesOfFolder($dir)
|
|||
if (!$fileInfo->isDot()) {
|
||||
if ($fileInfo->isDir()) {
|
||||
requireFilesOfFolder($fileInfo->getPathname());
|
||||
} else {
|
||||
} else if ($fileInfo->getExtension() == 'php') {
|
||||
require_once $fileInfo->getPathname();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -137,7 +137,7 @@ $_RDATA['sp_starting'] = array_filter(array_map('trim', explode("\n", $_ODATA['s
|
|||
$_RDATA['s_starting_domains'] = array();
|
||||
foreach ($_RDATA['sp_starting'] as $starting) {
|
||||
$starting = parse_url($starting);
|
||||
if (isset($starting['host']) && $starting['host'])
|
||||
if (!empty($starting['host']))
|
||||
$_RDATA['s_starting_domains'][] = $starting['host'];
|
||||
}
|
||||
$_RDATA['s_starting_domains'] = array_unique($_RDATA['s_starting_domains']);
|
||||
|
@ -164,22 +164,22 @@ $_RDATA['index_status_list'] = array(
|
|||
|
||||
|
||||
// ***** Set session defaults
|
||||
if (!isset($_SESSION['admin_page']) || !isset($_RDATA['admin_pages'][$_SESSION['admin_page']]))
|
||||
if (empty($_SESSION['admin_page']) || empty($_RDATA['admin_pages'][$_SESSION['admin_page']]))
|
||||
$_SESSION['admin_page'] = 'crawler';
|
||||
|
||||
if (!isset($_SESSION['index_page'])) $_SESSION['index_page'] = 1;
|
||||
if (!isset($_SESSION['index_filter_category'])) $_SESSION['index_filter_category'] = '<none>';
|
||||
if (!isset($_SESSION['index_filter_status'])) $_SESSION['index_filter_status'] = '<none>';
|
||||
if (!isset($_SESSION['index_filter_text'])) $_SESSION['index_filter_text'] = '';
|
||||
if (!isset($_SESSION['error'])) $_SESSION['error'] = array();
|
||||
if (!isset($_SESSION['message'])) $_SESSION['message'] = array();
|
||||
if (!isset($_SESSION['admin_username'])) $_SESSION['admin_username'] = '';
|
||||
if (empty($_SESSION['index_page'])) $_SESSION['index_page'] = 1;
|
||||
if (empty($_SESSION['index_filter_category'])) $_SESSION['index_filter_category'] = '<none>';
|
||||
if (empty($_SESSION['index_filter_status'])) $_SESSION['index_filter_status'] = '<none>';
|
||||
if (empty($_SESSION['index_filter_text'])) $_SESSION['index_filter_text'] = '';
|
||||
if (empty($_SESSION['error'])) $_SESSION['error'] = array();
|
||||
if (empty($_SESSION['message'])) $_SESSION['message'] = array();
|
||||
if (empty($_SESSION['admin_username'])) $_SESSION['admin_username'] = '';
|
||||
|
||||
if (!$_SESSION['admin_username']) {
|
||||
if ($_SERVER['REQUEST_METHOD'] == 'POST') {
|
||||
if (isset($_POST['os_submit']) && $_POST['os_submit'] == 'os_admin_login') {
|
||||
if (!isset($_POST['os_admin_username'])) $_POST['os_admin_username'] = '';
|
||||
if (!isset($_POST['os_admin_password'])) $_POST['os_admin_password'] = '';
|
||||
if (!empty($_POST['os_submit']) && $_POST['os_submit'] == 'os_admin_login') {
|
||||
if (empty($_POST['os_admin_username'])) $_POST['os_admin_username'] = '';
|
||||
if (empty($_POST['os_admin_password'])) $_POST['os_admin_password'] = '';
|
||||
|
||||
if ($_POST['os_admin_username'] == $_RDATA['admin_username'] &&
|
||||
$_POST['os_admin_password'] == $_RDATA['admin_password']) {
|
||||
|
@ -206,7 +206,7 @@ if (!$_SESSION['admin_username']) {
|
|||
|
||||
$response = array();
|
||||
|
||||
if (!isset($_POST->action)) $_POST->action = '';
|
||||
if (empty($_POST->action)) $_POST->action = '';
|
||||
switch ($_POST->action) {
|
||||
|
||||
// Set the key for initiating the crawler
|
||||
|
@ -232,14 +232,14 @@ if (!$_SESSION['admin_username']) {
|
|||
|
||||
// Download a text file log of the most recent crawl
|
||||
case 'download':
|
||||
if (!isset($_POST->content)) $_POST->content = '';
|
||||
if (empty($_POST->content)) $_POST->content = '';
|
||||
switch ($_POST->content) {
|
||||
case 'crawl_log':
|
||||
if (!$_ODATA['sp_crawling']) {
|
||||
if ($_ODATA['sp_time_end']) {
|
||||
$lines = explode("\n", $_ODATA['sp_log']);
|
||||
|
||||
if (!isset($_POST->grep)) $_POST->grep = 'all';
|
||||
if (empty($_POST->grep)) $_POST->grep = '';
|
||||
switch ($_POST->grep) {
|
||||
case 'all': break;
|
||||
case 'errors': $lines = preg_grep('/^[\[\*]/', $lines); break;
|
||||
|
@ -278,8 +278,8 @@ if (!$_SESSION['admin_username']) {
|
|||
|
||||
// Not used?
|
||||
case 'fetch':
|
||||
if (!isset($_POST->value)) $_POST->value = '';
|
||||
if (isset($_ODATA[$_POST->value])) {
|
||||
if (empty($_POST->value)) $_POST->value = '';
|
||||
if (!empty($_ODATA[$_POST->value])) {
|
||||
$response = array(
|
||||
'status' => 'Success',
|
||||
'message' => trim($_ODATA[$_POST->value])
|
||||
|
@ -299,7 +299,7 @@ if (!$_SESSION['admin_username']) {
|
|||
|
||||
|
||||
// Normal POST request
|
||||
} else if (isset($_POST['os_submit'])) {
|
||||
} else if (!empty($_POST['os_submit'])) {
|
||||
|
||||
switch ($_POST['os_submit']) {
|
||||
|
||||
|
@ -484,7 +484,7 @@ if (!$_SESSION['admin_username']) {
|
|||
$_POST['os_admin_email'][$key] = $email[0]['name'].' <'.$email[0]['address'].'>';
|
||||
} else $_POST['os_admin_email'][$key] = $email[0]['address'];
|
||||
} else {
|
||||
$_SESSION['error'][] = 'Invalid email address \''.$admin_email.'\'.';
|
||||
$_SESSION['error'][] = 'Invalid To: email address \''.$admin_email.'\'.';
|
||||
unset($_POST['os_admin_email'][$key]);
|
||||
}
|
||||
}
|
||||
|
@ -520,7 +520,7 @@ if (!$_SESSION['admin_username']) {
|
|||
|
||||
// ***** Page Index >> With Selected...
|
||||
case 'os_index_with_selected':
|
||||
if (!isset($_POST['os_index_pages'])) $_POST['os_index_pages'] = array();
|
||||
if (empty($_POST['os_index_pages'])) $_POST['os_index_pages'] = array();
|
||||
if (is_array($_POST['os_index_pages'])) {
|
||||
|
||||
$checksums_good = true;
|
||||
|
@ -532,7 +532,7 @@ if (!$_SESSION['admin_username']) {
|
|||
}
|
||||
|
||||
if ($checksums_good) {
|
||||
if (!isset($_POST['os_index_select_action'])) $_POST['os_index_select_action'] = '';
|
||||
if (empty($_POST['os_index_select_action'])) $_POST['os_index_select_action'] = '';
|
||||
switch ($_POST['os_index_select_action']) {
|
||||
case 'delete':
|
||||
$delete = $_DDATA['pdo']->prepare(
|
||||
|
@ -550,33 +550,35 @@ if (!$_SESSION['admin_username']) {
|
|||
break;
|
||||
|
||||
case 'category':
|
||||
if (isset($_POST['os_apply_new_category'])) {
|
||||
if (!empty($_POST['os_apply_new_category'])) {
|
||||
$_POST['os_apply_new_category'] = preg_replace(array('/\s/', '/ {2,}/'), ' ', trim($_POST['os_apply_new_category']));
|
||||
$_POST['os_apply_new_category'] = preg_replace('/[^\w \d-]/', '', $_POST['os_apply_new_category']);
|
||||
$_POST['os_apply_new_category'] = substr($_POST['os_apply_new_category'], 0, 30);
|
||||
|
||||
$update = $_DDATA['pdo']->prepare(
|
||||
'UPDATE `'.$_DDATA['tbprefix'].'crawldata` SET `category`=:category WHERE `content_checksum`=:content_checksum;'
|
||||
);
|
||||
if ($_POST['os_apply_new_category']) {
|
||||
$update = $_DDATA['pdo']->prepare(
|
||||
'UPDATE `'.$_DDATA['tbprefix'].'crawldata` SET `category`=:category WHERE `content_checksum`=:content_checksum;'
|
||||
);
|
||||
|
||||
foreach ($_POST['os_index_pages'] as $content_checksum) {
|
||||
$update->execute(array(
|
||||
'category' => $_POST['os_apply_new_category'],
|
||||
'content_checksum' => $content_checksum
|
||||
));
|
||||
$err = $update->errorInfo();
|
||||
if ($err[0] != '00000') {
|
||||
$_SESSION['error'][] = 'Database error on attempt to update category: '.$err[2];
|
||||
break;
|
||||
foreach ($_POST['os_index_pages'] as $content_checksum) {
|
||||
$update->execute(array(
|
||||
'category' => $_POST['os_apply_new_category'],
|
||||
'content_checksum' => $content_checksum
|
||||
));
|
||||
$err = $update->errorInfo();
|
||||
if ($err[0] != '00000') {
|
||||
$_SESSION['error'][] = 'Database error on attempt to update category: '.$err[2];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$_SESSION['index_filter_category'] = '<none>';
|
||||
$_SESSION['index_filter_category'] = '<none>';
|
||||
} else $_SESSION['error'][] = 'Category names may only contain letters, numbers, spaces or dashes.';
|
||||
} else $_SESSION['error'][] = 'Please supply a category name.';
|
||||
break;
|
||||
|
||||
case 'priority':
|
||||
if (isset($_POST['os_apply_new_priority'])) {
|
||||
if (!empty($_POST['os_apply_new_priority'])) {
|
||||
$_POST['os_apply_new_priority'] = (float)$_POST['os_apply_new_priority'];
|
||||
$_POST['os_apply_new_priority'] = max(0, min(1, $_POST['os_apply_new_priority']));
|
||||
$_POST['os_apply_new_priority'] = round($_POST['os_apply_new_priority'], 5);
|
||||
|
@ -625,7 +627,7 @@ if (!$_SESSION['admin_username']) {
|
|||
|
||||
// ***** Page Index >> Text Match filter
|
||||
case 'os_index_filter_text':
|
||||
if (!isset($_POST['os_index_filter_text'])) $_POST['os_index_filter_text'] = '';
|
||||
if (empty($_POST['os_index_filter_text'])) $_POST['os_index_filter_text'] = '';
|
||||
$_POST['os_index_filter_text'] = filter_var($_POST['os_index_filter_text'], FILTER_SANITIZE_URL);
|
||||
$_SESSION['index_filter_text'] = $_POST['os_index_filter_text'];
|
||||
$_SESSION['index_page'] = 1;
|
||||
|
@ -806,7 +808,7 @@ if (!$_SESSION['admin_username']) {
|
|||
|
||||
foreach ($select[$key]['words'] as $index => $word) {
|
||||
if (!$word) continue;
|
||||
if (!isset($words[$word])) {
|
||||
if (empty($words[$word])) {
|
||||
$words[$word] = 1;
|
||||
} else $words[$word]++;
|
||||
}
|
||||
|
@ -1007,8 +1009,7 @@ if (os_crawldata.length) {
|
|||
}
|
||||
|
||||
os_request.q = os_params.get('q');
|
||||
if (!os_request.q)
|
||||
os_request.q = '';
|
||||
if (!os_request.q) os_request.q = '';
|
||||
|
||||
os_request.q = os_request.q.trim().replace(/\s/, ' ').replace(/ {2,}/, ' ');
|
||||
|
||||
|
@ -1046,8 +1047,8 @@ if (os_crawldata.length) {
|
|||
// Just count it as a 'phrase' of one word, functionally equivalent
|
||||
os_sdata.terms.push(['phrase', t.substring(1), false]);
|
||||
|
||||
// Leading - or ! means negative, a MUST exclude
|
||||
} else if (t[0] == '-' || t[0] == '!') {
|
||||
// Leading - means negative, a MUST exclude
|
||||
} else if (t[0] == '-') {
|
||||
os_sdata.terms.push(['exclude', t.substring(1), false]);
|
||||
|
||||
// Restrict to a specific filetype (not yet implemented)
|
||||
|
@ -1387,7 +1388,7 @@ document.write(mustache.render(
|
|||
));<?php
|
||||
|
||||
|
||||
// Dodgy character check on output
|
||||
// Dodgy character check on javascript output
|
||||
// [^\w\s()\[\]{};:.‖‘’‟„…/@©~®§⇔⇕⇒⇨⇩↪&\\^<>›×™*·,±_²°|≥!#$¢£+≤=•«%½»?"'-]
|
||||
|
||||
|
||||
|
@ -1408,7 +1409,7 @@ document.write(mustache.render(
|
|||
default:
|
||||
header('Content-type: text/plain; charset='.strtolower($_ODATA['s_charset']));
|
||||
var_dump($_POST);
|
||||
die();
|
||||
exit();
|
||||
|
||||
}
|
||||
|
||||
|
@ -1421,7 +1422,7 @@ document.write(mustache.render(
|
|||
} else {
|
||||
|
||||
// Set new Page Index pagination value
|
||||
if (isset($_POST['os_index_hidden_pagination']) && $_POST['os_index_hidden_pagination']) {
|
||||
if (!empty($_POST['os_index_hidden_pagination'])) {
|
||||
$_POST['os_index_hidden_pagination'] = (int)$_POST['os_index_hidden_pagination'];
|
||||
if (in_array($_POST['os_index_hidden_pagination'], $_RDATA['admin_pagination_options'])) {
|
||||
OS_setValue('admin_index_pagination', $_POST['os_index_hidden_pagination']);
|
||||
|
@ -1433,8 +1434,8 @@ document.write(mustache.render(
|
|||
}
|
||||
|
||||
// Select a Page Index Category filter
|
||||
if (isset($_POST['os_index_new_filter_category']) && $_POST['os_index_new_filter_category']) {
|
||||
if (isset($_RDATA['s_category_list'][$_POST['os_index_new_filter_category']])) {
|
||||
if (!empty($_POST['os_index_new_filter_category'])) {
|
||||
if (!empty($_RDATA['s_category_list'][$_POST['os_index_new_filter_category']])) {
|
||||
$_SESSION['index_filter_category'] = $_POST['os_index_new_filter_category'];
|
||||
$_SESSION['index_page'] = 1;
|
||||
}
|
||||
|
@ -1444,7 +1445,7 @@ document.write(mustache.render(
|
|||
}
|
||||
|
||||
// Select a Page Index Status filter
|
||||
if (isset($_POST['os_index_new_filter_status']) && $_POST['os_index_new_filter_status']) {
|
||||
if (!empty($_POST['os_index_new_filter_status'])) {
|
||||
if (in_array($_POST['os_index_new_filter_status'], $_RDATA['index_status_list'])) {
|
||||
$_SESSION['index_filter_status'] = $_POST['os_index_new_filter_status'];
|
||||
$_SESSION['index_page'] = 1;
|
||||
|
@ -1457,17 +1458,17 @@ document.write(mustache.render(
|
|||
// Unknown POST command
|
||||
header('Content-type: text/plain; charset='.strtolower($_ODATA['s_charset']));
|
||||
var_dump($_POST);
|
||||
die();
|
||||
exit();
|
||||
}
|
||||
|
||||
|
||||
// Select a new Administration UI page
|
||||
} else if (isset($_GET['page'])) {
|
||||
if (isset($_RDATA['admin_pages'][$_GET['page']]))
|
||||
} else if (!empty($_GET['page'])) {
|
||||
if (!empty($_RDATA['admin_pages'][$_GET['page']]))
|
||||
$_SESSION['admin_page'] = $_GET['page'];
|
||||
|
||||
// Select a new page within the Page Index list
|
||||
} else if (isset($_GET['ipage'])) {
|
||||
} else if (!empty($_GET['ipage'])) {
|
||||
$_GET['ipage'] = (int)$_GET['ipage'];
|
||||
$_SESSION['index_page'] = $_GET['ipage'];
|
||||
|
||||
|
@ -2474,11 +2475,14 @@ document.write(mustache.render(
|
|||
</li>
|
||||
<li class="list-group-item">
|
||||
<label class="d-flex w-100">
|
||||
<strong class="pe-2">Current Cache Size</strong>
|
||||
<strong class="pe-2">Current Cache Size
|
||||
<img src="img/help.svg" alt="Information" class="align-middle svg-icon mb-1"
|
||||
data-bs-toggle="tooltip" data-bs-placement="top" title="The Search Result Cache is cleared after each successful crawl, or you can purge the cache manually below.">
|
||||
</strong>
|
||||
<var class="text-end flex-grow-1 text-nowrap"><?php
|
||||
if (!function_exists('gzcompress')) { ?>
|
||||
<img src="img/warning.svg" alt="Notice" class="align-middle svg-icon mb-1 me-1"
|
||||
data-bs-toggle="tooltip" data-bs-placement="top" title="PHP's GZip functions are not enabled. This means your Search Cache won't be able to store as many results. You may want to consider increasing the Search Result Cache limit to compensate for this."><?php
|
||||
data-bs-toggle="tooltip" data-bs-placement="top" title="PHP's GZip functions are not enabled. This means your Search Result Cache won't be able to store as many results. You may want to consider increasing the Search Result Cache limit to compensate for this."><?php
|
||||
}
|
||||
echo OS_readSize($_RDATA['s_cache_size'], true);
|
||||
?></var>
|
||||
|
|
|
@ -6,6 +6,11 @@ $_RDATA = array();
|
|||
require __DIR__.'/config.ini.php';
|
||||
|
||||
|
||||
// Check version compatibility
|
||||
if (PHP_VERSION_ID < 70200)
|
||||
throw new Exception('Orcinus Site Search requires a PHP version ">= 7.2.0". You are running '.PHP_VERSION.'.');
|
||||
|
||||
|
||||
// ***** Connect to the database
|
||||
$_DDATA['pdo'] = new PDO(
|
||||
'mysql:host='.$_DDATA['hostname'].';dbname='.$_DDATA['database'].';charset=UTF8',
|
||||
|
@ -13,7 +18,9 @@ $_DDATA['pdo'] = new PDO(
|
|||
$_DDATA['password']
|
||||
);
|
||||
$err = $_DDATA['pdo']->errorInfo();
|
||||
if ($err[0]) die('Fatal database connection error: '.$err[0]);
|
||||
if ($err[0])
|
||||
throw new Exception('Database connection error: '.$err[2]);
|
||||
|
||||
$_DDATA['pdo']->setAttribute(PDO::ATTR_EMULATE_PREPARES, false);
|
||||
$_DDATA['pdo']->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_ASSOC);
|
||||
|
||||
|
@ -21,11 +28,12 @@ $_DDATA['tables'] = $_DDATA['pdo']->query(
|
|||
'SHOW TABLES FROM `'.$_DDATA['database'].'` LIKE \''.$_DDATA['tbprefix'].'%\';'
|
||||
);
|
||||
$err = $_DDATA['tables']->errorInfo();
|
||||
if ($err[0] == '00000') {
|
||||
$_DDATA['tables'] = $_DDATA['tables']->fetchAll(PDO::FETCH_NUM);
|
||||
foreach($_DDATA['tables'] as $key => $value)
|
||||
$_DDATA['tables'][$key] = $value[0];
|
||||
} else die('Fatal database read error: '.$err[2]);
|
||||
if ($err[0] != '00000')
|
||||
throw new Exception('Database table read error: '.$err[2]);
|
||||
|
||||
$_DDATA['tables'] = $_DDATA['tables']->fetchAll(PDO::FETCH_NUM);
|
||||
foreach($_DDATA['tables'] as $key => $value)
|
||||
$_DDATA['tables'][$key] = $value[0];
|
||||
|
||||
|
||||
// ***** Create the configuration table if it doesn't exist
|
||||
|
@ -33,7 +41,9 @@ if (!in_array($_DDATA['tbprefix'].'config', $_DDATA['tables'])) {
|
|||
$create = $_DDATA['pdo']->query(
|
||||
'CREATE TABLE `'.$_DDATA['tbprefix'].'config` (
|
||||
`version` VARCHAR(8) NOT NULL,
|
||||
`admin_from` TINYTEXT NOT NULL,
|
||||
`admin_email` TEXT NOT NULL,
|
||||
`admin_install_root` TINYTEXT NOT NULL,
|
||||
`admin_install_domain` TINYTEXT NOT NULL,
|
||||
`admin_index_pagination` SMALLINT UNSIGNED NOT NULL,
|
||||
`sp_key` TINYTEXT NOT NULL,
|
||||
|
@ -92,22 +102,26 @@ if (!in_array($_DDATA['tbprefix'].'config', $_DDATA['tables'])) {
|
|||
PRIMARY KEY (`version`)
|
||||
) ENGINE = MyISAM, COLLATE = utf8_general_ci;'
|
||||
);
|
||||
$err = $create->errorInfo();
|
||||
if ($err[0] != '00000')
|
||||
throw new Exception('Could not create configuration database table: '.$err[2]);
|
||||
}
|
||||
|
||||
$testConf = $_DDATA['pdo']->query(
|
||||
'SELECT `version` FROM `'.$_DDATA['tbprefix'].'config`;'
|
||||
);
|
||||
$err = $testConf->errorInfo();
|
||||
if ($err[0] == '00000') {
|
||||
$testConf = $testConf->fetchAll();
|
||||
} else die('Fatal configuration table read error: '.$err[2]);
|
||||
if ($err[0] != '00000')
|
||||
throw new Exception('Configuration table read error: '.$err[2]);
|
||||
|
||||
// ***** Set default configuration table values
|
||||
if (!count($testConf)) {
|
||||
if (!count($testConf->fetchAll())) {
|
||||
$insert = $_DDATA['pdo']->query(
|
||||
'INSERT INTO `'.$_DDATA['tbprefix'].'config` SET
|
||||
`version`=\'3.0\',
|
||||
`admin_from`=\'\',
|
||||
`admin_email`=\'\',
|
||||
`admin_install_root`=\'\',
|
||||
`admin_install_domain`=\'\',
|
||||
`admin_index_pagination`=100,
|
||||
`sp_key`=\'\',
|
||||
|
@ -140,8 +154,8 @@ if (!count($testConf)) {
|
|||
`sp_ifmodifiedsince`=1,
|
||||
`sp_cookies`=1,
|
||||
`sp_sitemap_file`=\'\',
|
||||
`sp_sitemap_hostname`=\''.$_SERVER['HTTP_HOST'].'\',
|
||||
`sp_useragent`=\'OrcinusSearch/3.0 (https://greywyvern.com/orcinus/)\',
|
||||
`sp_sitemap_hostname`=\'\',
|
||||
`sp_useragent`=\'OrcinusCrawler/3.0 (https://greywyvern.com/orcinus/)\',
|
||||
`sp_crawling`=0,
|
||||
`sp_cancel`=0,
|
||||
`sp_progress`=\'\',
|
||||
|
@ -165,8 +179,12 @@ if (!count($testConf)) {
|
|||
`jw_compression`=25
|
||||
;'
|
||||
);
|
||||
$err = $insert->errorInfo();
|
||||
if ($err[0] != '00000' || !$insert->rowCount())
|
||||
throw new Exception('Could not fill configuration database table: '.$err[2]);
|
||||
}
|
||||
|
||||
|
||||
// ***** Create the crawldata table if it doesn't exist
|
||||
if (!in_array($_DDATA['tbprefix'].'crawldata', $_DDATA['tables'])) {
|
||||
$create = $_DDATA['pdo']->query(
|
||||
|
@ -193,6 +211,9 @@ if (!in_array($_DDATA['tbprefix'].'crawldata', $_DDATA['tables'])) {
|
|||
UNIQUE `content_checksum` (`content_checksum`)
|
||||
) ENGINE = MyISAM, COLLATE = utf8_general_ci;'
|
||||
);
|
||||
$err = $create->errorInfo();
|
||||
if ($err[0] != '00000')
|
||||
throw new Exception('Could not create crawldata database table: '.$err[2]);
|
||||
}
|
||||
|
||||
// ***** Create the query log table if it doesn't exist
|
||||
|
@ -206,6 +227,9 @@ if (!in_array($_DDATA['tbprefix'].'query', $_DDATA['tables'])) {
|
|||
`cache` MEDIUMBLOB NOT NULL
|
||||
) ENGINE = MyISAM, COLLATE = utf8_general_ci;'
|
||||
);
|
||||
$err = $create->errorInfo();
|
||||
if ($err[0] != '00000')
|
||||
throw new Exception('Could not create query log database table: '.$err[2]);
|
||||
}
|
||||
|
||||
|
||||
|
@ -279,7 +303,7 @@ function OS_getValue($columnName) {
|
|||
|
||||
/**
|
||||
* Initialize a generic cURL connection
|
||||
* - If creating a cURL connection fails, we should try some fallbacks
|
||||
* - If creating a cURL connection fails, we could try some fallbacks
|
||||
*
|
||||
*/
|
||||
function OS_getConnection() {
|
||||
|
@ -317,16 +341,28 @@ date_default_timezone_set($_ODATA['sp_timezone']);
|
|||
ini_set('mbstring.substitute_character', 'none');
|
||||
|
||||
|
||||
// Determine the correct HTTP scheme by which we are accessing the page
|
||||
if (!isset($_SERVER['REQUEST_SCHEME'])) {
|
||||
if (!empty($_SERVER['HTTP_X_FORWARDED_PROTO'])) {
|
||||
$_SERVER['REQUEST_SCHEME'] = $_SERVER['HTTP_X_FORWARDED_PROTO'];
|
||||
} else if (!empty($_SERVER['HTTPS'])) {
|
||||
$_SERVER['REQUEST_SCHEME'] = ($_SERVER['HTTPS'] == 'on') ? 'https' : 'http';
|
||||
} else if (!empty($_SERVER['SERVER_PORT'])) {
|
||||
if ($_SERVER['SERVER_PORT'] == 443) {
|
||||
$_SERVER['REQUEST_SCHEME'] = 'https';
|
||||
} else $_SERVER['REQUEST_SCHEME'] = 'http';
|
||||
} else $_SERVER['REQUEST_SCHEME'] = '';
|
||||
}
|
||||
|
||||
// ***** Determine the install domain from run location
|
||||
if (!$_ODATA['admin_install_domain']) {
|
||||
if (isset($_SERVER['REQUEST_SCHEME']) && $_SERVER['REQUEST_SCHEME'] &&
|
||||
isset($_SERVER['HTTP_HOST']) && $_SERVER['HTTP_HOST']) {
|
||||
if ($_SERVER['REQUEST_SCHEME'] && !empty($_SERVER['HTTP_HOST'])) {
|
||||
$base = $_SERVER['REQUEST_SCHEME'].'://'.$_SERVER['HTTP_HOST'];
|
||||
if (isset($_SERVER['SCRIPT_URI']) && $_SERVER['SCRIPT_URI']) {
|
||||
if (!empty($_SERVER['SCRIPT_URI'])) {
|
||||
$psuri = parse_url($_SERVER['SCRIPT_URI']);
|
||||
if ($psuri && isset($psuri['port']) && !is_null($psuri['port']))
|
||||
if ($psuri && !empty($psuri['port']))
|
||||
$base .= ':'.$psuri['port'];
|
||||
} else if (isset($_SERVER['SERVER_PORT'])) {
|
||||
} else if (!empty($_SERVER['SERVER_PORT'])) {
|
||||
if ($_SERVER['SERVER_PORT'] == '80') {
|
||||
if ($_SERVER['REQUEST_SCHEME'] != 'http')
|
||||
$base .= ':'.$_SERVER['SERVER_PORT'];
|
||||
|
@ -340,11 +376,22 @@ if (!$_ODATA['admin_install_domain']) {
|
|||
}
|
||||
if (!$_ODATA['sp_starting']) {
|
||||
if (!$_ODATA['admin_install_domain']) {
|
||||
die('Fatal error, could not determine install domain. Please run this script from a web browser.');
|
||||
throw new Exception('Could not determine install domain. Please run this script from a web browser.');
|
||||
} else OS_setValue('sp_starting', $_ODATA['admin_install_domain'].'/');
|
||||
}
|
||||
|
||||
|
||||
// ***** Set the admin From: email value
|
||||
if (!$_ODATA['admin_from']) {
|
||||
if (!empty($_SERVER['SERVER_ADMIN'])) {
|
||||
OS_setValue('admin_from', $_SERVER['SERVER_ADMIN']);
|
||||
} else if (!empty($_SERVER['MAILTO'])) {
|
||||
OS_setValue('admin_from', $_SERVER['MAILTO']);
|
||||
} else if (isset($_SESSION['error']))
|
||||
$_SESSION['error'][] = 'Could not determine the admin email for this server. Please set your server\'s \'SERVER_ADMIN\' value.';
|
||||
}
|
||||
|
||||
|
||||
// ***** Load and Initialize PHPMailer
|
||||
if (!class_exists('PHPMailer\PHPMailer\PHPMailer')) {
|
||||
if (file_exists(__DIR__.'/PHPMailer/PHPMailer.php')) {
|
||||
|
@ -355,8 +402,10 @@ if (!class_exists('PHPMailer\PHPMailer\PHPMailer')) {
|
|||
}
|
||||
if (class_exists('PHPMailer\PHPMailer\PHPMailer')) {
|
||||
$_MAIL = new PHPMailer\PHPMailer\PHPMailer();
|
||||
$_MAIL->From = $_SERVER['SERVER_ADMIN'];
|
||||
$_MAIL->FromName = "Orcinus Site Search Crawler";
|
||||
if ($_ODATA['admin_from']) {
|
||||
$_MAIL->From = $_ODATA['admin_from'];
|
||||
$_MAIL->FromName = "Orcinus Crawler";
|
||||
}
|
||||
$_MAIL->CharSet = $_ODATA['s_charset'];
|
||||
if (count($ad = $_MAIL->parseAddresses($_ODATA['admin_email'])))
|
||||
foreach ($ad as $a) $_MAIL->AddAddress($a['address'], $a['name']);
|
||||
|
@ -365,8 +414,7 @@ if (class_exists('PHPMailer\PHPMailer\PHPMailer')) {
|
|||
|
||||
// ***** Load the default Search Result Template
|
||||
if (!$_ODATA['s_result_template']) {
|
||||
OS_setValue('s_result_template', <<<ORCINUS
|
||||
<section id="os_results">
|
||||
OS_setValue('s_result_template', '<section id="os_results">
|
||||
<!-- Orcinus Site Search {{version}} - HTML Template -->
|
||||
|
||||
{{#errors}}
|
||||
|
@ -473,7 +521,7 @@ if (!$_ODATA['s_result_template']) {
|
|||
<li>Search terms with fewer than {{limit_term_length}} characters are ignored</li>
|
||||
<li>Enclose groups of terms in quotes ("") to search for phrases</li>
|
||||
<li>Prefix terms with a plus-sign (+) to make them important</li>
|
||||
<li>Prefix terms with a minus-sign (-) or exclamation point (!) to exclude terms</li>
|
||||
<li>Prefix terms with a minus-sign (-) to exclude terms</li>
|
||||
</ul>
|
||||
</div>
|
||||
{{/searched}}
|
||||
|
@ -521,17 +569,16 @@ if (!$_ODATA['s_result_template']) {
|
|||
</small>
|
||||
</p>
|
||||
</footer>
|
||||
</section>
|
||||
ORCINUS);
|
||||
</section>');
|
||||
}
|
||||
|
||||
// {{{{{ Initialize the Mustache templating engine
|
||||
class OS_Mustache {
|
||||
public $errors;
|
||||
|
||||
public function __construct() {}
|
||||
function __construct() {}
|
||||
|
||||
public function addError($text) {
|
||||
function addError($text) {
|
||||
if (!$this->errors) {
|
||||
$this->errors = new stdClass();
|
||||
$this->errors->error_list = array();
|
||||
|
@ -540,7 +587,7 @@ class OS_Mustache {
|
|||
}
|
||||
|
||||
// We'll only autoload the Mustache engine if we need it
|
||||
public function render() {
|
||||
function render() {
|
||||
global $_ODATA;
|
||||
|
||||
require_once __DIR__.'/Mustache/Autoloader.php';
|
||||
|
@ -719,6 +766,7 @@ $_RDATA['s_latin'] = array(
|
|||
'x' => array('×'),
|
||||
'y' => array('ý', 'Ý', 'ÿ', 'Ÿ'),
|
||||
'z' => array('ź', 'Ź', 'ž', 'Ž', 'ż', 'Ż'),
|
||||
'!' => array('¡'),
|
||||
'?' => array('¿')
|
||||
);
|
||||
$_RDATA['s_filetypes'] = array(
|
||||
|
@ -729,16 +777,31 @@ $_RDATA['s_filetypes'] = array(
|
|||
'TXT' => array('text/plain')
|
||||
);
|
||||
|
||||
|
||||
// Store the DOCUMENT_ROOT while we have access to it
|
||||
if (!empty($_SERVER['DOCUMENT_ROOT'])) {
|
||||
if ($_SERVER['DOCUMENT_ROOT'] != $_ODATA['admin_install_root'])
|
||||
OS_setValue('admin_install_root', $_SERVER['DOCUMENT_ROOT']);
|
||||
} else $_SERVER['DOCUMENT_ROOT'] = $_ODATA['admin_install_root'];
|
||||
|
||||
// Adjust the REQUEST_URI to remove query strings
|
||||
if (isset($_SERVER['REQUEST_URI']))
|
||||
$_SERVER['REQUEST_URI'] = preg_replace('/\?.*$/', '', $_SERVER['REQUEST_URI']);
|
||||
|
||||
|
||||
// Locate the sitemap file if given
|
||||
if (!$_ODATA['sp_sitemap_hostname'] && !empty($_SERVER['HTTP_HOST']))
|
||||
OS_setValue('sp_sitemap_hostname', $_SERVER['HTTP_HOST']);
|
||||
|
||||
if ($_ODATA['sp_sitemap_file']) {
|
||||
$sitemapPath = ($_ODATA['sp_sitemap_file'][0] == '/') ? $_SERVER['DOCUMENT_ROOT'] : __DIR__.'/';
|
||||
$sitemapPath = ($_ODATA['sp_sitemap_file'][0] == '/') ? $_ODATA['admin_install_root'] : __DIR__.'/';
|
||||
$sitemapPath .= $_ODATA['sp_sitemap_file'];
|
||||
$sitemapPath = preg_replace(array('/\/[^\/]+\/\.\.\//', '/\/\.\//'), '/', $sitemapPath);
|
||||
|
||||
// If we did not try going beyond the document_root
|
||||
if (strpos($sitemapPath, $_SERVER['DOCUMENT_ROOT']) === 0) {
|
||||
if (strpos($sitemapPath, $_ODATA['admin_install_root']) === 0) {
|
||||
if (file_exists($sitemapPath)) {
|
||||
$sitemapNewFile = str_replace($_SERVER['DOCUMENT_ROOT'], '', $sitemapPath);
|
||||
$sitemapNewFile = str_replace($_ODATA['admin_install_root'], '', $sitemapPath);
|
||||
if ($sitemapNewFile != $_ODATA['sp_sitemap_file'])
|
||||
OS_setValue('sp_sitemap_file', $sitemapNewFile);
|
||||
if (is_writable($sitemapPath)) {
|
||||
|
@ -754,9 +817,6 @@ if ($_ODATA['sp_sitemap_file']) {
|
|||
} else $_RDATA['sp_sitemap_file'] = '';
|
||||
|
||||
|
||||
$_SERVER['REQUEST_URI'] = preg_replace('/\?.*$/', '', $_SERVER['REQUEST_URI']);
|
||||
|
||||
|
||||
$_RDATA['x_generated_by'] = 'X-Generated-By: Orcinus Site Search/'.$_ODATA['version'];
|
||||
header($_RDATA['x_generated_by']);
|
||||
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
<?php /* ***** Orcinus Site Search - Web Crawling Engine *********** */
|
||||
|
||||
|
||||
$_DEBUGMODE = ($_SERVER['REQUEST_METHOD'] == 'GET') ? false : false;
|
||||
require __DIR__.'/config.php';
|
||||
$_RDATA['debug'] = false;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -10,16 +10,20 @@ require __DIR__.'/config.php';
|
|||
*
|
||||
*/
|
||||
function OS_crawlLog($text, $level = 0) {
|
||||
global $_RDATA, $_DEBUGMODE;
|
||||
global $_RDATA;
|
||||
|
||||
switch ($level) {
|
||||
case 1: $level = ''; break;
|
||||
case 2: $level = '[ERROR] '; break;
|
||||
default: $level = ' -> ';
|
||||
case 1: $prefix = ''; break;
|
||||
case 2: $prefix = '[ERROR] '; break;
|
||||
default: $prefix = ' -> ';
|
||||
}
|
||||
|
||||
fwrite($_RDATA['sp_log'], $level.$text."\n");
|
||||
if ($_DEBUGMODE) echo $level.$text."\n";
|
||||
fwrite($_RDATA['sp_log'], $prefix.$text."\n");
|
||||
if ($_RDATA['debug'] ||
|
||||
($_SERVER['REQUEST_METHOD'] == 'CLI' &&
|
||||
$level >= $_RDATA['sp_log_clilevel'])) {
|
||||
echo $prefix.$text."\n";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -33,7 +37,7 @@ function OS_formatURL($_, $base) {
|
|||
$_ = str_replace('%20', ' ', $_);
|
||||
$dirbase = preg_replace('/(?<!:\/)\/[^\/]*$/', '', $base).'/';
|
||||
$pdb = parse_url($dirbase);
|
||||
$port = (isset($pdb['port']) && !is_null($pdb['port'])) ? ':'.$pdb['port'] : '';
|
||||
$port = (!empty($pdb['port'])) ? ':'.$pdb['port'] : '';
|
||||
|
||||
if (substr($_, 0, 3) == '../') {
|
||||
$p = preg_replace('/\/[^\/]*\/$/', '/', $pdb['path']);
|
||||
|
@ -69,7 +73,7 @@ function OS_filterURL($_, $base) {
|
|||
if (!preg_match('/^https?:\/\//', $_))
|
||||
$_ = OS_formatURL($_, $base);
|
||||
|
||||
if (isset($_RDATA['sp_filter'][$_]))
|
||||
if (!empty($_RDATA['sp_filter'][$_]))
|
||||
return $_RDATA['sp_filter'][$_];
|
||||
|
||||
$_RDATA['sp_filter'][$_] = '';
|
||||
|
@ -107,7 +111,7 @@ function OS_filterURL($_, $base) {
|
|||
return $_RDATA['sp_filter'][$_] = 'ignore-extension';
|
||||
|
||||
// robots.txt rules
|
||||
if (isset($_RDATA['sp_robots'][$plink['host']]))
|
||||
if (!empty($_RDATA['sp_robots'][$plink['host']]))
|
||||
foreach ($_RDATA['sp_robots'][$plink['host']] as $disallowURL)
|
||||
if (strpos($_, $disallowURL) === 0)
|
||||
return $_RDATA['sp_filter'][$_] = 'robots-txt';
|
||||
|
@ -216,7 +220,7 @@ function OS_fetchURL($url, $referer = '') {
|
|||
OS_crawlLog($_['errno'], 1);
|
||||
OS_crawlLog($_['error'], 1);
|
||||
OS_crawlLog(print_r($_['info'], true), 1);
|
||||
die('Uncaught cURL error');
|
||||
throw new Exception('Uncaught cURL error');
|
||||
|
||||
}
|
||||
|
||||
|
@ -241,7 +245,7 @@ function OS_parseURLContent($_) {
|
|||
|
||||
|
||||
// Detect MIME-type using extension?
|
||||
if (!isset($_['info']['content_type']))
|
||||
if (empty($_['info']['content_type']))
|
||||
$_['info']['content_type'] = 'text/plain';
|
||||
|
||||
// Parse MIME-type
|
||||
|
@ -256,11 +260,12 @@ function OS_parseURLContent($_) {
|
|||
$_['info']['charset'] = 'ISO-8859-1';
|
||||
|
||||
|
||||
$_['info']['sha1'] = sha1($_['body'], true);
|
||||
|
||||
|
||||
while (strpos($_['body'], "\x1f\x8b") === 0)
|
||||
$_['body'] = gzinflate(substr($_['body'], 10));
|
||||
|
||||
$_['info']['sha1'] = sha1($_['body']);
|
||||
|
||||
|
||||
switch ($_['info']['mime_type']) {
|
||||
case 'text/plain':
|
||||
|
@ -349,7 +354,7 @@ function OS_parseURLContent($_) {
|
|||
$head = $document->getElementsByTagName('head');
|
||||
|
||||
$base = $head[0]->getElementsByTagName('base');
|
||||
if (isset($base[0]))
|
||||
if (!empty($base[0]))
|
||||
for ($x = 0; $x < count($base[0]->attributes); $x++)
|
||||
if (strtolower($base[0]->attributes[$x]->name) == 'href')
|
||||
$_['base'] = filter_var($base[0]->attributes[$x]->value, FILTER_SANITIZE_URL);
|
||||
|
@ -604,9 +609,9 @@ function OS_parseURLContent($_) {
|
|||
$_['title'] = basename($_['info']['url']);
|
||||
|
||||
$meta = $pdf->getDetails();
|
||||
if (isset($meta['Title'])) $_['title'] = strtr($meta['Title'], $_RDATA['sp_utf_replace']);
|
||||
if (isset($meta['Subject'])) $_['description'] = strtr($meta['Subject'], $_RDATA['sp_utf_replace']);
|
||||
if (isset($meta['Keywords'])) $_['keywords'] = strtr($meta['Keywords'], $_RDATA['sp_utf_replace']);
|
||||
if (!empty($meta['Title'])) $_['title'] = strtr($meta['Title'], $_RDATA['sp_utf_replace']);
|
||||
if (!empty($meta['Subject'])) $_['description'] = strtr($meta['Subject'], $_RDATA['sp_utf_replace']);
|
||||
if (!empty($meta['Keywords'])) $_['keywords'] = strtr($meta['Keywords'], $_RDATA['sp_utf_replace']);
|
||||
|
||||
// remove escaped whitespace
|
||||
$_['title'] = str_replace(array("\\\n\r", "\\\n"), '', $_['title']);
|
||||
|
@ -678,7 +683,7 @@ function OS_parseURLContent($_) {
|
|||
*
|
||||
*/
|
||||
function OS_crawlCleanUp() {
|
||||
global $_DDATA, $_ODATA, $_RDATA, $_cURL, $_MAIL, $_DEBUGMODE;
|
||||
global $_DDATA, $_ODATA, $_RDATA, $_cURL, $_MAIL;
|
||||
|
||||
// If the crawl has already been canceled, don't bother
|
||||
if (!$_ODATA['sp_crawling']) return;
|
||||
|
@ -796,10 +801,10 @@ function OS_crawlCleanUp() {
|
|||
if (!$_MAIL->Send()) OS_crawlLog('Could not send notification email', 2);
|
||||
}
|
||||
|
||||
$cronMessage = 'Crawl completed successfully';
|
||||
$cliMessage = 'Crawl completed successfully';
|
||||
$jsonMessage = json_encode(array(
|
||||
'status' => 'Success',
|
||||
'message' => $cronMessage
|
||||
'message' => $cliMessage
|
||||
), JSON_INVALID_UTF8_IGNORE);
|
||||
|
||||
// We truncated the search table but FAILED to populate it!
|
||||
|
@ -820,10 +825,10 @@ function OS_crawlCleanUp() {
|
|||
if (!$_MAIL->Send()) OS_crawlLog('Could not send notification email', 2);
|
||||
}
|
||||
|
||||
$cronMessage = 'Could not populate search table; search table is currently empty!';
|
||||
$cliMessage = 'Could not populate search table; search table is currently empty!';
|
||||
$jsonMessage = json_encode(array(
|
||||
'status' => 'Error',
|
||||
'message' => $cronMessage
|
||||
'message' => $cliMessage
|
||||
), JSON_INVALID_UTF8_IGNORE);
|
||||
}
|
||||
|
||||
|
@ -842,10 +847,10 @@ function OS_crawlCleanUp() {
|
|||
if (!$_MAIL->Send()) OS_crawlLog('Could not send notification email', 2);
|
||||
}
|
||||
|
||||
$cronMessage = 'Crawl failed; see the log for details';
|
||||
$cliMessage = 'Crawl failed; see the log for details';
|
||||
$jsonMessage = json_encode(array(
|
||||
'status' => 'Error',
|
||||
'message' => $cronMessage
|
||||
'message' => $cliMessage
|
||||
), JSON_INVALID_UTF8_IGNORE);
|
||||
}
|
||||
|
||||
|
@ -866,25 +871,22 @@ function OS_crawlCleanUp() {
|
|||
|
||||
OS_setValue('sp_crawling', 0);
|
||||
|
||||
if ($_SERVER['REQUEST_METHOD'] != 'CRON') {
|
||||
if (!$_DEBUGMODE)
|
||||
if ($_SERVER['REQUEST_METHOD'] != 'CLI') {
|
||||
if (!$_RDATA['debug'])
|
||||
header('Content-type: application/json; charset='.strtolower($_ODATA['s_charset']));
|
||||
die($jsonMessage);
|
||||
} else die($cronMessage);
|
||||
} else die($cliMessage."\n");
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// This is most likely a crontab request
|
||||
if (!isset($_SERVER['REQUEST_METHOD'])) {
|
||||
$_SERVER['REQUEST_METHOD'] = 'CRON';
|
||||
chdir(dirname($_SERVER['argv'][0]));
|
||||
if (empty($_SERVER['REQUEST_METHOD'])) $_SERVER['REQUEST_METHOD'] = '';
|
||||
|
||||
} else {
|
||||
switch ($_SERVER['REQUEST_METHOD']) {
|
||||
|
||||
/* ***** Handle POST Requests ************************************** */
|
||||
if ($_SERVER['REQUEST_METHOD'] == 'POST') {
|
||||
/* ***** Handle POST Requests ************************************ */
|
||||
case 'POST':
|
||||
|
||||
// JSON POST request
|
||||
// These are usually sent by javascript fetch()
|
||||
|
@ -894,10 +896,10 @@ if (!isset($_SERVER['REQUEST_METHOD'])) {
|
|||
|
||||
$response = array();
|
||||
|
||||
if (!isset($_POST->action)) $_POST->action = '';
|
||||
if (empty($_POST->action)) $_POST->action = '';
|
||||
switch ($_POST->action) {
|
||||
case 'crawl':
|
||||
if (isset($_POST->sp_key) &&
|
||||
if (!empty($_POST->sp_key) &&
|
||||
$_ODATA['sp_key'] &&
|
||||
$_POST->sp_key == $_ODATA['sp_key']) {
|
||||
if ($_ODATA['sp_crawling']) {
|
||||
|
@ -925,7 +927,7 @@ if (!isset($_SERVER['REQUEST_METHOD'])) {
|
|||
$lines = file($_ODATA['sp_log'], FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
|
||||
} else $lines = explode("\n", $_ODATA['sp_log']);
|
||||
|
||||
if (!isset($_POST->grep)) $_POST->grep = 'all';
|
||||
if (empty($_POST->grep)) $_POST->grep = '';
|
||||
switch ($_POST->grep) {
|
||||
case 'all': break;
|
||||
case 'errors': $lines = preg_grep('/^[\[\*]/', $lines); break;
|
||||
|
@ -949,11 +951,11 @@ if (!isset($_SERVER['REQUEST_METHOD'])) {
|
|||
// IF the crawler 'time_start' is more than 'timeout_crawl'
|
||||
// seconds ago, or the 'force' token is set, the crawler is
|
||||
// probably stuck. Unstick it.
|
||||
if (!isset($_POST->force)) $_POST->force = '';
|
||||
if (empty($_POST->force)) $_POST->force = '';
|
||||
if ($_POST->force || time() - $_ODATA['sp_time_start'] > $_ODATA['sp_timeout_crawl']) {
|
||||
OS_setValue('sp_crawling', 0);
|
||||
|
||||
if (!isset($_POST->reason) || !$_POST->reason)
|
||||
if (empty($_POST->reason))
|
||||
$_POST->reason = 'The crawler halted unexpectedly';
|
||||
|
||||
if (strpos($_ODATA['sp_log'], "\n") === false && file_exists($_ODATA['sp_log'])) {
|
||||
|
@ -1007,11 +1009,26 @@ if (!isset($_SERVER['REQUEST_METHOD'])) {
|
|||
die($_ODATA['sp_useragent']);
|
||||
}
|
||||
|
||||
// Don't do anything for GET requests
|
||||
} else if ($_SERVER['REQUEST_METHOD'] == 'GET') {
|
||||
// Allow CLI requests through
|
||||
case '':
|
||||
if (!empty($_SERVER['argv'][0]) && $_SERVER['argv'][0] == $_SERVER['PHP_SELF']) {
|
||||
$_SERVER['REQUEST_METHOD'] = 'CLI';
|
||||
if (!empty($_SERVER['argv'][1]) && preg_match('/^-log=([012])$/', $_SERVER['argv'][1], $match)) {
|
||||
$_RDATA['sp_log_clilevel'] = (int)$match[1];
|
||||
} else $_RDATA['sp_log_clilevel'] = 2;
|
||||
} else die($_ODATA['sp_useragent']);
|
||||
break;
|
||||
|
||||
// Don't do anything for GET requests, unless in debug mode
|
||||
case 'GET':
|
||||
header('Content-type: text/plain; charset='.strtolower($_ODATA['s_charset']));
|
||||
if (!$_DEBUGMODE) die($_ODATA['sp_useragent']);
|
||||
}
|
||||
if (!$_RDATA['debug']) die($_ODATA['sp_useragent']);
|
||||
|
||||
// Exit for all other request types
|
||||
default:
|
||||
header('Content-type: text/plain; charset='.strtolower($_ODATA['s_charset']));
|
||||
die($_ODATA['sp_useragent']);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -1251,7 +1268,7 @@ while ($_cURL && count($_QUEUE)) {
|
|||
|
||||
// Check robots.txt for newly encountered hostnames
|
||||
$purl = parse_url($url);
|
||||
$port = (isset($purl['port']) && !is_null($purl['port'])) ? ':'.$purl['port'] : '';
|
||||
$port = (!empty($purl['port'])) ? ':'.$purl['port'] : '';
|
||||
if (!isset($_RDATA['sp_robots'][$purl['host']])) {
|
||||
$_RDATA['sp_robots'][$purl['host']] = array();
|
||||
OS_crawlLog('Fetching robots.txt for domain: '.$purl['host'], 1);
|
||||
|
@ -1265,7 +1282,7 @@ while ($_cURL && count($_QUEUE)) {
|
|||
$robolines = explode("\n", $robotstxt['content']);
|
||||
foreach ($robolines as $line) {
|
||||
if (preg_match('/^user-agent\s*:\s*(.*)\s*$/i', $line, $r)) {
|
||||
if (!isset($robots[$robot = $r[1]]))
|
||||
if (empty($robots[$robot = $r[1]]))
|
||||
$robots[$robot] = array('disallow' => array(), 'allow' => array());
|
||||
} else if (preg_match('/((dis)?allow)\s*:\s*(.*)\s*$/i', $line, $r))
|
||||
$robots[$robot][strtolower($r[1])][] = OS_formatURL($r[3], $url);
|
||||
|
@ -1284,7 +1301,7 @@ while ($_cURL && count($_QUEUE)) {
|
|||
}
|
||||
}
|
||||
|
||||
if ($_DEBUGMODE)
|
||||
if ($_RDATA['debug'])
|
||||
OS_crawlLog('Memory used: '.OS_readSize(memory_get_usage(true)), 1);
|
||||
|
||||
OS_crawlLog('Crawling: '.$url.' (Depth: '.$depth.')', 1);
|
||||
|
@ -1319,7 +1336,7 @@ while ($_cURL && count($_QUEUE)) {
|
|||
if (!$data['info']['noindex']) {
|
||||
|
||||
// Prevent duplicate content
|
||||
if (isset($_RDATA['sp_sha1'][$data['info']['sha1']])) {
|
||||
if (!empty($_RDATA['sp_sha1'][$data['info']['sha1']])) {
|
||||
OS_crawlLog('Content is a duplicate of already indexed page: '.$_RDATA['sp_sha1'][$data['info']['sha1']], 2);
|
||||
OS_crawlLog('Consider editing faulty redirects, or setting a \'canonical\' <link> element to avoid this', 0);
|
||||
|
||||
|
@ -1361,7 +1378,7 @@ while ($_cURL && count($_QUEUE)) {
|
|||
if ($data['info']['filetime'] <= 0)
|
||||
$data['info']['filetime'] = time();
|
||||
|
||||
if (isset($row['url'])) {
|
||||
if (!empty($row['url'])) {
|
||||
$_RDATA['sp_status']['Updated']++;
|
||||
} else $_RDATA['sp_status']['New']++;
|
||||
|
||||
|
@ -1371,7 +1388,7 @@ while ($_cURL && count($_QUEUE)) {
|
|||
$data['info']['filetime'] = $row['last_modified'];
|
||||
}
|
||||
|
||||
$port = (isset($data['url']['port']) && !is_null($data['url']['port'])) ? ':'.$data['url']['port'] : '';
|
||||
$port = (!empty($data['url']['port'])) ? ':'.$data['url']['port'] : '';
|
||||
$insertTemp->execute(array(
|
||||
'url' => $url,
|
||||
'url_base' => $data['url']['scheme'].'://'.$data['url']['host'].$port,
|
||||
|
|
|
@ -35,7 +35,7 @@ if ($_RDATA['s_searchable_pages']) {
|
|||
$_TEMPLATE->searchable = new stdClass();
|
||||
$_TEMPLATE->searchable->form_action = $_SERVER['REQUEST_URI'];
|
||||
|
||||
if (!isset($_REQUEST['c']) || !isset($_RDATA['s_category_list'][$_REQUEST['c']]))
|
||||
if (empty($_REQUEST['c']) || empty($_RDATA['s_category_list'][$_REQUEST['c']]))
|
||||
$_REQUEST['c'] = '<none>';
|
||||
|
||||
if (count($_RDATA['s_category_list']) > 2) {
|
||||
|
@ -50,8 +50,7 @@ if ($_RDATA['s_searchable_pages']) {
|
|||
}
|
||||
}
|
||||
|
||||
if (!isset($_REQUEST['q']) || !is_string($_REQUEST['q']))
|
||||
$_REQUEST['q'] = '';
|
||||
if (empty($_REQUEST['q'])) $_REQUEST['q'] = '';
|
||||
|
||||
$_REQUEST['q'] = preg_replace(array('/\s/', '/ {2,}/'), ' ', trim($_REQUEST['q']));
|
||||
|
||||
|
@ -87,8 +86,8 @@ if ($_RDATA['s_searchable_pages']) {
|
|||
// Just count it as a 'phrase' of one word, functionally equivalent
|
||||
$_SDATA['terms'][] = array('phrase', substr($t, 1), false);
|
||||
|
||||
// Leading - or ! means negative, a MUST exclude
|
||||
} else if ($t[0] == '-' || $t[0] == '!') {
|
||||
// Leading - means negative, a MUST exclude
|
||||
} else if ($t[0] == '-') {
|
||||
$_SDATA['terms'][] = array('exclude', substr($t, 1), false);
|
||||
|
||||
// Restrict to a specific filetype (not yet implemented)
|
||||
|
@ -701,7 +700,7 @@ if ($_ODATA['sp_interval'] &&
|
|||
curl_setopt($_cURL, CURLOPT_CONNECTTIMEOUT, 1);
|
||||
curl_setopt($_cURL, CURLOPT_TIMEOUT, 1);
|
||||
|
||||
$crawlerDir = str_replace($_SERVER['DOCUMENT_ROOT'], '', __DIR__);
|
||||
$crawlerDir = str_replace($_ODATA['admin_install_root'], '', __DIR__);
|
||||
$crawlerURL = $_ODATA['admin_install_domain'].$crawlerDir.'/crawler.php';
|
||||
curl_setopt($_cURL, CURLOPT_URL, str_replace(' ', '%20', $crawlerURL));
|
||||
|
||||
|
|
Loading…
Reference in a new issue