Daily updates
Keep Page Index pagination page within limits; add UTF-8 BOM to CSV and TXT download output; use utf8mb4_unicode_520_ci collation to remove need for SQL REGEXP; add more latin accent equivalent characters.
This commit is contained in:
parent
761491c21a
commit
ba04173c29
|
@ -9,7 +9,7 @@ require __DIR__.'/config.php';
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Display a time since' HTML/Javascript counter
|
* Display a 'time since' HTML/Javascript counter
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
function OS_countUp($time) {
|
function OS_countUp($time) {
|
||||||
|
@ -49,11 +49,9 @@ function OS_countUp($time) {
|
||||||
|
|
||||||
|
|
||||||
// ***** Load Maxmind GeoIP2
|
// ***** Load Maxmind GeoIP2
|
||||||
if (!class_exists('GeoIp2\Database\Reader')) {
|
if (!class_exists('GeoIp2\Database\Reader'))
|
||||||
if (file_exists(__DIR__.'/geoip2/geoip2.phar')) {
|
if (file_exists(__DIR__.'/geoip2/geoip2.phar'))
|
||||||
include __DIR__.'/geoip2/geoip2.phar';
|
include __DIR__.'/geoip2/geoip2.phar';
|
||||||
}
|
|
||||||
}
|
|
||||||
if (class_exists('GeoIp2\Database\Reader')) {
|
if (class_exists('GeoIp2\Database\Reader')) {
|
||||||
if (file_exists(__DIR__.'/geoip2/GeoLite2-Country.mmdb'))
|
if (file_exists(__DIR__.'/geoip2/GeoLite2-Country.mmdb'))
|
||||||
$_GEOIP2 = new GeoIp2\Database\Reader(__DIR__.'/geoip2/GeoLite2-Country.mmdb');
|
$_GEOIP2 = new GeoIp2\Database\Reader(__DIR__.'/geoip2/GeoLite2-Country.mmdb');
|
||||||
|
@ -120,13 +118,15 @@ $_RDATA['index_status_list'] = array(
|
||||||
if (empty($_SESSION['admin_page']) || empty($_RDATA['admin_pages'][$_SESSION['admin_page']]))
|
if (empty($_SESSION['admin_page']) || empty($_RDATA['admin_pages'][$_SESSION['admin_page']]))
|
||||||
$_SESSION['admin_page'] = 'crawler';
|
$_SESSION['admin_page'] = 'crawler';
|
||||||
|
|
||||||
if (empty($_SESSION['index_page'])) $_SESSION['index_page'] = 1;
|
if (!isset($_SESSION['index_page'])) $_SESSION['index_page'] = 1;
|
||||||
if (empty($_SESSION['index_filter_category'])) $_SESSION['index_filter_category'] = '<none>';
|
if (empty($_SESSION['index_filter_category'])) $_SESSION['index_filter_category'] = '<none>';
|
||||||
if (empty($_SESSION['index_filter_status'])) $_SESSION['index_filter_status'] = '<none>';
|
if (empty($_SESSION['index_filter_status'])) $_SESSION['index_filter_status'] = '<none>';
|
||||||
if (empty($_SESSION['index_filter_text'])) $_SESSION['index_filter_text'] = '';
|
if (empty($_SESSION['index_filter_text'])) $_SESSION['index_filter_text'] = '';
|
||||||
if (empty($_SESSION['admin_username'])) $_SESSION['admin_username'] = '';
|
if (empty($_SESSION['admin_username'])) $_SESSION['admin_username'] = '';
|
||||||
|
|
||||||
if (!$_SESSION['admin_username']) {
|
if (!$_SESSION['admin_username']) {
|
||||||
|
|
||||||
|
// If we are logging in
|
||||||
if ($_SERVER['REQUEST_METHOD'] == 'POST') {
|
if ($_SERVER['REQUEST_METHOD'] == 'POST') {
|
||||||
if (!empty($_POST['os_submit']) && $_POST['os_submit'] == 'os_admin_login') {
|
if (!empty($_POST['os_submit']) && $_POST['os_submit'] == 'os_admin_login') {
|
||||||
if (empty($_POST['os_admin_username'])) $_POST['os_admin_username'] = '';
|
if (empty($_POST['os_admin_username'])) $_POST['os_admin_username'] = '';
|
||||||
|
@ -203,6 +203,10 @@ if (!$_SESSION['admin_username']) {
|
||||||
header('Content-disposition: attachment; filename="'.
|
header('Content-disposition: attachment; filename="'.
|
||||||
'crawl-log'.$_POST->grep.'_'.date('Y-m-d', $_ODATA['sp_time_end']).'.txt"');
|
'crawl-log'.$_POST->grep.'_'.date('Y-m-d', $_ODATA['sp_time_end']).'.txt"');
|
||||||
|
|
||||||
|
// UTF-8 byte order mark
|
||||||
|
if (strtolower($_ODATA['s_charset']) == 'utf-8')
|
||||||
|
echo "\xEF\xBB\xBF";
|
||||||
|
|
||||||
die(implode("\n", $lines));
|
die(implode("\n", $lines));
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
@ -236,6 +240,10 @@ if (!$_SESSION['admin_username']) {
|
||||||
|
|
||||||
$output = fopen('php://output', 'w');
|
$output = fopen('php://output', 'w');
|
||||||
|
|
||||||
|
// UTF-8 byte order mark
|
||||||
|
if (strtolower($_ODATA['s_charset']) == 'utf-8')
|
||||||
|
fwrite($output, "\xEF\xBB\xBF");
|
||||||
|
|
||||||
$headings = array('Query', 'Results', 'Time Stamp', 'IP');
|
$headings = array('Query', 'Results', 'Time Stamp', 'IP');
|
||||||
if ($_GEOIP2) $headings[] = 'Country';
|
if ($_GEOIP2) $headings[] = 'Country';
|
||||||
|
|
||||||
|
@ -247,8 +255,8 @@ if (!$_SESSION['admin_username']) {
|
||||||
try {
|
try {
|
||||||
$geo = $_GEOIP2->country($line['ipaddr']);
|
$geo = $_GEOIP2->country($line['ipaddr']);
|
||||||
} catch(Exception $e) { $geo = false; }
|
} catch(Exception $e) { $geo = false; }
|
||||||
} else $geo = false;
|
$line['country'] = ($geo) ? $geo->raw['country']['names']['en'] : '';
|
||||||
if ($geo) $line['country'] = $geo->raw['country']['names']['en'];
|
}
|
||||||
|
|
||||||
fputcsv($output, $line);
|
fputcsv($output, $line);
|
||||||
}
|
}
|
||||||
|
@ -1491,7 +1499,7 @@ document.write(mustache.render(
|
||||||
$_SESSION['admin_page'] = $_GET['page'];
|
$_SESSION['admin_page'] = $_GET['page'];
|
||||||
|
|
||||||
// Select a new page within the Page Index list
|
// Select a new page within the Page Index list
|
||||||
} else if (!empty($_GET['ipage'])) {
|
} else if (isset($_GET['ipage'])) {
|
||||||
$_GET['ipage'] = (int)$_GET['ipage'];
|
$_GET['ipage'] = (int)$_GET['ipage'];
|
||||||
$_SESSION['index_page'] = $_GET['ipage'];
|
$_SESSION['index_page'] = $_GET['ipage'];
|
||||||
|
|
||||||
|
@ -1586,8 +1594,16 @@ document.write(mustache.render(
|
||||||
$_RDATA['page_index_found_rows'] = $foundRows[0][0];
|
$_RDATA['page_index_found_rows'] = $foundRows[0][0];
|
||||||
|
|
||||||
$_RDATA['index_pages'] = ceil($_RDATA['page_index_found_rows'] / $_ODATA['admin_index_pagination']);
|
$_RDATA['index_pages'] = ceil($_RDATA['page_index_found_rows'] / $_ODATA['admin_index_pagination']);
|
||||||
|
|
||||||
|
// If the requested page is outside page limit
|
||||||
|
if ($_SESSION['index_page'] != 1 && ($_SESSION['index_page'] > $_RDATA['index_pages'] || $_SESSION['index_page'] < 1)) {
|
||||||
$_SESSION['index_page'] = max(1, min($_RDATA['index_pages'], (int)$_SESSION['index_page']));
|
$_SESSION['index_page'] = max(1, min($_RDATA['index_pages'], (int)$_SESSION['index_page']));
|
||||||
|
|
||||||
|
// Redirect to a page within the limits
|
||||||
|
header('Location: '.$_SERVER['REQUEST_URI'].'?ipage='.$_SESSION['index_page']);
|
||||||
|
exit();
|
||||||
|
}
|
||||||
|
|
||||||
} else $_SESSION['error'][] = 'Database did not return a search table row count.';
|
} else $_SESSION['error'][] = 'Database did not return a search table row count.';
|
||||||
} else $_SESSION['error'][] = 'Database error reading search table row count: '.$err[2];
|
} else $_SESSION['error'][] = 'Database error reading search table row count: '.$err[2];
|
||||||
} else $_SESSION['error'][] = 'Database error reading search table: '.$err[2];
|
} else $_SESSION['error'][] = 'Database error reading search table: '.$err[2];
|
||||||
|
@ -2787,10 +2803,14 @@ document.write(mustache.render(
|
||||||
<label class="d-flex lh-lg w-100 mb-2">
|
<label class="d-flex lh-lg w-100 mb-2">
|
||||||
<strong class="pe-2">Output Encoding:</strong>
|
<strong class="pe-2">Output Encoding:</strong>
|
||||||
<span class="flex-grow-1 text-end">
|
<span class="flex-grow-1 text-end">
|
||||||
<input type="text" name="os_s_charset" value="<?php echo $_ODATA['s_charset']; ?>" pattern="[\w\d-]*" class="form-control d-inline-block w-auto mw-10em"
|
<input type="text" name="os_s_charset" value="<?php echo $_ODATA['s_charset']; ?>" pattern="[\w\d-]*" class="form-control d-inline-block w-auto mw-10em" aria-labelledby="os_s_charset_text">
|
||||||
data-bs-toggle="tooltip" data-bs-placement="bottom" title="This value should match the encoding of your search results page, and ideally match the character encoding of most of your crawled pages.">
|
|
||||||
</span>
|
</span>
|
||||||
</label>
|
</label>
|
||||||
|
<p id="os_s_charset_text" class="form-text">
|
||||||
|
The <em>Output Encoding</em> value should match the encoding of your search results page, and
|
||||||
|
ideally match the character encoding of most of your crawled pages. UTF-8 is strongly
|
||||||
|
recommended.
|
||||||
|
</p>
|
||||||
<label class="d-flex lh-lg w-100 mb-2">
|
<label class="d-flex lh-lg w-100 mb-2">
|
||||||
<strong class="pe-2">Maximum Returned Results:</strong>
|
<strong class="pe-2">Maximum Returned Results:</strong>
|
||||||
<span class="flex-grow-1 text-end text-nowrap">
|
<span class="flex-grow-1 text-end text-nowrap">
|
||||||
|
@ -2809,7 +2829,7 @@ document.write(mustache.render(
|
||||||
<strong class="pe-2">Maximum Matched Text (characters):</strong>
|
<strong class="pe-2">Maximum Matched Text (characters):</strong>
|
||||||
<span class="flex-grow-1 text-end text-nowrap">
|
<span class="flex-grow-1 text-end text-nowrap">
|
||||||
<input type="number" name="os_s_limit_matchtext" value="<?php echo $_ODATA['s_limit_matchtext']; ?>" min="0" max="65535" step="1" class="form-control d-inline-block"
|
<input type="number" name="os_s_limit_matchtext" value="<?php echo $_ODATA['s_limit_matchtext']; ?>" min="0" max="65535" step="1" class="form-control d-inline-block"
|
||||||
data-bs-toggle="tooltip" data-bs-placement="top" title="Maximum amount of matched 'description' text to display beneath the title / URL of each search result.">
|
data-bs-toggle="tooltip" data-bs-placement="top" title="Maximum amount of matched 'description' text to display beneath the heading of each search result.">
|
||||||
</span>
|
</span>
|
||||||
</label>
|
</label>
|
||||||
<div class="row">
|
<div class="row">
|
||||||
|
@ -3099,9 +3119,10 @@ document.write(mustache.render(
|
||||||
* Not logged in; Show login page ****************************** */
|
* Not logged in; Show login page ****************************** */
|
||||||
} else { ?>
|
} else { ?>
|
||||||
<section class="row justify-content-center">
|
<section class="row justify-content-center">
|
||||||
<header class="col-12 mb-2">
|
<header class="col-10 col-sm-8 col-md-6 col-lg-5 col-xl-4 mb-2">
|
||||||
<h2>Welcome</h2>
|
<h2>Welcome</h2>
|
||||||
</header>
|
</header>
|
||||||
|
<div class="w-100"></div>
|
||||||
|
|
||||||
<div class="col-10 col-sm-8 col-md-6 col-lg-5 col-xl-4">
|
<div class="col-10 col-sm-8 col-md-6 col-lg-5 col-xl-4">
|
||||||
<form action="<?php echo $_SERVER['REQUEST_URI']; ?>" method="post"
|
<form action="<?php echo $_SERVER['REQUEST_URI']; ?>" method="post"
|
||||||
|
|
|
@ -101,7 +101,7 @@ if (!in_array($_DDATA['tbprefix'].'config', $_DDATA['tables'])) {
|
||||||
`jw_hostname` TINYTEXT NOT NULL,
|
`jw_hostname` TINYTEXT NOT NULL,
|
||||||
`jw_compression` TINYINT UNSIGNED NOT NULL,
|
`jw_compression` TINYINT UNSIGNED NOT NULL,
|
||||||
PRIMARY KEY (`version`)
|
PRIMARY KEY (`version`)
|
||||||
) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_ci;'
|
) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_520_ci;'
|
||||||
);
|
);
|
||||||
$err = $create->errorInfo();
|
$err = $create->errorInfo();
|
||||||
if ($err[0] != '00000')
|
if ($err[0] != '00000')
|
||||||
|
@ -211,7 +211,7 @@ if (!in_array($_DDATA['tbprefix'].'crawldata', $_DDATA['tables'])) {
|
||||||
`last_modified` INT NOT NULL,
|
`last_modified` INT NOT NULL,
|
||||||
`priority` DECIMAL(2,1) NOT NULL,
|
`priority` DECIMAL(2,1) NOT NULL,
|
||||||
UNIQUE `content_checksum` (`content_checksum`)
|
UNIQUE `content_checksum` (`content_checksum`)
|
||||||
) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_ci;'
|
) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_520_ci;'
|
||||||
);
|
);
|
||||||
$err = $create->errorInfo();
|
$err = $create->errorInfo();
|
||||||
if ($err[0] != '00000')
|
if ($err[0] != '00000')
|
||||||
|
@ -227,7 +227,7 @@ if (!in_array($_DDATA['tbprefix'].'query', $_DDATA['tables'])) {
|
||||||
`stamp` INT UNSIGNED NOT NULL,
|
`stamp` INT UNSIGNED NOT NULL,
|
||||||
`ip` INT UNSIGNED NOT NULL,
|
`ip` INT UNSIGNED NOT NULL,
|
||||||
`cache` MEDIUMBLOB NOT NULL
|
`cache` MEDIUMBLOB NOT NULL
|
||||||
) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_ci;'
|
) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_520_ci;'
|
||||||
);
|
);
|
||||||
$err = $create->errorInfo();
|
$err = $create->errorInfo();
|
||||||
if ($err[0] != '00000')
|
if ($err[0] != '00000')
|
||||||
|
@ -725,10 +725,11 @@ $_RDATA['s_latin'] = array(
|
||||||
'color' => array('colour'),
|
'color' => array('colour'),
|
||||||
'fiber' => array('fibre'),
|
'fiber' => array('fibre'),
|
||||||
|
|
||||||
'ae' => array('æ', 'Æ'),
|
'ae' => array('æ', 'Æ', 'ä', 'Ä'),
|
||||||
'oe' => array('œ', 'Œ'),
|
'oe' => array('œ', 'Œ', 'ö', 'Ö', 'ø', 'Ø'),
|
||||||
'ss' => array('ß'),
|
'ss' => array('ß'),
|
||||||
'th' => array('þ', 'Þ'),
|
'th' => array('þ', 'Þ'),
|
||||||
|
'ue' => array('ü', 'Ü'),
|
||||||
'a' => array('á', 'Á', 'à', 'À', 'â', 'Â', 'ä', 'Ä', 'ã', 'Ã', 'å', 'Å', 'ą', 'Ą', 'ă', 'Ă'),
|
'a' => array('á', 'Á', 'à', 'À', 'â', 'Â', 'ä', 'Ä', 'ã', 'Ã', 'å', 'Å', 'ą', 'Ą', 'ă', 'Ă'),
|
||||||
'c' => array('ç', 'Ç', 'ć', 'Ć', 'č', 'Č'),
|
'c' => array('ç', 'Ç', 'ć', 'Ć', 'č', 'Č'),
|
||||||
'd' => array('ð', 'Ð', 'ď', 'Ď', 'đ', 'Đ'),
|
'd' => array('ð', 'Ð', 'ď', 'Ď', 'đ', 'Đ'),
|
||||||
|
|
|
@ -1335,7 +1335,6 @@ while ($_cURL && count($_RDATA['sp_queue'])) {
|
||||||
} else {
|
} else {
|
||||||
$data['errno'] = 702;
|
$data['errno'] = 702;
|
||||||
$data['error'] = 'PDF is empty of extractable text';
|
$data['error'] = 'PDF is empty of extractable text';
|
||||||
$data['content'] = '';
|
|
||||||
$data['info']['noindex'] = 'empty-pdf';
|
$data['info']['noindex'] = 'empty-pdf';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -255,9 +255,7 @@ if ($_RDATA['s_searchable_pages']) {
|
||||||
$filetypes = array();
|
$filetypes = array();
|
||||||
foreach ($_SDATA['terms'] as list($type, $term, $pcre)) {
|
foreach ($_SDATA['terms'] as list($type, $term, $pcre)) {
|
||||||
|
|
||||||
// Regexp only for SQL use
|
$slashTerm = addslashes($term);
|
||||||
$pterm = preg_quote(strtolower($term), '\'');
|
|
||||||
$pterm = strtr($pterm, $_RDATA['s_latin_pcre']);
|
|
||||||
|
|
||||||
switch ($type) {
|
switch ($type) {
|
||||||
case 'filetype':
|
case 'filetype':
|
||||||
|
@ -267,32 +265,32 @@ if ($_RDATA['s_searchable_pages']) {
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 'exclude':
|
case 'exclude':
|
||||||
$negs[] = '`content` NOT REGEXP \''.$pterm.'\'';
|
$negs[] = 'INSTR(`content`, \''.$slashTerm.'\')=0';
|
||||||
$negs[] = '`url` NOT REGEXP \''.$pterm.'\'';
|
$negs[] = 'INSTR(`url`, \''.$slashTerm.'\')=0';
|
||||||
$negs[] = '`title` NOT REGEXP \''.$pterm.'\'';
|
$negs[] = 'INSTR(`title`, \''.$slashTerm.'\')=0';
|
||||||
$negs[] = '`description` NOT REGEXP \''.$pterm.'\'';
|
$negs[] = 'INSTR(`description`, \''.$slashTerm.'\')=0';
|
||||||
$negs[] = '`keywords` NOT REGEXP \''.$pterm.'\'';
|
$negs[] = 'INSTR(`keywords`, \''.$slashTerm.'\')=0';
|
||||||
$negs[] = '`weighted` NOT REGEXP \''.$pterm.'\'';
|
$negs[] = 'INSTR(`weighted`, \''.$slashTerm.'\')=0';
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 'phrase':
|
case 'phrase':
|
||||||
$ands[] = '('.implode(' OR ', array(
|
$ands[] = '('.implode(' OR ', array(
|
||||||
'`content` REGEXP \''.$pterm.'\'',
|
'INSTR(`content`, \''.$slashTerm.'\')',
|
||||||
'`url` REGEXP \''.$pterm.'\'',
|
'INSTR(`url`, \''.$slashTerm.'\')',
|
||||||
'`title` REGEXP \''.$pterm.'\'',
|
'INSTR(`title`, \''.$slashTerm.'\')',
|
||||||
'`description` REGEXP \''.$pterm.'\'',
|
'INSTR(`description`, \''.$slashTerm.'\')',
|
||||||
'`keywords` REGEXP \''.$pterm.'\'',
|
'INSTR(`keywords`, \''.$slashTerm.'\')',
|
||||||
'`weighted` REGEXP \''.$pterm.'\''
|
'INSTR(`weighted`, \''.$slashTerm.'\')'
|
||||||
)).')';
|
)).')';
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 'term':
|
case 'term':
|
||||||
$ors[] = '`content` REGEXP \''.$pterm.'\'';
|
$ors[] = 'INSTR(`content`, \''.$slashTerm.'\')';
|
||||||
$ors[] = '`url` REGEXP \''.$pterm.'\'';
|
$ors[] = 'INSTR(`url`, \''.$slashTerm.'\')';
|
||||||
$ors[] = '`title` REGEXP \''.$pterm.'\'';
|
$ors[] = 'INSTR(`title`, \''.$slashTerm.'\')';
|
||||||
$ors[] = '`description` REGEXP \''.$pterm.'\'';
|
$ors[] = 'INSTR(`description`, \''.$slashTerm.'\')';
|
||||||
$ors[] = '`keywords` REGEXP \''.$pterm.'\'';
|
$ors[] = 'INSTR(`keywords`, \''.$slashTerm.'\')';
|
||||||
$ors[] = '`weighted` REGEXP \''.$pterm.'\'';
|
$ors[] = 'INSTR(`weighted`, \''.$slashTerm.'\')';
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue