Daily updates

Keep Page Index pagination page within limits; add UTF-8 BOM to CSV and TXT download output; use utf8mb4_unicode_520_ci collation to remove need for SQL REGEXP; add more latin accent equivalent characters.
This commit is contained in:
Brian Huisman 2023-04-26 15:16:13 -04:00
parent 761491c21a
commit ba04173c29
4 changed files with 60 additions and 41 deletions

View file

@ -9,7 +9,7 @@ require __DIR__.'/config.php';
/** /**
* Display a time since' HTML/Javascript counter * Display a 'time since' HTML/Javascript counter
* *
*/ */
function OS_countUp($time) { function OS_countUp($time) {
@ -49,11 +49,9 @@ function OS_countUp($time) {
// ***** Load Maxmind GeoIP2 // ***** Load Maxmind GeoIP2
if (!class_exists('GeoIp2\Database\Reader')) { if (!class_exists('GeoIp2\Database\Reader'))
if (file_exists(__DIR__.'/geoip2/geoip2.phar')) { if (file_exists(__DIR__.'/geoip2/geoip2.phar'))
include __DIR__.'/geoip2/geoip2.phar'; include __DIR__.'/geoip2/geoip2.phar';
}
}
if (class_exists('GeoIp2\Database\Reader')) { if (class_exists('GeoIp2\Database\Reader')) {
if (file_exists(__DIR__.'/geoip2/GeoLite2-Country.mmdb')) if (file_exists(__DIR__.'/geoip2/GeoLite2-Country.mmdb'))
$_GEOIP2 = new GeoIp2\Database\Reader(__DIR__.'/geoip2/GeoLite2-Country.mmdb'); $_GEOIP2 = new GeoIp2\Database\Reader(__DIR__.'/geoip2/GeoLite2-Country.mmdb');
@ -120,13 +118,15 @@ $_RDATA['index_status_list'] = array(
if (empty($_SESSION['admin_page']) || empty($_RDATA['admin_pages'][$_SESSION['admin_page']])) if (empty($_SESSION['admin_page']) || empty($_RDATA['admin_pages'][$_SESSION['admin_page']]))
$_SESSION['admin_page'] = 'crawler'; $_SESSION['admin_page'] = 'crawler';
if (empty($_SESSION['index_page'])) $_SESSION['index_page'] = 1; if (!isset($_SESSION['index_page'])) $_SESSION['index_page'] = 1;
if (empty($_SESSION['index_filter_category'])) $_SESSION['index_filter_category'] = '<none>'; if (empty($_SESSION['index_filter_category'])) $_SESSION['index_filter_category'] = '<none>';
if (empty($_SESSION['index_filter_status'])) $_SESSION['index_filter_status'] = '<none>'; if (empty($_SESSION['index_filter_status'])) $_SESSION['index_filter_status'] = '<none>';
if (empty($_SESSION['index_filter_text'])) $_SESSION['index_filter_text'] = ''; if (empty($_SESSION['index_filter_text'])) $_SESSION['index_filter_text'] = '';
if (empty($_SESSION['admin_username'])) $_SESSION['admin_username'] = ''; if (empty($_SESSION['admin_username'])) $_SESSION['admin_username'] = '';
if (!$_SESSION['admin_username']) { if (!$_SESSION['admin_username']) {
// If we are logging in
if ($_SERVER['REQUEST_METHOD'] == 'POST') { if ($_SERVER['REQUEST_METHOD'] == 'POST') {
if (!empty($_POST['os_submit']) && $_POST['os_submit'] == 'os_admin_login') { if (!empty($_POST['os_submit']) && $_POST['os_submit'] == 'os_admin_login') {
if (empty($_POST['os_admin_username'])) $_POST['os_admin_username'] = ''; if (empty($_POST['os_admin_username'])) $_POST['os_admin_username'] = '';
@ -203,6 +203,10 @@ if (!$_SESSION['admin_username']) {
header('Content-disposition: attachment; filename="'. header('Content-disposition: attachment; filename="'.
'crawl-log'.$_POST->grep.'_'.date('Y-m-d', $_ODATA['sp_time_end']).'.txt"'); 'crawl-log'.$_POST->grep.'_'.date('Y-m-d', $_ODATA['sp_time_end']).'.txt"');
// UTF-8 byte order mark
if (strtolower($_ODATA['s_charset']) == 'utf-8')
echo "\xEF\xBB\xBF";
die(implode("\n", $lines)); die(implode("\n", $lines));
} else { } else {
@ -236,6 +240,10 @@ if (!$_SESSION['admin_username']) {
$output = fopen('php://output', 'w'); $output = fopen('php://output', 'w');
// UTF-8 byte order mark
if (strtolower($_ODATA['s_charset']) == 'utf-8')
fwrite($output, "\xEF\xBB\xBF");
$headings = array('Query', 'Results', 'Time Stamp', 'IP'); $headings = array('Query', 'Results', 'Time Stamp', 'IP');
if ($_GEOIP2) $headings[] = 'Country'; if ($_GEOIP2) $headings[] = 'Country';
@ -247,8 +255,8 @@ if (!$_SESSION['admin_username']) {
try { try {
$geo = $_GEOIP2->country($line['ipaddr']); $geo = $_GEOIP2->country($line['ipaddr']);
} catch(Exception $e) { $geo = false; } } catch(Exception $e) { $geo = false; }
} else $geo = false; $line['country'] = ($geo) ? $geo->raw['country']['names']['en'] : '';
if ($geo) $line['country'] = $geo->raw['country']['names']['en']; }
fputcsv($output, $line); fputcsv($output, $line);
} }
@ -1491,7 +1499,7 @@ document.write(mustache.render(
$_SESSION['admin_page'] = $_GET['page']; $_SESSION['admin_page'] = $_GET['page'];
// Select a new page within the Page Index list // Select a new page within the Page Index list
} else if (!empty($_GET['ipage'])) { } else if (isset($_GET['ipage'])) {
$_GET['ipage'] = (int)$_GET['ipage']; $_GET['ipage'] = (int)$_GET['ipage'];
$_SESSION['index_page'] = $_GET['ipage']; $_SESSION['index_page'] = $_GET['ipage'];
@ -1586,7 +1594,15 @@ document.write(mustache.render(
$_RDATA['page_index_found_rows'] = $foundRows[0][0]; $_RDATA['page_index_found_rows'] = $foundRows[0][0];
$_RDATA['index_pages'] = ceil($_RDATA['page_index_found_rows'] / $_ODATA['admin_index_pagination']); $_RDATA['index_pages'] = ceil($_RDATA['page_index_found_rows'] / $_ODATA['admin_index_pagination']);
$_SESSION['index_page'] = max(1, min($_RDATA['index_pages'], (int)$_SESSION['index_page']));
// If the requested page is outside page limit
if ($_SESSION['index_page'] != 1 && ($_SESSION['index_page'] > $_RDATA['index_pages'] || $_SESSION['index_page'] < 1)) {
$_SESSION['index_page'] = max(1, min($_RDATA['index_pages'], (int)$_SESSION['index_page']));
// Redirect to a page within the limits
header('Location: '.$_SERVER['REQUEST_URI'].'?ipage='.$_SESSION['index_page']);
exit();
}
} else $_SESSION['error'][] = 'Database did not return a search table row count.'; } else $_SESSION['error'][] = 'Database did not return a search table row count.';
} else $_SESSION['error'][] = 'Database error reading search table row count: '.$err[2]; } else $_SESSION['error'][] = 'Database error reading search table row count: '.$err[2];
@ -2787,10 +2803,14 @@ document.write(mustache.render(
<label class="d-flex lh-lg w-100 mb-2"> <label class="d-flex lh-lg w-100 mb-2">
<strong class="pe-2">Output Encoding:</strong> <strong class="pe-2">Output Encoding:</strong>
<span class="flex-grow-1 text-end"> <span class="flex-grow-1 text-end">
<input type="text" name="os_s_charset" value="<?php echo $_ODATA['s_charset']; ?>" pattern="[\w\d-]*" class="form-control d-inline-block w-auto mw-10em" <input type="text" name="os_s_charset" value="<?php echo $_ODATA['s_charset']; ?>" pattern="[\w\d-]*" class="form-control d-inline-block w-auto mw-10em" aria-labelledby="os_s_charset_text">
data-bs-toggle="tooltip" data-bs-placement="bottom" title="This value should match the encoding of your search results page, and ideally match the character encoding of most of your crawled pages.">
</span> </span>
</label> </label>
<p id="os_s_charset_text" class="form-text">
The <em>Output Encoding</em> value should match the encoding of your search results page, and
ideally match the character encoding of most of your crawled pages. UTF-8 is strongly
recommended.
</p>
<label class="d-flex lh-lg w-100 mb-2"> <label class="d-flex lh-lg w-100 mb-2">
<strong class="pe-2">Maximum Returned Results:</strong> <strong class="pe-2">Maximum Returned Results:</strong>
<span class="flex-grow-1 text-end text-nowrap"> <span class="flex-grow-1 text-end text-nowrap">
@ -2809,7 +2829,7 @@ document.write(mustache.render(
<strong class="pe-2">Maximum Matched Text (characters):</strong> <strong class="pe-2">Maximum Matched Text (characters):</strong>
<span class="flex-grow-1 text-end text-nowrap"> <span class="flex-grow-1 text-end text-nowrap">
<input type="number" name="os_s_limit_matchtext" value="<?php echo $_ODATA['s_limit_matchtext']; ?>" min="0" max="65535" step="1" class="form-control d-inline-block" <input type="number" name="os_s_limit_matchtext" value="<?php echo $_ODATA['s_limit_matchtext']; ?>" min="0" max="65535" step="1" class="form-control d-inline-block"
data-bs-toggle="tooltip" data-bs-placement="top" title="Maximum amount of matched 'description' text to display beneath the title / URL of each search result."> data-bs-toggle="tooltip" data-bs-placement="top" title="Maximum amount of matched 'description' text to display beneath the heading of each search result.">
</span> </span>
</label> </label>
<div class="row"> <div class="row">
@ -3099,9 +3119,10 @@ document.write(mustache.render(
* Not logged in; Show login page ****************************** */ * Not logged in; Show login page ****************************** */
} else { ?> } else { ?>
<section class="row justify-content-center"> <section class="row justify-content-center">
<header class="col-12 mb-2"> <header class="col-10 col-sm-8 col-md-6 col-lg-5 col-xl-4 mb-2">
<h2>Welcome</h2> <h2>Welcome</h2>
</header> </header>
<div class="w-100"></div>
<div class="col-10 col-sm-8 col-md-6 col-lg-5 col-xl-4"> <div class="col-10 col-sm-8 col-md-6 col-lg-5 col-xl-4">
<form action="<?php echo $_SERVER['REQUEST_URI']; ?>" method="post" <form action="<?php echo $_SERVER['REQUEST_URI']; ?>" method="post"

View file

@ -101,7 +101,7 @@ if (!in_array($_DDATA['tbprefix'].'config', $_DDATA['tables'])) {
`jw_hostname` TINYTEXT NOT NULL, `jw_hostname` TINYTEXT NOT NULL,
`jw_compression` TINYINT UNSIGNED NOT NULL, `jw_compression` TINYINT UNSIGNED NOT NULL,
PRIMARY KEY (`version`) PRIMARY KEY (`version`)
) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_ci;' ) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_520_ci;'
); );
$err = $create->errorInfo(); $err = $create->errorInfo();
if ($err[0] != '00000') if ($err[0] != '00000')
@ -211,7 +211,7 @@ if (!in_array($_DDATA['tbprefix'].'crawldata', $_DDATA['tables'])) {
`last_modified` INT NOT NULL, `last_modified` INT NOT NULL,
`priority` DECIMAL(2,1) NOT NULL, `priority` DECIMAL(2,1) NOT NULL,
UNIQUE `content_checksum` (`content_checksum`) UNIQUE `content_checksum` (`content_checksum`)
) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_ci;' ) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_520_ci;'
); );
$err = $create->errorInfo(); $err = $create->errorInfo();
if ($err[0] != '00000') if ($err[0] != '00000')
@ -227,7 +227,7 @@ if (!in_array($_DDATA['tbprefix'].'query', $_DDATA['tables'])) {
`stamp` INT UNSIGNED NOT NULL, `stamp` INT UNSIGNED NOT NULL,
`ip` INT UNSIGNED NOT NULL, `ip` INT UNSIGNED NOT NULL,
`cache` MEDIUMBLOB NOT NULL `cache` MEDIUMBLOB NOT NULL
) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_ci;' ) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_520_ci;'
); );
$err = $create->errorInfo(); $err = $create->errorInfo();
if ($err[0] != '00000') if ($err[0] != '00000')
@ -725,10 +725,11 @@ $_RDATA['s_latin'] = array(
'color' => array('colour'), 'color' => array('colour'),
'fiber' => array('fibre'), 'fiber' => array('fibre'),
'ae' => array('æ', 'Æ'), 'ae' => array('æ', 'Æ', 'ä', 'Ä'),
'oe' => array('œ', 'Œ'), 'oe' => array('œ', 'Œ', 'ö', 'Ö', 'ø', 'Ø'),
'ss' => array('ß'), 'ss' => array('ß'),
'th' => array('þ', 'Þ'), 'th' => array('þ', 'Þ'),
'ue' => array('ü', 'Ü'),
'a' => array('á', 'Á', 'à', 'À', 'â', 'Â', 'ä', 'Ä', 'ã', 'Ã', 'å', 'Å', 'ą', 'Ą', 'ă', 'Ă'), 'a' => array('á', 'Á', 'à', 'À', 'â', 'Â', 'ä', 'Ä', 'ã', 'Ã', 'å', 'Å', 'ą', 'Ą', 'ă', 'Ă'),
'c' => array('ç', 'Ç', 'ć', 'Ć', 'č', 'Č'), 'c' => array('ç', 'Ç', 'ć', 'Ć', 'č', 'Č'),
'd' => array('ð', 'Ð', 'ď', 'Ď', 'đ', 'Đ'), 'd' => array('ð', 'Ð', 'ď', 'Ď', 'đ', 'Đ'),

View file

@ -1335,7 +1335,6 @@ while ($_cURL && count($_RDATA['sp_queue'])) {
} else { } else {
$data['errno'] = 702; $data['errno'] = 702;
$data['error'] = 'PDF is empty of extractable text'; $data['error'] = 'PDF is empty of extractable text';
$data['content'] = '';
$data['info']['noindex'] = 'empty-pdf'; $data['info']['noindex'] = 'empty-pdf';
} }

View file

@ -255,9 +255,7 @@ if ($_RDATA['s_searchable_pages']) {
$filetypes = array(); $filetypes = array();
foreach ($_SDATA['terms'] as list($type, $term, $pcre)) { foreach ($_SDATA['terms'] as list($type, $term, $pcre)) {
// Regexp only for SQL use $slashTerm = addslashes($term);
$pterm = preg_quote(strtolower($term), '\'');
$pterm = strtr($pterm, $_RDATA['s_latin_pcre']);
switch ($type) { switch ($type) {
case 'filetype': case 'filetype':
@ -267,32 +265,32 @@ if ($_RDATA['s_searchable_pages']) {
break; break;
case 'exclude': case 'exclude':
$negs[] = '`content` NOT REGEXP \''.$pterm.'\''; $negs[] = 'INSTR(`content`, \''.$slashTerm.'\')=0';
$negs[] = '`url` NOT REGEXP \''.$pterm.'\''; $negs[] = 'INSTR(`url`, \''.$slashTerm.'\')=0';
$negs[] = '`title` NOT REGEXP \''.$pterm.'\''; $negs[] = 'INSTR(`title`, \''.$slashTerm.'\')=0';
$negs[] = '`description` NOT REGEXP \''.$pterm.'\''; $negs[] = 'INSTR(`description`, \''.$slashTerm.'\')=0';
$negs[] = '`keywords` NOT REGEXP \''.$pterm.'\''; $negs[] = 'INSTR(`keywords`, \''.$slashTerm.'\')=0';
$negs[] = '`weighted` NOT REGEXP \''.$pterm.'\''; $negs[] = 'INSTR(`weighted`, \''.$slashTerm.'\')=0';
break; break;
case 'phrase': case 'phrase':
$ands[] = '('.implode(' OR ', array( $ands[] = '('.implode(' OR ', array(
'`content` REGEXP \''.$pterm.'\'', 'INSTR(`content`, \''.$slashTerm.'\')',
'`url` REGEXP \''.$pterm.'\'', 'INSTR(`url`, \''.$slashTerm.'\')',
'`title` REGEXP \''.$pterm.'\'', 'INSTR(`title`, \''.$slashTerm.'\')',
'`description` REGEXP \''.$pterm.'\'', 'INSTR(`description`, \''.$slashTerm.'\')',
'`keywords` REGEXP \''.$pterm.'\'', 'INSTR(`keywords`, \''.$slashTerm.'\')',
'`weighted` REGEXP \''.$pterm.'\'' 'INSTR(`weighted`, \''.$slashTerm.'\')'
)).')'; )).')';
break; break;
case 'term': case 'term':
$ors[] = '`content` REGEXP \''.$pterm.'\''; $ors[] = 'INSTR(`content`, \''.$slashTerm.'\')';
$ors[] = '`url` REGEXP \''.$pterm.'\''; $ors[] = 'INSTR(`url`, \''.$slashTerm.'\')';
$ors[] = '`title` REGEXP \''.$pterm.'\''; $ors[] = 'INSTR(`title`, \''.$slashTerm.'\')';
$ors[] = '`description` REGEXP \''.$pterm.'\''; $ors[] = 'INSTR(`description`, \''.$slashTerm.'\')';
$ors[] = '`keywords` REGEXP \''.$pterm.'\''; $ors[] = 'INSTR(`keywords`, \''.$slashTerm.'\')';
$ors[] = '`weighted` REGEXP \''.$pterm.'\''; $ors[] = 'INSTR(`weighted`, \''.$slashTerm.'\')';
} }
} }