Daily updates

Keep Page Index pagination page within limits; add UTF-8 BOM to CSV and TXT download output; use utf8mb4_unicode_520_ci collation to remove need for SQL REGEXP; add more latin accent equivalent characters.
This commit is contained in:
Brian Huisman 2023-04-26 15:16:13 -04:00
parent 761491c21a
commit ba04173c29
4 changed files with 60 additions and 41 deletions

View file

@ -9,7 +9,7 @@ require __DIR__.'/config.php';
/**
* Display a time since' HTML/Javascript counter
* Display a 'time since' HTML/Javascript counter
*
*/
function OS_countUp($time) {
@ -49,11 +49,9 @@ function OS_countUp($time) {
// ***** Load Maxmind GeoIP2
if (!class_exists('GeoIp2\Database\Reader')) {
if (file_exists(__DIR__.'/geoip2/geoip2.phar')) {
if (!class_exists('GeoIp2\Database\Reader'))
if (file_exists(__DIR__.'/geoip2/geoip2.phar'))
include __DIR__.'/geoip2/geoip2.phar';
}
}
if (class_exists('GeoIp2\Database\Reader')) {
if (file_exists(__DIR__.'/geoip2/GeoLite2-Country.mmdb'))
$_GEOIP2 = new GeoIp2\Database\Reader(__DIR__.'/geoip2/GeoLite2-Country.mmdb');
@ -120,13 +118,15 @@ $_RDATA['index_status_list'] = array(
if (empty($_SESSION['admin_page']) || empty($_RDATA['admin_pages'][$_SESSION['admin_page']]))
$_SESSION['admin_page'] = 'crawler';
if (empty($_SESSION['index_page'])) $_SESSION['index_page'] = 1;
if (!isset($_SESSION['index_page'])) $_SESSION['index_page'] = 1;
if (empty($_SESSION['index_filter_category'])) $_SESSION['index_filter_category'] = '<none>';
if (empty($_SESSION['index_filter_status'])) $_SESSION['index_filter_status'] = '<none>';
if (empty($_SESSION['index_filter_text'])) $_SESSION['index_filter_text'] = '';
if (empty($_SESSION['admin_username'])) $_SESSION['admin_username'] = '';
if (!$_SESSION['admin_username']) {
// If we are logging in
if ($_SERVER['REQUEST_METHOD'] == 'POST') {
if (!empty($_POST['os_submit']) && $_POST['os_submit'] == 'os_admin_login') {
if (empty($_POST['os_admin_username'])) $_POST['os_admin_username'] = '';
@ -203,6 +203,10 @@ if (!$_SESSION['admin_username']) {
header('Content-disposition: attachment; filename="'.
'crawl-log'.$_POST->grep.'_'.date('Y-m-d', $_ODATA['sp_time_end']).'.txt"');
// UTF-8 byte order mark
if (strtolower($_ODATA['s_charset']) == 'utf-8')
echo "\xEF\xBB\xBF";
die(implode("\n", $lines));
} else {
@ -236,6 +240,10 @@ if (!$_SESSION['admin_username']) {
$output = fopen('php://output', 'w');
// UTF-8 byte order mark
if (strtolower($_ODATA['s_charset']) == 'utf-8')
fwrite($output, "\xEF\xBB\xBF");
$headings = array('Query', 'Results', 'Time Stamp', 'IP');
if ($_GEOIP2) $headings[] = 'Country';
@ -247,8 +255,8 @@ if (!$_SESSION['admin_username']) {
try {
$geo = $_GEOIP2->country($line['ipaddr']);
} catch(Exception $e) { $geo = false; }
} else $geo = false;
if ($geo) $line['country'] = $geo->raw['country']['names']['en'];
$line['country'] = ($geo) ? $geo->raw['country']['names']['en'] : '';
}
fputcsv($output, $line);
}
@ -1491,7 +1499,7 @@ document.write(mustache.render(
$_SESSION['admin_page'] = $_GET['page'];
// Select a new page within the Page Index list
} else if (!empty($_GET['ipage'])) {
} else if (isset($_GET['ipage'])) {
$_GET['ipage'] = (int)$_GET['ipage'];
$_SESSION['index_page'] = $_GET['ipage'];
@ -1586,7 +1594,15 @@ document.write(mustache.render(
$_RDATA['page_index_found_rows'] = $foundRows[0][0];
$_RDATA['index_pages'] = ceil($_RDATA['page_index_found_rows'] / $_ODATA['admin_index_pagination']);
$_SESSION['index_page'] = max(1, min($_RDATA['index_pages'], (int)$_SESSION['index_page']));
// If the requested page is outside page limit
if ($_SESSION['index_page'] != 1 && ($_SESSION['index_page'] > $_RDATA['index_pages'] || $_SESSION['index_page'] < 1)) {
$_SESSION['index_page'] = max(1, min($_RDATA['index_pages'], (int)$_SESSION['index_page']));
// Redirect to a page within the limits
header('Location: '.$_SERVER['REQUEST_URI'].'?ipage='.$_SESSION['index_page']);
exit();
}
} else $_SESSION['error'][] = 'Database did not return a search table row count.';
} else $_SESSION['error'][] = 'Database error reading search table row count: '.$err[2];
@ -2787,10 +2803,14 @@ document.write(mustache.render(
<label class="d-flex lh-lg w-100 mb-2">
<strong class="pe-2">Output Encoding:</strong>
<span class="flex-grow-1 text-end">
<input type="text" name="os_s_charset" value="<?php echo $_ODATA['s_charset']; ?>" pattern="[\w\d-]*" class="form-control d-inline-block w-auto mw-10em"
data-bs-toggle="tooltip" data-bs-placement="bottom" title="This value should match the encoding of your search results page, and ideally match the character encoding of most of your crawled pages.">
<input type="text" name="os_s_charset" value="<?php echo $_ODATA['s_charset']; ?>" pattern="[\w\d-]*" class="form-control d-inline-block w-auto mw-10em" aria-labelledby="os_s_charset_text">
</span>
</label>
<p id="os_s_charset_text" class="form-text">
The <em>Output Encoding</em> value should match the encoding of your search results page, and
ideally match the character encoding of most of your crawled pages. UTF-8 is strongly
recommended.
</p>
<label class="d-flex lh-lg w-100 mb-2">
<strong class="pe-2">Maximum Returned Results:</strong>
<span class="flex-grow-1 text-end text-nowrap">
@ -2809,7 +2829,7 @@ document.write(mustache.render(
<strong class="pe-2">Maximum Matched Text (characters):</strong>
<span class="flex-grow-1 text-end text-nowrap">
<input type="number" name="os_s_limit_matchtext" value="<?php echo $_ODATA['s_limit_matchtext']; ?>" min="0" max="65535" step="1" class="form-control d-inline-block"
data-bs-toggle="tooltip" data-bs-placement="top" title="Maximum amount of matched 'description' text to display beneath the title / URL of each search result.">
data-bs-toggle="tooltip" data-bs-placement="top" title="Maximum amount of matched 'description' text to display beneath the heading of each search result.">
</span>
</label>
<div class="row">
@ -3099,9 +3119,10 @@ document.write(mustache.render(
* Not logged in; Show login page ****************************** */
} else { ?>
<section class="row justify-content-center">
<header class="col-12 mb-2">
<header class="col-10 col-sm-8 col-md-6 col-lg-5 col-xl-4 mb-2">
<h2>Welcome</h2>
</header>
<div class="w-100"></div>
<div class="col-10 col-sm-8 col-md-6 col-lg-5 col-xl-4">
<form action="<?php echo $_SERVER['REQUEST_URI']; ?>" method="post"

View file

@ -101,7 +101,7 @@ if (!in_array($_DDATA['tbprefix'].'config', $_DDATA['tables'])) {
`jw_hostname` TINYTEXT NOT NULL,
`jw_compression` TINYINT UNSIGNED NOT NULL,
PRIMARY KEY (`version`)
) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_ci;'
) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_520_ci;'
);
$err = $create->errorInfo();
if ($err[0] != '00000')
@ -211,7 +211,7 @@ if (!in_array($_DDATA['tbprefix'].'crawldata', $_DDATA['tables'])) {
`last_modified` INT NOT NULL,
`priority` DECIMAL(2,1) NOT NULL,
UNIQUE `content_checksum` (`content_checksum`)
) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_ci;'
) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_520_ci;'
);
$err = $create->errorInfo();
if ($err[0] != '00000')
@ -227,7 +227,7 @@ if (!in_array($_DDATA['tbprefix'].'query', $_DDATA['tables'])) {
`stamp` INT UNSIGNED NOT NULL,
`ip` INT UNSIGNED NOT NULL,
`cache` MEDIUMBLOB NOT NULL
) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_ci;'
) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_520_ci;'
);
$err = $create->errorInfo();
if ($err[0] != '00000')
@ -725,10 +725,11 @@ $_RDATA['s_latin'] = array(
'color' => array('colour'),
'fiber' => array('fibre'),
'ae' => array('æ', 'Æ'),
'oe' => array('œ', 'Œ'),
'ae' => array('æ', 'Æ', 'ä', 'Ä'),
'oe' => array('œ', 'Œ', 'ö', 'Ö', 'ø', 'Ø'),
'ss' => array('ß'),
'th' => array('þ', 'Þ'),
'ue' => array('ü', 'Ü'),
'a' => array('á', 'Á', 'à', 'À', 'â', 'Â', 'ä', 'Ä', 'ã', 'Ã', 'å', 'Å', 'ą', 'Ą', 'ă', 'Ă'),
'c' => array('ç', 'Ç', 'ć', 'Ć', 'č', 'Č'),
'd' => array('ð', 'Ð', 'ď', 'Ď', 'đ', 'Đ'),

View file

@ -1335,7 +1335,6 @@ while ($_cURL && count($_RDATA['sp_queue'])) {
} else {
$data['errno'] = 702;
$data['error'] = 'PDF is empty of extractable text';
$data['content'] = '';
$data['info']['noindex'] = 'empty-pdf';
}

View file

@ -255,9 +255,7 @@ if ($_RDATA['s_searchable_pages']) {
$filetypes = array();
foreach ($_SDATA['terms'] as list($type, $term, $pcre)) {
// Regexp only for SQL use
$pterm = preg_quote(strtolower($term), '\'');
$pterm = strtr($pterm, $_RDATA['s_latin_pcre']);
$slashTerm = addslashes($term);
switch ($type) {
case 'filetype':
@ -267,32 +265,32 @@ if ($_RDATA['s_searchable_pages']) {
break;
case 'exclude':
$negs[] = '`content` NOT REGEXP \''.$pterm.'\'';
$negs[] = '`url` NOT REGEXP \''.$pterm.'\'';
$negs[] = '`title` NOT REGEXP \''.$pterm.'\'';
$negs[] = '`description` NOT REGEXP \''.$pterm.'\'';
$negs[] = '`keywords` NOT REGEXP \''.$pterm.'\'';
$negs[] = '`weighted` NOT REGEXP \''.$pterm.'\'';
$negs[] = 'INSTR(`content`, \''.$slashTerm.'\')=0';
$negs[] = 'INSTR(`url`, \''.$slashTerm.'\')=0';
$negs[] = 'INSTR(`title`, \''.$slashTerm.'\')=0';
$negs[] = 'INSTR(`description`, \''.$slashTerm.'\')=0';
$negs[] = 'INSTR(`keywords`, \''.$slashTerm.'\')=0';
$negs[] = 'INSTR(`weighted`, \''.$slashTerm.'\')=0';
break;
case 'phrase':
$ands[] = '('.implode(' OR ', array(
'`content` REGEXP \''.$pterm.'\'',
'`url` REGEXP \''.$pterm.'\'',
'`title` REGEXP \''.$pterm.'\'',
'`description` REGEXP \''.$pterm.'\'',
'`keywords` REGEXP \''.$pterm.'\'',
'`weighted` REGEXP \''.$pterm.'\''
'INSTR(`content`, \''.$slashTerm.'\')',
'INSTR(`url`, \''.$slashTerm.'\')',
'INSTR(`title`, \''.$slashTerm.'\')',
'INSTR(`description`, \''.$slashTerm.'\')',
'INSTR(`keywords`, \''.$slashTerm.'\')',
'INSTR(`weighted`, \''.$slashTerm.'\')'
)).')';
break;
case 'term':
$ors[] = '`content` REGEXP \''.$pterm.'\'';
$ors[] = '`url` REGEXP \''.$pterm.'\'';
$ors[] = '`title` REGEXP \''.$pterm.'\'';
$ors[] = '`description` REGEXP \''.$pterm.'\'';
$ors[] = '`keywords` REGEXP \''.$pterm.'\'';
$ors[] = '`weighted` REGEXP \''.$pterm.'\'';
$ors[] = 'INSTR(`content`, \''.$slashTerm.'\')';
$ors[] = 'INSTR(`url`, \''.$slashTerm.'\')';
$ors[] = 'INSTR(`title`, \''.$slashTerm.'\')';
$ors[] = 'INSTR(`description`, \''.$slashTerm.'\')';
$ors[] = 'INSTR(`keywords`, \''.$slashTerm.'\')';
$ors[] = 'INSTR(`weighted`, \''.$slashTerm.'\')';
}
}