From ba04173c29dfc75a2154878b034639c5c7923410 Mon Sep 17 00:00:00 2001 From: Brian Huisman Date: Wed, 26 Apr 2023 15:16:13 -0400 Subject: [PATCH] Daily updates Keep Page Index pagination page within limits; add UTF-8 BOM to CSV and TXT download output; use utf8mb4_unicode_520_ci collation to remove need for SQL REGEXP; add more latin accent equivalent characters. --- orcinus/admin.php | 49 ++++++++++++++++++++++++++++++++------------- orcinus/config.php | 11 +++++----- orcinus/crawler.php | 1 - orcinus/search.php | 40 ++++++++++++++++++------------------ 4 files changed, 60 insertions(+), 41 deletions(-) diff --git a/orcinus/admin.php b/orcinus/admin.php index cbd052e..7315666 100644 --- a/orcinus/admin.php +++ b/orcinus/admin.php @@ -9,7 +9,7 @@ require __DIR__.'/config.php'; /** - * Display a time since' HTML/Javascript counter + * Display a 'time since' HTML/Javascript counter * */ function OS_countUp($time) { @@ -49,11 +49,9 @@ function OS_countUp($time) { // ***** Load Maxmind GeoIP2 -if (!class_exists('GeoIp2\Database\Reader')) { - if (file_exists(__DIR__.'/geoip2/geoip2.phar')) { +if (!class_exists('GeoIp2\Database\Reader')) + if (file_exists(__DIR__.'/geoip2/geoip2.phar')) include __DIR__.'/geoip2/geoip2.phar'; - } -} if (class_exists('GeoIp2\Database\Reader')) { if (file_exists(__DIR__.'/geoip2/GeoLite2-Country.mmdb')) $_GEOIP2 = new GeoIp2\Database\Reader(__DIR__.'/geoip2/GeoLite2-Country.mmdb'); @@ -120,13 +118,15 @@ $_RDATA['index_status_list'] = array( if (empty($_SESSION['admin_page']) || empty($_RDATA['admin_pages'][$_SESSION['admin_page']])) $_SESSION['admin_page'] = 'crawler'; -if (empty($_SESSION['index_page'])) $_SESSION['index_page'] = 1; +if (!isset($_SESSION['index_page'])) $_SESSION['index_page'] = 1; if (empty($_SESSION['index_filter_category'])) $_SESSION['index_filter_category'] = ''; if (empty($_SESSION['index_filter_status'])) $_SESSION['index_filter_status'] = ''; if (empty($_SESSION['index_filter_text'])) $_SESSION['index_filter_text'] = ''; if (empty($_SESSION['admin_username'])) $_SESSION['admin_username'] = ''; if (!$_SESSION['admin_username']) { + + // If we are logging in if ($_SERVER['REQUEST_METHOD'] == 'POST') { if (!empty($_POST['os_submit']) && $_POST['os_submit'] == 'os_admin_login') { if (empty($_POST['os_admin_username'])) $_POST['os_admin_username'] = ''; @@ -203,6 +203,10 @@ if (!$_SESSION['admin_username']) { header('Content-disposition: attachment; filename="'. 'crawl-log'.$_POST->grep.'_'.date('Y-m-d', $_ODATA['sp_time_end']).'.txt"'); + // UTF-8 byte order mark + if (strtolower($_ODATA['s_charset']) == 'utf-8') + echo "\xEF\xBB\xBF"; + die(implode("\n", $lines)); } else { @@ -236,6 +240,10 @@ if (!$_SESSION['admin_username']) { $output = fopen('php://output', 'w'); + // UTF-8 byte order mark + if (strtolower($_ODATA['s_charset']) == 'utf-8') + fwrite($output, "\xEF\xBB\xBF"); + $headings = array('Query', 'Results', 'Time Stamp', 'IP'); if ($_GEOIP2) $headings[] = 'Country'; @@ -247,8 +255,8 @@ if (!$_SESSION['admin_username']) { try { $geo = $_GEOIP2->country($line['ipaddr']); } catch(Exception $e) { $geo = false; } - } else $geo = false; - if ($geo) $line['country'] = $geo->raw['country']['names']['en']; + $line['country'] = ($geo) ? $geo->raw['country']['names']['en'] : ''; + } fputcsv($output, $line); } @@ -1491,7 +1499,7 @@ document.write(mustache.render( $_SESSION['admin_page'] = $_GET['page']; // Select a new page within the Page Index list - } else if (!empty($_GET['ipage'])) { + } else if (isset($_GET['ipage'])) { $_GET['ipage'] = (int)$_GET['ipage']; $_SESSION['index_page'] = $_GET['ipage']; @@ -1586,7 +1594,15 @@ document.write(mustache.render( $_RDATA['page_index_found_rows'] = $foundRows[0][0]; $_RDATA['index_pages'] = ceil($_RDATA['page_index_found_rows'] / $_ODATA['admin_index_pagination']); - $_SESSION['index_page'] = max(1, min($_RDATA['index_pages'], (int)$_SESSION['index_page'])); + + // If the requested page is outside page limit + if ($_SESSION['index_page'] != 1 && ($_SESSION['index_page'] > $_RDATA['index_pages'] || $_SESSION['index_page'] < 1)) { + $_SESSION['index_page'] = max(1, min($_RDATA['index_pages'], (int)$_SESSION['index_page'])); + + // Redirect to a page within the limits + header('Location: '.$_SERVER['REQUEST_URI'].'?ipage='.$_SESSION['index_page']); + exit(); + } } else $_SESSION['error'][] = 'Database did not return a search table row count.'; } else $_SESSION['error'][] = 'Database error reading search table row count: '.$err[2]; @@ -2787,10 +2803,14 @@ document.write(mustache.render( +

+ The Output Encoding value should match the encoding of your search results page, and + ideally match the character encoding of most of your crawled pages. UTF-8 is strongly + recommended. +

@@ -3099,9 +3119,10 @@ document.write(mustache.render( * Not logged in; Show login page ****************************** */ } else { ?>
-
+

Welcome

+
errorInfo(); if ($err[0] != '00000') @@ -211,7 +211,7 @@ if (!in_array($_DDATA['tbprefix'].'crawldata', $_DDATA['tables'])) { `last_modified` INT NOT NULL, `priority` DECIMAL(2,1) NOT NULL, UNIQUE `content_checksum` (`content_checksum`) - ) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_ci;' + ) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_520_ci;' ); $err = $create->errorInfo(); if ($err[0] != '00000') @@ -227,7 +227,7 @@ if (!in_array($_DDATA['tbprefix'].'query', $_DDATA['tables'])) { `stamp` INT UNSIGNED NOT NULL, `ip` INT UNSIGNED NOT NULL, `cache` MEDIUMBLOB NOT NULL - ) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_ci;' + ) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_520_ci;' ); $err = $create->errorInfo(); if ($err[0] != '00000') @@ -725,10 +725,11 @@ $_RDATA['s_latin'] = array( 'color' => array('colour'), 'fiber' => array('fibre'), - 'ae' => array('æ', 'Æ'), - 'oe' => array('œ', 'Œ'), + 'ae' => array('æ', 'Æ', 'ä', 'Ä'), + 'oe' => array('œ', 'Œ', 'ö', 'Ö', 'ø', 'Ø'), 'ss' => array('ß'), 'th' => array('þ', 'Þ'), + 'ue' => array('ü', 'Ü'), 'a' => array('á', 'Á', 'à', 'À', 'â', 'Â', 'ä', 'Ä', 'ã', 'Ã', 'å', 'Å', 'ą', 'Ą', 'ă', 'Ă'), 'c' => array('ç', 'Ç', 'ć', 'Ć', 'č', 'Č'), 'd' => array('ð', 'Ð', 'ď', 'Ď', 'đ', 'Đ'), diff --git a/orcinus/crawler.php b/orcinus/crawler.php index 37b2511..623b285 100644 --- a/orcinus/crawler.php +++ b/orcinus/crawler.php @@ -1335,7 +1335,6 @@ while ($_cURL && count($_RDATA['sp_queue'])) { } else { $data['errno'] = 702; $data['error'] = 'PDF is empty of extractable text'; - $data['content'] = ''; $data['info']['noindex'] = 'empty-pdf'; } diff --git a/orcinus/search.php b/orcinus/search.php index 9304368..61947b5 100644 --- a/orcinus/search.php +++ b/orcinus/search.php @@ -255,9 +255,7 @@ if ($_RDATA['s_searchable_pages']) { $filetypes = array(); foreach ($_SDATA['terms'] as list($type, $term, $pcre)) { - // Regexp only for SQL use - $pterm = preg_quote(strtolower($term), '\''); - $pterm = strtr($pterm, $_RDATA['s_latin_pcre']); + $slashTerm = addslashes($term); switch ($type) { case 'filetype': @@ -267,32 +265,32 @@ if ($_RDATA['s_searchable_pages']) { break; case 'exclude': - $negs[] = '`content` NOT REGEXP \''.$pterm.'\''; - $negs[] = '`url` NOT REGEXP \''.$pterm.'\''; - $negs[] = '`title` NOT REGEXP \''.$pterm.'\''; - $negs[] = '`description` NOT REGEXP \''.$pterm.'\''; - $negs[] = '`keywords` NOT REGEXP \''.$pterm.'\''; - $negs[] = '`weighted` NOT REGEXP \''.$pterm.'\''; + $negs[] = 'INSTR(`content`, \''.$slashTerm.'\')=0'; + $negs[] = 'INSTR(`url`, \''.$slashTerm.'\')=0'; + $negs[] = 'INSTR(`title`, \''.$slashTerm.'\')=0'; + $negs[] = 'INSTR(`description`, \''.$slashTerm.'\')=0'; + $negs[] = 'INSTR(`keywords`, \''.$slashTerm.'\')=0'; + $negs[] = 'INSTR(`weighted`, \''.$slashTerm.'\')=0'; break; case 'phrase': $ands[] = '('.implode(' OR ', array( - '`content` REGEXP \''.$pterm.'\'', - '`url` REGEXP \''.$pterm.'\'', - '`title` REGEXP \''.$pterm.'\'', - '`description` REGEXP \''.$pterm.'\'', - '`keywords` REGEXP \''.$pterm.'\'', - '`weighted` REGEXP \''.$pterm.'\'' + 'INSTR(`content`, \''.$slashTerm.'\')', + 'INSTR(`url`, \''.$slashTerm.'\')', + 'INSTR(`title`, \''.$slashTerm.'\')', + 'INSTR(`description`, \''.$slashTerm.'\')', + 'INSTR(`keywords`, \''.$slashTerm.'\')', + 'INSTR(`weighted`, \''.$slashTerm.'\')' )).')'; break; case 'term': - $ors[] = '`content` REGEXP \''.$pterm.'\''; - $ors[] = '`url` REGEXP \''.$pterm.'\''; - $ors[] = '`title` REGEXP \''.$pterm.'\''; - $ors[] = '`description` REGEXP \''.$pterm.'\''; - $ors[] = '`keywords` REGEXP \''.$pterm.'\''; - $ors[] = '`weighted` REGEXP \''.$pterm.'\''; + $ors[] = 'INSTR(`content`, \''.$slashTerm.'\')'; + $ors[] = 'INSTR(`url`, \''.$slashTerm.'\')'; + $ors[] = 'INSTR(`title`, \''.$slashTerm.'\')'; + $ors[] = 'INSTR(`description`, \''.$slashTerm.'\')'; + $ors[] = 'INSTR(`keywords`, \''.$slashTerm.'\')'; + $ors[] = 'INSTR(`weighted`, \''.$slashTerm.'\')'; } }