Daily updates
Keep Page Index pagination page within limits; add UTF-8 BOM to CSV and TXT download output; use utf8mb4_unicode_520_ci collation to remove need for SQL REGEXP; add more latin accent equivalent characters.
This commit is contained in:
parent
761491c21a
commit
ba04173c29
|
@ -9,7 +9,7 @@ require __DIR__.'/config.php';
|
|||
|
||||
|
||||
/**
|
||||
* Display a time since' HTML/Javascript counter
|
||||
* Display a 'time since' HTML/Javascript counter
|
||||
*
|
||||
*/
|
||||
function OS_countUp($time) {
|
||||
|
@ -49,11 +49,9 @@ function OS_countUp($time) {
|
|||
|
||||
|
||||
// ***** Load Maxmind GeoIP2
|
||||
if (!class_exists('GeoIp2\Database\Reader')) {
|
||||
if (file_exists(__DIR__.'/geoip2/geoip2.phar')) {
|
||||
if (!class_exists('GeoIp2\Database\Reader'))
|
||||
if (file_exists(__DIR__.'/geoip2/geoip2.phar'))
|
||||
include __DIR__.'/geoip2/geoip2.phar';
|
||||
}
|
||||
}
|
||||
if (class_exists('GeoIp2\Database\Reader')) {
|
||||
if (file_exists(__DIR__.'/geoip2/GeoLite2-Country.mmdb'))
|
||||
$_GEOIP2 = new GeoIp2\Database\Reader(__DIR__.'/geoip2/GeoLite2-Country.mmdb');
|
||||
|
@ -120,13 +118,15 @@ $_RDATA['index_status_list'] = array(
|
|||
if (empty($_SESSION['admin_page']) || empty($_RDATA['admin_pages'][$_SESSION['admin_page']]))
|
||||
$_SESSION['admin_page'] = 'crawler';
|
||||
|
||||
if (empty($_SESSION['index_page'])) $_SESSION['index_page'] = 1;
|
||||
if (!isset($_SESSION['index_page'])) $_SESSION['index_page'] = 1;
|
||||
if (empty($_SESSION['index_filter_category'])) $_SESSION['index_filter_category'] = '<none>';
|
||||
if (empty($_SESSION['index_filter_status'])) $_SESSION['index_filter_status'] = '<none>';
|
||||
if (empty($_SESSION['index_filter_text'])) $_SESSION['index_filter_text'] = '';
|
||||
if (empty($_SESSION['admin_username'])) $_SESSION['admin_username'] = '';
|
||||
|
||||
if (!$_SESSION['admin_username']) {
|
||||
|
||||
// If we are logging in
|
||||
if ($_SERVER['REQUEST_METHOD'] == 'POST') {
|
||||
if (!empty($_POST['os_submit']) && $_POST['os_submit'] == 'os_admin_login') {
|
||||
if (empty($_POST['os_admin_username'])) $_POST['os_admin_username'] = '';
|
||||
|
@ -203,6 +203,10 @@ if (!$_SESSION['admin_username']) {
|
|||
header('Content-disposition: attachment; filename="'.
|
||||
'crawl-log'.$_POST->grep.'_'.date('Y-m-d', $_ODATA['sp_time_end']).'.txt"');
|
||||
|
||||
// UTF-8 byte order mark
|
||||
if (strtolower($_ODATA['s_charset']) == 'utf-8')
|
||||
echo "\xEF\xBB\xBF";
|
||||
|
||||
die(implode("\n", $lines));
|
||||
|
||||
} else {
|
||||
|
@ -236,6 +240,10 @@ if (!$_SESSION['admin_username']) {
|
|||
|
||||
$output = fopen('php://output', 'w');
|
||||
|
||||
// UTF-8 byte order mark
|
||||
if (strtolower($_ODATA['s_charset']) == 'utf-8')
|
||||
fwrite($output, "\xEF\xBB\xBF");
|
||||
|
||||
$headings = array('Query', 'Results', 'Time Stamp', 'IP');
|
||||
if ($_GEOIP2) $headings[] = 'Country';
|
||||
|
||||
|
@ -247,8 +255,8 @@ if (!$_SESSION['admin_username']) {
|
|||
try {
|
||||
$geo = $_GEOIP2->country($line['ipaddr']);
|
||||
} catch(Exception $e) { $geo = false; }
|
||||
} else $geo = false;
|
||||
if ($geo) $line['country'] = $geo->raw['country']['names']['en'];
|
||||
$line['country'] = ($geo) ? $geo->raw['country']['names']['en'] : '';
|
||||
}
|
||||
|
||||
fputcsv($output, $line);
|
||||
}
|
||||
|
@ -1491,7 +1499,7 @@ document.write(mustache.render(
|
|||
$_SESSION['admin_page'] = $_GET['page'];
|
||||
|
||||
// Select a new page within the Page Index list
|
||||
} else if (!empty($_GET['ipage'])) {
|
||||
} else if (isset($_GET['ipage'])) {
|
||||
$_GET['ipage'] = (int)$_GET['ipage'];
|
||||
$_SESSION['index_page'] = $_GET['ipage'];
|
||||
|
||||
|
@ -1586,8 +1594,16 @@ document.write(mustache.render(
|
|||
$_RDATA['page_index_found_rows'] = $foundRows[0][0];
|
||||
|
||||
$_RDATA['index_pages'] = ceil($_RDATA['page_index_found_rows'] / $_ODATA['admin_index_pagination']);
|
||||
|
||||
// If the requested page is outside page limit
|
||||
if ($_SESSION['index_page'] != 1 && ($_SESSION['index_page'] > $_RDATA['index_pages'] || $_SESSION['index_page'] < 1)) {
|
||||
$_SESSION['index_page'] = max(1, min($_RDATA['index_pages'], (int)$_SESSION['index_page']));
|
||||
|
||||
// Redirect to a page within the limits
|
||||
header('Location: '.$_SERVER['REQUEST_URI'].'?ipage='.$_SESSION['index_page']);
|
||||
exit();
|
||||
}
|
||||
|
||||
} else $_SESSION['error'][] = 'Database did not return a search table row count.';
|
||||
} else $_SESSION['error'][] = 'Database error reading search table row count: '.$err[2];
|
||||
} else $_SESSION['error'][] = 'Database error reading search table: '.$err[2];
|
||||
|
@ -2787,10 +2803,14 @@ document.write(mustache.render(
|
|||
<label class="d-flex lh-lg w-100 mb-2">
|
||||
<strong class="pe-2">Output Encoding:</strong>
|
||||
<span class="flex-grow-1 text-end">
|
||||
<input type="text" name="os_s_charset" value="<?php echo $_ODATA['s_charset']; ?>" pattern="[\w\d-]*" class="form-control d-inline-block w-auto mw-10em"
|
||||
data-bs-toggle="tooltip" data-bs-placement="bottom" title="This value should match the encoding of your search results page, and ideally match the character encoding of most of your crawled pages.">
|
||||
<input type="text" name="os_s_charset" value="<?php echo $_ODATA['s_charset']; ?>" pattern="[\w\d-]*" class="form-control d-inline-block w-auto mw-10em" aria-labelledby="os_s_charset_text">
|
||||
</span>
|
||||
</label>
|
||||
<p id="os_s_charset_text" class="form-text">
|
||||
The <em>Output Encoding</em> value should match the encoding of your search results page, and
|
||||
ideally match the character encoding of most of your crawled pages. UTF-8 is strongly
|
||||
recommended.
|
||||
</p>
|
||||
<label class="d-flex lh-lg w-100 mb-2">
|
||||
<strong class="pe-2">Maximum Returned Results:</strong>
|
||||
<span class="flex-grow-1 text-end text-nowrap">
|
||||
|
@ -2809,7 +2829,7 @@ document.write(mustache.render(
|
|||
<strong class="pe-2">Maximum Matched Text (characters):</strong>
|
||||
<span class="flex-grow-1 text-end text-nowrap">
|
||||
<input type="number" name="os_s_limit_matchtext" value="<?php echo $_ODATA['s_limit_matchtext']; ?>" min="0" max="65535" step="1" class="form-control d-inline-block"
|
||||
data-bs-toggle="tooltip" data-bs-placement="top" title="Maximum amount of matched 'description' text to display beneath the title / URL of each search result.">
|
||||
data-bs-toggle="tooltip" data-bs-placement="top" title="Maximum amount of matched 'description' text to display beneath the heading of each search result.">
|
||||
</span>
|
||||
</label>
|
||||
<div class="row">
|
||||
|
@ -3099,9 +3119,10 @@ document.write(mustache.render(
|
|||
* Not logged in; Show login page ****************************** */
|
||||
} else { ?>
|
||||
<section class="row justify-content-center">
|
||||
<header class="col-12 mb-2">
|
||||
<header class="col-10 col-sm-8 col-md-6 col-lg-5 col-xl-4 mb-2">
|
||||
<h2>Welcome</h2>
|
||||
</header>
|
||||
<div class="w-100"></div>
|
||||
|
||||
<div class="col-10 col-sm-8 col-md-6 col-lg-5 col-xl-4">
|
||||
<form action="<?php echo $_SERVER['REQUEST_URI']; ?>" method="post"
|
||||
|
|
|
@ -101,7 +101,7 @@ if (!in_array($_DDATA['tbprefix'].'config', $_DDATA['tables'])) {
|
|||
`jw_hostname` TINYTEXT NOT NULL,
|
||||
`jw_compression` TINYINT UNSIGNED NOT NULL,
|
||||
PRIMARY KEY (`version`)
|
||||
) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_ci;'
|
||||
) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_520_ci;'
|
||||
);
|
||||
$err = $create->errorInfo();
|
||||
if ($err[0] != '00000')
|
||||
|
@ -211,7 +211,7 @@ if (!in_array($_DDATA['tbprefix'].'crawldata', $_DDATA['tables'])) {
|
|||
`last_modified` INT NOT NULL,
|
||||
`priority` DECIMAL(2,1) NOT NULL,
|
||||
UNIQUE `content_checksum` (`content_checksum`)
|
||||
) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_ci;'
|
||||
) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_520_ci;'
|
||||
);
|
||||
$err = $create->errorInfo();
|
||||
if ($err[0] != '00000')
|
||||
|
@ -227,7 +227,7 @@ if (!in_array($_DDATA['tbprefix'].'query', $_DDATA['tables'])) {
|
|||
`stamp` INT UNSIGNED NOT NULL,
|
||||
`ip` INT UNSIGNED NOT NULL,
|
||||
`cache` MEDIUMBLOB NOT NULL
|
||||
) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_ci;'
|
||||
) ENGINE = MyISAM, CHARACTER SET = utf8mb4, COLLATE = utf8mb4_unicode_520_ci;'
|
||||
);
|
||||
$err = $create->errorInfo();
|
||||
if ($err[0] != '00000')
|
||||
|
@ -725,10 +725,11 @@ $_RDATA['s_latin'] = array(
|
|||
'color' => array('colour'),
|
||||
'fiber' => array('fibre'),
|
||||
|
||||
'ae' => array('æ', 'Æ'),
|
||||
'oe' => array('œ', 'Œ'),
|
||||
'ae' => array('æ', 'Æ', 'ä', 'Ä'),
|
||||
'oe' => array('œ', 'Œ', 'ö', 'Ö', 'ø', 'Ø'),
|
||||
'ss' => array('ß'),
|
||||
'th' => array('þ', 'Þ'),
|
||||
'ue' => array('ü', 'Ü'),
|
||||
'a' => array('á', 'Á', 'à', 'À', 'â', 'Â', 'ä', 'Ä', 'ã', 'Ã', 'å', 'Å', 'ą', 'Ą', 'ă', 'Ă'),
|
||||
'c' => array('ç', 'Ç', 'ć', 'Ć', 'č', 'Č'),
|
||||
'd' => array('ð', 'Ð', 'ď', 'Ď', 'đ', 'Đ'),
|
||||
|
|
|
@ -1335,7 +1335,6 @@ while ($_cURL && count($_RDATA['sp_queue'])) {
|
|||
} else {
|
||||
$data['errno'] = 702;
|
||||
$data['error'] = 'PDF is empty of extractable text';
|
||||
$data['content'] = '';
|
||||
$data['info']['noindex'] = 'empty-pdf';
|
||||
}
|
||||
|
||||
|
|
|
@ -255,9 +255,7 @@ if ($_RDATA['s_searchable_pages']) {
|
|||
$filetypes = array();
|
||||
foreach ($_SDATA['terms'] as list($type, $term, $pcre)) {
|
||||
|
||||
// Regexp only for SQL use
|
||||
$pterm = preg_quote(strtolower($term), '\'');
|
||||
$pterm = strtr($pterm, $_RDATA['s_latin_pcre']);
|
||||
$slashTerm = addslashes($term);
|
||||
|
||||
switch ($type) {
|
||||
case 'filetype':
|
||||
|
@ -267,32 +265,32 @@ if ($_RDATA['s_searchable_pages']) {
|
|||
break;
|
||||
|
||||
case 'exclude':
|
||||
$negs[] = '`content` NOT REGEXP \''.$pterm.'\'';
|
||||
$negs[] = '`url` NOT REGEXP \''.$pterm.'\'';
|
||||
$negs[] = '`title` NOT REGEXP \''.$pterm.'\'';
|
||||
$negs[] = '`description` NOT REGEXP \''.$pterm.'\'';
|
||||
$negs[] = '`keywords` NOT REGEXP \''.$pterm.'\'';
|
||||
$negs[] = '`weighted` NOT REGEXP \''.$pterm.'\'';
|
||||
$negs[] = 'INSTR(`content`, \''.$slashTerm.'\')=0';
|
||||
$negs[] = 'INSTR(`url`, \''.$slashTerm.'\')=0';
|
||||
$negs[] = 'INSTR(`title`, \''.$slashTerm.'\')=0';
|
||||
$negs[] = 'INSTR(`description`, \''.$slashTerm.'\')=0';
|
||||
$negs[] = 'INSTR(`keywords`, \''.$slashTerm.'\')=0';
|
||||
$negs[] = 'INSTR(`weighted`, \''.$slashTerm.'\')=0';
|
||||
break;
|
||||
|
||||
case 'phrase':
|
||||
$ands[] = '('.implode(' OR ', array(
|
||||
'`content` REGEXP \''.$pterm.'\'',
|
||||
'`url` REGEXP \''.$pterm.'\'',
|
||||
'`title` REGEXP \''.$pterm.'\'',
|
||||
'`description` REGEXP \''.$pterm.'\'',
|
||||
'`keywords` REGEXP \''.$pterm.'\'',
|
||||
'`weighted` REGEXP \''.$pterm.'\''
|
||||
'INSTR(`content`, \''.$slashTerm.'\')',
|
||||
'INSTR(`url`, \''.$slashTerm.'\')',
|
||||
'INSTR(`title`, \''.$slashTerm.'\')',
|
||||
'INSTR(`description`, \''.$slashTerm.'\')',
|
||||
'INSTR(`keywords`, \''.$slashTerm.'\')',
|
||||
'INSTR(`weighted`, \''.$slashTerm.'\')'
|
||||
)).')';
|
||||
break;
|
||||
|
||||
case 'term':
|
||||
$ors[] = '`content` REGEXP \''.$pterm.'\'';
|
||||
$ors[] = '`url` REGEXP \''.$pterm.'\'';
|
||||
$ors[] = '`title` REGEXP \''.$pterm.'\'';
|
||||
$ors[] = '`description` REGEXP \''.$pterm.'\'';
|
||||
$ors[] = '`keywords` REGEXP \''.$pterm.'\'';
|
||||
$ors[] = '`weighted` REGEXP \''.$pterm.'\'';
|
||||
$ors[] = 'INSTR(`content`, \''.$slashTerm.'\')';
|
||||
$ors[] = 'INSTR(`url`, \''.$slashTerm.'\')';
|
||||
$ors[] = 'INSTR(`title`, \''.$slashTerm.'\')';
|
||||
$ors[] = 'INSTR(`description`, \''.$slashTerm.'\')';
|
||||
$ors[] = 'INSTR(`keywords`, \''.$slashTerm.'\')';
|
||||
$ors[] = 'INSTR(`weighted`, \''.$slashTerm.'\')';
|
||||
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue