Punctuation normalization and MIME-type display

Disable Query log download button if query log is empty.
Further database error resiliency.
Add many more punctuation normalization characters; normalize on search as well as storage.
Add count of MIME-types in Search Management UI.
This commit is contained in:
Brian Huisman 2023-05-05 11:17:39 -04:00
parent e6777287d7
commit 635422b1d6
3 changed files with 120 additions and 49 deletions

View file

@ -81,21 +81,6 @@ if ($err[0] == '00000') {
}
} else $_SESSION['error'][] = 'Could not read search database status.';
// Search Database Charsets
$charsets = $_DDATA['pdo']->query(
'SELECT `content_charset`, COUNT(*) as `num`
FROM `'.$_DDATA['tbprefix'].'crawldata`
GROUP BY `content_charset` ORDER BY `num` DESC;'
);
$err = $charsets->errorInfo();
if ($err[0] == '00000') {
$charsets = $charsets->fetchAll();
foreach ($charsets as $row) {
if (!$row['content_charset']) $row['content_charset'] = '<none>';
$_RDATA['s_crawldata_info']['Charsets'][$row['content_charset']] = $row['num'];
}
} else $_SESSION['error'][] = 'Could not read charset counts from search database.';
// ***** Other runtime data
$_RDATA['admin_pagination_options'] = array(25, 50, 100, 250, 500, 1000);
@ -894,6 +879,12 @@ function os_preg_quote(str, delimiter) {
// ***** Variable Migration
let os_rdata = {
sp_smart: <?php
echo json_encode(
$_RDATA['sp_smart'],
JSON_INVALID_UTF8_IGNORE
);
?>,
s_latin: <?php
echo json_encode(
$_RDATA['s_latin'],
@ -1108,26 +1099,28 @@ if (os_crawldata.length) {
// Prepare PCRE match text for each phrase and term
let filetypes = [];
for (let x = 0; x < os_sdata.terms.length; x++) {
for (let x = 0, term; x < os_sdata.terms.length; x++) {
// Normalize punctuation
Object.keys(os_rdata.sp_smart).forEach(key => {
os_sdata.terms[x][1] = os_sdata.terms[x][1].replace(key, os_rdata.sp_smart[key]);
});
switch (os_sdata.terms[x][0]) {
case 'filetype':
os_sdata.formatted.push(os_sdata.terms[x][0] + ':' + os_sdata.terms[x][1]);
if (os_rdata.s_filetypes[os_sdata.terms[x][1].toUpperCase()])
for (let z = 0; z < os_rdata.s_filetypes[os_sdata.terms[x][1].toUpperCase()].length; z++)
filetypes.push(os_rdata.s_filetypes[os_sdata.terms[x][1].toUpperCase()][z]);
if (os_rdata.s_filetypes[term.toUpperCase()])
for (let z = 0; z < os_rdata.s_filetypes[term.toUpperCase()].length; z++)
filetypes.push(os_rdata.s_filetypes[term.toUpperCase()][z]);
break;
case 'exclude':
os_sdata.formatted.push('-' + os_sdata.terms[x][1]);
break;
case 'phrase':
os_sdata.formatted.push('"' + os_sdata.terms[x][1] + '"');
case 'term':
if (os_sdata.terms[x][0] == 'term')
os_sdata.formatted.push(os_sdata.terms[x][1]);
// Regexp for later use pattern matching results
os_sdata.terms[x][2] = os_preg_quote(os_sdata.terms[x][1].toLowerCase(), '/');
Object.keys(os_rdata.s_latin).forEach(key => {
for (let y = 0; y < os_rdata.s_latin[key].length; y++)
@ -1622,6 +1615,36 @@ document.write(mustache.render(
break;
case 'search':
// Search Database Charsets
$charsets = $_DDATA['pdo']->query(
'SELECT `content_charset`, COUNT(*) as `num`
FROM `'.$_DDATA['tbprefix'].'crawldata`
GROUP BY `content_charset` ORDER BY `num` DESC;'
);
$err = $charsets->errorInfo();
if ($err[0] == '00000') {
$charsets = $charsets->fetchAll();
foreach ($charsets as $row) {
if (!$row['content_charset']) $row['content_charset'] = '<none>';
$_RDATA['s_crawldata_info']['Charsets'][$row['content_charset']] = $row['num'];
}
} else $_SESSION['error'][] = 'Could not read charset counts from search database.';
// Search Database MIME-types
$mimetypes = $_DDATA['pdo']->query(
'SELECT `content_mime`, COUNT(*) as `num`
FROM `'.$_DDATA['tbprefix'].'crawldata`
GROUP BY `content_mime` ORDER BY `num` DESC;'
);
$err = $mimetypes->errorInfo();
if ($err[0] == '00000') {
$mimetypes = $mimetypes->fetchAll();
foreach ($mimetypes as $row) {
if (!$row['content_mime']) $row['content_mime'] = '<none>';
$_RDATA['s_crawldata_info']['MIME-types'][$row['content_mime']] = $row['num'];
}
} else $_SESSION['error'][] = 'Could not read charset counts from search database.';
// Average hits per hour: First find the oldest `stamp` in the
// database, then base all averages on the difference between that
// time and now; also get average number of results
@ -1662,7 +1685,7 @@ document.write(mustache.render(
break;
case 'queries':
$_RDATA['query_log_rows'] = false;
$_RDATA['query_log_rows'] = array();
$queries = $_DDATA['pdo']->query(
'SELECT *, INET_NTOA(`ip`) AS `ipaddr`
FROM `'.$_DDATA['tbprefix'].'query` AS `t`
@ -2512,8 +2535,23 @@ document.write(mustache.render(
</li>
<li class="list-group-item">
<label class="d-flex w-100">
<strong class="pe-2">Page Encodings</strong>
<ol class="list-group list-group-flush flex-grow-1" id="os_crawl_info_charsets"><?php
<strong class="pe-2">MIME-types</strong>
<ol class="list-group list-group-flush flex-grow-1"><?php
foreach ($_RDATA['s_crawldata_info']['MIME-types'] as $mimetype => $value) { ?>
<li class="list-group-item text-end p-0 border-0">
<strong><?php echo htmlspecialchars($mimetype); ?>:</strong>
<var title="<?php echo $value; ?> pages"><?php
echo round(($value / $_RDATA['s_crawldata_info']['Rows']) * 100, 1);
?>%</var>
</li><?php
} ?>
</ol>
</label>
</li>
<li class="list-group-item">
<label class="d-flex w-100">
<strong class="pe-2">Encodings</strong>
<ol class="list-group list-group-flush flex-grow-1"><?php
foreach ($_RDATA['s_crawldata_info']['Charsets'] as $encoding => $value) { ?>
<li class="list-group-item text-end p-0 border-0">
<strong><?php echo htmlspecialchars($encoding); ?>:</strong>
@ -2925,10 +2963,10 @@ document.write(mustache.render(
</header>
<div class="col-6 col-xl-5 col-xxl-4 mb-2 text-end text-nowrap">
<button type="button" class="btn btn-primary" id="os_query_log_download" title="Download Query Log"<?php
if ($_ODATA['sp_crawling']) echo ' disabled="disabled"'; ?>>Download</button>
if (!count($_RDATA['query_log_rows'])) echo ' disabled="disabled"'; ?>>Download</button>
</div><?php
if (is_array($_RDATA['query_log_rows']) && count($_RDATA['query_log_rows'])) { ?>
if (count($_RDATA['query_log_rows'])) { ?>
<div class="col-xl-10 col-xxl-8">
<div class="rounded-3 border border-1 border-secondary-subtle shadow border-bottom-0 mb-3 overflow-hidden">
<table class="table table-striped w-100 mb-0">

View file

@ -307,8 +307,6 @@ function OS_getValue($columnName) {
/**
* Initialize a generic cURL connection
* - If creating a cURL connection fails, try using libcurlemu as a
* backup option
*
*/
function OS_getConnection() {
@ -336,9 +334,17 @@ function OS_getConnection() {
// ***** Pull the configuration data from the database
$_ODATA = $_DDATA['pdo']->query(
$odata = $_DDATA['pdo']->query(
'SELECT * FROM `'.$_DDATA['tbprefix'].'config`;'
)->fetchAll()[0];
);
$err = $odata->errorInfo();
if ($err[0] == '00000') {
$odata = $odata->fetchAll();
if (count($odata)) {
$_ODATA = $odata[0];
} else throw new Exception('No data in configuration table');
} else throw new Exception('Could not read from configuration table: '.$err[2]);
ini_set('display_errors', 1);
error_reporting(E_ALL);
@ -585,7 +591,7 @@ $deleteold->execute(array('cutoff' => time() - $_ODATA['s_limit_query_log'] * 86
$err = $deleteold->errorInfo();
if ($err[0] != '00000')
if (isset($_SESSION['error']))
$_SESSION['error'][] = 'Database error purging old records from the query log.';
$_SESSION['error'][] = 'Database error purging old records from the query log: '.$err[2];
// Reduce search result cache size to within limits
@ -636,13 +642,14 @@ if ($err[0] == '00000') {
}
} else if (isset($_SESSION['error']))
$_SESSION['error'][] = 'Could not read from search result cache.';
$_SESSION['error'][] = 'Could not read from search result cache: '.$err[2];
}
$_RDATA['s_cache_size'] = $cachesize[0]['size'];
} else if (isset($_SESSION['error']))
$_SESSION['error'][] = 'Could not read search result cache size.';
$_SESSION['error'][] = 'Could not read search result cache size: '.$err[2];
// Get a list of all categories in the search database
$_RDATA['s_category_list'] = array('<none>' => 0);
@ -662,6 +669,7 @@ if ($err[0] == '00000') {
} else if (isset($_SESSION['error']))
$_SESSION['error'][] = 'Could not read categories from the search database.';
// Count base URLs / domains from the crawldata: if there is only one
// in the search database then we don't have to show it in a number of
// places
@ -677,10 +685,11 @@ if ($err[0] == '00000') {
foreach ($domains as $domain)
$_RDATA['s_crawldata_domains'][$domain['url_base']] = $domain['count'];
} else if (isset($_SESSION['error']))
$_SESSION['error'][] = 'Could not read domain count data from search database.';
$_SESSION['error'][] = 'Could not read domain count data from search database: '.$err[2];
if (count($_RDATA['s_crawldata_domains']) == 1)
OS_setValue('jw_hostname', key($_RDATA['s_crawldata_domains']));
// Count searchable pages
$_RDATA['s_searchable_pages'] = 0;
$query_status = ($_ODATA['s_show_orphans']) ? '(`status`=\'OK\' || `status`=\'Orphan\')' : '`status`=\'OK\'';
@ -694,7 +703,7 @@ if ($err[0] == '00000') {
$searchable = $searchable->fetchAll();
$_RDATA['s_searchable_pages'] = $searchable[0]['count'];
} else if (isset($_SESSION['error']))
$_SESSION['error'][] = 'Could not read status data from search database.';
$_SESSION['error'][] = 'Could not read status data from search database: '.$err[2];
// Match Weighting Values
@ -711,17 +720,36 @@ $_RDATA['s_weights'] = array(
);
$_RDATA['sp_smart'] = array(
'' => '\'',
'' => '\'',
'“' => '"',
'”' => '"',
'‟' => '"',
'„' => '"',
'…' => '...',
'·' => '•',
'' => '>',
'‖' => '|'
"\u{00AB}" => '"', "\u{00AD}" => '-', "\u{00B4}" => '\'', "\u{00B7}" => '•',
"\u{00BB}" => '"', "\u{00F7}" => '/', "\u{01C0}" => '|', "\u{01C3}" => '!',
"\u{02B9}" => '\'', "\u{02BA}" => '"', "\u{02BC}" => '\'', "\u{02C4}" => '^',
"\u{02C6}" => '^', "\u{02C8}" => '\'', "\u{02CB}" => '`', "\u{02CD}" => '_',
"\u{02DC}" => '~', "\u{0300}" => '`', "\u{0301}" => '\'', "\u{0302}" => '^',
"\u{0303}" => '~', "\u{030B}" => '"', "\u{030E}" => '"', "\u{0331}" => '_',
"\u{0332}" => '_', "\u{0338}" => '/', "\u{0589}" => ':', "\u{05C0}" => '|',
"\u{05C3}" => ':', "\u{066A}" => '%', "\u{066D}" => '*', "\u{200B}" => ' ',
"\u{2010}" => '-', "\u{2011}" => '-', "\u{2012}" => '-', "\u{2013}" => '-',
"\u{2014}" => '-', "\u{2015}" => '-', "\u{2016}" => '|', "\u{2017}" => '_',
"\u{2018}" => '\'', "\u{2019}" => '\'', "\u{201A}" => ',', "\u{201B}" => '\'',
"\u{201C}" => '"', "\u{201D}" => '"', "\u{201E}" => '"', "\u{201F}" => '"',
"\u{2024}" => '•', "\u{2025}" => '••', "\u{2026}" => '...', "\u{2027}" => '•',
"\u{2032}" => '\'', "\u{2033}" => '"', "\u{2034}" => '\'', "\u{2035}" => '`',
"\u{2036}" => '"', "\u{2037}" => '\'', "\u{2038}" => '^', "\u{2039}" => '<',
"\u{203A}" => '>', "\u{203D}" => '?', "\u{2042}" => '*', "\u{2043}" => '•',
"\u{2044}" => '/', "\u{2045}" => '[', "\u{2046}" => ']', "\u{2047}" => '??',
"\u{2048}" => '?!', "\u{2049}" => '!?', "\u{204E}" => '*', "\u{204F}" => ';',
"\u{2051}" => '**', "\u{2052}" => '%', "\u{2053}" => '~', "\u{2055}" => '*',
"\u{2060}" => ' ', "\u{20E5}" => '\\', "\u{2212}" => '-', "\u{2215}" => '/',
"\u{2216}" => '\\', "\u{2217}" => '*', "\u{2223}" => '|', "\u{2236}" => ':',
"\u{223C}" => '~', "\u{2264}" => '<', "\u{2265}" => '>', "\u{2266}" => '<',
"\u{2267}" => '>', "\u{2303}" => '^', "\u{2329}" => '<', "\u{232A}" => '>',
"\u{266F}" => '#', "\u{2731}" => '*', "\u{2758}" => '|', "\u{2762}" => '!',
"\u{27E6}" => '[', "\u{27E8}" => '<', "\u{27E9}" => '>', "\u{2983}" => '{',
"\u{2984}" => '}', "\u{3003}" => '"', "\u{3008}" => '<', "\u{3009}" => '>',
"\u{301B}" => ']', "\u{301C}" => '~', "\u{301D}" => '"', "\u{301E}" => '"',
"\u{FEFF}" => ' '
);
$_RDATA['s_latin'] = array(
'center' => array('centre'),
'color' => array('colour'),

View file

@ -165,6 +165,11 @@ if ($_RDATA['s_searchable_pages']) {
// Prepare PCRE match text for each phrase and term
foreach ($_SDATA['terms'] as $key => list($type, $term, $pcre)) {
// Normalize punctuation
$term = strtr($term, $_RDATA['sp_smart']);
$_SDATA['terms'][$key][1] = $term;
switch ($type) {
case 'filetype':
$_SDATA['formatted'][] = $type.':'.$term;
@ -182,7 +187,7 @@ if ($_RDATA['s_searchable_pages']) {
$_SDATA['formatted'][] = $term;
// Regexp for later use pattern matching results
$_SDATA['terms'][$key][2] = preg_quote(strtolower($term), '/');
$_SDATA['terms'][$key][2] = preg_quote(strtolower($_SDATA['terms'][$key][1]), '/');
$_SDATA['terms'][$key][2] = strtr($_SDATA['terms'][$key][2], $_RDATA['s_latin_pcre']);
$_SDATA['terms'][$key][2] = '/('.$_SDATA['terms'][$key][2].')/iu';