Punctuation normalization and MIME-type display
Disable Query log download button if query log is empty. Further database error resiliency. Add many more punctuation normalization characters; normalize on search as well as storage. Add count of MIME-types in Search Management UI.
This commit is contained in:
parent
e6777287d7
commit
635422b1d6
|
@ -81,21 +81,6 @@ if ($err[0] == '00000') {
|
|||
}
|
||||
} else $_SESSION['error'][] = 'Could not read search database status.';
|
||||
|
||||
// Search Database Charsets
|
||||
$charsets = $_DDATA['pdo']->query(
|
||||
'SELECT `content_charset`, COUNT(*) as `num`
|
||||
FROM `'.$_DDATA['tbprefix'].'crawldata`
|
||||
GROUP BY `content_charset` ORDER BY `num` DESC;'
|
||||
);
|
||||
$err = $charsets->errorInfo();
|
||||
if ($err[0] == '00000') {
|
||||
$charsets = $charsets->fetchAll();
|
||||
foreach ($charsets as $row) {
|
||||
if (!$row['content_charset']) $row['content_charset'] = '<none>';
|
||||
$_RDATA['s_crawldata_info']['Charsets'][$row['content_charset']] = $row['num'];
|
||||
}
|
||||
} else $_SESSION['error'][] = 'Could not read charset counts from search database.';
|
||||
|
||||
|
||||
// ***** Other runtime data
|
||||
$_RDATA['admin_pagination_options'] = array(25, 50, 100, 250, 500, 1000);
|
||||
|
@ -894,6 +879,12 @@ function os_preg_quote(str, delimiter) {
|
|||
|
||||
// ***** Variable Migration
|
||||
let os_rdata = {
|
||||
sp_smart: <?php
|
||||
echo json_encode(
|
||||
$_RDATA['sp_smart'],
|
||||
JSON_INVALID_UTF8_IGNORE
|
||||
);
|
||||
?>,
|
||||
s_latin: <?php
|
||||
echo json_encode(
|
||||
$_RDATA['s_latin'],
|
||||
|
@ -1108,26 +1099,28 @@ if (os_crawldata.length) {
|
|||
|
||||
// Prepare PCRE match text for each phrase and term
|
||||
let filetypes = [];
|
||||
for (let x = 0; x < os_sdata.terms.length; x++) {
|
||||
for (let x = 0, term; x < os_sdata.terms.length; x++) {
|
||||
|
||||
// Normalize punctuation
|
||||
Object.keys(os_rdata.sp_smart).forEach(key => {
|
||||
os_sdata.terms[x][1] = os_sdata.terms[x][1].replace(key, os_rdata.sp_smart[key]);
|
||||
});
|
||||
|
||||
switch (os_sdata.terms[x][0]) {
|
||||
case 'filetype':
|
||||
os_sdata.formatted.push(os_sdata.terms[x][0] + ':' + os_sdata.terms[x][1]);
|
||||
if (os_rdata.s_filetypes[os_sdata.terms[x][1].toUpperCase()])
|
||||
for (let z = 0; z < os_rdata.s_filetypes[os_sdata.terms[x][1].toUpperCase()].length; z++)
|
||||
filetypes.push(os_rdata.s_filetypes[os_sdata.terms[x][1].toUpperCase()][z]);
|
||||
if (os_rdata.s_filetypes[term.toUpperCase()])
|
||||
for (let z = 0; z < os_rdata.s_filetypes[term.toUpperCase()].length; z++)
|
||||
filetypes.push(os_rdata.s_filetypes[term.toUpperCase()][z]);
|
||||
break;
|
||||
|
||||
case 'exclude':
|
||||
os_sdata.formatted.push('-' + os_sdata.terms[x][1]);
|
||||
break;
|
||||
|
||||
case 'phrase':
|
||||
os_sdata.formatted.push('"' + os_sdata.terms[x][1] + '"');
|
||||
|
||||
case 'term':
|
||||
if (os_sdata.terms[x][0] == 'term')
|
||||
os_sdata.formatted.push(os_sdata.terms[x][1]);
|
||||
|
||||
// Regexp for later use pattern matching results
|
||||
os_sdata.terms[x][2] = os_preg_quote(os_sdata.terms[x][1].toLowerCase(), '/');
|
||||
Object.keys(os_rdata.s_latin).forEach(key => {
|
||||
for (let y = 0; y < os_rdata.s_latin[key].length; y++)
|
||||
|
@ -1622,6 +1615,36 @@ document.write(mustache.render(
|
|||
break;
|
||||
|
||||
case 'search':
|
||||
// Search Database Charsets
|
||||
$charsets = $_DDATA['pdo']->query(
|
||||
'SELECT `content_charset`, COUNT(*) as `num`
|
||||
FROM `'.$_DDATA['tbprefix'].'crawldata`
|
||||
GROUP BY `content_charset` ORDER BY `num` DESC;'
|
||||
);
|
||||
$err = $charsets->errorInfo();
|
||||
if ($err[0] == '00000') {
|
||||
$charsets = $charsets->fetchAll();
|
||||
foreach ($charsets as $row) {
|
||||
if (!$row['content_charset']) $row['content_charset'] = '<none>';
|
||||
$_RDATA['s_crawldata_info']['Charsets'][$row['content_charset']] = $row['num'];
|
||||
}
|
||||
} else $_SESSION['error'][] = 'Could not read charset counts from search database.';
|
||||
|
||||
// Search Database MIME-types
|
||||
$mimetypes = $_DDATA['pdo']->query(
|
||||
'SELECT `content_mime`, COUNT(*) as `num`
|
||||
FROM `'.$_DDATA['tbprefix'].'crawldata`
|
||||
GROUP BY `content_mime` ORDER BY `num` DESC;'
|
||||
);
|
||||
$err = $mimetypes->errorInfo();
|
||||
if ($err[0] == '00000') {
|
||||
$mimetypes = $mimetypes->fetchAll();
|
||||
foreach ($mimetypes as $row) {
|
||||
if (!$row['content_mime']) $row['content_mime'] = '<none>';
|
||||
$_RDATA['s_crawldata_info']['MIME-types'][$row['content_mime']] = $row['num'];
|
||||
}
|
||||
} else $_SESSION['error'][] = 'Could not read charset counts from search database.';
|
||||
|
||||
// Average hits per hour: First find the oldest `stamp` in the
|
||||
// database, then base all averages on the difference between that
|
||||
// time and now; also get average number of results
|
||||
|
@ -1662,7 +1685,7 @@ document.write(mustache.render(
|
|||
break;
|
||||
|
||||
case 'queries':
|
||||
$_RDATA['query_log_rows'] = false;
|
||||
$_RDATA['query_log_rows'] = array();
|
||||
$queries = $_DDATA['pdo']->query(
|
||||
'SELECT *, INET_NTOA(`ip`) AS `ipaddr`
|
||||
FROM `'.$_DDATA['tbprefix'].'query` AS `t`
|
||||
|
@ -2512,8 +2535,23 @@ document.write(mustache.render(
|
|||
</li>
|
||||
<li class="list-group-item">
|
||||
<label class="d-flex w-100">
|
||||
<strong class="pe-2">Page Encodings</strong>
|
||||
<ol class="list-group list-group-flush flex-grow-1" id="os_crawl_info_charsets"><?php
|
||||
<strong class="pe-2">MIME-types</strong>
|
||||
<ol class="list-group list-group-flush flex-grow-1"><?php
|
||||
foreach ($_RDATA['s_crawldata_info']['MIME-types'] as $mimetype => $value) { ?>
|
||||
<li class="list-group-item text-end p-0 border-0">
|
||||
<strong><?php echo htmlspecialchars($mimetype); ?>:</strong>
|
||||
<var title="<?php echo $value; ?> pages"><?php
|
||||
echo round(($value / $_RDATA['s_crawldata_info']['Rows']) * 100, 1);
|
||||
?>%</var>
|
||||
</li><?php
|
||||
} ?>
|
||||
</ol>
|
||||
</label>
|
||||
</li>
|
||||
<li class="list-group-item">
|
||||
<label class="d-flex w-100">
|
||||
<strong class="pe-2">Encodings</strong>
|
||||
<ol class="list-group list-group-flush flex-grow-1"><?php
|
||||
foreach ($_RDATA['s_crawldata_info']['Charsets'] as $encoding => $value) { ?>
|
||||
<li class="list-group-item text-end p-0 border-0">
|
||||
<strong><?php echo htmlspecialchars($encoding); ?>:</strong>
|
||||
|
@ -2925,10 +2963,10 @@ document.write(mustache.render(
|
|||
</header>
|
||||
<div class="col-6 col-xl-5 col-xxl-4 mb-2 text-end text-nowrap">
|
||||
<button type="button" class="btn btn-primary" id="os_query_log_download" title="Download Query Log"<?php
|
||||
if ($_ODATA['sp_crawling']) echo ' disabled="disabled"'; ?>>Download</button>
|
||||
if (!count($_RDATA['query_log_rows'])) echo ' disabled="disabled"'; ?>>Download</button>
|
||||
</div><?php
|
||||
|
||||
if (is_array($_RDATA['query_log_rows']) && count($_RDATA['query_log_rows'])) { ?>
|
||||
if (count($_RDATA['query_log_rows'])) { ?>
|
||||
<div class="col-xl-10 col-xxl-8">
|
||||
<div class="rounded-3 border border-1 border-secondary-subtle shadow border-bottom-0 mb-3 overflow-hidden">
|
||||
<table class="table table-striped w-100 mb-0">
|
||||
|
|
|
@ -307,8 +307,6 @@ function OS_getValue($columnName) {
|
|||
|
||||
/**
|
||||
* Initialize a generic cURL connection
|
||||
* - If creating a cURL connection fails, try using libcurlemu as a
|
||||
* backup option
|
||||
*
|
||||
*/
|
||||
function OS_getConnection() {
|
||||
|
@ -336,9 +334,17 @@ function OS_getConnection() {
|
|||
|
||||
|
||||
// ***** Pull the configuration data from the database
|
||||
$_ODATA = $_DDATA['pdo']->query(
|
||||
$odata = $_DDATA['pdo']->query(
|
||||
'SELECT * FROM `'.$_DDATA['tbprefix'].'config`;'
|
||||
)->fetchAll()[0];
|
||||
);
|
||||
$err = $odata->errorInfo();
|
||||
if ($err[0] == '00000') {
|
||||
$odata = $odata->fetchAll();
|
||||
if (count($odata)) {
|
||||
$_ODATA = $odata[0];
|
||||
} else throw new Exception('No data in configuration table');
|
||||
} else throw new Exception('Could not read from configuration table: '.$err[2]);
|
||||
|
||||
|
||||
ini_set('display_errors', 1);
|
||||
error_reporting(E_ALL);
|
||||
|
@ -585,7 +591,7 @@ $deleteold->execute(array('cutoff' => time() - $_ODATA['s_limit_query_log'] * 86
|
|||
$err = $deleteold->errorInfo();
|
||||
if ($err[0] != '00000')
|
||||
if (isset($_SESSION['error']))
|
||||
$_SESSION['error'][] = 'Database error purging old records from the query log.';
|
||||
$_SESSION['error'][] = 'Database error purging old records from the query log: '.$err[2];
|
||||
|
||||
|
||||
// Reduce search result cache size to within limits
|
||||
|
@ -636,13 +642,14 @@ if ($err[0] == '00000') {
|
|||
}
|
||||
|
||||
} else if (isset($_SESSION['error']))
|
||||
$_SESSION['error'][] = 'Could not read from search result cache.';
|
||||
$_SESSION['error'][] = 'Could not read from search result cache: '.$err[2];
|
||||
|
||||
}
|
||||
$_RDATA['s_cache_size'] = $cachesize[0]['size'];
|
||||
|
||||
} else if (isset($_SESSION['error']))
|
||||
$_SESSION['error'][] = 'Could not read search result cache size.';
|
||||
$_SESSION['error'][] = 'Could not read search result cache size: '.$err[2];
|
||||
|
||||
|
||||
// Get a list of all categories in the search database
|
||||
$_RDATA['s_category_list'] = array('<none>' => 0);
|
||||
|
@ -662,6 +669,7 @@ if ($err[0] == '00000') {
|
|||
} else if (isset($_SESSION['error']))
|
||||
$_SESSION['error'][] = 'Could not read categories from the search database.';
|
||||
|
||||
|
||||
// Count base URLs / domains from the crawldata: if there is only one
|
||||
// in the search database then we don't have to show it in a number of
|
||||
// places
|
||||
|
@ -677,10 +685,11 @@ if ($err[0] == '00000') {
|
|||
foreach ($domains as $domain)
|
||||
$_RDATA['s_crawldata_domains'][$domain['url_base']] = $domain['count'];
|
||||
} else if (isset($_SESSION['error']))
|
||||
$_SESSION['error'][] = 'Could not read domain count data from search database.';
|
||||
$_SESSION['error'][] = 'Could not read domain count data from search database: '.$err[2];
|
||||
if (count($_RDATA['s_crawldata_domains']) == 1)
|
||||
OS_setValue('jw_hostname', key($_RDATA['s_crawldata_domains']));
|
||||
|
||||
|
||||
// Count searchable pages
|
||||
$_RDATA['s_searchable_pages'] = 0;
|
||||
$query_status = ($_ODATA['s_show_orphans']) ? '(`status`=\'OK\' || `status`=\'Orphan\')' : '`status`=\'OK\'';
|
||||
|
@ -694,7 +703,7 @@ if ($err[0] == '00000') {
|
|||
$searchable = $searchable->fetchAll();
|
||||
$_RDATA['s_searchable_pages'] = $searchable[0]['count'];
|
||||
} else if (isset($_SESSION['error']))
|
||||
$_SESSION['error'][] = 'Could not read status data from search database.';
|
||||
$_SESSION['error'][] = 'Could not read status data from search database: '.$err[2];
|
||||
|
||||
|
||||
// Match Weighting Values
|
||||
|
@ -711,17 +720,36 @@ $_RDATA['s_weights'] = array(
|
|||
);
|
||||
|
||||
$_RDATA['sp_smart'] = array(
|
||||
'’' => '\'',
|
||||
'‘' => '\'',
|
||||
'“' => '"',
|
||||
'”' => '"',
|
||||
'‟' => '"',
|
||||
'„' => '"',
|
||||
'…' => '...',
|
||||
'·' => '•',
|
||||
'›' => '>',
|
||||
'‖' => '|'
|
||||
"\u{00AB}" => '"', "\u{00AD}" => '-', "\u{00B4}" => '\'', "\u{00B7}" => '•',
|
||||
"\u{00BB}" => '"', "\u{00F7}" => '/', "\u{01C0}" => '|', "\u{01C3}" => '!',
|
||||
"\u{02B9}" => '\'', "\u{02BA}" => '"', "\u{02BC}" => '\'', "\u{02C4}" => '^',
|
||||
"\u{02C6}" => '^', "\u{02C8}" => '\'', "\u{02CB}" => '`', "\u{02CD}" => '_',
|
||||
"\u{02DC}" => '~', "\u{0300}" => '`', "\u{0301}" => '\'', "\u{0302}" => '^',
|
||||
"\u{0303}" => '~', "\u{030B}" => '"', "\u{030E}" => '"', "\u{0331}" => '_',
|
||||
"\u{0332}" => '_', "\u{0338}" => '/', "\u{0589}" => ':', "\u{05C0}" => '|',
|
||||
"\u{05C3}" => ':', "\u{066A}" => '%', "\u{066D}" => '*', "\u{200B}" => ' ',
|
||||
"\u{2010}" => '-', "\u{2011}" => '-', "\u{2012}" => '-', "\u{2013}" => '-',
|
||||
"\u{2014}" => '-', "\u{2015}" => '-', "\u{2016}" => '|', "\u{2017}" => '_',
|
||||
"\u{2018}" => '\'', "\u{2019}" => '\'', "\u{201A}" => ',', "\u{201B}" => '\'',
|
||||
"\u{201C}" => '"', "\u{201D}" => '"', "\u{201E}" => '"', "\u{201F}" => '"',
|
||||
"\u{2024}" => '•', "\u{2025}" => '••', "\u{2026}" => '...', "\u{2027}" => '•',
|
||||
"\u{2032}" => '\'', "\u{2033}" => '"', "\u{2034}" => '\'', "\u{2035}" => '`',
|
||||
"\u{2036}" => '"', "\u{2037}" => '\'', "\u{2038}" => '^', "\u{2039}" => '<',
|
||||
"\u{203A}" => '>', "\u{203D}" => '?', "\u{2042}" => '*', "\u{2043}" => '•',
|
||||
"\u{2044}" => '/', "\u{2045}" => '[', "\u{2046}" => ']', "\u{2047}" => '??',
|
||||
"\u{2048}" => '?!', "\u{2049}" => '!?', "\u{204E}" => '*', "\u{204F}" => ';',
|
||||
"\u{2051}" => '**', "\u{2052}" => '%', "\u{2053}" => '~', "\u{2055}" => '*',
|
||||
"\u{2060}" => ' ', "\u{20E5}" => '\\', "\u{2212}" => '-', "\u{2215}" => '/',
|
||||
"\u{2216}" => '\\', "\u{2217}" => '*', "\u{2223}" => '|', "\u{2236}" => ':',
|
||||
"\u{223C}" => '~', "\u{2264}" => '<', "\u{2265}" => '>', "\u{2266}" => '<',
|
||||
"\u{2267}" => '>', "\u{2303}" => '^', "\u{2329}" => '<', "\u{232A}" => '>',
|
||||
"\u{266F}" => '#', "\u{2731}" => '*', "\u{2758}" => '|', "\u{2762}" => '!',
|
||||
"\u{27E6}" => '[', "\u{27E8}" => '<', "\u{27E9}" => '>', "\u{2983}" => '{',
|
||||
"\u{2984}" => '}', "\u{3003}" => '"', "\u{3008}" => '<', "\u{3009}" => '>',
|
||||
"\u{301B}" => ']', "\u{301C}" => '~', "\u{301D}" => '"', "\u{301E}" => '"',
|
||||
"\u{FEFF}" => ' '
|
||||
);
|
||||
|
||||
$_RDATA['s_latin'] = array(
|
||||
'center' => array('centre'),
|
||||
'color' => array('colour'),
|
||||
|
|
|
@ -165,6 +165,11 @@ if ($_RDATA['s_searchable_pages']) {
|
|||
|
||||
// Prepare PCRE match text for each phrase and term
|
||||
foreach ($_SDATA['terms'] as $key => list($type, $term, $pcre)) {
|
||||
|
||||
// Normalize punctuation
|
||||
$term = strtr($term, $_RDATA['sp_smart']);
|
||||
$_SDATA['terms'][$key][1] = $term;
|
||||
|
||||
switch ($type) {
|
||||
case 'filetype':
|
||||
$_SDATA['formatted'][] = $type.':'.$term;
|
||||
|
@ -182,7 +187,7 @@ if ($_RDATA['s_searchable_pages']) {
|
|||
$_SDATA['formatted'][] = $term;
|
||||
|
||||
// Regexp for later use pattern matching results
|
||||
$_SDATA['terms'][$key][2] = preg_quote(strtolower($term), '/');
|
||||
$_SDATA['terms'][$key][2] = preg_quote(strtolower($_SDATA['terms'][$key][1]), '/');
|
||||
$_SDATA['terms'][$key][2] = strtr($_SDATA['terms'][$key][2], $_RDATA['s_latin_pcre']);
|
||||
$_SDATA['terms'][$key][2] = '/('.$_SDATA['terms'][$key][2].')/iu';
|
||||
|
||||
|
|
Loading…
Reference in a new issue