Enable downloading Page Index

Allow downloading of the page index as a csv.
Remove unnecessary database columns url_base and status_noindex
Store list of domains at crawl so we don't need to request them every page-load; you will need to reinstall fresh because of this change
This commit is contained in:
Brian Huisman 2023-05-12 10:06:57 -04:00
parent bab4a7e2c5
commit 4bb28031b6
5 changed files with 130 additions and 48 deletions

View file

@ -146,7 +146,7 @@ if (!$_SESSION['admin_username']) {
if (empty($_POST->action)) $_POST->action = '';
switch ($_POST->action) {
// Set the key for initiating the crawler
// ***** Set the key for initiating the crawler
case 'setkey':
if (!$_ODATA['sp_crawling']) {
$md5 = md5(hrtime(true));
@ -167,10 +167,12 @@ if (!$_SESSION['admin_username']) {
}
break;
// Download a text file of the most recent crawl or query log
// ***** Download a text or csv file
case 'download':
if (empty($_POST->content)) $_POST->content = '';
switch ($_POST->content) {
// Download a text file of the latest crawl log
case 'crawl_log':
if (!$_ODATA['sp_crawling']) {
if ($_ODATA['sp_time_end']) {
@ -209,16 +211,72 @@ if (!$_SESSION['admin_username']) {
}
break;
// Download a csv of the unfiltered page index
case 'page_index':
$pageIndex = $_DDATA['pdo']->query(
'SELECT `url`, `category`, `content_mime`, `content_charset`,
`status`, `flag_unlisted`, `last_modified`, `priority`
FROM `'.$_DDATA['tbprefix'].'crawldata` ORDER BY `url_sort`;');
$err = $pageIndex->errorInfo();
if ($err[0] == '00000') {
$pageIndex = $pageIndex->fetchAll();
if (count($pageIndex)) {
header('Content-type: text/csv; charset='.strtolower($_ODATA['s_charset']));
header('Content-disposition: attachment; filename="'.
'page-index_'.date('Y-m-d').'.csv"');
$output = fopen('php://output', 'w');
// UTF-8 byte order mark
if (strtolower($_ODATA['s_charset']) == 'utf-8')
fwrite($output, "\xEF\xBB\xBF");
$headings = array(
'URL', 'Category', 'MIME Type', 'Character Encoding',
'Status', 'Last Modified', 'Priority'
);
fputcsv($output, $headings);
foreach ($pageIndex as $line) {
if ($line['flag_unlisted'])
$line['status'] .= ' (Unlisted)';
unset($line['flag_unlisted']);
$line['last_modified'] = date('c', $line['last_modified']);
fputcsv($output, $line);
}
fclose($output);
die();
} else {
$response = array(
'status' => 'Error',
'message' => 'The page index is empty; nothing to download'
);
}
} else {
$response = array(
'status' => 'Error',
'message' => 'Could not read the page index database'
);
}
break;
// Download a csv of the complete query log
case 'query_log':
$querylog = $_DDATA['pdo']->query(
$queryLog = $_DDATA['pdo']->query(
'SELECT `query`, `results`, `stamp`, INET_NTOA(`ip`) AS `ipaddr`
FROM `'.$_DDATA['tbprefix'].'query` ORDER BY `stamp` DESC;'
);
$err = $querylog->errorInfo();
$err = $queryLog->errorInfo();
if ($err[0] == '00000') {
$querylog = $querylog->fetchAll();
if (count($querylog)) {
$queryLog = $queryLog->fetchAll();
if (count($queryLog)) {
header('Content-type: text/csv; charset='.strtolower($_ODATA['s_charset']));
header('Content-disposition: attachment; filename="'.
@ -232,9 +290,9 @@ if (!$_SESSION['admin_username']) {
$headings = array('Query', 'Results', 'Time Stamp', 'IP');
if ($_GEOIP2) $headings[] = 'Country';
fputcsv($output, $headings);
foreach ($querylog as $line) {
foreach ($queryLog as $line) {
$line['stamp'] = date('c', $line['stamp']);
if ($_GEOIP2) {
@ -274,7 +332,7 @@ if (!$_SESSION['admin_username']) {
break;
// Not used?
// ***** Not used?
case 'fetch':
if (empty($_POST->value)) $_POST->value = '';
if (!empty($_ODATA[$_POST->value])) {
@ -553,6 +611,26 @@ if (!$_SESSION['admin_username']) {
break;
}
}
// Refresh the sp_domains data since we deleted some rows
$_RDATA['sp_domains'] = array();
$urls = $_DDATA['pdo']->query(
'SELECT `url` FROM `'.$_DDATA['tbprefix'].'crawldata`;'
);
$err = $urls->errorInfo();
if ($err[0] == '00000') {
$urls = $urls->fetchAll();
foreach ($urls as $url) {
$url = parse_url($url['url']);
if (is_array($url)) {
$domain = $url['scheme'].'://'.$url['host'];
if (!isset($_RDATA['sp_domains'][$domain])) {
$_RDATA['sp_domains'][$domain] = 1;
} else $_RDATA['sp_domains'][$domain]++;
}
}
OS_setValue('sp_domains', json_encode($_RDATA['sp_domains']));
} else $_SESSION['error'][] = 'Could not read domain count data from search database: '.$err[2];
break;
case 'category':
@ -803,7 +881,7 @@ if (!$_SESSION['admin_username']) {
`content_mime`, `weighted`, `content`, `priority`
FROM `'.$_DDATA['tbprefix'].'crawldata`
WHERE `flag_unlisted`<>1 AND '.$query_status.' AND
`url_base` LIKE \'%'.addslashes($_ODATA['jw_hostname']).'\';'
`url` LIKE \'%://'.addslashes($_ODATA['jw_hostname']).'/%\';'
);
$err = $select->errorInfo();
if ($err[0] == '00000') {
@ -1544,8 +1622,8 @@ document.write(mustache.render(
// ***** Select rows to populate the Page Index table
$indexRows = $_DDATA['pdo']->prepare(
'SELECT SQL_CALC_FOUND_ROWS
`url`, `url_base`, `title`, `category`, `content_checksum`, `status`,
`status_noindex`, `flag_unlisted`, `flag_updated`, `priority`
`url`, `title`, `category`, `content_checksum`,
`status`, `flag_unlisted`, `flag_updated`, `priority`
FROM `'.$_DDATA['tbprefix'].'crawldata`
WHERE (:text1=\'\' OR `url` LIKE :text2) AND
(:category1=\'\' OR `category`=:category2) AND
@ -2232,9 +2310,13 @@ document.write(mustache.render(
* Page Index ********************************************** */
case 'index': ?>
<section class="row justify-content-center">
<header class="col-xl-10 col-xxl-8 mb-2">
<header class="col-6 col-xl-5 col-xxl-4 mb-2">
<h2>Page Index</h2>
</header><?php
</header>
<div class="col-6 col-xl-5 col-xxl-4 mb-2 text-end text-nowrap">
<button type="button" class="btn btn-primary" id="os_page_index_download" title="Download Page Index"<?php
if (!$_RDATA['s_crawldata_info']['Rows']) echo ' disabled="disabled"'; ?>>Download</button>
</div><?php
// If there are *any* rows in the database
if ($_RDATA['s_crawldata_info']['Rows']) {
@ -2388,8 +2470,8 @@ document.write(mustache.render(
<tr><?php echo $_RDATA['index_action_row']; ?></tr>
</tfoot>
<tbody class="table-group-divider"><?php
if (count($_RDATA['s_crawldata_domains']) == 1)
$repStr = '/^'.preg_quote(key($_RDATA['s_crawldata_domains']), '/').'/';
if (count($_RDATA['sp_domains']) == 1)
$repStr = '/^'.preg_quote(key($_RDATA['sp_domains']), '/').'/';
foreach ($_RDATA['page_index_rows'] as $key => $row) { ?>
<tr class="lh-sm">
@ -2403,7 +2485,7 @@ document.write(mustache.render(
<a href="<?php echo htmlspecialchars($row['url']); ?>" title="<?php
echo htmlspecialchars($row['url']); ?>" target="_blank" class="align-middle<?php
if ($row['flag_updated']) echo ' fw-bold'; ?>"><?php
if (count($_RDATA['s_crawldata_domains']) == 1) {
if (count($_RDATA['sp_domains']) == 1) {
echo htmlspecialchars(preg_replace($repStr, '', $row['url']));
} else echo htmlspecialchars($row['url']);
?></a><?php
@ -2655,13 +2737,13 @@ document.write(mustache.render(
</legend>
<div class="p-2 border border-1 border-secondary-subtle rounded-bottom-3">
<ul class="list-group mb-2"><?php
if (count($_RDATA['s_crawldata_domains']) > 1) { ?>
if (count($_RDATA['sp_domains']) > 1) { ?>
<li class="list-group-item">
<label class="d-flex lh-lg w-100">
<strong class="pe-2">Domain:</strong>
<span class="text-end flex-grow-1 text-nowrap">
<select name="os_jw_hostname" class="form-select d-inline-block"><?php
foreach ($_RDATA['s_crawldata_domains'] as $domain => $count) { ?>
foreach ($_RDATA['sp_domains'] as $domain => $count) { ?>
<option value="<?php echo $domain; ?>"<?php
if ($_ODATA['jw_hostname'] == $domain) echo ' selected="selected"'; ?>><?php
echo $domain, ' (', $count, ')';

View file

@ -93,6 +93,7 @@ if (!in_array($_DDATA['tbprefix'].'config', $_DDATA['tables'])) {
`sp_data_stored` INT UNSIGNED NOT NULL,
`sp_links_crawled` SMALLINT UNSIGNED NOT NULL,
`sp_pages_stored` SMALLINT UNSIGNED NOT NULL,
`sp_domains` TEXT NOT NULL,
`sp_autodelete` BOOLEAN NOT NULL,
`sp_ifmodifiedsince` BOOLEAN NOT NULL,
`sp_cookies` BOOLEAN NOT NULL,
@ -173,6 +174,7 @@ if (!count($testConf->fetchAll())) {
`sp_data_stored`=0,
`sp_links_crawled`=0,
`sp_pages_stored`=0,
`sp_domains`=\'\',
`sp_autodelete`=0,
`sp_ifmodifiedsince`=1,
`sp_cookies`=1,
@ -214,7 +216,6 @@ if (!in_array($_DDATA['tbprefix'].'crawldata', $_DDATA['tables'])) {
$create = $_DDATA['pdo']->query(
'CREATE TABLE `'.$_DDATA['tbprefix'].'crawldata` (
`url` TEXT NOT NULL,
`url_base` TINYTEXT NOT NULL,
`url_sort` SMALLINT UNSIGNED NOT NULL,
`title` TEXT NOT NULL,
`description` TEXT NOT NULL,
@ -227,7 +228,6 @@ if (!in_array($_DDATA['tbprefix'].'crawldata', $_DDATA['tables'])) {
`content_charset` TINYTEXT NOT NULL,
`content_checksum` BINARY(20) NOT NULL,
`status` TINYTEXT NOT NULL,
`status_noindex` TINYTEXT NOT NULL,
`flag_unlisted` BOOLEAN NOT NULL,
`flag_updated` BOOLEAN NOT NULL,
`last_modified` INT NOT NULL,
@ -690,26 +690,6 @@ if ($err[0] == '00000') {
$_SESSION['error'][] = 'Could not read categories from the search database.';
// Count base URLs / domains from the crawldata: if there is only one
// in the search database then we don't have to show it in a number of
// places
$_RDATA['s_crawldata_domains'] = array();
$domains = $_DDATA['pdo']->query(
'SELECT `url_base`, COUNT(`url_base`) as `count`
FROM `'.$_DDATA['tbprefix'].'crawldata`
GROUP BY `url_base` ORDER BY `count` DESC;'
);
$err = $domains->errorInfo();
if ($err[0] == '00000') {
$domains = $domains->fetchAll();
foreach ($domains as $domain)
$_RDATA['s_crawldata_domains'][$domain['url_base']] = $domain['count'];
} else if (isset($_SESSION['error']))
$_SESSION['error'][] = 'Could not read domain count data from search database: '.$err[2];
if (count($_RDATA['s_crawldata_domains']) == 1)
OS_setValue('jw_hostname', key($_RDATA['s_crawldata_domains']));
// Count searchable pages
$_RDATA['s_searchable_pages'] = 0;
$query_status = ($_ODATA['s_show_orphans']) ? '(`status`=\'OK\' || `status`=\'Orphan\')' : '`status`=\'OK\'';
@ -726,6 +706,11 @@ if ($err[0] == '00000') {
$_SESSION['error'][] = 'Could not read status data from search database: '.$err[2];
$_RDATA['sp_domains'] = json_decode($_ODATA['sp_domains'], true);
if (count($_RDATA['sp_domains']) == 1 && $_ODATA['jw_hostname'] != key($_RDATA['sp_domains']))
OS_setValue('jw_hostname', key($_RDATA['sp_domains']));
// Match Weighting Values
$weights = explode('%', $_ODATA['s_weights']);
$_RDATA['s_weights'] = array(

View file

@ -332,6 +332,7 @@ function OS_crawlCleanUp() {
OS_setValue('sp_links_crawled', count($_RDATA['sp_links']));
OS_setValue('sp_pages_stored', count($_RDATA['sp_store']));
OS_setValue('sp_domains', json_encode($_RDATA['sp_domains']));
OS_setValue('sp_time_end_success', $_ODATA['sp_time_end']);
OS_crawlLog('***** Crawl completed in '.$_ODATA['sp_time_last'].'s *****', 1);
@ -644,6 +645,7 @@ $_RDATA['sp_robots_header'] = 0;
$_RDATA['sp_complete'] = false;
$_RDATA['sp_links'] = array();
$_RDATA['sp_store'] = array();
$_RDATA['sp_domains'] = array();
$_RDATA['sp_sitemap'] = array();
$_RDATA['sp_robots'] = array();
$_RDATA['sp_status'] = array('Orphan' => 0, 'Blocked' => 0, 'Not Found' => 0, 'Updated' => 0, 'New' => 0);
@ -789,7 +791,6 @@ $updateURL = $_DDATA['pdo']->prepare(
$insertTemp = $_DDATA['pdo']->prepare(
'INSERT INTO `'.$_DDATA['tbprefix'].'crawltemp` SET
`url`=:url,
`url_base`=:url_base,
`url_sort`=0,
`title`=:title,
`description`=:description,
@ -802,7 +803,6 @@ $insertTemp = $_DDATA['pdo']->prepare(
`content_charset`=:content_charset,
`content_checksum`=:content_checksum,
`status`=:status,
`status_noindex`=:status_noindex,
`flag_unlisted`=:flag_unlisted,
`flag_updated`=:flag_updated,
`last_modified`=:last_modified,
@ -1439,7 +1439,6 @@ while ($_cURL && count($_RDATA['sp_queue'])) {
$port = (!empty($data['url']['port'])) ? ':'.$data['url']['port'] : '';
$insertTemp->execute(array(
'url' => $url,
'url_base' => $data['url']['scheme'].'://'.$data['url']['host'].$port,
'title' => $data['title'],
'description' => $data['description'],
'keywords' => $data['keywords'],
@ -1451,7 +1450,6 @@ while ($_cURL && count($_RDATA['sp_queue'])) {
'content_charset' => $data['info']['charset'],
'content_checksum' => $data['info']['sha1'],
'status' => $data['info']['status'],
'status_noindex' => $data['info']['noindex'],
'flag_unlisted' => $row['flag_unlisted'],
'flag_updated' => 1,
'last_modified' => $data['info']['filetime'],
@ -1512,6 +1510,13 @@ while ($_cURL && count($_RDATA['sp_queue'])) {
}
}
$domain = $data['url']['scheme'].'://'.$data['url']['host'];
if (!isset($_RDATA['sp_domains'][$domain])) {
$_RDATA['sp_domains'][$domain] = 1;
} else $_RDATA['sp_domains'][$domain]++;
// Store data for use in the sitemap
if ($_ODATA['sp_sitemap_file'] &&
$data['url']['host'] == $_ODATA['sp_sitemap_hostname']) {

View file

@ -146,6 +146,16 @@ for (let x = 0; x < countUpTimers.length; x++) {
/* ***** Page >> Page Index **************************************** */
let os_page_index_download = document.getElementById('os_page_index_download');
if (os_page_index_download) {
os_page_index_download.addEventListener('click', function() {
os_download('page-index.txt', {
action: 'download',
content: 'page_index'
});
}, false);
}
let select_pagination = document.querySelectorAll('select[name="os_index_select_pagination"]');
for (let x = 0; x < select_pagination.length; x++) {
select_pagination[x].addEventListener('change', function() {

View file

@ -603,8 +603,8 @@ if ($_RDATA['s_searchable_pages']) {
$_ORCINUS->searchable->searched->results->result_list = array();
// Prepare PCRE for removing base domains
if (count($_RDATA['s_crawldata_domains']) == 1)
$repStr = '/^'.preg_quote(key($_RDATA['s_crawldata_domains']), '/').'/';
if (count($_RDATA['sp_domains']) == 1)
$repStr = '/^'.preg_quote(key($_RDATA['sp_domains']), '/').'/';
// Do a last once-over of the results
foreach ($resultsPage as $key => $result) {
@ -633,7 +633,7 @@ if ($_RDATA['s_searchable_pages']) {
$_RESULT->relevance = number_format($result['relevance'], 2, '.', '');
// Remove base domain from URL if they are all the same
if (count($_RDATA['s_crawldata_domains']) == 1)
if (count($_RDATA['sp_domains']) == 1)
$result['url'] = preg_replace($repStr, '', $result['url']);
// Highlight the terms in the title, url and matchtext