Enable downloading Page Index
Allow downloading of the page index as a csv. Remove unnecessary database columns url_base and status_noindex Store list of domains at crawl so we don't need to request them every page-load; you will need to reinstall fresh because of this change
This commit is contained in:
parent
bab4a7e2c5
commit
4bb28031b6
|
@ -146,7 +146,7 @@ if (!$_SESSION['admin_username']) {
|
|||
if (empty($_POST->action)) $_POST->action = '';
|
||||
switch ($_POST->action) {
|
||||
|
||||
// Set the key for initiating the crawler
|
||||
// ***** Set the key for initiating the crawler
|
||||
case 'setkey':
|
||||
if (!$_ODATA['sp_crawling']) {
|
||||
$md5 = md5(hrtime(true));
|
||||
|
@ -167,10 +167,12 @@ if (!$_SESSION['admin_username']) {
|
|||
}
|
||||
break;
|
||||
|
||||
// Download a text file of the most recent crawl or query log
|
||||
// ***** Download a text or csv file
|
||||
case 'download':
|
||||
if (empty($_POST->content)) $_POST->content = '';
|
||||
switch ($_POST->content) {
|
||||
|
||||
// Download a text file of the latest crawl log
|
||||
case 'crawl_log':
|
||||
if (!$_ODATA['sp_crawling']) {
|
||||
if ($_ODATA['sp_time_end']) {
|
||||
|
@ -209,16 +211,72 @@ if (!$_SESSION['admin_username']) {
|
|||
}
|
||||
break;
|
||||
|
||||
// Download a csv of the unfiltered page index
|
||||
case 'page_index':
|
||||
$pageIndex = $_DDATA['pdo']->query(
|
||||
'SELECT `url`, `category`, `content_mime`, `content_charset`,
|
||||
`status`, `flag_unlisted`, `last_modified`, `priority`
|
||||
FROM `'.$_DDATA['tbprefix'].'crawldata` ORDER BY `url_sort`;');
|
||||
$err = $pageIndex->errorInfo();
|
||||
if ($err[0] == '00000') {
|
||||
|
||||
$pageIndex = $pageIndex->fetchAll();
|
||||
if (count($pageIndex)) {
|
||||
|
||||
header('Content-type: text/csv; charset='.strtolower($_ODATA['s_charset']));
|
||||
header('Content-disposition: attachment; filename="'.
|
||||
'page-index_'.date('Y-m-d').'.csv"');
|
||||
|
||||
$output = fopen('php://output', 'w');
|
||||
|
||||
// UTF-8 byte order mark
|
||||
if (strtolower($_ODATA['s_charset']) == 'utf-8')
|
||||
fwrite($output, "\xEF\xBB\xBF");
|
||||
|
||||
$headings = array(
|
||||
'URL', 'Category', 'MIME Type', 'Character Encoding',
|
||||
'Status', 'Last Modified', 'Priority'
|
||||
);
|
||||
fputcsv($output, $headings);
|
||||
|
||||
foreach ($pageIndex as $line) {
|
||||
if ($line['flag_unlisted'])
|
||||
$line['status'] .= ' (Unlisted)';
|
||||
unset($line['flag_unlisted']);
|
||||
|
||||
$line['last_modified'] = date('c', $line['last_modified']);
|
||||
|
||||
fputcsv($output, $line);
|
||||
}
|
||||
|
||||
fclose($output);
|
||||
die();
|
||||
|
||||
} else {
|
||||
$response = array(
|
||||
'status' => 'Error',
|
||||
'message' => 'The page index is empty; nothing to download'
|
||||
);
|
||||
}
|
||||
} else {
|
||||
$response = array(
|
||||
'status' => 'Error',
|
||||
'message' => 'Could not read the page index database'
|
||||
);
|
||||
}
|
||||
break;
|
||||
|
||||
// Download a csv of the complete query log
|
||||
case 'query_log':
|
||||
$querylog = $_DDATA['pdo']->query(
|
||||
$queryLog = $_DDATA['pdo']->query(
|
||||
'SELECT `query`, `results`, `stamp`, INET_NTOA(`ip`) AS `ipaddr`
|
||||
FROM `'.$_DDATA['tbprefix'].'query` ORDER BY `stamp` DESC;'
|
||||
);
|
||||
$err = $querylog->errorInfo();
|
||||
$err = $queryLog->errorInfo();
|
||||
if ($err[0] == '00000') {
|
||||
|
||||
$querylog = $querylog->fetchAll();
|
||||
if (count($querylog)) {
|
||||
$queryLog = $queryLog->fetchAll();
|
||||
if (count($queryLog)) {
|
||||
|
||||
header('Content-type: text/csv; charset='.strtolower($_ODATA['s_charset']));
|
||||
header('Content-disposition: attachment; filename="'.
|
||||
|
@ -232,9 +290,9 @@ if (!$_SESSION['admin_username']) {
|
|||
|
||||
$headings = array('Query', 'Results', 'Time Stamp', 'IP');
|
||||
if ($_GEOIP2) $headings[] = 'Country';
|
||||
|
||||
fputcsv($output, $headings);
|
||||
foreach ($querylog as $line) {
|
||||
|
||||
foreach ($queryLog as $line) {
|
||||
$line['stamp'] = date('c', $line['stamp']);
|
||||
|
||||
if ($_GEOIP2) {
|
||||
|
@ -274,7 +332,7 @@ if (!$_SESSION['admin_username']) {
|
|||
break;
|
||||
|
||||
|
||||
// Not used?
|
||||
// ***** Not used?
|
||||
case 'fetch':
|
||||
if (empty($_POST->value)) $_POST->value = '';
|
||||
if (!empty($_ODATA[$_POST->value])) {
|
||||
|
@ -553,6 +611,26 @@ if (!$_SESSION['admin_username']) {
|
|||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Refresh the sp_domains data since we deleted some rows
|
||||
$_RDATA['sp_domains'] = array();
|
||||
$urls = $_DDATA['pdo']->query(
|
||||
'SELECT `url` FROM `'.$_DDATA['tbprefix'].'crawldata`;'
|
||||
);
|
||||
$err = $urls->errorInfo();
|
||||
if ($err[0] == '00000') {
|
||||
$urls = $urls->fetchAll();
|
||||
foreach ($urls as $url) {
|
||||
$url = parse_url($url['url']);
|
||||
if (is_array($url)) {
|
||||
$domain = $url['scheme'].'://'.$url['host'];
|
||||
if (!isset($_RDATA['sp_domains'][$domain])) {
|
||||
$_RDATA['sp_domains'][$domain] = 1;
|
||||
} else $_RDATA['sp_domains'][$domain]++;
|
||||
}
|
||||
}
|
||||
OS_setValue('sp_domains', json_encode($_RDATA['sp_domains']));
|
||||
} else $_SESSION['error'][] = 'Could not read domain count data from search database: '.$err[2];
|
||||
break;
|
||||
|
||||
case 'category':
|
||||
|
@ -803,7 +881,7 @@ if (!$_SESSION['admin_username']) {
|
|||
`content_mime`, `weighted`, `content`, `priority`
|
||||
FROM `'.$_DDATA['tbprefix'].'crawldata`
|
||||
WHERE `flag_unlisted`<>1 AND '.$query_status.' AND
|
||||
`url_base` LIKE \'%'.addslashes($_ODATA['jw_hostname']).'\';'
|
||||
`url` LIKE \'%://'.addslashes($_ODATA['jw_hostname']).'/%\';'
|
||||
);
|
||||
$err = $select->errorInfo();
|
||||
if ($err[0] == '00000') {
|
||||
|
@ -1544,8 +1622,8 @@ document.write(mustache.render(
|
|||
// ***** Select rows to populate the Page Index table
|
||||
$indexRows = $_DDATA['pdo']->prepare(
|
||||
'SELECT SQL_CALC_FOUND_ROWS
|
||||
`url`, `url_base`, `title`, `category`, `content_checksum`, `status`,
|
||||
`status_noindex`, `flag_unlisted`, `flag_updated`, `priority`
|
||||
`url`, `title`, `category`, `content_checksum`,
|
||||
`status`, `flag_unlisted`, `flag_updated`, `priority`
|
||||
FROM `'.$_DDATA['tbprefix'].'crawldata`
|
||||
WHERE (:text1=\'\' OR `url` LIKE :text2) AND
|
||||
(:category1=\'\' OR `category`=:category2) AND
|
||||
|
@ -2232,9 +2310,13 @@ document.write(mustache.render(
|
|||
* Page Index ********************************************** */
|
||||
case 'index': ?>
|
||||
<section class="row justify-content-center">
|
||||
<header class="col-xl-10 col-xxl-8 mb-2">
|
||||
<header class="col-6 col-xl-5 col-xxl-4 mb-2">
|
||||
<h2>Page Index</h2>
|
||||
</header><?php
|
||||
</header>
|
||||
<div class="col-6 col-xl-5 col-xxl-4 mb-2 text-end text-nowrap">
|
||||
<button type="button" class="btn btn-primary" id="os_page_index_download" title="Download Page Index"<?php
|
||||
if (!$_RDATA['s_crawldata_info']['Rows']) echo ' disabled="disabled"'; ?>>Download</button>
|
||||
</div><?php
|
||||
|
||||
// If there are *any* rows in the database
|
||||
if ($_RDATA['s_crawldata_info']['Rows']) {
|
||||
|
@ -2388,8 +2470,8 @@ document.write(mustache.render(
|
|||
<tr><?php echo $_RDATA['index_action_row']; ?></tr>
|
||||
</tfoot>
|
||||
<tbody class="table-group-divider"><?php
|
||||
if (count($_RDATA['s_crawldata_domains']) == 1)
|
||||
$repStr = '/^'.preg_quote(key($_RDATA['s_crawldata_domains']), '/').'/';
|
||||
if (count($_RDATA['sp_domains']) == 1)
|
||||
$repStr = '/^'.preg_quote(key($_RDATA['sp_domains']), '/').'/';
|
||||
|
||||
foreach ($_RDATA['page_index_rows'] as $key => $row) { ?>
|
||||
<tr class="lh-sm">
|
||||
|
@ -2403,7 +2485,7 @@ document.write(mustache.render(
|
|||
<a href="<?php echo htmlspecialchars($row['url']); ?>" title="<?php
|
||||
echo htmlspecialchars($row['url']); ?>" target="_blank" class="align-middle<?php
|
||||
if ($row['flag_updated']) echo ' fw-bold'; ?>"><?php
|
||||
if (count($_RDATA['s_crawldata_domains']) == 1) {
|
||||
if (count($_RDATA['sp_domains']) == 1) {
|
||||
echo htmlspecialchars(preg_replace($repStr, '', $row['url']));
|
||||
} else echo htmlspecialchars($row['url']);
|
||||
?></a><?php
|
||||
|
@ -2655,13 +2737,13 @@ document.write(mustache.render(
|
|||
</legend>
|
||||
<div class="p-2 border border-1 border-secondary-subtle rounded-bottom-3">
|
||||
<ul class="list-group mb-2"><?php
|
||||
if (count($_RDATA['s_crawldata_domains']) > 1) { ?>
|
||||
if (count($_RDATA['sp_domains']) > 1) { ?>
|
||||
<li class="list-group-item">
|
||||
<label class="d-flex lh-lg w-100">
|
||||
<strong class="pe-2">Domain:</strong>
|
||||
<span class="text-end flex-grow-1 text-nowrap">
|
||||
<select name="os_jw_hostname" class="form-select d-inline-block"><?php
|
||||
foreach ($_RDATA['s_crawldata_domains'] as $domain => $count) { ?>
|
||||
foreach ($_RDATA['sp_domains'] as $domain => $count) { ?>
|
||||
<option value="<?php echo $domain; ?>"<?php
|
||||
if ($_ODATA['jw_hostname'] == $domain) echo ' selected="selected"'; ?>><?php
|
||||
echo $domain, ' (', $count, ')';
|
||||
|
|
|
@ -93,6 +93,7 @@ if (!in_array($_DDATA['tbprefix'].'config', $_DDATA['tables'])) {
|
|||
`sp_data_stored` INT UNSIGNED NOT NULL,
|
||||
`sp_links_crawled` SMALLINT UNSIGNED NOT NULL,
|
||||
`sp_pages_stored` SMALLINT UNSIGNED NOT NULL,
|
||||
`sp_domains` TEXT NOT NULL,
|
||||
`sp_autodelete` BOOLEAN NOT NULL,
|
||||
`sp_ifmodifiedsince` BOOLEAN NOT NULL,
|
||||
`sp_cookies` BOOLEAN NOT NULL,
|
||||
|
@ -173,6 +174,7 @@ if (!count($testConf->fetchAll())) {
|
|||
`sp_data_stored`=0,
|
||||
`sp_links_crawled`=0,
|
||||
`sp_pages_stored`=0,
|
||||
`sp_domains`=\'\',
|
||||
`sp_autodelete`=0,
|
||||
`sp_ifmodifiedsince`=1,
|
||||
`sp_cookies`=1,
|
||||
|
@ -214,7 +216,6 @@ if (!in_array($_DDATA['tbprefix'].'crawldata', $_DDATA['tables'])) {
|
|||
$create = $_DDATA['pdo']->query(
|
||||
'CREATE TABLE `'.$_DDATA['tbprefix'].'crawldata` (
|
||||
`url` TEXT NOT NULL,
|
||||
`url_base` TINYTEXT NOT NULL,
|
||||
`url_sort` SMALLINT UNSIGNED NOT NULL,
|
||||
`title` TEXT NOT NULL,
|
||||
`description` TEXT NOT NULL,
|
||||
|
@ -227,7 +228,6 @@ if (!in_array($_DDATA['tbprefix'].'crawldata', $_DDATA['tables'])) {
|
|||
`content_charset` TINYTEXT NOT NULL,
|
||||
`content_checksum` BINARY(20) NOT NULL,
|
||||
`status` TINYTEXT NOT NULL,
|
||||
`status_noindex` TINYTEXT NOT NULL,
|
||||
`flag_unlisted` BOOLEAN NOT NULL,
|
||||
`flag_updated` BOOLEAN NOT NULL,
|
||||
`last_modified` INT NOT NULL,
|
||||
|
@ -690,26 +690,6 @@ if ($err[0] == '00000') {
|
|||
$_SESSION['error'][] = 'Could not read categories from the search database.';
|
||||
|
||||
|
||||
// Count base URLs / domains from the crawldata: if there is only one
|
||||
// in the search database then we don't have to show it in a number of
|
||||
// places
|
||||
$_RDATA['s_crawldata_domains'] = array();
|
||||
$domains = $_DDATA['pdo']->query(
|
||||
'SELECT `url_base`, COUNT(`url_base`) as `count`
|
||||
FROM `'.$_DDATA['tbprefix'].'crawldata`
|
||||
GROUP BY `url_base` ORDER BY `count` DESC;'
|
||||
);
|
||||
$err = $domains->errorInfo();
|
||||
if ($err[0] == '00000') {
|
||||
$domains = $domains->fetchAll();
|
||||
foreach ($domains as $domain)
|
||||
$_RDATA['s_crawldata_domains'][$domain['url_base']] = $domain['count'];
|
||||
} else if (isset($_SESSION['error']))
|
||||
$_SESSION['error'][] = 'Could not read domain count data from search database: '.$err[2];
|
||||
if (count($_RDATA['s_crawldata_domains']) == 1)
|
||||
OS_setValue('jw_hostname', key($_RDATA['s_crawldata_domains']));
|
||||
|
||||
|
||||
// Count searchable pages
|
||||
$_RDATA['s_searchable_pages'] = 0;
|
||||
$query_status = ($_ODATA['s_show_orphans']) ? '(`status`=\'OK\' || `status`=\'Orphan\')' : '`status`=\'OK\'';
|
||||
|
@ -726,6 +706,11 @@ if ($err[0] == '00000') {
|
|||
$_SESSION['error'][] = 'Could not read status data from search database: '.$err[2];
|
||||
|
||||
|
||||
$_RDATA['sp_domains'] = json_decode($_ODATA['sp_domains'], true);
|
||||
if (count($_RDATA['sp_domains']) == 1 && $_ODATA['jw_hostname'] != key($_RDATA['sp_domains']))
|
||||
OS_setValue('jw_hostname', key($_RDATA['sp_domains']));
|
||||
|
||||
|
||||
// Match Weighting Values
|
||||
$weights = explode('%', $_ODATA['s_weights']);
|
||||
$_RDATA['s_weights'] = array(
|
||||
|
|
|
@ -332,6 +332,7 @@ function OS_crawlCleanUp() {
|
|||
|
||||
OS_setValue('sp_links_crawled', count($_RDATA['sp_links']));
|
||||
OS_setValue('sp_pages_stored', count($_RDATA['sp_store']));
|
||||
OS_setValue('sp_domains', json_encode($_RDATA['sp_domains']));
|
||||
OS_setValue('sp_time_end_success', $_ODATA['sp_time_end']);
|
||||
|
||||
OS_crawlLog('***** Crawl completed in '.$_ODATA['sp_time_last'].'s *****', 1);
|
||||
|
@ -644,6 +645,7 @@ $_RDATA['sp_robots_header'] = 0;
|
|||
$_RDATA['sp_complete'] = false;
|
||||
$_RDATA['sp_links'] = array();
|
||||
$_RDATA['sp_store'] = array();
|
||||
$_RDATA['sp_domains'] = array();
|
||||
$_RDATA['sp_sitemap'] = array();
|
||||
$_RDATA['sp_robots'] = array();
|
||||
$_RDATA['sp_status'] = array('Orphan' => 0, 'Blocked' => 0, 'Not Found' => 0, 'Updated' => 0, 'New' => 0);
|
||||
|
@ -789,7 +791,6 @@ $updateURL = $_DDATA['pdo']->prepare(
|
|||
$insertTemp = $_DDATA['pdo']->prepare(
|
||||
'INSERT INTO `'.$_DDATA['tbprefix'].'crawltemp` SET
|
||||
`url`=:url,
|
||||
`url_base`=:url_base,
|
||||
`url_sort`=0,
|
||||
`title`=:title,
|
||||
`description`=:description,
|
||||
|
@ -802,7 +803,6 @@ $insertTemp = $_DDATA['pdo']->prepare(
|
|||
`content_charset`=:content_charset,
|
||||
`content_checksum`=:content_checksum,
|
||||
`status`=:status,
|
||||
`status_noindex`=:status_noindex,
|
||||
`flag_unlisted`=:flag_unlisted,
|
||||
`flag_updated`=:flag_updated,
|
||||
`last_modified`=:last_modified,
|
||||
|
@ -1439,7 +1439,6 @@ while ($_cURL && count($_RDATA['sp_queue'])) {
|
|||
$port = (!empty($data['url']['port'])) ? ':'.$data['url']['port'] : '';
|
||||
$insertTemp->execute(array(
|
||||
'url' => $url,
|
||||
'url_base' => $data['url']['scheme'].'://'.$data['url']['host'].$port,
|
||||
'title' => $data['title'],
|
||||
'description' => $data['description'],
|
||||
'keywords' => $data['keywords'],
|
||||
|
@ -1451,7 +1450,6 @@ while ($_cURL && count($_RDATA['sp_queue'])) {
|
|||
'content_charset' => $data['info']['charset'],
|
||||
'content_checksum' => $data['info']['sha1'],
|
||||
'status' => $data['info']['status'],
|
||||
'status_noindex' => $data['info']['noindex'],
|
||||
'flag_unlisted' => $row['flag_unlisted'],
|
||||
'flag_updated' => 1,
|
||||
'last_modified' => $data['info']['filetime'],
|
||||
|
@ -1512,6 +1510,13 @@ while ($_cURL && count($_RDATA['sp_queue'])) {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
$domain = $data['url']['scheme'].'://'.$data['url']['host'];
|
||||
if (!isset($_RDATA['sp_domains'][$domain])) {
|
||||
$_RDATA['sp_domains'][$domain] = 1;
|
||||
} else $_RDATA['sp_domains'][$domain]++;
|
||||
|
||||
|
||||
// Store data for use in the sitemap
|
||||
if ($_ODATA['sp_sitemap_file'] &&
|
||||
$data['url']['host'] == $_ODATA['sp_sitemap_hostname']) {
|
||||
|
|
|
@ -146,6 +146,16 @@ for (let x = 0; x < countUpTimers.length; x++) {
|
|||
|
||||
|
||||
/* ***** Page >> Page Index **************************************** */
|
||||
let os_page_index_download = document.getElementById('os_page_index_download');
|
||||
if (os_page_index_download) {
|
||||
os_page_index_download.addEventListener('click', function() {
|
||||
os_download('page-index.txt', {
|
||||
action: 'download',
|
||||
content: 'page_index'
|
||||
});
|
||||
}, false);
|
||||
}
|
||||
|
||||
let select_pagination = document.querySelectorAll('select[name="os_index_select_pagination"]');
|
||||
for (let x = 0; x < select_pagination.length; x++) {
|
||||
select_pagination[x].addEventListener('change', function() {
|
||||
|
|
|
@ -603,8 +603,8 @@ if ($_RDATA['s_searchable_pages']) {
|
|||
$_ORCINUS->searchable->searched->results->result_list = array();
|
||||
|
||||
// Prepare PCRE for removing base domains
|
||||
if (count($_RDATA['s_crawldata_domains']) == 1)
|
||||
$repStr = '/^'.preg_quote(key($_RDATA['s_crawldata_domains']), '/').'/';
|
||||
if (count($_RDATA['sp_domains']) == 1)
|
||||
$repStr = '/^'.preg_quote(key($_RDATA['sp_domains']), '/').'/';
|
||||
|
||||
// Do a last once-over of the results
|
||||
foreach ($resultsPage as $key => $result) {
|
||||
|
@ -633,7 +633,7 @@ if ($_RDATA['s_searchable_pages']) {
|
|||
$_RESULT->relevance = number_format($result['relevance'], 2, '.', '');
|
||||
|
||||
// Remove base domain from URL if they are all the same
|
||||
if (count($_RDATA['s_crawldata_domains']) == 1)
|
||||
if (count($_RDATA['sp_domains']) == 1)
|
||||
$result['url'] = preg_replace($repStr, '', $result['url']);
|
||||
|
||||
// Highlight the terms in the title, url and matchtext
|
||||
|
|
Loading…
Reference in a new issue