Admin UI edits for when crawl is in progress

Automatically encode/decode json when saving/reading ODATA config values.
Remove 'sp_links_crawled' config table value, now stored in 'sp_progress'.
Update Crawl Information window in real-time while crawler is running. Be more aggressive at reloading the page to get the latest data once a crawl has finished.
Time the setting of certain config values while crawling in a more sensible way.
This commit is contained in:
Brian Huisman 2023-05-16 12:00:28 -04:00
parent f16c4f9e0a
commit d8e9d5dc91
5 changed files with 151 additions and 135 deletions

View file

@ -12,7 +12,7 @@ require __DIR__.'/config.php';
* Display a 'time since' HTML/Javascript counter * Display a 'time since' HTML/Javascript counter
* *
*/ */
function OS_countUp($time, $id = '') { function OS_countUp($time, $active, $id = '') {
$since = time() - $time; $since = time() - $time;
$periods = array( $periods = array(
array('d', 'day', 'days'), array('d', 'day', 'days'),
@ -24,7 +24,8 @@ function OS_countUp($time, $id = '') {
$hours = floor($since / 3600); $since %= 3600; $hours = floor($since / 3600); $since %= 3600;
$minutes = floor($since / 60); $minutes = floor($since / 60);
$seconds = $since % 60; ?> $seconds = $since % 60; ?>
<span class="countup_timer" data-start="<?php echo $time; ?>" title="<?php echo date('r', $time); ?>"<?php <span class="countup_timer<?php if ($active) echo ' active'; ?>" data-start="<?php
echo $time; ?>" title="<?php echo date('r', $time); ?>"<?php
if (!empty($id)) echo ' id="'.htmlspecialchars($id).'"'; ?>> if (!empty($id)) echo ' id="'.htmlspecialchars($id).'"'; ?>>
<span data-period="days"<?php <span data-period="days"<?php
if (!$days) echo ' class="d-none"'; ?>> if (!$days) echo ' class="d-none"'; ?>>
@ -43,7 +44,7 @@ function OS_countUp($time, $id = '') {
</span> </span>
<span data-period="seconds"> <span data-period="seconds">
<var><?php echo $seconds; ?></var> <var><?php echo $seconds; ?></var>
<?php echo ($seconds == 1) ? $periods[3][1] : $periods[3][2]; ?> ago <?php echo ($seconds == 1) ? $periods[3][1] : $periods[3][2]; ?>
</span> </span>
</span><?php </span><?php
} }
@ -613,7 +614,7 @@ if (!$_SESSION['admin_username']) {
} }
// Refresh the sp_domains data since we deleted some rows // Refresh the sp_domains data since we deleted some rows
$_RDATA['sp_domains'] = array(); $domainList = array();
$urls = $_DDATA['pdo']->query( $urls = $_DDATA['pdo']->query(
'SELECT `url` FROM `'.$_DDATA['tbprefix'].'crawldata`;' 'SELECT `url` FROM `'.$_DDATA['tbprefix'].'crawldata`;'
); );
@ -624,12 +625,12 @@ if (!$_SESSION['admin_username']) {
$url = parse_url($url['url']); $url = parse_url($url['url']);
if (is_array($url)) { if (is_array($url)) {
$domain = $url['scheme'].'://'.$url['host']; $domain = $url['scheme'].'://'.$url['host'];
if (!isset($_RDATA['sp_domains'][$domain])) { if (!isset($domainList[$domain])) {
$_RDATA['sp_domains'][$domain] = 1; $domainList[$domain] = 1;
} else $_RDATA['sp_domains'][$domain]++; } else $domainList[$domain]++;
} }
} }
OS_setValue('sp_domains', json_encode($_RDATA['sp_domains'])); OS_setValue('sp_domains', $domainList);
} else $_SESSION['error'][] = 'Could not read domain count data from search database: '.$err[2]; } else $_SESSION['error'][] = 'Could not read domain count data from search database: '.$err[2];
break; break;
@ -1903,9 +1904,14 @@ document.write(mustache.render(
</span> </span>
</label> </label>
<div><?php <div><?php
OS_countUp(($_ODATA['sp_time_end']) ? $_ODATA['sp_time_end'] : time(), 'os_countup_time_end'); if (!$_ODATA['sp_crawling']) {
OS_countUp(($_ODATA['sp_time_end']) ? $_ODATA['sp_time_end'] : time(), true, 'os_countup_time_end');
?> ago<?php
} else { ?>
<em>Currently crawling...</em><?php
}
?></div><?php ?></div><?php
if ($_ODATA['sp_time_end'] != $_ODATA['sp_time_end_success']) { ?> if (!$_ODATA['sp_crawling'] && $_ODATA['sp_time_end'] != $_ODATA['sp_time_end_success']) { ?>
<p class="data-text text-danger"> <p class="data-text text-danger">
<strong>Warning:</strong> The previous crawl did not complete successfully. <strong>Warning:</strong> The previous crawl did not complete successfully.
Please check the crawl log for more details. Please check the crawl log for more details.
@ -1916,8 +1922,12 @@ document.write(mustache.render(
<label class="d-flex w-100"> <label class="d-flex w-100">
<strong class="pe-2">Crawl Time</strong> <strong class="pe-2">Crawl Time</strong>
<var class="flex-grow-1 text-end" id="os_crawl_time_last"><?php <var class="flex-grow-1 text-end" id="os_crawl_time_last"><?php
echo $_ODATA['sp_time_last']; if ($_ODATA['sp_crawling']) {
?> <abbr title="seconds">s</abbr></var> OS_countUp($_ODATA['sp_time_start'], true, 'os_countup_time_crawl');
} else {
OS_countUp(time() - $_ODATA['sp_time_last'], false, 'os_countup_time_crawl');
}
?></var>
</label> </label>
</li> </li>
<li class="list-group-item"> <li class="list-group-item">
@ -1932,12 +1942,14 @@ document.write(mustache.render(
<label class="d-flex w-100"> <label class="d-flex w-100">
<strong class="pe-2">Data Stored</strong> <strong class="pe-2">Data Stored</strong>
<var class="flex-grow-1 text-end" id="os_crawl_data_stored"><?php <var class="flex-grow-1 text-end" id="os_crawl_data_stored"><?php
if ($_ODATA['sp_data_transferred']) { ?> if (!$_ODATA['sp_crawling']) {
<small data-bs-toggle="tooltip" data-bs-placement="bottom" title="Efficiency percentage of data stored vs. data downloaded"><?php if ($_ODATA['sp_data_transferred']) { ?>
echo '('.round(($_ODATA['sp_data_stored'] / $_ODATA['sp_data_transferred']) * 100, 1).'%)'; <small data-bs-toggle="tooltip" data-bs-placement="bottom" title="Efficiency percentage of data stored vs. data downloaded"><?php
?></small> <?php echo '('.round(($_ODATA['sp_data_stored'] / $_ODATA['sp_data_transferred']) * 100, 1).'%)';
} ?></small> <?php
echo OS_readSize($_ODATA['sp_data_stored'], true); }
echo OS_readSize($_ODATA['sp_data_stored'], true);
} else echo '0';
?></var> ?></var>
</label> </label>
</li> </li>
@ -1945,7 +1957,9 @@ document.write(mustache.render(
<label class="d-flex w-100"> <label class="d-flex w-100">
<strong class="pe-2">Links Crawled</strong> <strong class="pe-2">Links Crawled</strong>
<var class="flex-grow-1 text-end" id="os_crawl_links_crawled"><?php <var class="flex-grow-1 text-end" id="os_crawl_links_crawled"><?php
echo $_ODATA['sp_links_crawled']; if ($_ODATA['sp_crawling']) {
echo $_ODATA['sp_progress'][0].' / '.$_ODATA['sp_progress'][1];
} else echo $_ODATA['sp_progress'][0];
?></var> ?></var>
</label> </label>
</li> </li>
@ -1953,12 +1967,14 @@ document.write(mustache.render(
<label class="d-flex w-100"> <label class="d-flex w-100">
<strong class="pe-2">Pages Stored</strong> <strong class="pe-2">Pages Stored</strong>
<var class="flex-grow-1 text-end" id="os_crawl_pages_stored"><?php <var class="flex-grow-1 text-end" id="os_crawl_pages_stored"><?php
if ($_ODATA['sp_links_crawled']) { ?> if (!$_ODATA['sp_crawling']) {
<small data-bs-toggle="tooltip" data-bs-placement="bottom" title="Efficiency percentage of pages stored vs. links crawled"><?php if ($_ODATA['sp_progress'][0]) { ?>
echo '('.round(($_ODATA['sp_pages_stored'] / $_ODATA['sp_links_crawled']) * 100, 1).'%)'; <small data-bs-toggle="tooltip" data-bs-placement="bottom" title="Efficiency percentage of pages stored vs. links crawled"><?php
?></small> <?php echo '('.round(($_ODATA['sp_pages_stored'] / $_ODATA['sp_progress'][0]) * 100, 1).'%)';
} ?></small> <?php
echo $_ODATA['sp_pages_stored']; }
echo $_ODATA['sp_pages_stored'];
} else echo '0';
?></var> ?></var>
</label> </label>
</li><?php </li><?php
@ -2470,8 +2486,8 @@ document.write(mustache.render(
<tr><?php echo $_RDATA['index_action_row']; ?></tr> <tr><?php echo $_RDATA['index_action_row']; ?></tr>
</tfoot> </tfoot>
<tbody class="table-group-divider"><?php <tbody class="table-group-divider"><?php
if (count($_RDATA['sp_domains']) == 1) if (count($_ODATA['sp_domains']) == 1)
$repStr = '/^'.preg_quote(key($_RDATA['sp_domains']), '/').'/'; $repStr = '/^'.preg_quote(key($_ODATA['sp_domains']), '/').'/';
foreach ($_RDATA['page_index_rows'] as $key => $row) { ?> foreach ($_RDATA['page_index_rows'] as $key => $row) { ?>
<tr class="lh-sm"> <tr class="lh-sm">
@ -2485,7 +2501,7 @@ document.write(mustache.render(
<a href="<?php echo htmlspecialchars($row['url']); ?>" title="<?php <a href="<?php echo htmlspecialchars($row['url']); ?>" title="<?php
echo htmlspecialchars($row['url']); ?>" target="_blank" class="align-middle<?php echo htmlspecialchars($row['url']); ?>" target="_blank" class="align-middle<?php
if ($row['flag_updated']) echo ' fw-bold'; ?>"><?php if ($row['flag_updated']) echo ' fw-bold'; ?>"><?php
if (count($_RDATA['sp_domains']) == 1) { if (count($_ODATA['sp_domains']) == 1) {
echo htmlspecialchars(preg_replace($repStr, '', $row['url'])); echo htmlspecialchars(preg_replace($repStr, '', $row['url']));
} else echo htmlspecialchars($row['url']); } else echo htmlspecialchars($row['url']);
?></a><?php ?></a><?php
@ -2737,13 +2753,13 @@ document.write(mustache.render(
</legend> </legend>
<div class="p-2 border border-1 border-secondary-subtle rounded-bottom-3"> <div class="p-2 border border-1 border-secondary-subtle rounded-bottom-3">
<ul class="list-group mb-2"><?php <ul class="list-group mb-2"><?php
if (count($_RDATA['sp_domains']) > 1) { ?> if (count($_ODATA['sp_domains']) > 1) { ?>
<li class="list-group-item"> <li class="list-group-item">
<label class="d-flex lh-lg w-100"> <label class="d-flex lh-lg w-100">
<strong class="pe-2">Domain:</strong> <strong class="pe-2">Domain:</strong>
<span class="text-end flex-grow-1 text-nowrap"> <span class="text-end flex-grow-1 text-nowrap">
<select name="os_jw_hostname" class="form-select d-inline-block"><?php <select name="os_jw_hostname" class="form-select d-inline-block"><?php
foreach ($_RDATA['sp_domains'] as $domain => $count) { ?> foreach ($_ODATA['sp_domains'] as $domain => $count) { ?>
<option value="<?php echo $domain; ?>"<?php <option value="<?php echo $domain; ?>"<?php
if ($_ODATA['jw_hostname'] == $domain) echo ' selected="selected"'; ?>><?php if ($_ODATA['jw_hostname'] == $domain) echo ' selected="selected"'; ?>><?php
echo $domain, ' (', $count, ')'; echo $domain, ' (', $count, ')';

View file

@ -91,7 +91,6 @@ if (!in_array($_DDATA['tbprefix'].'config', $_DDATA['tables'])) {
`sp_time_last` SMALLINT UNSIGNED NOT NULL, `sp_time_last` SMALLINT UNSIGNED NOT NULL,
`sp_data_transferred` INT UNSIGNED NOT NULL, `sp_data_transferred` INT UNSIGNED NOT NULL,
`sp_data_stored` INT UNSIGNED NOT NULL, `sp_data_stored` INT UNSIGNED NOT NULL,
`sp_links_crawled` SMALLINT UNSIGNED NOT NULL,
`sp_pages_stored` SMALLINT UNSIGNED NOT NULL, `sp_pages_stored` SMALLINT UNSIGNED NOT NULL,
`sp_domains` TEXT NOT NULL, `sp_domains` TEXT NOT NULL,
`sp_autodelete` BOOLEAN NOT NULL, `sp_autodelete` BOOLEAN NOT NULL,
@ -172,7 +171,6 @@ if (!count($testConf->fetchAll())) {
`sp_time_last`=0, `sp_time_last`=0,
`sp_data_transferred`=0, `sp_data_transferred`=0,
`sp_data_stored`=0, `sp_data_stored`=0,
`sp_links_crawled`=0,
`sp_pages_stored`=0, `sp_pages_stored`=0,
`sp_domains`=\'\', `sp_domains`=\'\',
`sp_autodelete`=0, `sp_autodelete`=0,
@ -280,10 +278,14 @@ function OS_setValue($columnName, $value) {
if (!isset($_ODATA[$columnName])) return 0; if (!isset($_ODATA[$columnName])) return 0;
$encValue = $value;
if (is_array($encValue) || is_object($encValue))
$encValue = json_encode($encValue);
$update = $_DDATA['pdo']->prepare( $update = $_DDATA['pdo']->prepare(
'UPDATE `'.$_DDATA['tbprefix'].'config` SET `'.$columnName.'`=:value;' 'UPDATE `'.$_DDATA['tbprefix'].'config` SET `'.$columnName.'`=:value;'
); );
$update->execute(array('value' => $value)); $update->execute(array('value' => $encValue));
$err = $update->errorInfo(); $err = $update->errorInfo();
if ($err[0] != '00000') { if ($err[0] != '00000') {
@ -313,8 +315,10 @@ function OS_getValue($columnName) {
$err = $select->errorInfo(); $err = $select->errorInfo();
if ($err[0] == '00000') { if ($err[0] == '00000') {
$select = $select->fetchAll(); $select = $select->fetchAll();
if (count($select)) if (count($select)) {
$_ODATA[$columnName] = $select[0][$columnName]; $json = json_decode($select[0][$columnName], true);
$_ODATA[$columnName] = (!is_null($json)) ? $json : $select[0][$columnName];
}
} else if (isset($_SESSION['error'])) } else if (isset($_SESSION['error']))
$_SESSION['error'][] = 'Could not get live value of \''.$columnName.'\' from config database.'; $_SESSION['error'][] = 'Could not get live value of \''.$columnName.'\' from config database.';
@ -361,7 +365,11 @@ $err = $odata->errorInfo();
if ($err[0] == '00000') { if ($err[0] == '00000') {
$odata = $odata->fetchAll(); $odata = $odata->fetchAll();
if (count($odata)) { if (count($odata)) {
$_ODATA = $odata[0]; $_ODATA = array();
foreach ($odata[0] as $key => $value) {
$json = json_decode($value, true);
$_ODATA[$key] = (!is_null($json)) ? $json : $value;
}
} else throw new Exception('No data in configuration table'); } else throw new Exception('No data in configuration table');
} else throw new Exception('Could not read from configuration table: '.$err[2]); } else throw new Exception('Could not read from configuration table: '.$err[2]);
@ -706,10 +714,9 @@ if ($err[0] == '00000') {
$_SESSION['error'][] = 'Could not read status data from search database: '.$err[2]; $_SESSION['error'][] = 'Could not read status data from search database: '.$err[2];
$_RDATA['sp_domains'] = json_decode($_ODATA['sp_domains'], true); if (!is_array($_ODATA['sp_domains'])) $_ODATA['sp_domains'] = array();
if (!is_array($_RDATA['sp_domains'])) $_RDATA['sp_domains'] = array(); if (count($_ODATA['sp_domains']) == 1 && $_ODATA['jw_hostname'] != key($_ODATA['sp_domains']))
if (count($_RDATA['sp_domains']) == 1 && $_ODATA['jw_hostname'] != key($_RDATA['sp_domains'])) OS_setValue('jw_hostname', key($_ODATA['sp_domains']));
OS_setValue('jw_hostname', key($_RDATA['sp_domains']));
// Match Weighting Values // Match Weighting Values

View file

@ -254,10 +254,6 @@ function OS_crawlCleanUp() {
// var_dump($cookies); // var_dump($cookies);
curl_close($_cURL); curl_close($_cURL);
OS_setValue('sp_time_end', time());
OS_setValue('sp_time_last', $_ODATA['sp_time_end'] - $_ODATA['sp_time_start']);
OS_setValue('sp_data_transferred', $_RDATA['sp_data_transferred']);
// If crawl completed successfully // If crawl completed successfully
if ($_RDATA['sp_complete']) { if ($_RDATA['sp_complete']) {
OS_crawlLog('Cleaning up database tables...', 1); OS_crawlLog('Cleaning up database tables...', 1);
@ -299,6 +295,9 @@ function OS_crawlCleanUp() {
// If crawl completed successfully AND we truncated the old table // If crawl completed successfully AND we truncated the old table
if ($_RDATA['sp_complete']) { if ($_RDATA['sp_complete']) {
OS_setValue('sp_time_end', time());
OS_setValue('sp_time_last', $_ODATA['sp_time_end'] - $_ODATA['sp_time_start']);
// Select all rows from the temp table into the existing search table // Select all rows from the temp table into the existing search table
$insert = $_DDATA['pdo']->query( $insert = $_DDATA['pdo']->query(
'INSERT INTO `'.$_DDATA['tbprefix'].'crawldata` 'INSERT INTO `'.$_DDATA['tbprefix'].'crawldata`
@ -330,9 +329,8 @@ function OS_crawlCleanUp() {
'OPTIMIZE TABLE `'.$_DDATA['tbprefix'].'query`;' 'OPTIMIZE TABLE `'.$_DDATA['tbprefix'].'query`;'
); );
OS_setValue('sp_links_crawled', count($_RDATA['sp_links']));
OS_setValue('sp_pages_stored', count($_RDATA['sp_store'])); OS_setValue('sp_pages_stored', count($_RDATA['sp_store']));
OS_setValue('sp_domains', json_encode($_RDATA['sp_domains'])); OS_setValue('sp_domains', $_RDATA['sp_domains']);
OS_setValue('sp_time_end_success', $_ODATA['sp_time_end']); OS_setValue('sp_time_end_success', $_ODATA['sp_time_end']);
OS_crawlLog('***** Crawl completed in '.$_ODATA['sp_time_last'].'s *****', 1); OS_crawlLog('***** Crawl completed in '.$_ODATA['sp_time_last'].'s *****', 1);
@ -341,7 +339,7 @@ function OS_crawlCleanUp() {
if ($_RDATA['sp_sleep']) if ($_RDATA['sp_sleep'])
OS_crawlLog('Time spent sleeping: '.(round($_RDATA['sp_sleep'] / 10) / 100).'s', 1); OS_crawlLog('Time spent sleeping: '.(round($_RDATA['sp_sleep'] / 10) / 100).'s', 1);
OS_crawlLog('Time taken by cURL: '.(round($_RDATA['sp_time_curl'] * 100) / 100).'s', 1); OS_crawlLog('Time taken by cURL: '.(round($_RDATA['sp_time_curl'] * 100) / 100).'s', 1);
OS_crawlLog($_ODATA['sp_links_crawled'].' page'.(($_ODATA['sp_links_crawled'] == 1) ? '' : 's').' crawled', 1); OS_crawlLog($_ODATA['sp_progress'][0].' page'.(($_ODATA['sp_progress'][0] == 1) ? '' : 's').' crawled', 1);
OS_crawlLog($_ODATA['sp_pages_stored'].' page'.(($_ODATA['sp_pages_stored'] == 1) ? '' : 's').' stored', 1); OS_crawlLog($_ODATA['sp_pages_stored'].' page'.(($_ODATA['sp_pages_stored'] == 1) ? '' : 's').' stored', 1);
if ($_RDATA['sp_status']['New']) if ($_RDATA['sp_status']['New'])
@ -398,7 +396,10 @@ function OS_crawlCleanUp() {
// Else the crawl failed // Else the crawl failed
} else { } else {
OS_setValue('sp_time_last', $_ODATA['sp_time_end'] - $_ODATA['sp_time_start']);
OS_crawlLog('***** Crawl failed; runtime '.$_ODATA['sp_time_last'].'s *****', 1); OS_crawlLog('***** Crawl failed; runtime '.$_ODATA['sp_time_last'].'s *****', 1);
OS_crawlLog('Total data transferred: '.OS_readSize($_RDATA['sp_data_transferred']), 1);
OS_crawlLog('Search table was NOT updated', 1); OS_crawlLog('Search table was NOT updated', 1);
if ($_ODATA['sp_sitemap_file']) if ($_ODATA['sp_sitemap_file'])
@ -469,7 +470,7 @@ switch ($_SERVER['REQUEST_METHOD']) {
if ($_ODATA['sp_crawling']) { if ($_ODATA['sp_crawling']) {
$response = array( $response = array(
'status' => 'Error', 'status' => 'Error',
'message' => 'Crawler is already running; current progress: '.$_ODATA['sp_progress'] 'message' => 'Crawler is already running; current progress: '.$_ODATA['sp_progress'][0].'/'.$_ODATA['sp_progress'][1]
); );
} }
@ -506,15 +507,11 @@ switch ($_SERVER['REQUEST_METHOD']) {
$response = array( $response = array(
'status' => ($_ODATA['sp_crawling']) ? 'Crawling' : 'Complete', 'status' => ($_ODATA['sp_crawling']) ? 'Crawling' : 'Complete',
'progress' => $_ODATA['sp_progress'], 'progress' => $_ODATA['sp_progress'],
'time_crawl' => time() - $_ODATA['sp_time_start'],
'time_end' => $_ODATA['sp_time_end'],
'time_end_success' => $_ODATA['sp_time_end_success'],
'time_last' => $_ODATA['sp_time_last'],
'timeout_crawl' => $_ODATA['sp_timeout_crawl'],
'data_transferred' => $_ODATA['sp_data_transferred'], 'data_transferred' => $_ODATA['sp_data_transferred'],
'data_stored' => $_ODATA['sp_data_stored'], 'time_crawl' => time() - $_ODATA['sp_time_start'],
'links_crawled' => $_ODATA['sp_links_crawled'], 'time_start' => $_ODATA['sp_time_start'],
'pages_stored' => $_ODATA['sp_pages_stored'], 'time_end' => $_ODATA['sp_time_end'],
'timeout_crawl' => $_ODATA['sp_timeout_crawl'],
'tail' => trim(implode("\n", $lines)) 'tail' => trim(implode("\n", $lines))
); );
break; break;
@ -536,10 +533,7 @@ switch ($_SERVER['REQUEST_METHOD']) {
$log = file_get_contents($_ODATA['sp_log']); $log = file_get_contents($_ODATA['sp_log']);
OS_setValue('sp_log', $log."\n".'[ERROR] '.$_POST->reason); OS_setValue('sp_log', $log."\n".'[ERROR] '.$_POST->reason);
} else OS_setValue('sp_log', '[ERROR] '.$_POST->reason); } else OS_setValue('sp_log', '[ERROR] '.$_POST->reason);
OS_setValue('sp_time_end', time()); OS_setValue('sp_time_last', $_ODATA['sp_time_end'] - $_ODATA['sp_time_start']);
OS_setValue('sp_time_last', time() - $_ODATA['sp_time_start']);
OS_setValue('sp_data_transferred', 0);
OS_setValue('sp_data_stored', 0);
// Send failure email to the admin(s) // Send failure email to the admin(s)
if ($_MAIL && count($_MAIL->getAllRecipientAddresses()) && $_ODATA['sp_email_failure']) { if ($_MAIL && count($_MAIL->getAllRecipientAddresses()) && $_ODATA['sp_email_failure']) {
@ -621,10 +615,11 @@ if (function_exists('apache_setenv'))
OS_setValue('sp_crawling', 1); OS_setValue('sp_crawling', 1);
OS_setValue('sp_cancel', 0); OS_setValue('sp_cancel', 0);
OS_setValue('sp_time_start', time()); OS_setValue('sp_time_start', time());
OS_setValue('sp_links_crawled', 0);
OS_setValue('sp_progress', array(0, 0));
OS_setValue('sp_pages_stored', 0); OS_setValue('sp_pages_stored', 0);
OS_setValue('sp_data_stored', 0);
OS_setValue('sp_data_transferred', 0); OS_setValue('sp_data_transferred', 0);
OS_setValue('sp_data_stored', 0);
OS_setValue('sp_time_last', 0); OS_setValue('sp_time_last', 0);
@ -651,7 +646,6 @@ $_RDATA['sp_robots'] = array();
$_RDATA['sp_status'] = array('Orphan' => 0, 'Blocked' => 0, 'Not Found' => 0, 'Updated' => 0, 'New' => 0); $_RDATA['sp_status'] = array('Orphan' => 0, 'Blocked' => 0, 'Not Found' => 0, 'Updated' => 0, 'New' => 0);
$_RDATA['sp_filter'] = array(); $_RDATA['sp_filter'] = array();
$_RDATA['sp_prev_dls'] = 0; $_RDATA['sp_prev_dls'] = 0;
$_RDATA['sp_data_transferred'] = 0;
$_RDATA['sp_time_curl'] = 0; $_RDATA['sp_time_curl'] = 0;
$_RDATA['sp_sleep'] = 0; $_RDATA['sp_sleep'] = 0;
$_RDATA['sp_sha1'] = array(); $_RDATA['sp_sha1'] = array();
@ -888,7 +882,11 @@ while ($_cURL && count($_RDATA['sp_queue'])) {
OS_crawlLog('Memory used: '.OS_readSize(memory_get_usage(true)), 1); OS_crawlLog('Memory used: '.OS_readSize(memory_get_usage(true)), 1);
OS_crawlLog('Crawling: '.$url.' (Depth: '.$depth.')', 1); OS_crawlLog('Crawling: '.$url.' (Depth: '.$depth.')', 1);
OS_setValue('sp_progress', count($_RDATA['sp_links']).'/'.(count($_RDATA['sp_links']) + count($_RDATA['sp_queue']))); OS_setValue('sp_progress', array(
count($_RDATA['sp_links']),
count($_RDATA['sp_links']) + count($_RDATA['sp_queue'])
));
OS_setValue('sp_time_end', time());
// Set the correct If-Modified-Since request header // Set the correct If-Modified-Since request header
if ($_ODATA['sp_ifmodifiedsince'] && isset($_RDATA['sp_lastmod'][$url])) { if ($_ODATA['sp_ifmodifiedsince'] && isset($_RDATA['sp_lastmod'][$url])) {
@ -900,7 +898,7 @@ while ($_cURL && count($_RDATA['sp_queue'])) {
$data = OS_fetchURL($url, $referer); $data = OS_fetchURL($url, $referer);
// Record cURL timing and data info for this fetch // Record cURL timing and data info for this fetch
$_RDATA['sp_data_transferred'] += $data['info']['size_download']; OS_setValue('sp_data_transferred', $_ODATA['sp_data_transferred'] + $data['info']['size_download']);
$_RDATA['sp_time_curl'] += $data['info']['total_time']; $_RDATA['sp_time_curl'] += $data['info']['total_time'];

View file

@ -136,12 +136,14 @@ for (let x = 0; x < countUpTimers.length; x++) {
} else this.spans[2].classList.remove('d-none'); } else this.spans[2].classList.remove('d-none');
let secPlural = (parseInt(this.spans[3].tVar.firstChild.nodeValue) == 1) ? 0 : 1; let secPlural = (parseInt(this.spans[3].tVar.firstChild.nodeValue) == 1) ? 0 : 1;
this.spans[3].tVar.nextSibling.nodeValue = ' ' + countUpPeriods[3][secPlural] + ' ago'; this.spans[3].tVar.nextSibling.nodeValue = ' ' + countUpPeriods[3][secPlural];
}; };
setInterval(function() { if (countUpTimers[x].classList.contains('active')) {
countUpTimers[x].incrementTime(); countUpTimers[x].interval = setInterval(function() {
}, 1000); countUpTimers[x].incrementTime();
}, 1000);
}
} }
@ -420,76 +422,38 @@ let os_get_crawl_progress = function(getLog) {
os_crawl_start.innerHTML = 'Crawling...'; os_crawl_start.innerHTML = 'Crawling...';
os_crawl_navbar.innerHTML = 'Crawling...'; os_crawl_navbar.innerHTML = 'Crawling...';
let os_countup_time_end = document.getElementById('os_countup_time_end');
if (os_countup_time_end) {
clearInterval(os_countup_time_end.interval);
os_countup_time_end.parentNode.innerHTML = '<em>Currently crawling...</em>';
}
let os_countup_time_crawl = document.getElementById('os_countup_time_crawl');
if (os_countup_time_crawl) {
os_countup_time_crawl.classList.add('active');
os_countup_time_crawl.setAttribute('data-start', data.time_start);
os_countup_time_crawl.interval = setInterval(function() {
os_countup_time_crawl.incrementTime();
}, 1000);
}
// Start an interval progress check // Start an interval progress check
clearInterval(os_crawl_interval); clearInterval(os_crawl_interval);
os_crawl_interval = setInterval(os_get_crawl_progress, 1000); os_crawl_interval = setInterval(os_get_crawl_progress, 1000);
// Else check if the given time_end is later than the time this // Else check if the given time_end is later than the time this
// page was loaded; if so, a crawl has finished after this page // page was loaded; if so, a crawl has finished after this page
// was loaded; if we are on the Crawler Management page, update // was loaded; if so, reload the page
// all the info there
} else if (os_crawl_loaded < data.time_end) { } else if (os_crawl_loaded < data.time_end) {
os_crawl_loaded = parseInt((new Date()).getTime() / 1000);
os_crawl_start.disabled = ''; // Check if the crawler modal window is open
os_crawl_start.innerHTML = 'Start Crawl'; if (crawlerModal && crawlerModal.classList.contains('show')) {
let os_countup_time_end = document.getElementById('os_countup_time_end');
if (os_countup_time_end) {
os_countup_time_end.setAttribute('data-start', data.time_end);
// Try to locate the warning <p> element
let pDanger = os_countup_time_end.parentNode.parentNode.querySelector('p.data-text.text-danger');
// If the time_end does not match the time_end_success, then
// the last crawl did not succeed; show the error message
if (data.time_end != data.time_end_success) {
if (!pDanger) {
let pDanger = document.createElement('p');
pDanger.classList.add('data-text', 'text-danger');
let strong = document.createElement('strong');
strong.appendChild(document.createTextNode('Warning:'));
pDanger.appendChild(strong);
pDanger.appendChild(document.createTextNode(' The previous crawl did not complete successfully. Please check the crawl log for more details.'));
os_countup_time_end.parentNode.parentNode.appendChild(pDanger);
}
// Else if it matches, it was successful, remove any warning
} else if (pDanger) pDanger.parentNode.removeChild(pDanger);
// Update the Crawl information items
let os_crawl_time_last = document.getElementById('os_crawl_time_last');
os_crawl_time_last.innerHTML = data.time_last + ' <abbr title="seconds">s</abbr>';
let os_crawl_data_transferred = document.getElementById('os_crawl_data_transferred');
os_crawl_data_transferred.innerHTML = os_readSize(data.data_transferred, true);
let os_crawl_data_stored = document.getElementById('os_crawl_data_stored');
let text = '';
if (data.data_transferred) {
text += '<small data-bs-toggle="tooltip" data-bs-placement="bottom" title="Efficiency percentage of data stored vs. data downloaded">';
text += '(' + (Math.round(data.data_stored * 1000 / data.data_transferred) / 10) + '%)';
text += '</small> ';
}
os_crawl_data_stored.innerHTML = text + os_readSize(data.data_stored, true);
let os_crawl_links_crawled = document.getElementById('os_crawl_links_crawled');
os_crawl_links_crawled.innerHTML = data.links_crawled;
let os_crawl_pages_stored = document.getElementById('os_crawl_pages_stored');
text = '';
if (data.links_crawled) {
text += '<small data-bs-toggle="tooltip" data-bs-placement="bottom" title="Efficiency percentage of pages stored vs. links crawled">';
text += '(' + (Math.round(data.pages_stored * 1000 / data.links_crawled) / 10) + '%)';
text += '</small> ';
}
os_crawl_pages_stored.innerHTML = text + data.pages_stored;
// If we are not on the Crawler Management page, let the user
// know there is new data, and ask to reload the page
} else if (window.confirm('A crawl has finished. Reload the page to view new data?'))
window.location.reload();
// Don't refresh the page until the user closes the modal
crawlerModal.addEventListener('hide.bs.modal', function() {
window.location.reload();
}, false);
} else window.location.reload();
} }
} }
@ -509,12 +473,23 @@ let os_get_crawl_progress = function(getLog) {
os_crawl_log.value = data.tail; os_crawl_log.value = data.tail;
if (os_crawl_interval) { if (os_crawl_interval) {
data.progress = data.progress.split('/');
os_crawl_progress.value = data.progress[0]; os_crawl_progress.value = data.progress[0];
os_crawl_progress.max = data.progress[1]; os_crawl_progress.max = data.progress[1];
os_crawl_progress.setAttribute('data-progress', data.progress[0] + ' / ' + data.progress[1]); os_crawl_progress.setAttribute('data-progress', data.progress[0] + ' / ' + data.progress[1]);
os_crawl_progress.innerHTML = Math.ceil(data.progress[0] / data.progress[1]) + '%'; os_crawl_progress.innerHTML = Math.ceil(data.progress[0] / data.progress[1]) + '%';
os_crawl_log.scrollTop = os_crawl_log.scrollHeight; os_crawl_log.scrollTop = os_crawl_log.scrollHeight;
if (os_crawl_data_transferred)
os_crawl_data_transferred.innerHTML = os_readSize(data.data_transferred, true);
if (os_crawl_data_stored)
os_crawl_data_stored.innerHTML = 0;
if (os_crawl_links_crawled)
os_crawl_links_crawled.innerHTML = data.progress[0] + ' / ' + data.progress[1];
if (os_crawl_pages_stored)
os_crawl_pages_stored.innerHTML = 0;
} }
if (!os_crawl_start.complete && data.status == 'Complete') { if (!os_crawl_start.complete && data.status == 'Complete') {
@ -542,6 +517,11 @@ let os_crawl_progress = document.getElementById('os_crawl_progress');
let os_crawl_log = document.getElementById('os_crawl_log'); let os_crawl_log = document.getElementById('os_crawl_log');
let os_crawl_log_download = document.getElementById('os_crawl_log_download'); let os_crawl_log_download = document.getElementById('os_crawl_log_download');
let os_crawl_data_transferred = document.getElementById('os_crawl_data_transferred');
let os_crawl_data_stored = document.getElementById('os_crawl_data_stored');
let os_crawl_links_crawled = document.getElementById('os_crawl_links_crawled');
let os_crawl_pages_stored = document.getElementById('os_crawl_pages_stored');
os_crawl_cancel.force = false; os_crawl_cancel.force = false;
os_crawl_cancel.reason = ''; os_crawl_cancel.reason = '';
os_crawl_start.allow_grep = false; os_crawl_start.allow_grep = false;
@ -587,6 +567,21 @@ os_crawl_start.addEventListener('click', function(e) {
os_crawl_start.innerHTML = 'Crawling...'; os_crawl_start.innerHTML = 'Crawling...';
os_crawl_navbar.innerHTML = 'Crawling...'; os_crawl_navbar.innerHTML = 'Crawling...';
let os_countup_time_end = document.getElementById('os_countup_time_end');
if (os_countup_time_end) {
clearInterval(os_countup_time_end.interval);
os_countup_time_end.parentNode.innerHTML = '<em>Currently crawling...</em>';
}
let os_countup_time_crawl = document.getElementById('os_countup_time_crawl');
if (os_countup_time_crawl) {
os_countup_time_crawl.classList.add('active');
os_countup_time_crawl.setAttribute('data-start', parseInt((new Date()).getTime() / 1000));
os_countup_time_crawl.interval = setInterval(function() {
os_countup_time_crawl.incrementTime();
}, 1000);
}
fetch(new Request('./crawler.php'), { fetch(new Request('./crawler.php'), {
method: 'POST', method: 'POST',
headers: { 'Content-type': 'application/json' }, headers: { 'Content-type': 'application/json' },

View file

@ -229,7 +229,7 @@ if ($_RDATA['s_searchable_pages']) {
// Try to json_decode the cache data // Try to json_decode the cache data
// If this step fails, assume there is no cache data // If this step fails, assume there is no cache data
$checkJS = json_decode($_SDATA['cache']['data'], true); $checkJS = json_decode($_SDATA['cache']['data'], true);
$_SDATA['cache']['data'] = ($checkJS) ? $checkJS : ''; $_SDATA['cache']['data'] = (!is_null($checkJS)) ? $checkJS : '';
} }
// Database error accessing the query log // Database error accessing the query log
@ -603,8 +603,8 @@ if ($_RDATA['s_searchable_pages']) {
$_ORCINUS->searchable->searched->results->result_list = array(); $_ORCINUS->searchable->searched->results->result_list = array();
// Prepare PCRE for removing base domains // Prepare PCRE for removing base domains
if (count($_RDATA['sp_domains']) == 1) if (count($_ODATA['sp_domains']) == 1)
$repStr = '/^'.preg_quote(key($_RDATA['sp_domains']), '/').'/'; $repStr = '/^'.preg_quote(key($_ODATA['sp_domains']), '/').'/';
// Do a last once-over of the results // Do a last once-over of the results
foreach ($resultsPage as $key => $result) { foreach ($resultsPage as $key => $result) {
@ -633,7 +633,7 @@ if ($_RDATA['s_searchable_pages']) {
$_RESULT->relevance = number_format($result['relevance'], 2, '.', ''); $_RESULT->relevance = number_format($result['relevance'], 2, '.', '');
// Remove base domain from URL if they are all the same // Remove base domain from URL if they are all the same
if (count($_RDATA['sp_domains']) == 1) if (count($_ODATA['sp_domains']) == 1)
$result['url'] = preg_replace($repStr, '', $result['url']); $result['url'] = preg_replace($repStr, '', $result['url']);
// Highlight the terms in the title, url and matchtext // Highlight the terms in the title, url and matchtext