Remove some unnecessary continues

Also add documentation for the crawler debug mode.
Scope fixes for JS output, still need to work on this.
This commit is contained in:
Brian Huisman 2023-06-02 14:05:52 -04:00
parent 56c84a89cb
commit 8b024c438c
2 changed files with 33 additions and 26 deletions

View file

@ -906,10 +906,11 @@ if (!$_SESSION['admin_username']) {
$select[$key]['words'] = array_unique(explode(' ', $row['content']));
foreach ($select[$key]['words'] as $index => $word) {
if (!$word) continue;
if (empty($words[$word])) {
$words[$word] = 1;
} else $words[$word]++;
if ($word) {
if (empty($words[$word])) {
$words[$word] = 1;
} else $words[$word]++;
}
}
}
@ -1187,7 +1188,7 @@ if (os_crawldata.length) {
// Prepare PCRE match text for each phrase and term
let filetypes = [];
for (let x = 0, term; x < os_sdata.terms.length; x++) {
for (let x = 0; x < os_sdata.terms.length; x++) {
// Normalize punctuation
Object.keys(os_rdata.sp_smart).forEach(key => {
@ -1196,9 +1197,9 @@ if (os_crawldata.length) {
switch (os_sdata.terms[x][0]) {
case 'filetype':
if (os_rdata.s_filetypes[term.toUpperCase()])
for (let z = 0; z < os_rdata.s_filetypes[term.toUpperCase()].length; z++)
filetypes.push(os_rdata.s_filetypes[term.toUpperCase()][z]);
if (os_rdata.s_filetypes[os_sdata.terms[x][1].toUpperCase()])
for (let z = 0; z < os_rdata.s_filetypes[os_sdata.terms[x][1].toUpperCase()].length; z++)
filetypes.push(os_rdata.s_filetypes[os_sdata.terms[x][1].toUpperCase()][z]);
break;
case 'exclude':
@ -1227,7 +1228,8 @@ if (os_crawldata.length) {
// ***** There is never any cache, so do an actual search
for (let y = os_crawldata.length - 1; y >= 0; y--) {
if (filetypes.length) {
for (let x = 0, allowMime = false; x < filetypes.length; x++)
let allowMime = false;
for (let x = 0; x < filetypes.length; x++)
if (os_crawldata[y].content_mime == filetypes[x]) allowMime = true;
if (!allowMime) {
os_crawldata.splice(y, 1);

View file

@ -2,6 +2,15 @@
require __DIR__.'/config.php';
// Setting the $_RDATA['debug'] value to true will allow you to start
// the crawler just by visiting this file's URL using your web browser.
// It will output the log lines as well as any PHP errors that may
// occur. It will also report how much memory the script is using. Use
// this mode if your crawls are failing but the logs alone aren't
// enough to tell you why. DO NOT leave the crawler in debug mode in a
// production environment, or anyone can just run your crawler whenever
// they want!
$_RDATA['debug'] = false;
@ -800,11 +809,10 @@ if (in_array($_DDATA['tbprefix'].'crawltemp', $_DDATA['tables'], true)) {
foreach ($_RDATA['sp_queue'] as $queue)
if ($link == $queue[0]) continue 2;
// ... and if link passes our user filters
if ($nx = OS_filterURL($link, $row['url'])) continue;
// ... then add the link to the queue
$_RDATA['sp_queue'][] = array($link, 0, $row['url']);
// ... and if link passes our user filters, add the link to
// the queue
if (!OS_filterURL($link, $row['url']))
$_RDATA['sp_queue'][] = array($link, 0, $row['url']);
}
}
}
@ -1662,11 +1670,9 @@ while ($_cURL && count($_RDATA['sp_queue'])) {
// ... and if link passes our user filters
if ($nx = OS_filterURL($link, $data['base'])) {
OS_crawlLog('Link ignored due to noindex rule \''.$nx.'\': '.$link, 0);
continue;
}
// ... then add the link to the queue
$_RDATA['sp_queue'][] = array($link, $depth + 1, $url);
} else $_RDATA['sp_queue'][] = array($link, $depth + 1, $url);
}
}
}
@ -1683,16 +1689,16 @@ while ($_cURL && count($_RDATA['sp_queue'])) {
foreach ($_RDATA['sp_exist'] as $key => $link) {
// If orphan URL passes our user filters
// Check if orphan URL passes our user filters
if ($nx = OS_filterURL($link, $data['base'])) {
// If not, remove it from the sp_exist list
OS_crawlLog('Orphan URL ignored due to noindex rule \''.$nx.'\': '.$link, 0);
$_RDATA['sp_status']['Blocked']++;
unset($_RDATA['sp_exist'][$key]);
continue;
}
// ... then add the orphan to the queue
$_RDATA['sp_queue'][] = array($link, 0, '<orphan>');
// If so, then add the orphan to the queue
} else $_RDATA['sp_queue'][] = array($link, 0, '<orphan>');
}
// Else if we stored some pages, we're done
@ -1717,10 +1723,9 @@ if ($_RDATA['sp_complete'] && $_ODATA['sp_sitemap_file']) {
$sm[] = '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">';
foreach ($_RDATA['sp_sitemap'] as $sitemap) {
$sm[] = ' <url>';
foreach ($sitemap as $key => $value) {
if ($key == 'priority' && $value == 0.5) continue;
$sm[] = ' <'.$key.'>'.$value.'</'.$key.'>';
}
foreach ($sitemap as $key => $value)
if ($key != 'priority' || $value != 0.5)
$sm[] = ' <'.$key.'>'.$value.'</'.$key.'>';
$sm[] = ' </url>';
}
$sm[] = '</urlset>';