Remove some unnecessary continues
Also add documentation for the crawler debug mode. Scope fixes for JS output, still need to work on this.
This commit is contained in:
parent
56c84a89cb
commit
8b024c438c
|
@ -906,12 +906,13 @@ if (!$_SESSION['admin_username']) {
|
|||
$select[$key]['words'] = array_unique(explode(' ', $row['content']));
|
||||
|
||||
foreach ($select[$key]['words'] as $index => $word) {
|
||||
if (!$word) continue;
|
||||
if ($word) {
|
||||
if (empty($words[$word])) {
|
||||
$words[$word] = 1;
|
||||
} else $words[$word]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Use the word frequency report to create a filter of
|
||||
// words that are more common than the compression
|
||||
|
@ -1187,7 +1188,7 @@ if (os_crawldata.length) {
|
|||
|
||||
// Prepare PCRE match text for each phrase and term
|
||||
let filetypes = [];
|
||||
for (let x = 0, term; x < os_sdata.terms.length; x++) {
|
||||
for (let x = 0; x < os_sdata.terms.length; x++) {
|
||||
|
||||
// Normalize punctuation
|
||||
Object.keys(os_rdata.sp_smart).forEach(key => {
|
||||
|
@ -1196,9 +1197,9 @@ if (os_crawldata.length) {
|
|||
|
||||
switch (os_sdata.terms[x][0]) {
|
||||
case 'filetype':
|
||||
if (os_rdata.s_filetypes[term.toUpperCase()])
|
||||
for (let z = 0; z < os_rdata.s_filetypes[term.toUpperCase()].length; z++)
|
||||
filetypes.push(os_rdata.s_filetypes[term.toUpperCase()][z]);
|
||||
if (os_rdata.s_filetypes[os_sdata.terms[x][1].toUpperCase()])
|
||||
for (let z = 0; z < os_rdata.s_filetypes[os_sdata.terms[x][1].toUpperCase()].length; z++)
|
||||
filetypes.push(os_rdata.s_filetypes[os_sdata.terms[x][1].toUpperCase()][z]);
|
||||
break;
|
||||
|
||||
case 'exclude':
|
||||
|
@ -1227,7 +1228,8 @@ if (os_crawldata.length) {
|
|||
// ***** There is never any cache, so do an actual search
|
||||
for (let y = os_crawldata.length - 1; y >= 0; y--) {
|
||||
if (filetypes.length) {
|
||||
for (let x = 0, allowMime = false; x < filetypes.length; x++)
|
||||
let allowMime = false;
|
||||
for (let x = 0; x < filetypes.length; x++)
|
||||
if (os_crawldata[y].content_mime == filetypes[x]) allowMime = true;
|
||||
if (!allowMime) {
|
||||
os_crawldata.splice(y, 1);
|
||||
|
|
|
@ -2,6 +2,15 @@
|
|||
|
||||
|
||||
require __DIR__.'/config.php';
|
||||
|
||||
// Setting the $_RDATA['debug'] value to true will allow you to start
|
||||
// the crawler just by visiting this file's URL using your web browser.
|
||||
// It will output the log lines as well as any PHP errors that may
|
||||
// occur. It will also report how much memory the script is using. Use
|
||||
// this mode if your crawls are failing but the logs alone aren't
|
||||
// enough to tell you why. DO NOT leave the crawler in debug mode in a
|
||||
// production environment, or anyone can just run your crawler whenever
|
||||
// they want!
|
||||
$_RDATA['debug'] = false;
|
||||
|
||||
|
||||
|
@ -800,10 +809,9 @@ if (in_array($_DDATA['tbprefix'].'crawltemp', $_DDATA['tables'], true)) {
|
|||
foreach ($_RDATA['sp_queue'] as $queue)
|
||||
if ($link == $queue[0]) continue 2;
|
||||
|
||||
// ... and if link passes our user filters
|
||||
if ($nx = OS_filterURL($link, $row['url'])) continue;
|
||||
|
||||
// ... then add the link to the queue
|
||||
// ... and if link passes our user filters, add the link to
|
||||
// the queue
|
||||
if (!OS_filterURL($link, $row['url']))
|
||||
$_RDATA['sp_queue'][] = array($link, 0, $row['url']);
|
||||
}
|
||||
}
|
||||
|
@ -1662,11 +1670,9 @@ while ($_cURL && count($_RDATA['sp_queue'])) {
|
|||
// ... and if link passes our user filters
|
||||
if ($nx = OS_filterURL($link, $data['base'])) {
|
||||
OS_crawlLog('Link ignored due to noindex rule \''.$nx.'\': '.$link, 0);
|
||||
continue;
|
||||
}
|
||||
|
||||
// ... then add the link to the queue
|
||||
$_RDATA['sp_queue'][] = array($link, $depth + 1, $url);
|
||||
} else $_RDATA['sp_queue'][] = array($link, $depth + 1, $url);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1683,16 +1689,16 @@ while ($_cURL && count($_RDATA['sp_queue'])) {
|
|||
|
||||
foreach ($_RDATA['sp_exist'] as $key => $link) {
|
||||
|
||||
// If orphan URL passes our user filters
|
||||
// Check if orphan URL passes our user filters
|
||||
if ($nx = OS_filterURL($link, $data['base'])) {
|
||||
|
||||
// If not, remove it from the sp_exist list
|
||||
OS_crawlLog('Orphan URL ignored due to noindex rule \''.$nx.'\': '.$link, 0);
|
||||
$_RDATA['sp_status']['Blocked']++;
|
||||
unset($_RDATA['sp_exist'][$key]);
|
||||
continue;
|
||||
}
|
||||
|
||||
// ... then add the orphan to the queue
|
||||
$_RDATA['sp_queue'][] = array($link, 0, '<orphan>');
|
||||
// If so, then add the orphan to the queue
|
||||
} else $_RDATA['sp_queue'][] = array($link, 0, '<orphan>');
|
||||
}
|
||||
|
||||
// Else if we stored some pages, we're done
|
||||
|
@ -1717,10 +1723,9 @@ if ($_RDATA['sp_complete'] && $_ODATA['sp_sitemap_file']) {
|
|||
$sm[] = '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">';
|
||||
foreach ($_RDATA['sp_sitemap'] as $sitemap) {
|
||||
$sm[] = ' <url>';
|
||||
foreach ($sitemap as $key => $value) {
|
||||
if ($key == 'priority' && $value == 0.5) continue;
|
||||
foreach ($sitemap as $key => $value)
|
||||
if ($key != 'priority' || $value != 0.5)
|
||||
$sm[] = ' <'.$key.'>'.$value.'</'.$key.'>';
|
||||
}
|
||||
$sm[] = ' </url>';
|
||||
}
|
||||
$sm[] = '</urlset>';
|
||||
|
|
Loading…
Reference in a new issue