Try to resume a failed crawl.
Attempt to resume a crawl if it exited without going through the shutdown function
This commit is contained in:
parent
c7c4960e1e
commit
89f6fc2393
|
@ -759,14 +759,69 @@ if ($err[0] == '00000') {
|
||||||
} else OS_crawlLog('Error getting list of previous URLs from crawldata table', 2);
|
} else OS_crawlLog('Error getting list of previous URLs from crawldata table', 2);
|
||||||
|
|
||||||
|
|
||||||
// Drop previous temp table if it exists
|
// If the crawltemp table exists here, that means a crawl was
|
||||||
$drop = $_DDATA['pdo']->query(
|
// interrupted without going through the shutdown function.
|
||||||
'DROP TABLE IF EXISTS `'.$_DDATA['tbprefix'].'crawltemp`;'
|
// Use the data from this partially completed crawl to resume it.
|
||||||
);
|
if (in_array($_DDATA['tbprefix'].'crawltemp', $_DDATA['tables'], true)) {
|
||||||
|
$select = $_DDATA['pdo']->query(
|
||||||
|
'SELECT `url`, `links` FROM `'.$_DDATA['tbprefix'].'crawltemp`;'
|
||||||
|
);
|
||||||
|
$err = $select->errorInfo();
|
||||||
|
if ($err[0] == '00000') {
|
||||||
|
OS_crawlLog('Previous crawl data exists; using it to resume crawling...', 1);
|
||||||
|
|
||||||
|
$select = $select->fetchAll();
|
||||||
|
|
||||||
|
// Run through every entry in the crawltemp table
|
||||||
|
foreach ($select as $row) {
|
||||||
|
|
||||||
|
// In an entry matches an existing URL in the queue (just
|
||||||
|
// starting URLs right now) then delete that queue entry
|
||||||
|
foreach ($_RDATA['sp_queue'] as $key => $queue)
|
||||||
|
if ($row['url'] == $queue[0])
|
||||||
|
unset($_RDATA['sp_queue'][$key]);
|
||||||
|
|
||||||
|
// Add it to the 'stored' and 'crawled links' list
|
||||||
|
$_RDATA['sp_store'][] = $row['url'];
|
||||||
|
$_RDATA['sp_links'][] = $row['url'];
|
||||||
|
OS_crawlLog('Crawl data found for: '.$row['url'], 0);
|
||||||
|
|
||||||
|
// Add links from the entry to the queue
|
||||||
|
$row['links'] = json_decode($row['links'], true);
|
||||||
|
foreach ($row['links'] as $link) {
|
||||||
|
|
||||||
|
$link = OS_formatURL($link, $url);
|
||||||
|
|
||||||
|
// ***** If this link hasn't been crawled yet
|
||||||
|
if (!in_array($link, $_RDATA['sp_links'], true)) {
|
||||||
|
|
||||||
|
// ... and if link hasn't been queued yet
|
||||||
|
foreach ($_RDATA['sp_queue'] as $queue)
|
||||||
|
if ($link == $queue[0]) continue 2;
|
||||||
|
|
||||||
|
// ... and if link passes our user filters
|
||||||
|
if ($nx = OS_filterURL($link, $url)) {
|
||||||
|
OS_crawlLog('Link ignored due to noindex rule \''.$nx.'\': '.$link, 0);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ... then add the link to the queue
|
||||||
|
$_RDATA['sp_queue'][] = array($link, 0, $url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We couldn't select any data from the crawltemp table so delete it
|
||||||
|
} else {
|
||||||
|
$drop = $_DDATA['pdo']->query(
|
||||||
|
'DROP TABLE IF EXISTS `'.$_DDATA['tbprefix'].'crawltemp`;'
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Create a temp MySQL storage table using schema of the existing table
|
// Create a temp MySQL storage table using schema of the existing table
|
||||||
$create = $_DDATA['pdo']->query(
|
$create = $_DDATA['pdo']->query(
|
||||||
'CREATE TABLE `'.$_DDATA['tbprefix'].'crawltemp`
|
'CREATE TABLE IF NOT EXISTS`'.$_DDATA['tbprefix'].'crawltemp`
|
||||||
LIKE `'.$_DDATA['tbprefix'].'crawldata`;'
|
LIKE `'.$_DDATA['tbprefix'].'crawldata`;'
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue