From 89f6fc239327afaa379be807e585da0f89019253 Mon Sep 17 00:00:00 2001 From: Brian Huisman Date: Tue, 30 May 2023 15:53:41 -0400 Subject: [PATCH] Try to resume a failed crawl. Attempt to resume a crawl if it exited without going through the shutdown function --- orcinus/crawler.php | 65 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 60 insertions(+), 5 deletions(-) diff --git a/orcinus/crawler.php b/orcinus/crawler.php index c34e120..3850751 100644 --- a/orcinus/crawler.php +++ b/orcinus/crawler.php @@ -759,14 +759,69 @@ if ($err[0] == '00000') { } else OS_crawlLog('Error getting list of previous URLs from crawldata table', 2); -// Drop previous temp table if it exists -$drop = $_DDATA['pdo']->query( - 'DROP TABLE IF EXISTS `'.$_DDATA['tbprefix'].'crawltemp`;' -); +// If the crawltemp table exists here, that means a crawl was +// interrupted without going through the shutdown function. +// Use the data from this partially completed crawl to resume it. +if (in_array($_DDATA['tbprefix'].'crawltemp', $_DDATA['tables'], true)) { + $select = $_DDATA['pdo']->query( + 'SELECT `url`, `links` FROM `'.$_DDATA['tbprefix'].'crawltemp`;' + ); + $err = $select->errorInfo(); + if ($err[0] == '00000') { + OS_crawlLog('Previous crawl data exists; using it to resume crawling...', 1); + + $select = $select->fetchAll(); + + // Run through every entry in the crawltemp table + foreach ($select as $row) { + + // In an entry matches an existing URL in the queue (just + // starting URLs right now) then delete that queue entry + foreach ($_RDATA['sp_queue'] as $key => $queue) + if ($row['url'] == $queue[0]) + unset($_RDATA['sp_queue'][$key]); + + // Add it to the 'stored' and 'crawled links' list + $_RDATA['sp_store'][] = $row['url']; + $_RDATA['sp_links'][] = $row['url']; + OS_crawlLog('Crawl data found for: '.$row['url'], 0); + + // Add links from the entry to the queue + $row['links'] = json_decode($row['links'], true); + foreach ($row['links'] as $link) { + + $link = OS_formatURL($link, $url); + + // ***** If this link hasn't been crawled yet + if (!in_array($link, $_RDATA['sp_links'], true)) { + + // ... and if link hasn't been queued yet + foreach ($_RDATA['sp_queue'] as $queue) + if ($link == $queue[0]) continue 2; + + // ... and if link passes our user filters + if ($nx = OS_filterURL($link, $url)) { + OS_crawlLog('Link ignored due to noindex rule \''.$nx.'\': '.$link, 0); + continue; + } + + // ... then add the link to the queue + $_RDATA['sp_queue'][] = array($link, 0, $url); + } + } + } + + // We couldn't select any data from the crawltemp table so delete it + } else { + $drop = $_DDATA['pdo']->query( + 'DROP TABLE IF EXISTS `'.$_DDATA['tbprefix'].'crawltemp`;' + ); + } +} // Create a temp MySQL storage table using schema of the existing table $create = $_DDATA['pdo']->query( - 'CREATE TABLE `'.$_DDATA['tbprefix'].'crawltemp` + 'CREATE TABLE IF NOT EXISTS`'.$_DDATA['tbprefix'].'crawltemp` LIKE `'.$_DDATA['tbprefix'].'crawldata`;' );