Remove some unnecessary continues

Also add documentation for the crawler debug mode. Scope fixes for JS output, still need to work on this.
2023-06-02 14:05:52 -04:00 · 2023-06-02 14:05:52 -04:00 · 8b024c438c
parent 56c84a89cb
commit 8b024c438c
2 changed files with 33 additions and 26 deletions
--- a/orcinus/admin.php
+++ b/orcinus/admin.php
@ -906,12 +906,13 @@ if (!$_SESSION['admin_username']) {
                $select[$key]['words'] = array_unique(explode(' ', $row['content']));

                foreach ($select[$key]['words'] as $index => $word) {
-                  if (!$word) continue;
+                  if ($word) {
                    if (empty($words[$word])) {
                      $words[$word] = 1;
                    } else $words[$word]++;
                  }
                }
+              }

              // Use the word frequency report to create a filter of
              // words that are more common than the compression
@ -1187,7 +1188,7 @@ if (os_crawldata.length) {

      // Prepare PCRE match text for each phrase and term
      let filetypes = [];
-      for (let x = 0, term; x < os_sdata.terms.length; x++) {
+      for (let x = 0; x < os_sdata.terms.length; x++) {

        // Normalize punctuation
        Object.keys(os_rdata.sp_smart).forEach(key => {
@ -1196,9 +1197,9 @@ if (os_crawldata.length) {

        switch (os_sdata.terms[x][0]) {
          case 'filetype':
-            if (os_rdata.s_filetypes[term.toUpperCase()])
-              for (let z = 0; z < os_rdata.s_filetypes[term.toUpperCase()].length; z++)
-                filetypes.push(os_rdata.s_filetypes[term.toUpperCase()][z]);
+            if (os_rdata.s_filetypes[os_sdata.terms[x][1].toUpperCase()])
+              for (let z = 0; z < os_rdata.s_filetypes[os_sdata.terms[x][1].toUpperCase()].length; z++)
+                filetypes.push(os_rdata.s_filetypes[os_sdata.terms[x][1].toUpperCase()][z]);
            break;

          case 'exclude':
@ -1227,7 +1228,8 @@ if (os_crawldata.length) {
      // ***** There is never any cache, so do an actual search
      for (let y = os_crawldata.length - 1; y >= 0; y--) {
        if (filetypes.length) {
-          for (let x = 0, allowMime = false; x < filetypes.length; x++)
+          let allowMime = false;
+          for (let x = 0; x < filetypes.length; x++)
            if (os_crawldata[y].content_mime == filetypes[x]) allowMime = true;
          if (!allowMime) {
            os_crawldata.splice(y, 1);
--- a/orcinus/crawler.php
+++ b/orcinus/crawler.php
@ -2,6 +2,15 @@


 require __DIR__.'/config.php';
+
+// Setting the $_RDATA['debug'] value to true will allow you to start
+// the crawler just by visiting this file's URL using your web browser.
+// It will output the log lines as well as any PHP errors that may
+// occur. It will also report how much memory the script is using. Use
+// this mode if your crawls are failing but the logs alone aren't
+// enough to tell you why. DO NOT leave the crawler in debug mode in a
+// production environment, or anyone can just run your crawler whenever
+// they want!
 $_RDATA['debug'] = false;


@ -800,10 +809,9 @@ if (in_array($_DDATA['tbprefix'].'crawltemp', $_DDATA['tables'], true)) {
          foreach ($_RDATA['sp_queue'] as $queue)
            if ($link == $queue[0]) continue 2;

-          // ... and if link passes our user filters
-          if ($nx = OS_filterURL($link, $row['url'])) continue;
-
-          // ... then add the link to the queue
+          // ... and if link passes our user filters, add the link to
+          // the queue
+          if (!OS_filterURL($link, $row['url']))
            $_RDATA['sp_queue'][] = array($link, 0, $row['url']);
        }
      }
@ -1662,11 +1670,9 @@ while ($_cURL && count($_RDATA['sp_queue'])) {
        // ... and if link passes our user filters
        if ($nx = OS_filterURL($link, $data['base'])) {
          OS_crawlLog('Link ignored due to noindex rule \''.$nx.'\': '.$link, 0);
-          continue;
-        }

        // ... then add the link to the queue
-        $_RDATA['sp_queue'][] = array($link, $depth + 1, $url);
+        } else $_RDATA['sp_queue'][] = array($link, $depth + 1, $url);
      }
    }
  }
@ -1683,16 +1689,16 @@ while ($_cURL && count($_RDATA['sp_queue'])) {

      foreach ($_RDATA['sp_exist'] as $key => $link) {

-        // If orphan URL passes our user filters
+        // Check if orphan URL passes our user filters
        if ($nx = OS_filterURL($link, $data['base'])) {
+
+          // If not, remove it from the sp_exist list
          OS_crawlLog('Orphan URL ignored due to noindex rule \''.$nx.'\': '.$link, 0);
          $_RDATA['sp_status']['Blocked']++;
          unset($_RDATA['sp_exist'][$key]);
-          continue;
-        }

-        // ... then add the orphan to the queue
-        $_RDATA['sp_queue'][] = array($link, 0, '<orphan>');
+        // If so, then add the orphan to the queue
+        } else $_RDATA['sp_queue'][] = array($link, 0, '<orphan>');
      }

    // Else if we stored some pages, we're done
@ -1717,10 +1723,9 @@ if ($_RDATA['sp_complete'] && $_ODATA['sp_sitemap_file']) {
      $sm[] = '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">';
      foreach ($_RDATA['sp_sitemap'] as $sitemap) {
        $sm[] = '  <url>';
-        foreach ($sitemap as $key => $value) {
-          if ($key == 'priority' && $value == 0.5) continue;
+        foreach ($sitemap as $key => $value)
+          if ($key != 'priority' || $value != 0.5)
            $sm[] = '    <'.$key.'>'.$value.'</'.$key.'>';
-        }
        $sm[] = '  </url>';
      }
      $sm[] = '</urlset>';