orcinus-search/orcinus/crawler.php

<?php /* ***** Orcinus Site Search - Web Crawling Engine *********** */


require __DIR__.'/config.php';

// Setting the $_RDATA['debug'] value to true will allow you to start
// the crawler just by visiting this file's URL using your web browser.
// It will output the log lines as well as any PHP errors that may
// occur. It will also report how much memory the script is using. Use
// this mode if your crawls are failing but the logs alone aren't
// enough to tell you why. DO NOT leave the crawler in debug mode in a
// production environment, or anyone can just run your crawler whenever
// they want!
$_RDATA['debug'] = false;


/**
 * Log a notice (0), message (1) or error (2)
 *
 */
function OS_crawlLog($text, $level = 0) {
  global $_RDATA;

  switch ($level) {
    case 1: $prefix = ''; break;
    case 2: $prefix = '[ERROR] '; break;
    default: $prefix = ' -> ';
  }

  fwrite($_RDATA['sp_log'], $prefix.$text."\n");
  if ($_RDATA['debug'] ||
       ($_SERVER['REQUEST_METHOD'] == 'CLI' &&
        $level >= $_RDATA['sp_log_clilevel'])) {
     echo $prefix.$text."\n";
  }
}


/**
 * Final prep to store content in UTF-8 format in the database
 *
 */
function OS_cleanTextUTF8(&$_, $charset, $entity = false) {
  global $_RDATA;

  if (!trim($charset)) $charset = 'ISO-8859-1';

  $_ = mb_convert_encoding($_, 'UTF-8', $charset);

  if ($entity)
    $_ = html_entity_decode($_, $entity | ENT_SUBSTITUTE, 'UTF-8');

  $_ = strtr($_, $_RDATA['sp_punct']);
  $_ = strtr($_, $_RDATA['sp_whitespace']);
  $_ = preg_replace(array('/\s/', '/ {2,}/'), ' ', trim($_));
}


/**
 * Format a full or partial URL into a full URL according to a base URL
 *
 */
function OS_formatURL($_, $base) {
  $_ = str_replace(' ', '%20', $_);
  $_ = preg_replace('/#.*$/', '', filter_var($_, FILTER_SANITIZE_URL));
  $_ = str_replace('%20', ' ', $_);
  $dirbase = preg_replace('/(?<!:\/)\/[^\/]*$/', '', $base).'/';
  $pdb = parse_url($dirbase);
  $port = (!empty($pdb['port'])) ? ':'.$pdb['port'] : '';

  if (substr($_, 0, 3) == '../') {
    $p = preg_replace('/\/[^\/]*\/$/', '/', $pdb['path']);
    $_ = $pdb['scheme'].'://'.$pdb['host'].$port.$p.substr($_, 3);
  }
  if (substr($_, 0, 2) == './') {
    $_ = $dirbase.substr($_, 2);
  } else if (substr($_, 0, 2) == '//') {
    $_ = $pdb['scheme'].':'.$_;
  } else if (substr($_, 0, 1) == '/') {
    $_ = $pdb['scheme'].'://'.$pdb['host'].$port.$_;
  } else if (substr($_, 0, 1) == '?') {
    $_ = preg_replace('/\?.*$/', '', $base).$_;
  } else if (!preg_match('/^https?:\/\//', $_)) $_ = $dirbase.$_;

  $_ = preg_replace(array('/\/[^\/]*\/\.\.\//', '/\/\.\//'), '/', $_);
  if ($_ == $pdb['scheme'].'://'.$pdb['host'] ||
      $_ == $pdb['scheme'].'://'.$pdb['host'].$port) $_ .= '/';

  return trim($_);
}


/**
 * Filter a URL by the crawling rules provided by the user
 * - Sets an $_RDATA['sp_filter'] array key + value and returns the
 *   REASON why the URL was rejected, NOT a 'filtered' URL
 *
 */
function OS_filterURL($_, $base) {
  global $_RDATA;

  if (!preg_match('/^https?:\/\//', $_))
    $_ = OS_formatURL($_, $base);

  if (!empty($_RDATA['sp_filter'][$_]))
    return $_RDATA['sp_filter'][$_];

  $_RDATA['sp_filter'][$_] = '';

  // Accepted hostnames
  $plink = parse_url($_);
  if (!in_array($plink['host'], $_RDATA['sp_hostnames'], true))
    return $_RDATA['sp_filter'][$_] = 'disallowed-host';

  // Require URL matches
  if (count($_RDATA['sp_require_url'])) {
    $foundRequired = false;
    foreach ($_RDATA['sp_require_url'] as $requireURL) {
      if ($requireURL[0] == '*') {
        if (preg_match('/'.str_replace('/', '\/', substr($requireURL, 1)).'/', $_))
          $foundRequired = true;
      } else if (strpos($_, $requireURL) !== false)
        $foundRequired = true;
    }
    if (!$foundRequired)
      return $_RDATA['sp_filter'][$_] = 'require-url';
  }

  // Ignore URL matches
  foreach ($_RDATA['sp_ignore_url'] as $ignoreURL) {
    if ($ignoreURL[0] == '*') {
      if (preg_match('/'.str_replace('/', '\/', substr($ignoreURL, 1)).'/', $_))
        return $_RDATA['sp_filter'][$_] = 'ignore-url';
    } else if (strpos($_, $ignoreURL) !== false)
      return $_RDATA['sp_filter'][$_] = 'ignore-url';
  }

  // Ignore extensions
  if (preg_match('/\.('.$_RDATA['sp_ignore_ext_regexp'].')$/i', $_))
    return $_RDATA['sp_filter'][$_] = 'ignore-extension';

  // robots.txt rules
  if (!empty($_RDATA['sp_robots'][$plink['host']]))
    foreach ($_RDATA['sp_robots'][$plink['host']] as $disallowURL)
      if (strpos($_, $disallowURL) === 0)
        return $_RDATA['sp_filter'][$_] = 'robots-txt';

  return $_RDATA['sp_filter'][$_];
}


/**
 * Fetch a URL using cURL, return an array of useful information
 *
 */
function OS_fetchURL($url, $referer = '') {
  global $_cURL, $_RDATA;

  $_RDATA['sp_robots_header'] = 0;
  $_RDATA['sp_self_reference'] = 0;

  curl_setopt($_cURL, CURLOPT_URL, str_replace(' ', '%20', $url));
  curl_setopt($_cURL, CURLOPT_REFERER, $referer);

  $_ = array(
    'url' => parse_url($url),
    'body' => curl_exec($_cURL),
    'base' => $url,
    'info' => curl_getinfo($_cURL),
    'error' => curl_error($_cURL),
    'errno' => curl_errno($_cURL),
    'links' => array(),
    'title' => '',
    'content' => '',
    'keywords' => '',
    'weighted' => '',
    'description' => ''
  );

  $_['info']['url'] = $url;
  $_['info']['noindex'] = '';
  $_['info']['nofollow'] = false;

  // Process any cURL errors
  switch ($_['errno']) {
    case 0: // Success
    case 42: // Aborted by callback
      if ($_['info']['http_code'] >= 400) {
        $_['errno'] = 22;
        $_['error'] = $_['info']['http_code'].' error';
        $_['info']['noindex'] = '400';

      } else if ($_['info']['redirect_url']) {
        $_['errno'] = 300;
        $_['error'] = 'Redirected by HTTP header to: '.$_['info']['redirect_url'];
        $_['info']['noindex'] = 'redirect-location';

      } else if ($_RDATA['sp_robots_header']) {
        $_['errno'] = 777;
        $_['error'] = 'Blocked by \'X-Robots-Tag\' HTTP header';
        $_['info']['noindex'] = 'robots-http';

      } else if ($_RDATA['sp_self_reference']) {
        $_['errno'] = 888;
        $_['error'] = 'Refused to index myself';
        $_['info']['noindex'] = 'self-reference';

      } else if ($_['errno'] == 42) {
        $_['errno'] = 999;
        $_['error'] = 'Max filesize exceeded';
        $_['info']['noindex'] = 'too-large';
      }
      break;

    case 28: // Timeout
      $_['error'] = 'Timed out waiting for data';
      $_['info']['noindex'] = 'timeout';
      break;

    case 55: // Network send error
    case 56: // Network receive error
      $_['error'] = 'Network error retrieving data';
      $_['info']['noindex'] = 'network-error';
      break;

    case 6: // Could not resolve host
    case 7: // Could not connect to host
      $_['error'] = 'Couldn\'t connect to host: '.$_['url']['host'];
      $_['info']['noindex'] = 'couldnt-connect';
      break;

    default: // Uncaught cURL error
      OS_crawlLog('Uncaught cURL error: '.$url, 2);
      OS_crawlLog($_['errno'], 1);
      OS_crawlLog($_['error'], 1);
      OS_crawlLog(print_r($_['info'], true), 1);
      throw new Exception('Uncaught cURL error');

  }

  return $_;
}


/**
 * Shutdown function to provide cleanup before exit
 *
 */
function OS_crawlCleanUp() {
  global $_DDATA, $_ODATA, $_RDATA, $_cURL, $_MAIL;

  // If the crawl has already been canceled, don't bother
  if (!OS_getValue('sp_crawling')) return;

  $error = error_get_last();
  if (!is_null($error) && $error['type'] == E_ERROR) {
    OS_crawlLog($error['message'], 2);
    OS_crawlLog('File: \''.$error['file'].'\' at line number: '.$error['line'], 0);
    $_RDATA['sp_complete'] = false;
  }

  // Save or display cookies?
  $cookies = curl_getinfo($_cURL, CURLINFO_COOKIELIST);
  // var_dump($cookies);
  curl_close($_cURL);

  // If crawl completed successfully
  if ($_RDATA['sp_complete']) {
    OS_crawlLog('Cleaning up database tables...', 1);

    // Add a natural sort order value to each entry
    natcasesort($_RDATA['sp_store']);
    $_RDATA['sp_store'] = array_values($_RDATA['sp_store']);
    $url_sort = $_DDATA['pdo']->prepare(
      'UPDATE `'.$_DDATA['tbprefix'].'crawltemp`
         SET `url_sort`=:url_sort WHERE `url`=:url;'
    );
    foreach ($_RDATA['sp_store'] as $key => $stored_url) {
      $url_sort->execute(array(
        'url_sort' => $key,
        'url' => $stored_url
      ));
      $err = $url_sort->errorInfo();
      if ($err[0] != '00000') {
        OS_crawlLog('Error sorting the search database', 1);
        OS_crawlLog($err[2], 0);
        break;
      }
    }

    // Truncate the existing search database
    $truncate = $_DDATA['pdo']->query(
      'TRUNCATE `'.$_DDATA['tbprefix'].'crawldata`;'
    );
    $err = $truncate->errorInfo();
    if ($err[0] != '00000') {
      OS_crawlLog('Could not truncate the search database', 1);
      OS_crawlLog($err[2], 0);

      // Last chance to bail out before we make actual changes
      $_RDATA['sp_complete'] = false;
    }
  }

  // If crawl completed successfully AND we truncated the old table
  if ($_RDATA['sp_complete']) {

    OS_setValue('sp_time_end', time());
    OS_setValue('sp_time_last', $_ODATA['sp_time_end'] - $_ODATA['sp_time_start']);

    // Select all rows from the temp table into the existing search table
    $insert = $_DDATA['pdo']->query(
      'INSERT INTO `'.$_DDATA['tbprefix'].'crawldata`
        SELECT * FROM `'.$_DDATA['tbprefix'].'crawltemp`;'
    );
    $err = $insert->errorInfo();
    if ($err[0] == '00000') {
      $tableinfo = $_DDATA['pdo']->query(
        'SHOW TABLE STATUS LIKE \''.$_DDATA['tbprefix'].'crawldata\';'
      );
      $err = $tableinfo->errorInfo();
      if ($err[0] == '00000') {
        $tableinfo = $tableinfo->fetchAll();
        OS_setValue('sp_data_stored', $tableinfo[0]['Data_length']);
      } else OS_crawlLog('Could not read crawl table status', 1);

      // Purge the search result cache
      if ($_ODATA['s_limit_cache']) {
        $purge = $_DDATA['pdo']->query(
          'UPDATE `'.$_DDATA['tbprefix'].'query` SET `cache`=\'\';'
        );
        $err = $purge->errorInfo();
        if ($err[0] != '00000')
          OS_crawlLog('Could not purge search result cache', 1);
      }

      // Optimize the query log table
      $optimize = $_DDATA['pdo']->query(
        'OPTIMIZE TABLE `'.$_DDATA['tbprefix'].'query`;'
      );

      OS_setValue('sp_pages_stored', count($_RDATA['sp_store']));
      OS_setValue('sp_domains', $_RDATA['sp_domains']);
      OS_setValue('sp_time_end_success', $_ODATA['sp_time_end']);

      OS_crawlLog('***** Crawl completed in '.$_ODATA['sp_time_last'].'s *****', 1);
      OS_crawlLog('Total data transferred: '.OS_readSize($_ODATA['sp_data_transferred']), 1);
      OS_crawlLog('Average transfer speed: '.OS_readSize(round($_ODATA['sp_data_transferred'] / $_ODATA['sp_time_last'])).'/s', 1);
      if ($_RDATA['sp_sleep'])
        OS_crawlLog('Time spent sleeping: '.(round($_RDATA['sp_sleep'] / 10) / 100).'s', 1);
      OS_crawlLog('Time taken by cURL: '.(round($_RDATA['sp_time_curl'] * 100) / 100).'s', 1);
      OS_crawlLog($_ODATA['sp_progress'][0].' page'.(($_ODATA['sp_progress'][0] == 1) ? '' : 's').' crawled', 1);
      OS_crawlLog($_ODATA['sp_pages_stored'].' page'.(($_ODATA['sp_pages_stored'] == 1) ? '' : 's').' stored', 1);

      if ($_RDATA['sp_status']['New'])
        OS_crawlLog($_RDATA['sp_status']['New'].' new '.(($_RDATA['sp_status']['New'] == 1) ? 'page' : 'pages').' found', 0);
      if ($_RDATA['sp_status']['Updated'])
        OS_crawlLog($_RDATA['sp_status']['Updated'].' '.(($_RDATA['sp_status']['Updated'] == 1) ? 'page' : 'pages').' updated', 0);
      if ($_RDATA['sp_status']['Blocked'])
        OS_crawlLog($_RDATA['sp_status']['Blocked'].' '.(($_RDATA['sp_status']['Blocked'] == 1) ? 'page' : 'pages').' blocked', 0);
      if ($_RDATA['sp_status']['Not Found'])
        OS_crawlLog($_RDATA['sp_status']['Not Found'].' '.(($_RDATA['sp_status']['Not Found'] == 1) ? 'page' : 'pages').' not found', 0);
      if ($_RDATA['sp_status']['Orphan'])
        OS_crawlLog($_RDATA['sp_status']['Orphan'].' orphaned '.(($_RDATA['sp_status']['Orphan'] == 1) ? 'page' : 'pages'), 0);

      if ($_ODATA['sp_autodelete'])
        OS_crawlLog('Orphaned pages were auto-deleted', 1);

      // Send success email to the admin(s)
      if ($_MAIL && count($_MAIL->getAllRecipientAddresses()) && $_ODATA['sp_email_success']) {
        $_MAIL->Subject = 'Orcinus Site Search Crawler: Crawl succeeded';
        $_MAIL->Body = implode("   \r\n", preg_grep('/^[\[\*]/', explode("\n", file_get_contents($_ODATA['sp_log']))));
        if (!$_MAIL->Send()) OS_crawlLog('Could not send notification email', 2);
      }

      $cliMessage = 'Crawl completed successfully';
      $jsonMessage = json_encode(array(
        'status' => 'Success',
        'message' => $cliMessage
      ), JSON_INVALID_UTF8_IGNORE);

    // We truncated the search table but FAILED to populate it!
    // This is a serious error that disables searching until the
    // crawler is run again!
    } else {
      OS_crawlLog('Could not populate the search table', 2);
      OS_crawlLog($err[2], 0);

      OS_crawlLog('***** Crawl failed; runtime '.$_ODATA['sp_time_last'].'s *****', 1);
      OS_crawlLog('Search table was cleared, but could not be repopulated!', 1);
      OS_crawlLog('The crawler MUST be run again to fix this issue!', 1);

      // Send failure email to the admin(s)
      if ($_MAIL && count($_MAIL->getAllRecipientAddresses()) && $_ODATA['sp_email_failure']) {
        $_MAIL->Subject = 'Orcinus Site Search Crawler: Catastrophic failure!';
        $_MAIL->Body = implode("   \r\n", preg_grep('/^[\[\*\w\d]/', explode("\n", file_get_contents($_ODATA['sp_log']))));
        if (!$_MAIL->Send()) OS_crawlLog('Could not send notification email', 2);
      }

      $cliMessage = 'Could not populate search table; search table is currently empty!';
      $jsonMessage = json_encode(array(
        'status' => 'Error',
        'message' => $cliMessage
      ), JSON_INVALID_UTF8_IGNORE);
    }

  // Else the crawl failed
  } else {
    OS_setValue('sp_time_last', $_ODATA['sp_time_end'] - $_ODATA['sp_time_start']);

    OS_crawlLog('***** Crawl failed; runtime '.$_ODATA['sp_time_last'].'s *****', 1);
    OS_crawlLog('Total data transferred: '.OS_readSize($_ODATA['sp_data_transferred']), 1);
    OS_crawlLog('Search table was NOT updated', 1);

    if ($_ODATA['sp_sitemap_file'])
      OS_crawlLog('Sitemap was NOT updated', 1);

    // Send failure email to the admin(s)
    if ($_MAIL && count($_MAIL->getAllRecipientAddresses()) && $_ODATA['sp_email_failure'] && !$_ODATA['sp_cancel']) {
      $_MAIL->Subject = 'Orcinus Site Search Crawler: Crawl failed';
      $_MAIL->Body = implode("   \r\n", preg_grep('/^[\[\*\w\d]/', explode("\n", file_get_contents($_ODATA['sp_log']))));
      if (!$_MAIL->Send()) OS_crawlLog('Could not send notification email', 2);
    }

    $cliMessage = 'Crawl failed; see the log for details';
    $jsonMessage = json_encode(array(
      'status' => 'Error',
      'message' => $cliMessage
    ), JSON_INVALID_UTF8_IGNORE);
  }

  // Delete the temp search table
  $drop = $_DDATA['pdo']->query(
    'DROP TABLE IF EXISTS `'.$_DDATA['tbprefix'].'crawltemp`;'
  );
  $err = $drop->errorInfo();
  if ($err[0] != '00000') {
    OS_crawlLog('Could not delete the temporary search table', 1);
    OS_crawlLog($err[2], 0);
  }

  // Store the log file to the config database
  OS_setValue('sp_log', file_get_contents($_ODATA['sp_log']));
  fclose($_RDATA['sp_log']);

  // Unset the crawling flag
  OS_setValue('sp_crawling', 0);

  if ($_SERVER['REQUEST_METHOD'] != 'CLI') {
    if (!$_RDATA['debug'])
      header('Content-type: application/json; charset='.strtolower($_ODATA['s_charset']));
    die($jsonMessage);
  } else die($cliMessage."\n");
}


// ***** Accept incoming commands by REQUEST_METHOD
switch ($_SERVER['REQUEST_METHOD']) {

  /* ***** Handle POST Requests ************************************ */
  case 'POST':

    // JSON POST request
    // These are usually sent by javascript fetch()
    if (strpos(trim($_SERVER['CONTENT_TYPE']), 'application/json') === 0) {
      $postBody = file_get_contents('php://input');
      $_POST = json_decode($postBody, false);

      $response = array();

      if (empty($_POST->action)) $_POST->action = '';
      switch ($_POST->action) {
        case 'crawl':
          if (!empty($_POST->sp_key) &&
              $_ODATA['sp_key'] &&
              $_POST->sp_key == $_ODATA['sp_key']) {
            if (OS_getValue('sp_crawling')) {
              $response = array(
                'status' => 'Error',
                'message' => 'Crawler is already running; current progress: '.$_ODATA['sp_progress'][0].'/'.$_ODATA['sp_progress'][1]
              );
            }

            // Go crawl!
            OS_setValue('sp_crawling', 1);
            OS_setValue('sp_key', '');

          } else {
            $response = array(
              'status' => 'Error',
              'message' => 'Incorrect key to initiate crawler'
            );
          }
          break;

        case 'progress':
          $lines = array();

          if (!empty($_POST->log)) {
            if (OS_getValue('sp_crawling')) {
              if (strpos($_ODATA['sp_log'], "\n") === false && file_exists($_ODATA['sp_log']))
                $lines = file($_ODATA['sp_log'], FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
            } else $lines = explode("\n", $_ODATA['sp_log']);

            if (empty($_POST->grep)) $_POST->grep = '';
            switch ($_POST->grep) {
              case 'all': break;
              case 'errors': $lines = preg_grep('/^[\[\*]/', $lines); break;
              default: $lines = preg_grep('/^[\[\*\w\d]/', $lines);
            }
          }

          if ($_ODATA['sp_crawling']) $lines = array_slice($lines, -15);

          $response = array(
            'status' => ($_ODATA['sp_crawling']) ? 'Crawling' : 'Complete',
            'progress' => $_ODATA['sp_progress'],
            'data_transferred' => $_ODATA['sp_data_transferred'],
            'time_crawl' => time() - $_ODATA['sp_time_start'],
            'time_start' => $_ODATA['sp_time_start'],
            'time_end' => $_ODATA['sp_time_end'],
            'timeout_crawl' => $_ODATA['sp_timeout_crawl'],
            'tail' => trim(implode("\n", $lines))
          );
          break;

        case 'cancel':
          if (OS_getValue('sp_crawling')) {

            // IF the crawler 'time_start' is more than 'timeout_crawl'
            // seconds ago, or the 'force' token is set, the crawler is
            // probably stuck. Unstick it.
            if (empty($_POST->force)) $_POST->force = '';
            if ($_POST->force || time() - $_ODATA['sp_time_start'] > $_ODATA['sp_timeout_crawl']) {
              OS_setValue('sp_crawling', 0);

              if (empty($_POST->reason))
                $_POST->reason = 'The crawler halted unexpectedly';

              if (strpos($_ODATA['sp_log'], "\n") === false && file_exists($_ODATA['sp_log'])) {
                $log = file_get_contents($_ODATA['sp_log']);
                OS_setValue('sp_log', $log."\n".'[ERROR] '.$_POST->reason);
              } else OS_setValue('sp_log', '[ERROR] '.$_POST->reason);
              OS_setValue('sp_time_last', $_ODATA['sp_time_end'] - $_ODATA['sp_time_start']);

              // Send failure email to the admin(s)
              if ($_MAIL && count($_MAIL->getAllRecipientAddresses()) && $_ODATA['sp_email_failure']) {
                $_MAIL->Subject = 'Orcinus Site Search Crawler: Crawler halted unexpectedly';
                $_MAIL->Body = implode("   \r\n", preg_grep('/^[\[\*\w\d]/', explode("\n", $_ODATA['sp_log'])));
                if (!$_MAIL->Send()) OS_setValue('sp_log', $_ODATA['sp_log']."\n".'[ERROR] Could not send notification email');
              }
            }

            OS_setValue('sp_cancel', 1);
            $response = array(
              'status' => 'Success',
              'message' => 'Cancel flag was set',
              'crawl_time' => time() - $_ODATA['sp_time_start']
            );

          } else {
            $response = array(
              'status' => 'Error',
              'message' => 'Crawler is not currently running'
            );
          }
          break;

        default:
          $response = array(
            'status' => 'Error',
            'message' => 'Unrecognized command'
          );

      }

      // If we have a response to give, display it and exit
      if ($response) {
        header('Content-type: application/json; charset='.strtolower($_ODATA['s_charset']));
        die(json_encode($response, JSON_INVALID_UTF8_IGNORE));
      }

    // Don't do anything for normal POST request
    // These are usually sent by <form> HTML elements
    } else {
      header('Content-type: text/plain; charset='.strtolower($_ODATA['s_charset']));
      die($_ODATA['sp_useragent']);
    }
    break;

  // Allow CLI requests through
  case '':
    if (!empty($_SERVER['argv'][0]) && $_SERVER['argv'][0] == $_SERVER['PHP_SELF']) {
      $_SERVER['REQUEST_METHOD'] = 'CLI';
      if (!OS_getValue('sp_crawling')) {

        // Set the logging level, if specified
        if (!empty($_SERVER['argv'][1]) && preg_match('/^-log=([012])$/', $_SERVER['argv'][1], $match)) {
          $_RDATA['sp_log_clilevel'] = (int)$match[1];
        } else $_RDATA['sp_log_clilevel'] = 2;

        // Start a crawl
        OS_setValue('sp_crawling', 1);

      } else die('Crawler is already running; exiting...');
    } else die($_ODATA['sp_useragent']);
    break;

  // Don't do anything for GET requests, unless in debug mode
  case 'GET':
    header('Content-type: text/plain; charset='.strtolower($_ODATA['s_charset']));
    if ($_RDATA['debug']) {

      // If we are in debug mode, but the crawler is already running, exit
      if (OS_getValue('sp_crawling'))
        die('Crawler is already running; exiting...');

      // Start a crawl
      OS_setValue('sp_crawling', 1);

    } else die($_ODATA['sp_useragent']);
    break;

  // Exit for all other request types
  default:
    header('Content-type: text/plain; charset='.strtolower($_ODATA['s_charset']));
    die($_ODATA['sp_useragent']);

}


/* ***** Begin Crawl Execution ************************************* */
register_shutdown_function('OS_crawlCleanUp');
ignore_user_abort(true);
@set_time_limit($_ODATA['sp_timeout_crawl'] * 1.1);
libxml_use_internal_errors(true);
if (function_exists('apache_setenv'))
  apache_setenv('no-gzip', '1');

OS_setValue('sp_cancel', 0);
OS_setValue('sp_time_start', time());

OS_setValue('sp_progress', array(0, 1, false));
OS_setValue('sp_pages_stored', 0);
OS_setValue('sp_data_transferred', 0);
OS_setValue('sp_data_stored', 0);
OS_setValue('sp_time_last', 0);


$_RDATA['sp_log'] = tmpfile();
OS_setValue('sp_log', stream_get_meta_data($_RDATA['sp_log'])['uri']);
OS_crawlLog('***** Crawl started: '.date('r').' *****', 1);


// ***** Prepare runtime data
$_RDATA['sp_starting'] = array_filter(array_map('trim', explode("\n", $_ODATA['sp_starting'])));
$_RDATA['sp_hostnames'] = array();
$_RDATA['sp_ignore_url'] = array_filter(array_map('trim', explode("\n", $_ODATA['sp_ignore_url'])));
$_RDATA['sp_ignore_css'] = array_filter(explode(' ', $_ODATA['sp_ignore_css']));
$_RDATA['sp_title_strip'] = array_filter(array_map('trim', explode("\n", $_ODATA['sp_title_strip'])));
$_RDATA['s_weight_css'] = array_filter(explode(' ', $_ODATA['s_weight_css']));
$_RDATA['sp_require_url'] = array_filter(array_map('trim', explode("\n", $_ODATA['sp_require_url'])));
$_RDATA['sp_ignore_ext_regexp'] = implode('|', array_map('preg_quote', array_filter(explode(' ', $_ODATA['sp_ignore_ext']))));
$_RDATA['sp_robots_header'] = 0;
$_RDATA['sp_complete'] = false;
$_RDATA['sp_links'] = array();
$_RDATA['sp_store'] = array();
$_RDATA['sp_domains'] = array();
$_RDATA['sp_sitemap'] = array();
$_RDATA['sp_robots'] = array();
$_RDATA['sp_status'] = array('Orphan' => 0, 'Blocked' => 0, 'Not Found' => 0, 'Updated' => 0, 'New' => 0);
$_RDATA['sp_filter'] = array();
$_RDATA['sp_prev_dls'] = 0;
$_RDATA['sp_time_curl'] = 0;
$_RDATA['sp_sleep'] = 0;
$_RDATA['sp_sha1'] = array();
$_RDATA['sp_resumed'] = false;
$_RDATA['sp_whitespace'] = array(
  "\u{0009}" => ' ',  "\u{000A}" => "\n", "\u{000B}" => "\n", "\u{000C}" => "\n",
  "\u{000D}" => "\n", "\u{0085}" => "\n", "\u{00A0}" => ' ',  "\u{1680}" => ' ',
  "\u{2000}" => ' ',  "\u{2001}" => ' ',  "\u{2002}" => ' ',  "\u{2003}" => ' ',
  "\u{2004}" => ' ',  "\u{2005}" => ' ',  "\u{2006}" => ' ',  "\u{2007}" => ' ',
  "\u{2008}" => ' ',  "\u{2009}" => ' ',  "\u{200A}" => ' ',  "\u{200B}" => ' ',
  "\u{200C}" => ' ',  "\u{200D}" => '',   "\u{2028}" => "\n", "\u{2029}" => "\n",
  "\u{202F}" => ' ',  "\u{205F}" => ' ',  "\u{2060}" => '',   "\u{3000}" => ' ',
  "\u{FEFF}" => ' ',  "\u{FFFD}" => ''
);


// ***** Load PDF parser
if (!class_exists('\Smalot\PdfParser\Parser'))
  if (file_exists(__DIR__.'/pdfparser/alt_autoload.php-dist'))
    include __DIR__.'/pdfparser/alt_autoload.php-dist';
if (class_exists('\Smalot\PdfParser\Parser')) {
  $config = new \Smalot\PdfParser\Config();
  $config->setRetainImageContent(false);
  $config->setDecodeMemoryLimit(16777216);
  $_PDF = new \Smalot\PdfParser\Parser([], $config);
} else {
  OS_crawlLog('Could not include \'PDFParser\'; PDFs will not be indexed', 1);
  $_PDF = false;
}


// ***** Check for PHPMailer
if (!$_MAIL) {
  OS_crawlLog('Could not include \'PHPMailer\'; Crawler cannot send mail', 1);
} else if (!count($_MAIL->getAllRecipientAddresses()))
  OS_crawlLog('No admin emails specified; Crawler will not send mail', 1);


// ***** Initialize the cURL connection
$_cURL = OS_getConnection();
if ($_cURL) {

  // Customize this cURL connection
  if ($_ODATA['sp_cookies'])
    curl_setopt($_cURL, CURLOPT_COOKIEFILE, '');
  curl_setopt($_cURL, CURLOPT_HEADERFUNCTION, function($_cURL, $line) {
    global $_RDATA;

    if (preg_match('/^X-Robots-Tag:\s*(noindex|none)/i', $line))
      $_RDATA['sp_robots_header'] = 1;

    if (trim($line) == $_RDATA['x_generated_by'])
      $_RDATA['sp_self_reference'] = 1;

    return strlen($line);
  });
  curl_setopt($_cURL, CURLOPT_NOPROGRESS, false);
  curl_setopt($_cURL, CURLOPT_PROGRESSFUNCTION,
    function($_cURL, $dls, $dl, $uls, $ul) {
      global $_ODATA, $_RDATA;

      if ($_RDATA['sp_robots_header']) return 1;
      if ($_RDATA['sp_self_reference']) return 1;

      // Prevent comparing this value until a Content-length header has
      // been received by the cURL connection
      if ($dls != $_RDATA['sp_prev_dls']) {
        $_RDATA['sp_prev_dls'] = $dls;
        if ($dls > $_ODATA['sp_limit_filesize'] * 1024) return 1;
      }
      if ($dl > $_ODATA['sp_limit_filesize'] * 1024) return 1;

      $i = curl_getinfo($_cURL);
      if ($i['redirect_url']) return 1;
      if ($i['http_code'] && $i['http_code'] >= 400) return 1;

      return $_RDATA['sp_robots_header'];
    }
  );

} else OS_crawlLog('cURL functions are not enabled; cannot perform crawl', 2);


// ***** Pre-fill queue with starting URL(s) at depth 0, blank referer
$_RDATA['sp_queue'] = array();
foreach ($_RDATA['sp_starting'] as $starting) {
  $starting = OS_formatURL($starting, $_ODATA['admin_install_domain'].'/');
  $_RDATA['sp_queue'][] = array($starting, 0, '');

  // Add starting URLs to required URLs so the crawler cannot travel
  // into parent directories
  $_RDATA['sp_require_url'][] = preg_replace('/\/[^\/]*$/', '/', $starting);

  $host = parse_url($starting)['host'];
  if (!in_array($host, $_RDATA['sp_hostnames'], true))
    $_RDATA['sp_hostnames'][] = $host;
}

// ***** List of previously crawled links from the database
$_RDATA['sp_exist'] = array();
$_RDATA['sp_lastmod'] = array();
$crawldata = $_DDATA['pdo']->query(
  'SELECT `url`, `content_checksum`, `last_modified`
     FROM `'.$_DDATA['tbprefix'].'crawldata`'
);
$err = $crawldata->errorInfo();
if ($err[0] == '00000') {
  foreach ($crawldata as $value) {
    $_RDATA['sp_exist'][$value['content_checksum']] = $value['url'];
    $_RDATA['sp_lastmod'][$value['url']] = $value['last_modified'];
  }
} else OS_crawlLog('Error getting list of previous URLs from crawldata table', 2);


// If the crawltemp table exists here, that means a crawl was
// interrupted without completing the shutdown function.
// Use the data from this partially completed crawl to resume it.
if (in_array($_DDATA['tbprefix'].'crawltemp', $_DDATA['tables'], true)) {
  $select = $_DDATA['pdo']->query(
    'SELECT `url`, `links`, `content_checksum` FROM `'.$_DDATA['tbprefix'].'crawltemp`;'
  );
  $err = $select->errorInfo();
  if ($err[0] == '00000') {
    OS_crawlLog('Previous crawl data exists; using it to resume crawling...', 1);

    $select = $select->fetchAll();

    OS_crawlLog('Found '.count($select).' previously crawled URLs', 1);
    $_RDATA['sp_resumed'] = true;

    // Run through every entry in the crawltemp table
    foreach ($select as $row) {

      // If an entry matches an existing URL in the queue (just
      // starting URLs right now) then delete that queue entry
      foreach ($_RDATA['sp_queue'] as $key => $queue)
        if ($row['url'] == $queue[0])
          unset($_RDATA['sp_queue'][$key]);

      // Add it to the 'stored' and 'crawled links' lists
      $_RDATA['sp_store'][] = $row['url'];
      $_RDATA['sp_links'][] = $row['url'];

      // Add the content hash to the tally
      $_RDATA['sp_sha1'][$row['content_checksum']] = $row['url'];

      // Rebuild the domains list
      $prurl = parse_url($row['url']);
      $domain = $prurl['scheme'].'://'.$prurl['host'];
      if (!isset($_RDATA['sp_domains'][$domain])) {
        $_RDATA['sp_domains'][$domain] = 1;
      } else $_RDATA['sp_domains'][$domain]++;

      // Add links from the entry to the queue
      $row['links'] = json_decode($row['links'], true);
      foreach ($row['links'] as $link) {

        $link = OS_formatURL($link, $row['url']);

        // ***** If this link hasn't been crawled yet
        if (!in_array($link, $_RDATA['sp_links'], true)) {

          // ... and if link hasn't been queued yet
          foreach ($_RDATA['sp_queue'] as $queue)
            if ($link == $queue[0]) continue 2;

          // ... and if link passes our user filters, add the link to
          // the queue
          if (!OS_filterURL($link, $row['url']))
            $_RDATA['sp_queue'][] = array($link, 0, $row['url']);
        }
      }
    }

  // We couldn't select any data from the crawltemp table so delete it
  } else {
    $drop = $_DDATA['pdo']->query(
      'DROP TABLE IF EXISTS `'.$_DDATA['tbprefix'].'crawltemp`;'
    );
    $err = $drop->errorInfo();
    if ($err[0] != '00000') {
      // If we couldn't delete the interrupted crawldata table, this is
      // a fatal error
      OS_crawlLog('Could not delete previously interrupted crawl data; unable to crawl.', 2);
      throw new Exception('Could not delete previously interrupted crawl data; unable to crawl.');
    }
  }
}

// Create a temp MySQL storage table using schema of the existing table
$create = $_DDATA['pdo']->query(
  'CREATE TABLE IF NOT EXISTS`'.$_DDATA['tbprefix'].'crawltemp`
     LIKE `'.$_DDATA['tbprefix'].'crawldata`;'
);
$err = $create->errorInfo();
if ($err[0] != '00000') {
  // If we could not create the crawldata table, or an interrupted
  // crawldata table doesn't exist, then this is a fatal error
  OS_crawlLog('Unable to create or reuse existing crawl data table; unable to crawl.', 2);
  throw new Exception('Unable to create or reuse existing crawl data table; unable to crawl.');
}

// Prepare SQL statements
$selectData = $_DDATA['pdo']->prepare(
  'SELECT `url`, `category`, `links`, `content_checksum`, `last_modified`,
          `flag_updated`, `flag_unlisted`, `priority`
  FROM `'.$_DDATA['tbprefix'].'crawldata` WHERE `url`=:url;'
);
$updateURL = $_DDATA['pdo']->prepare(
  'UPDATE `'.$_DDATA['tbprefix'].'crawltemp` SET
    `url`=:url WHERE `content_checksum`=:content_checksum;'
);
$insertTemp = $_DDATA['pdo']->prepare(
  'INSERT INTO `'.$_DDATA['tbprefix'].'crawltemp` SET
    `url`=:url,
    `url_sort`=0,
    `title`=:title,
    `description`=:description,
    `keywords`=:keywords,
    `category`=:category,
    `weighted`=:weighted,
    `links`=:links,
    `content`=:content,
    `content_mime`=:content_mime,
    `content_charset`=:content_charset,
    `content_checksum`=:content_checksum,
    `status`=:status,
    `flag_unlisted`=:flag_unlisted,
    `flag_updated`=:flag_updated,
    `last_modified`=:last_modified,
    `priority`=:priority
  ;'
);
$insertNotModified = $_DDATA['pdo']->prepare(
  'REPLACE INTO `'.$_DDATA['tbprefix'].'crawltemp` (
      `url`, `url_sort`, `title`, `description`, `keywords`, `category`,
      `weighted`, `links`, `content`, `content_mime`, `content_charset`,
      `content_checksum`, `status`, `flag_unlisted`, `flag_updated`,
      `last_modified`, `priority`
    ) SELECT
        `url`, 0, `title`, `description`, `keywords`, `category`,
        `weighted`, `links`, `content`, `content_mime`, `content_charset`,
        `content_checksum`, :status, `flag_unlisted`, 0,
        `last_modified`, `priority`
      FROM `'.$_DDATA['tbprefix'].'crawldata` WHERE `url`=:url;'
);


// ***** Begin crawling URLs from the queue
while ($_cURL && count($_RDATA['sp_queue'])) {

  // Check if we have run out of execution time
  if ($_ODATA['sp_time_start'] + $_ODATA['sp_timeout_crawl'] <= time()) {
    OS_crawlLog('Maximum script runtime ('.$_ODATA['sp_timeout_crawl'].'s) reached', 2);
    break;
  }

  // Check if user has canceled the crawl
  if (OS_getValue('sp_cancel')) {
    OS_crawlLog('Crawl canceled manually by user', 2);
    break;
  }

  // Check if we have exceeded the maximum number of crawled links
  if (count($_RDATA['sp_links']) > $_ODATA['sp_limit_crawl']) {
    OS_crawlLog('Maximum number of crawled pages exceeded', 2);
    break;
  }

  // Retrieve next link to crawl from the queue
  list($url, $depth, $referer) = array_shift($_RDATA['sp_queue']);
  $_RDATA['sp_links'][] = $url;

  // Check if URL is beyond the depth limit
  if ($depth > $_ODATA['sp_limit_depth']) {
    OS_crawlLog('Maximum link depth ('.$_ODATA['sp_limit_depth'].') exceeded; URL at depth '.$depth.' was not stored: '.$url, 2);
    continue;
  }

  // Check robots.txt for newly encountered hostnames
  $purl = parse_url($url);
  $port = (!empty($purl['port'])) ? ':'.$purl['port'] : '';
  if (!isset($_RDATA['sp_robots'][$purl['host']])) {
    $_RDATA['sp_robots'][$purl['host']] = array();
    OS_crawlLog('Fetching robots.txt for domain: '.$purl['host'], 1);

    curl_setopt($_cURL, CURLOPT_TIMECONDITION, CURL_TIMECOND_NONE);
    $robotstxt = OS_fetchURL($purl['scheme'].'://'.$purl['host'].$port.'/robots.txt', '');

    if (!$robotstxt['errno']) {
      $robots = array();
      $robot = '';
      $robolines = explode("\n", $robotstxt['content']);
      foreach ($robolines as $line) {
        if (preg_match('/^user-agent\s*:\s*(.*)\s*$/i', $line, $r)) {
          if (empty($robots[$robot = $r[1]]))
            $robots[$robot] = array('disallow' => array(), 'allow' => array());
        } else if (preg_match('/((dis)?allow)\s*:\s*(.*)\s*$/i', $line, $r))
          $robots[$robot][strtolower($r[1])][] = OS_formatURL($r[3], $url);
      }
      foreach ($robots as $agent => $rules) {
        if (preg_match('/^orc(a|inus)(-?php)?-?crawler$/i', $agent) || $agent == '*') {
          foreach ($rules['disallow'] as $disrule)
            if (!in_array($disrule, $_RDATA['sp_robots'][$purl['host']], true))
              $_RDATA['sp_robots'][$purl['host']][] = $disrule;
          foreach ($rules['allow'] as $rule) {
            $key = array_search($rule, $_RDATA['sp_robots'][$purl['host']]);
            if ($key !== false) unset($_RDATA['sp_robots'][$purl['host']][$key]);
          }
        }
      }
    }
  }

  if ($_RDATA['debug'])
    OS_crawlLog('Memory used: '.OS_readSize(memory_get_usage(true)), 1);

  OS_crawlLog('Crawling: '.$url.' (Depth: '.$depth.')', 1);
  OS_setValue('sp_progress', array(
    count($_RDATA['sp_links']),
    count($_RDATA['sp_links']) + count($_RDATA['sp_queue']),
    $_RDATA['sp_resumed']
  ));
  OS_setValue('sp_time_end', time());

  // Set the correct If-Modified-Since request header
  if ($_ODATA['sp_ifmodifiedsince'] && isset($_RDATA['sp_lastmod'][$url])) {
    curl_setopt($_cURL, CURLOPT_TIMEVALUE, $_RDATA['sp_lastmod'][$url]);
    curl_setopt($_cURL, CURLOPT_TIMECONDITION, CURL_TIMECOND_IFMODSINCE);
  } else curl_setopt($_cURL, CURLOPT_TIMECONDITION, CURL_TIMECOND_NONE);

  // Fetch the URL
  $data = OS_fetchURL($url, $referer);

  // Record cURL timing and data info for this fetch
  OS_setValue('sp_data_transferred', $_ODATA['sp_data_transferred'] + $data['info']['size_download']);
  $_RDATA['sp_time_curl'] += $data['info']['total_time'];


  // If there were cURL errors while fetching this URL
  if ($data['errno']) {


  // Else if the page hasn't been modified since the last crawl
  } else if ($data['info']['http_code'] == 304) {
    $data['info']['noindex'] = 'not-modified-304';


  // Else if we received any content at all
  } else if (trim($data['body'])) {

    // Get a 20-byte binary hash of the raw content
    $data['info']['sha1'] = sha1($data['body'], true);

    // If this content does not duplicate previously stored content
    if (empty($_RDATA['sp_sha1'][$data['info']['sha1']])) {

      // Add the content hash to the tally
      $_RDATA['sp_sha1'][$data['info']['sha1']] = $url;

      // If this is a new page, or an existing page but the content
      // hash has changed
      if (!isset($_RDATA['sp_exist'][$data['info']['sha1']]) ||
          $_RDATA['sp_exist'][$data['info']['sha1']] != $url) {

        // Detect MIME-type using extension?
        if (empty($data['info']['content_type']))
          $data['info']['content_type'] = 'text/plain';

        // Parse MIME-type
        $data['info']['mime_type'] = '';
        if (preg_match('/\w+\/[\w.+-]+/', $data['info']['content_type'], $m))
          $data['info']['mime_type'] = $m[0];

        // Parse Character Encoding
        $data['info']['charset'] = '';
        if (preg_match('/charset=([\w\d.:-]+)/i', $data['info']['content_type'], $m))
          $data['info']['charset'] = $m[1];
        if (!$data['info']['charset'])
          $data['info']['charset'] = 'ISO-8859-1';

        // GZ-Unzip the content if necessary
        while (strpos($data['body'], "\x1f\x8b") === 0)
          $data['body'] = gzinflate(substr($data['body'], 10));

        // Title defaults to filename
        $data['title'] = basename($data['info']['url']);

        // Determine how to parse the content by MIME-type
        switch ($data['info']['mime_type']) {

          /* ***** PLAIN TEXT ************************************** */
          case 'text/plain':
            $data['content'] = $data['body'];

            OS_cleanTextUTF8($data['content'], $data['info']['charset']);
            break;


          /* ***** XML DOCUMENT ************************************ */
          case 'text/xml':
          case 'application/xml':
            $data['body'] = preg_replace('/<br(\s?\/)?>/', ' ', $data['body']);

            $document = new DOMDocument();
            if ($document->loadXML($data['body'], LIBXML_PARSEHUGE | LIBXML_BIGLINES | LIBXML_COMPACT)) {

              // Remove <script> elements
              $scripts = $document->getElementsByTagName('script');
              foreach ($scripts as $script)
                $script->parentNode->removeChild($script);

              // Remove <!-- comments -->
              $xpath = new DOMXpath($document);
              $comments = $xpath->query('//comment()');
              foreach ($comments as $comment)
                $comment->parentNode->removeChild($comment);

              // Check XML document charset
              if (strtolower($data['info']['charset']) != strtolower($document->xmlEncoding)) {
                OS_crawlLog('Charset in Content-type header ('.(($data['info']['charset']) ? $data['info']['charset'] : '<none>').') differs from document charset ('.(($document->xmlEncoding) ? $document->xmlEncoding : '<none>').') at: '.$data['info']['url'], 1);
                $data['info']['charset'] = $document->xmlEncoding;
              }

              $data['content'] = $document->textContent;

            // Could not parse XML; try to store content anyway
            } else {
              $data['error'] = 'Invalid XML - could not parse content; storing as-is';
              $data['info']['nofollow'] = true;

              // Remove <script> elements and <!-- comments -->
              $data['content'] = preg_replace(array('/<!--.*?-->/s', '/<script.*?\/script>/is'), '', $data['body']);
              $data['content'] = strip_tags($data['content']);
            }

            OS_cleanTextUTF8($data['content'], $data['info']['charset'], ENT_XML1);
            break;


          /* ***** HTML DOCUMENT *********************************** */
          case 'text/html':
          case 'application/xhtml+xml':
            $data['body'] = preg_replace('/<br(\s?\/)?>/', ' ', $data['body']);

            $document = new DOMDocument();
            if ($document->loadHTML($data['body'], LIBXML_PARSEHUGE | LIBXML_BIGLINES | LIBXML_COMPACT | LIBXML_NOCDATA)) {

              // Remove <script> elements
              $scripts = $document->getElementsByTagName('script');
              foreach ($scripts as $script)
                $script->parentNode->removeChild($script);

              // Remove <!-- comments -->
              $xpath = new DOMXpath($document);
              $comments = $xpath->query('//comment()');
              foreach ($comments as $comment)
                $comment->parentNode->removeChild($comment);

              // ***** Process <head> elements
              $head = $document->getElementsByTagName('head');
              if (!empty($head[0])) {

                $base = $head[0]->getElementsByTagName('base');
                if (!empty($base[0]))
                  for ($x = 0; $x < count($base[0]->attributes); $x++)
                    if (strtolower($base[0]->attributes[$x]->name) == 'href')
                      if (!empty($base[0]->attributes[$x]->value))
                        $data['base'] = filter_var($base[0]->attributes[$x]->value, FILTER_SANITIZE_URL);

                $metas = $head[0]->getElementsByTagName('meta');
                foreach ($metas as $meta) {
                  for ($x = 0; $x < count($meta->attributes); $x++) {
                    if (strtolower($meta->attributes[$x]->name) == 'charset') {
                      if (strtolower($data['info']['charset']) != strtolower($meta->attributes[$x]->value)) {
                        OS_crawlLog('Charset in Content-type header ('.(($data['info']['charset']) ? $data['info']['charset'] : '<none>').') differs from document charset ('.(($meta->attributes[$x]->value) ? $meta->attributes[$x]->value : '<none>').') at: '.$data['info']['url'], 1);
                        $data['info']['charset'] = $meta->attributes[$x]->value;
                      }

                    } else if (strtolower($meta->attributes[$x]->name) == 'http-equiv') {
                      switch (strtolower($meta->attributes[$x]->value)) {
                        case 'refresh':
                          for ($y = 0; $y < count($meta->attributes); $y++) {
                            if (strtolower($meta->attributes[$y]->name) == 'content') {
                              if (preg_match('/(\d+)\s?;\s?url\s?=\s?([\'"])(.+?)\2?\s?$/i', $meta->attributes[$y]->value, $m)) {
                                if ((int)$m[1] <= $_ODATA['sp_timeout_url']) {
                                  $data['errno'] = 300;
                                  $data['error'] = 'Redirected by <meta> element to: '.$m[3];
                                  $data['info']['redirect_url'] = $m[3];
                                  $data['info']['noindex'] = 'redirect-meta';
                                  $data['info']['nofollow'] = true;
                                  break 4;
                                } else $data['links'][] = $m[3];
                              }
                            }
                          }
                          break;

                        case 'content-type':
                          for ($y = 0; $y < count($meta->attributes); $y++) {
                            if (strtolower($meta->attributes[$y]->name) == 'content' && preg_match('/charset=([\w\d.:-]+)/i', $meta->attributes[$y]->value, $m)) {
                              if (strtolower($data['info']['charset']) != strtolower($m[1])) {
                                OS_crawlLog('Charset in Content-type header ('.(($data['info']['charset']) ? $data['info']['charset'] : '<none>').') differs from document charset ('.(($m[1]) ? $m[1] : '<none>').') at: '.$data['info']['url'], 1);
                                $data['info']['charset'] = $m[1];
                              }
                            }
                          }

                      }

                    } else if (strtolower($meta->attributes[$x]->name) == 'name') {
                      switch (strtolower($meta->attributes[$x]->value)) {
                        case 'keywords':
                          for ($y = 0; $y < count($meta->attributes); $y++)
                            if (strtolower($meta->attributes[$y]->name) == 'content')
                              $data['keywords'] = $meta->attributes[$y]->value;
                          break;

                        case 'description':
                          for ($y = 0; $y < count($meta->attributes); $y++)
                            if (strtolower($meta->attributes[$y]->name) == 'content')
                              $data['description'] = $meta->attributes[$y]->value;
                          break;

                        case 'robots':
                        case 'orcacrawler':
                        case 'orcaphpcrawler':
                        case 'orca-crawler':
                        case 'orcaphp-crawler':
                        case 'orca-phpcrawler':
                        case 'orca-php-crawler':
                        case 'orcinuscrawler':
                        case 'orcinus-crawler':
                          for ($y = 0; $y < count($meta->attributes); $y++) {
                            if (strtolower($meta->attributes[$y]->name) == 'content') {
                              $content = explode(',', $meta->attributes[$y]->value);
                              foreach ($content as $con) {
                                switch (trim(strtolower($con))) {
                                  case 'nofollow':
                                    $data['info']['nofollow'] = true;
                                    break;

                                  case 'noindex':
                                    $data['error'] = 'Not indexed due to robots <meta> element';
                                    $data['info']['noindex'] = 'robots-meta';

                                }
                              }
                            }
                          }

                      }
                    }
                  }
                }

                $title = $head[0]->getElementsByTagName('title');
                $data['title'] = $title[0]->textContent;

                $links = $head[0]->getElementsByTagName('link');
                foreach ($links as $link) {
                  for ($x = 0; $x < count($link->attributes); $x++) {
                    if (strtolower($link->attributes[$x]->name) == 'rel') {
                      for ($y = 0; $y < count($link->attributes); $y++) {
                        if (strtolower($link->attributes[$y]->name) == 'href') {
                          $linkurl = filter_var($link->attributes[$y]->value, FILTER_SANITIZE_URL);

                          switch (strtolower($link->attributes[$x]->value)) {
                            case 'canonical':
                              if (OS_formatURL($linkurl, $data['base']) != $data['info']['url']) {
                                $data['info']['noindex'] = 'non-canonical';
                                $data['info']['canonical'] = $linkurl;
                              }

                            case 'alternate':
                            case 'author':
                            case 'help':
                            case 'license':
                            case 'me':
                            case 'next':
                            case 'prev':
                            case 'search':
                            case 'alternate':
                              $data['links'][] = $linkurl;

                          }
                          break;
                        }
                      }
                    }
                  }
                }
              }


              // ***** Process <body> elements
              $body = $document->getElementsByTagName('body');
              if (!empty($body[0])) {

                // Replace <img> tags with their alt text
                $imgs = $body[0]->getElementsByTagName('img');
                foreach ($imgs as $img) {
                  for ($x = 0; $x < count($img->attributes); $x++) {
                    if (strtolower($img->attributes[$x]->name) == 'alt') {
                      $img->parentNode->replaceChild(
                        $document->createTextNode(' '.$img->attributes[$x]->value.' '),
                        $img
                      );
                      break;
                    }
                  }
                }

                $as = $body[0]->getElementsByTagName('a');
                foreach ($as as $a) {
                  for ($x = 0; $x < count($a->attributes); $x++) {
                    if (strtolower($a->attributes[$x]->name) == 'href') {
                      for ($y = 0; $y < count($a->attributes); $y++)
                        if (strtolower($a->attributes[$y]->name) == 'rel' && strtolower($a->attributes[$y]->value) == 'nofollow') continue 3;
                      $data['links'][] = $a->attributes[$x]->value;
                    }
                  }
                }

                $areas = $body[0]->getElementsByTagName('area');
                foreach ($areas as $area) {
                  for ($x = 0; $x < count($area->attributes); $x++) {
                    if (strtolower($area->attributes[$x]->name) == 'href') {
                      for ($y = 0; $y < count($area->attributes); $y++)
                        if (strtolower($area->attributes[$y]->name) == 'rel' && strtolower($area->attributes[$y]->value) == 'nofollow') continue 3;
                      $data['links'][] = $area->attributes[$x]->value;
                    }
                  }
                }

                $frames = $body[0]->getElementsByTagName('frame');
                foreach ($frames as $frame)
                  for ($x = 0; $x < count($frame->attributes); $x++)
                    if (strtolower($frame->attributes[$x]->name) == 'src')
                      $data['links'][] = $frame->attributes[$x]->value;

                $iframes = $body[0]->getElementsByTagName('iframe');
                foreach ($iframes as $iframe)
                  for ($x = 0; $x < count($iframe->attributes); $x++)
                    if (strtolower($iframe->attributes[$x]->name) == 'src')
                      $data['links'][] = $iframe->attributes[$x]->value;

              }


              $data['links'] = array_map(function($l) {
                if (preg_match('/^(tel|telnet|mailto|ftp|sftp|ssh|gopher|news|ldap|urn|onion|magnet):/i', $l)) return '';
                return preg_replace('/#.*$/', '', $l);
              }, $data['links']);
              $data['links'] = array_filter(array_unique($data['links']));

              // Remove tags
              foreach ($_RDATA['sp_ignore_css'] as $ignoreCSS) {
                switch ($ignoreCSS[0]) {
                  case '#': // Remove by ID
                    $id = $document->getElementById(substr($ignoreCSS, 1));
                    if (!is_null($id)) $id->parentNode->removeChild($id);
                    break;

                  case '.': // Remove by class
                    foreach ($xpath->evaluate('//*[contains(concat(" ", normalize-space(@class), " "), " '.substr($ignoreCSS, 1).' ")]') as $cls)
                      $cls->parentNode->removeChild($cls);
                    break;

                  default: // Remove by tag name
                    $tags = $document->getElementsByTagName($ignoreCSS);
                    foreach ($tags as $tag)
                      $tag->parentNode->removeChild($tag);

                }
              }

              // Weighted elements
              foreach ($_RDATA['s_weight_css'] as $weightCSS) {
                switch ($weightCSS[0]) {
                  case '#': // Get content by ID
                    $id = $document->getElementById(substr($weightCSS, 1));
                    if (!is_null($id)) $data['weighted'] .= $id->textContent.' ';
                    break;

                  case '.': // Get content by class
                    foreach ($xpath->evaluate('//*[contains(concat(" ", normalize-space(@class), " "), " '.substr($weightCSS, 1).' ")]') as $cls)
                      $data['weighted'] .= $cls->textContent.' ';
                    break;

                  default: // Get content by tag name
                    $tags = $document->getElementsByTagName($weightCSS);
                    foreach ($tags as $tag)
                      $data['weighted'] .= $tag->textContent.' ';

                }
              }

              $data['content'] = $document->textContent;

            // Could not parse HTML; try to store content anyway
            } else {
              $data['error'] = 'Invalid HTML - could not parse content; storing as-is';
              $data['info']['nofollow'] = true;

              // Remove <script> elements and <!-- comments -->
              $data['content'] = preg_replace(array('/<!--.*?-->/s', '/<script.*?\/script>/is'), '', $data['body']);
              $data['content'] = strip_tags($data['content']);
            }

            // Not sure I need to do this, but hey... I could, so...
            if ($data['info']['mime_type'] == 'application/xhtml+xml') {
              $ent = ENT_XHTML;
            } else if (!empty($document->doctype->publicId)) {
              $publicId = strtoupper($document->doctype->publicId);
              if (strpos($publicId, 'DTD XHTML') !== false) {
                $ent = ENT_XHTML;
              } else if (strpos($publicId, 'DTD HTML') !== false) {
                $ent = ENT_HTML401;
              } else $ent = ENT_XML1;
            } else $ent = ENT_HTML5;

            OS_cleanTextUTF8($data['title'], $data['info']['charset'], $ent);
            OS_cleanTextUTF8($data['keywords'], $data['info']['charset'], $ent);
            OS_cleanTextUTF8($data['description'], $data['info']['charset'], $ent);
            OS_cleanTextUTF8($data['weighted'], $data['info']['charset'], $ent);
            OS_cleanTextUTF8($data['content'], $data['info']['charset'], $ent);
            break;


          /* ***** PDF ********************************************* */
          case 'application/pdf':
            if ($_PDF) {
              try {
                $pdf = $_PDF->parseContent($data['body']);

                $metadata = $pdf->getDetails();

                // Prefer regular PDF metadata first, then try XMP
                $getItems = array(
                  'title' => array('Title', 'dc:title', 'pdf:title'),
                  'description' => array('Subject', 'dc:description', 'pdf:subject'),
                  'keywords' => array('Keywords', 'dc:subject', 'pdf:keywords'),
                  'modified' => array('SourceModified', 'pdfx:sourcemodified', 'CreationDate', 'xmp:createdate')
                );

                foreach ($getItems as $key => $item) {
                  foreach ($item as $opt) {
                    if (!empty($metadata[$opt])) {

                      // Check if this is an array of list-items and if
                      // so, convert it to a comma-separated string
                      if (is_array($metadata[$opt]) && isset($metadata[$opt][0]) && is_string($metadata[$opt][0]))
                        $metadata[$opt] = implode(', ', $metadata[$opt]);

                      // Use the first valid string value we find as
                      // the appropriate property value
                      if (is_string($metadata[$opt]) && trim($metadata[$opt])) {
                        $data[$key] = $metadata[$opt];
                        break;
                      }
                    }
                  }
                }

                $data['content'] = $pdf->getText();

                $data['info']['charset'] = mb_detect_encoding($data['content']);
                if (!$data['info']['charset']) $data['info']['charset'] = 'CP1252';
                OS_cleanTextUTF8($data['content'], $data['info']['charset']);

                if (!empty($data['modified']))
                  if ($stamp = strtotime($data['modified']))
                    $data['info']['filetime'] = $stamp;

                if ($data['content']) {

                  // Discard the PDF text if it contains Unicode control
                  // characters; some of these might be simple PDF ligatures
                  // but PDFParser doesn't support them; any content that
                  // contains these is usually mostly gobbledegook
                  if (strpos($data['content'], "\u{3}") === false &&
                      strpos($data['content'], "\u{2}") === false &&
                      strpos($data['content'], "\u{1}") === false) {

                    OS_cleanTextUTF8($data['title'], mb_detect_encoding($data['title']));
                    OS_cleanTextUTF8($data['keywords'], mb_detect_encoding($data['keywords']));
                    OS_cleanTextUTF8($data['description'], mb_detect_encoding($data['description']));

                  } else {
                    $data['errno'] = 703;
                    $data['error'] = 'Failed to decode PDF text';
                    $data['content'] = '';
                    $data['info']['noindex'] = 'couldnt-decode-pdf';
                  }

                } else {
                  $data['errno'] = 702;
                  $data['error'] = 'PDF is empty of extractable text';
                  $data['info']['noindex'] = 'empty-pdf';
                }

              } catch (Exception $e) {
                $data['errno'] = 701;
                $data['error'] = 'PDF is secured/encrypted; text extraction failed';
                $data['content'] = '';
                $data['info']['noindex'] = 'secured-pdf';
              }

            } else $data['info']['noindex'] = 'missing-pdfparser';
            break;


          /* ***** Unknown MIME-type ******************************* */
          default:
            $data['error'] = 'Not indexed due to unknown MIME type ('.$data['info']['mime_type'].')';
            $data['info']['noindex'] = 'unknown-mime';

        }

      // Else content is identical to the old entry so don't parse
      } else {
        $data['info']['noindex'] = 'not-modified-sha1';
      }

    // Else content is a duplicate of a previously stored page
    } else {

      // Update the stored URL to the shortest version
      if (strlen($url) < strlen($_RDATA['sp_sha1'][$data['info']['sha1']])) {
        $updateURL->execute(array(
          'url' => $url,
          'content_checksum' => $data['info']['sha1']
        ));
      }
      $data['info']['noindex'] = 'duplicate';
    }

  // Else the 'body' of the response was empty
  } else {
    $data['error'] = 'Server returned no content';
    $data['info']['noindex'] = 'empty';
  }


  // Decide whether or not to 'index' / store this page
  switch ($data['info']['noindex']) {

    // ***** There is no 'noindex' reason, so store the page
    case '':
    case 'not-modified-304':
    case 'not-modified-sha1':

      if ($referer == '<orphan>') {
        $data['info']['status'] = 'Orphan';
        $_RDATA['sp_status']['Orphan']++;
      } else $data['info']['status'] = 'OK';

      // ***** If we got new or updated content for this URL
      if (!$data['info']['noindex']) {

        // If this URL exists (or existed) in the live table...
        if (in_array($url, $_RDATA['sp_exist'], true) || $referer == '<orphan>') {
          $_RDATA['sp_status']['Updated']++;

          $selectData->execute(array('url' => $url));
          $err = $selectData->errorInfo();
          if ($err[0] != '00000') {
            OS_crawlLog('Database select error: '.$url, 2);
            OS_crawlLog($err[2], 0);
            break 2;
          }
          $row = $selectData->fetchAll()[0];

        // Else provide default values for a new URL
        } else {
          $_RDATA['sp_status']['New']++;

          $row = array(
            'category' => $_ODATA['sp_category_default'],
            'flag_unlisted' => 0,
            'priority' => 0.5
          );
        }

        if ($data['info']['filetime'] <= 0)
          $data['info']['filetime'] = time();

        // Remove text from titles
        foreach ($_RDATA['sp_title_strip'] as $titleStrip) {
          if ($titleStrip[0] == '*') {
            $data['title'] = preg_replace('/'.str_replace('/', '\/', substr($titleStrip, 1)).'/', '', $data['title']);
          } else $data['title'] = str_replace($titleStrip, '', $data['title']);
        }

        $port = (!empty($data['url']['port'])) ? ':'.$data['url']['port'] : '';
        $insertTemp->execute(array(
          'url' => $url,
          'title' => trim($data['title']),
          'description' => $data['description'],
          'keywords' => $data['keywords'],
          'category' => $row['category'],
          'weighted' => $data['weighted'],
          'links' => json_encode($data['links'], JSON_INVALID_UTF8_IGNORE),
          'content' => $data['content'],
          'content_mime' => $data['info']['mime_type'],
          'content_charset' => $data['info']['charset'],
          'content_checksum' => $data['info']['sha1'],
          'status' => $data['info']['status'],
          'flag_unlisted' => $row['flag_unlisted'],
          'flag_updated' => 1,
          'last_modified' => $data['info']['filetime'],
          'priority' => $row['priority']
        ));
        if (!$insertTemp->rowCount()) {
          OS_crawlLog('Database primary insert error: '.$url, 2);
          $err = $insertTemp->errorInfo();
          if ($err[0] != '00000') OS_crawlLog($err[2], 0);
        } else $_RDATA['sp_store'][] = $url;


      // ***** URL hasn't been modified since the last successful crawl
      } else {
        OS_crawlLog('Page hasn\'t been modified since the last successful crawl', 0);

        // Preset the 'last_modified' time and 'priority' until we can
        // find out the actual values from the previous database record
        $data['info']['filetime'] = time();
        $row = array('priority' => 0.5);

        // Get previous entry from existing search database
        $insertNotModified->execute(array(
          'url' => $url,
          'status' => $data['info']['status']
        ));
        if ($insertNotModified->rowCount()) {

          // Mark as 'stored'
          $_RDATA['sp_store'][] = $url;

          // Get 'priority' & 'last_modified' values for the sitemap
          // Load the previously saved link list to add to the queue
          $selectData->execute(array('url' => $url));
          $err = $selectData->errorInfo();
          if ($err[0] == '00000') {
            $row = $selectData->fetchAll()[0];
            $data['links'] = json_decode($row['links'], true);
            $data['info']['filetime'] = $row['last_modified'];

          } else OS_crawlLog('Database existing table row read error: '.$url, 2);

        // Could not insert previously stored row into temp table
        } else {
          OS_crawlLog('Database \'not-modified\' insert error: '.$url, 2);
          $err = $insertNotModified->errorInfo();
          if ($err[0] != '00000') OS_crawlLog($err[2], 0);
        }
      }


      $domain = $data['url']['scheme'].'://'.$data['url']['host'];
      if (!isset($_RDATA['sp_domains'][$domain])) {
        $_RDATA['sp_domains'][$domain] = 1;
      } else $_RDATA['sp_domains'][$domain]++;


      // Store data for use in the sitemap
      if ($_ODATA['sp_sitemap_file'] &&
          $data['url']['host'] == $_ODATA['sp_sitemap_hostname']) {
        $delta = time() - $data['info']['filetime'];
        $cf = 'always';
        if ($delta > 2700 && $delta <= 64800) $cf = 'hourly';
        if ($delta > 64800 && $delta <= 432000) $cf = 'daily';
        if ($delta > 432000 && $delta <= 2160000) $cf = 'weekly';
        if ($delta > 2160000 && $delta <= 21600000) $cf = 'monthly';
        if ($delta > 21600000 && $delta <= 62400000) $cf = 'yearly';
        if ($delta > 62400000) $cf = 'never';

        $_RDATA['sp_sitemap'][] = array(
          'loc' => str_replace(' ', '%20', htmlentities($url)),
          'lastmod' => date('Y-m-d', $data['info']['filetime']),
          'changefreq' => $cf,
          'priority' => $row['priority']
        );
      }
      break;


    // ***** Otherwise, log the reason why this page was not stored
    case 'duplicate':
      OS_crawlLog('Content is a duplicate of already indexed page: '.$_RDATA['sp_sha1'][$data['info']['sha1']].' (Referrer was: '.$referer.')', 2);
      break;

    case 'timeout':
    case 'network-error':
    case 'couldnt-connect':
      OS_crawlLog($data['error'].': '.$url, 2);
      if ($referer == '<orphan>') $_RDATA['sp_status']['Blocked']++;
      break;

    case 'empty':
    case 'too-large':
    case 'robots-meta':
    case 'robots-http':
    case 'unknown-mime':
    case 'self-reference':
    case 'empty-pdf':
    case 'secured-pdf':
    case 'couldnt-decode-pdf':
      OS_crawlLog($data['error'], 1);
      if ($referer == '<orphan>') $_RDATA['sp_status']['Blocked']++;
      break;

    case '400':
      OS_crawlLog($data['error'].': '.$url.' (Referrer was: '.$referer.')', 2);
      if ($referer == '<orphan>') $_RDATA['sp_status']['Not Found']++;
      break;

    case 'redirect-meta':
    case 'redirect-location':
      OS_crawlLog($data['error'].': '.$url.' (Referrer was: '.$referer.')', 2);
      OS_crawlLog('Page was removed in favour of redirected URL', 0);
      $data['links'][] = $data['info']['redirect_url'];
      break;

    case 'non-canonical':
      OS_crawlLog('Not indexed due to canonical <link> element: '.$data['info']['canonical'], 1);
      OS_crawlLog('Referrer was: '.$referer, 0);
      break;

    default:
      OS_crawlLog('Not indexed due to noindex rule \''.$data['info']['noindex'].'\': '.$url.' (Referrer was: '.$referer.')', 2);
      if ($referer == '<orphan>') $_RDATA['sp_status']['Blocked']++;
      break;

  }

  // Check if we have stored the maximum allowed number of pages
  if (count($_RDATA['sp_store']) >= $_ODATA['sp_limit_store']) {
    OS_crawlLog('Maximum number of crawled pages reached ('.$_ODATA['sp_limit_store'].')', 1);
    $_RDATA['sp_complete'] = true;
    break;
  }

  // If we fetched more links from the content above, parse and add
  // them to the queue
  if (!$data['info']['nofollow']) {
    foreach ($data['links'] as $link) {

      $link = OS_formatURL($link, $data['base']);

      // ***** If this link hasn't been crawled yet
      if (!in_array($link, $_RDATA['sp_links'], true)) {

        // ... and if link hasn't been queued yet
        foreach ($_RDATA['sp_queue'] as $queue)
          if ($link == $queue[0]) continue 2;

        // ... and if link passes our user filters
        if ($nx = OS_filterURL($link, $data['base'])) {
          OS_crawlLog('Link ignored due to noindex rule \''.$nx.'\': '.$link, 0);

        // ... then add the link to the queue
        } else $_RDATA['sp_queue'][] = array($link, $depth + 1, $url);
      }
    }
  }

  // If we've completed the queue, check for orphans
  if (!count($_RDATA['sp_queue'])) {

    // Diff the previous URL list with the links we've already scanned
    $_RDATA['sp_exist'] = array_diff($_RDATA['sp_exist'], $_RDATA['sp_links']);

    // If we have leftover links, and we aren't autodeleting them
    if (count($_RDATA['sp_exist']) && !$_ODATA['sp_autodelete']) {
      OS_crawlLog('Adding '.count($_RDATA['sp_exist']).' orphan(s) to queue...', 1);

      foreach ($_RDATA['sp_exist'] as $key => $link) {

        // Check if orphan URL passes our user filters
        if ($nx = OS_filterURL($link, $data['base'])) {

          // If not, remove it from the sp_exist list
          OS_crawlLog('Orphan URL ignored due to noindex rule \''.$nx.'\': '.$link, 0);
          $_RDATA['sp_status']['Blocked']++;
          unset($_RDATA['sp_exist'][$key]);

        // If so, then add the orphan to the queue
        } else $_RDATA['sp_queue'][] = array($link, 0, '<orphan>');
      }

    // Else if we stored some pages, we're done
    } else if (count($_RDATA['sp_store'])) {
      $_RDATA['sp_complete'] = true;

    // No pages were stored
    } else OS_crawlLog('No pages could be indexed; check your starting URL(s)', 2);
  }

  gc_collect_cycles();

  usleep($_ODATA['sp_sleep'] * 1000);
  $_RDATA['sp_sleep'] += $_ODATA['sp_sleep'];
}

// ***** Write sitemap
if ($_RDATA['sp_complete'] && $_ODATA['sp_sitemap_file']) {
  if ($_RDATA['sp_sitemap_file'] != 'does not exist') {
    if ($_RDATA['sp_sitemap_file'] != 'not writable') {
      $sm = array('<?xml version="1.0" encoding="UTF-8"?>');
      $sm[] = '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">';
      foreach ($_RDATA['sp_sitemap'] as $sitemap) {
        $sm[] = '  <url>';
        foreach ($sitemap as $key => $value)
          if ($key != 'priority' || $value != 0.5)
            $sm[] = '    <'.$key.'>'.$value.'</'.$key.'>';
        $sm[] = '  </url>';
      }
      $sm[] = '</urlset>';

      if (preg_match('/\.xml\.gz$/', $_RDATA['sp_sitemap_file'])) {
        if (function_exists('gzopen')) {
          $smf = gzopen($_RDATA['sp_sitemap_file'], 'w');
          gzwrite($smf, implode("\n", $sm));
          gzclose($smf);
          OS_crawlLog('Sitemap written successfully: '.$_ODATA['sp_sitemap_file'], 1);

        } else OS_crawlLog('Could not write sitemap; PHP gzip functions are not enabled', 2);

      } else if (preg_match('/\.xml$/', $_RDATA['sp_sitemap_file'])) {
        $smf = fopen($_RDATA['sp_sitemap_file'], 'w');
        fwrite($smf, implode("\n", $sm));
        fclose($smf);
        OS_crawlLog('Sitemap written successfully: '.$_ODATA['sp_sitemap_file'], 1);

      } else OS_crawlLog('Sitemap filename ('.$_ODATA['sp_sitemap_file'].') must have extension \'.xml\' or \'.xml.gz\'', 2);

    } else OS_crawlLog('Sitemap file \''.$_ODATA['sp_sitemap_file'].'\' is not writable', 2);

  } else OS_crawlLog('Sitemap file \''.$_ODATA['sp_sitemap_file'].'\' does not exist', 2);
} ?>