Enable downloading Query Log CSV

2023-04-21 11:27:46 -04:00 · 2023-04-21 11:27:46 -04:00 · 8d99e4fd41
parent b93c75e132
commit 8d99e4fd41
4 changed files with 114 additions and 41 deletions
--- a/README.md
+++ b/README.md
@ -6,15 +6,15 @@

 ![banner](https://user-images.githubusercontent.com/137631/233513460-93269bbb-f218-4b00-a7bf-7fc4575d15d4.png)

-The **Orcinus Site Search** PHP script is an all-in-one website crawler and search engine that extracts searchable content from XML, HTML and PDF files at a single, or multiple websites. It replaces 3rd party, remote search solutions such as Google etc. 
+The **Orcinus Site Search** PHP script is an all-in-one website crawler and search engine that extracts searchable content from plain text, XML, HTML and PDF files at a single, or multiple websites. It replaces 3rd party, remote search solutions such as Google etc. 

 **Orcinus** will crawl your website content on a schedule, or at your command via the admin UI or even by CLI/crontab. Crawler log output conveniently informs you of missing pages, links that redirect, and other errors that you, as a webmaster can fix to keep your user experience tight. A full-featured, responsive administration GUI allows you to adjust crawl settings, view and edit all crawled pages, customize search results, and view a log of all searched queries. You also have complete control over the appearance of your search results with a [convenient templating system](https://mustache.github.io/).

-Optionally, **Orcinus** can generate a [sitemap XML or XML.GZ](https://www.sitemaps.org) file of your pages after every crawl, suitable for uploading to Google analytics. It can also export a JavaScript version of the entire search engine that works with offline mirrors, such as those generated by [HTTrack](https://www.httrack.com).
+Optionally, **Orcinus** can generate a [sitemap .xml or .xml.gz](https://www.sitemaps.org) file of your pages after every crawl, suitable for uploading to the [Google Search Console](https://search.google.com/search-console/sitemaps). It can also export a JavaScript version of the entire search engine that works with offline mirrors, such as those generated by [HTTrack](https://www.httrack.com).

 ### Requirements:
 - PHP >= 7.2.x
- MySQL / MariaDB
+- MySQL / MariaDB >= 10.0.5

 ### 3rd Party Libraries:
 Included:
@ -27,7 +27,6 @@ Optional:
 - [Maxmind GeoIP2](https://github.com/maxmind/GeoIP2-php)

 ## Getting Started
-
 1. Copy the `orcinus` directory to your root web directory.
 2. Fill out your SQL and desired credential details in the `orcinus/config.ini.php` file.
 3. Visit `yourdomain.com/orcinus/admin.php` in your favourite web browser and log in.
--- a/orcinus/admin.php
+++ b/orcinus/admin.php
@ -181,7 +181,7 @@ if (!$_SESSION['admin_username']) {
          }
          break;

-        // Download a text file log of the most recent crawl
+        // Download a text file of the most recent crawl or query log
        case 'download':
          if (empty($_POST->content)) $_POST->content = '';
          switch ($_POST->content) {
@ -202,6 +202,7 @@ if (!$_SESSION['admin_username']) {
                  header('Content-type: text/plain; charset='.strtolower($_ODATA['s_charset']));
                  header('Content-disposition: attachment; filename="'.
                    'crawl-log'.$_POST->grep.'_'.date('Y-m-d', $_ODATA['sp_time_end']).'.txt"');
+
                  die(implode("\n", $lines));

                } else {
@ -218,6 +219,55 @@ if (!$_SESSION['admin_username']) {
              }
              break;

+            case 'query_log':
+              $querylog = $_DDATA['pdo']->query(
+                'SELECT `query`, `results`, `stamp`, INET_NTOA(`ip`) AS `ipaddr`
+                   FROM `'.$_DDATA['tbprefix'].'query` ORDER BY `stamp` DESC;'
+              );
+              $err = $querylog->errorInfo();
+              if ($err[0] == '00000') {
+
+                $querylog = $querylog->fetchAll();
+                if (count($querylog)) {
+
+                  header('Content-type: text/csv; charset='.strtolower($_ODATA['s_charset']));
+                  header('Content-disposition: attachment; filename="'.
+                    'query-log_'.date('Y-m-d').'.csv"');
+
+                  $output = fopen('php://output', 'w');
+
+                  $headings = array('Query', 'Results', 'Time Stamp', 'IP');
+                  if ($_GEOIP2) $headings[] = 'Country';
+
+                  fputcsv($output, $headings);
+                  foreach ($querylog as $line) {
+                    $line['stamp'] = date('c', $line['stamp']);
+
+                    if ($_GEOIP2) {
+                      try {
+                        $geo = $_GEOIP2->country($line['ipaddr']);
+                      } catch(Exception $e) { $geo = false; }
+                    } else $geo = false;
+                    if ($geo) $line['country'] = $geo->raw['country']['names']['en'];
+
+                    fputcsv($output, $line);
+                  }
+                  die();
+
+                } else {
+                  $response = array(
+                    'status' => 'Error',
+                    'message' => 'The query log is empty; nothing to download'
+                  );
+                }
+              } else {
+                $response = array(
+                  'status' => 'Error',
+                  'message' => 'Could not read the query log database'
+                );
+              }
+              break;
+
            default:
              $response = array(
                'status' => 'Error',
@ -2804,9 +2854,13 @@ document.write(mustache.render(
         * Query Log *********************************************** */
        case 'queries': ?> 
          <section class="row justify-content-center">
-            <header class="col-12 mb-2">
+            <header class="col-5 mb-2">
              <h2>Query Log</h2>
-            </header><?php
+            </header>
+            <div class="col-7 mb-2 text-end text-nowrap">
+              <button type="button" class="btn btn-primary" id="os_query_log_download"<?php
+                if ($_ODATA['sp_crawling']) echo ' disabled="disabled"'; ?>>Download Query Log</button>
+            </div><?php

            if (is_array($_RDATA['query_log_rows']) && count($_RDATA['query_log_rows'])) { ?> 
              <div class="col-xl-10 col-xxl-8">
@ -3004,7 +3058,7 @@ document.write(mustache.render(
                  <strong>Note:</strong> You may close this popup and/or leave the page while the crawler is running.
                </p>
                <button type="button" class="btn btn-primary" id="os_crawl_log_download"<?php
-                  if ($_ODATA['sp_crawling']) echo ' disabled="disabled"'; ?>>Download Log</button>
+                  if ($_ODATA['sp_crawling']) echo ' disabled="disabled"'; ?>>Download Crawl Log</button>
              </div>
            </div>
          </div>
--- a/orcinus/config.php
+++ b/orcinus/config.php
@ -744,7 +744,6 @@ $_RDATA['s_latin'] = array(
 );
 $_RDATA['s_filetypes'] = array(
   'PDF' => array('application/pdf'),
-   'JPG' => array('image/jpeg'),
  'HTML' => array('text/html', 'application/xhtml+xml'),
   'XML' => array('text/xml', 'application/xml'),
   'TXT' => array('text/plain')
--- a/orcinus/js/admin.js
+++ b/orcinus/js/admin.js
@ -1,6 +1,45 @@
 /* ***** Orcinus Site Search - Administration UI Javascript ******** */


+/**
+ * Request a file from the server and trigger a download prompt
+ *
+ */
+let os_download = function(defaultFilename, postValues) {
+  fetch(new Request('./admin.php'), {
+    method: 'POST',
+    headers: { 'Content-type': 'application/json' },
+    body: JSON.stringify(postValues)
+  })
+  .then((response) => {
+    if (response.status === 200) {
+      let ct = response.headers.get('content-type').trim();
+      if (ct.indexOf('application/json') === 0) {
+        response.json().then((data) => {
+          if (data.status == 'Error')
+            alert(data.message);
+        });
+      } else {
+        let cd = response.headers.get('content-disposition');
+        if (cd) {
+          let filename = cd.match(/filename="([^"]+)"/);
+          filename = (filename.length > 1) ? filename[1] : defaultFilename;
+          response.blob().then((blob) => {
+            let file = window.URL.createObjectURL(blob);
+            let a = document.createElement('a');
+                a.href = file;
+                a.download = filename;
+                document.body.appendChild(a);
+                a.click();
+                a.remove();
+          });
+        } else alert('Something went wrong!');
+      }
+    }
+  });
+}
+
+
 // Enable Popper.js tooltips
 let toolTipElems = document.querySelectorAll('[data-bs-toggle="tooltip"]');
 let toolTipList = [...toolTipElems].map(elem => new bootstrap.Tooltip(elem));
@ -300,6 +339,16 @@ if (queriesModal) {
  }, false);
 }

+let os_query_log_download = document.getElementById('os_query_log_download');
+if (os_query_log_download) {
+  os_query_log_download.addEventListener('click', function() {
+    os_download('query-log.txt', {
+      action: 'download',
+      content: 'query_log'
+    });
+  }, false);
+}
+

 /* ***** Crawler Modal ********************************************* */
 let os_get_crawl_progress = function() {
@ -512,38 +561,10 @@ os_crawl_cancel.addEventListener('click', function() {
 }, false);

 os_crawl_log_download.addEventListener('click', function() {
-  fetch(new Request('./admin.php'), {
-    method: 'POST',
-    headers: { 'Content-type': 'application/json' },
-    body: JSON.stringify({
-      action: 'download',
-      content: 'crawl_log',
-      grep: document.querySelector('input[name="os_crawl_grep"]:checked').value
-    })
-  })
-  .then((response) => {
-    if (response.status === 200) {
-      let ct = response.headers.get('content-type').trim();
-      if (ct == 'application/json') {
-        response.json().then((data) => {
-          if (data.status == 'Error')
-            alert(data.message);
-        });
-      } else {
-        let cd = response.headers.get('content-disposition');
-        let filename = cd.match(/filename="([^"]+)"/);
-        filename = (filename.length > 1) ? filename[1] : 'log.txt';
-        response.blob().then((blob) => {
-          let file = window.URL.createObjectURL(blob);
-          let a = document.createElement('a');
-              a.href = file;
-              a.download = filename;
-              document.body.appendChild(a);
-              a.click();
-              a.remove();
-        });
-      }
-    }
+  os_download('crawl-log.txt', {
+    action: 'download',
+    content: 'crawl_log',
+    grep: document.querySelector('input[name="os_crawl_grep"]:checked').value
  });
 }, false);