From 8d99e4fd419346111e837da4944beaf3cef50da0 Mon Sep 17 00:00:00 2001
From: Brian Huisman <bhuisman@greywyvern.com>
Date: Fri, 21 Apr 2023 11:27:46 -0400
Subject: [PATCH] Enable downloading Query Log CSV

---
 README.md           |  7 ++--
 orcinus/admin.php   | 62 ++++++++++++++++++++++++++++++---
 orcinus/config.php  |  1 -
 orcinus/js/admin.js | 85 ++++++++++++++++++++++++++++-----------------
 4 files changed, 114 insertions(+), 41 deletions(-)

diff --git a/README.md b/README.md
index 8766cee..7d55d5c 100644
--- a/README.md
+++ b/README.md
@@ -6,15 +6,15 @@
 
 ![banner](https://user-images.githubusercontent.com/137631/233513460-93269bbb-f218-4b00-a7bf-7fc4575d15d4.png)
 
-The **Orcinus Site Search** PHP script is an all-in-one website crawler and search engine that extracts searchable content from XML, HTML and PDF files at a single, or multiple websites. It replaces 3rd party, remote search solutions such as Google etc. 
+The **Orcinus Site Search** PHP script is an all-in-one website crawler and search engine that extracts searchable content from plain text, XML, HTML and PDF files at a single, or multiple websites. It replaces 3rd party, remote search solutions such as Google etc. 
 
 **Orcinus** will crawl your website content on a schedule, or at your command via the admin UI or even by CLI/crontab. Crawler log output conveniently informs you of missing pages, links that redirect, and other errors that you, as a webmaster can fix to keep your user experience tight. A full-featured, responsive administration GUI allows you to adjust crawl settings, view and edit all crawled pages, customize search results, and view a log of all searched queries. You also have complete control over the appearance of your search results with a [convenient templating system](https://mustache.github.io/).
 
-Optionally, **Orcinus** can generate a [sitemap XML or XML.GZ](https://www.sitemaps.org) file of your pages after every crawl, suitable for uploading to Google analytics. It can also export a JavaScript version of the entire search engine that works with offline mirrors, such as those generated by [HTTrack](https://www.httrack.com).
+Optionally, **Orcinus** can generate a [sitemap .xml or .xml.gz](https://www.sitemaps.org) file of your pages after every crawl, suitable for uploading to the [Google Search Console](https://search.google.com/search-console/sitemaps). It can also export a JavaScript version of the entire search engine that works with offline mirrors, such as those generated by [HTTrack](https://www.httrack.com).
 
 ### Requirements:
 - PHP >= 7.2.x
-- MySQL / MariaDB
+- MySQL / MariaDB >= 10.0.5
 
 ### 3rd Party Libraries:
 Included:
@@ -27,7 +27,6 @@ Optional:
 - [Maxmind GeoIP2](https://github.com/maxmind/GeoIP2-php)
 
 ## Getting Started
-
 1. Copy the `orcinus` directory to your root web directory.
 2. Fill out your SQL and desired credential details in the `orcinus/config.ini.php` file.
 3. Visit `yourdomain.com/orcinus/admin.php` in your favourite web browser and log in.
diff --git a/orcinus/admin.php b/orcinus/admin.php
index 9696d88..7607ef1 100644
--- a/orcinus/admin.php
+++ b/orcinus/admin.php
@@ -181,7 +181,7 @@ if (!$_SESSION['admin_username']) {
           }
           break;
 
-        // Download a text file log of the most recent crawl
+        // Download a text file of the most recent crawl or query log
         case 'download':
           if (empty($_POST->content)) $_POST->content = '';
           switch ($_POST->content) {
@@ -202,6 +202,7 @@ if (!$_SESSION['admin_username']) {
                   header('Content-type: text/plain; charset='.strtolower($_ODATA['s_charset']));
                   header('Content-disposition: attachment; filename="'.
                     'crawl-log'.$_POST->grep.'_'.date('Y-m-d', $_ODATA['sp_time_end']).'.txt"');
+
                   die(implode("\n", $lines));
 
                 } else {
@@ -218,6 +219,55 @@ if (!$_SESSION['admin_username']) {
               }
               break;
 
+            case 'query_log':
+              $querylog = $_DDATA['pdo']->query(
+                'SELECT `query`, `results`, `stamp`, INET_NTOA(`ip`) AS `ipaddr`
+                   FROM `'.$_DDATA['tbprefix'].'query` ORDER BY `stamp` DESC;'
+              );
+              $err = $querylog->errorInfo();
+              if ($err[0] == '00000') {
+
+                $querylog = $querylog->fetchAll();
+                if (count($querylog)) {
+
+                  header('Content-type: text/csv; charset='.strtolower($_ODATA['s_charset']));
+                  header('Content-disposition: attachment; filename="'.
+                    'query-log_'.date('Y-m-d').'.csv"');
+
+                  $output = fopen('php://output', 'w');
+
+                  $headings = array('Query', 'Results', 'Time Stamp', 'IP');
+                  if ($_GEOIP2) $headings[] = 'Country';
+
+                  fputcsv($output, $headings);
+                  foreach ($querylog as $line) {
+                    $line['stamp'] = date('c', $line['stamp']);
+
+                    if ($_GEOIP2) {
+                      try {
+                        $geo = $_GEOIP2->country($line['ipaddr']);
+                      } catch(Exception $e) { $geo = false; }
+                    } else $geo = false;
+                    if ($geo) $line['country'] = $geo->raw['country']['names']['en'];
+
+                    fputcsv($output, $line);
+                  }
+                  die();
+
+                } else {
+                  $response = array(
+                    'status' => 'Error',
+                    'message' => 'The query log is empty; nothing to download'
+                  );
+                }
+              } else {
+                $response = array(
+                  'status' => 'Error',
+                  'message' => 'Could not read the query log database'
+                );
+              }
+              break;
+
             default:
               $response = array(
                 'status' => 'Error',
@@ -2804,9 +2854,13 @@ document.write(mustache.render(
          * Query Log *********************************************** */
         case 'queries': ?> 
           <section class="row justify-content-center">
-            <header class="col-12 mb-2">
+            <header class="col-5 mb-2">
               <h2>Query Log</h2>
-            </header><?php
+            </header>
+            <div class="col-7 mb-2 text-end text-nowrap">
+              <button type="button" class="btn btn-primary" id="os_query_log_download"<?php
+                if ($_ODATA['sp_crawling']) echo ' disabled="disabled"'; ?>>Download Query Log</button>
+            </div><?php
 
             if (is_array($_RDATA['query_log_rows']) && count($_RDATA['query_log_rows'])) { ?> 
               <div class="col-xl-10 col-xxl-8">
@@ -3004,7 +3058,7 @@ document.write(mustache.render(
                   <strong>Note:</strong> You may close this popup and/or leave the page while the crawler is running.
                 </p>
                 <button type="button" class="btn btn-primary" id="os_crawl_log_download"<?php
-                  if ($_ODATA['sp_crawling']) echo ' disabled="disabled"'; ?>>Download Log</button>
+                  if ($_ODATA['sp_crawling']) echo ' disabled="disabled"'; ?>>Download Crawl Log</button>
               </div>
             </div>
           </div>
diff --git a/orcinus/config.php b/orcinus/config.php
index 3a1b3e7..2ae7df2 100644
--- a/orcinus/config.php
+++ b/orcinus/config.php
@@ -744,7 +744,6 @@ $_RDATA['s_latin'] = array(
 );
 $_RDATA['s_filetypes'] = array(
    'PDF' => array('application/pdf'),
-   'JPG' => array('image/jpeg'),
   'HTML' => array('text/html', 'application/xhtml+xml'),
    'XML' => array('text/xml', 'application/xml'),
    'TXT' => array('text/plain')
diff --git a/orcinus/js/admin.js b/orcinus/js/admin.js
index ef78608..639c1f4 100644
--- a/orcinus/js/admin.js
+++ b/orcinus/js/admin.js
@@ -1,6 +1,45 @@
 /* ***** Orcinus Site Search - Administration UI Javascript ******** */
 
 
+/**
+ * Request a file from the server and trigger a download prompt
+ *
+ */
+let os_download = function(defaultFilename, postValues) {
+  fetch(new Request('./admin.php'), {
+    method: 'POST',
+    headers: { 'Content-type': 'application/json' },
+    body: JSON.stringify(postValues)
+  })
+  .then((response) => {
+    if (response.status === 200) {
+      let ct = response.headers.get('content-type').trim();
+      if (ct.indexOf('application/json') === 0) {
+        response.json().then((data) => {
+          if (data.status == 'Error')
+            alert(data.message);
+        });
+      } else {
+        let cd = response.headers.get('content-disposition');
+        if (cd) {
+          let filename = cd.match(/filename="([^"]+)"/);
+          filename = (filename.length > 1) ? filename[1] : defaultFilename;
+          response.blob().then((blob) => {
+            let file = window.URL.createObjectURL(blob);
+            let a = document.createElement('a');
+                a.href = file;
+                a.download = filename;
+                document.body.appendChild(a);
+                a.click();
+                a.remove();
+          });
+        } else alert('Something went wrong!');
+      }
+    }
+  });
+}
+
+
 // Enable Popper.js tooltips
 let toolTipElems = document.querySelectorAll('[data-bs-toggle="tooltip"]');
 let toolTipList = [...toolTipElems].map(elem => new bootstrap.Tooltip(elem));
@@ -300,6 +339,16 @@ if (queriesModal) {
   }, false);
 }
 
+let os_query_log_download = document.getElementById('os_query_log_download');
+if (os_query_log_download) {
+  os_query_log_download.addEventListener('click', function() {
+    os_download('query-log.txt', {
+      action: 'download',
+      content: 'query_log'
+    });
+  }, false);
+}
+
 
 /* ***** Crawler Modal ********************************************* */
 let os_get_crawl_progress = function() {
@@ -512,38 +561,10 @@ os_crawl_cancel.addEventListener('click', function() {
 }, false);
 
 os_crawl_log_download.addEventListener('click', function() {
-  fetch(new Request('./admin.php'), {
-    method: 'POST',
-    headers: { 'Content-type': 'application/json' },
-    body: JSON.stringify({
-      action: 'download',
-      content: 'crawl_log',
-      grep: document.querySelector('input[name="os_crawl_grep"]:checked').value
-    })
-  })
-  .then((response) => {
-    if (response.status === 200) {
-      let ct = response.headers.get('content-type').trim();
-      if (ct == 'application/json') {
-        response.json().then((data) => {
-          if (data.status == 'Error')
-            alert(data.message);
-        });
-      } else {
-        let cd = response.headers.get('content-disposition');
-        let filename = cd.match(/filename="([^"]+)"/);
-        filename = (filename.length > 1) ? filename[1] : 'log.txt';
-        response.blob().then((blob) => {
-          let file = window.URL.createObjectURL(blob);
-          let a = document.createElement('a');
-              a.href = file;
-              a.download = filename;
-              document.body.appendChild(a);
-              a.click();
-              a.remove();
-        });
-      }
-    }
+  os_download('crawl-log.txt', {
+    action: 'download',
+    content: 'crawl_log',
+    grep: document.querySelector('input[name="os_crawl_grep"]:checked').value
   });
 }, false);