From 8d99e4fd419346111e837da4944beaf3cef50da0 Mon Sep 17 00:00:00 2001 From: Brian Huisman Date: Fri, 21 Apr 2023 11:27:46 -0400 Subject: [PATCH] Enable downloading Query Log CSV --- README.md | 7 ++-- orcinus/admin.php | 62 ++++++++++++++++++++++++++++++--- orcinus/config.php | 1 - orcinus/js/admin.js | 85 ++++++++++++++++++++++++++++----------------- 4 files changed, 114 insertions(+), 41 deletions(-) diff --git a/README.md b/README.md index 8766cee..7d55d5c 100644 --- a/README.md +++ b/README.md @@ -6,15 +6,15 @@ ![banner](https://user-images.githubusercontent.com/137631/233513460-93269bbb-f218-4b00-a7bf-7fc4575d15d4.png) -The **Orcinus Site Search** PHP script is an all-in-one website crawler and search engine that extracts searchable content from XML, HTML and PDF files at a single, or multiple websites. It replaces 3rd party, remote search solutions such as Google etc. +The **Orcinus Site Search** PHP script is an all-in-one website crawler and search engine that extracts searchable content from plain text, XML, HTML and PDF files at a single, or multiple websites. It replaces 3rd party, remote search solutions such as Google etc. **Orcinus** will crawl your website content on a schedule, or at your command via the admin UI or even by CLI/crontab. Crawler log output conveniently informs you of missing pages, links that redirect, and other errors that you, as a webmaster can fix to keep your user experience tight. A full-featured, responsive administration GUI allows you to adjust crawl settings, view and edit all crawled pages, customize search results, and view a log of all searched queries. You also have complete control over the appearance of your search results with a [convenient templating system](https://mustache.github.io/). -Optionally, **Orcinus** can generate a [sitemap XML or XML.GZ](https://www.sitemaps.org) file of your pages after every crawl, suitable for uploading to Google analytics. It can also export a JavaScript version of the entire search engine that works with offline mirrors, such as those generated by [HTTrack](https://www.httrack.com). +Optionally, **Orcinus** can generate a [sitemap .xml or .xml.gz](https://www.sitemaps.org) file of your pages after every crawl, suitable for uploading to the [Google Search Console](https://search.google.com/search-console/sitemaps). It can also export a JavaScript version of the entire search engine that works with offline mirrors, such as those generated by [HTTrack](https://www.httrack.com). ### Requirements: - PHP >= 7.2.x -- MySQL / MariaDB +- MySQL / MariaDB >= 10.0.5 ### 3rd Party Libraries: Included: @@ -27,7 +27,6 @@ Optional: - [Maxmind GeoIP2](https://github.com/maxmind/GeoIP2-php) ## Getting Started - 1. Copy the `orcinus` directory to your root web directory. 2. Fill out your SQL and desired credential details in the `orcinus/config.ini.php` file. 3. Visit `yourdomain.com/orcinus/admin.php` in your favourite web browser and log in. diff --git a/orcinus/admin.php b/orcinus/admin.php index 9696d88..7607ef1 100644 --- a/orcinus/admin.php +++ b/orcinus/admin.php @@ -181,7 +181,7 @@ if (!$_SESSION['admin_username']) { } break; - // Download a text file log of the most recent crawl + // Download a text file of the most recent crawl or query log case 'download': if (empty($_POST->content)) $_POST->content = ''; switch ($_POST->content) { @@ -202,6 +202,7 @@ if (!$_SESSION['admin_username']) { header('Content-type: text/plain; charset='.strtolower($_ODATA['s_charset'])); header('Content-disposition: attachment; filename="'. 'crawl-log'.$_POST->grep.'_'.date('Y-m-d', $_ODATA['sp_time_end']).'.txt"'); + die(implode("\n", $lines)); } else { @@ -218,6 +219,55 @@ if (!$_SESSION['admin_username']) { } break; + case 'query_log': + $querylog = $_DDATA['pdo']->query( + 'SELECT `query`, `results`, `stamp`, INET_NTOA(`ip`) AS `ipaddr` + FROM `'.$_DDATA['tbprefix'].'query` ORDER BY `stamp` DESC;' + ); + $err = $querylog->errorInfo(); + if ($err[0] == '00000') { + + $querylog = $querylog->fetchAll(); + if (count($querylog)) { + + header('Content-type: text/csv; charset='.strtolower($_ODATA['s_charset'])); + header('Content-disposition: attachment; filename="'. + 'query-log_'.date('Y-m-d').'.csv"'); + + $output = fopen('php://output', 'w'); + + $headings = array('Query', 'Results', 'Time Stamp', 'IP'); + if ($_GEOIP2) $headings[] = 'Country'; + + fputcsv($output, $headings); + foreach ($querylog as $line) { + $line['stamp'] = date('c', $line['stamp']); + + if ($_GEOIP2) { + try { + $geo = $_GEOIP2->country($line['ipaddr']); + } catch(Exception $e) { $geo = false; } + } else $geo = false; + if ($geo) $line['country'] = $geo->raw['country']['names']['en']; + + fputcsv($output, $line); + } + die(); + + } else { + $response = array( + 'status' => 'Error', + 'message' => 'The query log is empty; nothing to download' + ); + } + } else { + $response = array( + 'status' => 'Error', + 'message' => 'Could not read the query log database' + ); + } + break; + default: $response = array( 'status' => 'Error', @@ -2804,9 +2854,13 @@ document.write(mustache.render( * Query Log *********************************************** */ case 'queries': ?>
-
+

Query Log

-
+
+ +
@@ -3004,7 +3058,7 @@ document.write(mustache.render( Note: You may close this popup and/or leave the page while the crawler is running.

+ if ($_ODATA['sp_crawling']) echo ' disabled="disabled"'; ?>>Download Crawl Log
diff --git a/orcinus/config.php b/orcinus/config.php index 3a1b3e7..2ae7df2 100644 --- a/orcinus/config.php +++ b/orcinus/config.php @@ -744,7 +744,6 @@ $_RDATA['s_latin'] = array( ); $_RDATA['s_filetypes'] = array( 'PDF' => array('application/pdf'), - 'JPG' => array('image/jpeg'), 'HTML' => array('text/html', 'application/xhtml+xml'), 'XML' => array('text/xml', 'application/xml'), 'TXT' => array('text/plain') diff --git a/orcinus/js/admin.js b/orcinus/js/admin.js index ef78608..639c1f4 100644 --- a/orcinus/js/admin.js +++ b/orcinus/js/admin.js @@ -1,6 +1,45 @@ /* ***** Orcinus Site Search - Administration UI Javascript ******** */ +/** + * Request a file from the server and trigger a download prompt + * + */ +let os_download = function(defaultFilename, postValues) { + fetch(new Request('./admin.php'), { + method: 'POST', + headers: { 'Content-type': 'application/json' }, + body: JSON.stringify(postValues) + }) + .then((response) => { + if (response.status === 200) { + let ct = response.headers.get('content-type').trim(); + if (ct.indexOf('application/json') === 0) { + response.json().then((data) => { + if (data.status == 'Error') + alert(data.message); + }); + } else { + let cd = response.headers.get('content-disposition'); + if (cd) { + let filename = cd.match(/filename="([^"]+)"/); + filename = (filename.length > 1) ? filename[1] : defaultFilename; + response.blob().then((blob) => { + let file = window.URL.createObjectURL(blob); + let a = document.createElement('a'); + a.href = file; + a.download = filename; + document.body.appendChild(a); + a.click(); + a.remove(); + }); + } else alert('Something went wrong!'); + } + } + }); +} + + // Enable Popper.js tooltips let toolTipElems = document.querySelectorAll('[data-bs-toggle="tooltip"]'); let toolTipList = [...toolTipElems].map(elem => new bootstrap.Tooltip(elem)); @@ -300,6 +339,16 @@ if (queriesModal) { }, false); } +let os_query_log_download = document.getElementById('os_query_log_download'); +if (os_query_log_download) { + os_query_log_download.addEventListener('click', function() { + os_download('query-log.txt', { + action: 'download', + content: 'query_log' + }); + }, false); +} + /* ***** Crawler Modal ********************************************* */ let os_get_crawl_progress = function() { @@ -512,38 +561,10 @@ os_crawl_cancel.addEventListener('click', function() { }, false); os_crawl_log_download.addEventListener('click', function() { - fetch(new Request('./admin.php'), { - method: 'POST', - headers: { 'Content-type': 'application/json' }, - body: JSON.stringify({ - action: 'download', - content: 'crawl_log', - grep: document.querySelector('input[name="os_crawl_grep"]:checked').value - }) - }) - .then((response) => { - if (response.status === 200) { - let ct = response.headers.get('content-type').trim(); - if (ct == 'application/json') { - response.json().then((data) => { - if (data.status == 'Error') - alert(data.message); - }); - } else { - let cd = response.headers.get('content-disposition'); - let filename = cd.match(/filename="([^"]+)"/); - filename = (filename.length > 1) ? filename[1] : 'log.txt'; - response.blob().then((blob) => { - let file = window.URL.createObjectURL(blob); - let a = document.createElement('a'); - a.href = file; - a.download = filename; - document.body.appendChild(a); - a.click(); - a.remove(); - }); - } - } + os_download('crawl-log.txt', { + action: 'download', + content: 'crawl_log', + grep: document.querySelector('input[name="os_crawl_grep"]:checked').value }); }, false);