Enable ligature / alternate-spelling matching

2023-04-24 09:52:05 -04:00 · 2023-04-24 09:52:05 -04:00 · 0f69a2d2c8
parent a6304d2f5d
commit 0f69a2d2c8
2 changed files with 52 additions and 28 deletions
--- a/orcinus/config.php
+++ b/orcinus/config.php
@ -423,7 +423,7 @@ if (!$_ODATA['s_result_template']) {
  {{#errors}}
    <ul>
      {{#error_list}}
-        <li>{{text}}</li>
+        <li>{{.}}</li>
      {{/error_list}}
    </ul>
  {{/errors}}
@ -721,7 +721,14 @@ $_RDATA['sp_smart'] = array(
  '‖' => '|'
 );
 $_RDATA['s_latin'] = array(
+  'center' => array('centre'),
+  'color' => array('colour'),
+  'fiber' => array('fibre'),
+
  'ae' => array('æ', 'Æ'),
+  'oe' => array('œ', 'Œ'),
+  'sz' => array('ß'),
+  'th' => array('þ', 'Þ'),
   'a' => array('á', 'Á', 'à', 'À', 'â', 'Â', 'ä', 'Ä', 'ã', 'Ã', 'å', 'Å', 'ą', 'Ą', 'ă', 'Ă'),
   'c' => array('ç', 'Ç', 'ć', 'Ć', 'č', 'Č'),
   'd' => array('ð', 'Ð', 'ď', 'Ď', 'đ', 'Đ'),
@ -730,12 +737,9 @@ $_RDATA['s_latin'] = array(
   'i' => array('í', 'Í', 'ì', 'Ì', 'î', 'Î', 'ï', 'Ï', 'ı', 'İ'),
   'l' => array('ł', 'Ł', 'ľ', 'Ľ', 'ĺ', 'Ĺ'),
   'n' => array('ñ', 'Ñ', 'ń', 'Ń', 'ň', 'Ň'),
-  'oe' => array('œ', 'Œ'),
   'o' => array('ó', 'Ó', 'ò', 'Ò', 'ô', 'Ô', 'ö', 'Ö', 'õ', 'Õ', 'ø', 'Ø', 'ő', 'Ő'),
   'r' => array('ŕ', 'Ŕ', 'ř', 'Ř'),
-  'sz' => array('ß'),
   's' => array('ş', 'Ş', 'ś', 'Ś', 'š', 'Š'),
-  'th' => array('þ', 'Þ'),
   't' => array('ť', 'Ť', 'ţ', 'Ţ'),
   'u' => array('ú', 'Ú', 'ù', 'Ù', 'û', 'Û', 'ü', 'Ü', 'ů', 'Ů', 'ű', 'Ű'),
   'x' => array('×'),
--- a/orcinus/search.php
+++ b/orcinus/search.php
@ -23,6 +23,23 @@ $_SDATA = array(
 foreach ($_RDATA['s_weights'] as $key => $weight)
  $_RDATA['s_weights'][$key] = (float)$weight;

+// Prepare regexp translation array for accented / ligature characters
+$_RDATA['s_latin_pcre'] = array();
+$_RDATA['s_latin_pcre_multi'] = array();
+foreach ($_RDATA['s_latin'] as $char => $latin) {
+  if (strlen($char) > 1) {
+    $pcre = '('.$char.'|'.implode('|', $latin).')';
+  } else $pcre = '['.$char.implode('', $latin).']';
+  $_RDATA['s_latin_pcre'][$char] = $pcre;
+  foreach ($latin as $lchar)
+    $_RDATA['s_latin_pcre'][$lchar] = $pcre;
+  if (strlen($char) > 1) {
+    $_RDATA['s_latin_pcre_multi'][$char] = $pcre;
+    foreach ($latin as $lchar)
+      $_RDATA['s_latin_pcre_multi'][$lchar] = $pcre;
+  }
+}
+

 // {{{{{ Initialize the Mustache templating engine
 class OS_Mustache {
@ -164,13 +181,9 @@ if ($_RDATA['s_searchable_pages']) {
            if ($type == 'term')
              $_SDATA['formatted'][] = $term;

+            // Regexp for later use pattern matching results
            $_SDATA['terms'][$key][2] = preg_quote(strtolower($term), '/');
-            foreach ($_RDATA['s_latin'] as $char => $latin) {
-              $_SDATA['terms'][$key][2] = str_replace($latin, $char, $_SDATA['terms'][$key][2]);
-              if (strlen($char) > 1) {
-                $_SDATA['terms'][$key][2] = str_replace($char, '('.$char.'|'.implode('|', $latin).')', $_SDATA['terms'][$key][2]);
-              } else $_SDATA['terms'][$key][2] = str_replace($char, '['.$char.implode('', $latin).']', $_SDATA['terms'][$key][2]);
-            }
+            $_SDATA['terms'][$key][2] = strtr($_SDATA['terms'][$key][2], $_RDATA['s_latin_pcre']);
            $_SDATA['terms'][$key][2] = '/('.$_SDATA['terms'][$key][2].')/iu';

        }
@ -240,37 +253,44 @@ if ($_RDATA['s_searchable_pages']) {
        $ors = array();
        $negs = array();
        foreach ($_SDATA['terms'] as list($type, $term, $pcre)) {
+
+          // Regexp only for SQL use
+          $term = preg_quote(strtolower($term), '\'');
+
+          // Regexp alternation for multi-character ligatures
+          $term = strtr($term, $_RDATA['s_latin_pcre_multi']);
+
          switch ($type) {
            case 'filetype': // Nothing for filetype yet
              break;

            case 'exclude':
-              $negs[] = '`content` NOT LIKE \'%'.addslashes($term).'%\'';
-              $negs[] = '`url` NOT LIKE \'%'.addslashes($term).'%\'';
-              $negs[] = '`title` NOT LIKE \'%'.addslashes($term).'%\'';
-              $negs[] = '`description` NOT LIKE \'%'.addslashes($term).'%\'';
-              $negs[] = '`keywords` NOT LIKE \'%'.addslashes($term).'%\'';
-              $negs[] = '`weighted` NOT LIKE \'%'.addslashes($term).'%\'';
+              $negs[] = '`content` NOT REGEXP \''.$term.'\'';
+              $negs[] = '`url` NOT REGEXP \''.$term.'\'';
+              $negs[] = '`title` NOT REGEXP \''.$term.'\'';
+              $negs[] = '`description` NOT REGEXP \''.$term.'\'';
+              $negs[] = '`keywords` NOT REGEXP \''.$term.'\'';
+              $negs[] = '`weighted` NOT REGEXP \''.$term.'\'';
              break;

            case 'phrase':
              $ands[] = '('.implode(' OR ', array(
-                '`content` LIKE \'%'.addslashes($term).'%\'',
-                '`url` LIKE \'%'.addslashes($term).'%\'',
-                '`title` LIKE \'%'.addslashes($term).'%\'',
-                '`description` LIKE \'%'.addslashes($term).'%\'',
-                '`keywords` LIKE \'%'.addslashes($term).'%\'',
-                '`weighted` LIKE \'%'.addslashes($term).'%\''
+                '`content` REGEXP \''.$term.'\'',
+                '`url` REGEXP \''.$term.'\'',
+                '`title` REGEXP \''.$term.'\'',
+                '`description` REGEXP \''.$term.'\'',
+                '`keywords` REGEXP \''.$term.'\'',
+                '`weighted` REGEXP \''.$term.'\''
              )).')';
              break;

            case 'term':
-              $ors[] = '`content` LIKE \'%'.addslashes($term).'%\'';
-              $ors[] = '`url` LIKE \'%'.addslashes($term).'%\'';
-              $ors[] = '`title` LIKE \'%'.addslashes($term).'%\'';
-              $ors[] = '`description` LIKE \'%'.addslashes($term).'%\'';
-              $ors[] = '`keywords` LIKE \'%'.addslashes($term).'%\'';
-              $ors[] = '`weighted` LIKE \'%'.addslashes($term).'%\'';
+              $ors[] = '`content` REGEXP \''.$term.'\'';
+              $ors[] = '`url` REGEXP \''.$term.'\'';
+              $ors[] = '`title` REGEXP \''.$term.'\'';
+              $ors[] = '`description` REGEXP \''.$term.'\'';
+              $ors[] = '`keywords` REGEXP \''.$term.'\'';
+              $ors[] = '`weighted` REGEXP \''.$term.'\'';

          }
        }