Enable ligature / alternate-spelling matching

This commit is contained in:
Brian Huisman 2023-04-24 09:52:05 -04:00
parent a6304d2f5d
commit 0f69a2d2c8
2 changed files with 52 additions and 28 deletions

View file

@ -423,7 +423,7 @@ if (!$_ODATA['s_result_template']) {
{{#errors}}
<ul>
{{#error_list}}
<li>{{text}}</li>
<li>{{.}}</li>
{{/error_list}}
</ul>
{{/errors}}
@ -721,7 +721,14 @@ $_RDATA['sp_smart'] = array(
'‖' => '|'
);
$_RDATA['s_latin'] = array(
'center' => array('centre'),
'color' => array('colour'),
'fiber' => array('fibre'),
'ae' => array('æ', 'Æ'),
'oe' => array('œ', 'Œ'),
'sz' => array('ß'),
'th' => array('þ', 'Þ'),
'a' => array('á', 'Á', 'à', 'À', 'â', 'Â', 'ä', 'Ä', 'ã', 'Ã', 'å', 'Å', 'ą', 'Ą', 'ă', 'Ă'),
'c' => array('ç', 'Ç', 'ć', 'Ć', 'č', 'Č'),
'd' => array('ð', 'Ð', 'ď', 'Ď', 'đ', 'Đ'),
@ -730,12 +737,9 @@ $_RDATA['s_latin'] = array(
'i' => array('í', 'Í', 'ì', 'Ì', 'î', 'Î', 'ï', 'Ï', 'ı', 'İ'),
'l' => array('ł', 'Ł', 'ľ', 'Ľ', 'ĺ', 'Ĺ'),
'n' => array('ñ', 'Ñ', 'ń', 'Ń', 'ň', 'Ň'),
'oe' => array('œ', 'Œ'),
'o' => array('ó', 'Ó', 'ò', 'Ò', 'ô', 'Ô', 'ö', 'Ö', 'õ', 'Õ', 'ø', 'Ø', 'ő', 'Ő'),
'r' => array('ŕ', 'Ŕ', 'ř', 'Ř'),
'sz' => array('ß'),
's' => array('ş', 'Ş', 'ś', 'Ś', 'š', 'Š'),
'th' => array('þ', 'Þ'),
't' => array('ť', 'Ť', 'ţ', 'Ţ'),
'u' => array('ú', 'Ú', 'ù', 'Ù', 'û', 'Û', 'ü', 'Ü', 'ů', 'Ů', 'ű', 'Ű'),
'x' => array('×'),

View file

@ -23,6 +23,23 @@ $_SDATA = array(
foreach ($_RDATA['s_weights'] as $key => $weight)
$_RDATA['s_weights'][$key] = (float)$weight;
// Prepare regexp translation array for accented / ligature characters
$_RDATA['s_latin_pcre'] = array();
$_RDATA['s_latin_pcre_multi'] = array();
foreach ($_RDATA['s_latin'] as $char => $latin) {
if (strlen($char) > 1) {
$pcre = '('.$char.'|'.implode('|', $latin).')';
} else $pcre = '['.$char.implode('', $latin).']';
$_RDATA['s_latin_pcre'][$char] = $pcre;
foreach ($latin as $lchar)
$_RDATA['s_latin_pcre'][$lchar] = $pcre;
if (strlen($char) > 1) {
$_RDATA['s_latin_pcre_multi'][$char] = $pcre;
foreach ($latin as $lchar)
$_RDATA['s_latin_pcre_multi'][$lchar] = $pcre;
}
}
// {{{{{ Initialize the Mustache templating engine
class OS_Mustache {
@ -164,13 +181,9 @@ if ($_RDATA['s_searchable_pages']) {
if ($type == 'term')
$_SDATA['formatted'][] = $term;
// Regexp for later use pattern matching results
$_SDATA['terms'][$key][2] = preg_quote(strtolower($term), '/');
foreach ($_RDATA['s_latin'] as $char => $latin) {
$_SDATA['terms'][$key][2] = str_replace($latin, $char, $_SDATA['terms'][$key][2]);
if (strlen($char) > 1) {
$_SDATA['terms'][$key][2] = str_replace($char, '('.$char.'|'.implode('|', $latin).')', $_SDATA['terms'][$key][2]);
} else $_SDATA['terms'][$key][2] = str_replace($char, '['.$char.implode('', $latin).']', $_SDATA['terms'][$key][2]);
}
$_SDATA['terms'][$key][2] = strtr($_SDATA['terms'][$key][2], $_RDATA['s_latin_pcre']);
$_SDATA['terms'][$key][2] = '/('.$_SDATA['terms'][$key][2].')/iu';
}
@ -240,37 +253,44 @@ if ($_RDATA['s_searchable_pages']) {
$ors = array();
$negs = array();
foreach ($_SDATA['terms'] as list($type, $term, $pcre)) {
// Regexp only for SQL use
$term = preg_quote(strtolower($term), '\'');
// Regexp alternation for multi-character ligatures
$term = strtr($term, $_RDATA['s_latin_pcre_multi']);
switch ($type) {
case 'filetype': // Nothing for filetype yet
break;
case 'exclude':
$negs[] = '`content` NOT LIKE \'%'.addslashes($term).'%\'';
$negs[] = '`url` NOT LIKE \'%'.addslashes($term).'%\'';
$negs[] = '`title` NOT LIKE \'%'.addslashes($term).'%\'';
$negs[] = '`description` NOT LIKE \'%'.addslashes($term).'%\'';
$negs[] = '`keywords` NOT LIKE \'%'.addslashes($term).'%\'';
$negs[] = '`weighted` NOT LIKE \'%'.addslashes($term).'%\'';
$negs[] = '`content` NOT REGEXP \''.$term.'\'';
$negs[] = '`url` NOT REGEXP \''.$term.'\'';
$negs[] = '`title` NOT REGEXP \''.$term.'\'';
$negs[] = '`description` NOT REGEXP \''.$term.'\'';
$negs[] = '`keywords` NOT REGEXP \''.$term.'\'';
$negs[] = '`weighted` NOT REGEXP \''.$term.'\'';
break;
case 'phrase':
$ands[] = '('.implode(' OR ', array(
'`content` LIKE \'%'.addslashes($term).'%\'',
'`url` LIKE \'%'.addslashes($term).'%\'',
'`title` LIKE \'%'.addslashes($term).'%\'',
'`description` LIKE \'%'.addslashes($term).'%\'',
'`keywords` LIKE \'%'.addslashes($term).'%\'',
'`weighted` LIKE \'%'.addslashes($term).'%\''
'`content` REGEXP \''.$term.'\'',
'`url` REGEXP \''.$term.'\'',
'`title` REGEXP \''.$term.'\'',
'`description` REGEXP \''.$term.'\'',
'`keywords` REGEXP \''.$term.'\'',
'`weighted` REGEXP \''.$term.'\''
)).')';
break;
case 'term':
$ors[] = '`content` LIKE \'%'.addslashes($term).'%\'';
$ors[] = '`url` LIKE \'%'.addslashes($term).'%\'';
$ors[] = '`title` LIKE \'%'.addslashes($term).'%\'';
$ors[] = '`description` LIKE \'%'.addslashes($term).'%\'';
$ors[] = '`keywords` LIKE \'%'.addslashes($term).'%\'';
$ors[] = '`weighted` LIKE \'%'.addslashes($term).'%\'';
$ors[] = '`content` REGEXP \''.$term.'\'';
$ors[] = '`url` REGEXP \''.$term.'\'';
$ors[] = '`title` REGEXP \''.$term.'\'';
$ors[] = '`description` REGEXP \''.$term.'\'';
$ors[] = '`keywords` REGEXP \''.$term.'\'';
$ors[] = '`weighted` REGEXP \''.$term.'\'';
}
}