diff --git a/orcinus/config.php b/orcinus/config.php index bb8e917..f2d7464 100644 --- a/orcinus/config.php +++ b/orcinus/config.php @@ -423,7 +423,7 @@ if (!$_ODATA['s_result_template']) { {{#errors}} {{/errors}} @@ -721,7 +721,14 @@ $_RDATA['sp_smart'] = array( '‖' => '|' ); $_RDATA['s_latin'] = array( + 'center' => array('centre'), + 'color' => array('colour'), + 'fiber' => array('fibre'), + 'ae' => array('æ', 'Æ'), + 'oe' => array('œ', 'Œ'), + 'sz' => array('ß'), + 'th' => array('þ', 'Þ'), 'a' => array('á', 'Á', 'à', 'À', 'â', 'Â', 'ä', 'Ä', 'ã', 'Ã', 'å', 'Å', 'ą', 'Ą', 'ă', 'Ă'), 'c' => array('ç', 'Ç', 'ć', 'Ć', 'č', 'Č'), 'd' => array('ð', 'Ð', 'ď', 'Ď', 'đ', 'Đ'), @@ -730,12 +737,9 @@ $_RDATA['s_latin'] = array( 'i' => array('í', 'Í', 'ì', 'Ì', 'î', 'Î', 'ï', 'Ï', 'ı', 'İ'), 'l' => array('ł', 'Ł', 'ľ', 'Ľ', 'ĺ', 'Ĺ'), 'n' => array('ñ', 'Ñ', 'ń', 'Ń', 'ň', 'Ň'), - 'oe' => array('œ', 'Œ'), 'o' => array('ó', 'Ó', 'ò', 'Ò', 'ô', 'Ô', 'ö', 'Ö', 'õ', 'Õ', 'ø', 'Ø', 'ő', 'Ő'), 'r' => array('ŕ', 'Ŕ', 'ř', 'Ř'), - 'sz' => array('ß'), 's' => array('ş', 'Ş', 'ś', 'Ś', 'š', 'Š'), - 'th' => array('þ', 'Þ'), 't' => array('ť', 'Ť', 'ţ', 'Ţ'), 'u' => array('ú', 'Ú', 'ù', 'Ù', 'û', 'Û', 'ü', 'Ü', 'ů', 'Ů', 'ű', 'Ű'), 'x' => array('×'), diff --git a/orcinus/search.php b/orcinus/search.php index ad050e0..00a3457 100644 --- a/orcinus/search.php +++ b/orcinus/search.php @@ -23,6 +23,23 @@ $_SDATA = array( foreach ($_RDATA['s_weights'] as $key => $weight) $_RDATA['s_weights'][$key] = (float)$weight; +// Prepare regexp translation array for accented / ligature characters +$_RDATA['s_latin_pcre'] = array(); +$_RDATA['s_latin_pcre_multi'] = array(); +foreach ($_RDATA['s_latin'] as $char => $latin) { + if (strlen($char) > 1) { + $pcre = '('.$char.'|'.implode('|', $latin).')'; + } else $pcre = '['.$char.implode('', $latin).']'; + $_RDATA['s_latin_pcre'][$char] = $pcre; + foreach ($latin as $lchar) + $_RDATA['s_latin_pcre'][$lchar] = $pcre; + if (strlen($char) > 1) { + $_RDATA['s_latin_pcre_multi'][$char] = $pcre; + foreach ($latin as $lchar) + $_RDATA['s_latin_pcre_multi'][$lchar] = $pcre; + } +} + // {{{{{ Initialize the Mustache templating engine class OS_Mustache { @@ -164,13 +181,9 @@ if ($_RDATA['s_searchable_pages']) { if ($type == 'term') $_SDATA['formatted'][] = $term; + // Regexp for later use pattern matching results $_SDATA['terms'][$key][2] = preg_quote(strtolower($term), '/'); - foreach ($_RDATA['s_latin'] as $char => $latin) { - $_SDATA['terms'][$key][2] = str_replace($latin, $char, $_SDATA['terms'][$key][2]); - if (strlen($char) > 1) { - $_SDATA['terms'][$key][2] = str_replace($char, '('.$char.'|'.implode('|', $latin).')', $_SDATA['terms'][$key][2]); - } else $_SDATA['terms'][$key][2] = str_replace($char, '['.$char.implode('', $latin).']', $_SDATA['terms'][$key][2]); - } + $_SDATA['terms'][$key][2] = strtr($_SDATA['terms'][$key][2], $_RDATA['s_latin_pcre']); $_SDATA['terms'][$key][2] = '/('.$_SDATA['terms'][$key][2].')/iu'; } @@ -240,37 +253,44 @@ if ($_RDATA['s_searchable_pages']) { $ors = array(); $negs = array(); foreach ($_SDATA['terms'] as list($type, $term, $pcre)) { + + // Regexp only for SQL use + $term = preg_quote(strtolower($term), '\''); + + // Regexp alternation for multi-character ligatures + $term = strtr($term, $_RDATA['s_latin_pcre_multi']); + switch ($type) { case 'filetype': // Nothing for filetype yet break; case 'exclude': - $negs[] = '`content` NOT LIKE \'%'.addslashes($term).'%\''; - $negs[] = '`url` NOT LIKE \'%'.addslashes($term).'%\''; - $negs[] = '`title` NOT LIKE \'%'.addslashes($term).'%\''; - $negs[] = '`description` NOT LIKE \'%'.addslashes($term).'%\''; - $negs[] = '`keywords` NOT LIKE \'%'.addslashes($term).'%\''; - $negs[] = '`weighted` NOT LIKE \'%'.addslashes($term).'%\''; + $negs[] = '`content` NOT REGEXP \''.$term.'\''; + $negs[] = '`url` NOT REGEXP \''.$term.'\''; + $negs[] = '`title` NOT REGEXP \''.$term.'\''; + $negs[] = '`description` NOT REGEXP \''.$term.'\''; + $negs[] = '`keywords` NOT REGEXP \''.$term.'\''; + $negs[] = '`weighted` NOT REGEXP \''.$term.'\''; break; case 'phrase': $ands[] = '('.implode(' OR ', array( - '`content` LIKE \'%'.addslashes($term).'%\'', - '`url` LIKE \'%'.addslashes($term).'%\'', - '`title` LIKE \'%'.addslashes($term).'%\'', - '`description` LIKE \'%'.addslashes($term).'%\'', - '`keywords` LIKE \'%'.addslashes($term).'%\'', - '`weighted` LIKE \'%'.addslashes($term).'%\'' + '`content` REGEXP \''.$term.'\'', + '`url` REGEXP \''.$term.'\'', + '`title` REGEXP \''.$term.'\'', + '`description` REGEXP \''.$term.'\'', + '`keywords` REGEXP \''.$term.'\'', + '`weighted` REGEXP \''.$term.'\'' )).')'; break; case 'term': - $ors[] = '`content` LIKE \'%'.addslashes($term).'%\''; - $ors[] = '`url` LIKE \'%'.addslashes($term).'%\''; - $ors[] = '`title` LIKE \'%'.addslashes($term).'%\''; - $ors[] = '`description` LIKE \'%'.addslashes($term).'%\''; - $ors[] = '`keywords` LIKE \'%'.addslashes($term).'%\''; - $ors[] = '`weighted` LIKE \'%'.addslashes($term).'%\''; + $ors[] = '`content` REGEXP \''.$term.'\''; + $ors[] = '`url` REGEXP \''.$term.'\''; + $ors[] = '`title` REGEXP \''.$term.'\''; + $ors[] = '`description` REGEXP \''.$term.'\''; + $ors[] = '`keywords` REGEXP \''.$term.'\''; + $ors[] = '`weighted` REGEXP \''.$term.'\''; } }