mirror of
https://github.com/phpbb/phpbb.git
synced 2025-06-28 06:08:52 +00:00
Added: support for CJK and Hangul into the search engine
git-svn-id: file:///svn/phpbb/trunk@6182 89ea8834-ac86-4346-8a33-228a782c2dd0
This commit is contained in:
parent
3b4944a476
commit
5f88af1a75
2 changed files with 51 additions and 39 deletions
|
@ -864,6 +864,14 @@ class fulltext_native_improved extends search_backend
|
||||||
|
|
||||||
$isset_min = $min - 1;
|
$isset_min = $min - 1;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Load the UTF tools
|
||||||
|
*/
|
||||||
|
if (!function_exists('utf8_strlen'))
|
||||||
|
{
|
||||||
|
include($phpbb_root_path . 'includes/utf/utf_tools.' . $phpEx);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Clean up the string, remove HTML tags, remove BBCodes
|
* Clean up the string, remove HTML tags, remove BBCodes
|
||||||
*/
|
*/
|
||||||
|
@ -871,49 +879,42 @@ class fulltext_native_improved extends search_backend
|
||||||
|
|
||||||
while (isset($word[0]))
|
while (isset($word[0]))
|
||||||
{
|
{
|
||||||
/**
|
if (isset($word[252])
|
||||||
* We check the length in octets to get an idea of the length
|
|| !isset($word[$isset_min]))
|
||||||
* in chars. If it greater than or equal to $min and lower than
|
|
||||||
* or equal to $max then we can safely assume they are within the
|
|
||||||
* char limits
|
|
||||||
*
|
|
||||||
* Words that take more than 255 bytes are ignored
|
|
||||||
*/
|
|
||||||
if (isset($word[$isset_min])
|
|
||||||
&& !isset($word[255]))
|
|
||||||
{
|
{
|
||||||
/**
|
/**
|
||||||
* This word does not exceed the SQL size, but we don't know
|
* Words longer than 252 bytes are ignored. This will have to be
|
||||||
* yet if its length in chars exceed the admin-defined one
|
* changed whenever we change the length of search_wordlist.word_text
|
||||||
|
*
|
||||||
|
* Words shorter than $isset_min bytes are ignored, too
|
||||||
*/
|
*/
|
||||||
if (!isset($word[$max]))
|
$word = strtok(' ');
|
||||||
{
|
continue;
|
||||||
/**
|
}
|
||||||
* No chance, its length in bytes is lower than our limit
|
|
||||||
* and a single byte can't represent two chars
|
|
||||||
*/
|
|
||||||
$words[] = $word;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/**
|
|
||||||
* We have to find the length in chars
|
|
||||||
*/
|
|
||||||
if (!function_exists('utf8_strlen'))
|
|
||||||
{
|
|
||||||
include($phpbb_root_path . 'includes/utf/utf_tools.' . $phpEx);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (utf8_strlen($word) <= $max)
|
$len = utf8_strlen($word);
|
||||||
{
|
|
||||||
/**
|
/**
|
||||||
* Hurray for us, the word is the right size
|
* Test whether the word is too short to be indexed.
|
||||||
*/
|
*
|
||||||
$words[] = $word;
|
* Note that this limit does NOT apply to CJK and Hangul
|
||||||
}
|
*/
|
||||||
|
if ($len < $min)
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Note: this could be optimized. If the codepoint is lower than Hangul's range
|
||||||
|
* we know that it will also be lower than CJK ranges
|
||||||
|
*/
|
||||||
|
if ((strncmp($word, UTF8_HANGUL_FIRST, 3) < 0 || strncmp($word, UTF8_HANGUL_LAST, 3) > 0)
|
||||||
|
&& (strncmp($word, UTF8_CJK_FIRST, 3) < 0 || strncmp($word, UTF8_CJK_LAST, 3) > 0)
|
||||||
|
&& (strncmp($word, UTF8_CJK_B_FIRST, 4) < 0 || strncmp($word, UTF8_CJK_B_LAST, 4) > 0))
|
||||||
|
{
|
||||||
|
$word = strtok(' ');
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$words[] = $word;
|
||||||
$word = strtok(' ');
|
$word = strtok(' ');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1377,12 +1378,17 @@ class fulltext_native_improved extends search_backend
|
||||||
$utf_char = substr($text, $pos, $utf_len);
|
$utf_char = substr($text, $pos, $utf_len);
|
||||||
$pos += $utf_len;
|
$pos += $utf_len;
|
||||||
|
|
||||||
if ($utf_char >= UTF8_HANGUL_FIRST && $utf_char <= UTF8_HANGUL_LAST)
|
if (($utf_char >= UTF8_HANGUL_FIRST && $utf_char <= UTF8_HANGUL_LAST)
|
||||||
|
|| ($utf_char >= UTF8_CJK_FIRST && $utf_char <= UTF8_CJK_LAST)
|
||||||
|
|| ($utf_char >= UTF8_CJK_B_FIRST && $utf_char <= UTF8_CJK_B_LAST))
|
||||||
{
|
{
|
||||||
/**
|
/**
|
||||||
* All characters within this range are valid
|
* All characters within these ranges are valid
|
||||||
|
*
|
||||||
|
* We index all the characters separately and we pad them to make them
|
||||||
|
* long enough to be indexed
|
||||||
*/
|
*/
|
||||||
$ret .= $utf_char;
|
$ret .= ' chr' . $utf_char;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -22,6 +22,12 @@ define('UTF8_SURROGATE_LAST', "\xED\xBF\xBF");
|
||||||
define('UTF8_HANGUL_FIRST', "\xEA\xB0\x80");
|
define('UTF8_HANGUL_FIRST', "\xEA\xB0\x80");
|
||||||
define('UTF8_HANGUL_LAST', "\xED\x9E\xA3");
|
define('UTF8_HANGUL_LAST', "\xED\x9E\xA3");
|
||||||
|
|
||||||
|
define('UTF8_CJK_FIRST', "\xE4\xB8\x80");
|
||||||
|
define('UTF8_CJK_LAST', "\xE9\xBE\xBB");
|
||||||
|
define('UTF8_CJK_B_FIRST', "\xF0\xA0\x80\x80");
|
||||||
|
define('UTF8_CJK_B_LAST', "\xF0\xAA\x9B\x96");
|
||||||
|
|
||||||
|
|
||||||
if (function_exists('utf8_normalize'))
|
if (function_exists('utf8_normalize'))
|
||||||
{
|
{
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue