Added: support for CJK and Hangul into the search engine

git-svn-id: file:///svn/phpbb/trunk@6182 89ea8834-ac86-4346-8a33-228a782c2dd0
2025-06-28 06:08:52 +00:00 · 2006-07-15 15:44:54 +00:00 · 2006-07-15 15:44:54 +00:00 · 5f88af1a75
commit 5f88af1a75
parent 3b4944a476
2 changed files with 51 additions and 39 deletions
--- a/phpBB/includes/search/fulltext_native_improved.php
+++ b/phpBB/includes/search/fulltext_native_improved.php
@ -864,6 +864,14 @@ class fulltext_native_improved extends search_backend
 		$isset_min = $min - 1;
 		/**
 		* Load the UTF tools
 		*/
 		if (!function_exists('utf8_strlen'))
 		{
 			include($phpbb_root_path . 'includes/utf/utf_tools.' . $phpEx);
 		}
 		/**
 		* Clean up the string, remove HTML tags, remove BBCodes
 		*/
@ -871,49 +879,42 @@ class fulltext_native_improved extends search_backend
 		while (isset($word[0]))
 		{
-			/**
+			if (isset($word[252])
-			* We check the length in octets to get an idea of the length
+			 || !isset($word[$isset_min]))
 			* in chars. If it greater than or equal to $min and lower than
 			* or equal to $max then we can safely assume they are within the
 			* char limits
 			*
 			* Words that take more than 255 bytes are ignored
 			*/
 			if (isset($word[$isset_min])
 			 && !isset($word[255]))
 			{
 				/**
-				* This word does not exceed the SQL size, but we don't know
+				* Words longer than 252 bytes are ignored. This will have to be
-				* yet if its length in chars exceed the admin-defined one
+				* changed whenever we change the length of search_wordlist.word_text
 				*
 				* Words shorter than $isset_min bytes are ignored, too
 				*/
-				if (!isset($word[$max]))
+				$word = strtok(' ');
-				{
+				continue;
-					/**
+			}
 					* No chance, its length in bytes is lower than our limit
 					* and a single byte can't represent two chars
 					*/
 					$words[] = $word;
 				}
 				else
 				{
 					/**
 					* We have to find the length in chars
 					*/
 					if (!function_exists('utf8_strlen'))
 					{
 						include($phpbb_root_path . 'includes/utf/utf_tools.' . $phpEx);
 					}
-					if (utf8_strlen($word) <= $max)
+			$len = utf8_strlen($word);
-					{
+
-						/**
+			/**
-						* Hurray for us, the word is the right size
+			* Test whether the word is too short to be indexed.
-						*/
+			*
-						$words[] = $word;
+			* Note that this limit does NOT apply to CJK and Hangul
-					}
+			*/
 			if ($len < $min)
 			{
 				/**
 				* Note: this could be optimized. If the codepoint is lower than Hangul's range
 				* we know that it will also be lower than CJK ranges
 				*/
 				if ((strncmp($word, UTF8_HANGUL_FIRST, 3) < 0 || strncmp($word, UTF8_HANGUL_LAST, 3) > 0)
 				 && (strncmp($word, UTF8_CJK_FIRST, 3) < 0 || strncmp($word, UTF8_CJK_LAST, 3) > 0)
 				 && (strncmp($word, UTF8_CJK_B_FIRST, 4) < 0 || strncmp($word, UTF8_CJK_B_LAST, 4) > 0))
 				{
 					$word = strtok(' ');
 					continue;
 				}
 			}
 			$words[] = $word;
 			$word = strtok(' ');
 		}
@ -1377,12 +1378,17 @@ class fulltext_native_improved extends search_backend
 			$utf_char = substr($text, $pos, $utf_len);
 			$pos += $utf_len;
-			if ($utf_char >= UTF8_HANGUL_FIRST && $utf_char <= UTF8_HANGUL_LAST)
+			if (($utf_char >= UTF8_HANGUL_FIRST && $utf_char <= UTF8_HANGUL_LAST)
 			 || ($utf_char >= UTF8_CJK_FIRST && $utf_char <= UTF8_CJK_LAST)
 			 || ($utf_char >= UTF8_CJK_B_FIRST && $utf_char <= UTF8_CJK_B_LAST))
 			{
 				/**
-				* All characters within this range are valid
+				* All characters within these ranges are valid
 				*
 				* We index all the characters separately and we pad them to make them
 				* long enough to be indexed
 				*/
-				$ret .= $utf_char;
+				$ret .= ' chr' . $utf_char;
 				continue;
 			}
--- a/phpBB/includes/utf/utf_normalizer.php
+++ b/phpBB/includes/utf/utf_normalizer.php
@ -22,6 +22,12 @@ define('UTF8_SURROGATE_LAST', "\xED\xBF\xBF");
 define('UTF8_HANGUL_FIRST', "\xEA\xB0\x80");
 define('UTF8_HANGUL_LAST', "\xED\x9E\xA3");
 define('UTF8_CJK_FIRST', "\xE4\xB8\x80");
 define('UTF8_CJK_LAST', "\xE9\xBE\xBB");
 define('UTF8_CJK_B_FIRST', "\xF0\xA0\x80\x80");
 define('UTF8_CJK_B_LAST', "\xF0\xAA\x9B\x96");
 if (function_exists('utf8_normalize'))
 {