mirror of
https://github.com/phpbb/phpbb.git
synced 2025-06-28 22:28:51 +00:00
Changed: moved functions that encode/decode NCRs from and to UTF-8 to utf_tools.php
git-svn-id: file:///svn/phpbb/trunk@6187 89ea8834-ac86-4346-8a33-228a782c2dd0
This commit is contained in:
parent
7b8f0da356
commit
29d92430c5
2 changed files with 96 additions and 68 deletions
|
@ -47,10 +47,18 @@ class fulltext_native_improved extends search_backend
|
||||||
|
|
||||||
$this->word_length = array('min' => $config['fulltext_native_min_chars'], 'max' => $config['fulltext_native_max_chars']);
|
$this->word_length = array('min' => $config['fulltext_native_min_chars'], 'max' => $config['fulltext_native_max_chars']);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Load the UTF tools
|
||||||
|
*/
|
||||||
if (!class_exists('utf_normalizer'))
|
if (!class_exists('utf_normalizer'))
|
||||||
{
|
{
|
||||||
include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
|
include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
|
||||||
}
|
}
|
||||||
|
if (!function_exists('utf8_strlen'))
|
||||||
|
{
|
||||||
|
include($phpbb_root_path . 'includes/utf/utf_tools.' . $phpEx);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
$error = false;
|
$error = false;
|
||||||
}
|
}
|
||||||
|
@ -864,14 +872,6 @@ class fulltext_native_improved extends search_backend
|
||||||
|
|
||||||
$isset_min = $min - 1;
|
$isset_min = $min - 1;
|
||||||
|
|
||||||
/**
|
|
||||||
* Load the UTF tools
|
|
||||||
*/
|
|
||||||
if (!function_exists('utf8_strlen'))
|
|
||||||
{
|
|
||||||
include($phpbb_root_path . 'includes/utf/utf_tools.' . $phpEx);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Clean up the string, remove HTML tags, remove BBCodes
|
* Clean up the string, remove HTML tags, remove BBCodes
|
||||||
*/
|
*/
|
||||||
|
@ -1259,11 +1259,6 @@ class fulltext_native_improved extends search_backend
|
||||||
$encoding = strtolower($encoding);
|
$encoding = strtolower($encoding);
|
||||||
if ($encoding != 'utf-8')
|
if ($encoding != 'utf-8')
|
||||||
{
|
{
|
||||||
if (!function_exists('utf8_recode'))
|
|
||||||
{
|
|
||||||
include($phpbb_root_path . 'includes/utf/utf_tools.' . $phpEx);
|
|
||||||
}
|
|
||||||
|
|
||||||
$text = utf8_recode($text, $encoding);
|
$text = utf8_recode($text, $encoding);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1277,7 +1272,7 @@ class fulltext_native_improved extends search_backend
|
||||||
/**
|
/**
|
||||||
* Replace HTML entities and NCRs
|
* Replace HTML entities and NCRs
|
||||||
*/
|
*/
|
||||||
$text = html_entity_decode($this->decode_ncr($text), ENT_QUOTES);
|
$text = html_entity_decode(utf8_decode_ncr($text), ENT_QUOTES);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Load the UTF-8 normalizer
|
* Load the UTF-8 normalizer
|
||||||
|
@ -1481,60 +1476,6 @@ class fulltext_native_improved extends search_backend
|
||||||
return $ret;
|
return $ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Convert Numeric Character References to UTF-8 chars
|
|
||||||
*
|
|
||||||
* Notes:
|
|
||||||
* - we do not convert NCRs recursively, if you pass & it will return &
|
|
||||||
* - we DO NOT check for the existence of the Unicode characters, therefore an entity
|
|
||||||
* may be converted to an inexistent codepoint
|
|
||||||
*
|
|
||||||
* @param string $text String to convert, encoded in UTF-8 (no normal form required)
|
|
||||||
* @return string UTF-8 string where NCRs have been replaced with the actual chars
|
|
||||||
*/
|
|
||||||
function decode_ncr($text)
|
|
||||||
{
|
|
||||||
/**
|
|
||||||
* @todo replace me with preg_replace_callback() or a loop
|
|
||||||
*/
|
|
||||||
return preg_replace(
|
|
||||||
'/&#([0-9]{1,6});/e',
|
|
||||||
"\$this->cp_to_utf(\$1)",
|
|
||||||
|
|
||||||
preg_replace(
|
|
||||||
'/&#x([0-9A-F]{1,5});/ie',
|
|
||||||
"\$this->cp_to_utf(hexdec('\$1'))",
|
|
||||||
$text
|
|
||||||
)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Convert a codepoint to a UTF-8 char
|
|
||||||
*
|
|
||||||
* @param integer $cp Unicode codepoint
|
|
||||||
* @return string UTF-8 string
|
|
||||||
*/
|
|
||||||
function cp_to_utf($cp)
|
|
||||||
{
|
|
||||||
if ($cp > 0xFFFF)
|
|
||||||
{
|
|
||||||
return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
|
|
||||||
}
|
|
||||||
elseif ($cp > 0x7FF)
|
|
||||||
{
|
|
||||||
return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
|
|
||||||
}
|
|
||||||
elseif ($cp > 0x7F)
|
|
||||||
{
|
|
||||||
return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
return chr($cp);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a list of options for the ACP to display
|
* Returns a list of options for the ACP to display
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -127,4 +127,91 @@ function utf8_recode($string, $encoding)
|
||||||
die('Finish me!! '.basename(__FILE__).' at line '.__LINE__);
|
die('Finish me!! '.basename(__FILE__).' at line '.__LINE__);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Replace all UTF-8 chars that are not in ASCII with their NCR
|
||||||
|
*
|
||||||
|
* @param string $text UTF-8 string in NFC
|
||||||
|
* @return string ASCII string using NCRs for non-ASCII chars
|
||||||
|
*/
|
||||||
|
function utf8_encode_ncr($text)
|
||||||
|
{
|
||||||
|
return preg_replace_callback('#[\\xC2-\\xF4][\\x80-\\xBF]+#', 'utf8_encode_ncr_callback', $text);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Callback used in encode_ncr()
|
||||||
|
*
|
||||||
|
* Takes a UTF-8 char and replaces it with its NCR. Attention, $m is an array
|
||||||
|
*
|
||||||
|
* @param array $m 0-based numerically indexed array passed by preg_replace_callback()
|
||||||
|
* @return string A HTML NCR if the character is valid, or the original string otherwise
|
||||||
|
*/
|
||||||
|
function utf8_encode_ncr_callback($m)
|
||||||
|
{
|
||||||
|
switch (strlen($m[0]))
|
||||||
|
{
|
||||||
|
case 1:
|
||||||
|
return '&#' . ord($m[0]) . ';';
|
||||||
|
|
||||||
|
case 2:
|
||||||
|
return '&#' . (((ord($m[0][0]) & 0x1F) << 6) | (ord($m[0][1]) & 0x3F)) . ';';
|
||||||
|
|
||||||
|
case 3:
|
||||||
|
return '&#' . (((ord($m[0][0]) & 0x0F) << 12) | ((ord($m[0][1]) & 0x3F) << 6) | (ord($m[0][2]) & 0x3F)) . ';';
|
||||||
|
|
||||||
|
case 4:
|
||||||
|
return '&#' . (((ord($m[0][0]) & 0x07) << 18) | ((ord($m[0][1]) & 0x3F) << 12) | ((ord($m[0][2]) & 0x3F) << 6) | (ord($m[0][3]) & 0x3F)) . ';';
|
||||||
|
|
||||||
|
default:
|
||||||
|
return $m[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert Numeric Character References to UTF-8 chars
|
||||||
|
*
|
||||||
|
* Notes:
|
||||||
|
* - we do not convert NCRs recursively, if you pass &#38; it will return &
|
||||||
|
* - we DO NOT check for the existence of the Unicode characters, therefore an entity
|
||||||
|
* may be converted to an inexistent codepoint
|
||||||
|
*
|
||||||
|
* @param string $text String to convert, encoded in UTF-8 (no normal form required)
|
||||||
|
* @return string UTF-8 string where NCRs have been replaced with the actual chars
|
||||||
|
*/
|
||||||
|
function utf8_decode_ncr($text)
|
||||||
|
{
|
||||||
|
return preg_replace_callback('/&#([0-9]{1,6}|x[0-9A-F]{1,5});/i', 'utf8_decode_ncr_callback', $text);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Callback used in decode_ncr()
|
||||||
|
*
|
||||||
|
* Takes a NCR (in decimal or hexadecimal) and returns a UTF-8 char. Attention, $m is an array.
|
||||||
|
* It will ignore most of invalid NCRs, but not all!
|
||||||
|
*
|
||||||
|
* @param array $m 0-based numerically indexed array passed by preg_replace_callback()
|
||||||
|
* @return string UTF-8 char
|
||||||
|
*/
|
||||||
|
function utf8_decode_ncr_callback($m)
|
||||||
|
{
|
||||||
|
$cp = (strncasecmp($m[1], 'x', 1)) ? $m[1] : hexdec(substr($m[1], 1));
|
||||||
|
|
||||||
|
if ($cp > 0xFFFF)
|
||||||
|
{
|
||||||
|
return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
|
||||||
|
}
|
||||||
|
elseif ($cp > 0x7FF)
|
||||||
|
{
|
||||||
|
return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
|
||||||
|
}
|
||||||
|
elseif ($cp > 0x7F)
|
||||||
|
{
|
||||||
|
return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return chr($cp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
?>
|
?>
|
Loading…
Add table
Reference in a new issue