mirror of
https://github.com/phpbb/phpbb.git
synced 2025-06-28 22:28:51 +00:00
Merge pull request #1880 from nicolas-grekas/develop
[3.2] Rely on Intl and mbstring, use patchwork/utf8 as fallback * nicolas-grekas/develop: [feature/patchwork-utf8] Rely on mbstring, use patchwork/utf8 as fallback [feature/patchwork-utf8] Remove utf8_str_replace [feature/patchwork-utf8] Normalize with intl, use patchwork/utf8 as fallback
This commit is contained in:
commit
16de388437
21 changed files with 180 additions and 3452 deletions
|
@ -27,6 +27,7 @@
|
||||||
"require": {
|
"require": {
|
||||||
"php": ">=5.3.3",
|
"php": ">=5.3.3",
|
||||||
"lusitanian/oauth": "0.2.*",
|
"lusitanian/oauth": "0.2.*",
|
||||||
|
"patchwork/utf8": "1.1.*",
|
||||||
"symfony/config": "2.5.*",
|
"symfony/config": "2.5.*",
|
||||||
"symfony/console": "2.5.*",
|
"symfony/console": "2.5.*",
|
||||||
"symfony/dependency-injection": "2.5.*",
|
"symfony/dependency-injection": "2.5.*",
|
||||||
|
|
56
phpBB/composer.lock
generated
56
phpBB/composer.lock
generated
|
@ -68,6 +68,62 @@
|
||||||
],
|
],
|
||||||
"time": "2013-08-29 21:40:04"
|
"time": "2013-08-29 21:40:04"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"name": "patchwork/utf8",
|
||||||
|
"version": "v1.1.26",
|
||||||
|
"source": {
|
||||||
|
"type": "git",
|
||||||
|
"url": "https://github.com/tchwork/utf8.git",
|
||||||
|
"reference": "6b8e46603b49ee87ad6bceb314da94cc04ffcdce"
|
||||||
|
},
|
||||||
|
"dist": {
|
||||||
|
"type": "zip",
|
||||||
|
"url": "https://api.github.com/repos/tchwork/utf8/zipball/6b8e46603b49ee87ad6bceb314da94cc04ffcdce",
|
||||||
|
"reference": "6b8e46603b49ee87ad6bceb314da94cc04ffcdce",
|
||||||
|
"shasum": ""
|
||||||
|
},
|
||||||
|
"require": {
|
||||||
|
"lib-pcre": ">=7.3",
|
||||||
|
"php": ">=5.3.0"
|
||||||
|
},
|
||||||
|
"suggest": {
|
||||||
|
"ext-iconv": "Use iconv for best performance",
|
||||||
|
"ext-intl": "Use Intl for best performance",
|
||||||
|
"ext-mbstring": "Use Mbstring for best performance"
|
||||||
|
},
|
||||||
|
"type": "library",
|
||||||
|
"extra": {
|
||||||
|
"branch-alias": {
|
||||||
|
"dev-master": "1.1-dev"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"autoload": {
|
||||||
|
"psr-0": {
|
||||||
|
"Patchwork": "class/",
|
||||||
|
"Normalizer": "class/"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"notification-url": "https://packagist.org/downloads/",
|
||||||
|
"license": [
|
||||||
|
"(Apache-2.0 or GPL-2.0)"
|
||||||
|
],
|
||||||
|
"authors": [
|
||||||
|
{
|
||||||
|
"name": "Nicolas Grekas",
|
||||||
|
"email": "p@tchwork.com"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"description": "Portable and performant UTF-8, Unicode and Grapheme Clusters for PHP",
|
||||||
|
"homepage": "https://github.com/tchwork/utf8",
|
||||||
|
"keywords": [
|
||||||
|
"grapheme",
|
||||||
|
"i18n",
|
||||||
|
"unicode",
|
||||||
|
"utf-8",
|
||||||
|
"utf8"
|
||||||
|
],
|
||||||
|
"time": "2014-11-08 10:13:25"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "psr/log",
|
"name": "psr/log",
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
|
|
|
@ -32,262 +32,11 @@ $phpbb_root_path = '../';
|
||||||
$phpEx = substr(strrchr(__FILE__, '.'), 1);
|
$phpEx = substr(strrchr(__FILE__, '.'), 1);
|
||||||
|
|
||||||
echo "Checking for required files\n";
|
echo "Checking for required files\n";
|
||||||
download('http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt');
|
|
||||||
download('http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt');
|
|
||||||
download('http://www.unicode.org/Public/UNIDATA/UnicodeData.txt');
|
download('http://www.unicode.org/Public/UNIDATA/UnicodeData.txt');
|
||||||
echo "\n";
|
echo "\n";
|
||||||
|
|
||||||
require_once($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
|
|
||||||
$file_contents = array();
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate some Hangul/Jamo stuff
|
* Generate the files needed by the search indexer
|
||||||
*/
|
|
||||||
echo "\nGenerating Hangul and Jamo tables\n";
|
|
||||||
for ($i = 0; $i < UNICODE_HANGUL_LCOUNT; ++$i)
|
|
||||||
{
|
|
||||||
$utf_char = cp_to_utf(UNICODE_HANGUL_LBASE + $i);
|
|
||||||
$file_contents['utf_normalizer_common']['utf_jamo_index'][$utf_char] = $i * UNICODE_HANGUL_VCOUNT * UNICODE_HANGUL_TCOUNT + UNICODE_HANGUL_SBASE;
|
|
||||||
$file_contents['utf_normalizer_common']['utf_jamo_type'][$utf_char] = UNICODE_JAMO_L;
|
|
||||||
}
|
|
||||||
|
|
||||||
for ($i = 0; $i < UNICODE_HANGUL_VCOUNT; ++$i)
|
|
||||||
{
|
|
||||||
$utf_char = cp_to_utf(UNICODE_HANGUL_VBASE + $i);
|
|
||||||
$file_contents['utf_normalizer_common']['utf_jamo_index'][$utf_char] = $i * UNICODE_HANGUL_TCOUNT;
|
|
||||||
$file_contents['utf_normalizer_common']['utf_jamo_type'][$utf_char] = UNICODE_JAMO_V;
|
|
||||||
}
|
|
||||||
|
|
||||||
for ($i = 0; $i < UNICODE_HANGUL_TCOUNT; ++$i)
|
|
||||||
{
|
|
||||||
$utf_char = cp_to_utf(UNICODE_HANGUL_TBASE + $i);
|
|
||||||
$file_contents['utf_normalizer_common']['utf_jamo_index'][$utf_char] = $i;
|
|
||||||
$file_contents['utf_normalizer_common']['utf_jamo_type'][$utf_char] = UNICODE_JAMO_T;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Load the CompositionExclusions table
|
|
||||||
*/
|
|
||||||
echo "Loading CompositionExclusion\n";
|
|
||||||
$fp = fopen('CompositionExclusions.txt', 'rt');
|
|
||||||
|
|
||||||
$exclude = array();
|
|
||||||
while (!feof($fp))
|
|
||||||
{
|
|
||||||
$line = fgets($fp, 1024);
|
|
||||||
|
|
||||||
if (!strpos(' 0123456789ABCDEFabcdef', $line[0]))
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
$cp = strtok($line, ' ');
|
|
||||||
|
|
||||||
if ($pos = strpos($cp, '..'))
|
|
||||||
{
|
|
||||||
$start = hexdec(substr($cp, 0, $pos));
|
|
||||||
$end = hexdec(substr($cp, $pos + 2));
|
|
||||||
|
|
||||||
for ($i = $start; $i < $end; ++$i)
|
|
||||||
{
|
|
||||||
$exclude[$i] = 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
$exclude[hexdec($cp)] = 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fclose($fp);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Load QuickCheck tables
|
|
||||||
*/
|
|
||||||
echo "Generating QuickCheck tables\n";
|
|
||||||
$fp = fopen('DerivedNormalizationProps.txt', 'rt');
|
|
||||||
|
|
||||||
while (!feof($fp))
|
|
||||||
{
|
|
||||||
$line = fgets($fp, 1024);
|
|
||||||
|
|
||||||
if (!strpos(' 0123456789ABCDEFabcdef', $line[0]))
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
$p = array_map('trim', explode(';', strtok($line, '#')));
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Capture only NFC_QC, NFKC_QC
|
|
||||||
*/
|
|
||||||
if (!preg_match('#^NFK?C_QC$#', $p[1]))
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($pos = strpos($p[0], '..'))
|
|
||||||
{
|
|
||||||
$start = hexdec(substr($p[0], 0, $pos));
|
|
||||||
$end = hexdec(substr($p[0], $pos + 2));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
$start = $end = hexdec($p[0]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($start >= UTF8_HANGUL_FIRST && $end <= UTF8_HANGUL_LAST)
|
|
||||||
{
|
|
||||||
/**
|
|
||||||
* We do not store Hangul syllables in the array
|
|
||||||
*/
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($p[2] == 'M')
|
|
||||||
{
|
|
||||||
$val = UNICODE_QC_MAYBE;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
$val = UNICODE_QC_NO;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($p[1] == 'NFKC_QC')
|
|
||||||
{
|
|
||||||
$file = 'utf_nfkc_qc';
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
$file = 'utf_nfc_qc';
|
|
||||||
}
|
|
||||||
|
|
||||||
for ($i = $start; $i <= $end; ++$i)
|
|
||||||
{
|
|
||||||
/**
|
|
||||||
* The vars have the same name as the file: $utf_nfc_qc is in utf_nfc_qc.php
|
|
||||||
*/
|
|
||||||
$file_contents[$file][$file][cp_to_utf($i)] = $val;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fclose($fp);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Do mappings
|
|
||||||
*/
|
|
||||||
echo "Loading Unicode decomposition mappings\n";
|
|
||||||
$fp = fopen($phpbb_root_path . 'develop/UnicodeData.txt', 'rt');
|
|
||||||
|
|
||||||
$map = array();
|
|
||||||
while (!feof($fp))
|
|
||||||
{
|
|
||||||
$p = explode(';', fgets($fp, 1024));
|
|
||||||
$cp = hexdec($p[0]);
|
|
||||||
|
|
||||||
if (!empty($p[3]))
|
|
||||||
{
|
|
||||||
/**
|
|
||||||
* Store combining class > 0
|
|
||||||
*/
|
|
||||||
$file_contents['utf_normalizer_common']['utf_combining_class'][cp_to_utf($cp)] = (int) $p[3];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!isset($p[5]) || !preg_match_all('#[0-9A-F]+#', strip_tags($p[5]), $m))
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (strpos($p[5], '>'))
|
|
||||||
{
|
|
||||||
$map['NFKD'][$cp] = implode(' ', array_map('hexdec', $m[0]));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
$map['NFD'][$cp] = $map['NFKD'][$cp] = implode(' ', array_map('hexdec', $m[0]));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fclose($fp);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Build the canonical composition table
|
|
||||||
*/
|
|
||||||
echo "Generating the Canonical Composition table\n";
|
|
||||||
foreach ($map['NFD'] as $cp => $decomp_seq)
|
|
||||||
{
|
|
||||||
if (!strpos($decomp_seq, ' ') || isset($exclude[$cp]))
|
|
||||||
{
|
|
||||||
/**
|
|
||||||
* Singletons are excluded from canonical composition
|
|
||||||
*/
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
$utf_seq = implode('', array_map('cp_to_utf', explode(' ', $decomp_seq)));
|
|
||||||
|
|
||||||
if (!isset($file_contents['utf_canonical_comp']['utf_canonical_comp'][$utf_seq]))
|
|
||||||
{
|
|
||||||
$file_contents['utf_canonical_comp']['utf_canonical_comp'][$utf_seq] = cp_to_utf($cp);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Decompose the NF[K]D mappings recursively and prepare the file contents
|
|
||||||
*/
|
|
||||||
echo "Generating the Canonical and Compatibility Decomposition tables\n\n";
|
|
||||||
foreach ($map as $type => $decomp_map)
|
|
||||||
{
|
|
||||||
foreach ($decomp_map as $cp => $decomp_seq)
|
|
||||||
{
|
|
||||||
$decomp_map[$cp] = decompose($decomp_map, $decomp_seq);
|
|
||||||
}
|
|
||||||
unset($decomp_seq);
|
|
||||||
|
|
||||||
if ($type == 'NFKD')
|
|
||||||
{
|
|
||||||
$file = 'utf_compatibility_decomp';
|
|
||||||
$var = 'utf_compatibility_decomp';
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
$file = 'utf_canonical_decomp';
|
|
||||||
$var = 'utf_canonical_decomp';
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Generate the corresponding file
|
|
||||||
*/
|
|
||||||
foreach ($decomp_map as $cp => $decomp_seq)
|
|
||||||
{
|
|
||||||
$file_contents[$file][$var][cp_to_utf($cp)] = implode('', array_map('cp_to_utf', explode(' ', $decomp_seq)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Generate and/or alter the files
|
|
||||||
*/
|
|
||||||
foreach ($file_contents as $file => $contents)
|
|
||||||
{
|
|
||||||
/**
|
|
||||||
* Generate a new file
|
|
||||||
*/
|
|
||||||
echo "Writing to $file.$phpEx\n";
|
|
||||||
|
|
||||||
if (!$fp = fopen($phpbb_root_path . 'includes/utf/data/' . $file . '.' . $phpEx, 'wb'))
|
|
||||||
{
|
|
||||||
trigger_error('Cannot open ' . $file . ' for write');
|
|
||||||
}
|
|
||||||
|
|
||||||
fwrite($fp, '<?php');
|
|
||||||
foreach ($contents as $var => $val)
|
|
||||||
{
|
|
||||||
fwrite($fp, "\n\$GLOBALS[" . my_var_export($var) . ']=' . my_var_export($val) . ";");
|
|
||||||
}
|
|
||||||
fclose($fp);
|
|
||||||
}
|
|
||||||
|
|
||||||
echo "\n*** UTF-8 normalization tables done\n\n";
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Now we'll generate the files needed by the search indexer
|
|
||||||
*/
|
*/
|
||||||
echo "Generating search indexer tables\n";
|
echo "Generating search indexer tables\n";
|
||||||
|
|
||||||
|
@ -424,32 +173,6 @@ die("\nAll done!\n");
|
||||||
// Internal functions //
|
// Internal functions //
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
/**
|
|
||||||
* Decompose a sequence recusively
|
|
||||||
*
|
|
||||||
* @param array $decomp_map Decomposition mapping, passed by reference
|
|
||||||
* @param string $decomp_seq Decomposition sequence as decimal codepoints separated with a space
|
|
||||||
* @return string Decomposition sequence, fully decomposed
|
|
||||||
*/
|
|
||||||
function decompose(&$decomp_map, $decomp_seq)
|
|
||||||
{
|
|
||||||
$ret = array();
|
|
||||||
foreach (explode(' ', $decomp_seq) as $cp)
|
|
||||||
{
|
|
||||||
if (isset($decomp_map[$cp]))
|
|
||||||
{
|
|
||||||
$ret[] = decompose($decomp_map, $decomp_map[$cp]);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
$ret[] = $cp;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return implode(' ', $ret);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return a parsable string representation of a variable
|
* Return a parsable string representation of a variable
|
||||||
*
|
*
|
||||||
|
@ -537,17 +260,6 @@ function hex_to_utf($hex)
|
||||||
return cp_to_utf(hexdec($hex));
|
return cp_to_utf(hexdec($hex));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Return a UTF string formed from a sequence of codepoints in hexadecimal
|
|
||||||
*
|
|
||||||
* @param string $seq Sequence of codepoints, separated with a space
|
|
||||||
* @return string UTF-8 string
|
|
||||||
*/
|
|
||||||
function hexseq_to_utf($seq)
|
|
||||||
{
|
|
||||||
return implode('', array_map('hex_to_utf', explode(' ', $seq)));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Convert a codepoint to a UTF-8 char
|
* Convert a codepoint to a UTF-8 char
|
||||||
*
|
*
|
||||||
|
|
|
@ -81,38 +81,3 @@ function utf8_to_unicode_callback($m)
|
||||||
{
|
{
|
||||||
return '\u' . str_pad(base_convert(utf8_ord($m[0]), 10, 16), 4, '0', STR_PAD_LEFT) . '';
|
return '\u' . str_pad(base_convert(utf8_ord($m[0]), 10, 16), 4, '0', STR_PAD_LEFT) . '';
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* A wrapper function for the normalizer which takes care of including the class if required and modifies the passed strings
|
|
||||||
* to be in NFKC
|
|
||||||
*
|
|
||||||
* @param mixed $strings a string or an array of strings to normalize
|
|
||||||
* @return mixed the normalized content, preserving array keys if array given.
|
|
||||||
*/
|
|
||||||
function utf8_normalize_nfkc($strings)
|
|
||||||
{
|
|
||||||
if (empty($strings))
|
|
||||||
{
|
|
||||||
return $strings;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!class_exists('utf_normalizer'))
|
|
||||||
{
|
|
||||||
global $phpbb_root_path, $phpEx;
|
|
||||||
include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!is_array($strings))
|
|
||||||
{
|
|
||||||
utf_normalizer::nfkc($strings);
|
|
||||||
}
|
|
||||||
else if (is_array($strings))
|
|
||||||
{
|
|
||||||
foreach ($strings as $key => $string)
|
|
||||||
{
|
|
||||||
utf_normalizer::nfkc($strings[$key]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return $strings;
|
|
||||||
}
|
|
||||||
|
|
|
@ -1,394 +0,0 @@
|
||||||
<?php
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* This file is part of the phpBB Forum Software package.
|
|
||||||
*
|
|
||||||
* @copyright (c) phpBB Limited <https://www.phpbb.com>
|
|
||||||
* @license GNU General Public License, version 2 (GPL-2.0)
|
|
||||||
*
|
|
||||||
* For full copyright and license information, please see
|
|
||||||
* the docs/CREDITS.txt file.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
if (php_sapi_name() != 'cli')
|
|
||||||
{
|
|
||||||
die("This program must be run from the command line.\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
// Security message:
|
|
||||||
//
|
|
||||||
// This script is potentially dangerous.
|
|
||||||
// Remove or comment the next line (die(".... ) to enable this script.
|
|
||||||
// Do NOT FORGET to either remove this script or disable it after you have used it.
|
|
||||||
//
|
|
||||||
die("Please read the first lines of this script for instructions on how to enable it");
|
|
||||||
|
|
||||||
set_time_limit(0);
|
|
||||||
error_reporting(E_ALL);
|
|
||||||
|
|
||||||
define('IN_PHPBB', true);
|
|
||||||
$phpbb_root_path = '../';
|
|
||||||
$phpEx = substr(strrchr(__FILE__, '.'), 1);
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Let's download some files we need
|
|
||||||
*/
|
|
||||||
download('http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt');
|
|
||||||
download('http://www.unicode.org/Public/UNIDATA/UnicodeData.txt');
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Those are the tests we run
|
|
||||||
*/
|
|
||||||
$test_suite = array(
|
|
||||||
/**
|
|
||||||
* NFC
|
|
||||||
* c2 == NFC(c1) == NFC(c2) == NFC(c3)
|
|
||||||
* c4 == NFC(c4) == NFC(c5)
|
|
||||||
*/
|
|
||||||
'NFC' => array(
|
|
||||||
'c2' => array('c1', 'c2', 'c3'),
|
|
||||||
'c4' => array('c4', 'c5')
|
|
||||||
),
|
|
||||||
|
|
||||||
/**
|
|
||||||
* NFD
|
|
||||||
* c3 == NFD(c1) == NFD(c2) == NFD(c3)
|
|
||||||
* c5 == NFD(c4) == NFD(c5)
|
|
||||||
*/
|
|
||||||
'NFD' => array(
|
|
||||||
'c3' => array('c1', 'c2', 'c3'),
|
|
||||||
'c5' => array('c4', 'c5')
|
|
||||||
),
|
|
||||||
|
|
||||||
/**
|
|
||||||
* NFKC
|
|
||||||
* c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
|
|
||||||
*/
|
|
||||||
'NFKC' => array(
|
|
||||||
'c4' => array('c1', 'c2', 'c3', 'c4', 'c5')
|
|
||||||
),
|
|
||||||
|
|
||||||
/**
|
|
||||||
* NFKD
|
|
||||||
* c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
|
|
||||||
*/
|
|
||||||
'NFKD' => array(
|
|
||||||
'c5' => array('c1', 'c2', 'c3', 'c4', 'c5')
|
|
||||||
)
|
|
||||||
);
|
|
||||||
|
|
||||||
require_once($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
|
|
||||||
|
|
||||||
$i = $n = 0;
|
|
||||||
$failed = false;
|
|
||||||
$tested_chars = array();
|
|
||||||
|
|
||||||
$fp = fopen($phpbb_root_path . 'develop/NormalizationTest.txt', 'rb');
|
|
||||||
while (!feof($fp))
|
|
||||||
{
|
|
||||||
$line = fgets($fp);
|
|
||||||
++$n;
|
|
||||||
|
|
||||||
if ($line[0] == '@')
|
|
||||||
{
|
|
||||||
if ($i)
|
|
||||||
{
|
|
||||||
echo "done\n";
|
|
||||||
}
|
|
||||||
|
|
||||||
$i = 0;
|
|
||||||
echo "\n", substr($line, 1), "\n\n";
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!strpos(' 0123456789ABCDEF', $line[0]))
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (++$i % 100 == 0)
|
|
||||||
{
|
|
||||||
echo $i, ' ';
|
|
||||||
}
|
|
||||||
|
|
||||||
list($c1, $c2, $c3, $c4, $c5) = explode(';', $line);
|
|
||||||
|
|
||||||
if (!strpos($c1, ' '))
|
|
||||||
{
|
|
||||||
/**
|
|
||||||
* We are currently testing a single character, we add it to the list of
|
|
||||||
* characters we have processed so that we can exclude it when testing
|
|
||||||
* for invariants
|
|
||||||
*/
|
|
||||||
$tested_chars[$c1] = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
foreach ($test_suite as $form => $serie)
|
|
||||||
{
|
|
||||||
foreach ($serie as $expected => $tests)
|
|
||||||
{
|
|
||||||
$hex_expected = ${$expected};
|
|
||||||
$utf_expected = hexseq_to_utf($hex_expected);
|
|
||||||
|
|
||||||
foreach ($tests as $test)
|
|
||||||
{
|
|
||||||
$utf_result = $utf_expected;
|
|
||||||
call_user_func(array('utf_normalizer', $form), $utf_result);
|
|
||||||
|
|
||||||
if (strcmp($utf_expected, $utf_result))
|
|
||||||
{
|
|
||||||
$failed = true;
|
|
||||||
$hex_result = utf_to_hexseq($utf_result);
|
|
||||||
|
|
||||||
echo "\nFAILED $expected == $form($test) ($hex_expected != $hex_result)";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($failed)
|
|
||||||
{
|
|
||||||
die("\n\nFailed at line $n\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fclose($fp);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Test for invariants
|
|
||||||
*/
|
|
||||||
echo "\n\nTesting for invariants...\n\n";
|
|
||||||
|
|
||||||
$fp = fopen($phpbb_root_path . 'develop/UnicodeData.txt', 'rt');
|
|
||||||
|
|
||||||
$n = 0;
|
|
||||||
while (!feof($fp))
|
|
||||||
{
|
|
||||||
if (++$n % 100 == 0)
|
|
||||||
{
|
|
||||||
echo $n, ' ';
|
|
||||||
}
|
|
||||||
|
|
||||||
$line = fgets($fp, 1024);
|
|
||||||
|
|
||||||
if (!$pos = strpos($line, ';'))
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
$hex_tested = $hex_expected = substr($line, 0, $pos);
|
|
||||||
|
|
||||||
if (isset($tested_chars[$hex_tested]))
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
$utf_expected = hex_to_utf($hex_expected);
|
|
||||||
|
|
||||||
if ($utf_expected >= UTF8_SURROGATE_FIRST
|
|
||||||
&& $utf_expected <= UTF8_SURROGATE_LAST)
|
|
||||||
{
|
|
||||||
/**
|
|
||||||
* Surrogates are illegal on their own, we expect the normalizer
|
|
||||||
* to return a replacement char
|
|
||||||
*/
|
|
||||||
$utf_expected = UTF8_REPLACEMENT;
|
|
||||||
$hex_expected = utf_to_hexseq($utf_expected);
|
|
||||||
}
|
|
||||||
|
|
||||||
foreach (array('nfc', 'nfkc', 'nfd', 'nfkd') as $form)
|
|
||||||
{
|
|
||||||
$utf_result = $utf_expected;
|
|
||||||
utf_normalizer::$form($utf_result);
|
|
||||||
$hex_result = utf_to_hexseq($utf_result);
|
|
||||||
// echo "$form($utf_expected) == $utf_result\n";
|
|
||||||
|
|
||||||
if (strcmp($utf_expected, $utf_result))
|
|
||||||
{
|
|
||||||
$failed = 1;
|
|
||||||
|
|
||||||
echo "\nFAILED $hex_expected == $form($hex_tested) ($hex_expected != $hex_result)";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($failed)
|
|
||||||
{
|
|
||||||
die("\n\nFailed at line $n\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fclose($fp);
|
|
||||||
|
|
||||||
die("\n\nALL TESTS PASSED SUCCESSFULLY\n");
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Download a file to the develop/ dir
|
|
||||||
*
|
|
||||||
* @param string $url URL of the file to download
|
|
||||||
* @return null
|
|
||||||
*/
|
|
||||||
function download($url)
|
|
||||||
{
|
|
||||||
global $phpbb_root_path;
|
|
||||||
|
|
||||||
if (file_exists($phpbb_root_path . 'develop/' . basename($url)))
|
|
||||||
{
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
echo 'Downloading from ', $url, ' ';
|
|
||||||
|
|
||||||
if (!$fpr = fopen($url, 'rb'))
|
|
||||||
{
|
|
||||||
die("Can't download from $url\nPlease download it yourself and put it in the develop/ dir, kthxbai");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!$fpw = fopen($phpbb_root_path . 'develop/' . basename($url), 'wb'))
|
|
||||||
{
|
|
||||||
die("Can't open develop/" . basename($url) . " for output... please check your permissions or something");
|
|
||||||
}
|
|
||||||
|
|
||||||
$i = 0;
|
|
||||||
$chunk = 32768;
|
|
||||||
$done = '';
|
|
||||||
|
|
||||||
while (!feof($fpr))
|
|
||||||
{
|
|
||||||
$i += fwrite($fpw, fread($fpr, $chunk));
|
|
||||||
echo str_repeat("\x08", strlen($done));
|
|
||||||
|
|
||||||
$done = ($i >> 10) . ' KiB';
|
|
||||||
echo $done;
|
|
||||||
}
|
|
||||||
fclose($fpr);
|
|
||||||
fclose($fpw);
|
|
||||||
|
|
||||||
echo "\n";
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Convert a UTF string to a sequence of codepoints in hexadecimal
|
|
||||||
*
|
|
||||||
* @param string $utf UTF string
|
|
||||||
* @return integer Unicode codepoints in hex
|
|
||||||
*/
|
|
||||||
function utf_to_hexseq($str)
|
|
||||||
{
|
|
||||||
$pos = 0;
|
|
||||||
$len = strlen($str);
|
|
||||||
$ret = array();
|
|
||||||
|
|
||||||
while ($pos < $len)
|
|
||||||
{
|
|
||||||
$c = $str[$pos];
|
|
||||||
switch ($c & "\xF0")
|
|
||||||
{
|
|
||||||
case "\xC0":
|
|
||||||
case "\xD0":
|
|
||||||
$utf_char = substr($str, $pos, 2);
|
|
||||||
$pos += 2;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case "\xE0":
|
|
||||||
$utf_char = substr($str, $pos, 3);
|
|
||||||
$pos += 3;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case "\xF0":
|
|
||||||
$utf_char = substr($str, $pos, 4);
|
|
||||||
$pos += 4;
|
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
|
||||||
$utf_char = $c;
|
|
||||||
++$pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
$hex = dechex(utf_to_cp($utf_char));
|
|
||||||
|
|
||||||
if (!isset($hex[3]))
|
|
||||||
{
|
|
||||||
$hex = substr('000' . $hex, -4);
|
|
||||||
}
|
|
||||||
|
|
||||||
$ret[] = $hex;
|
|
||||||
}
|
|
||||||
|
|
||||||
return strtr(implode(' ', $ret), 'abcdef', 'ABCDEF');
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Convert a UTF-8 char to its codepoint
|
|
||||||
*
|
|
||||||
* @param string $utf_char UTF-8 char
|
|
||||||
* @return integer Unicode codepoint
|
|
||||||
*/
|
|
||||||
function utf_to_cp($utf_char)
|
|
||||||
{
|
|
||||||
switch (strlen($utf_char))
|
|
||||||
{
|
|
||||||
case 1:
|
|
||||||
return ord($utf_char);
|
|
||||||
|
|
||||||
case 2:
|
|
||||||
return ((ord($utf_char[0]) & 0x1F) << 6) | (ord($utf_char[1]) & 0x3F);
|
|
||||||
|
|
||||||
case 3:
|
|
||||||
return ((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F);
|
|
||||||
|
|
||||||
case 4:
|
|
||||||
return ((ord($utf_char[0]) & 0x07) << 18) | ((ord($utf_char[1]) & 0x3F) << 12) | ((ord($utf_char[2]) & 0x3F) << 6) | (ord($utf_char[3]) & 0x3F);
|
|
||||||
|
|
||||||
default:
|
|
||||||
die('UTF-8 chars can only be 1-4 bytes long');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Return a UTF string formed from a sequence of codepoints in hexadecimal
|
|
||||||
*
|
|
||||||
* @param string $seq Sequence of codepoints, separated with a space
|
|
||||||
* @return string UTF-8 string
|
|
||||||
*/
|
|
||||||
function hexseq_to_utf($seq)
|
|
||||||
{
|
|
||||||
return implode('', array_map('hex_to_utf', explode(' ', $seq)));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Convert a codepoint in hexadecimal to a UTF-8 char
|
|
||||||
*
|
|
||||||
* @param string $hex Codepoint, in hexadecimal
|
|
||||||
* @return string UTF-8 char
|
|
||||||
*/
|
|
||||||
function hex_to_utf($hex)
|
|
||||||
{
|
|
||||||
return cp_to_utf(hexdec($hex));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Convert a codepoint to a UTF-8 char
|
|
||||||
*
|
|
||||||
* @param integer $cp Unicode codepoint
|
|
||||||
* @return string UTF-8 string
|
|
||||||
*/
|
|
||||||
function cp_to_utf($cp)
|
|
||||||
{
|
|
||||||
if ($cp > 0xFFFF)
|
|
||||||
{
|
|
||||||
return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
|
|
||||||
}
|
|
||||||
else if ($cp > 0x7FF)
|
|
||||||
{
|
|
||||||
return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
|
|
||||||
}
|
|
||||||
else if ($cp > 0x7F)
|
|
||||||
{
|
|
||||||
return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
return chr($cp);
|
|
||||||
}
|
|
||||||
}
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -1,4 +0,0 @@
|
||||||
<?php
|
|
||||||
$GLOBALS['utf_jamo_index']=array('ᄀ'=>44032,'ᄁ'=>44620,'ᄂ'=>45208,'ᄃ'=>45796,'ᄄ'=>46384,'ᄅ'=>46972,'ᄆ'=>47560,'ᄇ'=>48148,'ᄈ'=>48736,'ᄉ'=>49324,'ᄊ'=>49912,'ᄋ'=>50500,'ᄌ'=>51088,'ᄍ'=>51676,'ᄎ'=>52264,'ᄏ'=>52852,'ᄐ'=>53440,'ᄑ'=>54028,'ᄒ'=>54616,'ᅡ'=>0,'ᅢ'=>28,'ᅣ'=>56,'ᅤ'=>84,'ᅥ'=>112,'ᅦ'=>140,'ᅧ'=>168,'ᅨ'=>196,'ᅩ'=>224,'ᅪ'=>252,'ᅫ'=>280,'ᅬ'=>308,'ᅭ'=>336,'ᅮ'=>364,'ᅯ'=>392,'ᅰ'=>420,'ᅱ'=>448,'ᅲ'=>476,'ᅳ'=>504,'ᅴ'=>532,'ᅵ'=>560,'ᆧ'=>0,'ᆨ'=>1,'ᆩ'=>2,'ᆪ'=>3,'ᆫ'=>4,'ᆬ'=>5,'ᆭ'=>6,'ᆮ'=>7,'ᆯ'=>8,'ᆰ'=>9,'ᆱ'=>10,'ᆲ'=>11,'ᆳ'=>12,'ᆴ'=>13,'ᆵ'=>14,'ᆶ'=>15,'ᆷ'=>16,'ᆸ'=>17,'ᆹ'=>18,'ᆺ'=>19,'ᆻ'=>20,'ᆼ'=>21,'ᆽ'=>22,'ᆾ'=>23,'ᆿ'=>24,'ᇀ'=>25,'ᇁ'=>26,'ᇂ'=>27);
|
|
||||||
$GLOBALS['utf_jamo_type']=array('ᄀ'=>0,'ᄁ'=>0,'ᄂ'=>0,'ᄃ'=>0,'ᄄ'=>0,'ᄅ'=>0,'ᄆ'=>0,'ᄇ'=>0,'ᄈ'=>0,'ᄉ'=>0,'ᄊ'=>0,'ᄋ'=>0,'ᄌ'=>0,'ᄍ'=>0,'ᄎ'=>0,'ᄏ'=>0,'ᄐ'=>0,'ᄑ'=>0,'ᄒ'=>0,'ᅡ'=>1,'ᅢ'=>1,'ᅣ'=>1,'ᅤ'=>1,'ᅥ'=>1,'ᅦ'=>1,'ᅧ'=>1,'ᅨ'=>1,'ᅩ'=>1,'ᅪ'=>1,'ᅫ'=>1,'ᅬ'=>1,'ᅭ'=>1,'ᅮ'=>1,'ᅯ'=>1,'ᅰ'=>1,'ᅱ'=>1,'ᅲ'=>1,'ᅳ'=>1,'ᅴ'=>1,'ᅵ'=>1,'ᆧ'=>2,'ᆨ'=>2,'ᆩ'=>2,'ᆪ'=>2,'ᆫ'=>2,'ᆬ'=>2,'ᆭ'=>2,'ᆮ'=>2,'ᆯ'=>2,'ᆰ'=>2,'ᆱ'=>2,'ᆲ'=>2,'ᆳ'=>2,'ᆴ'=>2,'ᆵ'=>2,'ᆶ'=>2,'ᆷ'=>2,'ᆸ'=>2,'ᆹ'=>2,'ᆺ'=>2,'ᆻ'=>2,'ᆼ'=>2,'ᆽ'=>2,'ᆾ'=>2,'ᆿ'=>2,'ᇀ'=>2,'ᇁ'=>2,'ᇂ'=>2);
|
|
||||||
$GLOBALS['utf_combining_class']=array('̀'=>230,'́'=>230,'̂'=>230,'̃'=>230,'̄'=>230,'̅'=>230,'̆'=>230,'̇'=>230,'̈'=>230,'̉'=>230,'̊'=>230,'̋'=>230,'̌'=>230,'̍'=>230,'̎'=>230,'̏'=>230,'̐'=>230,'̑'=>230,'̒'=>230,'̓'=>230,'̔'=>230,'̕'=>232,'̖'=>220,'̗'=>220,'̘'=>220,'̙'=>220,'̚'=>232,'̛'=>216,'̜'=>220,'̝'=>220,'̞'=>220,'̟'=>220,'̠'=>220,'̡'=>202,'̢'=>202,'̣'=>220,'̤'=>220,'̥'=>220,'̦'=>220,'̧'=>202,'̨'=>202,'̩'=>220,'̪'=>220,'̫'=>220,'̬'=>220,'̭'=>220,'̮'=>220,'̯'=>220,'̰'=>220,'̱'=>220,'̲'=>220,'̳'=>220,'̴'=>1,'̵'=>1,'̶'=>1,'̷'=>1,'̸'=>1,'̹'=>220,'̺'=>220,'̻'=>220,'̼'=>220,'̽'=>230,'̾'=>230,'̿'=>230,'̀'=>230,'́'=>230,'͂'=>230,'̓'=>230,'̈́'=>230,'ͅ'=>240,'͆'=>230,'͇'=>220,'͈'=>220,'͉'=>220,'͊'=>230,'͋'=>230,'͌'=>230,'͍'=>220,'͎'=>220,'͐'=>230,'͑'=>230,'͒'=>230,'͓'=>220,'͔'=>220,'͕'=>220,'͖'=>220,'͗'=>230,'͘'=>232,'͙'=>220,'͚'=>220,'͛'=>230,'͜'=>233,'͝'=>234,'͞'=>234,'͟'=>233,'͠'=>234,'͡'=>234,'͢'=>233,'ͣ'=>230,'ͤ'=>230,'ͥ'=>230,'ͦ'=>230,'ͧ'=>230,'ͨ'=>230,'ͩ'=>230,'ͪ'=>230,'ͫ'=>230,'ͬ'=>230,'ͭ'=>230,'ͮ'=>230,'ͯ'=>230,'҃'=>230,'҄'=>230,'҅'=>230,'҆'=>230,'֑'=>220,'֒'=>230,'֓'=>230,'֔'=>230,'֕'=>230,'֖'=>220,'֗'=>230,'֘'=>230,'֙'=>230,'֚'=>222,'֛'=>220,'֜'=>230,'֝'=>230,'֞'=>230,'֟'=>230,'֠'=>230,'֡'=>230,'֢'=>220,'֣'=>220,'֤'=>220,'֥'=>220,'֦'=>220,'֧'=>220,'֨'=>230,'֩'=>230,'֪'=>220,'֫'=>230,'֬'=>230,'֭'=>222,'֮'=>228,'֯'=>230,'ְ'=>10,'ֱ'=>11,'ֲ'=>12,'ֳ'=>13,'ִ'=>14,'ֵ'=>15,'ֶ'=>16,'ַ'=>17,'ָ'=>18,'ֹ'=>19,'ֺ'=>19,'ֻ'=>20,'ּ'=>21,'ֽ'=>22,'ֿ'=>23,'ׁ'=>24,'ׂ'=>25,'ׄ'=>230,'ׅ'=>220,'ׇ'=>18,'ؐ'=>230,'ؑ'=>230,'ؒ'=>230,'ؓ'=>230,'ؔ'=>230,'ؕ'=>230,'ً'=>27,'ٌ'=>28,'ٍ'=>29,'َ'=>30,'ُ'=>31,'ِ'=>32,'ّ'=>33,'ْ'=>34,'ٓ'=>230,'ٔ'=>230,'ٕ'=>220,'ٖ'=>220,'ٗ'=>230,'٘'=>230,'ٙ'=>230,'ٚ'=>230,'ٛ'=>230,'ٜ'=>220,'ٝ'=>230,'ٞ'=>230,'ٰ'=>35,'ۖ'=>230,'ۗ'=>230,'ۘ'=>230,'ۙ'=>230,'ۚ'=>230,'ۛ'=>230,'ۜ'=>230,'۟'=>230,'۠'=>230,'ۡ'=>230,'ۢ'=>230,'ۣ'=>220,'ۤ'=>230,'ۧ'=>230,'ۨ'=>230,'۪'=>220,'۫'=>230,'۬'=>230,'ۭ'=>220,'ܑ'=>36,'ܰ'=>230,'ܱ'=>220,'ܲ'=>230,'ܳ'=>230,'ܴ'=>220,'ܵ'=>230,'ܶ'=>230,'ܷ'=>220,'ܸ'=>220,'ܹ'=>220,'ܺ'=>230,'ܻ'=>220,'ܼ'=>220,'ܽ'=>230,'ܾ'=>220,'ܿ'=>230,'݀'=>230,'݁'=>230,'݂'=>220,'݃'=>230,'݄'=>220,'݅'=>230,'݆'=>220,'݇'=>230,'݈'=>220,'݉'=>230,'݊'=>230,'߫'=>230,'߬'=>230,'߭'=>230,'߮'=>230,'߯'=>230,'߰'=>230,'߱'=>230,'߲'=>220,'߳'=>230,'़'=>7,'्'=>9,'॑'=>230,'॒'=>220,'॓'=>230,'॔'=>230,'়'=>7,'্'=>9,'਼'=>7,'੍'=>9,'઼'=>7,'્'=>9,'଼'=>7,'୍'=>9,'்'=>9,'్'=>9,'ౕ'=>84,'ౖ'=>91,'಼'=>7,'್'=>9,'്'=>9,'්'=>9,'ุ'=>103,'ู'=>103,'ฺ'=>9,'่'=>107,'้'=>107,'๊'=>107,'๋'=>107,'ຸ'=>118,'ູ'=>118,'່'=>122,'້'=>122,'໊'=>122,'໋'=>122,'༘'=>220,'༙'=>220,'༵'=>220,'༷'=>220,'༹'=>216,'ཱ'=>129,'ི'=>130,'ུ'=>132,'ེ'=>130,'ཻ'=>130,'ོ'=>130,'ཽ'=>130,'ྀ'=>130,'ྂ'=>230,'ྃ'=>230,'྄'=>9,'྆'=>230,'྇'=>230,'࿆'=>220,'့'=>7,'္'=>9,'፟'=>230,'᜔'=>9,'᜴'=>9,'្'=>9,'៝'=>230,'ᢩ'=>228,'᤹'=>222,'᤺'=>230,'᤻'=>220,'ᨗ'=>230,'ᨘ'=>220,'᬴'=>7,'᭄'=>9,'᭫'=>230,'᭬'=>220,'᭭'=>230,'᭮'=>230,'᭯'=>230,'᭰'=>230,'᭱'=>230,'᭲'=>230,'᭳'=>230,'᷀'=>230,'᷁'=>230,'᷂'=>220,'᷃'=>230,'᷄'=>230,'᷅'=>230,'᷆'=>230,'᷇'=>230,'᷈'=>230,'᷉'=>230,'᷊'=>220,'᷾'=>230,'᷿'=>220,'⃐'=>230,'⃑'=>230,'⃒'=>1,'⃓'=>1,'⃔'=>230,'⃕'=>230,'⃖'=>230,'⃗'=>230,'⃘'=>1,'⃙'=>1,'⃚'=>1,'⃛'=>230,'⃜'=>230,'⃡'=>230,'⃥'=>1,'⃦'=>1,'⃧'=>230,'⃨'=>220,'⃩'=>230,'⃪'=>1,'⃫'=>1,'⃬'=>220,'⃭'=>220,'⃮'=>220,'⃯'=>220,'〪'=>218,'〫'=>228,'〬'=>232,'〭'=>222,'〮'=>224,'〯'=>224,'゙'=>8,'゚'=>8,'꠆'=>9,'ﬞ'=>26,'︠'=>230,'︡'=>230,'︢'=>230,'︣'=>230,'𐨍'=>220,'𐨏'=>230,'𐨸'=>230,'𐨹'=>1,'𐨺'=>220,'𐨿'=>9,'𝅥'=>216,'𝅦'=>216,'𝅧'=>1,'𝅨'=>1,'𝅩'=>1,'𝅭'=>226,'𝅮'=>216,'𝅯'=>216,'𝅰'=>216,'𝅱'=>216,'𝅲'=>216,'𝅻'=>220,'𝅼'=>220,'𝅽'=>220,'𝅾'=>220,'𝅿'=>220,'𝆀'=>220,'𝆁'=>220,'𝆂'=>220,'𝆅'=>230,'𝆆'=>230,'𝆇'=>230,'𝆈'=>230,'𝆉'=>230,'𝆊'=>220,'𝆋'=>220,'𝆪'=>230,'𝆫'=>230,'𝆬'=>230,'𝆭'=>230,'𝉂'=>230,'𝉃'=>230,'𝉄'=>230);
|
|
File diff suppressed because it is too large
Load diff
|
@ -21,6 +21,13 @@ if (!defined('IN_PHPBB'))
|
||||||
// Enforce ASCII only string handling
|
// Enforce ASCII only string handling
|
||||||
setlocale(LC_CTYPE, 'C');
|
setlocale(LC_CTYPE, 'C');
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Setup the UTF-8 portability layer
|
||||||
|
*/
|
||||||
|
Patchwork\Utf8\Bootup::initUtf8Encode();
|
||||||
|
Patchwork\Utf8\Bootup::initMbstring();
|
||||||
|
Patchwork\Utf8\Bootup::initIntl();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* UTF-8 tools
|
* UTF-8 tools
|
||||||
*
|
*
|
||||||
|
@ -29,544 +36,85 @@ setlocale(LC_CTYPE, 'C');
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
if (!extension_loaded('xml'))
|
/**
|
||||||
|
* UTF-8 aware alternative to strrpos
|
||||||
|
* @ignore
|
||||||
|
*/
|
||||||
|
function utf8_strrpos($str, $needle, $offset = null)
|
||||||
{
|
{
|
||||||
/**
|
// Emulate behaviour of strrpos rather than raising warning
|
||||||
* Implementation of PHP's native utf8_encode for people without XML support
|
if (empty($str))
|
||||||
* This function exploits some nice things that ISO-8859-1 and UTF-8 have in common
|
|
||||||
*
|
|
||||||
* @param string $str ISO-8859-1 encoded data
|
|
||||||
* @return string UTF-8 encoded data
|
|
||||||
*/
|
|
||||||
function utf8_encode($str)
|
|
||||||
{
|
{
|
||||||
$out = '';
|
return false;
|
||||||
for ($i = 0, $len = strlen($str); $i < $len; $i++)
|
|
||||||
{
|
|
||||||
$letter = $str[$i];
|
|
||||||
$num = ord($letter);
|
|
||||||
if ($num < 0x80)
|
|
||||||
{
|
|
||||||
$out .= $letter;
|
|
||||||
}
|
|
||||||
else if ($num < 0xC0)
|
|
||||||
{
|
|
||||||
$out .= "\xC2" . $letter;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
$out .= "\xC3" . chr($num - 64);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return $out;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
if (is_null($offset))
|
||||||
* Implementation of PHP's native utf8_decode for people without XML support
|
|
||||||
*
|
|
||||||
* @param string $str UTF-8 encoded data
|
|
||||||
* @return string ISO-8859-1 encoded data
|
|
||||||
*/
|
|
||||||
function utf8_decode($str)
|
|
||||||
{
|
{
|
||||||
$pos = 0;
|
return mb_strrpos($str, $needle);
|
||||||
$len = strlen($str);
|
}
|
||||||
$ret = '';
|
else
|
||||||
|
{
|
||||||
while ($pos < $len)
|
return mb_strrpos($str, $needle, $offset);
|
||||||
{
|
|
||||||
$ord = ord($str[$pos]) & 0xF0;
|
|
||||||
if ($ord === 0xC0 || $ord === 0xD0)
|
|
||||||
{
|
|
||||||
$charval = ((ord($str[$pos]) & 0x1F) << 6) | (ord($str[$pos + 1]) & 0x3F);
|
|
||||||
$pos += 2;
|
|
||||||
$ret .= (($charval < 256) ? chr($charval) : '?');
|
|
||||||
}
|
|
||||||
else if ($ord === 0xE0)
|
|
||||||
{
|
|
||||||
$ret .= '?';
|
|
||||||
$pos += 3;
|
|
||||||
}
|
|
||||||
else if ($ord === 0xF0)
|
|
||||||
{
|
|
||||||
$ret .= '?';
|
|
||||||
$pos += 4;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
$ret .= $str[$pos];
|
|
||||||
++$pos;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return $ret;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// mbstring is old and has it's functions around for older versions of PHP.
|
/**
|
||||||
// if mbstring is not loaded, we go into native mode.
|
* UTF-8 aware alternative to strpos
|
||||||
if (extension_loaded('mbstring'))
|
* @ignore
|
||||||
|
*/
|
||||||
|
function utf8_strpos($str, $needle, $offset = null)
|
||||||
{
|
{
|
||||||
mb_internal_encoding('UTF-8');
|
if (is_null($offset))
|
||||||
|
|
||||||
/**
|
|
||||||
* UTF-8 aware alternative to strrpos
|
|
||||||
* Find position of last occurrence of a char in a string
|
|
||||||
*/
|
|
||||||
/**
|
|
||||||
* UTF-8 aware alternative to strrpos
|
|
||||||
* @ignore
|
|
||||||
*/
|
|
||||||
function utf8_strrpos($str, $needle, $offset = null)
|
|
||||||
{
|
{
|
||||||
// Emulate behaviour of strrpos rather than raising warning
|
return mb_strpos($str, $needle);
|
||||||
if (empty($str))
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (is_null($offset))
|
|
||||||
{
|
|
||||||
return mb_strrpos($str, $needle);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
return mb_strrpos($str, $needle, $offset);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
else
|
||||||
/**
|
|
||||||
* UTF-8 aware alternative to strpos
|
|
||||||
* @ignore
|
|
||||||
*/
|
|
||||||
function utf8_strpos($str, $needle, $offset = null)
|
|
||||||
{
|
{
|
||||||
if (is_null($offset))
|
return mb_strpos($str, $needle, $offset);
|
||||||
{
|
|
||||||
return mb_strpos($str, $needle);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
return mb_strpos($str, $needle, $offset);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* UTF-8 aware alternative to strtolower
|
|
||||||
* @ignore
|
|
||||||
*/
|
|
||||||
function utf8_strtolower($str)
|
|
||||||
{
|
|
||||||
return mb_strtolower($str);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* UTF-8 aware alternative to strtoupper
|
|
||||||
* @ignore
|
|
||||||
*/
|
|
||||||
function utf8_strtoupper($str)
|
|
||||||
{
|
|
||||||
return mb_strtoupper($str);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* UTF-8 aware alternative to substr
|
|
||||||
* @ignore
|
|
||||||
*/
|
|
||||||
function utf8_substr($str, $offset, $length = null)
|
|
||||||
{
|
|
||||||
if (is_null($length))
|
|
||||||
{
|
|
||||||
return mb_substr($str, $offset);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
return mb_substr($str, $offset, $length);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Return the length (in characters) of a UTF-8 string
|
|
||||||
* @ignore
|
|
||||||
*/
|
|
||||||
function utf8_strlen($text)
|
|
||||||
{
|
|
||||||
return mb_strlen($text, 'utf-8');
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
|
||||||
|
/**
|
||||||
|
* UTF-8 aware alternative to strtolower
|
||||||
|
* @ignore
|
||||||
|
*/
|
||||||
|
function utf8_strtolower($str)
|
||||||
{
|
{
|
||||||
/**
|
return mb_strtolower($str);
|
||||||
* UTF-8 aware alternative to strrpos
|
}
|
||||||
* Find position of last occurrence of a char in a string
|
|
||||||
*
|
/**
|
||||||
* @author Harry Fuecks
|
* UTF-8 aware alternative to strtoupper
|
||||||
* @param string $str haystack
|
* @ignore
|
||||||
* @param string $needle needle
|
*/
|
||||||
* @param integer $offset (optional) offset (from left)
|
function utf8_strtoupper($str)
|
||||||
* @return mixed integer position or FALSE on failure
|
{
|
||||||
*/
|
return mb_strtoupper($str);
|
||||||
function utf8_strrpos($str, $needle, $offset = null)
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* UTF-8 aware alternative to substr
|
||||||
|
* @ignore
|
||||||
|
*/
|
||||||
|
function utf8_substr($str, $offset, $length = null)
|
||||||
|
{
|
||||||
|
if (is_null($length))
|
||||||
{
|
{
|
||||||
if (is_null($offset))
|
return mb_substr($str, $offset);
|
||||||
{
|
|
||||||
$ar = explode($needle, $str);
|
|
||||||
|
|
||||||
if (sizeof($ar) > 1)
|
|
||||||
{
|
|
||||||
// Pop off the end of the string where the last match was made
|
|
||||||
array_pop($ar);
|
|
||||||
$str = join($needle, $ar);
|
|
||||||
|
|
||||||
return utf8_strlen($str);
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (!is_int($offset))
|
|
||||||
{
|
|
||||||
trigger_error('utf8_strrpos expects parameter 3 to be long', E_USER_ERROR);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
$str = utf8_substr($str, $offset);
|
|
||||||
|
|
||||||
if (false !== ($pos = utf8_strrpos($str, $needle)))
|
|
||||||
{
|
|
||||||
return $pos + $offset;
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
else
|
||||||
/**
|
|
||||||
* UTF-8 aware alternative to strpos
|
|
||||||
* Find position of first occurrence of a string
|
|
||||||
*
|
|
||||||
* @author Harry Fuecks
|
|
||||||
* @param string $str haystack
|
|
||||||
* @param string $needle needle
|
|
||||||
* @param integer $offset offset in characters (from left)
|
|
||||||
* @return mixed integer position or FALSE on failure
|
|
||||||
*/
|
|
||||||
function utf8_strpos($str, $needle, $offset = null)
|
|
||||||
{
|
{
|
||||||
if (is_null($offset))
|
return mb_substr($str, $offset, $length);
|
||||||
{
|
|
||||||
$ar = explode($needle, $str);
|
|
||||||
if (sizeof($ar) > 1)
|
|
||||||
{
|
|
||||||
return utf8_strlen($ar[0]);
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (!is_int($offset))
|
|
||||||
{
|
|
||||||
trigger_error('utf8_strpos: Offset must be an integer', E_USER_ERROR);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
$str = utf8_substr($str, $offset);
|
|
||||||
|
|
||||||
if (false !== ($pos = utf8_strpos($str, $needle)))
|
|
||||||
{
|
|
||||||
return $pos + $offset;
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* UTF-8 aware alternative to strtolower
|
* Return the length (in characters) of a UTF-8 string
|
||||||
* Make a string lowercase
|
* @ignore
|
||||||
* Note: The concept of a characters "case" only exists is some alphabets
|
*/
|
||||||
* such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
|
function utf8_strlen($text)
|
||||||
* not exist in the Chinese alphabet, for example. See Unicode Standard
|
{
|
||||||
* Annex #21: Case Mappings
|
return mb_strlen($text, 'utf-8');
|
||||||
*
|
|
||||||
* @param string
|
|
||||||
* @return string string in lowercase
|
|
||||||
*/
|
|
||||||
function utf8_strtolower($string)
|
|
||||||
{
|
|
||||||
static $utf8_upper_to_lower = array(
|
|
||||||
"\xC3\x80" => "\xC3\xA0", "\xC3\x81" => "\xC3\xA1",
|
|
||||||
"\xC3\x82" => "\xC3\xA2", "\xC3\x83" => "\xC3\xA3", "\xC3\x84" => "\xC3\xA4", "\xC3\x85" => "\xC3\xA5",
|
|
||||||
"\xC3\x86" => "\xC3\xA6", "\xC3\x87" => "\xC3\xA7", "\xC3\x88" => "\xC3\xA8", "\xC3\x89" => "\xC3\xA9",
|
|
||||||
"\xC3\x8A" => "\xC3\xAA", "\xC3\x8B" => "\xC3\xAB", "\xC3\x8C" => "\xC3\xAC", "\xC3\x8D" => "\xC3\xAD",
|
|
||||||
"\xC3\x8E" => "\xC3\xAE", "\xC3\x8F" => "\xC3\xAF", "\xC3\x90" => "\xC3\xB0", "\xC3\x91" => "\xC3\xB1",
|
|
||||||
"\xC3\x92" => "\xC3\xB2", "\xC3\x93" => "\xC3\xB3", "\xC3\x94" => "\xC3\xB4", "\xC3\x95" => "\xC3\xB5",
|
|
||||||
"\xC3\x96" => "\xC3\xB6", "\xC3\x98" => "\xC3\xB8", "\xC3\x99" => "\xC3\xB9", "\xC3\x9A" => "\xC3\xBA",
|
|
||||||
"\xC3\x9B" => "\xC3\xBB", "\xC3\x9C" => "\xC3\xBC", "\xC3\x9D" => "\xC3\xBD", "\xC3\x9E" => "\xC3\xBE",
|
|
||||||
"\xC4\x80" => "\xC4\x81", "\xC4\x82" => "\xC4\x83", "\xC4\x84" => "\xC4\x85", "\xC4\x86" => "\xC4\x87",
|
|
||||||
"\xC4\x88" => "\xC4\x89", "\xC4\x8A" => "\xC4\x8B", "\xC4\x8C" => "\xC4\x8D", "\xC4\x8E" => "\xC4\x8F",
|
|
||||||
"\xC4\x90" => "\xC4\x91", "\xC4\x92" => "\xC4\x93", "\xC4\x96" => "\xC4\x97", "\xC4\x98" => "\xC4\x99",
|
|
||||||
"\xC4\x9A" => "\xC4\x9B", "\xC4\x9C" => "\xC4\x9D", "\xC4\x9E" => "\xC4\x9F", "\xC4\xA0" => "\xC4\xA1",
|
|
||||||
"\xC4\xA2" => "\xC4\xA3", "\xC4\xA4" => "\xC4\xA5", "\xC4\xA6" => "\xC4\xA7", "\xC4\xA8" => "\xC4\xA9",
|
|
||||||
"\xC4\xAA" => "\xC4\xAB", "\xC4\xAE" => "\xC4\xAF", "\xC4\xB4" => "\xC4\xB5", "\xC4\xB6" => "\xC4\xB7",
|
|
||||||
"\xC4\xB9" => "\xC4\xBA", "\xC4\xBB" => "\xC4\xBC", "\xC4\xBD" => "\xC4\xBE", "\xC5\x81" => "\xC5\x82",
|
|
||||||
"\xC5\x83" => "\xC5\x84", "\xC5\x85" => "\xC5\x86", "\xC5\x87" => "\xC5\x88", "\xC5\x8A" => "\xC5\x8B",
|
|
||||||
"\xC5\x8C" => "\xC5\x8D", "\xC5\x90" => "\xC5\x91", "\xC5\x94" => "\xC5\x95", "\xC5\x96" => "\xC5\x97",
|
|
||||||
"\xC5\x98" => "\xC5\x99", "\xC5\x9A" => "\xC5\x9B", "\xC5\x9C" => "\xC5\x9D", "\xC5\x9E" => "\xC5\x9F",
|
|
||||||
"\xC5\xA0" => "\xC5\xA1", "\xC5\xA2" => "\xC5\xA3", "\xC5\xA4" => "\xC5\xA5", "\xC5\xA6" => "\xC5\xA7",
|
|
||||||
"\xC5\xA8" => "\xC5\xA9", "\xC5\xAA" => "\xC5\xAB", "\xC5\xAC" => "\xC5\xAD", "\xC5\xAE" => "\xC5\xAF",
|
|
||||||
"\xC5\xB0" => "\xC5\xB1", "\xC5\xB2" => "\xC5\xB3", "\xC5\xB4" => "\xC5\xB5", "\xC5\xB6" => "\xC5\xB7",
|
|
||||||
"\xC5\xB8" => "\xC3\xBF", "\xC5\xB9" => "\xC5\xBA", "\xC5\xBB" => "\xC5\xBC", "\xC5\xBD" => "\xC5\xBE",
|
|
||||||
"\xC6\xA0" => "\xC6\xA1", "\xC6\xAF" => "\xC6\xB0", "\xC8\x98" => "\xC8\x99", "\xC8\x9A" => "\xC8\x9B",
|
|
||||||
"\xCE\x86" => "\xCE\xAC", "\xCE\x88" => "\xCE\xAD", "\xCE\x89" => "\xCE\xAE", "\xCE\x8A" => "\xCE\xAF",
|
|
||||||
"\xCE\x8C" => "\xCF\x8C", "\xCE\x8E" => "\xCF\x8D", "\xCE\x8F" => "\xCF\x8E", "\xCE\x91" => "\xCE\xB1",
|
|
||||||
"\xCE\x92" => "\xCE\xB2", "\xCE\x93" => "\xCE\xB3", "\xCE\x94" => "\xCE\xB4", "\xCE\x95" => "\xCE\xB5",
|
|
||||||
"\xCE\x96" => "\xCE\xB6", "\xCE\x97" => "\xCE\xB7", "\xCE\x98" => "\xCE\xB8", "\xCE\x99" => "\xCE\xB9",
|
|
||||||
"\xCE\x9A" => "\xCE\xBA", "\xCE\x9B" => "\xCE\xBB", "\xCE\x9C" => "\xCE\xBC", "\xCE\x9D" => "\xCE\xBD",
|
|
||||||
"\xCE\x9E" => "\xCE\xBE", "\xCE\x9F" => "\xCE\xBF", "\xCE\xA0" => "\xCF\x80", "\xCE\xA1" => "\xCF\x81",
|
|
||||||
"\xCE\xA3" => "\xCF\x83", "\xCE\xA4" => "\xCF\x84", "\xCE\xA5" => "\xCF\x85", "\xCE\xA6" => "\xCF\x86",
|
|
||||||
"\xCE\xA7" => "\xCF\x87", "\xCE\xA8" => "\xCF\x88", "\xCE\xA9" => "\xCF\x89", "\xCE\xAA" => "\xCF\x8A",
|
|
||||||
"\xCE\xAB" => "\xCF\x8B", "\xD0\x81" => "\xD1\x91", "\xD0\x82" => "\xD1\x92", "\xD0\x83" => "\xD1\x93",
|
|
||||||
"\xD0\x84" => "\xD1\x94", "\xD0\x85" => "\xD1\x95", "\xD0\x86" => "\xD1\x96", "\xD0\x87" => "\xD1\x97",
|
|
||||||
"\xD0\x88" => "\xD1\x98", "\xD0\x89" => "\xD1\x99", "\xD0\x8A" => "\xD1\x9A", "\xD0\x8B" => "\xD1\x9B",
|
|
||||||
"\xD0\x8C" => "\xD1\x9C", "\xD0\x8E" => "\xD1\x9E", "\xD0\x8F" => "\xD1\x9F", "\xD0\x90" => "\xD0\xB0",
|
|
||||||
"\xD0\x91" => "\xD0\xB1", "\xD0\x92" => "\xD0\xB2", "\xD0\x93" => "\xD0\xB3", "\xD0\x94" => "\xD0\xB4",
|
|
||||||
"\xD0\x95" => "\xD0\xB5", "\xD0\x96" => "\xD0\xB6", "\xD0\x97" => "\xD0\xB7", "\xD0\x98" => "\xD0\xB8",
|
|
||||||
"\xD0\x99" => "\xD0\xB9", "\xD0\x9A" => "\xD0\xBA", "\xD0\x9B" => "\xD0\xBB", "\xD0\x9C" => "\xD0\xBC",
|
|
||||||
"\xD0\x9D" => "\xD0\xBD", "\xD0\x9E" => "\xD0\xBE", "\xD0\x9F" => "\xD0\xBF", "\xD0\xA0" => "\xD1\x80",
|
|
||||||
"\xD0\xA1" => "\xD1\x81", "\xD0\xA2" => "\xD1\x82", "\xD0\xA3" => "\xD1\x83", "\xD0\xA4" => "\xD1\x84",
|
|
||||||
"\xD0\xA5" => "\xD1\x85", "\xD0\xA6" => "\xD1\x86", "\xD0\xA7" => "\xD1\x87", "\xD0\xA8" => "\xD1\x88",
|
|
||||||
"\xD0\xA9" => "\xD1\x89", "\xD0\xAA" => "\xD1\x8A", "\xD0\xAB" => "\xD1\x8B", "\xD0\xAC" => "\xD1\x8C",
|
|
||||||
"\xD0\xAD" => "\xD1\x8D", "\xD0\xAE" => "\xD1\x8E", "\xD0\xAF" => "\xD1\x8F", "\xD2\x90" => "\xD2\x91",
|
|
||||||
"\xE1\xB8\x82" => "\xE1\xB8\x83", "\xE1\xB8\x8A" => "\xE1\xB8\x8B", "\xE1\xB8\x9E" => "\xE1\xB8\x9F", "\xE1\xB9\x80" => "\xE1\xB9\x81",
|
|
||||||
"\xE1\xB9\x96" => "\xE1\xB9\x97", "\xE1\xB9\xA0" => "\xE1\xB9\xA1", "\xE1\xB9\xAA" => "\xE1\xB9\xAB", "\xE1\xBA\x80" => "\xE1\xBA\x81",
|
|
||||||
"\xE1\xBA\x82" => "\xE1\xBA\x83", "\xE1\xBA\x84" => "\xE1\xBA\x85", "\xE1\xBB\xB2" => "\xE1\xBB\xB3"
|
|
||||||
);
|
|
||||||
|
|
||||||
return strtr(strtolower($string), $utf8_upper_to_lower);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* UTF-8 aware alternative to strtoupper
|
|
||||||
* Make a string uppercase
|
|
||||||
* Note: The concept of a characters "case" only exists is some alphabets
|
|
||||||
* such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
|
|
||||||
* not exist in the Chinese alphabet, for example. See Unicode Standard
|
|
||||||
* Annex #21: Case Mappings
|
|
||||||
*
|
|
||||||
* @param string
|
|
||||||
* @return string string in uppercase
|
|
||||||
*/
|
|
||||||
function utf8_strtoupper($string)
|
|
||||||
{
|
|
||||||
static $utf8_lower_to_upper = array(
|
|
||||||
"\xC3\xA0" => "\xC3\x80", "\xC3\xA1" => "\xC3\x81",
|
|
||||||
"\xC3\xA2" => "\xC3\x82", "\xC3\xA3" => "\xC3\x83", "\xC3\xA4" => "\xC3\x84", "\xC3\xA5" => "\xC3\x85",
|
|
||||||
"\xC3\xA6" => "\xC3\x86", "\xC3\xA7" => "\xC3\x87", "\xC3\xA8" => "\xC3\x88", "\xC3\xA9" => "\xC3\x89",
|
|
||||||
"\xC3\xAA" => "\xC3\x8A", "\xC3\xAB" => "\xC3\x8B", "\xC3\xAC" => "\xC3\x8C", "\xC3\xAD" => "\xC3\x8D",
|
|
||||||
"\xC3\xAE" => "\xC3\x8E", "\xC3\xAF" => "\xC3\x8F", "\xC3\xB0" => "\xC3\x90", "\xC3\xB1" => "\xC3\x91",
|
|
||||||
"\xC3\xB2" => "\xC3\x92", "\xC3\xB3" => "\xC3\x93", "\xC3\xB4" => "\xC3\x94", "\xC3\xB5" => "\xC3\x95",
|
|
||||||
"\xC3\xB6" => "\xC3\x96", "\xC3\xB8" => "\xC3\x98", "\xC3\xB9" => "\xC3\x99", "\xC3\xBA" => "\xC3\x9A",
|
|
||||||
"\xC3\xBB" => "\xC3\x9B", "\xC3\xBC" => "\xC3\x9C", "\xC3\xBD" => "\xC3\x9D", "\xC3\xBE" => "\xC3\x9E",
|
|
||||||
"\xC3\xBF" => "\xC5\xB8", "\xC4\x81" => "\xC4\x80", "\xC4\x83" => "\xC4\x82", "\xC4\x85" => "\xC4\x84",
|
|
||||||
"\xC4\x87" => "\xC4\x86", "\xC4\x89" => "\xC4\x88", "\xC4\x8B" => "\xC4\x8A", "\xC4\x8D" => "\xC4\x8C",
|
|
||||||
"\xC4\x8F" => "\xC4\x8E", "\xC4\x91" => "\xC4\x90", "\xC4\x93" => "\xC4\x92", "\xC4\x97" => "\xC4\x96",
|
|
||||||
"\xC4\x99" => "\xC4\x98", "\xC4\x9B" => "\xC4\x9A", "\xC4\x9D" => "\xC4\x9C", "\xC4\x9F" => "\xC4\x9E",
|
|
||||||
"\xC4\xA1" => "\xC4\xA0", "\xC4\xA3" => "\xC4\xA2", "\xC4\xA5" => "\xC4\xA4", "\xC4\xA7" => "\xC4\xA6",
|
|
||||||
"\xC4\xA9" => "\xC4\xA8", "\xC4\xAB" => "\xC4\xAA", "\xC4\xAF" => "\xC4\xAE", "\xC4\xB5" => "\xC4\xB4",
|
|
||||||
"\xC4\xB7" => "\xC4\xB6", "\xC4\xBA" => "\xC4\xB9", "\xC4\xBC" => "\xC4\xBB", "\xC4\xBE" => "\xC4\xBD",
|
|
||||||
"\xC5\x82" => "\xC5\x81", "\xC5\x84" => "\xC5\x83", "\xC5\x86" => "\xC5\x85", "\xC5\x88" => "\xC5\x87",
|
|
||||||
"\xC5\x8B" => "\xC5\x8A", "\xC5\x8D" => "\xC5\x8C", "\xC5\x91" => "\xC5\x90", "\xC5\x95" => "\xC5\x94",
|
|
||||||
"\xC5\x97" => "\xC5\x96", "\xC5\x99" => "\xC5\x98", "\xC5\x9B" => "\xC5\x9A", "\xC5\x9D" => "\xC5\x9C",
|
|
||||||
"\xC5\x9F" => "\xC5\x9E", "\xC5\xA1" => "\xC5\xA0", "\xC5\xA3" => "\xC5\xA2", "\xC5\xA5" => "\xC5\xA4",
|
|
||||||
"\xC5\xA7" => "\xC5\xA6", "\xC5\xA9" => "\xC5\xA8", "\xC5\xAB" => "\xC5\xAA", "\xC5\xAD" => "\xC5\xAC",
|
|
||||||
"\xC5\xAF" => "\xC5\xAE", "\xC5\xB1" => "\xC5\xB0", "\xC5\xB3" => "\xC5\xB2", "\xC5\xB5" => "\xC5\xB4",
|
|
||||||
"\xC5\xB7" => "\xC5\xB6", "\xC5\xBA" => "\xC5\xB9", "\xC5\xBC" => "\xC5\xBB", "\xC5\xBE" => "\xC5\xBD",
|
|
||||||
"\xC6\xA1" => "\xC6\xA0", "\xC6\xB0" => "\xC6\xAF", "\xC8\x99" => "\xC8\x98", "\xC8\x9B" => "\xC8\x9A",
|
|
||||||
"\xCE\xAC" => "\xCE\x86", "\xCE\xAD" => "\xCE\x88", "\xCE\xAE" => "\xCE\x89", "\xCE\xAF" => "\xCE\x8A",
|
|
||||||
"\xCE\xB1" => "\xCE\x91", "\xCE\xB2" => "\xCE\x92", "\xCE\xB3" => "\xCE\x93", "\xCE\xB4" => "\xCE\x94",
|
|
||||||
"\xCE\xB5" => "\xCE\x95", "\xCE\xB6" => "\xCE\x96", "\xCE\xB7" => "\xCE\x97", "\xCE\xB8" => "\xCE\x98",
|
|
||||||
"\xCE\xB9" => "\xCE\x99", "\xCE\xBA" => "\xCE\x9A", "\xCE\xBB" => "\xCE\x9B", "\xCE\xBC" => "\xCE\x9C",
|
|
||||||
"\xCE\xBD" => "\xCE\x9D", "\xCE\xBE" => "\xCE\x9E", "\xCE\xBF" => "\xCE\x9F", "\xCF\x80" => "\xCE\xA0",
|
|
||||||
"\xCF\x81" => "\xCE\xA1", "\xCF\x83" => "\xCE\xA3", "\xCF\x84" => "\xCE\xA4", "\xCF\x85" => "\xCE\xA5",
|
|
||||||
"\xCF\x86" => "\xCE\xA6", "\xCF\x87" => "\xCE\xA7", "\xCF\x88" => "\xCE\xA8", "\xCF\x89" => "\xCE\xA9",
|
|
||||||
"\xCF\x8A" => "\xCE\xAA", "\xCF\x8B" => "\xCE\xAB", "\xCF\x8C" => "\xCE\x8C", "\xCF\x8D" => "\xCE\x8E",
|
|
||||||
"\xCF\x8E" => "\xCE\x8F", "\xD0\xB0" => "\xD0\x90", "\xD0\xB1" => "\xD0\x91", "\xD0\xB2" => "\xD0\x92",
|
|
||||||
"\xD0\xB3" => "\xD0\x93", "\xD0\xB4" => "\xD0\x94", "\xD0\xB5" => "\xD0\x95", "\xD0\xB6" => "\xD0\x96",
|
|
||||||
"\xD0\xB7" => "\xD0\x97", "\xD0\xB8" => "\xD0\x98", "\xD0\xB9" => "\xD0\x99", "\xD0\xBA" => "\xD0\x9A",
|
|
||||||
"\xD0\xBB" => "\xD0\x9B", "\xD0\xBC" => "\xD0\x9C", "\xD0\xBD" => "\xD0\x9D", "\xD0\xBE" => "\xD0\x9E",
|
|
||||||
"\xD0\xBF" => "\xD0\x9F", "\xD1\x80" => "\xD0\xA0", "\xD1\x81" => "\xD0\xA1", "\xD1\x82" => "\xD0\xA2",
|
|
||||||
"\xD1\x83" => "\xD0\xA3", "\xD1\x84" => "\xD0\xA4", "\xD1\x85" => "\xD0\xA5", "\xD1\x86" => "\xD0\xA6",
|
|
||||||
"\xD1\x87" => "\xD0\xA7", "\xD1\x88" => "\xD0\xA8", "\xD1\x89" => "\xD0\xA9", "\xD1\x8A" => "\xD0\xAA",
|
|
||||||
"\xD1\x8B" => "\xD0\xAB", "\xD1\x8C" => "\xD0\xAC", "\xD1\x8D" => "\xD0\xAD", "\xD1\x8E" => "\xD0\xAE",
|
|
||||||
"\xD1\x8F" => "\xD0\xAF", "\xD1\x91" => "\xD0\x81", "\xD1\x92" => "\xD0\x82", "\xD1\x93" => "\xD0\x83",
|
|
||||||
"\xD1\x94" => "\xD0\x84", "\xD1\x95" => "\xD0\x85", "\xD1\x96" => "\xD0\x86", "\xD1\x97" => "\xD0\x87",
|
|
||||||
"\xD1\x98" => "\xD0\x88", "\xD1\x99" => "\xD0\x89", "\xD1\x9A" => "\xD0\x8A", "\xD1\x9B" => "\xD0\x8B",
|
|
||||||
"\xD1\x9C" => "\xD0\x8C", "\xD1\x9E" => "\xD0\x8E", "\xD1\x9F" => "\xD0\x8F", "\xD2\x91" => "\xD2\x90",
|
|
||||||
"\xE1\xB8\x83" => "\xE1\xB8\x82", "\xE1\xB8\x8B" => "\xE1\xB8\x8A", "\xE1\xB8\x9F" => "\xE1\xB8\x9E", "\xE1\xB9\x81" => "\xE1\xB9\x80",
|
|
||||||
"\xE1\xB9\x97" => "\xE1\xB9\x96", "\xE1\xB9\xA1" => "\xE1\xB9\xA0", "\xE1\xB9\xAB" => "\xE1\xB9\xAA", "\xE1\xBA\x81" => "\xE1\xBA\x80",
|
|
||||||
"\xE1\xBA\x83" => "\xE1\xBA\x82", "\xE1\xBA\x85" => "\xE1\xBA\x84", "\xE1\xBB\xB3" => "\xE1\xBB\xB2"
|
|
||||||
);
|
|
||||||
|
|
||||||
return strtr(strtoupper($string), $utf8_lower_to_upper);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* UTF-8 aware alternative to substr
|
|
||||||
* Return part of a string given character offset (and optionally length)
|
|
||||||
*
|
|
||||||
* Note arguments: comparied to substr - if offset or length are
|
|
||||||
* not integers, this version will not complain but rather massages them
|
|
||||||
* into an integer.
|
|
||||||
*
|
|
||||||
* Note on returned values: substr documentation states false can be
|
|
||||||
* returned in some cases (e.g. offset > string length)
|
|
||||||
* mb_substr never returns false, it will return an empty string instead.
|
|
||||||
* This adopts the mb_substr approach
|
|
||||||
*
|
|
||||||
* Note on implementation: PCRE only supports repetitions of less than
|
|
||||||
* 65536, in order to accept up to MAXINT values for offset and length,
|
|
||||||
* we'll repeat a group of 65535 characters when needed.
|
|
||||||
*
|
|
||||||
* Note on implementation: calculating the number of characters in the
|
|
||||||
* string is a relatively expensive operation, so we only carry it out when
|
|
||||||
* necessary. It isn't necessary for +ve offsets and no specified length
|
|
||||||
*
|
|
||||||
* @author Chris Smith<chris@jalakai.co.uk>
|
|
||||||
* @param string $str
|
|
||||||
* @param integer $offset number of UTF-8 characters offset (from left)
|
|
||||||
* @param integer $length (optional) length in UTF-8 characters from offset
|
|
||||||
* @return mixed string or FALSE if failure
|
|
||||||
*/
|
|
||||||
function utf8_substr($str, $offset, $length = NULL)
|
|
||||||
{
|
|
||||||
// generates E_NOTICE
|
|
||||||
// for PHP4 objects, but not PHP5 objects
|
|
||||||
$str = (string) $str;
|
|
||||||
$offset = (int) $offset;
|
|
||||||
if (!is_null($length))
|
|
||||||
{
|
|
||||||
$length = (int) $length;
|
|
||||||
}
|
|
||||||
|
|
||||||
// handle trivial cases
|
|
||||||
if ($length === 0 || ($offset < 0 && $length < 0 && $length < $offset))
|
|
||||||
{
|
|
||||||
return '';
|
|
||||||
}
|
|
||||||
|
|
||||||
// normalise negative offsets (we could use a tail
|
|
||||||
// anchored pattern, but they are horribly slow!)
|
|
||||||
if ($offset < 0)
|
|
||||||
{
|
|
||||||
// see notes
|
|
||||||
$strlen = utf8_strlen($str);
|
|
||||||
$offset = $strlen + $offset;
|
|
||||||
if ($offset < 0)
|
|
||||||
{
|
|
||||||
$offset = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
$op = '';
|
|
||||||
$lp = '';
|
|
||||||
|
|
||||||
// establish a pattern for offset, a
|
|
||||||
// non-captured group equal in length to offset
|
|
||||||
if ($offset > 0)
|
|
||||||
{
|
|
||||||
$ox = (int) ($offset / 65535);
|
|
||||||
$oy = $offset % 65535;
|
|
||||||
|
|
||||||
if ($ox)
|
|
||||||
{
|
|
||||||
$op = '(?:.{65535}){' . $ox . '}';
|
|
||||||
}
|
|
||||||
|
|
||||||
$op = '^(?:' . $op . '.{' . $oy . '})';
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// offset == 0; just anchor the pattern
|
|
||||||
$op = '^';
|
|
||||||
}
|
|
||||||
|
|
||||||
// establish a pattern for length
|
|
||||||
if (is_null($length))
|
|
||||||
{
|
|
||||||
// the rest of the string
|
|
||||||
$lp = '(.*)$';
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (!isset($strlen))
|
|
||||||
{
|
|
||||||
// see notes
|
|
||||||
$strlen = utf8_strlen($str);
|
|
||||||
}
|
|
||||||
|
|
||||||
// another trivial case
|
|
||||||
if ($offset > $strlen)
|
|
||||||
{
|
|
||||||
return '';
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($length > 0)
|
|
||||||
{
|
|
||||||
// reduce any length that would
|
|
||||||
// go passed the end of the string
|
|
||||||
$length = min($strlen - $offset, $length);
|
|
||||||
|
|
||||||
$lx = (int) ($length / 65535);
|
|
||||||
$ly = $length % 65535;
|
|
||||||
|
|
||||||
// negative length requires a captured group
|
|
||||||
// of length characters
|
|
||||||
if ($lx)
|
|
||||||
{
|
|
||||||
$lp = '(?:.{65535}){' . $lx . '}';
|
|
||||||
}
|
|
||||||
$lp = '(' . $lp . '.{'. $ly . '})';
|
|
||||||
}
|
|
||||||
else if ($length < 0)
|
|
||||||
{
|
|
||||||
if ($length < ($offset - $strlen))
|
|
||||||
{
|
|
||||||
return '';
|
|
||||||
}
|
|
||||||
|
|
||||||
$lx = (int) ((-$length) / 65535);
|
|
||||||
$ly = (-$length) % 65535;
|
|
||||||
|
|
||||||
// negative length requires ... capture everything
|
|
||||||
// except a group of -length characters
|
|
||||||
// anchored at the tail-end of the string
|
|
||||||
if ($lx)
|
|
||||||
{
|
|
||||||
$lp = '(?:.{65535}){' . $lx . '}';
|
|
||||||
}
|
|
||||||
$lp = '(.*)(?:' . $lp . '.{' . $ly . '})$';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!preg_match('#' . $op . $lp . '#us', $str, $match))
|
|
||||||
{
|
|
||||||
return '';
|
|
||||||
}
|
|
||||||
|
|
||||||
return $match[1];
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Return the length (in characters) of a UTF-8 string
|
|
||||||
*
|
|
||||||
* @param string $text UTF-8 string
|
|
||||||
* @return integer Length (in chars) of given string
|
|
||||||
*/
|
|
||||||
function utf8_strlen($text)
|
|
||||||
{
|
|
||||||
// Since utf8_decode is replacing multibyte characters to ? strlen works fine
|
|
||||||
return strlen(utf8_decode($text));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -867,7 +415,6 @@ function utf8_recode($string, $encoding)
|
||||||
|
|
||||||
// Trigger an error?! Fow now just give bad data :-(
|
// Trigger an error?! Fow now just give bad data :-(
|
||||||
trigger_error('Unknown encoding: ' . $encoding, E_USER_ERROR);
|
trigger_error('Unknown encoding: ' . $encoding, E_USER_ERROR);
|
||||||
//return $string; // use utf_normalizer::cleanup() ?
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1611,14 +1158,8 @@ function utf8_case_fold_nfkc($text, $option = 'full')
|
||||||
// do the case fold
|
// do the case fold
|
||||||
$text = utf8_case_fold($text, $option);
|
$text = utf8_case_fold($text, $option);
|
||||||
|
|
||||||
if (!class_exists('utf_normalizer'))
|
|
||||||
{
|
|
||||||
global $phpbb_root_path, $phpEx;
|
|
||||||
include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
|
|
||||||
}
|
|
||||||
|
|
||||||
// convert to NFKC
|
// convert to NFKC
|
||||||
utf_normalizer::nfkc($text);
|
Normalizer::normalize($text, Normalizer::NFKC);
|
||||||
|
|
||||||
// FC_NFKC_Closure, http://www.unicode.org/Public/5.0.0/ucd/DerivedNormalizationProps.txt
|
// FC_NFKC_Closure, http://www.unicode.org/Public/5.0.0/ucd/DerivedNormalizationProps.txt
|
||||||
$text = strtr($text, $fc_nfkc_closure);
|
$text = strtr($text, $fc_nfkc_closure);
|
||||||
|
@ -1714,106 +1255,56 @@ function utf8_case_fold_nfc($text, $option = 'full')
|
||||||
return $text;
|
return $text;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (extension_loaded('intl'))
|
/**
|
||||||
|
* wrapper around PHP's native normalizer from intl
|
||||||
|
* previously a PECL extension, included in the core since PHP 5.3.0
|
||||||
|
* http://php.net/manual/en/normalizer.normalize.php
|
||||||
|
*
|
||||||
|
* @param mixed $strings a string or an array of strings to normalize
|
||||||
|
* @return mixed the normalized content, preserving array keys if array given.
|
||||||
|
*/
|
||||||
|
function utf8_normalize_nfc($strings)
|
||||||
{
|
{
|
||||||
/**
|
if (empty($strings))
|
||||||
* wrapper around PHP's native normalizer from intl
|
|
||||||
* previously a PECL extension, included in the core since PHP 5.3.0
|
|
||||||
* http://php.net/manual/en/normalizer.normalize.php
|
|
||||||
*
|
|
||||||
* @param mixed $strings a string or an array of strings to normalize
|
|
||||||
* @return mixed the normalized content, preserving array keys if array given.
|
|
||||||
*/
|
|
||||||
function utf8_normalize_nfc($strings)
|
|
||||||
{
|
{
|
||||||
if (empty($strings))
|
return $strings;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!is_array($strings))
|
||||||
|
{
|
||||||
|
if (Normalizer::isNormalized($strings))
|
||||||
{
|
{
|
||||||
return $strings;
|
return $strings;
|
||||||
}
|
}
|
||||||
|
return (string) Normalizer::normalize($strings);
|
||||||
if (!is_array($strings))
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
foreach ($strings as $key => $string)
|
||||||
{
|
{
|
||||||
if (Normalizer::isNormalized($strings))
|
if (is_array($string))
|
||||||
{
|
{
|
||||||
return $strings;
|
foreach ($string as $_key => $_string)
|
||||||
}
|
|
||||||
return (string) Normalizer::normalize($strings);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
foreach ($strings as $key => $string)
|
|
||||||
{
|
|
||||||
if (is_array($string))
|
|
||||||
{
|
{
|
||||||
foreach ($string as $_key => $_string)
|
if (Normalizer::isNormalized($strings[$key][$_key]))
|
||||||
{
|
|
||||||
if (Normalizer::isNormalized($strings[$key][$_key]))
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
$strings[$key][$_key] = (string) Normalizer::normalize($strings[$key][$_key]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (Normalizer::isNormalized($strings[$key]))
|
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
$strings[$key] = (string) Normalizer::normalize($strings[$key]);
|
$strings[$key][$_key] = (string) Normalizer::normalize($strings[$key][$_key]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
else
|
||||||
|
|
||||||
return $strings;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/**
|
|
||||||
* A wrapper function for the normalizer which takes care of including the class if
|
|
||||||
* required and modifies the passed strings to be in NFC (Normalization Form Composition).
|
|
||||||
*
|
|
||||||
* @param mixed $strings a string or an array of strings to normalize
|
|
||||||
* @return mixed the normalized content, preserving array keys if array given.
|
|
||||||
*/
|
|
||||||
function utf8_normalize_nfc($strings)
|
|
||||||
{
|
|
||||||
if (empty($strings))
|
|
||||||
{
|
|
||||||
return $strings;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!class_exists('utf_normalizer'))
|
|
||||||
{
|
|
||||||
global $phpbb_root_path, $phpEx;
|
|
||||||
include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!is_array($strings))
|
|
||||||
{
|
|
||||||
utf_normalizer::nfc($strings);
|
|
||||||
}
|
|
||||||
else if (is_array($strings))
|
|
||||||
{
|
|
||||||
foreach ($strings as $key => $string)
|
|
||||||
{
|
{
|
||||||
if (is_array($string))
|
if (Normalizer::isNormalized($strings[$key]))
|
||||||
{
|
{
|
||||||
foreach ($string as $_key => $_string)
|
continue;
|
||||||
{
|
|
||||||
utf_normalizer::nfc($strings[$key][$_key]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
utf_normalizer::nfc($strings[$key]);
|
|
||||||
}
|
}
|
||||||
|
$strings[$key] = (string) Normalizer::normalize($strings[$key]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return $strings;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return $strings;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1959,50 +1450,3 @@ function utf8_basename($filename)
|
||||||
|
|
||||||
return $filename;
|
return $filename;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* UTF8-safe str_replace() function
|
|
||||||
*
|
|
||||||
* @param string $search The value to search for
|
|
||||||
* @param string $replace The replacement string
|
|
||||||
* @param string $subject The target string
|
|
||||||
* @return string The resultant string
|
|
||||||
*/
|
|
||||||
function utf8_str_replace($search, $replace, $subject)
|
|
||||||
{
|
|
||||||
if (!is_array($search))
|
|
||||||
{
|
|
||||||
$search = array($search);
|
|
||||||
if (is_array($replace))
|
|
||||||
{
|
|
||||||
$replace = (string) $replace;
|
|
||||||
trigger_error('Array to string conversion', E_USER_NOTICE);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
$length = sizeof($search);
|
|
||||||
|
|
||||||
if (!is_array($replace))
|
|
||||||
{
|
|
||||||
$replace = array_fill(0, $length, $replace);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
$replace = array_pad($replace, $length, '');
|
|
||||||
}
|
|
||||||
|
|
||||||
for ($i = 0; $i < $length; $i++)
|
|
||||||
{
|
|
||||||
$search_length = utf8_strlen($search[$i]);
|
|
||||||
$replace_length = utf8_strlen($replace[$i]);
|
|
||||||
|
|
||||||
$offset = 0;
|
|
||||||
while (($start = utf8_strpos($subject, $search[$i], $offset)) !== false)
|
|
||||||
{
|
|
||||||
$subject = utf8_substr($subject, 0, $start) . $replace[$i] . utf8_substr($subject, $start + $search_length);
|
|
||||||
$offset = $start + $replace_length;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return $subject;
|
|
||||||
}
|
|
||||||
|
|
|
@ -633,14 +633,8 @@ function utf8_new_case_fold_nfkc($text, $option = 'full')
|
||||||
// do the case fold
|
// do the case fold
|
||||||
$text = utf8_new_case_fold($text, $option);
|
$text = utf8_new_case_fold($text, $option);
|
||||||
|
|
||||||
if (!class_exists('utf_normalizer'))
|
|
||||||
{
|
|
||||||
global $phpbb_root_path, $phpEx;
|
|
||||||
include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
|
|
||||||
}
|
|
||||||
|
|
||||||
// convert to NFKC
|
// convert to NFKC
|
||||||
utf_new_normalizer::nfkc($text);
|
$text = Normalizer::normalize($text, Normalizer::NFKC);
|
||||||
|
|
||||||
// FC_NFKC_Closure, http://www.unicode.org/Public/5.0.0/ucd/DerivedNormalizationProps.txt
|
// FC_NFKC_Closure, http://www.unicode.org/Public/5.0.0/ucd/DerivedNormalizationProps.txt
|
||||||
$text = strtr($text, $fc_nfkc_closure);
|
$text = strtr($text, $fc_nfkc_closure);
|
||||||
|
|
|
@ -1,197 +0,0 @@
|
||||||
<?php
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* This file is part of the phpBB Forum Software package.
|
|
||||||
*
|
|
||||||
* @copyright (c) phpBB Limited <https://www.phpbb.com>
|
|
||||||
* @license GNU General Public License, version 2 (GPL-2.0)
|
|
||||||
*
|
|
||||||
* For full copyright and license information, please see
|
|
||||||
* the docs/CREDITS.txt file.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @ignore
|
|
||||||
*/
|
|
||||||
if (!defined('IN_PHPBB'))
|
|
||||||
{
|
|
||||||
exit;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A wrapper function for the normalizer which takes care of including the class if required and modifies the passed strings
|
|
||||||
* to be in NFC (Normalization Form Composition).
|
|
||||||
*
|
|
||||||
* @param mixed $strings a string or an array of strings to normalize
|
|
||||||
* @return mixed the normalized content, preserving array keys if array given.
|
|
||||||
*/
|
|
||||||
function utf8_new_normalize_nfc($strings)
|
|
||||||
{
|
|
||||||
if (empty($strings))
|
|
||||||
{
|
|
||||||
return $strings;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!is_array($strings))
|
|
||||||
{
|
|
||||||
utf_new_normalizer::nfc($strings);
|
|
||||||
}
|
|
||||||
else if (is_array($strings))
|
|
||||||
{
|
|
||||||
foreach ($strings as $key => $string)
|
|
||||||
{
|
|
||||||
if (is_array($string))
|
|
||||||
{
|
|
||||||
foreach ($string as $_key => $_string)
|
|
||||||
{
|
|
||||||
utf_new_normalizer::nfc($strings[$key][$_key]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
utf_new_normalizer::nfc($strings[$key]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return $strings;
|
|
||||||
}
|
|
||||||
|
|
||||||
class utf_new_normalizer
|
|
||||||
{
|
|
||||||
/**
|
|
||||||
* Validate, cleanup and normalize a string
|
|
||||||
*
|
|
||||||
* The ultimate convenience function! Clean up invalid UTF-8 sequences,
|
|
||||||
* and convert to Normal Form C, canonical composition.
|
|
||||||
*
|
|
||||||
* @param string &$str The dirty string
|
|
||||||
* @return string The same string, all shiny and cleaned-up
|
|
||||||
*/
|
|
||||||
function cleanup(&$str)
|
|
||||||
{
|
|
||||||
// The string below is the list of all autorized characters, sorted by frequency in latin text
|
|
||||||
$pos = strspn($str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D");
|
|
||||||
$len = strlen($str);
|
|
||||||
|
|
||||||
if ($pos == $len)
|
|
||||||
{
|
|
||||||
// ASCII strings with no special chars return immediately
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together
|
|
||||||
if (!isset($GLOBALS['utf_nfc_qc']))
|
|
||||||
{
|
|
||||||
global $phpbb_root_path, $phpEx;
|
|
||||||
include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!isset($GLOBALS['utf_canonical_decomp']))
|
|
||||||
{
|
|
||||||
global $phpbb_root_path, $phpEx;
|
|
||||||
include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Replace any byte in the range 0x00..0x1F, except for \r, \n and \t
|
|
||||||
// We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char
|
|
||||||
$str = strtr(
|
|
||||||
$str,
|
|
||||||
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
|
|
||||||
"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
|
|
||||||
);
|
|
||||||
|
|
||||||
$str = utf_new_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Validate and normalize a UTF string to NFC
|
|
||||||
*
|
|
||||||
* @param string &$str Unchecked UTF string
|
|
||||||
* @return string The string, validated and in normal form
|
|
||||||
*/
|
|
||||||
function nfc(&$str)
|
|
||||||
{
|
|
||||||
$pos = strspn($str, UTF8_ASCII_RANGE);
|
|
||||||
$len = strlen($str);
|
|
||||||
|
|
||||||
if ($pos == $len)
|
|
||||||
{
|
|
||||||
// ASCII strings return immediately
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!isset($GLOBALS['utf_nfc_qc']))
|
|
||||||
{
|
|
||||||
global $phpbb_root_path, $phpEx;
|
|
||||||
include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!isset($GLOBALS['utf_canonical_decomp']))
|
|
||||||
{
|
|
||||||
global $phpbb_root_path, $phpEx;
|
|
||||||
include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
|
|
||||||
}
|
|
||||||
|
|
||||||
$str = utf_new_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Validate and normalize a UTF string to NFKC
|
|
||||||
*
|
|
||||||
* @param string &$str Unchecked UTF string
|
|
||||||
* @return string The string, validated and in normal form
|
|
||||||
*/
|
|
||||||
function nfkc(&$str)
|
|
||||||
{
|
|
||||||
$pos = strspn($str, UTF8_ASCII_RANGE);
|
|
||||||
$len = strlen($str);
|
|
||||||
|
|
||||||
if ($pos == $len)
|
|
||||||
{
|
|
||||||
// ASCII strings return immediately
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!isset($GLOBALS['utf_nfkc_qc']))
|
|
||||||
{
|
|
||||||
global $phpbb_root_path, $phpEx;
|
|
||||||
include($phpbb_root_path . 'includes/utf/data/utf_nfkc_qc.' . $phpEx);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!isset($GLOBALS['utf_compatibility_decomp']))
|
|
||||||
{
|
|
||||||
global $phpbb_root_path, $phpEx;
|
|
||||||
include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
|
|
||||||
}
|
|
||||||
|
|
||||||
$str = utf_new_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Recompose a UTF string
|
|
||||||
*
|
|
||||||
* @param string $str Unchecked UTF string
|
|
||||||
* @param integer $pos Position of the first UTF char (in bytes)
|
|
||||||
* @param integer $len Length of the string (in bytes)
|
|
||||||
* @param array &$qc Quick-check array, passed by reference but never modified
|
|
||||||
* @param array &$decomp_map Decomposition mapping, passed by reference but never modified
|
|
||||||
* @return string The string, validated and recomposed
|
|
||||||
*
|
|
||||||
* @access private
|
|
||||||
*/
|
|
||||||
function recompose($str, $pos, $len, &$qc, &$decomp_map)
|
|
||||||
{
|
|
||||||
global $utf_canonical_comp;
|
|
||||||
|
|
||||||
// Load the canonical composition table
|
|
||||||
if (!isset($utf_canonical_comp))
|
|
||||||
{
|
|
||||||
global $phpbb_root_path, $phpEx;
|
|
||||||
include($phpbb_root_path . 'includes/utf/data/utf_canonical_comp.' . $phpEx);
|
|
||||||
}
|
|
||||||
|
|
||||||
return utf_normalizer::recompose($str, $pos, $len, $qc, $decomp_map);
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -74,7 +74,6 @@ require($phpbb_root_path . 'includes/functions.' . $phpEx);
|
||||||
require($phpbb_root_path . 'includes/functions_content.' . $phpEx);
|
require($phpbb_root_path . 'includes/functions_content.' . $phpEx);
|
||||||
|
|
||||||
require($phpbb_root_path . 'includes/constants.' . $phpEx);
|
require($phpbb_root_path . 'includes/constants.' . $phpEx);
|
||||||
include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
|
|
||||||
require($phpbb_root_path . 'includes/utf/utf_tools.' . $phpEx);
|
require($phpbb_root_path . 'includes/utf/utf_tools.' . $phpEx);
|
||||||
|
|
||||||
// Set PHP error handler to ours
|
// Set PHP error handler to ours
|
||||||
|
|
|
@ -102,7 +102,6 @@ phpbb_require_updated('includes/functions.' . $phpEx);
|
||||||
phpbb_require_updated('includes/functions_content.' . $phpEx, true);
|
phpbb_require_updated('includes/functions_content.' . $phpEx, true);
|
||||||
|
|
||||||
phpbb_include_updated('includes/functions_admin.' . $phpEx);
|
phpbb_include_updated('includes/functions_admin.' . $phpEx);
|
||||||
phpbb_include_updated('includes/utf/utf_normalizer.' . $phpEx);
|
|
||||||
phpbb_include_updated('includes/utf/utf_tools.' . $phpEx);
|
phpbb_include_updated('includes/utf/utf_tools.' . $phpEx);
|
||||||
phpbb_require_updated('includes/functions_install.' . $phpEx);
|
phpbb_require_updated('includes/functions_install.' . $phpEx);
|
||||||
|
|
||||||
|
|
|
@ -363,8 +363,8 @@ abstract class driver implements driver_interface
|
||||||
*/
|
*/
|
||||||
function sql_like_expression($expression)
|
function sql_like_expression($expression)
|
||||||
{
|
{
|
||||||
$expression = utf8_str_replace(array('_', '%'), array("\_", "\%"), $expression);
|
$expression = str_replace(array('_', '%'), array("\_", "\%"), $expression);
|
||||||
$expression = utf8_str_replace(array(chr(0) . "\_", chr(0) . "\%"), array('_', '%'), $expression);
|
$expression = str_replace(array(chr(0) . "\_", chr(0) . "\%"), array('_', '%'), $expression);
|
||||||
|
|
||||||
return $this->_sql_like_expression('LIKE \'' . $this->sql_escape($expression) . '\'');
|
return $this->_sql_like_expression('LIKE \'' . $this->sql_escape($expression) . '\'');
|
||||||
}
|
}
|
||||||
|
@ -374,8 +374,8 @@ abstract class driver implements driver_interface
|
||||||
*/
|
*/
|
||||||
function sql_not_like_expression($expression)
|
function sql_not_like_expression($expression)
|
||||||
{
|
{
|
||||||
$expression = utf8_str_replace(array('_', '%'), array("\_", "\%"), $expression);
|
$expression = str_replace(array('_', '%'), array("\_", "\%"), $expression);
|
||||||
$expression = utf8_str_replace(array(chr(0) . "\_", chr(0) . "\%"), array('_', '%'), $expression);
|
$expression = str_replace(array(chr(0) . "\_", chr(0) . "\%"), array('_', '%'), $expression);
|
||||||
|
|
||||||
return $this->_sql_not_like_expression('NOT LIKE \'' . $this->sql_escape($expression) . '\'');
|
return $this->_sql_not_like_expression('NOT LIKE \'' . $this->sql_escape($expression) . '\'');
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,6 +18,13 @@ namespace phpbb\search;
|
||||||
*/
|
*/
|
||||||
class fulltext_native extends \phpbb\search\base
|
class fulltext_native extends \phpbb\search\base
|
||||||
{
|
{
|
||||||
|
const UTF8_HANGUL_FIRST = "\xEA\xB0\x80";
|
||||||
|
const UTF8_HANGUL_LAST = "\xED\x9E\xA3";
|
||||||
|
const UTF8_CJK_FIRST = "\xE4\xB8\x80";
|
||||||
|
const UTF8_CJK_LAST = "\xE9\xBE\xBB";
|
||||||
|
const UTF8_CJK_B_FIRST = "\xF0\xA0\x80\x80";
|
||||||
|
const UTF8_CJK_B_LAST = "\xF0\xAA\x9B\x96";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Associative array holding index stats
|
* Associative array holding index stats
|
||||||
* @var array
|
* @var array
|
||||||
|
@ -93,7 +100,7 @@ class fulltext_native extends \phpbb\search\base
|
||||||
protected $user;
|
protected $user;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initialises the fulltext_native search backend with min/max word length and makes sure the UTF-8 normalizer is loaded
|
* Initialises the fulltext_native search backend with min/max word length
|
||||||
*
|
*
|
||||||
* @param boolean|string &$error is passed by reference and should either be set to false on success or an error message on failure
|
* @param boolean|string &$error is passed by reference and should either be set to false on success or an error message on failure
|
||||||
*/
|
*/
|
||||||
|
@ -110,10 +117,6 @@ class fulltext_native extends \phpbb\search\base
|
||||||
/**
|
/**
|
||||||
* Load the UTF tools
|
* Load the UTF tools
|
||||||
*/
|
*/
|
||||||
if (!class_exists('utf_normalizer'))
|
|
||||||
{
|
|
||||||
include($this->phpbb_root_path . 'includes/utf/utf_normalizer.' . $this->php_ext);
|
|
||||||
}
|
|
||||||
if (!function_exists('utf8_decode_ncr'))
|
if (!function_exists('utf8_decode_ncr'))
|
||||||
{
|
{
|
||||||
include($this->phpbb_root_path . 'includes/utf/utf_tools.' . $this->php_ext);
|
include($this->phpbb_root_path . 'includes/utf/utf_tools.' . $this->php_ext);
|
||||||
|
@ -1175,9 +1178,9 @@ class fulltext_native extends \phpbb\search\base
|
||||||
* Note: this could be optimized. If the codepoint is lower than Hangul's range
|
* Note: this could be optimized. If the codepoint is lower than Hangul's range
|
||||||
* we know that it will also be lower than CJK ranges
|
* we know that it will also be lower than CJK ranges
|
||||||
*/
|
*/
|
||||||
if ((strncmp($word, UTF8_HANGUL_FIRST, 3) < 0 || strncmp($word, UTF8_HANGUL_LAST, 3) > 0)
|
if ((strncmp($word, self::UTF8_HANGUL_FIRST, 3) < 0 || strncmp($word, self::UTF8_HANGUL_LAST, 3) > 0)
|
||||||
&& (strncmp($word, UTF8_CJK_FIRST, 3) < 0 || strncmp($word, UTF8_CJK_LAST, 3) > 0)
|
&& (strncmp($word, self::UTF8_CJK_FIRST, 3) < 0 || strncmp($word, self::UTF8_CJK_LAST, 3) > 0)
|
||||||
&& (strncmp($word, UTF8_CJK_B_FIRST, 4) < 0 || strncmp($word, UTF8_CJK_B_LAST, 4) > 0))
|
&& (strncmp($word, self::UTF8_CJK_B_FIRST, 4) < 0 || strncmp($word, self::UTF8_CJK_B_LAST, 4) > 0))
|
||||||
{
|
{
|
||||||
$word = strtok(' ');
|
$word = strtok(' ');
|
||||||
continue;
|
continue;
|
||||||
|
@ -1544,8 +1547,6 @@ class fulltext_native extends \phpbb\search\base
|
||||||
* @param string $allowed_chars String of special chars to allow
|
* @param string $allowed_chars String of special chars to allow
|
||||||
* @param string $encoding Text encoding
|
* @param string $encoding Text encoding
|
||||||
* @return string Cleaned up text, only alphanumeric chars are left
|
* @return string Cleaned up text, only alphanumeric chars are left
|
||||||
*
|
|
||||||
* @todo \normalizer::cleanup being able to be used?
|
|
||||||
*/
|
*/
|
||||||
protected function cleanup($text, $allowed_chars = null, $encoding = 'utf-8')
|
protected function cleanup($text, $allowed_chars = null, $encoding = 'utf-8')
|
||||||
{
|
{
|
||||||
|
@ -1572,12 +1573,9 @@ class fulltext_native extends \phpbb\search\base
|
||||||
$text = htmlspecialchars_decode(utf8_decode_ncr($text), ENT_QUOTES);
|
$text = htmlspecialchars_decode(utf8_decode_ncr($text), ENT_QUOTES);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Load the UTF-8 normalizer
|
* Normalize to NFC
|
||||||
*
|
|
||||||
* If we use it more widely, an instance of that class should be held in a
|
|
||||||
* a global variable instead
|
|
||||||
*/
|
*/
|
||||||
\utf_normalizer::nfc($text);
|
$text = \Normalizer::normalize($text);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The first thing we do is:
|
* The first thing we do is:
|
||||||
|
@ -1670,9 +1668,9 @@ class fulltext_native extends \phpbb\search\base
|
||||||
$utf_char = substr($text, $pos, $utf_len);
|
$utf_char = substr($text, $pos, $utf_len);
|
||||||
$pos += $utf_len;
|
$pos += $utf_len;
|
||||||
|
|
||||||
if (($utf_char >= UTF8_HANGUL_FIRST && $utf_char <= UTF8_HANGUL_LAST)
|
if (($utf_char >= self::UTF8_HANGUL_FIRST && $utf_char <= self::UTF8_HANGUL_LAST)
|
||||||
|| ($utf_char >= UTF8_CJK_FIRST && $utf_char <= UTF8_CJK_LAST)
|
|| ($utf_char >= self::UTF8_CJK_FIRST && $utf_char <= self::UTF8_CJK_LAST)
|
||||||
|| ($utf_char >= UTF8_CJK_B_FIRST && $utf_char <= UTF8_CJK_B_LAST))
|
|| ($utf_char >= self::UTF8_CJK_B_FIRST && $utf_char <= self::UTF8_CJK_B_LAST))
|
||||||
{
|
{
|
||||||
/**
|
/**
|
||||||
* All characters within these ranges are valid
|
* All characters within these ranges are valid
|
||||||
|
|
|
@ -120,8 +120,9 @@ directory (above phpBB):
|
||||||
Slow tests
|
Slow tests
|
||||||
--------------
|
--------------
|
||||||
|
|
||||||
Certain tests, such as the UTF-8 normalizer or the DNS tests tend to be slow.
|
Certain tests, such as the DNS tests tend to be slow.
|
||||||
Thus these tests are in the `slow` group, which is excluded by default. If you
|
Thus these tests are in the `slow` group, which is excluded by default. You can
|
||||||
|
enable slow tests by copying the phpunit.xml.all file to phpunit.xml. If you
|
||||||
only want the slow tests, run:
|
only want the slow tests, run:
|
||||||
|
|
||||||
$ phpBB/vendor/bin/phpunit --group slow
|
$ phpBB/vendor/bin/phpunit --group slow
|
||||||
|
|
|
@ -1,327 +0,0 @@
|
||||||
<?php
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* This file is part of the phpBB Forum Software package.
|
|
||||||
*
|
|
||||||
* @copyright (c) phpBB Limited <https://www.phpbb.com>
|
|
||||||
* @license GNU General Public License, version 2 (GPL-2.0)
|
|
||||||
*
|
|
||||||
* For full copyright and license information, please see
|
|
||||||
* the docs/CREDITS.txt file.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
require_once dirname(__FILE__) . '/../../phpBB/includes/utf/utf_normalizer.php';
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @group slow
|
|
||||||
*/
|
|
||||||
class phpbb_utf_normalizer_test extends phpbb_test_case
|
|
||||||
{
|
|
||||||
static private $data_dir;
|
|
||||||
|
|
||||||
static public function setUpBeforeClass()
|
|
||||||
{
|
|
||||||
self::$data_dir = dirname(__file__) . '/../tmp';
|
|
||||||
self::download('http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt', self::$data_dir);
|
|
||||||
self::download('http://www.unicode.org/Public/UNIDATA/UnicodeData.txt', self::$data_dir);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_normalizer()
|
|
||||||
{
|
|
||||||
$test_suite = array(
|
|
||||||
/**
|
|
||||||
* NFC
|
|
||||||
* c2 == NFC(c1) == NFC(c2) == NFC(c3)
|
|
||||||
* c4 == NFC(c4) == NFC(c5)
|
|
||||||
*/
|
|
||||||
'NFC' => array(
|
|
||||||
'c2' => array('c1', 'c2', 'c3'),
|
|
||||||
'c4' => array('c4', 'c5')
|
|
||||||
),
|
|
||||||
|
|
||||||
/**
|
|
||||||
* NFD
|
|
||||||
* c3 == NFD(c1) == NFD(c2) == NFD(c3)
|
|
||||||
* c5 == NFD(c4) == NFD(c5)
|
|
||||||
*/
|
|
||||||
'NFD' => array(
|
|
||||||
'c3' => array('c1', 'c2', 'c3'),
|
|
||||||
'c5' => array('c4', 'c5')
|
|
||||||
),
|
|
||||||
|
|
||||||
/**
|
|
||||||
* NFKC
|
|
||||||
* c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
|
|
||||||
*/
|
|
||||||
'NFKC' => array(
|
|
||||||
'c4' => array('c1', 'c2', 'c3', 'c4', 'c5')
|
|
||||||
),
|
|
||||||
|
|
||||||
/**
|
|
||||||
* NFKD
|
|
||||||
* c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
|
|
||||||
*/
|
|
||||||
'NFKD' => array(
|
|
||||||
'c5' => array('c1', 'c2', 'c3', 'c4', 'c5')
|
|
||||||
)
|
|
||||||
);
|
|
||||||
|
|
||||||
$tested_chars = array();
|
|
||||||
|
|
||||||
$fp = fopen(self::$data_dir . '/NormalizationTest.txt', 'rb');
|
|
||||||
while (!feof($fp))
|
|
||||||
{
|
|
||||||
$line = fgets($fp);
|
|
||||||
|
|
||||||
if ($line[0] == '@')
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!strpos(' 0123456789ABCDEF', $line[0]))
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
list($c1, $c2, $c3, $c4, $c5) = explode(';', $line);
|
|
||||||
|
|
||||||
if (!strpos($c1, ' '))
|
|
||||||
{
|
|
||||||
/**
|
|
||||||
* We are currently testing a single character, we add it to the list of
|
|
||||||
* characters we have processed so that we can exclude it when testing
|
|
||||||
* for invariants
|
|
||||||
*/
|
|
||||||
$tested_chars[$c1] = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
foreach ($test_suite as $form => $serie)
|
|
||||||
{
|
|
||||||
foreach ($serie as $expected => $tests)
|
|
||||||
{
|
|
||||||
$hex_expected = ${$expected};
|
|
||||||
$utf_expected = $this->hexseq_to_utf($hex_expected);
|
|
||||||
|
|
||||||
foreach ($tests as $test)
|
|
||||||
{
|
|
||||||
$utf_result = $utf_expected;
|
|
||||||
call_user_func_array(array('utf_normalizer', $form), array(&$utf_result));
|
|
||||||
|
|
||||||
$hex_result = $this->utf_to_hexseq($utf_result);
|
|
||||||
$this->assertEquals($utf_expected, $utf_result, "$expected == $form($test) ($hex_expected != $hex_result)");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fclose($fp);
|
|
||||||
|
|
||||||
return $tested_chars;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @depends test_normalizer
|
|
||||||
*/
|
|
||||||
public function test_invariants(array $tested_chars)
|
|
||||||
{
|
|
||||||
$fp = fopen(self::$data_dir . '/UnicodeData.txt', 'rb');
|
|
||||||
|
|
||||||
while (!feof($fp))
|
|
||||||
{
|
|
||||||
$line = fgets($fp, 1024);
|
|
||||||
|
|
||||||
if (!$pos = strpos($line, ';'))
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
$hex_tested = $hex_expected = substr($line, 0, $pos);
|
|
||||||
|
|
||||||
if (isset($tested_chars[$hex_tested]))
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
$utf_expected = $this->hex_to_utf($hex_expected);
|
|
||||||
|
|
||||||
if ($utf_expected >= UTF8_SURROGATE_FIRST
|
|
||||||
&& $utf_expected <= UTF8_SURROGATE_LAST)
|
|
||||||
{
|
|
||||||
/**
|
|
||||||
* Surrogates are illegal on their own, we expect the normalizer
|
|
||||||
* to return a replacement char
|
|
||||||
*/
|
|
||||||
$utf_expected = UTF8_REPLACEMENT;
|
|
||||||
$hex_expected = $this->utf_to_hexseq($utf_expected);
|
|
||||||
}
|
|
||||||
|
|
||||||
foreach (array('nfc', 'nfkc', 'nfd', 'nfkd') as $form)
|
|
||||||
{
|
|
||||||
$utf_result = $utf_expected;
|
|
||||||
call_user_func_array(array('utf_normalizer', $form), array(&$utf_result));
|
|
||||||
$hex_result = $this->utf_to_hexseq($utf_result);
|
|
||||||
|
|
||||||
$this->assertEquals($utf_expected, $utf_result, "$hex_expected == $form($hex_tested) ($hex_expected != $hex_result)");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fclose($fp);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Convert a UTF string to a sequence of codepoints in hexadecimal
|
|
||||||
*
|
|
||||||
* @param string $utf UTF string
|
|
||||||
* @return integer Unicode codepoints in hex
|
|
||||||
*/
|
|
||||||
protected function utf_to_hexseq($str)
|
|
||||||
{
|
|
||||||
$pos = 0;
|
|
||||||
$len = strlen($str);
|
|
||||||
$ret = array();
|
|
||||||
|
|
||||||
while ($pos < $len)
|
|
||||||
{
|
|
||||||
$c = $str[$pos];
|
|
||||||
switch ($c & "\xF0")
|
|
||||||
{
|
|
||||||
case "\xC0":
|
|
||||||
case "\xD0":
|
|
||||||
$utf_char = substr($str, $pos, 2);
|
|
||||||
$pos += 2;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case "\xE0":
|
|
||||||
$utf_char = substr($str, $pos, 3);
|
|
||||||
$pos += 3;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case "\xF0":
|
|
||||||
$utf_char = substr($str, $pos, 4);
|
|
||||||
$pos += 4;
|
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
|
||||||
$utf_char = $c;
|
|
||||||
++$pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
$hex = dechex($this->utf_to_cp($utf_char));
|
|
||||||
|
|
||||||
if (!isset($hex[3]))
|
|
||||||
{
|
|
||||||
$hex = substr('000' . $hex, -4);
|
|
||||||
}
|
|
||||||
|
|
||||||
$ret[] = $hex;
|
|
||||||
}
|
|
||||||
|
|
||||||
return strtr(implode(' ', $ret), 'abcdef', 'ABCDEF');
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Convert a UTF-8 char to its codepoint
|
|
||||||
*
|
|
||||||
* @param string $utf_char UTF-8 char
|
|
||||||
* @return integer Unicode codepoint
|
|
||||||
*/
|
|
||||||
protected function utf_to_cp($utf_char)
|
|
||||||
{
|
|
||||||
switch (strlen($utf_char))
|
|
||||||
{
|
|
||||||
case 1:
|
|
||||||
return ord($utf_char);
|
|
||||||
|
|
||||||
case 2:
|
|
||||||
return ((ord($utf_char[0]) & 0x1F) << 6) | (ord($utf_char[1]) & 0x3F);
|
|
||||||
|
|
||||||
case 3:
|
|
||||||
return ((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F);
|
|
||||||
|
|
||||||
case 4:
|
|
||||||
return ((ord($utf_char[0]) & 0x07) << 18) | ((ord($utf_char[1]) & 0x3F) << 12) | ((ord($utf_char[2]) & 0x3F) << 6) | (ord($utf_char[3]) & 0x3F);
|
|
||||||
|
|
||||||
default:
|
|
||||||
throw new RuntimeException('UTF-8 chars can only be 1-4 bytes long');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Return a UTF string formed from a sequence of codepoints in hexadecimal
|
|
||||||
*
|
|
||||||
* @param string $seq Sequence of codepoints, separated with a space
|
|
||||||
* @return string UTF-8 string
|
|
||||||
*/
|
|
||||||
protected function hexseq_to_utf($seq)
|
|
||||||
{
|
|
||||||
return implode('', array_map(array($this, 'hex_to_utf'), explode(' ', $seq)));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Convert a codepoint in hexadecimal to a UTF-8 char
|
|
||||||
*
|
|
||||||
* @param string $hex Codepoint, in hexadecimal
|
|
||||||
* @return string UTF-8 char
|
|
||||||
*/
|
|
||||||
protected function hex_to_utf($hex)
|
|
||||||
{
|
|
||||||
return $this->cp_to_utf(hexdec($hex));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Convert a codepoint to a UTF-8 char
|
|
||||||
*
|
|
||||||
* @param integer $cp Unicode codepoint
|
|
||||||
* @return string UTF-8 string
|
|
||||||
*/
|
|
||||||
protected function cp_to_utf($cp)
|
|
||||||
{
|
|
||||||
if ($cp > 0xFFFF)
|
|
||||||
{
|
|
||||||
return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
|
|
||||||
}
|
|
||||||
else if ($cp > 0x7FF)
|
|
||||||
{
|
|
||||||
return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
|
|
||||||
}
|
|
||||||
else if ($cp > 0x7F)
|
|
||||||
{
|
|
||||||
return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
return chr($cp);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// chunked download helper
|
|
||||||
static protected function download($url, $to)
|
|
||||||
{
|
|
||||||
$target = $to . '/' . basename($url);
|
|
||||||
|
|
||||||
if (file_exists($target))
|
|
||||||
{
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!$fpr = fopen($url, 'rb'))
|
|
||||||
{
|
|
||||||
echo "Failed to download $url\n";
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!$fpw = fopen($target, 'wb'))
|
|
||||||
{
|
|
||||||
echo "Failed to open $target for writing\n";
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
$chunk = 32768;
|
|
||||||
|
|
||||||
while (!feof($fpr))
|
|
||||||
{
|
|
||||||
fwrite($fpw, fread($fpr, $chunk));
|
|
||||||
}
|
|
||||||
fclose($fpr);
|
|
||||||
fclose($fpw);
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
Add table
Reference in a new issue