diff --git a/phpBB/includes/utf/data/recode_basic.php b/phpBB/includes/utf/data/recode_basic.php index 8950b0550b..fe29e3ddb9 100644 --- a/phpBB/includes/utf/data/recode_basic.php +++ b/phpBB/includes/utf/data/recode_basic.php @@ -929,6 +929,40 @@ function cp1251($string) return strtr($string, $transform); } +function cp1252($string) +{ + static $transform = array( + "\xC2\x80" => "\xE2\x82\xAC", + "\xC2\x82" => "\xE2\x80\x9A", + "\xC2\x83" => "\xC6\x92", + "\xC2\x84" => "\xE2\x80\x9E", + "\xC2\x85" => "\xE2\x80\xA6", + "\xC2\x86" => "\xE2\x80\xA0", + "\xC2\x87" => "\xE2\x80\xA1", + "\xC2\x88" => "\xCB\x86", + "\xC2\x89" => "\xE2\x80\xB0", + "\xC2\x8A" => "\xC5\xA0", + "\xC2\x8B" => "\xE2\x80\xB9", + "\xC2\x8C" => "\xC5\x92", + "\xC2\x8E" => "\xC5\xBD", + "\xC2\x91" => "\xE2\x80\x98", + "\xC2\x92" => "\xE2\x80\x99", + "\xC2\x93" => "\xE2\x80\x9C", + "\xC2\x94" => "\xE2\x80\x9D", + "\xC2\x95" => "\xE2\x80\xA2", + "\xC2\x96" => "\xE2\x80\x93", + "\xC2\x97" => "\xE2\x80\x94", + "\xC2\x98" => "\xCB\x9C", + "\xC2\x99" => "\xE2\x84\xA2", + "\xC2\x9A" => "\xC5\xA1", + "\xC2\x9B" => "\xE2\x80\xBA", + "\xC2\x9C" => "\xC5\x93", + "\xC2\x9E" => "\xC5\xBE", + "\xC2\x9F" => "\xC5\xB8" + ); + return strtr(utf8_encode($string), $transform); +} + function cp1254($string) { static $tranform = array( diff --git a/phpBB/includes/utf/utf_tools.php b/phpBB/includes/utf/utf_tools.php index f9558c12b4..a0bee0ad74 100644 --- a/phpBB/includes/utf/utf_tools.php +++ b/phpBB/includes/utf/utf_tools.php @@ -6,6 +6,9 @@ * @copyright (c) 2006 phpBB Group * @license http://opensource.org/licenses/gpl-license.php GNU Public License * +* @todo make sure the replacements are called correctly +* already done: strtolower, strtoupper, ucfirst, str_split, strrpos, strlen (hopefully!), strpos, substr, htmlspecialchars +* remaining: strspn, chr, ord */ /** @@ -307,7 +310,7 @@ else { if (!is_int($offset)) { - trigger_error('utf8_strpos: Offset must be an integer', E_USER_ERROR); + trigger_error('utf8_strpos: Offset must be an integer', E_USER_ERROR); return false; } @@ -580,7 +583,7 @@ else $ly = (-$length) % 65535; // negative length requires ... capture everything - // except a group of -length characters + // except a group of -length characters // anchored at the tail-end of the string if ($lx) { @@ -700,15 +703,15 @@ function utf8_recode($string, $encoding) { $encoding = strtolower($encoding); - if ($encoding == 'utf-8' || !is_string($string) || !isset($string[0])) + if ($encoding == 'utf-8' || !is_string($string) || empty($string)) { return $string; } - // start with something simple + // we force iso-8859-1 to be cp1252 if ($encoding == 'iso-8859-1') { - return utf8_encode($string); + $encoding = 'cp1252'; } // First, try iconv() @@ -790,6 +793,7 @@ function utf8_recode($string, $encoding) break; case '1250': case '1251': + case '1252': case '1254': case '1255': case '1256': @@ -1103,6 +1107,9 @@ function utf8_clean_string($text) utf_normalizer::nfc($text); static $homographs = array( + "\x08" => '', // BACKSPACE => empty string + "\x09" => "\x20", // CHARACTER TABULATION => SPACE + "\x11" => "\x20", // Device Controls => SPACE "\xC2\xA1" => "\x69", // EXCLAMATION MARK, INVERTED => LATIN SMALL LETTER I "\xC2\xAD" => '', // HYPHEN, SOFT => empty string "\xC4\x90" => "\xC3\x90", // LATIN CAPITAL LETTER D WITH STROKE => LATIN CAPITAL LETTER ETH @@ -1172,6 +1179,7 @@ function utf8_clean_string($text) "\xE1\xB4\xA8" => "\xD0\xBF", // GREEK LETTER SMALL CAPITAL PI => CYRILLIC SMALL LETTER PE "\xE1\xB4\xA9" => "\xE1\xB4\x98", // GREEK LETTER SMALL CAPITAL RHO => LATIN LETTER SMALL CAPITAL P "\xE1\xB4\xAB" => "\xD0\xBB", // CYRILLIC LETTER SMALL CAPITAL EL => CYRILLIC SMALL LETTER EL + "\xE2\x80\x81" => "\x20", // EM QUAD => SPACE "\xE2\x8D\xB3" => "\xC9\xA9", // APL FUNCTIONAL SYMBOL IOTA => LATIN SMALL LETTER IOTA "\xE2\x8D\xB4" => "\xCF\x81", // APL FUNCTIONAL SYMBOL RHO => GREEK SMALL LETTER RHO "\xE2\x8D\xB5" => "\xCF\x89", // APL FUNCcTIONAL SYMBOL OMEGA => GREEK SMALL LETTER OMEGA @@ -1182,63 +1190,10 @@ function utf8_clean_string($text) "\xF0\x90\x8F\x93" => "\xF0\x90\x8E\x93", // OLD PERSIAN NUMBER TEN => UGARITIC LETTER AIN "\xF0\x90\x92\xA0" => "\xF0\x90\x92\x86", // OSMANYA DIGIT ZERO => OSMANYA LETTER DEEL "\xF0\x92\x80\xB8" => "\xF0\x90\x8E\x9A", // CUNEIFORM SIGN ASH => UGARITIC LETTER TO - - "\xC2\xA0" => "\x20", // NO-BREAK SPACE - "\xE1\x9A\x80" => "\x20", // OGHAM SPACE MARK - "\xE2\x80\x80" => "\x20", // EN QUAD - "\xE2\x80\x81" => "\x20", // EM QUAD - "\xE2\x80\x82" => "\x20", // EN SPACE - "\xE2\x80\x83" => "\x20", // EM SPACE - "\xE2\x80\x84" => "\x20", // THREE-PER-EM SPACE - "\xE2\x80\x85" => "\x20", // FOUR-PER-EM SPACE - "\xE2\x80\x86" => "\x20", // SIX-PER-EM SPACE - "\xE2\x80\x87" => "\x20", // FIGURE SPACE - "\xE2\x80\x88" => "\x20", // PUNCTUATION SPACE - "\xE2\x80\x89" => "\x20", // THIN SPACE - "\xE2\x80\x8A" => "\x20", // HAIR SPACE - "\xE2\x80\xAF" => "\x20", // NARROW NO-BREAK SPACE - "\xE2\x81\x9F" => "\x20", // MEDIUM MATHEMATICAL SPACE - "\xE3\x80\x80" => "\x20", // IDEOGRAPHIC SPACE - - "\xDB\x9D" => '', // ARABIC END OF AYAH - "\xDC\x8F" => '', // SYRIAC ABBREVIATION MARK - "\xE1\xA0\x86" => '', // MONGOLIAN TODO SOFT HYPHEN - "\xE1\xA0\x8E" => '', // MONGOLIAN VOWEL SEPARATOR - "\xE2\x80\x8B" => '', // ZERO WIDTH SPACE - "\xE2\x80\x8C" => '', // ZERO WIDTH NON-JOINER - "\xE2\x80\x8D" => '', // ZERO WIDTH JOINER - "\xE2\x80\xA8" => '', // LINE SEPARATOR - "\xE2\x80\xA9" => '', // PARAGRAPH SEPARATOR - "\xE2\x81\xA0" => '', // WORD JOINER - "\xE2\x81\xA1" => '', // FUNCTION APPLICATION - "\xE2\x81\xA2" => '', // INVISIBLE TIMES - "\xE2\x81\xA3" => '', // INVISIBLE SEPARATOR - "\xE2\x81\xAA" => '', // [CONTROL CHARACTERS] - "\xE2\x81\xAB" => '', // [CONTROL CHARACTERS] - "\xE2\x81\xAC" => '', // [CONTROL CHARACTERS] - "\xE2\x81\xAD" => '', // [CONTROL CHARACTERS] - "\xE2\x81\xAE" => '', // [CONTROL CHARACTERS] - "\xE2\x81\xAF" => '', // [CONTROL CHARACTERS] - "\xEF\xBB\xBF" => '', // ZERO WIDTH NO-BREAK SPACE - "\xEF\xBF\xB9" => '', // [CONTROL CHARACTERS] - "\xEF\xBF\xBA" => '', // [CONTROL CHARACTERS] - "\xEF\xBF\xBB" => '', // [CONTROL CHARACTERS] - "\xEF\xBF\xBC" => '', // [CONTROL CHARACTERS] - "\xF0\x9D\x85\xB3" => '', // [MUSICAL CONTROL CHARACTERS] - "\xF0\x9D\x85\xB4" => '', // [MUSICAL CONTROL CHARACTERS] - "\xF0\x9D\x85\xB5" => '', // [MUSICAL CONTROL CHARACTERS] - "\xF0\x9D\x85\xB6" => '', // [MUSICAL CONTROL CHARACTERS] - "\xF0\x9D\x85\xB7" => '', // [MUSICAL CONTROL CHARACTERS] - "\xF0\x9D\x85\xB8" => '', // [MUSICAL CONTROL CHARACTERS] - "\xF0\x9D\x85\xB9" => '', // [MUSICAL CONTROL CHARACTERS] - "\xF0\x9D\x85\xBA" => '', // [MUSICAL CONTROL CHARACTERS] ); $text = strtr($text, $homographs); - // Other control characters - $text = preg_replace('#(?:[\x00-\x1F\x7F]+|(?:\xC2[\x80-\x9F])+)#', '', $text); - return $text; }