- search result extract shouldn't end in the middle of a multibyte character [Bug #11863]

- missing localisation for an imageset shouldn't create lots of "imageset refreshed" log messages [Bug #12027] - explain that themes which need parsing cannot be stored on the filesystem [Bug #11134] - normalize usernames (we really need to make sure we normalize everything) - improved utf8_clean_string, more complete list of homographs and NFKC normalization, also the resulting string is now trimmed - corrected searching subforums explanation [Bug #12209] git-svn-id: file:///svn/phpbb/trunk@7890 89ea8834-ac86-4346-8a33-228a782c2dd0
2025-06-29 06:38:52 +00:00 · 2007-07-15 20:53:27 +00:00 · 2007-07-15 20:53:27 +00:00 · 909e195a9b
commit 909e195a9b
parent f27fa04b8c
14 changed files with 406 additions and 182 deletions
--- a/phpBB/develop/generate_utf_confusables.php
+++ b/phpBB/develop/generate_utf_confusables.php
@ -0,0 +1,211 @@
+<?php
+/** 
+*
+* @package phpBB3
+* @version $Id$
+* @copyright (c) 2005 phpBB Group 
+* @license http://opensource.org/licenses/gpl-license.php GNU Public License 
+*
+*/
+
+if (php_sapi_name() != 'cli')
+{
+	die("This program must be run from the command line.\n");
+}
+
+//
+// Security message:
+//
+// This script is potentially dangerous.
+// Remove or comment the next line (die(".... ) to enable this script.
+// Do NOT FORGET to either remove this script or disable it after you have used it.
+//
+die("Please read the first lines of this script for instructions on how to enable it");
+
+set_time_limit(0);
+
+define('IN_PHPBB', true);
+$phpbb_root_path = '../';
+$phpEx = substr(strrchr(__FILE__, '.'), 1);
+
+echo "Checking for required files\n";
+download('http://unicode.org/reports/tr39/data/confusables.txt');
+echo "\n";
+
+
+/**
+* Load the CaseFolding table
+*/
+echo "Loading confusables\n";
+$unidata = file_get_contents('confusables.txt');
+
+
+function utf8_chr($cp)
+{
+    if ($cp > 0xFFFF)
+    {
+        return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
+    }
+    else if ($cp > 0x7FF)
+    {
+        return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
+    }
+    else if ($cp > 0x7F)
+    {
+        return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F));
+    }
+    else
+    {
+        return chr($cp);
+    }
+}
+
+preg_match_all('/^([0-9A-F]+) ;\s((?:[0-9A-F]+ )*);/im', $unidata, $array, PREG_SET_ORDER);
+
+// some that we defined ourselves
+$uniarray = array(
+		"\xC2\xA1"			=>	"\x69",	// EXCLAMATION MARK, INVERTED => LATIN SMALL LETTER I
+		"\xC7\x83"			=>	"\x21",	// LATIN LETTER RETROFLEX CLICK => EXCLAMATION MARK
+		"\xCE\xB1"			=>	"\x61",	// GREEK SMALL LETTER ALPHA => LATIN SMALL LETTER A
+		"\xE1\x9A\x80"		=>	"\x20",	// OGHAM SPACE MARK
+
+		"\xC2\xAD"			=>	'',		// HYPHEN, SOFT => empty string
+		"\xDB\x9D"			=>	'',		// ARABIC END OF AYAH
+		"\xDC\x8F"			=>	'',		// SYRIAC ABBREVIATION MARK
+		"\xE1\xA0\x86"		=>	'',		// MONGOLIAN TODO SOFT HYPHEN
+		"\xE1\xA0\x8E"		=>	'',		// MONGOLIAN VOWEL SEPARATOR
+		"\xE2\x80\x8B"		=>	'',		// ZERO WIDTH SPACE
+		"\xE2\x80\x8C"		=>	'',		// ZERO WIDTH NON-JOINER
+		"\xE2\x80\x8D"		=>	'',		// ZERO WIDTH JOINER
+		"\xE2\x80\xA8"		=>	'',		// LINE SEPARATOR
+		"\xE2\x80\xA9"		=>	'',		// PARAGRAPH SEPARATOR
+		"\xE2\x81\xA0"		=>	'',		// WORD JOINER
+		"\xE2\x81\xA1"		=>	'',		// FUNCTION APPLICATION
+		"\xE2\x81\xA2"		=>	'',		// INVISIBLE TIMES
+		"\xE2\x81\xA3"		=>	'',		// INVISIBLE SEPARATOR
+		"\xE2\x81\xAA"		=>	'',		// [CONTROL CHARACTERS]
+		"\xE2\x81\xAB"		=>	'',		// [CONTROL CHARACTERS]
+		"\xE2\x81\xAC"		=>	'',		// [CONTROL CHARACTERS]
+		"\xE2\x81\xAD"		=>	'',		// [CONTROL CHARACTERS]
+		"\xE2\x81\xAE"		=>	'',		// [CONTROL CHARACTERS]
+		"\xE2\x81\xAF"		=>	'',		// [CONTROL CHARACTERS]
+		"\xEF\xBB\xBF"		=>	'',		// ZERO WIDTH NO-BREAK SPACE
+		"\xEF\xBF\xB9"		=>	'',		// [CONTROL CHARACTERS]
+		"\xEF\xBF\xBA"		=>	'',		// [CONTROL CHARACTERS]
+		"\xEF\xBF\xBB"		=>	'',		// [CONTROL CHARACTERS]
+		"\xEF\xBF\xBC"		=>	'',		// [CONTROL CHARACTERS]
+		"\xF0\x9D\x85\xB3"	=>	'',		// [MUSICAL CONTROL CHARACTERS]
+		"\xF0\x9D\x85\xB4"	=>	'',		// [MUSICAL CONTROL CHARACTERS]
+		"\xF0\x9D\x85\xB5"	=>	'',		// [MUSICAL CONTROL CHARACTERS]
+		"\xF0\x9D\x85\xB6"	=>	'',		// [MUSICAL CONTROL CHARACTERS]
+		"\xF0\x9D\x85\xB7"	=>	'',		// [MUSICAL CONTROL CHARACTERS]
+		"\xF0\x9D\x85\xB8"	=>	'',		// [MUSICAL CONTROL CHARACTERS]
+		"\xF0\x9D\x85\xB9"	=>	'',		// [MUSICAL CONTROL CHARACTERS]
+		"\xF0\x9D\x85\xBA"	=>	'',		// [MUSICAL CONTROL CHARACTERS]
+);
+
+$copy = $uniarray;
+
+foreach ($array as $value)
+{
+	if (isset($copy[utf8_chr(hexdec((string)$value[1]))]))
+	{
+		$num = '';
+		$string = utf8_chr(hexdec((string)$value[1]));
+		for ($i = 0; $i < strlen($string); $i++)
+		{
+			$num .= '\x' . str_pad(base_convert(ord($string[$i]), 10, 16), 2, '0', STR_PAD_LEFT);
+		}
+		echo $num . "\n";
+		if ($uniarray[$string] != implode(array_map('utf8_chr', array_map('hexdec', explode(' ', trim($value[2]))))))
+		{
+			echo "  --> $string\n";
+			echo "  --> " . implode(array_map('utf8_chr', array_map('hexdec', explode(' ', trim($value[2]))))) . "\n";
+		}
+	}
+    $uniarray[utf8_chr(hexdec((string)$value[1]))] = implode(array_map('utf8_chr', array_map('hexdec', explode(' ', trim($value[2])))));
+}
+
+echo "Writing to confusables.$phpEx\n";
+
+$fp = fopen($phpbb_root_path . 'includes/utf/data/confusables.' . $phpEx, 'wb');
+fwrite($fp, '<?php return ' . my_var_export($uniarray) . ';');
+fclose($fp);
+
+/**
+* Return a parsable string representation of a variable
+*
+* This is function is limited to array/strings/integers
+*
+* @param	mixed	$var		Variable
+* @return	string				PHP code representing the variable
+*/
+function my_var_export($var)
+{
+	if (is_array($var))
+	{
+		$lines = array();
+
+		foreach ($var as $k => $v)
+		{
+			$lines[] = my_var_export($k) . '=>' . my_var_export($v);
+		}
+
+		return 'array(' . implode(',', $lines) . ')';
+	}
+	else if (is_string($var))
+	{
+		return "'" . str_replace(array('\\', "'"), array('\\\\', "\\'"), $var) . "'";
+	}
+	else
+	{
+		return $var;
+	}
+}
+
+/**
+* Download a file to the develop/ dir
+*
+* @param	string	$url		URL of the file to download
+* @return	void
+*/
+function download($url)
+{
+	global $phpbb_root_path;
+
+	if (file_exists($phpbb_root_path . 'develop/' . basename($url)))
+	{
+		return;
+	}
+
+	echo 'Downloading from ', $url, ' ';
+
+	if (!$fpr = fopen($url, 'rb'))
+	{
+		die("Can't download from $url\nPlease download it yourself and put it in the develop/ dir, kthxbai");
+	}
+
+	if (!$fpw = fopen($phpbb_root_path . 'develop/' . basename($url), 'wb'))
+	{
+		die("Can't open develop/" . basename($url) . " for output... please check your permissions or something");
+	}
+
+	$i = 0;
+	$chunk = 32768;
+	$done = '';
+
+	while (!feof($fpr))
+	{
+		$i += fwrite($fpw, fread($fpr, $chunk));
+		echo str_repeat("\x08", strlen($done));
+
+		$done = ($i >> 10) . ' KiB';
+		echo $done;
+	}
+	fclose($fpr);
+	fclose($fpw);
+
+	echo "\n";
+}
+
+?>
--- a/phpBB/develop/unicode_testing.php
+++ b/phpBB/develop/unicode_testing.php
@ -0,0 +1,120 @@
+<?php
+//
+// This file provides some useful functions for debugging the unicode/UTF-8 library
+// It requires utf_tools.php to be loaded
+//
+die("Please read the first lines of this script for instructions on how to enable it");
+
+if (!headers_sent())
+{
+	header('Content-type: text/html; charset=UTF-8');
+}
+
+/**
+ * Converts unicode escape sequences (\u0123) into UTF-8 characters
+ *
+ * @param	string	A unicode sequence
+ * @return	string	UTF-8 representation of the given unicode sequence
+ */
+function unicode_to_utf8($string)
+{
+	$utf8 = '';
+	$chars = array();
+	for ($i = 0; $i < strlen($string); $i++)
+	{
+		if (isset($string[$i + 5]) && substr($string, $i, 2) == '\\u' && ctype_xdigit(substr($string, $i + 2, 4)))
+		{
+			$utf8 .= utf8_from_unicode(array(base_convert(substr($string, $i + 2, 4), 16, 10)));
+			$i += 5;
+		}
+		else
+		{
+			$utf8 .= $string[$i];
+		}
+	}
+	return $utf8;
+}
+
+/**
+ * Takes an array of ints representing the Unicode characters and returns
+ * a UTF-8 string.
+ *
+ * @param array $array array of unicode code points representing a string
+ * @return string UTF-8 character string
+ */
+function utf8_from_unicode($array)
+{
+	$str = '';
+	foreach ($array as $value)
+	{
+		$str .= utf8_chr($value);
+	}
+	return $str;
+}
+
+/**
+* Converts a UTF-8 string to unicode code points
+*
+* @param	string	$text		UTF-8 string
+* @return	string				Unicode code points
+*/
+function utf8_to_unicode($text)
+{
+	return preg_replace_callback(
+		'#[\\xC2-\\xF4][\\x80-\\xBF]?[\\x80-\\xBF]?[\\x80-\\xBF]#',
+		'utf8_to_unicode_callback',
+		preg_replace_callback(
+			'#[\\x00-\\x7f]#',
+			'utf8_to_unicode_callback',
+			$text
+		)
+	);
+}
+
+/**
+* Takes a UTF-8 char and replaces it with its unicode escape sequence. Attention, $m is an array
+*
+* @param	array	$m			0-based numerically indexed array passed by preg_replace_callback()
+* @return	string				A unicode escape sequence
+*/
+function utf8_to_unicode_callback($m)
+{
+	return '\u' . str_pad(base_convert(utf8_ord($m[0]), 10, 16), 4, '0', STR_PAD_LEFT) . '';
+}
+
+/**
+* A wrapper function for the normalizer which takes care of including the class if required and modifies the passed strings
+* to be in NFKC
+*
+* @param	mixed	$strings	a string or an array of strings to normalize
+* @return	mixed				the normalized content, preserving array keys if array given.
+*/
+function utf8_normalize_nfkc($strings)
+{
+	if (empty($strings))
+	{
+		return $strings;
+	}
+
+	if (!class_exists('utf_normalizer'))
+	{
+		global $phpbb_root_path, $phpEx;
+		include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
+	}
+
+	if (!is_array($strings))
+	{
+		utf_normalizer::nfkc($strings);
+	}
+	else if (is_array($strings))
+	{
+		foreach ($strings as $key => $string)
+		{
+			utf_normalizer::nfkc($strings[$key]);
+		}
+	}
+
+	return $strings;
+}
+
+?>
--- a/phpBB/docs/CHANGELOG.html
+++ b/phpBB/docs/CHANGELOG.html
@ -242,7 +242,11 @@ p a {
 		<li>[Fix] Do not display return to search link in prosilver if search is not allowed (Bug #11393)</li>
 		<li>[Fix] Use global url validation for img bbcode tag (Bug #11935)</li>
 		<li>[Fix] Added proper unicode support to style names (Bug #12165)</li>
-
+		<li>[Fix] Search result extract should not end in the middle of a multibyte character (Bug #11863)</li>
+		<li>[Fix] Missing localisation for an imageset no longer triggers a lot of "imageset refreshed" log messages (Bug #12027)</li>
+		<li>[Fix] Explain that themes which need parsing cannot be stored on the filesystem (Bug #11134)</li>
+		<li>[Fix] Normalize usernames</li>
+		<li>[Change] Improved utf8_clean_string with a more complete list of homographs and NFKC normalization</li>
 	
 	</ul>

--- a/phpBB/includes/acp/acp_styles.php
+++ b/phpBB/includes/acp/acp_styles.php
@ -2051,6 +2051,18 @@ parse_css_file = {PARSE_CSS_FILE}
 					$error[] = $user->lang['EDIT_' . strtoupper($mode) . '_STORED_DB'];
 					$store_db = 1;
 				}
+
+				// themes which have to be parsed have to go into db
+				if ($mode == 'theme')
+				{
+					$cfg = parse_cfg_file("{$phpbb_root_path}styles/" . $style_row["{$mode}_path"] . "/theme/theme.cfg");
+
+					if (isset($cfg['parse_css_file']) && $cfg['parse_css_file'])
+					{
+						$error[] = $user->lang['EDIT_THEME_STORE_PARSED'];
+						$store_db = 1;
+					}
+				}
 			}
 			
 			if (!sizeof($error))
--- a/phpBB/includes/functions.php
+++ b/phpBB/includes/functions.php
@ -2394,7 +2394,7 @@ function get_context($text, $words, $length = 400)
 			{
 				if (preg_match('#(?:[^\w]|^)(' . $word . ')(?:[^\w]|$)#i', $text, $match))
 				{
-					$pos = strpos($text, $match[1]);
+					$pos = utf8_strpos($text, $match[1]);
 					if ($pos !== false)
 					{
 						$word_indizes[] = $pos;
@ -2417,21 +2417,21 @@ function get_context($text, $words, $length = 400)
 			$final_text_index = -1;

 			// cycle through every character in the original text
-			for ($i = $word_indizes[$word], $n = strlen($text); $i < $n; $i++)
+			for ($i = $word_indizes[$word], $n = utf8_strlen($text); $i < $n; $i++)
 			{
 				// if the current position is the start of one of the words then append $sequence_length characters to the final text
 				if (isset($word_indizes[$word]) && ($i == $word_indizes[$word]))
 				{
 					if ($final_text_index < $i - $sequence_length - 1)
 					{
-						$final_text .= '... ' . preg_replace('#^([^ ]*)#', '', substr($text, $i - $sequence_length, $sequence_length));
+						$final_text .= '... ' . preg_replace('#^([^ ]*)#', '', utf8_substr($text, $i - $sequence_length, $sequence_length));
 					}
 					else
 					{
 						// if the final text is already nearer to the current word than $sequence_length we only append the text
 						// from its current index on and distribute the unused length to all other sequenes
 						$sequence_length += (int) (($final_text_index - $i + $sequence_length + 1) / (2 * $wordnum));
-						$final_text .= substr($text, $final_text_index + 1, $i - $final_text_index - 1);
+						$final_text .= utf8_substr($text, $final_text_index + 1, $i - $final_text_index - 1);
 					}
 					$final_text_index = $i - 1;

@ -2443,17 +2443,17 @@ function get_context($text, $words, $length = 400)
 				if ($j > 0)
 				{
 					// add the character to the final text and increment the sequence counter
-					$final_text .= $text[$i];
+					$final_text .= utf8_substr($text, $i, 1);
 					$final_text_index++;
 					$j++;

 					// if this is a whitespace then check whether we are done with this sequence
-					if ($text[$i] == ' ')
+					if (utf8_substr($text, $i, 1) == ' ')
 					{
 						// only check whether we have to exit the context generation completely if we haven't already reached the end anyway
 						if ($i + 4 < $n)
 						{
-							if (($j > $sequence_length && $word >= $wordnum) || strlen($final_text) > $length)
+							if (($j > $sequence_length && $word >= $wordnum) || utf8_strlen($final_text) > $length)
 							{
 								$final_text .= ' ...';
 								break;
@ -2479,7 +2479,7 @@ function get_context($text, $words, $length = 400)

 	if (!sizeof($words) || !sizeof($word_indizes))
 	{
-		return (strlen($text) >= $length + 3) ? substr($text, 0, $length) . '...' : $text;
+		return (utf8_strlen($text) >= $length + 3) ? utf8_substr($text, 0, $length) . '...' : $text;
 	}
 }

--- a/phpBB/includes/session.php
+++ b/phpBB/includes/session.php
@ -1465,13 +1465,19 @@ class user extends session
 				}
 			}

+			if (sizeof($sql_ary))
+			{
 				$db->sql_multi_insert(STYLES_IMAGESET_DATA_TABLE, $sql_ary);
-	
 				$db->sql_transaction('commit');
-	
 				$cache->destroy('sql', STYLES_IMAGESET_DATA_TABLE);

-			add_log('admin', 'LOG_IMAGESET_REFRESHED', $this->theme['imageset_name'], $this->img_lang);
+				add_log('admin', 'LOG_IMAGESET_LANG_REFRESHED', $this->theme['imageset_name'], $this->img_lang);
+			}
+			else
+			{
+				$db->sql_transaction('commit');
+				add_log('admin', 'LOG_IMAGESET_LANG_MISSING', $this->theme['imageset_name'], $this->img_lang);
+			}
 		}

 		// If this function got called from the error handler we are finished here.
--- a/phpBB/includes/ucp/ucp_profile.php
+++ b/phpBB/includes/ucp/ucp_profile.php
@ -36,7 +36,7 @@ class ucp_profile
 			case 'reg_details':

 				$data = array(
-					'username'			=> request_var('username', $user->data['username'], true),
+					'username'			=> utf8_normalize_nfc(request_var('username', $user->data['username'], true)),
 					'email'				=> strtolower(request_var('email', $user->data['user_email'])),
 					'email_confirm'		=> strtolower(request_var('email_confirm', '')),
 					'new_password'		=> request_var('new_password', '', true),
--- a/phpBB/includes/ucp/ucp_register.php
+++ b/phpBB/includes/ucp/ucp_register.php
@ -142,7 +142,7 @@ class ucp_register
 		}

 		$data = array(
-			'username'			=> request_var('username', '', true),
+			'username'			=> utf8_normalize_nfc(request_var('username', '', true)),
 			'new_password'		=> request_var('new_password', '', true),
 			'password_confirm'	=> request_var('password_confirm', '', true),
 			'email'				=> strtolower(request_var('email', '')),
--- a/phpBB/includes/utf/data/confusables.php
+++ b/phpBB/includes/utf/data/confusables.php
--- a/phpBB/includes/utf/utf_tools.php
+++ b/phpBB/includes/utf/utf_tools.php
@ -1847,155 +1847,23 @@ function utf8_normalize_nfc($strings)
 * @return	string			Cleaned up version of the input string
 */
 function utf8_clean_string($text)
-{
-	$text = utf8_case_fold($text);
-	
-	if (!class_exists('utf_normalizer'))
 {
 	global $phpbb_root_path, $phpEx;
-		include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
+
+	static $homographs = array();
+	if (empty($homographs))
+	{
+		$homographs = include($phpbb_root_path . 'includes/utf/data/confusables.' . $phpEx);
 	}

-	utf_normalizer::nfc($text);
-
-	static $homographs = array(
-		"\xC2\xA1"			=>	"\x69",				// EXCLAMATION MARK, INVERTED => LATIN SMALL LETTER I
-		"\xC2\xAD"			=>	'',					// HYPHEN, SOFT => empty string
-		"\xC4\x90"			=>	"\xC3\x90",			// LATIN CAPITAL LETTER D WITH STROKE => LATIN CAPITAL LETTER ETH
-		"\xC7\x83"			=>	"\x21",				// LATIN LETTER RETROFLEX CLICK => EXCLAMATION MARK
-		"\xC9\x85"			=>	"\xCE\x9B",			// LATIN CAPITAL LETTER TURNED V => GREEK CAPITAL LETTER LAMDA
-		"\xC9\x99"			=>	"\xC7\x9D",			// LATIN SMALL LETTER SCHWA => LATIN SMALL LETTER TURNED E
-		"\xCA\x99"			=>	"\xD0\xB2",			// LATIN LETTER SMALL CAPITAL B => CYRILLIC SMALL LETTER VE
-		"\xCA\x9C"			=>	"\xD0\xBD",			// LATIN LETTER SMALL CAPITAL H => CYRILLIC SMALL LETTER EN
-		"\xCE\x91"			=>	"\x41",				// GREEK CAPITAL LETTER ALPHA => LATIN CAPITAL LETTER A
-		"\xCE\x92"			=>	"\x42",				// GREEK CAPITAL LETTER BETA => LATIN CAPITAL LETTER B
-		"\xCE\x95"			=>	"\x45",				// GREEK CAPITAL LETTER EPSILON => LATIN CAPITAL LETTER E
-		"\xCE\x96"			=>	"\x5A",				// GREEK CAPITAL LETTER ZETA => LATIN CAPITAL LETTER Z
-		"\xCE\x97"			=>	"\x48",				// GREEK CAPITAL LETTER ETA => LATIN CAPITAL LETTER H
-		"\xCE\x99"			=>	"\x49",				// GREEK CAPITAL LETTER IOTA => LATIN CAPITAL LETTER I
-		"\xCE\x9A"			=>	"\x4B",				// GREEK CAPITAL LETTER KAPPA => LATIN CAPITAL LETTER K
-		"\xCE\x9C"			=>	"\x4D",				// GREEK CAPITAL LETTER MU => LATIN CAPITAL LETTER M
-		"\xCE\x9D"			=>	"\x4E",				// GREEK CAPITAL LETTER NU => LATIN CAPITAL LETTER N
-		"\xCE\x9F"			=>	"\x4F",				// GREEK CAPITAL LETTER OMICRON => LATIN CAPITAL LETTER O
-		"\xCE\xA1"			=>	"\x50",				// GREEK CAPITAL LETTER RHO => LATIN CAPITAL LETTER P
-		"\xCE\xA3"			=>	"\xC6\xA9",			// GREEK CAPITAL LETTER SIGMA => LATIN CAPITAL LETTER ESH
-		"\xCE\xA4"			=>	"\x54",				// GREEK CAPITAL LETTER TAU => LATIN CAPITAL LETTER T
-		"\xCE\xA5"			=>	"\x59",				// GREEK CAPITAL LETTER UPSILON => LATIN CAPITAL LETTER Y
-		"\xCE\xA7"			=>	"\x58",				// GREEK CAPITAL LETTER CHI => LATIN CAPITAL LETTER X
-		"\xCE\xB1"			=>	"\x61",				// GREEK SMALL LETTER ALPHA => LATIN SMALL LETTER A
-		"\xCE\xB5"			=>	"\xC9\x9B",			// GREEK SMALL LETTER EPSILON => LATIN SMALL LETTER OPEN E
-		"\xCE\xB9"			=>	"\xC9\xA9",			// GREEK SMALL LETTER IOTA => LATIN SMALL LETTER IOTA
-		"\xCE\xBF"			=>	"\x6F",				// GREEK SMALL LETTER OMICRON => LATIN SMALL LETTER O
-		"\xCF\xB3"			=>	"\x6A",				// GREEK LETTER YOT => LATIN SMALL LETTER J
-		"\xD0\x85"			=>	"\x53",				// CYRILLIC CAPITAL LETTER DZE => LATIN CAPITAL LETTER S
-		"\xD0\x88"			=>	"\x4A",				// CYRILLIC CAPITAL LETTER JE => LATIN CAPITAL LETTER J
-		"\xD0\x91"			=>	"\xC6\x82",			// CYRILLIC CAPITAL LETTER BE => LATIN CAPITAL LETTER B WITH TOPBAR
-		"\xD0\x93"			=>	"\xCE\x93",			// CYRILLIC CAPITAL LETTER GHE => GREEK CAPITAL LETTER GAMMA
-		"\xD0\x9F"			=>	"\xCE\xA0",			// CYRILLIC CAPITAL LETTER PE => GREEK CAPITAL LETTER PI
-		"\xD0\xA1"			=>	"\x43",				// CYRILLIC CAPITAL LETTER ES => LATIN CAPITAL LETTER C
-		"\xD0\xB0"			=>	"\x61",				// CYRILLIC SMALL LETTER A => LATIN SMALL LETTER A
-		"\xD0\xB5"			=>	"\x65",				// CYRILLIC SMALL LETTER IE => LATIN SMALL LETTER E
-		"\xD0\xBA"			=>	"\xC4\xB8",			// CYRILLIC SMALL LETTER KA => LATIN SMALL LETTER KRA
-		"\xD0\xBE"			=>	"\x6F",				// CYRILLIC SMALL LETTER O => LATIN SMALL LETTER O
-		"\xD1\x80"			=>	"\x70",				// CYRILLIC SMALL LETTER ER => LATIN SMALL LETTER P
-		"\xD1\x81"			=>	"\x63",				// CYRILLIC SMALL LETTER ES => LATIN SMALL LETTER C
-		"\xD1\x83"			=>	"\x79",				// CYRILLIC SMALL LETTER U => LATIN SMALL LETTER Y
-		"\xD1\x85"			=>	"\x78",				// CYRILLIC SMALL LETTER HA => LATIN SMALL LETTER X
-		"\xD1\x95"			=>	"\x73",				// CYRILLIC SMALL LETTER DZE => LATIN SMALL LETTER S
-		"\xD1\x96"			=>	"\x69",				// CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I => LATIN SMALL LETTER I
-		"\xD1\x98"			=>	"\x6A",				// CYRILLIC SMALL LETTER JE => LATIN SMALL LETTER J
-		"\xD2\xBB"			=>	"\x68",				// CYRILLIC SMALL LETTER SHHA => LATIN SMALL LETTER H
-		"\xD3\x8F"			=>	"\xC9\xAA",			// CYRILLIC SMALL LETTER PALOCHKA => LATIN LETTER SMALL CAPITAL I
-		"\xD3\x94"			=>	"\xC3\x86",			// CYRILLIC CAPITAL LIGATURE A IE => LATIN CAPITAL LETTER AE
-		"\xD3\x95"			=>	"\xC3\xA6",			// CYRILLIC SMALL LIGATURE A IE => LATIN SMALL LETTER AE
-		"\xD3\x98"			=>	"\xC6\x8E",			// CYRILLIC CAPITAL LETTER SCHWA => LATIN CAPITAL LETTER REVERSED E
-		"\xD3\x99"			=>	"\xC7\x9D",			// CYRILLIC SMALL LETTER SCHWA => LATIN SMALL LETTER TURNED E
-		"\xD3\xA1"			=>	"\xCA\x92",			// CYRILLIC SMALL LETTER ABKHASIAN DZE => LATIN SMALL LETTER EZH
-		"\xD3\xA8"			=>	"\xC6\x9F",			// CYRILLIC CAPITAL LETTER BARRED O => LATIN CAPITAL LETTER O WITH MIDDLE TILDE
-		"\xD3\xA9"			=>	"\xC9\xB5",			// CYRILLIC SMALL LETTER BARRED O => LATIN SMALL LETTER BARRED O
-		"\xD4\x81"			=>	"\x64",				// CYRILLIC SMALL LETTER KOMI DE => LATIN SMALL LETTER D
-		"\xE1\x81\x80"		=>	"\xE1\x80\x9D",		// MYANMAR DIGIT ZERO => MYANMAR LETTER WA
-		"\xE1\x9E\xA3"		=>	"\xE1\x9E\xA2",		// KHMER INDEPENDENT VOWEL QAQ => KHMER LETTER QA
-		"\xE1\xA1\x95"		=>	"\xE1\xA0\xB5",		// MONGOLIAN LETTER TODO YA => MONGOLIAN LETTER JA
-		"\xE1\xA7\x90"		=>	"\xE1\xA6\x9E",		// NEW TAI LUE DIGIT ZERO => NEW TAI LUE LETTER LOW VA
-		"\xE1\xAD\x92"		=>	"\xE1\xAC\x8D",		// BALINESE DIGIT TWO => BALINESE LETTER LA LENGA
-		"\xE1\xAD\x93"		=>	"\xE1\xAC\x91",		// BALINESE DIGIT THREE => BALINESE LETTER OKARA
-		"\xE1\xAD\x98"		=>	"\xE1\xAC\xA8",		// BALINESE DIGIT EIGHT => BALINESE LETTER PA KAPAL
-		"\xE1\xAD\x9C"		=>	"\xE1\xAD\x90",		// BALINESE WINDU => BALINESE DIGIT ZERO
-		"\xE1\xB4\x8D"		=>	"\xD0\xBC",			// LATIN LETTER SMALL CAPITAL M => CYRILLIC SMALL LETTER EM
-		"\xE1\xB4\x9B"		=>	"\xD1\x82",			// LATIN LETTER SMALL CAPITAL T => CYRILLIC SMALL LETTER TE
-		"\xE1\xB4\xA6"		=>	"\xD0\xB3",			// GREEK LETTER SMALL CAPITAL GAMMA => CYRILLIC SMALL LETTER GHE
-		"\xE1\xB4\xA8"		=>	"\xD0\xBF",			// GREEK LETTER SMALL CAPITAL PI => CYRILLIC SMALL LETTER PE
-		"\xE1\xB4\xA9"		=>	"\xE1\xB4\x98",		// GREEK LETTER SMALL CAPITAL RHO => LATIN LETTER SMALL CAPITAL P
-		"\xE1\xB4\xAB"		=>	"\xD0\xBB",			// CYRILLIC LETTER SMALL CAPITAL EL => CYRILLIC SMALL LETTER EL
-		"\xE2\x8D\xB3"		=>	"\xC9\xA9",			// APL FUNCTIONAL SYMBOL IOTA => LATIN SMALL LETTER IOTA
-		"\xE2\x8D\xB4"		=>	"\xCF\x81",			// APL FUNCTIONAL SYMBOL RHO => GREEK SMALL LETTER RHO
-		"\xE2\x8D\xB5"		=>	"\xCF\x89",			// APL FUNCTIONAL SYMBOL OMEGA => GREEK SMALL LETTER OMEGA
-		"\xE2\x8D\xBA"		=>	"\xCE\xB1",			// APL FUNCTIONAL SYMBOL ALPHA => GREEK SMALL LETTER ALPHA
-		"\xE2\xB1\xA7"		=>	"\xD2\xA2",			// LATIN CAPITAL LETTER H WITH DESCENDER => CYRILLIC CAPITAL LETTER EN WITH DESCENDER
-		"\xE2\xB1\xA9"		=>	"\xD2\x9A",			// LATIN CAPITAL LETTER K WITH DESCENDER => CYRILLIC CAPITAL LETTER KA WITH DESCENDER
-		"\xF0\x90\x8F\x91"	=>	"\xF0\x90\x8E\x82",	// OLD PERSIAN NUMBER ONE => UGARITIC LETTER GAMLA
-		"\xF0\x90\x8F\x93"	=>	"\xF0\x90\x8E\x93",	// OLD PERSIAN NUMBER TEN => UGARITIC LETTER AIN
-		"\xF0\x90\x92\xA0"	=>	"\xF0\x90\x92\x86",	// OSMANYA DIGIT ZERO => OSMANYA LETTER DEEL
-		"\xF0\x92\x80\xB8"	=>	"\xF0\x90\x8E\x9A",	// CUNEIFORM SIGN ASH => UGARITIC LETTER TO
-
-		"\xC2\xA0"			=>	"\x20",				// NO-BREAK SPACE
-		"\xE1\x9A\x80"		=>	"\x20",				// OGHAM SPACE MARK
-		"\xE2\x80\x80"		=>	"\x20",				// EN QUAD
-		"\xE2\x80\x81"		=>	"\x20",				// EM QUAD
-		"\xE2\x80\x82"		=>	"\x20",				// EN SPACE
-		"\xE2\x80\x83"		=>	"\x20",				// EM SPACE
-		"\xE2\x80\x84"		=>	"\x20",				// THREE-PER-EM SPACE
-		"\xE2\x80\x85"		=>	"\x20",				// FOUR-PER-EM SPACE
-		"\xE2\x80\x86"		=>	"\x20",				// SIX-PER-EM SPACE
-		"\xE2\x80\x87"		=>	"\x20",				// FIGURE SPACE
-		"\xE2\x80\x88"		=>	"\x20",				// PUNCTUATION SPACE
-		"\xE2\x80\x89"		=>	"\x20",				// THIN SPACE
-		"\xE2\x80\x8A"		=>	"\x20",				// HAIR SPACE
-		"\xE2\x80\xAF"		=>	"\x20",				// NARROW NO-BREAK SPACE
-		"\xE2\x81\x9F"		=>	"\x20",				// MEDIUM MATHEMATICAL SPACE
-		"\xE3\x80\x80"		=>	"\x20",				// IDEOGRAPHIC SPACE
-
-		"\xDB\x9D"			=>	'',					// ARABIC END OF AYAH
-		"\xDC\x8F"			=>	'',					// SYRIAC ABBREVIATION MARK
-		"\xE1\xA0\x86"		=>	'',					// MONGOLIAN TODO SOFT HYPHEN
-		"\xE1\xA0\x8E"		=>	'',					// MONGOLIAN VOWEL SEPARATOR
-		"\xE2\x80\x8B"		=>	'',					// ZERO WIDTH SPACE
-		"\xE2\x80\x8C"		=>	'',					// ZERO WIDTH NON-JOINER
-		"\xE2\x80\x8D"		=>	'',					// ZERO WIDTH JOINER
-		"\xE2\x80\xA8"		=>	'',					// LINE SEPARATOR
-		"\xE2\x80\xA9"		=>	'',					// PARAGRAPH SEPARATOR
-		"\xE2\x81\xA0"		=>	'',					// WORD JOINER
-		"\xE2\x81\xA1"		=>	'',					// FUNCTION APPLICATION
-		"\xE2\x81\xA2"		=>	'',					// INVISIBLE TIMES
-		"\xE2\x81\xA3"		=>	'',					// INVISIBLE SEPARATOR
-		"\xE2\x81\xAA"		=>	'',					// [CONTROL CHARACTERS]
-		"\xE2\x81\xAB"		=>	'',					// [CONTROL CHARACTERS]
-		"\xE2\x81\xAC"		=>	'',					// [CONTROL CHARACTERS]
-		"\xE2\x81\xAD"		=>	'',					// [CONTROL CHARACTERS]
-		"\xE2\x81\xAE"		=>	'',					// [CONTROL CHARACTERS]
-		"\xE2\x81\xAF"		=>	'',					// [CONTROL CHARACTERS]
-		"\xEF\xBB\xBF"		=>	'',					// ZERO WIDTH NO-BREAK SPACE
-		"\xEF\xBF\xB9"		=>	'',					// [CONTROL CHARACTERS]
-		"\xEF\xBF\xBA"		=>	'',					// [CONTROL CHARACTERS]
-		"\xEF\xBF\xBB"		=>	'',					// [CONTROL CHARACTERS]
-		"\xEF\xBF\xBC"		=>	'',					// [CONTROL CHARACTERS]
-		"\xF0\x9D\x85\xB3"	=>	'',					// [MUSICAL CONTROL CHARACTERS]
-		"\xF0\x9D\x85\xB4"	=>	'',					// [MUSICAL CONTROL CHARACTERS]
-		"\xF0\x9D\x85\xB5"	=>	'',					// [MUSICAL CONTROL CHARACTERS]
-		"\xF0\x9D\x85\xB6"	=>	'',					// [MUSICAL CONTROL CHARACTERS]
-		"\xF0\x9D\x85\xB7"	=>	'',					// [MUSICAL CONTROL CHARACTERS]
-		"\xF0\x9D\x85\xB8"	=>	'',					// [MUSICAL CONTROL CHARACTERS]
-		"\xF0\x9D\x85\xB9"	=>	'',					// [MUSICAL CONTROL CHARACTERS]
-		"\xF0\x9D\x85\xBA"	=>	'',					// [MUSICAL CONTROL CHARACTERS]
-	);
-
+	$text = utf8_case_fold_nfkc($text);
 	$text = strtr($text, $homographs);
-
 	// Other control characters
 	$text = preg_replace('#(?:[\x00-\x1F\x7F]+|(?:\xC2[\x80-\x9F])+)#', '', $text);

-	return $text;
+	// we can use trim here as all the other space characters should have been turned
+	// into normal ASCII spaces by now
+	return trim($text);
 }

 /**
--- a/phpBB/install/data/confusables.php
+++ b/phpBB/install/data/confusables.php
--- a/phpBB/language/en/acp/common.php
+++ b/phpBB/language/en/acp/common.php
@ -523,6 +523,7 @@ $lang = array_merge($lang, array(
 	'LOG_IMAGESET_EDIT_DETAILS'		=> '<strong>Edited imageset details</strong><br />» %s',
 	'LOG_IMAGESET_EDIT'				=> '<strong>Edited imageset</strong><br />» %s',
 	'LOG_IMAGESET_EXPORT'			=> '<strong>Exported imageset</strong><br />» %s',
+	'LOG_IMAGESET_LANG_MISSING'		=> '<strong>Imageset misses “%2$s” localisation</strong><br />» %1$s',
 	'LOG_IMAGESET_LANG_REFRESHED'	=> '<strong>Refreshed “%2$s” localisation of imageset</strong><br />» %1$s',
 	'LOG_IMAGESET_REFRESHED'		=> '<strong>Refreshed imageset</strong><br />» %s',

--- a/phpBB/language/en/search.php
+++ b/phpBB/language/en/search.php
@ -73,7 +73,7 @@ $lang = array_merge($lang, array(
 	'SEARCH_AUTHOR_EXPLAIN'		=> 'Use * as a wildcard for partial matches.',
 	'SEARCH_FIRST_POST'			=> 'First post of topics only',
 	'SEARCH_FORUMS'				=> 'Search in forums',
-	'SEARCH_FORUMS_EXPLAIN'		=> 'Select the forum or forums you wish to search in. For speed all subforums can be searched by selecting the parent and setting enable search subforums below.',
+	'SEARCH_FORUMS_EXPLAIN'		=> 'Select the forum or forums you wish to search in. Subforums are searched automatically if you do not disable “search subforums“ below.',
 	'SEARCH_IN_RESULTS'			=> 'Search these results',
 	'SEARCH_KEYWORDS_EXPLAIN'	=> 'Place <strong>+</strong> in front of a word which must be found and <strong>-</strong> in front of a word which must not be found. Put a list of words separated by <strong>|</strong> into brackets if only one of the words must be found. Use * as a wildcard for partial matches.',
 	'SEARCH_MSG_ONLY'			=> 'Message text only',
--- a/phpBB/search.php
+++ b/phpBB/search.php
@ -44,7 +44,7 @@ $sort_days		= request_var('st', 0);
 $sort_key		= request_var('sk', 't');
 $sort_dir		= request_var('sd', 'd');

-$return_chars	= request_var('ch', ($topic_id) ? -1 : 200);
+$return_chars	= request_var('ch', ($topic_id) ? -1 : 300);
 $search_forum	= request_var('fid', array(0));

 // Is user able to search? Has search been disabled?
@ -476,7 +476,7 @@ if ($keywords || $author || $author_id || $search_id || $submit)
 	$u_search .= ($u_search_forum) ? '&amp;fid%5B%5D=' . $u_search_forum : '';
 	$u_search .= (!$search_child) ? '&amp;sc=0' : '';
 	$u_search .= ($search_fields != 'all') ? '&amp;sf=' . $search_fields : '';
-	$u_search .= ($return_chars != 200) ? '&amp;ch=' . $return_chars : '';
+	$u_search .= ($return_chars != 300) ? '&amp;ch=' . $return_chars : '';

 	$template->assign_vars(array(
 		'SEARCH_TITLE'		=> $l_search_title,
@ -1009,7 +1009,7 @@ $s_characters .= '<option value="50">50</option>';

 for ($i = 100; $i <= 1000 ; $i += 100)
 {
-	$selected = ($i == 200) ? ' selected="selected"' : '';
+	$selected = ($i == 300) ? ' selected="selected"' : '';
 	$s_characters .= '<option value="' . $i . '"' . $selected . '>' . $i . '</option>';
 }