Merge pull request #6712 from rxu/ticket/17387

[ticket/17387] Fix PHP warnings in search results
2025-07-23 10:28:55 +00:00 · 2024-09-09 21:04:04 +02:00 · 2024-09-09 21:04:04 +02:00 · 9eb30b5d8e
commit 9eb30b5d8e
parent bf4e26a0b8 544e0900e6
2 changed files with 119 additions and 14 deletions
--- a/phpBB/includes/functions_content.php
+++ b/phpBB/includes/functions_content.php
@ -329,14 +329,14 @@ function get_context(string $text, array $words, int $length = 400): string
 {
 	if ($length <= 0)
 	{
-		return '...';
+		return $text;
 	}

-	// we need to turn the entities back into their original form, to not cut the message in between them
-	$text = html_entity_decode($text);
+	// We need to turn the entities back into their original form, to not cut the message in between them
+	$text = htmlspecialchars_decode($text);

 	// Replace all spaces/invisible characters with single spaces
-	$text = preg_replace("/\s+/u", ' ', $text);
+	$text = preg_replace("/[\p{Z}\h\v]+/u", ' ', $text);

 	$text_length = utf8_strlen($text);

@ -351,7 +351,6 @@ function get_context(string $text, array $words, int $length = 400): string
 			$word_indexes[$pos] = $word;
 		}
 	}
-
 	if (!empty($word_indexes))
 	{
 		ksort($word_indexes);
@ -400,23 +399,21 @@ function get_context(string $text, array $words, int $length = 400): string
 		$fragment_end = $end - $start + 1;

 		// Find the first valid alphanumeric character in the fragment to don't cut words
-		if ($start > 0)
+		if ($start > 0 && preg_match('/[^\p{L}\p{N}][\p{L}\p{N}]/u', $fragment, $matches, PREG_OFFSET_CAPTURE))
 		{
-			preg_match('/[^a-zA-Z0-9][a-zA-Z0-9]/u', $fragment, $matches, PREG_OFFSET_CAPTURE);
-			$fragment_start = (int) $matches[0][1] + 1; // first valid alphanumeric character
+			$fragment_start = utf8_strlen(substr($fragment, 0, (int) $matches[0][1])) + 1;
 		}

 		// Find the last valid alphanumeric character in the fragment to don't cut words
-		if ($end < $text_length - 1)
+		if ($end < $text_length - 1 && preg_match_all('/[\p{L}\p{N}][^\p{L}\p{N}]/u', $fragment, $matches, PREG_OFFSET_CAPTURE))
 		{
-			preg_match_all('/[a-zA-Z0-9][^a-zA-Z0-9]/u', $fragment, $matches, PREG_OFFSET_CAPTURE);
-			$fragment_end = end($matches[0])[1]; // last valid alphanumeric character
+			$fragment_end = utf8_strlen(substr($fragment, 0, end($matches[0])[1]));
 		}

 		$output[] = utf8_substr($fragment, $fragment_start, $fragment_end - $fragment_start + 1);
 	}

-	return ($fragments[0][0] !== 0 ? '... ' : '') . htmlentities(implode(' ... ', $output)) . ($end < $text_length - 1 ? ' ...' : '');
+	return ($fragments[0][0] !== 0 ? '... ' : '') . utf8_htmlspecialchars(implode(' ... ', $output)) . ($end < $text_length - 1 ? ' ...' : '');
 }

 /**
--- a/tests/functions_content/get_context_test.php
+++ b/tests/functions_content/get_context_test.php
@ -81,13 +81,13 @@ class phpbb_functions_content_get_context_test extends TestCase
 				'text' => 'This is a sample text.',
 				'words' => ['sample'],
 				'length' => 0,
-				'expected' => '...',
+				'expected' => 'This is a sample text.',
 			],
 			'negative length' => [
 				'text' => 'This is a sample text.',
 				'words' => ['sample'],
 				'length' => -10,
-				'expected' => '...',
+				'expected' => 'This is a sample text.',
 			],
 			'ellipses_beginning' => [
 				'text' => 'foo foo foo foo foo foo foo foo bar',
@ -112,12 +112,120 @@ class phpbb_functions_content_get_context_test extends TestCase
 				'words' => ['word1', 'word2'],
 				'length' => 10,
 				'expected' => 'word1 ... word2',
+			],
+		];
+	}
+
+	/**
+	 * Data provider for unicode get_context test cases.
+	 *
+	 * @return array
+	 */
+	public function data_get_context_unicode(): array
+	{
+		return [
+			'text contains words and length greater than text' => [
+				'text' => 'Это пример текста, содержащего разнообразные слова, включая пример, текст и слова.',
+				'words' => ['пример', 'слова'],
+				'length' => 100,
+				'expected' => 'Это пример текста, содержащего разнообразные слова, включая пример, текст и слова.',
+			],
+			'text contains words and length less than text' => [
+				'text' => 'Это пример текста, содержащего разнообразные слова, включая шаблон, текст и слова.',
+				'words' => ['пример', 'слова'],
+				'length' => 50,
+				'expected' => 'Это пример текста, содержащего разнообразные слова ...',
+			],
+			'text does not contain words' => [
+				'text' => 'Это пример текста, содержащего разнообразные слова, но ни одно из них не совпадает с искомыми.',
+				'words' => ['nonexistent'],
+				'length' => 50,
+				'expected' => 'Это пример текста, содержащего разнообразные слова ...',
+			],
+			'desired length equal to text length' => [
+				'text' => 'Текст точной длины.',
+				'words' => ['Текст', 'точной'],
+				'length' => 19,
+				'expected' => 'Текст точной длины.',
+			],
+			'text with html entities' => [
+				'text' => 'Это пример текста, содержащего &amp; и &lt; и &gt; лексемы.',
+				'words' => ['пример', 'содержащего'],
+				'length' => 40,
+				'expected' => 'Это пример текста, содержащего &amp; и &lt; и ...',
+			],
+			'text with html entities and contains last word' => [
+				'text' => 'Это пример текста, содержащего &amp; и &lt; и &gt; лексемы.',
+				'words' => ['пример', 'лексемы'],
+				'length' => 40,
+				'expected' => 'Это пример текста ... и &lt; и &gt; лексемы.',
+			],
+			'text with multiple spaces and special characters' => [
+				'text' => 'Это    пример   текста, содержащего    разнообразные   слова.',
+				'words' => ['пример', 'разнообразные'],
+				'length' => 50,
+				'expected' => 'Это пример текста, содержащего разнообразные слова.',
+			],
+			'empty text' => [
+				'text' => '',
+				'words' => ['пример', 'слова'],
+				'length' => 50,
+				'expected' => '',
+			],
+			'empty words array' => [
+				'text' => 'Это пример текста, содержащего разнообразные слова.',
+				'words' => [],
+				'length' => 50,
+				'expected' => 'Это пример текста, содержащего разнообразные слова.',
+			],
+			'zero length' => [
+				'text' => 'Это пример текста.',
+				'words' => ['пример'],
+				'length' => 0,
+				'expected' => 'Это пример текста.',
+			],
+			'negative length' => [
+				'text' => 'Это пример текста.',
+				'words' => ['sample'],
+				'length' => -10,
+				'expected' => 'Это пример текста.',
+			],
+			'ellipses_beginning' => [
+				'text' => 'раз раз раз раз раз раз раз раз два',
+				'words' => ['два'],
+				'length' => 10,
+				'expected' => '... раз раз два',
+			],
+			'ellipsis_end' => [
+				'text' => 'два раз раз раз раз раз раз раз раз',
+				'words' => ['два'],
+				'length' => 10,
+				'expected' => 'два раз раз ...',
+			],
+			'ellipsis_middle' => [
+				'text' => 'раз слово1 раз раз раз раз раз раз раз раз раз слово2 раз',
+				'words' => ['слово1', 'слово2'],
+				'length' => 15,
+				'expected' => '... слово1 ... слово2 ...',
+			],
+			'ellipsis_middle2' => [
+				'text' => 'слово1 foo foo foo foo foo foo foo foo foo слово2',
+				'words' => ['слово1', 'слово2'],
+				'length' => 10,
+				'expected' => 'слово1 ... слово2',
+			],
+			'fruits_spanish' => [
+				'text' => 'Manzana,plátano,naranja,fresa,mango,uva,piña,pera,kiwi,cereza,sandía,melón,papaya,arándano,durazno',
+				'words' => ['piña'],
+				'length' => 20,
+				'expected' => '... uva,piña,pera ...',
 			]
 		];
 	}

 	/**
 	 * @dataProvider data_get_context
+	 * @dataProvider data_get_context_unicode
 	 */
 	public function test_get_context($text, $words, $length, $expected)
 	{