From a7b673a1b60ad477d19ae0607f4d46e804ea831c Mon Sep 17 00:00:00 2001 From: rxu Date: Sat, 31 Aug 2024 23:21:33 +0700 Subject: [PATCH 1/6] [ticket/17387] Fix PHP warnings in search results PHPBB-17387 --- phpBB/includes/functions_content.php | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/phpBB/includes/functions_content.php b/phpBB/includes/functions_content.php index 2e52863ef0..3dded3c6e9 100644 --- a/phpBB/includes/functions_content.php +++ b/phpBB/includes/functions_content.php @@ -400,16 +400,14 @@ function get_context(string $text, array $words, int $length = 400): string $fragment_end = $end - $start + 1; // Find the first valid alphanumeric character in the fragment to don't cut words - if ($start > 0) + if ($start > 0 && preg_match('/[^a-zA-Z0-9][a-zA-Z0-9]/u', $fragment, $matches, PREG_OFFSET_CAPTURE)) { - preg_match('/[^a-zA-Z0-9][a-zA-Z0-9]/u', $fragment, $matches, PREG_OFFSET_CAPTURE); $fragment_start = (int) $matches[0][1] + 1; // first valid alphanumeric character } // Find the last valid alphanumeric character in the fragment to don't cut words - if ($end < $text_length - 1) + if ($end < $text_length - 1 && preg_match_all('/[a-zA-Z0-9][^a-zA-Z0-9]/u', $fragment, $matches, PREG_OFFSET_CAPTURE)) { - preg_match_all('/[a-zA-Z0-9][^a-zA-Z0-9]/u', $fragment, $matches, PREG_OFFSET_CAPTURE); $fragment_end = end($matches[0])[1]; // last valid alphanumeric character } From 8acba2db024aa2a2c96037db65fbaab8f82aa1c8 Mon Sep 17 00:00:00 2001 From: rxu Date: Sun, 1 Sep 2024 22:31:22 +0700 Subject: [PATCH 2/6] [ticket/17387] Return entire post text for zero or negative length value In according to 'Default number of returned characters' setting ACP explanation `A value of 0 will return the entire post`. Do the same for negative values too PHPBB-17387 --- phpBB/includes/functions_content.php | 4 ++-- tests/functions_content/get_context_test.php | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/phpBB/includes/functions_content.php b/phpBB/includes/functions_content.php index 3dded3c6e9..a2038d12a6 100644 --- a/phpBB/includes/functions_content.php +++ b/phpBB/includes/functions_content.php @@ -329,10 +329,10 @@ function get_context(string $text, array $words, int $length = 400): string { if ($length <= 0) { - return '...'; + return $text; } - // we need to turn the entities back into their original form, to not cut the message in between them + // We need to turn the entities back into their original form, to not cut the message in between them $text = html_entity_decode($text); // Replace all spaces/invisible characters with single spaces diff --git a/tests/functions_content/get_context_test.php b/tests/functions_content/get_context_test.php index 59f6cdf846..be8618dd8a 100644 --- a/tests/functions_content/get_context_test.php +++ b/tests/functions_content/get_context_test.php @@ -81,13 +81,13 @@ class phpbb_functions_content_get_context_test extends TestCase 'text' => 'This is a sample text.', 'words' => ['sample'], 'length' => 0, - 'expected' => '...', + 'expected' => 'This is a sample text.', ], 'negative length' => [ 'text' => 'This is a sample text.', 'words' => ['sample'], 'length' => -10, - 'expected' => '...', + 'expected' => 'This is a sample text.', ], 'ellipses_beginning' => [ 'text' => 'foo foo foo foo foo foo foo foo bar', From 66b6a5e1f396c5f6150fe8301ae8afaeb0255720 Mon Sep 17 00:00:00 2001 From: rxu Date: Mon, 2 Sep 2024 17:46:16 +0700 Subject: [PATCH 3/6] [ticket/17387] Make regex match unicode characters PHPBB-17387 --- phpBB/includes/functions_content.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/phpBB/includes/functions_content.php b/phpBB/includes/functions_content.php index a2038d12a6..d24f938238 100644 --- a/phpBB/includes/functions_content.php +++ b/phpBB/includes/functions_content.php @@ -400,13 +400,13 @@ function get_context(string $text, array $words, int $length = 400): string $fragment_end = $end - $start + 1; // Find the first valid alphanumeric character in the fragment to don't cut words - if ($start > 0 && preg_match('/[^a-zA-Z0-9][a-zA-Z0-9]/u', $fragment, $matches, PREG_OFFSET_CAPTURE)) + if ($start > 0 && preg_match('/[^\p{L}\p{N}][\p{L}\p{N}]/ui', $fragment, $matches, PREG_OFFSET_CAPTURE)) { $fragment_start = (int) $matches[0][1] + 1; // first valid alphanumeric character } // Find the last valid alphanumeric character in the fragment to don't cut words - if ($end < $text_length - 1 && preg_match_all('/[a-zA-Z0-9][^a-zA-Z0-9]/u', $fragment, $matches, PREG_OFFSET_CAPTURE)) + if ($end < $text_length - 1 && preg_match_all('/[\p{L}\p{N}][^\p{L}\p{N}]/ui', $fragment, $matches, PREG_OFFSET_CAPTURE)) { $fragment_end = end($matches[0])[1]; // last valid alphanumeric character } From 472b36877c5c95bfdbf603ee3dd35f6c1be7085c Mon Sep 17 00:00:00 2001 From: Ruben Calvo Date: Mon, 2 Sep 2024 13:52:46 +0200 Subject: [PATCH 4/6] [ticket/17387] Add test for when words are separated by a non-space character PHPBB-17387 --- phpBB/includes/functions_content.php | 4 ++-- tests/functions_content/get_context_test.php | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/phpBB/includes/functions_content.php b/phpBB/includes/functions_content.php index d24f938238..cdbc4aee09 100644 --- a/phpBB/includes/functions_content.php +++ b/phpBB/includes/functions_content.php @@ -333,7 +333,7 @@ function get_context(string $text, array $words, int $length = 400): string } // We need to turn the entities back into their original form, to not cut the message in between them - $text = html_entity_decode($text); + $text = htmlspecialchars_decode($text); // Replace all spaces/invisible characters with single spaces $text = preg_replace("/\s+/u", ' ', $text); @@ -414,7 +414,7 @@ function get_context(string $text, array $words, int $length = 400): string $output[] = utf8_substr($fragment, $fragment_start, $fragment_end - $fragment_start + 1); } - return ($fragments[0][0] !== 0 ? '... ' : '') . htmlentities(implode(' ... ', $output)) . ($end < $text_length - 1 ? ' ...' : ''); + return ($fragments[0][0] !== 0 ? '... ' : '') . htmlspecialchars(implode(' ... ', $output)) . ($end < $text_length - 1 ? ' ...' : ''); } /** diff --git a/tests/functions_content/get_context_test.php b/tests/functions_content/get_context_test.php index be8618dd8a..d4d087d675 100644 --- a/tests/functions_content/get_context_test.php +++ b/tests/functions_content/get_context_test.php @@ -112,6 +112,12 @@ class phpbb_functions_content_get_context_test extends TestCase 'words' => ['word1', 'word2'], 'length' => 10, 'expected' => 'word1 ... word2', + ], + 'fruits_spanish' => [ + 'text' => 'Manzana,plátano,naranja,fresa,mango,uva,piña,pera,kiwi,cereza,sandía,melón,papaya,arándano,durazno', + 'words' => ['piña'], + 'length' => 20, + 'expected' => '... uva,piña,pera, ...', ] ]; } From c26ded6025643f78cf39f2a530f186fe1b4923b8 Mon Sep 17 00:00:00 2001 From: rxu Date: Mon, 2 Sep 2024 22:26:58 +0700 Subject: [PATCH 5/6] [ticket/17387] Fix handling unicode strings PHPBB-17387 --- phpBB/includes/functions_content.php | 13 ++++++------- tests/functions_content/get_context_test.php | 2 +- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/phpBB/includes/functions_content.php b/phpBB/includes/functions_content.php index cdbc4aee09..9034f785ff 100644 --- a/phpBB/includes/functions_content.php +++ b/phpBB/includes/functions_content.php @@ -336,7 +336,7 @@ function get_context(string $text, array $words, int $length = 400): string $text = htmlspecialchars_decode($text); // Replace all spaces/invisible characters with single spaces - $text = preg_replace("/\s+/u", ' ', $text); + $text = preg_replace("/[\p{Z}\h\v]+/u", ' ', $text); $text_length = utf8_strlen($text); @@ -351,7 +351,6 @@ function get_context(string $text, array $words, int $length = 400): string $word_indexes[$pos] = $word; } } - if (!empty($word_indexes)) { ksort($word_indexes); @@ -400,21 +399,21 @@ function get_context(string $text, array $words, int $length = 400): string $fragment_end = $end - $start + 1; // Find the first valid alphanumeric character in the fragment to don't cut words - if ($start > 0 && preg_match('/[^\p{L}\p{N}][\p{L}\p{N}]/ui', $fragment, $matches, PREG_OFFSET_CAPTURE)) + if ($start > 0 && preg_match('/[^\p{L}\p{N}][\p{L}\p{N}]/u', $fragment, $matches, PREG_OFFSET_CAPTURE)) { - $fragment_start = (int) $matches[0][1] + 1; // first valid alphanumeric character + $fragment_start = mb_strlen(substr($fragment, 0, (int) $matches[0][1])) + 1; } // Find the last valid alphanumeric character in the fragment to don't cut words - if ($end < $text_length - 1 && preg_match_all('/[\p{L}\p{N}][^\p{L}\p{N}]/ui', $fragment, $matches, PREG_OFFSET_CAPTURE)) + if ($end < $text_length - 1 && preg_match_all('/[\p{L}\p{N}][^\p{L}\p{N}]/u', $fragment, $matches, PREG_OFFSET_CAPTURE)) { - $fragment_end = end($matches[0])[1]; // last valid alphanumeric character + $fragment_end = mb_strlen(substr($fragment, 0, end($matches[0])[1])); } $output[] = utf8_substr($fragment, $fragment_start, $fragment_end - $fragment_start + 1); } - return ($fragments[0][0] !== 0 ? '... ' : '') . htmlspecialchars(implode(' ... ', $output)) . ($end < $text_length - 1 ? ' ...' : ''); + return ($fragments[0][0] !== 0 ? '... ' : '') . utf8_htmlspecialchars(implode(' ... ', $output)) . ($end < $text_length - 1 ? ' ...' : ''); } /** diff --git a/tests/functions_content/get_context_test.php b/tests/functions_content/get_context_test.php index d4d087d675..93136d69fb 100644 --- a/tests/functions_content/get_context_test.php +++ b/tests/functions_content/get_context_test.php @@ -117,7 +117,7 @@ class phpbb_functions_content_get_context_test extends TestCase 'text' => 'Manzana,plátano,naranja,fresa,mango,uva,piña,pera,kiwi,cereza,sandía,melón,papaya,arándano,durazno', 'words' => ['piña'], 'length' => 20, - 'expected' => '... uva,piña,pera, ...', + 'expected' => '... uva,piña,pera ...', ] ]; } From 544e0900e6d8aa24a302f5d80a9325af589bdcf8 Mon Sep 17 00:00:00 2001 From: rxu Date: Tue, 3 Sep 2024 00:04:25 +0700 Subject: [PATCH 6/6] [ticket/17387] Add more unicode tests PHPBB-17387 --- phpBB/includes/functions_content.php | 4 +- tests/functions_content/get_context_test.php | 102 +++++++++++++++++++ 2 files changed, 104 insertions(+), 2 deletions(-) diff --git a/phpBB/includes/functions_content.php b/phpBB/includes/functions_content.php index 9034f785ff..d50a515c87 100644 --- a/phpBB/includes/functions_content.php +++ b/phpBB/includes/functions_content.php @@ -401,13 +401,13 @@ function get_context(string $text, array $words, int $length = 400): string // Find the first valid alphanumeric character in the fragment to don't cut words if ($start > 0 && preg_match('/[^\p{L}\p{N}][\p{L}\p{N}]/u', $fragment, $matches, PREG_OFFSET_CAPTURE)) { - $fragment_start = mb_strlen(substr($fragment, 0, (int) $matches[0][1])) + 1; + $fragment_start = utf8_strlen(substr($fragment, 0, (int) $matches[0][1])) + 1; } // Find the last valid alphanumeric character in the fragment to don't cut words if ($end < $text_length - 1 && preg_match_all('/[\p{L}\p{N}][^\p{L}\p{N}]/u', $fragment, $matches, PREG_OFFSET_CAPTURE)) { - $fragment_end = mb_strlen(substr($fragment, 0, end($matches[0])[1])); + $fragment_end = utf8_strlen(substr($fragment, 0, end($matches[0])[1])); } $output[] = utf8_substr($fragment, $fragment_start, $fragment_end - $fragment_start + 1); diff --git a/tests/functions_content/get_context_test.php b/tests/functions_content/get_context_test.php index 93136d69fb..f2865f31ee 100644 --- a/tests/functions_content/get_context_test.php +++ b/tests/functions_content/get_context_test.php @@ -113,6 +113,107 @@ class phpbb_functions_content_get_context_test extends TestCase 'length' => 10, 'expected' => 'word1 ... word2', ], + ]; + } + + /** + * Data provider for unicode get_context test cases. + * + * @return array + */ + public function data_get_context_unicode(): array + { + return [ + 'text contains words and length greater than text' => [ + 'text' => 'Это пример текста, содержащего разнообразные слова, включая пример, текст и слова.', + 'words' => ['пример', 'слова'], + 'length' => 100, + 'expected' => 'Это пример текста, содержащего разнообразные слова, включая пример, текст и слова.', + ], + 'text contains words and length less than text' => [ + 'text' => 'Это пример текста, содержащего разнообразные слова, включая шаблон, текст и слова.', + 'words' => ['пример', 'слова'], + 'length' => 50, + 'expected' => 'Это пример текста, содержащего разнообразные слова ...', + ], + 'text does not contain words' => [ + 'text' => 'Это пример текста, содержащего разнообразные слова, но ни одно из них не совпадает с искомыми.', + 'words' => ['nonexistent'], + 'length' => 50, + 'expected' => 'Это пример текста, содержащего разнообразные слова ...', + ], + 'desired length equal to text length' => [ + 'text' => 'Текст точной длины.', + 'words' => ['Текст', 'точной'], + 'length' => 19, + 'expected' => 'Текст точной длины.', + ], + 'text with html entities' => [ + 'text' => 'Это пример текста, содержащего & и < и > лексемы.', + 'words' => ['пример', 'содержащего'], + 'length' => 40, + 'expected' => 'Это пример текста, содержащего & и < и ...', + ], + 'text with html entities and contains last word' => [ + 'text' => 'Это пример текста, содержащего & и < и > лексемы.', + 'words' => ['пример', 'лексемы'], + 'length' => 40, + 'expected' => 'Это пример текста ... и < и > лексемы.', + ], + 'text with multiple spaces and special characters' => [ + 'text' => 'Это пример текста, содержащего разнообразные слова.', + 'words' => ['пример', 'разнообразные'], + 'length' => 50, + 'expected' => 'Это пример текста, содержащего разнообразные слова.', + ], + 'empty text' => [ + 'text' => '', + 'words' => ['пример', 'слова'], + 'length' => 50, + 'expected' => '', + ], + 'empty words array' => [ + 'text' => 'Это пример текста, содержащего разнообразные слова.', + 'words' => [], + 'length' => 50, + 'expected' => 'Это пример текста, содержащего разнообразные слова.', + ], + 'zero length' => [ + 'text' => 'Это пример текста.', + 'words' => ['пример'], + 'length' => 0, + 'expected' => 'Это пример текста.', + ], + 'negative length' => [ + 'text' => 'Это пример текста.', + 'words' => ['sample'], + 'length' => -10, + 'expected' => 'Это пример текста.', + ], + 'ellipses_beginning' => [ + 'text' => 'раз раз раз раз раз раз раз раз два', + 'words' => ['два'], + 'length' => 10, + 'expected' => '... раз раз два', + ], + 'ellipsis_end' => [ + 'text' => 'два раз раз раз раз раз раз раз раз', + 'words' => ['два'], + 'length' => 10, + 'expected' => 'два раз раз ...', + ], + 'ellipsis_middle' => [ + 'text' => 'раз слово1 раз раз раз раз раз раз раз раз раз слово2 раз', + 'words' => ['слово1', 'слово2'], + 'length' => 15, + 'expected' => '... слово1 ... слово2 ...', + ], + 'ellipsis_middle2' => [ + 'text' => 'слово1 foo foo foo foo foo foo foo foo foo слово2', + 'words' => ['слово1', 'слово2'], + 'length' => 10, + 'expected' => 'слово1 ... слово2', + ], 'fruits_spanish' => [ 'text' => 'Manzana,plátano,naranja,fresa,mango,uva,piña,pera,kiwi,cereza,sandía,melón,papaya,arándano,durazno', 'words' => ['piña'], @@ -124,6 +225,7 @@ class phpbb_functions_content_get_context_test extends TestCase /** * @dataProvider data_get_context + * @dataProvider data_get_context_unicode */ public function test_get_context($text, $words, $length, $expected) {