From dd4c98279212e8f0a818387851c63c841ac8b7d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rub=C3=A9n=20Calvo?= Date: Wed, 26 Sep 2018 13:02:36 +0200 Subject: [PATCH 01/12] [ticket/15043] Rewrite get_context() PHPBB3-15043 --- phpBB/includes/functions_content.php | 143 ++++++++++----------------- phpBB/includes/utf/utf_tools.php | 16 +++ 2 files changed, 69 insertions(+), 90 deletions(-) diff --git a/phpBB/includes/functions_content.php b/phpBB/includes/functions_content.php index 40e1b64239..33bc8daa1f 100644 --- a/phpBB/includes/functions_content.php +++ b/phpBB/includes/functions_content.php @@ -327,117 +327,80 @@ function bump_topic_allowed($forum_id, $topic_bumped, $last_post_time, $topic_po */ function get_context($text, $words, $length = 400) { - // first replace all whitespaces with single spaces - $text = preg_replace('/ +/', ' ', strtr($text, "\t\n\r\x0C ", ' ')); + $output = ''; + $text_length = utf8_strlen($text); + + // Replace all spaces/invisible characters with single spaces + $text = preg_replace("/[[:^print:] ]+/", ' ', $text); // we need to turn the entities back into their original form, to not cut the message in between them - $entities = array('<', '>', '[', ']', '.', ':', ':'); - $characters = array('<', '>', '[', ']', '.', ':', ':'); - $text = str_replace($entities, $characters, $text); + $text = html_entity_decode($text); - $word_indizes = array(); - if (count($words)) + // Get first ocurrence of each word + $word_indizes = []; + foreach ($words as $word) { - $match = ''; - // find the starting indizes of all words - foreach ($words as $word) - { - if ($word) - { - if (preg_match('#(?:[^\w]|^)(' . $word . ')(?:[^\w]|$)#i', $text, $match)) - { - if (empty($match[1])) - { - continue; - } + $pos = utf8_stripos($text, $word); - $pos = utf8_strpos($text, $match[1]); - if ($pos !== false) - { - $word_indizes[] = $pos; - } - } - } + if ($pos !== false) + { + $word_indizes[$pos] = $word; } - unset($match); + } - if (count($word_indizes)) - { - $word_indizes = array_unique($word_indizes); - sort($word_indizes); + // If there are coincidences + if (!empty($word_indizes)) + { + ksort($word_indizes); $wordnum = count($word_indizes); - // number of characters on the right and left side of each word - $sequence_length = (int) ($length / (2 * $wordnum)) - 2; - $final_text = ''; - $word = $j = 0; - $final_text_index = -1; + // Size of the fragment of text per word + $characters_per_word = (int) ($length / $wordnum); - // cycle through every character in the original text - for ($i = $word_indizes[$word], $n = utf8_strlen($text); $i < $n; $i++) + // Get text fragments + $fragments = []; + $start = $end = 0; + foreach ($word_indizes as $indize => $word) { - // if the current position is the start of one of the words then append $sequence_length characters to the final text - if (isset($word_indizes[$word]) && ($i == $word_indizes[$word])) + if ($end+$characters_per_word+utf8_strlen($word) < $indize) { - if ($final_text_index < $i - $sequence_length - 1) + $fragment = utf8_substr($text, $start, $end-$start); + if ($start != 0) { - $final_text .= '... ' . preg_replace('#^([^ ]*)#', '', utf8_substr($text, $i - $sequence_length, $sequence_length)); + $fragment = '... ' . $fragment; } - else - { - // if the final text is already nearer to the current word than $sequence_length we only append the text - // from its current index on and distribute the unused length to all other sequenes - $sequence_length += (int) (($final_text_index - $i + $sequence_length + 1) / (2 * $wordnum)); - $final_text .= utf8_substr($text, $final_text_index + 1, $i - $final_text_index - 1); - } - $final_text_index = $i - 1; - // add the following characters to the final text (see below) - $word++; - $j = 1; + $fragments[] = $fragment; + + $start = $indize-($characters_per_word/2); + // Start fragment at the begining of a word + $end = $start = ($start > 0) ? (utf8_strpos($text, ' ', $start-1)+1) : 0; } - if ($j > 0) - { - // add the character to the final text and increment the sequence counter - $final_text .= utf8_substr($text, $i, 1); - $final_text_index++; - $j++; + $end += $characters_per_word; - // if this is a whitespace then check whether we are done with this sequence - if (utf8_substr($text, $i, 1) == ' ') - { - // only check whether we have to exit the context generation completely if we haven't already reached the end anyway - if ($i + 4 < $n) - { - if (($j > $sequence_length && $word >= $wordnum) || utf8_strlen($final_text) > $length) - { - $final_text .= ' ...'; - break; - } - } - else - { - // make sure the text really reaches the end - $j -= 4; - } - - // stop context generation and wait for the next word - if ($j > $sequence_length) - { - $j = 0; - } - } - } + // End fragment at the end of a word + $substring = utf8_substr($text, $start, $end-$start); + $end = $start+utf8_strrpos($substring, ' '); } - return str_replace($characters, $entities, $final_text); - } + + $fragment = utf8_substr($text, $start, $end-$start); + if ($start != 0) + { + $fragment = '... ' . $fragment; + } + if ($end < $text_length) + { + $fragment .= ' ...'; + } + + // Get the last fragment + $fragments[] = $fragment; + + $output = htmlentities(implode($fragments, '')); } - if (!count($words) || !count($word_indizes)) - { - return str_replace($characters, $entities, ((utf8_strlen($text) >= $length + 3) ? utf8_substr($text, 0, $length) . '...' : $text)); - } + return $output; } /** diff --git a/phpBB/includes/utf/utf_tools.php b/phpBB/includes/utf/utf_tools.php index 5a1ca5b6fe..4ed9c72bb1 100644 --- a/phpBB/includes/utf/utf_tools.php +++ b/phpBB/includes/utf/utf_tools.php @@ -72,6 +72,22 @@ function utf8_strpos($str, $needle, $offset = null) } } +/** +* UTF-8 aware alternative to stripos +* @ignore +*/ +function utf8_stripos($str, $needle, $offset = null) +{ + if (is_null($offset)) + { + return mb_stripos($str, $needle); + } + else + { + return mb_stripos($str, $needle, $offset); + } +} + /** * UTF-8 aware alternative to strtolower * @ignore From 7365764476a130e7af0300249d21ca5f46aa94ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rub=C3=A9n=20Calvo?= Date: Wed, 26 Sep 2018 14:17:53 +0200 Subject: [PATCH 02/12] [ticket/15043] New get_context displays more text PHPBB3-15043 --- tests/functional/search/base.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/functional/search/base.php b/tests/functional/search/base.php index 033c31c396..0e3aa80fa3 100644 --- a/tests/functional/search/base.php +++ b/tests/functional/search/base.php @@ -146,7 +146,7 @@ abstract class phpbb_functional_search_base extends phpbb_functional_test_case foreach (['', 'a', 't', 'f', 'i', 's'] as $sort_key) { - $this->assert_search_found('phpbb3+installation', 1, 3, $sort_key); + $this->assert_search_found('phpbb3+installation', 1, 4, $sort_key); $this->assert_search_found('foosubject+barsearch', 1, 2, $sort_key); $this->assert_search_found('barsearch-testing', 1, 2, $sort_key); // test hyphen ignored $this->assert_search_found('barsearch+-+testing', 1, 2, $sort_key); // test hyphen wrapped with space ignored From 350c9213ee09d79269cd3f80e023faa206f238ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rub=C3=A9n=20Calvo?= Date: Wed, 26 Sep 2018 15:15:07 +0200 Subject: [PATCH 03/12] [ticket/15043] Update regexp PHPBB3-15043 --- phpBB/includes/functions_content.php | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/phpBB/includes/functions_content.php b/phpBB/includes/functions_content.php index 33bc8daa1f..7731d63a48 100644 --- a/phpBB/includes/functions_content.php +++ b/phpBB/includes/functions_content.php @@ -331,7 +331,7 @@ function get_context($text, $words, $length = 400) $text_length = utf8_strlen($text); // Replace all spaces/invisible characters with single spaces - $text = preg_replace("/[[:^print:] ]+/", ' ', $text); + $text = preg_replace("/\s+/", ' ', $text); // we need to turn the entities back into their original form, to not cut the message in between them $text = html_entity_decode($text); @@ -362,9 +362,11 @@ function get_context($text, $words, $length = 400) $start = $end = 0; foreach ($word_indizes as $indize => $word) { - if ($end+$characters_per_word+utf8_strlen($word) < $indize) + // Check if the next word can be inside the current fragment of text + if ($end + $characters_per_word + utf8_strlen($word) < $indize) { $fragment = utf8_substr($text, $start, $end-$start); + if ($start != 0) { $fragment = '... ' . $fragment; @@ -372,16 +374,16 @@ function get_context($text, $words, $length = 400) $fragments[] = $fragment; - $start = $indize-($characters_per_word/2); + $start = $indize - ($characters_per_word / 2); // Start fragment at the begining of a word - $end = $start = ($start > 0) ? (utf8_strpos($text, ' ', $start-1)+1) : 0; + $end = $start = ($start > 0) ? (utf8_strpos($text, ' ', $start - 1) + 1) : 0; } $end += $characters_per_word; // End fragment at the end of a word $substring = utf8_substr($text, $start, $end-$start); - $end = $start+utf8_strrpos($substring, ' '); + $end = $start + utf8_strrpos($substring, ' '); } $fragment = utf8_substr($text, $start, $end-$start); From a2e720515444c07f37e259f4aca39cd74919455e Mon Sep 17 00:00:00 2001 From: Ruben Calvo Date: Tue, 9 Jul 2024 00:24:47 +0200 Subject: [PATCH 04/12] [ticket/15403] Fix order of parameters in implode PHPBB-15403 --- phpBB/includes/functions_content.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phpBB/includes/functions_content.php b/phpBB/includes/functions_content.php index 7731d63a48..5b1d6244b3 100644 --- a/phpBB/includes/functions_content.php +++ b/phpBB/includes/functions_content.php @@ -399,7 +399,7 @@ function get_context($text, $words, $length = 400) // Get the last fragment $fragments[] = $fragment; - $output = htmlentities(implode($fragments, '')); + $output = htmlentities(implode('', $fragments)); } return $output; From 776396962514770a3740ea847182ed0bd614d85c Mon Sep 17 00:00:00 2001 From: Ruben Calvo Date: Tue, 9 Jul 2024 17:09:06 +0200 Subject: [PATCH 05/12] [ticket/15403] Return first fragment of text if there is no coincidences PHPBB-15403 --- phpBB/includes/functions_content.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/phpBB/includes/functions_content.php b/phpBB/includes/functions_content.php index 5b1d6244b3..e536c5ad24 100644 --- a/phpBB/includes/functions_content.php +++ b/phpBB/includes/functions_content.php @@ -399,10 +399,10 @@ function get_context($text, $words, $length = 400) // Get the last fragment $fragments[] = $fragment; - $output = htmlentities(implode('', $fragments)); + return htmlentities(implode('', $fragments)); } - return $output; + return htmlentities($text_length >= $length + 3 ? utf8_substr($text, 0, $length) . ' ...' : $text); } /** From a62a303318773c66b9880a9e1467630a245bb24e Mon Sep 17 00:00:00 2001 From: Marc Alexander Date: Tue, 9 Jul 2024 21:20:09 +0200 Subject: [PATCH 06/12] [ticket/15043] Add unit test for get_context PHPBB-15043 --- tests/functions_content/get_context_test.php | 102 +++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 tests/functions_content/get_context_test.php diff --git a/tests/functions_content/get_context_test.php b/tests/functions_content/get_context_test.php new file mode 100644 index 0000000000..b5d5b2ef73 --- /dev/null +++ b/tests/functions_content/get_context_test.php @@ -0,0 +1,102 @@ + + * @license GNU General Public License, version 2 (GPL-2.0) + * + * For full copyright and license information, please see + * the docs/CREDITS.txt file. + * + */ + +use PHPUnit\Framework\TestCase; + +class phpbb_functions_content_get_context_test extends TestCase +{ + /** + * Data provider for get_context test cases. + * + * @return array + */ + public function data_get_context(): array + { + return [ + 'text contains words and length greater than text' => [ + 'text' => 'This is a sample text containing several words, including sample, text, and words.', + 'words' => ['sample', 'words'], + 'length' => 100, + 'expected' => 'This is a sample text containing several words, including sample, text, and words.', + ], + 'text contains words and length less than text' => [ + 'text' => 'This is a sample text containing several words, including sample, text, and words.', + 'words' => ['sample', 'words'], + 'length' => 50, + 'expected' => 'This is a sample text containing several words ...', + ], + 'text does not contain words' => [ + 'text' => 'This is a sample text containing several words, but none of them match the given words.', + 'words' => ['nonexistent'], + 'length' => 50, + 'expected' => 'This is a sample text containing several words, bu ...', + ], + 'desired length equal to text length' => [ + 'text' => 'Exact length text.', + 'words' => ['Exact', 'text'], + 'length' => 18, + 'expected' => 'Exact length text.', + ], + 'text with html entities' => [ + 'text' => 'This is a sample text containing & and < and > entities.', + 'words' => ['sample', 'containing'], + 'length' => 50, + 'expected' => 'This is a sample text containing & and < and ...', + ], + 'text with html entities and contains last word' => [ + 'text' => 'This is a sample text containing & and < and > entities.', + 'words' => ['sample', 'entities'], + 'length' => 50, + 'expected' => 'This is a sample text containing & and < and ...', + ], + 'text with multiple spaces and special characters' => [ + 'text' => 'This is a sample text containing several words.', + 'words' => ['sample', 'several'], + 'length' => 50, + 'expected' => 'This is a sample text containing several words ...', + ], + 'empty text' => [ + 'text' => '', + 'words' => ['sample', 'words'], + 'length' => 50, + 'expected' => '', + ], + 'empty words array' => [ + 'text' => 'This is a sample text containing several words.', + 'words' => [], + 'length' => 50, + 'expected' => 'This is a sample text containing several words.', + ], + 'zero length' => [ + 'text' => 'This is a sample text.', + 'words' => ['sample'], + 'length' => 0, + 'expected' => '...', + ], + 'negative length' => [ + 'text' => 'This is a sample text.', + 'words' => ['sample'], + 'length' => -10, + 'expected' => '...', + ], + ]; + } + + /** + * @dataProvider data_get_context + */ + public function test_get_context($text, $words, $length, $expected) + { + $this->assertEquals($expected, get_context($text, $words, $length)); + } +} From 89afa0cb5eaf2684ffa79c9d6e1415eab41063f9 Mon Sep 17 00:00:00 2001 From: Marc Alexander Date: Tue, 9 Jul 2024 21:46:37 +0200 Subject: [PATCH 07/12] [ticket/15043] Small code cleanup PHPBB-15043 --- phpBB/includes/functions_content.php | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/phpBB/includes/functions_content.php b/phpBB/includes/functions_content.php index e536c5ad24..94de0b7849 100644 --- a/phpBB/includes/functions_content.php +++ b/phpBB/includes/functions_content.php @@ -327,7 +327,6 @@ function bump_topic_allowed($forum_id, $topic_bumped, $last_post_time, $topic_po */ function get_context($text, $words, $length = 400) { - $output = ''; $text_length = utf8_strlen($text); // Replace all spaces/invisible characters with single spaces @@ -336,7 +335,7 @@ function get_context($text, $words, $length = 400) // we need to turn the entities back into their original form, to not cut the message in between them $text = html_entity_decode($text); - // Get first ocurrence of each word + // Get first occurrence of each word $word_indizes = []; foreach ($words as $word) { @@ -375,14 +374,14 @@ function get_context($text, $words, $length = 400) $fragments[] = $fragment; $start = $indize - ($characters_per_word / 2); - // Start fragment at the begining of a word + // Start fragment at the beginning of a word $end = $start = ($start > 0) ? (utf8_strpos($text, ' ', $start - 1) + 1) : 0; } $end += $characters_per_word; // End fragment at the end of a word - $substring = utf8_substr($text, $start, $end-$start); + $substring = utf8_substr($text, $start, $end - $start); $end = $start + utf8_strrpos($substring, ' '); } From f41c8eef473d6d970d9f02ccfec32c72d0c82186 Mon Sep 17 00:00:00 2001 From: Ruben Calvo Date: Sat, 13 Jul 2024 15:34:37 +0200 Subject: [PATCH 08/12] [ticket/15403] Refactor get_context PHPBB-15403 --- phpBB/includes/functions_content.php | 122 +++++++++++-------- tests/functions_content/get_context_test.php | 6 +- 2 files changed, 73 insertions(+), 55 deletions(-) diff --git a/phpBB/includes/functions_content.php b/phpBB/includes/functions_content.php index 94de0b7849..148421880f 100644 --- a/phpBB/includes/functions_content.php +++ b/phpBB/includes/functions_content.php @@ -325,83 +325,101 @@ function bump_topic_allowed($forum_id, $topic_bumped, $last_post_time, $topic_po * * @return string Context of the specified words separated by "..." */ -function get_context($text, $words, $length = 400) +function get_context(string $text, array $words, int $length = 400): string { - $text_length = utf8_strlen($text); - - // Replace all spaces/invisible characters with single spaces - $text = preg_replace("/\s+/", ' ', $text); + if ($length <= 0) + { + return '...'; + } // we need to turn the entities back into their original form, to not cut the message in between them $text = html_entity_decode($text); + // Replace all spaces/invisible characters with single spaces + $text = preg_replace("/\s+/u", ' ', $text); + + $text_length = utf8_strlen($text); + // Get first occurrence of each word - $word_indizes = []; + $word_indexes = []; foreach ($words as $word) { $pos = utf8_stripos($text, $word); if ($pos !== false) { - $word_indizes[$pos] = $word; + $word_indexes[$pos] = $word; } } - // If there are coincidences - if (!empty($word_indizes)) + if (!empty($word_indexes)) { - ksort($word_indizes); + ksort($word_indexes); - $wordnum = count($word_indizes); - // Size of the fragment of text per word - $characters_per_word = (int) ($length / $wordnum); + // Size of the fragment of text per word + $num_indexes = count($word_indexes); + $characters_per_word = (int) ($length / $num_indexes) + 2; // 2 to leave one character of margin at the sides to don't cut words - // Get text fragments - $fragments = []; - $start = $end = 0; - foreach ($word_indizes as $indize => $word) + // Get text fragment indexes + $fragments = []; + foreach ($word_indexes as $index => $word) + { + $word_length = utf8_strlen($word); + $start = max(0, min($text_length - 1 - $characters_per_word, (int) ($index + ($word_length / 2) - ($characters_per_word / 2)))); + $end = $start + $characters_per_word; + + // Check if we can merge this fragment into the previous fragment + $last_element = array_pop($fragments); + if ($last_element !== null) { - // Check if the next word can be inside the current fragment of text - if ($end + $characters_per_word + utf8_strlen($word) < $indize) + [$prev_start, $prev_end] = $last_element; + + if ($prev_end + $characters_per_word >= $index + $word_length) { - $fragment = utf8_substr($text, $start, $end-$start); - - if ($start != 0) - { - $fragment = '... ' . $fragment; - } - - $fragments[] = $fragment; - - $start = $indize - ($characters_per_word / 2); - // Start fragment at the beginning of a word - $end = $start = ($start > 0) ? (utf8_strpos($text, ' ', $start - 1) + 1) : 0; + $start = $prev_start; + $end = $prev_end + $characters_per_word; + } + else + { + $fragments[] = $last_element; } - - $end += $characters_per_word; - - // End fragment at the end of a word - $substring = utf8_substr($text, $start, $end - $start); - $end = $start + utf8_strrpos($substring, ' '); } - $fragment = utf8_substr($text, $start, $end-$start); - if ($start != 0) - { - $fragment = '... ' . $fragment; - } - if ($end < $text_length) - { - $fragment .= ' ...'; - } - - // Get the last fragment - $fragments[] = $fragment; - - return htmlentities(implode('', $fragments)); + $fragments[] = [$start, $end]; + } + } + else + { + // There is no coincidences, so we just create a fragment with the first $length characters + $fragments[] = [0, $length]; + $end = $length; } - return htmlentities($text_length >= $length + 3 ? utf8_substr($text, 0, $length) . ' ...' : $text); + $output = []; + foreach ($fragments as [$start, $end]) + { + $fragment = utf8_substr($text, $start, $end - $start + 1); + + $offset = $start; + + // Find the first valid alphanumeric character in the fragment to don't cut words + if ($start > 0) + { + preg_match('/[^a-zA-Z0-9][a-zA-Z0-9]/u', $fragment, $matches, PREG_OFFSET_CAPTURE); + $start = $offset + (int) $matches[0][1] + 1; // first valid alphanumeric character + } + + // Find the last valid alphanumeric character in the fragment to don't cut words + if ($end < $text_length - 1) + { + preg_match_all('/[a-zA-Z0-9][^a-zA-Z0-9]/u', $fragment, $matches, PREG_OFFSET_CAPTURE); + $end = $offset + end($matches[0])[1]; // last valid alphanumeric character + } + + $output[] = utf8_substr($text, $start, $end - $start + 1); + } + + return htmlentities(implode(' ... ', $output)) . ($end < $text_length - 1 ? ' ...' : ''); } /** diff --git a/tests/functions_content/get_context_test.php b/tests/functions_content/get_context_test.php index b5d5b2ef73..4da1be8ed1 100644 --- a/tests/functions_content/get_context_test.php +++ b/tests/functions_content/get_context_test.php @@ -39,7 +39,7 @@ class phpbb_functions_content_get_context_test extends TestCase 'text' => 'This is a sample text containing several words, but none of them match the given words.', 'words' => ['nonexistent'], 'length' => 50, - 'expected' => 'This is a sample text containing several words, bu ...', + 'expected' => 'This is a sample text containing several words ...', ], 'desired length equal to text length' => [ 'text' => 'Exact length text.', @@ -57,13 +57,13 @@ class phpbb_functions_content_get_context_test extends TestCase 'text' => 'This is a sample text containing & and < and > entities.', 'words' => ['sample', 'entities'], 'length' => 50, - 'expected' => 'This is a sample text containing & and < and ...', + 'expected' => 'This is a sample text ... and < and > entities.', ], 'text with multiple spaces and special characters' => [ 'text' => 'This is a sample text containing several words.', 'words' => ['sample', 'several'], 'length' => 50, - 'expected' => 'This is a sample text containing several words ...', + 'expected' => 'This is a sample text containing several words.', ], 'empty text' => [ 'text' => '', From dd9267b6785424d3a42e16e74164c15270014cc8 Mon Sep 17 00:00:00 2001 From: Ruben Calvo Date: Sun, 14 Jul 2024 01:10:25 +0200 Subject: [PATCH 09/12] [ticket/15403] Use substr over the fragment and not the whole text PHPBB-15403 --- phpBB/includes/functions_content.php | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/phpBB/includes/functions_content.php b/phpBB/includes/functions_content.php index 148421880f..0ef52b1b81 100644 --- a/phpBB/includes/functions_content.php +++ b/phpBB/includes/functions_content.php @@ -400,23 +400,24 @@ function get_context(string $text, array $words, int $length = 400): string { $fragment = utf8_substr($text, $start, $end - $start + 1); - $offset = $start; + $fragment_start = 0; + $fragment_end = $end - $start + 1; // Find the first valid alphanumeric character in the fragment to don't cut words if ($start > 0) { preg_match('/[^a-zA-Z0-9][a-zA-Z0-9]/u', $fragment, $matches, PREG_OFFSET_CAPTURE); - $start = $offset + (int) $matches[0][1] + 1; // first valid alphanumeric character + $fragment_start = (int) $matches[0][1] + 1; // first valid alphanumeric character } // Find the last valid alphanumeric character in the fragment to don't cut words if ($end < $text_length - 1) { preg_match_all('/[a-zA-Z0-9][^a-zA-Z0-9]/u', $fragment, $matches, PREG_OFFSET_CAPTURE); - $end = $offset + end($matches[0])[1]; // last valid alphanumeric character + $fragment_end = end($matches[0])[1]; // last valid alphanumeric character } - $output[] = utf8_substr($text, $start, $end - $start + 1); + $output[] = utf8_substr($fragment, $fragment_start, $fragment_end - $fragment_start + 1); } return htmlentities(implode(' ... ', $output)) . ($end < $text_length - 1 ? ' ...' : ''); From f4b144424837b035d2c8a360c6d119bf740432c8 Mon Sep 17 00:00:00 2001 From: Ruben Calvo Date: Sun, 14 Jul 2024 12:37:34 +0200 Subject: [PATCH 10/12] [ticket/15403] Handle ellipsis at the beginning of context and add tests PHPBB-15403 --- phpBB/includes/functions_content.php | 2 +- tests/functions_content/get_context_test.php | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/phpBB/includes/functions_content.php b/phpBB/includes/functions_content.php index 0ef52b1b81..6a360fc4a2 100644 --- a/phpBB/includes/functions_content.php +++ b/phpBB/includes/functions_content.php @@ -420,7 +420,7 @@ function get_context(string $text, array $words, int $length = 400): string $output[] = utf8_substr($fragment, $fragment_start, $fragment_end - $fragment_start + 1); } - return htmlentities(implode(' ... ', $output)) . ($end < $text_length - 1 ? ' ...' : ''); + return ($fragments[0][0] != 0 ? '... ' : '') . htmlentities(implode(' ... ', $output)) . ($end < $text_length - 1 ? ' ...' : ''); } /** diff --git a/tests/functions_content/get_context_test.php b/tests/functions_content/get_context_test.php index 4da1be8ed1..cce72c6652 100644 --- a/tests/functions_content/get_context_test.php +++ b/tests/functions_content/get_context_test.php @@ -89,6 +89,24 @@ class phpbb_functions_content_get_context_test extends TestCase 'length' => -10, 'expected' => '...', ], + 'ellipses_beginning' => [ + 'text' => 'foo foo foo foo foo foo foo foo bar', + 'words' => ['bar'], + 'length' => 10, + 'expected' => '... foo foo bar', + ], + 'ellipsis_end' => [ + 'text' => 'bar foo foo foo foo foo foo foo foo', + 'words' => ['bar'], + 'length' => 10, + 'expected' => 'bar foo foo ...', + ], + 'ellipsis_middle' => [ + 'text' => 'foo word1 foo foo foo foo foo foo foo foo foo word2 foo', + 'words' => ['word1', 'word2'], + 'length' => 10, + 'expected' => '... word1 ... word2 ...', + ], ]; } @@ -99,4 +117,5 @@ class phpbb_functions_content_get_context_test extends TestCase { $this->assertEquals($expected, get_context($text, $words, $length)); } + } From e60288921309884291b4cec93c2d060faf8f444f Mon Sep 17 00:00:00 2001 From: Ruben Calvo Date: Sun, 14 Jul 2024 12:47:06 +0200 Subject: [PATCH 11/12] [ticket/15403] Add another test PHPBB-15403 --- tests/functions_content/get_context_test.php | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/functions_content/get_context_test.php b/tests/functions_content/get_context_test.php index cce72c6652..59f6cdf846 100644 --- a/tests/functions_content/get_context_test.php +++ b/tests/functions_content/get_context_test.php @@ -107,6 +107,12 @@ class phpbb_functions_content_get_context_test extends TestCase 'length' => 10, 'expected' => '... word1 ... word2 ...', ], + 'ellipsis_middle2' => [ + 'text' => 'word1 foo foo foo foo foo foo foo foo foo word2', + 'words' => ['word1', 'word2'], + 'length' => 10, + 'expected' => 'word1 ... word2', + ] ]; } From 5c40766dc4d9f5ca7bca3f87c94a15867e0a1069 Mon Sep 17 00:00:00 2001 From: Ruben Calvo Date: Mon, 15 Jul 2024 12:30:10 +0200 Subject: [PATCH 12/12] [ticket/15403] Remove last element of array only if needed PHPBB-15403 --- phpBB/includes/functions_content.php | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/phpBB/includes/functions_content.php b/phpBB/includes/functions_content.php index 6a360fc4a2..2e52863ef0 100644 --- a/phpBB/includes/functions_content.php +++ b/phpBB/includes/functions_content.php @@ -369,20 +369,16 @@ function get_context(string $text, array $words, int $length = 400): string $end = $start + $characters_per_word; // Check if we can merge this fragment into the previous fragment - $last_element = array_pop($fragments); - if ($last_element !== null) + if (!empty($fragments)) { - [$prev_start, $prev_end] = $last_element; + [$prev_start, $prev_end] = end($fragments); if ($prev_end + $characters_per_word >= $index + $word_length) { + array_pop($fragments); $start = $prev_start; $end = $prev_end + $characters_per_word; } - else - { - $fragments[] = $last_element; - } } $fragments[] = [$start, $end]; @@ -420,7 +416,7 @@ function get_context(string $text, array $words, int $length = 400): string $output[] = utf8_substr($fragment, $fragment_start, $fragment_end - $fragment_start + 1); } - return ($fragments[0][0] != 0 ? '... ' : '') . htmlentities(implode(' ... ', $output)) . ($end < $text_length - 1 ? ' ...' : ''); + return ($fragments[0][0] !== 0 ? '... ' : '') . htmlentities(implode(' ... ', $output)) . ($end < $text_length - 1 ? ' ...' : ''); } /**