diff --git a/phpBB/composer.json b/phpBB/composer.json index 9d4f28689c..0a04cde181 100644 --- a/phpBB/composer.json +++ b/phpBB/composer.json @@ -73,9 +73,6 @@ "vimeo/psalm": "^5.18.0", "psalm/plugin-symfony": "^v5.1.0" }, - "suggest": { - "ext-mbstring": "Better performance in search" - }, "extra": { "branch-alias": { "dev-master": "4.0.x-dev" diff --git a/phpBB/includes/functions_content.php b/phpBB/includes/functions_content.php index 941b2eb246..2950fe762a 100644 --- a/phpBB/includes/functions_content.php +++ b/phpBB/includes/functions_content.php @@ -324,121 +324,98 @@ function bump_topic_allowed($forum_id, $topic_bumped, $last_post_time, $topic_po * * @return string Context of the specified words separated by "..." */ -function get_context(string $text, array $words, int $length = 400) +function get_context(string $text, array $words, int $length = 400): string { - // first replace all whitespaces with single spaces - $text = preg_replace('/ +/', ' ', strtr($text, "\t\n\r\x0C ", ' ')); + if ($length <= 0) + { + return '...'; + } // we need to turn the entities back into their original form, to not cut the message in between them - $entities = array('<', '>', '[', ']', '.', ':', ':'); - $characters = array('<', '>', '[', ']', '.', ':', ':'); - $text = str_replace($entities, $characters, $text); + $text = html_entity_decode($text); - $word_indizes = array(); - if (count($words)) + // Replace all spaces/invisible characters with single spaces + $text = preg_replace("/\s+/u", ' ', $text); + + $text_length = utf8_strlen($text); + + // Get first occurrence of each word + $word_indexes = []; + foreach ($words as $word) { - $match = ''; - // find the starting indizes of all words - foreach ($words as $word) + $pos = utf8_stripos($text, $word); + + if ($pos !== false) { - if ($word) - { - if (preg_match('#(?:[^\w]|^)(' . $word . ')(?:[^\w]|$)#i', $text, $match)) - { - if (empty($match[1])) - { - continue; - } - - $pos = utf8_strpos($text, $match[1]); - if ($pos !== false) - { - $word_indizes[] = $pos; - } - } - } - } - unset($match); - - if (count($word_indizes)) - { - $word_indizes = array_unique($word_indizes); - sort($word_indizes); - - $wordnum = count($word_indizes); - // number of characters on the right and left side of each word - $sequence_length = (int) ($length / (2 * $wordnum)) - 2; - $final_text = ''; - $word = $j = 0; - $final_text_index = -1; - - // cycle through every character in the original text - for ($i = $word_indizes[$word], $n = utf8_strlen($text); $i < $n; $i++) - { - // if the current position is the start of one of the words then append $sequence_length characters to the final text - if (isset($word_indizes[$word]) && ($i == $word_indizes[$word])) - { - if ($final_text_index < $i - $sequence_length - 1) - { - $final_text .= '... ' . preg_replace('#^([^ ]*)#', '', utf8_substr($text, $i - $sequence_length, $sequence_length)); - } - else - { - // if the final text is already nearer to the current word than $sequence_length we only append the text - // from its current index on and distribute the unused length to all other sequenes - $sequence_length += (int) (($final_text_index - $i + $sequence_length + 1) / (2 * $wordnum)); - $final_text .= utf8_substr($text, $final_text_index + 1, $i - $final_text_index - 1); - } - $final_text_index = $i - 1; - - // add the following characters to the final text (see below) - $word++; - $j = 1; - } - - if ($j > 0) - { - // add the character to the final text and increment the sequence counter - $final_text .= utf8_substr($text, $i, 1); - $final_text_index++; - $j++; - - // if this is a whitespace then check whether we are done with this sequence - if (utf8_substr($text, $i, 1) == ' ') - { - // only check whether we have to exit the context generation completely if we haven't already reached the end anyway - if ($i + 4 < $n) - { - if (($j > $sequence_length && $word >= $wordnum) || utf8_strlen($final_text) > $length) - { - $final_text .= ' ...'; - break; - } - } - else - { - // make sure the text really reaches the end - $j -= 4; - } - - // stop context generation and wait for the next word - if ($j > $sequence_length) - { - $j = 0; - } - } - } - } - return str_replace($characters, $entities, $final_text); + $word_indexes[$pos] = $word; } } - if (!count($words) || !count($word_indizes)) + if (!empty($word_indexes)) { - return str_replace($characters, $entities, ((utf8_strlen($text) >= $length + 3) ? utf8_substr($text, 0, $length) . '...' : $text)); + ksort($word_indexes); + + // Size of the fragment of text per word + $num_indexes = count($word_indexes); + $characters_per_word = (int) ($length / $num_indexes) + 2; // 2 to leave one character of margin at the sides to don't cut words + + // Get text fragment indexes + $fragments = []; + foreach ($word_indexes as $index => $word) + { + $word_length = utf8_strlen($word); + $start = max(0, min($text_length - 1 - $characters_per_word, (int) ($index + ($word_length / 2) - ($characters_per_word / 2)))); + $end = $start + $characters_per_word; + + // Check if we can merge this fragment into the previous fragment + if (!empty($fragments)) + { + [$prev_start, $prev_end] = end($fragments); + + if ($prev_end + $characters_per_word >= $index + $word_length) + { + array_pop($fragments); + $start = $prev_start; + $end = $prev_end + $characters_per_word; + } + } + + $fragments[] = [$start, $end]; + } + } + else + { + // There is no coincidences, so we just create a fragment with the first $length characters + $fragments[] = [0, $length]; + $end = $length; } - return ''; + $output = []; + foreach ($fragments as [$start, $end]) + { + $fragment = utf8_substr($text, $start, $end - $start + 1); + + $fragment_start = 0; + $fragment_end = $end - $start + 1; + + // Find the first valid alphanumeric character in the fragment to don't cut words + if ($start > 0) + { + preg_match('/[^a-zA-Z0-9][a-zA-Z0-9]/u', $fragment, $matches, PREG_OFFSET_CAPTURE); + $fragment_start = (int) $matches[0][1] + 1; // first valid alphanumeric character + } + + // Find the last valid alphanumeric character in the fragment to don't cut words + if ($end < $text_length - 1) + { + preg_match_all('/[a-zA-Z0-9][^a-zA-Z0-9]/u', $fragment, $matches, PREG_OFFSET_CAPTURE); + $fragment_end = end($matches[0])[1]; // last valid alphanumeric character + } + + $output[] = utf8_substr($fragment, $fragment_start, $fragment_end - $fragment_start + 1); + } + + return ($fragments[0][0] !== 0 ? '... ' : '') . htmlentities(implode(' ... ', $output)) . ($end < $text_length - 1 ? ' ...' : ''); } /** diff --git a/phpBB/includes/utf/utf_tools.php b/phpBB/includes/utf/utf_tools.php index b8c35a5048..ed86829b97 100644 --- a/phpBB/includes/utf/utf_tools.php +++ b/phpBB/includes/utf/utf_tools.php @@ -72,6 +72,22 @@ function utf8_strpos($str, $needle, $offset = null) } } +/** +* UTF-8 aware alternative to stripos +* @ignore +*/ +function utf8_stripos($str, $needle, $offset = null) +{ + if (is_null($offset)) + { + return mb_stripos($str, $needle); + } + else + { + return mb_stripos($str, $needle, $offset); + } +} + /** * UTF-8 aware alternative to strtolower * @ignore diff --git a/tests/functional/search/base.php b/tests/functional/search/base.php index c3f3f6f7dd..1b2087adc0 100644 --- a/tests/functional/search/base.php +++ b/tests/functional/search/base.php @@ -165,7 +165,7 @@ abstract class phpbb_functional_search_base extends phpbb_functional_test_case foreach (['', 'a', 't', 'f', 'i', 's'] as $sort_key) { - $this->assert_search_found('phpbb3+installation', 1, 3, $sort_key); + $this->assert_search_found('phpbb3+installation', 1, 4, $sort_key); $this->assert_search_found('foosubject+barsearch', 1, 2, $sort_key); $this->assert_search_found('barsearch-testing', 1, 2, $sort_key); // test hyphen ignored $this->assert_search_found('barsearch+-+testing', 1, 2, $sort_key); // test hyphen wrapped with space ignored diff --git a/tests/functions_content/get_context_test.php b/tests/functions_content/get_context_test.php new file mode 100644 index 0000000000..59f6cdf846 --- /dev/null +++ b/tests/functions_content/get_context_test.php @@ -0,0 +1,127 @@ + + * @license GNU General Public License, version 2 (GPL-2.0) + * + * For full copyright and license information, please see + * the docs/CREDITS.txt file. + * + */ + +use PHPUnit\Framework\TestCase; + +class phpbb_functions_content_get_context_test extends TestCase +{ + /** + * Data provider for get_context test cases. + * + * @return array + */ + public function data_get_context(): array + { + return [ + 'text contains words and length greater than text' => [ + 'text' => 'This is a sample text containing several words, including sample, text, and words.', + 'words' => ['sample', 'words'], + 'length' => 100, + 'expected' => 'This is a sample text containing several words, including sample, text, and words.', + ], + 'text contains words and length less than text' => [ + 'text' => 'This is a sample text containing several words, including sample, text, and words.', + 'words' => ['sample', 'words'], + 'length' => 50, + 'expected' => 'This is a sample text containing several words ...', + ], + 'text does not contain words' => [ + 'text' => 'This is a sample text containing several words, but none of them match the given words.', + 'words' => ['nonexistent'], + 'length' => 50, + 'expected' => 'This is a sample text containing several words ...', + ], + 'desired length equal to text length' => [ + 'text' => 'Exact length text.', + 'words' => ['Exact', 'text'], + 'length' => 18, + 'expected' => 'Exact length text.', + ], + 'text with html entities' => [ + 'text' => 'This is a sample text containing & and < and > entities.', + 'words' => ['sample', 'containing'], + 'length' => 50, + 'expected' => 'This is a sample text containing & and < and ...', + ], + 'text with html entities and contains last word' => [ + 'text' => 'This is a sample text containing & and < and > entities.', + 'words' => ['sample', 'entities'], + 'length' => 50, + 'expected' => 'This is a sample text ... and < and > entities.', + ], + 'text with multiple spaces and special characters' => [ + 'text' => 'This is a sample text containing several words.', + 'words' => ['sample', 'several'], + 'length' => 50, + 'expected' => 'This is a sample text containing several words.', + ], + 'empty text' => [ + 'text' => '', + 'words' => ['sample', 'words'], + 'length' => 50, + 'expected' => '', + ], + 'empty words array' => [ + 'text' => 'This is a sample text containing several words.', + 'words' => [], + 'length' => 50, + 'expected' => 'This is a sample text containing several words.', + ], + 'zero length' => [ + 'text' => 'This is a sample text.', + 'words' => ['sample'], + 'length' => 0, + 'expected' => '...', + ], + 'negative length' => [ + 'text' => 'This is a sample text.', + 'words' => ['sample'], + 'length' => -10, + 'expected' => '...', + ], + 'ellipses_beginning' => [ + 'text' => 'foo foo foo foo foo foo foo foo bar', + 'words' => ['bar'], + 'length' => 10, + 'expected' => '... foo foo bar', + ], + 'ellipsis_end' => [ + 'text' => 'bar foo foo foo foo foo foo foo foo', + 'words' => ['bar'], + 'length' => 10, + 'expected' => 'bar foo foo ...', + ], + 'ellipsis_middle' => [ + 'text' => 'foo word1 foo foo foo foo foo foo foo foo foo word2 foo', + 'words' => ['word1', 'word2'], + 'length' => 10, + 'expected' => '... word1 ... word2 ...', + ], + 'ellipsis_middle2' => [ + 'text' => 'word1 foo foo foo foo foo foo foo foo foo word2', + 'words' => ['word1', 'word2'], + 'length' => 10, + 'expected' => 'word1 ... word2', + ] + ]; + } + + /** + * @dataProvider data_get_context + */ + public function test_get_context($text, $words, $length, $expected) + { + $this->assertEquals($expected, get_context($text, $words, $length)); + } + +}