[ticket/15403] Refactor get_context

PHPBB-15403
2025-07-23 10:28:55 +00:00 · 2024-07-13 15:34:37 +02:00 · 2024-07-13 15:34:37 +02:00 · f41c8eef47
commit f41c8eef47
parent 89afa0cb5e
2 changed files with 73 additions and 55 deletions
--- a/phpBB/includes/functions_content.php
+++ b/phpBB/includes/functions_content.php
@ -325,83 +325,101 @@ function bump_topic_allowed($forum_id, $topic_bumped, $last_post_time, $topic_po
 *
 * @return	string			Context of the specified words separated by "..."
 */
-function get_context($text, $words, $length = 400)
+function get_context(string $text, array $words, int $length = 400): string
 {
-	$text_length = utf8_strlen($text);
-
-	// Replace all spaces/invisible characters with single spaces
-	$text = preg_replace("/\s+/", ' ', $text);
+	if ($length <= 0)
+	{
+		return '...';
+	}

 	// we need to turn the entities back into their original form, to not cut the message in between them
 	$text = html_entity_decode($text);

+	// Replace all spaces/invisible characters with single spaces
+	$text = preg_replace("/\s+/u", ' ', $text);
+
+	$text_length = utf8_strlen($text);
+
 	// Get first occurrence of each word
-	$word_indizes = [];
+	$word_indexes = [];
 	foreach ($words as $word)
 	{
 		$pos = utf8_stripos($text, $word);

 		if ($pos !== false)
 		{
-			$word_indizes[$pos] = $word;
+			$word_indexes[$pos] = $word;
 		}
 	}

-	// If there are coincidences
-	if (!empty($word_indizes))
+	if (!empty($word_indexes))
 	{
-			ksort($word_indizes);
+		ksort($word_indexes);

-			$wordnum = count($word_indizes);
 		// Size of the fragment of text per word
-			$characters_per_word = (int) ($length / $wordnum);
+		$num_indexes = count($word_indexes);
+		$characters_per_word = (int) ($length / $num_indexes) + 2; // 2 to leave one character of margin at the sides to don't cut words

-			// Get text fragments
+		// Get text fragment indexes
 		$fragments = [];
-			$start = $end = 0;
-			foreach ($word_indizes as $indize => $word)
+		foreach ($word_indexes as $index => $word)
 		{
-				// Check if the next word can be inside the current fragment of text
-				if ($end + $characters_per_word + utf8_strlen($word) < $indize)
+			$word_length = utf8_strlen($word);
+			$start = max(0, min($text_length - 1 - $characters_per_word, (int) ($index + ($word_length / 2) - ($characters_per_word / 2))));
+			$end = $start + $characters_per_word;
+
+			// Check if we can merge this fragment into the previous fragment
+			$last_element = array_pop($fragments);
+			if ($last_element !== null)
 			{
-					$fragment = utf8_substr($text, $start, $end-$start);
+				[$prev_start, $prev_end] = $last_element;

-					if ($start != 0)
+				if ($prev_end + $characters_per_word >= $index + $word_length)
 				{
-						$fragment = '... ' . $fragment;
+					$start = $prev_start;
+					$end = $prev_end + $characters_per_word;
 				}
-
-					$fragments[] = $fragment;
-
-					$start = $indize - ($characters_per_word / 2);
-					// Start fragment at the beginning of a word
-					$end = $start = ($start > 0) ? (utf8_strpos($text, ' ', $start - 1) + 1) : 0;
-				}
-
-				$end += $characters_per_word;
-
-				// End fragment at the end of a word
-				$substring = utf8_substr($text, $start, $end - $start);
-				$end = $start + utf8_strrpos($substring, ' ');
-			}
-
-			$fragment = utf8_substr($text, $start, $end-$start);
-			if ($start != 0)
+				else
 				{
-				$fragment = '... ' . $fragment;
+					$fragments[] = $last_element;
 				}
-			if ($end < $text_length)
+			}
+
+			$fragments[] = [$start, $end];
+		}
+	}
+	else
 	{
-				$fragment .= ' ...';
+		// There is no coincidences, so we just create a fragment with the first $length characters
+		$fragments[] = [0, $length];
+		$end = $length;
 	}

-			// Get the last fragment
-			$fragments[] = $fragment;
+	$output = [];
+	foreach ($fragments as [$start, $end])
+	{
+		$fragment = utf8_substr($text, $start, $end - $start + 1);

-			return htmlentities(implode('', $fragments));
+		$offset = $start;
+
+		// Find the first valid alphanumeric character in the fragment to don't cut words
+		if ($start > 0)
+		{
+			preg_match('/[^a-zA-Z0-9][a-zA-Z0-9]/u', $fragment, $matches, PREG_OFFSET_CAPTURE);
+			$start = $offset + (int) $matches[0][1] + 1; // first valid alphanumeric character
 		}

-	return htmlentities($text_length >= $length + 3 ? utf8_substr($text, 0, $length) . ' ...' : $text);
+		// Find the last valid alphanumeric character in the fragment to don't cut words
+		if ($end < $text_length - 1)
+		{
+			preg_match_all('/[a-zA-Z0-9][^a-zA-Z0-9]/u', $fragment, $matches, PREG_OFFSET_CAPTURE);
+			$end = $offset + end($matches[0])[1]; // last valid alphanumeric character
+		}
+
+		$output[] = utf8_substr($text, $start, $end - $start + 1);
+	}
+
+	return htmlentities(implode(' ... ', $output)) . ($end < $text_length - 1 ? ' ...' : '');
 }

 /**
--- a/tests/functions_content/get_context_test.php
+++ b/tests/functions_content/get_context_test.php
@ -39,7 +39,7 @@ class phpbb_functions_content_get_context_test extends TestCase
 				'text' => 'This is a sample text containing several words, but none of them match the given words.',
 				'words' => ['nonexistent'],
 				'length' => 50,
-				'expected' => 'This is a sample text containing several words, bu ...',
+				'expected' => 'This is a sample text containing several words ...',
 			],
 			'desired length equal to text length' => [
 				'text' => 'Exact length text.',
@ -57,13 +57,13 @@ class phpbb_functions_content_get_context_test extends TestCase
 				'text' => 'This is a sample text containing &amp; and &lt; and &gt; entities.',
 				'words' => ['sample', 'entities'],
 				'length' => 50,
-				'expected' => 'This is a sample text containing &amp; and &lt; and ...',
+				'expected' => 'This is a sample text ... and &lt; and &gt; entities.',
 			],
 			'text with multiple spaces and special characters' => [
 				'text' => 'This    is    a   sample   text containing    several   words.',
 				'words' => ['sample', 'several'],
 				'length' => 50,
-				'expected' => 'This is a sample text containing several words ...',
+				'expected' => 'This is a sample text containing several words.',
 			],
 			'empty text' => [
 				'text' => '',