* @license GNU General Public License, version 2 (GPL-2.0) * * For full copyright and license information, please see * the docs/CREDITS.txt file. * */ namespace phpbb\search\backend; use phpbb\config\config; use phpbb\db\driver\driver_interface; use phpbb\db\tools\tools_interface; use phpbb\event\dispatcher_interface; use phpbb\language\language; use phpbb\user; /** * phpBB's own db driven fulltext search, version 2 */ class fulltext_native extends base implements search_backend_interface { protected const UTF8_HANGUL_FIRST = "\xEA\xB0\x80"; protected const UTF8_HANGUL_LAST = "\xED\x9E\xA3"; protected const UTF8_CJK_FIRST = "\xE4\xB8\x80"; protected const UTF8_CJK_LAST = "\xE9\xBE\xBB"; protected const UTF8_CJK_B_FIRST = "\xF0\xA0\x80\x80"; protected const UTF8_CJK_B_LAST = "\xF0\xAA\x9B\x96"; /** * Associative array holding index stats * @var array */ protected $stats = array(); /** * Associative array stores the min and max word length to be searched * @var array */ protected $word_length = array(); /** * Contains tidied search query. * Operators are prefixed in search query and common words excluded * @var string */ protected $search_query = ''; /** * Contains common words. * Common words are words with length less/more than min/max length * @var array */ protected $common_words = array(); /** * Post ids of posts containing words that are to be included * @var array */ protected $must_contain_ids = array(); /** * Post ids of posts containing words that should not be included * @var array */ protected $must_not_contain_ids = array(); /** * Post ids of posts containing at least one word that needs to be excluded * @var array */ protected $must_exclude_one_ids = array(); /** * Relative path to board root * @var string */ protected $phpbb_root_path; /** * PHP Extension * @var string */ protected $php_ext; /** * DBAL tools * @var tools_interface */ protected $db_tools; /** * phpBB event dispatcher object * @var dispatcher_interface */ protected $phpbb_dispatcher; /** @var language */ protected $language; /** @var string */ protected $search_wordlist_table; /** @var string */ protected $search_wordmatch_table; /** * Initialises the fulltext_native search backend with min/max word length * * @param config $config Config object * @param driver_interface $db Database object * @param tools_interface $db_tools Database tools * @param dispatcher_interface $phpbb_dispatcher Event dispatcher object * @param language $language * @param user $user User object * @param string $search_results_table * @param string $search_wordlist_table * @param string $search_wordmatch_table * @param string $phpbb_root_path phpBB root path * @param string $phpEx PHP file extension */ public function __construct(config $config, driver_interface $db, tools_interface $db_tools, dispatcher_interface $phpbb_dispatcher, language $language, user $user, string $search_results_table, string $search_wordlist_table, string $search_wordmatch_table, string $phpbb_root_path, string $phpEx) { global $cache; parent::__construct($cache, $config, $db, $user, $search_results_table); $this->db_tools = $db_tools; $this->phpbb_dispatcher = $phpbb_dispatcher; $this->language = $language; $this->search_wordlist_table = $search_wordlist_table; $this->search_wordmatch_table = $search_wordmatch_table; $this->phpbb_root_path = $phpbb_root_path; $this->php_ext = $phpEx; $this->word_length = array('min' => (int) $this->config['fulltext_native_min_chars'], 'max' => (int) $this->config['fulltext_native_max_chars']); /** * Load the UTF tools */ if (!function_exists('utf8_decode_ncr')) { include($this->phpbb_root_path . 'includes/utf/utf_tools.' . $this->php_ext); } } /** * {@inheritdoc} */ public function get_name(): string { return 'phpBB Native Fulltext'; } /** * {@inheritdoc} */ public function is_available(): bool { return true; } /** * {@inheritdoc} */ public function init() { return false; } /** * {@inheritdoc} */ public function get_search_query(): string { return $this->search_query; } /** * {@inheritdoc} */ public function get_common_words(): array { return $this->common_words; } /** * {@inheritdoc} */ public function get_word_length() { return $this->word_length; } /** * {@inheritdoc} */ public function split_keywords(string &$keywords, string $terms): bool { $tokens = '+-|()* '; $keywords = trim($this->cleanup($keywords, $tokens)); // allow word|word|word without brackets if ((strpos($keywords, ' ') === false) && (strpos($keywords, '|') !== false) && (strpos($keywords, '(') === false)) { $keywords = '(' . $keywords . ')'; } $open_bracket = $space = false; for ($i = 0, $n = strlen($keywords); $i < $n; $i++) { if ($open_bracket !== false) { switch ($keywords[$i]) { case ')': if ($open_bracket + 1 == $i) { $keywords[$i - 1] = '|'; $keywords[$i] = '|'; } $open_bracket = false; break; case '(': $keywords[$i] = '|'; break; case '+': case '-': case ' ': $keywords[$i] = '|'; break; case '*': // $i can never be 0 here since $open_bracket is initialised to false if (strpos($tokens, $keywords[$i - 1]) !== false && ($i + 1 === $n || strpos($tokens, $keywords[$i + 1]) !== false)) { $keywords[$i] = '|'; } break; } } else { switch ($keywords[$i]) { case ')': $keywords[$i] = ' '; break; case '(': $open_bracket = $i; $space = false; break; case '|': $keywords[$i] = ' '; break; case '-': // Ignore hyphen if followed by a space if (isset($keywords[$i + 1]) && $keywords[$i + 1] == ' ') { $keywords[$i] = ' '; } else { $space = $keywords[$i]; } break; case '+': $space = $keywords[$i]; break; case ' ': if ($space !== false) { $keywords[$i] = $space; } break; default: $space = false; } } } if ($open_bracket !== false) { $keywords .= ')'; } $match = array( '# +#', '#\|\|+#', '#(\+|\-)(?:\+|\-)+#', '#\(\|#', '#\|\)#', ); $replace = array( ' ', '|', '$1', '(', ')', ); $keywords = preg_replace($match, $replace, $keywords); // Ensure a space exists before +, - and | to make the split and count work correctly $countable_keywords = preg_replace('/(?config['max_num_search_keywords'] && $num_keywords > $this->config['max_num_search_keywords']) { trigger_error($this->language->lang('MAX_NUM_SEARCH_KEYWORDS_REFINE', (int) $this->config['max_num_search_keywords'], $num_keywords)); } // $keywords input format: each word separated by a space, words in a bracket are not separated // the user wants to search for any word, convert the search query if ($terms == 'any') { $words = array(); preg_match_all('#([^\\s+\\-|()]+)(?:$|[\\s+\\-|()])#u', $keywords, $words); if (count($words[1])) { $keywords = '(' . implode('|', $words[1]) . ')'; } } // Remove non trailing wildcards from each word to prevent a full table scan (it's now using the database index) $match = '#\*(?!$|\s)#'; $replace = '$1'; $keywords = preg_replace($match, $replace, $keywords); // Only allow one wildcard in the search query to limit the database load $match = '#\*#'; $replace = '$1'; $count_wildcards = substr_count($keywords, '*'); // Reverse the string to remove all wildcards except the first one $keywords = strrev(preg_replace($match, $replace, strrev($keywords), $count_wildcards - 1)); unset($count_wildcards); // set the search_query which is shown to the user $this->search_query = $keywords; $exact_words = array(); preg_match_all('#([^\\s+\\-|()]+)(?:$|[\\s+\\-|()])#u', $keywords, $exact_words); $exact_words = $exact_words[1]; $common_ids = $words = array(); if (count($exact_words)) { $sql = 'SELECT word_id, word_text, word_common FROM ' . $this->search_wordlist_table . ' WHERE ' . $this->db->sql_in_set('word_text', $exact_words) . ' ORDER BY word_count ASC'; $result = $this->db->sql_query($sql); // store an array of words and ids, remove common words while ($row = $this->db->sql_fetchrow($result)) { if ($row['word_common']) { $this->common_words[] = $row['word_text']; $common_ids[$row['word_text']] = (int) $row['word_id']; continue; } $words[$row['word_text']] = (int) $row['word_id']; } $this->db->sql_freeresult($result); } // Handle +, - without preceding whitespace character $match = array('#(\S)\+#', '#(\S)-#'); $replace = array('$1 +', '$1 +'); $keywords = preg_replace($match, $replace, $keywords); // now analyse the search query, first split it using the spaces $query = explode(' ', $keywords); $this->must_contain_ids = array(); $this->must_not_contain_ids = array(); $this->must_exclude_one_ids = array(); foreach ($query as $word) { if (empty($word)) { continue; } // words which should not be included if ($word[0] == '-') { $word = substr($word, 1); // a group of which at least one may not be in the resulting posts if (isset($word[0]) && $word[0] == '(') { $word = array_unique(explode('|', substr($word, 1, -1))); $mode = 'must_exclude_one'; } // one word which should not be in the resulting posts else { $mode = 'must_not_contain'; } $ignore_no_id = true; } // words which have to be included else { // no prefix is the same as a +prefix if ($word[0] == '+') { $word = substr($word, 1); } // a group of words of which at least one word should be in every resulting post if (isset($word[0]) && $word[0] == '(') { $word = array_unique(explode('|', substr($word, 1, -1))); } $ignore_no_id = false; $mode = 'must_contain'; } if (empty($word)) { continue; } // if this is an array of words then retrieve an id for each if (is_array($word)) { $non_common_words = array(); $id_words = array(); foreach ($word as $i => $word_part) { if (strpos($word_part, '*') !== false) { $len = utf8_strlen(str_replace('*', '', $word_part)); if ($len >= $this->word_length['min'] && $len <= $this->word_length['max']) { $id_words[] = '\'' . $this->db->sql_escape(str_replace('*', '%', $word_part)) . '\''; $non_common_words[] = $word_part; } else { $this->common_words[] = $word_part; } } else if (isset($words[$word_part])) { $id_words[] = $words[$word_part]; $non_common_words[] = $word_part; } else { $len = utf8_strlen($word_part); if ($len < $this->word_length['min'] || $len > $this->word_length['max']) { $this->common_words[] = $word_part; } } } if (count($id_words)) { sort($id_words); if (count($id_words) > 1) { $this->{$mode . '_ids'}[] = $id_words; } else { $mode = ($mode == 'must_exclude_one') ? 'must_not_contain' : $mode; $this->{$mode . '_ids'}[] = $id_words[0]; } } // throw an error if we shall not ignore unexistant words else if (!$ignore_no_id && count($non_common_words)) { trigger_error(sprintf($this->language->lang('WORDS_IN_NO_POST'), implode($this->language->lang('COMMA_SEPARATOR'), $non_common_words))); } unset($non_common_words); } // else we only need one id else if (($wildcard = strpos($word, '*') !== false) || isset($words[$word])) { if ($wildcard) { $len = utf8_strlen(str_replace('*', '', $word)); if ($len >= $this->word_length['min'] && $len <= $this->word_length['max']) { $this->{$mode . '_ids'}[] = '\'' . $this->db->sql_escape(str_replace('*', '%', $word)) . '\''; } else { $this->common_words[] = $word; } } else { $this->{$mode . '_ids'}[] = $words[$word]; } } else { if (!isset($common_ids[$word])) { $len = utf8_strlen($word); if ($len < $this->word_length['min'] || $len > $this->word_length['max']) { $this->common_words[] = $word; } } } } // Return true if all words are not common words if (count($exact_words) - count($this->common_words) > 0) { return true; } return false; } /** * {@inheritdoc} */ public function keyword_search(string $type, string $fields, string $terms, array $sort_by_sql, string $sort_key, string $sort_dir, string $sort_days, array $ex_fid_ary, string $post_visibility, int $topic_id, array $author_ary, string $author_name, array &$id_ary, int &$start, int $per_page) { // No keywords? No posts. if (empty($this->search_query)) { return false; } // we can't search for negatives only if (empty($this->must_contain_ids)) { return false; } $must_contain_ids = $this->must_contain_ids; $must_not_contain_ids = $this->must_not_contain_ids; $must_exclude_one_ids = $this->must_exclude_one_ids; sort($must_contain_ids); sort($must_not_contain_ids); sort($must_exclude_one_ids); // generate a search_key from all the options to identify the results $search_key_array = array( serialize($must_contain_ids), serialize($must_not_contain_ids), serialize($must_exclude_one_ids), $type, $fields, $terms, $sort_days, $sort_key, $topic_id, implode(',', $ex_fid_ary), $post_visibility, implode(',', $author_ary), $author_name, ); /** * Allow changing the search_key for cached results * * @event core.search_native_by_keyword_modify_search_key * @var array search_key_array Array with search parameters to generate the search_key * @var array must_contain_ids Array with post ids of posts containing words that are to be included * @var array must_not_contain_ids Array with post ids of posts containing words that should not be included * @var array must_exclude_one_ids Array with post ids of posts containing at least one word that needs to be excluded * @var string type Searching type ('posts', 'topics') * @var string fields Searching fields ('titleonly', 'msgonly', 'firstpost', 'all') * @var string terms Searching terms ('all', 'any') * @var int sort_days Time, in days, of the oldest possible post to list * @var string sort_key The sort type used from the possible sort types * @var int topic_id Limit the search to this topic_id only * @var array ex_fid_ary Which forums not to search on * @var string post_visibility Post visibility data * @var array author_ary Array of user_id containing the users to filter the results to * @since 3.1.7-RC1 */ $vars = array( 'search_key_array', 'must_contain_ids', 'must_not_contain_ids', 'must_exclude_one_ids', 'type', 'fields', 'terms', 'sort_days', 'sort_key', 'topic_id', 'ex_fid_ary', 'post_visibility', 'author_ary', ); extract($this->phpbb_dispatcher->trigger_event('core.search_native_by_keyword_modify_search_key', compact($vars))); $search_key = md5(implode('#', $search_key_array)); // try reading the results from cache $total_results = 0; if ($this->obtain_ids($search_key, $total_results, $id_ary, $start, $per_page, $sort_dir) == self::SEARCH_RESULT_IN_CACHE) { return $total_results; } $id_ary = array(); $sql_where = array(); $m_num = 0; $w_num = 0; $sql_array = array( 'SELECT' => ($type == 'posts') ? 'DISTINCT p.post_id' : 'DISTINCT p.topic_id', 'FROM' => array( $this->search_wordmatch_table => array(), $this->search_wordlist_table => array(), ), 'LEFT_JOIN' => array(array( 'FROM' => array(POSTS_TABLE => 'p'), 'ON' => 'm0.post_id = p.post_id', )), ); $title_match = ''; $left_join_topics = false; $group_by = true; // Build some display specific sql strings switch ($fields) { case 'titleonly': $title_match = 'title_match = 1'; $group_by = false; // no break case 'firstpost': $left_join_topics = true; $sql_where[] = 'p.post_id = t.topic_first_post_id'; break; case 'msgonly': $title_match = 'title_match = 0'; $group_by = false; break; } if ($type == 'topics') { $left_join_topics = true; $group_by = true; } /** * @todo Add a query optimizer (handle stuff like "+(4|3) +4") */ foreach ($this->must_contain_ids as $subquery) { if (is_array($subquery)) { $group_by = true; $word_id_sql = array(); $word_ids = array(); foreach ($subquery as $id) { if (is_string($id)) { $sql_array['LEFT_JOIN'][] = array( 'FROM' => array($this->search_wordlist_table => 'w' . $w_num), 'ON' => "w$w_num.word_text LIKE $id" ); $word_ids[] = "w$w_num.word_id"; $w_num++; } else { $word_ids[] = $id; } } $sql_where[] = $this->db->sql_in_set("m$m_num.word_id", $word_ids); unset($word_id_sql); unset($word_ids); } else if (is_string($subquery)) { $sql_array['FROM'][$this->search_wordlist_table][] = 'w' . $w_num; $sql_where[] = "w$w_num.word_text LIKE $subquery"; $sql_where[] = "m$m_num.word_id = w$w_num.word_id"; $group_by = true; $w_num++; } else { $sql_where[] = "m$m_num.word_id = $subquery"; } $sql_array['FROM'][$this->search_wordmatch_table][] = 'm' . $m_num; if ($title_match) { $sql_where[] = "m$m_num.$title_match"; } if ($m_num != 0) { $sql_where[] = "m$m_num.post_id = m0.post_id"; } $m_num++; } foreach ($this->must_not_contain_ids as $key => $subquery) { if (is_string($subquery)) { $sql_array['LEFT_JOIN'][] = array( 'FROM' => array($this->search_wordlist_table => 'w' . $w_num), 'ON' => "w$w_num.word_text LIKE $subquery" ); $this->must_not_contain_ids[$key] = "w$w_num.word_id"; $group_by = true; $w_num++; } } if (count($this->must_not_contain_ids)) { $sql_array['LEFT_JOIN'][] = array( 'FROM' => array($this->search_wordmatch_table => 'm' . $m_num), 'ON' => $this->db->sql_in_set("m$m_num.word_id", $this->must_not_contain_ids) . (($title_match) ? " AND m$m_num.$title_match" : '') . " AND m$m_num.post_id = m0.post_id" ); $sql_where[] = "m$m_num.word_id IS NULL"; $m_num++; } foreach ($this->must_exclude_one_ids as $ids) { $is_null_joins = array(); foreach ($ids as $id) { if (is_string($id)) { $sql_array['LEFT_JOIN'][] = array( 'FROM' => array($this->search_wordlist_table => 'w' . $w_num), 'ON' => "w$w_num.word_text LIKE $id" ); $id = "w$w_num.word_id"; $group_by = true; $w_num++; } $sql_array['LEFT_JOIN'][] = array( 'FROM' => array($this->search_wordmatch_table => 'm' . $m_num), 'ON' => "m$m_num.word_id = $id AND m$m_num.post_id = m0.post_id" . (($title_match) ? " AND m$m_num.$title_match" : '') ); $is_null_joins[] = "m$m_num.word_id IS NULL"; $m_num++; } $sql_where[] = '(' . implode(' OR ', $is_null_joins) . ')'; } $sql_where[] = $post_visibility; $search_query = $this->search_query; $must_exclude_one_ids = $this->must_exclude_one_ids; $must_not_contain_ids = $this->must_not_contain_ids; $must_contain_ids = $this->must_contain_ids; $sql_sort_table = $sql_sort_join = $sql_match = $sql_match_where = $sql_sort = ''; /** * Allow changing the query used for counting for posts using fulltext_native * * @event core.search_native_keywords_count_query_before * @var string search_query The parsed keywords used for this search * @var array must_not_contain_ids Ids that cannot be taken into account for the results * @var array must_exclude_one_ids Ids that cannot be on the results * @var array must_contain_ids Ids that must be on the results * @var int total_results The previous result count for the format of the query * Set to 0 to force a re-count * @var array sql_array The data on how to search in the DB at this point * @var bool left_join_topics Whether or not TOPICS_TABLE should be CROSS JOIN'ED * @var array author_ary Array of user_id containing the users to filter the results to * @var string author_name An extra username to search on (!empty(author_ary) must be true, to be relevant) * @var array ex_fid_ary Which forums not to search on * @var int topic_id Limit the search to this topic_id only * @var string sql_sort_table Extra tables to include in the SQL query. * Used in conjunction with sql_sort_join * @var string sql_sort_join SQL conditions to join all the tables used together. * Used in conjunction with sql_sort_table * @var int sort_days Time, in days, of the oldest possible post to list * @var string sql_where An array of the current WHERE clause conditions * @var string sql_match Which columns to do the search on * @var string sql_match_where Extra conditions to use to properly filter the matching process * @var bool group_by Whether or not the SQL query requires a GROUP BY for the elements in the SELECT clause * @var string sort_by_sql The possible predefined sort types * @var string sort_key The sort type used from the possible sort types * @var string sort_dir "a" for ASC or "d" dor DESC for the sort order used * @var string sql_sort The result SQL when processing sort_by_sql + sort_key + sort_dir * @var int start How many posts to skip in the search results (used for pagination) * @since 3.1.5-RC1 */ $vars = array( 'search_query', 'must_not_contain_ids', 'must_exclude_one_ids', 'must_contain_ids', 'total_results', 'sql_array', 'left_join_topics', 'author_ary', 'author_name', 'ex_fid_ary', 'topic_id', 'sql_sort_table', 'sql_sort_join', 'sort_days', 'sql_where', 'sql_match', 'sql_match_where', 'group_by', 'sort_by_sql', 'sort_key', 'sort_dir', 'sql_sort', 'start', ); extract($this->phpbb_dispatcher->trigger_event('core.search_native_keywords_count_query_before', compact($vars))); if ($topic_id) { $sql_where[] = 'p.topic_id = ' . $topic_id; } if (count($author_ary)) { if ($author_name) { // first one matches post of registered users, second one guests and deleted users $sql_author = '(' . $this->db->sql_in_set('p.poster_id', array_diff($author_ary, array(ANONYMOUS)), false, true) . ' OR p.post_username ' . $author_name . ')'; } else { $sql_author = $this->db->sql_in_set('p.poster_id', $author_ary); } $sql_where[] = $sql_author; } if (count($ex_fid_ary)) { $sql_where[] = $this->db->sql_in_set('p.forum_id', $ex_fid_ary, true); } if ($sort_days) { $sql_where[] = 'p.post_time >= ' . (time() - ($sort_days * 86400)); } $sql_array['WHERE'] = implode(' AND ', $sql_where); $is_mysql = false; // if the total result count is not cached yet, retrieve it from the db if (!$total_results) { $sql = ''; $sql_array_count = $sql_array; if ($left_join_topics) { $sql_array_count['LEFT_JOIN'][] = array( 'FROM' => array(TOPICS_TABLE => 't'), 'ON' => 'p.topic_id = t.topic_id' ); } switch ($this->db->get_sql_layer()) { case 'mysqli': $is_mysql = true; break; case 'sqlite3': $sql_array_count['SELECT'] = ($type == 'posts') ? 'DISTINCT p.post_id' : 'DISTINCT p.topic_id'; $sql = 'SELECT COUNT(' . (($type == 'posts') ? 'post_id' : 'topic_id') . ') as total_results FROM (' . $this->db->sql_build_query('SELECT', $sql_array_count) . ')'; // no break default: $sql_array_count['SELECT'] = ($type == 'posts') ? 'COUNT(DISTINCT p.post_id) AS total_results' : 'COUNT(DISTINCT p.topic_id) AS total_results'; $sql = (!$sql) ? $this->db->sql_build_query('SELECT', $sql_array_count) : $sql; $result = $this->db->sql_query($sql); $total_results = (int) $this->db->sql_fetchfield('total_results'); $this->db->sql_freeresult($result); if (!$total_results) { return false; } break; } unset($sql_array_count, $sql); } // Build sql strings for sorting $sql_sort = $sort_by_sql[$sort_key] . (($sort_dir == 'a') ? ' ASC' : ' DESC'); switch ($sql_sort[0]) { case 'u': $sql_array['FROM'][USERS_TABLE] = 'u'; $sql_where[] = 'u.user_id = p.poster_id '; break; case 't': $left_join_topics = true; break; case 'f': $sql_array['FROM'][FORUMS_TABLE] = 'f'; $sql_where[] = 'f.forum_id = p.forum_id'; break; } if ($left_join_topics) { $sql_array['LEFT_JOIN'][] = array( 'FROM' => array(TOPICS_TABLE => 't'), 'ON' => 'p.topic_id = t.topic_id' ); } $sql_array['WHERE'] = implode(' AND ', $sql_where); $sql_array['GROUP_BY'] = ($group_by) ? (($type == 'posts') ? 'p.post_id' : 'p.topic_id') . ', ' . $sort_by_sql[$sort_key] : ''; $sql_array['ORDER_BY'] = $sql_sort; $sql_array['SELECT'] .= $sort_by_sql[$sort_key] ? ", {$sort_by_sql[$sort_key]}" : ''; unset($sql_where, $sql_sort, $group_by); $sql = $this->db->sql_build_query('SELECT', $sql_array); $result = $this->db->sql_query_limit($sql, $this->config['search_block_size'], $start); while ($row = $this->db->sql_fetchrow($result)) { $id_ary[] = (int) $row[(($type == 'posts') ? 'post_id' : 'topic_id')]; } $this->db->sql_freeresult($result); // If using mysql and the total result count is not calculated yet, get it from the db if (!$total_results && $is_mysql) { $sql_count = str_replace("SELECT {$sql_array['SELECT']}", "SELECT COUNT({$sql_array['SELECT']}) as total_results", $sql); $result = $this->db->sql_query($sql_count); $total_results = $sql_array['GROUP_BY'] ? count($this->db->sql_fetchrowset($result)) : $this->db->sql_fetchfield('total_results'); $this->db->sql_freeresult($result); if (!$total_results) { return false; } } if ($start >= $total_results) { $start = floor(($total_results - 1) / $per_page) * $per_page; $result = $this->db->sql_query_limit($sql, $this->config['search_block_size'], $start); while ($row = $this->db->sql_fetchrow($result)) { $id_ary[] = (int) $row[(($type == 'posts') ? 'post_id' : 'topic_id')]; } $this->db->sql_freeresult($result); } $id_ary = array_unique($id_ary); // store the ids, from start on then delete anything that isn't on the current page because we only need ids for one page $this->save_ids($search_key, $this->search_query, $author_ary, $total_results, $id_ary, $start, $sort_dir); $id_ary = array_slice($id_ary, 0, (int) $per_page); return $total_results; } /** * {@inheritdoc} */ public function author_search(string $type, bool $firstpost_only, array $sort_by_sql, string $sort_key, string $sort_dir, string $sort_days, array $ex_fid_ary, string $post_visibility, int $topic_id, array $author_ary, string $author_name, array &$id_ary, int &$start, int $per_page) { // No author? No posts if (!count($author_ary)) { return 0; } // generate a search_key from all the options to identify the results $search_key_array = array( '', $type, ($firstpost_only) ? 'firstpost' : '', '', '', $sort_days, $sort_key, $topic_id, implode(',', $ex_fid_ary), $post_visibility, implode(',', $author_ary), $author_name, ); /** * Allow changing the search_key for cached results * * @event core.search_native_by_author_modify_search_key * @var array search_key_array Array with search parameters to generate the search_key * @var string type Searching type ('posts', 'topics') * @var boolean firstpost_only Flag indicating if only topic starting posts are considered * @var int sort_days Time, in days, of the oldest possible post to list * @var string sort_key The sort type used from the possible sort types * @var int topic_id Limit the search to this topic_id only * @var array ex_fid_ary Which forums not to search on * @var string post_visibility Post visibility data * @var array author_ary Array of user_id containing the users to filter the results to * @var string author_name The username to search on * @since 3.1.7-RC1 */ $vars = array( 'search_key_array', 'type', 'firstpost_only', 'sort_days', 'sort_key', 'topic_id', 'ex_fid_ary', 'post_visibility', 'author_ary', 'author_name', ); extract($this->phpbb_dispatcher->trigger_event('core.search_native_by_author_modify_search_key', compact($vars))); $search_key = md5(implode('#', $search_key_array)); // try reading the results from cache $total_results = 0; if ($this->obtain_ids($search_key, $total_results, $id_ary, $start, $per_page, $sort_dir) == self::SEARCH_RESULT_IN_CACHE) { return $total_results; } $id_ary = array(); // Create some display specific sql strings if ($author_name) { // first one matches post of registered users, second one guests and deleted users $sql_author = '(' . $this->db->sql_in_set('p.poster_id', array_diff($author_ary, array(ANONYMOUS)), false, true) . ' OR p.post_username ' . $author_name . ')'; } else { $sql_author = $this->db->sql_in_set('p.poster_id', $author_ary); } $sql_fora = (count($ex_fid_ary)) ? ' AND ' . $this->db->sql_in_set('p.forum_id', $ex_fid_ary, true) : ''; $sql_time = ($sort_days) ? ' AND p.post_time >= ' . (time() - ($sort_days * 86400)) : ''; $sql_topic_id = ($topic_id) ? ' AND p.topic_id = ' . (int) $topic_id : ''; $sql_firstpost = ($firstpost_only) ? ' AND p.post_id = t.topic_first_post_id' : ''; $post_visibility = ($post_visibility) ? ' AND ' . $post_visibility : ''; // Build sql strings for sorting $sql_sort = $sort_by_sql[$sort_key] . (($sort_dir == 'a') ? ' ASC' : ' DESC'); $sql_sort_table = $sql_sort_join = ''; switch ($sql_sort[0]) { case 'u': $sql_sort_table = USERS_TABLE . ' u, '; $sql_sort_join = ' AND u.user_id = p.poster_id '; break; case 't': $sql_sort_table = ($type == 'posts' && !$firstpost_only) ? TOPICS_TABLE . ' t, ' : ''; $sql_sort_join = ($type == 'posts' && !$firstpost_only) ? ' AND t.topic_id = p.topic_id ' : ''; break; case 'f': $sql_sort_table = FORUMS_TABLE . ' f, '; $sql_sort_join = ' AND f.forum_id = p.forum_id '; break; } $select = ($type == 'posts') ? 'p.post_id' : 't.topic_id'; $select .= $sort_by_sql[$sort_key] ? ", {$sort_by_sql[$sort_key]}" : ''; $is_mysql = false; /** * Allow changing the query used to search for posts by author in fulltext_native * * @event core.search_native_author_count_query_before * @var int total_results The previous result count for the format of the query. * Set to 0 to force a re-count * @var string type The type of search being made * @var string select SQL SELECT clause for what to get * @var string sql_sort_table CROSS JOIN'ed table to allow doing the sort chosen * @var string sql_sort_join Condition to define how to join the CROSS JOIN'ed table specifyed in sql_sort_table * @var array sql_author SQL WHERE condition for the post author ids * @var int topic_id Limit the search to this topic_id only * @var string sort_by_sql The possible predefined sort types * @var string sort_key The sort type used from the possible sort types * @var string sort_dir "a" for ASC or "d" dor DESC for the sort order used * @var string sql_sort The result SQL when processing sort_by_sql + sort_key + sort_dir * @var string sort_days Time, in days, that the oldest post showing can have * @var string sql_time The SQL to search on the time specifyed by sort_days * @var bool firstpost_only Wether or not to search only on the first post of the topics * @var string sql_firstpost The SQL used in the WHERE claused to filter by firstpost. * @var array ex_fid_ary Forum ids that must not be searched on * @var array sql_fora SQL query for ex_fid_ary * @var int start How many posts to skip in the search results (used for pagination) * @since 3.1.5-RC1 */ $vars = array( 'total_results', 'type', 'select', 'sql_sort_table', 'sql_sort_join', 'sql_author', 'topic_id', 'sort_by_sql', 'sort_key', 'sort_dir', 'sql_sort', 'sort_days', 'sql_time', 'firstpost_only', 'sql_firstpost', 'ex_fid_ary', 'sql_fora', 'start', ); extract($this->phpbb_dispatcher->trigger_event('core.search_native_author_count_query_before', compact($vars))); // If the cache was completely empty count the results if (!$total_results) { switch ($this->db->get_sql_layer()) { case 'mysqli': $is_mysql = true; break; default: if ($type == 'posts') { $sql = 'SELECT COUNT(p.post_id) as total_results FROM ' . POSTS_TABLE . ' p' . (($firstpost_only) ? ', ' . TOPICS_TABLE . ' t ' : ' ') . " WHERE $sql_author $sql_topic_id $sql_firstpost $post_visibility $sql_fora $sql_time"; } else { if ($this->db->get_sql_layer() == 'sqlite3') { $sql = 'SELECT COUNT(topic_id) as total_results FROM (SELECT DISTINCT t.topic_id'; } else { $sql = 'SELECT COUNT(DISTINCT t.topic_id) as total_results'; } $sql .= ' FROM ' . TOPICS_TABLE . ' t, ' . POSTS_TABLE . " p WHERE $sql_author $sql_topic_id $sql_firstpost $post_visibility $sql_fora AND t.topic_id = p.topic_id $sql_time" . ($this->db->get_sql_layer() == 'sqlite3' ? ')' : ''); } $result = $this->db->sql_query($sql); $total_results = (int) $this->db->sql_fetchfield('total_results'); $this->db->sql_freeresult($result); if (!$total_results) { return false; } break; } } // Build the query for really selecting the post_ids if ($type == 'posts') { // For sorting by non-unique columns, add unique sort key to avoid duplicated rows in results $sql_sort .= ', p.post_id' . (($sort_dir == 'a') ? ' ASC' : ' DESC'); $sql = "SELECT $select FROM " . $sql_sort_table . POSTS_TABLE . ' p' . (($firstpost_only) ? ', ' . TOPICS_TABLE . ' t' : '') . " WHERE $sql_author $sql_topic_id $sql_firstpost $post_visibility $sql_fora $sql_sort_join $sql_time ORDER BY $sql_sort"; $field = 'post_id'; } else { $sql = "SELECT $select FROM " . $sql_sort_table . TOPICS_TABLE . ' t, ' . POSTS_TABLE . " p WHERE $sql_author $sql_topic_id $sql_firstpost $post_visibility $sql_fora AND t.topic_id = p.topic_id $sql_sort_join $sql_time GROUP BY t.topic_id, " . $sort_by_sql[$sort_key] . ' ORDER BY ' . $sql_sort; $field = 'topic_id'; } // Only read one block of posts from the db and then cache it $result = $this->db->sql_query_limit($sql, $this->config['search_block_size'], $start); while ($row = $this->db->sql_fetchrow($result)) { $id_ary[] = (int) $row[$field]; } $this->db->sql_freeresult($result); if (!$total_results && $is_mysql) { $sql_count = str_replace("SELECT $select", "SELECT COUNT(*) as total_results", $sql); $result = $this->db->sql_query($sql_count); $total_results = ($type == 'posts') ? (int) $this->db->sql_fetchfield('total_results') : count($this->db->sql_fetchrowset($result)); $this->db->sql_freeresult($result); if (!$total_results) { return false; } } if ($start >= $total_results) { $start = floor(($total_results - 1) / $per_page) * $per_page; $result = $this->db->sql_query_limit($sql, $this->config['search_block_size'], $start); while ($row = $this->db->sql_fetchrow($result)) { $id_ary[] = (int) $row[$field]; } $this->db->sql_freeresult($result); } $id_ary = array_unique($id_ary); if (count($id_ary)) { $this->save_ids($search_key, '', $author_ary, $total_results, $id_ary, $start, $sort_dir); $id_ary = array_slice($id_ary, 0, $per_page); return $total_results; } return false; } /** * {@inheritdoc} */ public function supports_phrase_search(): bool { return false; } /** * {@inheritdoc} */ public function index(string $mode, int $post_id, string &$message, string &$subject, int $poster_id, int $forum_id) { if (!$this->config['fulltext_native_load_upd']) { /** * The search indexer is disabled, return */ return; } // Split old and new post/subject to obtain array of 'words' $split_text = $this->split_message($message); $split_title = $this->split_message($subject); $cur_words = array('post' => array(), 'title' => array()); $words = array(); if ($mode == 'edit') { $words['add']['post'] = array(); $words['add']['title'] = array(); $words['del']['post'] = array(); $words['del']['title'] = array(); $sql = 'SELECT w.word_id, w.word_text, m.title_match FROM ' . $this->search_wordlist_table . ' w, ' . $this->search_wordmatch_table . " m WHERE m.post_id = $post_id AND w.word_id = m.word_id"; $result = $this->db->sql_query($sql); while ($row = $this->db->sql_fetchrow($result)) { $which = ($row['title_match']) ? 'title' : 'post'; $cur_words[$which][$row['word_text']] = $row['word_id']; } $this->db->sql_freeresult($result); $words['add']['post'] = array_diff($split_text, array_keys($cur_words['post'])); $words['add']['title'] = array_diff($split_title, array_keys($cur_words['title'])); $words['del']['post'] = array_diff(array_keys($cur_words['post']), $split_text); $words['del']['title'] = array_diff(array_keys($cur_words['title']), $split_title); } else { $words['add']['post'] = $split_text; $words['add']['title'] = $split_title; $words['del']['post'] = array(); $words['del']['title'] = array(); } /** * Event to modify method arguments and words before the native search index is updated * * @event core.search_native_index_before * @var string mode Contains the post mode: edit, post, reply, quote * @var int post_id The id of the post which is modified/created * @var string message New or updated post content * @var string subject New or updated post subject * @var int poster_id Post author's user id * @var int forum_id The id of the forum in which the post is located * @var array words Grouped lists of words added to or remove from the index * @var array split_text Array of words from the message * @var array split_title Array of words from the title * @var array cur_words Array of words currently in the index for comparing to new words * when mode is edit. Empty for other modes. * @since 3.2.3-RC1 */ $vars = array( 'mode', 'post_id', 'message', 'subject', 'poster_id', 'forum_id', 'words', 'split_text', 'split_title', 'cur_words', ); extract($this->phpbb_dispatcher->trigger_event('core.search_native_index_before', compact($vars))); unset($split_text); unset($split_title); // Get unique words from the above arrays $unique_add_words = array_unique(array_merge($words['add']['post'], $words['add']['title'])); // We now have unique arrays of all words to be added and removed and // individual arrays of added and removed words for text and title. What // we need to do now is add the new words (if they don't already exist) // and then add (or remove) matches between the words and this post if (count($unique_add_words)) { $sql = 'SELECT word_id, word_text FROM ' . $this->search_wordlist_table . ' WHERE ' . $this->db->sql_in_set('word_text', $unique_add_words); $result = $this->db->sql_query($sql); $word_ids = array(); while ($row = $this->db->sql_fetchrow($result)) { $word_ids[$row['word_text']] = $row['word_id']; } $this->db->sql_freeresult($result); $new_words = array_diff($unique_add_words, array_keys($word_ids)); $this->db->sql_transaction('begin'); if (count($new_words)) { $sql_ary = array(); foreach ($new_words as $word) { $sql_ary[] = array('word_text' => (string) $word, 'word_count' => 0); } $this->db->sql_return_on_error(true); $this->db->sql_multi_insert($this->search_wordlist_table, $sql_ary); $this->db->sql_return_on_error(false); } unset($new_words, $sql_ary); } else { $this->db->sql_transaction('begin'); } // now update the search match table, remove links to removed words and add links to new words foreach ($words['del'] as $word_in => $word_ary) { $title_match = ($word_in == 'title') ? 1 : 0; if (count($word_ary)) { $sql_in = array(); foreach ($word_ary as $word) { $sql_in[] = $cur_words[$word_in][$word]; } $sql = 'DELETE FROM ' . $this->search_wordmatch_table . ' WHERE ' . $this->db->sql_in_set('word_id', $sql_in) . ' AND post_id = ' . intval($post_id) . " AND title_match = $title_match"; $this->db->sql_query($sql); $sql = 'UPDATE ' . $this->search_wordlist_table . ' SET word_count = word_count - 1 WHERE ' . $this->db->sql_in_set('word_id', $sql_in) . ' AND word_count > 0'; $this->db->sql_query($sql); unset($sql_in); } } $this->db->sql_return_on_error(true); foreach ($words['add'] as $word_in => $word_ary) { $title_match = ($word_in == 'title') ? 1 : 0; if (count($word_ary)) { $sql = 'INSERT INTO ' . $this->search_wordmatch_table . ' (post_id, word_id, title_match) SELECT ' . (int) $post_id . ', word_id, ' . (int) $title_match . ' FROM ' . $this->search_wordlist_table . ' WHERE ' . $this->db->sql_in_set('word_text', $word_ary); $this->db->sql_query($sql); $sql = 'UPDATE ' . $this->search_wordlist_table . ' SET word_count = word_count + 1 WHERE ' . $this->db->sql_in_set('word_text', $word_ary); $this->db->sql_query($sql); } } $this->db->sql_return_on_error(false); $this->db->sql_transaction('commit'); // destroy cached search results containing any of the words removed or added $this->destroy_cache(array_unique(array_merge($words['add']['post'], $words['add']['title'], $words['del']['post'], $words['del']['title'])), array($poster_id)); unset($unique_add_words); unset($words); unset($cur_words); } /** * {@inheritdoc} */ public function index_remove(array $post_ids, array $author_ids, array $forum_ids): void { if (count($post_ids)) { $sql = 'SELECT w.word_id, w.word_text, m.title_match FROM ' . $this->search_wordmatch_table . ' m, ' . $this->search_wordlist_table . ' w WHERE ' . $this->db->sql_in_set('m.post_id', $post_ids) . ' AND w.word_id = m.word_id'; $result = $this->db->sql_query($sql); $message_word_ids = $title_word_ids = $word_texts = array(); while ($row = $this->db->sql_fetchrow($result)) { if ($row['title_match']) { $title_word_ids[] = $row['word_id']; } else { $message_word_ids[] = $row['word_id']; } $word_texts[] = $row['word_text']; } $this->db->sql_freeresult($result); if (count($title_word_ids)) { $sql = 'UPDATE ' . $this->search_wordlist_table . ' SET word_count = word_count - 1 WHERE ' . $this->db->sql_in_set('word_id', $title_word_ids) . ' AND word_count > 0'; $this->db->sql_query($sql); } if (count($message_word_ids)) { $sql = 'UPDATE ' . $this->search_wordlist_table . ' SET word_count = word_count - 1 WHERE ' . $this->db->sql_in_set('word_id', $message_word_ids) . ' AND word_count > 0'; $this->db->sql_query($sql); } unset($title_word_ids); unset($message_word_ids); $sql = 'DELETE FROM ' . $this->search_wordmatch_table . ' WHERE ' . $this->db->sql_in_set('post_id', $post_ids); $this->db->sql_query($sql); } $this->destroy_cache(array_unique($word_texts), array_unique($author_ids)); } /** * {@inheritdoc} */ public function tidy(): void { // Is the fulltext indexer disabled? If yes then we need not // carry on ... it's okay ... I know when I'm not wanted boo hoo if (!$this->config['fulltext_native_load_upd']) { $this->config->set('search_last_gc', time(), false); return; } $destroy_cache_words = array(); // Remove common words if ($this->config['num_posts'] >= 100 && $this->config['fulltext_native_common_thres']) { $common_threshold = ((double) $this->config['fulltext_native_common_thres']) / 100.0; // First, get the IDs of common words $sql = 'SELECT word_id, word_text FROM ' . $this->search_wordlist_table . ' WHERE word_count > ' . floor($this->config['num_posts'] * $common_threshold) . ' OR word_common = 1'; $result = $this->db->sql_query($sql); $sql_in = array(); while ($row = $this->db->sql_fetchrow($result)) { $sql_in[] = $row['word_id']; $destroy_cache_words[] = $row['word_text']; } $this->db->sql_freeresult($result); if (count($sql_in)) { // Flag the words $sql = 'UPDATE ' . $this->search_wordlist_table . ' SET word_common = 1 WHERE ' . $this->db->sql_in_set('word_id', $sql_in); $this->db->sql_query($sql); // by setting search_last_gc to the new time here we make sure that if a user reloads because the // following query takes too long, he won't run into it again $this->config->set('search_last_gc', time(), false); // Delete the matches $sql = 'DELETE FROM ' . $this->search_wordmatch_table . ' WHERE ' . $this->db->sql_in_set('word_id', $sql_in); $this->db->sql_query($sql); } unset($sql_in); } if (count($destroy_cache_words)) { // destroy cached search results containing any of the words that are now common or were removed $this->destroy_cache(array_unique($destroy_cache_words)); } $this->config->set('search_last_gc', time(), false); } // create_index is inherited from base.php /** * {@inheritdoc} */ public function delete_index(int|null &$post_counter = null): array|null { $truncate_tables = [ $this->search_wordlist_table, $this->search_wordmatch_table, $this->search_results_table, ]; $stats = $this->stats; /** * Event to modify SQL queries before the native search index is deleted * * @event core.search_native_delete_index_before * * @var array stats Array with statistics of the current index (read only) * @var array truncate_tables Array with tables that will be truncated * * @since 3.2.3-RC1 * @changed 4.0.0-a1 Removed sql_queries, only add/remove tables to truncate to truncate_tables */ $vars = array( 'stats', 'truncate_tables', ); extract($this->phpbb_dispatcher->trigger_event('core.search_native_delete_index_before', compact($vars))); foreach ($truncate_tables as $table) { $this->db_tools->sql_truncate_table($table); } return null; } /** * {@inheritdoc} */ public function index_created(): bool { if (!count($this->stats)) { $this->get_stats(); } return $this->stats['total_words'] && $this->stats['total_matches']; } /** * {@inheritdoc} */ public function index_stats() { if (!count($this->stats)) { $this->get_stats(); } return array( $this->language->lang('TOTAL_WORDS') => $this->stats['total_words'], $this->language->lang('TOTAL_MATCHES') => $this->stats['total_matches']); } /** * Computes the stats and store them in the $this->stats associative array */ protected function get_stats() { $this->stats['total_words'] = $this->db->get_estimated_row_count($this->search_wordlist_table); $this->stats['total_matches'] = $this->db->get_estimated_row_count($this->search_wordmatch_table); } /** * Split a text into words of a given length * * The text is converted to UTF-8, cleaned up, and split. Then, words that * conform to the defined length range are returned in an array. * * NOTE: duplicates are NOT removed from the return array * * @param string $text Text to split, encoded in UTF-8 * @return array Array of UTF-8 words */ protected function split_message($text) { $match = $words = array(); /** * Taken from the original code */ // Do not index code $match[] = '#\[code(?:=.*?)?(\:?[0-9a-z]{5,})\].*?\[\/code(\:?[0-9a-z]{5,})\]#is'; // BBcode $match[] = '#\[\/?[a-z0-9\*\+\-]+(?:=.*?)?(?::[a-z])?(\:?[0-9a-z]{5,})\]#'; $min = $this->word_length['min']; $isset_min = $min - 1; /** * Clean up the string, remove HTML tags, remove BBCodes */ $word = strtok($this->cleanup(preg_replace($match, ' ', strip_tags($text)), '-1'), ' '); while (strlen($word)) { if (strlen($word) > 255 || strlen($word) <= $isset_min) { /** * Words longer than 255 bytes are ignored. This will have to be * changed whenever we change the length of search_wordlist.word_text * * Words shorter than $isset_min bytes are ignored, too */ $word = strtok(' '); continue; } $len = utf8_strlen($word); /** * Test whether the word is too short to be indexed. * * Note that this limit does NOT apply to CJK and Hangul */ if ($len < $min) { /** * Note: this could be optimized. If the codepoint is lower than Hangul's range * we know that it will also be lower than CJK ranges */ if ((strncmp($word, self::UTF8_HANGUL_FIRST, 3) < 0 || strncmp($word, self::UTF8_HANGUL_LAST, 3) > 0) && (strncmp($word, self::UTF8_CJK_FIRST, 3) < 0 || strncmp($word, self::UTF8_CJK_LAST, 3) > 0) && (strncmp($word, self::UTF8_CJK_B_FIRST, 4) < 0 || strncmp($word, self::UTF8_CJK_B_LAST, 4) > 0)) { $word = strtok(' '); continue; } } $words[] = $word; $word = strtok(' '); } return $words; } /** * Clean up a text to remove non-alphanumeric characters * * This method receives a UTF-8 string, normalizes and validates it, replaces all * non-alphanumeric characters with strings then returns the result. * * Any number of "allowed chars" can be passed as a UTF-8 string in NFC. * * @param string $text Text to split, in UTF-8 (not normalized or sanitized) * @param string $allowed_chars String of special chars to allow * @param string $encoding Text encoding * @return string Cleaned up text, only alphanumeric chars are left */ protected function cleanup($text, $allowed_chars = null, $encoding = 'utf-8') { static $conv = array(), $conv_loaded = array(); $allow = array(); // Convert the text to UTF-8 $encoding = strtolower($encoding); if ($encoding != 'utf-8') { $text = utf8_recode($text, $encoding); } $utf_len_mask = array( "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4 ); /** * Replace HTML entities and NCRs */ $text = html_entity_decode(utf8_decode_ncr($text), ENT_QUOTES); /** * Normalize to NFC */ $text = \Normalizer::normalize($text); /** * The first thing we do is: * * - convert ASCII-7 letters to lowercase * - remove the ASCII-7 non-alpha characters * - remove the bytes that should not appear in a valid UTF-8 string: 0xC0, * 0xC1 and 0xF5-0xFF * * @todo in theory, the third one is already taken care of during normalization and those chars should have been replaced by Unicode replacement chars */ $sb_match = "ISTCPAMELRDOJBNHFGVWUQKYXZ\r\n\t!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\xC0\xC1\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF"; $sb_replace = 'istcpamelrdojbnhfgvwuqkyxz '; /** * This is the list of legal ASCII chars, it is automatically extended * with ASCII chars from $allowed_chars */ $legal_ascii = ' eaisntroludcpmghbfvq10xy2j9kw354867z'; /** * Prepare an array containing the extra chars to allow */ if (isset($allowed_chars[0])) { $pos = 0; $len = strlen($allowed_chars); do { $c = $allowed_chars[$pos]; if ($c < "\x80") { /** * ASCII char */ $sb_pos = strpos($sb_match, $c); if (is_int($sb_pos)) { /** * Remove the char from $sb_match and its corresponding * replacement in $sb_replace */ $sb_match = substr($sb_match, 0, $sb_pos) . substr($sb_match, $sb_pos + 1); $sb_replace = substr($sb_replace, 0, $sb_pos) . substr($sb_replace, $sb_pos + 1); $legal_ascii .= $c; } ++$pos; } else { /** * UTF-8 char */ $utf_len = $utf_len_mask[$c & "\xF0"]; $allow[substr($allowed_chars, $pos, $utf_len)] = 1; $pos += $utf_len; } } while ($pos < $len); } $text = strtr($text, $sb_match, $sb_replace); $ret = ''; $pos = 0; $len = strlen($text); do { /** * Do all consecutive ASCII chars at once */ if ($spn = strspn($text, $legal_ascii, $pos)) { $ret .= substr($text, $pos, $spn); $pos += $spn; } if ($pos >= $len) { return $ret; } /** * Capture the UTF char */ $utf_len = $utf_len_mask[$text[$pos] & "\xF0"]; $utf_char = substr($text, $pos, $utf_len); $pos += $utf_len; if (($utf_char >= self::UTF8_HANGUL_FIRST && $utf_char <= self::UTF8_HANGUL_LAST) || ($utf_char >= self::UTF8_CJK_FIRST && $utf_char <= self::UTF8_CJK_LAST) || ($utf_char >= self::UTF8_CJK_B_FIRST && $utf_char <= self::UTF8_CJK_B_LAST)) { /** * All characters within these ranges are valid * * We separate them with a space in order to index each character * individually */ $ret .= ' ' . $utf_char . ' '; continue; } if (isset($allow[$utf_char])) { /** * The char is explicitly allowed */ $ret .= $utf_char; continue; } if (isset($conv[$utf_char])) { /** * The char is mapped to something, maybe to itself actually */ $ret .= $conv[$utf_char]; continue; } /** * The char isn't mapped, but did we load its conversion table? * * The search indexer table is split into blocks. The block number of * each char is equal to its codepoint right-shifted for 11 bits. It * means that out of the 11, 16 or 21 meaningful bits of a 2-, 3- or * 4- byte sequence we only keep the leftmost 0, 5 or 10 bits. Thus, * all UTF chars encoded in 2 bytes are in the same first block. */ if (isset($utf_char[2])) { if (isset($utf_char[3])) { /** * 1111 0nnn 10nn nnnn 10nx xxxx 10xx xxxx * 0000 0111 0011 1111 0010 0000 */ $idx = ((ord($utf_char[0]) & 0x07) << 7) | ((ord($utf_char[1]) & 0x3F) << 1) | ((ord($utf_char[2]) & 0x20) >> 5); } else { /** * 1110 nnnn 10nx xxxx 10xx xxxx * 0000 0111 0010 0000 */ $idx = ((ord($utf_char[0]) & 0x07) << 1) | ((ord($utf_char[1]) & 0x20) >> 5); } } else { /** * 110x xxxx 10xx xxxx * 0000 0000 0000 0000 */ $idx = 0; } /** * Check if the required conv table has been loaded already */ if (!isset($conv_loaded[$idx])) { $conv_loaded[$idx] = 1; $file = $this->phpbb_root_path . 'includes/utf/data/search_indexer_' . $idx . '.' . $this->php_ext; if (file_exists($file)) { $conv += include($file); } } if (isset($conv[$utf_char])) { $ret .= $conv[$utf_char]; } else { /** * We add an entry to the conversion table so that we * don't have to convert to codepoint and perform the checks * that are above this block */ $conv[$utf_char] = ' '; $ret .= ' '; } } while (1); return $ret; } /** * {@inheritdoc} */ public function get_acp_options(): array { /** * if we need any options, copied from fulltext_native for now, will have to be adjusted or removed */ $tpl = '