From eeb9c387cc4b2f131e3a98afa255992042d9904e Mon Sep 17 00:00:00 2001 From: rvelices Date: Fri, 21 Mar 2014 05:16:35 +0000 Subject: bug 3056: Improve/rewrite quick search engine: by default AND is used to match all entered terms, OR operator, grouping using brackets () still work in progress git-svn-id: http://piwigo.org/svn/trunk@27868 68402e56-0260-453c-a942-63ccdbb3a9ee --- include/functions_search.inc.php | 623 +++++++++++++++++++++------------------ 1 file changed, 338 insertions(+), 285 deletions(-) (limited to 'include/functions_search.inc.php') diff --git a/include/functions_search.inc.php b/include/functions_search.inc.php index 9cf50d602..c67cfdf0b 100644 --- a/include/functions_search.inc.php +++ b/include/functions_search.inc.php @@ -291,8 +291,9 @@ function is_odd_wbreak_end($ch) define('QST_QUOTED', 0x01); define('QST_NOT', 0x02); -define('QST_WILDCARD_BEGIN', 0x04); -define('QST_WILDCARD_END', 0x08); +define('QST_OR', 0x04); +define('QST_WILDCARD_BEGIN', 0x08); +define('QST_WILDCARD_END', 0x10); define('QST_WILDCARD', QST_WILDCARD_BEGIN|QST_WILDCARD_END); /** @@ -302,143 +303,275 @@ define('QST_WILDCARD', QST_WILDCARD_BEGIN|QST_WILDCARD_END); * The query can contain a phrase: 'Pierre "New York"' will return 'pierre' qnd 'new york'. * * @param string $q - * @param array &$qtokens - * @param array &$qtoken_modifiers */ -function analyse_qsearch($q, &$qtokens, &$qtoken_modifiers) + +class QSingleToken +{ + var $is_single = true; + var $token; + var $idx; + + function __construct($token) + { + $this->token = $token; + } +} + +class QMultiToken { - $q = stripslashes($q); - $tokens = array(); - $token_modifiers = array(); - $crt_token = ""; - $crt_token_modifier = 0; + var $is_single = false; + var $tokens = array(); + var $token_modifiers = array(); - for ($i=0; $itokens); $i++) { - if ($ch=='"') + $modifier = $this->token_modifiers[$i]; + if ($i) + $s .= ' '; + if ($modifier & QST_OR) + $s .= 'OR '; + if ($modifier & QST_NOT) + $s .= 'NOT '; + if ($modifier & QST_WILDCARD_BEGIN) + $s .= '*'; + if ($modifier & QST_QUOTED) + $s .= '"'; + if (! ($this->tokens[$i]->is_single) ) + { + $s .= '('; + $s .= $this->tokens[$i]; + $s .= ')'; + } + else + { + $s .= $this->tokens[$i]->token; + } + if ($modifier & QST_QUOTED) + $s .= '"'; + if ($modifier & QST_WILDCARD_END) + $s .= '*'; + + } + return $s; + } + + function push(&$token, &$modifier) + { + $this->tokens[] = new QSingleToken($token); + $this->token_modifiers[] = $modifier; + $token = ""; + $modifier = 0; + } + + protected function parse_expression($q, &$qi, $level) + { + $crt_token = ""; + $crt_modifier = 0; + + for ($stop=false; !$stop && $qi<~')==0 ) - { //special full text modifier - if (strlen($crt_token)) - { - $crt_token .= $ch; - } - else - { - if ( $ch=='*' ) - $crt_token_modifier |= QST_WILDCARD_BEGIN; - if ( $ch=='-' ) - $crt_token_modifier |= QST_NOT; - } + case '(': + if (strlen($crt_token)) + $this->push($crt_token, $crt_modifier); + $sub = new QMultiToken; + $qi++; + $sub->parse_expression($q, $qi, $level+1); + $this->tokens[] = $sub; + $this->token_modifiers[] = $crt_modifier; + $crt_modifier = 0; + break; + case ')': + if ($level>0) + $stop = true; + break; + case '"': + if (strlen($crt_token)) + $this->push($crt_token, $crt_modifier); + $crt_modifier |= QST_QUOTED; + break; + case '-': + if (strlen($crt_token)) + $crt_token .= $ch; + else + $crt_modifier |= QST_NOT; + break; + case '*': + if (strlen($crt_token)) + $crt_token .= $ch; // wildcard end later + else + $crt_modifier |= QST_WILDCARD_BEGIN; + break; + default: + if (preg_match('/[\s,.;!\?]+/', $ch)) + { // white space + if (strlen($crt_token)) + $this->push($crt_token, $crt_modifier); + $crt_modifier = 0; + } + else + $crt_token .= $ch; + break; } - elseif (preg_match('/[\s,.;!\?]+/', $ch)) - { // white space - if (strlen($crt_token)) + } + else + {// quoted + if ($ch=='"') + { + if ($qi+1 < strlen($q) && $q[$qi+1]=='*') { - $tokens[] = $crt_token; $token_modifiers[] = $crt_token_modifier; - $crt_token = ""; + $crt_modifier |= QST_WILDCARD_END; + $ai++; } - $crt_token_modifier = 0; + $this->push($crt_token, $crt_modifier); } else - { $crt_token .= $ch; - } + } } - else // qualified with quotes + + if (strlen($crt_token)) + $this->push($crt_token, $crt_modifier); + + for ($i=0; $itokens); $i++) { - if ($ch=='"') + $token = $this->tokens[$i]; + $remove = false; + if ($token->is_single) { - if ($i+1 < strlen($q) && $q[$i+1]=='*') + if ( ($this->token_modifiers[$i]&QST_QUOTED)==0 ) { - $crt_token_modifier |= QST_WILDCARD_END; - $i++; + if ('not' == strtolower($token->token)) + { + if ($i+1 < count($this->tokens)) + $this->token_modifiers[$i+1] |= QST_NOT; + $token->token = ""; + } + if ('or' == strtolower($token->token)) + { + if ($i+1 < count($this->tokens)) + $this->token_modifiers[$i+1] |= QST_OR; + $token->token = ""; + } + if ('and' == strtolower($token->token)) + { + $token->token = ""; + } + if ( substr($token->token, -1)=='*' ) + { + $token->token = rtrim($token->token, '*'); + $this->token_modifiers[$i] |= QST_WILDCARD_END; + } } - $tokens[] = $crt_token; $token_modifiers[] = $crt_token_modifier; - $crt_token = ""; $crt_token_modifier = 0; - $state=0; + if (!strlen($token->token)) + $remove = true; } else - $crt_token .= $ch; + { + if (!count($token->tokens)) + $remove = true; + } + if ($remove) + { + array_splice($this->tokens, $i, 1); + array_splice($this->token_modifiers, $i, 1); + $i--; + } } } +} + +class QExpression extends QMultiToken +{ + var $stokens = array(); + var $stoken_modifiers = array(); - if (strlen($crt_token)) + function __construct($q) { - $tokens[] = $crt_token; - $token_modifiers[] = $crt_token_modifier; + $i = 0; + $this->parse_expression($q, $i, 0); + //@TODO: manipulate the tree so that 'a OR b c' is the same as 'b c OR a' + $this->build_single_tokens($this); } - $qtokens = array(); - $qtoken_modifiers = array(); - for ($i=0; $itokens); $i++) { - if ( substr($tokens[$i], -1)=='*' ) + $token = $expr->tokens[$i]; + if ($token->is_single) { - $tokens[$i] = rtrim($tokens[$i], '*'); - $token_modifiers[$i] |= QST_WILDCARD_END; + $token->idx = count($this->stokens); + $this->stokens[] = $token->token; + $this->stoken_modifiers[] = $expr->token_modifiers[$i]; } + else + $this->build_single_tokens($token); } - if ( strlen($tokens[$i])==0) - continue; - $qtokens[] = $tokens[$i]; - $qtoken_modifiers[] = $token_modifiers[$i]; } } -/** - * Returns the LIKE SQL clause corresponding to the quick search query - * that has been split into tokens. - * for example file LIKE '%john%' OR file LIKE '%bill%'. - * - * @param array $tokens - * @param array $token_modifiers - * @param string $field - * @return string|null - */ -function get_qsearch_like_clause($tokens, $token_modifiers, $field) +class QResults { - $clauses = array(); - for ($i=0; $iimages_iids = array_fill(0, count($expr->tokens), array()); + $query_base = 'SELECT id from '.IMAGES_TABLE.' i WHERE '; + for ($i=0; $istokens); $i++) { - $token = trim($tokens[$i], '%'); - if ($token_modifiers[$i]&QST_NOT) - continue; - if ( strlen($token)==0 ) - continue; - $token = addslashes($token); - $token = str_replace( array('%','_'), array('\\%','\\_'), $token); // escape LIKE specials %_ - $clauses[] = $field.' LIKE \'%'.$token.'%\''; - } + $token = $expr->stokens[$i]; + $clauses = array(); - return count($clauses) ? '('.implode(' OR ', $clauses).')' : null; + $like = addslashes($token); + $like = str_replace( array('%','_'), array('\\%','\\_'), $like); // escape LIKE specials %_ + $clauses[] = 'CONVERT(file, CHAR) LIKE \'%'.$like.'%\''; + + if (strlen($token)>3) // default minimum full text index + { + $ft = $token; + if ($expr->stoken_modifiers[$i] & QST_QUOTED) + $ft = '"'.$ft.'"'; + if ($expr->stoken_modifiers[$i] & QST_WILDCARD_END) + $ft .= '*'; + $clauses[] = 'MATCH(i.name, i.comment) AGAINST( \''.addslashes($ft).'\' IN BOOLEAN MODE)'; + } + else + { + foreach( array('i.name', 'i.comment') as $field) + { + $clauses[] = $field.' LIKE \''.$like.' %\''; + $clauses[] = $field.' LIKE \'% '.$like.'\''; + $clauses[] = $field.' LIKE \'% '.$like.' %\''; + } + } + $query = $query_base.'('.implode(' OR ', $clauses).')'; + $qsr->images_iids[$i] = query2array($query,null,'id'); + } } -/** - * Returns tags corresponding to the quick search query that has been split into tokens. - * - * @param array $tokens - * @param array $token_modifiers - * @param array &$token_tag_ids - * @param array &$not_tag_ids - * @param array &$all_tags - */ -function get_qsearch_tags($tokens, $token_modifiers, &$token_tag_ids, &$not_tag_ids, &$all_tags) +function qsearch_get_tags(QExpression $expr, QResults $qsr) { + $tokens = $expr->stokens; + $token_modifiers = $expr->stoken_modifiers; + $token_tag_ids = array_fill(0, count($tokens), array() ); - $not_tag_ids = $all_tags = array(); + $all_tags = array(); $token_tag_scores = $token_tag_ids; $transliterated_tokens = array(); @@ -531,65 +664,111 @@ SELECT t.*, COUNT(image_id) AS counter } } - // process not tags + // process tags + $not_tag_ids = array(); for ($i=0; $i0 && $token_tag_scores[$i][$j] < $token_tag_scores[$i][0]) - break; - $tag_id = $token_tag_ids[$i][$j]; - if ( isset($all_tags[$tag_id]) ) + if ($is_not) + { + if ($token_tag_scores[$i][$j] < 0.8 || + ($j>0 && $token_tag_scores[$i][$j] < $token_tag_scores[$i][0]) ) + { + array_splice($token_tag_scores[$i], $j); + array_splice($token_tag_ids[$i], $j); + } + } + else { - unset($all_tags[$tag_id]); - $not_tag_ids[] = $tag_id; + $tag_id = $token_tag_ids[$i][$j]; + $counter += $all_tags[$tag_id]['counter']; + if ($counter > 200 && $j>0 && $token_tag_scores[$i][0] > $token_tag_scores[$i][$j] ) + {// "many" images in previous tags and starting from this tag is less relevent + array_splice($token_tag_ids[$i], $j); + array_splice($token_tag_scores[$i], $j); + break; + } } } - $token_tag_ids[$i] = array(); + + if ($is_not) + { + $not_tag_ids = array_merge($not_tag_ids, $token_tag_ids[$i]); + } + } + + $all_tags = array_diff_key($all_tags, array_flip($not_tag_ids)); + usort($all_tags, 'tag_alpha_compare'); + foreach ( $all_tags as &$tag ) + { + $tag['name'] = trigger_event('render_tag_name', $tag['name'], $tag); } + $qsr->all_tags = $all_tags; + + $qsr->tag_ids = $token_tag_ids; + $qsr->tag_iids = array_fill(0, count($tokens), array() ); - // process regular tags for ($i=0; $itag_iids[$i] = query2array($query, null, 'image_id'); + } + } +} - $counter = 0; - for ($j=0; $jtokens); $i++) + { + $current = $crt_expr->tokens[$i]; + if ($current->is_single) { - $tag_id = $token_tag_ids[$i][$j]; - if ( ! isset($all_tags[$tag_id]) ) - { - array_splice($token_tag_ids[$i], $j, 1); - array_splice($token_tag_scores[$i], $j, 1); - $j--; - continue; - } + $crt_ids = $qsr->iids[$current->idx] = array_unique( array_merge($qsr->images_iids[$current->idx], $qsr->tag_iids[$current->idx]) ); + } + else + $crt_ids = qsearch_eval($expr, $qsr, $current); + $modifier = $crt_expr->token_modifiers[$i]; - $counter += $all_tags[$tag_id]['counter']; - if ($counter > 200 && $j>0 && $token_tag_scores[$i][0] > $token_tag_scores[$i][$j] ) - {// "many" images in previous tags and starting from this tag is less relevent - array_splice($token_tag_ids[$i], $j); - array_splice($token_tag_scores[$i], $j); - break; + if ($modifier & QST_NOT) + $not_ids = array_unique( array_merge($not_ids, $crt_ids)); + else + { + if ($modifier & QST_OR) + $ids = array_unique( array_merge($ids, $crt_ids) ); + else + { + if ($current->is_single && empty($crt_ids)) + { + //@TODO: mark this term as unmatched and tell users + //@TODO: if we don't find a term at all, maybe ignore it and produce some results + } + if ($first) + $ids = $crt_ids; + else + $ids = array_intersect($ids, $crt_ids); + $first= false; } } } - usort($all_tags, 'tag_alpha_compare'); - foreach ( $all_tags as &$tag ) - { - $tag['name'] = trigger_event('render_tag_name', $tag['name'], $tag); - } + if (count($not_ids)) + $ids = array_diff($ids, $not_ids); + return $ids; } /** @@ -620,144 +799,36 @@ function get_quick_search_results($q, $super_order_by, $images_where='') 'qs' => array('q'=>stripslashes($q)), ); $q = trim($q); - analyse_qsearch($q, $tokens, $token_modifiers); - if (count($tokens)==0) - { - return $search_results; - } - $debug[] = ''; + $template->append('footer_elements', implode("\n", $debug) ); return $search_results; } - if (!empty($not_tag_ids)) - { - $query = ' -SELECT image_id FROM '.IMAGE_TAG_TABLE.' - WHERE tag_id IN ('.implode(',',$not_tag_ids).') - GROUP BY image_id'; - $result = pwg_query($query); - while ($row = pwg_db_fetch_row($result)) - { - $id = $row[0]; - unset($by_weights[$id]); - } - $debug[] = count($by_weights).' after not tags'; - } - // Step 4 - now we have $by_weights ( array image id => weight ) that need - // permission checks and/or matching categories to get images from $where_clauses = array(); - if ( !empty($by_weights) ) - { - $where_clauses[]='i.id IN (' - . implode(',', array_keys($by_weights)) . ')'; - } - if ( !empty($search_results['qs']['matching_cats']) ) - { - $where_clauses[]='category_id IN ('. - implode(',',array_keys($search_results['qs']['matching_cats'])).')'; - } - $where_clauses = array( '('.implode("\n OR ",$where_clauses).')' ); + $where_clauses[]='i.id IN ('. implode(',', $ids) . ')'; if (!empty($images_where)) { $where_clauses[]='('.$images_where.')'; @@ -779,30 +850,12 @@ SELECT DISTINCT(id) WHERE '.implode("\n AND ", $where_clauses)."\n". $conf['order_by']; - $allowed_images = array_from_query( $query, 'id'); - - $debug[] = count($allowed_images).' final photo count -->'; - global $template; - $template->append('footer_elements', implode(', ', $debug) ); - - if ( $super_order_by or empty($by_weights) ) - { - $search_results['items'] = $allowed_images; - return $search_results; - } + $ids = query2array($query, null, 'id'); - $allowed_images = array_flip( $allowed_images ); - $divisor = 5.0 * count($allowed_images); - foreach ($allowed_images as $id=> &$rank ) - { - $weight = isset($by_weights[$id]) ? $by_weights[$id] : 1; - $weight -= $rank/$divisor; - $rank = $weight; - } - unset($rank); + $debug[] = count($ids).' final photo count -->'; + $template->append('footer_elements', implode("\n", $debug) ); - arsort($allowed_images, SORT_NUMERIC); - $search_results['items'] = array_keys($allowed_images); + $search_results['items'] = $ids; return $search_results; } -- cgit v1.2.3