From 42324452602d2d53a560af5b328e97ed45d0e69a Mon Sep 17 00:00:00 2001 From: rvelices Date: Tue, 12 Apr 2011 19:46:36 +0000 Subject: feature:2248 Improve quick/query search results git-svn-id: http://piwigo.org/svn/trunk@10340 68402e56-0260-453c-a942-63ccdbb3a9ee --- include/functions_search.inc.php | 246 +++++++++++++++++++++++++++++---------- 1 file changed, 187 insertions(+), 59 deletions(-) (limited to 'include/functions_search.inc.php') diff --git a/include/functions_search.inc.php b/include/functions_search.inc.php index 220ee0d4e..b25d41050 100644 --- a/include/functions_search.inc.php +++ b/include/functions_search.inc.php @@ -265,17 +265,34 @@ SELECT DISTINCT(id) return $items; } + +if (function_exists('mb_strtolower')) +{ + function transliterate($term) + { + return remove_accents( mb_strtolower($term) ); + } +} +else +{ + function transliterate($term) + { + return remove_accents( strtolower($term) ); + } +} + +function is_word_char($ch) +{ + return ($ch>='0' && $ch<='9') || ($ch>='a' && $ch<='z') || ($ch>='A' && $ch<='Z') || ord($ch)>127; +} + /** - * returns the LIKE sql clause corresponding to the quick search query $q - * and the field $field. example q='john bill', field='file' will return - * file LIKE '%john%' OR file LIKE '%bill%'. Special characters for MySql full - * text search (+,<,>,~) are omitted. The query can contain a phrase: - * 'Pierre "New York"' will return LIKE '%Pierre%' OR LIKE '%New York%'. - * @param string q - * @param string field - * @return string + * analyzes and splits the quick/query search query $q into tokens + * q='john bill' => 2 tokens 'john' 'bill' + * Special characters for MySql full text search (+,<,>,~) appear in the token modifiers. + * The query can contain a phrase: 'Pierre "New York"' will return 'pierre' qnd 'new york'. */ -function get_qsearch_like_clause($q, $field, $before='%', $after='%') +function analyse_qsearch($q, &$qtokens, &$qtoken_modifiers) { $q = stripslashes($q); $tokens = array(); @@ -292,27 +309,27 @@ function get_qsearch_like_clause($q, $field, $before='%', $after='%') case 0: if ($ch=='"') { - if (strlen($crt_token)) - { - $tokens[] = $crt_token; - $token_modifiers[] = $crt_token_modifier; - $crt_token = ""; - $crt_token_modifier = ""; - } + $tokens[] = $crt_token; $token_modifiers[] = $crt_token_modifier; + $crt_token = ""; $crt_token_modifier = "q"; $state=1; } elseif ( $ch=='*' ) { // wild card - $crt_token .= '%'; + if (strlen($crt_token)) + { + $crt_token .= $ch; + } + else + { + $crt_token_modifier .= '*'; + } } elseif ( strcspn($ch, '+-><~')==0 ) { //special full text modifier if (strlen($crt_token)) { - $tokens[] = $crt_token; - $token_modifiers[] = $crt_token_modifier; - $crt_token = ""; - $crt_token_modifier = ""; + $tokens[] = $crt_token; $token_modifiers[] = $crt_token_modifier; + $crt_token = ""; $crt_token_modifier = ""; } $crt_token_modifier .= $ch; } @@ -320,18 +337,12 @@ function get_qsearch_like_clause($q, $field, $before='%', $after='%') { // white space if (strlen($crt_token)) { - $tokens[] = $crt_token; - $token_modifiers[] = $crt_token_modifier; - $crt_token = ""; - $crt_token_modifier = ""; + $tokens[] = $crt_token; $token_modifiers[] = $crt_token_modifier; + $crt_token = ""; $crt_token_modifier = ""; } } else { - if ( strcspn($ch, '%_')==0) - {// escape LIKE specials %_ - $ch = '\\'.$ch; - } $crt_token .= $ch; } break; @@ -339,17 +350,11 @@ function get_qsearch_like_clause($q, $field, $before='%', $after='%') switch ($ch) { case '"': - $tokens[] = $crt_token; - $token_modifiers[] = $crt_token_modifier; - $crt_token = ""; - $crt_token_modifier = ""; + $tokens[] = $crt_token; $token_modifiers[] = $crt_token_modifier; + $crt_token = ""; $crt_token_modifier = ""; $state=0; break; default: - if ( strcspn($ch, '%_')==0) - {// escape LIKE specials %_ - $ch = '\\'.$ch; - } $crt_token .= $ch; } break; @@ -361,21 +366,49 @@ function get_qsearch_like_clause($q, $field, $before='%', $after='%') $token_modifiers[] = $crt_token_modifier; } + $qtokens = array(); + $qtoken_modifiers = array(); + for ($i=0; $i array(), @@ -405,9 +440,11 @@ function get_quick_search_results($q, $super_order_by, $images_where='') { return $search_results; } - $q_like_field = '@@__db_field__@@'; //something never in a search - $q_like_clause = get_qsearch_like_clause($q, $q_like_field ); + + analyse_qsearch($q, $tokens, $token_modifiers); + $q_like_field = '@@__db_field__@@'; //something never in a search + $q_like_clause = get_qsearch_like_clause($tokens, $token_modifiers, $q_like_field ); // Step 1 - first we find matches in #images table =========================== $where_clauses='MATCH(i.name, i.comment) AGAINST( \''.$q.'\' IN BOOLEAN MODE)'; @@ -448,34 +485,126 @@ SELECT i.id, // Step 2 - search tags corresponding to the query $q ======================== - if (!empty($q_like_clause)) - { // search name and url name (without accents) - $query = ' -SELECT id, name, url_name + $transliterated_tokens = array(); + $token_tags = array(); + foreach ($tokens as $token) + { + $transliterated_tokens[] = transliterate($token); + $token_tags[] = array(); + } + + // Step 2.1 - find match tags for every token in the query search + $all_tags = array(); + $query = ' +SELECT id, name, url_name, COUNT(image_id) AS nb_images FROM '.TAGS_TABLE.' - WHERE ('.str_replace($q_like_field, 'CONVERT(name, CHAR)', $q_like_clause).' - OR '.str_replace($q_like_field, 'url_name', $q_like_clause).')'; - $tags = hash_from_query($query, 'id'); - if ( !empty($tags) ) - { // we got some tags; get the images - $search_results['qs']['matching_tags']=$tags; + INNER JOIN '.IMAGE_TAG_TABLE.' ON id=tag_id + GROUP BY id'; + $result = pwg_query($query); + while ($tag = pwg_db_fetch_assoc($result)) + { + $transliterated_tag = transliterate($tag['name']); + + // find how this tag matches query tokens + for ($i=0; $i0) + { + if (! is_word_char($transliterated_tag[$word_begin-1]) ) + break; + $word_begin--; + } + + $word_end = $pos + $token_len; + while ($word_end 4) + $this_score = 0; + } + + if ($this_score>0) + $match = max($match, $this_score ); + $pos++; + } + + if ($match) + { + $tag_id = (int)$tag['id']; + $all_tags[$tag_id] = $tag; + $token_tags[$i][] = array('tag_id'=>$tag_id, 'score'=>$match); + } + } + } + $search_results['qs']['matching_tags']=$all_tags; + + // Step 2.2 - reduce matching tags for every token in the query search + $score_cmp_fn = create_function('$a,$b', 'return 100*($b["score"]-$a["score"]);'); + foreach ($token_tags as &$tt) + { + usort($tt, $score_cmp_fn); + $nb_images = 0; + $prev_score = 0; + for ($j=0; $j 200 && $prev_score > $tt[$j]['score'] ) + {// "many" images in previous tags and starting from this tag is less relevent + $tt = array_slice( $tt, 0, $j); + break; + } + $nb_images += $all_tags[ $tt[$j]['tag_id'] ]['nb_images']; + $prev_score = $tt[$j]['score']; + } + } + + // Step 2.3 - get the images for tags + for ($i=0; $i