diff options
author | rvelices <rv-github@modusoptimus.com> | 2014-04-09 21:23:49 +0000 |
---|---|---|
committer | rvelices <rv-github@modusoptimus.com> | 2014-04-09 21:23:49 +0000 |
commit | 3ceca0d755df8f7dc229582df2efe3b96853c263 (patch) | |
tree | 1e784c7c924181efef9f256f40601182e6f1ed16 /include/functions_search.inc.php | |
parent | 2c3eb75ee9808b798e7cc3231f0bf1f2f1e86960 (diff) |
bug 3056: quick search - now tag search is the same as image search (full text match or like operator)
git-svn-id: http://piwigo.org/svn/trunk@28144 68402e56-0260-453c-a942-63ccdbb3a9ee
Diffstat (limited to '')
-rw-r--r-- | include/functions_search.inc.php | 297 |
1 files changed, 89 insertions, 208 deletions
diff --git a/include/functions_search.inc.php b/include/functions_search.inc.php index 6c83df76a..b3240f967 100644 --- a/include/functions_search.inc.php +++ b/include/functions_search.inc.php @@ -255,38 +255,6 @@ SELECT DISTINCT(id) return $items; } -/** - * Finds if a char is a letter, a figure or any char of the extended ASCII table (>127). - * - * @param char $ch - * @return bool - */ -function is_word_char($ch) -{ - return ($ch>='0' && $ch<='9') || ($ch>='a' && $ch<='z') || ($ch>='A' && $ch<='Z') || ord($ch)>127; -} - -/** - * Finds if a char is a special token for word start: [{<=*+ - * - * @param char $ch - * @return bool - */ -function is_odd_wbreak_begin($ch) -{ - return strpos('[{<=*+', $ch)===false ? false:true; -} - -/** - * Finds if a char is a special token for word end: ]}>=*+ - * - * @param char $ch - * @return bool - */ -function is_odd_wbreak_end($ch) -{ - return strpos(']}>=*+', $ch)===false ? false:true; -} define('QST_QUOTED', 0x01); @@ -295,7 +263,7 @@ define('QST_OR', 0x04); define('QST_WILDCARD_BEGIN', 0x08); define('QST_WILDCARD_END', 0x10); define('QST_WILDCARD', QST_WILDCARD_BEGIN|QST_WILDCARD_END); - +define('QST_BREAK', 0x20); class QSearchScope { @@ -552,6 +520,8 @@ class QMultiToken { if (strlen($token) || (isset($scope) && $scope->nullable)) { + if (isset($scope)) + $modifier |= QST_BREAK; $this->tokens[] = new QSingleToken($token, $modifier, $scope); } $token = ""; @@ -713,9 +683,18 @@ class QMultiToken if ($remove) { array_splice($this->tokens, $i, 1); + if ($i<count($this->tokens) && $this->tokens[$i]->is_single) + { + $this->tokens[$i]->modifier |= QST_BREAK; + } $i--; } } + + if ($level>0 && count($this->tokens) && $this->tokens[0]->is_single) + { + $this->tokens[0]->modifier |= QST_BREAK; + } } private function apply_scope(QSearchScope $scope) @@ -836,6 +815,39 @@ class QResults var $iids; } +function qsearch_get_text_token_search_sql($token, $fields) +{ + $clauses = array(); + $variants = array_merge(array($token->term), $token->variants); + $fts = array(); + foreach ($variants as $variant) + { + if (mb_strlen($variant)<=3 + || strcspn($variant, '!"#$%&()*+,./:;<=>?@[\]^`{|}~') < 3) + {// odd term or too short for full text search; fallback to regex but unfortunately this is diacritic/accent sensitive + $pre = ($token->modifier & QST_WILDCARD_BEGIN) ? '' : '[[:<:]]'; + $post = ($token->modifier & QST_WILDCARD_END) ? '' : '[[:>:]]'; + foreach( $fields as $field) + $clauses[] = $field.' REGEXP \''.$pre.addslashes(preg_quote($variant)).$post.'\''; + } + else + { + $ft = $variant; + if ($token->modifier & QST_QUOTED) + $ft = '"'.$ft.'"'; + if ($token->modifier & QST_WILDCARD_END) + $ft .= '*'; + $fts[] = $ft; + } + } + + if (count($fts)) + { + $clauses[] = 'MATCH('.implode(', ',$fields).') AGAINST( \''.addslashes(implode(' ',$fts)).'\' IN BOOLEAN MODE)'; + } + return $clauses; +} + function qsearch_get_images(QExpression $expr, QResults $qsr) { $qsr->images_iids = array_fill(0, count($expr->stokens), array()); @@ -856,34 +868,7 @@ function qsearch_get_images(QExpression $expr, QResults $qsr) { case 'photo': $clauses[] = $file_like; - - $variants = array_merge(array($token->term), $token->variants); - $fts = array(); - foreach ($variants as $variant) - { - if (mb_strlen($variant)<=3 - || strcspn($variant, '!"#$%&()*+,./:;<=>?@[\]^`{|}~') < 3) - {// odd term or too short for full text search; fallback to regex but unfortunately this is diacritic/accent sensitive - $pre = ($token->modifier & QST_WILDCARD_BEGIN) ? '' : '[[:<:]]'; - $post = ($token->modifier & QST_WILDCARD_END) ? '' : '[[:>:]]'; - foreach( array('i.name', 'i.comment') as $field) - $clauses[] = $field.' REGEXP \''.$pre.addslashes(preg_quote($variant)).$post.'\''; - } - else - { - $ft = $variant; - if ($expr->stoken_modifiers[$i] & QST_QUOTED) - $ft = '"'.$ft.'"'; - if ($expr->stoken_modifiers[$i] & QST_WILDCARD_END) - $ft .= '*'; - $fts[] = $ft; - } - } - - if (count($fts)) - { - $clauses[] = 'MATCH(i.name, i.comment) AGAINST( \''.addslashes(implode(' ',$fts)).'\' IN BOOLEAN MODE)'; - } + $clauses = array_merge($clauses, qsearch_get_text_token_search_sql($token, array('name','comment'))); break; case 'file': @@ -929,168 +914,49 @@ function qsearch_get_images(QExpression $expr, QResults $qsr) function qsearch_get_tags(QExpression $expr, QResults $qsr) { - $tokens = $expr->stokens; - $token_modifiers = $expr->stoken_modifiers; - - $token_tag_ids = array_fill(0, count($tokens), array() ); + $token_tag_ids = $qsr->tag_iids = array_fill(0, count($expr->stokens), array() ); $all_tags = array(); - $token_tag_scores = $token_tag_ids; - $transliterated_tokens = array(); - foreach ($tokens as $token) + for ($i=0; $i<count($expr->stokens); $i++) { - if (!isset($token->scope) || 'tag' == $token->scope->id) - { - $transliterated_tokens[] = transliterate($token->term); - } - else + $token = $expr->stokens[$i]; + if (isset($token->scope) && 'tag' != $token->scope->id) + continue; + if (empty($token->term)) + continue; + + $clauses = qsearch_get_text_token_search_sql( $token, array('name')); + $query = 'SELECT * FROM '.TAGS_TABLE.' +WHERE ('. implode("\n OR ",$clauses) .')'; + $result = pwg_query($query); + while ($tag = pwg_db_fetch_assoc($result)) { - $transliterated_tokens[] = ''; + $token_tag_ids[$i][] = $tag['id']; + $all_tags[$tag['id']] = $tag; } } - $query = ' -SELECT t.*, COUNT(image_id) AS counter - FROM '.TAGS_TABLE.' t - INNER JOIN '.IMAGE_TAG_TABLE.' ON id=tag_id - GROUP BY id'; - $result = pwg_query($query); - while ($tag = pwg_db_fetch_assoc($result)) + // check adjacent short words + for ($i=0; $i<count($expr->stokens)-1; $i++) { - $transliterated_tag = transliterate($tag['name']); - - // find how this tag matches query tokens - for ($i=0; $i<count($tokens); $i++) + if ( (strlen($expr->stokens[$i])<=3 || strlen($expr->stokens[$i+1])<=3) + && (($expr->stoken_modifiers[$i] & (QST_QUOTED|QST_WILDCARD)) == 0) + && (($expr->stoken_modifiers[$i+1] & (QST_BREAK|QST_QUOTED|QST_WILDCARD)) == 0) ) { - $transliterated_token = $transliterated_tokens[$i]; - if (strlen($transliterated_token)==0) - continue; - - $match = false; - $pos = 0; - while ( ($pos = strpos($transliterated_tag, $transliterated_token, $pos)) !== false) + $common = array_intersect( $token_tag_ids[$i], $token_tag_ids[$i+1] ); + if (count($common)) { - if ( ($token_modifiers[$i]&QST_WILDCARD)==QST_WILDCARD ) - {// wildcard in this token - $match = 1; - break; - } - $token_len = strlen($transliterated_token); - - // search begin of word - $wbegin_len=0; $wbegin_char=' '; - while ($pos-$wbegin_len > 0) - { - if (! is_word_char($transliterated_tag[$pos-$wbegin_len-1]) ) - { - $wbegin_char = $transliterated_tag[$pos-$wbegin_len-1]; - break; - } - $wbegin_len++; - } - - // search end of word - $wend_len=0; $wend_char=' '; - while ($pos+$token_len+$wend_len < strlen($transliterated_tag)) - { - if (! is_word_char($transliterated_tag[$pos+$token_len+$wend_len]) ) - { - $wend_char = $transliterated_tag[$pos+$token_len+$wend_len]; - break; - } - $wend_len++; - } - - $this_score = 0; - if ( ($token_modifiers[$i]&QST_WILDCARD)==0 ) - {// no wildcard begin or end - if ($token_len <= 2) - {// search for 1 or 2 characters must match exactly to avoid retrieving too much data - if ($wbegin_len==0 && $wend_len==0 && !is_odd_wbreak_begin($wbegin_char) && !is_odd_wbreak_end($wend_char) ) - $this_score = 1; - } - elseif ($token_len == 3) - { - if ($wbegin_len==0) - $this_score = $token_len / ($token_len + $wend_len); - } - else - { - $this_score = $token_len / ($token_len + 1.1 * $wbegin_len + 0.9 * $wend_len); - } - } - - if ($this_score>0) - $match = max($match, $this_score ); - $pos++; - } - - if ($match) - { - $tag_id = (int)$tag['id']; - $all_tags[$tag_id] = $tag; - $token_tag_ids[$i][] = $tag_id; - $token_tag_scores[$i][] = $match; + $token_tag_ids[$i] = $token_tag_ids[$i+1] = $common; } } } - // process tags - $not_tag_ids = array(); - for ($i=0; $i<count($tokens); $i++) - { - array_multisort($token_tag_scores[$i], SORT_DESC|SORT_NUMERIC, $token_tag_ids[$i]); - $is_not = $token_modifiers[$i]&QST_NOT; - $counter = 0; - - for ($j=0; $j<count($token_tag_scores[$i]); $j++) - { - if ($is_not) - { - if ($token_tag_scores[$i][$j] < 0.8 || - ($j>0 && $token_tag_scores[$i][$j] < $token_tag_scores[$i][0]) ) - { - array_splice($token_tag_scores[$i], $j); - array_splice($token_tag_ids[$i], $j); - } - } - else - { - $tag_id = $token_tag_ids[$i][$j]; - $counter += $all_tags[$tag_id]['counter']; - if ( $j>0 && ( - ($counter > 100 && $token_tag_scores[$i][0] > $token_tag_scores[$i][$j]) // "many" images in previous tags and starting from this tag is less relevant - || ($token_tag_scores[$i][0]==1 && $token_tag_scores[$i][$j]<0.8) - || ($token_tag_scores[$i][0]>0.8 && $token_tag_scores[$i][$j]<0.5) - )) - {// we remove this tag from the results, but we still leave it in all_tags list so that if we are wrong, the user chooses it - array_splice($token_tag_ids[$i], $j); - array_splice($token_tag_scores[$i], $j); - break; - } - } - } - - if ($is_not) - { - $not_tag_ids = array_merge($not_tag_ids, $token_tag_ids[$i]); - } - } - - $all_tags = array_diff_key($all_tags, array_flip($not_tag_ids)); - usort($all_tags, 'tag_alpha_compare'); - foreach ( $all_tags as &$tag ) - { - $tag['name'] = trigger_event('render_tag_name', $tag['name'], $tag); - } - $qsr->all_tags = $all_tags; - - $qsr->tag_ids = $token_tag_ids; - $qsr->tag_iids = array_fill(0, count($tokens), array() ); - - for ($i=0; $i<count($tokens); $i++) + // get images + $positive_ids = $not_ids = array(); + for ($i=0; $i<count($expr->stokens); $i++) { $tag_ids = $token_tag_ids[$i]; + $token = $expr->stokens[$i]; if (!empty($tag_ids)) { @@ -1099,8 +965,12 @@ SELECT image_id FROM '.IMAGE_TAG_TABLE.' WHERE tag_id IN ('.implode(',',$tag_ids).') GROUP BY image_id'; $qsr->tag_iids[$i] = query2array($query, null, 'image_id'); + if ($expr->stoken_modifiers[$i]&QST_NOT) + $not_ids = array_merge($not_ids, $tag_ids); + else + $positive_ids = array_merge($positive_ids, $tag_ids); } - elseif (isset($tokens[$i]->scope) && 'tag' == $tokens[$i]->scope->id && strlen($token->term)==0) + elseif (isset($token->scope) && 'tag' == $token->scope->id && strlen($token->term)==0) { if ($tokens[$i]->modifier & QST_WILDCARD) {// eg. 'tag:*' returns all tagged images @@ -1112,9 +982,19 @@ SELECT image_id FROM '.IMAGE_TAG_TABLE.' } } } + + $all_tags = array_intersect_key($all_tags, array_flip( array_diff($positive_ids, $not_ids) ) ); + usort($all_tags, 'tag_alpha_compare'); + foreach ( $all_tags as &$tag ) + { + $tag['name'] = trigger_event('render_tag_name', $tag['name'], $tag); + } + $qsr->all_tags = $all_tags; + $qsr->tag_ids = $token_tag_ids; } + function qsearch_eval(QMultiToken $expr, QResults $qsr, &$qualifies, &$ignored_terms) { $qualifies = false; // until we find at least one positive term @@ -1259,6 +1139,7 @@ function get_quick_search_results($q, $options) for ($i=0; $i<count($expression->stokens); $i++) { $debug[] = $expression->stokens[$i].': '.count($qsr->tag_ids[$i]).' tags, '.count($qsr->tag_iids[$i]).' tiids, '.count($qsr->images_iids[$i]).' iiids, '.count($qsr->iids[$i]).' iids' + .' modifier:'.dechex($expression->stoken_modifiers[$i]) .( !empty($expression->stokens[$i]->variants) ? ' variants: '.implode(', ',$expression->stokens[$i]->variants): ''); } $debug[] = 'before perms '.count($ids); |