bug 3056: quick search - now tag search is the same as image search (full text match or like operator)

git-svn-id: http://piwigo.org/svn/trunk@28144 68402e56-0260-453c-a942-63ccdbb3a9ee
This commit is contained in:
rvelices 2014-04-09 21:23:49 +00:00
parent 2c3eb75ee9
commit 3ceca0d755

View file

@ -255,38 +255,6 @@ SELECT DISTINCT(id)
return $items;
}
/**
* Finds if a char is a letter, a figure or any char of the extended ASCII table (>127).
*
* @param char $ch
* @return bool
*/
function is_word_char($ch)
{
return ($ch>='0' && $ch<='9') || ($ch>='a' && $ch<='z') || ($ch>='A' && $ch<='Z') || ord($ch)>127;
}
/**
* Finds if a char is a special token for word start: [{<=*+
*
* @param char $ch
* @return bool
*/
function is_odd_wbreak_begin($ch)
{
return strpos('[{<=*+', $ch)===false ? false:true;
}
/**
* Finds if a char is a special token for word end: ]}>=*+
*
* @param char $ch
* @return bool
*/
function is_odd_wbreak_end($ch)
{
return strpos(']}>=*+', $ch)===false ? false:true;
}
define('QST_QUOTED', 0x01);
@ -295,7 +263,7 @@ define('QST_OR', 0x04);
define('QST_WILDCARD_BEGIN', 0x08);
define('QST_WILDCARD_END', 0x10);
define('QST_WILDCARD', QST_WILDCARD_BEGIN|QST_WILDCARD_END);
define('QST_BREAK', 0x20);
class QSearchScope
{
@ -552,6 +520,8 @@ class QMultiToken
{
if (strlen($token) || (isset($scope) && $scope->nullable))
{
if (isset($scope))
$modifier |= QST_BREAK;
$this->tokens[] = new QSingleToken($token, $modifier, $scope);
}
$token = "";
@ -713,9 +683,18 @@ class QMultiToken
if ($remove)
{
array_splice($this->tokens, $i, 1);
if ($i<count($this->tokens) && $this->tokens[$i]->is_single)
{
$this->tokens[$i]->modifier |= QST_BREAK;
}
$i--;
}
}
if ($level>0 && count($this->tokens) && $this->tokens[0]->is_single)
{
$this->tokens[0]->modifier |= QST_BREAK;
}
}
private function apply_scope(QSearchScope $scope)
@ -836,6 +815,39 @@ class QResults
var $iids;
}
function qsearch_get_text_token_search_sql($token, $fields)
{
$clauses = array();
$variants = array_merge(array($token->term), $token->variants);
$fts = array();
foreach ($variants as $variant)
{
if (mb_strlen($variant)<=3
|| strcspn($variant, '!"#$%&()*+,./:;<=>?@[\]^`{|}~') < 3)
{// odd term or too short for full text search; fallback to regex but unfortunately this is diacritic/accent sensitive
$pre = ($token->modifier & QST_WILDCARD_BEGIN) ? '' : '[[:<:]]';
$post = ($token->modifier & QST_WILDCARD_END) ? '' : '[[:>:]]';
foreach( $fields as $field)
$clauses[] = $field.' REGEXP \''.$pre.addslashes(preg_quote($variant)).$post.'\'';
}
else
{
$ft = $variant;
if ($token->modifier & QST_QUOTED)
$ft = '"'.$ft.'"';
if ($token->modifier & QST_WILDCARD_END)
$ft .= '*';
$fts[] = $ft;
}
}
if (count($fts))
{
$clauses[] = 'MATCH('.implode(', ',$fields).') AGAINST( \''.addslashes(implode(' ',$fts)).'\' IN BOOLEAN MODE)';
}
return $clauses;
}
function qsearch_get_images(QExpression $expr, QResults $qsr)
{
$qsr->images_iids = array_fill(0, count($expr->stokens), array());
@ -856,34 +868,7 @@ function qsearch_get_images(QExpression $expr, QResults $qsr)
{
case 'photo':
$clauses[] = $file_like;
$variants = array_merge(array($token->term), $token->variants);
$fts = array();
foreach ($variants as $variant)
{
if (mb_strlen($variant)<=3
|| strcspn($variant, '!"#$%&()*+,./:;<=>?@[\]^`{|}~') < 3)
{// odd term or too short for full text search; fallback to regex but unfortunately this is diacritic/accent sensitive
$pre = ($token->modifier & QST_WILDCARD_BEGIN) ? '' : '[[:<:]]';
$post = ($token->modifier & QST_WILDCARD_END) ? '' : '[[:>:]]';
foreach( array('i.name', 'i.comment') as $field)
$clauses[] = $field.' REGEXP \''.$pre.addslashes(preg_quote($variant)).$post.'\'';
}
else
{
$ft = $variant;
if ($expr->stoken_modifiers[$i] & QST_QUOTED)
$ft = '"'.$ft.'"';
if ($expr->stoken_modifiers[$i] & QST_WILDCARD_END)
$ft .= '*';
$fts[] = $ft;
}
}
if (count($fts))
{
$clauses[] = 'MATCH(i.name, i.comment) AGAINST( \''.addslashes(implode(' ',$fts)).'\' IN BOOLEAN MODE)';
}
$clauses = array_merge($clauses, qsearch_get_text_token_search_sql($token, array('name','comment')));
break;
case 'file':
@ -929,168 +914,49 @@ function qsearch_get_images(QExpression $expr, QResults $qsr)
function qsearch_get_tags(QExpression $expr, QResults $qsr)
{
$tokens = $expr->stokens;
$token_modifiers = $expr->stoken_modifiers;
$token_tag_ids = array_fill(0, count($tokens), array() );
$token_tag_ids = $qsr->tag_iids = array_fill(0, count($expr->stokens), array() );
$all_tags = array();
$token_tag_scores = $token_tag_ids;
$transliterated_tokens = array();
foreach ($tokens as $token)
for ($i=0; $i<count($expr->stokens); $i++)
{
if (!isset($token->scope) || 'tag' == $token->scope->id)
{
$transliterated_tokens[] = transliterate($token->term);
}
else
{
$transliterated_tokens[] = '';
}
}
$token = $expr->stokens[$i];
if (isset($token->scope) && 'tag' != $token->scope->id)
continue;
if (empty($token->term))
continue;
$query = '
SELECT t.*, COUNT(image_id) AS counter
FROM '.TAGS_TABLE.' t
INNER JOIN '.IMAGE_TAG_TABLE.' ON id=tag_id
GROUP BY id';
$clauses = qsearch_get_text_token_search_sql( $token, array('name'));
$query = 'SELECT * FROM '.TAGS_TABLE.'
WHERE ('. implode("\n OR ",$clauses) .')';
$result = pwg_query($query);
while ($tag = pwg_db_fetch_assoc($result))
{
$transliterated_tag = transliterate($tag['name']);
// find how this tag matches query tokens
for ($i=0; $i<count($tokens); $i++)
{
$transliterated_token = $transliterated_tokens[$i];
if (strlen($transliterated_token)==0)
continue;
$match = false;
$pos = 0;
while ( ($pos = strpos($transliterated_tag, $transliterated_token, $pos)) !== false)
{
if ( ($token_modifiers[$i]&QST_WILDCARD)==QST_WILDCARD )
{// wildcard in this token
$match = 1;
break;
}
$token_len = strlen($transliterated_token);
// search begin of word
$wbegin_len=0; $wbegin_char=' ';
while ($pos-$wbegin_len > 0)
{
if (! is_word_char($transliterated_tag[$pos-$wbegin_len-1]) )
{
$wbegin_char = $transliterated_tag[$pos-$wbegin_len-1];
break;
}
$wbegin_len++;
}
// search end of word
$wend_len=0; $wend_char=' ';
while ($pos+$token_len+$wend_len < strlen($transliterated_tag))
{
if (! is_word_char($transliterated_tag[$pos+$token_len+$wend_len]) )
{
$wend_char = $transliterated_tag[$pos+$token_len+$wend_len];
break;
}
$wend_len++;
}
$this_score = 0;
if ( ($token_modifiers[$i]&QST_WILDCARD)==0 )
{// no wildcard begin or end
if ($token_len <= 2)
{// search for 1 or 2 characters must match exactly to avoid retrieving too much data
if ($wbegin_len==0 && $wend_len==0 && !is_odd_wbreak_begin($wbegin_char) && !is_odd_wbreak_end($wend_char) )
$this_score = 1;
}
elseif ($token_len == 3)
{
if ($wbegin_len==0)
$this_score = $token_len / ($token_len + $wend_len);
}
else
{
$this_score = $token_len / ($token_len + 1.1 * $wbegin_len + 0.9 * $wend_len);
$token_tag_ids[$i][] = $tag['id'];
$all_tags[$tag['id']] = $tag;
}
}
if ($this_score>0)
$match = max($match, $this_score );
$pos++;
}
if ($match)
// check adjacent short words
for ($i=0; $i<count($expr->stokens)-1; $i++)
{
$tag_id = (int)$tag['id'];
$all_tags[$tag_id] = $tag;
$token_tag_ids[$i][] = $tag_id;
$token_tag_scores[$i][] = $match;
if ( (strlen($expr->stokens[$i])<=3 || strlen($expr->stokens[$i+1])<=3)
&& (($expr->stoken_modifiers[$i] & (QST_QUOTED|QST_WILDCARD)) == 0)
&& (($expr->stoken_modifiers[$i+1] & (QST_BREAK|QST_QUOTED|QST_WILDCARD)) == 0) )
{
$common = array_intersect( $token_tag_ids[$i], $token_tag_ids[$i+1] );
if (count($common))
{
$token_tag_ids[$i] = $token_tag_ids[$i+1] = $common;
}
}
}
// process tags
$not_tag_ids = array();
for ($i=0; $i<count($tokens); $i++)
{
array_multisort($token_tag_scores[$i], SORT_DESC|SORT_NUMERIC, $token_tag_ids[$i]);
$is_not = $token_modifiers[$i]&QST_NOT;
$counter = 0;
for ($j=0; $j<count($token_tag_scores[$i]); $j++)
{
if ($is_not)
{
if ($token_tag_scores[$i][$j] < 0.8 ||
($j>0 && $token_tag_scores[$i][$j] < $token_tag_scores[$i][0]) )
{
array_splice($token_tag_scores[$i], $j);
array_splice($token_tag_ids[$i], $j);
}
}
else
{
$tag_id = $token_tag_ids[$i][$j];
$counter += $all_tags[$tag_id]['counter'];
if ( $j>0 && (
($counter > 100 && $token_tag_scores[$i][0] > $token_tag_scores[$i][$j]) // "many" images in previous tags and starting from this tag is less relevant
|| ($token_tag_scores[$i][0]==1 && $token_tag_scores[$i][$j]<0.8)
|| ($token_tag_scores[$i][0]>0.8 && $token_tag_scores[$i][$j]<0.5)
))
{// we remove this tag from the results, but we still leave it in all_tags list so that if we are wrong, the user chooses it
array_splice($token_tag_ids[$i], $j);
array_splice($token_tag_scores[$i], $j);
break;
}
}
}
if ($is_not)
{
$not_tag_ids = array_merge($not_tag_ids, $token_tag_ids[$i]);
}
}
$all_tags = array_diff_key($all_tags, array_flip($not_tag_ids));
usort($all_tags, 'tag_alpha_compare');
foreach ( $all_tags as &$tag )
{
$tag['name'] = trigger_event('render_tag_name', $tag['name'], $tag);
}
$qsr->all_tags = $all_tags;
$qsr->tag_ids = $token_tag_ids;
$qsr->tag_iids = array_fill(0, count($tokens), array() );
for ($i=0; $i<count($tokens); $i++)
// get images
$positive_ids = $not_ids = array();
for ($i=0; $i<count($expr->stokens); $i++)
{
$tag_ids = $token_tag_ids[$i];
$token = $expr->stokens[$i];
if (!empty($tag_ids))
{
@ -1099,8 +965,12 @@ SELECT image_id FROM '.IMAGE_TAG_TABLE.'
WHERE tag_id IN ('.implode(',',$tag_ids).')
GROUP BY image_id';
$qsr->tag_iids[$i] = query2array($query, null, 'image_id');
if ($expr->stoken_modifiers[$i]&QST_NOT)
$not_ids = array_merge($not_ids, $tag_ids);
else
$positive_ids = array_merge($positive_ids, $tag_ids);
}
elseif (isset($tokens[$i]->scope) && 'tag' == $tokens[$i]->scope->id && strlen($token->term)==0)
elseif (isset($token->scope) && 'tag' == $token->scope->id && strlen($token->term)==0)
{
if ($tokens[$i]->modifier & QST_WILDCARD)
{// eg. 'tag:*' returns all tagged images
@ -1112,7 +982,17 @@ SELECT image_id FROM '.IMAGE_TAG_TABLE.'
}
}
}
$all_tags = array_intersect_key($all_tags, array_flip( array_diff($positive_ids, $not_ids) ) );
usort($all_tags, 'tag_alpha_compare');
foreach ( $all_tags as &$tag )
{
$tag['name'] = trigger_event('render_tag_name', $tag['name'], $tag);
}
$qsr->all_tags = $all_tags;
$qsr->tag_ids = $token_tag_ids;
}
function qsearch_eval(QMultiToken $expr, QResults $qsr, &$qualifies, &$ignored_terms)
@ -1259,6 +1139,7 @@ function get_quick_search_results($q, $options)
for ($i=0; $i<count($expression->stokens); $i++)
{
$debug[] = $expression->stokens[$i].': '.count($qsr->tag_ids[$i]).' tags, '.count($qsr->tag_iids[$i]).' tiids, '.count($qsr->images_iids[$i]).' iiids, '.count($qsr->iids[$i]).' iids'
.' modifier:'.dechex($expression->stoken_modifiers[$i])
.( !empty($expression->stokens[$i]->variants) ? ' variants: '.implode(', ',$expression->stokens[$i]->variants): '');
}
$debug[] = 'before perms '.count($ids);