bug 3056: quick search - now tag search is the same as image search (full text match or like operator)

git-svn-id: http://piwigo.org/svn/trunk@28144 68402e56-0260-453c-a942-63ccdbb3a9ee
author: rvelices <rv-github@modusoptimus.com> 2014-04-09 21:23:49 +0000
committer: rvelices <rv-github@modusoptimus.com> 2014-04-09 21:23:49 +0000
commit: 3ceca0d755df8f7dc229582df2efe3b96853c263 (patch)
tree: 1e784c7c924181efef9f256f40601182e6f1ed16 /include/functions_search.inc.php
parent: 2c3eb75ee9808b798e7cc3231f0bf1f2f1e86960 (diff)
1 files changed, 89 insertions, 208 deletions
diff --git a/include/functions_search.inc.php b/include/functions_search.inc.php
index 6c83df76a..b3240f967 100644
--- a/include/functions_search.inc.php
+++ b/include/functions_search.inc.php
@@ -255,38 +255,6 @@ SELECT DISTINCT(id)
   return $items;
 }
 
-/**
- * Finds if a char is a letter, a figure or any char of the extended ASCII table (>127).
- *
- * @param char $ch
- * @return bool
- */
-function is_word_char($ch)
-{
-  return ($ch>='0' && $ch<='9') || ($ch>='a' && $ch<='z') || ($ch>='A' && $ch<='Z') || ord($ch)>127;
-}
-
-/**
- * Finds if a char is a special token for word start: [{<=*+
- *
- * @param char $ch
- * @return bool
- */
-function is_odd_wbreak_begin($ch)
-{
-  return strpos('[{<=*+', $ch)===false ? false:true;
-}
-
-/**
- * Finds if a char is a special token for word end: ]}>=*+
- *
- * @param char $ch
- * @return bool
- */
-function is_odd_wbreak_end($ch)
-{
-  return strpos(']}>=*+', $ch)===false ? false:true;
-}
 
 
 define('QST_QUOTED',         0x01);
@@ -295,7 +263,7 @@ define('QST_OR',             0x04);
 define('QST_WILDCARD_BEGIN', 0x08);
 define('QST_WILDCARD_END',   0x10);
 define('QST_WILDCARD', QST_WILDCARD_BEGIN|QST_WILDCARD_END);
-
+define('QST_BREAK',          0x20);
 
 class QSearchScope
 {
@@ -552,6 +520,8 @@ class QMultiToken
   {
     if (strlen($token) || (isset($scope) && $scope->nullable))
     {
+      if (isset($scope))
+        $modifier |= QST_BREAK;
       $this->tokens[] = new QSingleToken($token, $modifier, $scope);
     }
     $token = "";
@@ -713,9 +683,18 @@ class QMultiToken
       if ($remove)
       {
         array_splice($this->tokens, $i, 1);
+        if ($i<count($this->tokens) && $this->tokens[$i]->is_single)
+        {
+          $this->tokens[$i]->modifier |= QST_BREAK;
+        }
         $i--;
       }
     }
+
+    if ($level>0 && count($this->tokens) && $this->tokens[0]->is_single)
+    {
+      $this->tokens[0]->modifier |= QST_BREAK;
+    }
   }
 
   private function apply_scope(QSearchScope $scope)
@@ -836,6 +815,39 @@ class QResults
   var $iids;
 }
 
+function qsearch_get_text_token_search_sql($token, $fields)
+{
+  $clauses = array();
+  $variants = array_merge(array($token->term), $token->variants);
+  $fts = array();
+  foreach ($variants as $variant)
+  {
+    if (mb_strlen($variant)<=3
+      || strcspn($variant, '!"#$%&()*+,./:;<=>?@[\]^`{|}~') < 3)
+    {// odd term or too short for full text search; fallback to regex but unfortunately this is diacritic/accent sensitive
+      $pre = ($token->modifier & QST_WILDCARD_BEGIN) ? '' : '[[:<:]]';
+      $post = ($token->modifier & QST_WILDCARD_END) ? '' : '[[:>:]]';
+      foreach( $fields as $field)
+        $clauses[] = $field.' REGEXP \''.$pre.addslashes(preg_quote($variant)).$post.'\'';
+    }
+    else
+    {
+      $ft = $variant;
+      if ($token->modifier & QST_QUOTED)
+        $ft = '"'.$ft.'"';
+      if ($token->modifier & QST_WILDCARD_END)
+        $ft .= '*';
+      $fts[] = $ft;
+    }
+  }
+
+  if (count($fts))
+  {
+    $clauses[] = 'MATCH('.implode(', ',$fields).') AGAINST( \''.addslashes(implode(' ',$fts)).'\' IN BOOLEAN MODE)';
+  }
+  return $clauses;
+}
+
 function qsearch_get_images(QExpression $expr, QResults $qsr)
 {
   $qsr->images_iids = array_fill(0, count($expr->stokens), array());
@@ -856,34 +868,7 @@ function qsearch_get_images(QExpression $expr, QResults $qsr)
     {
       case 'photo':
         $clauses[] = $file_like;
-
-        $variants = array_merge(array($token->term), $token->variants);
-        $fts = array();
-        foreach ($variants as $variant)
-        {
-          if (mb_strlen($variant)<=3
-            || strcspn($variant, '!"#$%&()*+,./:;<=>?@[\]^`{|}~') < 3)
-          {// odd term or too short for full text search; fallback to regex but unfortunately this is diacritic/accent sensitive
-            $pre = ($token->modifier & QST_WILDCARD_BEGIN) ? '' : '[[:<:]]';
-            $post = ($token->modifier & QST_WILDCARD_END) ? '' : '[[:>:]]';
-            foreach( array('i.name', 'i.comment') as $field)
-              $clauses[] = $field.' REGEXP \''.$pre.addslashes(preg_quote($variant)).$post.'\'';
-          }
-          else
-          {
-            $ft = $variant;
-            if ($expr->stoken_modifiers[$i] & QST_QUOTED)
-              $ft = '"'.$ft.'"';
-            if ($expr->stoken_modifiers[$i] & QST_WILDCARD_END)
-              $ft .= '*';
-            $fts[] = $ft;
-          }
-        }
-
-        if (count($fts))
-        {
-          $clauses[] = 'MATCH(i.name, i.comment) AGAINST( \''.addslashes(implode(' ',$fts)).'\' IN BOOLEAN MODE)';
-        }
+        $clauses = array_merge($clauses, qsearch_get_text_token_search_sql($token, array('name','comment')));
         break;
 
       case 'file':
@@ -929,168 +914,49 @@ function qsearch_get_images(QExpression $expr, QResults $qsr)
 
 function qsearch_get_tags(QExpression $expr, QResults $qsr)
 {
-  $tokens = $expr->stokens;
-  $token_modifiers = $expr->stoken_modifiers;
-
-  $token_tag_ids = array_fill(0, count($tokens), array() );
+  $token_tag_ids = $qsr->tag_iids = array_fill(0, count($expr->stokens), array() );
   $all_tags = array();
 
-  $token_tag_scores = $token_tag_ids;
-  $transliterated_tokens = array();
-  foreach ($tokens as $token)
+  for ($i=0; $i<count($expr->stokens); $i++)
   {
-    if (!isset($token->scope) || 'tag' == $token->scope->id)
-    {
-      $transliterated_tokens[] = transliterate($token->term);
-    }
-    else
+    $token = $expr->stokens[$i];
+    if (isset($token->scope) && 'tag' != $token->scope->id)
+      continue;
+    if (empty($token->term))
+      continue;
+
+    $clauses = qsearch_get_text_token_search_sql( $token, array('name'));
+    $query = 'SELECT * FROM '.TAGS_TABLE.'
+WHERE ('. implode("\n OR ",$clauses) .')';
+    $result = pwg_query($query);
+    while ($tag = pwg_db_fetch_assoc($result))
     {
-      $transliterated_tokens[] = '';
+      $token_tag_ids[$i][] = $tag['id'];
+      $all_tags[$tag['id']] = $tag;
     }
   }
 
-  $query = '
-SELECT t.*, COUNT(image_id) AS counter
-  FROM '.TAGS_TABLE.' t
-    INNER JOIN '.IMAGE_TAG_TABLE.' ON id=tag_id
-  GROUP BY id';
-  $result = pwg_query($query);
-  while ($tag = pwg_db_fetch_assoc($result))
+  // check adjacent short words
+  for ($i=0; $i<count($expr->stokens)-1; $i++)
   {
-    $transliterated_tag = transliterate($tag['name']);
-
-    // find how this tag matches query tokens
-    for ($i=0; $i<count($tokens); $i++)
+    if ( (strlen($expr->stokens[$i])<=3 || strlen($expr->stokens[$i+1])<=3)
+      && (($expr->stoken_modifiers[$i] & (QST_QUOTED|QST_WILDCARD)) == 0)
+      && (($expr->stoken_modifiers[$i+1] & (QST_BREAK|QST_QUOTED|QST_WILDCARD)) == 0) )
     {
-      $transliterated_token = $transliterated_tokens[$i];
-      if (strlen($transliterated_token)==0)
-        continue;
-
-      $match = false;
-      $pos = 0;
-      while ( ($pos = strpos($transliterated_tag, $transliterated_token, $pos)) !== false)
+      $common = array_intersect( $token_tag_ids[$i], $token_tag_ids[$i+1] );
+      if (count($common))
       {
-        if ( ($token_modifiers[$i]&QST_WILDCARD)==QST_WILDCARD )
-        {// wildcard in this token
-          $match = 1;
-          break;
-        }
-        $token_len = strlen($transliterated_token);
-
-        // search begin of word
-        $wbegin_len=0; $wbegin_char=' ';
-        while ($pos-$wbegin_len > 0)
-        {
-          if (! is_word_char($transliterated_tag[$pos-$wbegin_len-1]) )
-          {
-            $wbegin_char = $transliterated_tag[$pos-$wbegin_len-1];
-            break;
-          }
-          $wbegin_len++;
-        }
-
-        // search end of word
-        $wend_len=0; $wend_char=' ';
-        while ($pos+$token_len+$wend_len < strlen($transliterated_tag))
-        {
-          if (! is_word_char($transliterated_tag[$pos+$token_len+$wend_len]) )
-          {
-            $wend_char = $transliterated_tag[$pos+$token_len+$wend_len];
-            break;
-          }
-          $wend_len++;
-        }
-
-        $this_score = 0;
-        if ( ($token_modifiers[$i]&QST_WILDCARD)==0 )
-        {// no wildcard begin or end
-          if ($token_len <= 2)
-          {// search for 1 or 2 characters must match exactly to avoid retrieving too much data
-            if ($wbegin_len==0 && $wend_len==0 && !is_odd_wbreak_begin($wbegin_char) && !is_odd_wbreak_end($wend_char) )
-              $this_score = 1;
-          }
-          elseif ($token_len == 3)
-          {
-            if ($wbegin_len==0)
-              $this_score = $token_len / ($token_len + $wend_len);
-          }
-          else
-          {
-            $this_score = $token_len / ($token_len + 1.1 * $wbegin_len + 0.9 * $wend_len);
-          }
-        }
-
-        if ($this_score>0)
-          $match = max($match, $this_score );
-        $pos++;
-      }
-
-      if ($match)
-      {
-        $tag_id = (int)$tag['id'];
-        $all_tags[$tag_id] = $tag;
-        $token_tag_ids[$i][] = $tag_id;
-        $token_tag_scores[$i][] = $match;
+        $token_tag_ids[$i] = $token_tag_ids[$i+1] = $common;
       }
     }
   }
 
-  // process tags
-  $not_tag_ids = array();
-  for ($i=0; $i<count($tokens); $i++)
-  {
-    array_multisort($token_tag_scores[$i], SORT_DESC|SORT_NUMERIC, $token_tag_ids[$i]);
-    $is_not = $token_modifiers[$i]&QST_NOT;
-    $counter = 0;
-
-    for ($j=0; $j<count($token_tag_scores[$i]); $j++)
-    {
-      if ($is_not)
-      {
-        if ($token_tag_scores[$i][$j] < 0.8 ||
-              ($j>0 && $token_tag_scores[$i][$j] < $token_tag_scores[$i][0]) )
-        {
-          array_splice($token_tag_scores[$i], $j);
-          array_splice($token_tag_ids[$i], $j);
-        }
-      }
-      else
-      {
-        $tag_id = $token_tag_ids[$i][$j];
-        $counter += $all_tags[$tag_id]['counter'];
-        if ( $j>0 && (
-          ($counter > 100 && $token_tag_scores[$i][0] > $token_tag_scores[$i][$j]) // "many" images in previous tags and starting from this tag is less relevant
-          || ($token_tag_scores[$i][0]==1 && $token_tag_scores[$i][$j]<0.8)
-          || ($token_tag_scores[$i][0]>0.8 && $token_tag_scores[$i][$j]<0.5)
-          ))
-        {// we remove this tag from the results, but we still leave it in all_tags list so that if we are wrong, the user chooses it
-          array_splice($token_tag_ids[$i], $j);
-          array_splice($token_tag_scores[$i], $j);
-          break;
-        }
-      }
-    }
-
-    if ($is_not)
-    {
-      $not_tag_ids = array_merge($not_tag_ids, $token_tag_ids[$i]);
-    }
-  }
-
-  $all_tags = array_diff_key($all_tags, array_flip($not_tag_ids));
-  usort($all_tags, 'tag_alpha_compare');
-  foreach ( $all_tags as &$tag )
-  {
-    $tag['name'] = trigger_event('render_tag_name', $tag['name'], $tag);
-  }
-  $qsr->all_tags = $all_tags;
-
-  $qsr->tag_ids = $token_tag_ids;
-  $qsr->tag_iids = array_fill(0, count($tokens), array() );
-
-  for ($i=0; $i<count($tokens); $i++)
+  // get images
+  $positive_ids = $not_ids = array();
+  for ($i=0; $i<count($expr->stokens); $i++)
   {
     $tag_ids = $token_tag_ids[$i];
+    $token = $expr->stokens[$i];
 
     if (!empty($tag_ids))
     {
@@ -1099,8 +965,12 @@ SELECT image_id FROM '.IMAGE_TAG_TABLE.'
   WHERE tag_id IN ('.implode(',',$tag_ids).')
   GROUP BY image_id';
       $qsr->tag_iids[$i] = query2array($query, null, 'image_id');
+      if ($expr->stoken_modifiers[$i]&QST_NOT)
+        $not_ids = array_merge($not_ids, $tag_ids);
+      else
+        $positive_ids = array_merge($positive_ids, $tag_ids);
     }
-    elseif (isset($tokens[$i]->scope) && 'tag' == $tokens[$i]->scope->id && strlen($token->term)==0)
+    elseif (isset($token->scope) && 'tag' == $token->scope->id && strlen($token->term)==0)
     {
       if ($tokens[$i]->modifier & QST_WILDCARD)
       {// eg. 'tag:*' returns all tagged images
@@ -1112,9 +982,19 @@ SELECT image_id FROM '.IMAGE_TAG_TABLE.'
       }
     }
   }
+
+  $all_tags = array_intersect_key($all_tags, array_flip( array_diff($positive_ids, $not_ids) ) );
+  usort($all_tags, 'tag_alpha_compare');
+  foreach ( $all_tags as &$tag )
+  {
+    $tag['name'] = trigger_event('render_tag_name', $tag['name'], $tag);
+  }
+  $qsr->all_tags = $all_tags;
+  $qsr->tag_ids = $token_tag_ids;
 }
 
 
+
 function qsearch_eval(QMultiToken $expr, QResults $qsr, &$qualifies, &$ignored_terms)
 {
   $qualifies = false; // until we find at least one positive term
@@ -1259,6 +1139,7 @@ function get_quick_search_results($q, $options)
   for ($i=0; $i<count($expression->stokens); $i++)
   {
     $debug[] = $expression->stokens[$i].': '.count($qsr->tag_ids[$i]).' tags, '.count($qsr->tag_iids[$i]).' tiids, '.count($qsr->images_iids[$i]).' iiids, '.count($qsr->iids[$i]).' iids'
+      .' modifier:'.dechex($expression->stoken_modifiers[$i])
       .( !empty($expression->stokens[$i]->variants) ? ' variants: '.implode(', ',$expression->stokens[$i]->variants): '');
   }
   $debug[] = 'before perms '.count($ids);
author	rvelices <rv-github@modusoptimus.com>	2014-04-09 21:23:49 +0000
committer	rvelices <rv-github@modusoptimus.com>	2014-04-09 21:23:49 +0000
commit	3ceca0d755df8f7dc229582df2efe3b96853c263 (patch)
tree	1e784c7c924181efef9f256f40601182e6f1ed16 /include/functions_search.inc.php
parent	2c3eb75ee9808b798e7cc3231f0bf1f2f1e86960 (diff)