merge -r10340 from trunk feature:2248 Improve quick/query search results

git-svn-id: http://piwigo.org/svn/branches/2.2@10427 68402e56-0260-453c-a942-63ccdbb3a9ee
2011-04-16 18:15:52 +00:00 · 2011-04-16 18:15:52 +00:00 · 3594e552d7
commit 3594e552d7
parent bc1cf1e420
1 changed files with 187 additions and 59 deletions
--- a/include/functions_search.inc.php
+++ b/include/functions_search.inc.php
@ -265,17 +265,34 @@ SELECT DISTINCT(id)
  return $items;
 }

+
+if (function_exists('mb_strtolower'))
+{
+  function transliterate($term)
+  {
+    return remove_accents( mb_strtolower($term) );
+  }
+}
+else
+{
+  function transliterate($term)
+  {
+    return remove_accents( strtolower($term) );
+  }
+}
+
+function is_word_char($ch)
+{
+  return ($ch>='0' && $ch<='9') || ($ch>='a' && $ch<='z') || ($ch>='A' && $ch<='Z') || ord($ch)>127;
+}
+
 /**
- * returns the LIKE sql clause corresponding to the quick search query $q
- * and the field $field. example q='john bill', field='file' will return
- * file LIKE '%john%' OR file LIKE '%bill%'. Special characters for MySql full
- * text search (+,<,>,~) are omitted. The query can contain a phrase:
- * 'Pierre "New York"' will return LIKE '%Pierre%' OR LIKE '%New York%'.
- * @param string q
- * @param string field
- * @return string
+ * analyzes and splits the quick/query search query $q into tokens
+ * q='john bill' => 2 tokens 'john' 'bill'
+ * Special characters for MySql full text search (+,<,>,~) appear in the token modifiers.
+ * The query can contain a phrase: 'Pierre "New York"' will return 'pierre' qnd 'new york'.
 */
-function get_qsearch_like_clause($q, $field, $before='%', $after='%')
+function analyse_qsearch($q, &$qtokens, &$qtoken_modifiers)
 {
  $q = stripslashes($q);
  $tokens = array();
@ -292,27 +309,27 @@ function get_qsearch_like_clause($q, $field, $before='%', $after='%')
      case 0:
        if ($ch=='"')
        {
-          if (strlen($crt_token))
-          {
-            $tokens[] = $crt_token;
-            $token_modifiers[] = $crt_token_modifier;
-            $crt_token = "";
-            $crt_token_modifier = "";
-          }
+          $tokens[] = $crt_token; $token_modifiers[] = $crt_token_modifier;
+          $crt_token = ""; $crt_token_modifier = "q";
          $state=1;
        }
        elseif ( $ch=='*' )
        { // wild card
-          $crt_token .= '%';
+          if (strlen($crt_token))
+          {
+            $crt_token .= $ch;
+          }
+          else
+          {
+            $crt_token_modifier .= '*';
+          }
        }
        elseif ( strcspn($ch, '+-><~')==0 )
        { //special full text modifier
          if (strlen($crt_token))
          {
-            $tokens[] = $crt_token;
-            $token_modifiers[] = $crt_token_modifier;
-            $crt_token = "";
-            $crt_token_modifier = "";
+            $tokens[] = $crt_token; $token_modifiers[] = $crt_token_modifier;
+            $crt_token = ""; $crt_token_modifier = "";
          }
          $crt_token_modifier .= $ch;
        }
@ -320,18 +337,12 @@ function get_qsearch_like_clause($q, $field, $before='%', $after='%')
        { // white space
          if (strlen($crt_token))
          {
-            $tokens[] = $crt_token;
-            $token_modifiers[] = $crt_token_modifier;
-            $crt_token = "";
-            $crt_token_modifier = "";
+            $tokens[] = $crt_token; $token_modifiers[] = $crt_token_modifier;
+            $crt_token = ""; $crt_token_modifier = "";
          }
        }
        else
        {
-          if ( strcspn($ch, '%_')==0)
-          {// escape LIKE specials %_
-            $ch = '\\'.$ch;
-          }
          $crt_token .= $ch;
        }
        break;
@ -339,17 +350,11 @@ function get_qsearch_like_clause($q, $field, $before='%', $after='%')
        switch ($ch)
        {
          case '"':
-            $tokens[] = $crt_token;
-            $token_modifiers[] = $crt_token_modifier;
-            $crt_token = "";
-            $crt_token_modifier = "";
+            $tokens[] = $crt_token; $token_modifiers[] = $crt_token_modifier;
+            $crt_token = ""; $crt_token_modifier = "";
            $state=0;
            break;
          default:
-            if ( strcspn($ch, '%_')==0)
-            {// escape LIKE specials %_
-            	$ch = '\\'.$ch;
-            }
            $crt_token .= $ch;
        }
        break;
@ -361,21 +366,49 @@ function get_qsearch_like_clause($q, $field, $before='%', $after='%')
    $token_modifiers[] = $crt_token_modifier;
  }

+  $qtokens = array();
+  $qtoken_modifiers = array();
+  for ($i=0; $i<count($tokens); $i++)
+  {
+    if (strstr($token_modifiers[$i], 'q')===false)
+    {
+      if ( substr($tokens[$i], -1)=='*' )
+      {
+        $tokens[$i] = rtrim($tokens[$i], '*');
+        $token_modifiers[$i] .= '*';
+      }
+    }
+    if ( strlen($tokens[$i])==0)
+      continue;
+    $qtokens[] = $tokens[$i];
+    $qtoken_modifiers[] = $token_modifiers[$i];
+  }
+}
+
+
+/**
+ * returns the LIKE sql clause corresponding to the quick search query 
+ * that has been split into tokens
+ * for example file LIKE '%john%' OR file LIKE '%bill%'.
+ */
+function get_qsearch_like_clause($tokens, $token_modifiers, $field)
+{
  $clauses = array();
  for ($i=0; $i<count($tokens); $i++)
  {
-    $tokens[$i] = trim($tokens[$i], '%');
+    $token = trim($tokens[$i], '%');
    if (strstr($token_modifiers[$i], '-')!==false)
      continue;
-    if ( strlen($tokens[$i])==0)
+    if ( strlen($token==0) )
      continue;
-    $clauses[] = $field.' LIKE \''.$before.addslashes($tokens[$i]).$after.'\'';
+    $token = addslashes($token);
+    $token = str_replace( array('%','_'), array('\\%','\\_'), $token); // escape LIKE specials %_
+    $clauses[] = $field.' LIKE \'%'.$token.'%\'';
  }

  return count($clauses) ? '('.implode(' OR ', $clauses).')' : null;
 }

-
 /**
 * returns the search results corresponding to a quick/query search.
 * A quick/query search returns many items (search is not strict), but results
@ -395,6 +428,8 @@ function get_qsearch_like_clause($q, $field, $before='%', $after='%')
 */
 function get_quick_search_results($q, $super_order_by, $images_where='')
 {
+  global $user, $conf;
+
  $search_results =
    array(
      'items' => array(),
@ -405,9 +440,11 @@ function get_quick_search_results($q, $super_order_by, $images_where='')
  {
    return $search_results;
  }
-  $q_like_field = '@@__db_field__@@'; //something never in a search
-  $q_like_clause = get_qsearch_like_clause($q, $q_like_field );
+  
+  analyse_qsearch($q, $tokens, $token_modifiers);

+  $q_like_field = '@@__db_field__@@'; //something never in a search
+  $q_like_clause = get_qsearch_like_clause($tokens, $token_modifiers, $q_like_field );

  // Step 1 - first we find matches in #images table ===========================
  $where_clauses='MATCH(i.name, i.comment) AGAINST( \''.$q.'\' IN BOOLEAN MODE)';
@ -448,34 +485,126 @@ SELECT i.id,


  // Step 2 - search tags corresponding to the query $q ========================
-  if (!empty($q_like_clause))
-  { // search name and url name (without accents)
-    $query = '
-SELECT id, name, url_name
+  $transliterated_tokens = array();
+  $token_tags = array();
+  foreach ($tokens as $token)
+  {
+    $transliterated_tokens[] = transliterate($token);
+    $token_tags[] = array();
+  }
+
+  // Step 2.1 - find match tags for every token in the query search
+  $all_tags = array();
+  $query = '
+SELECT id, name, url_name, COUNT(image_id) AS nb_images
  FROM '.TAGS_TABLE.'
-  WHERE ('.str_replace($q_like_field, 'CONVERT(name, CHAR)', $q_like_clause).'
-    OR '.str_replace($q_like_field, 'url_name', $q_like_clause).')';
-    $tags = hash_from_query($query, 'id');
-    if ( !empty($tags) )
-    { // we got some tags; get the images
-      $search_results['qs']['matching_tags']=$tags;
+    INNER JOIN '.IMAGE_TAG_TABLE.' ON id=tag_id
+  GROUP BY id';
+  $result = pwg_query($query);
+  while ($tag = pwg_db_fetch_assoc($result))
+  {
+    $transliterated_tag = transliterate($tag['name']);
+
+    // find how this tag matches query tokens
+    for ($i=0; $i<count($tokens); $i++)
+    {
+      if (strstr($token_modifiers[$i], '-')!==false)
+        continue;// ignore this NOT token
+      $transliterated_token = $transliterated_tokens[$i];
+
+      $match = false;
+      $pos = 0;
+      while ( ($pos = strpos($transliterated_tag, $transliterated_token, $pos)) !== false)
+      {
+        if (strstr($token_modifiers[$i], '*')!==false)
+        {// wildcard in this token
+          $match = 1;
+          break;
+        }
+        $token_len = strlen($transliterated_token);
+
+        $word_begin = $pos;
+        while ($word_begin>0)
+        {
+          if (! is_word_char($transliterated_tag[$word_begin-1]) )
+            break;
+          $word_begin--;
+        }
+
+        $word_end = $pos + $token_len;
+        while ($word_end<strlen($transliterated_tag) && is_word_char($transliterated_tag[$word_end]) )
+          $word_end++;
+
+        $this_score = $token_len / ($word_end-$word_begin);
+        if ($token_len <= 2)
+        {// search for 1 or 2 characters must match exactly to avoid retrieving too much data
+          if ($token_len != $word_end-$word_begin)
+            $this_score = 0;
+        }
+        elseif ($token_len == 3)
+        {
+          if ($word_end-$word_begin > 4)
+            $this_score = 0;
+        }
+
+        if ($this_score>0)
+          $match = max($match, $this_score );
+        $pos++;
+      }
+
+      if ($match)
+      {
+        $tag_id = (int)$tag['id'];
+        $all_tags[$tag_id] = $tag;
+        $token_tags[$i][] = array('tag_id'=>$tag_id, 'score'=>$match);
+      }
+    }
+  }
+  $search_results['qs']['matching_tags']=$all_tags;
+
+  // Step 2.2 - reduce matching tags for every token in the query search
+  $score_cmp_fn = create_function('$a,$b', 'return 100*($b["score"]-$a["score"]);');
+  foreach ($token_tags as &$tt)
+  {
+    usort($tt, $score_cmp_fn);
+    $nb_images = 0;
+    $prev_score = 0;
+    for ($j=0; $j<count($tt); $j++)
+    {
+      if ($nb_images > 200 && $prev_score > $tt[$j]['score'] )
+      {// "many" images in previous tags and starting from this tag is less relevent
+        $tt = array_slice( $tt, 0, $j);
+        break;
+      }
+      $nb_images += $all_tags[ $tt[$j]['tag_id'] ]['nb_images'];
+      $prev_score = $tt[$j]['score'];
+    }
+  }
+
+  // Step 2.3 - get the images for tags
+  for ($i=0; $i<count($token_tags); $i++)
+  {
+    $tag_ids = array();
+    foreach($token_tags[$i] as $arr)
+      $tag_ids[] = $arr['tag_id'];
+
+    if (!empty($tag_ids))
+    {
      $query = '
-SELECT image_id, COUNT(tag_id) AS weight
+SELECT image_id
  FROM '.IMAGE_TAG_TABLE.'
-  WHERE tag_id IN ('.implode(',',array_keys($tags)).')
+  WHERE tag_id IN ('.implode(',',$tag_ids).')
  GROUP BY image_id';
      $result = pwg_query($query);
      while ($row = pwg_db_fetch_assoc($result))
      { // weight is important when sorting images by relevance
        $image_id=(int)$row['image_id'];
-        @$by_weights[$image_id] += $row['weight'];
+        @$by_weights[$image_id] += 1;
      }
    }
  }

-
  // Step 3 - search categories corresponding to the query $q ==================
-  global $user;
  $query = '
 SELECT id, name, permalink, nb_images
  FROM '.CATEGORIES_TABLE.'
@ -531,7 +660,6 @@ SELECT id, name, permalink, nb_images
      null,true
    );

-  global $conf;
  $query = '
 SELECT DISTINCT(id)
  FROM '.IMAGES_TABLE.' i