aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorrvelices <rv-github@modusoptimus.com>2011-04-16 18:15:52 +0000
committerrvelices <rv-github@modusoptimus.com>2011-04-16 18:15:52 +0000
commit3594e552d73ff20d4d8be119d38b29c0ae55ffdd (patch)
tree13e1d837cad86be21df0571fbca602287696b1d0 /include
parentbc1cf1e4201eac68edbf4f8f6887604ee23c38de (diff)
merge -r10340 from trunk feature:2248 Improve quick/query search results
git-svn-id: http://piwigo.org/svn/branches/2.2@10427 68402e56-0260-453c-a942-63ccdbb3a9ee
Diffstat (limited to '')
-rw-r--r--include/functions_search.inc.php246
1 files changed, 187 insertions, 59 deletions
diff --git a/include/functions_search.inc.php b/include/functions_search.inc.php
index 220ee0d4e..b25d41050 100644
--- a/include/functions_search.inc.php
+++ b/include/functions_search.inc.php
@@ -265,17 +265,34 @@ SELECT DISTINCT(id)
return $items;
}
+
+if (function_exists('mb_strtolower'))
+{
+ function transliterate($term)
+ {
+ return remove_accents( mb_strtolower($term) );
+ }
+}
+else
+{
+ function transliterate($term)
+ {
+ return remove_accents( strtolower($term) );
+ }
+}
+
+function is_word_char($ch)
+{
+ return ($ch>='0' && $ch<='9') || ($ch>='a' && $ch<='z') || ($ch>='A' && $ch<='Z') || ord($ch)>127;
+}
+
/**
- * returns the LIKE sql clause corresponding to the quick search query $q
- * and the field $field. example q='john bill', field='file' will return
- * file LIKE '%john%' OR file LIKE '%bill%'. Special characters for MySql full
- * text search (+,<,>,~) are omitted. The query can contain a phrase:
- * 'Pierre "New York"' will return LIKE '%Pierre%' OR LIKE '%New York%'.
- * @param string q
- * @param string field
- * @return string
+ * analyzes and splits the quick/query search query $q into tokens
+ * q='john bill' => 2 tokens 'john' 'bill'
+ * Special characters for MySql full text search (+,<,>,~) appear in the token modifiers.
+ * The query can contain a phrase: 'Pierre "New York"' will return 'pierre' qnd 'new york'.
*/
-function get_qsearch_like_clause($q, $field, $before='%', $after='%')
+function analyse_qsearch($q, &$qtokens, &$qtoken_modifiers)
{
$q = stripslashes($q);
$tokens = array();
@@ -292,27 +309,27 @@ function get_qsearch_like_clause($q, $field, $before='%', $after='%')
case 0:
if ($ch=='"')
{
- if (strlen($crt_token))
- {
- $tokens[] = $crt_token;
- $token_modifiers[] = $crt_token_modifier;
- $crt_token = "";
- $crt_token_modifier = "";
- }
+ $tokens[] = $crt_token; $token_modifiers[] = $crt_token_modifier;
+ $crt_token = ""; $crt_token_modifier = "q";
$state=1;
}
elseif ( $ch=='*' )
{ // wild card
- $crt_token .= '%';
+ if (strlen($crt_token))
+ {
+ $crt_token .= $ch;
+ }
+ else
+ {
+ $crt_token_modifier .= '*';
+ }
}
elseif ( strcspn($ch, '+-><~')==0 )
{ //special full text modifier
if (strlen($crt_token))
{
- $tokens[] = $crt_token;
- $token_modifiers[] = $crt_token_modifier;
- $crt_token = "";
- $crt_token_modifier = "";
+ $tokens[] = $crt_token; $token_modifiers[] = $crt_token_modifier;
+ $crt_token = ""; $crt_token_modifier = "";
}
$crt_token_modifier .= $ch;
}
@@ -320,18 +337,12 @@ function get_qsearch_like_clause($q, $field, $before='%', $after='%')
{ // white space
if (strlen($crt_token))
{
- $tokens[] = $crt_token;
- $token_modifiers[] = $crt_token_modifier;
- $crt_token = "";
- $crt_token_modifier = "";
+ $tokens[] = $crt_token; $token_modifiers[] = $crt_token_modifier;
+ $crt_token = ""; $crt_token_modifier = "";
}
}
else
{
- if ( strcspn($ch, '%_')==0)
- {// escape LIKE specials %_
- $ch = '\\'.$ch;
- }
$crt_token .= $ch;
}
break;
@@ -339,17 +350,11 @@ function get_qsearch_like_clause($q, $field, $before='%', $after='%')
switch ($ch)
{
case '"':
- $tokens[] = $crt_token;
- $token_modifiers[] = $crt_token_modifier;
- $crt_token = "";
- $crt_token_modifier = "";
+ $tokens[] = $crt_token; $token_modifiers[] = $crt_token_modifier;
+ $crt_token = ""; $crt_token_modifier = "";
$state=0;
break;
default:
- if ( strcspn($ch, '%_')==0)
- {// escape LIKE specials %_
- $ch = '\\'.$ch;
- }
$crt_token .= $ch;
}
break;
@@ -361,21 +366,49 @@ function get_qsearch_like_clause($q, $field, $before='%', $after='%')
$token_modifiers[] = $crt_token_modifier;
}
+ $qtokens = array();
+ $qtoken_modifiers = array();
+ for ($i=0; $i<count($tokens); $i++)
+ {
+ if (strstr($token_modifiers[$i], 'q')===false)
+ {
+ if ( substr($tokens[$i], -1)=='*' )
+ {
+ $tokens[$i] = rtrim($tokens[$i], '*');
+ $token_modifiers[$i] .= '*';
+ }
+ }
+ if ( strlen($tokens[$i])==0)
+ continue;
+ $qtokens[] = $tokens[$i];
+ $qtoken_modifiers[] = $token_modifiers[$i];
+ }
+}
+
+
+/**
+ * returns the LIKE sql clause corresponding to the quick search query
+ * that has been split into tokens
+ * for example file LIKE '%john%' OR file LIKE '%bill%'.
+ */
+function get_qsearch_like_clause($tokens, $token_modifiers, $field)
+{
$clauses = array();
for ($i=0; $i<count($tokens); $i++)
{
- $tokens[$i] = trim($tokens[$i], '%');
+ $token = trim($tokens[$i], '%');
if (strstr($token_modifiers[$i], '-')!==false)
continue;
- if ( strlen($tokens[$i])==0)
+ if ( strlen($token==0) )
continue;
- $clauses[] = $field.' LIKE \''.$before.addslashes($tokens[$i]).$after.'\'';
+ $token = addslashes($token);
+ $token = str_replace( array('%','_'), array('\\%','\\_'), $token); // escape LIKE specials %_
+ $clauses[] = $field.' LIKE \'%'.$token.'%\'';
}
return count($clauses) ? '('.implode(' OR ', $clauses).')' : null;
}
-
/**
* returns the search results corresponding to a quick/query search.
* A quick/query search returns many items (search is not strict), but results
@@ -395,6 +428,8 @@ function get_qsearch_like_clause($q, $field, $before='%', $after='%')
*/
function get_quick_search_results($q, $super_order_by, $images_where='')
{
+ global $user, $conf;
+
$search_results =
array(
'items' => array(),
@@ -405,9 +440,11 @@ function get_quick_search_results($q, $super_order_by, $images_where='')
{
return $search_results;
}
- $q_like_field = '@@__db_field__@@'; //something never in a search
- $q_like_clause = get_qsearch_like_clause($q, $q_like_field );
+
+ analyse_qsearch($q, $tokens, $token_modifiers);
+ $q_like_field = '@@__db_field__@@'; //something never in a search
+ $q_like_clause = get_qsearch_like_clause($tokens, $token_modifiers, $q_like_field );
// Step 1 - first we find matches in #images table ===========================
$where_clauses='MATCH(i.name, i.comment) AGAINST( \''.$q.'\' IN BOOLEAN MODE)';
@@ -448,34 +485,126 @@ SELECT i.id,
// Step 2 - search tags corresponding to the query $q ========================
- if (!empty($q_like_clause))
- { // search name and url name (without accents)
- $query = '
-SELECT id, name, url_name
+ $transliterated_tokens = array();
+ $token_tags = array();
+ foreach ($tokens as $token)
+ {
+ $transliterated_tokens[] = transliterate($token);
+ $token_tags[] = array();
+ }
+
+ // Step 2.1 - find match tags for every token in the query search
+ $all_tags = array();
+ $query = '
+SELECT id, name, url_name, COUNT(image_id) AS nb_images
FROM '.TAGS_TABLE.'
- WHERE ('.str_replace($q_like_field, 'CONVERT(name, CHAR)', $q_like_clause).'
- OR '.str_replace($q_like_field, 'url_name', $q_like_clause).')';
- $tags = hash_from_query($query, 'id');
- if ( !empty($tags) )
- { // we got some tags; get the images
- $search_results['qs']['matching_tags']=$tags;
+ INNER JOIN '.IMAGE_TAG_TABLE.' ON id=tag_id
+ GROUP BY id';
+ $result = pwg_query($query);
+ while ($tag = pwg_db_fetch_assoc($result))
+ {
+ $transliterated_tag = transliterate($tag['name']);
+
+ // find how this tag matches query tokens
+ for ($i=0; $i<count($tokens); $i++)
+ {
+ if (strstr($token_modifiers[$i], '-')!==false)
+ continue;// ignore this NOT token
+ $transliterated_token = $transliterated_tokens[$i];
+
+ $match = false;
+ $pos = 0;
+ while ( ($pos = strpos($transliterated_tag, $transliterated_token, $pos)) !== false)
+ {
+ if (strstr($token_modifiers[$i], '*')!==false)
+ {// wildcard in this token
+ $match = 1;
+ break;
+ }
+ $token_len = strlen($transliterated_token);
+
+ $word_begin = $pos;
+ while ($word_begin>0)
+ {
+ if (! is_word_char($transliterated_tag[$word_begin-1]) )
+ break;
+ $word_begin--;
+ }
+
+ $word_end = $pos + $token_len;
+ while ($word_end<strlen($transliterated_tag) && is_word_char($transliterated_tag[$word_end]) )
+ $word_end++;
+
+ $this_score = $token_len / ($word_end-$word_begin);
+ if ($token_len <= 2)
+ {// search for 1 or 2 characters must match exactly to avoid retrieving too much data
+ if ($token_len != $word_end-$word_begin)
+ $this_score = 0;
+ }
+ elseif ($token_len == 3)
+ {
+ if ($word_end-$word_begin > 4)
+ $this_score = 0;
+ }
+
+ if ($this_score>0)
+ $match = max($match, $this_score );
+ $pos++;
+ }
+
+ if ($match)
+ {
+ $tag_id = (int)$tag['id'];
+ $all_tags[$tag_id] = $tag;
+ $token_tags[$i][] = array('tag_id'=>$tag_id, 'score'=>$match);
+ }
+ }
+ }
+ $search_results['qs']['matching_tags']=$all_tags;
+
+ // Step 2.2 - reduce matching tags for every token in the query search
+ $score_cmp_fn = create_function('$a,$b', 'return 100*($b["score"]-$a["score"]);');
+ foreach ($token_tags as &$tt)
+ {
+ usort($tt, $score_cmp_fn);
+ $nb_images = 0;
+ $prev_score = 0;
+ for ($j=0; $j<count($tt); $j++)
+ {
+ if ($nb_images > 200 && $prev_score > $tt[$j]['score'] )
+ {// "many" images in previous tags and starting from this tag is less relevent
+ $tt = array_slice( $tt, 0, $j);
+ break;
+ }
+ $nb_images += $all_tags[ $tt[$j]['tag_id'] ]['nb_images'];
+ $prev_score = $tt[$j]['score'];
+ }
+ }
+
+ // Step 2.3 - get the images for tags
+ for ($i=0; $i<count($token_tags); $i++)
+ {
+ $tag_ids = array();
+ foreach($token_tags[$i] as $arr)
+ $tag_ids[] = $arr['tag_id'];
+
+ if (!empty($tag_ids))
+ {
$query = '
-SELECT image_id, COUNT(tag_id) AS weight
+SELECT image_id
FROM '.IMAGE_TAG_TABLE.'
- WHERE tag_id IN ('.implode(',',array_keys($tags)).')
+ WHERE tag_id IN ('.implode(',',$tag_ids).')
GROUP BY image_id';
$result = pwg_query($query);
while ($row = pwg_db_fetch_assoc($result))
{ // weight is important when sorting images by relevance
$image_id=(int)$row['image_id'];
- @$by_weights[$image_id] += $row['weight'];
+ @$by_weights[$image_id] += 1;
}
}
}
-
// Step 3 - search categories corresponding to the query $q ==================
- global $user;
$query = '
SELECT id, name, permalink, nb_images
FROM '.CATEGORIES_TABLE.'
@@ -531,7 +660,6 @@ SELECT id, name, permalink, nb_images
null,true
);
- global $conf;
$query = '
SELECT DISTINCT(id)
FROM '.IMAGES_TABLE.' i