Initial checkin of the new boolean fulltext search code

This commit is contained in:
unknown 2001-09-21 18:38:17 +02:00
parent 2204fbea33
commit 51c261ff23
7 changed files with 294 additions and 197 deletions

View file

@ -53,8 +53,8 @@ void ft_free_stopwords(void);
FT_DOCLIST * ft_init_search(void *, uint, byte *, uint, my_bool);
int ft_read_next(FT_DOCLIST *, char *);
#define ft_close_search(handler) my_free(((gptr)(handler)),MYF(0))
#define ft_get_relevance(handler) ((handler)->doc[(handler)->curdoc].weight)
#define ft_get_docid(handler) ((handler)->doc[(handler)->curdoc].dpos)
#define ft_get_relevance(handler) (((FT_DOCLIST *)(handler))->doc[((FT_DOCLIST *)(handler))->curdoc].weight)
#define ft_get_docid(handler) (((FT_DOCLIST *)(handler))->doc[((FT_DOCLIST *)(handler))->curdoc].dpos)
#define ft_reinit_search(handler) (((FT_DOCLIST *)(handler))->curdoc=-1)
#ifdef __cplusplus

View file

@ -17,52 +17,10 @@
/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
#include "ftdefs.h"
#include <queues.h>
/* search with boolean queries */
typedef struct st_all_in_one {
MI_INFO *info;
uint keynr;
uchar *keybuff;
MI_KEYDEF *keyinfo;
my_off_t key_root;
TREE dtree;
byte *start, *end;
uint total_yes, total_no;
} ALL_IN_ONE;
typedef struct st_ft_superdoc {
FT_DOC doc;
//FT_WORD *word_ptr;
//double tmp_weight;
uint yes;
uint no;
uint wno;
ALL_IN_ONE *aio;
} FT_SUPERDOC;
static int FT_SUPERDOC_cmp(void* cmp_arg __attribute__((unused)),
FT_SUPERDOC *p1, FT_SUPERDOC *p2)
{
if (p1->doc.dpos < p2->doc.dpos)
return -1;
if (p1->doc.dpos == p2->doc.dpos)
return 0;
return 1;
}
static int walk_and_copy(FT_SUPERDOC *from,
uint32 count __attribute__((unused)), FT_DOC **to)
{
if (from->yes == from->aio->total_yes && !from->no)
{
(*to)->dpos=from->doc.dpos;
(*to)->weight=from->doc.weight;
(*to)++;
}
return 0;
}
static double _wghts[11]={
0.131687242798354,
0.197530864197531,
@ -91,136 +49,268 @@ static double _nwghts[11]={
-3.796875000000000};
static double *nwghts=_nwghts+5; // nwghts[i] = -0.5*1.5**i
int do_boolean(ALL_IN_ONE *aio, uint nested __attribute__((unused)),
int yesno __attribute__((unused)),
int plusminus, bool pmsign)
typedef struct st_ftb_expr FTB_EXPR;
struct st_ftb_expr {
FTB_EXPR *up;
float weight;
int yesno;
my_off_t docid;
float cur_weight;
int yesses; /* number of "yes" words matched */
int nos; /* number of "no" words matched */
int ythresh; /* number of "yes" words in expr */
};
typedef struct {
FTB_EXPR *up;
float weight;
int yesno;
int trunc;
my_off_t docid;
uint ndepth;
int len;
/* ... there can be docid cache added here. SerG */
byte word[1];
} FTB_WORD;
typedef struct st_ftb_handler {
MI_INFO *info;
uint keynr;
int ok;
FTB_EXPR *root;
QUEUE queue;
MEM_ROOT mem_root;
} FTB;
int FTB_WORD_cmp(void *v, byte *a, byte *b)
{
int r, res;
uint keylen, wno;
FT_SUPERDOC sdoc, *sptr;
TREE_ELEMENT *selem;
FT_WORD w;
FTB_PARAM param;
#ifdef EVAL_RUN
return 1;
#endif /* EVAL_RUN */
param.prev=' ';
for(wno=1; (res=ft_get_word(&aio->start,aio->end,&w,&param)); wno++)
{
r=plusminus+param.plusminus;
if (param.pmsign^pmsign)
w.weight=nwghts[(r>5)?5:((r<-5)?-5:r)];
else
w.weight=wghts[(r>5)?5:((r<-5)?-5:r)];
if (param.yesno>0) aio->total_yes++;
if (param.yesno<0) aio->total_no++;
switch (res) {
case FTB_LBR: // (
//if (do_boolean(aio,nested+1,my_yesno,plusminus+my_plusminus))
// return 1;
// ???
break;
case 1: // word
keylen=_ft_make_key(aio->info,aio->keynr,(char*) aio->keybuff,&w,0);
keylen-=HA_FT_WLEN;
r=_mi_search(aio->info, aio->keyinfo, aio->keybuff, keylen,
SEARCH_FIND | SEARCH_PREFIX, aio->key_root);
while (!r)
{
if (param.trunc)
r=_mi_compare_text(default_charset_info,
aio->info->lastkey+1,keylen-1,
aio->keybuff+1,keylen-1,0);
else
r=_mi_compare_text(default_charset_info,
aio->info->lastkey,keylen,
aio->keybuff,keylen,0);
if (r) break;
sdoc.doc.dpos=aio->info->lastpos;
/* saving document matched into dtree */
if (!(selem=tree_insert(&aio->dtree, &sdoc, 0))) return 1;
sptr=(FT_SUPERDOC *)ELEMENT_KEY((&aio->dtree), selem);
if (selem->count==1) /* document's first match */
{
sptr->yes=0;
sptr->no=0;
sptr->doc.weight=0;
sptr->aio=aio;
sptr->wno=0;
}
if (sptr->wno != wno)
{
if (param.yesno>0) sptr->yes++;
if (param.yesno<0) sptr->no++;
sptr->wno=wno;
}
sptr->doc.weight+=w.weight;
if (_mi_test_if_changed(aio->info) == 0)
r=_mi_search_next(aio->info, aio->keyinfo, aio->info->lastkey,
aio->info->lastkey_length, SEARCH_BIGGER,
aio->key_root);
else
r=_mi_search(aio->info, aio->keyinfo, aio->info->lastkey,
aio->info->lastkey_length, SEARCH_BIGGER,
aio->key_root);
}
break;
case FTB_RBR: // )
break;
}
}
return 0;
/* ORDER BY docid, ndepth DESC */
int i=((FTB_WORD *)a)->docid-((FTB_WORD *)b)->docid;
if (!i)
i=((FTB_WORD *)b)->ndepth-((FTB_WORD *)a)->ndepth;
return sgn(i);
}
FT_DOCLIST *ft_boolean_search(MI_INFO *info, uint keynr, byte *query,
void _ftb_parse_query(FTB *ftb, byte **start, byte *end,
FTB_EXPR *up, uint ndepth, uint depth)
{
byte res;
FTB_PARAM param;
FT_WORD w;
FTB_WORD *ftbw;
FTB_EXPR *ftbe;
MI_INFO *info=ftb->info;
int r;
MI_KEYDEF *keyinfo=info->s->keyinfo+ftb->keynr;
my_off_t keyroot=info->s->state.key_root[ftb->keynr];
uint extra=HA_FT_WLEN+info->s->rec_reflength; /* just a shortcut */
if (! ftb->ok)
return;
while (res=ftb_get_word(&start,end,&w,&param))
{
byte r=param.plusminus;
float weight=(param.pmsign ? nwghts : wghts)[(r>5)?5:((r<-5)?-5:r)];
switch (res) {
case FTB_LBR:
ftbe=(FTB_EXPR *)alloc_root(&ftb->mem_root, sizeof(FTB_EXPR));
ftbe->yesno=param.yesno;
ftbe->weight=weight;
ftbe->up=up;
ftbe->ythresh=0;
ftbe->docid=HA_POS_ERROR;
if (ftbw->yesno > 0) up->ythresh++;
_ftb_parse_query(ftb, start, end, ftbe, depth+1,
(param.yesno<0 ? depth+1 : ndepth));
break;
case FTB_RBR:
return;
case 1:
ftbw=(FTB_WORD *)alloc_root(&ftb->mem_root,
sizeof(FTB_WORD) + (param.trunc ? MI_MAX_KEY_BUFF : w.len+extra));
ftbw->len=w.len + !param.trunc;
ftbw->yesno=param.yesno;
ftbw->trunc=param.trunc; /* 0 or 1 */
ftbw->weight=weight;
ftbw->up=up;
ftbw->docid=HA_POS_ERROR;
ftbw->ndepth= param.yesno<0 ? depth : ndepth;
memcpy(ftbw->word+1, w.pos, w.len);
ftbw->word[0]=w.len;
if (ftbw->yesno > 0) up->ythresh++;
/*****************************************/
r=_mi_search(info, keyinfo, ftbw->word, ftbw->len,
SEARCH_FIND | SEARCH_PREFIX, keyroot);
if (!r)
{
r=_mi_compare_text(default_charset_info,
info->lastkey+ftbw->trunc,ftbw->len,
ftbw->word+ftbw->trunc,ftbw->len,0);
}
if (r) /* not found */
{
if (ftbw->yesno>0 && ftbw->up->up==0)
{ /* this word MUST BE present in every document returned,
so we can abort the search right now */
ftb->ok=0;
return;
}
}
else
{
memcpy(ftbw->word, info->lastkey, info->lastkey_length);
ftbw->docid=info->lastpos;
queue_insert(& ftb->queue, (byte *)ftbw);
}
/*****************************************/
break;
}
}
return;
}
FTB * ft_boolean_search_init(MI_INFO *info, uint keynr, byte *query,
uint query_len)
{
ALL_IN_ONE aio;
FT_DOC *dptr;
FT_DOCLIST *dlist=NULL;
FTB *ftb;
FTB_EXPR *ftbe;
uint res;
aio.info=info;
aio.keynr=keynr;
aio.keybuff=aio.info->lastkey+aio.info->s->base.max_key_length;
aio.keyinfo=aio.info->s->keyinfo+keynr;
aio.key_root=aio.info->s->state.key_root[keynr];
aio.start=query;
aio.end=query+query_len;
aio.total_yes=aio.total_no=0;
if (!(ftb=(FTB *)my_malloc(sizeof(FTB), MYF(MY_WME))))
return 0;
ftb->ok=1;
ftb->info=info;
ftb->keynr=keynr;
init_tree(&aio.dtree,0,0,sizeof(FT_SUPERDOC),(qsort_cmp2)&FT_SUPERDOC_cmp,0,
NULL, NULL);
init_alloc_root(&ftb->mem_root, query_len,0);
if (do_boolean(&aio,0,0,0,0))
goto err;
dlist=(FT_DOCLIST *)my_malloc(sizeof(FT_DOCLIST)+sizeof(FT_DOC)*(aio.dtree.elements_in_tree-1),MYF(0));
if(!dlist)
goto err;
dlist->ndocs=aio.dtree.elements_in_tree;
dlist->curdoc=-1;
dlist->info=aio.info;
dptr=dlist->doc;
tree_walk(&aio.dtree, (tree_walk_action)&walk_and_copy, &dptr, left_root_right);
dlist->ndocs=dptr - dlist->doc;
err:
delete_tree(&aio.dtree);
return dlist;
/* hack: instead of init_queue, we'll use reinit queue to be able
* to alloc queue with alloc_root()
*/
res=ftb->queue.max_elements=query_len/(ft_min_word_len+1);
ftb->queue.root=(byte **)alloc_root(&ftb->mem_root, (res+1)*sizeof(void*));
reinit_queue(& ftb->queue, res, 0, 0, FTB_WORD_cmp, ftb);
ftbe=(FTB_EXPR *)alloc_root(&ftb->mem_root, sizeof(FTB_EXPR));
ftbe->weight=ftbe->yesno=ftbe->nos=1;
ftbe->up=0;
ftbe->ythresh=0;
ftbe->docid=HA_POS_ERROR;
ftb->root=ftbe;
_ftb_parse_query(ftb, &query, query+query_len, ftbe, 0, 0);
return ftb;
}
int ft_boolean_search_next(FTB *ftb, char *record)
{
FTB_EXPR *ftbe, *up;
FTB_WORD *ftbw;
MI_INFO *info=ftb->info;
MI_KEYDEF *keyinfo=info->s->keyinfo+ftb->keynr;
my_off_t keyroot=info->s->state.key_root[ftb->keynr];
my_off_t curdoc;
int r;
/* black magic ON */
if ((int) _mi_check_index(info, ftb->keynr) < 0)
return my_errno;
if (_mi_readinfo(info, F_RDLCK, 1))
return my_errno;
/* black magic OFF */
while(ftb->ok && ftb->queue.elements)
{
curdoc=((FTB_WORD *)queue_top(& ftb->queue))->docid;
while (curdoc==(ftbw=(FTB_WORD *)queue_top(& ftb->queue))->docid)
{
float weight=ftbw->weight;
uint yn=ftbw->yesno;
for (ftbe=ftbw->up; ftbe; ftbe=ftbe->up)
{
if (ftbe->docid != curdoc)
{
ftbe->cur_weight=ftbe->yesses=ftbe->nos=0;
ftbe->docid=curdoc;
}
if (yn>0)
{
ftbe->cur_weight+=weight;
if (++ftbe->yesses >= ftbe->ythresh && !ftbe->nos)
{
yn=ftbe->yesno;
weight=ftbe->cur_weight*ftbe->weight;
}
else
break;
}
else
if (yn<0)
{
/* NOTE: special sort function of queue assures that all yn<0
* events for every particular subexpression will happen
* BEFORE all yn>=0 events. So no already matched expression
* can become not-matched again.
*/
++ftbe->nos;
break;
}
else
/* if (yn==0) */
{
if (ftbe->yesses >= ftbe->ythresh && !ftbe->nos)
{
yn=ftbe->yesno;
weight*=ftbe->weight;
}
else
{
ftbe->cur_weight+=weight;
break;
}
}
}
/* update queue */
r=_mi_search(info, keyinfo, ftbw->word, ftbw->len,
SEARCH_FIND | SEARCH_PREFIX, keyroot);
if (!r)
{
r=_mi_compare_text(default_charset_info,
info->lastkey+ftbw->trunc,ftbw->len,
ftbw->word+ftbw->trunc,ftbw->len,0);
}
if (r) /* not found */
{
queue_remove(& ftb->queue, 0);
if (ftbw->yesno>0 && ftbw->up->up==0)
{ /* this word MUST BE present in every document returned,
so we can stop the search right now */
ftb->ok=0;
}
}
else
{
memcpy(ftbw->word, info->lastkey, info->lastkey_length);
ftbw->docid=info->lastpos;
queue_replaced(& ftb->queue);
}
}
ftbe=ftb->root;
if (ftbe->cur_weight>0 && ftbe->yesses>=ftbe->ythresh && !ftbe->nos)
{
/* curdoc matched ! */
info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); /* why is this ? */
/* info->lastpos=curdoc; */ /* do I need this ? */
if (!(*info->read_record)(info,curdoc,record))
{
info->update|= HA_STATE_AKTIV; /* Record is read */
return 0;
}
return my_errno;
}
}
return my_errno=HA_ERR_END_OF_FILE;
}

View file

@ -38,9 +38,9 @@ FT_DOCLIST *ft_init_search(void *info, uint keynr, byte *query,
return NULL;
/* black magic OFF */
if (is_boolean(query, query_len))
dlist=ft_boolean_search(info,keynr,query,query_len);
else
// if (is_boolean(query, query_len))
// dlist=ft_boolean_search(info,keynr,query,query_len);
// else
dlist=ft_nlq_search(info,keynr,query,query_len);
if(dlist && presort)
@ -72,3 +72,4 @@ int ft_read_next(FT_DOCLIST *handler, char *record)
}
return my_errno;
}

View file

@ -1903,7 +1903,7 @@ err:
return 0;
}
double Item_func_match::val()
double Item_func_match_nl::val()
{
if (ft_handler==NULL)
init_search(1);
@ -1922,7 +1922,7 @@ double Item_func_match::val()
/* we'll have to find ft_relevance manually in ft_handler array */
int a,b,c;
FT_DOC *docs=ft_handler->doc;
FT_DOC *docs=((FT_DOCLIST *)ft_handler)->doc;
my_off_t docid=table->file->row_position();
if ((null_value=(docid==HA_OFFSET_ERROR)))
@ -1930,7 +1930,7 @@ double Item_func_match::val()
// Assuming docs[] is sorted by dpos...
for (a=0, b=ft_handler->ndocs, c=(a+b)/2; b-a>1; c=(a+b)/2)
for (a=0, b=((FT_DOCLIST *)ft_handler)->ndocs, c=(a+b)/2; b-a>1; c=(a+b)/2)
{
if (docs[c].dpos > docid)
b=c;
@ -1941,7 +1941,6 @@ double Item_func_match::val()
return docs[a].weight;
else
return 0.0;
}
void Item_func_match::init_search(bool no_order)
@ -1969,9 +1968,7 @@ void Item_func_match::init_search(bool no_order)
tmp2.set("",0);
}
ft_handler=(FT_DOCLIST *)
table->file->ft_init_ext(key, (byte*) ft_tmp->ptr(), ft_tmp->length(),
join_key && !no_order);
ft_handler_init(ft_tmp->ptr(), ft_tmp->length(), join_key && !no_order);
if (join_key)
{
@ -2024,7 +2021,6 @@ bool Item_func_match::fix_fields(THD *thd,struct st_table_list *tlist)
return 0;
}
bool Item_func_match::fix_index()
{
List_iterator_fast<Item> li(fields);

View file

@ -863,30 +863,40 @@ public:
uint key;
bool join_key;
Item_func_match *master;
FT_DOCLIST *ft_handler;
void * ft_handler;
Item_func_match(List<Item> &a, Item *b): Item_real_func(b),
fields(a), table(0), join_key(0), master(0), ft_handler(0) {}
~Item_func_match()
{
if (!master)
if (!master && ft_handler)
{
if (ft_handler)
{
ft_close_search(ft_handler);
ft_handler_close();
if(join_key)
table->file->ft_handler=0;
}
}
}
const char *func_name() const { return "match"; }
virtual int ft_handler_init(const byte *key, uint keylen, bool presort)
{ return 1; }
virtual int ft_handler_close() { return 1; }
enum Functype functype() const { return FT_FUNC; }
void update_used_tables() {}
bool fix_fields(THD *thd,struct st_table_list *tlist);
bool eq(const Item *) const;
double val();
longlong val_int() { return val()!=0.0; }
bool fix_index();
void init_search(bool no_order);
};
class Item_func_match_nl :public Item_func_match
{
public:
Item_func_match_nl(List<Item> &a, Item *b): Item_func_match(a,b) {}
const char *func_name() const { return "match_NL"; }
double val();
int ft_handler_init(const byte *query, uint querylen, bool presort)
{ ft_handler=table->file->ft_init_ext(key, query, querylen, presort); }
int ft_handler_close() { ft_close_search(ft_handler); ft_handler=0; }
};

View file

@ -605,7 +605,7 @@ mysql_select(THD *thd,TABLE_LIST *tables,List<Item> &fields,COND *conds,
List_iterator_fast<Item_func_match> li(ftfuncs);
Item_func_match *ifm;
DBUG_PRINT("info",("Performing FULLTEXT search"));
thd->proc_info="FULLTEXT searching";
thd->proc_info="FULLTEXT search init";
while ((ifm=li++))
{
@ -1455,13 +1455,13 @@ add_ft_keys(DYNAMIC_ARRAY *keyuse_array,
functype == Item_func::GT_FUNC) &&
arg0->type() == Item::FUNC_ITEM &&
arg0->functype() == Item_func::FT_FUNC &&
arg1->const_item() && arg1->val()>=0)
arg1->const_item() && arg1->val()>0)
cond_func=(Item_func_match *) arg0;
else if ((functype == Item_func::LE_FUNC ||
functype == Item_func::LT_FUNC) &&
arg1->type() == Item::FUNC_ITEM &&
arg1->functype() == Item_func::FT_FUNC &&
arg0->const_item() && arg0->val()>=0)
arg0->const_item() && arg0->val()>0)
cond_func=(Item_func_match *) arg1;
}
}
@ -1473,7 +1473,7 @@ add_ft_keys(DYNAMIC_ARRAY *keyuse_array,
{
Item *item;
/*
I', (Sergei) too lazy to implement proper recursive descent here,
I, (Sergei) too lazy to implement proper recursive descent here,
and anyway, nobody will use such a stupid queries
that will require it :-)
May be later...

View file

@ -1549,10 +1549,10 @@ simple_expr:
| '{' ident expr '}' { $$= $3; }
| MATCH '(' ident_list ')' AGAINST '(' expr ')'
{ Select->ftfunc_list.push_back(
(Item_func_match *)($$=new Item_func_match(*$3,$7))); }
(Item_func_match *)($$=new Item_func_match_nl(*$3,$7))); }
| MATCH ident_list AGAINST '(' expr ')'
{ Select->ftfunc_list.push_back(
(Item_func_match *)($$=new Item_func_match(*$2,$5))); }
(Item_func_match *)($$=new Item_func_match_nl(*$2,$5))); }
| BINARY expr %prec NEG { $$= new Item_func_binary($2); }
| CASE_SYM opt_expr WHEN_SYM when_list opt_else END
{ $$= new Item_func_case(* $4, $2, $5 ) }