mariadb/storage/innobase/include/fts0tokenize.h

189 lines
4.9 KiB
C
Raw Normal View History

/*****************************************************************************
Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
*****************************************************************************/
/******************************************************************//**
@file fts/fts0tokenize.cc
Full Text Search plugin tokenizer refer to MyISAM
Created 2014/11/17 Shaohua Wang
***********************************************************************/
#include "ft_global.h"
#include "mysql/plugin_ftparser.h"
#include "m_ctype.h"
/* Macros and structs below are from ftdefs.h in MyISAM */
/** Check a char is true word */
#define true_word_char(c, ch) ((c) & (_MY_U | _MY_L | _MY_NMR) || (ch) == '_')
/** Check if a char is misc word */
#define misc_word_char(X) 0
/** Boolean search syntax */
static const char* fts_boolean_syntax = DEFAULT_FTB_SYNTAX;
#define FTB_YES (fts_boolean_syntax[0])
#define FTB_EGAL (fts_boolean_syntax[1])
#define FTB_NO (fts_boolean_syntax[2])
#define FTB_INC (fts_boolean_syntax[3])
#define FTB_DEC (fts_boolean_syntax[4])
#define FTB_LBR (fts_boolean_syntax[5])
#define FTB_RBR (fts_boolean_syntax[6])
#define FTB_NEG (fts_boolean_syntax[7])
#define FTB_TRUNC (fts_boolean_syntax[8])
#define FTB_LQUOT (fts_boolean_syntax[10])
#define FTB_RQUOT (fts_boolean_syntax[11])
/** FTS query token */
typedef struct st_ft_word {
uchar* pos; /*!< word start pointer */
uint len; /*!< word len */
double weight; /*!< word weight, unused in innodb */
} FT_WORD;
/** Tokenizer for ngram referring to ft_get_word(ft_parser.c) in MyISAM.
Differences: a. code format changed; b. stopword processing removed.
@param[in] cs charset
@param[in,out] start doc start pointer
@param[in,out] end doc end pointer
@param[in,out] word token
@param[in,out] info token info
@retval 0 eof
@retval 1 word found
@retval 2 left bracket
@retval 3 right bracket
@retval 4 stopword found */
inline
uchar
fts_get_word(
const CHARSET_INFO* cs,
uchar** start,
uchar* end,
FT_WORD* word,
MYSQL_FTPARSER_BOOLEAN_INFO*
info)
{
uchar* doc = *start;
int ctype;
uint mwc;
uint length;
int mbl;
info->yesno = (FTB_YES ==' ') ? 1 : (info->quot != 0);
info->weight_adjust = info->wasign = 0;
info->type = FT_TOKEN_EOF;
while (doc < end) {
for (; doc < end;
doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
mbl = cs->cset->ctype(cs, &ctype, doc, end);
if (true_word_char(ctype, *doc)) {
break;
}
if (*doc == FTB_RQUOT && info->quot) {
*start = doc + 1;
info->type = FT_TOKEN_RIGHT_PAREN;
return(info->type);
}
if (!info->quot) {
if (*doc == FTB_LBR
|| *doc == FTB_RBR
|| *doc == FTB_LQUOT) {
/* param->prev=' '; */
*start = doc + 1;
if (*doc == FTB_LQUOT) {
info->quot = (char*)1;
}
info->type = (*doc == FTB_RBR ?
FT_TOKEN_RIGHT_PAREN :
FT_TOKEN_LEFT_PAREN);
return(info->type);
}
if (info->prev == ' ') {
if (*doc == FTB_YES) {
info->yesno = +1;
continue;
} else if (*doc == FTB_EGAL) {
info->yesno = 0;
continue;
} else if (*doc == FTB_NO) {
info->yesno = -1;
continue;
} else if (*doc == FTB_INC) {
info->weight_adjust++;
continue;
} else if (*doc == FTB_DEC) {
info->weight_adjust--;
continue;
} else if (*doc == FTB_NEG) {
info->wasign = !info->wasign;
continue;
}
}
}
info->prev = *doc;
info->yesno = (FTB_YES == ' ') ? 1 : (info->quot != 0);
info->weight_adjust = info->wasign = 0;
}
mwc = length = 0;
for (word->pos = doc;
doc < end;
length++, doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
mbl = cs->cset->ctype(cs, &ctype, doc, end);
if (true_word_char(ctype, *doc)) {
mwc = 0;
} else if (!misc_word_char(*doc) || mwc) {
break;
} else {
mwc++;
}
}
/* Be sure *prev is true_word_char. */
info->prev = 'A';
word->len = (uint)(doc-word->pos) - mwc;
if ((info->trunc = (doc < end && *doc == FTB_TRUNC))) {
doc++;
}
/* We don't check stopword here. */
*start = doc;
info->type = FT_TOKEN_WORD;
return(info->type);
}
if (info->quot) {
*start = doc;
info->type = FT_TOKEN_RIGHT_PAREN;
}
return(info->type);
}