mirror of
				https://github.com/MariaDB/server.git
				synced 2025-10-31 19:06:14 +01:00 
			
		
		
		
	
		
			
				
	
	
		
			189 lines
		
	
	
	
		
			4.9 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			189 lines
		
	
	
	
		
			4.9 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*****************************************************************************
 | |
| 
 | |
| Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
 | |
| Copyright (c) 2020, MariaDB Corporation.
 | |
| 
 | |
| This program is free software; you can redistribute it and/or modify it under
 | |
| the terms of the GNU General Public License as published by the Free Software
 | |
| Foundation; version 2 of the License.
 | |
| 
 | |
| This program is distributed in the hope that it will be useful, but WITHOUT
 | |
| ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 | |
| FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 | |
| 
 | |
| You should have received a copy of the GNU General Public License along with
 | |
| this program; if not, write to the Free Software Foundation, Inc.,
 | |
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
 | |
| 
 | |
| *****************************************************************************/
 | |
| 
 | |
| /******************************************************************//**
 | |
| @file fts/fts0tokenize.cc
 | |
| Full Text Search plugin tokenizer refer to MyISAM
 | |
| 
 | |
| Created 2014/11/17 Shaohua Wang
 | |
| ***********************************************************************/
 | |
| 
 | |
| #include "ft_global.h"
 | |
| #include "mysql/plugin_ftparser.h"
 | |
| #include "m_ctype.h"
 | |
| 
 | |
| /* Macros and structs below are from ftdefs.h in MyISAM */
 | |
| /** Check a char is true word */
 | |
| #define true_word_char(c, ch) ((c) & (_MY_U | _MY_L | _MY_NMR) || (ch) == '_')
 | |
| 
 | |
| /** Check if a char is misc word */
 | |
| #define misc_word_char(X)       0
 | |
| 
 | |
| /** Boolean search syntax */
 | |
| static const char* fts_boolean_syntax = DEFAULT_FTB_SYNTAX;
 | |
| 
 | |
| #define FTB_YES   (fts_boolean_syntax[0])
 | |
| #define FTB_EGAL  (fts_boolean_syntax[1])
 | |
| #define FTB_NO    (fts_boolean_syntax[2])
 | |
| #define FTB_INC   (fts_boolean_syntax[3])
 | |
| #define FTB_DEC   (fts_boolean_syntax[4])
 | |
| #define FTB_LBR   (fts_boolean_syntax[5])
 | |
| #define FTB_RBR   (fts_boolean_syntax[6])
 | |
| #define FTB_NEG   (fts_boolean_syntax[7])
 | |
| #define FTB_TRUNC (fts_boolean_syntax[8])
 | |
| #define FTB_LQUOT (fts_boolean_syntax[10])
 | |
| #define FTB_RQUOT (fts_boolean_syntax[11])
 | |
| 
 | |
| /** FTS query token */
 | |
| typedef struct st_ft_word {
 | |
|         uchar* pos;     /*!< word start pointer */
 | |
|         uint   len;     /*!< word len */
 | |
|         double weight;  /*!< word weight, unused in innodb */
 | |
| } FT_WORD;
 | |
| 
 | |
| /** Tokenizer for ngram referring to ft_get_word(ft_parser.c) in MyISAM.
 | |
| Differences: a. code format changed; b. stopword processing removed.
 | |
| @param[in]	cs	charset
 | |
| @param[in,out]	start	doc start pointer
 | |
| @param[in,out]	end	doc end pointer
 | |
| @param[in,out]	word	token
 | |
| @param[in,out]	info	token info
 | |
| @retval	0	eof
 | |
| @retval	1	word found
 | |
| @retval	2	left bracket
 | |
| @retval	3	right bracket
 | |
| @retval	4	stopword found */
 | |
| inline
 | |
| uchar
 | |
| fts_get_word(
 | |
| 	const CHARSET_INFO*	cs,
 | |
| 	uchar**			start,
 | |
| 	uchar*			end,
 | |
| 	FT_WORD*		word,
 | |
| 	MYSQL_FTPARSER_BOOLEAN_INFO*
 | |
| 				info)
 | |
| {
 | |
| 	uchar*	doc = *start;
 | |
| 	int	ctype;
 | |
| 	uint	mwc;
 | |
| 	uint	length;
 | |
| 	int	mbl;
 | |
| 
 | |
| 	info->yesno = (FTB_YES ==' ') ? 1 : (info->quot != 0);
 | |
| 	info->weight_adjust = info->wasign = 0;
 | |
| 	info->type = FT_TOKEN_EOF;
 | |
| 
 | |
| 	while (doc < end) {
 | |
| 		for (; doc < end;
 | |
| 		     doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
 | |
| 			mbl = cs->ctype(&ctype, doc, end);
 | |
| 
 | |
| 			if (true_word_char(ctype, *doc)) {
 | |
| 				break;
 | |
| 			}
 | |
| 
 | |
| 			if (*doc == FTB_RQUOT && info->quot) {
 | |
| 				*start = doc + 1;
 | |
| 				info->type = FT_TOKEN_RIGHT_PAREN;
 | |
| 
 | |
| 				return(info->type);
 | |
| 			}
 | |
| 
 | |
| 			if (!info->quot) {
 | |
| 				if (*doc == FTB_LBR
 | |
| 				    || *doc == FTB_RBR
 | |
| 				    || *doc == FTB_LQUOT) {
 | |
| 					/* param->prev=' '; */
 | |
| 					*start = doc + 1;
 | |
| 					if (*doc == FTB_LQUOT) {
 | |
| 						info->quot = (char*)1;
 | |
| 					}
 | |
| 
 | |
| 					info->type = (*doc == FTB_RBR ?
 | |
| 						       FT_TOKEN_RIGHT_PAREN :
 | |
| 						       FT_TOKEN_LEFT_PAREN);
 | |
| 
 | |
| 					return(info->type);
 | |
| 				}
 | |
| 
 | |
| 				if (info->prev == ' ') {
 | |
| 					if (*doc == FTB_YES) {
 | |
| 						info->yesno = +1;
 | |
| 						continue;
 | |
| 					} else if (*doc == FTB_EGAL) {
 | |
| 						info->yesno = 0;
 | |
| 						continue;
 | |
| 					} else if (*doc == FTB_NO) {
 | |
| 						info->yesno = -1;
 | |
| 						continue;
 | |
| 					} else if (*doc == FTB_INC) {
 | |
| 						info->weight_adjust++;
 | |
| 						continue;
 | |
| 					} else if (*doc == FTB_DEC) {
 | |
| 						info->weight_adjust--;
 | |
| 						continue;
 | |
| 					} else if (*doc == FTB_NEG) {
 | |
| 						info->wasign = !info->wasign;
 | |
| 						continue;
 | |
| 					}
 | |
| 				}
 | |
| 			}
 | |
| 
 | |
| 			info->prev = char(*doc);
 | |
| 			info->yesno = (FTB_YES == ' ') ? 1 : (info->quot != 0);
 | |
| 			info->weight_adjust = info->wasign = 0;
 | |
| 		}
 | |
| 
 | |
| 		mwc = length = 0;
 | |
| 		for (word->pos = doc;
 | |
| 		     doc < end;
 | |
| 		     length++, doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
 | |
| 			mbl = cs->ctype(&ctype, doc, end);
 | |
| 
 | |
| 			if (true_word_char(ctype, *doc)) {
 | |
| 				mwc = 0;
 | |
| 			} else if (!misc_word_char(*doc) || mwc) {
 | |
| 				break;
 | |
| 			} else {
 | |
| 				mwc++;
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		/* Be sure *prev is true_word_char. */
 | |
| 		info->prev = 'A';
 | |
| 		word->len = (uint)(doc-word->pos) - mwc;
 | |
| 
 | |
| 		if ((info->trunc = (doc < end && *doc == FTB_TRUNC))) {
 | |
| 			doc++;
 | |
| 		}
 | |
| 
 | |
| 		/* We don't check stopword here. */
 | |
| 		*start = doc;
 | |
| 		info->type = FT_TOKEN_WORD;
 | |
| 
 | |
| 		return(info->type);
 | |
| 	}
 | |
| 
 | |
| 	if (info->quot) {
 | |
| 		*start = doc;
 | |
| 		info->type = FT_TOKEN_RIGHT_PAREN;
 | |
| 	}
 | |
| 
 | |
| 	return(info->type);
 | |
| }
 | 
