mariadb/storage/innobase/include/fts0tokenize.h

/*****************************************************************************

Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2020, MariaDB Corporation.

This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA

*****************************************************************************/

/******************************************************************//**
@file fts/fts0tokenize.cc
Full Text Search plugin tokenizer refer to MyISAM

Created 2014/11/17 Shaohua Wang
***********************************************************************/

#include "ft_global.h"
#include "mysql/plugin_ftparser.h"
#include "m_ctype.h"

/* Macros and structs below are from ftdefs.h in MyISAM */
/** Check a char is true word */
#define true_word_char(c, ch) ((c) & (_MY_U | _MY_L | _MY_NMR) || (ch) == '_')

/** Check if a char is misc word */
#define misc_word_char(X)       0

/** Boolean search syntax */
static const char* fts_boolean_syntax = DEFAULT_FTB_SYNTAX;

#define FTB_YES   (fts_boolean_syntax[0])
#define FTB_EGAL  (fts_boolean_syntax[1])
#define FTB_NO    (fts_boolean_syntax[2])
#define FTB_INC   (fts_boolean_syntax[3])
#define FTB_DEC   (fts_boolean_syntax[4])
#define FTB_LBR   (fts_boolean_syntax[5])
#define FTB_RBR   (fts_boolean_syntax[6])
#define FTB_NEG   (fts_boolean_syntax[7])
#define FTB_TRUNC (fts_boolean_syntax[8])
#define FTB_LQUOT (fts_boolean_syntax[10])
#define FTB_RQUOT (fts_boolean_syntax[11])

/** FTS query token */
typedef struct st_ft_word {
        uchar* pos;     /*!< word start pointer */
        uint   len;     /*!< word len */
        double weight;  /*!< word weight, unused in innodb */
} FT_WORD;

/** Tokenizer for ngram referring to ft_get_word(ft_parser.c) in MyISAM.
Differences: a. code format changed; b. stopword processing removed.
@param[in]	cs	charset
@param[in,out]	start	doc start pointer
@param[in,out]	end	doc end pointer
@param[in,out]	word	token
@param[in,out]	info	token info
@retval	0	eof
@retval	1	word found
@retval	2	left bracket
@retval	3	right bracket
@retval	4	stopword found */
inline
uchar
fts_get_word(
	const CHARSET_INFO*	cs,
	uchar**			start,
	uchar*			end,
	FT_WORD*		word,
	MYSQL_FTPARSER_BOOLEAN_INFO*
				info)
{
	uchar*	doc = *start;
	int	ctype;
	uint	mwc;
	uint	length;
	int	mbl;

	info->yesno = (FTB_YES ==' ') ? 1 : (info->quot != 0);
	info->weight_adjust = info->wasign = 0;
	info->type = FT_TOKEN_EOF;

	while (doc < end) {
		for (; doc < end;
		     doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
			mbl = cs->ctype(&ctype, doc, end);

			if (true_word_char(ctype, *doc)) {
				break;
			}

			if (*doc == FTB_RQUOT && info->quot) {
				*start = doc + 1;
				info->type = FT_TOKEN_RIGHT_PAREN;

				return(info->type);
			}

			if (!info->quot) {
				if (*doc == FTB_LBR
				    || *doc == FTB_RBR
				    || *doc == FTB_LQUOT) {
					/* param->prev=' '; */
					*start = doc + 1;
					if (*doc == FTB_LQUOT) {
						info->quot = (char*)1;
					}

					info->type = (*doc == FTB_RBR ?
						       FT_TOKEN_RIGHT_PAREN :
						       FT_TOKEN_LEFT_PAREN);

					return(info->type);
				}

				if (info->prev == ' ') {
					if (*doc == FTB_YES) {
						info->yesno = +1;
						continue;
					} else if (*doc == FTB_EGAL) {
						info->yesno = 0;
						continue;
					} else if (*doc == FTB_NO) {
						info->yesno = -1;
						continue;
					} else if (*doc == FTB_INC) {
						info->weight_adjust++;
						continue;
					} else if (*doc == FTB_DEC) {
						info->weight_adjust--;
						continue;
					} else if (*doc == FTB_NEG) {
						info->wasign = !info->wasign;
						continue;
					}
				}
			}

			info->prev = char(*doc);
			info->yesno = (FTB_YES == ' ') ? 1 : (info->quot != 0);
			info->weight_adjust = info->wasign = 0;
		}

		mwc = length = 0;
		for (word->pos = doc;
		     doc < end;
		     length++, doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
			mbl = cs->ctype(&ctype, doc, end);

			if (true_word_char(ctype, *doc)) {
				mwc = 0;
			} else if (!misc_word_char(*doc) || mwc) {
				break;
			} else {
				mwc++;
			}
		}

		/* Be sure *prev is true_word_char. */
		info->prev = 'A';
		word->len = (uint)(doc-word->pos) - mwc;

		if ((info->trunc = (doc < end && *doc == FTB_TRUNC))) {
			doc++;
		}

		/* We don't check stopword here. */
		*start = doc;
		info->type = FT_TOKEN_WORD;

		return(info->type);
	}

	if (info->quot) {
		*start = doc;
		info->type = FT_TOKEN_RIGHT_PAREN;
	}

	return(info->type);
}