mirror of
				https://github.com/MariaDB/server.git
				synced 2025-10-31 02:46:29 +01:00 
			
		
		
		
	
		
			
				
	
	
		
			377 lines
		
	
	
	
		
			11 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			377 lines
		
	
	
	
		
			11 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /* -*- c-basic-offset: 2 -*- */
 | |
| /*
 | |
|   Copyright(C) 2012-2014 Brazil
 | |
| 
 | |
|   This library is free software; you can redistribute it and/or
 | |
|   modify it under the terms of the GNU Lesser General Public
 | |
|   License version 2.1 as published by the Free Software Foundation.
 | |
| 
 | |
|   This library is distributed in the hope that it will be useful,
 | |
|   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | |
|   Lesser General Public License for more details.
 | |
| 
 | |
|   You should have received a copy of the GNU Lesser General Public
 | |
|   License along with this library; if not, write to the Free Software
 | |
|   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1335  USA
 | |
| */
 | |
| #include "grn.h"
 | |
| #include <groonga/tokenizer.h>
 | |
| 
 | |
| #include <string.h>
 | |
| 
 | |
| #include "grn_ctx.h"
 | |
| #include "grn_db.h"
 | |
| #include "grn_str.h"
 | |
| #include "grn_string.h"
 | |
| #include "grn_token_cursor.h"
 | |
| 
 | |
| /*
 | |
|   Just for backward compatibility. See grn_plugin_charlen() instead.
 | |
|  */
 | |
| int
 | |
| grn_tokenizer_charlen(grn_ctx *ctx, const char *str_ptr,
 | |
|                       unsigned int str_length, grn_encoding encoding)
 | |
| {
 | |
|   return grn_plugin_charlen(ctx, str_ptr, str_length, encoding);
 | |
| }
 | |
| 
 | |
| /*
 | |
|   Just for backward compatibility. See grn_plugin_isspace() instead.
 | |
|  */
 | |
| int
 | |
| grn_tokenizer_isspace(grn_ctx *ctx, const char *str_ptr,
 | |
|                       unsigned int str_length, grn_encoding encoding)
 | |
| {
 | |
|   return grn_plugin_isspace(ctx, str_ptr, str_length, encoding);
 | |
| }
 | |
| 
 | |
| grn_bool
 | |
| grn_tokenizer_is_tokenized_delimiter(grn_ctx *ctx,
 | |
|                                      const char *str_ptr,
 | |
|                                      unsigned int str_length,
 | |
|                                      grn_encoding encoding)
 | |
| {
 | |
|   if (encoding != GRN_ENC_UTF8) {
 | |
|     return GRN_FALSE;
 | |
|   }
 | |
| 
 | |
|   if (str_length != GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8_LEN) {
 | |
|     return GRN_FALSE;
 | |
|   }
 | |
| 
 | |
|   return memcmp(str_ptr,
 | |
|                 GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8,
 | |
|                 GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8_LEN) == 0;
 | |
| }
 | |
| 
 | |
| grn_bool
 | |
| grn_tokenizer_have_tokenized_delimiter(grn_ctx *ctx,
 | |
|                                        const char *str_ptr,
 | |
|                                        unsigned int str_length,
 | |
|                                        grn_encoding encoding)
 | |
| {
 | |
|   int char_length;
 | |
|   const char *current = str_ptr;
 | |
|   const char *end = str_ptr + str_length;
 | |
| 
 | |
|   if (encoding != GRN_ENC_UTF8) {
 | |
|     return GRN_FALSE;
 | |
|   }
 | |
| 
 | |
|   if (str_length == 0) {
 | |
|     return GRN_FALSE;
 | |
|   }
 | |
| 
 | |
|   while ((char_length = grn_charlen_(ctx, current, end, encoding)) > 0) {
 | |
|     if (grn_tokenizer_is_tokenized_delimiter(ctx,
 | |
|                                              current, char_length,
 | |
|                                              encoding)) {
 | |
|       return GRN_TRUE;
 | |
|     }
 | |
|     current += char_length;
 | |
|   }
 | |
|   return GRN_FALSE;
 | |
| }
 | |
| 
 | |
| grn_tokenizer_query *
 | |
| grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args,
 | |
|                          unsigned int normalize_flags)
 | |
| {
 | |
|   grn_obj *flags = grn_ctx_pop(ctx);
 | |
|   grn_obj *query_str = grn_ctx_pop(ctx);
 | |
|   grn_obj *tokenize_mode = grn_ctx_pop(ctx);
 | |
| 
 | |
|   if (query_str == NULL) {
 | |
|     GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "missing argument");
 | |
|     return NULL;
 | |
|   }
 | |
| 
 | |
|   if ((num_args < 1) || (args == NULL) || (args[0] == NULL)) {
 | |
|     GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid NULL pointer");
 | |
|     return NULL;
 | |
|   }
 | |
| 
 | |
|   {
 | |
|     grn_tokenizer_query * const query =
 | |
|         GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_query));
 | |
|     if (query == NULL) {
 | |
|       return NULL;
 | |
|     }
 | |
|     query->normalized_query = NULL;
 | |
|     query->query_buf = NULL;
 | |
|     if (flags) {
 | |
|       query->flags = GRN_UINT32_VALUE(flags);
 | |
|     } else {
 | |
|       query->flags = 0;
 | |
|     }
 | |
|     if (tokenize_mode) {
 | |
|       query->tokenize_mode = GRN_UINT32_VALUE(tokenize_mode);
 | |
|     } else {
 | |
|       query->tokenize_mode = GRN_TOKENIZE_ADD;
 | |
|     }
 | |
|     query->token_mode = query->tokenize_mode;
 | |
| 
 | |
|     {
 | |
|       grn_obj * const table = args[0];
 | |
|       grn_table_flags table_flags;
 | |
|       grn_encoding table_encoding;
 | |
|       unsigned int query_length = GRN_TEXT_LEN(query_str);
 | |
|       char *query_buf = (char *)GRN_PLUGIN_MALLOC(ctx, query_length + 1);
 | |
|       grn_obj *normalizer = NULL;
 | |
| 
 | |
|       if (query_buf == NULL) {
 | |
|         GRN_PLUGIN_FREE(ctx, query);
 | |
|         GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
 | |
|                          "[tokenizer] failed to duplicate query");
 | |
|         return NULL;
 | |
|       }
 | |
|       grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL,
 | |
|                          &normalizer, NULL);
 | |
|       {
 | |
|         grn_obj *normalized_query;
 | |
|         if (table_flags & GRN_OBJ_KEY_NORMALIZE) {
 | |
|           normalizer = GRN_NORMALIZER_AUTO;
 | |
|         }
 | |
|         normalized_query = grn_string_open_(ctx,
 | |
|                                             GRN_TEXT_VALUE(query_str),
 | |
|                                             GRN_TEXT_LEN(query_str),
 | |
|                                             normalizer,
 | |
|                                             normalize_flags,
 | |
|                                             table_encoding);
 | |
|         if (!normalized_query) {
 | |
|           GRN_PLUGIN_FREE(ctx, query_buf);
 | |
|           GRN_PLUGIN_FREE(ctx, query);
 | |
|           GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
 | |
|                            "[tokenizer] failed to open normalized string");
 | |
|           return NULL;
 | |
|         }
 | |
|         query->normalized_query = normalized_query;
 | |
|         grn_memcpy(query_buf, GRN_TEXT_VALUE(query_str), query_length);
 | |
|         query_buf[query_length] = '\0';
 | |
|         query->query_buf = query_buf;
 | |
|         query->ptr = query_buf;
 | |
|         query->length = query_length;
 | |
|       }
 | |
|       query->encoding = table_encoding;
 | |
| 
 | |
|       if (query->flags & GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER) {
 | |
|         const char *normalized_string;
 | |
|         unsigned int normalized_string_length;
 | |
| 
 | |
|         grn_string_get_normalized(ctx,
 | |
|                                   query->normalized_query,
 | |
|                                   &normalized_string,
 | |
|                                   &normalized_string_length,
 | |
|                                   NULL);
 | |
|         query->have_tokenized_delimiter =
 | |
|           grn_tokenizer_have_tokenized_delimiter(ctx,
 | |
|                                                  normalized_string,
 | |
|                                                  normalized_string_length,
 | |
|                                                  query->encoding);
 | |
|       } else {
 | |
|         query->have_tokenized_delimiter = GRN_FALSE;
 | |
|       }
 | |
|     }
 | |
|     return query;
 | |
|   }
 | |
| }
 | |
| 
 | |
| grn_tokenizer_query *
 | |
| grn_tokenizer_query_create(grn_ctx *ctx, int num_args, grn_obj **args)
 | |
| {
 | |
|   return grn_tokenizer_query_open(ctx, num_args, args, 0);
 | |
| }
 | |
| 
 | |
| void
 | |
| grn_tokenizer_query_close(grn_ctx *ctx, grn_tokenizer_query *query)
 | |
| {
 | |
|   if (query != NULL) {
 | |
|     if (query->normalized_query != NULL) {
 | |
|       grn_obj_unlink(ctx, query->normalized_query);
 | |
|     }
 | |
|     if (query->query_buf != NULL) {
 | |
|       GRN_PLUGIN_FREE(ctx, query->query_buf);
 | |
|     }
 | |
|     GRN_PLUGIN_FREE(ctx, query);
 | |
|   }
 | |
| }
 | |
| 
 | |
| void
 | |
| grn_tokenizer_query_destroy(grn_ctx *ctx, grn_tokenizer_query *query)
 | |
| {
 | |
|   grn_tokenizer_query_close(ctx, query);
 | |
| }
 | |
| 
 | |
| void
 | |
| grn_tokenizer_token_init(grn_ctx *ctx, grn_tokenizer_token *token)
 | |
| {
 | |
|   GRN_TEXT_INIT(&token->str, GRN_OBJ_DO_SHALLOW_COPY);
 | |
|   GRN_UINT32_INIT(&token->status, 0);
 | |
| }
 | |
| 
 | |
| void
 | |
| grn_tokenizer_token_fin(grn_ctx *ctx, grn_tokenizer_token *token)
 | |
| {
 | |
|   GRN_OBJ_FIN(ctx, &(token->str));
 | |
|   GRN_OBJ_FIN(ctx, &(token->status));
 | |
| }
 | |
| 
 | |
| void
 | |
| grn_tokenizer_token_push(grn_ctx *ctx, grn_tokenizer_token *token,
 | |
|                          const char *str_ptr, unsigned int str_length,
 | |
|                          grn_token_status status)
 | |
| {
 | |
|   GRN_TEXT_SET_REF(&token->str, str_ptr, str_length);
 | |
|   GRN_UINT32_SET(ctx, &token->status, status);
 | |
|   grn_ctx_push(ctx, &token->str);
 | |
|   grn_ctx_push(ctx, &token->status);
 | |
| }
 | |
| 
 | |
| const char *
 | |
| grn_tokenizer_tokenized_delimiter_next(grn_ctx *ctx,
 | |
|                                        grn_tokenizer_token *token,
 | |
|                                        const char *str_ptr,
 | |
|                                        unsigned int str_length,
 | |
|                                        grn_encoding encoding)
 | |
| {
 | |
|   size_t char_length = 0;
 | |
|   const char *start = str_ptr;
 | |
|   const char *current;
 | |
|   const char *end = str_ptr + str_length;
 | |
|   const char *next_start = NULL;
 | |
|   unsigned int token_length;
 | |
|   grn_token_status status;
 | |
| 
 | |
|   for (current = start; current < end; current += char_length) {
 | |
|     char_length = grn_charlen_(ctx, current, end, encoding);
 | |
|     if (char_length == 0) {
 | |
|       break;
 | |
|     }
 | |
|     if (grn_tokenizer_is_tokenized_delimiter(ctx, current, char_length,
 | |
|                                              encoding)) {
 | |
|       next_start = str_ptr + (current - start + char_length);
 | |
|       break;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   token_length = current - start;
 | |
|   if (current == end) {
 | |
|     status = GRN_TOKENIZER_LAST;
 | |
|   } else {
 | |
|     status = GRN_TOKENIZER_CONTINUE;
 | |
|   }
 | |
|   grn_tokenizer_token_push(ctx, token, start, token_length, status);
 | |
| 
 | |
|   return next_start;
 | |
| }
 | |
| 
 | |
| grn_rc
 | |
| grn_tokenizer_register(grn_ctx *ctx, const char *plugin_name_ptr,
 | |
|                        unsigned int plugin_name_length,
 | |
|                        grn_proc_func *init, grn_proc_func *next,
 | |
|                        grn_proc_func *fin)
 | |
| {
 | |
|   grn_expr_var vars[3];
 | |
|   vars[0].name= NULL;
 | |
|   vars[0].name_size= 0;
 | |
|   vars[1].name= NULL;
 | |
|   vars[1].name_size= 0;
 | |
|   vars[2].name= NULL;
 | |
|   vars[2].name_size= 0;
 | |
|   GRN_TEXT_INIT(&vars[0].value, 0);
 | |
|   GRN_TEXT_INIT(&vars[1].value, 0);
 | |
|   GRN_UINT32_INIT(&vars[2].value, 0);
 | |
| 
 | |
|   {
 | |
|     /*
 | |
|       grn_proc_create() registers a plugin to the database which is associated
 | |
|       with `ctx'. A returned object must not be finalized here.
 | |
|      */
 | |
|     grn_obj * const obj = grn_proc_create(ctx, plugin_name_ptr,
 | |
|                                           plugin_name_length,
 | |
|                                           GRN_PROC_TOKENIZER,
 | |
|                                           init, next, fin, 3, vars);
 | |
|     if (obj == NULL) {
 | |
|       GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "grn_proc_create() failed");
 | |
|       return ctx->rc;
 | |
|     }
 | |
|   }
 | |
|   return GRN_SUCCESS;
 | |
| }
 | |
| 
 | |
| grn_obj *
 | |
| grn_token_get_data(grn_ctx *ctx, grn_token *token)
 | |
| {
 | |
|   GRN_API_ENTER;
 | |
|   if (!token) {
 | |
|     ERR(GRN_INVALID_ARGUMENT, "token must not be NULL");
 | |
|     GRN_API_RETURN(NULL);
 | |
|   }
 | |
|   GRN_API_RETURN(&(token->data));
 | |
| }
 | |
| 
 | |
| grn_rc
 | |
| grn_token_set_data(grn_ctx *ctx,
 | |
|                    grn_token *token,
 | |
|                    const char *str_ptr,
 | |
|                    int str_length)
 | |
| {
 | |
|   GRN_API_ENTER;
 | |
|   if (!token) {
 | |
|     ERR(GRN_INVALID_ARGUMENT, "token must not be NULL");
 | |
|     goto exit;
 | |
|   }
 | |
|   if (str_length == -1) {
 | |
|     str_length = strlen(str_ptr);
 | |
|   }
 | |
|   GRN_TEXT_SET(ctx, &(token->data), str_ptr, str_length);
 | |
| exit:
 | |
|   GRN_API_RETURN(ctx->rc);
 | |
| }
 | |
| 
 | |
| grn_token_status
 | |
| grn_token_get_status(grn_ctx *ctx, grn_token *token)
 | |
| {
 | |
|   GRN_API_ENTER;
 | |
|   if (!token) {
 | |
|     ERR(GRN_INVALID_ARGUMENT, "token must not be NULL");
 | |
|     GRN_API_RETURN(GRN_TOKEN_CONTINUE);
 | |
|   }
 | |
|   GRN_API_RETURN(token->status);
 | |
| }
 | |
| 
 | |
| grn_rc
 | |
| grn_token_set_status(grn_ctx *ctx,
 | |
|                      grn_token *token,
 | |
|                      grn_token_status status)
 | |
| {
 | |
|   GRN_API_ENTER;
 | |
|   if (!token) {
 | |
|     ERR(GRN_INVALID_ARGUMENT, "token must not be NULL");
 | |
|     goto exit;
 | |
|   }
 | |
|   token->status = status;
 | |
| exit:
 | |
|   GRN_API_RETURN(ctx->rc);
 | |
| }
 | 
