mirror of
https://github.com/MariaDB/server.git
synced 2025-01-18 13:02:28 +01:00
413 lines
11 KiB
C
413 lines
11 KiB
C
/* -*- c-basic-offset: 2 -*- */
|
|
/*
|
|
Copyright(C) 2009-2012 Brazil
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License version 2.1 as published by the Free Software Foundation.
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with this library; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#include "grn.h"
|
|
#include <string.h>
|
|
#include "grn_string.h"
|
|
#include "grn_normalizer.h"
|
|
#include "grn_str.h"
|
|
#include "grn_util.h"
|
|
|
|
#include <groonga/tokenizer.h>
|
|
|
|
static grn_string *
|
|
grn_fake_string_open(grn_ctx *ctx, grn_string *string)
|
|
{
|
|
/* TODO: support GRN_STRING_REMOVE_BLANK flag and ctypes */
|
|
grn_string *nstr = string;
|
|
const char *str;
|
|
unsigned int str_len;
|
|
|
|
str = nstr->original;
|
|
str_len = nstr->original_length_in_bytes;
|
|
|
|
if (!(nstr->normalized = GRN_MALLOC(str_len + 1))) {
|
|
ERR(GRN_NO_MEMORY_AVAILABLE,
|
|
"[strinig][fake] failed to allocate normalized text space");
|
|
grn_string_close(ctx, (grn_obj *)nstr);
|
|
return NULL;
|
|
}
|
|
|
|
if (nstr->flags & GRN_STRING_REMOVE_TOKENIZED_DELIMITER &&
|
|
ctx->encoding == GRN_ENC_UTF8) {
|
|
int char_length;
|
|
const char *source_current = str;
|
|
const char *source_end = str + str_len;
|
|
char *destination = nstr->normalized;
|
|
unsigned int destination_length = 0;
|
|
while ((char_length = grn_charlen(ctx, source_current, source_end)) > 0) {
|
|
if (!grn_tokenizer_is_tokenized_delimiter(ctx,
|
|
source_current, char_length,
|
|
ctx->encoding)) {
|
|
grn_memcpy(destination, source_current, char_length);
|
|
destination += char_length;
|
|
destination_length += char_length;
|
|
}
|
|
source_current += char_length;
|
|
}
|
|
nstr->normalized[destination_length] = '\0';
|
|
nstr->normalized_length_in_bytes = destination_length;
|
|
} else {
|
|
grn_memcpy(nstr->normalized, str, str_len);
|
|
nstr->normalized[str_len] = '\0';
|
|
nstr->normalized_length_in_bytes = str_len;
|
|
}
|
|
|
|
if (nstr->flags & GRN_STRING_WITH_CHECKS) {
|
|
int16_t f = 0;
|
|
unsigned char c;
|
|
size_t i;
|
|
if (!(nstr->checks = (int16_t *) GRN_MALLOC(sizeof(int16_t) * str_len))) {
|
|
grn_string_close(ctx, (grn_obj *)nstr);
|
|
ERR(GRN_NO_MEMORY_AVAILABLE,
|
|
"[strinig][fake] failed to allocate checks space");
|
|
return NULL;
|
|
}
|
|
switch (nstr->encoding) {
|
|
case GRN_ENC_EUC_JP:
|
|
for (i = 0; i < str_len; i++) {
|
|
if (!f) {
|
|
c = (unsigned char) str[i];
|
|
f = ((c >= 0xa1U && c <= 0xfeU) || c == 0x8eU ? 2 : (c == 0x8fU ? 3 : 1)
|
|
);
|
|
nstr->checks[i] = f;
|
|
} else {
|
|
nstr->checks[i] = 0;
|
|
}
|
|
f--;
|
|
}
|
|
break;
|
|
case GRN_ENC_SJIS:
|
|
for (i = 0; i < str_len; i++) {
|
|
if (!f) {
|
|
c = (unsigned char) str[i];
|
|
f = (c >= 0x81U && ((c <= 0x9fU) || (c >= 0xe0U && c <= 0xfcU)) ? 2 : 1);
|
|
nstr->checks[i] = f;
|
|
} else {
|
|
nstr->checks[i] = 0;
|
|
}
|
|
f--;
|
|
}
|
|
break;
|
|
case GRN_ENC_UTF8:
|
|
for (i = 0; i < str_len; i++) {
|
|
if (!f) {
|
|
c = (unsigned char) str[i];
|
|
f = (c & 0x80U ? (c & 0x20U ? (c & 0x10U ? 4 : 3)
|
|
: 2)
|
|
: 1);
|
|
nstr->checks[i] = f;
|
|
} else {
|
|
nstr->checks[i] = 0;
|
|
}
|
|
f--;
|
|
}
|
|
break;
|
|
default:
|
|
for (i = 0; i < str_len; i++) {
|
|
nstr->checks[i] = 1;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
return nstr;
|
|
}
|
|
|
|
grn_obj *
|
|
grn_string_open_(grn_ctx *ctx, const char *str, unsigned int str_len,
|
|
grn_obj *normalizer, int flags, grn_encoding encoding)
|
|
{
|
|
grn_string *string;
|
|
grn_obj *obj;
|
|
grn_bool is_normalizer_auto;
|
|
|
|
if (!str || !str_len) {
|
|
return NULL;
|
|
}
|
|
|
|
is_normalizer_auto = (normalizer == GRN_NORMALIZER_AUTO);
|
|
if (is_normalizer_auto) {
|
|
normalizer = grn_ctx_get(ctx, GRN_NORMALIZER_AUTO_NAME, -1);
|
|
if (!normalizer) {
|
|
ERR(GRN_INVALID_ARGUMENT,
|
|
"[string][open] NormalizerAuto normalizer isn't available");
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
string = GRN_MALLOCN(grn_string, 1);
|
|
if (!string) {
|
|
if (is_normalizer_auto) {
|
|
grn_obj_unlink(ctx, normalizer);
|
|
}
|
|
GRN_LOG(ctx, GRN_LOG_ALERT,
|
|
"[string][open] failed to allocate memory");
|
|
return NULL;
|
|
}
|
|
|
|
obj = (grn_obj *)string;
|
|
GRN_OBJ_INIT(obj, GRN_STRING, GRN_OBJ_ALLOCATED, GRN_ID_NIL);
|
|
string->original = str;
|
|
string->original_length_in_bytes = str_len;
|
|
string->normalized = NULL;
|
|
string->normalized_length_in_bytes = 0;
|
|
string->n_characters = 0;
|
|
string->checks = NULL;
|
|
string->ctypes = NULL;
|
|
string->encoding = encoding;
|
|
string->flags = flags;
|
|
|
|
if (!normalizer) {
|
|
return (grn_obj *)grn_fake_string_open(ctx, string);
|
|
}
|
|
|
|
grn_normalizer_normalize(ctx, normalizer, (grn_obj *)string);
|
|
if (ctx->rc) {
|
|
grn_obj_close(ctx, obj);
|
|
obj = NULL;
|
|
}
|
|
|
|
if (is_normalizer_auto) {
|
|
grn_obj_unlink(ctx, normalizer);
|
|
}
|
|
|
|
return obj;
|
|
}
|
|
|
|
grn_obj *
|
|
grn_string_open(grn_ctx *ctx, const char *str, unsigned int str_len,
|
|
grn_obj *normalizer, int flags)
|
|
{
|
|
return grn_string_open_(ctx, str, str_len, normalizer, flags, ctx->encoding);
|
|
}
|
|
|
|
grn_rc
|
|
grn_string_get_original(grn_ctx *ctx, grn_obj *string,
|
|
const char **original,
|
|
unsigned int *length_in_bytes)
|
|
{
|
|
grn_rc rc;
|
|
grn_string *string_ = (grn_string *)string;
|
|
GRN_API_ENTER;
|
|
if (string_) {
|
|
if (original) { *original = string_->original; }
|
|
if (length_in_bytes) {
|
|
*length_in_bytes = string_->original_length_in_bytes;
|
|
}
|
|
rc = GRN_SUCCESS;
|
|
} else {
|
|
rc = GRN_INVALID_ARGUMENT;
|
|
}
|
|
GRN_API_RETURN(rc);
|
|
}
|
|
|
|
int
|
|
grn_string_get_flags(grn_ctx *ctx, grn_obj *string)
|
|
{
|
|
int flags = 0;
|
|
grn_string *string_ = (grn_string *)string;
|
|
GRN_API_ENTER;
|
|
if (string_) {
|
|
flags = string_->flags;
|
|
}
|
|
GRN_API_RETURN(flags);
|
|
}
|
|
|
|
grn_rc
|
|
grn_string_get_normalized(grn_ctx *ctx, grn_obj *string,
|
|
const char **normalized,
|
|
unsigned int *length_in_bytes,
|
|
unsigned int *n_characters)
|
|
{
|
|
grn_rc rc;
|
|
grn_string *string_ = (grn_string *)string;
|
|
GRN_API_ENTER;
|
|
if (string_) {
|
|
if (normalized) { *normalized = string_->normalized; }
|
|
if (length_in_bytes) {
|
|
*length_in_bytes = string_->normalized_length_in_bytes;
|
|
}
|
|
if (n_characters) { *n_characters = string_->n_characters; }
|
|
rc = GRN_SUCCESS;
|
|
} else {
|
|
rc = GRN_INVALID_ARGUMENT;
|
|
}
|
|
GRN_API_RETURN(rc);
|
|
}
|
|
|
|
grn_rc
|
|
grn_string_set_normalized(grn_ctx *ctx, grn_obj *string,
|
|
char *normalized, unsigned int length_in_bytes,
|
|
unsigned int n_characters)
|
|
{
|
|
grn_rc rc;
|
|
grn_string *string_ = (grn_string *)string;
|
|
GRN_API_ENTER;
|
|
if (string_) {
|
|
if (string_->normalized) { GRN_FREE(string_->normalized); }
|
|
string_->normalized = normalized;
|
|
string_->normalized_length_in_bytes = length_in_bytes;
|
|
string_->n_characters = n_characters;
|
|
rc = GRN_SUCCESS;
|
|
} else {
|
|
rc = GRN_INVALID_ARGUMENT;
|
|
}
|
|
GRN_API_RETURN(rc);
|
|
}
|
|
|
|
const short *
|
|
grn_string_get_checks(grn_ctx *ctx, grn_obj *string)
|
|
{
|
|
int16_t *checks = NULL;
|
|
grn_string *string_ = (grn_string *)string;
|
|
GRN_API_ENTER;
|
|
if (string_) {
|
|
checks = string_->checks;
|
|
} else {
|
|
checks = NULL;
|
|
}
|
|
GRN_API_RETURN(checks);
|
|
}
|
|
|
|
grn_rc
|
|
grn_string_set_checks(grn_ctx *ctx, grn_obj *string, short *checks)
|
|
{
|
|
grn_rc rc;
|
|
grn_string *string_ = (grn_string *)string;
|
|
GRN_API_ENTER;
|
|
if (string_) {
|
|
if (string_->checks) { GRN_FREE(string_->checks); }
|
|
string_->checks = checks;
|
|
rc = GRN_SUCCESS;
|
|
} else {
|
|
rc = GRN_INVALID_ARGUMENT;
|
|
}
|
|
GRN_API_RETURN(rc);
|
|
}
|
|
|
|
const unsigned char *
|
|
grn_string_get_types(grn_ctx *ctx, grn_obj *string)
|
|
{
|
|
unsigned char *types = NULL;
|
|
grn_string *string_ = (grn_string *)string;
|
|
GRN_API_ENTER;
|
|
if (string_) {
|
|
types = string_->ctypes;
|
|
} else {
|
|
types = NULL;
|
|
}
|
|
GRN_API_RETURN(types);
|
|
}
|
|
|
|
grn_rc
|
|
grn_string_set_types(grn_ctx *ctx, grn_obj *string, unsigned char *types)
|
|
{
|
|
grn_rc rc;
|
|
grn_string *string_ = (grn_string *)string;
|
|
GRN_API_ENTER;
|
|
if (string_) {
|
|
if (string_->ctypes) { GRN_FREE(string_->ctypes); }
|
|
string_->ctypes = types;
|
|
rc = GRN_SUCCESS;
|
|
} else {
|
|
rc = GRN_INVALID_ARGUMENT;
|
|
}
|
|
GRN_API_RETURN(rc);
|
|
}
|
|
|
|
grn_encoding
|
|
grn_string_get_encoding(grn_ctx *ctx, grn_obj *string)
|
|
{
|
|
grn_encoding encoding = GRN_ENC_NONE;
|
|
grn_string *string_ = (grn_string *)string;
|
|
GRN_API_ENTER;
|
|
if (string_) {
|
|
encoding = string_->encoding;
|
|
}
|
|
GRN_API_RETURN(encoding);
|
|
}
|
|
|
|
grn_rc
|
|
grn_string_inspect(grn_ctx *ctx, grn_obj *buffer, grn_obj *string)
|
|
{
|
|
grn_string *string_ = (grn_string *)string;
|
|
|
|
GRN_TEXT_PUTS(ctx, buffer, "#<string:");
|
|
|
|
GRN_TEXT_PUTS(ctx, buffer, " original:<");
|
|
GRN_TEXT_PUT(ctx, buffer,
|
|
string_->original,
|
|
string_->original_length_in_bytes);
|
|
GRN_TEXT_PUTS(ctx, buffer, ">");
|
|
GRN_TEXT_PUTS(ctx, buffer, "(");
|
|
grn_text_itoa(ctx, buffer, string_->original_length_in_bytes);
|
|
GRN_TEXT_PUTS(ctx, buffer, ")");
|
|
|
|
GRN_TEXT_PUTS(ctx, buffer, " normalized:<");
|
|
GRN_TEXT_PUT(ctx, buffer,
|
|
string_->normalized,
|
|
string_->normalized_length_in_bytes);
|
|
GRN_TEXT_PUTS(ctx, buffer, ">");
|
|
GRN_TEXT_PUTS(ctx, buffer, "(");
|
|
grn_text_itoa(ctx, buffer, string_->normalized_length_in_bytes);
|
|
GRN_TEXT_PUTS(ctx, buffer, ")");
|
|
|
|
GRN_TEXT_PUTS(ctx, buffer, " n_characters:");
|
|
grn_text_itoa(ctx, buffer, string_->n_characters);
|
|
|
|
GRN_TEXT_PUTS(ctx, buffer, " encoding:");
|
|
grn_inspect_encoding(ctx, buffer, string_->encoding);
|
|
|
|
GRN_TEXT_PUTS(ctx, buffer, " flags:");
|
|
if (string_->flags & GRN_STRING_REMOVE_BLANK) {
|
|
GRN_TEXT_PUTS(ctx, buffer, "REMOVE_BLANK|");
|
|
}
|
|
if (string_->flags & GRN_STRING_WITH_TYPES) {
|
|
GRN_TEXT_PUTS(ctx, buffer, "WITH_TYPES|");
|
|
}
|
|
if (string_->flags & GRN_STRING_WITH_CHECKS) {
|
|
GRN_TEXT_PUTS(ctx, buffer, "WITH_CHECKS|");
|
|
}
|
|
if (string_->flags & GRN_STRING_REMOVE_TOKENIZED_DELIMITER) {
|
|
GRN_TEXT_PUTS(ctx, buffer, "REMOVE_TOKENIZED_DELIMITER|");
|
|
}
|
|
if (GRN_TEXT_VALUE(buffer)[GRN_TEXT_LEN(buffer) - 1] == '|') {
|
|
grn_bulk_truncate(ctx, buffer, GRN_TEXT_LEN(buffer) - 1);
|
|
}
|
|
|
|
GRN_TEXT_PUTS(ctx, buffer, ">");
|
|
|
|
return GRN_SUCCESS;
|
|
}
|
|
|
|
grn_rc
|
|
grn_string_close(grn_ctx *ctx, grn_obj *string)
|
|
{
|
|
grn_rc rc;
|
|
grn_string *string_ = (grn_string *)string;
|
|
if (string_) {
|
|
if (string_->normalized) { GRN_FREE(string_->normalized); }
|
|
if (string_->ctypes) { GRN_FREE(string_->ctypes); }
|
|
if (string_->checks) { GRN_FREE(string_->checks); }
|
|
GRN_FREE(string);
|
|
rc = GRN_SUCCESS;
|
|
} else {
|
|
rc = GRN_INVALID_ARGUMENT;
|
|
}
|
|
return rc;
|
|
}
|