From 228f4a43a353e9e7c56e1a617749fc9c0e875f6d Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 20 Oct 2003 15:53:48 +0200 Subject: [PATCH] FULLTEXT: correct charset support (UTF included, UCS2 - not) code cleanup include/m_ctype.h: my_mbcharlen_8bit() { return 1 } mysql-test/r/fulltext.result: fulltext on UTF mysql-test/t/fulltext.test: fulltext on UTF sql/item_cmpfunc.h: cleanup sql/sql_table.cc: FULLTEXT: UCS2 is not allowed sql/sql_yacc.yy: FULLTEXT: code cleanup strings/ctype-bin.c: my_mbcharlen_8bit() { return 1 } strings/ctype-latin1.c: my_mbcharlen_8bit() { return 1 } strings/ctype-simple.c: my_mbcharlen_8bit() { return 1 } strings/ctype-tis620.c: my_mbcharlen_8bit() { return 1 } strings/ctype-utf8.c: hack: (to be fixed properly later) all multi-byte sequences are considered isalpha() now --- include/m_ctype.h | 1 + mysql-test/r/fulltext.result | 21 ++++++++++ mysql-test/t/fulltext.test | 18 +++++++++ sql/item_cmpfunc.h | 1 - sql/item_func.cc | 75 +++++++++++++++++------------------- sql/item_func.h | 30 ++++++--------- sql/sql_table.cc | 1 + sql/sql_yacc.yy | 18 +++++---- strings/ctype-bin.c | 26 ++++++++----- strings/ctype-latin1.c | 4 +- strings/ctype-simple.c | 2 +- strings/ctype-tis620.c | 2 +- strings/ctype-utf8.c | 8 ++-- 13 files changed, 121 insertions(+), 86 deletions(-) diff --git a/include/m_ctype.h b/include/m_ctype.h index b1557e5293b..f39cbf8b659 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -310,6 +310,7 @@ int my_wildcmp_8bit(CHARSET_INFO *, uint my_numchars_8bit(CHARSET_INFO *, const char *b, const char *e); uint my_charpos_8bit(CHARSET_INFO *, const char *b, const char *e, uint pos); +int my_mbcharlen_8bit(CHARSET_INFO *, uint c); /* Functions for multibyte charsets */ diff --git a/mysql-test/r/fulltext.result b/mysql-test/r/fulltext.result index 87416ae97eb..63338869fdd 100644 --- a/mysql-test/r/fulltext.result +++ b/mysql-test/r/fulltext.result @@ -279,3 +279,24 @@ select * from t1 join t2 using(`t1_id`) where match (t1.name, t2.name) against(' t1_id name t2_id t1_id name 1 data1 1 1 xxfoo drop table t1,t2; +SET NAMES latin1; +CREATE TABLE t1 (t text character set utf8 not null, fulltext(t)); +INSERT t1 VALUES ('Mit freundlichem Grüß'), ('aus Osnabrück'); +SET NAMES koi8r; +INSERT t1 VALUES ("üÔÏ ÍÙ - ÏÐÉÌËÉ"),("ïÔÌÅÚØ, ÇÎÉÄÁ!"), +("îÅ ×ÌÅÚÁÊ, ÕÂØÅÔ!"),("É ÂÕÄÅÔ ÐÒÁ×!"); +SELECT t, charset(t) FROM t1 WHERE MATCH t AGAINST ('ïðéìëé'); +t charset(t) +üÔÏ ÍÙ - ÏÐÉÌËÉ utf8 +SELECT t, charset(t) FROM t1 WHERE MATCH t AGAINST ('ðÒá*' IN BOOLEAN MODE); +t charset(t) +É ÂÕÄÅÔ ÐÒÁ×! utf8 +SELECT * FROM t1 WHERE MATCH t AGAINST ('ÜÔÏ' IN BOOLEAN MODE); +t +SELECT t, charset(t) FROM t1 WHERE MATCH t AGAINST ('Osnabrück'); +t charset(t) +SET NAMES latin1; +SELECT t, charset(t) FROM t1 WHERE MATCH t AGAINST ('Osnabrück'); +t charset(t) +aus Osnabrück utf8 +DROP TABLE t1; diff --git a/mysql-test/t/fulltext.test b/mysql-test/t/fulltext.test index 04b0c1e6afd..56a27c83cf1 100644 --- a/mysql-test/t/fulltext.test +++ b/mysql-test/t/fulltext.test @@ -226,3 +226,21 @@ insert into t2 values (2, 1, 'xxbar'); insert into t2 values (3, 1, 'xxbuz'); select * from t1 join t2 using(`t1_id`) where match (t1.name, t2.name) against('xxfoo' in boolean mode); drop table t1,t2; + +# +# UTF8 +# +SET NAMES latin1; +CREATE TABLE t1 (t text character set utf8 not null, fulltext(t)); +INSERT t1 VALUES ('Mit freundlichem Grüß'), ('aus Osnabrück'); +SET NAMES koi8r; +INSERT t1 VALUES ("üÔÏ ÍÙ - ÏÐÉÌËÉ"),("ïÔÌÅÚØ, ÇÎÉÄÁ!"), + ("îÅ ×ÌÅÚÁÊ, ÕÂØÅÔ!"),("É ÂÕÄÅÔ ÐÒÁ×!"); +SELECT t, charset(t) FROM t1 WHERE MATCH t AGAINST ('ïðéìëé'); +SELECT t, charset(t) FROM t1 WHERE MATCH t AGAINST ('ðÒá*' IN BOOLEAN MODE); +SELECT * FROM t1 WHERE MATCH t AGAINST ('ÜÔÏ' IN BOOLEAN MODE); +SELECT t, charset(t) FROM t1 WHERE MATCH t AGAINST ('Osnabrück'); +SET NAMES latin1; +SELECT t, charset(t) FROM t1 WHERE MATCH t AGAINST ('Osnabrück'); +DROP TABLE t1; + diff --git a/sql/item_cmpfunc.h b/sql/item_cmpfunc.h index 42b73c48606..41e7060335c 100644 --- a/sql/item_cmpfunc.h +++ b/sql/item_cmpfunc.h @@ -647,7 +647,6 @@ class Item_func_in :public Item_int_func ~Item_func_in() { delete array; delete in_item; } optimize_type select_optimize() const { return array ? OPTIMIZE_KEY : OPTIMIZE_NONE; } - Item *key_item() const { return args[0]; } void print(String *str); enum Functype functype() const { return IN_FUNC; } const char *func_name() const { return " IN "; } diff --git a/sql/item_func.cc b/sql/item_func.cc index af9fa692f8b..dde830b61c9 100644 --- a/sql/item_func.cc +++ b/sql/item_func.cc @@ -81,12 +81,12 @@ bool Item_func::agg_arg_collations(DTCollation &c, Item **av, uint count) } -bool Item_func::agg_arg_collations_for_comparison(DTCollation &c, +bool Item_func::agg_arg_collations_for_comparison(DTCollation &c, Item **av, uint count) { if (agg_arg_collations(c, av, count)) return TRUE; - + if (c.derivation == DERIVATION_NONE) { my_coll_agg_error(av, count, func_name()); @@ -211,7 +211,7 @@ Item_func::fix_fields(THD *thd, TABLE_LIST *tables, Item **ref) item= *arg; if (item->maybe_null) maybe_null=1; - + with_sum_func= with_sum_func || item->with_sum_func; used_tables_cache|= item->used_tables(); not_null_tables_cache|= item->not_null_tables(); @@ -2545,9 +2545,13 @@ void Item_func_match::init_search(bool no_order) DBUG_VOID_RETURN; if (key == NO_SUCH_KEY) + { + List fields; + for (uint i=1; i < arg_count; i++) + fields.push_back(args[i]); concat=new Item_func_concat_ws(new Item_string(" ",1, - default_charset_info), - fields); + cmp_collation.collation), fields); + } if (master) { @@ -2559,14 +2563,19 @@ void Item_func_match::init_search(bool no_order) } String *ft_tmp= 0; - char tmp1[FT_QUERY_MAXLEN]; - String tmp2(tmp1,sizeof(tmp1),default_charset_info); // MATCH ... AGAINST (NULL) is meaningless, but possible - if (!(ft_tmp=key_item()->val_str(&tmp2))) + if (!(ft_tmp=key_item()->val_str(&value))) { - ft_tmp= &tmp2; - tmp2.set("",0,default_charset_info); + ft_tmp= &value; + value.set("",0,cmp_collation.collation); + } + + if (ft_tmp->charset() != cmp_collation.collation) + { + search_value.copy(ft_tmp->ptr(), ft_tmp->length(), ft_tmp->charset(), + cmp_collation.collation); + ft_tmp= &search_value; } ft_handler=table->file->ft_init_ext(mode, key, @@ -2583,7 +2592,6 @@ void Item_func_match::init_search(bool no_order) bool Item_func_match::fix_fields(THD *thd, TABLE_LIST *tlist, Item **ref) { - List_iterator li(fields); Item *item; maybe_null=1; @@ -2595,51 +2603,37 @@ bool Item_func_match::fix_fields(THD *thd, TABLE_LIST *tlist, Item **ref) modifications to find_best and auto_close as complement to auto_init code above. */ - if (Item_func::fix_fields(thd, tlist, ref) || !const_item()) + if (Item_func::fix_fields(thd, tlist, ref) || !args[0]->const_item()) { my_error(ER_WRONG_ARGUMENTS,MYF(0),"AGAINST"); return 1; } - while ((item=li++)) + const_item_cache=0; + for (uint i=1 ; i < arg_count ; i++) { - if (item->fix_fields(thd, tlist, li.ref()) || item->check_cols(1)) - return 1; + item=args[i]; if (item->type() == Item::REF_ITEM) - li.replace(item= *((Item_ref *)item)->ref); - if (item->type() != Item::FIELD_ITEM || !item->used_tables()) + args[i]= item= *((Item_ref *)item)->ref; + if (item->type() != Item::FIELD_ITEM) key=NO_SUCH_KEY; used_tables_cache|=item->used_tables(); } /* check that all columns come from the same table */ if (my_count_bits(used_tables_cache) != 1) key=NO_SUCH_KEY; - const_item_cache=0; - table=((Item_field *)fields.head())->field->table; - table->fulltext_searched=1; - record=table->record[0]; if (key == NO_SUCH_KEY && mode != FT_BOOL) { my_error(ER_WRONG_ARGUMENTS,MYF(0),"MATCH"); return 1; } - - return 0; -} - -bool Item_func_match::walk(Item_processor processor, byte *arg) -{ - List_iterator_fast li(fields); - Item *item; - while ((item= li++)) - if (item->walk(processor, arg)) - return 1; - return Item_func::walk(processor, arg); + table=((Item_field *)item)->field->table; + table->fulltext_searched=1; + return agg_arg_collations_for_comparison(cmp_collation, args+1, arg_count-1); } bool Item_func_match::fix_index() { - List_iterator_fast li(fields); Item_field *item; uint ft_to_key[MAX_KEY], ft_cnt[MAX_KEY], fts=0, keynr; uint max_cnt=0, mkeys=0; @@ -2661,8 +2655,9 @@ bool Item_func_match::fix_index() if (!fts) goto err; - while ((item=(Item_field*)(li++))) + for (uint i=1; i < arg_count; i++) { + item=(Item_field*)args[i]; for (keynr=0 ; keynr < fts ; keynr++) { KEY *ft_key=&table->key_info[ft_to_key[keynr]]; @@ -2696,8 +2691,8 @@ bool Item_func_match::fix_index() for (keynr=0 ; keynr <= mkeys ; keynr++) { - // for now, partial keys won't work. SerG - if (max_cnt < fields.elements || + // partial keys doesn't work + if (max_cnt < arg_count-1 || max_cnt < table->key_info[ft_to_key[keynr]].key_parts) continue; @@ -2712,8 +2707,7 @@ err: key=NO_SUCH_KEY; return 0; } - my_printf_error(ER_FT_MATCHING_KEY_NOT_FOUND, - ER(ER_FT_MATCHING_KEY_NOT_FOUND),MYF(0)); + my_error(ER_FT_MATCHING_KEY_NOT_FOUND,MYF(0)); return 1; } @@ -2759,7 +2753,8 @@ double Item_func_match::val() (byte *)a->ptr(), a->length())); } else - DBUG_RETURN(ft_handler->please->find_relevance(ft_handler, record, 0)); + DBUG_RETURN(ft_handler->please->find_relevance(ft_handler, + table->record[0], 0)); } diff --git a/sql/item_func.h b/sql/item_func.h index 8086e65786d..33609694fe9 100644 --- a/sql/item_func.h +++ b/sql/item_func.h @@ -962,20 +962,18 @@ public: class Item_func_match :public Item_real_func { public: - List fields; - String value; - TABLE *table; - Item_func_match *master; - FT_INFO * ft_handler; - Item *concat; - byte *record; uint key, mode; bool join_key; + DTCollation cmp_collation; + FT_INFO *ft_handler; + TABLE *table; + Item_func_match *master; // for master-slave optimization + Item *concat; // Item_func_concat_ws + String value; // value of concat + String search_value; // key_item()'s value converted to cmp_collation - Item_func_match(List &a, Item *b): Item_real_func(b), - fields(a), table(0), master(0), ft_handler(0), - concat(0), key(0), join_key(0) - {} + Item_func_match(List &a): Item_real_func(a), + table(0), master(0), ft_handler(0), concat(0), key(0), join_key(0) { } ~Item_func_match() { if (!master && ft_handler) @@ -999,17 +997,13 @@ public: bool fix_index(); void init_search(bool no_order); - - bool walk(Item_processor processor, byte *arg); }; class Item_func_match_nl :public Item_func_match { public: - Item_func_match_nl(List &a, Item *b) - :Item_func_match(a,b) - { mode=FT_NL; } + Item_func_match_nl(List &a) :Item_func_match(a) { mode=FT_NL; } const char *func_name() const { return "match_nl"; } }; @@ -1017,9 +1011,7 @@ public: class Item_func_match_bool :public Item_func_match { public: - Item_func_match_bool(List &a, Item *b) - :Item_func_match(a,b) - { mode=FT_BOOL; } + Item_func_match_bool(List &a) :Item_func_match(a) { mode=FT_BOOL; } const char *func_name() const { return "match_bool"; } }; diff --git a/sql/sql_table.cc b/sql/sql_table.cc index 46dc1191ef8..5112dfb59cd 100644 --- a/sql/sql_table.cc +++ b/sql/sql_table.cc @@ -733,6 +733,7 @@ int mysql_create_table(THD *thd,const char *db, const char *table_name, sql_field->sql_type != FIELD_TYPE_VAR_STRING && !f_is_blob(sql_field->pack_flag)) || sql_field->charset == &my_charset_bin || + sql_field->charset->state & MY_CS_NONTEXT || // ucs2 doesn't work yet (ft_key_charset && sql_field->charset != ft_key_charset)) { my_printf_error(ER_BAD_FT_COLUMN,ER(ER_BAD_FT_COLUMN),MYF(0), diff --git a/sql/sql_yacc.yy b/sql/sql_yacc.yy index 51df4e358b4..f7cd22545c5 100644 --- a/sql/sql_yacc.yy +++ b/sql/sql_yacc.yy @@ -2446,11 +2446,13 @@ simple_expr: | singlerow_subselect { $$= $1; } | '{' ident expr '}' { $$= $3; } | MATCH ident_list_arg AGAINST '(' expr ')' - { Select->add_ftfunc_to_list((Item_func_match *) - ($$=new Item_func_match_nl(*$2,$5))); } + { $2->push_front($5); + Select->add_ftfunc_to_list((Item_func_match *) + ($$=new Item_func_match_nl(*$2))); } | MATCH ident_list_arg AGAINST '(' expr IN_SYM BOOLEAN_SYM MODE_SYM ')' - { Select->add_ftfunc_to_list((Item_func_match *) - ($$=new Item_func_match_bool(*$2,$5))); } + { $2->push_front($5); + Select->add_ftfunc_to_list((Item_func_match *) + ($$=new Item_func_match_bool(*$2))); } | ASCII_SYM '(' expr ')' { $$= new Item_func_ascii($3); } | BINARY expr %prec NEG { @@ -2458,10 +2460,10 @@ simple_expr: 6, &my_charset_latin1)); } | CAST_SYM '(' expr AS cast_type ')' - { - $$= create_func_cast($3, $5, + { + $$= create_func_cast($3, $5, Lex->length ? atoi(Lex->length) : -1, - Lex->charset); + Lex->charset); } | CASE_SYM opt_expr WHEN_SYM when_list opt_else END { $$= new Item_func_case(* $4, $2, $5 ); } @@ -2962,7 +2964,7 @@ ident_list2: opt_expr: /* empty */ { $$= NULL; } - | expr { $$= $1; }; + | expr { $$= $1; }; opt_else: /* empty */ { $$= NULL; } diff --git a/strings/ctype-bin.c b/strings/ctype-bin.c index 340084ad848..cd1b1399506 100644 --- a/strings/ctype-bin.c +++ b/strings/ctype-bin.c @@ -118,6 +118,12 @@ static int my_strcasecmp_bin(CHARSET_INFO * cs __attribute__((unused)), return strcmp(s,t); } +int my_mbcharlen_8bit(CHARSET_INFO *cs __attribute__((unused)), + uint c __attribute__((unused))) +{ + return 1; +} + static int my_mb_wc_bin(CHARSET_INFO *cs __attribute__((unused)), my_wc_t *wc, const unsigned char *str, @@ -264,12 +270,12 @@ static int my_strnxfrm_bin(CHARSET_INFO *cs __attribute__((unused)), static uint my_instr_bin(CHARSET_INFO *cs __attribute__((unused)), - const char *big, uint b_length, + const char *big, uint b_length, const char *small, uint s_length, my_match_t *match, uint nmatch) { register const uchar *str, *search, *end, *search_end; - + if (s_length <= b_length) { if (!s_length) @@ -282,32 +288,32 @@ uint my_instr_bin(CHARSET_INFO *cs __attribute__((unused)), } return 1; /* Empty string is always found */ } - + str= (const uchar*) big; search= (const uchar*) small; end= (const uchar*) big+b_length-s_length+1; search_end= (const uchar*) small + s_length; - + skipp: while (str != end) { if ( (*str++) == (*search)) { register const uchar *i,*j; - - i= str; + + i= str; j= search+1; - + while (j != search_end) if ((*i++) != (*j++)) goto skipp; - + if (nmatch > 0) { match[0].beg= 0; match[0].end= str- (const uchar*)big-1; match[0].mblen= match[0].end; - + if (nmatch > 1) { match[1].beg= match[0].end; @@ -338,7 +344,7 @@ MY_COLLATION_HANDLER my_collation_8bit_bin_handler = static MY_CHARSET_HANDLER my_charset_handler= { NULL, /* ismbchar */ - NULL, /* mbcharlen */ + my_mbcharlen_8bit, /* mbcharlen */ my_numchars_8bit, my_charpos_8bit, my_lengthsp_8bit, diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c index a8a5329f844..15798abb85b 100644 --- a/strings/ctype-latin1.c +++ b/strings/ctype-latin1.c @@ -170,14 +170,14 @@ int my_wc_mb_latin1(CHARSET_INFO *cs __attribute__((unused)), { if (str >= end) return MY_CS_TOOSMALL; - + return ((wc < 256) && ((str[0]=uni_latin1[wc]) || (!wc))) ? 1 : MY_CS_ILUNI; } static MY_CHARSET_HANDLER my_charset_handler= { NULL, - NULL, + my_mbcharlen_8bit, my_numchars_8bit, my_charpos_8bit, my_lengthsp_8bit, diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c index f85ce5e7a2b..ed1d2c77049 100644 --- a/strings/ctype-simple.c +++ b/strings/ctype-simple.c @@ -1093,7 +1093,7 @@ skipp: MY_CHARSET_HANDLER my_charset_8bit_handler= { NULL, /* ismbchar */ - NULL, /* mbcharlen */ + my_mbcharlen_8bit, /* mbcharlen */ my_numchars_8bit, my_charpos_8bit, my_lengthsp_8bit, diff --git a/strings/ctype-tis620.c b/strings/ctype-tis620.c index a4d8a7d1f79..fd5e58ad8a7 100644 --- a/strings/ctype-tis620.c +++ b/strings/ctype-tis620.c @@ -717,7 +717,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = static MY_CHARSET_HANDLER my_charset_handler= { NULL, /* ismbchar */ - NULL, /* mbcharlen */ + my_mbcharlen_8bit, /* mbcharlen */ my_numchars_8bit, my_charpos_8bit, my_lengthsp_8bit, diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index 3ede1aa26f6..b5716c53ea2 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -1540,10 +1540,10 @@ static uchar ctype_utf8[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0 }; static uchar to_lower_utf8[] = {