From a48db602dd9bb54439cabef8c5921639837e639e Mon Sep 17 00:00:00 2001
From: "jan@hundin.mysql.fi" <>
Date: Fri, 3 Sep 2004 15:26:29 +0300
Subject: [PATCH] Fixed unique prefix key bug for multibyte character sets (BUG
 #4521) for InnoDB. This fixes also a second part of the same problem with
 prefix keys on a multibyte string column for InnoDB.

---
 innobase/btr/btr0btr.c         |  9 +++--
 innobase/rem/rem0cmp.c         | 34 ++----------------
 innobase/row/row0ins.c         | 17 ++++++---
 innobase/row/row0row.c         | 33 +++++++++++++----
 innobase/row/row0sel.c         | 15 ++++++--
 innobase/row/row0upd.c         | 32 +++++++++++++----
 mysql-test/r/ctype_utf8.result | 66 ++++++++++++++++++++++++++++++++++
 mysql-test/t/ctype_utf8.test   | 61 ++++++++++++++++++++++++++++++-
 sql/ha_innodb.cc               | 58 ++++++++++++++++++++++++++++++
 sql/ha_innodb.h                |  1 +
 10 files changed, 271 insertions(+), 55 deletions(-)

diff --git a/innobase/btr/btr0btr.c b/innobase/btr/btr0btr.c
index 27d798f925a..e31aadbbfff 100644
--- a/innobase/btr/btr0btr.c
+++ b/innobase/btr/btr0btr.c
@@ -2400,14 +2400,17 @@ btr_index_rec_validate(
 		dtype_t*	type = dict_index_get_nth_type(index, i);
 
 		rec_get_nth_field(rec, i, &len);
-		
+
+		/* Note that prefix indexes are not fixed size even when
+		their type is CHAR. */
+
 		if ((dict_index_get_nth_field(index, i)->prefix_len == 0
 		    && len != UNIV_SQL_NULL && dtype_is_fixed_size(type)
 		    && len != dtype_get_fixed_size(type))
 		   ||
 		   (dict_index_get_nth_field(index, i)->prefix_len > 0
-		    && len != UNIV_SQL_NULL && dtype_is_fixed_size(type)
-		    && len !=
+		    && len != UNIV_SQL_NULL
+		    && len >
 			   dict_index_get_nth_field(index, i)->prefix_len)) {
 
 			btr_index_rec_validate_report(page, rec, index);
diff --git a/innobase/rem/rem0cmp.c b/innobase/rem/rem0cmp.c
index f6c82102839..041fb7914e2 100644
--- a/innobase/rem/rem0cmp.c
+++ b/innobase/rem/rem0cmp.c
@@ -14,9 +14,6 @@ Created 7/1/1994 Heikki Tuuri
 
 #include "srv0srv.h"
 
-#include <m_ctype.h>
-#include <my_sys.h>
-
 /*		ALPHABETICAL ORDER
 		==================
 		
@@ -455,8 +452,6 @@ cmp_dtuple_rec_with_match(
 	ulint		cur_bytes; 	/* number of already matched bytes 
 					in current field */
 	int		ret = 3333;	/* return value */
-	
-	CHARSET_INFO*   charset;        /* charset used in the field */
 
 	ut_ad(dtuple && rec && matched_fields && matched_bytes);
 	ut_ad(dtuple_check_typed(dtuple));
@@ -546,33 +541,8 @@ cmp_dtuple_rec_with_match(
 			&& dtype_get_charset_coll(cur_type->prtype) !=
 				data_mysql_latin1_swedish_charset_coll)) {
 
-		  	/* If character set is not latin1_swedish
-			we have to devide character length by the
-			maximum bytes needed for that character
-			set. For example if we have unique prefix
-			index for 1 utf8 character then we have
-			actually 3 bytes allocated in the index.
-			Therefore, we have to divide that with
-			maximum bytes needed for utf8 character i.e.
-			3 byges.*/
-
-                        if ( dtuple_f_len > 0) {
-			  charset = get_charset(
-				dtype_get_charset_coll(cur_type->prtype), 
-				MYF(MY_WME));
-
-			  ut_ad(charset);
-			  ut_ad(charset->mbmaxlen);
-
-			  dtuple_f_len = dtuple_f_len / charset->mbmaxlen;
-
-                          if ( dtuple_f_len == 0)
-			    dtuple_f_len = 1;
-
-			  rec_f_len = dtuple_f_len;
-			} 
-
-			ret = cmp_whole_field(cur_type,
+			ret = cmp_whole_field(
+				cur_type,
 				dfield_get_data(dtuple_field), dtuple_f_len,
 				rec_b_ptr, rec_f_len);
 
diff --git a/innobase/row/row0ins.c b/innobase/row/row0ins.c
index edd3099b5f3..2429b8f2bf3 100644
--- a/innobase/row/row0ins.c
+++ b/innobase/row/row0ins.c
@@ -1999,6 +1999,7 @@ row_ins_index_entry_set_vals(
 	dfield_t*	row_field;
 	ulint		n_fields;
 	ulint		i;
+	dtype_t*        cur_type;
 
 	ut_ad(entry && row);
 
@@ -2012,10 +2013,18 @@ row_ins_index_entry_set_vals(
 
 		/* Check column prefix indexes */
 		if (ind_field->prefix_len > 0
-		    && dfield_get_len(row_field) != UNIV_SQL_NULL
-		    && dfield_get_len(row_field) > ind_field->prefix_len) {
-		    
-		        field->len = ind_field->prefix_len;
+		    && dfield_get_len(row_field) != UNIV_SQL_NULL) {
+
+			/* For prefix keys get the storage length
+			for the prefix_len characters. */
+
+			cur_type = dict_col_get_type(
+				dict_field_get_col(ind_field));
+
+			field->len = innobase_get_at_most_n_mbchars(
+				dtype_get_charset_coll(cur_type->prtype),
+				ind_field->prefix_len,
+				dfield_get_len(field),row_field->data);
 		} else {
 		        field->len = row_field->len;
 		}
diff --git a/innobase/row/row0row.c b/innobase/row/row0row.c
index 680539764fd..ed6462b7377 100644
--- a/innobase/row/row0row.c
+++ b/innobase/row/row0row.c
@@ -113,6 +113,8 @@ row_build_index_entry(
 	dfield_t*	dfield2;
 	dict_col_t*	col;
 	ulint		i;
+        ulint           storage_len;
+	dtype_t*	cur_type;
 
 	ut_ad(row && index && heap);
 	ut_ad(dtuple_check_typed(row));
@@ -139,10 +141,20 @@ row_build_index_entry(
 
 		/* If a column prefix index, take only the prefix */
 		if (ind_field->prefix_len > 0
-		    && dfield_get_len(dfield2) != UNIV_SQL_NULL
-		    && dfield_get_len(dfield2) > ind_field->prefix_len) {
+		    && dfield_get_len(dfield2) != UNIV_SQL_NULL) {
 			
-			dfield_set_len(dfield, ind_field->prefix_len);
+			/* For prefix keys get the storage length
+			for the prefix_len characters. */
+
+			cur_type = dict_col_get_type(
+				dict_field_get_col(ind_field));
+
+			storage_len = innobase_get_at_most_n_mbchars(
+				dtype_get_charset_coll(cur_type->prtype),
+				ind_field->prefix_len,
+				dfield_get_len(dfield2),dfield2->data);
+
+			dfield_set_len(dfield,storage_len);
 		}
 	}
 
@@ -460,6 +472,7 @@ row_build_row_ref_from_row(
 	dict_col_t*	col;
 	ulint		ref_len;
 	ulint		i;
+	dtype_t*	cur_type;
 	
 	ut_ad(ref && table && row);
 		
@@ -481,10 +494,18 @@ row_build_row_ref_from_row(
 		dfield_copy(dfield, dfield2);
 
 		if (field->prefix_len > 0
-		    && dfield->len != UNIV_SQL_NULL
-		    && dfield->len > field->prefix_len) {
+		    && dfield->len != UNIV_SQL_NULL) {
 
-		        dfield->len = field->prefix_len;
+			/* For prefix keys get the storage length
+			for the prefix_len characters. */
+
+			cur_type = dict_col_get_type(
+				dict_field_get_col(field));
+
+			dfield->len = innobase_get_at_most_n_mbchars(
+				dtype_get_charset_coll(cur_type->prtype),
+				field->prefix_len,
+				dfield->len,dfield->data);
 		}
 	}
 
diff --git a/innobase/row/row0sel.c b/innobase/row/row0sel.c
index 2c0092adc6e..d87cc857651 100644
--- a/innobase/row/row0sel.c
+++ b/innobase/row/row0sel.c
@@ -76,6 +76,7 @@ row_sel_sec_rec_is_for_clust_rec(
         ulint           clust_len;
         ulint           n;
         ulint           i;
+	dtype_t*	cur_type;
 
 	UT_NOT_USED(clust_index);
 
@@ -91,10 +92,18 @@ row_sel_sec_rec_is_for_clust_rec(
                 sec_field = rec_get_nth_field(sec_rec, i, &sec_len);
 
 		if (ifield->prefix_len > 0
-		    && clust_len != UNIV_SQL_NULL
-		    && clust_len > ifield->prefix_len) {
+		    && clust_len != UNIV_SQL_NULL) {
 
-		       clust_len = ifield->prefix_len;
+			/* For prefix keys get the storage length
+			for the prefix_len characters. */
+
+			cur_type = dict_col_get_type(
+				dict_field_get_col(ifield));
+
+			clust_len = innobase_get_at_most_n_mbchars(
+				dtype_get_charset_coll(cur_type->prtype),
+				ifield->prefix_len,
+				clust_len,clust_field);
 		}
 
                 if (0 != cmp_data_data(dict_col_get_type(col),
diff --git a/innobase/row/row0upd.c b/innobase/row/row0upd.c
index d35ae0a3e38..75400e06059 100644
--- a/innobase/row/row0upd.c
+++ b/innobase/row/row0upd.c
@@ -842,6 +842,7 @@ row_upd_index_replace_new_col_vals_index_pos(
 	dfield_t*	new_val;
 	ulint		j;
 	ulint		i;
+	dtype_t*	cur_type;
 
 	ut_ad(index);
 
@@ -871,10 +872,19 @@ row_upd_index_replace_new_col_vals_index_pos(
 				}
 
 				if (field->prefix_len > 0
-			            && new_val->len != UNIV_SQL_NULL
-			            && new_val->len > field->prefix_len) {
+			            && new_val->len != UNIV_SQL_NULL) {
 
-				        dfield->len = field->prefix_len;
+				/* For prefix keys get the storage length
+				for the prefix_len characters. */
+
+				  cur_type = dict_col_get_type(
+					dict_field_get_col(field));
+
+				  dfield->len = 
+				    innobase_get_at_most_n_mbchars(
+				      dtype_get_charset_coll(cur_type->prtype),
+					field->prefix_len,
+					new_val->len,new_val->data);
 				}
 			}
 		}
@@ -904,6 +914,7 @@ row_upd_index_replace_new_col_vals(
 	dfield_t*	new_val;
 	ulint		j;
 	ulint		i;
+	dtype_t*	cur_type;
 
 	ut_ad(index);
 
@@ -933,10 +944,19 @@ row_upd_index_replace_new_col_vals(
 				}
 
 				if (field->prefix_len > 0
-			            && new_val->len != UNIV_SQL_NULL
-			            && new_val->len > field->prefix_len) {
+			            && new_val->len != UNIV_SQL_NULL) {
 
-				        dfield->len = field->prefix_len;
+				/* For prefix keys get the storage length
+				for the prefix_len characters. */
+
+				cur_type = dict_col_get_type(
+					dict_field_get_col(field));
+
+				  dfield->len = 
+				    innobase_get_at_most_n_mbchars(
+				      dtype_get_charset_coll(cur_type->prtype),
+					field->prefix_len,
+					new_val->len,new_val->data);
 				}
 			}
 		}
diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result
index f3be539251a..7fb7a508a4e 100644
--- a/mysql-test/r/ctype_utf8.result
+++ b/mysql-test/r/ctype_utf8.result
@@ -316,6 +316,39 @@ select c cb20 from t1 where c=repeat('b',20);
 cb20
 bbbbbbbbbbbbbbbbbbbb
 drop table t1;
+create table t1 (c varchar(30) character set utf8, unique(c(10))) engine=innodb;
+insert into t1 values ('1'),('2'),('3'),('x'),('y'),('z');
+insert into t1 values ('aaaaaaaaaa');
+insert into t1 values ('aaaaaaaaaaa');
+ERROR 23000: Duplicate entry 'aaaaaaaaaaa' for key 1
+insert into t1 values ('aaaaaaaaaaaa');
+ERROR 23000: Duplicate entry 'aaaaaaaaaaaa' for key 1
+insert into t1 values (repeat('b',20));
+select c c1 from t1 where c='1';
+c1
+1
+select c c2 from t1 where c='2';
+c2
+2
+select c c3 from t1 where c='3';
+c3
+3
+select c cx from t1 where c='x';
+cx
+x
+select c cy from t1 where c='y';
+cy
+y
+select c cz from t1 where c='z';
+cz
+z
+select c ca10 from t1 where c='aaaaaaaaaa';
+ca10
+aaaaaaaaaa
+select c cb20 from t1 where c=repeat('b',20);
+cb20
+bbbbbbbbbbbbbbbbbbbb
+drop table t1;
 create table t1 (c char(3) character set utf8, unique (c(2)));
 insert into t1 values ('1'),('2'),('3'),('4'),('x'),('y'),('z');
 insert into t1 values ('a');
@@ -339,6 +372,29 @@ insert into t1 values ('ꪪꪪ');
 insert into t1 values ('ꪪꪪꪪ');
 ERROR 23000: Duplicate entry 'ꪪꪪ' for key 1
 drop table t1;
+create table t1 (c char(3) character set utf8, unique (c(2))) engine=innodb;
+insert into t1 values ('1'),('2'),('3'),('4'),('x'),('y'),('z');
+insert into t1 values ('a');
+insert into t1 values ('aa');
+insert into t1 values ('aaa');
+ERROR 23000: Duplicate entry 'aaa' for key 1
+insert into t1 values ('b');
+insert into t1 values ('bb');
+insert into t1 values ('bbb');
+ERROR 23000: Duplicate entry 'bbb' for key 1
+insert into t1 values ('а');
+insert into t1 values ('аа');
+insert into t1 values ('ааа');
+ERROR 23000: Duplicate entry 'ааа' for key 1
+insert into t1 values ('б');
+insert into t1 values ('бб');
+insert into t1 values ('ббб');
+ERROR 23000: Duplicate entry 'ббб' for key 1
+insert into t1 values ('ꪪ');
+insert into t1 values ('ꪪꪪ');
+insert into t1 values ('ꪪꪪꪪ');
+ERROR 23000: Duplicate entry 'ꪪꪪ' for key 1
+drop table t1;
 create table t1 (
 c char(10) character set utf8,
 unique key a using hash (c(1))
@@ -611,6 +667,16 @@ str
 drop table t1;
 create table t1 (
 str varchar(255) character set utf8 not null,
+key str  (str(2))
+) engine=innodb;
+INSERT INTO t1 VALUES ('str');
+INSERT INTO t1 VALUES ('str2');
+select * from t1 where str='str';
+str
+str
+drop table t1;
+create table t1 (
+str varchar(255) character set utf8 not null,
 key str using btree (str(2))
 ) engine=heap;
 INSERT INTO t1 VALUES ('str');
diff --git a/mysql-test/t/ctype_utf8.test b/mysql-test/t/ctype_utf8.test
index 2c531d4e5d2..97cdfedca99 100644
--- a/mysql-test/t/ctype_utf8.test
+++ b/mysql-test/t/ctype_utf8.test
@@ -217,6 +217,27 @@ select c ca10 from t1 where c='aaaaaaaaaa';
 select c cb20 from t1 where c=repeat('b',20);
 drop table t1;
 
+#
+# Bug 4521: unique key prefix interacts poorly with utf8
+# InnoDB: keys with prefix compression, case insensitive collation.
+#
+create table t1 (c varchar(30) character set utf8, unique(c(10))) engine=innodb;
+insert into t1 values ('1'),('2'),('3'),('x'),('y'),('z');
+insert into t1 values ('aaaaaaaaaa');
+--error 1062
+insert into t1 values ('aaaaaaaaaaa');
+--error 1062
+insert into t1 values ('aaaaaaaaaaaa');
+insert into t1 values (repeat('b',20));
+select c c1 from t1 where c='1';
+select c c2 from t1 where c='2';
+select c c3 from t1 where c='3';
+select c cx from t1 where c='x';
+select c cy from t1 where c='y';
+select c cz from t1 where c='z';
+select c ca10 from t1 where c='aaaaaaaaaa';
+select c cb20 from t1 where c=repeat('b',20);
+drop table t1;
 #
 # Bug 4521: unique key prefix interacts poorly with utf8
 # MYISAM: fixed length keys, case insensitive collation
@@ -244,7 +265,33 @@ insert into t1 values ('ꪪꪪ');
 --error 1062
 insert into t1 values ('ꪪꪪꪪ');
 drop table t1;
-
+#
+# Bug 4521: unique key prefix interacts poorly with utf8
+# InnoDB: fixed length keys, case insensitive collation
+#
+create table t1 (c char(3) character set utf8, unique (c(2))) engine=innodb;
+insert into t1 values ('1'),('2'),('3'),('4'),('x'),('y'),('z');
+insert into t1 values ('a');
+insert into t1 values ('aa');
+--error 1062
+insert into t1 values ('aaa');
+insert into t1 values ('b');
+insert into t1 values ('bb');
+--error 1062
+insert into t1 values ('bbb');
+insert into t1 values ('а');
+insert into t1 values ('аа');
+--error 1062
+insert into t1 values ('ааа');
+insert into t1 values ('б');
+insert into t1 values ('бб');
+--error 1062
+insert into t1 values ('ббб');
+insert into t1 values ('ꪪ');
+insert into t1 values ('ꪪꪪ');
+--error 1062
+insert into t1 values ('ꪪꪪꪪ');
+drop table t1;
 #
 # Bug 4531: unique key prefix interacts poorly with utf8
 # Check HEAP+HASH, case insensitive collation
@@ -454,6 +501,18 @@ INSERT INTO t1 VALUES ('str2');
 select * from t1 where str='str';
 drop table t1;
 
+# Bug#4594: column index make = failed for gbk, but like works
+# Check InnoDB
+#
+create table t1 (
+  str varchar(255) character set utf8 not null,
+  key str  (str(2))
+) engine=innodb;
+INSERT INTO t1 VALUES ('str');
+INSERT INTO t1 VALUES ('str2');
+select * from t1 where str='str';
+drop table t1;
+
 # the same for HEAP+BTREE
 #
 
diff --git a/sql/ha_innodb.cc b/sql/ha_innodb.cc
index 5aa7c02fcc0..00fa6bf34aa 100644
--- a/sql/ha_innodb.cc
+++ b/sql/ha_innodb.cc
@@ -41,6 +41,7 @@ have disables the InnoDB inlining in this file. */
 #include <hash.h>
 #include <myisampack.h>
 #include <mysys_err.h>
+#include <my_sys.h>
 
 #define MAX_ULONG_BIT ((ulong) 1 << (sizeof(ulong)*8-1))
 
@@ -5268,4 +5269,61 @@ ulonglong ha_innobase::get_mysql_bin_log_pos()
   return trx_sys_mysql_bin_log_pos;
 }
 
+extern "C" {
+/***********************************************************************
+This function finds charset information and returns the character
+length for multibyte character set. */
+
+ulint innobase_get_charset_mbmaxlen(
+	ulint charset_id)	/* in: charset id */
+{
+  CHARSET_INFO*   charset;        /* charset used in the field */
+
+  charset = get_charset(charset_id,MYF(MY_WME));
+
+  ut_ad(charset);
+  ut_ad(charset->mbmaxlen);
+
+  return charset->mbmaxlen;
+}
+}
+
+extern "C" {
+/***********************************************************************
+This function finds charset information and returns position the nth 
+character for multibyte character set.*/
+
+ulint innobase_get_at_most_n_mbchars(
+	ulint charset_id,	/* in: character set id */
+        ulint nth,		/* in: nth character    */
+	ulint data_len,         /* in: length of the sting in bytes */
+	const char *pos)	/* in: character string */
+{
+  ulint byte_length;		/* storage length, in bytes. */
+  ulint char_length;		/* character length in bytes */
+  CHARSET_INFO* charset;	/* charset used in the field */
+
+  ut_ad(pos);
+  byte_length = data_len;
+
+  charset = get_charset(charset_id,MYF(MY_WME));
+
+  ut_ad(charset);
+  ut_ad(charset->mbmaxlen);
+
+  char_length= byte_length / charset->mbmaxlen;
+  nth = nth / charset->mbmaxlen;
+
+  if (byte_length > char_length)
+  {
+	char_length= my_charpos(charset, pos, pos + byte_length, nth);
+	set_if_smaller(char_length, byte_length);
+  } 
+  else
+    char_length = nth;
+
+  return char_length;
+}
+}
+
 #endif /* HAVE_INNOBASE_DB */
diff --git a/sql/ha_innodb.h b/sql/ha_innodb.h
index 6556931fa1a..2aca6e3be70 100644
--- a/sql/ha_innodb.h
+++ b/sql/ha_innodb.h
@@ -228,3 +228,4 @@ my_bool innobase_query_caching_of_table_permitted(THD* thd, char* full_name,
 void innobase_release_temporary_latches(void* innobase_tid);
 
 void innobase_store_binlog_offset_and_flush_log(char *binlog_name,longlong offset);
+