Merge abarkov@bk-internal.mysql.com:/home/bk/mysql-5.0-rpl

into mysql.com:/home/bar/mysql-work/mysql-5.0.b27345
2025-01-18 04:53:01 +01:00 · 2007-07-03 13:58:19 +05:00 · 2007-07-03 13:58:19 +05:00 · 825570f5a4
commit 825570f5a4
parent 4bbac9acfd d3f43c874e
5 changed files with 261 additions and 2 deletions
--- a/mysql-test/r/ctype_uca.result
+++ b/mysql-test/r/ctype_uca.result
@ -2663,3 +2663,95 @@ COUNT(*)	c1
 1	
 1	a
 DROP TABLE IF EXISTS t1;
+set names utf8;
+create table t1 (
+a varchar(255),
+key a(a)
+) character set utf8 collate utf8_danish_ci;
+insert into t1 values ('åaaaa'),('ååaaa'),('aaaaa');
+select a as like_a from t1 where a like 'a%';
+like_a
+aaaaa
+select a as like_aa from t1 where a like 'aa%';
+like_aa
+aaaaa
+select a as like_aaa from t1 where a like 'aaa%';
+like_aaa
+aaaaa
+select a as like_aaaa from t1 where a like 'aaaa%';
+like_aaaa
+aaaaa
+select a as like_aaaaa from t1 where a like 'aaaaa%';
+like_aaaaa
+aaaaa
+alter table t1 convert to character set ucs2 collate ucs2_danish_ci;
+select a as like_a from t1 where a like 'a%';
+like_a
+aaaaa
+select a as like_aa from t1 where a like 'aa%';
+like_aa
+aaaaa
+select a as like_aaa from t1 where a like 'aaa%';
+like_aaa
+aaaaa
+select a as like_aaaa from t1 where a like 'aaaa%';
+like_aaaa
+aaaaa
+select a as like_aaaaa from t1 where a like 'aaaaa%';
+like_aaaaa
+aaaaa
+drop table t1;
+create table t1 (
+a varchar(255),
+key(a)
+) character set utf8 collate utf8_spanish2_ci;
+insert into t1 values ('aaaaa'),('lllll'),('zzzzz');
+select a as like_l from t1 where a like 'l%';
+like_l
+lllll
+select a as like_ll from t1 where a like 'll%';
+like_ll
+lllll
+select a as like_lll from t1 where a like 'lll%';
+like_lll
+lllll
+select a as like_llll from t1 where a like 'llll%';
+like_llll
+lllll
+select a as like_lllll from t1 where a like 'lllll%';
+like_lllll
+lllll
+alter table t1 convert to character set ucs2 collate ucs2_spanish2_ci;
+select a as like_l from t1 where a like 'l%';
+like_l
+lllll
+select a as like_ll from t1 where a like 'll%';
+like_ll
+lllll
+select a as like_lll from t1 where a like 'lll%';
+like_lll
+lllll
+select a as like_llll from t1 where a like 'llll%';
+like_llll
+lllll
+select a as like_lllll from t1 where a like 'lllll%';
+like_lllll
+lllll
+drop table t1;
+create table t1 (
+a varchar(255),
+key a(a)
+) character set utf8 collate utf8_czech_ci;
+insert into t1 values
+('b'),('c'),('d'),('e'),('f'),('g'),('h'),('ch'),('i'),('j');
+select * from t1 where a like 'c%';
+a
+c
+ch
+alter table t1 convert to character set ucs2 collate ucs2_czech_ci;
+select * from t1 where a like 'c%';
+a
+c
+ch
+drop table t1;
+End for 5.0 tests
--- a/mysql-test/t/ctype_uca.test
+++ b/mysql-test/t/ctype_uca.test
@ -485,3 +485,57 @@ CREATE TABLE t1 (
 insert into t1 values (''),('a');
 SELECT COUNT(*), c1 FROM t1 GROUP BY c1;
 DROP TABLE IF EXISTS t1;
+
+#
+# Bug#27345 Incorrect data returned when range-read from utf8_danish_ci indexes
+#
+set names utf8;
+create table t1 (
+  a varchar(255),
+  key a(a)
+) character set utf8 collate utf8_danish_ci;
+insert into t1 values ('åaaaa'),('ååaaa'),('aaaaa');
+select a as like_a from t1 where a like 'a%';
+select a as like_aa from t1 where a like 'aa%';
+select a as like_aaa from t1 where a like 'aaa%';
+select a as like_aaaa from t1 where a like 'aaaa%';
+select a as like_aaaaa from t1 where a like 'aaaaa%';
+alter table t1 convert to character set ucs2 collate ucs2_danish_ci;
+select a as like_a from t1 where a like 'a%';
+select a as like_aa from t1 where a like 'aa%';
+select a as like_aaa from t1 where a like 'aaa%';
+select a as like_aaaa from t1 where a like 'aaaa%';
+select a as like_aaaaa from t1 where a like 'aaaaa%';
+drop table t1;
+
+create table t1 (
+  a varchar(255),
+  key(a)
+) character set utf8 collate utf8_spanish2_ci;
+insert into t1 values ('aaaaa'),('lllll'),('zzzzz');
+select a as like_l from t1 where a like 'l%';
+select a as like_ll from t1 where a like 'll%';
+select a as like_lll from t1 where a like 'lll%';
+select a as like_llll from t1 where a like 'llll%';
+select a as like_lllll from t1 where a like 'lllll%';
+alter table t1 convert to character set ucs2 collate ucs2_spanish2_ci;
+select a as like_l from t1 where a like 'l%';
+select a as like_ll from t1 where a like 'll%';
+select a as like_lll from t1 where a like 'lll%';
+select a as like_llll from t1 where a like 'llll%';
+select a as like_lllll from t1 where a like 'lllll%';
+drop table t1;
+
+create table t1 (
+  a varchar(255),
+  key a(a)
+) character set utf8 collate utf8_czech_ci;
+-- In Czech 'ch' is a single letter between 'h' and 'i'
+insert into t1 values
+('b'),('c'),('d'),('e'),('f'),('g'),('h'),('ch'),('i'),('j');
+select * from t1 where a like 'c%';
+alter table t1 convert to character set ucs2 collate ucs2_czech_ci;
+select * from t1 where a like 'c%';
+drop table t1;
+
+-- echo End for 5.0 tests
--- a/strings/ctype-mb.c
+++ b/strings/ctype-mb.c
@ -563,6 +563,8 @@ my_bool my_like_range_mb(CHARSET_INFO *cs,
  char *min_end= min_str + res_length;
  char *max_end= max_str + res_length;
  uint maxcharlen= res_length / cs->mbmaxlen;
+  const char *contraction_flags= cs->contractions ? 
+              ((const char*) cs->contractions) + 0x40*0x40 : NULL;

  for (; ptr != end && min_str != min_end && maxcharlen ; maxcharlen--)
  {
@ -571,6 +573,7 @@ my_bool my_like_range_mb(CHARSET_INFO *cs,
      ptr++;                                    /* Skip escape */
    else if (*ptr == w_one || *ptr == w_many)   /* '_' and '%' in SQL */
    {      
+fill_max_and_min:
      /*
        Calculate length of keys:
        'a\0\0... is the smallest possible string when we have space expand
@ -602,8 +605,74 @@ my_bool my_like_range_mb(CHARSET_INFO *cs,
       *min_str++= *max_str++= *ptr++;
    }
    else
-       *min_str++= *max_str++= *ptr++;    
+    {
+      /*
+        Special case for collations with contractions.
+        For example, in Chezh, 'ch' is a separate letter
+        which is sorted between 'h' and 'i'.
+        If the pattern 'abc%', 'c' at the end can mean:
+        - letter 'c' itself,
+        - beginning of the contraction 'ch'.

+        If we simply return this LIKE range:
+
+         'abc\min\min\min' and 'abc\max\max\max'
+
+        then this query: SELECT * FROM t1 WHERE a LIKE 'abc%'
+        will only find values starting from 'abc[^h]',
+        but won't find values starting from 'abch'.
+
+        We must ignore contraction heads followed by w_one or w_many.
+        ('Contraction head' means any letter which can be the first
+        letter in a contraction)
+
+        For example, for Czech 'abc%', we will return LIKE range,
+        which is equal to LIKE range for 'ab%':
+
+        'ab\min\min\min\min' and 'ab\max\max\max\max'.
+
+      */
+      if (contraction_flags && ptr + 1 < end &&
+          contraction_flags[(uchar) *ptr])
+      {
+        /* Ptr[0] is a contraction head. */
+        
+        if (ptr[1] == w_one || ptr[1] == w_many)
+        {
+          /* Contraction head followed by a wildcard, quit. */
+          goto fill_max_and_min;
+        }
+        
+        /*
+          Some letters can be both contraction heads and contraction tails.
+          For example, in Danish 'aa' is a separate single letter which
+          is sorted after 'z'. So 'a' can be both head and tail.
+          
+          If ptr[0]+ptr[1] is a contraction,
+          then put both letters together.
+          
+          If ptr[1] can be a contraction part, but ptr[0]+ptr[1]
+          is not a contraction, then we put only ptr[0],
+          and continue with ptr[1] on the next loop.
+        */
+        if (contraction_flags[(uchar) ptr[1]] &&
+            cs->contractions[(*ptr-0x40)*0x40 + ptr[1] - 0x40])
+        {
+          /* Contraction found */
+          if (maxcharlen == 1 || min_str + 1 >= min_end)
+          {
+            /* Both contraction parts don't fit, quit */
+            goto fill_max_and_min;
+          }
+          
+          /* Put contraction head */
+          *min_str++= *max_str++= *ptr++;
+          maxcharlen--;
+        }
+      }
+      /* Put contraction tail, or a single character */
+      *min_str++= *max_str++= *ptr++;    
+    }
  }

  *min_length= *max_length = (uint) (min_str - min_org);
--- a/strings/ctype-uca.c
+++ b/strings/ctype-uca.c
@ -7937,10 +7937,16 @@ static my_bool create_tailoring(CHARSET_INFO *cs, void *(*alloc)(uint))
  /* Now process contractions */
  if (ncontractions)
  {
-    uint size= 0x40*0x40*sizeof(uint16); /* 8K, for basic latin letter only */
+    /*
+      8K for weights for basic latin letter pairs,
+      plus 256 bytes for "is contraction part" flags.
+    */
+    uint size= 0x40*0x40*sizeof(uint16) + 256;
+    char *contraction_flags;
    if (!(cs->contractions= (uint16*) (*alloc)(size)))
        return 1;
    bzero((void*)cs->contractions, size);
+    contraction_flags= ((char*) cs->contractions) + 0x40*0x40;
    for (i=0; i < rc; i++)
    {
      if (rule[i].curr[1])
@ -7966,6 +7972,9 @@ static my_bool create_tailoring(CHARSET_INFO *cs, void *(*alloc)(uint))
        
        /* Copy base weight applying primary difference */
        cs->contractions[offsc]= offsb[0] + rule[i].diff[0];
+        /* Mark both letters as "is contraction part */
+        contraction_flags[rule[i].curr[0]]= 1;
+        contraction_flags[rule[i].curr[1]]= 1;
      }
    }
  }
--- a/strings/ctype-ucs2.c
+++ b/strings/ctype-ucs2.c
@ -1524,6 +1524,8 @@ my_bool my_like_range_ucs2(CHARSET_INFO *cs,
  char *min_org=min_str;
  char *min_end=min_str+res_length;
  uint charlen= res_length / cs->mbmaxlen;
+  const char *contraction_flags= cs->contractions ?
+             ((const char*) cs->contractions) + 0x40*0x40 : NULL;
  
  for ( ; ptr + 1 < end && min_str + 1 < min_end && charlen > 0
        ; ptr+=2, charlen--)
@ -1545,6 +1547,7 @@ my_bool my_like_range_ucs2(CHARSET_INFO *cs,
    }
    if (ptr[0] == '\0' && ptr[1] == w_many)	/* '%' in SQL */
    {
+fill_max_and_min:
      /*
        Calculate length of keys:
        'a\0\0... is the smallest possible string when we have space expand
@ -1561,6 +1564,38 @@ my_bool my_like_range_ucs2(CHARSET_INFO *cs,
      } while (min_str + 1 < min_end);
      return 0;
    }
+
+    if (contraction_flags && ptr + 3 < end &&
+        ptr[0] == '\0' && contraction_flags[(uchar) ptr[1]])
+    {
+      /* Contraction head found */
+      if (ptr[2] == '\0' && (ptr[3] == w_one || ptr[3] == w_many))
+      {
+        /* Contraction head followed by a wildcard, quit */
+        goto fill_max_and_min;
+      }
+      
+      /*
+        Check if the second letter can be contraction part,
+        and if two letters really produce a contraction.
+      */
+      if (ptr[2] == '\0' && contraction_flags[(uchar) ptr[3]] &&
+          cs->contractions[(ptr[1]-0x40)*0x40 + ptr[3] - 0x40])
+      {
+        /* Contraction found */
+        if (charlen == 1 || min_str + 2 >= min_end)
+        {
+          /* Full contraction doesn't fit, quit */
+          goto fill_max_and_min;
+        }
+        
+        /* Put contraction head */
+        *min_str++= *max_str++= *ptr++;
+        *min_str++= *max_str++= *ptr++;
+        charlen--;
+      }
+    }
+    /* Put contraction tail, or a single character */
    *min_str++= *max_str++ = ptr[0];
    *min_str++= *max_str++ = ptr[1];
  }