From 9bb464487d6c1c0359dba9aa79e88f927cd24c93 Mon Sep 17 00:00:00 2001
From: unknown <bar@mysql.com>
Date: Wed, 26 Jan 2005 16:34:09 +0400
Subject: [PATCH] CSC#4385: slow sorting for UTF8 large table: my_strnxfrm_utf8
 now requires 2 bytes per character in filesort key, instead of 3 bytes per
 character. Shorter filesort keys make sorting faster.

---
 include/m_ctype.h         |  4 +++-
 sql/filesort.cc           |  4 ++--
 strings/ctype-big5.c      |  1 +
 strings/ctype-bin.c       |  2 ++
 strings/ctype-cp932.c     |  1 +
 strings/ctype-czech.c     |  1 +
 strings/ctype-euc_kr.c    |  1 +
 strings/ctype-eucjpms.c   |  1 +
 strings/ctype-gb2312.c    |  1 +
 strings/ctype-gbk.c       |  1 +
 strings/ctype-latin1.c    |  1 +
 strings/ctype-mb.c        |  1 +
 strings/ctype-simple.c    | 10 ++++++++++
 strings/ctype-sjis.c      |  1 +
 strings/ctype-tis620.c    |  1 +
 strings/ctype-uca.c       |  2 ++
 strings/ctype-ucs2.c      |  2 ++
 strings/ctype-ujis.c      |  1 +
 strings/ctype-utf8.c      | 37 ++++++++++++++++++++++++-------------
 strings/ctype-win1250ch.c |  1 +
 20 files changed, 58 insertions(+), 16 deletions(-)

diff --git a/include/m_ctype.h b/include/m_ctype.h
index c2354c7feff..c41c7385b3d 100644
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@@ -110,6 +110,7 @@ typedef struct my_collation_handler_st
                          my_bool diff_if_only_endspace_difference);
   int     (*strnxfrm)(struct charset_info_st *,
 		      uchar *, uint, const uchar *, uint);
+  uint    (*strnxfrmlen)(struct charset_info_st *, uint); 
   my_bool (*like_range)(struct charset_info_st *,
 			const char *s, uint s_length,
 			pchar w_prefix, pchar w_one, pchar w_many, 
@@ -259,7 +260,8 @@ extern CHARSET_INFO my_charset_cp1250_czech_ci;
 
 /* declarations for simple charsets */
 extern int  my_strnxfrm_simple(CHARSET_INFO *, uchar *, uint, const uchar *,
-			       uint); 
+                               uint); 
+uint  my_strnxfrmlen_simple(CHARSET_INFO *, uint); 
 extern int  my_strnncoll_simple(CHARSET_INFO *, const uchar *, uint,
 				const uchar *, uint, my_bool);
 
diff --git a/sql/filesort.cc b/sql/filesort.cc
index 0e9fa8c79ed..1665358dbf0 100644
--- a/sql/filesort.cc
+++ b/sql/filesort.cc
@@ -1187,7 +1187,7 @@ sortlength(SORT_FIELD *sortorder, uint s_length, bool *multi_byte_charset)
 	{
 	  sortorder->need_strxnfrm= 1;
 	  *multi_byte_charset= 1;
-	  sortorder->length= sortorder->length*cs->strxfrm_multiply;
+          sortorder->length= cs->coll->strnxfrmlen(cs, sortorder->length);
 	}
       }
       if (sortorder->field->maybe_null())
@@ -1200,7 +1200,7 @@ sortlength(SORT_FIELD *sortorder, uint s_length, bool *multi_byte_charset)
 	sortorder->length=sortorder->item->max_length;
 	if (use_strnxfrm((cs=sortorder->item->collation.collation)))
 	{ 
-	  sortorder->length= sortorder->length*cs->strxfrm_multiply;
+          sortorder->length= cs->coll->strnxfrmlen(cs, sortorder->length);
 	  sortorder->need_strxnfrm= 1;
 	  *multi_byte_charset= 1;
 	}
diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c
index a2db7de244e..70c5ec633be 100644
--- a/strings/ctype-big5.c
+++ b/strings/ctype-big5.c
@@ -6293,6 +6293,7 @@ static MY_COLLATION_HANDLER my_collation_big5_chinese_ci_handler =
   my_strnncoll_big5,
   my_strnncollsp_big5,
   my_strnxfrm_big5,
+  my_strnxfrmlen_simple,
   my_like_range_big5,
   my_wildcmp_mb,
   my_strcasecmp_mb,
diff --git a/strings/ctype-bin.c b/strings/ctype-bin.c
index 401605a462f..50c66a63e97 100644
--- a/strings/ctype-bin.c
+++ b/strings/ctype-bin.c
@@ -447,6 +447,7 @@ MY_COLLATION_HANDLER my_collation_8bit_bin_handler =
     my_strnncoll_8bit_bin,
     my_strnncollsp_8bit_bin,
     my_strnxfrm_8bit_bin,
+    my_strnxfrmlen_simple,
     my_like_range_simple,
     my_wildcmp_bin,
     my_strcasecmp_bin,
@@ -461,6 +462,7 @@ static MY_COLLATION_HANDLER my_collation_binary_handler =
     my_strnncoll_binary,
     my_strnncollsp_binary,
     my_strnxfrm_bin,
+    my_strnxfrmlen_simple,
     my_like_range_simple,
     my_wildcmp_bin,
     my_strcasecmp_bin,
diff --git a/strings/ctype-cp932.c b/strings/ctype-cp932.c
index 804f87b2a5b..c47f2c2d8ce 100644
--- a/strings/ctype-cp932.c
+++ b/strings/ctype-cp932.c
@@ -5454,6 +5454,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
   my_strnncoll_cp932,
   my_strnncollsp_cp932,
   my_strnxfrm_cp932,
+  my_strnxfrmlen_simple,
   my_like_range_cp932,
   my_wildcmp_mb,	/* wildcmp  */
   my_strcasecmp_8bit,
diff --git a/strings/ctype-czech.c b/strings/ctype-czech.c
index 2834dbb28ff..f5a410afc50 100644
--- a/strings/ctype-czech.c
+++ b/strings/ctype-czech.c
@@ -593,6 +593,7 @@ static MY_COLLATION_HANDLER my_collation_latin2_czech_ci_handler =
   my_strnncoll_czech,
   my_strnncollsp_czech,
   my_strnxfrm_czech,
+  my_strnxfrmlen_simple,
   my_like_range_czech,
   my_wildcmp_8bit,
   my_strcasecmp_8bit,
diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c
index ee792d9c3e4..289b7309ea0 100644
--- a/strings/ctype-euc_kr.c
+++ b/strings/ctype-euc_kr.c
@@ -8641,6 +8641,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
   my_strnncoll_simple,  /* strnncoll  */
   my_strnncollsp_simple,
   my_strnxfrm_simple,	/* strnxfrm   */
+  my_strnxfrmlen_simple,
   my_like_range_simple, /* like_range */
   my_wildcmp_mb,	/* wildcmp    */
   my_strcasecmp_mb,
diff --git a/strings/ctype-eucjpms.c b/strings/ctype-eucjpms.c
index 5b108d24f4b..8c8d237cf48 100644
--- a/strings/ctype-eucjpms.c
+++ b/strings/ctype-eucjpms.c
@@ -8636,6 +8636,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
     my_strnncoll_simple,/* strnncoll    */
     my_strnncollsp_simple,
     my_strnxfrm_simple,	/* strnxfrm     */
+    my_strnxfrmlen_simple,
     my_like_range_simple,/* like_range   */
     my_wildcmp_mb,	/* wildcmp      */
     my_strcasecmp_mb,
diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c
index f17cc94723f..73e4132dd7f 100644
--- a/strings/ctype-gb2312.c
+++ b/strings/ctype-gb2312.c
@@ -5692,6 +5692,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
   my_strnncoll_simple,  /* strnncoll  */
   my_strnncollsp_simple,
   my_strnxfrm_simple,	/* strnxfrm   */
+  my_strnxfrmlen_simple,
   my_like_range_simple, /* like_range */
   my_wildcmp_mb,	/* wildcmp    */
   my_strcasecmp_mb,     /* instr      */
diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c
index dc4aea60096..6b47b537fb9 100644
--- a/strings/ctype-gbk.c
+++ b/strings/ctype-gbk.c
@@ -9939,6 +9939,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
   my_strnncoll_gbk,
   my_strnncollsp_gbk,
   my_strnxfrm_gbk,
+  my_strnxfrmlen_simple,
   my_like_range_gbk,
   my_wildcmp_mb,
   my_strcasecmp_mb,
diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c
index b5da99a7452..043645684cf 100644
--- a/strings/ctype-latin1.c
+++ b/strings/ctype-latin1.c
@@ -693,6 +693,7 @@ static MY_COLLATION_HANDLER my_collation_german2_ci_handler=
   my_strnncoll_latin1_de,
   my_strnncollsp_latin1_de,
   my_strnxfrm_latin1_de,
+  my_strnxfrmlen_simple,
   my_like_range_simple,
   my_wildcmp_8bit,
   my_strcasecmp_8bit,
diff --git a/strings/ctype-mb.c b/strings/ctype-mb.c
index 4be21599fef..e902730d65a 100644
--- a/strings/ctype-mb.c
+++ b/strings/ctype-mb.c
@@ -912,6 +912,7 @@ MY_COLLATION_HANDLER my_collation_mb_bin_handler =
     my_strnncoll_mb_bin,
     my_strnncollsp_mb_bin,
     my_strnxfrm_mb_bin,
+    my_strnxfrmlen_simple,
     my_like_range_simple,
     my_wildcmp_mb_bin,
     my_strcasecmp_mb_bin,
diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c
index bb623ef66f1..e436d5f8702 100644
--- a/strings/ctype-simple.c
+++ b/strings/ctype-simple.c
@@ -21,6 +21,15 @@
 
 #include "stdarg.h"
 
+/*
+  Returns the number of bytes required for strnxfrm().
+*/
+uint my_strnxfrmlen_simple(CHARSET_INFO *cs, uint len)
+{
+  return len * (cs->strxfrm_multiply ? cs->strxfrm_multiply : 1);
+}
+
+
 /*
   Converts a string into its sort key.
   
@@ -1365,6 +1374,7 @@ MY_COLLATION_HANDLER my_collation_8bit_simple_ci_handler =
     my_strnncoll_simple,
     my_strnncollsp_simple,
     my_strnxfrm_simple,
+    my_strnxfrmlen_simple,
     my_like_range_simple,
     my_wildcmp_8bit,
     my_strcasecmp_8bit,
diff --git a/strings/ctype-sjis.c b/strings/ctype-sjis.c
index c1e41dc2d94..22cc8d9818d 100644
--- a/strings/ctype-sjis.c
+++ b/strings/ctype-sjis.c
@@ -4627,6 +4627,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
   my_strnncoll_sjis,
   my_strnncollsp_sjis,
   my_strnxfrm_sjis,
+  my_strnxfrmlen_simple,
   my_like_range_sjis,
   my_wildcmp_mb,	/* wildcmp  */
   my_strcasecmp_8bit,
diff --git a/strings/ctype-tis620.c b/strings/ctype-tis620.c
index c6bdd106ad4..9ba35e1c8ec 100644
--- a/strings/ctype-tis620.c
+++ b/strings/ctype-tis620.c
@@ -927,6 +927,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
     my_strnncoll_tis620,
     my_strnncollsp_tis620,
     my_strnxfrm_tis620,
+    my_strnxfrmlen_simple,
     my_like_range_tis620,
     my_wildcmp_8bit,	/* wildcmp   */
     my_strcasecmp_8bit,
diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c
index 2353c9397a2..8345d0474f2 100644
--- a/strings/ctype-uca.c
+++ b/strings/ctype-uca.c
@@ -8024,6 +8024,7 @@ MY_COLLATION_HANDLER my_collation_ucs2_uca_handler =
     my_strnncoll_ucs2_uca,
     my_strnncollsp_ucs2_uca,
     my_strnxfrm_ucs2_uca,
+    my_strnxfrmlen_simple,
     my_like_range_ucs2,
     my_wildcmp_uca,
     NULL,
@@ -8504,6 +8505,7 @@ MY_COLLATION_HANDLER my_collation_any_uca_handler =
     my_strnncoll_any_uca,
     my_strnncollsp_any_uca,
     my_strnxfrm_any_uca,
+    my_strnxfrmlen_simple,
     my_like_range_mb,
     my_wildcmp_uca,
     NULL,
diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c
index adfd4794e36..0d45cceb64d 100644
--- a/strings/ctype-ucs2.c
+++ b/strings/ctype-ucs2.c
@@ -1499,6 +1499,7 @@ static MY_COLLATION_HANDLER my_collation_ucs2_general_ci_handler =
     my_strnncoll_ucs2,
     my_strnncollsp_ucs2,
     my_strnxfrm_ucs2,
+    my_strnxfrmlen_simple,
     my_like_range_ucs2,
     my_wildcmp_ucs2_ci,
     my_strcasecmp_ucs2,
@@ -1513,6 +1514,7 @@ static MY_COLLATION_HANDLER my_collation_ucs2_bin_handler =
     my_strnncoll_ucs2_bin,
     my_strnncollsp_ucs2_bin,
     my_strnxfrm_ucs2_bin,
+    my_strnxfrmlen_simple,
     my_like_range_simple,
     my_wildcmp_ucs2_bin,
     my_strcasecmp_ucs2_bin,
diff --git a/strings/ctype-ujis.c b/strings/ctype-ujis.c
index fc1496df280..deaddcc76f6 100644
--- a/strings/ctype-ujis.c
+++ b/strings/ctype-ujis.c
@@ -8501,6 +8501,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
     my_strnncoll_simple,/* strnncoll    */
     my_strnncollsp_simple,
     my_strnxfrm_simple,	/* strnxfrm     */
+    my_strnxfrmlen_simple,
     my_like_range_simple,/* like_range   */
     my_wildcmp_mb,	/* wildcmp      */
     my_strcasecmp_mb,
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index 1f9f158a73d..e17e7587e85 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -2238,6 +2238,12 @@ int my_wildcmp_utf8(CHARSET_INFO *cs,
 }
 
 
+static
+uint my_strnxfrmlen_utf8(CHARSET_INFO *cs __attribute__((unused)), uint len)
+{
+  return (len * 2 + 2) / 3;
+}
+
 static int my_strnxfrm_utf8(CHARSET_INFO *cs,
                             uchar *dst, uint dstlen,
                             const uchar *src, uint srclen)
@@ -2245,29 +2251,33 @@ static int my_strnxfrm_utf8(CHARSET_INFO *cs,
   my_wc_t wc;
   int res;
   int plane;
-  uchar *de = dst + dstlen;
+  uchar *de= dst + dstlen;
+  uchar *de_beg= de - 1;
   const uchar *se = src + srclen;
 
-  while( src < se && dst < de )
+  while (dst < de_beg)
   {
-    if ((res=my_utf8_uni(cs,&wc, src, se))<0)
-    {
+    if ((res=my_utf8_uni(cs,&wc, src, se)) <= 0)
       break;
-    }
     src+=res;
-    srclen-=res;
 
     plane=(wc>>8) & 0xFF;
     wc = uni_plane[plane] ? uni_plane[plane][wc & 0xFF].sort : wc;
 
-    if ((res=my_uni_utf8(cs,wc,dst,de)) <0)
-    {
-      break;
-    }
-    dst+=res;
+    *dst++= wc >> 8;
+    *dst++= wc & 0xFF;
+    
   }
-  if (dst < de)
-    bfill(dst, de - dst, ' ');
+  
+  while (dst < de_beg) /* Fill the tail with keys for space character */
+  {
+    *dst++= 0x00;
+    *dst++= 0x20;
+  }
+  
+  if (dst < de)  /* Clear the last byte, if "dstlen" was an odd number */
+    *de= 0x00;
+  
   return dstlen;
 }
 
@@ -2306,6 +2316,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
     my_strnncoll_utf8,
     my_strnncollsp_utf8,
     my_strnxfrm_utf8,
+    my_strnxfrmlen_utf8,
     my_like_range_mb,
     my_wildcmp_utf8,
     my_strcasecmp_utf8,
diff --git a/strings/ctype-win1250ch.c b/strings/ctype-win1250ch.c
index b58a8f0f1e5..37611a5bd20 100644
--- a/strings/ctype-win1250ch.c
+++ b/strings/ctype-win1250ch.c
@@ -626,6 +626,7 @@ static MY_COLLATION_HANDLER my_collation_czech_ci_handler =
   my_strnncoll_win1250ch,
   my_strnncollsp_win1250ch,
   my_strnxfrm_win1250ch,
+  my_strnxfrmlen_simple,
   my_like_range_win1250ch,
   my_wildcmp_8bit,
   my_strcasecmp_8bit,