mirror of
				https://github.com/MariaDB/server.git
				synced 2025-10-31 19:06:14 +01:00 
			
		
		
		
	
		
			
				
	
	
		
			1415 lines
		
	
	
	
		
			40 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			1415 lines
		
	
	
	
		
			40 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /* Copyright (c) 2000, 2013, Oracle and/or its affiliates.
 | |
|    Copyright (c) 2009, 2020, MariaDB Corporation.
 | |
| 
 | |
|    This program is free software; you can redistribute it and/or modify
 | |
|    it under the terms of the GNU General Public License as published by
 | |
|    the Free Software Foundation; version 2 of the License.
 | |
| 
 | |
|    This program is distributed in the hope that it will be useful,
 | |
|    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
|    GNU General Public License for more details.
 | |
| 
 | |
|    You should have received a copy of the GNU General Public License
 | |
|    along with this program; if not, write to the Free Software
 | |
|    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335  USA */
 | |
| 
 | |
| #include "strings_def.h"
 | |
| #include <m_ctype.h>
 | |
| #include <my_xml.h>
 | |
| 
 | |
| /*
 | |
| 
 | |
|   This files implements routines which parse XML based
 | |
|   character set and collation description files.
 | |
|   
 | |
|   Unicode collations are encoded according to
 | |
|   
 | |
|     Unicode Technical Standard #35
 | |
|     Locale Data Markup Language (LDML)
 | |
|     http://www.unicode.org/reports/tr35/
 | |
|   
 | |
|   and converted into ICU string according to
 | |
|   
 | |
|     Collation Customization
 | |
|     http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
 | |
|   
 | |
| */
 | |
| const char charset_name_latin2[]= "latin2";
 | |
| const char charset_name_utf8mb3[]= "utf8mb3";
 | |
| const char charset_name_utf16[]= "utf16";
 | |
| const char charset_name_utf32[]= "utf32";
 | |
| const char charset_name_ucs2[]= "ucs2";
 | |
| const char charset_name_utf8mb4[]= "utf8mb4";
 | |
| 
 | |
| /*
 | |
|   Avoid using my_snprintf
 | |
|   We cannot use my_snprintf() here, because ctype.o is
 | |
|   used to build conf_to_src, which must require minimum
 | |
|   dependency.
 | |
| */
 | |
| 
 | |
| #undef my_snprinf
 | |
| #define my_snprintf "We cannot use my_snprintf in this file"
 | |
| 
 | |
| 
 | |
| int (*my_string_stack_guard)(int)= NULL;
 | |
| 
 | |
| static char *mstr(char *str,const char *src,size_t l1,size_t l2)
 | |
| {
 | |
|   l1= l1<l2 ? l1 : l2;
 | |
|   memcpy(str,src,l1);
 | |
|   str[l1]='\0';
 | |
|   return str;
 | |
| }
 | |
| 
 | |
| struct my_cs_file_section_st
 | |
| {
 | |
|   int        state;
 | |
|   const char *str;
 | |
| };
 | |
| 
 | |
| #define _CS_MISC	1
 | |
| #define _CS_ID		2
 | |
| #define _CS_CSNAME	3
 | |
| #define _CS_FAMILY	4
 | |
| #define _CS_ORDER	5
 | |
| #define _CS_COLNAME	6
 | |
| #define _CS_FLAG	7
 | |
| #define _CS_CHARSET	8
 | |
| #define _CS_COLLATION	9
 | |
| #define _CS_UPPERMAP	10
 | |
| #define _CS_LOWERMAP	11
 | |
| #define _CS_UNIMAP	12
 | |
| #define _CS_COLLMAP	13
 | |
| #define _CS_CTYPEMAP	14
 | |
| #define _CS_PRIMARY_ID	15
 | |
| #define _CS_BINARY_ID	16
 | |
| #define _CS_CSDESCRIPT	17
 | |
| 
 | |
| 
 | |
| /* Special purpose commands */
 | |
| #define _CS_UCA_VERSION                 100
 | |
| #define _CS_CL_SUPPRESS_CONTRACTIONS    101
 | |
| #define _CS_CL_OPTIMIZE                 102
 | |
| #define _CS_CL_SHIFT_AFTER_METHOD       103
 | |
| #define _CS_CL_RULES_IMPORT             104
 | |
| #define _CS_CL_RULES_IMPORT_SOURCE      105
 | |
| 
 | |
| 
 | |
| /* Collation Settings */
 | |
| #define _CS_ST_SETTINGS                 200
 | |
| #define _CS_ST_STRENGTH                 201
 | |
| #define _CS_ST_ALTERNATE                202
 | |
| #define _CS_ST_BACKWARDS                203
 | |
| #define _CS_ST_NORMALIZATION            204
 | |
| #define _CS_ST_CASE_LEVEL               205
 | |
| #define _CS_ST_CASE_FIRST               206
 | |
| #define _CS_ST_HIRAGANA_QUATERNARY      207
 | |
| #define _CS_ST_NUMERIC                  208
 | |
| #define _CS_ST_VARIABLE_TOP             209
 | |
| #define _CS_ST_MATCH_BOUNDARIES         210
 | |
| #define _CS_ST_MATCH_STYLE              211
 | |
| 
 | |
| 
 | |
| /* Rules */
 | |
| #define _CS_RULES                       300
 | |
| #define _CS_RESET                       301
 | |
| #define _CS_DIFF1                       302
 | |
| #define _CS_DIFF2                       303
 | |
| #define _CS_DIFF3                       304
 | |
| #define _CS_DIFF4                       305
 | |
| #define _CS_IDENTICAL                   306
 | |
| 
 | |
| /* Rules: Expansions */
 | |
| #define _CS_EXP_X                       320
 | |
| #define _CS_EXP_EXTEND                  321
 | |
| #define _CS_EXP_DIFF1                   322
 | |
| #define _CS_EXP_DIFF2                   323
 | |
| #define _CS_EXP_DIFF3                   324
 | |
| #define _CS_EXP_DIFF4                   325
 | |
| #define _CS_EXP_IDENTICAL               326
 | |
| 
 | |
| /* Rules: Abbreviating Ordering Specifications */
 | |
| #define _CS_A_DIFF1                     351
 | |
| #define _CS_A_DIFF2                     352
 | |
| #define _CS_A_DIFF3                     353
 | |
| #define _CS_A_DIFF4                     354
 | |
| #define _CS_A_IDENTICAL                 355
 | |
| 
 | |
| /* Rules: previous context */
 | |
| #define _CS_CONTEXT                     370
 | |
| 
 | |
| /* Rules: Placing Characters Before Others*/
 | |
| #define _CS_RESET_BEFORE 380
 | |
| 
 | |
| /* Rules: Logical Reset Positions */
 | |
| #define _CS_RESET_FIRST_PRIMARY_IGNORABLE     401
 | |
| #define _CS_RESET_LAST_PRIMARY_IGNORABLE      402
 | |
| #define _CS_RESET_FIRST_SECONDARY_IGNORABLE   403
 | |
| #define _CS_RESET_LAST_SECONDARY_IGNORABLE    404
 | |
| #define _CS_RESET_FIRST_TERTIARY_IGNORABLE    405
 | |
| #define _CS_RESET_LAST_TERTIARY_IGNORABLE     406
 | |
| #define _CS_RESET_FIRST_TRAILING              407
 | |
| #define _CS_RESET_LAST_TRAILING               408
 | |
| #define _CS_RESET_FIRST_VARIABLE              409
 | |
| #define _CS_RESET_LAST_VARIABLE               410
 | |
| #define _CS_RESET_FIRST_NON_IGNORABLE         411
 | |
| #define _CS_RESET_LAST_NON_IGNORABLE          412
 | |
| 
 | |
| 
 | |
| 
 | |
| static const struct my_cs_file_section_st sec[] =
 | |
| {
 | |
|   {_CS_MISC,		"xml"},
 | |
|   {_CS_MISC,		"xml/version"},
 | |
|   {_CS_MISC,		"xml/encoding"},
 | |
|   {_CS_MISC,		"charsets"},
 | |
|   {_CS_MISC,		"charsets/max-id"},
 | |
|   {_CS_MISC,		"charsets/copyright"},
 | |
|   {_CS_MISC,		"charsets/description"},
 | |
|   {_CS_CHARSET,		"charsets/charset"},
 | |
|   {_CS_PRIMARY_ID,	"charsets/charset/primary-id"},
 | |
|   {_CS_BINARY_ID,	"charsets/charset/binary-id"},
 | |
|   {_CS_CSNAME,		"charsets/charset/name"},
 | |
|   {_CS_FAMILY,		"charsets/charset/family"},
 | |
|   {_CS_CSDESCRIPT,	"charsets/charset/description"},
 | |
|   {_CS_MISC,		"charsets/charset/alias"},
 | |
|   {_CS_MISC,		"charsets/charset/ctype"},
 | |
|   {_CS_CTYPEMAP,	"charsets/charset/ctype/map"},
 | |
|   {_CS_MISC,		"charsets/charset/upper"},
 | |
|   {_CS_UPPERMAP,	"charsets/charset/upper/map"},
 | |
|   {_CS_MISC,		"charsets/charset/lower"},
 | |
|   {_CS_LOWERMAP,	"charsets/charset/lower/map"},
 | |
|   {_CS_MISC,		"charsets/charset/unicode"},
 | |
|   {_CS_UNIMAP,		"charsets/charset/unicode/map"},
 | |
|   {_CS_COLLATION,	"charsets/charset/collation"},
 | |
|   {_CS_COLNAME,		"charsets/charset/collation/name"},
 | |
|   {_CS_ID,		"charsets/charset/collation/id"},
 | |
|   {_CS_ORDER,		"charsets/charset/collation/order"},
 | |
|   {_CS_FLAG,		"charsets/charset/collation/flag"},
 | |
|   {_CS_COLLMAP,		"charsets/charset/collation/map"},
 | |
| 
 | |
|   /* Special purpose commands */
 | |
|   {_CS_UCA_VERSION,              "charsets/charset/collation/version"},
 | |
|   {_CS_CL_SUPPRESS_CONTRACTIONS, "charsets/charset/collation/suppress_contractions"},
 | |
|   {_CS_CL_OPTIMIZE,              "charsets/charset/collation/optimize"},
 | |
|   {_CS_CL_SHIFT_AFTER_METHOD,    "charsets/charset/collation/shift-after-method"},
 | |
|   {_CS_CL_RULES_IMPORT,          "charsets/charset/collation/rules/import"},
 | |
|   {_CS_CL_RULES_IMPORT_SOURCE,   "charsets/charset/collation/rules/import/source"},
 | |
| 
 | |
|   /* Collation Settings */
 | |
|   {_CS_ST_SETTINGS,              "charsets/charset/collation/settings"},
 | |
|   {_CS_ST_STRENGTH,              "charsets/charset/collation/settings/strength"},
 | |
|   {_CS_ST_ALTERNATE,             "charsets/charset/collation/settings/alternate"},
 | |
|   {_CS_ST_BACKWARDS,             "charsets/charset/collation/settings/backwards"},
 | |
|   {_CS_ST_NORMALIZATION,         "charsets/charset/collation/settings/normalization"},
 | |
|   {_CS_ST_CASE_LEVEL,            "charsets/charset/collation/settings/caseLevel"},
 | |
|   {_CS_ST_CASE_FIRST,            "charsets/charset/collation/settings/caseFirst"},
 | |
|   {_CS_ST_HIRAGANA_QUATERNARY,   "charsets/charset/collation/settings/hiraganaQuaternary"},
 | |
|   {_CS_ST_NUMERIC,               "charsets/charset/collation/settings/numeric"},
 | |
|   {_CS_ST_VARIABLE_TOP,          "charsets/charset/collation/settings/variableTop"},
 | |
|   {_CS_ST_MATCH_BOUNDARIES,      "charsets/charset/collation/settings/match-boundaries"},
 | |
|   {_CS_ST_MATCH_STYLE,           "charsets/charset/collation/settings/match-style"},
 | |
| 
 | |
|   /* Rules */
 | |
|   {_CS_RULES,           "charsets/charset/collation/rules"},
 | |
|   {_CS_RESET,           "charsets/charset/collation/rules/reset"},
 | |
|   {_CS_DIFF1,           "charsets/charset/collation/rules/p"},
 | |
|   {_CS_DIFF2,           "charsets/charset/collation/rules/s"},
 | |
|   {_CS_DIFF3,           "charsets/charset/collation/rules/t"},
 | |
|   {_CS_DIFF4,           "charsets/charset/collation/rules/q"},
 | |
|   {_CS_IDENTICAL,       "charsets/charset/collation/rules/i"},
 | |
| 
 | |
|   /* Rules: expansions */
 | |
|   {_CS_EXP_X,           "charsets/charset/collation/rules/x"},
 | |
|   {_CS_EXP_EXTEND,      "charsets/charset/collation/rules/x/extend"},
 | |
|   {_CS_EXP_DIFF1,       "charsets/charset/collation/rules/x/p"},
 | |
|   {_CS_EXP_DIFF2,       "charsets/charset/collation/rules/x/s"},
 | |
|   {_CS_EXP_DIFF3,       "charsets/charset/collation/rules/x/t"},
 | |
|   {_CS_EXP_DIFF4,       "charsets/charset/collation/rules/x/q"},
 | |
|   {_CS_EXP_IDENTICAL,   "charsets/charset/collation/rules/x/i"},
 | |
|   
 | |
|   /* Rules: previous context */
 | |
|   {_CS_CONTEXT,         "charsets/charset/collation/rules/x/context"},
 | |
| 
 | |
|   /* Rules: Abbreviating Ordering Specifications */
 | |
|   {_CS_A_DIFF1,         "charsets/charset/collation/rules/pc"},
 | |
|   {_CS_A_DIFF2,         "charsets/charset/collation/rules/sc"},
 | |
|   {_CS_A_DIFF3,         "charsets/charset/collation/rules/tc"},
 | |
|   {_CS_A_DIFF4,         "charsets/charset/collation/rules/qc"},
 | |
|   {_CS_A_IDENTICAL,     "charsets/charset/collation/rules/ic"},
 | |
| 
 | |
|   /* Rules: Placing Characters Before Others*/
 | |
|   {_CS_RESET_BEFORE,    "charsets/charset/collation/rules/reset/before"},
 | |
| 
 | |
|   /* Rules: Logical Reset Positions */
 | |
|   {_CS_RESET_FIRST_NON_IGNORABLE,       "charsets/charset/collation/rules/reset/first_non_ignorable"},
 | |
|   {_CS_RESET_LAST_NON_IGNORABLE,        "charsets/charset/collation/rules/reset/last_non_ignorable"},
 | |
|   {_CS_RESET_FIRST_PRIMARY_IGNORABLE,   "charsets/charset/collation/rules/reset/first_primary_ignorable"},
 | |
|   {_CS_RESET_LAST_PRIMARY_IGNORABLE,    "charsets/charset/collation/rules/reset/last_primary_ignorable"},
 | |
|   {_CS_RESET_FIRST_SECONDARY_IGNORABLE, "charsets/charset/collation/rules/reset/first_secondary_ignorable"},
 | |
|   {_CS_RESET_LAST_SECONDARY_IGNORABLE,  "charsets/charset/collation/rules/reset/last_secondary_ignorable"},
 | |
|   {_CS_RESET_FIRST_TERTIARY_IGNORABLE,  "charsets/charset/collation/rules/reset/first_tertiary_ignorable"},
 | |
|   {_CS_RESET_LAST_TERTIARY_IGNORABLE,   "charsets/charset/collation/rules/reset/last_tertiary_ignorable"},
 | |
|   {_CS_RESET_FIRST_TRAILING,            "charsets/charset/collation/rules/reset/first_trailing"},
 | |
|   {_CS_RESET_LAST_TRAILING,             "charsets/charset/collation/rules/reset/last_trailing"},
 | |
|   {_CS_RESET_FIRST_VARIABLE,            "charsets/charset/collation/rules/reset/first_variable"},
 | |
|   {_CS_RESET_LAST_VARIABLE,             "charsets/charset/collation/rules/reset/last_variable"},
 | |
| 
 | |
|   {0,	NULL}
 | |
| };
 | |
| 
 | |
| static const struct my_cs_file_section_st
 | |
| *cs_file_sec(const char *attr, size_t len)
 | |
| {
 | |
|   const struct my_cs_file_section_st *s;
 | |
|   for (s=sec; s->str; s++)
 | |
|   {
 | |
|     if (!strncmp(attr, s->str, len) && s->str[len] == 0)
 | |
|       return s;
 | |
|   }
 | |
|   return NULL;
 | |
| }
 | |
| 
 | |
| #define MY_CS_CSDESCR_SIZE	64
 | |
| #define MY_CS_TAILORING_SIZE	(32*1024)
 | |
| #define MY_CS_UCA_VERSION_SIZE  64
 | |
| #define MY_CS_CONTEXT_SIZE      64
 | |
| 
 | |
| typedef struct my_cs_file_info
 | |
| {
 | |
|   char   csname[MY_CS_CHARACTER_SET_NAME_SIZE];
 | |
|   char   name[MY_CS_COLLATION_NAME_SIZE];
 | |
|   uchar  ctype[MY_CS_CTYPE_TABLE_SIZE];
 | |
|   uchar  to_lower[MY_CS_TO_LOWER_TABLE_SIZE];
 | |
|   uchar  to_upper[MY_CS_TO_UPPER_TABLE_SIZE];
 | |
|   uchar  sort_order[MY_CS_SORT_ORDER_TABLE_SIZE];
 | |
|   uint16 tab_to_uni[MY_CS_TO_UNI_TABLE_SIZE];
 | |
|   char   comment[MY_CS_CSDESCR_SIZE];
 | |
|   char  *tailoring;
 | |
|   size_t tailoring_length;
 | |
|   size_t tailoring_alloced_length;
 | |
|   char   context[MY_CS_CONTEXT_SIZE];
 | |
|   struct charset_info_st cs;
 | |
|   MY_CHARSET_LOADER *loader;
 | |
| } MY_CHARSET_FILE;
 | |
| 
 | |
| 
 | |
| static void
 | |
| my_charset_file_reset_charset(MY_CHARSET_FILE *i)
 | |
| {
 | |
|   memset(&i->cs, 0, sizeof(i->cs));
 | |
| }
 | |
| 
 | |
| 
 | |
| static void
 | |
| my_charset_file_reset_collation(MY_CHARSET_FILE *i)
 | |
| {
 | |
|   i->tailoring_length= 0;
 | |
|   i->context[0]= '\0';
 | |
| }
 | |
| 
 | |
| 
 | |
| static void
 | |
| my_charset_file_init(MY_CHARSET_FILE *i)
 | |
| {
 | |
|   my_charset_file_reset_charset(i);
 | |
|   my_charset_file_reset_collation(i);
 | |
|   i->tailoring= NULL;
 | |
|   i->tailoring_alloced_length= 0;
 | |
| }
 | |
| 
 | |
| 
 | |
| static void
 | |
| my_charset_file_free(MY_CHARSET_FILE *i)
 | |
| {
 | |
|   i->loader->free(i->tailoring);
 | |
| }
 | |
| 
 | |
| 
 | |
| static int
 | |
| my_charset_file_tailoring_realloc(MY_CHARSET_FILE *i, size_t newlen)
 | |
| {
 | |
|   if (i->tailoring_alloced_length > newlen ||
 | |
|      (i->tailoring= i->loader->realloc(i->tailoring,
 | |
|                                        (i->tailoring_alloced_length=
 | |
|                                         (newlen + 32*1024)))))
 | |
|   {
 | |
|     return MY_XML_OK;
 | |
|   }
 | |
|   return MY_XML_ERROR;
 | |
| }
 | |
| 
 | |
| 
 | |
| static int fill_uchar(uchar *a,uint size,const char *str, size_t len)
 | |
| {
 | |
|   uint i= 0;
 | |
|   const char *s, *b, *e=str+len;
 | |
|   
 | |
|   for (s=str ; s < e ; i++)
 | |
|   { 
 | |
|     for ( ; (s < e) && strchr(" \t\r\n",s[0]); s++) ;
 | |
|     b=s;
 | |
|     for ( ; (s < e) && !strchr(" \t\r\n",s[0]); s++) ;
 | |
|     if (s == b || i > size)
 | |
|       break;
 | |
|     a[i]= (uchar) strtoul(b,NULL,16);
 | |
|   }
 | |
|   return 0;
 | |
| }
 | |
| 
 | |
| static int fill_uint16(uint16 *a,uint size,const char *str, size_t len)
 | |
| {
 | |
|   uint i= 0;
 | |
|   
 | |
|   const char *s, *b, *e=str+len;
 | |
|   for (s=str ; s < e ; i++)
 | |
|   { 
 | |
|     for ( ; (s < e) && strchr(" \t\r\n",s[0]); s++) ;
 | |
|     b=s;
 | |
|     for ( ; (s < e) && !strchr(" \t\r\n",s[0]); s++) ;
 | |
|     if (s == b || i > size)
 | |
|       break;
 | |
|     a[i]= (uint16) strtol(b,NULL,16);
 | |
|   }
 | |
|   return 0;
 | |
| }
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| static int
 | |
| tailoring_append(MY_XML_PARSER *st,
 | |
|                  const char *fmt, size_t len, const char *attr)
 | |
| {
 | |
|   struct my_cs_file_info *i= (struct my_cs_file_info *) st->user_data;
 | |
|   size_t newlen= i->tailoring_length + len + 64; /* 64 for format */ 
 | |
|   if (MY_XML_OK == my_charset_file_tailoring_realloc(i, newlen))
 | |
|   {
 | |
|     char *dst= i->tailoring + i->tailoring_length;
 | |
|     sprintf(dst, fmt, (int) len, attr);
 | |
|     i->tailoring_length+= strlen(dst);
 | |
|     return MY_XML_OK;
 | |
|   }
 | |
|   return MY_XML_ERROR;
 | |
| }
 | |
| 
 | |
| 
 | |
| static int
 | |
| tailoring_append2(MY_XML_PARSER *st,
 | |
|                   const char *fmt,
 | |
|                   size_t len1, const char *attr1,
 | |
|                   size_t len2, const char *attr2)
 | |
| {
 | |
|   struct my_cs_file_info *i= (struct my_cs_file_info *) st->user_data;
 | |
|   size_t newlen= i->tailoring_length + len1 + len2 + 64; /* 64 for format */
 | |
|   if (MY_XML_OK == my_charset_file_tailoring_realloc(i, newlen))
 | |
|   {
 | |
|     char *dst= i->tailoring + i->tailoring_length;
 | |
|     sprintf(dst, fmt, (int) len1, attr1, (int) len2, attr2);
 | |
|     i->tailoring_length+= strlen(dst);
 | |
|     return MY_XML_OK;
 | |
|   }
 | |
|   return MY_XML_ERROR;
 | |
| }
 | |
| 
 | |
| 
 | |
| static size_t
 | |
| scan_one_character(const char *s, const char *e, my_wc_t *wc)
 | |
| {
 | |
|   CHARSET_INFO *cs= &my_charset_utf8mb3_general_ci;
 | |
|   if (s >= e)
 | |
|     return 0;
 | |
| 
 | |
|   /* Escape sequence: \uXXXX */
 | |
|   if (s[0] == '\\' && s + 2 < e && s[1] == 'u' && my_isxdigit(cs, s[2]))
 | |
|   {
 | |
|     size_t len= 3; /* We have at least one digit */
 | |
|     for (s+= 3; s < e && my_isxdigit(cs, s[0]); s++, len++)
 | |
|     {
 | |
|     }
 | |
|     wc[0]= 0;
 | |
|     return len;
 | |
|   }
 | |
|   else if ((int8) s[0] > 0) /* 7-bit character */
 | |
|   {
 | |
|     wc[0]= 0;
 | |
|     return 1;
 | |
|   }
 | |
|   else /* Non-escaped character */
 | |
|   {
 | |
|     int rc= my_ci_mb_wc(cs, wc, (uchar *) s, (uchar *) e);
 | |
|     if (rc > 0)
 | |
|       return (size_t) rc;
 | |
|   }
 | |
|   return 0;
 | |
| }
 | |
| 
 | |
| 
 | |
| static int
 | |
| tailoring_append_abbreviation(MY_XML_PARSER *st,
 | |
|                               const char *fmt, size_t len, const char *attr)
 | |
| {
 | |
|   size_t clen;
 | |
|   const char *attrend= attr + len;
 | |
|   my_wc_t wc;
 | |
| 
 | |
|   for ( ; (clen= scan_one_character(attr, attrend, &wc)) > 0; attr+= clen)
 | |
|   {
 | |
|     DBUG_ASSERT(attr < attrend);
 | |
|     if (tailoring_append(st, fmt, clen, attr) != MY_XML_OK)
 | |
|       return MY_XML_ERROR;
 | |
|   }
 | |
|   return MY_XML_OK;
 | |
| }
 | |
| 
 | |
| 
 | |
| static int cs_enter(MY_XML_PARSER *st,const char *attr, size_t len)
 | |
| {
 | |
|   struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
 | |
|   const struct my_cs_file_section_st *s= cs_file_sec(attr,len);
 | |
|   int state= s ? s->state : 0;
 | |
|   
 | |
|   switch (state) {
 | |
|   case 0:
 | |
|     i->loader->reporter(WARNING_LEVEL, "Unknown LDML tag: '%.*s'", len, attr);
 | |
|     break;
 | |
| 
 | |
|   case _CS_CHARSET:
 | |
|     my_charset_file_reset_charset(i);
 | |
|     break;
 | |
| 
 | |
|   case _CS_COLLATION:
 | |
|     my_charset_file_reset_collation(i);
 | |
|     break;
 | |
| 
 | |
|   case _CS_RESET:
 | |
|     return tailoring_append(st, " &", 0, NULL);
 | |
| 
 | |
|   default:
 | |
|     break;
 | |
|   }
 | |
|   return MY_XML_OK;
 | |
| }
 | |
| 
 | |
| 
 | |
| static int cs_leave(MY_XML_PARSER *st,const char *attr, size_t len)
 | |
| {
 | |
|   struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
 | |
|   const struct my_cs_file_section_st *s= cs_file_sec(attr,len);
 | |
|   int    state= s ? s->state : 0;
 | |
|   int    rc;
 | |
|   
 | |
|   switch(state){
 | |
|   case _CS_COLLATION:
 | |
|     if (i->tailoring_length)
 | |
|       i->cs.tailoring= i->tailoring;
 | |
|     rc= i->loader->add_collation ? i->loader->add_collation(&i->cs) : MY_XML_OK;
 | |
|     break;
 | |
| 
 | |
|   /* Rules: Logical Reset Positions */
 | |
|   case _CS_RESET_FIRST_NON_IGNORABLE:
 | |
|     rc= tailoring_append(st, "[first non-ignorable]", 0, NULL);
 | |
|     break;
 | |
| 
 | |
|   case _CS_RESET_LAST_NON_IGNORABLE:
 | |
|     rc= tailoring_append(st, "[last non-ignorable]", 0, NULL);
 | |
|     break;
 | |
| 
 | |
|   case _CS_RESET_FIRST_PRIMARY_IGNORABLE:
 | |
|     rc= tailoring_append(st, "[first primary ignorable]", 0, NULL);
 | |
|     break;
 | |
| 
 | |
|   case _CS_RESET_LAST_PRIMARY_IGNORABLE:
 | |
|     rc= tailoring_append(st, "[last primary ignorable]", 0, NULL);
 | |
|     break;
 | |
| 
 | |
|   case _CS_RESET_FIRST_SECONDARY_IGNORABLE:
 | |
|     rc= tailoring_append(st, "[first secondary ignorable]", 0, NULL);
 | |
|     break;
 | |
| 
 | |
|   case _CS_RESET_LAST_SECONDARY_IGNORABLE:
 | |
|     rc= tailoring_append(st, "[last secondary ignorable]", 0, NULL);
 | |
|     break;
 | |
| 
 | |
|   case _CS_RESET_FIRST_TERTIARY_IGNORABLE:
 | |
|     rc= tailoring_append(st, "[first tertiary ignorable]", 0, NULL);
 | |
|     break;
 | |
| 
 | |
|   case _CS_RESET_LAST_TERTIARY_IGNORABLE:
 | |
|     rc= tailoring_append(st, "[last tertiary ignorable]", 0, NULL);
 | |
|     break;
 | |
| 
 | |
|   case _CS_RESET_FIRST_TRAILING:
 | |
|     rc= tailoring_append(st, "[first trailing]", 0, NULL);
 | |
|     break;
 | |
| 
 | |
|   case _CS_RESET_LAST_TRAILING:
 | |
|     rc= tailoring_append(st, "[last trailing]", 0, NULL);
 | |
|     break;
 | |
| 
 | |
|   case _CS_RESET_FIRST_VARIABLE:
 | |
|     rc= tailoring_append(st, "[first variable]", 0, NULL);
 | |
|     break;
 | |
| 
 | |
|   case _CS_RESET_LAST_VARIABLE:
 | |
|     rc= tailoring_append(st, "[last variable]", 0, NULL);
 | |
|     break;
 | |
| 
 | |
|   default:
 | |
|     rc=MY_XML_OK;
 | |
|   }
 | |
|   return rc;
 | |
| }
 | |
| 
 | |
| 
 | |
| static const char *diff_fmt[5]=
 | |
| {
 | |
|   "<%.*s",
 | |
|   "<<%.*s",
 | |
|   "<<<%.*s",
 | |
|   "<<<<%.*s",
 | |
|   "=%.*s"
 | |
| };
 | |
| 
 | |
| 
 | |
| static const char *context_diff_fmt[5]=
 | |
| {
 | |
|   "<%.*s|%.*s",
 | |
|   "<<%.*s|%.*s",
 | |
|   "<<<%.*s|%.*s",
 | |
|   "<<<<%.*s|%.*s",
 | |
|   "=%.*s|%.*s"
 | |
| };
 | |
| 
 | |
| 
 | |
| static int cs_value(MY_XML_PARSER *st,const char *attr, size_t len)
 | |
| {
 | |
|   struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
 | |
|   const struct my_cs_file_section_st *s;
 | |
|   int    state= (int)((s= cs_file_sec(st->attr.start,
 | |
|                                       st->attr.end - st->attr.start)) ?
 | |
|                       s->state : 0);
 | |
|   int rc= MY_XML_OK;
 | |
| 
 | |
|   switch (state) {
 | |
|   case _CS_MISC:
 | |
|   case _CS_FAMILY:
 | |
|   case _CS_ORDER:
 | |
|     break;
 | |
|   case _CS_ID:
 | |
|     i->cs.number= strtol(attr,(char**)NULL,10);
 | |
|     break;
 | |
|   case _CS_BINARY_ID:
 | |
|     i->cs.binary_number= strtol(attr,(char**)NULL,10);
 | |
|     break;
 | |
|   case _CS_PRIMARY_ID:
 | |
|     i->cs.primary_number= strtol(attr,(char**)NULL,10);
 | |
|     break;
 | |
|   case _CS_COLNAME:
 | |
|     i->cs.coll_name.str= mstr(i->name,attr,len,MY_CS_COLLATION_NAME_SIZE-1);
 | |
|     i->cs.coll_name.length= strlen(i->cs.coll_name.str);
 | |
|     break;
 | |
|   case _CS_CSNAME:
 | |
|     i->cs.cs_name.str= mstr(i->csname,attr,len,MY_CS_CHARACTER_SET_NAME_SIZE-1);
 | |
|     i->cs.cs_name.length= strlen(i->cs.cs_name.str);
 | |
|     break;
 | |
|   case _CS_CSDESCRIPT:
 | |
|     i->cs.comment=mstr(i->comment,attr,len,MY_CS_CSDESCR_SIZE-1);
 | |
|     break;
 | |
|   case _CS_FLAG:
 | |
|     if (!strncmp("primary",attr,len))
 | |
|       i->cs.state|= MY_CS_PRIMARY;
 | |
|     else if (!strncmp("binary",attr,len))
 | |
|       i->cs.state|= MY_CS_BINSORT;
 | |
|     else if (!strncmp("compiled",attr,len))
 | |
|       i->cs.state|= MY_CS_COMPILED;
 | |
|     else if (!strncmp("nopad",attr,len))
 | |
|       i->cs.state|= MY_CS_NOPAD;
 | |
|     break;
 | |
|   case _CS_UPPERMAP:
 | |
|     fill_uchar(i->to_upper,MY_CS_TO_UPPER_TABLE_SIZE,attr,len);
 | |
|     i->cs.to_upper=i->to_upper;
 | |
|     break;
 | |
|   case _CS_LOWERMAP:
 | |
|     fill_uchar(i->to_lower,MY_CS_TO_LOWER_TABLE_SIZE,attr,len);
 | |
|     i->cs.to_lower=i->to_lower;
 | |
|     break;
 | |
|   case _CS_UNIMAP:
 | |
|     fill_uint16(i->tab_to_uni,MY_CS_TO_UNI_TABLE_SIZE,attr,len);
 | |
|     i->cs.tab_to_uni=i->tab_to_uni;
 | |
|     break;
 | |
|   case _CS_COLLMAP:
 | |
|     fill_uchar(i->sort_order,MY_CS_SORT_ORDER_TABLE_SIZE,attr,len);
 | |
|     i->cs.sort_order=i->sort_order;
 | |
|     break;
 | |
|   case _CS_CTYPEMAP:
 | |
|     fill_uchar(i->ctype,MY_CS_CTYPE_TABLE_SIZE,attr,len);
 | |
|     i->cs.m_ctype=i->ctype;
 | |
|     break;
 | |
| 
 | |
|   /* Special purpose commands */
 | |
|   case _CS_UCA_VERSION:
 | |
|     rc= tailoring_append(st, "[version %.*s]", len, attr);
 | |
|     break;
 | |
| 
 | |
|   case _CS_CL_RULES_IMPORT_SOURCE:
 | |
|     rc= tailoring_append(st, "[import %.*s]", len, attr);
 | |
|     break;
 | |
| 
 | |
|   case _CS_CL_SUPPRESS_CONTRACTIONS:
 | |
|     rc= tailoring_append(st, "[suppress contractions %.*s]", len, attr);
 | |
|     break;
 | |
| 
 | |
|   case _CS_CL_OPTIMIZE:
 | |
|     rc= tailoring_append(st, "[optimize %.*s]", len, attr);
 | |
|     break;
 | |
| 
 | |
|   case _CS_CL_SHIFT_AFTER_METHOD:
 | |
|     rc= tailoring_append(st, "[shift-after-method %.*s]", len, attr);
 | |
|     break;
 | |
| 
 | |
|   /* Collation Settings */
 | |
|   case _CS_ST_STRENGTH:
 | |
|     /* 1, 2, 3, 4, 5, or primary, secondary, tertiary, quaternary, identical */
 | |
|     rc= tailoring_append(st, "[strength %.*s]", len, attr);
 | |
|     if (len && attr[0] >= '1' && attr[0] <= '9')
 | |
|     {
 | |
|       uint strength= attr[0] - '0';
 | |
|       my_ci_set_strength(&i->cs, MY_MIN(strength, MY_UCA_WEIGHT_LEVELS));
 | |
|     }
 | |
|     break;
 | |
| 
 | |
|   case _CS_ST_ALTERNATE:
 | |
|     /* non-ignorable, shifted */
 | |
|     rc= tailoring_append(st, "[alternate %.*s]", len, attr);
 | |
|     break;
 | |
| 
 | |
|   case _CS_ST_BACKWARDS:
 | |
|     /* on, off, 2 */
 | |
|     rc= tailoring_append(st, "[backwards %.*s]", len, attr);
 | |
|     break;
 | |
| 
 | |
|   case _CS_ST_NORMALIZATION:
 | |
|     /*
 | |
|       TODO for WL#896: check collations for normalization: vi.xml
 | |
|       We want precomposed characters work well at this point.
 | |
|     */
 | |
|     /* on, off */
 | |
|     rc= tailoring_append(st, "[normalization %.*s]", len, attr);
 | |
|     break;
 | |
| 
 | |
|   case _CS_ST_CASE_LEVEL:
 | |
|     /* on, off */
 | |
|     rc= tailoring_append(st, "[caseLevel %.*s]", len, attr);
 | |
|     break;
 | |
| 
 | |
|   case _CS_ST_CASE_FIRST:
 | |
|     /* upper, lower, off */
 | |
|     rc= tailoring_append(st, "[caseFirst %.*s]", len, attr);
 | |
|     break;
 | |
| 
 | |
|   case _CS_ST_HIRAGANA_QUATERNARY:
 | |
|     /* on, off */
 | |
|     rc= tailoring_append(st, "[hiraganaQ %.*s]", len, attr);
 | |
|     break;
 | |
| 
 | |
|   case _CS_ST_NUMERIC:
 | |
|     /* on, off */
 | |
|     rc= tailoring_append(st, "[numeric %.*s]", len, attr);
 | |
|     break;
 | |
| 
 | |
|   case _CS_ST_VARIABLE_TOP:
 | |
|     /* TODO for WL#896: check value format */
 | |
|     rc= tailoring_append(st, "[variableTop %.*s]", len, attr);
 | |
|     break;
 | |
| 
 | |
|   case _CS_ST_MATCH_BOUNDARIES:
 | |
|     /* none, whole-character, whole-word */
 | |
|     rc= tailoring_append(st, "[match-boundaries %.*s]", len, attr);
 | |
|     break;
 | |
| 
 | |
|   case _CS_ST_MATCH_STYLE:
 | |
|     /* minimal, medial, maximal */
 | |
|     rc= tailoring_append(st, "[match-style %.*s]", len, attr);
 | |
|     break;
 | |
| 
 | |
| 
 | |
|   /* Rules */
 | |
|   case _CS_RESET:
 | |
|     rc= tailoring_append(st, "%.*s", len, attr);
 | |
|     break;
 | |
| 
 | |
|   case _CS_DIFF1:
 | |
|   case _CS_DIFF2:
 | |
|   case _CS_DIFF3:
 | |
|   case _CS_DIFF4:
 | |
|   case _CS_IDENTICAL:
 | |
|     rc= tailoring_append(st, diff_fmt[state - _CS_DIFF1], len, attr);
 | |
|     break;
 | |
| 
 | |
| 
 | |
|   /* Rules: Expansion */
 | |
|   case _CS_EXP_EXTEND:
 | |
|     rc= tailoring_append(st, " / %.*s", len, attr);
 | |
|     break;
 | |
| 
 | |
|   case _CS_EXP_DIFF1:
 | |
|   case _CS_EXP_DIFF2:
 | |
|   case _CS_EXP_DIFF3:
 | |
|   case _CS_EXP_DIFF4:
 | |
|   case _CS_EXP_IDENTICAL:
 | |
|     if (i->context[0])
 | |
|     {
 | |
|       rc= tailoring_append2(st, context_diff_fmt[state - _CS_EXP_DIFF1],
 | |
|                             strlen(i->context), i->context, len, attr);
 | |
|       i->context[0]= 0;
 | |
|     }
 | |
|     else
 | |
|       rc= tailoring_append(st, diff_fmt[state  - _CS_EXP_DIFF1], len, attr);
 | |
|     break;
 | |
| 
 | |
|   /* Rules: Context */
 | |
|   case _CS_CONTEXT:
 | |
|     if (len < sizeof(i->context))
 | |
|     {
 | |
|       memcpy(i->context, attr, len);
 | |
|       i->context[len]= '\0';
 | |
|     }
 | |
|     break;
 | |
| 
 | |
|   /* Rules: Abbreviating Ordering Specifications */
 | |
|   case _CS_A_DIFF1:
 | |
|   case _CS_A_DIFF2:
 | |
|   case _CS_A_DIFF3:
 | |
|   case _CS_A_DIFF4:
 | |
|   case _CS_A_IDENTICAL:
 | |
|     rc= tailoring_append_abbreviation(st, diff_fmt[state - _CS_A_DIFF1], len, attr);
 | |
|     break;
 | |
| 
 | |
|   /* Rules: Placing Characters Before Others */
 | |
|   case _CS_RESET_BEFORE:
 | |
|     /*
 | |
|       TODO for WL#896: Add this check into text customization parser:
 | |
|       It is an error if the strength of the before relation is not identical
 | |
|       to the relation after the reset. We'll need this for WL#896.
 | |
|     */
 | |
|     rc= tailoring_append(st, "[before %.*s]", len, attr);
 | |
|     break;
 | |
| 
 | |
| 
 | |
|   default:
 | |
|     break;
 | |
|   }
 | |
| 
 | |
|   return rc;
 | |
| }
 | |
| 
 | |
| 
 | |
| my_bool
 | |
| my_parse_charset_xml(MY_CHARSET_LOADER *loader, const char *buf, size_t len)
 | |
| {
 | |
|   MY_XML_PARSER p;
 | |
|   struct my_cs_file_info info;
 | |
|   my_bool rc;
 | |
|   
 | |
|   my_charset_file_init(&info);
 | |
|   my_xml_parser_create(&p);
 | |
|   my_xml_set_enter_handler(&p,cs_enter);
 | |
|   my_xml_set_value_handler(&p,cs_value);
 | |
|   my_xml_set_leave_handler(&p,cs_leave);
 | |
|   info.loader= loader;
 | |
|   my_xml_set_user_data(&p, (void *) &info);
 | |
|   rc= (my_xml_parse(&p,buf,len) == MY_XML_OK) ? FALSE : TRUE;
 | |
|   my_xml_parser_free(&p);
 | |
|   my_charset_file_free(&info);
 | |
|   if (rc != MY_XML_OK)
 | |
|   {
 | |
|     const char *errstr= my_xml_error_string(&p);
 | |
|     if (sizeof(loader->error) > 32 + strlen(errstr))
 | |
|     {
 | |
|       /* We cannot use my_snprintf() here. See previous comment. */
 | |
|       sprintf(loader->error, "at line %d pos %d: %s",
 | |
|                 my_xml_error_lineno(&p)+1,
 | |
|                 (int) my_xml_error_pos(&p),
 | |
|                 my_xml_error_string(&p));
 | |
|     }
 | |
|   }
 | |
|   return rc;
 | |
| }
 | |
| 
 | |
| 
 | |
| uint
 | |
| my_string_repertoire_8bit(CHARSET_INFO *cs, const char *str, size_t length)
 | |
| {
 | |
|   const char *strend;
 | |
|   if ((cs->state & MY_CS_NONASCII) && length > 0)
 | |
|     return MY_REPERTOIRE_UNICODE30;
 | |
|   if (!str) // Avoid UBSAN nullptr-with-offset
 | |
|     return MY_REPERTOIRE_ASCII;
 | |
|   for (strend= str + length; str < strend; str++)
 | |
|   {
 | |
|     if (((uchar) *str) > 0x7F)
 | |
|       return MY_REPERTOIRE_UNICODE30;
 | |
|   }
 | |
|   return MY_REPERTOIRE_ASCII;
 | |
| }
 | |
| 
 | |
| 
 | |
| static void
 | |
| my_string_metadata_init(MY_STRING_METADATA *metadata)
 | |
| {
 | |
|   metadata->repertoire= MY_REPERTOIRE_ASCII;
 | |
|   metadata->char_length= 0;
 | |
| }
 | |
| 
 | |
| 
 | |
| /**
 | |
|   This should probably eventually go as a virtual function into
 | |
|   MY_CHARSET_HANDLER or MY_COLLATION_HANDLER.
 | |
| */
 | |
| static void
 | |
| my_string_metadata_get_mb(MY_STRING_METADATA *metadata,
 | |
|                           CHARSET_INFO *cs, const char *str, ulong length)
 | |
| {
 | |
|   const char *strend= str ? str + length : NULL; // Avoid UB nullptr+0
 | |
|   for (my_string_metadata_init(metadata) ;
 | |
|        str < strend;
 | |
|        metadata->char_length++)
 | |
|   {
 | |
|     my_wc_t wc;
 | |
|     int mblen= my_ci_mb_wc(cs, &wc, (const uchar *) str, (const uchar *) strend);
 | |
|     if (mblen > 0) /* Assigned character */
 | |
|     {
 | |
|       if (wc > 0x7F)
 | |
|         metadata->repertoire|= MY_REPERTOIRE_EXTENDED;
 | |
|       str+= mblen;
 | |
|     }
 | |
|     else if (mblen == MY_CS_ILSEQ) /* Bad byte sequence */
 | |
|     {
 | |
|       metadata->repertoire|= MY_REPERTOIRE_EXTENDED;
 | |
|       str++;
 | |
|     }
 | |
|     else if (mblen > MY_CS_TOOSMALL) /* Unassigned character */ 
 | |
|     {
 | |
|       metadata->repertoire|= MY_REPERTOIRE_EXTENDED;
 | |
|       str+= (-mblen);
 | |
|     }
 | |
|     else /* Incomplete character, premature end-of-line */
 | |
|     {
 | |
|       metadata->repertoire|= MY_REPERTOIRE_EXTENDED; /* Just in case */
 | |
|       break;
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| 
 | |
| /**
 | |
|   Collect string metadata: length in characters and repertoire.
 | |
| */
 | |
| void
 | |
| my_string_metadata_get(MY_STRING_METADATA *metadata,
 | |
|                        CHARSET_INFO *cs, const char *str, size_t length)
 | |
| {
 | |
|   if (cs->mbmaxlen == 1 && !(cs->state & MY_CS_NONASCII))
 | |
|   {
 | |
|     metadata->char_length= length;
 | |
|     metadata->repertoire= my_string_repertoire_8bit(cs, str, (ulong)length);
 | |
|   }
 | |
|   else
 | |
|   {
 | |
|     my_string_metadata_get_mb(metadata, cs, str, (ulong)length);
 | |
|   }
 | |
| }
 | |
| 
 | |
| 
 | |
| /*
 | |
|   Check repertoire: detect pure ascii strings
 | |
| */
 | |
| my_repertoire_t
 | |
| my_string_repertoire(CHARSET_INFO *cs, const char *str, size_t length)
 | |
| {
 | |
|   if (cs->mbminlen == 1 && !(cs->state & MY_CS_NONASCII))
 | |
|   {
 | |
|     return my_string_repertoire_8bit(cs, str, length);
 | |
|   }
 | |
|   else
 | |
|   {
 | |
|     const char *strend= str + length;
 | |
|     my_wc_t wc;
 | |
|     int chlen;
 | |
|     for (;
 | |
|          (chlen= my_ci_mb_wc(cs, &wc, (uchar*) str, (uchar*) strend)) > 0;
 | |
|          str+= chlen)
 | |
|     {
 | |
|       if (wc > 0x7F)
 | |
|         return MY_REPERTOIRE_UNICODE30;
 | |
|     }
 | |
|   }
 | |
|   return MY_REPERTOIRE_ASCII;
 | |
| }
 | |
| 
 | |
| 
 | |
| /*
 | |
|   Returns repertoire for charset
 | |
| */
 | |
| my_repertoire_t my_charset_repertoire(CHARSET_INFO *cs)
 | |
| {
 | |
|   return cs->state & MY_CS_PUREASCII ?
 | |
|     MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30;
 | |
| }
 | |
| 
 | |
| 
 | |
| /*
 | |
|   Detect whether a character set is ASCII compatible.
 | |
| 
 | |
|   Returns TRUE for:
 | |
|   
 | |
|   - all 8bit character sets whose Unicode mapping of 0x7B is '{'
 | |
|     (ignores swe7 which maps 0x7B to "LATIN LETTER A WITH DIAERESIS")
 | |
|   
 | |
|   - all multi-byte character sets having mbminlen == 1
 | |
|     (ignores ucs2 whose mbminlen is 2)
 | |
|   
 | |
|   TODO:
 | |
|   
 | |
|   When merging to 5.2, this function should be changed
 | |
|   to check a new flag MY_CS_NONASCII, 
 | |
|   
 | |
|      return (cs->flag & MY_CS_NONASCII) ? 0 : 1;
 | |
|   
 | |
|   This flag was previously added into 5.2 under terms
 | |
|   of WL#3759 "Optimize identifier conversion in client-server protocol"
 | |
|   especially to mark character sets not compatible with ASCII.
 | |
|   
 | |
|   We won't backport this flag to 5.0 or 5.1.
 | |
|   This function is Ok for 5.0 and 5.1, because we're not going
 | |
|   to introduce new tricky character sets between 5.0 and 5.2.
 | |
| */
 | |
| my_bool
 | |
| my_charset_is_ascii_based(CHARSET_INFO *cs)
 | |
| {
 | |
|   return 
 | |
|     (cs->mbmaxlen == 1 && cs->tab_to_uni && cs->tab_to_uni['{'] == '{') ||
 | |
|     (cs->mbminlen == 1 && cs->mbmaxlen > 1);
 | |
| }
 | |
| 
 | |
| 
 | |
| /**
 | |
|   Detect if a Unicode code point is printable.
 | |
| */
 | |
| static inline my_bool
 | |
| my_is_printable(my_wc_t wc)
 | |
| {
 | |
|   /*
 | |
|     Blocks:
 | |
|       U+0000 .. U+001F     control
 | |
|       U+0020 .. U+007E     printable
 | |
|       U+007F .. U+009F     control
 | |
|       U+00A0 .. U+00FF     printable
 | |
|       U+0100 .. U+10FFFF   As of Unicode-6.1.0, this range does not have any
 | |
|                            characters of the "Cc" (Other, control) category.
 | |
|                            Should be mostly safe to print.
 | |
|                            Except for the surrogate halfs,
 | |
|                            which are encoding components, not real characters.
 | |
|   */
 | |
|   if (wc >= 0x20 && wc <= 0x7E) /* Quickly detect ASCII printable */
 | |
|     return TRUE;
 | |
|   if (wc <= 0x9F)    /* The rest of U+0000..U+009F are control characters */
 | |
|   {
 | |
|     /* NL, CR, TAB are Ok */
 | |
|     return (wc == '\r' || wc == '\n' || wc == '\t');
 | |
|   }
 | |
|   /*
 | |
|     Surrogate halfs (when alone) print badly in terminals:
 | |
|       SELECT _ucs2 0xD800;
 | |
|     Let's escape them as well.
 | |
|   */
 | |
|   if (wc >= 0xD800 && wc <= 0xDFFF)
 | |
|     return FALSE;
 | |
|   return TRUE;
 | |
| }
 | |
| 
 | |
| 
 | |
| static uint to_printable_8bit(uchar *dst, my_wc_t wc, uint bs)
 | |
| {
 | |
|   /*
 | |
|     This function is used only in context of error messages for now.
 | |
|     All non-BMP characters are currently replaced to question marks
 | |
|     when a message is put into diagnostics area.
 | |
|   */
 | |
|   DBUG_ASSERT(wc < 0x10000);
 | |
|   *dst++= (char) bs;
 | |
|   *dst++= _dig_vec_upper[(wc >> 12) & 0x0F];
 | |
|   *dst++= _dig_vec_upper[(wc >> 8) & 0x0F];
 | |
|   *dst++= _dig_vec_upper[(wc >> 4) & 0x0F];
 | |
|   *dst++= _dig_vec_upper[wc & 0x0F];
 | |
|   return MY_CS_PRINTABLE_CHAR_LENGTH;
 | |
| }
 | |
| 
 | |
| 
 | |
| static uint my_printable_length(uint bslen, uint diglen)
 | |
| {
 | |
|   return bslen + (MY_CS_PRINTABLE_CHAR_LENGTH - 1) * diglen;
 | |
| }
 | |
| 
 | |
| 
 | |
| /**
 | |
|   Encode an Unicode character "wc" into a printable string.
 | |
|   This function is suitable for any character set, including
 | |
|   ASCII-incompatible multi-byte character sets, e.g. ucs2, utf16, utf32.
 | |
| */
 | |
| int
 | |
| my_wc_to_printable_ex(CHARSET_INFO *cs, my_wc_t wc,
 | |
|                       uchar *str, uchar *end,
 | |
|                       uint bs, uint bslen, uint diglen)
 | |
| {
 | |
|   uchar *str0;
 | |
|   uint i, length;
 | |
|   uchar tmp[MY_CS_PRINTABLE_CHAR_LENGTH * MY_CS_MBMAXLEN];
 | |
| 
 | |
|   if (my_is_printable(wc))
 | |
|   {
 | |
|     int mblen= my_ci_wc_mb(cs, wc, str, end);
 | |
|     if (mblen > 0)
 | |
|       return mblen;
 | |
|   }
 | |
| 
 | |
|   if (str + my_printable_length(bslen, diglen) > end)
 | |
|     return MY_CS_TOOSMALLN(my_printable_length(bslen, diglen));
 | |
| 
 | |
|   if ((cs->state & MY_CS_NONASCII) == 0)
 | |
|     return to_printable_8bit(str, wc, bs);
 | |
| 
 | |
|   length= to_printable_8bit(tmp, wc, bs);
 | |
|   str0= str;
 | |
|   for (i= 0; i < length; i++)
 | |
|   {
 | |
|     uint expected_length= i == 0 ? bslen : diglen;
 | |
|     if (my_ci_wc_mb(cs, tmp[i], str, end) != (int) expected_length)
 | |
|     {
 | |
|       DBUG_ASSERT(0);
 | |
|       return MY_CS_ILSEQ;
 | |
|     }
 | |
|     str+= expected_length;
 | |
|   }
 | |
|   return (int) (str - str0);
 | |
| }
 | |
| 
 | |
| 
 | |
| int
 | |
| my_wc_to_printable_8bit(CHARSET_INFO *cs, my_wc_t wc,
 | |
|                         uchar *str, uchar *end)
 | |
| {
 | |
|   /*
 | |
|     Special case: swe7 does not have the backslash character.
 | |
|     Use dot instead of backslash for escaping.
 | |
|   */
 | |
|   uint bs= cs->tab_to_uni && cs->tab_to_uni['\\'] != '\\' ? '.' : '\\';
 | |
|   DBUG_ASSERT(cs->mbminlen == 1);
 | |
|   /*
 | |
|     Additionally, if the original swe7 string contains backslashes,
 | |
|     replace them to dots, so this error message:
 | |
|       Invalid swe7 character string: '\xEF\xBC\xB4'
 | |
|     is displayed as:
 | |
|       Invalid swe7 character string: '.xEF.xBC.xB4'
 | |
|     which is more readable than what would happen without '\'-to-dot mapping:
 | |
|       Invalid swe7 character string: '.005CxEF.005CxBC.005CxB4'
 | |
|   */
 | |
|   if (bs == '.' && wc == '\\')
 | |
|     wc= '.';
 | |
|   return my_wc_to_printable_ex(cs, wc, str, end, bs, 1, 1);
 | |
| }
 | |
| 
 | |
| 
 | |
| int
 | |
| my_wc_to_printable_generic(CHARSET_INFO *cs, my_wc_t wc,
 | |
|                            uchar *str, uchar *end)
 | |
| {
 | |
|   return my_wc_to_printable_ex(cs, wc, str, end, '\\',
 | |
|                                cs->mbminlen, cs->mbminlen);
 | |
| }
 | |
| 
 | |
| 
 | |
| /*
 | |
|   Convert a string between two character sets.
 | |
|   'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes.
 | |
| 
 | |
|   @param  to[OUT]       Store result here
 | |
|   @param  to_length     Size of "to" buffer
 | |
|   @param  to_cs         Character set of result string
 | |
|   @param  from          Copy from here
 | |
|   @param  from_length   Length of the "from" string
 | |
|   @param  from_cs       Character set of the "from" string
 | |
|   @param  errors[OUT]   Number of conversion errors
 | |
| 
 | |
|   @return Number of bytes copied to 'to' string
 | |
| */
 | |
| 
 | |
| uint32
 | |
| my_convert_using_func(char *to, size_t to_length,
 | |
|                       CHARSET_INFO *to_cs, my_charset_conv_wc_mb wc_mb,
 | |
|                       const char *from, size_t from_length,
 | |
|                       CHARSET_INFO *from_cs, my_charset_conv_mb_wc mb_wc,
 | |
|                       uint *errors)
 | |
| {
 | |
|   int         cnvres;
 | |
|   my_wc_t     wc;
 | |
|   const uchar *from_end= (const uchar*) from + from_length;
 | |
|   char *to_start= to;
 | |
|   uchar *to_end= (uchar*) to + to_length;
 | |
|   uint error_count= 0;
 | |
| 
 | |
|   while (1)
 | |
|   {
 | |
|     if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0)
 | |
|       from+= cnvres;
 | |
|     else if (cnvres == MY_CS_ILSEQ)
 | |
|     {
 | |
|       error_count++;
 | |
|       from++;
 | |
|       wc= '?';
 | |
|     }
 | |
|     else if (cnvres > MY_CS_TOOSMALL)
 | |
|     {
 | |
|       /*
 | |
|         A correct multibyte sequence detected
 | |
|         But it doesn't have Unicode mapping.
 | |
|       */
 | |
|       error_count++;
 | |
|       from+= (-cnvres);
 | |
|       wc= '?';
 | |
|     }
 | |
|     else
 | |
|     {
 | |
|       if ((uchar *) from >= from_end)
 | |
|         break;  /* End of line */
 | |
|       /* Incomplete byte sequence */
 | |
|       error_count++;
 | |
|       from++;
 | |
|       wc= '?';
 | |
|     }
 | |
| 
 | |
| outp:
 | |
|     if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
 | |
|       to+= cnvres;
 | |
|     else if (cnvres == MY_CS_ILUNI && wc != '?')
 | |
|     {
 | |
|       error_count++;
 | |
|       wc= '?';
 | |
|       goto outp;
 | |
|     }
 | |
|     else
 | |
|       break;
 | |
|   }
 | |
|   *errors= error_count;
 | |
|   return (uint32) (to - to_start);
 | |
| }
 | |
| 
 | |
| 
 | |
| /*
 | |
|   Convert a string between two character sets.
 | |
|    Optimized for quick copying of ASCII characters in the range 0x00..0x7F.
 | |
|   'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes.
 | |
| 
 | |
|   @param  to[OUT]       Store result here
 | |
|   @param  to_length     Size of "to" buffer
 | |
|   @param  to_cs         Character set of result string
 | |
|   @param  from          Copy from here
 | |
|   @param  from_length   Length of the "from" string
 | |
|   @param  from_cs       Character set of the "from" string
 | |
|   @param  errors[OUT]   Number of conversion errors
 | |
| 
 | |
|   @return Number of bytes copied to 'to' string
 | |
| */
 | |
| 
 | |
| uint32
 | |
| my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
 | |
|            const char *from, uint32 from_length,
 | |
|            CHARSET_INFO *from_cs, uint *errors)
 | |
| {
 | |
|   uint32 length, length2;
 | |
|   /*
 | |
|     If any of the character sets is not ASCII compatible,
 | |
|     immediately switch to slow mb_wc->wc_mb method.
 | |
|   */
 | |
|   if ((to_cs->state | from_cs->state) & MY_CS_NONASCII)
 | |
|     return my_convert_using_func(to, to_length,
 | |
|                                  to_cs, to_cs->cset->wc_mb,
 | |
|                                  from, from_length,
 | |
|                                  from_cs, from_cs->cset->mb_wc,
 | |
|                                  errors);
 | |
| 
 | |
|   length= length2= MY_MIN(to_length, from_length);
 | |
| 
 | |
| #if defined(__i386__) || defined(__x86_64__)
 | |
|   /*
 | |
|     Special loop for i386, it allows to refer to a
 | |
|     non-aligned memory block as UINT32, which makes
 | |
|     it possible to copy four bytes at once. This
 | |
|     gives about 10% performance improvement comparing
 | |
|     to byte-by-byte loop.
 | |
|   */
 | |
|   for ( ; length >= 4; length-= 4, from+= 4, to+= 4)
 | |
|   {
 | |
|     if ((*(uint32*)from) & 0x80808080)
 | |
|       break;
 | |
|     *((uint32*) to)= *((const uint32*) from);
 | |
|   }
 | |
| #endif /* __i386__ */
 | |
| 
 | |
|   for (; ; *to++= *from++, length--)
 | |
|   {
 | |
|     if (!length)
 | |
|     {
 | |
|       *errors= 0;
 | |
|       return length2;
 | |
|     }
 | |
|     if (*((unsigned char*) from) > 0x7F) /* A non-ASCII character */
 | |
|     {
 | |
|       uint32 copied_length= length2 - length;
 | |
|       to_length-= copied_length;
 | |
|       from_length-= copied_length;
 | |
|       return copied_length + my_convert_using_func(to, to_length, to_cs,
 | |
|                                                    to_cs->cset->wc_mb,
 | |
|                                                    from, from_length, from_cs,
 | |
|                                                    from_cs->cset->mb_wc,
 | |
|                                                    errors);
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   DBUG_ASSERT(FALSE); // Should never get to here
 | |
|   return 0;           // Make compiler happy
 | |
| }
 | |
| 
 | |
| 
 | |
| size_t
 | |
| my_convert_fix(CHARSET_INFO *to_cs, char *to, size_t to_length,
 | |
|                CHARSET_INFO *from_cs, const char *from, size_t from_length,
 | |
|                size_t nchars,
 | |
|                MY_STRCOPY_STATUS *copy_status,
 | |
|                MY_STRCONV_STATUS *conv_status)
 | |
| {
 | |
|   int cnvres;
 | |
|   my_wc_t wc;
 | |
|   my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
 | |
|   my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
 | |
|   const uchar *from_end= (const uchar*) from + from_length;
 | |
|   uchar *to_end= (uchar*) to + to_length;
 | |
|   char *to_start= to;
 | |
| 
 | |
|   DBUG_ASSERT(to_cs != &my_charset_bin);
 | |
|   DBUG_ASSERT(from_cs != &my_charset_bin);
 | |
| 
 | |
|   copy_status->m_well_formed_error_pos= NULL;
 | |
|   conv_status->m_cannot_convert_error_pos= NULL;
 | |
| 
 | |
|   for ( ; nchars; nchars--)
 | |
|   {
 | |
|     const char *from_prev= from;
 | |
|     if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0)
 | |
|       from+= cnvres;
 | |
|     else if (cnvres == MY_CS_ILSEQ)
 | |
|     {
 | |
|       if (!copy_status->m_well_formed_error_pos)
 | |
|         copy_status->m_well_formed_error_pos= from;
 | |
|       from++;
 | |
|       wc= '?';
 | |
|     }
 | |
|     else if (cnvres > MY_CS_TOOSMALL)
 | |
|     {
 | |
|       /*
 | |
|         A correct multibyte sequence detected
 | |
|         But it doesn't have Unicode mapping.
 | |
|       */
 | |
|       if (!conv_status->m_cannot_convert_error_pos)
 | |
|         conv_status->m_cannot_convert_error_pos= from;
 | |
|       from+= (-cnvres);
 | |
|       wc= '?';
 | |
|     }
 | |
|     else
 | |
|     {
 | |
|       if ((uchar *) from >= from_end)
 | |
|         break; // End of line
 | |
|       // Incomplete byte sequence
 | |
|       if (!copy_status->m_well_formed_error_pos)
 | |
|         copy_status->m_well_formed_error_pos= from;
 | |
|       from++;
 | |
|       wc= '?';
 | |
|     }
 | |
| outp:
 | |
|     if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
 | |
|       to+= cnvres;
 | |
|     else if (cnvres == MY_CS_ILUNI && wc != '?')
 | |
|     {
 | |
|       if (!conv_status->m_cannot_convert_error_pos)
 | |
|         conv_status->m_cannot_convert_error_pos= from_prev;
 | |
|       wc= '?';
 | |
|       goto outp;
 | |
|     }
 | |
|     else
 | |
|     {
 | |
|       from= from_prev;
 | |
|       break;
 | |
|     }
 | |
|   }
 | |
|   copy_status->m_source_end_pos= from;
 | |
|   return to - to_start;
 | |
| }
 | |
| 
 | |
| 
 | |
| int my_strnncollsp_nchars_generic(CHARSET_INFO *cs,
 | |
|                                   const uchar *str1, size_t len1,
 | |
|                                   const uchar *str2, size_t len2,
 | |
|                                   size_t nchars,
 | |
|                                   uint flags)
 | |
| {
 | |
|   int error;
 | |
|   len1= my_well_formed_length(cs, (const char *) str1,
 | |
|                                   (const char *) str1 + len1,
 | |
|                                   nchars, &error);
 | |
|   len2= my_well_formed_length(cs, (const char *) str2,
 | |
|                                   (const char *) str2 + len2,
 | |
|                                   nchars, &error);
 | |
|   DBUG_ASSERT((cs->state & MY_CS_NOPAD) == 0);
 | |
|   return cs->coll->strnncollsp(cs, str1, len1, str2, len2);
 | |
| }
 | |
| 
 | |
| 
 | |
| int my_strnncollsp_nchars_generic_8bit(CHARSET_INFO *cs,
 | |
|                                        const uchar *str1, size_t len1,
 | |
|                                        const uchar *str2, size_t len2,
 | |
|                                        size_t nchars,
 | |
|                                        uint flags)
 | |
| {
 | |
|   set_if_smaller(len1, nchars);
 | |
|   set_if_smaller(len2, nchars);
 | |
|   DBUG_ASSERT((cs->state & MY_CS_NOPAD) == 0);
 | |
|   return cs->coll->strnncollsp(cs, str1, len1, str2, len2);
 | |
| }
 | |
| 
 | |
| 
 | |
| uint my_ci_get_id_generic(CHARSET_INFO *cs, my_collation_id_type_t type)
 | |
| {
 | |
|   return cs->number;
 | |
| }
 | |
| 
 | |
| 
 | |
| LEX_CSTRING my_ci_get_collation_name_generic(CHARSET_INFO *cs,
 | |
|                                              my_collation_name_mode_t mode)
 | |
| {
 | |
|   return cs->coll_name;
 | |
| }
 | |
| 
 | |
| 
 | |
| uint my_casefold_multiply_1(CHARSET_INFO *cs)
 | |
| {
 | |
|   return 1;
 | |
| }
 | |
| 
 | |
| 
 | |
| uint my_casefold_multiply_2(CHARSET_INFO *cs)
 | |
| {
 | |
|   return 2;
 | |
| }
 | 
