mariadb/strings/ctype-unidata.c

/* Copyright (c) 2000, 2013, Oracle and/or its affiliates.
   Copyright (c) 2009, 2020, MariaDB

   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Library General Public
   License as published by the Free Software Foundation; version 2
   of the License.

   This library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Library General Public License for more details.

   You should have received a copy of the GNU Library General Public
   License along with this library; if not, write to the Free
   Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
   MA 02110-1335  USA */

#include "strings_def.h"
#include <m_ctype.h>
#include "ctype-mb.h"

#ifndef EILSEQ
#define EILSEQ ENOENT
#endif


#include "ctype-unidata.h"
#include "ctype-unicode300-general_ci.h"
#include "ctype-unicode300-general_mysql500_ci.h"
#include "ctype-unicode300-casefold.h"
#include "ctype-unicode300-casefold-tr.h"
#include "ctype-unicode520-casefold.h"
#include "ctype-unicode1400-casefold.h"
#include "ctype-unicode1400-casefold-tr.h"


MY_CASEFOLD_INFO my_casefold_default=
{
  0xFFFF,
  my_u300_casefold_index,
  weight_general_ci_index
};


/*
  Turkish lower/upper mapping:
  1. LOWER(0x0049 LATIN CAPITAL LETTER I) ->
           0x0131 LATIN SMALL   LETTER DOTLESS I
  2. UPPER(0x0069 LATIN SMALL   LETTER I) ->
           0x0130 LATIN CAPITAL LETTER I WITH DOT ABOVE
*/

MY_CASEFOLD_INFO my_casefold_turkish=
{
  0xFFFF,
  my_u300tr_casefold_index,
  weight_general_ci_index
};


/*
  general_mysql500_ci is very similar to general_ci, but maps sorting order
  for U+00DF to 0x00DF instead of 0x0053.
*/
MY_CASEFOLD_INFO my_casefold_mysql500=
{
  0xFFFF,
  my_u300_casefold_index,
  weight_general_mysql500_ci_index
};


MY_CASEFOLD_INFO my_casefold_unicode520=
{
  0x10FFFF,
  my_u520_casefold_index,
  NULL
};


MY_CASEFOLD_INFO my_casefold_unicode1400=
{
  0x10FFFF,
  my_u1400_casefold_index,
  NULL
};


MY_CASEFOLD_INFO my_casefold_unicode1400tr=
{
  0x10FFFF,
  my_u1400tr_casefold_index,
  NULL
};
MDEV-31071 Refactor case folding data types in Unicode collations This is a non-functional change. It changes the way how case folding data and weight data (for simple Unicode collations) are stored: - Removing data types MY_UNICASE_CHARACTER, MY_UNICASE_INFO - Using data types MY_CASEFOLD_CHARACTER, MY_CASEFOLD_INFO instead. This patch changes simple Unicode collations in a similar way how MDEV-30695 previously changed Asian collations. No new MTR tests are needed. The underlying code is thoroughly covered by a number of ctype__ws.test and ctype__casefold.test files, which were added recently as a preparation for this change. Old and new Unicode data layout ------------------------------- Case folding data is now stored in separate tables consisting of MY_CASEFOLD_CHARACTER elements with two members: typedef struct casefold_info_char_t { uint32 toupper; uint32 tolower; } MY_CASEFOLD_CHARACTER; while weight data (for simple non-UCA collations xxx_general_ci and xxx_general_mysql500_ci) is stored in separate arrays of uint16 elements. Before this change case folding data and simple weight data were stored together, in tables of the following elements with three members: typedef struct unicase_info_char_st { uint32 toupper; uint32 tolower; uint32 sort; /* weights for simple collations */ } MY_UNICASE_CHARACTER; This data format was redundant, because weights (the "sort" member) were needed only for these two simple Unicode collations: - xxx_general_ci - xxx_general_mysql500_ci Adding case folding information for Unicode-14.0.0 using the old format would waste memory without purpose. Detailed changes ---------------- - Changing the underlying data types as described above - Including unidata-dump.c into the sources. This program was earlier used to dump UnicodeData.txt (e.g. https://www.unicode.org/Public/14.0.0/ucd/UnicodeData.txt) into MySQL / MariaDB source files. It was originally written in 2002, but has not been distributed yet together with MySQL / MariaDB sources. - Removing the old format Unicode data earlier dumped from UnicodeData.txt (versions 3.0.0 and 5.2.0) from ctype-utf8.c. Adding Unicode data in the new format into separate header files, to maintain the code easier: - ctype-unicode300-casefold.h - ctype-unicode300-casefold-tr.h - ctype-unicode300-general_ci.h - ctype-unicode300-general_mysql500_ci.h - ctype-unicode520-casefold.h - Adding a new file ctype-unidata.c as an aggregator for the header files listed above. 2023-02-24 16:22:32 +01:00			`/* Copyright (c) 2000, 2013, Oracle and/or its affiliates.`
			`Copyright (c) 2009, 2020, MariaDB`

			`This library is free software; you can redistribute it and/or`
			`modify it under the terms of the GNU Library General Public`
			`License as published by the Free Software Foundation; version 2`
			`of the License.`

			`This library is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`Library General Public License for more details.`

			`You should have received a copy of the GNU Library General Public`
			`License along with this library; if not, write to the Free`
			`Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,`
			`MA 02110-1335 USA */`

			`#include "strings_def.h"`
			`#include <m_ctype.h>`
			`#include "ctype-mb.h"`

			`#ifndef EILSEQ`
			`#define EILSEQ ENOENT`
			`#endif`


			`#include "ctype-unidata.h"`
			`#include "ctype-unicode300-general_ci.h"`
			`#include "ctype-unicode300-general_mysql500_ci.h"`
			`#include "ctype-unicode300-casefold.h"`
			`#include "ctype-unicode300-casefold-tr.h"`
			`#include "ctype-unicode520-casefold.h"`
MDEV-30577 Case folding for uca1400 collations is not up to date Adding casefolding for Unicode-14.0.0 into uca1400 collations. 2023-03-02 14:37:36 +01:00			`#include "ctype-unicode1400-casefold.h"`
			`#include "ctype-unicode1400-casefold-tr.h"`
MDEV-31071 Refactor case folding data types in Unicode collations This is a non-functional change. It changes the way how case folding data and weight data (for simple Unicode collations) are stored: - Removing data types MY_UNICASE_CHARACTER, MY_UNICASE_INFO - Using data types MY_CASEFOLD_CHARACTER, MY_CASEFOLD_INFO instead. This patch changes simple Unicode collations in a similar way how MDEV-30695 previously changed Asian collations. No new MTR tests are needed. The underlying code is thoroughly covered by a number of ctype__ws.test and ctype__casefold.test files, which were added recently as a preparation for this change. Old and new Unicode data layout ------------------------------- Case folding data is now stored in separate tables consisting of MY_CASEFOLD_CHARACTER elements with two members: typedef struct casefold_info_char_t { uint32 toupper; uint32 tolower; } MY_CASEFOLD_CHARACTER; while weight data (for simple non-UCA collations xxx_general_ci and xxx_general_mysql500_ci) is stored in separate arrays of uint16 elements. Before this change case folding data and simple weight data were stored together, in tables of the following elements with three members: typedef struct unicase_info_char_st { uint32 toupper; uint32 tolower; uint32 sort; /* weights for simple collations */ } MY_UNICASE_CHARACTER; This data format was redundant, because weights (the "sort" member) were needed only for these two simple Unicode collations: - xxx_general_ci - xxx_general_mysql500_ci Adding case folding information for Unicode-14.0.0 using the old format would waste memory without purpose. Detailed changes ---------------- - Changing the underlying data types as described above - Including unidata-dump.c into the sources. This program was earlier used to dump UnicodeData.txt (e.g. https://www.unicode.org/Public/14.0.0/ucd/UnicodeData.txt) into MySQL / MariaDB source files. It was originally written in 2002, but has not been distributed yet together with MySQL / MariaDB sources. - Removing the old format Unicode data earlier dumped from UnicodeData.txt (versions 3.0.0 and 5.2.0) from ctype-utf8.c. Adding Unicode data in the new format into separate header files, to maintain the code easier: - ctype-unicode300-casefold.h - ctype-unicode300-casefold-tr.h - ctype-unicode300-general_ci.h - ctype-unicode300-general_mysql500_ci.h - ctype-unicode520-casefold.h - Adding a new file ctype-unidata.c as an aggregator for the header files listed above. 2023-02-24 16:22:32 +01:00


			`MY_CASEFOLD_INFO my_casefold_default=`
			`{`
			`0xFFFF,`
			`my_u300_casefold_index,`
			`weight_general_ci_index`
			`};`


			`/*`
			`Turkish lower/upper mapping:`
			`1. LOWER(0x0049 LATIN CAPITAL LETTER I) ->`
			`0x0131 LATIN SMALL LETTER DOTLESS I`
			`2. UPPER(0x0069 LATIN SMALL LETTER I) ->`
			`0x0130 LATIN CAPITAL LETTER I WITH DOT ABOVE`
			`*/`

			`MY_CASEFOLD_INFO my_casefold_turkish=`
			`{`
			`0xFFFF,`
			`my_u300tr_casefold_index,`
			`weight_general_ci_index`
			`};`


			`/*`
			`general_mysql500_ci is very similar to general_ci, but maps sorting order`
			`for U+00DF to 0x00DF instead of 0x0053.`
			`*/`
			`MY_CASEFOLD_INFO my_casefold_mysql500=`
			`{`
			`0xFFFF,`
			`my_u300_casefold_index,`
			`weight_general_mysql500_ci_index`
			`};`



			`MY_CASEFOLD_INFO my_casefold_unicode520=`
			`{`
			`0x10FFFF,`
			`my_u520_casefold_index,`
			`NULL`
			`};`
MDEV-30577 Case folding for uca1400 collations is not up to date Adding casefolding for Unicode-14.0.0 into uca1400 collations. 2023-03-02 14:37:36 +01:00

			`MY_CASEFOLD_INFO my_casefold_unicode1400=`
			`{`
			`0x10FFFF,`
			`my_u1400_casefold_index,`
			`NULL`
			`};`


			`MY_CASEFOLD_INFO my_casefold_unicode1400tr=`
			`{`
			`0x10FFFF,`
			`my_u1400tr_casefold_index,`
			`NULL`
			`};`