mirror of
				https://github.com/MariaDB/server.git
				synced 2025-10-31 19:06:14 +01:00 
			
		
		
		
	
		
			
				
	
	
		
			996 lines
		
	
	
	
		
			25 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			996 lines
		
	
	
	
		
			25 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /* Copyright (c) 2004, 2006 MySQL AB
 | |
|    Copyright (c) 2009-2011, Monty Program Ab
 | |
|    Use is subject to license terms.
 | |
|    Copyright (c) 2009-2011, Monty Program Ab
 | |
| 
 | |
|    This program is free software; you can redistribute it and/or modify
 | |
|    it under the terms of the GNU General Public License as published by
 | |
|    the Free Software Foundation; version 2 of the License.
 | |
| 
 | |
|    This program is distributed in the hope that it will be useful,
 | |
|    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
|    GNU General Public License for more details.
 | |
| 
 | |
|    You should have received a copy of the GNU General Public License
 | |
|    along with this program; if not, write to the Free Software
 | |
|    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1335  USA */
 | |
| 
 | |
| #include <stdio.h>
 | |
| #include <stdlib.h>
 | |
| #include <string.h>
 | |
| 
 | |
| #include "my_global.h"
 | |
| #include "m_ctype.h"
 | |
| #include "ctype-uca.h"
 | |
| 
 | |
| 
 | |
| #define MAX_ALLOWED_CODE 0x10FFFF
 | |
| 
 | |
| 
 | |
| typedef struct opt_st
 | |
| {
 | |
|   const char *name_prefix; /* Name that goes into all array names */
 | |
|   const char *filename;    /* The filename or "-" for stdin */
 | |
|   uint levels;             /* The number of levels to dump */
 | |
|   my_bool no_contractions;
 | |
|   my_bool case_first_upper;
 | |
| } OPT;
 | |
| 
 | |
| 
 | |
| static OPT defaults=
 | |
| {
 | |
|   "uca",
 | |
|   "-",
 | |
|   3,
 | |
|   FALSE,
 | |
|   FALSE
 | |
| };
 | |
| 
 | |
| 
 | |
| typedef struct my_ducet_weight_st
 | |
| {
 | |
|   uint16 weight[4][MY_UCA_MAX_WEIGHT_SIZE];
 | |
|   size_t weight_length;
 | |
| } MY_DUCET_WEIGHT;
 | |
| 
 | |
| 
 | |
| typedef struct my_ducet_single_char_t
 | |
| {
 | |
|   MY_DUCET_WEIGHT weight;
 | |
|   my_bool is_variable;
 | |
| } MY_DUCET_SINGLE_CHAR;
 | |
| 
 | |
| 
 | |
| typedef struct my_ducet_char_t
 | |
| {
 | |
|   my_wc_t wc[MY_UCA_MAX_CONTRACTION];
 | |
|   size_t length;
 | |
| } MY_DUCET_CHARS;
 | |
| 
 | |
| 
 | |
| typedef struct my_ducet_contraction_t
 | |
| {
 | |
|   MY_DUCET_CHARS chars;
 | |
|   MY_DUCET_WEIGHT weights;
 | |
| } MY_DUCET_CONTRACTION;
 | |
| 
 | |
| 
 | |
| typedef struct my_ducet_contraction_list_st
 | |
| {
 | |
|   size_t nitems;
 | |
|   MY_DUCET_CONTRACTION item[4*1024];
 | |
| } MY_DUCET_CONTRACTION_LIST;
 | |
| 
 | |
| 
 | |
| typedef struct my_ducet_logical_posision_st
 | |
| {
 | |
|   my_wc_t first;
 | |
|   my_wc_t last;
 | |
| } MY_DUCET_LOGICAL_POSITION;
 | |
| 
 | |
| 
 | |
| typedef struct my_ducet_logical_positions_st
 | |
| {
 | |
|   MY_DUCET_LOGICAL_POSITION tertiary_ignorable;
 | |
|   MY_DUCET_LOGICAL_POSITION secondary_ignorable;
 | |
|   MY_DUCET_LOGICAL_POSITION primary_ignorable;
 | |
|   MY_DUCET_LOGICAL_POSITION variable;
 | |
|   MY_DUCET_LOGICAL_POSITION non_ignorable;
 | |
| } MY_DUCET_LOGICAL_POSITIONS;
 | |
| 
 | |
| 
 | |
| typedef struct my_allkeys_st
 | |
| {
 | |
|   MY_DUCET_SINGLE_CHAR single_chars[MAX_ALLOWED_CODE+1];
 | |
|   MY_DUCET_CONTRACTION_LIST contractions;
 | |
|   MY_DUCET_LOGICAL_POSITIONS logical_positions;
 | |
|   uint version;
 | |
|   char version_str[32];
 | |
| } MY_DUCET;
 | |
| 
 | |
| 
 | |
| static int
 | |
| my_ducet_weight_cmp_on_level(const MY_DUCET_WEIGHT *a,
 | |
|                              const MY_DUCET_WEIGHT *b,
 | |
|                              uint level)
 | |
| {
 | |
|   uint i;
 | |
|   for (i= 0; i < array_elements(a->weight[level]); i++)
 | |
|   {
 | |
|     int diff= (int) a->weight[level][i] - (int) b->weight[level][i];
 | |
|     if (diff)
 | |
|       return diff;
 | |
|   }
 | |
|   return 0;
 | |
| }
 | |
| 
 | |
| 
 | |
| static int
 | |
| my_ducet_weight_cmp(const MY_DUCET_WEIGHT *a,
 | |
|                     const MY_DUCET_WEIGHT *b)
 | |
| {
 | |
|   uint level;
 | |
|   for (level= 0; level < array_elements(a->weight); level++)
 | |
|   {
 | |
|     int diff= my_ducet_weight_cmp_on_level(a, b, level);
 | |
|     if (diff)
 | |
|       return diff;
 | |
|   }
 | |
|   return 0;
 | |
| }
 | |
| 
 | |
| 
 | |
| /*
 | |
| "3.11 Logical Reset Positions" says:
 | |
| 
 | |
| The CLDR table (based on UCA) has the following overall structure for weights,
 | |
| going from low to high.
 | |
| 
 | |
| */
 | |
| 
 | |
| static my_bool
 | |
| my_ducet_weight_is_tertiary_ignorable(const MY_DUCET_WEIGHT *w)
 | |
| {
 | |
|   return w->weight[0][0] == 0 &&
 | |
|          w->weight[1][0] == 0 &&
 | |
|          w->weight[2][0] == 0;
 | |
| }
 | |
| 
 | |
| 
 | |
| static my_bool
 | |
| my_ducet_weight_is_secondary_ignorable(const MY_DUCET_WEIGHT *w)
 | |
| {
 | |
|   return w->weight[0][0] == 0 &&
 | |
|          w->weight[1][0] == 0 &&
 | |
|          w->weight[2][0] != 0;
 | |
| }
 | |
| 
 | |
| 
 | |
| static my_bool
 | |
| my_ducet_weight_is_primary_ignorable(const MY_DUCET_WEIGHT *w)
 | |
| {
 | |
|   return w->weight[0][0] == 0 &&
 | |
|          w->weight[1][0] != 0 &&
 | |
|          w->weight[2][0] != 0;
 | |
| }
 | |
| 
 | |
| 
 | |
| static my_bool
 | |
| my_ducet_weight_is_primary_non_ignorable(const MY_DUCET_WEIGHT *w)
 | |
| {
 | |
|   return w->weight[0][0] > 0 && w->weight[0][0] < 0xFB00;
 | |
| }
 | |
| 
 | |
| 
 | |
| /*
 | |
|   if alternate = non-ignorable
 | |
|   p != ignore,
 | |
|   if  alternate = shifted
 | |
|   p, s, t = ignore
 | |
| */
 | |
| static my_bool
 | |
| my_ducet_single_char_is_variable(const MY_DUCET_SINGLE_CHAR *ch)
 | |
| {
 | |
|   return ch->is_variable &&
 | |
|          my_ducet_weight_is_primary_non_ignorable(&ch->weight);
 | |
| }
 | |
| 
 | |
| 
 | |
| static void
 | |
| my_ducet_logical_position_set(MY_DUCET_LOGICAL_POSITION *dst, my_wc_t wc)
 | |
| {
 | |
|   dst->first= dst->last= wc;
 | |
| }
 | |
| 
 | |
| 
 | |
| static void
 | |
| my_ducet_logical_position_update(MY_DUCET_LOGICAL_POSITION *dst,
 | |
|                                  const MY_DUCET *ducet, my_wc_t current)
 | |
| {
 | |
|   const MY_DUCET_SINGLE_CHAR *chars= ducet->single_chars;
 | |
|   int diff;
 | |
|   if (current >= array_elements(ducet->single_chars))
 | |
|     return;
 | |
|   if ((diff= my_ducet_weight_cmp(&chars[current].weight,
 | |
|                                  &chars[dst->first].weight)) < 0 ||
 | |
|       (diff == 0 && current < dst->first))
 | |
|     dst->first= current;
 | |
|   if ((diff= my_ducet_weight_cmp(&chars[current].weight,
 | |
|                                  &chars[dst->last].weight)) > 0 ||
 | |
|       (diff == 0 && current > dst->last))
 | |
|     dst->last= current;
 | |
| }
 | |
| 
 | |
| 
 | |
| static void
 | |
| my_ducet_logical_positions_init(MY_DUCET_LOGICAL_POSITIONS *dst,
 | |
|                                 const MY_DUCET *ducet)
 | |
| {
 | |
|   uint i;
 | |
|   const MY_DUCET_SINGLE_CHAR *chars= ducet->single_chars;
 | |
| 
 | |
|   for (i= 0; i < array_elements(ducet->single_chars); i++)
 | |
|   {
 | |
|     if (my_ducet_weight_is_tertiary_ignorable(&chars[i].weight))
 | |
|     {
 | |
|       my_ducet_logical_position_set(&dst->tertiary_ignorable, i);
 | |
|       break;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   for (i= 0; i < array_elements(ducet->single_chars); i++)
 | |
|   {
 | |
|     if (my_ducet_weight_is_secondary_ignorable(&chars[i].weight))
 | |
|     {
 | |
|       my_ducet_logical_position_set(&dst->secondary_ignorable, i);
 | |
|       break;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   for (i= 0; i < array_elements(ducet->single_chars); i++)
 | |
|   {
 | |
|     if (my_ducet_weight_is_primary_ignorable(&chars[i].weight))
 | |
|     {
 | |
|       my_ducet_logical_position_set(&dst->primary_ignorable, i);
 | |
|       break;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   for (i= 0; i < array_elements(ducet->single_chars); i++)
 | |
|   {
 | |
|     if (my_ducet_weight_is_primary_non_ignorable(&chars[i].weight))
 | |
|     {
 | |
|       my_ducet_logical_position_set(&dst->non_ignorable, i);
 | |
|       break;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   for (i= 0; i < array_elements(ducet->single_chars); i++)
 | |
|   {
 | |
|     if (my_ducet_single_char_is_variable(&chars[i]))
 | |
|     {
 | |
|       my_ducet_logical_position_set(&dst->variable, i);
 | |
|       break;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   for (i= 1; i < array_elements(ducet->single_chars); i++)
 | |
|   {
 | |
|     if (my_ducet_weight_is_primary_non_ignorable(&chars[i].weight))
 | |
|       my_ducet_logical_position_update(&dst->non_ignorable, ducet, i);
 | |
|     if (my_ducet_weight_is_primary_ignorable(&chars[i].weight))
 | |
|       my_ducet_logical_position_update(&dst->primary_ignorable, ducet, i);
 | |
|     if (my_ducet_weight_is_secondary_ignorable(&chars[i].weight))
 | |
|       my_ducet_logical_position_update(&dst->secondary_ignorable, ducet, i);
 | |
|     if (my_ducet_weight_is_tertiary_ignorable(&chars[i].weight))
 | |
|       my_ducet_logical_position_update(&dst->tertiary_ignorable, ducet, i);
 | |
|     if (my_ducet_single_char_is_variable(&chars[i]))
 | |
|       my_ducet_logical_position_update(&dst->variable, ducet, i);
 | |
|   }
 | |
| 
 | |
|   /*
 | |
|     DUCET as of Unicode-14.0.0 does not have any secondary ignorable
 | |
|     characters, i.e. with weights [p=0000, s=0000, t!=0000]
 | |
|     For compatibility with 4.0.0 and 5.2.0 data in ctype-uca.c,
 | |
|     let copy tertiary_ignorable to secondary_ignorable.
 | |
|     It gives effectively the same result with just leaving
 | |
|     secondary_ignorable as {first=U+0000,last=U+0000}.
 | |
|   */
 | |
|   if (dst->secondary_ignorable.first == 0 && dst->secondary_ignorable.last == 0)
 | |
|   {
 | |
|     dst->secondary_ignorable.first= dst->tertiary_ignorable.first;
 | |
|     dst->secondary_ignorable.last= dst->tertiary_ignorable.last;
 | |
|   }
 | |
| }
 | |
| 
 | |
| 
 | |
| static void
 | |
| my_ducet_weight_normalize_on_level(MY_DUCET_WEIGHT *weights,
 | |
|                                    uint level,
 | |
|                                    const OPT *options)
 | |
| {
 | |
|   uint dst, src;
 | |
|   for (src= 0, dst= 0; src < array_elements(weights->weight[level]); src++)
 | |
|   {
 | |
|     if (weights->weight[level][src] != 0)
 | |
|       weights->weight[level][dst++]= weights->weight[level][src];
 | |
|   }
 | |
|   for ( ; dst < array_elements(weights->weight[level]) ; dst++)
 | |
|     weights->weight[level][dst]= 0;
 | |
|   if (options->case_first_upper && level == 2)
 | |
|   {
 | |
|     /*
 | |
|       Invert weights for secondary level to
 | |
|       sort upper case letters before their
 | |
|       lower case counter part.
 | |
|     */
 | |
|     for (dst= 0; dst < array_elements(weights->weight[level]); dst++)
 | |
|     {
 | |
|       if (weights->weight[level][dst] == 0)
 | |
|         break;
 | |
|       if (weights->weight[level][dst] >= 0x20)
 | |
|       {
 | |
|         fprintf(stderr, "Secondary level is too large: %04X\n",
 | |
|                 (int) weights->weight[level][dst]);
 | |
|       }
 | |
|       weights->weight[level][dst]= (uint16) (0x20 - weights->weight[level][dst]);
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| 
 | |
| static void
 | |
| my_ducet_weight_normalize(MY_DUCET_WEIGHT *weights, const OPT *options)
 | |
| {
 | |
|   uint i;
 | |
|   for (i= 0; i < array_elements(weights->weight); i++)
 | |
|     my_ducet_weight_normalize_on_level(weights, i, options);
 | |
| }
 | |
| 
 | |
| 
 | |
| static void
 | |
| my_ducet_normalize(MY_DUCET *ducet, const OPT *options)
 | |
| {
 | |
|   uint i;
 | |
|   for (i= 0; i < array_elements(ducet->single_chars); i++)
 | |
|     my_ducet_weight_normalize(&ducet->single_chars[i].weight, options);
 | |
|   for (i= 0; i < array_elements(ducet->contractions.item); i++)
 | |
|     my_ducet_weight_normalize(&ducet->contractions.item[i].weights, options);
 | |
| }
 | |
| 
 | |
| 
 | |
| static my_bool
 | |
| my_ducet_contraction_list_add(MY_DUCET_CONTRACTION_LIST *dst,
 | |
|                               const MY_DUCET_CHARS *chars,
 | |
|                               const MY_DUCET_WEIGHT *weights)
 | |
| {
 | |
|   if (dst->nitems >= array_elements(dst->item))
 | |
|   {
 | |
|     fprintf(stderr, "Too many contractions\n");
 | |
|     return TRUE;
 | |
|   }
 | |
|   dst->item[dst->nitems].chars= *chars;
 | |
|   dst->item[dst->nitems].weights= *weights;
 | |
|   dst->nitems++;
 | |
|   return FALSE;
 | |
| }
 | |
| 
 | |
| 
 | |
| #if 0
 | |
| #define MY_UCA_NPAGES	1024
 | |
| #define MY_UCA_NCHARS	64
 | |
| #define MY_UCA_CMASK	63
 | |
| #define MY_UCA_PSHIFT	6
 | |
| #else
 | |
| #define MY_UCA_NPAGES	4352 /* 0x110000 characters / 0x100 chars per page */
 | |
| #define MY_UCA_NCHARS	256
 | |
| #define MY_UCA_CMASK	255
 | |
| #define MY_UCA_PSHIFT	8
 | |
| #endif
 | |
| 
 | |
| 
 | |
| 
 | |
| /* Name prefix that goes into page weight array names after global_name_prefix */
 | |
| static const char *pname_prefix[]= {"_p", "_p", "_p"};
 | |
| 
 | |
| /* Name suffix that goes into page weight array names after page number */
 | |
| static const char *pname_suffix[]= {"", "_secondary", "_tertiary"};
 | |
| 
 | |
| 
 | |
| void usage(const char *prog)
 | |
| {
 | |
|   printf("Usage:\n");
 | |
|   printf("%s [options] filename\n", prog);
 | |
| }
 | |
| 
 | |
| 
 | |
| static inline int lstrncmp(const char *str, const LEX_CSTRING lstr)
 | |
| {
 | |
|   return strncmp(lstr.str, str, lstr.length);
 | |
| }
 | |
| 
 | |
| 
 | |
| int process_option(OPT *options, const char *opt)
 | |
| {
 | |
|   static const LEX_CSTRING opt_name_prefix= {STRING_WITH_LEN("--name-prefix=")};
 | |
|   static const LEX_CSTRING opt_levels= {STRING_WITH_LEN("--levels=")};
 | |
|   static const LEX_CSTRING opt_no_contractions= {STRING_WITH_LEN("--no-contractions")};
 | |
|   static const LEX_CSTRING opt_case_first= {STRING_WITH_LEN("--case-first=")};
 | |
|   if (!lstrncmp(opt, opt_name_prefix))
 | |
|   {
 | |
|     options->name_prefix= opt + opt_name_prefix.length;
 | |
|     return 0;
 | |
|   }
 | |
|   if (!lstrncmp(opt, opt_levels))
 | |
|   {
 | |
|     options->levels= (uint) strtoul(opt + opt_levels.length, NULL, 10);
 | |
|     if (options->levels < 1 || options->levels > 3)
 | |
|     {
 | |
|       printf("Bad --levels value\n");
 | |
|       return 1;
 | |
|     }
 | |
|     return 0;
 | |
|   }
 | |
|   if (!lstrncmp(opt, opt_case_first))
 | |
|   {
 | |
|     const char *value= opt + opt_case_first.length;
 | |
|     if (!strcasecmp(value, "upper"))
 | |
|     {
 | |
|       options->case_first_upper= TRUE;
 | |
|       return 0;
 | |
|     }
 | |
|     if (!strcasecmp(value, "lower"))
 | |
|     {
 | |
|       options->case_first_upper= FALSE;
 | |
|       return 0;
 | |
|     }
 | |
|     fprintf(stderr, "Bad option: %s\n", opt);
 | |
|     return 1;
 | |
|   }
 | |
|   if (!strcmp(opt, opt_no_contractions.str))
 | |
|   {
 | |
|     options->no_contractions= TRUE;
 | |
|     return 0;
 | |
|   }
 | |
|   printf("Unknown option: %s\n", opt);
 | |
|   return 1;
 | |
| }
 | |
| 
 | |
| 
 | |
| int process_options(OPT *options, int ac, char **av)
 | |
| {
 | |
|   int i;
 | |
|   for (i= 1; i < ac; i++)
 | |
|   {
 | |
|     if (!strncmp(av[i], "--", 2))
 | |
|     {
 | |
|       if (process_option(options, av[i]))
 | |
|         return 1;
 | |
|     }
 | |
|     else
 | |
|     {
 | |
|       if (i + 1 != ac)
 | |
|       {
 | |
|         usage(av[0]);
 | |
|         return 1;
 | |
|       }
 | |
|       options->filename= av[i];
 | |
|       return 0;
 | |
|     }
 | |
|   }
 | |
|   usage(av[0]);
 | |
|   return 1;
 | |
| }
 | |
| 
 | |
| 
 | |
| FILE *open_file(const char *name)
 | |
| {
 | |
|   if (!strcmp(name, "-"))
 | |
|     return stdin;
 | |
|   return fopen(name, "r");
 | |
| }
 | |
| 
 | |
| 
 | |
| void close_file(FILE *file)
 | |
| {
 | |
|   if (file != stdin)
 | |
|     fclose(file);
 | |
| }
 | |
| 
 | |
| 
 | |
| char *strrtrim(char *str)
 | |
| {
 | |
|   char *end= str + strlen(str);
 | |
|   for ( ; str < end; end--)
 | |
|   {
 | |
|     if (end[-1] != '\r' && end[-1] != '\n' &&
 | |
|         end[-1] != ' '  && end[-1] != '\t')
 | |
|       break;
 | |
|     end[-1]= '\0';
 | |
|   }
 | |
|   return str;
 | |
| }
 | |
| 
 | |
| 
 | |
| /*
 | |
|   Parse a line starting with '@'.
 | |
|   As of 14.0.0, allkeys.txt has @version and @implicitweights lines.
 | |
|   Only @version is parsed here.
 | |
| 
 | |
|   It could also be possible to parse @implicitweights to automatically
 | |
|   generate routines responsible for implicit weight handling for Siniform
 | |
|   ideographic scripts (Tangut, Nushu, Khitan). But as there are only a few
 | |
|   of them at the moment, it was easier to write these routines in ctype-uca.h
 | |
|   manually. So @implicitweights lines are ignored here.
 | |
| */
 | |
| my_bool parse_at_line(MY_DUCET *ducet, const char *str)
 | |
| {
 | |
|   static const LEX_CSTRING version= {STRING_WITH_LEN("@version ")};
 | |
|   if (!lstrncmp(str, version))
 | |
|   {
 | |
|     /*
 | |
|       Examples:
 | |
|         @version 4.0.0
 | |
|         @version 5.2.0
 | |
|         @version 14.0.0
 | |
|     */
 | |
|     const char *src= str + version.length;
 | |
|     long n[3]= {0};
 | |
|     uint pos;
 | |
|     int length;
 | |
| 
 | |
|     length= snprintf(ducet->version_str, sizeof(ducet->version_str)-1,
 | |
|                      "%s", src);
 | |
|     ducet->version_str[length]= '\0';
 | |
| 
 | |
|     for (pos= 0 ; pos < 3; pos++)
 | |
|     {
 | |
|       char *endptr;
 | |
|       n[pos]= strtol(src, &endptr, 10);
 | |
|       if (*endptr != '.' && *endptr != '\r' && *endptr != '\n' && *endptr != 0)
 | |
|         return TRUE;
 | |
|       src= endptr + 1;
 | |
|     }
 | |
|     ducet->version= MY_UCA_VERSION_ID(n[0], n[1], n[2]);
 | |
|   }
 | |
|   return FALSE;
 | |
| }
 | |
| 
 | |
| 
 | |
| static void
 | |
| parse_chars(MY_DUCET_CHARS *dst, char *str)
 | |
| {
 | |
|   char *s;
 | |
|   const char *delim= " \t";
 | |
|   dst->length= 0;
 | |
|   for (s= strtok(str, delim); s ; s= strtok(NULL, delim))
 | |
|   {
 | |
|     my_wc_t code= (my_wc_t) strtoul(s, NULL, 16);
 | |
|     if (dst->length < array_elements(dst->wc))
 | |
|       dst->wc[dst->length]= code;
 | |
|     dst->length++;
 | |
|   }
 | |
| }
 | |
| 
 | |
| 
 | |
| static void
 | |
| parse_weights(MY_DUCET_WEIGHT *dst, my_bool *is_variable, char *weight)
 | |
| {
 | |
|   const char *delim= " []";
 | |
|   size_t w;
 | |
|   char *weights[64];
 | |
|   char *s;
 | |
|   dst->weight_length= 0;
 | |
|   *is_variable= FALSE;
 | |
|   for (s= strtok(weight, delim) ; s ; s= strtok(NULL, delim))
 | |
|   {
 | |
|     if (dst->weight_length < array_elements(weights))
 | |
|       weights[dst->weight_length]= s;
 | |
|     dst->weight_length++;
 | |
|   }
 | |
| 
 | |
|   set_if_smaller(dst->weight_length, MY_UCA_MAX_WEIGHT_SIZE-1);
 | |
| 
 | |
|   for (w= 0; w < dst->weight_length ; w++)
 | |
|   {
 | |
|     size_t partnum= 0;
 | |
|     for (s= weights[w]; *s ;)
 | |
|     {
 | |
|       char *endptr;
 | |
|       uint part= (uint) strtoul(s + 1, &endptr, 16);
 | |
|       if (w == 0 && s[0] == '*')
 | |
|         *is_variable= TRUE;
 | |
|       if (part > 0xFFFF)
 | |
|         fprintf(stderr, "Weight is too large: %X\n", (uint) part);
 | |
|       dst->weight[partnum][w]= (uint16) part;
 | |
|       s= endptr;
 | |
|       partnum++;
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| 
 | |
| static void
 | |
| print_one_logical_position(const OPT *options,
 | |
|                        const char *name,
 | |
|                        const char *name2,
 | |
|                        my_wc_t value)
 | |
| {
 | |
|   printf("#define %s_%s%s 0x%04X\n",
 | |
|          options->name_prefix, name, name2, (int) value);
 | |
| }
 | |
| 
 | |
| 
 | |
| static void
 | |
| my_ducet_weight_print_canonical(const MY_DUCET_WEIGHT *src)
 | |
| {
 | |
|   uint i;
 | |
|   for (i= 0; i < array_elements(src->weight[0]); i++)
 | |
|   {
 | |
|     my_bool zero= src->weight[0][i] == 0 &&
 | |
|                   src->weight[1][i] == 0 &&
 | |
|                   src->weight[2][i] == 0;
 | |
|     if (zero && i > 0)
 | |
|       break;
 | |
|     printf("[.%04X.%04X.%04X]",
 | |
|            src->weight[0][i],
 | |
|            src->weight[1][i],
 | |
|            src->weight[2][i]);
 | |
|   }
 | |
| }
 | |
| 
 | |
| 
 | |
| static void
 | |
| my_ducet_logical_position_print(const MY_DUCET_LOGICAL_POSITION *src,
 | |
|                                 const char *name,
 | |
|                                 const MY_DUCET *ducet,
 | |
|                                 const OPT *options)
 | |
| {
 | |
|   printf("/*\n");
 | |
|   my_ducet_weight_print_canonical(&ducet->single_chars[src->first].weight);
 | |
|   printf("\n");
 | |
|   my_ducet_weight_print_canonical(&ducet->single_chars[src->last].weight);
 | |
|   printf("\n*/\n");
 | |
|   print_one_logical_position(options, name, "_first", src->first);
 | |
|   print_one_logical_position(options, name, "_last", src->last);
 | |
|   printf("\n");
 | |
| }
 | |
| 
 | |
| 
 | |
| static void
 | |
| print_logical_positions(const MY_DUCET_LOGICAL_POSITIONS *src,
 | |
|                         const MY_DUCET *ducet,
 | |
|                         const OPT *opt)
 | |
| {
 | |
|   my_ducet_logical_position_print(&src->tertiary_ignorable, "tertiary_ignorable", ducet, opt);
 | |
|   my_ducet_logical_position_print(&src->secondary_ignorable, "secondary_ignorable", ducet, opt);
 | |
|   my_ducet_logical_position_print(&src->primary_ignorable, "primary_ignorable", ducet, opt);
 | |
|   my_ducet_logical_position_print(&src->variable, "variable", ducet, opt);
 | |
|   my_ducet_logical_position_print(&src->non_ignorable, "non_ignorable", ducet, opt);
 | |
| }
 | |
| 
 | |
| 
 | |
| static void
 | |
| print_version(const MY_DUCET *ducet, const OPT *opt)
 | |
| {
 | |
|   printf("\n");
 | |
|   printf("#define %s_version %d /* %s */\n",
 | |
|          opt->name_prefix, ducet->version, ducet->version_str);
 | |
|   printf("\n");
 | |
| }
 | |
| 
 | |
| 
 | |
| static void
 | |
| print_contraction(const MY_DUCET_CONTRACTION *c,
 | |
|                   uint level,
 | |
|                   const OPT *options)
 | |
| {
 | |
|   size_t j;
 | |
|   printf("{");
 | |
|   printf("{");
 | |
|   for (j= 0; j < array_elements(c->chars.wc); j++)
 | |
|   {
 | |
|     if (j > 0)
 | |
|       printf(", ");
 | |
|     if (c->chars.wc[j])
 | |
|       printf("0x%04X", (uint) c->chars.wc[j]);
 | |
|     else
 | |
|     {
 | |
|       printf("0");
 | |
|       break;
 | |
|     }
 | |
|   }
 | |
|   printf("}, ");
 | |
|   printf("{");
 | |
|   for (j= 0; j < array_elements(c->weights.weight[level]); j++)
 | |
|   {
 | |
|     if (j > 0)
 | |
|       printf(", ");
 | |
|     if (c->weights.weight[level][j])
 | |
|       printf("0x%04X", (uint) c->weights.weight[level][j]);
 | |
|     else
 | |
|     {
 | |
|       printf("0");
 | |
|       break;
 | |
|     }
 | |
|   }
 | |
|   printf("}, FALSE");
 | |
|   printf("},\n");
 | |
| }
 | |
| 
 | |
| 
 | |
| static void
 | |
| print_contraction_list(const MY_DUCET_CONTRACTION_LIST *src, uint level, const OPT *opt)
 | |
| {
 | |
|   size_t i;
 | |
|   printf("\n\n/* Contractions, level %d */\n", level);
 | |
|   printf("static MY_CONTRACTION %s_contractions%s[%d]={\n",
 | |
|          opt->name_prefix, pname_suffix[level], (int) src->nitems);
 | |
|   for (i= 0; i < src->nitems; i++)
 | |
|   {
 | |
|     const MY_DUCET_CONTRACTION *c= &src->item[i];
 | |
|     print_contraction(c, level, opt);
 | |
|   }
 | |
|   printf("};\n\n");
 | |
| }
 | |
| 
 | |
| 
 | |
| int main(int ac, char **av)
 | |
| {
 | |
|   char str[1024];
 | |
|   static MY_DUCET ducet;
 | |
|   my_wc_t code;
 | |
|   uint w;
 | |
|   int pageloaded[MY_UCA_NPAGES];
 | |
|   FILE *file;
 | |
|   OPT options= defaults;
 | |
| 
 | |
|   if (process_options(&options, ac, av))
 | |
|     return 1;
 | |
| 
 | |
|   if (!(file= open_file(options.filename)))
 | |
|   {
 | |
|     printf("Could not open %s for reading\n", options.filename);
 | |
|     return 1;
 | |
|   }
 | |
| 
 | |
|   bzero(&ducet, sizeof(ducet));
 | |
|   bzero(pageloaded, sizeof(pageloaded));
 | |
|   
 | |
|   while (fgets(str, sizeof(str), file))
 | |
|   {
 | |
|     char *comment;
 | |
|     char *weight;
 | |
|     MY_DUCET_CHARS chr = {{0,0,0,0,0,0}, 0};
 | |
| 
 | |
|     if (str[0] == '#')
 | |
|       continue;
 | |
| 
 | |
|     if (str[0] == '@')
 | |
|     {
 | |
|       parse_at_line(&ducet, strrtrim(str));
 | |
|       continue;
 | |
|     }
 | |
| 
 | |
|     if ((weight= strchr(str, ';')))
 | |
|     {
 | |
|       *weight++= '\0';
 | |
|       for ( ; *weight==' ' ; weight++);
 | |
|     }
 | |
|     else
 | |
|       continue;
 | |
| 
 | |
|     if ((comment=strchr(weight, '#')))
 | |
|     {
 | |
|       *comment++= '\0';
 | |
|     }else
 | |
|       continue;
 | |
| 
 | |
|     parse_chars(&chr, str);
 | |
|     if (!chr.length)
 | |
|       continue;
 | |
| 
 | |
|     if (chr.length == 1)
 | |
|     {
 | |
|       if (chr.wc[0] > MAX_ALLOWED_CODE)
 | |
|         continue;
 | |
|       parse_weights(&ducet.single_chars[chr.wc[0]].weight,
 | |
|                     &ducet.single_chars[chr.wc[0]].is_variable,
 | |
|                     weight);
 | |
|       /* Mark that a character from this page was loaded */
 | |
|       pageloaded[chr.wc[0] >> MY_UCA_PSHIFT]++;
 | |
|     }
 | |
|     else
 | |
|     {
 | |
|       MY_DUCET_WEIGHT weights= {{{0,0,0,0,0,0,0,0,0}, {0,0,0,0,0,0,0,0,0},
 | |
| 				 {0,0,0,0,0,0,0,0,0}, {0,0,0,0,0,0,0,0,0}},
 | |
|                                 0};
 | |
|       my_bool dummy;
 | |
|       if (chr.length >= MY_UCA_MAX_CONTRACTION)
 | |
|       {
 | |
|         fprintf(stderr, "Too long contraction: %d\n", (int) chr.length);
 | |
|         continue;
 | |
|       }
 | |
|       parse_weights(&weights, &dummy, weight);
 | |
|       my_ducet_contraction_list_add(&ducet.contractions, &chr, &weights);
 | |
|       continue;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   close_file(file);
 | |
| 
 | |
|   /* Now set implicit weights */
 | |
|   for (code=0; code <= MAX_ALLOWED_CODE; code++)
 | |
|   {
 | |
|     uint level;
 | |
| 
 | |
|     if (ducet.single_chars[code].weight.weight_length)
 | |
|       continue;
 | |
| 
 | |
|     for (level= 0; level < 4; level++)
 | |
|     {
 | |
|       MY_UCA_IMPLICIT_WEIGHT weight;
 | |
|       weight= my_uca_implicit_weight_on_level(ducet.version, code, level);
 | |
|       ducet.single_chars[code].weight.weight[level][0]= weight.weight[0];
 | |
|       ducet.single_chars[code].weight.weight[level][1]= weight.weight[1];
 | |
|     }
 | |
|     ducet.single_chars[code].weight.weight_length= 2;
 | |
|   }
 | |
| 
 | |
|   my_ducet_normalize(&ducet, &options);
 | |
|   my_ducet_logical_positions_init(&ducet.logical_positions, &ducet);
 | |
| 
 | |
|   printf("/*\n");
 | |
|   printf("  Generated from allkeys.txt version '%s'\n", ducet.version_str);
 | |
|   printf("*/\n");
 | |
| 
 | |
|   for (w=0; w < options.levels; w++)
 | |
|   {
 | |
|     my_wc_t page;
 | |
|     int pagemaxlen[MY_UCA_NPAGES];
 | |
| 
 | |
|     for (page=0; page < MY_UCA_NPAGES; page++)
 | |
|     {
 | |
|       my_wc_t offs;
 | |
|       size_t maxnum= 0;
 | |
|       size_t nchars= 0;
 | |
|       size_t mchars;
 | |
|       size_t ndefs= 0;
 | |
|       size_t code_line_start= page * MY_UCA_NCHARS;
 | |
|       
 | |
|       pagemaxlen[page]= 0;
 | |
|       
 | |
|       /*
 | |
|         Skip this page if no weights were loaded
 | |
|       */
 | |
|       
 | |
|       if (!pageloaded[page])
 | |
|         continue;
 | |
|       
 | |
|       /* 
 | |
|         Calculate maximum weight
 | |
|         length for this page
 | |
|       */
 | |
|       
 | |
|       for (offs=0; offs < MY_UCA_NCHARS; offs++)
 | |
|       {
 | |
|         size_t i, num;
 | |
|         
 | |
|         code= page*MY_UCA_NCHARS+offs;
 | |
|         
 | |
|         /* Calculate only non-zero weights */
 | |
|         for (num=0, i=0; i < ducet.single_chars[code].weight.weight_length; i++)
 | |
|           if (ducet.single_chars[code].weight.weight[w][i])
 | |
|             num++;
 | |
|         
 | |
|         maxnum= maxnum < num ? num : maxnum;
 | |
|         
 | |
|         /* Check if default weight */
 | |
|         if (w == 1 && num == 1)
 | |
|         {
 | |
|           /* 0020 0000 ... */
 | |
|           if (ducet.single_chars[code].weight.weight[w][0] == 0x0020)
 | |
|             ndefs++;
 | |
|         }
 | |
|         else if (w == 2 && num == 1)
 | |
|         {
 | |
|           /* 0002 0000 ... */
 | |
|           if (ducet.single_chars[code].weight.weight[w][0] == 0x0002)
 | |
|             ndefs++;
 | |
|         }
 | |
|       } 
 | |
|       maxnum++;
 | |
|       
 | |
|       /*
 | |
|         If the page have only default weights
 | |
|         then no needs to dump it, skip.
 | |
|       */
 | |
|       if (ndefs == MY_UCA_NCHARS)
 | |
|       {
 | |
|         continue;
 | |
|       }
 | |
|       switch (maxnum)
 | |
|       {
 | |
|         case 0: mchars= 8; break;
 | |
|         case 1: mchars= 8; break;
 | |
|         case 2: mchars= 8; break;
 | |
|         case 3: mchars= 9; break;
 | |
|         case 4: mchars= 8; break;
 | |
|         default: mchars= ducet.single_chars[code].weight.weight_length;
 | |
|       }
 | |
|       
 | |
|       pagemaxlen[page]= (int) maxnum;
 | |
| 
 | |
| 
 | |
|       /*
 | |
|         Now print this page
 | |
|       */
 | |
|       
 | |
|       
 | |
|       printf("static const uint16 %s%s%03X%s[]= { /* %04X (%d weights per char) */\n",
 | |
|               options.name_prefix, pname_prefix[w], (int) page, pname_suffix[w],
 | |
|               (int) page*MY_UCA_NCHARS, (int) maxnum);
 | |
|       
 | |
|       for (offs=0; offs < MY_UCA_NCHARS; offs++)
 | |
|       {
 | |
|         size_t i;
 | |
|         
 | |
|         code= page*MY_UCA_NCHARS+offs;
 | |
|         
 | |
|         for (i=0; i < maxnum; i++)
 | |
|         {
 | |
|           int tmp= ducet.single_chars[code].weight.weight[w][i];
 | |
|           printf("0x%04X", tmp);
 | |
|           if ((offs+1 != MY_UCA_NCHARS) || (i+1!=maxnum))
 | |
|             printf(",");
 | |
|           else
 | |
|             printf(" ");
 | |
|           nchars++;
 | |
|         }
 | |
|         if (nchars >=mchars)
 | |
|         {
 | |
|           printf(" /* %04X */\n", (int) code_line_start);
 | |
|           code_line_start= code + 1;
 | |
|           nchars=0;
 | |
|         }
 | |
|         else
 | |
|         {
 | |
|           printf(" ");
 | |
|         }
 | |
|       }
 | |
|       printf("};\n\n");
 | |
|     }
 | |
| 
 | |
|     printf("const uchar %s_length%s[%d]={\n",
 | |
|            options.name_prefix, pname_suffix[w], MY_UCA_NPAGES);
 | |
|     for (page=0; page < MY_UCA_NPAGES; page++)
 | |
|     {
 | |
|       printf("%d%s%s",pagemaxlen[page],page<MY_UCA_NPAGES-1?",":"",(page+1) % 16 ? "":"\n");
 | |
|     }
 | |
|     printf("};\n");
 | |
| 
 | |
| 
 | |
|     printf("static const uint16 *%s_weight%s[%d]={\n",
 | |
|            options.name_prefix, pname_suffix[w], MY_UCA_NPAGES);
 | |
|     for (page=0; page < MY_UCA_NPAGES; page++)
 | |
|     {
 | |
|       const char *comma= page < MY_UCA_NPAGES-1 ? "," : "";
 | |
|       const char *nline= (page+1) % 4 ? "" : "\n";
 | |
|       if (!pagemaxlen[page])
 | |
|         printf("NULL       %s%s%s", w ? " ": "",  comma , nline);
 | |
|       else
 | |
|         printf("%s%s%03X%s%s%s",
 | |
|                options.name_prefix, pname_prefix[w], (int) page, pname_suffix[w],
 | |
|                comma, nline);
 | |
|     }
 | |
|     printf("};\n");
 | |
| 
 | |
|     if (!options.no_contractions)
 | |
|       print_contraction_list(&ducet.contractions, w, &options);
 | |
|   }
 | |
|   print_version(&ducet, &options);
 | |
|   print_logical_positions(&ducet.logical_positions, &ducet, &options);
 | |
|   
 | |
|   return 0;
 | |
| }
 | 
