From d7ffb7c3dd95c4c5e36f75b09662faafa0fb1ecd Mon Sep 17 00:00:00 2001
From: Alexander Barkov <bar@mariadb.com>
Date: Thu, 25 Nov 2021 06:48:17 +0400
Subject: [PATCH] MDEV-27009 Add UCA-14.0.0 collations - dump logical positions
 and contractions

- uca-dump can now dump logical positions as a set of "#define" directives.
  Logical positions for 4.0.0 and for 5.2.0 were calculated and put into
  ctype-uca.c manually. That required some efforts by analyzing allkeys.txt
  with help of grep and sort.
  Now when defining a new MY_UCA_INFO it's possible to use the new #define's
  instead of calculating logical positions manually.
  Logical positions also print their weights in DUCET format as a comment
  before the define:

/*
[.0000.0021.0002]
[.0000.0117.0002]
*/

  The comment helps to know weight ranges on various levels,
  which makes it easier to debug the code.

- uca-dump can now dump built-in DUCET contractions

- Adding a new uca-dump command line option --no-contractions, this is useful
  if one needs to re-dump 4.0.0 and 5.2.0 data in ctype-uca.c compatible way.

- Adding a new uca-dump command line options --case-first=upper|level.
  This can be useful if one need to dump with UPPER case first by default.
  It's not yet decided if we'll use --case-first=upper during the dump though.

- Moving parts of the code from the main loop into separate functions
  parse_chars() and parse_weights(). This allows to reuse the code between
  single characters and contractions.

- Adding a new function my_ducet_weight_normalize(), to cut zero weights
  from a weight string, e.g. [AAAA][0000][BBBB] -> [AAAA][BBBB].
  This helps to reuse the code between single characters and contractions.

- Weight normalization is now done before printing, in separate loops inside
  my_ducet_normalize(). Before this change, normalization was done during
  priting, inside the printing loop. This helps to separate steps:
  loading -> normalizing -> printing.
  This makes it easier to follow what's going on, e.g. while debugging.

- Fixing ctype-uca.c to handle built-in contractions of any length.
  Previously we had only built-in contractions in utf8mb4_thai_520_w2,
  which contains only 2-character contractions.
---
 strings/ctype-uca.c |   9 +-
 strings/uca-dump.c  | 662 +++++++++++++++++++++++++++++++++++++-------
 2 files changed, 567 insertions(+), 104 deletions(-)

diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c
index 06ea8dd251f..06dfe73ab39 100644
--- a/strings/ctype-uca.c
+++ b/strings/ctype-uca.c
@@ -33720,16 +33720,11 @@ init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules,
   for (i= 0; i != src->contractions.nitems; i++)
   {
     MY_CONTRACTION *item= &src->contractions.item[i];
-    /*
-      TODO: calculate length from item->ch.
-      Generally contractions can consist of more than 2 characters.
-    */
-    uint length= 2;
+    uint length= my_wstrnlen(item->ch, array_elements(item->ch));
     uint16 *weights= my_uca_init_one_contraction(&dst->contractions,
                                                  item->ch, length,
                                                  item->with_context);
-    memcpy(weights, item->weight, length * sizeof(uint16));
-    weights[length]= 0;
+    memcpy(weights, item->weight, sizeof(item->weight));
   }
   return FALSE;
 }
diff --git a/strings/uca-dump.c b/strings/uca-dump.c
index c9195229e0b..8846841bdce 100644
--- a/strings/uca-dump.c
+++ b/strings/uca-dump.c
@@ -24,17 +24,6 @@
 #include "m_ctype.h"
 #include "ctype-uca.h"
 
-#if 0
-#define MY_UCA_NPAGES	1024
-#define MY_UCA_NCHARS	64
-#define MY_UCA_CMASK	63
-#define MY_UCA_PSHIFT	6
-#else
-#define MY_UCA_NPAGES	4352 /* 0x110000 characters / 0x100 chars per page */
-#define MY_UCA_NCHARS	256
-#define MY_UCA_CMASK	255
-#define MY_UCA_PSHIFT	8
-#endif
 
 #define MAX_ALLOWED_CODE 0x10FFFF
 
@@ -44,6 +33,8 @@ typedef struct opt_st
   const char *name_prefix; /* Name that goes into all array names */
   const char *filename;    /* The filename or "-" for stdin */
   uint levels;             /* The number of levels to dump */
+  my_bool no_contractions;
+  my_bool case_first_upper;
 } OPT;
 
 
@@ -51,7 +42,9 @@ static OPT defaults=
 {
   "uca",
   "-",
-  3
+  3,
+  FALSE,
+  FALSE
 };
 
 
@@ -65,22 +58,344 @@ typedef struct my_ducet_weight_st
 typedef struct my_ducet_single_char_t
 {
   MY_DUCET_WEIGHT weight;
+  my_bool is_variable;
 } MY_DUCET_SINGLE_CHAR;
 
 
+typedef struct my_ducet_char_t
+{
+  my_wc_t wc[MY_UCA_MAX_CONTRACTION];
+  size_t length;
+} MY_DUCET_CHARS;
+
+
+typedef struct my_ducet_contraction_t
+{
+  MY_DUCET_CHARS chars;
+  MY_DUCET_WEIGHT weights;
+} MY_DUCET_CONTRACTION;
+
+
+typedef struct my_ducet_contraction_list_st
+{
+  size_t nitems;
+  MY_DUCET_CONTRACTION item[4*1024];
+} MY_DUCET_CONTRACTION_LIST;
+
+
+typedef struct my_ducet_logical_posision_st
+{
+  my_wc_t first;
+  my_wc_t last;
+} MY_DUCET_LOGICAL_POSITION;
+
+
+typedef struct my_ducet_logical_positions_st
+{
+  MY_DUCET_LOGICAL_POSITION tertiary_ignorable;
+  MY_DUCET_LOGICAL_POSITION secondary_ignorable;
+  MY_DUCET_LOGICAL_POSITION primary_ignorable;
+  MY_DUCET_LOGICAL_POSITION variable;
+  MY_DUCET_LOGICAL_POSITION non_ignorable;
+} MY_DUCET_LOGICAL_POSITIONS;
+
+
 typedef struct my_allkeys_st
 {
   MY_DUCET_SINGLE_CHAR single_chars[MAX_ALLOWED_CODE+1];
+  MY_DUCET_CONTRACTION_LIST contractions;
+  MY_DUCET_LOGICAL_POSITIONS logical_positions;
   uint version;
   char version_str[32];
 } MY_DUCET;
 
 
+static int
+my_ducet_weight_cmp_on_level(const MY_DUCET_WEIGHT *a,
+                             const MY_DUCET_WEIGHT *b,
+                             uint level)
+{
+  uint i;
+  for (i= 0; i < array_elements(a->weight[level]); i++)
+  {
+    int diff= (int) a->weight[level][i] - (int) b->weight[level][i];
+    if (diff)
+      return diff;
+  }
+  return 0;
+}
+
+
+static int
+my_ducet_weight_cmp(const MY_DUCET_WEIGHT *a,
+                    const MY_DUCET_WEIGHT *b)
+{
+  uint level;
+  for (level= 0; level < array_elements(a->weight); level++)
+  {
+    int diff= my_ducet_weight_cmp_on_level(a, b, level);
+    if (diff)
+      return diff;
+  }
+  return 0;
+}
+
+
+/*
+"3.11 Logical Reset Positions" says:
+
+The CLDR table (based on UCA) has the following overall structure for weights,
+going from low to high.
+
+*/
+
+static my_bool
+my_ducet_weight_is_tertiary_ignorable(const MY_DUCET_WEIGHT *w)
+{
+  return w->weight[0][0] == 0 &&
+         w->weight[1][0] == 0 &&
+         w->weight[2][0] == 0;
+}
+
+
+static my_bool
+my_ducet_weight_is_secondary_ignorable(const MY_DUCET_WEIGHT *w)
+{
+  return w->weight[0][0] == 0 &&
+         w->weight[1][0] == 0 &&
+         w->weight[2][0] != 0;
+}
+
+
+static my_bool
+my_ducet_weight_is_primary_ignorable(const MY_DUCET_WEIGHT *w)
+{
+  return w->weight[0][0] == 0 &&
+         w->weight[1][0] != 0 &&
+         w->weight[2][0] != 0;
+}
+
+
+static my_bool
+my_ducet_weight_is_primary_non_ignorable(const MY_DUCET_WEIGHT *w)
+{
+  return w->weight[0][0] > 0 && w->weight[0][0] < 0xFB00;
+}
+
+
+/*
+  if alternate = non-ignorable
+  p != ignore,
+  if  alternate = shifted
+  p, s, t = ignore
+*/
+static my_bool
+my_ducet_single_char_is_variable(const MY_DUCET_SINGLE_CHAR *ch)
+{
+  return ch->is_variable &&
+         my_ducet_weight_is_primary_non_ignorable(&ch->weight);
+}
+
+
+static void
+my_ducet_logical_position_set(MY_DUCET_LOGICAL_POSITION *dst, my_wc_t wc)
+{
+  dst->first= dst->last= wc;
+}
+
+
+static void
+my_ducet_logical_position_update(MY_DUCET_LOGICAL_POSITION *dst,
+                                 const MY_DUCET *ducet, my_wc_t current)
+{
+  const MY_DUCET_SINGLE_CHAR *chars= ducet->single_chars;
+  int diff;
+  if (current >= array_elements(ducet->single_chars))
+    return;
+  if ((diff= my_ducet_weight_cmp(&chars[current].weight,
+                                 &chars[dst->first].weight)) < 0 ||
+      (diff == 0 && current < dst->first))
+    dst->first= current;
+  if ((diff= my_ducet_weight_cmp(&chars[current].weight,
+                                 &chars[dst->last].weight)) > 0 ||
+      (diff == 0 && current > dst->last))
+    dst->last= current;
+}
+
+
+static void
+my_ducet_logical_positions_init(MY_DUCET_LOGICAL_POSITIONS *dst,
+                                const MY_DUCET *ducet)
+{
+  uint i;
+  const MY_DUCET_SINGLE_CHAR *chars= ducet->single_chars;
+
+  for (i= 0; i < array_elements(ducet->single_chars); i++)
+  {
+    if (my_ducet_weight_is_tertiary_ignorable(&chars[i].weight))
+    {
+      my_ducet_logical_position_set(&dst->tertiary_ignorable, i);
+      break;
+    }
+  }
+
+  for (i= 0; i < array_elements(ducet->single_chars); i++)
+  {
+    if (my_ducet_weight_is_secondary_ignorable(&chars[i].weight))
+    {
+      my_ducet_logical_position_set(&dst->secondary_ignorable, i);
+      break;
+    }
+  }
+
+  for (i= 0; i < array_elements(ducet->single_chars); i++)
+  {
+    if (my_ducet_weight_is_primary_ignorable(&chars[i].weight))
+    {
+      my_ducet_logical_position_set(&dst->primary_ignorable, i);
+      break;
+    }
+  }
+
+  for (i= 0; i < array_elements(ducet->single_chars); i++)
+  {
+    if (my_ducet_weight_is_primary_non_ignorable(&chars[i].weight))
+    {
+      my_ducet_logical_position_set(&dst->non_ignorable, i);
+      break;
+    }
+  }
+
+  for (i= 0; i < array_elements(ducet->single_chars); i++)
+  {
+    if (my_ducet_single_char_is_variable(&chars[i]))
+    {
+      my_ducet_logical_position_set(&dst->variable, i);
+      break;
+    }
+  }
+
+  for (i= 1; i < array_elements(ducet->single_chars); i++)
+  {
+    if (my_ducet_weight_is_primary_non_ignorable(&chars[i].weight))
+      my_ducet_logical_position_update(&dst->non_ignorable, ducet, i);
+    if (my_ducet_weight_is_primary_ignorable(&chars[i].weight))
+      my_ducet_logical_position_update(&dst->primary_ignorable, ducet, i);
+    if (my_ducet_weight_is_secondary_ignorable(&chars[i].weight))
+      my_ducet_logical_position_update(&dst->secondary_ignorable, ducet, i);
+    if (my_ducet_weight_is_tertiary_ignorable(&chars[i].weight))
+      my_ducet_logical_position_update(&dst->tertiary_ignorable, ducet, i);
+    if (my_ducet_single_char_is_variable(&chars[i]))
+      my_ducet_logical_position_update(&dst->variable, ducet, i);
+  }
+
+  /*
+    DUCET as of Unicode-14.0.0 does not have any secondary ignorable
+    characters, i.e. with weights [p=0000, s=0000, t!=0000]
+    For compatibility with 4.0.0 and 5.2.0 data in ctype-uca.c,
+    let copy tertiary_ignorable to secondary_ignorable.
+    It gives effectively the same result with just leaving
+    secondary_ignorable as {first=U+0000,last=U+0000}.
+  */
+  if (dst->secondary_ignorable.first == 0 && dst->secondary_ignorable.last == 0)
+  {
+    dst->secondary_ignorable.first= dst->tertiary_ignorable.first;
+    dst->secondary_ignorable.last= dst->tertiary_ignorable.last;
+  }
+}
+
+
+static void
+my_ducet_weight_normalize_on_level(MY_DUCET_WEIGHT *weights,
+                                   uint level,
+                                   const OPT *options)
+{
+  uint dst, src;
+  for (src= 0, dst= 0; src < array_elements(weights->weight[level]); src++)
+  {
+    if (weights->weight[level][src] != 0)
+      weights->weight[level][dst++]= weights->weight[level][src];
+  }
+  for ( ; dst < array_elements(weights->weight[level]) ; dst++)
+    weights->weight[level][dst]= 0;
+  if (options->case_first_upper && level == 2)
+  {
+    /*
+      Invert weights for secondary level to
+      sort upper case letters before their
+      lower case counter part.
+    */
+    for (dst= 0; dst < array_elements(weights->weight[level]); dst++)
+    {
+      if (weights->weight[level][dst] == 0)
+        break;
+      if (weights->weight[level][dst] >= 0x20)
+      {
+        fprintf(stderr, "Secondary level is too large: %04X\n",
+                (int) weights->weight[level][dst]);
+      }
+      weights->weight[level][dst]= (uint16) (0x20 - weights->weight[level][dst]);
+    }
+  }
+}
+
+
+static void
+my_ducet_weight_normalize(MY_DUCET_WEIGHT *weights, const OPT *options)
+{
+  uint i;
+  for (i= 0; i < array_elements(weights->weight); i++)
+    my_ducet_weight_normalize_on_level(weights, i, options);
+}
+
+
+static void
+my_ducet_normalize(MY_DUCET *ducet, const OPT *options)
+{
+  uint i;
+  for (i= 0; i < array_elements(ducet->single_chars); i++)
+    my_ducet_weight_normalize(&ducet->single_chars[i].weight, options);
+  for (i= 0; i < array_elements(ducet->contractions.item); i++)
+    my_ducet_weight_normalize(&ducet->contractions.item[i].weights, options);
+}
+
+
+static my_bool
+my_ducet_contraction_list_add(MY_DUCET_CONTRACTION_LIST *dst,
+                              const MY_DUCET_CHARS *chars,
+                              const MY_DUCET_WEIGHT *weights)
+{
+  if (dst->nitems >= array_elements(dst->item))
+  {
+    fprintf(stderr, "Too many contractions\n");
+    return TRUE;
+  }
+  dst->item[dst->nitems].chars= *chars;
+  dst->item[dst->nitems].weights= *weights;
+  dst->nitems++;
+  return FALSE;
+}
+
+
+#if 0
+#define MY_UCA_NPAGES	1024
+#define MY_UCA_NCHARS	64
+#define MY_UCA_CMASK	63
+#define MY_UCA_PSHIFT	6
+#else
+#define MY_UCA_NPAGES	4352 /* 0x110000 characters / 0x100 chars per page */
+#define MY_UCA_NCHARS	256
+#define MY_UCA_CMASK	255
+#define MY_UCA_PSHIFT	8
+#endif
+
+
+
 /* Name prefix that goes into page weight array names after global_name_prefix */
 static const char *pname_prefix[]= {"_p", "_p", "_p"};
 
 /* Name suffix that goes into page weight array names after page number */
-static const char *pname_suffix[]= {"", "_w2", "_w3"};
+static const char *pname_suffix[]= {"", "_secondary", "_tertiary"};
 
 
 void usage(const char *prog)
@@ -100,6 +415,8 @@ int process_option(OPT *options, const char *opt)
 {
   static const LEX_CSTRING opt_name_prefix= {STRING_WITH_LEN("--name-prefix=")};
   static const LEX_CSTRING opt_levels= {STRING_WITH_LEN("--levels=")};
+  static const LEX_CSTRING opt_no_contractions= {STRING_WITH_LEN("--no-contractions")};
+  static const LEX_CSTRING opt_case_first= {STRING_WITH_LEN("--case-first=")};
   if (!lstrncmp(opt, opt_name_prefix))
   {
     options->name_prefix= opt + opt_name_prefix.length;
@@ -115,6 +432,27 @@ int process_option(OPT *options, const char *opt)
     }
     return 0;
   }
+  if (!lstrncmp(opt, opt_case_first))
+  {
+    const char *value= opt + opt_case_first.length;
+    if (!strcasecmp(value, "upper"))
+    {
+      options->case_first_upper= TRUE;
+      return 0;
+    }
+    if (!strcasecmp(value, "lower"))
+    {
+      options->case_first_upper= FALSE;
+      return 0;
+    }
+    fprintf(stderr, "Bad option: %s\n", opt);
+    return 1;
+  }
+  if (!strcmp(opt, opt_no_contractions.str))
+  {
+    options->no_contractions= TRUE;
+    return 0;
+  }
   printf("Unknown option: %s\n", opt);
   return 1;
 }
@@ -189,7 +527,7 @@ char *strrtrim(char *str)
 my_bool parse_at_line(MY_DUCET *ducet, const char *str)
 {
   static const LEX_CSTRING version= {STRING_WITH_LEN("@version ")};
-  if (!strncmp(str, version.str, version.length))
+  if (!lstrncmp(str, version))
   {
     /*
       Examples:
@@ -220,6 +558,119 @@ my_bool parse_at_line(MY_DUCET *ducet, const char *str)
 }
 
 
+static void
+parse_chars(MY_DUCET_CHARS *dst, char *str)
+{
+  char *s;
+  const char *delim= " \t";
+  dst->length= 0;
+  for (s= strtok(str, delim); s ; s= strtok(NULL, delim))
+  {
+    my_wc_t code= (my_wc_t) strtoul(s, NULL, 16);
+    if (dst->length < array_elements(dst->wc))
+      dst->wc[dst->length]= code;
+    dst->length++;
+  }
+}
+
+
+static void
+parse_weights(MY_DUCET_WEIGHT *dst, my_bool *is_variable, char *weight)
+{
+  const char *delim= " []";
+  size_t w;
+  char *weights[64];
+  char *s;
+  dst->weight_length= 0;
+  *is_variable= FALSE;
+  for (s= strtok(weight, delim) ; s ; s= strtok(NULL, delim))
+  {
+    if (dst->weight_length < array_elements(weights))
+      weights[dst->weight_length]= s;
+    dst->weight_length++;
+  }
+
+  set_if_smaller(dst->weight_length, MY_UCA_MAX_WEIGHT_SIZE-1);
+
+  for (w= 0; w < dst->weight_length ; w++)
+  {
+    size_t partnum= 0;
+    for (s= weights[w]; *s ;)
+    {
+      char *endptr;
+      uint part= (uint) strtoul(s + 1, &endptr, 16);
+      if (w == 0 && s[0] == '*')
+        *is_variable= TRUE;
+      if (part > 0xFFFF)
+        fprintf(stderr, "Weight is too large: %X\n", (uint) part);
+      dst->weight[partnum][w]= (uint16) part;
+      s= endptr;
+      partnum++;
+    }
+  }
+}
+
+
+static void
+print_one_logical_position(const OPT *options,
+                       const char *name,
+                       const char *name2,
+                       my_wc_t value)
+{
+  printf("#define %s_%s%s 0x%04X\n",
+         options->name_prefix, name, name2, (int) value);
+}
+
+
+static void
+my_ducet_weight_print_canonical(const MY_DUCET_WEIGHT *src)
+{
+  uint i;
+  for (i= 0; i < array_elements(src->weight[0]); i++)
+  {
+    my_bool zero= src->weight[0][i] == 0 &&
+                  src->weight[1][i] == 0 &&
+                  src->weight[2][i] == 0;
+    if (zero && i > 0)
+      break;
+    printf("[.%04X.%04X.%04X]",
+           src->weight[0][i],
+           src->weight[1][i],
+           src->weight[2][i]);
+  }
+}
+
+
+static void
+my_ducet_logical_position_print(const MY_DUCET_LOGICAL_POSITION *src,
+                                const char *name,
+                                const MY_DUCET *ducet,
+                                const OPT *options)
+{
+  printf("/*\n");
+  my_ducet_weight_print_canonical(&ducet->single_chars[src->first].weight);
+  printf("\n");
+  my_ducet_weight_print_canonical(&ducet->single_chars[src->last].weight);
+  printf("\n*/\n");
+  print_one_logical_position(options, name, "_first", src->first);
+  print_one_logical_position(options, name, "_last", src->last);
+  printf("\n");
+}
+
+
+static void
+print_logical_positions(const MY_DUCET_LOGICAL_POSITIONS *src,
+                        const MY_DUCET *ducet,
+                        const OPT *opt)
+{
+  my_ducet_logical_position_print(&src->tertiary_ignorable, "tertiary_ignorable", ducet, opt);
+  my_ducet_logical_position_print(&src->secondary_ignorable, "secondary_ignorable", ducet, opt);
+  my_ducet_logical_position_print(&src->primary_ignorable, "primary_ignorable", ducet, opt);
+  my_ducet_logical_position_print(&src->variable, "variable", ducet, opt);
+  my_ducet_logical_position_print(&src->non_ignorable, "non_ignorable", ducet, opt);
+}
+
+
 static void
 print_version(const MY_DUCET *ducet, const OPT *opt)
 {
@@ -230,10 +681,64 @@ print_version(const MY_DUCET *ducet, const OPT *opt)
 }
 
 
+static void
+print_contraction(const MY_DUCET_CONTRACTION *c,
+                  uint level,
+                  const OPT *options)
+{
+  size_t j;
+  printf("{");
+  printf("{");
+  for (j= 0; j < array_elements(c->chars.wc); j++)
+  {
+    if (j > 0)
+      printf(", ");
+    if (c->chars.wc[j])
+      printf("0x%04X", (uint) c->chars.wc[j]);
+    else
+    {
+      printf("0");
+      break;
+    }
+  }
+  printf("}, ");
+  printf("{");
+  for (j= 0; j < array_elements(c->weights.weight[level]); j++)
+  {
+    if (j > 0)
+      printf(", ");
+    if (c->weights.weight[level][j])
+      printf("0x%04X", (uint) c->weights.weight[level][j]);
+    else
+    {
+      printf("0");
+      break;
+    }
+  }
+  printf("}, FALSE");
+  printf("},\n");
+}
+
+
+static void
+print_contraction_list(const MY_DUCET_CONTRACTION_LIST *src, uint level, const OPT *opt)
+{
+  size_t i;
+  printf("\n\n/* Contractions, level %d */\n", level);
+  printf("static MY_CONTRACTION %s_contractions%s[%d]={\n",
+         opt->name_prefix, pname_suffix[level], (int) src->nitems);
+  for (i= 0; i < src->nitems; i++)
+  {
+    const MY_DUCET_CONTRACTION *c= &src->item[i];
+    print_contraction(c, level, opt);
+  }
+  printf("};\n\n");
+}
+
+
 int main(int ac, char **av)
 {
   char str[1024];
-  char *weights[64];
   static MY_DUCET ducet;
   my_wc_t code;
   uint w;
@@ -257,79 +762,58 @@ int main(int ac, char **av)
   {
     char *comment;
     char *weight;
-    char *s;
-    size_t codenum;
+    MY_DUCET_CHARS chr = {0};
+
+    if (str[0] == '#')
+      continue;
 
     if (str[0] == '@')
     {
       parse_at_line(&ducet, strrtrim(str));
       continue;
     }
-    
-    code= (my_wc_t) strtol(str,NULL,16);
-    
-    if (str[0]=='#' || (code > MAX_ALLOWED_CODE))
-      continue;
-    if ((comment=strchr(str,'#')))
-    {
-      *comment++= '\0';
-      for ( ; *comment==' ' ; comment++);
-    }else
-      continue;
-    
-    if ((weight=strchr(str,';')))
+
+    if ((weight= strchr(str, ';')))
     {
       *weight++= '\0';
       for ( ; *weight==' ' ; weight++);
     }
     else
       continue;
-    
-    codenum= 0;
-    s= strtok(str, " \t");
-    while (s)
+
+    if ((comment=strchr(weight, '#')))
     {
-      s= strtok(NULL, " \t");
-      codenum++;
+      *comment++= '\0';
+    }else
+      continue;
+
+    parse_chars(&chr, str);
+    if (!chr.length)
+      continue;
+
+    if (chr.length == 1)
+    {
+      if (chr.wc[0] > MAX_ALLOWED_CODE)
+        continue;
+      parse_weights(&ducet.single_chars[chr.wc[0]].weight,
+                    &ducet.single_chars[chr.wc[0]].is_variable,
+                    weight);
+      /* Mark that a character from this page was loaded */
+      pageloaded[chr.wc[0] >> MY_UCA_PSHIFT]++;
     }
-    
-    if (codenum>1)
+    else
     {
-      /* Multi-character weight, 
-         i.e. contraction. 
-         Not supported yet.
-      */
+      MY_DUCET_WEIGHT weights= {0};
+      my_bool dummy;
+      if (chr.length >= MY_UCA_MAX_CONTRACTION)
+      {
+        fprintf(stderr, "Too long contraction: %d\n", (int) chr.length);
+        continue;
+      }
+      parse_weights(&weights, &dummy, weight);
+      my_ducet_contraction_list_add(&ducet.contractions, &chr, &weights);
       continue;
     }
-    
-    ducet.single_chars[code].weight.weight_length= 0;
-    s= strtok(weight, " []");
-    while (s)
-    {
-      weights[ducet.single_chars[code].weight.weight_length]= s;
-      s= strtok(NULL, " []");
-      ducet.single_chars[code].weight.weight_length++;
-    }
-    
-    set_if_smaller(ducet.single_chars[code].weight.weight_length, MY_UCA_MAX_WEIGHT_SIZE-1);
-
-    for (w=0; w < ducet.single_chars[code].weight.weight_length ; w++)
-    {
-      size_t partnum;
-      
-      partnum= 0;
-      s= weights[w];
-      while (*s)
-      {
-        char *endptr;
-        uint part= (uint) strtoul(s + 1, &endptr, 16);
-        ducet.single_chars[code].weight.weight[partnum][w]= (uint16) part;
-        s= endptr;
-        partnum++;
-      }
-    }
-    /* Mark that a character from this page was loaded */
-    pageloaded[code >> MY_UCA_PSHIFT]++;
   }
 
   close_file(file);
@@ -351,7 +835,10 @@ int main(int ac, char **av)
     }
     ducet.single_chars[code].weight.weight_length= 2;
   }
-  
+
+  my_ducet_normalize(&ducet, &options);
+  my_ducet_logical_positions_init(&ducet.logical_positions, &ducet);
+
   printf("/*\n");
   printf("  Generated from allkeys.txt version '%s'\n", ducet.version_str);
   printf("*/\n");
@@ -445,35 +932,13 @@ int main(int ac, char **av)
       
       for (offs=0; offs < MY_UCA_NCHARS; offs++)
       {
-        uint16 weight[MY_UCA_MAX_WEIGHT_SIZE];
-        size_t num, i;
+        size_t i;
         
         code= page*MY_UCA_NCHARS+offs;
         
-        bzero(weight,sizeof(weight));
-        
-        /* Copy non-zero weights */
-        for (num=0, i=0; i < ducet.single_chars[code].weight.weight_length; i++)
-        {
-          if (ducet.single_chars[code].weight.weight[w][i])
-          {
-            weight[num]= ducet.single_chars[code].weight.weight[w][i];
-            num++;
-          }
-        }
-        
         for (i=0; i < maxnum; i++)
         {
-          /* 
-            Invert weights for secondary level to
-            sort upper case letters before their
-            lower case counter part.
-          */
-          int tmp= weight[i];
-          if (w == 2 && tmp)
-            tmp= (int)(0x20 - weight[i]);
-          
-          
+          int tmp= ducet.single_chars[code].weight.weight[w][i];
           printf("0x%04X", tmp);
           if ((offs+1 != MY_UCA_NCHARS) || (i+1!=maxnum))
             printf(",");
@@ -518,9 +983,12 @@ int main(int ac, char **av)
                comma, nline);
     }
     printf("};\n");
-  }
 
+    if (!options.no_contractions)
+      print_contraction_list(&ducet.contractions, w, &options);
+  }
   print_version(&ducet, &options);
+  print_logical_positions(&ducet.logical_positions, &ducet, &options);
   
   return 0;
 }