mirror of
				https://github.com/MariaDB/server.git
				synced 2025-11-04 12:56:14 +01:00 
			
		
		
		
	
		
			
				
	
	
		
			897 lines
		
	
	
	
		
			24 KiB
		
	
	
	
		
			Ruby
		
	
	
		
			Executable file
		
	
	
	
	
			
		
		
	
	
			897 lines
		
	
	
	
		
			24 KiB
		
	
	
	
		
			Ruby
		
	
	
		
			Executable file
		
	
	
	
	
#!/usr/bin/env ruby
 | 
						|
# -*- coding: utf-8 -*-
 | 
						|
#
 | 
						|
# Copyright(C) 2010-2016 Brazil
 | 
						|
#
 | 
						|
# This library is free software; you can redistribute it and/or
 | 
						|
# modify it under the terms of the GNU Lesser General Public
 | 
						|
# License version 2.1 as published by the Free Software Foundation.
 | 
						|
#
 | 
						|
# This library is distributed in the hope that it will be useful,
 | 
						|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | 
						|
# Lesser General Public License for more details.
 | 
						|
#
 | 
						|
# You should have received a copy of the GNU Lesser General Public
 | 
						|
# License along with this library; if not, write to the Free Software
 | 
						|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1335  USA
 | 
						|
 | 
						|
CUSTOM_RULE_PATH = 'nfkc-custom-rules.txt'
 | 
						|
 | 
						|
class SwitchGenerator
 | 
						|
  def initialize(unicode_version, output)
 | 
						|
    @unicode_version = unicode_version
 | 
						|
    @output = output
 | 
						|
  end
 | 
						|
 | 
						|
  def generate(bc, decompose_map, compose_map)
 | 
						|
    STDERR.puts('generating char type code..')
 | 
						|
    generate_blockcode_char_type(bc)
 | 
						|
    STDERR.puts('generating decompose code..')
 | 
						|
    generate_decompose(decompose_map)
 | 
						|
    STDERR.puts('generating compose code..')
 | 
						|
    generate_compose(compose_map)
 | 
						|
  end
 | 
						|
 | 
						|
  private
 | 
						|
  def generate_blockcode_char_type(bc)
 | 
						|
    @output.puts(<<-HEADER)
 | 
						|
 | 
						|
grn_char_type
 | 
						|
grn_nfkc#{@unicode_version}_char_type(const unsigned char *str)
 | 
						|
{
 | 
						|
    HEADER
 | 
						|
 | 
						|
    @lv = 0
 | 
						|
    gen_bc(bc, 0)
 | 
						|
 | 
						|
    @output.puts(<<-FOOTER)
 | 
						|
  return -1;
 | 
						|
}
 | 
						|
    FOOTER
 | 
						|
  end
 | 
						|
 | 
						|
  def gen_bc(hash, level)
 | 
						|
    bl = ' ' * (level * 2)
 | 
						|
    h2 = {}
 | 
						|
    hash.each{|key,val|
 | 
						|
      key = key.dup
 | 
						|
      key.force_encoding("ASCII-8BIT")
 | 
						|
      head = key.bytes[0]
 | 
						|
      rest = key[1..-1]
 | 
						|
      if h2[head]
 | 
						|
        h2[head][rest] = val
 | 
						|
      else
 | 
						|
        h2[head] = {rest => val}
 | 
						|
      end
 | 
						|
    }
 | 
						|
    if h2.size < 3
 | 
						|
      h2.keys.sort.each{|k|
 | 
						|
        if (0x80 < k)
 | 
						|
          @output.printf("#{bl}if (str[#{level}] < 0x%02X) { return #{@lv}; }\n", k)
 | 
						|
        end
 | 
						|
        h = h2[k]
 | 
						|
        if h.keys.join =~ /^\x80*$/n
 | 
						|
          @lv, = h.values
 | 
						|
        else
 | 
						|
          @output.printf("#{bl}if (str[#{level}] == 0x%02X) {\n", k)
 | 
						|
          gen_bc(h, level + 1)
 | 
						|
          @output.puts bl + '}'
 | 
						|
        end
 | 
						|
      }
 | 
						|
      @output.puts bl + "return #{@lv};"
 | 
						|
    else
 | 
						|
      @output.puts bl + "switch (str[#{level}]) {"
 | 
						|
      lk = 0x80
 | 
						|
      br = true
 | 
						|
      h2.keys.sort.each{|k|
 | 
						|
        if (lk < k)
 | 
						|
          for j in lk..k-1
 | 
						|
            @output.printf("#{bl}case 0x%02X :\n", j)
 | 
						|
          end
 | 
						|
          br = false
 | 
						|
        end
 | 
						|
        unless br
 | 
						|
          @output.puts bl + "  return #{@lv};"
 | 
						|
          @output.puts bl + '  break;'
 | 
						|
        end
 | 
						|
        h = h2[k]
 | 
						|
        @output.printf("#{bl}case 0x%02X :\n", k)
 | 
						|
        if h.keys.join =~ /^\x80*$/n
 | 
						|
          @lv, = h.values
 | 
						|
          br = false
 | 
						|
        else
 | 
						|
          gen_bc(h, level + 1)
 | 
						|
          @output.puts bl + '  break;'
 | 
						|
          br = true
 | 
						|
        end
 | 
						|
        lk = k + 1
 | 
						|
      }
 | 
						|
      @output.puts bl + 'default :'
 | 
						|
      @output.puts bl + "  return #{@lv};"
 | 
						|
      @output.puts bl + '  break;'
 | 
						|
      @output.puts bl + '}'
 | 
						|
    end
 | 
						|
  end
 | 
						|
 | 
						|
  def generate_decompose(hash)
 | 
						|
    @output.puts(<<-HEADER)
 | 
						|
 | 
						|
const char *
 | 
						|
grn_nfkc#{@unicode_version}_decompose(const unsigned char *str)
 | 
						|
{
 | 
						|
    HEADER
 | 
						|
 | 
						|
    gen_decompose(hash, 0)
 | 
						|
 | 
						|
    @output.puts(<<-FOOTER)
 | 
						|
  return 0;
 | 
						|
}
 | 
						|
    FOOTER
 | 
						|
  end
 | 
						|
 | 
						|
  def gen_decompose(hash, level)
 | 
						|
    bl = ' ' * ((level + 0) * 2)
 | 
						|
    if hash['']
 | 
						|
      dst = ''
 | 
						|
      hash[''].each_byte{|b| dst << format('\x%02X', b)}
 | 
						|
      @output.puts "#{bl}return \"#{dst}\";"
 | 
						|
      hash.delete('')
 | 
						|
    end
 | 
						|
    return if hash.empty?
 | 
						|
    h2 = {}
 | 
						|
    hash.each{|key,val|
 | 
						|
      key = key.dup
 | 
						|
      key.force_encoding("ASCII-8BIT")
 | 
						|
      head = key.bytes[0]
 | 
						|
      rest = key[1..-1]
 | 
						|
      if h2[head]
 | 
						|
        h2[head][rest] = val
 | 
						|
      else
 | 
						|
        h2[head] = {rest => val}
 | 
						|
      end
 | 
						|
    }
 | 
						|
    if h2.size == 1
 | 
						|
      h2.each{|key,val|
 | 
						|
        @output.printf("#{bl}if (str[#{level}] == 0x%02X) {\n", key)
 | 
						|
        gen_decompose(val, level + 1)
 | 
						|
        @output.puts bl + '}'
 | 
						|
      }
 | 
						|
    else
 | 
						|
      @output.puts "#{bl}switch (str[#{level}]) {"
 | 
						|
      h2.keys.sort.each{|k|
 | 
						|
        @output.printf("#{bl}case 0x%02X :\n", k)
 | 
						|
        gen_decompose(h2[k], level + 1)
 | 
						|
        @output.puts("#{bl}  break;")
 | 
						|
      }
 | 
						|
      @output.puts bl + '}'
 | 
						|
    end
 | 
						|
  end
 | 
						|
 | 
						|
  def generate_compose(compose_map)
 | 
						|
    @output.puts(<<-HEADER)
 | 
						|
 | 
						|
const char *
 | 
						|
grn_nfkc#{@unicode_version}_compose(const unsigned char *prefix, const unsigned char *suffix)
 | 
						|
{
 | 
						|
    HEADER
 | 
						|
    suffix = {}
 | 
						|
    compose_map.each{|src,dst|
 | 
						|
      chars = src.chars
 | 
						|
      if chars.size != 2
 | 
						|
        STDERR.puts "caution: more than two chars in pattern #{chars.join('|')}"
 | 
						|
      end
 | 
						|
      s = chars.pop
 | 
						|
      if suffix[s]
 | 
						|
        suffix[s][chars.join] = dst
 | 
						|
      else
 | 
						|
        suffix[s] = {chars.join=>dst}
 | 
						|
      end
 | 
						|
    }
 | 
						|
    gen_compose_sub(suffix, 0)
 | 
						|
    @output.puts(<<-FOOTER)
 | 
						|
  return 0;
 | 
						|
}
 | 
						|
    FOOTER
 | 
						|
  end
 | 
						|
 | 
						|
  def gen_compose_sub2(hash, level, indent)
 | 
						|
    bl = ' ' * ((level + indent + 0) * 2)
 | 
						|
    if hash['']
 | 
						|
      @output.print "#{bl}return \""
 | 
						|
      hash[''].each_byte{|b| @output.printf('\x%02X', b)}
 | 
						|
      @output.puts "\";"
 | 
						|
      hash.delete('')
 | 
						|
    end
 | 
						|
    return if hash.empty?
 | 
						|
 | 
						|
    h2 = {}
 | 
						|
    hash.each{|key,val|
 | 
						|
      key = key.dup
 | 
						|
      key.force_encoding("ASCII-8BIT")
 | 
						|
      head = key.bytes[0]
 | 
						|
      rest = key[1..-1]
 | 
						|
      if h2[head]
 | 
						|
        h2[head][rest] = val
 | 
						|
      else
 | 
						|
        h2[head] = {rest => val}
 | 
						|
      end
 | 
						|
    }
 | 
						|
 | 
						|
    if h2.size == 1
 | 
						|
      h2.each{|key,val|
 | 
						|
        @output.printf("#{bl}if (prefix[#{level}] == 0x%02X) {\n", key)
 | 
						|
        gen_compose_sub2(val, level + 1, indent)
 | 
						|
        @output.puts bl + '}'
 | 
						|
      }
 | 
						|
    else
 | 
						|
      @output.puts "#{bl}switch (prefix[#{level}]) {"
 | 
						|
      h2.keys.sort.each{|k|
 | 
						|
        @output.printf("#{bl}case 0x%02X :\n", k)
 | 
						|
        gen_compose_sub2(h2[k], level + 1, indent)
 | 
						|
        @output.puts("#{bl}  break;")
 | 
						|
      }
 | 
						|
      @output.puts bl + '}'
 | 
						|
    end
 | 
						|
  end
 | 
						|
 | 
						|
  def gen_compose_sub(hash, level)
 | 
						|
    bl = ' ' * ((level + 0) * 2)
 | 
						|
    if hash['']
 | 
						|
      gen_compose_sub2(hash[''], 0, level)
 | 
						|
      hash.delete('')
 | 
						|
    end
 | 
						|
    return if hash.empty?
 | 
						|
    h2 = {}
 | 
						|
    hash.each{|key,val|
 | 
						|
      key = key.dup
 | 
						|
      key.force_encoding("ASCII-8BIT")
 | 
						|
      head = key.bytes[0]
 | 
						|
      rest = key[1..-1]
 | 
						|
      if h2[head]
 | 
						|
        h2[head][rest] = val
 | 
						|
      else
 | 
						|
        h2[head] = {rest => val}
 | 
						|
      end
 | 
						|
    }
 | 
						|
    if h2.size == 1
 | 
						|
      h2.each{|key,val|
 | 
						|
        @output.printf("#{bl}if (suffix[#{level}] == 0x%02X) {\n", key)
 | 
						|
        gen_compose_sub(val, level + 1)
 | 
						|
        @output.puts bl + '}'
 | 
						|
      }
 | 
						|
    else
 | 
						|
      @output.puts "#{bl}switch (suffix[#{level}]) {"
 | 
						|
      h2.keys.sort.each{|k|
 | 
						|
        @output.printf("#{bl}case 0x%02X :\n", k)
 | 
						|
        gen_compose_sub(h2[k], level + 1)
 | 
						|
        @output.puts("#{bl}  break;")
 | 
						|
      }
 | 
						|
      @output.puts bl + '}'
 | 
						|
    end
 | 
						|
  end
 | 
						|
end
 | 
						|
 | 
						|
class TableGenerator < SwitchGenerator
 | 
						|
  private
 | 
						|
  def name_prefix
 | 
						|
    "grn_nfkc#{@unicode_version}_"
 | 
						|
  end
 | 
						|
 | 
						|
  def table_name(type, common_bytes)
 | 
						|
    suffix = common_bytes.collect {|byte| "%02x" % byte}.join("")
 | 
						|
    "#{name_prefix}#{type}_table_#{suffix}"
 | 
						|
  end
 | 
						|
 | 
						|
  def function_name(type)
 | 
						|
    "#{name_prefix}#{type}"
 | 
						|
  end
 | 
						|
 | 
						|
  def generate_char_convert_tables(type, return_type, byte_size_groups)
 | 
						|
    if return_type.end_with?("*")
 | 
						|
      space = ""
 | 
						|
    else
 | 
						|
      space = " "
 | 
						|
    end
 | 
						|
    byte_size_groups.keys.sort.each do |common_bytes|
 | 
						|
      chars = byte_size_groups[common_bytes]
 | 
						|
      lines = []
 | 
						|
      all_values = []
 | 
						|
      last_bytes = chars.collect {|char| char.bytes.last}
 | 
						|
      last_bytes.min.step(last_bytes.max).each_slice(8) do |slice|
 | 
						|
        values = slice.collect do |last_byte|
 | 
						|
          char = (common_bytes + [last_byte]).pack("c*")
 | 
						|
          char.force_encoding("UTF-8")
 | 
						|
          yield(char)
 | 
						|
        end
 | 
						|
        all_values.concat(values)
 | 
						|
        lines << ("  " + values.join(", "))
 | 
						|
      end
 | 
						|
 | 
						|
      next if all_values.uniq.size == 1
 | 
						|
 | 
						|
      @output.puts(<<-TABLE_HEADER)
 | 
						|
 | 
						|
static #{return_type}#{space}#{table_name(type, common_bytes)}[] = {
 | 
						|
      TABLE_HEADER
 | 
						|
      @output.puts(lines.join(",\n"))
 | 
						|
      @output.puts(<<-TABLE_FOOTER)
 | 
						|
};
 | 
						|
      TABLE_FOOTER
 | 
						|
    end
 | 
						|
  end
 | 
						|
 | 
						|
  def generate_char_convert_function(type,
 | 
						|
                                     argument_list,
 | 
						|
                                     char_variable,
 | 
						|
                                     default,
 | 
						|
                                     return_type,
 | 
						|
                                     byte_size_groups,
 | 
						|
                                     options={})
 | 
						|
    modifier = options[:internal] ? "static inline " : ""
 | 
						|
    @output.puts(<<-HEADER)
 | 
						|
 | 
						|
#{modifier}#{return_type}
 | 
						|
#{function_name(type)}(#{argument_list})
 | 
						|
{
 | 
						|
    HEADER
 | 
						|
 | 
						|
    prev_common_bytes = []
 | 
						|
    prev_n_common_bytes = 0
 | 
						|
    first_group = true
 | 
						|
    byte_size_groups.keys.sort.each do |common_bytes|
 | 
						|
      chars = byte_size_groups[common_bytes]
 | 
						|
      chars_bytes = chars.collect(&:bytes).sort
 | 
						|
      min = chars_bytes.first.last
 | 
						|
      max = chars_bytes.last.last
 | 
						|
      n_common_bytes = 0
 | 
						|
      if common_bytes.empty?
 | 
						|
        indent = "  "
 | 
						|
        yield(:no_common_bytes, indent, chars, chars_bytes)
 | 
						|
      else
 | 
						|
        if first_group
 | 
						|
          @output.puts(<<-BODY)
 | 
						|
  {
 | 
						|
          BODY
 | 
						|
        end
 | 
						|
 | 
						|
        found_different_byte = false
 | 
						|
        common_bytes.each_with_index do |common_byte, i|
 | 
						|
          unless found_different_byte
 | 
						|
            if prev_common_bytes[i] == common_byte
 | 
						|
              n_common_bytes += 1
 | 
						|
              next
 | 
						|
            end
 | 
						|
            found_different_byte = true
 | 
						|
          end
 | 
						|
          indent = "  " * i
 | 
						|
          # p [i, prev_common_bytes.collect{|x| "%#04x" % x}, common_bytes.collect{|x| "%#04x" % x}, "%#04x" % common_byte, n_common_bytes, prev_n_common_bytes]
 | 
						|
          # TODO: The following code may be able to be simplified.
 | 
						|
          if prev_common_bytes[i].nil?
 | 
						|
            # p nil
 | 
						|
            @output.puts(<<-BODY)
 | 
						|
    #{indent}switch (#{char_variable}[#{i}]) {
 | 
						|
            BODY
 | 
						|
          elsif i < prev_n_common_bytes
 | 
						|
            # p :prev
 | 
						|
            @output.puts(<<-BODY)
 | 
						|
    #{indent}  default :
 | 
						|
    #{indent}    break;
 | 
						|
    #{indent}  }
 | 
						|
    #{indent}  break;
 | 
						|
            BODY
 | 
						|
          elsif n_common_bytes < prev_n_common_bytes
 | 
						|
            # p :common_prev
 | 
						|
            @output.puts(<<-BODY)
 | 
						|
    #{indent}switch (#{char_variable}[#{i}]) {
 | 
						|
            BODY
 | 
						|
          else
 | 
						|
            # p :else
 | 
						|
            prev_common_bytes.size.downto(common_bytes.size + 1) do |j|
 | 
						|
              sub_indent = "  " * (j - 1)
 | 
						|
              @output.puts(<<-BODY)
 | 
						|
    #{indent}#{sub_indent}default :
 | 
						|
    #{indent}#{sub_indent}  break;
 | 
						|
    #{indent}#{sub_indent}}
 | 
						|
    #{indent}#{sub_indent}break;
 | 
						|
              BODY
 | 
						|
            end
 | 
						|
          end
 | 
						|
          @output.puts(<<-BODY)
 | 
						|
    #{indent}case #{"%#04x" % common_byte} :
 | 
						|
          BODY
 | 
						|
        end
 | 
						|
 | 
						|
        n = chars_bytes.first.size - 1
 | 
						|
        indent = "    " + ("  " * common_bytes.size)
 | 
						|
        yield(:have_common_bytes, indent, chars, chars_bytes, n, common_bytes)
 | 
						|
      end
 | 
						|
 | 
						|
      prev_common_bytes = common_bytes
 | 
						|
      prev_n_common_bytes = n_common_bytes
 | 
						|
      first_group = false
 | 
						|
    end
 | 
						|
 | 
						|
    # p [prev_common_bytes.collect{|x| "%#04x" % x}, prev_n_common_bytes]
 | 
						|
 | 
						|
    (prev_common_bytes.size - 1).step(0, -1) do |i|
 | 
						|
      indent = "  " * i
 | 
						|
      @output.puts(<<-BODY)
 | 
						|
    #{indent}default :
 | 
						|
    #{indent}  break;
 | 
						|
    #{indent}}
 | 
						|
      BODY
 | 
						|
      if i > 0
 | 
						|
        @output.puts(<<-BODY)
 | 
						|
    #{indent}break;
 | 
						|
        BODY
 | 
						|
      end
 | 
						|
    end
 | 
						|
 | 
						|
    @output.puts(<<-FOOTER)
 | 
						|
  }
 | 
						|
 | 
						|
  return #{default};
 | 
						|
}
 | 
						|
    FOOTER
 | 
						|
  end
 | 
						|
 | 
						|
  def generate_char_converter(type,
 | 
						|
                              function_type,
 | 
						|
                              char_map,
 | 
						|
                              default,
 | 
						|
                              return_type,
 | 
						|
                              options={},
 | 
						|
                              &converter)
 | 
						|
    byte_size_groups = char_map.keys.group_by do |from|
 | 
						|
      bytes = from.bytes
 | 
						|
      bytes[0..-2]
 | 
						|
    end
 | 
						|
 | 
						|
    generate_char_convert_tables(type,
 | 
						|
                                 return_type,
 | 
						|
                                 byte_size_groups,
 | 
						|
                                 &converter)
 | 
						|
 | 
						|
    char_variable = "utf8"
 | 
						|
    generate_char_convert_function(function_type,
 | 
						|
                                   "const unsigned char *#{char_variable}",
 | 
						|
                                   char_variable,
 | 
						|
                                   default,
 | 
						|
                                   return_type,
 | 
						|
                                   byte_size_groups,
 | 
						|
                                   options) do |state, *args|
 | 
						|
      case state
 | 
						|
      when :no_common_bytes
 | 
						|
        indent, chars, chars_bytes = args
 | 
						|
        if chars.size == 1
 | 
						|
          char = chars[0]
 | 
						|
          char_byte = chars_bytes.first.first
 | 
						|
          value = yield(char)
 | 
						|
          @output.puts(<<-BODY)
 | 
						|
#{indent}if (#{char_variable}[0] < 0x80) {
 | 
						|
#{indent}  if (#{char_variable}[0] == #{"%#04x" % char_byte}) {
 | 
						|
#{indent}    return #{value};
 | 
						|
#{indent}  } else {
 | 
						|
#{indent}    return #{default};
 | 
						|
#{indent}  }
 | 
						|
#{indent}} else {
 | 
						|
          BODY
 | 
						|
        else
 | 
						|
          min = chars_bytes.first.first
 | 
						|
          max = chars_bytes.last.first
 | 
						|
          @output.puts(<<-BODY)
 | 
						|
#{indent}if (#{char_variable}[0] < 0x80) {
 | 
						|
#{indent}  if (#{char_variable}[0] >= #{"%#04x" % min} &&
 | 
						|
#{indent}      #{char_variable}[0] <= #{"%#04x" % max}) {
 | 
						|
#{indent}    return #{table_name(type, [])}[#{char_variable}[0] - #{"%#04x" % min}];
 | 
						|
#{indent}  } else {
 | 
						|
#{indent}    return #{default};
 | 
						|
#{indent}  }
 | 
						|
#{indent}} else {
 | 
						|
          BODY
 | 
						|
        end
 | 
						|
      when :have_common_bytes
 | 
						|
        indent, chars, chars_bytes, n, common_bytes = args
 | 
						|
        if chars.size == 1
 | 
						|
          char = chars[0]
 | 
						|
          char_byte = chars_bytes.first.last
 | 
						|
          value = yield(char)
 | 
						|
          @output.puts(<<-BODY)
 | 
						|
#{indent}if (#{char_variable}[#{n}] == #{"%#04x" % char_byte}) {
 | 
						|
#{indent}  return #{value};
 | 
						|
#{indent}}
 | 
						|
#{indent}break;
 | 
						|
          BODY
 | 
						|
        else
 | 
						|
          sorted_chars = chars.sort
 | 
						|
          min = chars_bytes.first.last
 | 
						|
          max = chars_bytes.last.last
 | 
						|
          all_values = (min..max).collect do |last_byte|
 | 
						|
            char = (common_bytes + [last_byte]).pack("c*")
 | 
						|
            char.force_encoding("UTF-8")
 | 
						|
            yield(char)
 | 
						|
          end
 | 
						|
          if all_values.uniq.size == 1
 | 
						|
            value = all_values.first
 | 
						|
          else
 | 
						|
            value = "#{table_name(type, common_bytes)}[#{char_variable}[#{n}] - #{"%#04x" % min}]"
 | 
						|
          end
 | 
						|
          last_n_bits_for_char_in_utf8 = 6
 | 
						|
          max_n_chars_in_byte = 2 ** last_n_bits_for_char_in_utf8
 | 
						|
          if all_values.size == max_n_chars_in_byte
 | 
						|
            @output.puts(<<-BODY)
 | 
						|
#{indent}return #{value};
 | 
						|
            BODY
 | 
						|
          else
 | 
						|
            @output.puts(<<-BODY)
 | 
						|
#{indent}if (#{char_variable}[#{n}] >= #{"%#04x" % min} &&
 | 
						|
#{indent}    #{char_variable}[#{n}] <= #{"%#04x" % max}) {
 | 
						|
#{indent}  return #{value};
 | 
						|
#{indent}}
 | 
						|
#{indent}break;
 | 
						|
            BODY
 | 
						|
          end
 | 
						|
        end
 | 
						|
      end
 | 
						|
    end
 | 
						|
  end
 | 
						|
 | 
						|
  def generate_blockcode_char_type(block_codes)
 | 
						|
    default = "GRN_CHAR_OTHERS"
 | 
						|
 | 
						|
    char_types = {}
 | 
						|
    current_type = default
 | 
						|
    prev_char = nil
 | 
						|
    block_codes.keys.sort.each do |char|
 | 
						|
      type = block_codes[char]
 | 
						|
      if current_type != default
 | 
						|
        prev_code_point = prev_char.codepoints[0]
 | 
						|
        code_point = char.codepoints[0]
 | 
						|
        (prev_code_point...code_point).each do |target_code_point|
 | 
						|
          target_char = [target_code_point].pack("U*")
 | 
						|
          char_types[target_char] = current_type
 | 
						|
        end
 | 
						|
      end
 | 
						|
      current_type = type
 | 
						|
      prev_char = char
 | 
						|
    end
 | 
						|
    unless current_type == default
 | 
						|
      raise "TODO: Consider the max unicode character"
 | 
						|
      max_unicode_char = "\u{10ffff}"
 | 
						|
      (prev_char..max_unicode_char).each do |target_char|
 | 
						|
        char_types[target_char] = current_type
 | 
						|
      end
 | 
						|
    end
 | 
						|
 | 
						|
    generate_char_converter("char_type",
 | 
						|
                            "char_type",
 | 
						|
                            char_types,
 | 
						|
                            default,
 | 
						|
                            "grn_char_type") do |char|
 | 
						|
      char_types[char] || default
 | 
						|
    end
 | 
						|
  end
 | 
						|
 | 
						|
  def generate_decompose(decompose_map)
 | 
						|
    default = "NULL"
 | 
						|
    generate_char_converter("decompose",
 | 
						|
                            "decompose",
 | 
						|
                            decompose_map,
 | 
						|
                            default,
 | 
						|
                            "const char *") do |from|
 | 
						|
      to = decompose_map[from]
 | 
						|
      if to
 | 
						|
        escaped_value = to.bytes.collect {|char| "\\x%02x" % char}.join("")
 | 
						|
        "\"#{escaped_value}\""
 | 
						|
      else
 | 
						|
        default
 | 
						|
      end
 | 
						|
    end
 | 
						|
  end
 | 
						|
 | 
						|
  def generate_compose(compose_map)
 | 
						|
    # require "pp"
 | 
						|
    # p compose_map.size
 | 
						|
    # pp compose_map.keys.group_by {|x| x.chars[1]}.size
 | 
						|
    # pp compose_map.keys.group_by {|x| x.chars[1]}.collect {|k, vs| [k, k.codepoints, vs.size, vs.group_by {|x| x.chars[0].bytesize}.collect {|k2, vs2| [k2, vs2.size]}]}
 | 
						|
    # pp compose_map.keys.group_by {|x| x.chars[0].bytesize}.collect {|k, vs| [k, vs.size]}
 | 
						|
    # pp compose_map
 | 
						|
 | 
						|
    suffix_char_map = {}
 | 
						|
    compose_map.each do |source, destination|
 | 
						|
      chars = source.chars
 | 
						|
      if chars.size != 2
 | 
						|
        STDERR.puts "caution: more than two chars in pattern #{chars.join('|')}"
 | 
						|
        return
 | 
						|
      end
 | 
						|
      prefix, suffix = chars
 | 
						|
      suffix_char_map[suffix] ||= {}
 | 
						|
      suffix_char_map[suffix][prefix] = destination
 | 
						|
    end
 | 
						|
 | 
						|
    suffix_char_map.each do |suffix, prefix_char_map|
 | 
						|
      suffix_bytes = suffix.bytes.collect {|byte| "%02x" % byte}.join("")
 | 
						|
      default = "NULL"
 | 
						|
      generate_char_converter("compose_prefix_#{suffix_bytes}",
 | 
						|
                              "compose_prefix_#{suffix_bytes}",
 | 
						|
                              prefix_char_map,
 | 
						|
                              default,
 | 
						|
                              "const char *",
 | 
						|
                              :internal => true) do |prefix|
 | 
						|
        to = prefix_char_map[prefix]
 | 
						|
        if to
 | 
						|
          escaped_value = to.bytes.collect {|char| "\\x%02x" % char}.join("")
 | 
						|
          "\"#{escaped_value}\""
 | 
						|
        else
 | 
						|
          default
 | 
						|
        end
 | 
						|
      end
 | 
						|
    end
 | 
						|
 | 
						|
 | 
						|
    char_variable = "suffix_utf8"
 | 
						|
    argument_list =
 | 
						|
      "const unsigned char *prefix_utf8, " +
 | 
						|
      "const unsigned char *#{char_variable}"
 | 
						|
    default = "NULL"
 | 
						|
    byte_size_groups = suffix_char_map.keys.group_by do |from|
 | 
						|
      bytes = from.bytes
 | 
						|
      bytes[0..-2]
 | 
						|
    end
 | 
						|
    generate_char_convert_function("compose",
 | 
						|
                                   argument_list,
 | 
						|
                                   char_variable,
 | 
						|
                                   default,
 | 
						|
                                   "const char *",
 | 
						|
                                   byte_size_groups) do |type, *args|
 | 
						|
      case type
 | 
						|
      when :no_common_bytes
 | 
						|
        indent, chars, chars_bytes = args
 | 
						|
        @output.puts(<<-BODY)
 | 
						|
#{indent}switch (#{char_variable}[0]) {
 | 
						|
        BODY
 | 
						|
        chars.each do |char|
 | 
						|
          suffix_bytes = char.bytes.collect {|byte| "%02x" % byte}.join("")
 | 
						|
          type = "compose_prefix_#{suffix_bytes}"
 | 
						|
          @output.puts(<<-BODY)
 | 
						|
#{indent}case #{"%#04x" % char.bytes.last} :
 | 
						|
#{indent}  return #{function_name(type)}(prefix_utf8);
 | 
						|
          BODY
 | 
						|
        end
 | 
						|
        @output.puts(<<-BODY)
 | 
						|
#{indent}default :
 | 
						|
#{indent}  return #{default};
 | 
						|
#{indent}}
 | 
						|
#{indent}break;
 | 
						|
        BODY
 | 
						|
      when :have_common_bytes
 | 
						|
        indent, chars, chars_bytes, n, common_bytes = args
 | 
						|
        @output.puts(<<-BODY)
 | 
						|
#{indent}switch (#{char_variable}[#{n}]) {
 | 
						|
        BODY
 | 
						|
        chars.each do |char|
 | 
						|
          suffix_bytes = char.bytes.collect {|byte| "%02x" % byte}.join("")
 | 
						|
          type = "compose_prefix_#{suffix_bytes}"
 | 
						|
          @output.puts(<<-BODY)
 | 
						|
#{indent}case #{"%#04x" % char.bytes.last} :
 | 
						|
#{indent}  return #{function_name(type)}(prefix_utf8);
 | 
						|
          BODY
 | 
						|
        end
 | 
						|
        @output.puts(<<-BODY)
 | 
						|
#{indent}default :
 | 
						|
#{indent}  return #{default};
 | 
						|
#{indent}}
 | 
						|
#{indent}break;
 | 
						|
        BODY
 | 
						|
      end
 | 
						|
    end
 | 
						|
  end
 | 
						|
 | 
						|
  def to_bytes_map(char_map)
 | 
						|
    bytes_map = {}
 | 
						|
    char_map.each_key do |from|
 | 
						|
      parent = bytes_map
 | 
						|
      from.bytes[0..-2].each do |byte|
 | 
						|
        parent[byte] ||= {}
 | 
						|
        parent = parent[byte]
 | 
						|
      end
 | 
						|
      parent[from.bytes.last] = char_map[from]
 | 
						|
    end
 | 
						|
    bytes_map
 | 
						|
  end
 | 
						|
end
 | 
						|
 | 
						|
def create_bc(option)
 | 
						|
  bc = {}
 | 
						|
  open("|./icudump --#{option}").each{|l|
 | 
						|
    src,_,code = l.chomp.split("\t")
 | 
						|
    str = src.split(':').collect(&:hex).pack("c*")
 | 
						|
    str.force_encoding("UTF-8")
 | 
						|
    bc[str] = code
 | 
						|
  }
 | 
						|
  bc
 | 
						|
end
 | 
						|
 | 
						|
def ccpush(hash, src, dst)
 | 
						|
  head = src.shift
 | 
						|
  hash[head] = {} unless hash[head]
 | 
						|
  if head
 | 
						|
    ccpush(hash[head], src, dst)
 | 
						|
  else
 | 
						|
    hash[head] = dst
 | 
						|
  end
 | 
						|
end
 | 
						|
 | 
						|
def subst(hash, str)
 | 
						|
  cand = nil
 | 
						|
  src = str.chars
 | 
						|
  for i in 0..src.size-1
 | 
						|
    h = hash
 | 
						|
    for j in i..src.size-1
 | 
						|
      head = src[j]
 | 
						|
      h = h[head]
 | 
						|
      break unless h
 | 
						|
      if h[nil]
 | 
						|
        cand = src[0,i].join("") + h[nil] + src[j + 1..-1].join("")
 | 
						|
      end
 | 
						|
    end
 | 
						|
    return cand if cand
 | 
						|
  end
 | 
						|
  return str
 | 
						|
end
 | 
						|
 | 
						|
def map_entry(decompose, cc, src, dst)
 | 
						|
  dst.downcase! unless $case_sensitive
 | 
						|
  loop {
 | 
						|
    dst2 = subst(cc, dst)
 | 
						|
    break if dst2 == dst
 | 
						|
    dst = dst2
 | 
						|
  }
 | 
						|
  unless $keep_space
 | 
						|
    dst = $1 if dst =~ /^ +([^ ].*)$/
 | 
						|
  end
 | 
						|
  decompose[src] = dst if src != dst
 | 
						|
end
 | 
						|
 | 
						|
def create_decompose_map()
 | 
						|
  cc = {}
 | 
						|
  open('|./icudump --cc').each{|l|
 | 
						|
    _,src,dst = l.chomp.split("\t")
 | 
						|
    if cc[src]
 | 
						|
      STDERR.puts "caution: ambiguous mapping #{src}|#{cc[src]}|#{dst}" if cc[src] != dst
 | 
						|
    end
 | 
						|
    ccpush(cc, src.chars, dst)
 | 
						|
  }
 | 
						|
  decompose_map = {}
 | 
						|
  open('|./icudump --nfkd').each{|l|
 | 
						|
    n,src,dst = l.chomp.split("\t")
 | 
						|
    map_entry(decompose_map, cc, src, dst)
 | 
						|
  }
 | 
						|
  if File.readable?(CUSTOM_RULE_PATH)
 | 
						|
    open(CUSTOM_RULE_PATH).each{|l|
 | 
						|
      src,dst = l.chomp.split("\t")
 | 
						|
      map_entry(decompose_map, cc, src, dst)
 | 
						|
    }
 | 
						|
  end
 | 
						|
  unless $case_sensitive
 | 
						|
    for c in 'A'..'Z'
 | 
						|
      decompose_map[c] = c.downcase
 | 
						|
    end
 | 
						|
  end
 | 
						|
  return decompose_map
 | 
						|
end
 | 
						|
 | 
						|
def create_compose_map(decompose_map)
 | 
						|
  cc = {}
 | 
						|
  open('|./icudump --cc').each{|l|
 | 
						|
    _,src,dst = l.chomp.split("\t")
 | 
						|
    src = src.chars.collect{|c| decompose_map[c] || c}.join
 | 
						|
    dst = decompose_map[dst] || dst
 | 
						|
    if cc[src] && cc[src] != dst
 | 
						|
      STDERR.puts("caution: inconsitent mapping '#{src}' => '#{cc[src]}'|'#{dst}'")
 | 
						|
    end
 | 
						|
    cc[src] = dst if src != dst
 | 
						|
  }
 | 
						|
  loop {
 | 
						|
    noccur = 0
 | 
						|
    cc2 = {}
 | 
						|
    cc.each {|src,dst|
 | 
						|
      src2 = src
 | 
						|
      chars = src.chars
 | 
						|
      l = chars.size - 1
 | 
						|
      for i in 0..l
 | 
						|
        for j in i..l
 | 
						|
          next if i == 0 && j == l
 | 
						|
          str = chars[i..j].join
 | 
						|
          if decompose_map[str]
 | 
						|
            STDERR.printf("caution: recursive mapping '%s'=>'%s'\n",
 | 
						|
                          str, decompose_map[str])
 | 
						|
          end
 | 
						|
          if cc[str]
 | 
						|
            src2 = (i > 0 ? chars[0..i-1].join : '') + cc[str] + (j < l ? chars[j+1..l].join : '')
 | 
						|
            noccur += 1
 | 
						|
          end
 | 
						|
        end
 | 
						|
      end
 | 
						|
      cc2[src2] = dst if src2 != dst
 | 
						|
    }
 | 
						|
    cc = cc2
 | 
						|
    STDERR.puts("substituted #{noccur} patterns.")
 | 
						|
    break if noccur == 0
 | 
						|
    STDERR.puts('try again..')
 | 
						|
  }
 | 
						|
  return cc
 | 
						|
end
 | 
						|
 | 
						|
######## main #######
 | 
						|
 | 
						|
generator_class = SwitchGenerator
 | 
						|
ARGV.each{|arg|
 | 
						|
  case arg
 | 
						|
  when /-*c/i
 | 
						|
    $case_sensitive = true
 | 
						|
  when /-*s/i
 | 
						|
    $keep_space = true
 | 
						|
  when "--impl=switch"
 | 
						|
    generator_class = SwitchGenerator
 | 
						|
  when "--impl=table"
 | 
						|
    generator_class = TableGenerator
 | 
						|
  end
 | 
						|
}
 | 
						|
 | 
						|
STDERR.puts('compiling icudump')
 | 
						|
system('cc -Wall -O3 -o icudump -I/tmp/local/include -L/tmp/local/lib icudump.c -licuuc -licui18n')
 | 
						|
 | 
						|
STDERR.puts('getting Unicode version')
 | 
						|
unicode_version = `./icudump --version`.strip.gsub(".", "")
 | 
						|
 | 
						|
STDERR.puts('creating bc..')
 | 
						|
bc = create_bc("gc")
 | 
						|
 | 
						|
STDERR.puts('creating decompose map..')
 | 
						|
decompose_map = create_decompose_map()
 | 
						|
 | 
						|
STDERR.puts('creating compose map..')
 | 
						|
compose_map = create_compose_map(decompose_map)
 | 
						|
 | 
						|
File.open("nfkc#{unicode_version}.c", "w") do |output|
 | 
						|
  output.puts(<<-HEADER)
 | 
						|
/* -*- c-basic-offset: 2 -*- */
 | 
						|
/*
 | 
						|
  Copyright(C) 2010-2016 Brazil
 | 
						|
 | 
						|
  This library is free software; you can redistribute it and/or
 | 
						|
  modify it under the terms of the GNU Lesser General Public
 | 
						|
  License version 2.1 as published by the Free Software Foundation.
 | 
						|
 | 
						|
  This library is distributed in the hope that it will be useful,
 | 
						|
  but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | 
						|
  Lesser General Public License for more details.
 | 
						|
 | 
						|
  You should have received a copy of the GNU Lesser General Public
 | 
						|
  License along with this library; if not, write to the Free Software
 | 
						|
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1335  USA
 | 
						|
*/
 | 
						|
 | 
						|
/*
 | 
						|
  Don't edit this file by hand. it generated automatically by nfkc.rb.
 | 
						|
*/
 | 
						|
 | 
						|
#include "grn.h"
 | 
						|
#include "grn_nfkc.h"
 | 
						|
#include <groonga/nfkc.h>
 | 
						|
 | 
						|
#ifdef GRN_WITH_NFKC
 | 
						|
  HEADER
 | 
						|
 | 
						|
  generator = generator_class.new(unicode_version, output)
 | 
						|
  generator.generate(bc, decompose_map, compose_map)
 | 
						|
 | 
						|
  output.puts(<<-FOOTER)
 | 
						|
 | 
						|
#endif /* GRN_WITH_NFKC */
 | 
						|
 | 
						|
  FOOTER
 | 
						|
end
 |