mirror of
https://github.com/MariaDB/server.git
synced 2025-02-21 04:43:29 +01:00
897 lines
24 KiB
Ruby
Executable file
897 lines
24 KiB
Ruby
Executable file
#!/usr/bin/env ruby
|
|
# -*- coding: utf-8 -*-
|
|
#
|
|
# Copyright(C) 2010-2016 Brazil
|
|
#
|
|
# This library is free software; you can redistribute it and/or
|
|
# modify it under the terms of the GNU Lesser General Public
|
|
# License version 2.1 as published by the Free Software Foundation.
|
|
#
|
|
# This library is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
# Lesser General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU Lesser General Public
|
|
# License along with this library; if not, write to the Free Software
|
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
|
|
|
|
CUSTOM_RULE_PATH = 'nfkc-custom-rules.txt'
|
|
|
|
class SwitchGenerator
|
|
def initialize(unicode_version, output)
|
|
@unicode_version = unicode_version
|
|
@output = output
|
|
end
|
|
|
|
def generate(bc, decompose_map, compose_map)
|
|
STDERR.puts('generating char type code..')
|
|
generate_blockcode_char_type(bc)
|
|
STDERR.puts('generating decompose code..')
|
|
generate_decompose(decompose_map)
|
|
STDERR.puts('generating compose code..')
|
|
generate_compose(compose_map)
|
|
end
|
|
|
|
private
|
|
def generate_blockcode_char_type(bc)
|
|
@output.puts(<<-HEADER)
|
|
|
|
grn_char_type
|
|
grn_nfkc#{@unicode_version}_char_type(const unsigned char *str)
|
|
{
|
|
HEADER
|
|
|
|
@lv = 0
|
|
gen_bc(bc, 0)
|
|
|
|
@output.puts(<<-FOOTER)
|
|
return -1;
|
|
}
|
|
FOOTER
|
|
end
|
|
|
|
def gen_bc(hash, level)
|
|
bl = ' ' * (level * 2)
|
|
h2 = {}
|
|
hash.each{|key,val|
|
|
key = key.dup
|
|
key.force_encoding("ASCII-8BIT")
|
|
head = key.bytes[0]
|
|
rest = key[1..-1]
|
|
if h2[head]
|
|
h2[head][rest] = val
|
|
else
|
|
h2[head] = {rest => val}
|
|
end
|
|
}
|
|
if h2.size < 3
|
|
h2.keys.sort.each{|k|
|
|
if (0x80 < k)
|
|
@output.printf("#{bl}if (str[#{level}] < 0x%02X) { return #{@lv}; }\n", k)
|
|
end
|
|
h = h2[k]
|
|
if h.keys.join =~ /^\x80*$/n
|
|
@lv, = h.values
|
|
else
|
|
@output.printf("#{bl}if (str[#{level}] == 0x%02X) {\n", k)
|
|
gen_bc(h, level + 1)
|
|
@output.puts bl + '}'
|
|
end
|
|
}
|
|
@output.puts bl + "return #{@lv};"
|
|
else
|
|
@output.puts bl + "switch (str[#{level}]) {"
|
|
lk = 0x80
|
|
br = true
|
|
h2.keys.sort.each{|k|
|
|
if (lk < k)
|
|
for j in lk..k-1
|
|
@output.printf("#{bl}case 0x%02X :\n", j)
|
|
end
|
|
br = false
|
|
end
|
|
unless br
|
|
@output.puts bl + " return #{@lv};"
|
|
@output.puts bl + ' break;'
|
|
end
|
|
h = h2[k]
|
|
@output.printf("#{bl}case 0x%02X :\n", k)
|
|
if h.keys.join =~ /^\x80*$/n
|
|
@lv, = h.values
|
|
br = false
|
|
else
|
|
gen_bc(h, level + 1)
|
|
@output.puts bl + ' break;'
|
|
br = true
|
|
end
|
|
lk = k + 1
|
|
}
|
|
@output.puts bl + 'default :'
|
|
@output.puts bl + " return #{@lv};"
|
|
@output.puts bl + ' break;'
|
|
@output.puts bl + '}'
|
|
end
|
|
end
|
|
|
|
def generate_decompose(hash)
|
|
@output.puts(<<-HEADER)
|
|
|
|
const char *
|
|
grn_nfkc#{@unicode_version}_decompose(const unsigned char *str)
|
|
{
|
|
HEADER
|
|
|
|
gen_decompose(hash, 0)
|
|
|
|
@output.puts(<<-FOOTER)
|
|
return 0;
|
|
}
|
|
FOOTER
|
|
end
|
|
|
|
def gen_decompose(hash, level)
|
|
bl = ' ' * ((level + 0) * 2)
|
|
if hash['']
|
|
dst = ''
|
|
hash[''].each_byte{|b| dst << format('\x%02X', b)}
|
|
@output.puts "#{bl}return \"#{dst}\";"
|
|
hash.delete('')
|
|
end
|
|
return if hash.empty?
|
|
h2 = {}
|
|
hash.each{|key,val|
|
|
key = key.dup
|
|
key.force_encoding("ASCII-8BIT")
|
|
head = key.bytes[0]
|
|
rest = key[1..-1]
|
|
if h2[head]
|
|
h2[head][rest] = val
|
|
else
|
|
h2[head] = {rest => val}
|
|
end
|
|
}
|
|
if h2.size == 1
|
|
h2.each{|key,val|
|
|
@output.printf("#{bl}if (str[#{level}] == 0x%02X) {\n", key)
|
|
gen_decompose(val, level + 1)
|
|
@output.puts bl + '}'
|
|
}
|
|
else
|
|
@output.puts "#{bl}switch (str[#{level}]) {"
|
|
h2.keys.sort.each{|k|
|
|
@output.printf("#{bl}case 0x%02X :\n", k)
|
|
gen_decompose(h2[k], level + 1)
|
|
@output.puts("#{bl} break;")
|
|
}
|
|
@output.puts bl + '}'
|
|
end
|
|
end
|
|
|
|
def generate_compose(compose_map)
|
|
@output.puts(<<-HEADER)
|
|
|
|
const char *
|
|
grn_nfkc#{@unicode_version}_compose(const unsigned char *prefix, const unsigned char *suffix)
|
|
{
|
|
HEADER
|
|
suffix = {}
|
|
compose_map.each{|src,dst|
|
|
chars = src.chars
|
|
if chars.size != 2
|
|
STDERR.puts "caution: more than two chars in pattern #{chars.join('|')}"
|
|
end
|
|
s = chars.pop
|
|
if suffix[s]
|
|
suffix[s][chars.join] = dst
|
|
else
|
|
suffix[s] = {chars.join=>dst}
|
|
end
|
|
}
|
|
gen_compose_sub(suffix, 0)
|
|
@output.puts(<<-FOOTER)
|
|
return 0;
|
|
}
|
|
FOOTER
|
|
end
|
|
|
|
def gen_compose_sub2(hash, level, indent)
|
|
bl = ' ' * ((level + indent + 0) * 2)
|
|
if hash['']
|
|
@output.print "#{bl}return \""
|
|
hash[''].each_byte{|b| @output.printf('\x%02X', b)}
|
|
@output.puts "\";"
|
|
hash.delete('')
|
|
end
|
|
return if hash.empty?
|
|
|
|
h2 = {}
|
|
hash.each{|key,val|
|
|
key = key.dup
|
|
key.force_encoding("ASCII-8BIT")
|
|
head = key.bytes[0]
|
|
rest = key[1..-1]
|
|
if h2[head]
|
|
h2[head][rest] = val
|
|
else
|
|
h2[head] = {rest => val}
|
|
end
|
|
}
|
|
|
|
if h2.size == 1
|
|
h2.each{|key,val|
|
|
@output.printf("#{bl}if (prefix[#{level}] == 0x%02X) {\n", key)
|
|
gen_compose_sub2(val, level + 1, indent)
|
|
@output.puts bl + '}'
|
|
}
|
|
else
|
|
@output.puts "#{bl}switch (prefix[#{level}]) {"
|
|
h2.keys.sort.each{|k|
|
|
@output.printf("#{bl}case 0x%02X :\n", k)
|
|
gen_compose_sub2(h2[k], level + 1, indent)
|
|
@output.puts("#{bl} break;")
|
|
}
|
|
@output.puts bl + '}'
|
|
end
|
|
end
|
|
|
|
def gen_compose_sub(hash, level)
|
|
bl = ' ' * ((level + 0) * 2)
|
|
if hash['']
|
|
gen_compose_sub2(hash[''], 0, level)
|
|
hash.delete('')
|
|
end
|
|
return if hash.empty?
|
|
h2 = {}
|
|
hash.each{|key,val|
|
|
key = key.dup
|
|
key.force_encoding("ASCII-8BIT")
|
|
head = key.bytes[0]
|
|
rest = key[1..-1]
|
|
if h2[head]
|
|
h2[head][rest] = val
|
|
else
|
|
h2[head] = {rest => val}
|
|
end
|
|
}
|
|
if h2.size == 1
|
|
h2.each{|key,val|
|
|
@output.printf("#{bl}if (suffix[#{level}] == 0x%02X) {\n", key)
|
|
gen_compose_sub(val, level + 1)
|
|
@output.puts bl + '}'
|
|
}
|
|
else
|
|
@output.puts "#{bl}switch (suffix[#{level}]) {"
|
|
h2.keys.sort.each{|k|
|
|
@output.printf("#{bl}case 0x%02X :\n", k)
|
|
gen_compose_sub(h2[k], level + 1)
|
|
@output.puts("#{bl} break;")
|
|
}
|
|
@output.puts bl + '}'
|
|
end
|
|
end
|
|
end
|
|
|
|
class TableGenerator < SwitchGenerator
|
|
private
|
|
def name_prefix
|
|
"grn_nfkc#{@unicode_version}_"
|
|
end
|
|
|
|
def table_name(type, common_bytes)
|
|
suffix = common_bytes.collect {|byte| "%02x" % byte}.join("")
|
|
"#{name_prefix}#{type}_table_#{suffix}"
|
|
end
|
|
|
|
def function_name(type)
|
|
"#{name_prefix}#{type}"
|
|
end
|
|
|
|
def generate_char_convert_tables(type, return_type, byte_size_groups)
|
|
if return_type.end_with?("*")
|
|
space = ""
|
|
else
|
|
space = " "
|
|
end
|
|
byte_size_groups.keys.sort.each do |common_bytes|
|
|
chars = byte_size_groups[common_bytes]
|
|
lines = []
|
|
all_values = []
|
|
last_bytes = chars.collect {|char| char.bytes.last}
|
|
last_bytes.min.step(last_bytes.max).each_slice(8) do |slice|
|
|
values = slice.collect do |last_byte|
|
|
char = (common_bytes + [last_byte]).pack("c*")
|
|
char.force_encoding("UTF-8")
|
|
yield(char)
|
|
end
|
|
all_values.concat(values)
|
|
lines << (" " + values.join(", "))
|
|
end
|
|
|
|
next if all_values.uniq.size == 1
|
|
|
|
@output.puts(<<-TABLE_HEADER)
|
|
|
|
static #{return_type}#{space}#{table_name(type, common_bytes)}[] = {
|
|
TABLE_HEADER
|
|
@output.puts(lines.join(",\n"))
|
|
@output.puts(<<-TABLE_FOOTER)
|
|
};
|
|
TABLE_FOOTER
|
|
end
|
|
end
|
|
|
|
def generate_char_convert_function(type,
|
|
argument_list,
|
|
char_variable,
|
|
default,
|
|
return_type,
|
|
byte_size_groups,
|
|
options={})
|
|
modifier = options[:internal] ? "static inline " : ""
|
|
@output.puts(<<-HEADER)
|
|
|
|
#{modifier}#{return_type}
|
|
#{function_name(type)}(#{argument_list})
|
|
{
|
|
HEADER
|
|
|
|
prev_common_bytes = []
|
|
prev_n_common_bytes = 0
|
|
first_group = true
|
|
byte_size_groups.keys.sort.each do |common_bytes|
|
|
chars = byte_size_groups[common_bytes]
|
|
chars_bytes = chars.collect(&:bytes).sort
|
|
min = chars_bytes.first.last
|
|
max = chars_bytes.last.last
|
|
n_common_bytes = 0
|
|
if common_bytes.empty?
|
|
indent = " "
|
|
yield(:no_common_bytes, indent, chars, chars_bytes)
|
|
else
|
|
if first_group
|
|
@output.puts(<<-BODY)
|
|
{
|
|
BODY
|
|
end
|
|
|
|
found_different_byte = false
|
|
common_bytes.each_with_index do |common_byte, i|
|
|
unless found_different_byte
|
|
if prev_common_bytes[i] == common_byte
|
|
n_common_bytes += 1
|
|
next
|
|
end
|
|
found_different_byte = true
|
|
end
|
|
indent = " " * i
|
|
# p [i, prev_common_bytes.collect{|x| "%#04x" % x}, common_bytes.collect{|x| "%#04x" % x}, "%#04x" % common_byte, n_common_bytes, prev_n_common_bytes]
|
|
# TODO: The following code may be able to be simplified.
|
|
if prev_common_bytes[i].nil?
|
|
# p nil
|
|
@output.puts(<<-BODY)
|
|
#{indent}switch (#{char_variable}[#{i}]) {
|
|
BODY
|
|
elsif i < prev_n_common_bytes
|
|
# p :prev
|
|
@output.puts(<<-BODY)
|
|
#{indent} default :
|
|
#{indent} break;
|
|
#{indent} }
|
|
#{indent} break;
|
|
BODY
|
|
elsif n_common_bytes < prev_n_common_bytes
|
|
# p :common_prev
|
|
@output.puts(<<-BODY)
|
|
#{indent}switch (#{char_variable}[#{i}]) {
|
|
BODY
|
|
else
|
|
# p :else
|
|
prev_common_bytes.size.downto(common_bytes.size + 1) do |j|
|
|
sub_indent = " " * (j - 1)
|
|
@output.puts(<<-BODY)
|
|
#{indent}#{sub_indent}default :
|
|
#{indent}#{sub_indent} break;
|
|
#{indent}#{sub_indent}}
|
|
#{indent}#{sub_indent}break;
|
|
BODY
|
|
end
|
|
end
|
|
@output.puts(<<-BODY)
|
|
#{indent}case #{"%#04x" % common_byte} :
|
|
BODY
|
|
end
|
|
|
|
n = chars_bytes.first.size - 1
|
|
indent = " " + (" " * common_bytes.size)
|
|
yield(:have_common_bytes, indent, chars, chars_bytes, n, common_bytes)
|
|
end
|
|
|
|
prev_common_bytes = common_bytes
|
|
prev_n_common_bytes = n_common_bytes
|
|
first_group = false
|
|
end
|
|
|
|
# p [prev_common_bytes.collect{|x| "%#04x" % x}, prev_n_common_bytes]
|
|
|
|
(prev_common_bytes.size - 1).step(0, -1) do |i|
|
|
indent = " " * i
|
|
@output.puts(<<-BODY)
|
|
#{indent}default :
|
|
#{indent} break;
|
|
#{indent}}
|
|
BODY
|
|
if i > 0
|
|
@output.puts(<<-BODY)
|
|
#{indent}break;
|
|
BODY
|
|
end
|
|
end
|
|
|
|
@output.puts(<<-FOOTER)
|
|
}
|
|
|
|
return #{default};
|
|
}
|
|
FOOTER
|
|
end
|
|
|
|
def generate_char_converter(type,
|
|
function_type,
|
|
char_map,
|
|
default,
|
|
return_type,
|
|
options={},
|
|
&converter)
|
|
byte_size_groups = char_map.keys.group_by do |from|
|
|
bytes = from.bytes
|
|
bytes[0..-2]
|
|
end
|
|
|
|
generate_char_convert_tables(type,
|
|
return_type,
|
|
byte_size_groups,
|
|
&converter)
|
|
|
|
char_variable = "utf8"
|
|
generate_char_convert_function(function_type,
|
|
"const unsigned char *#{char_variable}",
|
|
char_variable,
|
|
default,
|
|
return_type,
|
|
byte_size_groups,
|
|
options) do |state, *args|
|
|
case state
|
|
when :no_common_bytes
|
|
indent, chars, chars_bytes = args
|
|
if chars.size == 1
|
|
char = chars[0]
|
|
char_byte = chars_bytes.first.first
|
|
value = yield(char)
|
|
@output.puts(<<-BODY)
|
|
#{indent}if (#{char_variable}[0] < 0x80) {
|
|
#{indent} if (#{char_variable}[0] == #{"%#04x" % char_byte}) {
|
|
#{indent} return #{value};
|
|
#{indent} } else {
|
|
#{indent} return #{default};
|
|
#{indent} }
|
|
#{indent}} else {
|
|
BODY
|
|
else
|
|
min = chars_bytes.first.first
|
|
max = chars_bytes.last.first
|
|
@output.puts(<<-BODY)
|
|
#{indent}if (#{char_variable}[0] < 0x80) {
|
|
#{indent} if (#{char_variable}[0] >= #{"%#04x" % min} &&
|
|
#{indent} #{char_variable}[0] <= #{"%#04x" % max}) {
|
|
#{indent} return #{table_name(type, [])}[#{char_variable}[0] - #{"%#04x" % min}];
|
|
#{indent} } else {
|
|
#{indent} return #{default};
|
|
#{indent} }
|
|
#{indent}} else {
|
|
BODY
|
|
end
|
|
when :have_common_bytes
|
|
indent, chars, chars_bytes, n, common_bytes = args
|
|
if chars.size == 1
|
|
char = chars[0]
|
|
char_byte = chars_bytes.first.last
|
|
value = yield(char)
|
|
@output.puts(<<-BODY)
|
|
#{indent}if (#{char_variable}[#{n}] == #{"%#04x" % char_byte}) {
|
|
#{indent} return #{value};
|
|
#{indent}}
|
|
#{indent}break;
|
|
BODY
|
|
else
|
|
sorted_chars = chars.sort
|
|
min = chars_bytes.first.last
|
|
max = chars_bytes.last.last
|
|
all_values = (min..max).collect do |last_byte|
|
|
char = (common_bytes + [last_byte]).pack("c*")
|
|
char.force_encoding("UTF-8")
|
|
yield(char)
|
|
end
|
|
if all_values.uniq.size == 1
|
|
value = all_values.first
|
|
else
|
|
value = "#{table_name(type, common_bytes)}[#{char_variable}[#{n}] - #{"%#04x" % min}]"
|
|
end
|
|
last_n_bits_for_char_in_utf8 = 6
|
|
max_n_chars_in_byte = 2 ** last_n_bits_for_char_in_utf8
|
|
if all_values.size == max_n_chars_in_byte
|
|
@output.puts(<<-BODY)
|
|
#{indent}return #{value};
|
|
BODY
|
|
else
|
|
@output.puts(<<-BODY)
|
|
#{indent}if (#{char_variable}[#{n}] >= #{"%#04x" % min} &&
|
|
#{indent} #{char_variable}[#{n}] <= #{"%#04x" % max}) {
|
|
#{indent} return #{value};
|
|
#{indent}}
|
|
#{indent}break;
|
|
BODY
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
def generate_blockcode_char_type(block_codes)
|
|
default = "GRN_CHAR_OTHERS"
|
|
|
|
char_types = {}
|
|
current_type = default
|
|
prev_char = nil
|
|
block_codes.keys.sort.each do |char|
|
|
type = block_codes[char]
|
|
if current_type != default
|
|
prev_code_point = prev_char.codepoints[0]
|
|
code_point = char.codepoints[0]
|
|
(prev_code_point...code_point).each do |target_code_point|
|
|
target_char = [target_code_point].pack("U*")
|
|
char_types[target_char] = current_type
|
|
end
|
|
end
|
|
current_type = type
|
|
prev_char = char
|
|
end
|
|
unless current_type == default
|
|
raise "TODO: Consider the max unicode character"
|
|
max_unicode_char = "\u{10ffff}"
|
|
(prev_char..max_unicode_char).each do |target_char|
|
|
char_types[target_char] = current_type
|
|
end
|
|
end
|
|
|
|
generate_char_converter("char_type",
|
|
"char_type",
|
|
char_types,
|
|
default,
|
|
"grn_char_type") do |char|
|
|
char_types[char] || default
|
|
end
|
|
end
|
|
|
|
def generate_decompose(decompose_map)
|
|
default = "NULL"
|
|
generate_char_converter("decompose",
|
|
"decompose",
|
|
decompose_map,
|
|
default,
|
|
"const char *") do |from|
|
|
to = decompose_map[from]
|
|
if to
|
|
escaped_value = to.bytes.collect {|char| "\\x%02x" % char}.join("")
|
|
"\"#{escaped_value}\""
|
|
else
|
|
default
|
|
end
|
|
end
|
|
end
|
|
|
|
def generate_compose(compose_map)
|
|
# require "pp"
|
|
# p compose_map.size
|
|
# pp compose_map.keys.group_by {|x| x.chars[1]}.size
|
|
# pp compose_map.keys.group_by {|x| x.chars[1]}.collect {|k, vs| [k, k.codepoints, vs.size, vs.group_by {|x| x.chars[0].bytesize}.collect {|k2, vs2| [k2, vs2.size]}]}
|
|
# pp compose_map.keys.group_by {|x| x.chars[0].bytesize}.collect {|k, vs| [k, vs.size]}
|
|
# pp compose_map
|
|
|
|
suffix_char_map = {}
|
|
compose_map.each do |source, destination|
|
|
chars = source.chars
|
|
if chars.size != 2
|
|
STDERR.puts "caution: more than two chars in pattern #{chars.join('|')}"
|
|
return
|
|
end
|
|
prefix, suffix = chars
|
|
suffix_char_map[suffix] ||= {}
|
|
suffix_char_map[suffix][prefix] = destination
|
|
end
|
|
|
|
suffix_char_map.each do |suffix, prefix_char_map|
|
|
suffix_bytes = suffix.bytes.collect {|byte| "%02x" % byte}.join("")
|
|
default = "NULL"
|
|
generate_char_converter("compose_prefix_#{suffix_bytes}",
|
|
"compose_prefix_#{suffix_bytes}",
|
|
prefix_char_map,
|
|
default,
|
|
"const char *",
|
|
:internal => true) do |prefix|
|
|
to = prefix_char_map[prefix]
|
|
if to
|
|
escaped_value = to.bytes.collect {|char| "\\x%02x" % char}.join("")
|
|
"\"#{escaped_value}\""
|
|
else
|
|
default
|
|
end
|
|
end
|
|
end
|
|
|
|
|
|
char_variable = "suffix_utf8"
|
|
argument_list =
|
|
"const unsigned char *prefix_utf8, " +
|
|
"const unsigned char *#{char_variable}"
|
|
default = "NULL"
|
|
byte_size_groups = suffix_char_map.keys.group_by do |from|
|
|
bytes = from.bytes
|
|
bytes[0..-2]
|
|
end
|
|
generate_char_convert_function("compose",
|
|
argument_list,
|
|
char_variable,
|
|
default,
|
|
"const char *",
|
|
byte_size_groups) do |type, *args|
|
|
case type
|
|
when :no_common_bytes
|
|
indent, chars, chars_bytes = args
|
|
@output.puts(<<-BODY)
|
|
#{indent}switch (#{char_variable}[0]) {
|
|
BODY
|
|
chars.each do |char|
|
|
suffix_bytes = char.bytes.collect {|byte| "%02x" % byte}.join("")
|
|
type = "compose_prefix_#{suffix_bytes}"
|
|
@output.puts(<<-BODY)
|
|
#{indent}case #{"%#04x" % char.bytes.last} :
|
|
#{indent} return #{function_name(type)}(prefix_utf8);
|
|
BODY
|
|
end
|
|
@output.puts(<<-BODY)
|
|
#{indent}default :
|
|
#{indent} return #{default};
|
|
#{indent}}
|
|
#{indent}break;
|
|
BODY
|
|
when :have_common_bytes
|
|
indent, chars, chars_bytes, n, common_bytes = args
|
|
@output.puts(<<-BODY)
|
|
#{indent}switch (#{char_variable}[#{n}]) {
|
|
BODY
|
|
chars.each do |char|
|
|
suffix_bytes = char.bytes.collect {|byte| "%02x" % byte}.join("")
|
|
type = "compose_prefix_#{suffix_bytes}"
|
|
@output.puts(<<-BODY)
|
|
#{indent}case #{"%#04x" % char.bytes.last} :
|
|
#{indent} return #{function_name(type)}(prefix_utf8);
|
|
BODY
|
|
end
|
|
@output.puts(<<-BODY)
|
|
#{indent}default :
|
|
#{indent} return #{default};
|
|
#{indent}}
|
|
#{indent}break;
|
|
BODY
|
|
end
|
|
end
|
|
end
|
|
|
|
def to_bytes_map(char_map)
|
|
bytes_map = {}
|
|
char_map.each_key do |from|
|
|
parent = bytes_map
|
|
from.bytes[0..-2].each do |byte|
|
|
parent[byte] ||= {}
|
|
parent = parent[byte]
|
|
end
|
|
parent[from.bytes.last] = char_map[from]
|
|
end
|
|
bytes_map
|
|
end
|
|
end
|
|
|
|
def create_bc(option)
|
|
bc = {}
|
|
open("|./icudump --#{option}").each{|l|
|
|
src,_,code = l.chomp.split("\t")
|
|
str = src.split(':').collect(&:hex).pack("c*")
|
|
str.force_encoding("UTF-8")
|
|
bc[str] = code
|
|
}
|
|
bc
|
|
end
|
|
|
|
def ccpush(hash, src, dst)
|
|
head = src.shift
|
|
hash[head] = {} unless hash[head]
|
|
if head
|
|
ccpush(hash[head], src, dst)
|
|
else
|
|
hash[head] = dst
|
|
end
|
|
end
|
|
|
|
def subst(hash, str)
|
|
cand = nil
|
|
src = str.chars
|
|
for i in 0..src.size-1
|
|
h = hash
|
|
for j in i..src.size-1
|
|
head = src[j]
|
|
h = h[head]
|
|
break unless h
|
|
if h[nil]
|
|
cand = src[0,i].join("") + h[nil] + src[j + 1..-1].join("")
|
|
end
|
|
end
|
|
return cand if cand
|
|
end
|
|
return str
|
|
end
|
|
|
|
def map_entry(decompose, cc, src, dst)
|
|
dst.downcase! unless $case_sensitive
|
|
loop {
|
|
dst2 = subst(cc, dst)
|
|
break if dst2 == dst
|
|
dst = dst2
|
|
}
|
|
unless $keep_space
|
|
dst = $1 if dst =~ /^ +([^ ].*)$/
|
|
end
|
|
decompose[src] = dst if src != dst
|
|
end
|
|
|
|
def create_decompose_map()
|
|
cc = {}
|
|
open('|./icudump --cc').each{|l|
|
|
_,src,dst = l.chomp.split("\t")
|
|
if cc[src]
|
|
STDERR.puts "caution: ambiguous mapping #{src}|#{cc[src]}|#{dst}" if cc[src] != dst
|
|
end
|
|
ccpush(cc, src.chars, dst)
|
|
}
|
|
decompose_map = {}
|
|
open('|./icudump --nfkd').each{|l|
|
|
n,src,dst = l.chomp.split("\t")
|
|
map_entry(decompose_map, cc, src, dst)
|
|
}
|
|
if File.readable?(CUSTOM_RULE_PATH)
|
|
open(CUSTOM_RULE_PATH).each{|l|
|
|
src,dst = l.chomp.split("\t")
|
|
map_entry(decompose_map, cc, src, dst)
|
|
}
|
|
end
|
|
unless $case_sensitive
|
|
for c in 'A'..'Z'
|
|
decompose_map[c] = c.downcase
|
|
end
|
|
end
|
|
return decompose_map
|
|
end
|
|
|
|
def create_compose_map(decompose_map)
|
|
cc = {}
|
|
open('|./icudump --cc').each{|l|
|
|
_,src,dst = l.chomp.split("\t")
|
|
src = src.chars.collect{|c| decompose_map[c] || c}.join
|
|
dst = decompose_map[dst] || dst
|
|
if cc[src] && cc[src] != dst
|
|
STDERR.puts("caution: inconsitent mapping '#{src}' => '#{cc[src]}'|'#{dst}'")
|
|
end
|
|
cc[src] = dst if src != dst
|
|
}
|
|
loop {
|
|
noccur = 0
|
|
cc2 = {}
|
|
cc.each {|src,dst|
|
|
src2 = src
|
|
chars = src.chars
|
|
l = chars.size - 1
|
|
for i in 0..l
|
|
for j in i..l
|
|
next if i == 0 && j == l
|
|
str = chars[i..j].join
|
|
if decompose_map[str]
|
|
STDERR.printf("caution: recursive mapping '%s'=>'%s'\n",
|
|
str, decompose_map[str])
|
|
end
|
|
if cc[str]
|
|
src2 = (i > 0 ? chars[0..i-1].join : '') + cc[str] + (j < l ? chars[j+1..l].join : '')
|
|
noccur += 1
|
|
end
|
|
end
|
|
end
|
|
cc2[src2] = dst if src2 != dst
|
|
}
|
|
cc = cc2
|
|
STDERR.puts("substituted #{noccur} patterns.")
|
|
break if noccur == 0
|
|
STDERR.puts('try again..')
|
|
}
|
|
return cc
|
|
end
|
|
|
|
######## main #######
|
|
|
|
generator_class = SwitchGenerator
|
|
ARGV.each{|arg|
|
|
case arg
|
|
when /-*c/i
|
|
$case_sensitive = true
|
|
when /-*s/i
|
|
$keep_space = true
|
|
when "--impl=switch"
|
|
generator_class = SwitchGenerator
|
|
when "--impl=table"
|
|
generator_class = TableGenerator
|
|
end
|
|
}
|
|
|
|
STDERR.puts('compiling icudump')
|
|
system('cc -Wall -O3 -o icudump -I/tmp/local/include -L/tmp/local/lib icudump.c -licuuc -licui18n')
|
|
|
|
STDERR.puts('getting Unicode version')
|
|
unicode_version = `./icudump --version`.strip.gsub(".", "")
|
|
|
|
STDERR.puts('creating bc..')
|
|
bc = create_bc("gc")
|
|
|
|
STDERR.puts('creating decompose map..')
|
|
decompose_map = create_decompose_map()
|
|
|
|
STDERR.puts('creating compose map..')
|
|
compose_map = create_compose_map(decompose_map)
|
|
|
|
File.open("nfkc#{unicode_version}.c", "w") do |output|
|
|
output.puts(<<-HEADER)
|
|
/* -*- c-basic-offset: 2 -*- */
|
|
/*
|
|
Copyright(C) 2010-2016 Brazil
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License version 2.1 as published by the Free Software Foundation.
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with this library; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
|
|
*/
|
|
|
|
/*
|
|
Don't edit this file by hand. it generated automatically by nfkc.rb.
|
|
*/
|
|
|
|
#include "grn.h"
|
|
#include "grn_nfkc.h"
|
|
#include <groonga/nfkc.h>
|
|
|
|
#ifdef GRN_WITH_NFKC
|
|
HEADER
|
|
|
|
generator = generator_class.new(unicode_version, output)
|
|
generator.generate(bc, decompose_map, compose_map)
|
|
|
|
output.puts(<<-FOOTER)
|
|
|
|
#endif /* GRN_WITH_NFKC */
|
|
|
|
FOOTER
|
|
end
|