11 0x00.upto(0x1f) {|ch| C_ESC[[ch].pack("C")] ||= "\\%03o" % ch }
12 0x7f.upto(0xff) {|ch| C_ESC[[ch].pack("C")] = "\\%03o" % ch }
13 C_ESC_PAT = Regexp.union(*C_ESC.keys)
16 '"' + str.gsub(C_ESC_PAT) { C_ESC[$&] } + '"'
20 def self.parse(pattern)
21 if /\A\s*(([0-9a-f][0-9a-f]|\{([0-9a-f][0-9a-f]|[0-9a-f][0-9a-f]-[0-9a-f][0-9a-f])(,([0-9a-f][0-9a-f]|[0-9a-f][0-9a-f]-[0-9a-f][0-9a-f]))*\})+(\s+|\z))*\z/i !~ pattern
22 raise ArgumentError, "invalid pattern: #{pattern.inspect}"
25 pattern.scan(/\S+/) {|seq|
28 if /\A([0-9a-f][0-9a-f])/i =~ seq
30 seq_result << [byte..byte]
32 elsif /\A\{([^\}]+)\}/ =~ seq
36 set.scan(/[^,]+/) {|range|
37 if /\A([0-9a-f][0-9a-f])-([0-9a-f][0-9a-f])\z/ =~ range
41 elsif /\A([0-9a-f][0-9a-f])\z/ =~ range
43 set_result << (byte..byte)
45 raise "invalid range: #{range.inspect}"
48 seq_result << set_result
50 raise "invalid sequence: #{seq.inspect}"
67 self.class == other.class &&
68 @pat == other.instance_eval { @pat }
82 if byteset.length == 1 && byteset[0].begin == byteset[0].end
83 "%02x" % byteset[0].begin
87 if range.begin == range.end
90 "%02x-%02x" % [range.begin, range.end]
102 "\#<#{self.class}: #{self.to_s}>"
109 @pat.map {|seq| seq.length }.min
117 @pat.map {|seq| seq.length }.max
131 seq.first.each {|range|
144 seq.first.each {|range|
146 (h[byte] ||= []) << seq[1..-1]
150 h.keys.sort.each {|byte|
151 yield byte, StrSet.new(h[byte])
159 hash.each {|pat, action|
160 h[StrSet.parse(pat)] = action
172 hash ^= k.hash ^ v.hash
178 self.class == other.class &&
179 @map == other.instance_eval { @map }
185 "\#<#{self.class}:" +
186 @map.map {|k, v| " [" + k.to_s + "]=>" + v.inspect }.join('') +
191 @map.keys.map {|k| k.max_length }.max
195 @map.each {|ss, action|
196 return action if ss.emptyable?
201 def each_firstbyte(valid_encoding=nil)
203 @map.each {|ss, action|
205 raise "emptyable pattern"
207 ss.each_firstbyte {|byte, rest|
212 h[byte][rest] = action
217 valid_encoding.each_firstbyte {|byte, rest|
219 am = ActionMap.new(h[byte])
222 am = ActionMap.new(rest => :undef)
227 h.keys.sort.each {|byte|
228 am = ActionMap.new(h[byte])
237 def format_offsets(min, max, offsets)
238 offsets = offsets[min..max]
239 code = "{ %d, %d,\n" % [min, max]
240 0.step(offsets.length-1,16) {|i|
242 code << offsets[i,8].map {|off| "%3d," % off.to_s }.join('')
243 if i+8 < offsets.length
245 code << offsets[i+8,8].map {|off| "%3d," % off.to_s }.join('')
253 def generate_info(info)
269 when /\A([0-9a-f][0-9a-f])\z/i
271 when /\A([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])\z/i
273 when /\A([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])\z/i
274 "o3(0x#$1,0x#$2,0x#$3)"
275 when /\A([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])\z/i
276 "o4(0x#$1,0x#$2,0x#$3,0x#$4)"
277 when /\A&/ # pointer to BYTE_LOOKUP structure
280 raise "unexpected action: #{info.inspect}"
284 def format_infos(infos)
285 infos = infos.map {|info| generate_info(info) }
286 maxlen = infos.map {|info| info.length }.max
287 columns = maxlen <= 16 ? 4 : 2
289 0.step(infos.length-1, columns) {|i|
291 is = infos[i,columns]
293 code << sprintf(" %#{maxlen}s,", info)
301 def generate_lookup_node(name, table)
306 table.each_with_index {|action, byte|
308 if action != :invalid
312 unless o = infomap[action]
313 infomap[action] = o = infos.length
322 offsets_key = [min, max, offsets[min..max]]
323 if n = OffsetsMemo[offsets_key]
327 offsets_name = "#{name}_offsets"
328 offsets_code = <<"End"
329 static const unsigned char
330 #{offsets_name}[#{2+max-min+1}] = #{format_offsets(min,max,offsets)};
332 OffsetsMemo[offsets_key] = offsets_name
335 if n = InfosMemo[infos]
339 infos_name = "#{name}_infos"
341 static const struct byte_lookup* const
342 #{infos_name}[#{infos.length}] = #{format_infos(infos)};
344 InfosMemo[infos] = infos_name
347 r = offsets_code + infos_code + <<"End"
348 static const BYTE_LOOKUP
362 def generate_node(code, name_hint=nil, valid_encoding=nil)
363 if n = PreMemo[[self,valid_encoding]]
367 table = Array.new(0x100, :invalid)
368 each_firstbyte(valid_encoding) {|byte, rest, rest_valid_encoding|
369 if a = rest.empty_action
373 name_hint2 = "#{name_hint}_#{'%02X' % byte}" if name_hint
374 table[byte] = "&" + rest.generate_node(code, name_hint2, rest_valid_encoding)
378 if n = PostMemo[table]
383 name_hint = "fun_" + NextName.dup
387 PreMemo[[self,valid_encoding]] = PostMemo[table] = name_hint
389 code << generate_lookup_node(name_hint, table)
397 # integer means UTF-8 encoded sequence.
398 k = [k].pack("U").unpack("H*")[0].upcase if Integer === k
399 v = [v].pack("U").unpack("H*")[0].upcase if Integer === v
405 def transcode_compile_tree(name, from, map)
406 map = encode_utf8(map)
411 am = ActionMap.parse(h)
413 max_input = am.max_input_length
415 if ValidEncoding[from]
416 valid_encoding = StrSet.parse(ValidEncoding[from])
422 defined_name = am.generate_node(code, name, valid_encoding)
423 return defined_name, code, max_input
428 def transcode_tblgen(from, to, map)
429 STDERR.puts "converter from #{from} to #{to}" if VERBOSE_MODE
430 id_from = from.tr('^0-9A-Za-z', '_')
431 id_to = to.tr('^0-9A-Za-z', '_')
433 tree_name = "to_#{id_to}"
435 tree_name = "from_#{id_from}"
437 tree_name = "from_#{id_from}_to_#{id_to}"
439 map = encode_utf8(map)
440 real_tree_name, tree_code, max_input = transcode_compile_tree(tree_name, from, map)
441 transcoder_name = "rb_#{tree_name}"
442 TRANSCODERS << transcoder_name
443 input_unit_length = UnitLength[from]
444 max_output = map.map {|k,v| String === v ? v.length/2 : 1 }.max
445 transcoder_code = <<"End"
446 static const rb_transcoder
447 #{transcoder_name} = {
448 #{c_esc from}, #{c_esc to}, &#{real_tree_name},
449 #{input_unit_length}, /* input_unit_length */
450 #{max_input}, /* max_input */
451 #{max_output}, /* max_output */
452 stateless_converter, /* stateful_type */
453 NULL, NULL, NULL, NULL,
457 tree_code + "\n" + transcoder_code
460 def transcode_generate_node(am, name_hint=nil)
461 STDERR.puts "converter for #{name_hint}" if VERBOSE_MODE
463 am.generate_node(code, name_hint)
467 def transcode_register_code
469 TRANSCODERS.each {|transcoder_name|
470 code << " rb_register_transcoder(&#{transcoder_name});\n"
481 UnitLength.default = 1
484 '1byte' => '{00-ff}',
485 '2byte' => '{00-ff}{00-ff}',
486 '4byte' => '{00-ff}{00-ff}{00-ff}{00-ff}',
487 'US-ASCII' => '{00-7f}',
491 {e1-ec}{80-bf}{80-bf}
493 {ee-ef}{80-bf}{80-bf}
494 f0{90-bf}{80-bf}{80-bf}
495 {f1-f3}{80-bf}{80-bf}{80-bf}
496 f4{80-8f}{80-bf}{80-bf}',
497 'UTF-16BE' => '{00-d7,e0-ff}{00-ff}
498 {d8-db}{00-ff}{dc-df}{00-ff}',
499 'UTF-16LE' => '{00-ff}{00-d7,e0-ff}
500 {00-ff}{d8-db}{00-ff}{dc-df}',
501 'UTF-32BE' => '0000{00-d7,e0-ff}{00-ff}
502 00{01-10}{00-ff}{00-ff}',
503 'UTF-32LE' => '{00-ff}{00-d7,e0-ff}0000
504 {00-ff}{00-ff}{01-10}00',
509 'CP51932' => '{00-7f}
512 'Shift_JIS' => '{00-7f}
513 {81-9f,e0-fc}{40-7e,80-fc}
518 {81-fe}{41-5a,61-7a,81-fe}',
520 {81-fe}{40-7e,a1-fe}',
523 8e{a1-b0}{a1-fe}{a1-fe}',
525 {81-fe}{40-7e,80-fe}',
526 'GB18030' => '{00-7f}
528 {81-fe}{30-39}{81-fe}{30-39}',
532 'ASCII-8BIT' => '1byte',
533 'ISO-8859-1' => '1byte',
534 'ISO-8859-2' => '1byte',
535 'ISO-8859-3' => '1byte',
536 'ISO-8859-4' => '1byte',
537 'ISO-8859-5' => '1byte',
538 'ISO-8859-6' => '1byte',
539 'ISO-8859-7' => '1byte',
540 'ISO-8859-8' => '1byte',
541 'ISO-8859-9' => '1byte',
542 'ISO-8859-10' => '1byte',
543 'ISO-8859-11' => '1byte',
544 'ISO-8859-13' => '1byte',
545 'ISO-8859-14' => '1byte',
546 'ISO-8859-15' => '1byte',
547 'Windows-31J' => 'Shift_JIS',
549 ValidEncoding[k] = ValidEncoding.fetch(v)
552 def make_signature(filename, src)
553 "src=#{filename.dump}, len=#{src.length}, checksum=#{src.sum}"
556 output_filename = nil
560 op = OptionParser.new
561 op.def_option("--help", "show help message") { puts op; exit 0 }
562 op.def_option("--verbose", "verbose mode") { verbose_mode = true }
563 op.def_option("--force", "force table generation") { force_mode = true }
564 op.def_option("--output=FILE", "specify output file") {|arg| output_filename = arg }
567 VERBOSE_MODE = verbose_mode
570 dir = File.dirname(arg)
571 $:.unshift dir unless $:.include? dir
573 src.force_encoding("ascii-8bit") if src.respond_to? :force_encoding
574 this_script = File.read(__FILE__)
575 this_script.force_encoding("ascii-8bit") if this_script.respond_to? :force_encoding
577 base_signature = "/* autogenerated. */\n"
578 base_signature << "/* #{make_signature(File.basename(__FILE__), this_script)} */\n"
579 base_signature << "/* #{make_signature(File.basename(arg), src)} */\n"
581 if !force_mode && output_filename && File.readable?(output_filename)
582 old_signature = File.open(output_filename) {|f| f.gets("").chomp }
583 chk_signature = base_signature.dup
584 old_signature.each_line {|line|
585 if %r{/\* src="([0-9a-z_.-]+)",} =~ line
587 next if name == File.basename(arg) || name == File.basename(__FILE__)
588 path = File.join(dir, name)
589 if File.readable? path
590 chk_signature << "/* #{make_signature(name, File.read(path))} */\n"
594 if old_signature == chk_signature
596 File.utime(now, now, output_filename)
597 STDERR.puts "already up-to-date: #{output_filename}" if VERBOSE_MODE
604 STDERR.puts "generating #{output_filename} ..."
609 erb_result = ERB.new(src, nil, '%').result(binding)
615 lib = File.basename(lib)
616 path = File.join(dir, lib)
617 if File.readable? path
618 lib_sigs << "/* #{make_signature(lib, File.read(path))} */\n"
623 result << base_signature
630 new_filename = output_filename + ".new"
631 FileUtils.mkdir_p(File.dirname(output_filename))
632 File.open(new_filename, "wb") {|f| f << result }
633 File.rename(new_filename, output_filename)
634 STDERR.puts "done." if VERBOSE_MODE