* transcode.c (econv_init): accept Encoding object as source_encoding
[ruby-svn.git] / tool / transcode-tblgen.rb
blobb0d35f623073f40755fb78f781c0428b1d997512
1 require 'optparse'
2 require 'erb'
3 require 'fileutils'
5 C_ESC = {
6   "\\" => "\\\\",
7   '"' => '\"',
8   "\n" => '\n',
11 0x00.upto(0x1f) {|ch| C_ESC[[ch].pack("C")] ||= "\\%03o" % ch }
12 0x7f.upto(0xff) {|ch| C_ESC[[ch].pack("C")] = "\\%03o" % ch }
13 C_ESC_PAT = Regexp.union(*C_ESC.keys)
15 def c_esc(str)
16   '"' + str.gsub(C_ESC_PAT) { C_ESC[$&] } + '"'
17 end
19 class StrSet
20   def self.parse(pattern)
21     if /\A\s*(([0-9a-f][0-9a-f]|\{([0-9a-f][0-9a-f]|[0-9a-f][0-9a-f]-[0-9a-f][0-9a-f])(,([0-9a-f][0-9a-f]|[0-9a-f][0-9a-f]-[0-9a-f][0-9a-f]))*\})+(\s+|\z))*\z/i !~ pattern
22       raise ArgumentError, "invalid pattern: #{pattern.inspect}"
23     end
24     result = []
25     pattern.scan(/\S+/) {|seq|
26       seq_result = []
27       while !seq.empty?
28         if /\A([0-9a-f][0-9a-f])/i =~ seq
29           byte = $1.to_i(16)
30           seq_result << [byte..byte]
31           seq = $'
32         elsif /\A\{([^\}]+)\}/ =~ seq
33           set = $1
34           seq = $'
35           set_result = []
36           set.scan(/[^,]+/) {|range|
37             if /\A([0-9a-f][0-9a-f])-([0-9a-f][0-9a-f])\z/ =~ range
38               b = $1.to_i(16)
39               e = $2.to_i(16)
40               set_result << (b..e)
41             elsif /\A([0-9a-f][0-9a-f])\z/ =~ range
42               byte = $1.to_i(16)
43               set_result << (byte..byte)
44             else
45               raise "invalid range: #{range.inspect}"
46             end
47           }
48           seq_result << set_result
49         else
50           raise "invalid sequence: #{seq.inspect}"
51         end
52       end
53       result << seq_result
54     }
55     self.new(result)
56   end
58   def initialize(pat)
59     @pat = pat
60   end
62   def hash
63     @pat.hash
64   end
66   def eql?(other)
67     self.class == other.class &&
68     @pat == other.instance_eval { @pat }
69   end
71   alias == eql?
73   def to_s
74     if @pat.empty?
75       "(empset)"
76     else
77       @pat.map {|seq|
78         if seq.empty?
79           "(empstr)"
80         else
81           seq.map {|byteset|
82             if byteset.length == 1 && byteset[0].begin == byteset[0].end
83               "%02x" % byteset[0].begin
84             else
85               "{" + 
86               byteset.map {|range|
87                 if range.begin == range.end
88                   "%02x" % range.begin
89                 else
90                   "%02x-%02x" % [range.begin, range.end]
91                 end
92               }.join(',') +
93               "}"
94             end
95           }.join('')
96         end
97       }.join(' ')
98     end
99   end
101   def inspect
102     "\#<#{self.class}: #{self.to_s}>"
103   end
105   def min_length
106     if @pat.empty?
107       nil
108     else
109       @pat.map {|seq| seq.length }.min
110     end
111   end
113   def max_length
114     if @pat.empty?
115       nil
116     else
117       @pat.map {|seq| seq.length }.max
118     end
119   end
121   def emptyable?
122     @pat.any? {|seq|
123       seq.empty?
124     }
125   end
127   def first_bytes
128     result = {}
129     @pat.each {|seq|
130       next if seq.empty?
131       seq.first.each {|range|
132         range.each {|byte|
133           result[byte] = true
134         }
135       }
136     }
137     result.keys.sort
138   end
140   def each_firstbyte
141     h = {}
142     @pat.each {|seq|
143       next if seq.empty?
144       seq.first.each {|range|
145         range.each {|byte|
146           (h[byte] ||= []) << seq[1..-1]
147         }
148       }
149     }
150     h.keys.sort.each {|byte|
151       yield byte, StrSet.new(h[byte])
152     }
153   end
156 class ActionMap
157   def self.parse(hash)
158     h = {}
159     hash.each {|pat, action|
160       h[StrSet.parse(pat)] = action
161     }
162     self.new(h)
163   end
165   def initialize(h)
166     @map = h
167   end
169   def hash
170     hash = 0
171     @map.each {|k,v|
172       hash ^= k.hash ^ v.hash
173     }
174     hash
175   end
177   def eql?(other)
178     self.class == other.class &&
179     @map == other.instance_eval { @map }
180   end
182   alias == eql?
184   def inspect
185     "\#<#{self.class}:" + 
186     @map.map {|k, v| " [" + k.to_s + "]=>" + v.inspect }.join('') +
187     ">"
188   end
190   def max_input_length
191     @map.keys.map {|k| k.max_length }.max
192   end
194   def empty_action
195     @map.each {|ss, action|
196       return action if ss.emptyable?
197     }
198     nil
199   end
201   def each_firstbyte(valid_encoding=nil)
202     h = {}
203     @map.each {|ss, action|
204       if ss.emptyable?
205         raise "emptyable pattern"
206       else
207         ss.each_firstbyte {|byte, rest|
208           h[byte] ||= {}
209           if h[byte][rest]
210             raise "ambiguous"
211           end
212           h[byte][rest] = action
213         }
214       end
215     }
216     if valid_encoding
217       valid_encoding.each_firstbyte {|byte, rest|
218         if h[byte]
219           am = ActionMap.new(h[byte])
220           yield byte, am, rest
221         else
222           am = ActionMap.new(rest => :undef)
223           yield byte, am, nil
224         end
225       }
226     else
227       h.keys.sort.each {|byte|
228         am = ActionMap.new(h[byte])
229         yield byte, am, nil
230       }
231     end
232   end
234   OffsetsMemo = {}
235   InfosMemo = {}
237   def format_offsets(min, max, offsets)
238     offsets = offsets[min..max]
239     code = "{ %d, %d,\n" % [min, max]
240     0.step(offsets.length-1,16) {|i|
241       code << "    "
242       code << offsets[i,8].map {|off| "%3d," % off.to_s }.join('')
243       if i+8 < offsets.length
244         code << "  "
245         code << offsets[i+8,8].map {|off| "%3d," % off.to_s }.join('')
246       end
247       code << "\n"
248     }
249     code << '}'
250     code
251   end
253   def generate_info(info)
254     case info
255     when :nomap
256       "NOMAP"
257     when :undef
258       "UNDEF"
259     when :invalid
260       "INVALID"
261     when :func_ii
262       "FUNii"
263     when :func_si
264       "FUNsi"
265     when :func_io
266       "FUNio"
267     when :func_so
268       "FUNso"
269     when /\A([0-9a-f][0-9a-f])\z/i
270       "o1(0x#$1)"
271     when /\A([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])\z/i
272       "o2(0x#$1,0x#$2)"
273     when /\A([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])\z/i
274       "o3(0x#$1,0x#$2,0x#$3)"
275     when /\A([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])\z/i
276       "o4(0x#$1,0x#$2,0x#$3,0x#$4)"
277     when /\A&/ # pointer to BYTE_LOOKUP structure
278       info.to_s
279     else
280       raise "unexpected action: #{info.inspect}"
281     end
282   end
284   def format_infos(infos)
285     infos = infos.map {|info| generate_info(info) }
286     maxlen = infos.map {|info| info.length }.max
287     columns = maxlen <= 16 ? 4 : 2
288     code = "{\n"
289     0.step(infos.length-1, columns) {|i|
290       code << "    "
291       is = infos[i,columns]
292       is.each {|info|
293         code << sprintf(" %#{maxlen}s,", info)
294       }
295       code << "\n"
296     }
297     code << "}"
298     code
299   end
301   def generate_lookup_node(name, table)
302     offsets = []
303     infos = []
304     infomap = {}
305     min = max = nil
306     table.each_with_index {|action, byte|
307       action ||= :invalid
308       if action != :invalid
309         min = byte if !min
310         max = byte
311       end
312       unless o = infomap[action]
313         infomap[action] = o = infos.length
314         infos[o] = action
315       end
316       offsets[byte] = o
317     }
318     if !min
319       min = max = 0
320     end
322     offsets_key = [min, max, offsets[min..max]]
323     if n = OffsetsMemo[offsets_key]
324       offsets_name = n
325       offsets_code = ''
326     else
327       offsets_name = "#{name}_offsets"
328       offsets_code = <<"End"
329 static const unsigned char
330 #{offsets_name}[#{2+max-min+1}] = #{format_offsets(min,max,offsets)};
332       OffsetsMemo[offsets_key] = offsets_name
333     end
335     if n = InfosMemo[infos]
336       infos_name = n
337       infos_code = ''
338     else
339       infos_name = "#{name}_infos"
340       infos_code = <<"End"
341 static const struct byte_lookup* const
342 #{infos_name}[#{infos.length}] = #{format_infos(infos)};
344       InfosMemo[infos] = infos_name
345     end
347     r = offsets_code + infos_code + <<"End"
348 static const BYTE_LOOKUP
349 #{name} = {
350     #{offsets_name},
351     #{infos_name}
355     r
356   end
358   PreMemo = {}
359   PostMemo = {}
360   NextName = "a"
362   def generate_node(code, name_hint=nil, valid_encoding=nil)
363     if n = PreMemo[[self,valid_encoding]]
364       return n
365     end
367     table = Array.new(0x100, :invalid)
368     each_firstbyte(valid_encoding) {|byte, rest, rest_valid_encoding|
369       if a = rest.empty_action
370         table[byte] = a
371       else
372         name_hint2 = nil
373         name_hint2 = "#{name_hint}_#{'%02X' % byte}" if name_hint
374         table[byte] = "&" + rest.generate_node(code, name_hint2, rest_valid_encoding)
375       end
376     }
378     if n = PostMemo[table]
379       return n
380     end
382     if !name_hint
383       name_hint = "fun_" + NextName.dup
384       NextName.succ!
385     end
387     PreMemo[[self,valid_encoding]] = PostMemo[table] = name_hint
389     code << generate_lookup_node(name_hint, table)
390     name_hint
391   end
394 def encode_utf8(map)
395   r = []
396   map.each {|k, v|
397     # integer means UTF-8 encoded sequence.
398     k = [k].pack("U").unpack("H*")[0].upcase if Integer === k
399     v = [v].pack("U").unpack("H*")[0].upcase if Integer === v
400     r << [k,v]
401   }
402   r
405 def transcode_compile_tree(name, from, map)
406   map = encode_utf8(map)
407   h = {}
408   map.each {|k, v|
409     h[k] = v
410   }
411   am = ActionMap.parse(h)
413   max_input = am.max_input_length
415   if ValidEncoding[from]
416     valid_encoding = StrSet.parse(ValidEncoding[from])
417   else
418     valid_encoding = nil
419   end
421   code = ''
422   defined_name = am.generate_node(code, name, valid_encoding)
423   return defined_name, code, max_input
426 TRANSCODERS = []
428 def transcode_tblgen(from, to, map)
429   STDERR.puts "converter from #{from} to #{to}" if VERBOSE_MODE
430   id_from = from.tr('^0-9A-Za-z', '_')
431   id_to = to.tr('^0-9A-Za-z', '_')
432   if from == "UTF-8"
433     tree_name = "to_#{id_to}"
434   elsif to == "UTF-8"
435     tree_name = "from_#{id_from}"
436   else
437     tree_name = "from_#{id_from}_to_#{id_to}"
438   end
439   map = encode_utf8(map)
440   real_tree_name, tree_code, max_input = transcode_compile_tree(tree_name, from, map)
441   transcoder_name = "rb_#{tree_name}"
442   TRANSCODERS << transcoder_name
443   input_unit_length = UnitLength[from]
444   max_output = map.map {|k,v| String === v ? v.length/2 : 1 }.max
445   transcoder_code = <<"End"
446 static const rb_transcoder
447 #{transcoder_name} = {
448     #{c_esc from}, #{c_esc to}, &#{real_tree_name},
449     #{input_unit_length}, /* input_unit_length */
450     #{max_input}, /* max_input */
451     #{max_output}, /* max_output */
452     NULL, NULL, NULL, NULL, NULL, NULL
455   tree_code + "\n" + transcoder_code
458 def transcode_generate_node(am, name_hint=nil)
459   STDERR.puts "converter for #{name_hint}" if VERBOSE_MODE
460   code = ''
461   am.generate_node(code, name_hint)
462   code
465 def transcode_register_code
466   code = ''
467   TRANSCODERS.each {|transcoder_name|
468     code << "    rb_register_transcoder(&#{transcoder_name});\n"
469   }
470   code
473 UnitLength = {
474   'UTF-16BE'    => 2,
475   'UTF-16LE'    => 2,
476   'UTF-32BE'    => 4,
477   'UTF-32LE'    => 4,
479 UnitLength.default = 1
481 ValidEncoding = {
482   '1byte'       => '{00-ff}',
483   '2byte'       => '{00-ff}{00-ff}',
484   '4byte'       => '{00-ff}{00-ff}{00-ff}{00-ff}',
485   'US-ASCII'    => '{00-7f}',
486   'UTF-8'       => '{00-7f}
487                     {c2-df}{80-bf}
488                          e0{a0-bf}{80-bf}
489                     {e1-ec}{80-bf}{80-bf}
490                          ed{80-9f}{80-bf}
491                     {ee-ef}{80-bf}{80-bf}
492                          f0{90-bf}{80-bf}{80-bf}
493                     {f1-f3}{80-bf}{80-bf}{80-bf}
494                          f4{80-8f}{80-bf}{80-bf}',
495   'UTF-16BE'    => '{00-d7,e0-ff}{00-ff}
496                     {d8-db}{00-ff}{dc-df}{00-ff}',
497   'UTF-16LE'    => '{00-ff}{00-d7,e0-ff}
498                     {00-ff}{d8-db}{00-ff}{dc-df}',
499   'UTF-32BE'    => '0000{00-d7,e0-ff}{00-ff}
500                     00{01-10}{00-ff}{00-ff}',
501   'UTF-32LE'    => '{00-ff}{00-d7,e0-ff}0000
502                     {00-ff}{00-ff}{01-10}00',
503   'EUC-JP'      => '{00-7f}
504                     {a1-fe}{a1-fe}
505                     8e{a1-fe}
506                     8f{a1-fe}{a1-fe}',
507   'CP51932'     => '{00-7f}
508                     {a1-fe}{a1-fe}
509                     8e{a1-fe}',
510   'Shift_JIS'   => '{00-7f}
511                     {81-9f,e0-fc}{40-7e,80-fc}
512                     {a1-df}',
513   'EUC-KR'      => '{00-7f}
514                     {a1-fe}{a1-fe}',
515   'CP949'       => '{00-7f}
516                     {81-fe}{41-5a,61-7a,81-fe}',
517   'Big5'        => '{00-7f}
518                     {81-fe}{40-7e,a1-fe}',
519   'EUC-TW'      => '{00-7f}
520                     {a1-fe}{a1-fe}
521                     8e{a1-b0}{a1-fe}{a1-fe}',
522   'GBK'         => '{00-80}
523                     {81-fe}{40-7e,80-fe}',
524   'GB18030'     => '{00-7f}
525                     {81-fe}{40-7e,80-fe}
526                     {81-fe}{30-39}{81-fe}{30-39}',
530   'ASCII-8BIT'  => '1byte',
531   'ISO-8859-1'  => '1byte',
532   'ISO-8859-2'  => '1byte',
533   'ISO-8859-3'  => '1byte',
534   'ISO-8859-4'  => '1byte',
535   'ISO-8859-5'  => '1byte',
536   'ISO-8859-6'  => '1byte',
537   'ISO-8859-7'  => '1byte',
538   'ISO-8859-8'  => '1byte',
539   'ISO-8859-9'  => '1byte',
540   'ISO-8859-10' => '1byte',
541   'ISO-8859-11' => '1byte',
542   'ISO-8859-13' => '1byte',
543   'ISO-8859-14' => '1byte',
544   'ISO-8859-15' => '1byte',
545   'Windows-31J' => 'Shift_JIS',
546 }.each {|k, v|
547   ValidEncoding[k] = ValidEncoding.fetch(v)
550 def make_signature(filename, src)
551   "src=#{filename.dump}, len=#{src.length}, checksum=#{src.sum}"
554 output_filename = nil
555 verbose_mode = false
556 force_mode = false
558 op = OptionParser.new
559 op.def_option("--help", "show help message") { puts op; exit 0 }
560 op.def_option("--verbose", "verbose mode") { verbose_mode = true }
561 op.def_option("--force", "force table generation") { force_mode = true }
562 op.def_option("--output=FILE", "specify output file") {|arg| output_filename = arg }
563 op.parse!
565 VERBOSE_MODE = verbose_mode
567 arg = ARGV.shift
568 dir = File.dirname(arg)
569 $:.unshift dir unless $:.include? dir
570 src = File.read(arg)
571 src.force_encoding("ascii-8bit") if src.respond_to? :force_encoding
572 this_script = File.read(__FILE__)
573 this_script.force_encoding("ascii-8bit") if this_script.respond_to? :force_encoding
575 base_signature = "/* autogenerated. */\n"
576 base_signature << "/* #{make_signature(File.basename(__FILE__), this_script)} */\n"
577 base_signature << "/* #{make_signature(File.basename(arg), src)} */\n"
579 if !force_mode && output_filename && File.readable?(output_filename)
580   old_signature = File.open(output_filename) {|f| f.gets("").chomp }
581   chk_signature = base_signature.dup
582   old_signature.each_line {|line|
583     if %r{/\* src="([0-9a-z_.-]+)",} =~ line
584       name = $1
585       next if name == File.basename(arg) || name == File.basename(__FILE__)
586       path = File.join(dir, name)
587       if File.readable? path
588         chk_signature << "/* #{make_signature(name, File.read(path))} */\n"
589       end
590     end
591   }
592   if old_signature == chk_signature
593     now = Time.now
594     File.utime(now, now, output_filename)
595     STDERR.puts "already up-to-date: #{output_filename}" if VERBOSE_MODE
596     exit
597   end
600 if VERBOSE_MODE
601   if output_filename
602     STDERR.puts "generating #{output_filename} ..."
603   end
606 libs1 = $".dup
607 erb_result = ERB.new(src, nil, '%').result(binding)
608 libs2 = $".dup
610 libs = libs2 - libs1
611 lib_sigs = ''
612 libs.each {|lib|
613   lib = File.basename(lib)
614   path = File.join(dir, lib)
615   if File.readable? path
616     lib_sigs << "/* #{make_signature(lib, File.read(path))} */\n"
617   end
620 result = ''
621 result << base_signature
622 result << lib_sigs
623 result << "\n"
624 result << erb_result
625 result << "\n"
627 if output_filename
628   new_filename = output_filename + ".new"
629   FileUtils.mkdir_p(File.dirname(output_filename))
630   File.open(new_filename, "wb") {|f| f << result }
631   File.rename(new_filename, output_filename)
632   STDERR.puts "done." if VERBOSE_MODE
633 else
634   print result