Bug 468575 - Scrape some gunk off the config/ grout, r=ted
[wine-gecko.git] / intl / chardet / tools / genutf8.pl
blob17d29f57583fd82257a4ff1849de427615918717
1 #!/usr/local/bin/perl
2 use strict;
3 require "genverifier.pm";
4 use genverifier;
7 my(@utf8_cls);
8 my(@utf8_st);
9 my($utf8_ver);
13 # UTF8 encode the UCS4 into 1 to 6 bytes
15 # 1 byte 00 00 00 00 00 00 00 7f
16 # 2 bytes 00 00 00 80 00 00 07 ff
17 # 3 bytes 00 00 08 00 00 00 ff ff
18 # 4 bytes 00 01 00 00 00 1f ff ff
19 # 5 bytes 00 20 00 00 03 ff ff ff
20 # 6 bytes 04 00 00 00 7f ff ff ff
22 # Howerver, since Surrogate area should not be encoded into UTF8 as
23 # a Surrogate pair, we can remove the surrogate area from UTF8
25 # 1 byte 00 00 00 00 00 00 00 7f
26 # 2 bytes 00 00 00 80 00 00 07 ff
27 # 3 bytes 00 00 08 00 00 00 d7 ff
28 # 00 00 e0 00 00 00 ff ff
29 # 4 bytes 00 01 00 00 00 1f ff ff
30 # 5 bytes 00 20 00 00 03 ff ff ff
31 # 6 bytes 04 00 00 00 7f ff ff ff
33 # Now we break them into 6 bits group for 2-6 bytes UTF8
35 # 1 byte 00 7f
36 # 2 bytes 02 00 1f 3f
37 # 3 bytes 00 20 00 0d 1f 3f
38 # 0e 00 00 0f 3f 3f
39 # 4 bytes 00 20 00 00 07 3f 3f 3f
40 # 5 bytes 00 08 00 00 00 03 3f 3f 3f 3f
41 # 6 bytes 00 04 00 00 00 00 01 3f 3f 3f 3f 3f
43 # Break down more
45 # 1 byte 00 7f
46 # 2 bytes 02 00 1f 3f
47 # 3 bytes 00 20 00 00 3f 3f
48 # 01 00 00 0c 3f 3f
49 # 0d 00 00 0d 1f 3f
50 # 0e 00 00 0f 3f 3f
51 # 4 bytes 00 20 00 00 00 3f 3f 3f
52 # 01 00 00 00 07 3f 3f 3f
53 # 5 bytes 00 08 00 00 00 00 3f 3f 3f 3f
54 # 01 00 00 00 00 03 3f 3f 3f 3f
55 # 6 bytes 00 04 00 00 00 00 00 3f 3f 3f 3f 3f
56 # 01 00 00 00 00 00 01 3f 3f 3f 3f 3f
58 # Now, add
59 # c0 to the lead byte of 2 bytes UTF8
60 # e0 to the lead byte of 3 bytes UTF8
61 # f0 to the lead byte of 4 bytes UTF8
62 # f8 to the lead byte of 5 bytes UTF8
63 # fc to the lead byte of 6 bytes UTF8
64 # 80 to the trail bytes of 2 - 6 bytes UTF8
66 # 1 byte 00 7f
67 # 2 bytes c2 80 df bf
68 # 3 bytes e0 a0 80 e0 bf bf
69 # e1 80 80 ec bf bf
70 # ed 80 80 ed 9f bf
71 # ee 80 80 ef bf bf
72 # 4 bytes f0 a0 80 80 f0 bf bf bf
73 # f1 80 80 80 f7 bf bf bf
74 # 5 bytes f8 88 80 80 80 f8 bf bf bf bf
75 # f9 80 80 80 80 fb bf bf bf bf
76 # 6 bytes fc 84 80 80 80 80 fc bf bf bf bf bf
77 # fd 80 80 80 80 80 fd bf bf bf bf bf
80 # Now we can construct our state diagram
82 # 0:0x00,0x0e,0x0f,0x1b->Error
83 # 0:[0-0x7f]->0
84 # 0:fd->3
85 # 0:fc->4
86 # 0:[f9-fb]->5
87 # 0:f8->6
88 # 0:[f1-f7]->7
89 # 0:f0->8
90 # 0:[e1-ecee-ef]->9
91 # 0:e0->10
92 # 0:ed->11
93 # 0:[c2-df]->12
94 # 0:*->Error
95 # 3:[80-bf]->5
96 # 3:*->Error
97 # 4:[84-bf]->5
98 # 4:*->Error
99 # 5:[80-bf]->7
100 # 5:*->Error
101 # 6:[88-bf]->7
102 # 6:*->Error
103 # 7:[80-bf]->9
104 # 7:*->Error
105 # 8:[a0-bf]->9
106 # 8:*->Error
107 # 9:[80-bf]->12
108 # 9:*->Error
109 # 10:[a0-bf]->12
110 # 10:*->Error
111 # 11:[80-9f]->12
112 # 11:*->Error
113 # 12:[80-bf]->0
114 # 12:*->Error
116 # Now, we classified chars into class
118 # 00,0e,0f,1b:k0
119 # 01-0d,10-1a,1c-7f:k1
120 # 80-83:k2
121 # 84-87:k3
122 # 88-9f:k4
123 # a0-bf:k5
124 # c0-c1:k0
125 # c2-df:k6
126 # e0:k7
127 # e1-ec:k8
128 # ed:k9
129 # ee-ef:k8
130 # f0:k10
131 # f1-f7:k11
132 # f8:k12
133 # f9-fb:k13
134 # fc:k14
135 # fd:k15
136 # fe-ff:k0
138 # Now, let's put them into array form
140 @utf8_cls = (
141 [ 0x00 , 0x00 , 1 ],
142 [ 0x0e , 0x0f , 0 ],
143 [ 0x1b , 0x1b , 0 ],
144 [ 0x01 , 0x0d , 1 ],
145 [ 0x10 , 0x1a , 1 ],
146 [ 0x1c , 0x7f , 1 ],
147 [ 0x80 , 0x83 , 2 ],
148 [ 0x84 , 0x87 , 3 ],
149 [ 0x88 , 0x9f , 4 ],
150 [ 0xa0 , 0xbf , 5 ],
151 [ 0xc0 , 0xc1 , 0 ],
152 [ 0xc2 , 0xdf , 6 ],
153 [ 0xe0 , 0xe0 , 7 ],
154 [ 0xe1 , 0xec , 8 ],
155 [ 0xed , 0xed , 9 ],
156 [ 0xee , 0xef , 8 ],
157 [ 0xf0 , 0xf0 , 10 ],
158 [ 0xf1 , 0xf7 , 11 ],
159 [ 0xf8 , 0xf8 , 12 ],
160 [ 0xf9 , 0xfb , 13 ],
161 [ 0xfc , 0xfc , 14 ],
162 [ 0xfd , 0xfd , 15 ],
163 [ 0xfe , 0xff , 0 ],
166 # Now, we write the state diagram in class
168 # 0:k0->Error
169 # 0:k1->0
170 # 0:k15->3
171 # 0:k14->4
172 # 0:k13->5
173 # 0:k12->6
174 # 0:k11->7
175 # 0:k10->8
176 # 0:k8->9
177 # 0:k7->10
178 # 0:k9->11
179 # 0:k6->12
180 # 0:*->Error
181 # 3:k2,k3,k4,k5->5
182 # 3:*->Error
183 # 4:k3,k4,k5->5
184 # 4:*->Error
185 # 5:k2,k3,k4,k5->7
186 # 5:*->Error
187 # 6:k4,k5->7
188 # 6:*->Error
189 # 7:k2,k3,k4,k5->9
190 # 7:*->Error
191 # 8:k5->9
192 # 8:*->Error
193 # 9:k2,k3,k4,k5->12
194 # 9:*->Error
195 # 10:k5->12
196 # 10:*->Error
197 # 11:k2,k3,k4->12
198 # 11:*->Error
199 # 12:k2,k3,k4,k5->0
200 # 12:*->Error
202 # Now, let's put them into array
204 package genverifier;
205 @utf8_st = (
206 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
207 1, 0, 1, 1, 1, 1,12,10, 9,11, 8, 7, 6, 5, 4, 3, # state 0 Start
208 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 1 Error
209 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # state 2 ItsMe
210 1, 1, 5, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 3
211 1, 1, 1, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 4
212 1, 1, 7, 7, 7, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 5
213 1, 1, 1, 1, 7, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 6
214 1, 1, 9, 9, 9, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 7
215 1, 1, 1, 1, 1, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 8
216 1, 1,12,12,12,12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 9
217 1, 1, 1, 1, 1,12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 10
218 1, 1,12,12,12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 11
219 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 12
224 $utf8_ver = genverifier::GenVerifier("UTF8", "UTF-8", \@utf8_cls, 16, \@utf8_st);
225 print $utf8_ver;