3 require "genverifier.pm";
13 # UTF8 encode the UCS4 into 1 to 6 bytes
15 # 1 byte 00 00 00 00 00 00 00 7f
16 # 2 bytes 00 00 00 80 00 00 07 ff
17 # 3 bytes 00 00 08 00 00 00 ff ff
18 # 4 bytes 00 01 00 00 00 1f ff ff
19 # 5 bytes 00 20 00 00 03 ff ff ff
20 # 6 bytes 04 00 00 00 7f ff ff ff
22 # Howerver, since Surrogate area should not be encoded into UTF8 as
23 # a Surrogate pair, we can remove the surrogate area from UTF8
25 # 1 byte 00 00 00 00 00 00 00 7f
26 # 2 bytes 00 00 00 80 00 00 07 ff
27 # 3 bytes 00 00 08 00 00 00 d7 ff
28 # 00 00 e0 00 00 00 ff ff
29 # 4 bytes 00 01 00 00 00 1f ff ff
30 # 5 bytes 00 20 00 00 03 ff ff ff
31 # 6 bytes 04 00 00 00 7f ff ff ff
33 # Now we break them into 6 bits group for 2-6 bytes UTF8
37 # 3 bytes 00 20 00 0d 1f 3f
39 # 4 bytes 00 20 00 00 07 3f 3f 3f
40 # 5 bytes 00 08 00 00 00 03 3f 3f 3f 3f
41 # 6 bytes 00 04 00 00 00 00 01 3f 3f 3f 3f 3f
47 # 3 bytes 00 20 00 00 3f 3f
51 # 4 bytes 00 20 00 00 00 3f 3f 3f
52 # 01 00 00 00 07 3f 3f 3f
53 # 5 bytes 00 08 00 00 00 00 3f 3f 3f 3f
54 # 01 00 00 00 00 03 3f 3f 3f 3f
55 # 6 bytes 00 04 00 00 00 00 00 3f 3f 3f 3f 3f
56 # 01 00 00 00 00 00 01 3f 3f 3f 3f 3f
59 # c0 to the lead byte of 2 bytes UTF8
60 # e0 to the lead byte of 3 bytes UTF8
61 # f0 to the lead byte of 4 bytes UTF8
62 # f8 to the lead byte of 5 bytes UTF8
63 # fc to the lead byte of 6 bytes UTF8
64 # 80 to the trail bytes of 2 - 6 bytes UTF8
68 # 3 bytes e0 a0 80 e0 bf bf
72 # 4 bytes f0 a0 80 80 f0 bf bf bf
73 # f1 80 80 80 f7 bf bf bf
74 # 5 bytes f8 88 80 80 80 f8 bf bf bf bf
75 # f9 80 80 80 80 fb bf bf bf bf
76 # 6 bytes fc 84 80 80 80 80 fc bf bf bf bf bf
77 # fd 80 80 80 80 80 fd bf bf bf bf bf
80 # Now we can construct our state diagram
82 # 0:0x00,0x0e,0x0f,0x1b->Error
116 # Now, we classified chars into class
119 # 01-0d,10-1a,1c-7f:k1
138 # Now, let's put them into array form
157 [ 0xf0 , 0xf0 , 10 ],
158 [ 0xf1 , 0xf7 , 11 ],
159 [ 0xf8 , 0xf8 , 12 ],
160 [ 0xf9 , 0xfb , 13 ],
161 [ 0xfc , 0xfc , 14 ],
162 [ 0xfd , 0xfd , 15 ],
166 # Now, we write the state diagram in class
202 # Now, let's put them into array
206 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
207 1, 0, 1, 1, 1, 1,12,10, 9,11, 8, 7, 6, 5, 4, 3, # state 0 Start
208 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 1 Error
209 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # state 2 ItsMe
210 1, 1, 5, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 3
211 1, 1, 1, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 4
212 1, 1, 7, 7, 7, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 5
213 1, 1, 1, 1, 7, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 6
214 1, 1, 9, 9, 9, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 7
215 1, 1, 1, 1, 1, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 8
216 1, 1,12,12,12,12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 9
217 1, 1, 1, 1, 1,12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 10
218 1, 1,12,12,12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 11
219 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 12
224 $utf8_ver = genverifier
::GenVerifier
("UTF8", "UTF-8", \
@utf8_cls, 16, \
@utf8_st);