2 * utf_validate.c: Validate a UTF-8 string
4 * ====================================================================
5 * Copyright (c) 2004 CollabNet. All rights reserved.
7 * This software is licensed as described in the file COPYING, which
8 * you should have received as part of this distribution. The terms
9 * are also available at http://subversion.tigris.org/license-1.html.
10 * If newer versions of this license are posted there, you may use a
11 * newer version instead, at your option.
13 * This software consists of voluntary contributions made by many
14 * individuals. For exact contribution history, see the revision
15 * history and logs, available at http://subversion.tigris.org/.
16 * ====================================================================
19 /* Validate a UTF-8 string according to the rules in
21 * Table 3-6. Well-Formed UTF-8 Bytes Sequences
25 * The Unicode Standard, Version 4.0
27 * which is available at
29 * http://www.unicode.org/
31 * UTF-8 was originally defined in RFC-2279, Unicode's "well-formed UTF-8"
32 * is a subset of that enconding. The Unicode enconding prohibits things
33 * like non-shortest encodings (some characters can be represented by more
34 * than one multi-byte encoding) and the encodings for the surrogate code
35 * points. RFC-3629 superceeds RFC-2279 and adopts the same well-formed
36 * rules as Unicode. This is the ABNF in RFC-3629 that describes
37 * well-formed UTF-8 rules:
39 * UTF8-octets = *( UTF8-char )
40 * UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
42 * UTF8-2 = %xC2-DF UTF8-tail
43 * UTF8-3 = %xE0 %xA0-BF UTF8-tail /
44 * %xE1-EC 2( UTF8-tail ) /
45 * %xED %x80-9F UTF8-tail /
46 * %xEE-EF 2( UTF8-tail )
47 * UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) /
48 * %xF1-F3 3( UTF8-tail ) /
49 * %xF4 %x80-8F 2( UTF8-tail )
56 /* Lookup table to categorise each octet in the string. */
57 static const char octet_category
[256] = {
58 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00-0x7f */
59 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
60 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
61 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
62 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
63 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
64 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
65 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
66 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x80-0x8f */
67 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0x90-0x9f */
68 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 0xa0-0xbf */
69 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
71 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, /* 0xc2-0xdf */
72 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
74 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 0xe1-0xec */
78 11, 11, 11, /* 0xf1-0xf3 */
80 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13 /* 0xf5-0xff */
87 #define FSM_80BF80BF 3
90 #define FSM_80BF80BF80BF 6
94 /* In the FSM it appears that categories 0xc0-0xc1 and 0xf5-0xff make the
95 same transitions, as do categories 0xe1-0xec and 0xee-0xef. I wonder if
96 there is any great benefit in combining categories? It would reduce the
97 memory footprint of the transition table by 16 bytes, but might it be
98 harder to understand? */
100 /* Machine transition table */
101 static const char machine
[9][14] = {
103 {FSM_START
, /* 0x00-0x7f */
104 FSM_ERROR
, /* 0x80-0x8f */
105 FSM_ERROR
, /* 0x90-0x9f */
106 FSM_ERROR
, /* 0xa0-0xbf */
107 FSM_ERROR
, /* 0xc0-0xc1 */
108 FSM_80BF
, /* 0xc2-0xdf */
110 FSM_80BF80BF
, /* 0xe1-0xec */
112 FSM_80BF80BF
, /* 0xee-0xef */
114 FSM_80BF80BF80BF
, /* 0xf1-0xf3 */
116 FSM_ERROR
}, /* 0xf5-0xff */
119 {FSM_ERROR
, /* 0x00-0x7f */
120 FSM_START
, /* 0x80-0x8f */
121 FSM_START
, /* 0x90-0x9f */
122 FSM_START
, /* 0xa0-0xbf */
123 FSM_ERROR
, /* 0xc0-0xc1 */
124 FSM_ERROR
, /* 0xc2-0xdf */
125 FSM_ERROR
, /* 0xe0 */
126 FSM_ERROR
, /* 0xe1-0xec */
127 FSM_ERROR
, /* 0xed */
128 FSM_ERROR
, /* 0xee-0xef */
129 FSM_ERROR
, /* 0xf0 */
130 FSM_ERROR
, /* 0xf1-0xf3 */
131 FSM_ERROR
, /* 0xf4 */
132 FSM_ERROR
}, /* 0xf5-0xff */
135 {FSM_ERROR
, /* 0x00-0x7f */
136 FSM_ERROR
, /* 0x80-0x8f */
137 FSM_ERROR
, /* 0x90-0x9f */
138 FSM_80BF
, /* 0xa0-0xbf */
139 FSM_ERROR
, /* 0xc0-0xc1 */
140 FSM_ERROR
, /* 0xc2-0xdf */
141 FSM_ERROR
, /* 0xe0 */
142 FSM_ERROR
, /* 0xe1-0xec */
143 FSM_ERROR
, /* 0xed */
144 FSM_ERROR
, /* 0xee-0xef */
145 FSM_ERROR
, /* 0xf0 */
146 FSM_ERROR
, /* 0xf1-0xf3 */
147 FSM_ERROR
, /* 0xf4 */
148 FSM_ERROR
}, /* 0xf5-0xff */
151 {FSM_ERROR
, /* 0x00-0x7f */
152 FSM_80BF
, /* 0x80-0x8f */
153 FSM_80BF
, /* 0x90-0x9f */
154 FSM_80BF
, /* 0xa0-0xbf */
155 FSM_ERROR
, /* 0xc0-0xc1 */
156 FSM_ERROR
, /* 0xc2-0xdf */
157 FSM_ERROR
, /* 0xe0 */
158 FSM_ERROR
, /* 0xe1-0xec */
159 FSM_ERROR
, /* 0xed */
160 FSM_ERROR
, /* 0xee-0xef */
161 FSM_ERROR
, /* 0xf0 */
162 FSM_ERROR
, /* 0xf1-0xf3 */
163 FSM_ERROR
, /* 0xf4 */
164 FSM_ERROR
}, /* 0xf5-0xff */
167 {FSM_ERROR
, /* 0x00-0x7f */
168 FSM_80BF
, /* 0x80-0x8f */
169 FSM_80BF
, /* 0x90-0x9f */
170 FSM_ERROR
, /* 0xa0-0xbf */
171 FSM_ERROR
, /* 0xc0-0xc1 */
172 FSM_ERROR
, /* 0xc2-0xdf */
173 FSM_ERROR
, /* 0xe0 */
174 FSM_ERROR
, /* 0xe1-0xec */
175 FSM_ERROR
, /* 0xed */
176 FSM_ERROR
, /* 0xee-0xef */
177 FSM_ERROR
, /* 0xf0 */
178 FSM_ERROR
, /* 0xf1-0xf3 */
179 FSM_ERROR
, /* 0xf4 */
180 FSM_ERROR
}, /* 0xf5-0xff */
183 {FSM_ERROR
, /* 0x00-0x7f */
184 FSM_ERROR
, /* 0x80-0x8f */
185 FSM_80BF80BF
, /* 0x90-0x9f */
186 FSM_80BF80BF
, /* 0xa0-0xbf */
187 FSM_ERROR
, /* 0xc0-0xc1 */
188 FSM_ERROR
, /* 0xc2-0xdf */
189 FSM_ERROR
, /* 0xe0 */
190 FSM_ERROR
, /* 0xe1-0xec */
191 FSM_ERROR
, /* 0xed */
192 FSM_ERROR
, /* 0xee-0xef */
193 FSM_ERROR
, /* 0xf0 */
194 FSM_ERROR
, /* 0xf1-0xf3 */
195 FSM_ERROR
, /* 0xf4 */
196 FSM_ERROR
}, /* 0xf5-0xff */
198 /* FSM_80BF80BF80BF */
199 {FSM_ERROR
, /* 0x00-0x7f */
200 FSM_80BF80BF
, /* 0x80-0x8f */
201 FSM_80BF80BF
, /* 0x90-0x9f */
202 FSM_80BF80BF
, /* 0xa0-0xbf */
203 FSM_ERROR
, /* 0xc0-0xc1 */
204 FSM_ERROR
, /* 0xc2-0xdf */
205 FSM_ERROR
, /* 0xe0 */
206 FSM_ERROR
, /* 0xe1-0xec */
207 FSM_ERROR
, /* 0xed */
208 FSM_ERROR
, /* 0xee-0xef */
209 FSM_ERROR
, /* 0xf0 */
210 FSM_ERROR
, /* 0xf1-0xf3 */
211 FSM_ERROR
, /* 0xf4 */
212 FSM_ERROR
}, /* 0xf5-0xff */
215 {FSM_ERROR
, /* 0x00-0x7f */
216 FSM_80BF80BF
, /* 0x80-0x8f */
217 FSM_ERROR
, /* 0x90-0x9f */
218 FSM_ERROR
, /* 0xa0-0xbf */
219 FSM_ERROR
, /* 0xc0-0xc1 */
220 FSM_ERROR
, /* 0xc2-0xdf */
221 FSM_ERROR
, /* 0xe0 */
222 FSM_ERROR
, /* 0xe1-0xec */
223 FSM_ERROR
, /* 0xed */
224 FSM_ERROR
, /* 0xee-0xef */
225 FSM_ERROR
, /* 0xf0 */
226 FSM_ERROR
, /* 0xf1-0xf3 */
227 FSM_ERROR
, /* 0xf4 */
228 FSM_ERROR
}, /* 0xf5-0xff */
231 {FSM_ERROR
, /* 0x00-0x7f */
232 FSM_ERROR
, /* 0x80-0x8f */
233 FSM_ERROR
, /* 0x90-0x9f */
234 FSM_ERROR
, /* 0xa0-0xbf */
235 FSM_ERROR
, /* 0xc0-0xc1 */
236 FSM_ERROR
, /* 0xc2-0xdf */
237 FSM_ERROR
, /* 0xe0 */
238 FSM_ERROR
, /* 0xe1-0xec */
239 FSM_ERROR
, /* 0xed */
240 FSM_ERROR
, /* 0xee-0xef */
241 FSM_ERROR
, /* 0xf0 */
242 FSM_ERROR
, /* 0xf1-0xf3 */
243 FSM_ERROR
, /* 0xf4 */
244 FSM_ERROR
}, /* 0xf5-0xff */
249 svn_utf__last_valid(const char *data
, apr_size_t len
)
251 const char *start
= data
, *end
= data
+ len
;
252 int state
= FSM_START
;
255 unsigned char octet
= *data
++;
256 int category
= octet_category
[octet
];
257 state
= machine
[state
][category
];
258 if (state
== FSM_START
)
265 svn_utf__cstring_is_valid(const char *data
)
267 int state
= FSM_START
;
270 unsigned char octet
= *data
++;
271 int category
= octet_category
[octet
];
272 state
= machine
[state
][category
];
274 return state
== FSM_START
? TRUE
: FALSE
;
278 svn_utf__is_valid(const char *data
, apr_size_t len
)
280 const char *end
= data
+ len
;
281 int state
= FSM_START
;
284 unsigned char octet
= *data
++;
285 int category
= octet_category
[octet
];
286 state
= machine
[state
][category
];
288 return state
== FSM_START
? TRUE
: FALSE
;
292 svn_utf__last_valid2(const char *data
, apr_size_t len
)
294 const char *start
= data
, *end
= data
+ len
;
295 int state
= FSM_START
;
298 unsigned char octet
= *data
++;
304 else if (octet
<= 0xC1)
306 else if (octet
<= 0xDF)
308 else if (octet
== 0xE0)
310 else if (octet
<= 0xEC)
311 state
= FSM_80BF80BF
;
312 else if (octet
== 0xED)
314 else if (octet
<= 0xEF)
315 state
= FSM_80BF80BF
;
316 else if (octet
== 0xF0)
318 else if (octet
<= 0xF3)
319 state
= FSM_80BF80BF80BF
;
320 else if (octet
<= 0xF4)
326 if (octet
>= 0x80 && octet
<= 0xBF)
332 if (octet
>= 0xA0 && octet
<= 0xBF)
338 if (octet
>= 0x80 && octet
<= 0xBF)
344 if (octet
>= 0x80 && octet
<= 0x9F)
350 if (octet
>= 0x90 && octet
<= 0xBF)
351 state
= FSM_80BF80BF
;
355 case FSM_80BF80BF80BF
:
356 if (octet
>= 0x80 && octet
<= 0xBF)
357 state
= FSM_80BF80BF
;
362 if (octet
>= 0x80 && octet
<= 0x8F)
363 state
= FSM_80BF80BF
;
371 if (state
== FSM_START
)