* subversion/libsvn_subr/validate.c
[svn.git] / subversion / libsvn_subr / utf_validate.c
blobf77a73b99c31079b3e5f70c996cfbfa2297f6ebe
1 /*
2 * utf_validate.c: Validate a UTF-8 string
4 * ====================================================================
5 * Copyright (c) 2004 CollabNet. All rights reserved.
7 * This software is licensed as described in the file COPYING, which
8 * you should have received as part of this distribution. The terms
9 * are also available at http://subversion.tigris.org/license-1.html.
10 * If newer versions of this license are posted there, you may use a
11 * newer version instead, at your option.
13 * This software consists of voluntary contributions made by many
14 * individuals. For exact contribution history, see the revision
15 * history and logs, available at http://subversion.tigris.org/.
16 * ====================================================================
19 /* Validate a UTF-8 string according to the rules in
21 * Table 3-6. Well-Formed UTF-8 Bytes Sequences
23 * in
25 * The Unicode Standard, Version 4.0
27 * which is available at
29 * http://www.unicode.org/
31 * UTF-8 was originally defined in RFC-2279, Unicode's "well-formed UTF-8"
32 * is a subset of that enconding. The Unicode enconding prohibits things
33 * like non-shortest encodings (some characters can be represented by more
34 * than one multi-byte encoding) and the encodings for the surrogate code
35 * points. RFC-3629 superceeds RFC-2279 and adopts the same well-formed
36 * rules as Unicode. This is the ABNF in RFC-3629 that describes
37 * well-formed UTF-8 rules:
39 * UTF8-octets = *( UTF8-char )
40 * UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
41 * UTF8-1 = %x00-7F
42 * UTF8-2 = %xC2-DF UTF8-tail
43 * UTF8-3 = %xE0 %xA0-BF UTF8-tail /
44 * %xE1-EC 2( UTF8-tail ) /
45 * %xED %x80-9F UTF8-tail /
46 * %xEE-EF 2( UTF8-tail )
47 * UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) /
48 * %xF1-F3 3( UTF8-tail ) /
49 * %xF4 %x80-8F 2( UTF8-tail )
50 * UTF8-tail = %x80-BF
54 #include "utf_impl.h"
56 /* Lookup table to categorise each octet in the string. */
57 static const char octet_category[256] = {
58 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00-0x7f */
59 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
60 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
61 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
62 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
63 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
64 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
65 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
66 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x80-0x8f */
67 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0x90-0x9f */
68 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 0xa0-0xbf */
69 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
70 4, 4, /* 0xc0-0xc1 */
71 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, /* 0xc2-0xdf */
72 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
73 6, /* 0xe0 */
74 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 0xe1-0xec */
75 8, /* 0xed */
76 9, 9, /* 0xee-0xef */
77 10, /* 0xf0 */
78 11, 11, 11, /* 0xf1-0xf3 */
79 12, /* 0xf4 */
80 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13 /* 0xf5-0xff */
83 /* Machine states */
84 #define FSM_START 0
85 #define FSM_80BF 1
86 #define FSM_A0BF 2
87 #define FSM_80BF80BF 3
88 #define FSM_809F 4
89 #define FSM_90BF 5
90 #define FSM_80BF80BF80BF 6
91 #define FSM_808F 7
92 #define FSM_ERROR 8
94 /* In the FSM it appears that categories 0xc0-0xc1 and 0xf5-0xff make the
95 same transitions, as do categories 0xe1-0xec and 0xee-0xef. I wonder if
96 there is any great benefit in combining categories? It would reduce the
97 memory footprint of the transition table by 16 bytes, but might it be
98 harder to understand? */
100 /* Machine transition table */
101 static const char machine [9][14] = {
102 /* FSM_START */
103 {FSM_START, /* 0x00-0x7f */
104 FSM_ERROR, /* 0x80-0x8f */
105 FSM_ERROR, /* 0x90-0x9f */
106 FSM_ERROR, /* 0xa0-0xbf */
107 FSM_ERROR, /* 0xc0-0xc1 */
108 FSM_80BF, /* 0xc2-0xdf */
109 FSM_A0BF, /* 0xe0 */
110 FSM_80BF80BF, /* 0xe1-0xec */
111 FSM_809F, /* 0xed */
112 FSM_80BF80BF, /* 0xee-0xef */
113 FSM_90BF, /* 0xf0 */
114 FSM_80BF80BF80BF, /* 0xf1-0xf3 */
115 FSM_808F, /* 0xf4 */
116 FSM_ERROR}, /* 0xf5-0xff */
118 /* FSM_80BF */
119 {FSM_ERROR, /* 0x00-0x7f */
120 FSM_START, /* 0x80-0x8f */
121 FSM_START, /* 0x90-0x9f */
122 FSM_START, /* 0xa0-0xbf */
123 FSM_ERROR, /* 0xc0-0xc1 */
124 FSM_ERROR, /* 0xc2-0xdf */
125 FSM_ERROR, /* 0xe0 */
126 FSM_ERROR, /* 0xe1-0xec */
127 FSM_ERROR, /* 0xed */
128 FSM_ERROR, /* 0xee-0xef */
129 FSM_ERROR, /* 0xf0 */
130 FSM_ERROR, /* 0xf1-0xf3 */
131 FSM_ERROR, /* 0xf4 */
132 FSM_ERROR}, /* 0xf5-0xff */
134 /* FSM_A0BF */
135 {FSM_ERROR, /* 0x00-0x7f */
136 FSM_ERROR, /* 0x80-0x8f */
137 FSM_ERROR, /* 0x90-0x9f */
138 FSM_80BF, /* 0xa0-0xbf */
139 FSM_ERROR, /* 0xc0-0xc1 */
140 FSM_ERROR, /* 0xc2-0xdf */
141 FSM_ERROR, /* 0xe0 */
142 FSM_ERROR, /* 0xe1-0xec */
143 FSM_ERROR, /* 0xed */
144 FSM_ERROR, /* 0xee-0xef */
145 FSM_ERROR, /* 0xf0 */
146 FSM_ERROR, /* 0xf1-0xf3 */
147 FSM_ERROR, /* 0xf4 */
148 FSM_ERROR}, /* 0xf5-0xff */
150 /* FSM_80BF80BF */
151 {FSM_ERROR, /* 0x00-0x7f */
152 FSM_80BF, /* 0x80-0x8f */
153 FSM_80BF, /* 0x90-0x9f */
154 FSM_80BF, /* 0xa0-0xbf */
155 FSM_ERROR, /* 0xc0-0xc1 */
156 FSM_ERROR, /* 0xc2-0xdf */
157 FSM_ERROR, /* 0xe0 */
158 FSM_ERROR, /* 0xe1-0xec */
159 FSM_ERROR, /* 0xed */
160 FSM_ERROR, /* 0xee-0xef */
161 FSM_ERROR, /* 0xf0 */
162 FSM_ERROR, /* 0xf1-0xf3 */
163 FSM_ERROR, /* 0xf4 */
164 FSM_ERROR}, /* 0xf5-0xff */
166 /* FSM_809F */
167 {FSM_ERROR, /* 0x00-0x7f */
168 FSM_80BF, /* 0x80-0x8f */
169 FSM_80BF, /* 0x90-0x9f */
170 FSM_ERROR, /* 0xa0-0xbf */
171 FSM_ERROR, /* 0xc0-0xc1 */
172 FSM_ERROR, /* 0xc2-0xdf */
173 FSM_ERROR, /* 0xe0 */
174 FSM_ERROR, /* 0xe1-0xec */
175 FSM_ERROR, /* 0xed */
176 FSM_ERROR, /* 0xee-0xef */
177 FSM_ERROR, /* 0xf0 */
178 FSM_ERROR, /* 0xf1-0xf3 */
179 FSM_ERROR, /* 0xf4 */
180 FSM_ERROR}, /* 0xf5-0xff */
182 /* FSM_90BF */
183 {FSM_ERROR, /* 0x00-0x7f */
184 FSM_ERROR, /* 0x80-0x8f */
185 FSM_80BF80BF, /* 0x90-0x9f */
186 FSM_80BF80BF, /* 0xa0-0xbf */
187 FSM_ERROR, /* 0xc0-0xc1 */
188 FSM_ERROR, /* 0xc2-0xdf */
189 FSM_ERROR, /* 0xe0 */
190 FSM_ERROR, /* 0xe1-0xec */
191 FSM_ERROR, /* 0xed */
192 FSM_ERROR, /* 0xee-0xef */
193 FSM_ERROR, /* 0xf0 */
194 FSM_ERROR, /* 0xf1-0xf3 */
195 FSM_ERROR, /* 0xf4 */
196 FSM_ERROR}, /* 0xf5-0xff */
198 /* FSM_80BF80BF80BF */
199 {FSM_ERROR, /* 0x00-0x7f */
200 FSM_80BF80BF, /* 0x80-0x8f */
201 FSM_80BF80BF, /* 0x90-0x9f */
202 FSM_80BF80BF, /* 0xa0-0xbf */
203 FSM_ERROR, /* 0xc0-0xc1 */
204 FSM_ERROR, /* 0xc2-0xdf */
205 FSM_ERROR, /* 0xe0 */
206 FSM_ERROR, /* 0xe1-0xec */
207 FSM_ERROR, /* 0xed */
208 FSM_ERROR, /* 0xee-0xef */
209 FSM_ERROR, /* 0xf0 */
210 FSM_ERROR, /* 0xf1-0xf3 */
211 FSM_ERROR, /* 0xf4 */
212 FSM_ERROR}, /* 0xf5-0xff */
214 /* FSM_808F */
215 {FSM_ERROR, /* 0x00-0x7f */
216 FSM_80BF80BF, /* 0x80-0x8f */
217 FSM_ERROR, /* 0x90-0x9f */
218 FSM_ERROR, /* 0xa0-0xbf */
219 FSM_ERROR, /* 0xc0-0xc1 */
220 FSM_ERROR, /* 0xc2-0xdf */
221 FSM_ERROR, /* 0xe0 */
222 FSM_ERROR, /* 0xe1-0xec */
223 FSM_ERROR, /* 0xed */
224 FSM_ERROR, /* 0xee-0xef */
225 FSM_ERROR, /* 0xf0 */
226 FSM_ERROR, /* 0xf1-0xf3 */
227 FSM_ERROR, /* 0xf4 */
228 FSM_ERROR}, /* 0xf5-0xff */
230 /* FSM_ERROR */
231 {FSM_ERROR, /* 0x00-0x7f */
232 FSM_ERROR, /* 0x80-0x8f */
233 FSM_ERROR, /* 0x90-0x9f */
234 FSM_ERROR, /* 0xa0-0xbf */
235 FSM_ERROR, /* 0xc0-0xc1 */
236 FSM_ERROR, /* 0xc2-0xdf */
237 FSM_ERROR, /* 0xe0 */
238 FSM_ERROR, /* 0xe1-0xec */
239 FSM_ERROR, /* 0xed */
240 FSM_ERROR, /* 0xee-0xef */
241 FSM_ERROR, /* 0xf0 */
242 FSM_ERROR, /* 0xf1-0xf3 */
243 FSM_ERROR, /* 0xf4 */
244 FSM_ERROR}, /* 0xf5-0xff */
248 const char *
249 svn_utf__last_valid(const char *data, apr_size_t len)
251 const char *start = data, *end = data + len;
252 int state = FSM_START;
253 while (data < end)
255 unsigned char octet = *data++;
256 int category = octet_category[octet];
257 state = machine[state][category];
258 if (state == FSM_START)
259 start = data;
261 return start;
264 svn_boolean_t
265 svn_utf__cstring_is_valid(const char *data)
267 int state = FSM_START;
268 while (*data)
270 unsigned char octet = *data++;
271 int category = octet_category[octet];
272 state = machine[state][category];
274 return state == FSM_START ? TRUE : FALSE;
277 svn_boolean_t
278 svn_utf__is_valid(const char *data, apr_size_t len)
280 const char *end = data + len;
281 int state = FSM_START;
282 while (data < end)
284 unsigned char octet = *data++;
285 int category = octet_category[octet];
286 state = machine[state][category];
288 return state == FSM_START ? TRUE : FALSE;
291 const char *
292 svn_utf__last_valid2(const char *data, apr_size_t len)
294 const char *start = data, *end = data + len;
295 int state = FSM_START;
296 while (data < end)
298 unsigned char octet = *data++;
299 switch (state)
301 case FSM_START:
302 if (octet <= 0x7F)
303 break;
304 else if (octet <= 0xC1)
305 state = FSM_ERROR;
306 else if (octet <= 0xDF)
307 state = FSM_80BF;
308 else if (octet == 0xE0)
309 state = FSM_A0BF;
310 else if (octet <= 0xEC)
311 state = FSM_80BF80BF;
312 else if (octet == 0xED)
313 state = FSM_809F;
314 else if (octet <= 0xEF)
315 state = FSM_80BF80BF;
316 else if (octet == 0xF0)
317 state = FSM_90BF;
318 else if (octet <= 0xF3)
319 state = FSM_80BF80BF80BF;
320 else if (octet <= 0xF4)
321 state = FSM_808F;
322 else
323 state = FSM_ERROR;
324 break;
325 case FSM_80BF:
326 if (octet >= 0x80 && octet <= 0xBF)
327 state = FSM_START;
328 else
329 state = FSM_ERROR;
330 break;
331 case FSM_A0BF:
332 if (octet >= 0xA0 && octet <= 0xBF)
333 state = FSM_80BF;
334 else
335 state = FSM_ERROR;
336 break;
337 case FSM_80BF80BF:
338 if (octet >= 0x80 && octet <= 0xBF)
339 state = FSM_80BF;
340 else
341 state = FSM_ERROR;
342 break;
343 case FSM_809F:
344 if (octet >= 0x80 && octet <= 0x9F)
345 state = FSM_80BF;
346 else
347 state = FSM_ERROR;
348 break;
349 case FSM_90BF:
350 if (octet >= 0x90 && octet <= 0xBF)
351 state = FSM_80BF80BF;
352 else
353 state = FSM_ERROR;
354 break;
355 case FSM_80BF80BF80BF:
356 if (octet >= 0x80 && octet <= 0xBF)
357 state = FSM_80BF80BF;
358 else
359 state = FSM_ERROR;
360 break;
361 case FSM_808F:
362 if (octet >= 0x80 && octet <= 0x8F)
363 state = FSM_80BF80BF;
364 else
365 state = FSM_ERROR;
366 break;
367 default:
368 case FSM_ERROR:
369 return start;
371 if (state == FSM_START)
372 start = data;
374 return start;