Win32: fix an incorrect error status being propagated to the caller in case
[svn/apache.git] / subversion / libsvn_subr / utf8proc.c
blob0e22af8d94ae56babe6917f13f7acc26f244204d
1 /*
2 * utf8proc.c: Wrappers for the utf8proc library
4 * ====================================================================
5 * Licensed to the Apache Software Foundation (ASF) under one
6 * or more contributor license agreements. See the NOTICE file
7 * distributed with this work for additional information
8 * regarding copyright ownership. The ASF licenses this file
9 * to you under the Apache License, Version 2.0 (the
10 * "License"); you may not use this file except in compliance
11 * with the License. You may obtain a copy of the License at
13 * http://www.apache.org/licenses/LICENSE-2.0
15 * Unless required by applicable law or agreed to in writing,
16 * software distributed under the License is distributed on an
17 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 * KIND, either express or implied. See the License for the
19 * specific language governing permissions and limitations
20 * under the License.
21 * ====================================================================
26 #include <apr_fnmatch.h>
28 #include "private/svn_string_private.h"
29 #include "private/svn_utf_private.h"
30 #include "svn_private_config.h"
32 #if SVN_INTERNAL_UTF8PROC
33 #define UTF8PROC_INLINE
34 /* Somehow utf8proc thinks it is nice to use strlen as an argument name,
35 while this function is already defined via apr.h */
36 #define strlen svn__strlen_var
37 #include "utf8proc/utf8proc.c"
38 #undef strlen
39 #else
40 #include <utf8proc.h>
41 #endif
45 const char *
46 svn_utf__utf8proc_compiled_version(void)
48 static const char utf8proc_version[] =
49 APR_STRINGIFY(UTF8PROC_VERSION_MAJOR) "."
50 APR_STRINGIFY(UTF8PROC_VERSION_MINOR) "."
51 APR_STRINGIFY(UTF8PROC_VERSION_PATCH);
52 return utf8proc_version;
55 const char *
56 svn_utf__utf8proc_runtime_version(void)
58 /* Unused static function warning removal hack. */
59 SVN_UNUSED(utf8proc_grapheme_break);
60 SVN_UNUSED(utf8proc_tolower);
61 SVN_UNUSED(utf8proc_toupper);
62 #if UTF8PROC_VERSION_MAJOR >= 2
63 SVN_UNUSED(utf8proc_totitle);
64 #endif
65 SVN_UNUSED(utf8proc_charwidth);
66 SVN_UNUSED(utf8proc_category_string);
67 SVN_UNUSED(utf8proc_NFD);
68 SVN_UNUSED(utf8proc_NFC);
69 SVN_UNUSED(utf8proc_NFKD);
70 SVN_UNUSED(utf8proc_NFKC);
72 return utf8proc_version();
77 /* Fill the given BUFFER with decomposed UCS-4 representation of the
78 * UTF-8 STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING
79 * is NUL-terminated; otherwise look only at the first LENGTH bytes in
80 * STRING. Upon return, BUFFER->data points at an array of UCS-4
81 * characters, and return the length of the array. TRANSFORM_FLAGS
82 * define exactly how the decomposition is performed.
84 * A negative return value is an utf8proc error code and may indicate
85 * that STRING contains invalid UTF-8 or was so long that an overflow
86 * occurred.
88 static apr_ssize_t
89 unicode_decomposition(int transform_flags,
90 const char *string, apr_size_t length,
91 svn_membuf_t *buffer)
93 const int nullterm = (length == SVN_UTF__UNKNOWN_LENGTH
94 ? UTF8PROC_NULLTERM : 0);
96 for (;;)
98 apr_int32_t *const ucs4buf = buffer->data;
99 const apr_ssize_t ucs4len = buffer->size / sizeof(*ucs4buf);
100 const apr_ssize_t result =
101 utf8proc_decompose((const void*) string, length, ucs4buf, ucs4len,
102 UTF8PROC_DECOMPOSE | UTF8PROC_STABLE
103 | transform_flags | nullterm);
105 if (result < 0 || result <= ucs4len)
106 return result;
108 /* Increase the decomposition buffer size and retry */
109 svn_membuf__ensure(buffer, result * sizeof(*ucs4buf));
113 /* Fill the given BUFFER with an NFD UCS-4 representation of the UTF-8
114 * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is
115 * NUL-terminated; otherwise look only at the first LENGTH bytes in
116 * STRING. Upon return, BUFFER->data points at an array of UCS-4
117 * characters and *RESULT_LENGTH contains the length of the array.
119 * A returned error may indicate that STRING contains invalid UTF-8 or
120 * invalid Unicode codepoints. Any error message comes from utf8proc.
122 static svn_error_t *
123 decompose_normalized(apr_size_t *result_length,
124 const char *string, apr_size_t length,
125 svn_membuf_t *buffer)
127 apr_ssize_t result = unicode_decomposition(0, string, length, buffer);
128 if (result < 0)
129 return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
130 gettext(utf8proc_errmsg(result)));
131 *result_length = result;
132 return SVN_NO_ERROR;
135 /* Fill the given BUFFER with an NFC UTF-8 representation of the UTF-8
136 * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is
137 * NUL-terminated; otherwise look only at the first LENGTH bytes in
138 * STRING. Upon return, BUFFER->data points at a NUL-terminated string
139 * of UTF-8 characters.
141 * If CASEFOLD is non-zero, perform Unicode case folding, e.g., for
142 * case-insensitive string comparison. If STRIPMARK is non-zero, strip
143 * all diacritical marks (e.g., accents) from the string.
145 * A returned error may indicate that STRING contains invalid UTF-8 or
146 * invalid Unicode codepoints. Any error message comes from utf8proc.
148 static svn_error_t *
149 normalize_cstring(apr_size_t *result_length,
150 const char *string, apr_size_t length,
151 svn_boolean_t casefold,
152 svn_boolean_t stripmark,
153 svn_membuf_t *buffer)
155 int flags = 0;
156 apr_ssize_t result;
158 if (casefold)
159 flags |= UTF8PROC_CASEFOLD;
161 if (stripmark)
162 flags |= UTF8PROC_STRIPMARK;
164 result = unicode_decomposition(flags, string, length, buffer);
165 if (result >= 0)
167 svn_membuf__resize(buffer, result * sizeof(apr_int32_t) + 1);
168 result = utf8proc_reencode(buffer->data, result,
169 UTF8PROC_COMPOSE | UTF8PROC_STABLE);
171 if (result < 0)
172 return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
173 gettext(utf8proc_errmsg(result)));
174 *result_length = result;
175 return SVN_NO_ERROR;
178 /* Compare two arrays of UCS-4 codes, BUFA of length LENA and BUFB of
179 * length LENB. Return 0 if they're equal, a negative value if BUFA is
180 * less than BUFB, otherwise a positive value.
182 * Yes, this is strcmp for known-length UCS-4 strings.
184 static int
185 ucs4cmp(const apr_int32_t *bufa, apr_size_t lena,
186 const apr_int32_t *bufb, apr_size_t lenb)
188 const apr_size_t len = (lena < lenb ? lena : lenb);
189 apr_size_t i;
191 for (i = 0; i < len; ++i)
193 const int diff = bufa[i] - bufb[i];
194 if (diff)
195 return diff;
197 return (lena == lenb ? 0 : (lena < lenb ? -1 : 1));
200 svn_error_t *
201 svn_utf__normcmp(int *result,
202 const char *str1, apr_size_t len1,
203 const char *str2, apr_size_t len2,
204 svn_membuf_t *buf1, svn_membuf_t *buf2)
206 apr_size_t buflen1;
207 apr_size_t buflen2;
209 /* Shortcut-circuit the decision if at least one of the strings is empty. */
210 const svn_boolean_t empty1 =
211 (0 == len1 || (len1 == SVN_UTF__UNKNOWN_LENGTH && !*str1));
212 const svn_boolean_t empty2 =
213 (0 == len2 || (len2 == SVN_UTF__UNKNOWN_LENGTH && !*str2));
214 if (empty1 || empty2)
216 *result = (empty1 == empty2 ? 0 : (empty1 ? -1 : 1));
217 return SVN_NO_ERROR;
220 SVN_ERR(decompose_normalized(&buflen1, str1, len1, buf1));
221 SVN_ERR(decompose_normalized(&buflen2, str2, len2, buf2));
222 *result = ucs4cmp(buf1->data, buflen1, buf2->data, buflen2);
223 return SVN_NO_ERROR;
226 svn_error_t*
227 svn_utf__normalize(const char **result,
228 const char *str, apr_size_t len,
229 svn_membuf_t *buf)
231 apr_size_t result_length;
232 SVN_ERR(normalize_cstring(&result_length, str, len, FALSE, FALSE, buf));
233 *result = (const char*)(buf->data);
234 return SVN_NO_ERROR;
237 svn_error_t *
238 svn_utf__xfrm(const char **result,
239 const char *str, apr_size_t len,
240 svn_boolean_t case_insensitive,
241 svn_boolean_t accent_insensitive,
242 svn_membuf_t *buf)
244 apr_size_t result_length;
245 SVN_ERR(normalize_cstring(&result_length, str, len,
246 case_insensitive, accent_insensitive, buf));
247 *result = (const char*)(buf->data);
248 return SVN_NO_ERROR;
251 svn_boolean_t
252 svn_utf__fuzzy_glob_match(const char *str,
253 const apr_array_header_t *patterns,
254 svn_membuf_t *buf)
256 const char *normalized;
257 svn_error_t *err;
258 int i;
260 /* Try to normalize case and accents in STR.
262 * If that should fail for some reason, consider STR a mismatch. */
263 err = svn_utf__xfrm(&normalized, str, strlen(str), TRUE, TRUE, buf);
264 if (err)
266 svn_error_clear(err);
267 return FALSE;
270 /* Now see whether it matches any/all of the patterns. */
271 for (i = 0; i < patterns->nelts; ++i)
273 const char *pattern = APR_ARRAY_IDX(patterns, i, const char *);
274 if (apr_fnmatch(pattern, normalized, 0) == APR_SUCCESS)
275 return TRUE;
278 return FALSE;
281 /* Decode a single UCS-4 code point to UTF-8, appending the result to BUFFER.
282 * Assume BUFFER is already filled to *LENGTH and return the new size there.
283 * This function does *not* nul-terminate the stringbuf!
285 * A returned error indicates that the codepoint is invalid.
287 static svn_error_t *
288 encode_ucs4(svn_membuf_t *buffer, apr_int32_t ucs4chr, apr_size_t *length)
290 apr_size_t utf8len;
292 if (buffer->size - *length < 4)
293 svn_membuf__resize(buffer, buffer->size + 4);
295 utf8len = utf8proc_encode_char(ucs4chr, ((apr_byte_t*)buffer->data + *length));
296 if (!utf8len)
297 return svn_error_createf(SVN_ERR_UTF8PROC_ERROR, NULL,
298 _("Invalid Unicode character U+%04lX"),
299 (long)ucs4chr);
300 *length += utf8len;
301 return SVN_NO_ERROR;
304 svn_error_t *
305 svn_utf__encode_ucs4_string(svn_membuf_t *buffer,
306 const apr_int32_t *ucs4str,
307 apr_size_t length,
308 apr_size_t *result_length)
310 *result_length = 0;
311 while (length-- > 0)
312 SVN_ERR(encode_ucs4(buffer, *ucs4str++, result_length));
313 svn_membuf__resize(buffer, *result_length + 1);
314 ((char*)buffer->data)[*result_length] = '\0';
315 return SVN_NO_ERROR;
319 svn_error_t *
320 svn_utf__glob(svn_boolean_t *match,
321 const char *pattern, apr_size_t pattern_len,
322 const char *string, apr_size_t string_len,
323 const char *escape, apr_size_t escape_len,
324 svn_boolean_t sql_like,
325 svn_membuf_t *pattern_buf,
326 svn_membuf_t *string_buf,
327 svn_membuf_t *temp_buf)
329 apr_size_t patternbuf_len;
330 apr_size_t tempbuf_len;
332 /* If we're in GLOB mode, we don't do custom escape chars. */
333 if (escape && !sql_like)
334 return svn_error_create(SVN_ERR_UTF8_GLOB, NULL,
335 _("Cannot use a custom escape token"
336 " in glob matching mode"));
338 /* Convert the patern to NFD UTF-8. We can't use the UCS-4 result
339 because apr_fnmatch can't handle it.*/
340 SVN_ERR(decompose_normalized(&tempbuf_len, pattern, pattern_len, temp_buf));
341 if (!sql_like)
342 SVN_ERR(svn_utf__encode_ucs4_string(pattern_buf, temp_buf->data,
343 tempbuf_len, &patternbuf_len));
344 else
346 /* Convert a LIKE pattern to a GLOB pattern that apr_fnmatch can use. */
347 const apr_int32_t *like = temp_buf->data;
348 apr_int32_t ucs4esc;
349 svn_boolean_t escaped;
350 apr_size_t i;
352 if (!escape)
353 ucs4esc = -1; /* Definitely an invalid UCS-4 character. */
354 else
356 const int nullterm = (escape_len == SVN_UTF__UNKNOWN_LENGTH
357 ? UTF8PROC_NULLTERM : 0);
358 apr_ssize_t result =
359 utf8proc_decompose((const void*) escape, escape_len, &ucs4esc, 1,
360 UTF8PROC_DECOMPOSE | UTF8PROC_STABLE | nullterm);
361 if (result < 0)
362 return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
363 gettext(utf8proc_errmsg(result)));
364 if (result == 0 || result > 1)
365 return svn_error_create(SVN_ERR_UTF8_GLOB, NULL,
366 _("Escape token must be one character"));
367 if ((ucs4esc & 0xFF) != ucs4esc)
368 return svn_error_createf(SVN_ERR_UTF8_GLOB, NULL,
369 _("Invalid escape character U+%04lX"),
370 (long)ucs4esc);
373 patternbuf_len = 0;
374 svn_membuf__ensure(pattern_buf, tempbuf_len + 1);
375 for (i = 0, escaped = FALSE; i < tempbuf_len; ++i, ++like)
377 if (*like == ucs4esc && !escaped)
379 svn_membuf__resize(pattern_buf, patternbuf_len + 1);
380 ((char*)pattern_buf->data)[patternbuf_len++] = '\\';
381 escaped = TRUE;
383 else if (escaped)
385 SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len));
386 escaped = FALSE;
388 else
390 if ((*like == '[' || *like == '\\') && !escaped)
392 /* Escape brackets and backslashes which are always
393 literals in LIKE patterns. */
394 svn_membuf__resize(pattern_buf, patternbuf_len + 1);
395 ((char*)pattern_buf->data)[patternbuf_len++] = '\\';
396 escaped = TRUE;
397 --i; --like;
398 continue;
401 /* Replace LIKE wildcards with their GLOB equivalents. */
402 if (*like == '%' || *like == '_')
404 const char wildcard = (*like == '%' ? '*' : '?');
405 svn_membuf__resize(pattern_buf, patternbuf_len + 1);
406 ((char*)pattern_buf->data)[patternbuf_len++] = wildcard;
408 else
409 SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len));
412 svn_membuf__resize(pattern_buf, patternbuf_len + 1);
413 ((char*)pattern_buf->data)[patternbuf_len] = '\0';
416 /* Now normalize the string */
417 SVN_ERR(decompose_normalized(&tempbuf_len, string, string_len, temp_buf));
418 SVN_ERR(svn_utf__encode_ucs4_string(string_buf, temp_buf->data,
419 tempbuf_len, &tempbuf_len));
421 *match = !apr_fnmatch(pattern_buf->data, string_buf->data, 0);
422 return SVN_NO_ERROR;
425 svn_boolean_t
426 svn_utf__is_normalized(const char *string, apr_pool_t *scratch_pool)
428 svn_error_t *err;
429 svn_membuf_t buffer;
430 apr_size_t result_length;
431 const apr_size_t length = strlen(string);
432 svn_membuf__create(&buffer, length * sizeof(apr_int32_t), scratch_pool);
433 err = normalize_cstring(&result_length, string, length,
434 FALSE, FALSE, &buffer);
435 if (err)
437 svn_error_clear(err);
438 return FALSE;
440 return (length == result_length && 0 == strcmp(string, buffer.data));
443 const char *
444 svn_utf__fuzzy_escape(const char *src, apr_size_t length, apr_pool_t *pool)
446 /* Hexadecimal digits for code conversion. */
447 static const char digits[] = "0123456789ABCDEF";
449 /* Flags used for Unicode decomposition. */
450 static const int decomp_flags = (
451 UTF8PROC_COMPAT | UTF8PROC_STABLE | UTF8PROC_LUMP
452 | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC | UTF8PROC_STRIPMARK);
454 svn_stringbuf_t *result;
455 svn_membuf_t buffer;
456 apr_ssize_t decomp_length;
457 apr_ssize_t len;
459 /* Decompose to a non-reversible compatibility format. */
460 svn_membuf__create(&buffer, length * sizeof(apr_int32_t), pool);
461 decomp_length = unicode_decomposition(decomp_flags, src, length, &buffer);
462 if (decomp_length < 0)
464 svn_membuf_t part;
465 apr_size_t done, prev;
467 /* The only other error we can receive here indicates an integer
468 overflow due to the length of the input string. Not very
469 likely, but we certainly shouldn't continue in that case. */
470 SVN_ERR_ASSERT_NO_RETURN(decomp_length == UTF8PROC_ERROR_INVALIDUTF8);
472 /* Break the decomposition into parts that are valid UTF-8, and
473 bytes that are not. Represent the invalid bytes in the target
474 erray by their negative value. This works because utf8proc
475 will not generate Unicode code points with values larger than
476 U+10FFFF. */
477 svn_membuf__create(&part, sizeof(apr_int32_t), pool);
478 decomp_length = 0;
479 done = prev = 0;
480 while (done < length)
482 apr_int32_t uc;
484 while (done < length)
486 len = utf8proc_iterate((apr_byte_t*)src + done, length - done, &uc);
487 if (len < 0)
488 break;
489 done += len;
492 /* Decompose the valid part */
493 if (done > prev)
495 len = unicode_decomposition(
496 decomp_flags, src + prev, done - prev, &part);
497 SVN_ERR_ASSERT_NO_RETURN(len > 0);
498 svn_membuf__resize(
499 &buffer, (decomp_length + len) * sizeof(apr_int32_t));
500 memcpy((apr_int32_t*)buffer.data + decomp_length,
501 part.data, len * sizeof(apr_int32_t));
502 decomp_length += len;
503 prev = done;
506 /* What follows could be a valid UTF-8 sequence, but not
507 a valid Unicode character. */
508 if (done < length)
510 const char *last;
512 /* Determine the length of the UTF-8 sequence */
513 const char *const p = src + done;
514 len = utf8proc_utf8class[(apr_byte_t)*p];
516 /* Check if the multi-byte sequence is valid UTF-8. */
517 if (len > 1 && len <= (apr_ssize_t)(length - done))
518 last = svn_utf__last_valid(p, len);
519 else
520 last = NULL;
522 /* Might not be a valid UTF-8 sequence at all */
523 if (!last || (last && last - p < len))
525 uc = -((apr_int32_t)(*p & 0xff));
526 len = 1;
528 else
530 switch (len)
532 /* Decode the UTF-8 sequence without validation. */
533 case 2:
534 uc = ((p[0] & 0x1f) << 6) + (p[1] & 0x3f);
535 break;
536 case 3:
537 uc = (((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6)
538 + (p[2] & 0x3f));
539 break;
540 case 4:
541 uc = (((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
542 + ((p[2] & 0x3f) << 6) + (p[3] & 0x3f));
543 break;
544 default:
545 SVN_ERR_ASSERT_NO_RETURN(
546 !"Unexpected invalid UTF-8 byte");
551 svn_membuf__resize(
552 &buffer, (decomp_length + 1) * sizeof(apr_int32_t));
553 ((apr_int32_t*)buffer.data)[decomp_length++] = uc;
554 done += len;
555 prev = done;
560 /* Scan the result and deleting any combining diacriticals and
561 inserting placeholders where any non-ascii characters remain. */
562 result = svn_stringbuf_create_ensure(decomp_length, pool);
563 for (len = 0; len < decomp_length; ++len)
565 const apr_int32_t cp = ((apr_int32_t*)buffer.data)[len];
566 if (cp > 0 && cp < 127)
567 svn_stringbuf_appendbyte(result, (char)cp);
568 else if (cp == 0)
569 svn_stringbuf_appendcstr(result, "\\0");
570 else if (cp < 0)
572 const apr_int32_t rcp = ((-cp) & 0xff);
573 svn_stringbuf_appendcstr(result, "?\\");
574 svn_stringbuf_appendbyte(result, digits[(rcp & 0x00f0) >> 4]);
575 svn_stringbuf_appendbyte(result, digits[(rcp & 0x000f)]);
577 else
579 if (utf8proc_codepoint_valid(cp))
581 const utf8proc_property_t *prop = utf8proc_get_property(cp);
582 if (prop->combining_class != 0)
583 continue; /* Combining mark; ignore */
584 svn_stringbuf_appendcstr(result, "{U+");
586 else
587 svn_stringbuf_appendcstr(result, "{U?");
588 if (cp > 0xffff)
590 svn_stringbuf_appendbyte(result, digits[(cp & 0xf00000) >> 20]);
591 svn_stringbuf_appendbyte(result, digits[(cp & 0x0f0000) >> 16]);
593 svn_stringbuf_appendbyte(result, digits[(cp & 0xf000) >> 12]);
594 svn_stringbuf_appendbyte(result, digits[(cp & 0x0f00) >> 8]);
595 svn_stringbuf_appendbyte(result, digits[(cp & 0x00f0) >> 4]);
596 svn_stringbuf_appendbyte(result, digits[(cp & 0x000f)]);
597 svn_stringbuf_appendbyte(result, '}');
601 return result->data;