2 * utf8proc.c: Wrappers for the utf8proc library
4 * ====================================================================
5 * Licensed to the Apache Software Foundation (ASF) under one
6 * or more contributor license agreements. See the NOTICE file
7 * distributed with this work for additional information
8 * regarding copyright ownership. The ASF licenses this file
9 * to you under the Apache License, Version 2.0 (the
10 * "License"); you may not use this file except in compliance
11 * with the License. You may obtain a copy of the License at
13 * http://www.apache.org/licenses/LICENSE-2.0
15 * Unless required by applicable law or agreed to in writing,
16 * software distributed under the License is distributed on an
17 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 * KIND, either express or implied. See the License for the
19 * specific language governing permissions and limitations
21 * ====================================================================
26 #include <apr_fnmatch.h>
28 #include "private/svn_string_private.h"
29 #include "private/svn_utf_private.h"
30 #include "svn_private_config.h"
32 #if SVN_INTERNAL_UTF8PROC
33 #define UTF8PROC_INLINE
34 /* Somehow utf8proc thinks it is nice to use strlen as an argument name,
35 while this function is already defined via apr.h */
36 #define strlen svn__strlen_var
37 #include "utf8proc/utf8proc.c"
46 svn_utf__utf8proc_compiled_version(void)
48 static const char utf8proc_version
[] =
49 APR_STRINGIFY(UTF8PROC_VERSION_MAJOR
) "."
50 APR_STRINGIFY(UTF8PROC_VERSION_MINOR
) "."
51 APR_STRINGIFY(UTF8PROC_VERSION_PATCH
);
52 return utf8proc_version
;
56 svn_utf__utf8proc_runtime_version(void)
58 /* Unused static function warning removal hack. */
59 SVN_UNUSED(utf8proc_grapheme_break
);
60 SVN_UNUSED(utf8proc_tolower
);
61 SVN_UNUSED(utf8proc_toupper
);
62 #if UTF8PROC_VERSION_MAJOR >= 2
63 SVN_UNUSED(utf8proc_totitle
);
65 SVN_UNUSED(utf8proc_charwidth
);
66 SVN_UNUSED(utf8proc_category_string
);
67 SVN_UNUSED(utf8proc_NFD
);
68 SVN_UNUSED(utf8proc_NFC
);
69 SVN_UNUSED(utf8proc_NFKD
);
70 SVN_UNUSED(utf8proc_NFKC
);
72 return utf8proc_version();
77 /* Fill the given BUFFER with decomposed UCS-4 representation of the
78 * UTF-8 STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING
79 * is NUL-terminated; otherwise look only at the first LENGTH bytes in
80 * STRING. Upon return, BUFFER->data points at an array of UCS-4
81 * characters, and return the length of the array. TRANSFORM_FLAGS
82 * define exactly how the decomposition is performed.
84 * A negative return value is an utf8proc error code and may indicate
85 * that STRING contains invalid UTF-8 or was so long that an overflow
89 unicode_decomposition(int transform_flags
,
90 const char *string
, apr_size_t length
,
93 const int nullterm
= (length
== SVN_UTF__UNKNOWN_LENGTH
94 ? UTF8PROC_NULLTERM
: 0);
98 apr_int32_t
*const ucs4buf
= buffer
->data
;
99 const apr_ssize_t ucs4len
= buffer
->size
/ sizeof(*ucs4buf
);
100 const apr_ssize_t result
=
101 utf8proc_decompose((const void*) string
, length
, ucs4buf
, ucs4len
,
102 UTF8PROC_DECOMPOSE
| UTF8PROC_STABLE
103 | transform_flags
| nullterm
);
105 if (result
< 0 || result
<= ucs4len
)
108 /* Increase the decomposition buffer size and retry */
109 svn_membuf__ensure(buffer
, result
* sizeof(*ucs4buf
));
113 /* Fill the given BUFFER with an NFD UCS-4 representation of the UTF-8
114 * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is
115 * NUL-terminated; otherwise look only at the first LENGTH bytes in
116 * STRING. Upon return, BUFFER->data points at an array of UCS-4
117 * characters and *RESULT_LENGTH contains the length of the array.
119 * A returned error may indicate that STRING contains invalid UTF-8 or
120 * invalid Unicode codepoints. Any error message comes from utf8proc.
123 decompose_normalized(apr_size_t
*result_length
,
124 const char *string
, apr_size_t length
,
125 svn_membuf_t
*buffer
)
127 apr_ssize_t result
= unicode_decomposition(0, string
, length
, buffer
);
129 return svn_error_create(SVN_ERR_UTF8PROC_ERROR
, NULL
,
130 gettext(utf8proc_errmsg(result
)));
131 *result_length
= result
;
135 /* Fill the given BUFFER with an NFC UTF-8 representation of the UTF-8
136 * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is
137 * NUL-terminated; otherwise look only at the first LENGTH bytes in
138 * STRING. Upon return, BUFFER->data points at a NUL-terminated string
139 * of UTF-8 characters.
141 * If CASEFOLD is non-zero, perform Unicode case folding, e.g., for
142 * case-insensitive string comparison. If STRIPMARK is non-zero, strip
143 * all diacritical marks (e.g., accents) from the string.
145 * A returned error may indicate that STRING contains invalid UTF-8 or
146 * invalid Unicode codepoints. Any error message comes from utf8proc.
149 normalize_cstring(apr_size_t
*result_length
,
150 const char *string
, apr_size_t length
,
151 svn_boolean_t casefold
,
152 svn_boolean_t stripmark
,
153 svn_membuf_t
*buffer
)
159 flags
|= UTF8PROC_CASEFOLD
;
162 flags
|= UTF8PROC_STRIPMARK
;
164 result
= unicode_decomposition(flags
, string
, length
, buffer
);
167 svn_membuf__resize(buffer
, result
* sizeof(apr_int32_t
) + 1);
168 result
= utf8proc_reencode(buffer
->data
, result
,
169 UTF8PROC_COMPOSE
| UTF8PROC_STABLE
);
172 return svn_error_create(SVN_ERR_UTF8PROC_ERROR
, NULL
,
173 gettext(utf8proc_errmsg(result
)));
174 *result_length
= result
;
178 /* Compare two arrays of UCS-4 codes, BUFA of length LENA and BUFB of
179 * length LENB. Return 0 if they're equal, a negative value if BUFA is
180 * less than BUFB, otherwise a positive value.
182 * Yes, this is strcmp for known-length UCS-4 strings.
185 ucs4cmp(const apr_int32_t
*bufa
, apr_size_t lena
,
186 const apr_int32_t
*bufb
, apr_size_t lenb
)
188 const apr_size_t len
= (lena
< lenb
? lena
: lenb
);
191 for (i
= 0; i
< len
; ++i
)
193 const int diff
= bufa
[i
] - bufb
[i
];
197 return (lena
== lenb
? 0 : (lena
< lenb
? -1 : 1));
201 svn_utf__normcmp(int *result
,
202 const char *str1
, apr_size_t len1
,
203 const char *str2
, apr_size_t len2
,
204 svn_membuf_t
*buf1
, svn_membuf_t
*buf2
)
209 /* Shortcut-circuit the decision if at least one of the strings is empty. */
210 const svn_boolean_t empty1
=
211 (0 == len1
|| (len1
== SVN_UTF__UNKNOWN_LENGTH
&& !*str1
));
212 const svn_boolean_t empty2
=
213 (0 == len2
|| (len2
== SVN_UTF__UNKNOWN_LENGTH
&& !*str2
));
214 if (empty1
|| empty2
)
216 *result
= (empty1
== empty2
? 0 : (empty1
? -1 : 1));
220 SVN_ERR(decompose_normalized(&buflen1
, str1
, len1
, buf1
));
221 SVN_ERR(decompose_normalized(&buflen2
, str2
, len2
, buf2
));
222 *result
= ucs4cmp(buf1
->data
, buflen1
, buf2
->data
, buflen2
);
227 svn_utf__normalize(const char **result
,
228 const char *str
, apr_size_t len
,
231 apr_size_t result_length
;
232 SVN_ERR(normalize_cstring(&result_length
, str
, len
, FALSE
, FALSE
, buf
));
233 *result
= (const char*)(buf
->data
);
238 svn_utf__xfrm(const char **result
,
239 const char *str
, apr_size_t len
,
240 svn_boolean_t case_insensitive
,
241 svn_boolean_t accent_insensitive
,
244 apr_size_t result_length
;
245 SVN_ERR(normalize_cstring(&result_length
, str
, len
,
246 case_insensitive
, accent_insensitive
, buf
));
247 *result
= (const char*)(buf
->data
);
252 svn_utf__fuzzy_glob_match(const char *str
,
253 const apr_array_header_t
*patterns
,
256 const char *normalized
;
260 /* Try to normalize case and accents in STR.
262 * If that should fail for some reason, consider STR a mismatch. */
263 err
= svn_utf__xfrm(&normalized
, str
, strlen(str
), TRUE
, TRUE
, buf
);
266 svn_error_clear(err
);
270 /* Now see whether it matches any/all of the patterns. */
271 for (i
= 0; i
< patterns
->nelts
; ++i
)
273 const char *pattern
= APR_ARRAY_IDX(patterns
, i
, const char *);
274 if (apr_fnmatch(pattern
, normalized
, 0) == APR_SUCCESS
)
281 /* Decode a single UCS-4 code point to UTF-8, appending the result to BUFFER.
282 * Assume BUFFER is already filled to *LENGTH and return the new size there.
283 * This function does *not* nul-terminate the stringbuf!
285 * A returned error indicates that the codepoint is invalid.
288 encode_ucs4(svn_membuf_t
*buffer
, apr_int32_t ucs4chr
, apr_size_t
*length
)
292 if (buffer
->size
- *length
< 4)
293 svn_membuf__resize(buffer
, buffer
->size
+ 4);
295 utf8len
= utf8proc_encode_char(ucs4chr
, ((apr_byte_t
*)buffer
->data
+ *length
));
297 return svn_error_createf(SVN_ERR_UTF8PROC_ERROR
, NULL
,
298 _("Invalid Unicode character U+%04lX"),
305 svn_utf__encode_ucs4_string(svn_membuf_t
*buffer
,
306 const apr_int32_t
*ucs4str
,
308 apr_size_t
*result_length
)
312 SVN_ERR(encode_ucs4(buffer
, *ucs4str
++, result_length
));
313 svn_membuf__resize(buffer
, *result_length
+ 1);
314 ((char*)buffer
->data
)[*result_length
] = '\0';
320 svn_utf__glob(svn_boolean_t
*match
,
321 const char *pattern
, apr_size_t pattern_len
,
322 const char *string
, apr_size_t string_len
,
323 const char *escape
, apr_size_t escape_len
,
324 svn_boolean_t sql_like
,
325 svn_membuf_t
*pattern_buf
,
326 svn_membuf_t
*string_buf
,
327 svn_membuf_t
*temp_buf
)
329 apr_size_t patternbuf_len
;
330 apr_size_t tempbuf_len
;
332 /* If we're in GLOB mode, we don't do custom escape chars. */
333 if (escape
&& !sql_like
)
334 return svn_error_create(SVN_ERR_UTF8_GLOB
, NULL
,
335 _("Cannot use a custom escape token"
336 " in glob matching mode"));
338 /* Convert the patern to NFD UTF-8. We can't use the UCS-4 result
339 because apr_fnmatch can't handle it.*/
340 SVN_ERR(decompose_normalized(&tempbuf_len
, pattern
, pattern_len
, temp_buf
));
342 SVN_ERR(svn_utf__encode_ucs4_string(pattern_buf
, temp_buf
->data
,
343 tempbuf_len
, &patternbuf_len
));
346 /* Convert a LIKE pattern to a GLOB pattern that apr_fnmatch can use. */
347 const apr_int32_t
*like
= temp_buf
->data
;
349 svn_boolean_t escaped
;
353 ucs4esc
= -1; /* Definitely an invalid UCS-4 character. */
356 const int nullterm
= (escape_len
== SVN_UTF__UNKNOWN_LENGTH
357 ? UTF8PROC_NULLTERM
: 0);
359 utf8proc_decompose((const void*) escape
, escape_len
, &ucs4esc
, 1,
360 UTF8PROC_DECOMPOSE
| UTF8PROC_STABLE
| nullterm
);
362 return svn_error_create(SVN_ERR_UTF8PROC_ERROR
, NULL
,
363 gettext(utf8proc_errmsg(result
)));
364 if (result
== 0 || result
> 1)
365 return svn_error_create(SVN_ERR_UTF8_GLOB
, NULL
,
366 _("Escape token must be one character"));
367 if ((ucs4esc
& 0xFF) != ucs4esc
)
368 return svn_error_createf(SVN_ERR_UTF8_GLOB
, NULL
,
369 _("Invalid escape character U+%04lX"),
374 svn_membuf__ensure(pattern_buf
, tempbuf_len
+ 1);
375 for (i
= 0, escaped
= FALSE
; i
< tempbuf_len
; ++i
, ++like
)
377 if (*like
== ucs4esc
&& !escaped
)
379 svn_membuf__resize(pattern_buf
, patternbuf_len
+ 1);
380 ((char*)pattern_buf
->data
)[patternbuf_len
++] = '\\';
385 SVN_ERR(encode_ucs4(pattern_buf
, *like
, &patternbuf_len
));
390 if ((*like
== '[' || *like
== '\\') && !escaped
)
392 /* Escape brackets and backslashes which are always
393 literals in LIKE patterns. */
394 svn_membuf__resize(pattern_buf
, patternbuf_len
+ 1);
395 ((char*)pattern_buf
->data
)[patternbuf_len
++] = '\\';
401 /* Replace LIKE wildcards with their GLOB equivalents. */
402 if (*like
== '%' || *like
== '_')
404 const char wildcard
= (*like
== '%' ? '*' : '?');
405 svn_membuf__resize(pattern_buf
, patternbuf_len
+ 1);
406 ((char*)pattern_buf
->data
)[patternbuf_len
++] = wildcard
;
409 SVN_ERR(encode_ucs4(pattern_buf
, *like
, &patternbuf_len
));
412 svn_membuf__resize(pattern_buf
, patternbuf_len
+ 1);
413 ((char*)pattern_buf
->data
)[patternbuf_len
] = '\0';
416 /* Now normalize the string */
417 SVN_ERR(decompose_normalized(&tempbuf_len
, string
, string_len
, temp_buf
));
418 SVN_ERR(svn_utf__encode_ucs4_string(string_buf
, temp_buf
->data
,
419 tempbuf_len
, &tempbuf_len
));
421 *match
= !apr_fnmatch(pattern_buf
->data
, string_buf
->data
, 0);
426 svn_utf__is_normalized(const char *string
, apr_pool_t
*scratch_pool
)
430 apr_size_t result_length
;
431 const apr_size_t length
= strlen(string
);
432 svn_membuf__create(&buffer
, length
* sizeof(apr_int32_t
), scratch_pool
);
433 err
= normalize_cstring(&result_length
, string
, length
,
434 FALSE
, FALSE
, &buffer
);
437 svn_error_clear(err
);
440 return (length
== result_length
&& 0 == strcmp(string
, buffer
.data
));
444 svn_utf__fuzzy_escape(const char *src
, apr_size_t length
, apr_pool_t
*pool
)
446 /* Hexadecimal digits for code conversion. */
447 static const char digits
[] = "0123456789ABCDEF";
449 /* Flags used for Unicode decomposition. */
450 static const int decomp_flags
= (
451 UTF8PROC_COMPAT
| UTF8PROC_STABLE
| UTF8PROC_LUMP
452 | UTF8PROC_NLF2LF
| UTF8PROC_STRIPCC
| UTF8PROC_STRIPMARK
);
454 svn_stringbuf_t
*result
;
456 apr_ssize_t decomp_length
;
459 /* Decompose to a non-reversible compatibility format. */
460 svn_membuf__create(&buffer
, length
* sizeof(apr_int32_t
), pool
);
461 decomp_length
= unicode_decomposition(decomp_flags
, src
, length
, &buffer
);
462 if (decomp_length
< 0)
465 apr_size_t done
, prev
;
467 /* The only other error we can receive here indicates an integer
468 overflow due to the length of the input string. Not very
469 likely, but we certainly shouldn't continue in that case. */
470 SVN_ERR_ASSERT_NO_RETURN(decomp_length
== UTF8PROC_ERROR_INVALIDUTF8
);
472 /* Break the decomposition into parts that are valid UTF-8, and
473 bytes that are not. Represent the invalid bytes in the target
474 erray by their negative value. This works because utf8proc
475 will not generate Unicode code points with values larger than
477 svn_membuf__create(&part
, sizeof(apr_int32_t
), pool
);
480 while (done
< length
)
484 while (done
< length
)
486 len
= utf8proc_iterate((apr_byte_t
*)src
+ done
, length
- done
, &uc
);
492 /* Decompose the valid part */
495 len
= unicode_decomposition(
496 decomp_flags
, src
+ prev
, done
- prev
, &part
);
497 SVN_ERR_ASSERT_NO_RETURN(len
> 0);
499 &buffer
, (decomp_length
+ len
) * sizeof(apr_int32_t
));
500 memcpy((apr_int32_t
*)buffer
.data
+ decomp_length
,
501 part
.data
, len
* sizeof(apr_int32_t
));
502 decomp_length
+= len
;
506 /* What follows could be a valid UTF-8 sequence, but not
507 a valid Unicode character. */
512 /* Determine the length of the UTF-8 sequence */
513 const char *const p
= src
+ done
;
514 len
= utf8proc_utf8class
[(apr_byte_t
)*p
];
516 /* Check if the multi-byte sequence is valid UTF-8. */
517 if (len
> 1 && len
<= (apr_ssize_t
)(length
- done
))
518 last
= svn_utf__last_valid(p
, len
);
522 /* Might not be a valid UTF-8 sequence at all */
523 if (!last
|| (last
&& last
- p
< len
))
525 uc
= -((apr_int32_t
)(*p
& 0xff));
532 /* Decode the UTF-8 sequence without validation. */
534 uc
= ((p
[0] & 0x1f) << 6) + (p
[1] & 0x3f);
537 uc
= (((p
[0] & 0x0f) << 12) + ((p
[1] & 0x3f) << 6)
541 uc
= (((p
[0] & 0x07) << 18) + ((p
[1] & 0x3f) << 12)
542 + ((p
[2] & 0x3f) << 6) + (p
[3] & 0x3f));
545 SVN_ERR_ASSERT_NO_RETURN(
546 !"Unexpected invalid UTF-8 byte");
552 &buffer
, (decomp_length
+ 1) * sizeof(apr_int32_t
));
553 ((apr_int32_t
*)buffer
.data
)[decomp_length
++] = uc
;
560 /* Scan the result and deleting any combining diacriticals and
561 inserting placeholders where any non-ascii characters remain. */
562 result
= svn_stringbuf_create_ensure(decomp_length
, pool
);
563 for (len
= 0; len
< decomp_length
; ++len
)
565 const apr_int32_t cp
= ((apr_int32_t
*)buffer
.data
)[len
];
566 if (cp
> 0 && cp
< 127)
567 svn_stringbuf_appendbyte(result
, (char)cp
);
569 svn_stringbuf_appendcstr(result
, "\\0");
572 const apr_int32_t rcp
= ((-cp
) & 0xff);
573 svn_stringbuf_appendcstr(result
, "?\\");
574 svn_stringbuf_appendbyte(result
, digits
[(rcp
& 0x00f0) >> 4]);
575 svn_stringbuf_appendbyte(result
, digits
[(rcp
& 0x000f)]);
579 if (utf8proc_codepoint_valid(cp
))
581 const utf8proc_property_t
*prop
= utf8proc_get_property(cp
);
582 if (prop
->combining_class
!= 0)
583 continue; /* Combining mark; ignore */
584 svn_stringbuf_appendcstr(result
, "{U+");
587 svn_stringbuf_appendcstr(result
, "{U?");
590 svn_stringbuf_appendbyte(result
, digits
[(cp
& 0xf00000) >> 20]);
591 svn_stringbuf_appendbyte(result
, digits
[(cp
& 0x0f0000) >> 16]);
593 svn_stringbuf_appendbyte(result
, digits
[(cp
& 0xf000) >> 12]);
594 svn_stringbuf_appendbyte(result
, digits
[(cp
& 0x0f00) >> 8]);
595 svn_stringbuf_appendbyte(result
, digits
[(cp
& 0x00f0) >> 4]);
596 svn_stringbuf_appendbyte(result
, digits
[(cp
& 0x000f)]);
597 svn_stringbuf_appendbyte(result
, '}');