1 // This file is part of Deark.
2 // Copyright (C) 2016 Jason Summers
3 // See the file COPYING for terms of use.
5 // Implementation of de_ucstring Unicode string object
7 #define DE_NOT_IN_MODULE
8 #include "deark-config.h"
10 #include "deark-private.h"
12 // de_ucstring is a Unicode (utf-32) string object.
13 de_ucstring
*ucstring_create(deark
*c
)
16 s
= de_malloc(c
, sizeof(de_ucstring
));
21 void ucstring_empty(de_ucstring
*s
)
23 ucstring_truncate(s
, 0);
26 // Reduce the string's length to newlen, by deleting the characters after
28 // 'newlen' is expected to be no larger than the string's current length.
29 void ucstring_truncate(de_ucstring
*s
, i64 newlen
)
32 if(newlen
<0) newlen
=0;
33 if(newlen
<s
->len
) s
->len
= newlen
;
36 // There's no requirement to free tmp_string here, but it's no
37 // longer needed, and maybe it's nice to have a way to do it.
38 de_free(s
->c
, s
->tmp_string
);
43 // Delete the first U+0000 character, and everything after it.
44 void ucstring_truncate_at_NUL(de_ucstring
*s
)
48 for(i
=0; i
<s
->len
; i
++) {
49 if(s
->str
[i
]==0x0000) {
50 ucstring_truncate(s
, i
);
56 // If the string ends with U+0000, delete that character.
57 // If not, do nothing.
58 void ucstring_strip_trailing_NUL(de_ucstring
*s
)
60 if(s
->len
>=1 && s
->str
[s
->len
-1]==0x0000) {
61 ucstring_truncate(s
, s
->len
-1);
65 void ucstring_strip_trailing_spaces(de_ucstring
*s
)
67 while(s
->len
>=1 && s
->str
[s
->len
-1]==' ') {
68 ucstring_truncate(s
, s
->len
-1);
73 void ucstring_append_ucstring(de_ucstring
*s1
, const de_ucstring
*s2
)
78 // TODO: This could be done more efficiently.
79 for(i
=0; i
<s2
->len
; i
++) {
80 ucstring_append_char(s1
, s2
->str
[i
]);
84 void ucstring_vprintf(de_ucstring
*s
, de_ext_encoding ee
, const char *fmt
, va_list ap
)
87 de_vsnprintf(buf
, sizeof(buf
), fmt
, ap
);
88 ucstring_append_sz(s
, buf
, ee
);
91 // Appends a formatted C-style string.
92 // (Unfortunately, there is no format specifier for a ucstring.)
93 // There is a limit to how many characters will be appended.
94 void ucstring_printf(de_ucstring
*s
, de_ext_encoding ee
, const char *fmt
, ...)
99 ucstring_vprintf(s
, ee
, fmt
, ap
);
103 de_ucstring
*ucstring_clone(const de_ucstring
*src
)
107 if(!src
) return NULL
;
108 dst
= ucstring_create(src
->c
);
109 ucstring_append_ucstring(dst
, src
);
113 void ucstring_destroy(de_ucstring
*s
)
119 de_free(c
, s
->tmp_string
);
124 int ucstring_isempty(const de_ucstring
*s
)
127 return (s
->len
== 0);
130 int ucstring_isnonempty(const de_ucstring
*s
)
132 return (s
&& (s
->len
> 0));
135 void ucstring_append_char(de_ucstring
*s
, de_rune ch
)
140 if(s
->len
>= 100000000) {
143 new_len
= s
->len
+ 1;
144 if(new_len
> s
->alloc
) {
145 new_alloc
= s
->alloc
* 2;
146 if(new_alloc
<32) new_alloc
=32;
147 s
->str
= de_reallocarray(s
->c
, s
->str
, s
->alloc
, sizeof(i32
), new_alloc
);
148 s
->alloc
= new_alloc
;
155 static void handle_invalid_byte(de_ucstring
*s
, u8 n
)
159 ch
= DE_CODEPOINT_BYTE00
+ (i32
)n
;
160 ucstring_append_char(s
, ch
);
163 static void handle_invalid_bytes(de_ucstring
*s
, const u8
*buf
, UI buflen
)
167 for(k
=0; k
<buflen
; k
++) {
168 handle_invalid_byte(s
, buf
[k
]);
172 // This function does not validate the "control" bits of the UTF-8 sequence.
173 // Out-of-range codepoints are return as U+FFFD.
174 static de_rune
decode_utf8_sequence(const u8
*b
, UI seqlen
)
178 if(seqlen
==2) { // 2-byte
180 ch
= (ch
<<6) | (b
[1] & 0x3f);
181 if(ch
<0x80) ch
= 0xfffd;
183 else if(seqlen
==3) { // 3-byte
185 ch
= (ch
<<6) | (b
[1] & 0x3f);
186 ch
= (ch
<<6) | (b
[2] & 0x3f);
187 if(ch
<0x800) ch
= 0xfffd;
191 ch
= (ch
<<6) | (b
[1] & 0x3f);
192 ch
= (ch
<<6) | (b
[2] & 0x3f);
193 ch
= (ch
<<6) | (b
[3] & 0x3f);
194 if(ch
<0x10000 || ch
>0x10ffff) ch
= 0xfffd;
202 #define UTF8_NBYTES_EXPECTED (es->buf[7])
203 #define UTF8_NBYTES_SAVED (es->buf[6])
205 static void append_bytes_utf8(de_ucstring
*s
, const u8
*cbuf
, i64 buflen
,
206 unsigned int conv_flags
, struct de_encconv_state
*es
)
210 for(pos
=0; pos
<buflen
; pos
++) {
213 if(n
>=0x80 && n
<=0xbf) { // continuation byte
214 if(UTF8_NBYTES_EXPECTED
==0) {
215 handle_invalid_byte(s
, n
);
217 else { // valid continuation byte
218 es
->buf
[UTF8_NBYTES_SAVED
] = n
;
220 UTF8_NBYTES_EXPECTED
--;
221 if(UTF8_NBYTES_EXPECTED
==0) {
222 ucstring_append_char(s
,
223 decode_utf8_sequence(es
->buf
, (UI
)UTF8_NBYTES_SAVED
));
227 else { // not a continuation byte
228 // Should not be any pending bytes. If there are, they're invalid.
229 if(UTF8_NBYTES_EXPECTED
!= 0) {
230 handle_invalid_bytes(s
, es
->buf
, (UI
)UTF8_NBYTES_SAVED
);
231 UTF8_NBYTES_EXPECTED
= 0;
235 ucstring_append_char(s
, (i32
)n
);
236 UTF8_NBYTES_EXPECTED
= 0; // number of additional continuation bytes expected
238 else if(n
<=0xdf) { // 2-byte UTF-8 char
239 es
->buf
[0] = n
; // save this byte for later
240 UTF8_NBYTES_SAVED
= 1; // number of bytes saved in buf[0]...buf[3]
241 UTF8_NBYTES_EXPECTED
= 1; // 1 more byte expected in this sequence
243 else if(n
<=0xef) { // 3-byte UTF-8 char
245 UTF8_NBYTES_SAVED
= 1;
246 UTF8_NBYTES_EXPECTED
= 2;
248 else if(n
<=0xf7) { // 4-byte UTF-8 char
250 UTF8_NBYTES_SAVED
= 1;
251 UTF8_NBYTES_EXPECTED
= 3;
254 handle_invalid_byte(s
, n
);
255 UTF8_NBYTES_EXPECTED
= 0;
260 // Check if there are unprocessed bytes at the end of the string, when there
262 if(UTF8_NBYTES_EXPECTED
!=0 && !(conv_flags
& DE_CONVFLAG_PARTIAL_DATA
)) {
263 handle_invalid_bytes(s
, es
->buf
, (UI
)UTF8_NBYTES_SAVED
);
264 UTF8_NBYTES_EXPECTED
= 0;
268 #undef UTF8_NBYTES_EXPECTED
269 #undef UTF8_NBYTES_SAVED
271 #define UTF16_NBYTES_SAVED (es->buf[7])
273 static i64
getu16x_direct(const u8
*m
, int is_le
)
276 return de_getu16le_direct(m
);
277 return de_getu16be_direct(m
);
280 static void append_bytes_utf16(de_ucstring
*s
, const u8
*cbuf
, i64 buflen
,
281 unsigned int conv_flags
, struct de_encconv_state
*es
, int is_le
)
286 for(pos
=0; pos
<buflen
; pos
++) {
289 es
->buf
[UTF16_NBYTES_SAVED
] = n
;
290 UTF16_NBYTES_SAVED
++;
292 if(UTF16_NBYTES_SAVED
==2) {
293 ch
= (i32
)getu16x_direct(es
->buf
, is_le
);
294 if(ch
>=0xd800 && ch
<0xdc00) {
295 ; // lead surrogate; do nothing
297 else if(ch
>=0xdc00 && ch
<0xe000) {
298 // trail surrogate, shouldn't be here
299 ucstring_append_char(s
, 0xfffd);
300 UTF16_NBYTES_SAVED
= 0;
302 else { // non-surrogate
303 ucstring_append_char(s
, ch
);
304 UTF16_NBYTES_SAVED
= 0;
307 else if(UTF16_NBYTES_SAVED
>=4) { // >4 is impossible
310 ch
= (i32
)getu16x_direct(es
->buf
, is_le
);
311 ch2
= (i32
)getu16x_direct(&es
->buf
[2], is_le
);
313 if(ch2
>=0xd800 && ch2
<0xdc00) {
314 ; // lead surrogate immediately following another lead surrogate
315 ucstring_append_char(s
, 0xfffd);
316 es
->buf
[0] = es
->buf
[2];
317 es
->buf
[1] = es
->buf
[3];
318 UTF16_NBYTES_SAVED
= 2;
320 else if(ch2
>=0xdc00 && ch2
<0xe000) {
321 // well-formed surrogate pair
322 ucstring_append_char(s
, 0x10000 + (((ch
-0xd800)<<10) | (ch2
-0xdc00)));
323 UTF16_NBYTES_SAVED
= 0;
325 else { // non-surrogate immediately following a lead surrogate
326 ucstring_append_char(s
, 0xfffd);
327 ucstring_append_char(s
, ch2
);
328 UTF16_NBYTES_SAVED
= 0;
334 // Check if there are unprocessed bytes at the end of the string, when there
336 if(UTF16_NBYTES_SAVED
!=0 && !(conv_flags
& DE_CONVFLAG_PARTIAL_DATA
)) {
337 ucstring_append_char(s
, 0xfffd);
338 UTF16_NBYTES_SAVED
= 0;
342 #undef UTF16_NBYTES_SAVED
345 // DE_CONVFLAG_PARTIAL_DATA: There might be more data after this; if 'buf' ends
346 // in a way it shouldn't, it's not necessarily an error.
347 void ucstring_append_bytes_ex(de_ucstring
*s
, const u8
*buf
, i64 buflen
,
348 unsigned int conv_flags
, struct de_encconv_state
*es
)
350 de_encoding encoding
= DE_EXTENC_GET_BASE(es
->ee
);
352 // Adjust buflen if necessary.
353 if(conv_flags
& DE_CONVFLAG_STOP_AT_NUL
) {
355 tmpp
= de_memchr(buf
, 0, (size_t)buflen
);
357 buflen
= (const u8
*)tmpp
- buf
;
361 if(encoding
==DE_ENCODING_UTF8
) {
362 append_bytes_utf8(s
, buf
, buflen
, conv_flags
, es
);
364 else if(encoding
==DE_ENCODING_UTF16LE
|| encoding
==DE_ENCODING_UTF16BE
) {
365 append_bytes_utf16(s
, buf
, buflen
, conv_flags
, es
, (encoding
==DE_ENCODING_UTF16LE
));
370 for(pos
=0; pos
<buflen
; pos
++) {
373 ch
= de_char_to_unicode_ex(buf
[pos
], es
);
374 if(ch
==DE_CODEPOINT_INVALID
) {
375 handle_invalid_byte(s
, buf
[pos
]);
378 ucstring_append_char(s
, ch
);
384 void ucstring_append_bytes(de_ucstring
*s
, const u8
*buf
, i64 buflen
,
385 unsigned int conv_flags
, de_ext_encoding ee
)
387 struct de_encconv_state es
;
389 de_encconv_init(&es
, ee
);
390 ucstring_append_bytes_ex(s
, buf
, buflen
, conv_flags
, &es
);
393 void ucstring_append_sz(de_ucstring
*s
, const char *sz
, de_ext_encoding ee
)
397 if(ee
==DE_ENCODING_LATIN1
) { // Fast path for this common case
398 const u8
*sz_u8
= (const u8
*)sz
;
401 ucstring_append_char(s
, (de_rune
)(*sz_u8
));
407 len
= (i64
)de_strlen(sz
);
408 ucstring_append_bytes(s
, (const u8
*)sz
, len
, 0, ee
);
411 static int ucstring_is_ascii(const de_ucstring
*s
)
414 for(i
=0; i
<s
->len
; i
++) {
415 if(s
->str
[i
]<0 || s
->str
[i
]>=0x80)
421 i64
ucstring_count_utf8_bytes(de_ucstring
*s
)
426 for(i
=0; i
<s
->len
; i
++) {
427 if(s
->str
[i
]<0 || s
->str
[i
]>0xffff) n
+=4;
428 else if(s
->str
[i
]<=0x7f) n
+=1;
429 else if(s
->str
[i
]<=0x7ff) n
+=2;
435 // If add_bom_if_needed is set, we'll prepend a BOM if the global c->write_bom
436 // option is enabled, and 's' has any non-ASCII characters, and 's' doesn't already
438 void ucstring_write_as_utf8(deark
*c
, de_ucstring
*s
, dbuf
*outf
, int add_bom_if_needed
)
442 if(add_bom_if_needed
&&
444 (s
->len
>0 && s
->str
[0]!=0xfeff) &&
445 !ucstring_is_ascii(s
))
448 dbuf_write_uchar_as_utf8(outf
, 0xfeff);
451 for(i
=0; i
<s
->len
; i
++) {
452 dbuf_write_uchar_as_utf8(outf
, s
->str
[i
]);
456 // Note: This function is similar to de_finfo_set_name_from_ucstring().
457 // Maybe they should be consolidated.
458 // TODO: Should we remove the 'encoding' param, and always assume UTF-8?
459 void ucstring_to_sz(de_ucstring
*s
, char *szbuf
, size_t szbuf_len
,
460 unsigned int flags
, de_ext_encoding ee
)
466 de_encoding encoding
= DE_EXTENC_GET_BASE(ee
);
467 static const char *sc1
= "\x01<"; // DE_CODEPOINT_HL in UTF-8
468 static const char *sc2
= ">\x02"; // DE_CODEPOINT_UNHL
471 if(szbuf_len
<1) return;
473 for(i
=0; i
<s
->len
; i
++) {
475 if(encoding
==DE_ENCODING_UTF8
) {
476 de_uchar_to_utf8(ch
, charcodebuf
, &charcodelen
);
478 else { // DE_ENCODING_LATIN1 or DE_ENCODING_ASCII
479 // TODO: This may not work right if DE_CONVFLAG_MAKE_PRINTABLE is used,
480 // but currently that never happens.
481 if(ch
>=0 && ch
<=(encoding
==DE_ENCODING_LATIN1
?255:127))
482 charcodebuf
[0] = (u8
)ch
;
484 charcodebuf
[0] = '_';
488 if(flags
& DE_CONVFLAG_MAKE_PRINTABLE
) {
489 // TODO: This is slightly inefficient, because we're overwriting the
490 // conversion we already did.
491 if(!de_is_printable_uchar(ch
)) {
493 de_snprintf((char*)charcodebuf
, sizeof(charcodebuf
),
494 "%s\\n%s", sc1
, sc2
);
497 de_snprintf((char*)charcodebuf
, sizeof(charcodebuf
),
498 "%s\\r%s", sc1
, sc2
);
501 de_snprintf((char*)charcodebuf
, sizeof(charcodebuf
),
502 "%s\\t%s", sc1
, sc2
);
505 de_snprintf((char*)charcodebuf
, sizeof(charcodebuf
),
506 "%s\\0%s", sc1
, sc2
);
508 else if(ch
>=DE_CODEPOINT_BYTE00
&& ch
<=DE_CODEPOINT_BYTEFF
) {
509 de_snprintf((char*)charcodebuf
, sizeof(charcodebuf
), "%s%02X%s",
510 sc1
, (int)(ch
-DE_CODEPOINT_BYTE00
), sc2
);
513 de_snprintf((char*)charcodebuf
, sizeof(charcodebuf
),
514 "%sU+%04X%s", sc1
, (unsigned int)ch
, sc2
);
516 charcodelen
= (i64
)de_strlen((const char*)charcodebuf
);
520 if(szpos
+ charcodelen
+ 1 > (i64
)szbuf_len
) break;
522 szbuf
[szpos
] = charcodebuf
[0];
525 de_memcpy(&szbuf
[szpos
], charcodebuf
, (size_t)charcodelen
);
527 szpos
+= charcodelen
;
533 // Try to determine if a Unicode codepoint (presumed to be from an untrusted source)
534 // is "safe" to print to a terminal.
535 // We try to ban control characters, formatting characters, private-use characters,
536 // and noncharacters.
537 // It would be good to also ban incorrectly-used "combining" and other context-
538 // sensitive characters, but that's too difficult.
539 int de_is_printable_uchar(de_rune ch
)
541 struct pr_range
{ i32 n1
, n2
; };
542 static const struct pr_range ranges
[] = {
553 { 0x10000, 0x101ff },
555 // TODO: Whitelist more codepoints
558 const size_t num_ranges
= DE_ARRAYCOUNT(ranges
);
560 for(i
=0; i
<num_ranges
; i
++) {
561 if(ch
>=ranges
[i
].n1
&& ch
<=ranges
[i
].n2
) return 1;
566 static const char *ucstring_getpsz_internal(de_ucstring
*s
,
567 int has_max
, i64 max_bytes
)
572 if(has_max
&& max_bytes
<6) return "";
577 allocsize
= max_bytes
+ 1;
580 // TODO: Calculating the proper allocsize could be difficult,
581 // depending on how DE_CONVFLAG_MAKE_PRINTABLE is implemented.
582 allocsize
= s
->len
* 4 + 1 + 100;
586 de_free(s
->c
, s
->tmp_string
);
587 s
->tmp_string
= de_malloc(s
->c
, allocsize
);
589 ucstring_to_sz(s
, s
->tmp_string
, (size_t)allocsize
, DE_CONVFLAG_MAKE_PRINTABLE
, DE_ENCODING_UTF8
);
591 return s
->tmp_string
;
594 const char *ucstring_getpsz(de_ucstring
*s
)
596 return ucstring_getpsz_internal(s
, 0, 0);
599 // It might make more sense to limit the number of visible characters, instead
600 // of the number of bytes in the encoded string, but that's too difficult.
601 const char *ucstring_getpsz_n(de_ucstring
*s
, i64 max_bytes
)
603 return ucstring_getpsz_internal(s
, 1, max_bytes
);
606 const char *ucstring_getpsz_d(de_ucstring
*s
)
608 return ucstring_getpsz_internal(s
, 1, DE_DBG_MAX_STRLEN
);
611 void ucstring_append_flags_item(de_ucstring
*s
, const char *str
)
613 ucstring_printf(s
, DE_ENCODING_UTF8
, "%s%s", (s
->len
>0)?" | ":"", str
);
616 void ucstring_append_flags_itemf(de_ucstring
*s
, const char *fmt
, ...)
622 de_vsnprintf(buf
, sizeof(buf
), fmt
, ap
);
623 ucstring_append_flags_item(s
, buf
);
627 // strarray: A mini library, intended mainly to help manage directory paths.
634 de_ucstring
**ss
; // array of 'num_alloc' ucstring pointers
637 struct de_strarray
*de_strarray_create(deark
*c
, size_t max_elems
)
639 struct de_strarray
*sa
;
640 sa
= de_malloc(c
, sizeof(struct de_strarray
));
642 sa
->max_elems
= max_elems
;
646 void de_strarray_destroy(struct de_strarray
*sa
)
659 // This makes a copy of 's'. The caller still owns 's'.
660 int de_strarray_push(struct de_strarray
*sa
, de_ucstring
*s
)
663 size_t newidx
= sa
->count
;
665 if(sa
->count
>= sa
->max_elems
) return 0;
666 if(newidx
>= sa
->num_alloc
) {
667 size_t old_num_alloc
= sa
->num_alloc
;
669 if(sa
->num_alloc
<8) sa
->num_alloc
=8;
670 sa
->ss
= de_reallocarray(c
, sa
->ss
, old_num_alloc
, sizeof(de_ucstring
*),
673 sa
->ss
[newidx
] = ucstring_clone(s
);
678 int de_strarray_pop(struct de_strarray
*sa
)
680 if(sa
->count
<1) return 0;
681 ucstring_destroy(sa
->ss
[sa
->count
-1]);
682 sa
->ss
[sa
->count
-1] = NULL
;
687 // Replace slashes in a string, starting at the given position.
688 static void mp_squash_slashes(de_ucstring
*s
, i64 pos1
)
692 for(i
=pos1
; i
<s
->len
; i
++) {
699 // Caller allocates 'path' to receive the path.
700 void de_strarray_make_path(struct de_strarray
*sa
, de_ucstring
*path
, unsigned int flags
)
704 for(i
=0; i
<sa
->count
; i
++) {
705 i64 oldlen
= path
->len
;
707 if(ucstring_isnonempty(sa
->ss
[i
])) {
708 ucstring_append_ucstring(path
, sa
->ss
[i
]);
711 ucstring_append_char(path
, '_');
714 mp_squash_slashes(path
, oldlen
);
715 if((i
+1 < sa
->count
) || !(flags
& DE_MPFLAG_NOTRAILINGSLASH
)) {
716 ucstring_append_char(path
, '/');