Updated version number for development
[deark.git] / src / deark-ucstring.c
blob4716755bd910336a733c89121a24bb53e20f7d58
1 // This file is part of Deark.
2 // Copyright (C) 2016 Jason Summers
3 // See the file COPYING for terms of use.
5 // Implementation of de_ucstring Unicode string object
7 #define DE_NOT_IN_MODULE
8 #include "deark-config.h"
10 #include "deark-private.h"
12 // de_ucstring is a Unicode (utf-32) string object.
13 de_ucstring *ucstring_create(deark *c)
15 de_ucstring *s;
16 s = de_malloc(c, sizeof(de_ucstring));
17 s->c = c;
18 return s;
21 void ucstring_empty(de_ucstring *s)
23 ucstring_truncate(s, 0);
26 // Reduce the string's length to newlen, by deleting the characters after
27 // that point.
28 // 'newlen' is expected to be no larger than the string's current length.
29 void ucstring_truncate(de_ucstring *s, i64 newlen)
31 if(!s) return;
32 if(newlen<0) newlen=0;
33 if(newlen<s->len) s->len = newlen;
35 if(s->tmp_string) {
36 // There's no requirement to free tmp_string here, but it's no
37 // longer needed, and maybe it's nice to have a way to do it.
38 de_free(s->c, s->tmp_string);
39 s->tmp_string = NULL;
43 // Delete the first U+0000 character, and everything after it.
44 void ucstring_truncate_at_NUL(de_ucstring *s)
46 i64 i;
48 for(i=0; i<s->len; i++) {
49 if(s->str[i]==0x0000) {
50 ucstring_truncate(s, i);
51 return;
56 // If the string ends with U+0000, delete that character.
57 // If not, do nothing.
58 void ucstring_strip_trailing_NUL(de_ucstring *s)
60 if(s->len>=1 && s->str[s->len-1]==0x0000) {
61 ucstring_truncate(s, s->len-1);
65 void ucstring_strip_trailing_spaces(de_ucstring *s)
67 while(s->len>=1 && s->str[s->len-1]==' ') {
68 ucstring_truncate(s, s->len-1);
72 // Append s2 to s1
73 void ucstring_append_ucstring(de_ucstring *s1, const de_ucstring *s2)
75 i64 i;
77 if(!s2) return;
78 // TODO: This could be done more efficiently.
79 for(i=0; i<s2->len; i++) {
80 ucstring_append_char(s1, s2->str[i]);
84 void ucstring_vprintf(de_ucstring *s, de_ext_encoding ee, const char *fmt, va_list ap)
86 char buf[1024];
87 de_vsnprintf(buf, sizeof(buf), fmt, ap);
88 ucstring_append_sz(s, buf, ee);
91 // Appends a formatted C-style string.
92 // (Unfortunately, there is no format specifier for a ucstring.)
93 // There is a limit to how many characters will be appended.
94 void ucstring_printf(de_ucstring *s, de_ext_encoding ee, const char *fmt, ...)
96 va_list ap;
98 va_start(ap, fmt);
99 ucstring_vprintf(s, ee, fmt, ap);
100 va_end(ap);
103 de_ucstring *ucstring_clone(const de_ucstring *src)
105 de_ucstring *dst;
107 if(!src) return NULL;
108 dst = ucstring_create(src->c);
109 ucstring_append_ucstring(dst, src);
110 return dst;
113 void ucstring_destroy(de_ucstring *s)
115 deark *c;
116 if(s) {
117 c = s->c;
118 de_free(c, s->str);
119 de_free(c, s->tmp_string);
120 de_free(c, s);
124 int ucstring_isempty(const de_ucstring *s)
126 if(!s) return 1;
127 return (s->len == 0);
130 int ucstring_isnonempty(const de_ucstring *s)
132 return (s && (s->len > 0));
135 void ucstring_append_char(de_ucstring *s, de_rune ch)
137 i64 new_len;
138 i64 new_alloc;
140 if(s->len >= 100000000) {
141 return;
143 new_len = s->len + 1;
144 if(new_len > s->alloc) {
145 new_alloc = s->alloc * 2;
146 if(new_alloc<32) new_alloc=32;
147 s->str = de_reallocarray(s->c, s->str, s->alloc, sizeof(i32), new_alloc);
148 s->alloc = new_alloc;
151 s->str[s->len] = ch;
152 s->len++;
155 static void handle_invalid_byte(de_ucstring *s, u8 n)
157 i32 ch;
159 ch = DE_CODEPOINT_BYTE00 + (i32)n;
160 ucstring_append_char(s, ch);
163 static void handle_invalid_bytes(de_ucstring *s, const u8 *buf, UI buflen)
165 UI k;
167 for(k=0; k<buflen; k++) {
168 handle_invalid_byte(s, buf[k]);
172 // This function does not validate the "control" bits of the UTF-8 sequence.
173 // Out-of-range codepoints are return as U+FFFD.
174 static de_rune decode_utf8_sequence(const u8 *b, UI seqlen)
176 UI ch;
178 if(seqlen==2) { // 2-byte
179 ch = b[0] & 0x1f;
180 ch = (ch<<6) | (b[1] & 0x3f);
181 if(ch<0x80) ch = 0xfffd;
183 else if(seqlen==3) { // 3-byte
184 ch = b[0] & 0x0f;
185 ch = (ch<<6) | (b[1] & 0x3f);
186 ch = (ch<<6) | (b[2] & 0x3f);
187 if(ch<0x800) ch = 0xfffd;
189 else if(seqlen==4) {
190 ch = b[0] & 0x07;
191 ch = (ch<<6) | (b[1] & 0x3f);
192 ch = (ch<<6) | (b[2] & 0x3f);
193 ch = (ch<<6) | (b[3] & 0x3f);
194 if(ch<0x10000 || ch>0x10ffff) ch = 0xfffd;
196 else {
197 ch = b[0] & 0x7f;
199 return (de_rune)ch;
202 #define UTF8_NBYTES_EXPECTED (es->buf[7])
203 #define UTF8_NBYTES_SAVED (es->buf[6])
205 static void append_bytes_utf8(de_ucstring *s, const u8 *cbuf, i64 buflen,
206 unsigned int conv_flags, struct de_encconv_state *es)
208 i64 pos;
210 for(pos=0; pos<buflen; pos++) {
211 u8 n = cbuf[pos];
213 if(n>=0x80 && n<=0xbf) { // continuation byte
214 if(UTF8_NBYTES_EXPECTED==0) {
215 handle_invalid_byte(s, n);
217 else { // valid continuation byte
218 es->buf[UTF8_NBYTES_SAVED] = n;
219 UTF8_NBYTES_SAVED++;
220 UTF8_NBYTES_EXPECTED--;
221 if(UTF8_NBYTES_EXPECTED==0) {
222 ucstring_append_char(s,
223 decode_utf8_sequence(es->buf, (UI)UTF8_NBYTES_SAVED));
227 else { // not a continuation byte
228 // Should not be any pending bytes. If there are, they're invalid.
229 if(UTF8_NBYTES_EXPECTED != 0) {
230 handle_invalid_bytes(s, es->buf, (UI)UTF8_NBYTES_SAVED);
231 UTF8_NBYTES_EXPECTED = 0;
234 if(n<=0x7f) {
235 ucstring_append_char(s, (i32)n);
236 UTF8_NBYTES_EXPECTED = 0; // number of additional continuation bytes expected
238 else if(n<=0xdf) { // 2-byte UTF-8 char
239 es->buf[0] = n; // save this byte for later
240 UTF8_NBYTES_SAVED = 1; // number of bytes saved in buf[0]...buf[3]
241 UTF8_NBYTES_EXPECTED = 1; // 1 more byte expected in this sequence
243 else if(n<=0xef) { // 3-byte UTF-8 char
244 es->buf[0] = n;
245 UTF8_NBYTES_SAVED = 1;
246 UTF8_NBYTES_EXPECTED = 2;
248 else if(n<=0xf7) { // 4-byte UTF-8 char
249 es->buf[0] = n;
250 UTF8_NBYTES_SAVED = 1;
251 UTF8_NBYTES_EXPECTED = 3;
253 else {
254 handle_invalid_byte(s, n);
255 UTF8_NBYTES_EXPECTED = 0;
260 // Check if there are unprocessed bytes at the end of the string, when there
261 // shouldn't be.
262 if(UTF8_NBYTES_EXPECTED!=0 && !(conv_flags & DE_CONVFLAG_PARTIAL_DATA)) {
263 handle_invalid_bytes(s, es->buf, (UI)UTF8_NBYTES_SAVED);
264 UTF8_NBYTES_EXPECTED = 0;
268 #undef UTF8_NBYTES_EXPECTED
269 #undef UTF8_NBYTES_SAVED
271 #define UTF16_NBYTES_SAVED (es->buf[7])
273 static i64 getu16x_direct(const u8 *m, int is_le)
275 if(is_le)
276 return de_getu16le_direct(m);
277 return de_getu16be_direct(m);
280 static void append_bytes_utf16(de_ucstring *s, const u8 *cbuf, i64 buflen,
281 unsigned int conv_flags, struct de_encconv_state *es, int is_le)
283 i64 pos = 0;
284 i32 ch;
286 for(pos=0; pos<buflen; pos++) {
287 u8 n = cbuf[pos];
289 es->buf[UTF16_NBYTES_SAVED] = n;
290 UTF16_NBYTES_SAVED++;
292 if(UTF16_NBYTES_SAVED==2) {
293 ch = (i32)getu16x_direct(es->buf, is_le);
294 if(ch>=0xd800 && ch<0xdc00) {
295 ; // lead surrogate; do nothing
297 else if(ch>=0xdc00 && ch<0xe000) {
298 // trail surrogate, shouldn't be here
299 ucstring_append_char(s, 0xfffd);
300 UTF16_NBYTES_SAVED = 0;
302 else { // non-surrogate
303 ucstring_append_char(s, ch);
304 UTF16_NBYTES_SAVED = 0;
307 else if(UTF16_NBYTES_SAVED>=4) { // >4 is impossible
308 i32 ch2;
310 ch = (i32)getu16x_direct(es->buf, is_le);
311 ch2 = (i32)getu16x_direct(&es->buf[2], is_le);
313 if(ch2>=0xd800 && ch2<0xdc00) {
314 ; // lead surrogate immediately following another lead surrogate
315 ucstring_append_char(s, 0xfffd);
316 es->buf[0] = es->buf[2];
317 es->buf[1] = es->buf[3];
318 UTF16_NBYTES_SAVED = 2;
320 else if(ch2>=0xdc00 && ch2<0xe000) {
321 // well-formed surrogate pair
322 ucstring_append_char(s, 0x10000 + (((ch-0xd800)<<10) | (ch2-0xdc00)));
323 UTF16_NBYTES_SAVED = 0;
325 else { // non-surrogate immediately following a lead surrogate
326 ucstring_append_char(s, 0xfffd);
327 ucstring_append_char(s, ch2);
328 UTF16_NBYTES_SAVED = 0;
334 // Check if there are unprocessed bytes at the end of the string, when there
335 // shouldn't be.
336 if(UTF16_NBYTES_SAVED!=0 && !(conv_flags & DE_CONVFLAG_PARTIAL_DATA)) {
337 ucstring_append_char(s, 0xfffd);
338 UTF16_NBYTES_SAVED = 0;
342 #undef UTF16_NBYTES_SAVED
344 // conv_flags:
345 // DE_CONVFLAG_PARTIAL_DATA: There might be more data after this; if 'buf' ends
346 // in a way it shouldn't, it's not necessarily an error.
347 void ucstring_append_bytes_ex(de_ucstring *s, const u8 *buf, i64 buflen,
348 unsigned int conv_flags, struct de_encconv_state *es)
350 de_encoding encoding = DE_EXTENC_GET_BASE(es->ee);
352 // Adjust buflen if necessary.
353 if(conv_flags & DE_CONVFLAG_STOP_AT_NUL) {
354 char *tmpp;
355 tmpp = de_memchr(buf, 0, (size_t)buflen);
356 if(tmpp) {
357 buflen = (const u8*)tmpp - buf;
361 if(encoding==DE_ENCODING_UTF8) {
362 append_bytes_utf8(s, buf, buflen, conv_flags, es);
364 else if(encoding==DE_ENCODING_UTF16LE || encoding==DE_ENCODING_UTF16BE) {
365 append_bytes_utf16(s, buf, buflen, conv_flags, es, (encoding==DE_ENCODING_UTF16LE));
367 else {
368 i64 pos;
370 for(pos=0; pos<buflen; pos++) {
371 i32 ch;
373 ch = de_char_to_unicode_ex(buf[pos], es);
374 if(ch==DE_CODEPOINT_INVALID) {
375 handle_invalid_byte(s, buf[pos]);
377 else {
378 ucstring_append_char(s, ch);
384 void ucstring_append_bytes(de_ucstring *s, const u8 *buf, i64 buflen,
385 unsigned int conv_flags, de_ext_encoding ee)
387 struct de_encconv_state es;
389 de_encconv_init(&es, ee);
390 ucstring_append_bytes_ex(s, buf, buflen, conv_flags, &es);
393 void ucstring_append_sz(de_ucstring *s, const char *sz, de_ext_encoding ee)
395 i64 len;
397 if(ee==DE_ENCODING_LATIN1) { // Fast path for this common case
398 const u8 *sz_u8 = (const u8*)sz;
400 while(*sz_u8) {
401 ucstring_append_char(s, (de_rune)(*sz_u8));
402 sz_u8++;
404 return;
407 len = (i64)de_strlen(sz);
408 ucstring_append_bytes(s, (const u8*)sz, len, 0, ee);
411 static int ucstring_is_ascii(const de_ucstring *s)
413 i64 i;
414 for(i=0; i<s->len; i++) {
415 if(s->str[i]<0 || s->str[i]>=0x80)
416 return 0;
418 return 1;
421 i64 ucstring_count_utf8_bytes(de_ucstring *s)
423 i64 i;
424 i64 n = 0;
425 if(!s) return n;
426 for(i=0; i<s->len; i++) {
427 if(s->str[i]<0 || s->str[i]>0xffff) n+=4;
428 else if(s->str[i]<=0x7f) n+=1;
429 else if(s->str[i]<=0x7ff) n+=2;
430 else n+=3;
432 return n;
435 // If add_bom_if_needed is set, we'll prepend a BOM if the global c->write_bom
436 // option is enabled, and 's' has any non-ASCII characters, and 's' doesn't already
437 // start with a BOM.
438 void ucstring_write_as_utf8(deark *c, de_ucstring *s, dbuf *outf, int add_bom_if_needed)
440 i64 i;
442 if(add_bom_if_needed &&
443 c->write_bom &&
444 (s->len>0 && s->str[0]!=0xfeff) &&
445 !ucstring_is_ascii(s))
447 // Write a BOM
448 dbuf_write_uchar_as_utf8(outf, 0xfeff);
451 for(i=0; i<s->len; i++) {
452 dbuf_write_uchar_as_utf8(outf, s->str[i]);
456 // Note: This function is similar to de_finfo_set_name_from_ucstring().
457 // Maybe they should be consolidated.
458 // TODO: Should we remove the 'encoding' param, and always assume UTF-8?
459 void ucstring_to_sz(de_ucstring *s, char *szbuf, size_t szbuf_len,
460 unsigned int flags, de_ext_encoding ee)
462 i64 i;
463 i64 szpos = 0;
464 i32 ch;
465 i64 charcodelen;
466 de_encoding encoding = DE_EXTENC_GET_BASE(ee);
467 static const char *sc1 = "\x01<"; // DE_CODEPOINT_HL in UTF-8
468 static const char *sc2 = ">\x02"; // DE_CODEPOINT_UNHL
469 u8 charcodebuf[32];
471 if(szbuf_len<1) return;
473 for(i=0; i<s->len; i++) {
474 ch = s->str[i];
475 if(encoding==DE_ENCODING_UTF8) {
476 de_uchar_to_utf8(ch, charcodebuf, &charcodelen);
478 else { // DE_ENCODING_LATIN1 or DE_ENCODING_ASCII
479 // TODO: This may not work right if DE_CONVFLAG_MAKE_PRINTABLE is used,
480 // but currently that never happens.
481 if(ch>=0 && ch<=(encoding==DE_ENCODING_LATIN1?255:127))
482 charcodebuf[0] = (u8)ch;
483 else
484 charcodebuf[0] = '_';
485 charcodelen = 1;
488 if(flags & DE_CONVFLAG_MAKE_PRINTABLE) {
489 // TODO: This is slightly inefficient, because we're overwriting the
490 // conversion we already did.
491 if(!de_is_printable_uchar(ch)) {
492 if(ch==0x0a) {
493 de_snprintf((char*)charcodebuf, sizeof(charcodebuf),
494 "%s\\n%s", sc1, sc2);
496 else if(ch==0x0d) {
497 de_snprintf((char*)charcodebuf, sizeof(charcodebuf),
498 "%s\\r%s", sc1, sc2);
500 else if(ch==0x09) {
501 de_snprintf((char*)charcodebuf, sizeof(charcodebuf),
502 "%s\\t%s", sc1, sc2);
504 else if(ch==0x00) {
505 de_snprintf((char*)charcodebuf, sizeof(charcodebuf),
506 "%s\\0%s", sc1, sc2);
508 else if(ch>=DE_CODEPOINT_BYTE00 && ch<=DE_CODEPOINT_BYTEFF) {
509 de_snprintf((char*)charcodebuf, sizeof(charcodebuf), "%s%02X%s",
510 sc1, (int)(ch-DE_CODEPOINT_BYTE00), sc2);
512 else {
513 de_snprintf((char*)charcodebuf, sizeof(charcodebuf),
514 "%sU+%04X%s", sc1, (unsigned int)ch, sc2);
516 charcodelen = (i64)de_strlen((const char*)charcodebuf);
520 if(szpos + charcodelen + 1 > (i64)szbuf_len) break;
521 if(charcodelen==1) {
522 szbuf[szpos] = charcodebuf[0];
524 else {
525 de_memcpy(&szbuf[szpos], charcodebuf, (size_t)charcodelen);
527 szpos += charcodelen;
530 szbuf[szpos] = '\0';
533 // Try to determine if a Unicode codepoint (presumed to be from an untrusted source)
534 // is "safe" to print to a terminal.
535 // We try to ban control characters, formatting characters, private-use characters,
536 // and noncharacters.
537 // It would be good to also ban incorrectly-used "combining" and other context-
538 // sensitive characters, but that's too difficult.
539 int de_is_printable_uchar(de_rune ch)
541 struct pr_range { i32 n1, n2; };
542 static const struct pr_range ranges[] = {
543 { 0x0020, 0x007e },
544 { 0x00a0, 0x200d },
545 { 0x2010, 0x2027 },
546 { 0x202f, 0x2065 },
547 { 0x2070, 0xd7ff },
548 { 0xf900, 0xfdcf },
549 { 0xfdf0, 0xfdff },
550 { 0xfe10, 0xfefe },
551 { 0xff00, 0xffef },
552 { 0xfffd, 0xfffd },
553 { 0x10000, 0x101ff },
554 { 0x1f000, 0x1fbff }
555 // TODO: Whitelist more codepoints
557 size_t i;
558 const size_t num_ranges = DE_ARRAYCOUNT(ranges);
560 for(i=0; i<num_ranges; i++) {
561 if(ch>=ranges[i].n1 && ch<=ranges[i].n2) return 1;
563 return 0;
566 static const char *ucstring_getpsz_internal(de_ucstring *s,
567 int has_max, i64 max_bytes)
569 i64 allocsize;
571 if(!s) {
572 if(has_max && max_bytes<6) return "";
573 return "(null)";
576 if(has_max) {
577 allocsize = max_bytes + 1;
579 else {
580 // TODO: Calculating the proper allocsize could be difficult,
581 // depending on how DE_CONVFLAG_MAKE_PRINTABLE is implemented.
582 allocsize = s->len * 4 + 1 + 100;
585 if(s->tmp_string)
586 de_free(s->c, s->tmp_string);
587 s->tmp_string = de_malloc(s->c, allocsize);
589 ucstring_to_sz(s, s->tmp_string, (size_t)allocsize, DE_CONVFLAG_MAKE_PRINTABLE, DE_ENCODING_UTF8);
591 return s->tmp_string;
594 const char *ucstring_getpsz(de_ucstring *s)
596 return ucstring_getpsz_internal(s, 0, 0);
599 // It might make more sense to limit the number of visible characters, instead
600 // of the number of bytes in the encoded string, but that's too difficult.
601 const char *ucstring_getpsz_n(de_ucstring *s, i64 max_bytes)
603 return ucstring_getpsz_internal(s, 1, max_bytes);
606 const char *ucstring_getpsz_d(de_ucstring *s)
608 return ucstring_getpsz_internal(s, 1, DE_DBG_MAX_STRLEN);
611 void ucstring_append_flags_item(de_ucstring *s, const char *str)
613 ucstring_printf(s, DE_ENCODING_UTF8, "%s%s", (s->len>0)?" | ":"", str);
616 void ucstring_append_flags_itemf(de_ucstring *s, const char *fmt, ...)
618 va_list ap;
619 char buf[1024];
621 va_start(ap, fmt);
622 de_vsnprintf(buf, sizeof(buf), fmt, ap);
623 ucstring_append_flags_item(s, buf);
624 va_end(ap);
627 // strarray: A mini library, intended mainly to help manage directory paths.
629 struct de_strarray {
630 deark *c;
631 size_t max_elems;
632 size_t count;
633 size_t num_alloc;
634 de_ucstring **ss; // array of 'num_alloc' ucstring pointers
637 struct de_strarray *de_strarray_create(deark *c, size_t max_elems)
639 struct de_strarray *sa;
640 sa = de_malloc(c, sizeof(struct de_strarray));
641 sa->c = c;
642 sa->max_elems = max_elems;
643 return sa;
646 void de_strarray_destroy(struct de_strarray *sa)
648 deark *c;
650 if(!sa) return;
651 c = sa->c;
652 while(sa->count>0) {
653 de_strarray_pop(sa);
655 de_free(c, sa->ss);
656 de_free(c, sa);
659 // This makes a copy of 's'. The caller still owns 's'.
660 int de_strarray_push(struct de_strarray *sa, de_ucstring *s)
662 deark *c = sa->c;
663 size_t newidx = sa->count;
665 if(sa->count >= sa->max_elems) return 0;
666 if(newidx >= sa->num_alloc) {
667 size_t old_num_alloc = sa->num_alloc;
668 sa->num_alloc *= 2;
669 if(sa->num_alloc<8) sa->num_alloc=8;
670 sa->ss = de_reallocarray(c, sa->ss, old_num_alloc, sizeof(de_ucstring*),
671 sa->num_alloc);
673 sa->ss[newidx] = ucstring_clone(s);
674 sa->count++;
675 return 1;
678 int de_strarray_pop(struct de_strarray *sa)
680 if(sa->count<1) return 0;
681 ucstring_destroy(sa->ss[sa->count-1]);
682 sa->ss[sa->count-1] = NULL;
683 sa->count--;
684 return 1;
687 // Replace slashes in a string, starting at the given position.
688 static void mp_squash_slashes(de_ucstring *s, i64 pos1)
690 i64 i;
692 for(i=pos1; i<s->len; i++) {
693 if(s->str[i]=='/') {
694 s->str[i] = '_';
699 // Caller allocates 'path' to receive the path.
700 void de_strarray_make_path(struct de_strarray *sa, de_ucstring *path, unsigned int flags)
702 size_t i;
704 for(i=0; i<sa->count; i++) {
705 i64 oldlen = path->len;
707 if(ucstring_isnonempty(sa->ss[i])) {
708 ucstring_append_ucstring(path, sa->ss[i]);
710 else {
711 ucstring_append_char(path, '_');
714 mp_squash_slashes(path, oldlen);
715 if((i+1 < sa->count) || !(flags & DE_MPFLAG_NOTRAILINGSLASH)) {
716 ucstring_append_char(path, '/');