2 // "$Id: fl_utf8.cxx 7975 2010-12-08 12:15:48Z AlbrechtS $"
4 // Unicode to UTF-8 conversion functions.
6 // Author: Jean-Marc Lienher ( http://oksid.ch )
7 // Copyright 2000-2010 by O'ksi'D.
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Library General Public
11 // License as published by the Free Software Foundation; either
12 // version 2 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Library General Public License for more details.
19 // You should have received a copy of the GNU Library General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
24 // Please report all bugs and problems on the following page:
26 // http://www.fltk.org/str.php
30 #include <FL/filename.H>
33 #if defined(WIN32) && !defined(__CYGWIN__)
45 int XUtf8Tolower(int ucs
);
46 unsigned short XUtf8IsNonSpacing(unsigned int ucs
);
49 #elif defined(__APPLE__)
57 # include <sys/types.h>
58 # include <sys/stat.h>
61 int XUtf8Tolower(int ucs
);
62 unsigned short XUtf8IsNonSpacing(unsigned int ucs
);
65 #else // X-windows platform
67 # include <FL/Xutf8.h>
68 # include <sys/types.h>
69 # include <sys/stat.h>
74 #include <FL/fl_utf8.h>
80 /** \addtogroup fl_unicode
84 /*** NOTE : all functions are LIMITED to 24 bits Unicode values !!! ***/
85 /*** but only 16 bits are really used under Linux and win32 ***/
88 #define NBC 0xFFFF + 1
95 static unsigned short *table
= NULL
;
98 table
= (unsigned short*) malloc(
99 sizeof(unsigned short) * (NBC
));
100 for (i
= 0; i
< NBC
; i
++) {
101 table
[i
] = (unsigned short) i
;
103 for (i
= 0; i
< NBC
; i
++) {
106 if (l
!= i
) table
[l
] = (unsigned short) i
;
110 if (ucs
>= NBC
|| ucs
< 0) return ucs
;
115 return the byte length of the UTF-8 sequence with first byte \p c,
116 or -1 if \p c is not valid.
117 This function is helpful for finding faulty UTF8 sequences.
120 int fl_utf8len(char c
)
122 if (!(c
& 0x80)) return 1;
143 Return the byte length of the UTF-8 sequence with first byte \p c,
144 or 1 if \p c is not valid.
145 This function can be used to scan faulty UTF8 sequence, albeit ignoring invalid
149 int fl_utf8len1(char c
)
151 if (!(c
& 0x80)) return 1;
172 returns the number of Unicode chars in the UTF-8 string
176 const unsigned char *buf
,
182 int cl
= fl_utf8len((buf
+i
)[0]);//fl_utflen(buf + i, len - i);
191 * compare only the first n bytes
192 * return 0 if the strings are equal;
193 * return 1 if s1 is greater than s2
194 * return -1 if s1 is less than s2
197 UTF-8 aware strncasecmp - converts to lower case Unicode and tests.
199 \todo Correct the incorrect logic where length of strings tested
200 \todo Clarify whether n means number of bytes, or characters.
202 int fl_utf_strncasecmp(const char *s1
, const char *s2
, int n
)
207 char *e1
, *e2
; // string end pointers
210 while (s1_l
< n
&& s1
[s1_l
]) s1_l
++;
212 while (s2_l
< n
&& s2
[s2_l
]) s2_l
++;
216 } else if (s1_l
> s2_l
) {
219 e1
= (char *)&s1
[s1_l
]; // last char to test
220 e2
= (char *)&s2
[s2_l
];
221 for (i
= 0; i
< n
;) {
226 // l1 = fl_utf2ucs((unsigned char*)s1 + i, n - i, &u1);
227 u1
= fl_utf8decode(s1
+ i
, e1
, &l1
);
228 // l2 = fl_utf2ucs((unsigned char*)s2 + i, n - i, &u2);
229 u2
= fl_utf8decode(s2
+ i
, e2
, &l2
);
230 if (l1
- l2
!= 0) return l1
- l2
;
231 res
= XUtf8Tolower(u1
) - XUtf8Tolower(u2
);
232 if (res
!= 0) return res
;
243 * return 0 if the strings are equal;
244 * return 1 if s1 is greater than s2
245 * return -1 if s1 is less than s2
248 UTF-8 aware strcasecmp - converts to Unicode and tests.
250 \todo Correct the incorrect logic where length of strings tested
252 int fl_utf_strcasecmp(const char *s1
, const char *s2
)
254 int s1_l
= strlen(s1
);
255 int s2_l
= strlen(s2
);
259 } else if (s1_l
> s2_l
) {
262 return fl_utf_strncasecmp(s1
, s2
, s1_l
);
266 return the Unicode lower case value of \p ucs
268 int fl_tolower(unsigned int ucs
)
270 return XUtf8Tolower(ucs
);
274 return the Unicode upper case value of \p ucs
276 int fl_toupper(unsigned int ucs
)
282 converts the str string to the lower case equivalent into buf.
283 Warning: to be safe buf length must be at least 3 * len [for 16-bit Unicode]
285 int fl_utf_tolower(const unsigned char *str
, int len
, char *buf
)
289 char *end
= (char *)&str
[len
];
290 for (i
= 0; i
< len
;) {
294 // l1 = fl_utf2ucs((unsigned char*)str + i, len - i, &u1);
295 u1
= fl_utf8decode((const char*)(str
+ i
), end
, &l1
);
296 l2
= fl_utf8encode((unsigned int) XUtf8Tolower(u1
), buf
+ l
);
313 converts the str string to the upper case equivalent into buf.
314 Warning: to be safe buf length must be at least 3 * len [for 16-bit Unicode]
316 int fl_utf_toupper(const unsigned char *str
, int len
, char *buf
)
320 char *end
= (char *)&str
[len
];
321 for (i
= 0; i
< len
;) {
325 // l1 = fl_utf2ucs((unsigned char*)str + i, len - i, &u1);
326 u1
= fl_utf8decode((const char*)(str
+ i
), end
, &l1
);
327 l2
= fl_utf8encode((unsigned int) Toupper(u1
), buf
+ l
);
342 #if 0 // deprecated in favour of FLTK2's fl_utf8toa
344 * convert UTF-8 str to latin1
345 * Warning: buf must be at least len long
347 int fl_utf2latin1(const unsigned char *str
, int len
, char *buf
)
351 char *end
= (char *)&str
[len
];
352 for (i
= 0; i
< len
;) {
356 // l1 = fl_utf2ucs((unsigned char*)str + i, len - i, &u1);
357 u1
= fl_utf8decode((const char*)(str
+ i
), end
, &l1
);
358 if (u1
> 0xFF) u1
= '?';
372 #if 0 // deprecated in favour of FLTK2's fl_utf8froma
374 * convert latin1 str to UTF-8
375 * Warning: buf must be at least 2 * len long
377 int fl_latin12utf(const unsigned char *str
, int len
, char *buf
)
382 for (i
= 0; i
< len
; i
++) {
383 unsigned int n
= (unsigned int) str
[i
];
384 l1
= fl_utf8encode(n
, buf
+ l
);
397 returns true if the character is non-spacing.
398 \todo explain what non-spacing means.
400 unsigned int fl_nonspacing(unsigned int ucs
)
403 return (ucs
==0x20); // FIXME: what does this really do?
405 return (unsigned int) XUtf8IsNonSpacing(ucs
);
409 #if defined(WIN32) && !defined(__CYGWIN__)
410 static xchar
*mbwbuf
= NULL
;
414 unsigned int fl_codepage
= 0;
417 #if defined (WIN32) && !defined(__CYGWIN__)
419 static char *buf
= NULL
;
420 static int buf_len
= 0;
421 static unsigned short *wbufa
= NULL
;
423 // FIXME: This should *maybe* return 'const char *' instead of 'char *'
424 char *fl_utf8_to_locale(const char *s
, int len
, UINT codepage
)
426 if (!s
) return (char *)"";
428 // if (buf_len < len * 2 + 1) {
429 // buf_len = len * 2 + 1;
430 // buf = (char*) realloc(buf, buf_len);
431 // wbufa = (unsigned short*) realloc(wbufa, buf_len * sizeof(short));
433 unsigned wn
= fl_utf8toUtf16(s
, len
, NULL
, 0); // Query length
435 if (wn
>= (unsigned)buf_len
) {
437 buf
= (char*) realloc(buf
, buf_len
);
438 wbufa
= (unsigned short*) realloc(wbufa
, buf_len
* sizeof(short));
440 if (codepage
< 1) codepage
= fl_codepage
;
441 // l = fl_utf2unicode((const unsigned char *)s, len, (xchar*) wbufa);
442 l
= fl_utf8toUtf16(s
, len
, wbufa
, wn
); // Convert string
445 l
= WideCharToMultiByte(codepage
, 0, (WCHAR
*)wbufa
, l
, buf
, buf_len
, NULL
, NULL
);
451 // FIXME: This should maybe return 'const char *' instead of 'char *'
452 char *fl_locale_to_utf8(const char *s
, int len
, UINT codepage
)
454 if (!s
) return (char *)"";
456 if (buf_len
< len
* 5 + 1) {
457 buf_len
= len
* 5 + 1;
458 buf
= (char*) realloc(buf
, buf_len
);
459 wbufa
= (unsigned short*) realloc(wbufa
, buf_len
* sizeof(short));
461 if (codepage
< 1) codepage
= fl_codepage
;
464 l
= MultiByteToWideChar(codepage
, 0, s
, len
, (WCHAR
*)wbufa
, buf_len
);
467 // l = fl_unicode2utf((xchar*)wbufa, l, buf);
468 l
= fl_utf8fromwc(buf
, buf_len
, (xchar
*)wbufa
, l
);
475 converts UTF8 to a local multi-byte character string.
477 char * fl_utf2mbcs(const char *s
)
480 #if defined(WIN32) && !defined(__CYGWIN__)
482 static char *buf
= NULL
;
484 // mbwbuf = (xchar*)realloc(mbwbuf, (l+6) * sizeof(xchar));
485 // l = fl_utf2unicode((unsigned char*)s, l, mbwbuf);
487 unsigned wn
= fl_utf8toUtf16(s
, l
, NULL
, 0) + 7; // Query length
488 mbwbuf
= (xchar
*)realloc(mbwbuf
, sizeof(xchar
)*wn
);
489 l
= fl_utf8toUtf16(s
, l
, (unsigned short *)mbwbuf
, wn
); // Convert string
492 buf
= (char*)realloc(buf
, l
* 6 + 1);
493 l
= wcstombs(buf
, mbwbuf
, l
* 6);
502 #if 0 // deprecated in favour of FLTK2's fl_utf8from_mb
503 char * fl_mbcs2utf(const char *s
)
509 static char *buf
= NULL
;
511 mbwbuf
= (xchar
*)realloc(mbwbuf
,(l
* 6 + 6) * sizeof(xchar
));
512 l
= mbstowcs(mbwbuf
, s
, l
);
514 buf
= (char*)realloc(buf
, dstlen
);
515 // l = fl_unicode2utf(mbwbuf, l, buf);
516 l
= fl_utf8fromwc(buf
, dstlen
, mbwbuf
, l
);
525 #if defined(WIN32) && !defined(__CYGWIN__)
526 static xchar
*wbuf
= NULL
;
527 static xchar
*wbuf1
= NULL
;
531 char *fl_getenv(const char* v
)
533 #if defined (WIN32) && !defined(__CYGWIN__)
535 // static xchar* wbuf = NULL;
536 // wbuf = (xchar*)realloc(wbuf, sizeof(xchar) * (l+1));
537 // wbuf[fl_utf2unicode((const unsigned char*)v, l, wbuf)] = 0;
538 unsigned wn
= fl_utf8toUtf16(v
, l
, NULL
, 0) + 1; // Query length
539 wbuf
= (xchar
*)realloc(wbuf
, sizeof(xchar
)*wn
);
540 wn
= fl_utf8toUtf16(v
, l
, (unsigned short *)wbuf
, wn
); // Convert string
542 xchar
*ret
= _wgetenv(wbuf
);
543 static char *buf
= NULL
;
546 wn
= fl_utf8fromwc(NULL
, 0, ret
, l
) + 1; // query length
547 buf
= (char*) realloc(buf
, wn
);
548 // buf[fl_unicode2utf(ret, l, buf)] = 0;
549 wn
= fl_utf8fromwc(buf
, wn
, ret
, l
); // convert string
560 int fl_open(const char* f
, int oflags
, ...)
564 va_start(ap
, oflags
);
565 pmode
= va_arg (ap
, int);
567 #if defined (WIN32) && !defined(__CYGWIN__)
569 // wbuf = (xchar*)realloc(wbuf, sizeof(xchar) * (l+1));
570 // wbuf[fl_utf2unicode((const unsigned char*)f, l, wbuf)] = 0;
571 unsigned wn
= fl_utf8toUtf16(f
, l
, NULL
, 0) + 1; // Query length
572 wbuf
= (xchar
*)realloc(wbuf
, sizeof(xchar
)*wn
);
573 wn
= fl_utf8toUtf16(f
, l
, (unsigned short *)wbuf
, wn
); // Convert string
575 if (pmode
== -1) return _wopen(wbuf
, oflags
);
576 else return _wopen(wbuf
, oflags
, pmode
);
578 if (pmode
== -1) return open(f
, oflags
);
579 else return open(f
, oflags
, pmode
);
583 FILE *fl_fopen(const char* f
, const char *mode
)
585 #if defined (WIN32) && !defined(__CYGWIN__)
587 // wbuf = (xchar*)realloc(wbuf, sizeof(xchar) * (l+1));
588 // wbuf[fl_utf2unicode((const unsigned char*)f, l, wbuf)] = 0;
589 unsigned wn
= fl_utf8toUtf16(f
, l
, NULL
, 0) + 1; // Query length
590 wbuf
= (xchar
*)realloc(wbuf
, sizeof(xchar
)*wn
);
591 wn
= fl_utf8toUtf16(f
, l
, (unsigned short *)wbuf
, wn
); // Convert string
594 // wbuf1 = (xchar*)realloc(wbuf1, sizeof(xchar) * (l+1));
595 // wbuf1[fl_utf2unicode((const unsigned char*)mode, l, wbuf1)] = 0;
596 wn
= fl_utf8toUtf16(mode
, l
, NULL
, 0) + 1; // Query length
597 wbuf1
= (xchar
*)realloc(wbuf1
, sizeof(xchar
)*wn
);
598 wn
= fl_utf8toUtf16(mode
, l
, (unsigned short *)wbuf1
, wn
); // Convert string
600 return _wfopen(wbuf
, wbuf1
);
602 return fopen(f
, mode
);
606 int fl_system(const char* f
)
608 #if defined (WIN32) && !defined(__CYGWIN__)
610 return system(fl_utf2mbcs(f
));
613 // wbuf = (xchar*)realloc(wbuf, sizeof(xchar) * (l+1));
614 // wbuf[fl_utf2unicode((const unsigned char*)f, l, wbuf)] = 0;
615 unsigned wn
= fl_utf8toUtf16(f
, l
, NULL
, 0) + 1; // Query length
616 wbuf
= (xchar
*)realloc(wbuf
, sizeof(xchar
)*wn
);
617 wn
= fl_utf8toUtf16(f
, l
, (unsigned short *)wbuf
, wn
); // Convert string
619 return _wsystem(wbuf
);
626 int fl_execvp(const char *file
, char *const *argv
)
628 #if defined (WIN32) && !defined(__CYGWIN__)
630 return _execvp(fl_utf2mbcs(file
), argv
);
632 int l
= strlen(file
);
635 // wbuf = (xchar*)realloc(wbuf, sizeof(xchar) * (l+1));
636 // wbuf[fl_utf2unicode((const unsigned char*)file, l, wbuf)] = 0;
637 unsigned wn
= fl_utf8toUtf16(file
, l
, NULL
, 0) + 1; // Query length
638 wbuf
= (xchar
*)realloc(wbuf
, sizeof(xchar
)*wn
);
639 wn
= fl_utf8toUtf16(file
, l
, (unsigned short *)wbuf
, wn
); // Convert string
643 while (argv
[i
]) {i
++; n
++;}
644 ar
= (xchar
**) malloc(sizeof(xchar
*) * (n
+ 1));
649 // ar[i] = (xchar *)malloc(sizeof(xchar) * (l+1));
650 // ar[i][fl_utf2unicode((const unsigned char*)argv[i], l, ar[i])] = 0;
651 wn
= fl_utf8toUtf16(argv
[i
], l
, NULL
, 0) + 1; // Query length
652 ar
[i
] = (xchar
*)malloc(sizeof(xchar
)*wn
);
653 wn
= fl_utf8toUtf16(argv
[i
], l
, (unsigned short *)ar
[i
], wn
); // Convert string
658 ret
= _wexecvp(wbuf
, ar
);
668 return execvp(file
, argv
);
674 int fl_chmod(const char* f
, int mode
)
676 #if defined (WIN32) && !defined(__CYGWIN__)
678 // wbuf = (xchar*)realloc(wbuf, sizeof(xchar) * (l+1));
679 // wbuf[fl_utf2unicode((const unsigned char*)f, l, wbuf)] = 0;
680 unsigned wn
= fl_utf8toUtf16(f
, l
, NULL
, 0) + 1; // Query length
681 wbuf
= (xchar
*)realloc(wbuf
, sizeof(xchar
)*wn
);
682 wn
= fl_utf8toUtf16(f
, l
, (unsigned short *)wbuf
, wn
); // Convert string
684 return _wchmod(wbuf
, mode
);
686 return chmod(f
, mode
);
690 int fl_access(const char* f
, int mode
)
692 #if defined (WIN32) && !defined(__CYGWIN__)
694 // wbuf = (xchar*)realloc(wbuf, sizeof(xchar) * (l+1));
695 // wbuf[fl_utf2unicode((const unsigned char*)f, l, wbuf)] = 0;
696 unsigned wn
= fl_utf8toUtf16(f
, l
, NULL
, 0) + 1; // Query length
697 wbuf
= (xchar
*)realloc(wbuf
, sizeof(xchar
)*wn
);
698 wn
= fl_utf8toUtf16(f
, l
, (unsigned short *)wbuf
, wn
); // Convert string
700 return _waccess(wbuf
, mode
);
702 return access(f
, mode
);
707 int fl_stat(const char* f
, struct stat
*b
)
709 #if defined(WIN32) && !defined(__CYGWIN__)
711 // wbuf = (xchar*)realloc(wbuf, sizeof(xchar) * (l+1));
712 // wbuf[fl_utf2unicode((const unsigned char*)f, l, wbuf)] = 0;
713 unsigned wn
= fl_utf8toUtf16(f
, l
, NULL
, 0) + 1; // Query length
714 wbuf
= (xchar
*)realloc(wbuf
, sizeof(xchar
)*wn
);
715 wn
= fl_utf8toUtf16(f
, l
, (unsigned short *)wbuf
, wn
); // Convert string
717 return _wstat(wbuf
, (struct _stat
*)b
);
723 char *fl_getcwd(char* b
, int l
)
726 b
= (char*) malloc(l
+1);
728 #if defined(WIN32) && !defined(__CYGWIN__)
729 static xchar
*wbuf
= NULL
;
730 wbuf
= (xchar
*)realloc(wbuf
, sizeof(xchar
) * (l
+1));
731 // xchar *ret = _wgetcwd(wbuf, l / 5);
732 xchar
*ret
= _wgetcwd(wbuf
, l
);
736 // b[fl_unicode2utf(wbuf, l, b)] = 0;
737 dstlen
= fl_utf8fromwc(b
, dstlen
, wbuf
, l
);
749 int fl_unlink(const char* f
)
751 #if defined(WIN32) && !defined(__CYGWIN__)
753 // wbuf = (xchar*)realloc(wbuf, sizeof(xchar) * (l+1));
754 // wbuf[fl_utf2unicode((const unsigned char*)f, l, wbuf)] = 0;
755 unsigned wn
= fl_utf8toUtf16(f
, l
, NULL
, 0) + 1; // Query length
756 wbuf
= (xchar
*)realloc(wbuf
, sizeof(xchar
)*wn
);
757 wn
= fl_utf8toUtf16(f
, l
, (unsigned short *)wbuf
, wn
); // Convert string
759 return _wunlink(wbuf
);
765 int fl_mkdir(const char* f
, int mode
)
767 #if defined(WIN32) && !defined(__CYGWIN__)
769 // wbuf = (xchar*)realloc(wbuf, sizeof(short) * (l+1));
770 // wbuf[fl_utf2unicode((const unsigned char*)f, l, wbuf)] = 0;
771 unsigned wn
= fl_utf8toUtf16(f
, l
, NULL
, 0) + 1; // Query length
772 wbuf
= (xchar
*)realloc(wbuf
, sizeof(xchar
)*wn
);
773 wn
= fl_utf8toUtf16(f
, l
, (unsigned short *)wbuf
, wn
); // Convert string
775 return _wmkdir(wbuf
);
777 return mkdir(f
, mode
);
782 int fl_rmdir(const char* f
)
784 #if defined (WIN32) && !defined(__CYGWIN__)
786 // wbuf = (xchar*)realloc(wbuf, sizeof(xchar) * (l+1));
787 // wbuf[fl_utf2unicode((const unsigned char*)f, l, wbuf)] = 0;
788 unsigned wn
= fl_utf8toUtf16(f
, l
, NULL
, 0) + 1; // Query length
789 wbuf
= (xchar
*)realloc(wbuf
, sizeof(xchar
)*wn
);
790 wn
= fl_utf8toUtf16(f
, l
, (unsigned short *)wbuf
, wn
); // Convert string
792 return _wrmdir(wbuf
);
798 int fl_rename(const char* f
, const char *n
)
800 #if defined (WIN32) && !defined(__CYGWIN__)
802 unsigned wn
= fl_utf8toUtf16(f
, l
, NULL
, 0) + 1; // Query length
803 wbuf
= (xchar
*)realloc(wbuf
, sizeof(xchar
)*wn
);
804 wn
= fl_utf8toUtf16(f
, l
, (unsigned short *)wbuf
, wn
); // Convert string
807 wn
= fl_utf8toUtf16(n
, l
, NULL
, 0) + 1; // Query length
808 wbuf1
= (xchar
*)realloc(wbuf1
, sizeof(xchar
)*wn
);
809 wn
= fl_utf8toUtf16(n
, l
, (unsigned short *)wbuf1
, wn
); // Convert string
811 return _wrename(wbuf
, wbuf1
);
817 // recursively create a path in the file system
818 char fl_make_path( const char *path
) {
819 if (fl_access(path
, 0)) {
820 const char *s
= strrchr( path
, '/' );
823 char *p
= (char*)malloc( len
+1 );
824 memcpy( p
, path
, len
);
828 fl_mkdir(path
, 0700);
833 // strip the filename and create a path
834 void fl_make_path_for_file( const char *path
)
836 const char *s
= strrchr( path
, '/' );
839 char *p
= (char*)malloc( len
+1 );
840 memcpy( p
, path
, len
);
849 // End of "$Id: fl_utf8.cxx 7975 2010-12-08 12:15:48Z AlbrechtS $".