On x86 compilers without fastcall, simulate it when invoking traces and un-simulate...
[wine-gecko.git] / xpcom / io / nsNativeCharsetUtils.cpp
blobeabcc8c9be51e729b14a9279689da1434b2b28aa
1 /* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 * The contents of this file are subject to the Mozilla Public License Version
5 * 1.1 (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 * http://www.mozilla.org/MPL/
9 * Software distributed under the License is distributed on an "AS IS" basis,
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 * for the specific language governing rights and limitations under the
12 * License.
14 * The Original Code is Mozilla.
16 * The Initial Developer of the Original Code is
17 * Netscape Communications Corporation.
18 * Portions created by the Initial Developer are Copyright (C) 2002
19 * the Initial Developer. All Rights Reserved.
21 * Contributor(s):
22 * Darin Fisher <darin@netscape.com>
23 * Brian Stell <bstell@ix.netcom.com>
24 * Frank Tang <ftang@netscape.com>
25 * Brendan Eich <brendan@mozilla.org>
26 * Sergei Dolgov <sergei_d@fi.fi.tartu.ee>
27 * Jungshik Shin <jshin@i18nl10n.com>
29 * Alternatively, the contents of this file may be used under the terms of
30 * either the GNU General Public License Version 2 or later (the "GPL"), or
31 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
32 * in which case the provisions of the GPL or the LGPL are applicable instead
33 * of those above. If you wish to allow use of your version of this file only
34 * under the terms of either the GPL or the LGPL, and not to allow others to
35 * use your version of this file under the terms of the MPL, indicate your
36 * decision by deleting the provisions above and replace them with the notice
37 * and other provisions required by the GPL or the LGPL. If you do not delete
38 * the provisions above, a recipient may use your version of this file under
39 * the terms of any one of the MPL, the GPL or the LGPL.
41 * ***** END LICENSE BLOCK ***** */
43 #include "xpcom-private.h"
45 //-----------------------------------------------------------------------------
46 // XP_MACOSX or XP_BEOS
47 //-----------------------------------------------------------------------------
48 #if defined(XP_BEOS) || defined(XP_MACOSX)
50 #include "nsAString.h"
51 #include "nsReadableUtils.h"
52 #include "nsString.h"
54 NS_COM nsresult
55 NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
57 CopyUTF8toUTF16(input, output);
58 return NS_OK;
61 NS_COM nsresult
62 NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
64 CopyUTF16toUTF8(input, output);
65 return NS_OK;
68 void
69 NS_StartupNativeCharsetUtils()
73 void
74 NS_ShutdownNativeCharsetUtils()
79 //-----------------------------------------------------------------------------
80 // XP_UNIX
81 //-----------------------------------------------------------------------------
82 #elif defined(XP_UNIX)
84 #include <stdlib.h> // mbtowc, wctomb
85 #include <locale.h> // setlocale
86 #include "nscore.h"
87 #include "prlock.h"
88 #include "nsAString.h"
89 #include "nsReadableUtils.h"
92 // choose a conversion library. we used to use mbrtowc/wcrtomb under Linux,
93 // but that doesn't work for non-BMP characters whether we use '-fshort-wchar'
94 // or not (see bug 206811 and
95 // news://news.mozilla.org:119/bajml3$fvr1@ripley.netscape.com). we now use
96 // iconv for all platforms where nltypes.h and nllanginfo.h are present
97 // along with iconv.
99 #if defined(HAVE_ICONV) && defined(HAVE_NL_TYPES_H) && defined(HAVE_LANGINFO_CODESET)
100 #define USE_ICONV 1
101 #else
102 #define USE_STDCONV 1
103 #endif
105 static void
106 isolatin1_to_utf16(const char **input, PRUint32 *inputLeft, PRUnichar **output, PRUint32 *outputLeft)
108 while (*inputLeft && *outputLeft) {
109 **output = (unsigned char) **input;
110 (*input)++;
111 (*inputLeft)--;
112 (*output)++;
113 (*outputLeft)--;
117 static void
118 utf16_to_isolatin1(const PRUnichar **input, PRUint32 *inputLeft, char **output, PRUint32 *outputLeft)
120 while (*inputLeft && *outputLeft) {
121 **output = (unsigned char) **input;
122 (*input)++;
123 (*inputLeft)--;
124 (*output)++;
125 (*outputLeft)--;
129 //-----------------------------------------------------------------------------
130 // conversion using iconv
131 //-----------------------------------------------------------------------------
132 #if defined(USE_ICONV)
133 #include <nl_types.h> // CODESET
134 #include <langinfo.h> // nl_langinfo
135 #include <iconv.h> // iconv_open, iconv, iconv_close
136 #include <errno.h>
137 #include "plstr.h"
139 #if defined(HAVE_ICONV_WITH_CONST_INPUT)
140 #define ICONV_INPUT(x) (x)
141 #else
142 #define ICONV_INPUT(x) ((char **)x)
143 #endif
145 // solaris definitely needs this, but we'll enable it by default
146 // just in case... but we know for sure that iconv(3) in glibc
147 // doesn't need this.
148 #if !defined(__GLIBC__)
149 #define ENABLE_UTF8_FALLBACK_SUPPORT
150 #endif
152 #define INVALID_ICONV_T ((iconv_t) -1)
154 static inline size_t
155 xp_iconv(iconv_t converter,
156 const char **input,
157 size_t *inputLeft,
158 char **output,
159 size_t *outputLeft)
161 size_t res, outputAvail = outputLeft ? *outputLeft : 0;
162 res = iconv(converter, ICONV_INPUT(input), inputLeft, output, outputLeft);
163 if (res == (size_t) -1) {
164 // on some platforms (e.g., linux) iconv will fail with
165 // E2BIG if it cannot convert _all_ of its input. it'll
166 // still adjust all of the in/out params correctly, so we
167 // can ignore this error. the assumption is that we will
168 // be called again to complete the conversion.
169 if ((errno == E2BIG) && (*outputLeft < outputAvail))
170 res = 0;
172 return res;
175 static inline void
176 xp_iconv_reset(iconv_t converter)
178 // NOTE: the man pages on Solaris claim that you can pass NULL
179 // for all parameter to reset the converter, but beware the
180 // evil Solaris crash if you go down this route >:-)
182 const char *zero_char_in_ptr = NULL;
183 char *zero_char_out_ptr = NULL;
184 size_t zero_size_in = 0,
185 zero_size_out = 0;
187 xp_iconv(converter, &zero_char_in_ptr,
188 &zero_size_in,
189 &zero_char_out_ptr,
190 &zero_size_out);
193 static inline iconv_t
194 xp_iconv_open(const char **to_list, const char **from_list)
196 iconv_t res;
197 const char **from_name;
198 const char **to_name;
200 // try all possible combinations to locate a converter.
201 to_name = to_list;
202 while (*to_name) {
203 if (**to_name) {
204 from_name = from_list;
205 while (*from_name) {
206 if (**from_name) {
207 res = iconv_open(*to_name, *from_name);
208 if (res != INVALID_ICONV_T)
209 return res;
211 from_name++;
214 to_name++;
217 return INVALID_ICONV_T;
221 * PRUnichar[] is NOT a UCS-2 array BUT a UTF-16 string. Therefore, we
222 * have to use UTF-16 with iconv(3) on platforms where it's supported.
223 * However, the way UTF-16 and UCS-2 are interpreted varies across platforms
224 * and implementations of iconv(3). On Tru64, it also depends on the environment
225 * variable. To avoid the trouble arising from byte-swapping
226 * (bug 208809), we have to try UTF-16LE/BE and UCS-2LE/BE before falling
227 * back to UTF-16 and UCS-2 and variants. We assume that UTF-16 and UCS-2
228 * on systems without UTF-16LE/BE and UCS-2LE/BE have the native endianness,
229 * which isn't the case of glibc 2.1.x, for which we use 'UNICODELITTLE'
230 * and 'UNICODEBIG'. It's also not true of Tru64 V4 when the environment
231 * variable ICONV_BYTEORDER is set to 'big-endian', about which not much
232 * can be done other than adding a note in the release notes. (bug 206811)
234 static const char *UTF_16_NAMES[] = {
235 #if defined(IS_LITTLE_ENDIAN)
236 "UTF-16LE",
237 #if defined(__GLIBC__)
238 "UNICODELITTLE",
239 #endif
240 "UCS-2LE",
241 #else
242 "UTF-16BE",
243 #if defined(__GLIBC__)
244 "UNICODEBIG",
245 #endif
246 "UCS-2BE",
247 #endif
248 "UTF-16",
249 "UCS-2",
250 "UCS2",
251 "UCS_2",
252 "ucs-2",
253 "ucs2",
254 "ucs_2",
255 NULL
258 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
259 static const char *UTF_8_NAMES[] = {
260 "UTF-8",
261 "UTF8",
262 "UTF_8",
263 "utf-8",
264 "utf8",
265 "utf_8",
266 NULL
268 #endif
270 static const char *ISO_8859_1_NAMES[] = {
271 "ISO-8859-1",
272 #if !defined(__GLIBC__)
273 "ISO8859-1",
274 "ISO88591",
275 "ISO_8859_1",
276 "ISO8859_1",
277 "iso-8859-1",
278 "iso8859-1",
279 "iso88591",
280 "iso_8859_1",
281 "iso8859_1",
282 #endif
283 NULL
286 class nsNativeCharsetConverter
288 public:
289 nsNativeCharsetConverter();
290 ~nsNativeCharsetConverter();
292 nsresult NativeToUnicode(const char **input , PRUint32 *inputLeft,
293 PRUnichar **output, PRUint32 *outputLeft);
294 nsresult UnicodeToNative(const PRUnichar **input , PRUint32 *inputLeft,
295 char **output, PRUint32 *outputLeft);
297 static void GlobalInit();
298 static void GlobalShutdown();
299 static PRBool IsNativeUTF8();
301 private:
302 static iconv_t gNativeToUnicode;
303 static iconv_t gUnicodeToNative;
304 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
305 static iconv_t gNativeToUTF8;
306 static iconv_t gUTF8ToNative;
307 static iconv_t gUnicodeToUTF8;
308 static iconv_t gUTF8ToUnicode;
309 #endif
310 static PRLock *gLock;
311 static PRBool gInitialized;
312 static PRBool gIsNativeUTF8;
314 static void LazyInit();
316 static void Lock() { if (gLock) PR_Lock(gLock); }
317 static void Unlock() { if (gLock) PR_Unlock(gLock); }
320 iconv_t nsNativeCharsetConverter::gNativeToUnicode = INVALID_ICONV_T;
321 iconv_t nsNativeCharsetConverter::gUnicodeToNative = INVALID_ICONV_T;
322 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
323 iconv_t nsNativeCharsetConverter::gNativeToUTF8 = INVALID_ICONV_T;
324 iconv_t nsNativeCharsetConverter::gUTF8ToNative = INVALID_ICONV_T;
325 iconv_t nsNativeCharsetConverter::gUnicodeToUTF8 = INVALID_ICONV_T;
326 iconv_t nsNativeCharsetConverter::gUTF8ToUnicode = INVALID_ICONV_T;
327 #endif
328 PRLock *nsNativeCharsetConverter::gLock = nsnull;
329 PRBool nsNativeCharsetConverter::gInitialized = PR_FALSE;
330 PRBool nsNativeCharsetConverter::gIsNativeUTF8 = PR_FALSE;
332 void
333 nsNativeCharsetConverter::LazyInit()
335 const char *blank_list[] = { "", NULL };
336 const char **native_charset_list = blank_list;
337 const char *native_charset = nl_langinfo(CODESET);
338 if (native_charset == nsnull) {
339 NS_ERROR("native charset is unknown");
340 // fallback to ISO-8859-1
341 native_charset_list = ISO_8859_1_NAMES;
343 else
344 native_charset_list[0] = native_charset;
346 // Most, if not all, Unixen supporting UTF-8 and nl_langinfo(CODESET)
347 // return 'UTF-8' (or 'utf-8')
348 if (!PL_strcasecmp(native_charset, "UTF-8"))
349 gIsNativeUTF8 = PR_TRUE;
351 gNativeToUnicode = xp_iconv_open(UTF_16_NAMES, native_charset_list);
352 gUnicodeToNative = xp_iconv_open(native_charset_list, UTF_16_NAMES);
354 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
355 if (gNativeToUnicode == INVALID_ICONV_T) {
356 gNativeToUTF8 = xp_iconv_open(UTF_8_NAMES, native_charset_list);
357 gUTF8ToUnicode = xp_iconv_open(UTF_16_NAMES, UTF_8_NAMES);
358 NS_ASSERTION(gNativeToUTF8 != INVALID_ICONV_T, "no native to utf-8 converter");
359 NS_ASSERTION(gUTF8ToUnicode != INVALID_ICONV_T, "no utf-8 to utf-16 converter");
361 if (gUnicodeToNative == INVALID_ICONV_T) {
362 gUnicodeToUTF8 = xp_iconv_open(UTF_8_NAMES, UTF_16_NAMES);
363 gUTF8ToNative = xp_iconv_open(native_charset_list, UTF_8_NAMES);
364 NS_ASSERTION(gUnicodeToUTF8 != INVALID_ICONV_T, "no utf-16 to utf-8 converter");
365 NS_ASSERTION(gUTF8ToNative != INVALID_ICONV_T, "no utf-8 to native converter");
367 #else
368 NS_ASSERTION(gNativeToUnicode != INVALID_ICONV_T, "no native to utf-16 converter");
369 NS_ASSERTION(gUnicodeToNative != INVALID_ICONV_T, "no utf-16 to native converter");
370 #endif
373 * On Solaris 8 (and newer?), the iconv modules converting to UCS-2
374 * prepend a byte order mark unicode character (BOM, u+FEFF) during
375 * the first use of the iconv converter. The same is the case of
376 * glibc 2.2.9x and Tru64 V5 (see bug 208809) when 'UTF-16' is used.
377 * However, we use 'UTF-16LE/BE' in both cases, instead so that we
378 * should be safe. But just in case...
380 * This dummy conversion gets rid of the BOMs and fixes bug 153562.
382 char dummy_input[1] = { ' ' };
383 char dummy_output[4];
385 if (gNativeToUnicode != INVALID_ICONV_T) {
386 const char *input = dummy_input;
387 size_t input_left = sizeof(dummy_input);
388 char *output = dummy_output;
389 size_t output_left = sizeof(dummy_output);
391 xp_iconv(gNativeToUnicode, &input, &input_left, &output, &output_left);
393 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
394 if (gUTF8ToUnicode != INVALID_ICONV_T) {
395 const char *input = dummy_input;
396 size_t input_left = sizeof(dummy_input);
397 char *output = dummy_output;
398 size_t output_left = sizeof(dummy_output);
400 xp_iconv(gUTF8ToUnicode, &input, &input_left, &output, &output_left);
402 #endif
404 gInitialized = PR_TRUE;
407 void
408 nsNativeCharsetConverter::GlobalInit()
410 gLock = PR_NewLock();
411 NS_ASSERTION(gLock, "lock creation failed");
414 void
415 nsNativeCharsetConverter::GlobalShutdown()
417 if (gLock) {
418 PR_DestroyLock(gLock);
419 gLock = nsnull;
422 if (gNativeToUnicode != INVALID_ICONV_T) {
423 iconv_close(gNativeToUnicode);
424 gNativeToUnicode = INVALID_ICONV_T;
427 if (gUnicodeToNative != INVALID_ICONV_T) {
428 iconv_close(gUnicodeToNative);
429 gUnicodeToNative = INVALID_ICONV_T;
432 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
433 if (gNativeToUTF8 != INVALID_ICONV_T) {
434 iconv_close(gNativeToUTF8);
435 gNativeToUTF8 = INVALID_ICONV_T;
437 if (gUTF8ToNative != INVALID_ICONV_T) {
438 iconv_close(gUTF8ToNative);
439 gUTF8ToNative = INVALID_ICONV_T;
441 if (gUnicodeToUTF8 != INVALID_ICONV_T) {
442 iconv_close(gUnicodeToUTF8);
443 gUnicodeToUTF8 = INVALID_ICONV_T;
445 if (gUTF8ToUnicode != INVALID_ICONV_T) {
446 iconv_close(gUTF8ToUnicode);
447 gUTF8ToUnicode = INVALID_ICONV_T;
449 #endif
451 gInitialized = PR_FALSE;
454 nsNativeCharsetConverter::nsNativeCharsetConverter()
456 Lock();
457 if (!gInitialized)
458 LazyInit();
461 nsNativeCharsetConverter::~nsNativeCharsetConverter()
463 // reset converters for next time
464 if (gNativeToUnicode != INVALID_ICONV_T)
465 xp_iconv_reset(gNativeToUnicode);
466 if (gUnicodeToNative != INVALID_ICONV_T)
467 xp_iconv_reset(gUnicodeToNative);
468 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
469 if (gNativeToUTF8 != INVALID_ICONV_T)
470 xp_iconv_reset(gNativeToUTF8);
471 if (gUTF8ToNative != INVALID_ICONV_T)
472 xp_iconv_reset(gUTF8ToNative);
473 if (gUnicodeToUTF8 != INVALID_ICONV_T)
474 xp_iconv_reset(gUnicodeToUTF8);
475 if (gUTF8ToUnicode != INVALID_ICONV_T)
476 xp_iconv_reset(gUTF8ToUnicode);
477 #endif
478 Unlock();
481 nsresult
482 nsNativeCharsetConverter::NativeToUnicode(const char **input,
483 PRUint32 *inputLeft,
484 PRUnichar **output,
485 PRUint32 *outputLeft)
487 size_t res = 0;
488 size_t inLeft = (size_t) *inputLeft;
489 size_t outLeft = (size_t) *outputLeft * 2;
491 if (gNativeToUnicode != INVALID_ICONV_T) {
493 res = xp_iconv(gNativeToUnicode, input, &inLeft, (char **) output, &outLeft);
495 *inputLeft = inLeft;
496 *outputLeft = outLeft / 2;
497 if (res != (size_t) -1)
498 return NS_OK;
500 NS_WARNING("conversion from native to utf-16 failed");
502 // reset converter
503 xp_iconv_reset(gNativeToUnicode);
505 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
506 else if ((gNativeToUTF8 != INVALID_ICONV_T) &&
507 (gUTF8ToUnicode != INVALID_ICONV_T)) {
508 // convert first to UTF8, then from UTF8 to UCS2
509 const char *in = *input;
511 char ubuf[1024];
513 // we assume we're always called with enough space in |output|,
514 // so convert many chars at a time...
515 while (inLeft) {
516 char *p = ubuf;
517 size_t n = sizeof(ubuf);
518 res = xp_iconv(gNativeToUTF8, &in, &inLeft, &p, &n);
519 if (res == (size_t) -1) {
520 NS_ERROR("conversion from native to utf-8 failed");
521 break;
523 NS_ASSERTION(outLeft > 0, "bad assumption");
524 p = ubuf;
525 n = sizeof(ubuf) - n;
526 res = xp_iconv(gUTF8ToUnicode, (const char **) &p, &n, (char **) output, &outLeft);
527 if (res == (size_t) -1) {
528 NS_ERROR("conversion from utf-8 to utf-16 failed");
529 break;
533 (*input) += (*inputLeft - inLeft);
534 *inputLeft = inLeft;
535 *outputLeft = outLeft / 2;
537 if (res != (size_t) -1)
538 return NS_OK;
540 // reset converters
541 xp_iconv_reset(gNativeToUTF8);
542 xp_iconv_reset(gUTF8ToUnicode);
544 #endif
546 // fallback: zero-pad and hope for the best
547 // XXX This is lame and we have to do better.
548 isolatin1_to_utf16(input, inputLeft, output, outputLeft);
550 return NS_OK;
553 nsresult
554 nsNativeCharsetConverter::UnicodeToNative(const PRUnichar **input,
555 PRUint32 *inputLeft,
556 char **output,
557 PRUint32 *outputLeft)
559 size_t res = 0;
560 size_t inLeft = (size_t) *inputLeft * 2;
561 size_t outLeft = (size_t) *outputLeft;
563 if (gUnicodeToNative != INVALID_ICONV_T) {
564 res = xp_iconv(gUnicodeToNative, (const char **) input, &inLeft, output, &outLeft);
566 if (res != (size_t) -1) {
567 *inputLeft = inLeft / 2;
568 *outputLeft = outLeft;
569 return NS_OK;
572 NS_ERROR("iconv failed");
574 // reset converter
575 xp_iconv_reset(gUnicodeToNative);
577 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
578 else if ((gUnicodeToUTF8 != INVALID_ICONV_T) &&
579 (gUTF8ToNative != INVALID_ICONV_T)) {
580 const char *in = (const char *) *input;
582 char ubuf[6]; // max utf-8 char length (really only needs to be 4 bytes)
584 // convert one uchar at a time...
585 while (inLeft && outLeft) {
586 char *p = ubuf;
587 size_t n = sizeof(ubuf), one_uchar = sizeof(PRUnichar);
588 res = xp_iconv(gUnicodeToUTF8, &in, &one_uchar, &p, &n);
589 if (res == (size_t) -1) {
590 NS_ERROR("conversion from utf-16 to utf-8 failed");
591 break;
593 p = ubuf;
594 n = sizeof(ubuf) - n;
595 res = xp_iconv(gUTF8ToNative, (const char **) &p, &n, output, &outLeft);
596 if (res == (size_t) -1) {
597 if (errno == E2BIG) {
598 // not enough room for last uchar... back up and return.
599 in -= sizeof(PRUnichar);
600 res = 0;
602 else
603 NS_ERROR("conversion from utf-8 to native failed");
604 break;
606 inLeft -= sizeof(PRUnichar);
609 if (res != (size_t) -1) {
610 (*input) += (*inputLeft - inLeft/2);
611 *inputLeft = inLeft/2;
612 *outputLeft = outLeft;
613 return NS_OK;
616 // reset converters
617 xp_iconv_reset(gUnicodeToUTF8);
618 xp_iconv_reset(gUTF8ToNative);
620 #endif
622 // fallback: truncate and hope for the best
623 utf16_to_isolatin1(input, inputLeft, output, outputLeft);
625 return NS_OK;
628 PRBool
629 nsNativeCharsetConverter::IsNativeUTF8()
631 if (!gInitialized) {
632 Lock();
633 if (!gInitialized)
634 LazyInit();
635 Unlock();
637 return gIsNativeUTF8;
640 #endif // USE_ICONV
642 //-----------------------------------------------------------------------------
643 // conversion using mb[r]towc/wc[r]tomb
644 //-----------------------------------------------------------------------------
645 #if defined(USE_STDCONV)
646 #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
647 #include <wchar.h> // mbrtowc, wcrtomb
648 #endif
650 class nsNativeCharsetConverter
652 public:
653 nsNativeCharsetConverter();
655 nsresult NativeToUnicode(const char **input , PRUint32 *inputLeft,
656 PRUnichar **output, PRUint32 *outputLeft);
657 nsresult UnicodeToNative(const PRUnichar **input , PRUint32 *inputLeft,
658 char **output, PRUint32 *outputLeft);
660 static void GlobalInit();
661 static void GlobalShutdown() { }
662 static PRBool IsNativeUTF8();
664 private:
665 static PRBool gWCharIsUnicode;
667 #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
668 mbstate_t ps;
669 #endif
672 PRBool nsNativeCharsetConverter::gWCharIsUnicode = PR_FALSE;
674 nsNativeCharsetConverter::nsNativeCharsetConverter()
676 #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
677 memset(&ps, 0, sizeof(ps));
678 #endif
681 void
682 nsNativeCharsetConverter::GlobalInit()
684 // verify that wchar_t for the current locale is actually unicode.
685 // if it is not, then we should avoid calling mbtowc/wctomb and
686 // just fallback on zero-pad/truncation conversion.
688 // this test cannot be done at build time because the encoding of
689 // wchar_t may depend on the runtime locale. sad, but true!!
691 // so, if wchar_t is unicode then converting an ASCII character
692 // to wchar_t should not change its numeric value. we'll just
693 // check what happens with the ASCII 'a' character.
695 // this test is not perfect... obviously, it could yield false
696 // positives, but then at least ASCII text would be converted
697 // properly (or maybe just the 'a' character) -- oh well :(
699 char a = 'a';
700 unsigned int w = 0;
702 int res = mbtowc((wchar_t *) &w, &a, 1);
704 gWCharIsUnicode = (res != -1 && w == 'a');
706 #ifdef DEBUG
707 if (!gWCharIsUnicode)
708 NS_WARNING("wchar_t is not unicode (unicode conversion will be lossy)");
709 #endif
712 nsresult
713 nsNativeCharsetConverter::NativeToUnicode(const char **input,
714 PRUint32 *inputLeft,
715 PRUnichar **output,
716 PRUint32 *outputLeft)
718 if (gWCharIsUnicode) {
719 int incr;
721 // cannot use wchar_t here since it may have been redefined (e.g.,
722 // via -fshort-wchar). hopefully, sizeof(tmp) is sufficient XP.
723 unsigned int tmp = 0;
724 while (*inputLeft && *outputLeft) {
725 #ifdef HAVE_MBRTOWC
726 incr = (int) mbrtowc((wchar_t *) &tmp, *input, *inputLeft, &ps);
727 #else
728 // XXX is this thread-safe?
729 incr = (int) mbtowc((wchar_t *) &tmp, *input, *inputLeft);
730 #endif
731 if (incr < 0) {
732 NS_WARNING("mbtowc failed: possible charset mismatch");
733 // zero-pad and hope for the best
734 tmp = (unsigned char) **input;
735 incr = 1;
737 **output = (PRUnichar) tmp;
738 (*input) += incr;
739 (*inputLeft) -= incr;
740 (*output)++;
741 (*outputLeft)--;
744 else {
745 // wchar_t isn't unicode, so the best we can do is treat the
746 // input as if it is isolatin1 :(
747 isolatin1_to_utf16(input, inputLeft, output, outputLeft);
750 return NS_OK;
753 nsresult
754 nsNativeCharsetConverter::UnicodeToNative(const PRUnichar **input,
755 PRUint32 *inputLeft,
756 char **output,
757 PRUint32 *outputLeft)
759 if (gWCharIsUnicode) {
760 int incr;
762 while (*inputLeft && *outputLeft >= MB_CUR_MAX) {
763 #ifdef HAVE_WCRTOMB
764 incr = (int) wcrtomb(*output, (wchar_t) **input, &ps);
765 #else
766 // XXX is this thread-safe?
767 incr = (int) wctomb(*output, (wchar_t) **input);
768 #endif
769 if (incr < 0) {
770 NS_WARNING("mbtowc failed: possible charset mismatch");
771 **output = (unsigned char) **input; // truncate
772 incr = 1;
774 // most likely we're dead anyways if this assertion should fire
775 NS_ASSERTION(PRUint32(incr) <= *outputLeft, "wrote beyond end of string");
776 (*output) += incr;
777 (*outputLeft) -= incr;
778 (*input)++;
779 (*inputLeft)--;
782 else {
783 // wchar_t isn't unicode, so the best we can do is treat the
784 // input as if it is isolatin1 :(
785 utf16_to_isolatin1(input, inputLeft, output, outputLeft);
788 return NS_OK;
791 // XXX : for now, return false
792 PRBool
793 nsNativeCharsetConverter::IsNativeUTF8()
795 return PR_FALSE;
798 #endif // USE_STDCONV
800 //-----------------------------------------------------------------------------
801 // API implementation
802 //-----------------------------------------------------------------------------
804 NS_COM nsresult
805 NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
807 output.Truncate();
809 PRUint32 inputLen = input.Length();
811 nsACString::const_iterator iter;
812 input.BeginReading(iter);
815 // OPTIMIZATION: preallocate space for largest possible result; convert
816 // directly into the result buffer to avoid intermediate buffer copy.
818 // this will generally result in a larger allocation, but that seems
819 // better than an extra buffer copy.
821 if (!EnsureStringLength(output, inputLen))
822 return NS_ERROR_OUT_OF_MEMORY;
823 nsAString::iterator out_iter;
824 output.BeginWriting(out_iter);
826 PRUnichar *result = out_iter.get();
827 PRUint32 resultLeft = inputLen;
829 const char *buf = iter.get();
830 PRUint32 bufLeft = inputLen;
832 nsNativeCharsetConverter conv;
833 nsresult rv = conv.NativeToUnicode(&buf, &bufLeft, &result, &resultLeft);
834 if (NS_SUCCEEDED(rv)) {
835 NS_ASSERTION(bufLeft == 0, "did not consume entire input buffer");
836 output.SetLength(inputLen - resultLeft);
838 return rv;
841 NS_COM nsresult
842 NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
844 output.Truncate();
846 nsAString::const_iterator iter, end;
847 input.BeginReading(iter);
848 input.EndReading(end);
850 // cannot easily avoid intermediate buffer copy.
851 char temp[4096];
853 nsNativeCharsetConverter conv;
855 const PRUnichar *buf = iter.get();
856 PRUint32 bufLeft = Distance(iter, end);
857 while (bufLeft) {
858 char *p = temp;
859 PRUint32 tempLeft = sizeof(temp);
861 nsresult rv = conv.UnicodeToNative(&buf, &bufLeft, &p, &tempLeft);
862 if (NS_FAILED(rv)) return rv;
864 if (tempLeft < sizeof(temp))
865 output.Append(temp, sizeof(temp) - tempLeft);
867 return NS_OK;
870 NS_COM PRBool
871 NS_IsNativeUTF8()
873 return nsNativeCharsetConverter::IsNativeUTF8();
876 void
877 NS_StartupNativeCharsetUtils()
880 // need to initialize the locale or else charset conversion will fail.
881 // better not delay this in case some other component alters the locale
882 // settings.
884 // XXX we assume that we are called early enough that we should
885 // always be the first to care about the locale's charset.
887 setlocale(LC_CTYPE, "");
889 nsNativeCharsetConverter::GlobalInit();
892 void
893 NS_ShutdownNativeCharsetUtils()
895 nsNativeCharsetConverter::GlobalShutdown();
898 //-----------------------------------------------------------------------------
899 // XP_WIN
900 //-----------------------------------------------------------------------------
901 #elif defined(XP_WIN)
903 #include <windows.h>
904 #include "nsAString.h"
905 #include "nsReadableUtils.h"
907 NS_COM nsresult
908 NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
910 PRUint32 inputLen = input.Length();
912 nsACString::const_iterator iter;
913 input.BeginReading(iter);
915 const char *buf = iter.get();
917 // determine length of result
918 PRUint32 resultLen = 0;
919 int n = ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, NULL, 0);
920 if (n > 0)
921 resultLen += n;
923 // allocate sufficient space
924 if (!EnsureStringLength(output, resultLen))
925 return NS_ERROR_OUT_OF_MEMORY;
926 if (resultLen > 0) {
927 nsAString::iterator out_iter;
928 output.BeginWriting(out_iter);
930 PRUnichar *result = out_iter.get();
932 ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, result, resultLen);
934 return NS_OK;
937 NS_COM nsresult
938 NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
940 PRUint32 inputLen = input.Length();
942 nsAString::const_iterator iter;
943 input.BeginReading(iter);
945 const PRUnichar *buf = iter.get();
947 // determine length of result
948 PRUint32 resultLen = 0;
950 int n = ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, NULL, 0, NULL, NULL);
951 if (n > 0)
952 resultLen += n;
954 // allocate sufficient space
955 if (!EnsureStringLength(output, resultLen))
956 return NS_ERROR_OUT_OF_MEMORY;
957 if (resultLen > 0) {
958 nsACString::iterator out_iter;
959 output.BeginWriting(out_iter);
961 // default "defaultChar" is '?', which is an illegal character on windows
962 // file system. That will cause file uncreatable. Change it to '_'
963 const char defaultChar = '_';
965 char *result = out_iter.get();
967 ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, result, resultLen,
968 &defaultChar, NULL);
970 return NS_OK;
973 // moved from widget/src/windows/nsToolkit.cpp
974 NS_COM PRInt32
975 NS_ConvertAtoW(const char *aStrInA, int aBufferSize, PRUnichar *aStrOutW)
977 return MultiByteToWideChar(CP_ACP, 0, aStrInA, -1, aStrOutW, aBufferSize);
980 NS_COM PRInt32
981 NS_ConvertWtoA(const PRUnichar *aStrInW, int aBufferSizeOut,
982 char *aStrOutA, const char *aDefault)
984 if ((!aStrInW) || (!aStrOutA) || (aBufferSizeOut <= 0))
985 return 0;
987 int numCharsConverted = WideCharToMultiByte(CP_ACP, 0, aStrInW, -1,
988 aStrOutA, aBufferSizeOut,
989 aDefault, NULL);
991 if (!numCharsConverted) {
992 if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
993 // Overflow, add missing null termination but return 0
994 aStrOutA[aBufferSizeOut-1] = '\0';
996 else {
997 // Other error, clear string and return 0
998 aStrOutA[0] = '\0';
1001 else if (numCharsConverted < aBufferSizeOut) {
1002 // Add 2nd null (really necessary?)
1003 aStrOutA[numCharsConverted] = '\0';
1006 return numCharsConverted;
1009 //-----------------------------------------------------------------------------
1010 // XP_OS2
1011 //-----------------------------------------------------------------------------
1012 #elif defined(XP_OS2)
1014 #define INCL_DOS
1015 #include <os2.h>
1016 #include <uconv.h>
1017 #include "nsAString.h"
1018 #include "nsReadableUtils.h"
1019 #include <ulserrno.h>
1020 #include "nsNativeCharsetUtils.h"
1022 static UconvObject UnicodeConverter = NULL;
1024 NS_COM nsresult
1025 NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
1027 PRUint32 inputLen = input.Length();
1029 nsACString::const_iterator iter;
1030 input.BeginReading(iter);
1031 const char *inputStr = iter.get();
1033 // determine length of result
1034 PRUint32 resultLen = inputLen;
1035 if (!EnsureStringLength(output, resultLen))
1036 return NS_ERROR_OUT_OF_MEMORY;
1038 nsAString::iterator out_iter;
1039 output.BeginWriting(out_iter);
1040 UniChar *result = (UniChar*)out_iter.get();
1042 size_t cSubs = 0;
1043 size_t resultLeft = resultLen;
1045 if (!UnicodeConverter)
1046 NS_StartupNativeCharsetUtils();
1048 int unirc = ::UniUconvToUcs(UnicodeConverter, (void**)&inputStr, &inputLen,
1049 &result, &resultLeft, &cSubs);
1051 NS_ASSERTION(unirc != UCONV_E2BIG, "Path too big");
1053 if (unirc != ULS_SUCCESS) {
1054 output.Truncate();
1055 return NS_ERROR_FAILURE;
1058 // Need to update string length to reflect how many bytes were actually
1059 // written.
1060 output.Truncate(resultLen - resultLeft);
1061 return NS_OK;
1064 NS_COM nsresult
1065 NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
1067 size_t inputLen = input.Length();
1069 nsAString::const_iterator iter;
1070 input.BeginReading(iter);
1071 UniChar* inputStr = (UniChar*) const_cast<PRUnichar*>(iter.get());
1073 // maximum length of unicode string of length x converted to native
1074 // codepage is x*2
1075 size_t resultLen = inputLen * 2;
1076 if (!EnsureStringLength(output, resultLen))
1077 return NS_ERROR_OUT_OF_MEMORY;
1079 nsACString::iterator out_iter;
1080 output.BeginWriting(out_iter);
1081 char *result = out_iter.get();
1083 size_t cSubs = 0;
1084 size_t resultLeft = resultLen;
1086 if (!UnicodeConverter)
1087 NS_StartupNativeCharsetUtils();
1089 int unirc = ::UniUconvFromUcs(UnicodeConverter, &inputStr, &inputLen,
1090 (void**)&result, &resultLeft, &cSubs);
1092 NS_ASSERTION(unirc != UCONV_E2BIG, "Path too big");
1094 if (unirc != ULS_SUCCESS) {
1095 output.Truncate();
1096 return NS_ERROR_FAILURE;
1099 // Need to update string length to reflect how many bytes were actually
1100 // written.
1101 output.Truncate(resultLen - resultLeft);
1102 return NS_OK;
1105 void
1106 NS_StartupNativeCharsetUtils()
1108 ULONG ulLength;
1109 ULONG ulCodePage;
1110 DosQueryCp(sizeof(ULONG), &ulCodePage, &ulLength);
1112 UniChar codepage[20];
1113 int unirc = ::UniMapCpToUcsCp(ulCodePage, codepage, 20);
1114 if (unirc == ULS_SUCCESS) {
1115 unirc = ::UniCreateUconvObject(codepage, &UnicodeConverter);
1116 if (unirc == ULS_SUCCESS) {
1117 uconv_attribute_t attr;
1118 ::UniQueryUconvObject(UnicodeConverter, &attr, sizeof(uconv_attribute_t),
1119 NULL, NULL, NULL);
1120 attr.options = UCONV_OPTION_SUBSTITUTE_BOTH;
1121 attr.subchar_len=1;
1122 attr.subchar[0]='_';
1123 ::UniSetUconvObject(UnicodeConverter, &attr);
1128 void
1129 NS_ShutdownNativeCharsetUtils()
1131 ::UniFreeUconvObject(UnicodeConverter);
1134 #else
1136 #include "nsReadableUtils.h"
1138 NS_COM nsresult
1139 NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
1141 CopyASCIItoUTF16(input, output);
1142 return NS_OK;
1145 NS_COM nsresult
1146 NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
1148 LossyCopyUTF16toASCII(input, output);
1149 return NS_OK;
1152 void
1153 NS_StartupNativeCharsetUtils()
1157 void
1158 NS_ShutdownNativeCharsetUtils()
1162 #endif