xpcom/io/nsNativeCharsetUtils.cpp

   1 /* ***** BEGIN LICENSE BLOCK *****
   2  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   3  *
   4  * The contents of this file are subject to the Mozilla Public License Version
   5  * 1.1 (the "License"); you may not use this file except in compliance with
   6  * the License. You may obtain a copy of the License at
   7  * http://www.mozilla.org/MPL/
   8  *
   9  * Software distributed under the License is distributed on an "AS IS" basis,
  10  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  11  * for the specific language governing rights and limitations under the
  12  * License.
  13  *
  14  * The Original Code is Mozilla.
  15  *
  16  * The Initial Developer of the Original Code is
  17  * Netscape Communications Corporation.
  18  * Portions created by the Initial Developer are Copyright (C) 2002
  19  * the Initial Developer. All Rights Reserved.
  20  *
  21  * Contributor(s):
  22  *   Darin Fisher <darin@netscape.com>
  23  *   Brian Stell <bstell@ix.netcom.com>
  24  *   Frank Tang <ftang@netscape.com>
  25  *   Brendan Eich <brendan@mozilla.org>
  26  *   Sergei Dolgov <sergei_d@fi.fi.tartu.ee>
  27  *   Jungshik Shin <jshin@i18nl10n.com>
  28  *
  29  * Alternatively, the contents of this file may be used under the terms of
  30  * either the GNU General Public License Version 2 or later (the "GPL"), or
  31  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  32  * in which case the provisions of the GPL or the LGPL are applicable instead
  33  * of those above. If you wish to allow use of your version of this file only
  34  * under the terms of either the GPL or the LGPL, and not to allow others to
  35  * use your version of this file under the terms of the MPL, indicate your
  36  * decision by deleting the provisions above and replace them with the notice
  37  * and other provisions required by the GPL or the LGPL. If you do not delete
  38  * the provisions above, a recipient may use your version of this file under
  39  * the terms of any one of the MPL, the GPL or the LGPL.
  40  *
  41  * ***** END LICENSE BLOCK ***** */
  42
  43 #include "xpcom-private.h"
  44
  45 //-----------------------------------------------------------------------------
  46 // XP_MACOSX or XP_BEOS
  47 //-----------------------------------------------------------------------------
  48 #if defined(XP_BEOS) || defined(XP_MACOSX)
  49
  50 #include "nsAString.h"
  51 #include "nsReadableUtils.h"
  52 #include "nsString.h"
  53
  54 NS_COM nsresult
  55 NS_CopyNativeToUnicode(const nsACString &input, nsAString  &output)
  56 {
  57     CopyUTF8toUTF16(input, output);
  58     return NS_OK;
  59 }
  60
  61 NS_COM nsresult
  62 NS_CopyUnicodeToNative(const nsAString  &input, nsACString &output)
  63 {
  64     CopyUTF16toUTF8(input, output);
  65     return NS_OK;
  66 }
  67
  68 void
  69 NS_StartupNativeCharsetUtils()
  70 {
  71 }
  72
  73 void
  74 NS_ShutdownNativeCharsetUtils()
  75 {
  76 }
  77
  78
  79 //-----------------------------------------------------------------------------
  80 // XP_UNIX
  81 //-----------------------------------------------------------------------------
  82 #elif defined(XP_UNIX)
  83
  84 #include <stdlib.h>   // mbtowc, wctomb
  85 #include <locale.h>   // setlocale
  86 #include "nscore.h"
  87 #include "prlock.h"
  88 #include "nsAString.h"
  89 #include "nsReadableUtils.h"
  90
  91 //
  92 // choose a conversion library.  we used to use mbrtowc/wcrtomb under Linux,
  93 // but that doesn't work for non-BMP characters whether we use '-fshort-wchar'
  94 // or not (see bug 206811 and
  95 // news://news.mozilla.org:119/bajml3$fvr1@ripley.netscape.com). we now use
  96 // iconv for all platforms where nltypes.h and nllanginfo.h are present
  97 // along with iconv.
  98 //
  99 #if defined(HAVE_ICONV) && defined(HAVE_NL_TYPES_H) && defined(HAVE_LANGINFO_CODESET)
 100 #define USE_ICONV 1
 101 #else
 102 #define USE_STDCONV 1
 103 #endif
 104
 105 static void
 106 isolatin1_to_utf16(const char **input, PRUint32 *inputLeft, PRUnichar **output, PRUint32 *outputLeft)
 107 {
 108     while (*inputLeft && *outputLeft) {
 109         **output = (unsigned char) **input;
 110         (*input)++;
 111         (*inputLeft)--;
 112         (*output)++;
 113         (*outputLeft)--;
 114     }
 115 }
 116
 117 static void
 118 utf16_to_isolatin1(const PRUnichar **input, PRUint32 *inputLeft, char **output, PRUint32 *outputLeft)
 119 {
 120     while (*inputLeft && *outputLeft) {
 121         **output = (unsigned char) **input;
 122         (*input)++;
 123         (*inputLeft)--;
 124         (*output)++;
 125         (*outputLeft)--;
 126     }
 127 }
 128
 129 //-----------------------------------------------------------------------------
 130 // conversion using iconv
 131 //-----------------------------------------------------------------------------
 132 #if defined(USE_ICONV)
 133 #include <nl_types.h> // CODESET
 134 #include <langinfo.h> // nl_langinfo
 135 #include <iconv.h>    // iconv_open, iconv, iconv_close
 136 #include <errno.h>
 137 #include "plstr.h"
 138
 139 #if defined(HAVE_ICONV_WITH_CONST_INPUT)
 140 #define ICONV_INPUT(x) (x)
 141 #else
 142 #define ICONV_INPUT(x) ((char **)x)
 143 #endif
 144
 145 // solaris definitely needs this, but we'll enable it by default
 146 // just in case... but we know for sure that iconv(3) in glibc
 147 // doesn't need this.
 148 #if !defined(__GLIBC__)
 149 #define ENABLE_UTF8_FALLBACK_SUPPORT
 150 #endif
 151
 152 #define INVALID_ICONV_T ((iconv_t) -1)
 153
 154 static inline size_t
 155 xp_iconv(iconv_t converter,
 156          const char **input,
 157          size_t      *inputLeft,
 158          char       **output,
 159          size_t      *outputLeft)
 160 {
 161     size_t res, outputAvail = outputLeft ? *outputLeft : 0;
 162     res = iconv(converter, ICONV_INPUT(input), inputLeft, output, outputLeft);
 163     if (res == (size_t) -1) {
 164         // on some platforms (e.g., linux) iconv will fail with
 165         // E2BIG if it cannot convert _all_ of its input.  it'll
 166         // still adjust all of the in/out params correctly, so we
 167         // can ignore this error.  the assumption is that we will
 168         // be called again to complete the conversion.
 169         if ((errno == E2BIG) && (*outputLeft < outputAvail))
 170             res = 0;
 171     }
 172     return res;
 173 }
 174
 175 static inline void
 176 xp_iconv_reset(iconv_t converter)
 177 {
 178     // NOTE: the man pages on Solaris claim that you can pass NULL
 179     // for all parameter to reset the converter, but beware the
 180     // evil Solaris crash if you go down this route >:-)
 181
 182     const char *zero_char_in_ptr  = NULL;
 183     char       *zero_char_out_ptr = NULL;
 184     size_t      zero_size_in      = 0,
 185                 zero_size_out     = 0;
 186
 187     xp_iconv(converter, &zero_char_in_ptr,
 188                         &zero_size_in,
 189                         &zero_char_out_ptr,
 190                         &zero_size_out);
 191 }
 192
 193 static inline iconv_t
 194 xp_iconv_open(const char **to_list, const char **from_list)
 195 {
 196     iconv_t res;
 197     const char **from_name;
 198     const char **to_name;
 199
 200     // try all possible combinations to locate a converter.
 201     to_name = to_list;
 202     while (*to_name) {
 203         if (**to_name) {
 204             from_name = from_list;
 205             while (*from_name) {
 206                 if (**from_name) {
 207                     res = iconv_open(*to_name, *from_name);
 208                     if (res != INVALID_ICONV_T)
 209                         return res;
 210                 }
 211                 from_name++;
 212             }
 213         }
 214         to_name++;
 215     }
 216
 217     return INVALID_ICONV_T;
 218 }
 219
 220 /*
 221  * PRUnichar[] is NOT a UCS-2 array BUT a UTF-16 string. Therefore, we
 222  * have to use UTF-16 with iconv(3) on platforms where it's supported.
 223  * However, the way UTF-16 and UCS-2 are interpreted varies across platforms
 224  * and implementations of iconv(3). On Tru64, it also depends on the environment
 225  * variable. To avoid the trouble arising from byte-swapping
 226  * (bug 208809), we have to try UTF-16LE/BE and UCS-2LE/BE before falling
 227  * back to UTF-16 and UCS-2 and variants. We assume that UTF-16 and UCS-2
 228  * on systems without UTF-16LE/BE and UCS-2LE/BE have the native endianness,
 229  * which isn't the case of glibc 2.1.x, for which we use 'UNICODELITTLE'
 230  * and 'UNICODEBIG'. It's also not true of Tru64 V4 when the environment
 231  * variable ICONV_BYTEORDER is set to 'big-endian', about which not much
 232  * can be done other than adding a note in the release notes. (bug 206811)
 233  */
 234 static const char *UTF_16_NAMES[] = {
 235 #if defined(IS_LITTLE_ENDIAN)
 236     "UTF-16LE",
 237 #if defined(__GLIBC__)
 238     "UNICODELITTLE",
 239 #endif
 240     "UCS-2LE",
 241 #else
 242     "UTF-16BE",
 243 #if defined(__GLIBC__)
 244     "UNICODEBIG",
 245 #endif
 246     "UCS-2BE",
 247 #endif
 248     "UTF-16",
 249     "UCS-2",
 250     "UCS2",
 251     "UCS_2",
 252     "ucs-2",
 253     "ucs2",
 254     "ucs_2",
 255     NULL
 256 };
 257
 258 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
 259 static const char *UTF_8_NAMES[] = {
 260     "UTF-8",
 261     "UTF8",
 262     "UTF_8",
 263     "utf-8",
 264     "utf8",
 265     "utf_8",
 266     NULL
 267 };
 268 #endif
 269
 270 static const char *ISO_8859_1_NAMES[] = {
 271     "ISO-8859-1",
 272 #if !defined(__GLIBC__)
 273     "ISO8859-1",
 274     "ISO88591",
 275     "ISO_8859_1",
 276     "ISO8859_1",
 277     "iso-8859-1",
 278     "iso8859-1",
 279     "iso88591",
 280     "iso_8859_1",
 281     "iso8859_1",
 282 #endif
 283     NULL
 284 };
 285
 286 class nsNativeCharsetConverter
 287 {
 288 public:
 289     nsNativeCharsetConverter();
 290    ~nsNativeCharsetConverter();
 291
 292     nsresult NativeToUnicode(const char      **input , PRUint32 *inputLeft,
 293                              PRUnichar       **output, PRUint32 *outputLeft);
 294     nsresult UnicodeToNative(const PRUnichar **input , PRUint32 *inputLeft,
 295                              char            **output, PRUint32 *outputLeft);
 296
 297     static void GlobalInit();
 298     static void GlobalShutdown();
 299     static PRBool IsNativeUTF8();
 300
 301 private:
 302     static iconv_t gNativeToUnicode;
 303     static iconv_t gUnicodeToNative;
 304 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
 305     static iconv_t gNativeToUTF8;
 306     static iconv_t gUTF8ToNative;
 307     static iconv_t gUnicodeToUTF8;
 308     static iconv_t gUTF8ToUnicode;
 309 #endif
 310     static PRLock *gLock;
 311     static PRBool  gInitialized;
 312     static PRBool  gIsNativeUTF8;
 313
 314     static void LazyInit();
 315
 316     static void Lock()   { if (gLock) PR_Lock(gLock);   }
 317     static void Unlock() { if (gLock) PR_Unlock(gLock); }
 318 };
 319
 320 iconv_t nsNativeCharsetConverter::gNativeToUnicode = INVALID_ICONV_T;
 321 iconv_t nsNativeCharsetConverter::gUnicodeToNative = INVALID_ICONV_T;
 322 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
 323 iconv_t nsNativeCharsetConverter::gNativeToUTF8    = INVALID_ICONV_T;
 324 iconv_t nsNativeCharsetConverter::gUTF8ToNative    = INVALID_ICONV_T;
 325 iconv_t nsNativeCharsetConverter::gUnicodeToUTF8   = INVALID_ICONV_T;
 326 iconv_t nsNativeCharsetConverter::gUTF8ToUnicode   = INVALID_ICONV_T;
 327 #endif
 328 PRLock *nsNativeCharsetConverter::gLock            = nsnull;
 329 PRBool  nsNativeCharsetConverter::gInitialized     = PR_FALSE;
 330 PRBool  nsNativeCharsetConverter::gIsNativeUTF8    = PR_FALSE;
 331
 332 void
 333 nsNativeCharsetConverter::LazyInit()
 334 {
 335     const char  *blank_list[] = { "", NULL };
 336     const char **native_charset_list = blank_list;
 337     const char  *native_charset = nl_langinfo(CODESET);
 338     if (native_charset == nsnull) {
 339         NS_ERROR("native charset is unknown");
 340         // fallback to ISO-8859-1
 341         native_charset_list = ISO_8859_1_NAMES;
 342     }
 343     else
 344         native_charset_list[0] = native_charset;
 345
 346     // Most, if not all, Unixen supporting UTF-8 and nl_langinfo(CODESET)
 347     // return 'UTF-8' (or 'utf-8')
 348     if (!PL_strcasecmp(native_charset, "UTF-8"))
 349         gIsNativeUTF8 = PR_TRUE;
 350
 351     gNativeToUnicode = xp_iconv_open(UTF_16_NAMES, native_charset_list);
 352     gUnicodeToNative = xp_iconv_open(native_charset_list, UTF_16_NAMES);
 353
 354 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
 355     if (gNativeToUnicode == INVALID_ICONV_T) {
 356         gNativeToUTF8 = xp_iconv_open(UTF_8_NAMES, native_charset_list);
 357         gUTF8ToUnicode = xp_iconv_open(UTF_16_NAMES, UTF_8_NAMES);
 358         NS_ASSERTION(gNativeToUTF8 != INVALID_ICONV_T, "no native to utf-8 converter");
 359         NS_ASSERTION(gUTF8ToUnicode != INVALID_ICONV_T, "no utf-8 to utf-16 converter");
 360     }
 361     if (gUnicodeToNative == INVALID_ICONV_T) {
 362         gUnicodeToUTF8 = xp_iconv_open(UTF_8_NAMES, UTF_16_NAMES);
 363         gUTF8ToNative = xp_iconv_open(native_charset_list, UTF_8_NAMES);
 364         NS_ASSERTION(gUnicodeToUTF8 != INVALID_ICONV_T, "no utf-16 to utf-8 converter");
 365         NS_ASSERTION(gUTF8ToNative != INVALID_ICONV_T, "no utf-8 to native converter");
 366     }
 367 #else
 368     NS_ASSERTION(gNativeToUnicode != INVALID_ICONV_T, "no native to utf-16 converter");
 369     NS_ASSERTION(gUnicodeToNative != INVALID_ICONV_T, "no utf-16 to native converter");
 370 #endif
 371
 372     /*
 373      * On Solaris 8 (and newer?), the iconv modules converting to UCS-2
 374      * prepend a byte order mark unicode character (BOM, u+FEFF) during
 375      * the first use of the iconv converter. The same is the case of
 376      * glibc 2.2.9x and Tru64 V5 (see bug 208809) when 'UTF-16' is used.
 377      * However, we use 'UTF-16LE/BE' in both cases, instead so that we
 378      * should be safe. But just in case...
 379      *
 380      * This dummy conversion gets rid of the BOMs and fixes bug 153562.
 381      */
 382     char dummy_input[1] = { ' ' };
 383     char dummy_output[4];
 384
 385     if (gNativeToUnicode != INVALID_ICONV_T) {
 386         const char *input = dummy_input;
 387         size_t input_left = sizeof(dummy_input);
 388         char *output = dummy_output;
 389         size_t output_left = sizeof(dummy_output);
 390
 391         xp_iconv(gNativeToUnicode, &input, &input_left, &output, &output_left);
 392     }
 393 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
 394     if (gUTF8ToUnicode != INVALID_ICONV_T) {
 395         const char *input = dummy_input;
 396         size_t input_left = sizeof(dummy_input);
 397         char *output = dummy_output;
 398         size_t output_left = sizeof(dummy_output);
 399
 400         xp_iconv(gUTF8ToUnicode, &input, &input_left, &output, &output_left);
 401     }
 402 #endif
 403
 404     gInitialized = PR_TRUE;
 405 }
 406
 407 void
 408 nsNativeCharsetConverter::GlobalInit()
 409 {
 410     gLock = PR_NewLock();
 411     NS_ASSERTION(gLock, "lock creation failed");
 412 }
 413
 414 void
 415 nsNativeCharsetConverter::GlobalShutdown()
 416 {
 417     if (gLock) {
 418         PR_DestroyLock(gLock);
 419         gLock = nsnull;
 420     }
 421
 422     if (gNativeToUnicode != INVALID_ICONV_T) {
 423         iconv_close(gNativeToUnicode);
 424         gNativeToUnicode = INVALID_ICONV_T;
 425     }
 426
 427     if (gUnicodeToNative != INVALID_ICONV_T) {
 428         iconv_close(gUnicodeToNative);
 429         gUnicodeToNative = INVALID_ICONV_T;
 430     }
 431
 432 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
 433     if (gNativeToUTF8 != INVALID_ICONV_T) {
 434         iconv_close(gNativeToUTF8);
 435         gNativeToUTF8 = INVALID_ICONV_T;
 436     }
 437     if (gUTF8ToNative != INVALID_ICONV_T) {
 438         iconv_close(gUTF8ToNative);
 439         gUTF8ToNative = INVALID_ICONV_T;
 440     }
 441     if (gUnicodeToUTF8 != INVALID_ICONV_T) {
 442         iconv_close(gUnicodeToUTF8);
 443         gUnicodeToUTF8 = INVALID_ICONV_T;
 444     }
 445     if (gUTF8ToUnicode != INVALID_ICONV_T) {
 446         iconv_close(gUTF8ToUnicode);
 447         gUTF8ToUnicode = INVALID_ICONV_T;
 448     }
 449 #endif
 450
 451     gInitialized = PR_FALSE;
 452 }
 453
 454 nsNativeCharsetConverter::nsNativeCharsetConverter()
 455 {
 456     Lock();
 457     if (!gInitialized)
 458         LazyInit();
 459 }
 460
 461 nsNativeCharsetConverter::~nsNativeCharsetConverter()
 462 {
 463     // reset converters for next time
 464     if (gNativeToUnicode != INVALID_ICONV_T)
 465         xp_iconv_reset(gNativeToUnicode);
 466     if (gUnicodeToNative != INVALID_ICONV_T)
 467         xp_iconv_reset(gUnicodeToNative);
 468 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
 469     if (gNativeToUTF8 != INVALID_ICONV_T)
 470         xp_iconv_reset(gNativeToUTF8);
 471     if (gUTF8ToNative != INVALID_ICONV_T)
 472         xp_iconv_reset(gUTF8ToNative);
 473     if (gUnicodeToUTF8 != INVALID_ICONV_T)
 474         xp_iconv_reset(gUnicodeToUTF8);
 475     if (gUTF8ToUnicode != INVALID_ICONV_T)
 476         xp_iconv_reset(gUTF8ToUnicode);
 477 #endif
 478     Unlock();
 479 }
 480
 481 nsresult
 482 nsNativeCharsetConverter::NativeToUnicode(const char **input,
 483                                           PRUint32    *inputLeft,
 484                                           PRUnichar  **output,
 485                                           PRUint32    *outputLeft)
 486 {
 487     size_t res = 0;
 488     size_t inLeft = (size_t) *inputLeft;
 489     size_t outLeft = (size_t) *outputLeft * 2;
 490
 491     if (gNativeToUnicode != INVALID_ICONV_T) {
 492
 493         res = xp_iconv(gNativeToUnicode, input, &inLeft, (char **) output, &outLeft);
 494
 495         *inputLeft = inLeft;
 496         *outputLeft = outLeft / 2;
 497         if (res != (size_t) -1)
 498             return NS_OK;
 499
 500         NS_WARNING("conversion from native to utf-16 failed");
 501
 502         // reset converter
 503         xp_iconv_reset(gNativeToUnicode);
 504     }
 505 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
 506     else if ((gNativeToUTF8 != INVALID_ICONV_T) &&
 507              (gUTF8ToUnicode != INVALID_ICONV_T)) {
 508         // convert first to UTF8, then from UTF8 to UCS2
 509         const char *in = *input;
 510
 511         char ubuf[1024];
 512
 513         // we assume we're always called with enough space in |output|,
 514         // so convert many chars at a time...
 515         while (inLeft) {
 516             char *p = ubuf;
 517             size_t n = sizeof(ubuf);
 518             res = xp_iconv(gNativeToUTF8, &in, &inLeft, &p, &n);
 519             if (res == (size_t) -1) {
 520                 NS_ERROR("conversion from native to utf-8 failed");
 521                 break;
 522             }
 523             NS_ASSERTION(outLeft > 0, "bad assumption");
 524             p = ubuf;
 525             n = sizeof(ubuf) - n;
 526             res = xp_iconv(gUTF8ToUnicode, (const char **) &p, &n, (char **) output, &outLeft);
 527             if (res == (size_t) -1) {
 528                 NS_ERROR("conversion from utf-8 to utf-16 failed");
 529                 break;
 530             }
 531         }
 532
 533         (*input) += (*inputLeft - inLeft);
 534         *inputLeft = inLeft;
 535         *outputLeft = outLeft / 2;
 536
 537         if (res != (size_t) -1)
 538             return NS_OK;
 539
 540         // reset converters
 541         xp_iconv_reset(gNativeToUTF8);
 542         xp_iconv_reset(gUTF8ToUnicode);
 543     }
 544 #endif
 545
 546     // fallback: zero-pad and hope for the best
 547     // XXX This is lame and we have to do better.
 548     isolatin1_to_utf16(input, inputLeft, output, outputLeft);
 549
 550     return NS_OK;
 551 }
 552
 553 nsresult
 554 nsNativeCharsetConverter::UnicodeToNative(const PRUnichar **input,
 555                                           PRUint32         *inputLeft,
 556                                           char            **output,
 557                                           PRUint32         *outputLeft)
 558 {
 559     size_t res = 0;
 560     size_t inLeft = (size_t) *inputLeft * 2;
 561     size_t outLeft = (size_t) *outputLeft;
 562
 563     if (gUnicodeToNative != INVALID_ICONV_T) {
 564         res = xp_iconv(gUnicodeToNative, (const char **) input, &inLeft, output, &outLeft);
 565
 566         if (res != (size_t) -1) {
 567             *inputLeft = inLeft / 2;
 568             *outputLeft = outLeft;
 569             return NS_OK;
 570         }
 571
 572         NS_ERROR("iconv failed");
 573
 574         // reset converter
 575         xp_iconv_reset(gUnicodeToNative);
 576     }
 577 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
 578     else if ((gUnicodeToUTF8 != INVALID_ICONV_T) &&
 579              (gUTF8ToNative != INVALID_ICONV_T)) {
 580         const char *in = (const char *) *input;
 581
 582         char ubuf[6]; // max utf-8 char length (really only needs to be 4 bytes)
 583
 584         // convert one uchar at a time...
 585         while (inLeft && outLeft) {
 586             char *p = ubuf;
 587             size_t n = sizeof(ubuf), one_uchar = sizeof(PRUnichar);
 588             res = xp_iconv(gUnicodeToUTF8, &in, &one_uchar, &p, &n);
 589             if (res == (size_t) -1) {
 590                 NS_ERROR("conversion from utf-16 to utf-8 failed");
 591                 break;
 592             }
 593             p = ubuf;
 594             n = sizeof(ubuf) - n;
 595             res = xp_iconv(gUTF8ToNative, (const char **) &p, &n, output, &outLeft);
 596             if (res == (size_t) -1) {
 597                 if (errno == E2BIG) {
 598                     // not enough room for last uchar... back up and return.
 599                     in -= sizeof(PRUnichar);
 600                     res = 0;
 601                 }
 602                 else
 603                     NS_ERROR("conversion from utf-8 to native failed");
 604                 break;
 605             }
 606             inLeft -= sizeof(PRUnichar);
 607         }
 608
 609         if (res != (size_t) -1) {
 610             (*input) += (*inputLeft - inLeft/2);
 611             *inputLeft = inLeft/2;
 612             *outputLeft = outLeft;
 613             return NS_OK;
 614         }
 615
 616         // reset converters
 617         xp_iconv_reset(gUnicodeToUTF8);
 618         xp_iconv_reset(gUTF8ToNative);
 619     }
 620 #endif
 621
 622     // fallback: truncate and hope for the best
 623     utf16_to_isolatin1(input, inputLeft, output, outputLeft);
 624
 625     return NS_OK;
 626 }
 627
 628 PRBool
 629 nsNativeCharsetConverter::IsNativeUTF8()
 630 {
 631     if (!gInitialized) {
 632         Lock();
 633         if (!gInitialized)
 634            LazyInit();
 635         Unlock();
 636     }
 637     return gIsNativeUTF8;
 638 }
 639
 640 #endif // USE_ICONV
 641
 642 //-----------------------------------------------------------------------------
 643 // conversion using mb[r]towc/wc[r]tomb
 644 //-----------------------------------------------------------------------------
 645 #if defined(USE_STDCONV)
 646 #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
 647 #include <wchar.h>    // mbrtowc, wcrtomb
 648 #endif
 649
 650 class nsNativeCharsetConverter
 651 {
 652 public:
 653     nsNativeCharsetConverter();
 654
 655     nsresult NativeToUnicode(const char      **input , PRUint32 *inputLeft,
 656                              PRUnichar       **output, PRUint32 *outputLeft);
 657     nsresult UnicodeToNative(const PRUnichar **input , PRUint32 *inputLeft,
 658                              char            **output, PRUint32 *outputLeft);
 659
 660     static void GlobalInit();
 661     static void GlobalShutdown() { }
 662     static PRBool IsNativeUTF8();
 663
 664 private:
 665     static PRBool gWCharIsUnicode;
 666
 667 #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
 668     mbstate_t ps;
 669 #endif
 670 };
 671
 672 PRBool nsNativeCharsetConverter::gWCharIsUnicode = PR_FALSE;
 673
 674 nsNativeCharsetConverter::nsNativeCharsetConverter()
 675 {
 676 #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
 677     memset(&ps, 0, sizeof(ps));
 678 #endif
 679 }
 680
 681 void
 682 nsNativeCharsetConverter::GlobalInit()
 683 {
 684     // verify that wchar_t for the current locale is actually unicode.
 685     // if it is not, then we should avoid calling mbtowc/wctomb and
 686     // just fallback on zero-pad/truncation conversion.
 687     //
 688     // this test cannot be done at build time because the encoding of
 689     // wchar_t may depend on the runtime locale.  sad, but true!!
 690     //
 691     // so, if wchar_t is unicode then converting an ASCII character
 692     // to wchar_t should not change its numeric value.  we'll just
 693     // check what happens with the ASCII 'a' character.
 694     //
 695     // this test is not perfect... obviously, it could yield false
 696     // positives, but then at least ASCII text would be converted
 697     // properly (or maybe just the 'a' character) -- oh well :(
 698
 699     char a = 'a';
 700     unsigned int w = 0;
 701
 702     int res = mbtowc((wchar_t *) &w, &a, 1);
 703
 704     gWCharIsUnicode = (res != -1 && w == 'a');
 705
 706 #ifdef DEBUG
 707     if (!gWCharIsUnicode)
 708         NS_WARNING("wchar_t is not unicode (unicode conversion will be lossy)");
 709 #endif
 710 }
 711
 712 nsresult
 713 nsNativeCharsetConverter::NativeToUnicode(const char **input,
 714                                           PRUint32    *inputLeft,
 715                                           PRUnichar  **output,
 716                                           PRUint32    *outputLeft)
 717 {
 718     if (gWCharIsUnicode) {
 719         int incr;
 720
 721         // cannot use wchar_t here since it may have been redefined (e.g.,
 722         // via -fshort-wchar).  hopefully, sizeof(tmp) is sufficient XP.
 723         unsigned int tmp = 0;
 724         while (*inputLeft && *outputLeft) {
 725 #ifdef HAVE_MBRTOWC
 726             incr = (int) mbrtowc((wchar_t *) &tmp, *input, *inputLeft, &ps);
 727 #else
 728             // XXX is this thread-safe?
 729             incr = (int) mbtowc((wchar_t *) &tmp, *input, *inputLeft);
 730 #endif
 731             if (incr < 0) {
 732                 NS_WARNING("mbtowc failed: possible charset mismatch");
 733                 // zero-pad and hope for the best
 734                 tmp = (unsigned char) **input;
 735                 incr = 1;
 736             }
 737             **output = (PRUnichar) tmp;
 738             (*input) += incr;
 739             (*inputLeft) -= incr;
 740             (*output)++;
 741             (*outputLeft)--;
 742         }
 743     }
 744     else {
 745         // wchar_t isn't unicode, so the best we can do is treat the
 746         // input as if it is isolatin1 :(
 747         isolatin1_to_utf16(input, inputLeft, output, outputLeft);
 748     }
 749
 750     return NS_OK;
 751 }
 752
 753 nsresult
 754 nsNativeCharsetConverter::UnicodeToNative(const PRUnichar **input,
 755                                           PRUint32         *inputLeft,
 756                                           char            **output,
 757                                           PRUint32         *outputLeft)
 758 {
 759     if (gWCharIsUnicode) {
 760         int incr;
 761
 762         while (*inputLeft && *outputLeft >= MB_CUR_MAX) {
 763 #ifdef HAVE_WCRTOMB
 764             incr = (int) wcrtomb(*output, (wchar_t) **input, &ps);
 765 #else
 766             // XXX is this thread-safe?
 767             incr = (int) wctomb(*output, (wchar_t) **input);
 768 #endif
 769             if (incr < 0) {
 770                 NS_WARNING("mbtowc failed: possible charset mismatch");
 771                 **output = (unsigned char) **input; // truncate
 772                 incr = 1;
 773             }
 774             // most likely we're dead anyways if this assertion should fire
 775             NS_ASSERTION(PRUint32(incr) <= *outputLeft, "wrote beyond end of string");
 776             (*output) += incr;
 777             (*outputLeft) -= incr;
 778             (*input)++;
 779             (*inputLeft)--;
 780         }
 781     }
 782     else {
 783         // wchar_t isn't unicode, so the best we can do is treat the
 784         // input as if it is isolatin1 :(
 785         utf16_to_isolatin1(input, inputLeft, output, outputLeft);
 786     }
 787
 788     return NS_OK;
 789 }
 790
 791 // XXX : for now, return false
 792 PRBool
 793 nsNativeCharsetConverter::IsNativeUTF8()
 794 {
 795     return PR_FALSE;
 796 }
 797
 798 #endif // USE_STDCONV
 799
 800 //-----------------------------------------------------------------------------
 801 // API implementation
 802 //-----------------------------------------------------------------------------
 803
 804 NS_COM nsresult
 805 NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
 806 {
 807     output.Truncate();
 808
 809     PRUint32 inputLen = input.Length();
 810
 811     nsACString::const_iterator iter;
 812     input.BeginReading(iter);
 813
 814     //
 815     // OPTIMIZATION: preallocate space for largest possible result; convert
 816     // directly into the result buffer to avoid intermediate buffer copy.
 817     //
 818     // this will generally result in a larger allocation, but that seems
 819     // better than an extra buffer copy.
 820     //
 821     if (!EnsureStringLength(output, inputLen))
 822         return NS_ERROR_OUT_OF_MEMORY;
 823     nsAString::iterator out_iter;
 824     output.BeginWriting(out_iter);
 825
 826     PRUnichar *result = out_iter.get();
 827     PRUint32 resultLeft = inputLen;
 828
 829     const char *buf = iter.get();
 830     PRUint32 bufLeft = inputLen;
 831
 832     nsNativeCharsetConverter conv;
 833     nsresult rv = conv.NativeToUnicode(&buf, &bufLeft, &result, &resultLeft);
 834     if (NS_SUCCEEDED(rv)) {
 835         NS_ASSERTION(bufLeft == 0, "did not consume entire input buffer");
 836         output.SetLength(inputLen - resultLeft);
 837     }
 838     return rv;
 839 }
 840
 841 NS_COM nsresult
 842 NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
 843 {
 844     output.Truncate();
 845
 846     nsAString::const_iterator iter, end;
 847     input.BeginReading(iter);
 848     input.EndReading(end);
 849
 850     // cannot easily avoid intermediate buffer copy.
 851     char temp[4096];
 852
 853     nsNativeCharsetConverter conv;
 854
 855     const PRUnichar *buf = iter.get();
 856     PRUint32 bufLeft = Distance(iter, end);
 857     while (bufLeft) {
 858         char *p = temp;
 859         PRUint32 tempLeft = sizeof(temp);
 860
 861         nsresult rv = conv.UnicodeToNative(&buf, &bufLeft, &p, &tempLeft);
 862         if (NS_FAILED(rv)) return rv;
 863
 864         if (tempLeft < sizeof(temp))
 865             output.Append(temp, sizeof(temp) - tempLeft);
 866     }
 867     return NS_OK;
 868 }
 869
 870 NS_COM PRBool
 871 NS_IsNativeUTF8()
 872 {
 873     return nsNativeCharsetConverter::IsNativeUTF8();
 874 }
 875
 876 void
 877 NS_StartupNativeCharsetUtils()
 878 {
 879     //
 880     // need to initialize the locale or else charset conversion will fail.
 881     // better not delay this in case some other component alters the locale
 882     // settings.
 883     //
 884     // XXX we assume that we are called early enough that we should
 885     // always be the first to care about the locale's charset.
 886     //
 887     setlocale(LC_CTYPE, "");
 888
 889     nsNativeCharsetConverter::GlobalInit();
 890 }
 891
 892 void
 893 NS_ShutdownNativeCharsetUtils()
 894 {
 895     nsNativeCharsetConverter::GlobalShutdown();
 896 }
 897
 898 //-----------------------------------------------------------------------------
 899 // XP_WIN
 900 //-----------------------------------------------------------------------------
 901 #elif defined(XP_WIN)
 902
 903 #include <windows.h>
 904 #include "nsAString.h"
 905 #include "nsReadableUtils.h"
 906
 907 NS_COM nsresult
 908 NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
 909 {
 910     PRUint32 inputLen = input.Length();
 911
 912     nsACString::const_iterator iter;
 913     input.BeginReading(iter);
 914
 915     const char *buf = iter.get();
 916
 917     // determine length of result
 918     PRUint32 resultLen = 0;
 919     int n = ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, NULL, 0);
 920     if (n > 0)
 921         resultLen += n;
 922
 923     // allocate sufficient space
 924     if (!EnsureStringLength(output, resultLen))
 925         return NS_ERROR_OUT_OF_MEMORY;
 926     if (resultLen > 0) {
 927         nsAString::iterator out_iter;
 928         output.BeginWriting(out_iter);
 929
 930         PRUnichar *result = out_iter.get();
 931
 932         ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, result, resultLen);
 933     }
 934     return NS_OK;
 935 }
 936
 937 NS_COM nsresult
 938 NS_CopyUnicodeToNative(const nsAString  &input, nsACString &output)
 939 {
 940     PRUint32 inputLen = input.Length();
 941
 942     nsAString::const_iterator iter;
 943     input.BeginReading(iter);
 944
 945     const PRUnichar *buf = iter.get();
 946
 947     // determine length of result
 948     PRUint32 resultLen = 0;
 949
 950     int n = ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, NULL, 0, NULL, NULL);
 951     if (n > 0)
 952         resultLen += n;
 953
 954     // allocate sufficient space
 955     if (!EnsureStringLength(output, resultLen))
 956         return NS_ERROR_OUT_OF_MEMORY;
 957     if (resultLen > 0) {
 958         nsACString::iterator out_iter;
 959         output.BeginWriting(out_iter);
 960
 961         // default "defaultChar" is '?', which is an illegal character on windows
 962         // file system.  That will cause file uncreatable. Change it to '_'
 963         const char defaultChar = '_';
 964
 965         char *result = out_iter.get();
 966
 967         ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, result, resultLen,
 968                               &defaultChar, NULL);
 969     }
 970     return NS_OK;
 971 }
 972
 973 // moved from widget/src/windows/nsToolkit.cpp
 974 NS_COM PRInt32
 975 NS_ConvertAtoW(const char *aStrInA, int aBufferSize, PRUnichar *aStrOutW)
 976 {
 977     return MultiByteToWideChar(CP_ACP, 0, aStrInA, -1, aStrOutW, aBufferSize);
 978 }
 979
 980 NS_COM PRInt32
 981 NS_ConvertWtoA(const PRUnichar *aStrInW, int aBufferSizeOut,
 982                char *aStrOutA, const char *aDefault)
 983 {
 984     if ((!aStrInW) || (!aStrOutA) || (aBufferSizeOut <= 0))
 985         return 0;
 986
 987     int numCharsConverted = WideCharToMultiByte(CP_ACP, 0, aStrInW, -1,
 988                                                 aStrOutA, aBufferSizeOut,
 989                                                 aDefault, NULL);
 990
 991     if (!numCharsConverted) {
 992         if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
 993             // Overflow, add missing null termination but return 0
 994             aStrOutA[aBufferSizeOut-1] = '\0';
 995         }
 996         else {
 997             // Other error, clear string and return 0
 998             aStrOutA[0] = '\0';
 999         }
1000     }
1001     else if (numCharsConverted < aBufferSizeOut) {
1002         // Add 2nd null (really necessary?)
1003         aStrOutA[numCharsConverted] = '\0';
1004     }
1005
1006     return numCharsConverted;
1007 }
1008
1009 //-----------------------------------------------------------------------------
1010 // XP_OS2
1011 //-----------------------------------------------------------------------------
1012 #elif defined(XP_OS2)
1013
1014 #define INCL_DOS
1015 #include <os2.h>
1016 #include <uconv.h>
1017 #include "nsAString.h"
1018 #include "nsReadableUtils.h"
1019 #include <ulserrno.h>
1020 #include "nsNativeCharsetUtils.h"
1021
1022 static UconvObject UnicodeConverter = NULL;
1023
1024 NS_COM nsresult
1025 NS_CopyNativeToUnicode(const nsACString &input, nsAString  &output)
1026 {
1027     PRUint32 inputLen = input.Length();
1028
1029     nsACString::const_iterator iter;
1030     input.BeginReading(iter);
1031     const char *inputStr = iter.get();
1032
1033     // determine length of result
1034     PRUint32 resultLen = inputLen;
1035     if (!EnsureStringLength(output, resultLen))
1036         return NS_ERROR_OUT_OF_MEMORY;
1037
1038     nsAString::iterator out_iter;
1039     output.BeginWriting(out_iter);
1040     UniChar *result = (UniChar*)out_iter.get();
1041
1042     size_t cSubs = 0;
1043     size_t resultLeft = resultLen;
1044
1045     if (!UnicodeConverter)
1046       NS_StartupNativeCharsetUtils();
1047
1048     int unirc = ::UniUconvToUcs(UnicodeConverter, (void**)&inputStr, &inputLen,
1049                                 &result, &resultLeft, &cSubs);
1050
1051     NS_ASSERTION(unirc != UCONV_E2BIG, "Path too big");
1052
1053     if (unirc != ULS_SUCCESS) {
1054         output.Truncate();
1055         return NS_ERROR_FAILURE;
1056     }
1057
1058     // Need to update string length to reflect how many bytes were actually
1059     // written.
1060     output.Truncate(resultLen - resultLeft);
1061     return NS_OK;
1062 }
1063
1064 NS_COM nsresult
1065 NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
1066 {
1067     size_t inputLen = input.Length();
1068
1069     nsAString::const_iterator iter;
1070     input.BeginReading(iter);
1071     UniChar* inputStr = (UniChar*) const_cast<PRUnichar*>(iter.get());
1072
1073     // maximum length of unicode string of length x converted to native
1074     // codepage is x*2
1075     size_t resultLen = inputLen * 2;
1076     if (!EnsureStringLength(output, resultLen))
1077         return NS_ERROR_OUT_OF_MEMORY;
1078
1079     nsACString::iterator out_iter;
1080     output.BeginWriting(out_iter);
1081     char *result = out_iter.get();
1082
1083     size_t cSubs = 0;
1084     size_t resultLeft = resultLen;
1085
1086     if (!UnicodeConverter)
1087       NS_StartupNativeCharsetUtils();
1088
1089     int unirc = ::UniUconvFromUcs(UnicodeConverter, &inputStr, &inputLen,
1090                                   (void**)&result, &resultLeft, &cSubs);
1091
1092     NS_ASSERTION(unirc != UCONV_E2BIG, "Path too big");
1093
1094     if (unirc != ULS_SUCCESS) {
1095         output.Truncate();
1096         return NS_ERROR_FAILURE;
1097     }
1098
1099     // Need to update string length to reflect how many bytes were actually
1100     // written.
1101     output.Truncate(resultLen - resultLeft);
1102     return NS_OK;
1103 }
1104
1105 void
1106 NS_StartupNativeCharsetUtils()
1107 {
1108     ULONG ulLength;
1109     ULONG ulCodePage;
1110     DosQueryCp(sizeof(ULONG), &ulCodePage, &ulLength);
1111
1112     UniChar codepage[20];
1113     int unirc = ::UniMapCpToUcsCp(ulCodePage, codepage, 20);
1114     if (unirc == ULS_SUCCESS) {
1115         unirc = ::UniCreateUconvObject(codepage, &UnicodeConverter);
1116         if (unirc == ULS_SUCCESS) {
1117             uconv_attribute_t attr;
1118             ::UniQueryUconvObject(UnicodeConverter, &attr, sizeof(uconv_attribute_t),
1119                                   NULL, NULL, NULL);
1120             attr.options = UCONV_OPTION_SUBSTITUTE_BOTH;
1121             attr.subchar_len=1;
1122             attr.subchar[0]='_';
1123             ::UniSetUconvObject(UnicodeConverter, &attr);
1124         }
1125     }
1126 }
1127
1128 void
1129 NS_ShutdownNativeCharsetUtils()
1130 {
1131     ::UniFreeUconvObject(UnicodeConverter);
1132 }
1133
1134 #else
1135
1136 #include "nsReadableUtils.h"
1137
1138 NS_COM nsresult
1139 NS_CopyNativeToUnicode(const nsACString &input, nsAString  &output)
1140 {
1141     CopyASCIItoUTF16(input, output);
1142     return NS_OK;
1143 }
1144
1145 NS_COM nsresult
1146 NS_CopyUnicodeToNative(const nsAString  &input, nsACString &output)
1147 {
1148     LossyCopyUTF16toASCII(input, output);
1149     return NS_OK;
1150 }
1151
1152 void
1153 NS_StartupNativeCharsetUtils()
1154 {
1155 }
1156
1157 void
1158 NS_ShutdownNativeCharsetUtils()
1159 {
1160 }
1161
1162 #endif