intl/uconv/src/nsUnicodeToUTF8.cpp

   1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* ***** BEGIN LICENSE BLOCK *****
   3  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   4  *
   5  * The contents of this file are subject to the Mozilla Public License Version
   6  * 1.1 (the "License"); you may not use this file except in compliance with
   7  * the License. You may obtain a copy of the License at
   8  * http://www.mozilla.org/MPL/
   9  *
  10  * Software distributed under the License is distributed on an "AS IS" basis,
  11  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  12  * for the specific language governing rights and limitations under the
  13  * License.
  14  *
  15  * The Original Code is Mozilla Communicator client code.
  16  *
  17  * The Initial Developer of the Original Code is
  18  * Netscape Communications Corporation.
  19  * Portions created by the Initial Developer are Copyright (C) 1998
  20  * the Initial Developer. All Rights Reserved.
  21  *
  22  * Contributor(s):
  23  *
  24  * Alternatively, the contents of this file may be used under the terms of
  25  * either of the GNU General Public License Version 2 or later (the "GPL"),
  26  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  27  * in which case the provisions of the GPL or the LGPL are applicable instead
  28  * of those above. If you wish to allow use of your version of this file only
  29  * under the terms of either the GPL or the LGPL, and not to allow others to
  30  * use your version of this file under the terms of the MPL, indicate your
  31  * decision by deleting the provisions above and replace them with the notice
  32  * and other provisions required by the GPL or the LGPL. If you do not delete
  33  * the provisions above, a recipient may use your version of this file under
  34  * the terms of any one of the MPL, the GPL or the LGPL.
  35  *
  36  * ***** END LICENSE BLOCK ***** */
  37
  38 //----------------------------------------------------------------------
  39 // Global functions and data [declaration]
  40 #include "nsUCSupport.h"
  41 #include "nsUnicodeToUTF8.h"
  42 #include <string.h>
  43
  44 NS_IMPL_ISUPPORTS1(nsUnicodeToUTF8, nsIUnicodeEncoder)
  45
  46 //----------------------------------------------------------------------
  47 // nsUnicodeToUTF8 class [implementation]
  48
  49 NS_IMETHODIMP nsUnicodeToUTF8::GetMaxLength(const PRUnichar * aSrc,
  50                                               PRInt32 aSrcLength,
  51                                               PRInt32 * aDestLength)
  52 {
  53   // aSrc is interpreted as UTF16, 3 is normally enough.
  54   // But when previous buffer only contains part of the surrogate pair, we
  55   // need to complete it here. If the first word in following buffer is not
  56   // in valid surrogate rang, we need to convert the remaining of last buffer
  57   // to 3 bytes.
  58   *aDestLength = 3*aSrcLength + 3;
  59   return NS_OK;
  60 }
  61
  62 NS_IMETHODIMP nsUnicodeToUTF8::FillInfo(PRUint32 *aInfo)
  63 {
  64   memset(aInfo, 0xFF, (0x10000L >> 3));
  65   return NS_OK;
  66 }
  67
  68 NS_IMETHODIMP nsUnicodeToUTF8::Convert(const PRUnichar * aSrc,
  69                                 PRInt32 * aSrcLength,
  70                                 char * aDest,
  71                                 PRInt32 * aDestLength)
  72 {
  73   const PRUnichar * src = aSrc;
  74   const PRUnichar * srcEnd = aSrc + *aSrcLength;
  75   char * dest = aDest;
  76   PRInt32 destLen = *aDestLength;
  77   PRUint32 n;
  78
  79   //complete remaining of last conversion
  80   if (mHighSurrogate) {
  81     if (src < srcEnd) {
  82       *aDestLength = 0;
  83       return NS_OK_UENC_MOREINPUT;
  84     }
  85     if (*aDestLength < 4) {
  86       *aSrcLength = 0;
  87       *aDestLength = 0;
  88       return NS_OK_UENC_MOREOUTPUT;
  89     }
  90     if (*src < (PRUnichar)0xdc00 || *src > (PRUnichar)0xdfff) { //not a pair
  91       *dest++ = (char)0xe0 | (mHighSurrogate >> 12);
  92       *dest++ = (char)0x80 | ((mHighSurrogate >> 6) & 0x003f);
  93       *dest++ = (char)0x80 | (mHighSurrogate & 0x003f);
  94       destLen -= 3;
  95     } else {
  96       n = ((mHighSurrogate - (PRUnichar)0xd800) << 10) +
  97               (*src - (PRUnichar)0xdc00) + 0x10000;
  98       *dest++ = (char)0xf0 | (n >> 18);
  99       *dest++ = (char)0x80 | ((n >> 12) & 0x3f);
 100       *dest++ = (char)0x80 | ((n >> 6) & 0x3f);
 101       *dest++ = (char)0x80 | (n & 0x3f);
 102       ++src;
 103       destLen -= 4;
 104     }
 105     mHighSurrogate = 0;
 106   }
 107
 108   while (src < srcEnd) {
 109     if ( *src <= 0x007f) {
 110       if (destLen < 1)
 111         goto error_more_output;
 112       *dest++ = (char)*src;
 113       --destLen;
 114     } else if (*src <= 0x07ff) {
 115       if (destLen < 2)
 116         goto error_more_output;
 117       *dest++ = (char)0xc0 | (*src >> 6);
 118       *dest++ = (char)0x80 | (*src & 0x003f);
 119       destLen -= 2;
 120     } else if (*src >= (PRUnichar)0xD800 && *src < (PRUnichar)0xDC00) {
 121       if ((src+1) >= srcEnd) {
 122         //we need another surrogate to complete this unicode char
 123         mHighSurrogate = *src;
 124         *aDestLength = dest - aDest;
 125         return NS_OK_UENC_MOREINPUT;
 126       }
 127       //handle surrogate
 128       if (destLen < 4)
 129         goto error_more_output;
 130       if (*(src+1) < (PRUnichar)0xdc00 || *(src+1) > 0xdfff) { //not a pair
 131         *dest++ = (char)0xe0 | (*src >> 12);
 132         *dest++ = (char)0x80 | ((*src >> 6) & 0x003f);
 133         *dest++ = (char)0x80 | (*src & 0x003f);
 134         destLen -= 3;
 135       } else {
 136         n = ((*src - (PRUnichar)0xd800) << 10) + (*(src+1) - (PRUnichar)0xdc00) + (PRUint32)0x10000;
 137         *dest++ = (char)0xf0 | (n >> 18);
 138         *dest++ = (char)0x80 | ((n >> 12) & 0x3f);
 139         *dest++ = (char)0x80 | ((n >> 6) & 0x3f);
 140         *dest++ = (char)0x80 | (n & 0x3f);
 141         destLen -= 4;
 142         ++src;
 143       }
 144     } else {
 145       if (destLen < 3)
 146         goto error_more_output;
 147       //treat rest of the character as BMP
 148       *dest++ = (char)0xe0 | (*src >> 12);
 149       *dest++ = (char)0x80 | ((*src >> 6) & 0x003f);
 150       *dest++ = (char)0x80 | (*src & 0x003f);
 151       destLen -= 3;
 152     }
 153     ++src;
 154   }
 155
 156   *aDestLength = dest - aDest;
 157   return NS_OK;
 158
 159 error_more_output:
 160   *aSrcLength = src - aSrc;
 161   *aDestLength = dest - aDest;
 162   return NS_OK_UENC_MOREOUTPUT;
 163 }
 164
 165 NS_IMETHODIMP nsUnicodeToUTF8::Finish(char * aDest, PRInt32 * aDestLength)
 166 {
 167   char * dest = aDest;
 168
 169   if (mHighSurrogate) {
 170     if (*aDestLength < 3) {
 171       *aDestLength = 0;
 172       return NS_OK_UENC_MOREOUTPUT;
 173     }
 174     *dest++ = (char)0xe0 | (mHighSurrogate >> 12);
 175     *dest++ = (char)0x80 | ((mHighSurrogate >> 6) & 0x003f);
 176     *dest++ = (char)0x80 | (mHighSurrogate & 0x003f);
 177     mHighSurrogate = 0;
 178     *aDestLength = 3;
 179     return NS_OK;
 180   }
 181
 182   *aDestLength  = 0;
 183   return NS_OK;
 184 }