intl/uconv/src/nsUTF8ToUnicode.cpp

   1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* ***** BEGIN LICENSE BLOCK *****
   3  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   4  *
   5  * The contents of this file are subject to the Mozilla Public License Version
   6  * 1.1 (the "License"); you may not use this file except in compliance with
   7  * the License. You may obtain a copy of the License at
   8  * http://www.mozilla.org/MPL/
   9  *
  10  * Software distributed under the License is distributed on an "AS IS" basis,
  11  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  12  * for the specific language governing rights and limitations under the
  13  * License.
  14  *
  15  * The Original Code is Mozilla Communicator client code.
  16  *
  17  * The Initial Developer of the Original Code is
  18  * Netscape Communications Corporation.
  19  * Portions created by the Initial Developer are Copyright (C) 1998
  20  * the Initial Developer. All Rights Reserved.
  21  *
  22  * Contributor(s):
  23  *
  24  * Alternatively, the contents of this file may be used under the terms of
  25  * either of the GNU General Public License Version 2 or later (the "GPL"),
  26  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  27  * in which case the provisions of the GPL or the LGPL are applicable instead
  28  * of those above. If you wish to allow use of your version of this file only
  29  * under the terms of either the GPL or the LGPL, and not to allow others to
  30  * use your version of this file under the terms of the MPL, indicate your
  31  * decision by deleting the provisions above and replace them with the notice
  32  * and other provisions required by the GPL or the LGPL. If you do not delete
  33  * the provisions above, a recipient may use your version of this file under
  34  * the terms of any one of the MPL, the GPL or the LGPL.
  35  *
  36  * ***** END LICENSE BLOCK ***** */
  37
  38 #include "nsUCSupport.h"
  39 #include "nsUTF8ToUnicode.h"
  40
  41 #define UNICODE_BYTE_ORDER_MARK    0xFEFF
  42
  43 NS_IMETHODIMP NS_NewUTF8ToUnicode(nsISupports* aOuter,
  44                                   const nsIID& aIID,
  45                                   void** aResult)
  46 {
  47   if (!aResult) {
  48     return NS_ERROR_NULL_POINTER;
  49   }
  50   if (aOuter) {
  51     *aResult = nsnull;
  52     return NS_ERROR_NO_AGGREGATION;
  53   }
  54   nsUTF8ToUnicode * inst = new nsUTF8ToUnicode();
  55   if (!inst) {
  56     *aResult = nsnull;
  57     return NS_ERROR_OUT_OF_MEMORY;
  58   }
  59   nsresult res = inst->QueryInterface(aIID, aResult);
  60   if (NS_FAILED(res)) {
  61     *aResult = nsnull;
  62     delete inst;
  63   }
  64   return res;
  65 }
  66
  67 //----------------------------------------------------------------------
  68 // Class nsUTF8ToUnicode [implementation]
  69
  70 nsUTF8ToUnicode::nsUTF8ToUnicode()
  71 : nsBasicDecoderSupport()
  72 {
  73   Reset();
  74 }
  75
  76 //----------------------------------------------------------------------
  77 // Subclassing of nsTableDecoderSupport class [implementation]
  78
  79 /**
  80  * Normally the maximum length of the output of the UTF8 decoder in UTF16
  81  *  code units is the same as the length of the input in UTF8 code units,
  82  *  since 1-byte, 2-byte and 3-byte UTF-8 sequences decode to a single
  83  *  UTF-16 character, and 4-byte UTF-8 sequences decode to a surrogate pair.
  84  *
  85  * However, there is an edge case where the output can be longer than the
  86  *  input: if the previous buffer ended with an incomplete multi-byte
  87  *  sequence and this buffer does not begin with a valid continuation
  88  *  byte, we will return NS_ERROR_UNEXPECTED and the caller may insert a
  89  *  replacement character in the output buffer which corresponds to no
  90  *  character in the input buffer. So in the worst case the destination
  91  *  will need to be one code unit longer than the source.
  92  *  See bug 301797.
  93  */
  94 NS_IMETHODIMP nsUTF8ToUnicode::GetMaxLength(const char * aSrc,
  95                                             PRInt32 aSrcLength,
  96                                             PRInt32 * aDestLength)
  97 {
  98   *aDestLength = aSrcLength + 1;
  99   return NS_OK;
 100 }
 101
 102
 103 //----------------------------------------------------------------------
 104 // Subclassing of nsBasicDecoderSupport class [implementation]
 105
 106 NS_IMETHODIMP nsUTF8ToUnicode::Reset()
 107 {
 108
 109   mUcs4  = 0;     // cached Unicode character
 110   mState = 0;     // cached expected number of octets after the current octet
 111                   // until the beginning of the next UTF8 character sequence
 112   mBytes = 1;     // cached expected number of octets in the current sequence
 113   mFirst = PR_TRUE;
 114
 115   return NS_OK;
 116
 117 }
 118
 119 //----------------------------------------------------------------------
 120 // Subclassing of nsBasicDecoderSupport class [implementation]
 121
 122
 123 NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc,
 124                                        PRInt32 * aSrcLength,
 125                                        PRUnichar * aDest,
 126                                        PRInt32 * aDestLength)
 127 {
 128   PRUint32 aSrcLen   = (PRUint32) (*aSrcLength);
 129   PRUint32 aDestLen = (PRUint32) (*aDestLength);
 130
 131   const char *in, *inend;
 132   inend = aSrc + aSrcLen;
 133
 134   PRUnichar *out, *outend;
 135   outend = aDest + aDestLen;
 136
 137   nsresult res = NS_OK; // conversion result
 138
 139   // Set mFirst to PR_FALSE now so we don't have to every time through the ASCII
 140   // branch within the loop.
 141   if (mFirst && aSrcLen && (0 == (0x80 & (*aSrc))))
 142     mFirst = PR_FALSE;
 143
 144   for (in = aSrc, out = aDest; ((in < inend) && (out < outend)); ++in) {
 145     if (0 == mState) {
 146       // When mState is zero we expect either a US-ASCII character or a
 147       // multi-octet sequence.
 148       if (0 == (0x80 & (*in))) {
 149         // US-ASCII, pass straight through.
 150         *out++ = (PRUnichar)*in;
 151         mBytes = 1;
 152       } else if (0xC0 == (0xE0 & (*in))) {
 153         // First octet of 2 octet sequence
 154         mUcs4 = (PRUint32)(*in);
 155         mUcs4 = (mUcs4 & 0x1F) << 6;
 156         mState = 1;
 157         mBytes = 2;
 158       } else if (0xE0 == (0xF0 & (*in))) {
 159         // First octet of 3 octet sequence
 160         mUcs4 = (PRUint32)(*in);
 161         mUcs4 = (mUcs4 & 0x0F) << 12;
 162         mState = 2;
 163         mBytes = 3;
 164       } else if (0xF0 == (0xF8 & (*in))) {
 165         // First octet of 4 octet sequence
 166         mUcs4 = (PRUint32)(*in);
 167         mUcs4 = (mUcs4 & 0x07) << 18;
 168         mState = 3;
 169         mBytes = 4;
 170       } else if (0xF8 == (0xFC & (*in))) {
 171         /* First octet of 5 octet sequence.
 172          *
 173          * This is illegal because the encoded codepoint must be either
 174          * (a) not the shortest form or
 175          * (b) outside the Unicode range of 0-0x10FFFF.
 176          * Rather than trying to resynchronize, we will carry on until the end
 177          * of the sequence and let the later error handling code catch it.
 178          */
 179         mUcs4 = (PRUint32)(*in);
 180         mUcs4 = (mUcs4 & 0x03) << 24;
 181         mState = 4;
 182         mBytes = 5;
 183       } else if (0xFC == (0xFE & (*in))) {
 184         // First octet of 6 octet sequence, see comments for 5 octet sequence.
 185         mUcs4 = (PRUint32)(*in);
 186         mUcs4 = (mUcs4 & 1) << 30;
 187         mState = 5;
 188         mBytes = 6;
 189       } else {
 190         /* Current octet is neither in the US-ASCII range nor a legal first
 191          * octet of a multi-octet sequence.
 192          *
 193          * Return an error condition. Caller is responsible for flushing and
 194          * refilling the buffer and resetting state.
 195          */
 196         res = NS_ERROR_UNEXPECTED;
 197         break;
 198       }
 199     } else {
 200       // When mState is non-zero, we expect a continuation of the multi-octet
 201       // sequence
 202       if (0x80 == (0xC0 & (*in))) {
 203         // Legal continuation.
 204         PRUint32 shift = (mState - 1) * 6;
 205         PRUint32 tmp = *in;
 206         tmp = (tmp & 0x0000003FL) << shift;
 207         mUcs4 |= tmp;
 208
 209         if (0 == --mState) {
 210           /* End of the multi-octet sequence. mUcs4 now contains the final
 211            * Unicode codepoint to be output
 212            *
 213            * Check for illegal sequences and codepoints.
 214            */
 215
 216           // From Unicode 3.1, non-shortest form is illegal
 217           if (((2 == mBytes) && (mUcs4 < 0x0080)) ||
 218               ((3 == mBytes) && (mUcs4 < 0x0800)) ||
 219               ((4 == mBytes) && (mUcs4 < 0x10000)) ||
 220               (4 < mBytes) ||
 221               // From Unicode 3.2, surrogate characters are illegal
 222               ((mUcs4 & 0xFFFFF800) == 0xD800) ||
 223               // Codepoints outside the Unicode range are illegal
 224               (mUcs4 > 0x10FFFF)) {
 225             res = NS_ERROR_UNEXPECTED;
 226             break;
 227           }
 228           if (mUcs4 > 0xFFFF) {
 229             // mUcs4 is in the range 0x10000 - 0x10FFFF. Output a UTF-16 pair
 230             mUcs4 -= 0x00010000;
 231             *out++ = 0xD800 | (0x000003FF & (mUcs4 >> 10));
 232             *out++ = 0xDC00 | (0x000003FF & mUcs4);
 233           } else if (UNICODE_BYTE_ORDER_MARK != mUcs4 || !mFirst) {
 234             // Don't output the BOM only if it is the first character
 235             *out++ = mUcs4;
 236           }
 237           //initialize UTF8 cache
 238           mUcs4  = 0;
 239           mState = 0;
 240           mBytes = 1;
 241           mFirst = PR_FALSE;
 242         }
 243       } else {
 244         /* ((0xC0 & (*in) != 0x80) && (mState != 0))
 245          *
 246          * Incomplete multi-octet sequence. Unconsume this
 247          * octet and return an error condition. Caller is responsible
 248          * for flushing and refilling the buffer and resetting state.
 249          */
 250         in--;
 251         res = NS_ERROR_UNEXPECTED;
 252         break;
 253       }
 254     }
 255   }
 256
 257   // output not finished, output buffer too short
 258   if ((NS_OK == res) && (in < inend) && (out >= outend))
 259     res = NS_OK_UDEC_MOREOUTPUT;
 260
 261   // last UCS4 is incomplete, make sure the caller
 262   // returns with properly aligned continuation of the buffer
 263   if ((NS_OK == res) && (mState != 0))
 264     res = NS_OK_UDEC_MOREINPUT;
 265
 266   *aSrcLength = in - aSrc;
 267   *aDestLength = out - aDest;
 268
 269   return(res);
 270 }