sal/textenc/tcvtutf8.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*************************************************************************
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * Copyright 2000, 2010 Oracle and/or its affiliates.
   7  *
   8  * OpenOffice.org - a multi-platform office productivity suite
   9  *
  10  * This file is part of OpenOffice.org.
  11  *
  12  * OpenOffice.org is free software: you can redistribute it and/or modify
  13  * it under the terms of the GNU Lesser General Public License version 3
  14  * only, as published by the Free Software Foundation.
  15  *
  16  * OpenOffice.org is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19  * GNU Lesser General Public License version 3 for more details
  20  * (a copy is included in the LICENSE file that accompanied this code).
  21  *
  22  * You should have received a copy of the GNU Lesser General Public License
  23  * version 3 along with OpenOffice.org.  If not, see
  24  * <http://www.openoffice.org/license.html>
  25  * for a copy of the LGPLv3 License.
  26  *
  27  ************************************************************************/
  28
  29 #include "sal/config.h"
  30
  31 #include "sal/types.h"
  32 #include "rtl/textcvt.h"
  33
  34 #include "converter.hxx"
  35 #include "tcvtutf8.hxx"
  36 #include "tenchelp.hxx"
  37 #include "unichars.hxx"
  38
  39 struct ImplUtf8ToUnicodeContext
  40 {
  41     sal_uInt32 nUtf32;
  42     int nShift;
  43     bool bCheckBom;
  44 };
  45
  46 struct ImplUnicodeToUtf8Context
  47 {
  48     sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
  49 };
  50
  51 void * ImplCreateUtf8ToUnicodeContext()
  52 {
  53     ImplUtf8ToUnicodeContext * p = new ImplUtf8ToUnicodeContext;
  54     ImplResetUtf8ToUnicodeContext(p);
  55     return p;
  56 }
  57
  58 void ImplResetUtf8ToUnicodeContext(void * pContext)
  59 {
  60     if (pContext != NULL)
  61     {
  62         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = -1;
  63         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = true;
  64     }
  65 }
  66
  67 void ImplDestroyUtf8ToUnicodeContext(void * pContext)
  68 {
  69     delete static_cast< ImplUtf8ToUnicodeContext * >(pContext);
  70 }
  71
  72 sal_Size ImplConvertUtf8ToUnicode(
  73     void const * pData, void * pContext, char const * pSrcBuf,
  74     sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars,
  75     sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
  76 {
  77     /*
  78        This function is very liberal with the UTF-8 input.  Accepted are:
  79        - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041)
  80        - surrogates (e.g., ED A0 80 to represent U+D800)
  81        - encodings with up to six bytes (everything outside the range
  82          U+0000..10FFFF is considered "undefined")
  83        The first two of these points allow this routine to translate from both
  84        RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8.
  85       */
  86
  87     int bJavaUtf8 = pData != NULL;
  88     sal_uInt32 nUtf32 = 0;
  89     int nShift = -1;
  90     bool bCheckBom = true;
  91     sal_uInt32 nInfo = 0;
  92     sal_uChar const * pSrcBufPtr = (sal_uChar const *) pSrcBuf;
  93     sal_uChar const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
  94     sal_Unicode * pDestBufPtr = pDestBuf;
  95     sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
  96
  97     if (pContext != NULL)
  98     {
  99         nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32;
 100         nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift;
 101         bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom;
 102     }
 103
 104     while (pSrcBufPtr < pSrcBufEnd)
 105     {
 106         bool bUndefined = false;
 107         int bConsume = true;
 108         sal_uInt32 nChar = *pSrcBufPtr++;
 109         if (nShift < 0)
 110             if (nChar <= 0x7F)
 111             {
 112                 nUtf32 = nChar;
 113                 goto transform;
 114             }
 115             else if (nChar <= 0xBF)
 116                 goto bad_input;
 117             else if (nChar <= 0xDF)
 118             {
 119                 nUtf32 = (nChar & 0x1F) << 6;
 120                 nShift = 0;
 121             }
 122             else if (nChar <= 0xEF)
 123             {
 124                 nUtf32 = (nChar & 0x0F) << 12;
 125                 nShift = 6;
 126             }
 127             else if (nChar <= 0xF7)
 128             {
 129                 nUtf32 = (nChar & 0x07) << 18;
 130                 nShift = 12;
 131             }
 132             else if (nChar <= 0xFB)
 133             {
 134                 nUtf32 = (nChar & 0x03) << 24;
 135                 nShift = 18;
 136             }
 137             else if (nChar <= 0xFD)
 138             {
 139                 nUtf32 = (nChar & 0x01) << 30;
 140                 nShift = 24;
 141             }
 142             else
 143                 goto bad_input;
 144         else if ((nChar & 0xC0) == 0x80)
 145         {
 146             nUtf32 |= (nChar & 0x3F) << nShift;
 147             if (nShift == 0)
 148                 goto transform;
 149             else
 150                 nShift -= 6;
 151         }
 152         else
 153         {
 154             /*
 155              This byte is preceeded by a broken UTF-8 sequence; if this byte
 156              is neither in the range [0x80..0xBF] nor in the range
 157              [0xFE..0xFF], assume that this byte does not belong to that
 158              broken sequence, but instead starts a new, legal UTF-8 sequence:
 159              */
 160             bConsume = nChar >= 0xFE;
 161             goto bad_input;
 162         }
 163         continue;
 164
 165     transform:
 166         if (!bCheckBom || nUtf32 != 0xFEFF
 167             || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
 168             || bJavaUtf8)
 169         {
 170             if (nUtf32 <= 0xFFFF)
 171                 if (pDestBufPtr != pDestBufEnd)
 172                     *pDestBufPtr++ = (sal_Unicode) nUtf32;
 173                 else
 174                     goto no_output;
 175             else if (nUtf32 <= 0x10FFFF)
 176                 if (pDestBufEnd - pDestBufPtr >= 2)
 177                 {
 178                     *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32);
 179                     *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32);
 180                 }
 181                 else
 182                     goto no_output;
 183             else
 184             {
 185                 bUndefined = true;
 186                 goto bad_input;
 187             }
 188         }
 189         nShift = -1;
 190         bCheckBom = false;
 191         continue;
 192
 193     bad_input:
 194         switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
 195                     bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
 196                     &nInfo))
 197         {
 198         case sal::detail::textenc::BAD_INPUT_STOP:
 199             nShift = -1;
 200             bCheckBom = false;
 201             if (!bConsume)
 202                 --pSrcBufPtr;
 203             break;
 204
 205         case sal::detail::textenc::BAD_INPUT_CONTINUE:
 206             nShift = -1;
 207             bCheckBom = false;
 208             if (!bConsume)
 209                 --pSrcBufPtr;
 210             continue;
 211
 212         case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
 213             goto no_output;
 214         }
 215         break;
 216
 217     no_output:
 218         --pSrcBufPtr;
 219         nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
 220         break;
 221     }
 222
 223     if (nShift >= 0
 224         && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
 225                          | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
 226                == 0)
 227     {
 228         if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
 229             nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
 230         else
 231             switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
 232                         false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
 233                         &nInfo))
 234             {
 235             case sal::detail::textenc::BAD_INPUT_STOP:
 236             case sal::detail::textenc::BAD_INPUT_CONTINUE:
 237                 nShift = -1;
 238                 bCheckBom = false;
 239                 break;
 240
 241             case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
 242                 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
 243                 break;
 244             }
 245     }
 246
 247     if (pContext != NULL)
 248     {
 249         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32;
 250         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift;
 251         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom;
 252     }
 253     if (pInfo != NULL)
 254         *pInfo = nInfo;
 255     if (pSrcCvtBytes != NULL)
 256         *pSrcCvtBytes = reinterpret_cast< char const * >(pSrcBufPtr) - pSrcBuf;
 257     return pDestBufPtr - pDestBuf;
 258 }
 259
 260 void * ImplCreateUnicodeToUtf8Context()
 261 {
 262     ImplUnicodeToUtf8Context * p = new ImplUnicodeToUtf8Context;
 263     ImplResetUnicodeToUtf8Context(p);
 264     return p;
 265 }
 266
 267 void ImplResetUnicodeToUtf8Context(void * pContext)
 268 {
 269     if (pContext != NULL)
 270         static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate = 0xFFFF;
 271 }
 272
 273 void ImplDestroyUnicodeToUtf8Context(void * pContext)
 274 {
 275     delete static_cast< ImplUnicodeToUtf8Context * >(pContext);
 276 }
 277
 278 sal_Size ImplConvertUnicodeToUtf8(
 279     void const * pData, void * pContext, sal_Unicode const * pSrcBuf,
 280     sal_Size nSrcChars, char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
 281     sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
 282 {
 283     int bJavaUtf8 = pData != NULL;
 284     sal_Unicode nHighSurrogate = 0xFFFF;
 285     sal_uInt32 nInfo = 0;
 286     sal_Unicode const * pSrcBufPtr = pSrcBuf;
 287     sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
 288     char * pDestBufPtr = pDestBuf;
 289     char * pDestBufEnd = pDestBufPtr + nDestBytes;
 290
 291     if (pContext != NULL)
 292         nHighSurrogate
 293             = static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate;
 294
 295     if (nHighSurrogate == 0xFFFF)
 296     {
 297         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
 298             && !bJavaUtf8)
 299         {
 300             if (pDestBufEnd - pDestBufPtr >= 3)
 301             {
 302                 /* Write BOM (U+FEFF) as UTF-8: */
 303                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xEF));
 304                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBB));
 305                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBF));
 306             }
 307             else
 308             {
 309                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
 310                 goto done;
 311             }
 312         }
 313         nHighSurrogate = 0;
 314     }
 315
 316     while (pSrcBufPtr < pSrcBufEnd)
 317     {
 318         sal_uInt32 nChar = *pSrcBufPtr++;
 319         if (nHighSurrogate == 0)
 320         {
 321             if (ImplIsHighSurrogate(nChar) && !bJavaUtf8)
 322             {
 323                 nHighSurrogate = (sal_Unicode) nChar;
 324                 continue;
 325             }
 326         }
 327         else if (ImplIsLowSurrogate(nChar) && !bJavaUtf8)
 328             nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
 329         else
 330             goto bad_input;
 331
 332         if ((ImplIsLowSurrogate(nChar) && !bJavaUtf8)
 333             || ImplIsNoncharacter(nChar))
 334             goto bad_input;
 335
 336         if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
 337             if (pDestBufPtr != pDestBufEnd)
 338                 *pDestBufPtr++ = static_cast< char >(nChar);
 339             else
 340                 goto no_output;
 341         else if (nChar <= 0x7FF)
 342             if (pDestBufEnd - pDestBufPtr >= 2)
 343             {
 344                 *pDestBufPtr++ = static_cast< char >(0xC0 | (nChar >> 6));
 345                 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
 346             }
 347             else
 348                 goto no_output;
 349         else if (nChar <= 0xFFFF)
 350             if (pDestBufEnd - pDestBufPtr >= 3)
 351             {
 352                 *pDestBufPtr++ = static_cast< char >(0xE0 | (nChar >> 12));
 353                 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
 354                 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
 355             }
 356             else
 357                 goto no_output;
 358         else if (pDestBufEnd - pDestBufPtr >= 4)
 359         {
 360             *pDestBufPtr++ = static_cast< char >(0xF0 | (nChar >> 18));
 361             *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 12) & 0x3F));
 362             *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
 363             *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
 364         }
 365         else
 366             goto no_output;
 367         nHighSurrogate = 0;
 368         continue;
 369
 370     bad_input:
 371         switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
 372                     false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, NULL,
 373                     0, NULL))
 374         {
 375         case sal::detail::textenc::BAD_INPUT_STOP:
 376             nHighSurrogate = 0;
 377             break;
 378
 379         case sal::detail::textenc::BAD_INPUT_CONTINUE:
 380             nHighSurrogate = 0;
 381             continue;
 382
 383         case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
 384             goto no_output;
 385         }
 386         break;
 387
 388     no_output:
 389         --pSrcBufPtr;
 390         nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
 391         break;
 392     }
 393
 394     if (nHighSurrogate != 0
 395         && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
 396                          | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
 397                == 0)
 398     {
 399         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
 400             nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
 401         else
 402             switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
 403                         false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
 404                         NULL, 0, NULL))
 405             {
 406             case sal::detail::textenc::BAD_INPUT_STOP:
 407             case sal::detail::textenc::BAD_INPUT_CONTINUE:
 408                 nHighSurrogate = 0;
 409                 break;
 410
 411             case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
 412                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
 413                 break;
 414             }
 415     }
 416
 417  done:
 418     if (pContext != NULL)
 419         static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate
 420             = nHighSurrogate;
 421     if (pInfo != NULL)
 422         *pInfo = nInfo;
 423     if (pSrcCvtChars != NULL)
 424         *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
 425     return pDestBufPtr - pDestBuf;
 426 }
 427
 428 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */