sal/textenc/tcvtutf8.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include "sal/config.h"
  21
  22 #include "sal/types.h"
  23 #include "rtl/textcvt.h"
  24
  25 #include "converter.hxx"
  26 #include "tcvtutf8.hxx"
  27 #include "tenchelp.hxx"
  28 #include "unichars.hxx"
  29
  30 struct ImplUtf8ToUnicodeContext
  31 {
  32     sal_uInt32 nUtf32;
  33     int nShift;
  34     bool bCheckBom;
  35 };
  36
  37 struct ImplUnicodeToUtf8Context
  38 {
  39     sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
  40 };
  41
  42 void * ImplCreateUtf8ToUnicodeContext()
  43 {
  44     ImplUtf8ToUnicodeContext * p = new ImplUtf8ToUnicodeContext;
  45     ImplResetUtf8ToUnicodeContext(p);
  46     return p;
  47 }
  48
  49 void ImplResetUtf8ToUnicodeContext(void * pContext)
  50 {
  51     if (pContext != NULL)
  52     {
  53         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = -1;
  54         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = true;
  55     }
  56 }
  57
  58 void ImplDestroyUtf8ToUnicodeContext(void * pContext)
  59 {
  60     delete static_cast< ImplUtf8ToUnicodeContext * >(pContext);
  61 }
  62
  63 sal_Size ImplConvertUtf8ToUnicode(
  64     void const * pData, void * pContext, char const * pSrcBuf,
  65     sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars,
  66     sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
  67 {
  68     /*
  69        This function is very liberal with the UTF-8 input.  Accepted are:
  70        - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041)
  71        - surrogates (e.g., ED A0 80 to represent U+D800)
  72        - encodings with up to six bytes (everything outside the range
  73          U+0000..10FFFF is considered "undefined")
  74        The first two of these points allow this routine to translate from both
  75        RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8.
  76       */
  77
  78     bool bJavaUtf8 = pData != NULL;
  79     sal_uInt32 nUtf32 = 0;
  80     int nShift = -1;
  81     bool bCheckBom = true;
  82     sal_uInt32 nInfo = 0;
  83     unsigned char const * pSrcBufPtr = reinterpret_cast<unsigned char const *>(pSrcBuf);
  84     unsigned char const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
  85     sal_Unicode * pDestBufPtr = pDestBuf;
  86     sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
  87
  88     if (pContext != NULL)
  89     {
  90         nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32;
  91         nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift;
  92         bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom;
  93     }
  94
  95     while (pSrcBufPtr < pSrcBufEnd)
  96     {
  97         bool bUndefined = false;
  98         bool bConsume = true;
  99         sal_uInt32 nChar = *pSrcBufPtr++;
 100         if (nShift < 0)
 101             if (nChar <= 0x7F)
 102             {
 103                 nUtf32 = nChar;
 104                 goto transform;
 105             }
 106             else if (nChar <= 0xBF)
 107                 goto bad_input;
 108             else if (nChar <= 0xDF)
 109             {
 110                 nUtf32 = (nChar & 0x1F) << 6;
 111                 nShift = 0;
 112             }
 113             else if (nChar <= 0xEF)
 114             {
 115                 nUtf32 = (nChar & 0x0F) << 12;
 116                 nShift = 6;
 117             }
 118             else if (nChar <= 0xF7)
 119             {
 120                 nUtf32 = (nChar & 0x07) << 18;
 121                 nShift = 12;
 122             }
 123             else if (nChar <= 0xFB)
 124             {
 125                 nUtf32 = (nChar & 0x03) << 24;
 126                 nShift = 18;
 127             }
 128             else if (nChar <= 0xFD)
 129             {
 130                 nUtf32 = (nChar & 0x01) << 30;
 131                 nShift = 24;
 132             }
 133             else
 134                 goto bad_input;
 135         else if ((nChar & 0xC0) == 0x80)
 136         {
 137             nUtf32 |= (nChar & 0x3F) << nShift;
 138             if (nShift == 0)
 139                 goto transform;
 140             else
 141                 nShift -= 6;
 142         }
 143         else
 144         {
 145             /*
 146              This byte is preceded by a broken UTF-8 sequence; if this byte
 147              is neither in the range [0x80..0xBF] nor in the range
 148              [0xFE..0xFF], assume that this byte does not belong to that
 149              broken sequence, but instead starts a new, legal UTF-8 sequence:
 150              */
 151             bConsume = nChar >= 0xFE;
 152             goto bad_input;
 153         }
 154         continue;
 155
 156     transform:
 157         if (!bCheckBom || nUtf32 != 0xFEFF
 158             || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
 159             || bJavaUtf8)
 160         {
 161             if (nUtf32 <= 0xFFFF)
 162                 if (pDestBufPtr != pDestBufEnd)
 163                     *pDestBufPtr++ = (sal_Unicode) nUtf32;
 164                 else
 165                     goto no_output;
 166             else if (nUtf32 <= 0x10FFFF)
 167                 if (pDestBufEnd - pDestBufPtr >= 2)
 168                 {
 169                     *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32);
 170                     *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32);
 171                 }
 172                 else
 173                     goto no_output;
 174             else
 175             {
 176                 bUndefined = true;
 177                 goto bad_input;
 178             }
 179         }
 180         nShift = -1;
 181         bCheckBom = false;
 182         continue;
 183
 184     bad_input:
 185         switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
 186                     bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
 187                     &nInfo))
 188         {
 189         case sal::detail::textenc::BAD_INPUT_STOP:
 190             nShift = -1;
 191             bCheckBom = false;
 192             if (!bConsume)
 193                 --pSrcBufPtr;
 194             break;
 195
 196         case sal::detail::textenc::BAD_INPUT_CONTINUE:
 197             nShift = -1;
 198             bCheckBom = false;
 199             if (!bConsume)
 200                 --pSrcBufPtr;
 201             continue;
 202
 203         case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
 204             goto no_output;
 205         }
 206         break;
 207
 208     no_output:
 209         --pSrcBufPtr;
 210         nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
 211         break;
 212     }
 213
 214     if (nShift >= 0
 215         && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
 216                          | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
 217                == 0)
 218     {
 219         if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
 220             nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
 221         else
 222             switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
 223                         false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
 224                         &nInfo))
 225             {
 226             case sal::detail::textenc::BAD_INPUT_STOP:
 227             case sal::detail::textenc::BAD_INPUT_CONTINUE:
 228                 nShift = -1;
 229                 bCheckBom = false;
 230                 break;
 231
 232             case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
 233                 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
 234                 break;
 235             }
 236     }
 237
 238     if (pContext != NULL)
 239     {
 240         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32;
 241         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift;
 242         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom;
 243     }
 244     if (pInfo != NULL)
 245         *pInfo = nInfo;
 246     if (pSrcCvtBytes != NULL)
 247         *pSrcCvtBytes = reinterpret_cast< char const * >(pSrcBufPtr) - pSrcBuf;
 248     return pDestBufPtr - pDestBuf;
 249 }
 250
 251 void * ImplCreateUnicodeToUtf8Context()
 252 {
 253     ImplUnicodeToUtf8Context * p = new ImplUnicodeToUtf8Context;
 254     ImplResetUnicodeToUtf8Context(p);
 255     return p;
 256 }
 257
 258 void ImplResetUnicodeToUtf8Context(void * pContext)
 259 {
 260     if (pContext != NULL)
 261         static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate = 0xFFFF;
 262 }
 263
 264 void ImplDestroyUnicodeToUtf8Context(void * pContext)
 265 {
 266     delete static_cast< ImplUnicodeToUtf8Context * >(pContext);
 267 }
 268
 269 sal_Size ImplConvertUnicodeToUtf8(
 270     void const * pData, void * pContext, sal_Unicode const * pSrcBuf,
 271     sal_Size nSrcChars, char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
 272     sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
 273 {
 274     bool bJavaUtf8 = pData != NULL;
 275     sal_Unicode nHighSurrogate = 0xFFFF;
 276     sal_uInt32 nInfo = 0;
 277     sal_Unicode const * pSrcBufPtr = pSrcBuf;
 278     sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
 279     char * pDestBufPtr = pDestBuf;
 280     char * pDestBufEnd = pDestBufPtr + nDestBytes;
 281
 282     if (pContext != NULL)
 283         nHighSurrogate
 284             = static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate;
 285
 286     if (nHighSurrogate == 0xFFFF)
 287     {
 288         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
 289             && !bJavaUtf8)
 290         {
 291             if (pDestBufEnd - pDestBufPtr >= 3)
 292             {
 293                 /* Write BOM (U+FEFF) as UTF-8: */
 294                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xEF));
 295                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBB));
 296                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBF));
 297             }
 298             else
 299             {
 300                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
 301                 goto done;
 302             }
 303         }
 304         nHighSurrogate = 0;
 305     }
 306
 307     while (pSrcBufPtr < pSrcBufEnd)
 308     {
 309         sal_uInt32 nChar = *pSrcBufPtr++;
 310         if (nHighSurrogate == 0)
 311         {
 312             if (ImplIsHighSurrogate(nChar) && !bJavaUtf8)
 313             {
 314                 nHighSurrogate = (sal_Unicode) nChar;
 315                 continue;
 316             }
 317         }
 318         else if (ImplIsLowSurrogate(nChar) && !bJavaUtf8)
 319             nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
 320         else
 321             goto bad_input;
 322
 323         if ((ImplIsLowSurrogate(nChar) && !bJavaUtf8)
 324             || ImplIsNoncharacter(nChar))
 325             goto bad_input;
 326
 327         if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
 328             if (pDestBufPtr != pDestBufEnd)
 329                 *pDestBufPtr++ = static_cast< char >(nChar);
 330             else
 331                 goto no_output;
 332         else if (nChar <= 0x7FF)
 333             if (pDestBufEnd - pDestBufPtr >= 2)
 334             {
 335                 *pDestBufPtr++ = static_cast< char >(0xC0 | (nChar >> 6));
 336                 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
 337             }
 338             else
 339                 goto no_output;
 340         else if (nChar <= 0xFFFF)
 341             if (pDestBufEnd - pDestBufPtr >= 3)
 342             {
 343                 *pDestBufPtr++ = static_cast< char >(0xE0 | (nChar >> 12));
 344                 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
 345                 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
 346             }
 347             else
 348                 goto no_output;
 349         else if (pDestBufEnd - pDestBufPtr >= 4)
 350         {
 351             *pDestBufPtr++ = static_cast< char >(0xF0 | (nChar >> 18));
 352             *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 12) & 0x3F));
 353             *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
 354             *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
 355         }
 356         else
 357             goto no_output;
 358         nHighSurrogate = 0;
 359         continue;
 360
 361     bad_input:
 362         switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
 363                     false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, NULL,
 364                     0, NULL))
 365         {
 366         case sal::detail::textenc::BAD_INPUT_STOP:
 367             nHighSurrogate = 0;
 368             break;
 369
 370         case sal::detail::textenc::BAD_INPUT_CONTINUE:
 371             nHighSurrogate = 0;
 372             continue;
 373
 374         case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
 375             goto no_output;
 376         }
 377         break;
 378
 379     no_output:
 380         --pSrcBufPtr;
 381         nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
 382         break;
 383     }
 384
 385     if (nHighSurrogate != 0
 386         && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
 387                          | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
 388                == 0)
 389     {
 390         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
 391             nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
 392         else
 393             switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
 394                         false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
 395                         NULL, 0, NULL))
 396             {
 397             case sal::detail::textenc::BAD_INPUT_STOP:
 398             case sal::detail::textenc::BAD_INPUT_CONTINUE:
 399                 nHighSurrogate = 0;
 400                 break;
 401
 402             case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
 403                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
 404                 break;
 405             }
 406     }
 407
 408  done:
 409     if (pContext != NULL)
 410         static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate
 411             = nHighSurrogate;
 412     if (pInfo != NULL)
 413         *pInfo = nInfo;
 414     if (pSrcCvtChars != NULL)
 415         *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
 416     return pDestBufPtr - pDestBuf;
 417 }
 418
 419 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */