sal/textenc/tcvtutf8.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include <sal/config.h>
  21
  22 #include <sal/types.h>
  23 #include <rtl/textcvt.h>
  24
  25 #include "converter.hxx"
  26 #include "tcvtutf8.hxx"
  27 #include "tenchelp.hxx"
  28 #include "unichars.hxx"
  29
  30 struct ImplUtf8ToUnicodeContext
  31 {
  32     sal_uInt32 nUtf32;
  33     int nBytes;
  34     int nShift;
  35     bool bCheckBom;
  36 };
  37
  38 struct ImplUnicodeToUtf8Context
  39 {
  40     sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
  41 };
  42
  43 void * ImplCreateUtf8ToUnicodeContext()
  44 {
  45     ImplUtf8ToUnicodeContext * p = new ImplUtf8ToUnicodeContext;
  46     ImplResetUtf8ToUnicodeContext(p);
  47     return p;
  48 }
  49
  50 void ImplResetUtf8ToUnicodeContext(void * pContext)
  51 {
  52     if (pContext != nullptr)
  53     {
  54         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = -1;
  55         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = true;
  56     }
  57 }
  58
  59 void ImplDestroyUtf8ToUnicodeContext(void * pContext)
  60 {
  61     delete static_cast< ImplUtf8ToUnicodeContext * >(pContext);
  62 }
  63
  64 sal_Size ImplConvertUtf8ToUnicode(
  65     void const * pData, void * pContext, char const * pSrcBuf,
  66     sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars,
  67     sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
  68 {
  69     bool bJavaUtf8 = pData != nullptr;
  70     sal_uInt32 nUtf32 = 0;
  71     int nBytes = int();
  72     int nShift = -1;
  73     bool bCheckBom = true;
  74     sal_uInt32 nInfo = 0;
  75     unsigned char const * pSrcBufPtr = reinterpret_cast<unsigned char const *>(pSrcBuf);
  76     unsigned char const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
  77     sal_Unicode * pDestBufPtr = pDestBuf;
  78     sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
  79
  80     if (pContext != nullptr)
  81     {
  82         nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32;
  83         nBytes = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes;
  84         nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift;
  85         bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom;
  86     }
  87
  88     while (pSrcBufPtr < pSrcBufEnd)
  89     {
  90         bool bConsume = true;
  91         sal_uInt32 nChar = *pSrcBufPtr++;
  92         if (nShift < 0)
  93             // Allow (illegal) 5 and 6 byte sequences, so they are read as a
  94             // single individual bad character:
  95             if (nChar <= 0x7F)
  96             {
  97                 nUtf32 = nChar;
  98                 nBytes = 1;
  99                 goto transform;
 100             }
 101             else if (nChar <= 0xBF)
 102                 goto bad_input;
 103             else if (nChar <= 0xDF)
 104             {
 105                 nUtf32 = (nChar & 0x1F) << 6;
 106                 nBytes = 2;
 107                 nShift = 0;
 108             }
 109             else if (nChar <= 0xEF)
 110             {
 111                 nUtf32 = (nChar & 0x0F) << 12;
 112                 nBytes = 3;
 113                 nShift = 6;
 114             }
 115             else if (nChar <= 0xF7)
 116             {
 117                 nUtf32 = (nChar & 0x07) << 18;
 118                 nBytes = 4;
 119                 nShift = 12;
 120             }
 121             else if (nChar <= 0xFB)
 122             {
 123                 nUtf32 = (nChar & 0x03) << 24;
 124                 nBytes = 5;
 125                 nShift = 18;
 126             }
 127             else if (nChar <= 0xFD)
 128             {
 129                 nUtf32 = (nChar & 0x01) << 30;
 130                 nBytes = 6;
 131                 nShift = 24;
 132             }
 133             else
 134                 goto bad_input;
 135         else if ((nChar & 0xC0) == 0x80)
 136         {
 137             nUtf32 |= (nChar & 0x3F) << nShift;
 138             if (nShift == 0)
 139                 goto transform;
 140             else
 141                 nShift -= 6;
 142         }
 143         else
 144         {
 145             /*
 146              This byte is preceded by a broken UTF-8 sequence; if this byte
 147              is neither in the range [0x80..0xBF] nor in the range
 148              [0xFE..0xFF], assume that this byte does not belong to that
 149              broken sequence, but instead starts a new, legal UTF-8 sequence:
 150              */
 151             bConsume = nChar >= 0xFE;
 152             goto bad_input;
 153         }
 154         continue;
 155
 156     transform:
 157         if (!bCheckBom || nUtf32 != 0xFEFF || nBytes != 3
 158             || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
 159             || bJavaUtf8)
 160         {
 161             switch (nBytes) {
 162             case 1:
 163                 if (bJavaUtf8 && nUtf32 == 0) {
 164                     goto bad_input;
 165                 }
 166                 break;
 167             case 2:
 168                 if (nUtf32 < 0x80 && !(bJavaUtf8 && nUtf32 == 0)) {
 169                     goto bad_input;
 170                 }
 171                 break;
 172             case 3:
 173                 if (nUtf32 < 0x800 || (!bJavaUtf8 && rtl::isSurrogate(nUtf32)))
 174                 {
 175                     goto bad_input;
 176                 }
 177                 break;
 178             case 4:
 179                 if (nUtf32 < 0x10000 || !rtl::isUnicodeCodePoint(nUtf32)
 180                     || bJavaUtf8)
 181                 {
 182                     goto bad_input;
 183                 }
 184                 break;
 185             default:
 186                 goto bad_input;
 187             }
 188             if (nUtf32 <= 0xFFFF)
 189                 if (pDestBufPtr != pDestBufEnd)
 190                     *pDestBufPtr++ = static_cast<sal_Unicode>(nUtf32);
 191                 else
 192                     goto no_output;
 193             else if (pDestBufEnd - pDestBufPtr >= 2)
 194             {
 195                 *pDestBufPtr++ = static_cast<sal_Unicode>(ImplGetHighSurrogate(nUtf32));
 196                 *pDestBufPtr++ = static_cast<sal_Unicode>(ImplGetLowSurrogate(nUtf32));
 197             }
 198             else
 199                 goto no_output;
 200         }
 201         nShift = -1;
 202         bCheckBom = false;
 203         continue;
 204
 205     bad_input:
 206         switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
 207                     false, nBytes != 1, 0, nFlags, &pDestBufPtr, pDestBufEnd,
 208                     &nInfo))
 209         {
 210         case sal::detail::textenc::BAD_INPUT_STOP:
 211             nShift = -1;
 212             bCheckBom = false;
 213             if (!bConsume)
 214                 --pSrcBufPtr;
 215             break;
 216
 217         case sal::detail::textenc::BAD_INPUT_CONTINUE:
 218             nShift = -1;
 219             bCheckBom = false;
 220             if (!bConsume)
 221                 --pSrcBufPtr;
 222             continue;
 223
 224         case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
 225             goto no_output;
 226         }
 227         break;
 228
 229     no_output:
 230         --pSrcBufPtr;
 231         nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
 232         break;
 233     }
 234
 235     if (nShift >= 0
 236         && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
 237                          | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL))
 238                == 0)
 239     {
 240         if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
 241             nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL;
 242         else
 243             switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
 244                         false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
 245                         &nInfo))
 246             {
 247             case sal::detail::textenc::BAD_INPUT_STOP:
 248             case sal::detail::textenc::BAD_INPUT_CONTINUE:
 249                 nShift = -1;
 250                 bCheckBom = false;
 251                 break;
 252
 253             case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
 254                 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
 255                 break;
 256             }
 257     }
 258
 259     if (pContext != nullptr)
 260     {
 261         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32;
 262         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes = nBytes;
 263         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift;
 264         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom;
 265     }
 266     if (pInfo != nullptr)
 267         *pInfo = nInfo;
 268     if (pSrcCvtBytes != nullptr)
 269         *pSrcCvtBytes = reinterpret_cast< char const * >(pSrcBufPtr) - pSrcBuf;
 270     return pDestBufPtr - pDestBuf;
 271 }
 272
 273 void * ImplCreateUnicodeToUtf8Context()
 274 {
 275     ImplUnicodeToUtf8Context * p = new ImplUnicodeToUtf8Context;
 276     ImplResetUnicodeToUtf8Context(p);
 277     return p;
 278 }
 279
 280 void ImplResetUnicodeToUtf8Context(void * pContext)
 281 {
 282     if (pContext != nullptr)
 283         static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate = 0xFFFF;
 284 }
 285
 286 void ImplDestroyUnicodeToUtf8Context(void * pContext)
 287 {
 288     delete static_cast< ImplUnicodeToUtf8Context * >(pContext);
 289 }
 290
 291 sal_Size ImplConvertUnicodeToUtf8(
 292     void const * pData, void * pContext, sal_Unicode const * pSrcBuf,
 293     sal_Size nSrcChars, char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
 294     sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
 295 {
 296     bool bJavaUtf8 = pData != nullptr;
 297     sal_Unicode nHighSurrogate = 0xFFFF;
 298     sal_uInt32 nInfo = 0;
 299     sal_Unicode const * pSrcBufPtr = pSrcBuf;
 300     sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
 301     char * pDestBufPtr = pDestBuf;
 302     char * pDestBufEnd = pDestBufPtr + nDestBytes;
 303
 304     if (pContext != nullptr)
 305         nHighSurrogate
 306             = static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate;
 307
 308     if (nHighSurrogate == 0xFFFF)
 309     {
 310         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
 311             && !bJavaUtf8)
 312         {
 313             if (pDestBufEnd - pDestBufPtr >= 3)
 314             {
 315                 /* Write BOM (U+FEFF) as UTF-8: */
 316                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xEF));
 317                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBB));
 318                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBF));
 319             }
 320             else
 321             {
 322                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
 323                 goto done;
 324             }
 325         }
 326         nHighSurrogate = 0;
 327     }
 328
 329     while (pSrcBufPtr < pSrcBufEnd)
 330     {
 331         sal_uInt32 nChar = *pSrcBufPtr++;
 332         if (nHighSurrogate == 0)
 333         {
 334             if (ImplIsHighSurrogate(nChar) && !bJavaUtf8)
 335             {
 336                 nHighSurrogate = static_cast<sal_Unicode>(nChar);
 337                 continue;
 338             }
 339         }
 340         else if (ImplIsLowSurrogate(nChar) && !bJavaUtf8)
 341             nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
 342         else
 343             goto bad_input;
 344
 345         if ((ImplIsLowSurrogate(nChar) && !bJavaUtf8)
 346             || ImplIsNoncharacter(nChar))
 347             goto bad_input;
 348
 349         if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
 350             if (pDestBufPtr != pDestBufEnd)
 351                 *pDestBufPtr++ = static_cast< char >(nChar);
 352             else
 353                 goto no_output;
 354         else if (nChar <= 0x7FF)
 355             if (pDestBufEnd - pDestBufPtr >= 2)
 356             {
 357                 *pDestBufPtr++ = static_cast< char >(0xC0 | (nChar >> 6));
 358                 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
 359             }
 360             else
 361                 goto no_output;
 362         else if (nChar <= 0xFFFF)
 363             if (pDestBufEnd - pDestBufPtr >= 3)
 364             {
 365                 *pDestBufPtr++ = static_cast< char >(0xE0 | (nChar >> 12));
 366                 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
 367                 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
 368             }
 369             else
 370                 goto no_output;
 371         else if (pDestBufEnd - pDestBufPtr >= 4)
 372         {
 373             *pDestBufPtr++ = static_cast< char >(0xF0 | (nChar >> 18));
 374             *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 12) & 0x3F));
 375             *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
 376             *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
 377         }
 378         else
 379             goto no_output;
 380         nHighSurrogate = 0;
 381         continue;
 382
 383     bad_input:
 384         switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
 385                     false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, nullptr,
 386                     0, nullptr))
 387         {
 388         case sal::detail::textenc::BAD_INPUT_STOP:
 389             nHighSurrogate = 0;
 390             break;
 391
 392         case sal::detail::textenc::BAD_INPUT_CONTINUE:
 393             nHighSurrogate = 0;
 394             continue;
 395
 396         case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
 397             goto no_output;
 398         }
 399         break;
 400
 401     no_output:
 402         --pSrcBufPtr;
 403         nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
 404         break;
 405     }
 406
 407     if (nHighSurrogate != 0
 408         && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
 409                          | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
 410                == 0)
 411     {
 412         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
 413             nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
 414         else
 415             switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
 416                         false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
 417                         nullptr, 0, nullptr))
 418             {
 419             case sal::detail::textenc::BAD_INPUT_STOP:
 420             case sal::detail::textenc::BAD_INPUT_CONTINUE:
 421                 nHighSurrogate = 0;
 422                 break;
 423
 424             case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
 425                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
 426                 break;
 427             }
 428     }
 429
 430  done:
 431     if (pContext != nullptr)
 432         static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate
 433             = nHighSurrogate;
 434     if (pInfo != nullptr)
 435         *pInfo = nInfo;
 436     if (pSrcCvtChars != nullptr)
 437         *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
 438     return pDestBufPtr - pDestBuf;
 439 }
 440
 441 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */