src/kits/textencoding/utf8_conversions.cpp

   1 /*
   2  * Copyright 2003-2008, Haiku, Inc. All Rights Reserved.
   3  * Distributed under the terms of the MIT License.
   4  *
   5  * Authors:
   6  *              Andrew Bachmann
   7  */
   8
   9
  10 #include <CharacterSet.h>
  11 #include <CharacterSetRoster.h>
  12 #include <UTF8.h>
  13
  14 #include <errno.h>
  15 #include <iconv.h>
  16 #include <stdio.h>
  17
  18
  19 //#define DEBUG_CONV 1
  20
  21 #ifdef DEBUG_CONV
  22 #       define DEBPRINT(ARGS) printf ARGS;
  23 #else
  24 #       define DEBPRINT(ARGS) ;
  25 #endif
  26
  27 using namespace BPrivate;
  28
  29 int iconvctl(iconv_t icd, int request, void* argument);
  30
  31
  32 static void
  33 discard_invalid_input_character(iconv_t* conversion, char** inputBuffer,
  34         size_t* inputLeft)
  35 {
  36         if (*inputLeft == 0)
  37                 return;
  38
  39         char outputBuffer[1];
  40
  41         // skip the invalid input character only
  42         size_t left = 1;
  43         for (; left <= *inputLeft; left ++) {
  44                 // reset internal state
  45                 iconv(*conversion, NULL, NULL, NULL, NULL);
  46
  47                 char* buffer = *inputBuffer;
  48                 char* output = outputBuffer;
  49                 size_t outputLeft = 1;
  50                 size_t size = iconv(*conversion, &buffer, &left,
  51                         &output, &outputLeft);
  52
  53                 if (size != (size_t)-1) {
  54                         // should not reach here
  55                         break;
  56                 }
  57
  58                 if (errno == EINVAL) {
  59                         // too few input bytes provided,
  60                         // increase input buffer size and try again
  61                         continue;
  62                 }
  63
  64                 if (errno == EILSEQ) {
  65                         // minimal size of input buffer found
  66                         break;
  67                 }
  68
  69                 // should not reach here
  70         };
  71
  72         *inputBuffer += left;
  73         *inputLeft -= left;
  74 }
  75
  76
  77 status_t
  78 convert_encoding(const char* from, const char* to, const char* src,
  79         int32* srcLen, char* dst, int32* dstLen, int32* state,
  80         char substitute)
  81 {
  82         if (*srcLen == 0) {
  83                 // nothing to do!
  84                 *dstLen = 0;
  85                 return B_OK;
  86         }
  87
  88         // TODO: this doesn't work, as the state is reset every time!
  89         iconv_t conversion = iconv_open(to, from);
  90         if (conversion == (iconv_t)-1) {
  91                 DEBPRINT(("iconv_open failed\n"));
  92                 return B_ERROR;
  93         }
  94
  95         size_t outputLeft = *dstLen;
  96
  97         if (state == NULL || *state == 0) {
  98                 if (state != NULL)
  99                         *state = 1;
 100
 101                 iconv(conversion, NULL, NULL, &dst, &outputLeft);
 102         }
 103
 104         char** inputBuffer = const_cast<char**>(&src);
 105         size_t inputLeft = *srcLen;
 106         do {
 107                 size_t nonReversibleConversions = iconv(conversion, inputBuffer,
 108                         &inputLeft, &dst, &outputLeft);
 109                 if (nonReversibleConversions == (size_t)-1) {
 110                         if (errno == E2BIG) {
 111                                 // Not enough room in the output buffer for the next converted character
 112                                 // This is not a "real" error, we just quit out.
 113                                 break;
 114                         }
 115
 116                         switch (errno) {
 117                                 case EILSEQ: // unable to generate a corresponding character
 118                                 {
 119                                         discard_invalid_input_character(&conversion, inputBuffer,
 120                                                 &inputLeft);
 121
 122                                         // prepare to convert the substitute character to target encoding
 123                                         char original = substitute;
 124                                         size_t len = 1;
 125                                         char* copy = &original;
 126
 127                                         // Perform the conversion
 128                                         // We ignore any errors during this as part of robustness/best-effort
 129                                         // We use ISO-8859-1 as a source because it is a single byte encoding
 130                                         // It also overlaps UTF-8 for the lower 128 characters.  It is also
 131                                         // likely to have a mapping to almost any target encoding.
 132                                         iconv_t iso8859_1to = iconv_open(to,"ISO-8859-1");
 133                                         if (iso8859_1to != (iconv_t)-1) {
 134                                                 iconv(iso8859_1to, 0, 0, 0, 0);
 135                                                 iconv(iso8859_1to, &copy, &len, &dst, &outputLeft);
 136                                                 iconv_close(iso8859_1to);
 137                                         }
 138                                         break;
 139                                 }
 140
 141                                 case EINVAL: // incomplete multibyte sequence at the end of the input
 142                                         // TODO inputLeft bytes from inputBuffer should
 143                                         // be stored in state variable, so that conversion
 144                                         // can continue when the caller provides the missing
 145                                         // bytes with the next call of this method
 146
 147                                         // we just eat bad bytes, as part of robustness/best-effort
 148                                         inputBuffer++;
 149                                         inputLeft--;
 150                                         break;
 151
 152                                 default:
 153                                         // unknown error, completely bail
 154                                         status_t status = errno;
 155                                         iconv_close(conversion);
 156                                         return status;
 157                         }
 158                 }
 159         } while (inputLeft > 0 && outputLeft > 0);
 160
 161         *srcLen -= inputLeft;
 162         *dstLen -= outputLeft;
 163         iconv_close(conversion);
 164
 165         return B_OK;
 166 }
 167
 168
 169 status_t
 170 convert_to_utf8(uint32 srcEncoding, const char* src, int32* srcLen,
 171         char* dst, int32* dstLen, int32* state, char substitute)
 172 {
 173         const BCharacterSet* charset = BCharacterSetRoster::GetCharacterSetByConversionID(
 174                 srcEncoding);
 175         if (charset == NULL)
 176                 return B_ERROR;
 177
 178 #if DEBUG_CONV
 179         fprintf(stderr, "convert_to_utf8(%s) : \"", charset->GetName());
 180         for (int i = 0 ; i < *srcLen ; i++) {
 181                 fprintf(stderr, "%c", src[i]);
 182         }
 183         fprintf(stderr, "\"\n");
 184 #endif
 185
 186         return convert_encoding(charset->GetName(), "UTF-8", src, srcLen,
 187                 dst, dstLen, state, substitute);
 188 }
 189
 190
 191 status_t
 192 convert_from_utf8(uint32 dstEncoding, const char* src, int32* srcLen,
 193         char* dst, int32* dstLen, int32* state, char substitute)
 194 {
 195         const BCharacterSet* charset = BCharacterSetRoster::GetCharacterSetByConversionID(
 196                 dstEncoding);
 197         if (charset == NULL)
 198                 return B_ERROR;
 199
 200 #if DEBUG_CONV
 201         fprintf(stderr, "convert_from_utf8(%s) : \"", charset->GetName());
 202         for (int i = 0 ; i < *srcLen ; i++) {
 203                 fprintf(stderr, "%c", src[i]);
 204         }
 205         fprintf(stderr, "\"\n");
 206 #endif
 207
 208         return convert_encoding("UTF-8", charset->GetName(), src, srcLen,
 209                 dst, dstLen, state, substitute);
 210 }
 211