vfs: check userland buffers before reading them.
[haiku.git] / src / kits / textencoding / utf8_conversions.cpp
blobf7486fc51ff7fc9463f12a2e20ac5de064f995f8
1 /*
2 * Copyright 2003-2008, Haiku, Inc. All Rights Reserved.
3 * Distributed under the terms of the MIT License.
5 * Authors:
6 * Andrew Bachmann
7 */
10 #include <CharacterSet.h>
11 #include <CharacterSetRoster.h>
12 #include <UTF8.h>
14 #include <errno.h>
15 #include <iconv.h>
16 #include <stdio.h>
19 //#define DEBUG_CONV 1
21 #ifdef DEBUG_CONV
22 # define DEBPRINT(ARGS) printf ARGS;
23 #else
24 # define DEBPRINT(ARGS) ;
25 #endif
27 using namespace BPrivate;
29 int iconvctl(iconv_t icd, int request, void* argument);
32 static void
33 discard_invalid_input_character(iconv_t* conversion, char** inputBuffer,
34 size_t* inputLeft)
36 if (*inputLeft == 0)
37 return;
39 char outputBuffer[1];
41 // skip the invalid input character only
42 size_t left = 1;
43 for (; left <= *inputLeft; left ++) {
44 // reset internal state
45 iconv(*conversion, NULL, NULL, NULL, NULL);
47 char* buffer = *inputBuffer;
48 char* output = outputBuffer;
49 size_t outputLeft = 1;
50 size_t size = iconv(*conversion, &buffer, &left,
51 &output, &outputLeft);
53 if (size != (size_t)-1) {
54 // should not reach here
55 break;
58 if (errno == EINVAL) {
59 // too few input bytes provided,
60 // increase input buffer size and try again
61 continue;
64 if (errno == EILSEQ) {
65 // minimal size of input buffer found
66 break;
69 // should not reach here
72 *inputBuffer += left;
73 *inputLeft -= left;
77 status_t
78 convert_encoding(const char* from, const char* to, const char* src,
79 int32* srcLen, char* dst, int32* dstLen, int32* state,
80 char substitute)
82 if (*srcLen == 0) {
83 // nothing to do!
84 *dstLen = 0;
85 return B_OK;
88 // TODO: this doesn't work, as the state is reset every time!
89 iconv_t conversion = iconv_open(to, from);
90 if (conversion == (iconv_t)-1) {
91 DEBPRINT(("iconv_open failed\n"));
92 return B_ERROR;
95 size_t outputLeft = *dstLen;
97 if (state == NULL || *state == 0) {
98 if (state != NULL)
99 *state = 1;
101 iconv(conversion, NULL, NULL, &dst, &outputLeft);
104 char** inputBuffer = const_cast<char**>(&src);
105 size_t inputLeft = *srcLen;
106 do {
107 size_t nonReversibleConversions = iconv(conversion, inputBuffer,
108 &inputLeft, &dst, &outputLeft);
109 if (nonReversibleConversions == (size_t)-1) {
110 if (errno == E2BIG) {
111 // Not enough room in the output buffer for the next converted character
112 // This is not a "real" error, we just quit out.
113 break;
116 switch (errno) {
117 case EILSEQ: // unable to generate a corresponding character
119 discard_invalid_input_character(&conversion, inputBuffer,
120 &inputLeft);
122 // prepare to convert the substitute character to target encoding
123 char original = substitute;
124 size_t len = 1;
125 char* copy = &original;
127 // Perform the conversion
128 // We ignore any errors during this as part of robustness/best-effort
129 // We use ISO-8859-1 as a source because it is a single byte encoding
130 // It also overlaps UTF-8 for the lower 128 characters. It is also
131 // likely to have a mapping to almost any target encoding.
132 iconv_t iso8859_1to = iconv_open(to,"ISO-8859-1");
133 if (iso8859_1to != (iconv_t)-1) {
134 iconv(iso8859_1to, 0, 0, 0, 0);
135 iconv(iso8859_1to, &copy, &len, &dst, &outputLeft);
136 iconv_close(iso8859_1to);
138 break;
141 case EINVAL: // incomplete multibyte sequence at the end of the input
142 // TODO inputLeft bytes from inputBuffer should
143 // be stored in state variable, so that conversion
144 // can continue when the caller provides the missing
145 // bytes with the next call of this method
147 // we just eat bad bytes, as part of robustness/best-effort
148 inputBuffer++;
149 inputLeft--;
150 break;
152 default:
153 // unknown error, completely bail
154 status_t status = errno;
155 iconv_close(conversion);
156 return status;
159 } while (inputLeft > 0 && outputLeft > 0);
161 *srcLen -= inputLeft;
162 *dstLen -= outputLeft;
163 iconv_close(conversion);
165 return B_OK;
169 status_t
170 convert_to_utf8(uint32 srcEncoding, const char* src, int32* srcLen,
171 char* dst, int32* dstLen, int32* state, char substitute)
173 const BCharacterSet* charset = BCharacterSetRoster::GetCharacterSetByConversionID(
174 srcEncoding);
175 if (charset == NULL)
176 return B_ERROR;
178 #if DEBUG_CONV
179 fprintf(stderr, "convert_to_utf8(%s) : \"", charset->GetName());
180 for (int i = 0 ; i < *srcLen ; i++) {
181 fprintf(stderr, "%c", src[i]);
183 fprintf(stderr, "\"\n");
184 #endif
186 return convert_encoding(charset->GetName(), "UTF-8", src, srcLen,
187 dst, dstLen, state, substitute);
191 status_t
192 convert_from_utf8(uint32 dstEncoding, const char* src, int32* srcLen,
193 char* dst, int32* dstLen, int32* state, char substitute)
195 const BCharacterSet* charset = BCharacterSetRoster::GetCharacterSetByConversionID(
196 dstEncoding);
197 if (charset == NULL)
198 return B_ERROR;
200 #if DEBUG_CONV
201 fprintf(stderr, "convert_from_utf8(%s) : \"", charset->GetName());
202 for (int i = 0 ; i < *srcLen ; i++) {
203 fprintf(stderr, "%c", src[i]);
205 fprintf(stderr, "\"\n");
206 #endif
208 return convert_encoding("UTF-8", charset->GetName(), src, srcLen,
209 dst, dstLen, state, substitute);