revert between 56095 -> 55830 in arch
[AROS.git] / workbench / network / smbfs / source_code / utf-8-iso-8859-1-conversion.c
blobdb0a6bc6e535216a68f7b220cd3093d93ad95e8c
1 /*
2 * :ts=4
4 * SMB file system wrapper for AmigaOS, using the AmiTCP V3 API
6 * Copyright (C) 2000-2016 by Olaf `Olsen' Barthel <obarthel -at- gmx -dot- net>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 #include <stddef.h>
25 #include "utf-8-iso-8859-1-conversion.h"
27 /* Encode an ISO 8859 Latin 1 character (default character set for
28 * the Amiga) in UTF-8 representation (rfc2279). Returns the number
29 * of characters written to the buffer, or -2 for 'buffer overflow'
30 * in which case no data is written.
32 * If the address of the buffer to write to is NULL, then no data
33 * will be written; only the number of bytes that would have been
34 * written if the buffer address were not NULL will be returned.
36 static int
37 encode_iso8859_1_as_utf8_char(unsigned char c,unsigned char * string,int size)
39 int len;
41 if((c & 0x80) == 0)
43 /* ASCII characters can be encoded as a single octet. */
44 if(string == NULL || size >= 1)
46 len = 1;
48 if(string != NULL)
49 string[0] = c;
51 else
53 /* Not enough room... */
54 len = -2;
57 else
59 /* ISO 8859 Latin 1 characters must be encoded as two octets. */
60 if(string == NULL || size >= 2)
62 len = 2;
64 if(string != NULL)
66 string[0] = 0xc0 | ((c >> 6) & 0x03);
67 string[1] = 0x80 | (c & 0x3f);
70 else
72 /* Not enough room... */
73 len = -2;
77 return(len);
80 /****************************************************************************/
82 /* Data used by the decoder. */
83 struct utf8_decoding_entry
85 unsigned char mask; /* Mask and pattern are used to identify */
86 unsigned char pattern; /* the type of multi-octet sequence */
87 int len; /* Number of octets in the sequence */
88 long first; /* First and last are for checking the */
89 long last; /* resulting character against its code range */
92 /****************************************************************************/
94 /* Decode a character in UTF-8 representation (rfc2279) and return
95 * how many bytes contributed to that character (1-6). Returns
96 * -1 if the character could not be decoded or -2 if more bytes
97 * would be required for decoding than the input buffer holds.
98 * Returns -3 if the character was not encoded as the shortest
99 * possible UTF-8 sequence.
101 * If an error is indicated, no data will be written.
103 * If the address of the output buffer to write to is NULL, then no
104 * data will be written; only the number of bytes that would have
105 * been decoded if the buffer address were not NULL will be returned.
107 static int
108 decode_utf8_char(const unsigned char * const string,int size,unsigned long * result_ptr)
110 int len;
112 if(size > 0)
114 int c,i;
116 /* Assume a seven bit ASCII character. */
117 c = string[0];
119 /* Could this be an UTF-8 encoded character? */
120 if((c & 0x80) != 0)
122 static const struct utf8_decoding_entry utf8_decoding_table[5] =
124 { 0xfe,0xfc,6,0x04000000,0x7FFFFFFF }, /* 1111110x (UCS-4 range 04000000-7FFFFFFF) */
125 { 0xfc,0xf8,5,0x00200000,0x03FFFFFF }, /* 111110xx (UCS-4 range 00200000-03FFFFFF) */
126 { 0xf8,0xf0,4,0x00010000,0x001FFFFF }, /* 11110xxx (UCS-4 range 00010000-001FFFFF) */
127 { 0xf0,0xe0,3,0x00000800,0x0000FFFF }, /* 1110xxxx (UCS-4 range 00000800-0000FFFF) */
128 { 0xe0,0xc0,2,0x00000080,0x000007FF } /* 110xxxxx (UCS-4 range 00000080-000007FF) */
131 /* Find the bit pattern that corresponds to the
132 * code; if none matches, then we have an
133 * invalid code.
135 len = -1;
137 for(i = 0 ; i < 5 ; i++)
139 if((c & utf8_decoding_table[i].mask) == utf8_decoding_table[i].pattern)
141 /* Strip the encoding pattern and retain
142 * the 'payload'.
144 c &= ~utf8_decoding_table[i].mask;
146 /* If the character would consist of more octects
147 * than the input buffer holds, we flag an underflow
148 * error.
150 len = utf8_decoding_table[i].len;
151 if(len <= size)
153 int j,d;
155 /* The next few octets contain six bits of
156 * character data each.
158 for(j = 1 ; j < len ; j++)
160 d = string[j];
162 /* Each octet must be in the form
163 * of 10xxxxxx.
165 if((d & 0xc0) == 0x80)
167 c = (c << 6) | (d & 0x3f);
169 else
171 /* Bad code... */
172 len = -1;
173 break;
177 if(len > 0)
179 /* Verify that the character was encoded
180 * in the shortest form possible.
182 if(c < utf8_decoding_table[i].first ||
183 c > utf8_decoding_table[i].last)
185 len = -3;
189 else
191 len = -2;
194 break;
198 else
200 len = 1;
203 if(len > 0 && result_ptr != NULL)
204 (*result_ptr) = c;
206 else
208 len = 0;
211 return(len);
214 /****************************************************************************/
216 /* Encode a string of characters in ISO 8859 Latin-1 encoding into
217 * UTF-8 representation (rfc2279). Will encode as many characters as
218 * will fit into the output buffer, and NUL-terminates the result.
219 * Returns the number of UTF-8 characters in the output buffer.
222 encode_iso8859_1_as_utf8_string(const unsigned char * const from,int from_len,unsigned char * to,int to_size)
224 int i,char_len,total_len;
225 int result;
227 total_len = 0;
229 for(i = 0 ; i < from_len ; i++)
231 result = encode_iso8859_1_as_utf8_char(from[i],to,to_size-1);
232 if(result < 0)
234 /* Stop on buffer overflow or error. */
235 goto out;
238 char_len = result;
240 if(to != NULL)
242 to += char_len;
244 to_size -= char_len;
247 total_len += char_len;
250 /* Provide for NUL termination. */
251 if(to != NULL && to_size > 0)
252 (*to) = '\0';
254 result = total_len;
256 out:
258 return(result);
261 /****************************************************************************/
263 /* Decode a string of characters encoded in UTF-8 representation (rfc2279).
264 * Will decode and retain only characters that can be decoded properly
265 * and which fit into the ASCII/BMP Latin-1 supplementary range. Will
266 * decode as many characters as will fit into the output buffer, and
267 * NUL-terminates the result. Returns the number of characters in the
268 * output buffer, or -1 for decoding error.
270 * Note that decoding will stop once a NUL has been found in the
271 * input string to be decoded.
274 decode_utf8_as_iso8859_1_string(const unsigned char * const from,int from_len,unsigned char * to,int to_size)
276 unsigned long c;
277 int i,char_len,total_len;
278 int result = -1;
280 total_len = 0;
282 i = 0;
284 /* Process the entire input buffer unless we hit
285 * a NUL first.
287 while(from_len > 0)
289 char_len = decode_utf8_char(&from[i],from_len,&c);
290 if(char_len > 0)
292 from_len -= char_len;
293 i += char_len;
295 /* Allow only for ASCII/BMP Latin-1 supplementary
296 * characters.
298 if(c >= 256)
299 goto out;
301 /* Is there still enough room for the character
302 * and a terminating NUL byte?
304 if(to == NULL || to_size-1 > 0)
306 /* Add this only if it's not the terminating
307 * NUL byte.
309 if(c != '\0')
311 if(to != NULL)
313 (*to++) = c;
315 to_size--;
318 total_len++;
320 else
322 /* Found a terminating NUL byte. */
323 break;
326 else
328 /* No more room in the buffer. */
329 if(to != NULL)
330 break;
333 else
335 /* Underflow or invalid code. */
336 goto out;
340 /* Provide for NUL-termination. */
341 if(to != NULL && to_size > 0)
342 (*to) = '\0';
344 result = total_len;
346 out:
348 return(result);