4 * SMB file system wrapper for AmigaOS, using the AmiTCP V3 API
6 * Copyright (C) 2000-2016 by Olaf `Olsen' Barthel <obarthel -at- gmx -dot- net>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 #include "utf-8-iso-8859-1-conversion.h"
27 /* Encode an ISO 8859 Latin 1 character (default character set for
28 * the Amiga) in UTF-8 representation (rfc2279). Returns the number
29 * of characters written to the buffer, or -2 for 'buffer overflow'
30 * in which case no data is written.
32 * If the address of the buffer to write to is NULL, then no data
33 * will be written; only the number of bytes that would have been
34 * written if the buffer address were not NULL will be returned.
37 encode_iso8859_1_as_utf8_char(unsigned char c
,unsigned char * string
,int size
)
43 /* ASCII characters can be encoded as a single octet. */
44 if(string
== NULL
|| size
>= 1)
53 /* Not enough room... */
59 /* ISO 8859 Latin 1 characters must be encoded as two octets. */
60 if(string
== NULL
|| size
>= 2)
66 string
[0] = 0xc0 | ((c
>> 6) & 0x03);
67 string
[1] = 0x80 | (c
& 0x3f);
72 /* Not enough room... */
80 /****************************************************************************/
82 /* Data used by the decoder. */
83 struct utf8_decoding_entry
85 unsigned char mask
; /* Mask and pattern are used to identify */
86 unsigned char pattern
; /* the type of multi-octet sequence */
87 int len
; /* Number of octets in the sequence */
88 long first
; /* First and last are for checking the */
89 long last
; /* resulting character against its code range */
92 /****************************************************************************/
94 /* Decode a character in UTF-8 representation (rfc2279) and return
95 * how many bytes contributed to that character (1-6). Returns
96 * -1 if the character could not be decoded or -2 if more bytes
97 * would be required for decoding than the input buffer holds.
98 * Returns -3 if the character was not encoded as the shortest
99 * possible UTF-8 sequence.
101 * If an error is indicated, no data will be written.
103 * If the address of the output buffer to write to is NULL, then no
104 * data will be written; only the number of bytes that would have
105 * been decoded if the buffer address were not NULL will be returned.
108 decode_utf8_char(const unsigned char * const string
,int size
,unsigned long * result_ptr
)
116 /* Assume a seven bit ASCII character. */
119 /* Could this be an UTF-8 encoded character? */
122 static const struct utf8_decoding_entry utf8_decoding_table
[5] =
124 { 0xfe,0xfc,6,0x04000000,0x7FFFFFFF }, /* 1111110x (UCS-4 range 04000000-7FFFFFFF) */
125 { 0xfc,0xf8,5,0x00200000,0x03FFFFFF }, /* 111110xx (UCS-4 range 00200000-03FFFFFF) */
126 { 0xf8,0xf0,4,0x00010000,0x001FFFFF }, /* 11110xxx (UCS-4 range 00010000-001FFFFF) */
127 { 0xf0,0xe0,3,0x00000800,0x0000FFFF }, /* 1110xxxx (UCS-4 range 00000800-0000FFFF) */
128 { 0xe0,0xc0,2,0x00000080,0x000007FF } /* 110xxxxx (UCS-4 range 00000080-000007FF) */
131 /* Find the bit pattern that corresponds to the
132 * code; if none matches, then we have an
137 for(i
= 0 ; i
< 5 ; i
++)
139 if((c
& utf8_decoding_table
[i
].mask
) == utf8_decoding_table
[i
].pattern
)
141 /* Strip the encoding pattern and retain
144 c
&= ~utf8_decoding_table
[i
].mask
;
146 /* If the character would consist of more octects
147 * than the input buffer holds, we flag an underflow
150 len
= utf8_decoding_table
[i
].len
;
155 /* The next few octets contain six bits of
156 * character data each.
158 for(j
= 1 ; j
< len
; j
++)
162 /* Each octet must be in the form
165 if((d
& 0xc0) == 0x80)
167 c
= (c
<< 6) | (d
& 0x3f);
179 /* Verify that the character was encoded
180 * in the shortest form possible.
182 if(c
< utf8_decoding_table
[i
].first
||
183 c
> utf8_decoding_table
[i
].last
)
203 if(len
> 0 && result_ptr
!= NULL
)
214 /****************************************************************************/
216 /* Encode a string of characters in ISO 8859 Latin-1 encoding into
217 * UTF-8 representation (rfc2279). Will encode as many characters as
218 * will fit into the output buffer, and NUL-terminates the result.
219 * Returns the number of UTF-8 characters in the output buffer.
222 encode_iso8859_1_as_utf8_string(const unsigned char * const from
,int from_len
,unsigned char * to
,int to_size
)
224 int i
,char_len
,total_len
;
229 for(i
= 0 ; i
< from_len
; i
++)
231 result
= encode_iso8859_1_as_utf8_char(from
[i
],to
,to_size
-1);
234 /* Stop on buffer overflow or error. */
247 total_len
+= char_len
;
250 /* Provide for NUL termination. */
251 if(to
!= NULL
&& to_size
> 0)
261 /****************************************************************************/
263 /* Decode a string of characters encoded in UTF-8 representation (rfc2279).
264 * Will decode and retain only characters that can be decoded properly
265 * and which fit into the ASCII/BMP Latin-1 supplementary range. Will
266 * decode as many characters as will fit into the output buffer, and
267 * NUL-terminates the result. Returns the number of characters in the
268 * output buffer, or -1 for decoding error.
270 * Note that decoding will stop once a NUL has been found in the
271 * input string to be decoded.
274 decode_utf8_as_iso8859_1_string(const unsigned char * const from
,int from_len
,unsigned char * to
,int to_size
)
277 int i
,char_len
,total_len
;
284 /* Process the entire input buffer unless we hit
289 char_len
= decode_utf8_char(&from
[i
],from_len
,&c
);
292 from_len
-= char_len
;
295 /* Allow only for ASCII/BMP Latin-1 supplementary
301 /* Is there still enough room for the character
302 * and a terminating NUL byte?
304 if(to
== NULL
|| to_size
-1 > 0)
306 /* Add this only if it's not the terminating
322 /* Found a terminating NUL byte. */
328 /* No more room in the buffer. */
335 /* Underflow or invalid code. */
340 /* Provide for NUL-termination. */
341 if(to
!= NULL
&& to_size
> 0)