workbench/network/smbfs/source_code/utf-8-iso-8859-1-conversion.c

   1 /*
   2  * :ts=4
   3  *
   4  * SMB file system wrapper for AmigaOS, using the AmiTCP V3 API
   5  *
   6  * Copyright (C) 2000-2016 by Olaf `Olsen' Barthel <obarthel -at- gmx -dot- net>
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with this program; if not, write to the Free Software
  20  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  21  */
  22
  23 #include <stddef.h>
  24
  25 #include "utf-8-iso-8859-1-conversion.h"
  26
  27 /* Encode an ISO 8859 Latin 1 character (default character set for
  28  * the Amiga) in UTF-8 representation (rfc2279). Returns the number
  29  * of characters written to the buffer, or -2 for 'buffer overflow'
  30  * in which case no data is written.
  31  *
  32  * If the address of the buffer to write to is NULL, then no data
  33  * will be written; only the number of bytes that would have been
  34  * written if the buffer address were not NULL will be returned.
  35  */
  36 static int
  37 encode_iso8859_1_as_utf8_char(unsigned char c,unsigned char * string,int size)
  38 {
  39         int len;
  40
  41         if((c & 0x80) == 0)
  42         {
  43                 /* ASCII characters can be encoded as a single octet. */
  44                 if(string == NULL || size >= 1)
  45                 {
  46                         len = 1;
  47
  48                         if(string != NULL)
  49                                 string[0] = c;
  50                 }
  51                 else
  52                 {
  53                         /* Not enough room... */
  54                         len = -2;
  55                 }
  56         }
  57         else
  58         {
  59                 /* ISO 8859 Latin 1 characters must be encoded as two octets. */
  60                 if(string == NULL || size >= 2)
  61                 {
  62                         len = 2;
  63
  64                         if(string != NULL)
  65                         {
  66                                 string[0] = 0xc0 | ((c >> 6) & 0x03);
  67                                 string[1] = 0x80 | (c & 0x3f);
  68                         }
  69                 }
  70                 else
  71                 {
  72                         /* Not enough room... */
  73                         len = -2;
  74                 }
  75         }
  76
  77         return(len);
  78 }
  79
  80 /****************************************************************************/
  81
  82 /* Data used by the decoder. */
  83 struct utf8_decoding_entry
  84 {
  85         unsigned char   mask;           /* Mask and pattern are used to identify */
  86         unsigned char   pattern;        /* the type of multi-octet sequence */
  87         int                             len;            /* Number of octets in the sequence */
  88         long                    first;          /* First and last are for checking the */
  89         long                    last;           /* resulting character against its code range */
  90 };
  91
  92 /****************************************************************************/
  93
  94 /* Decode a character in UTF-8 representation (rfc2279) and return
  95  * how many bytes contributed to that character (1-6). Returns
  96  * -1 if the character could not be decoded or -2 if more bytes
  97  * would be required for decoding than the input buffer holds.
  98  * Returns -3 if the character was not encoded as the shortest
  99  * possible UTF-8 sequence.
 100  *
 101  * If an error is indicated, no data will be written.
 102  *
 103  * If the address of the output buffer to write to is NULL, then no
 104  * data will be written; only the number of bytes that would have
 105  * been decoded if the buffer address were not NULL will be returned.
 106  */
 107 static int
 108 decode_utf8_char(const unsigned char * const string,int size,unsigned long * result_ptr)
 109 {
 110         int len;
 111
 112         if(size > 0)
 113         {
 114                 int c,i;
 115
 116                 /* Assume a seven bit ASCII character. */
 117                 c = string[0];
 118
 119                 /* Could this be an UTF-8 encoded character? */
 120                 if((c & 0x80) != 0)
 121                 {
 122                         static const struct utf8_decoding_entry utf8_decoding_table[5] =
 123                         {
 124                                 { 0xfe,0xfc,6,0x04000000,0x7FFFFFFF }, /* 1111110x (UCS-4 range 04000000-7FFFFFFF) */
 125                                 { 0xfc,0xf8,5,0x00200000,0x03FFFFFF }, /* 111110xx (UCS-4 range 00200000-03FFFFFF) */
 126                                 { 0xf8,0xf0,4,0x00010000,0x001FFFFF }, /* 11110xxx (UCS-4 range 00010000-001FFFFF) */
 127                                 { 0xf0,0xe0,3,0x00000800,0x0000FFFF }, /* 1110xxxx (UCS-4 range 00000800-0000FFFF) */
 128                                 { 0xe0,0xc0,2,0x00000080,0x000007FF }  /* 110xxxxx (UCS-4 range 00000080-000007FF) */
 129                         };
 130
 131                         /* Find the bit pattern that corresponds to the
 132                          * code; if none matches, then we have an
 133                          * invalid code.
 134                          */
 135                         len = -1;
 136
 137                         for(i = 0 ; i < 5 ; i++)
 138                         {
 139                                 if((c & utf8_decoding_table[i].mask) == utf8_decoding_table[i].pattern)
 140                                 {
 141                                         /* Strip the encoding pattern and retain
 142                                          * the 'payload'.
 143                                          */
 144                                         c &= ~utf8_decoding_table[i].mask;
 145
 146                                         /* If the character would consist of more octects
 147                                          * than the input buffer holds, we flag an underflow
 148                                          * error.
 149                                          */
 150                                         len = utf8_decoding_table[i].len;
 151                                         if(len <= size)
 152                                         {
 153                                                 int j,d;
 154
 155                                                 /* The next few octets contain six bits of
 156                                                  * character data each.
 157                                                  */
 158                                                 for(j = 1 ; j < len ; j++)
 159                                                 {
 160                                                         d = string[j];
 161
 162                                                         /* Each octet must be in the form
 163                                                          * of 10xxxxxx.
 164                                                          */
 165                                                         if((d & 0xc0) == 0x80)
 166                                                         {
 167                                                                 c = (c << 6) | (d & 0x3f);
 168                                                         }
 169                                                         else
 170                                                         {
 171                                                                 /* Bad code... */
 172                                                                 len = -1;
 173                                                                 break;
 174                                                         }
 175                                                 }
 176
 177                                                 if(len > 0)
 178                                                 {
 179                                                         /* Verify that the character was encoded
 180                                                          * in the shortest form possible.
 181                                                          */
 182                                                         if(c < utf8_decoding_table[i].first ||
 183                                                            c > utf8_decoding_table[i].last)
 184                                                         {
 185                                                                 len = -3;
 186                                                         }
 187                                                 }
 188                                         }
 189                                         else
 190                                         {
 191                                                 len = -2;
 192                                         }
 193
 194                                         break;
 195                                 }
 196                         }
 197                 }
 198                 else
 199                 {
 200                         len = 1;
 201                 }
 202
 203                 if(len > 0 && result_ptr != NULL)
 204                         (*result_ptr) = c;
 205         }
 206         else
 207         {
 208                 len = 0;
 209         }
 210
 211         return(len);
 212 }
 213
 214 /****************************************************************************/
 215
 216 /* Encode a string of characters in ISO 8859 Latin-1 encoding into
 217  * UTF-8 representation (rfc2279). Will encode as many characters as
 218  * will fit into the output buffer, and NUL-terminates the result.
 219  * Returns the number of UTF-8 characters in the output buffer.
 220  */
 221 int
 222 encode_iso8859_1_as_utf8_string(const unsigned char * const from,int from_len,unsigned char * to,int to_size)
 223 {
 224         int i,char_len,total_len;
 225         int result;
 226
 227         total_len = 0;
 228
 229         for(i = 0 ; i < from_len ; i++)
 230         {
 231                 result = encode_iso8859_1_as_utf8_char(from[i],to,to_size-1);
 232                 if(result < 0)
 233                 {
 234                         /* Stop on buffer overflow or error. */
 235                         goto out;
 236                 }
 237
 238                 char_len = result;
 239
 240                 if(to != NULL)
 241                 {
 242                         to += char_len;
 243
 244                         to_size -= char_len;
 245                 }
 246
 247                 total_len += char_len;
 248         }
 249
 250         /* Provide for NUL termination. */
 251         if(to != NULL && to_size > 0)
 252                 (*to) = '\0';
 253
 254         result = total_len;
 255
 256  out:
 257
 258         return(result);
 259 }
 260
 261 /****************************************************************************/
 262
 263 /* Decode a string of characters encoded in UTF-8 representation (rfc2279).
 264  * Will decode and retain only characters that can be decoded properly
 265  * and which fit into the ASCII/BMP Latin-1 supplementary range. Will
 266  * decode as many characters as will fit into the output buffer, and
 267  * NUL-terminates the result. Returns the number of characters in the
 268  * output buffer, or -1 for decoding error.
 269  *
 270  * Note that decoding will stop once a NUL has been found in the
 271  * input string to be decoded.
 272  */
 273 int
 274 decode_utf8_as_iso8859_1_string(const unsigned char * const from,int from_len,unsigned char * to,int to_size)
 275 {
 276         unsigned long c;
 277         int i,char_len,total_len;
 278         int result = -1;
 279
 280         total_len = 0;
 281
 282         i = 0;
 283
 284         /* Process the entire input buffer unless we hit
 285          * a NUL first.
 286          */
 287         while(from_len > 0)
 288         {
 289                 char_len = decode_utf8_char(&from[i],from_len,&c);
 290                 if(char_len > 0)
 291                 {
 292                         from_len -= char_len;
 293                         i += char_len;
 294
 295                         /* Allow only for ASCII/BMP Latin-1 supplementary
 296                          * characters.
 297                          */
 298                         if(c >= 256)
 299                                 goto out;
 300
 301                         /* Is there still enough room for the character
 302                          * and a terminating NUL byte?
 303                          */
 304                         if(to == NULL || to_size-1 > 0)
 305                         {
 306                                 /* Add this only if it's not the terminating
 307                                  * NUL byte.
 308                                  */
 309                                 if(c != '\0')
 310                                 {
 311                                         if(to != NULL)
 312                                         {
 313                                                 (*to++) = c;
 314
 315                                                 to_size--;
 316                                         }
 317
 318                                         total_len++;
 319                                 }
 320                                 else
 321                                 {
 322                                         /* Found a terminating NUL byte. */
 323                                         break;
 324                                 }
 325                         }
 326                         else
 327                         {
 328                                 /* No more room in the buffer. */
 329                                 if(to != NULL)
 330                                         break;
 331                         }
 332                 }
 333                 else
 334                 {
 335                         /* Underflow or invalid code. */
 336                         goto out;
 337                 }
 338         }
 339
 340         /* Provide for NUL-termination. */
 341         if(to != NULL && to_size > 0)
 342                 (*to) = '\0';
 343
 344         result = total_len;
 345
 346  out:
 347
 348         return(result);
 349 }