libpurple/protocols/oscar/encoding.c

   1 /*
   2  * Purple's oscar protocol plugin
   3  * This file is the legal property of its developers.
   4  * Please see the AUTHORS file distributed alongside this file.
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111-1301  USA
  19 */
  20
  21 #include "encoding.h"
  22
  23 static gchar *
  24 encoding_multi_convert_to_utf8(const gchar *text, gssize textlen, const gchar *encodings, GError **error, gboolean fallback)
  25 {
  26         gchar *utf8 = NULL;
  27         const gchar *begin = encodings;
  28         const gchar *end = NULL;
  29         gchar *curr_encoding = NULL; /* allocated buffer for encoding name */
  30         const gchar *curr_encoding_ro = NULL; /* read-only encoding name */
  31
  32         if (!encodings) {
  33                 purple_debug_error("oscar", "encodings is NULL");
  34                 return NULL;
  35         }
  36
  37         for (;;)
  38         {
  39                 /* extract next encoding */
  40                 end = strchr(begin, ',');
  41                 if (!end) {
  42                         curr_encoding_ro = begin;
  43                 }       else { /* allocate buffer for encoding */
  44                         curr_encoding = g_strndup(begin, end - begin);
  45                         if (!curr_encoding) {
  46                                 purple_debug_error("oscar", "Error allocating memory for encoding");
  47                                 break;
  48                         }
  49                         curr_encoding_ro = curr_encoding;
  50                 }
  51
  52                 if (!g_ascii_strcasecmp(curr_encoding_ro, "utf-8") && g_utf8_validate(text, textlen, NULL)) {
  53                         break;
  54                 }
  55
  56                 utf8 = g_convert(text, textlen, "UTF-8", curr_encoding_ro, NULL, NULL, NULL);
  57
  58                 if (!end) /* last occurence. do not free curr_encoding: buffer was'nt allocated */
  59                         break;
  60
  61                 g_free(curr_encoding); /* free allocated buffer for encoding here */
  62
  63                 if (utf8) /* text was successfully converted */
  64                         break;
  65
  66                 begin = end + 1;
  67         }
  68
  69         if (!utf8 && fallback)
  70         { /* "begin" points to last encoding */
  71                 utf8 = g_convert_with_fallback(text, textlen, "UTF-8", begin, "?", NULL, NULL, error);
  72         }
  73
  74         return utf8;
  75 }
  76
  77 static gchar *
  78 encoding_extract(const char *encoding)
  79 {
  80         char *begin, *end;
  81
  82         if (encoding == NULL) {
  83                 return NULL;
  84         }
  85
  86         if (!g_str_has_prefix(encoding, "text/aolrtf; charset=") &&
  87                 !g_str_has_prefix(encoding, "text/x-aolrtf; charset=") &&
  88                 !g_str_has_prefix(encoding, "text/plain; charset=")) {
  89                 return g_strdup(encoding);
  90         }
  91
  92         begin = strchr(encoding, '"');
  93         end = strrchr(encoding, '"');
  94
  95         if ((begin == NULL) || (end == NULL) || (begin >= end)) {
  96                 return g_strdup(encoding);
  97         }
  98
  99         return g_strndup(begin+1, (end-1) - begin);
 100 }
 101
 102 gchar *
 103 oscar_encoding_to_utf8(const char *encoding, const char *text, int textlen)
 104 {
 105         gchar *utf8 = NULL;
 106         const gchar *glib_encoding = NULL;
 107         gchar *extracted_encoding = encoding_extract(encoding);
 108
 109         if (extracted_encoding == NULL || *extracted_encoding == '\0') {
 110                 purple_debug_info("oscar", "Empty encoding, assuming UTF-8\n");
 111         } else if (!g_ascii_strcasecmp(extracted_encoding, "iso-8859-1")) {
 112                 glib_encoding = "iso-8859-1";
 113         } else if (!g_ascii_strcasecmp(extracted_encoding, "ISO-8859-1-Windows-3.1-Latin-1") || !g_ascii_strcasecmp(extracted_encoding, "us-ascii")) {
 114                 glib_encoding = "Windows-1252";
 115         } else if (!g_ascii_strcasecmp(extracted_encoding, "unicode-2-0")) {
 116                 glib_encoding = "UTF-16BE";
 117         } else if (g_ascii_strcasecmp(extracted_encoding, "utf-8")) {
 118                 glib_encoding = extracted_encoding;
 119         }
 120
 121         if (glib_encoding != NULL) {
 122                 utf8 = encoding_multi_convert_to_utf8(text, textlen, glib_encoding, NULL, FALSE);
 123         }
 124
 125         /*
 126          * If utf8 is still NULL then either the encoding is utf-8 or
 127          * we have been unable to convert the text to utf-8 from the encoding
 128          * that was specified.  So we check if the text is valid utf-8 then
 129          * just copy it.
 130          */
 131         if (utf8 == NULL) {
 132                 if (textlen != 0 && *text != '\0' && !g_utf8_validate(text, textlen, NULL))
 133                         utf8 = g_strdup(_("(There was an error receiving this message.  The buddy you are speaking with is probably using a different encoding than expected.  If you know what encoding he is using, you can specify it in the advanced account options for your AIM/ICQ account.)"));
 134                 else
 135                         utf8 = g_strndup(text, textlen);
 136         }
 137
 138         g_free(extracted_encoding);
 139         return utf8;
 140 }
 141
 142 gchar *
 143 oscar_utf8_try_convert(PurpleAccount *account, OscarData *od, const gchar *msg)
 144 {
 145         const char *charset = NULL;
 146         char *ret = NULL;
 147
 148         if (msg == NULL)
 149                 return NULL;
 150
 151         if (g_utf8_validate(msg, -1, NULL))
 152                 return g_strdup(msg);
 153
 154         if (od->icq)
 155                 charset = purple_account_get_string(account, "encoding", NULL);
 156
 157         if(charset && *charset)
 158                 ret = encoding_multi_convert_to_utf8(msg, -1, charset, NULL, FALSE);
 159
 160         if(!ret)
 161                 ret = purple_utf8_try_convert(msg);
 162
 163         return ret;
 164 }
 165
 166 static gchar *
 167 oscar_convert_to_utf8(const gchar *data, gsize datalen, const char *charsetstr, gboolean fallback)
 168 {
 169         gchar *ret = NULL;
 170         GError *err = NULL;
 171
 172         if ((charsetstr == NULL) || (*charsetstr == '\0'))
 173                 return NULL;
 174
 175         if (g_ascii_strcasecmp("UTF-8", charsetstr)) {
 176                 ret = encoding_multi_convert_to_utf8(data, datalen, charsetstr, &err, fallback);
 177                 if (err != NULL) {
 178                         purple_debug_warning("oscar", "Conversion from %s failed: %s.\n",
 179                                                            charsetstr, err->message);
 180                         g_error_free(err);
 181                 }
 182         } else {
 183                 if (g_utf8_validate(data, datalen, NULL))
 184                         ret = g_strndup(data, datalen);
 185                 else
 186                         purple_debug_warning("oscar", "String is not valid UTF-8.\n");
 187         }
 188
 189         return ret;
 190 }
 191
 192 gchar *
 193 oscar_decode_im(PurpleAccount *account, const char *sourcebn, guint16 charset, const gchar *data, gsize datalen)
 194 {
 195         gchar *ret = NULL;
 196         /* charsetstr1 is always set to what the correct encoding should be. */
 197         const gchar *charsetstr1, *charsetstr2, *charsetstr3 = NULL;
 198
 199         if ((datalen == 0) || (data == NULL))
 200                 return NULL;
 201
 202         if (charset == AIM_CHARSET_UNICODE) {
 203                 charsetstr1 = "UTF-16BE";
 204                 charsetstr2 = "UTF-8";
 205         } else if (charset == AIM_CHARSET_LATIN_1) {
 206                 if ((sourcebn != NULL) && oscar_util_valid_name_icq(sourcebn))
 207                         charsetstr1 = purple_account_get_string(account, "encoding", OSCAR_DEFAULT_CUSTOM_ENCODING);
 208                 else
 209                         charsetstr1 = "ISO-8859-1";
 210                 charsetstr2 = "UTF-8";
 211         } else if (charset == AIM_CHARSET_ASCII) {
 212                 /* Should just be "ASCII" */
 213                 charsetstr1 = "ASCII";
 214                 charsetstr2 = purple_account_get_string(account, "encoding", OSCAR_DEFAULT_CUSTOM_ENCODING);
 215         } else if (charset == 0x000d) {
 216                 /* iChat sending unicode over a Direct IM connection = UTF-8 */
 217                 /* Mobile AIM client on multiple devices (including Blackberry Tour, Nokia 3100, and LG VX6000) = ISO-8859-1 */
 218                 charsetstr1 = "UTF-8";
 219                 charsetstr2 = "ISO-8859-1";
 220                 charsetstr3 = purple_account_get_string(account, "encoding", OSCAR_DEFAULT_CUSTOM_ENCODING);
 221         } else {
 222                 /* Unknown, hope for valid UTF-8... */
 223                 charsetstr1 = "UTF-8";
 224                 charsetstr2 = purple_account_get_string(account, "encoding", OSCAR_DEFAULT_CUSTOM_ENCODING);
 225         }
 226
 227         purple_debug_info("oscar", "Parsing IM, charset=0x%04hx, datalen=%" G_GSIZE_FORMAT ", choice1=%s, choice2=%s, choice3=%s\n",
 228                                           charset, datalen, charsetstr1, charsetstr2, (charsetstr3 ? charsetstr3 : ""));
 229
 230         ret = oscar_convert_to_utf8(data, datalen, charsetstr1, FALSE);
 231         if (ret == NULL) {
 232                 if (charsetstr3 != NULL) {
 233                         /* Try charsetstr2 without allowing substitutions, then fall through to charsetstr3 if needed */
 234                         ret = oscar_convert_to_utf8(data, datalen, charsetstr2, FALSE);
 235                         if (ret == NULL)
 236                                 ret = oscar_convert_to_utf8(data, datalen, charsetstr3, TRUE);
 237                 } else {
 238                         /* Try charsetstr2, allowing substitutions */
 239                         ret = oscar_convert_to_utf8(data, datalen, charsetstr2, TRUE);
 240                 }
 241         }
 242         if (ret == NULL) {
 243                 char *str, *salvage, *tmp;
 244
 245                 str = g_malloc(datalen + 1);
 246                 strncpy(str, data, datalen);
 247                 str[datalen] = '\0';
 248                 salvage = purple_utf8_salvage(str);
 249                 tmp = g_strdup_printf(_("(There was an error receiving this message.  Either you and %s have different encodings selected, or %s has a buggy client.)"),
 250                                           sourcebn, sourcebn);
 251                 ret = g_strdup_printf("%s %s", salvage, tmp);
 252                 g_free(tmp);
 253                 g_free(str);
 254                 g_free(salvage);
 255         }
 256
 257         return ret;
 258 }
 259
 260 static guint16
 261 get_simplest_charset(const char *utf8)
 262 {
 263         while (*utf8)
 264         {
 265                 if ((unsigned char)(*utf8) > 0x7f) {
 266                         /* not ASCII! */
 267                         return AIM_CHARSET_UNICODE;
 268                 }
 269                 utf8++;
 270         }
 271         return AIM_CHARSET_ASCII;
 272 }
 273
 274 gchar *
 275 oscar_encode_im(const gchar *msg, gsize *result_len, guint16 *charset, gchar **charsetstr)
 276 {
 277         guint16 msg_charset = get_simplest_charset(msg);
 278         if (charset != NULL) {
 279                 *charset = msg_charset;
 280         }
 281         if (charsetstr != NULL) {
 282                 *charsetstr = msg_charset == AIM_CHARSET_ASCII ? "us-ascii" : "unicode-2-0";
 283         }
 284         return g_convert(msg, -1, msg_charset == AIM_CHARSET_ASCII ? "ASCII" : "UTF-16BE", "UTF-8", NULL, result_len, NULL);
 285 }