webcit/decode.c

   1 #include "webcit.h"
   2 #ifdef HAVE_ICONV
   3
   4 /*
   5  * Wrapper around iconv_open()
   6  * Our version adds aliases for non-standard Microsoft charsets
   7  * such as 'MS950', aliasing them to names like 'CP950'
   8  *
   9  * tocode       Target encoding
  10  * fromcode     Source encoding
  11  * /
  12 iconv_t ctdl_iconv_open(const char *tocode, const char *fromcode)
  13 {
  14         iconv_t ic = (iconv_t)(-1) ;
  15         ic = iconv_open(tocode, fromcode);
  16         if (ic == (iconv_t)(-1) ) {
  17                 char alias_fromcode[64];
  18                 if ( (strlen(fromcode) == 5) && (!strncasecmp(fromcode, "MS", 2)) ) {
  19                         safestrncpy(alias_fromcode, fromcode, sizeof alias_fromcode);
  20                         alias_fromcode[0] = 'C';
  21                         alias_fromcode[1] = 'P';
  22                         ic = iconv_open(tocode, alias_fromcode);
  23                 }
  24         }
  25         return(ic);
  26 }
  27 */
  28
  29
  30 inline char *FindNextEnd (char *bptr)
  31 {
  32         char * end;
  33         /* Find the next ?Q? */
  34         end = strchr(bptr + 2, '?');
  35         if (end == NULL) return NULL;
  36         if (((*(end + 1) == 'B') || (*(end + 1) == 'Q')) &&
  37             (*(end + 2) == '?')) {
  38                 /* skip on to the end of the cluster, the next ?= */
  39                 end = strstr(end + 3, "?=");
  40         }
  41         else
  42                 /* sort of half valid encoding, try to find an end. */
  43                 end = strstr(bptr, "?=");
  44         return end;
  45 }
  46
  47 /*
  48  * Handle subjects with RFC2047 encoding such as:
  49  * =?koi8-r?B?78bP0s3Mxc7JxSDXz9rE1dvO2c3JINvB0sHNySDP?=
  50  */
  51 void utf8ify_rfc822_string(char *buf) {
  52         char *start, *end, *next, *nextend, *ptr;
  53         char newbuf[1024];
  54         char charset[128];
  55         char encoding[16];
  56         char istr[1024];
  57         iconv_t ic = (iconv_t)(-1) ;
  58         char *ibuf;                     /**< Buffer of characters to be converted */
  59         char *obuf;                     /**< Buffer for converted characters */
  60         size_t ibuflen;                 /**< Length of input buffer */
  61         size_t obuflen;                 /**< Length of output buffer */
  62         char *isav;                     /**< Saved pointer to input buffer */
  63         char *osav;                     /**< Saved pointer to output buffer */
  64         int passes = 0;
  65         int i, len, delta;
  66         int illegal_non_rfc2047_encoding = 0;
  67
  68         /* Sometimes, badly formed messages contain strings which were simply
  69          *  written out directly in some foreign character set instead of
  70          *  using RFC2047 encoding.  This is illegal but we will attempt to
  71          *  handle it anyway by converting from a user-specified default
  72          *  charset to UTF-8 if we see any nonprintable characters.
  73          */
  74         len = strlen(buf);
  75         for (i=0; i<len; ++i) {
  76                 if ((buf[i] < 32) || (buf[i] > 126)) {
  77                         illegal_non_rfc2047_encoding = 1;
  78                         i = len; /*< take a shortcut, it won't be more than one. */
  79                 }
  80         }
  81         if (illegal_non_rfc2047_encoding) {
  82                 StrBuf *default_header_charset;
  83                 get_preference("default_header_charset", &default_header_charset);
  84                 if ( (strcasecmp(ChrPtr(default_header_charset), "UTF-8")) &&
  85                      (strcasecmp(ChrPtr(default_header_charset), "us-ascii")) ) {
  86                         ctdl_iconv_open("UTF-8", ChrPtr(default_header_charset), &ic);
  87                         if (ic != (iconv_t)(-1) ) {
  88                                 ibuf = malloc(1024);
  89                                 isav = ibuf;
  90                                 safestrncpy(ibuf, buf, 1024);
  91                                 ibuflen = strlen(ibuf);
  92                                 obuflen = 1024;
  93                                 obuf = (char *) malloc(obuflen);
  94                                 osav = obuf;
  95                                 iconv(ic, &ibuf, &ibuflen, &obuf, &obuflen);
  96                                 osav[1024-obuflen] = 0;
  97                                 strcpy(buf, osav);
  98                                 free(osav);
  99                                 iconv_close(ic);
 100                                 free(isav);
 101                         }
 102                 }
 103         }
 104
 105         /* pre evaluate the first pair */
 106         nextend = end = NULL;
 107         len = strlen(buf);
 108         start = strstr(buf, "=?");
 109         if (start != NULL)
 110                 end = FindNextEnd (start);
 111
 112         while ((start != NULL) && (end != NULL))
 113         {
 114                 next = strstr(end, "=?");
 115                 if (next != NULL)
 116                         nextend = FindNextEnd(next);
 117                 if (nextend == NULL)
 118                         next = NULL;
 119
 120                 /* did we find two partitions */
 121                 if ((next != NULL) &&
 122                     ((next - end) > 2))
 123                 {
 124                         ptr = end + 2;
 125                         while ((ptr < next) &&
 126                                (isspace(*ptr) ||
 127                                 (*ptr == '\r') ||
 128                                 (*ptr == '\n') ||
 129                                 (*ptr == '\t')))
 130                                 ptr ++;
 131                         /* did we find a gab just filled with blanks? */
 132                         if (ptr == next)
 133                         {
 134                                 memmove (end + 2,
 135                                          next,
 136                                          len - (next - start));
 137
 138                                 /* now terminate the gab at the end */
 139                                 delta = (next - end) - 2;
 140                                 len -= delta;
 141                                 buf[len] = '\0';
 142
 143                                 /* move next to its new location. */
 144                                 next -= delta;
 145                                 nextend -= delta;
 146                         }
 147                 }
 148                 /* our next-pair is our new first pair now. */
 149                 start = next;
 150                 end = nextend;
 151         }
 152
 153         /* Now we handle foreign character sets properly encoded
 154          * in RFC2047 format.
 155          */
 156         while (start=strstr(buf, "=?"), end=FindNextEnd((start != NULL)? start : buf),
 157                 ((start != NULL) && (end != NULL) && (end > start)) )
 158         {
 159                 extract_token(charset, start, 1, '?', sizeof charset);
 160                 extract_token(encoding, start, 2, '?', sizeof encoding);
 161                 extract_token(istr, start, 3, '?', sizeof istr);
 162
 163                 ibuf = malloc(1024);
 164                 isav = ibuf;
 165                 if (!strcasecmp(encoding, "B")) {       /**< base64 */
 166                         ibuflen = CtdlDecodeBase64(ibuf, istr, strlen(istr));
 167                 }
 168                 else if (!strcasecmp(encoding, "Q")) {  /**< quoted-printable */
 169                         size_t len;
 170                         long pos;
 171
 172                         len = strlen(istr);
 173                         pos = 0;
 174                         while (pos < len)
 175                         {
 176                                 if (istr[pos] == '_') istr[pos] = ' ';
 177                                 pos++;
 178                         }
 179
 180                         ibuflen = CtdlDecodeQuotedPrintable(ibuf, istr, len);
 181                 }
 182                 else {
 183                         strcpy(ibuf, istr);             /**< unknown encoding */
 184                         ibuflen = strlen(istr);
 185                 }
 186
 187                 ctdl_iconv_open("UTF-8", charset, &ic);
 188                 if (ic != (iconv_t)(-1) ) {
 189                         obuflen = 1024;
 190                         obuf = (char *) malloc(obuflen);
 191                         osav = obuf;
 192                         iconv(ic, &ibuf, &ibuflen, &obuf, &obuflen);
 193                         osav[1024-obuflen] = 0;
 194
 195                         end = start;
 196                         end++;
 197                         strcpy(start, "");
 198                         remove_token(end, 0, '?');
 199                         remove_token(end, 0, '?');
 200                         remove_token(end, 0, '?');
 201                         remove_token(end, 0, '?');
 202                         strcpy(end, &end[1]);
 203
 204                         snprintf(newbuf, sizeof newbuf, "%s%s%s", buf, osav, end);
 205                         strcpy(buf, newbuf);
 206                         free(osav);
 207                         iconv_close(ic);
 208                 }
 209                 else {
 210                         end = start;
 211                         end++;
 212                         strcpy(start, "");
 213                         remove_token(end, 0, '?');
 214                         remove_token(end, 0, '?');
 215                         remove_token(end, 0, '?');
 216                         remove_token(end, 0, '?');
 217                         strcpy(end, &end[1]);
 218
 219                         snprintf(newbuf, sizeof newbuf, "%s(unreadable)%s", buf, end);
 220                         strcpy(buf, newbuf);
 221                 }
 222
 223                 free(isav);
 224
 225                 /*
 226                  * Since spammers will go to all sorts of absurd lengths to get their
 227                  * messages through, there are LOTS of corrupt headers out there.
 228                  * So, prevent a really badly formed RFC2047 header from throwing
 229                  * this function into an infinite loop.
 230                  */
 231                 ++passes;
 232                 if (passes > 20) return;
 233         }
 234
 235 }
 236 #else
 237 inline void utf8ify_rfc822_string(char *a){};
 238
 239 #endif
 240
 241
 242
 243
 244 /**
 245  * \brief       RFC2047-encode a header field if necessary.
 246  *              If no non-ASCII characters are found, the string
 247  *              will be copied verbatim without encoding.
 248  *
 249  * \param       target          Target buffer.
 250  * \param       maxlen          Maximum size of target buffer.
 251  * \param       source          Source string to be encoded.
 252  * \param       SourceLen       Length of the source string
 253  * \returns     encoded length; -1 if non success.
 254  */
 255 int webcit_rfc2047encode(char *target, int maxlen, char *source, long SourceLen)
 256 {
 257         const char headerStr[] = "=?UTF-8?Q?";
 258         int need_to_encode = 0;
 259         int i = 0;
 260         int len;
 261         unsigned char ch;
 262
 263         if ((source == NULL) ||
 264             (target == NULL) ||
 265             (SourceLen > maxlen)) return -1;
 266
 267         while ((!IsEmptyStr (&source[i])) &&
 268                (need_to_encode == 0) &&
 269                (i < SourceLen) ) {
 270                 if (((unsigned char) source[i] < 32) ||
 271                     ((unsigned char) source[i] > 126)) {
 272                         need_to_encode = 1;
 273                 }
 274                 i++;
 275         }
 276
 277         if (!need_to_encode) {
 278                 memcpy (target, source, SourceLen);
 279                 target[SourceLen] = '\0';
 280                 return SourceLen;
 281         }
 282
 283         if (sizeof (headerStr + SourceLen + 2) > maxlen)
 284                 return -1;
 285         memcpy (target, headerStr, sizeof (headerStr));
 286         len = sizeof (headerStr) - 1;
 287         for (i=0; (i < SourceLen) && (len + 3< maxlen) ; ++i) {
 288                 ch = (unsigned char) source[i];
 289                 if ((ch < 32) || (ch > 126) || (ch == 61)) {
 290                         sprintf(&target[len], "=%02X", ch);
 291                         len += 3;
 292                 }
 293                 else {
 294                         sprintf(&target[len], "%c", ch);
 295                         len ++;
 296                 }
 297         }
 298
 299         if (len + 2 < maxlen) {
 300                 strcat(&target[len], "?=");
 301                 len +=2;
 302                 return len;
 303         }
 304         else
 305                 return -1;
 306 }
 307