src/kits/mail/mail_util.cpp

   1 /*
   2  * Copyright 2011, Haiku, Inc. All rights reserved.
   3  * Copyright 2001-2003 Dr. Zoidberg Enterprises. All rights reserved.
   4  */
   5
   6
   7 #include <mail_util.h>
   8
   9 #include <stdlib.h>
  10 #include <strings.h>
  11 #include <stdio.h>
  12 #define __USE_GNU
  13 #include <regex.h>
  14 #include <ctype.h>
  15 #include <errno.h>
  16
  17 #include <List.h>
  18 #include <Locker.h>
  19 #include <parsedate.h>
  20 #include <String.h>
  21 #include <UTF8.h>
  22
  23 #include <mail_encoding.h>
  24
  25 #include <CharacterSet.h>
  26 #include <CharacterSetRoster.h>
  27
  28
  29 using namespace BPrivate;
  30
  31
  32 #define CRLF   "\r\n"
  33
  34 struct CharsetConversionEntry {
  35         const char *charset;
  36         uint32 flavor;
  37 };
  38
  39 extern const CharsetConversionEntry mail_charsets[] = {
  40         // In order of authority, so when searching for the name for a particular
  41         // numbered conversion, start at the beginning of the array.
  42         {"iso-8859-1",  B_ISO1_CONVERSION}, // MIME STANDARD
  43         {"iso-8859-2",  B_ISO2_CONVERSION}, // MIME STANDARD
  44         {"iso-8859-3",  B_ISO3_CONVERSION}, // MIME STANDARD
  45         {"iso-8859-4",  B_ISO4_CONVERSION}, // MIME STANDARD
  46         {"iso-8859-5",  B_ISO5_CONVERSION}, // MIME STANDARD
  47         {"iso-8859-6",  B_ISO6_CONVERSION}, // MIME STANDARD
  48         {"iso-8859-7",  B_ISO7_CONVERSION}, // MIME STANDARD
  49         {"iso-8859-8",  B_ISO8_CONVERSION}, // MIME STANDARD
  50         {"iso-8859-9",  B_ISO9_CONVERSION}, // MIME STANDARD
  51         {"iso-8859-10", B_ISO10_CONVERSION}, // MIME STANDARD
  52         {"iso-8859-13", B_ISO13_CONVERSION}, // MIME STANDARD
  53         {"iso-8859-14", B_ISO14_CONVERSION}, // MIME STANDARD
  54         {"iso-8859-15", B_ISO15_CONVERSION}, // MIME STANDARD
  55
  56         {"shift_jis",   B_SJIS_CONVERSION}, // MIME STANDARD
  57         {"shift-jis",   B_SJIS_CONVERSION},
  58         {"iso-2022-jp", B_JIS_CONVERSION}, // MIME STANDARD
  59         {"euc-jp",              B_EUC_CONVERSION}, // MIME STANDARD
  60
  61         {"euc-kr",      B_EUC_KR_CONVERSION}, // Shift encoding 7 bit and KSC-5601 if bit 8 is on. // MIME STANDARD
  62         {"ksc5601",             B_EUC_KR_CONVERSION},    // Not sure if 7 or 8 bit. // COMPATIBLE?
  63         {"ks_c_5601-1987", B_EUC_KR_CONVERSION}, // Not sure if 7 or 8 bit. // COMPATIBLE with stupid MS software
  64
  65         {"koi8-r",      B_KOI8R_CONVERSION},           // MIME STANDARD
  66         {"windows-1251",B_MS_WINDOWS_1251_CONVERSION}, // MIME STANDARD
  67         {"windows-1252",B_MS_WINDOWS_CONVERSION},      // MIME STANDARD
  68
  69         {"dos-437",     B_MS_DOS_CONVERSION},     // WRONG NAME : MIME STANDARD NAME = NONE ( IBM437? )
  70         {"dos-866",     B_MS_DOS_866_CONVERSION}, // WRONG NAME : MIME STANDARD NAME = NONE ( IBM866? )
  71         {"x-mac-roman", B_MAC_ROMAN_CONVERSION},  // WRONG NAME : MIME STANDARD NAME = NONE ( macintosh? + x-mac-roman? )
  72
  73     {"big5",        24}, // MIME STANDARD
  74
  75     {"gb18030",     25}, // WRONG NAME : MIME STANDARD NAME = NONE ( GB18030? )
  76     {"gb2312",      25}, // COMPATIBLE
  77     {"gbk",         25}, // COMPATIBLE
  78
  79         /* {"utf-16",           B_UNICODE_CONVERSION}, Might not work due to NULs in text, needs testing. */
  80         {"us-ascii",    B_MAIL_US_ASCII_CONVERSION},                                  // MIME STANDARD
  81         {"utf-8",               B_MAIL_UTF8_CONVERSION /* Special code for no conversion */}, // MIME STANDARD
  82
  83         {NULL, (uint32) -1} /* End of list marker, NULL string pointer is the key. */
  84 };
  85
  86
  87 static int32 gLocker = 0;
  88 static size_t gNsub = 1;
  89 static re_pattern_buffer gRe;
  90 static re_pattern_buffer *gRebuf = NULL;
  91 static unsigned char gTranslation[256];
  92
  93
  94 static int
  95 handle_non_rfc2047_encoding(char **buffer, size_t *bufferLength,
  96         size_t *sourceLength)
  97 {
  98         char *string = *buffer;
  99         int32 length = *sourceLength;
 100         int32 i;
 101
 102         // check for 8-bit characters
 103         for (i = 0;i < length;i++)
 104                 if (string[i] & 0x80)
 105                         break;
 106         if (i == length)
 107                 return false;
 108
 109         // check for groups of 8-bit characters - this code is not very smart;
 110         // it just can detect some sort of single-byte encoded stuff, the rest
 111         // is regarded as UTF-8
 112
 113         int32 singletons = 0,doubles = 0;
 114
 115         for (i = 0;i < length;i++)
 116         {
 117                 if (string[i] & 0x80)
 118                 {
 119                         if ((string[i + 1] & 0x80) == 0)
 120                                 singletons++;
 121                         else doubles++;
 122                         i++;
 123                 }
 124         }
 125
 126         if (singletons != 0)    // can't be valid UTF-8 anymore, so we assume ISO-Latin-1
 127         {
 128                 int32 state = 0;
 129                 // just to be sure
 130                 int32 destLength = length * 4 + 1;
 131                 int32 destBufferLength = destLength;
 132                 char *dest = (char*)malloc(destLength);
 133                 if (dest == NULL)
 134                         return 0;
 135
 136                 if (convert_to_utf8(B_ISO1_CONVERSION, string, &length,dest,
 137                         &destLength, &state) == B_OK) {
 138                         *buffer = dest;
 139                         *bufferLength = destBufferLength;
 140                         *sourceLength = destLength;
 141                         return true;
 142                 }
 143                 free(dest);
 144                 return false;
 145         }
 146
 147         // we assume a valid UTF-8 string here, but yes, we don't check it
 148         return true;
 149 }
 150
 151
 152 // #pragma mark -
 153
 154
 155 status_t
 156 write_read_attr(BNode& node, read_flags flag)
 157 {
 158         if (node.WriteAttr(B_MAIL_ATTR_READ, B_INT32_TYPE, 0, &flag, sizeof(int32))
 159                         < 0)
 160                 return B_ERROR;
 161
 162         // manage the status string only if it currently has a "read" status
 163         BString currentStatus;
 164         if (node.ReadAttrString(B_MAIL_ATTR_STATUS, &currentStatus) == B_OK) {
 165                 if (currentStatus.ICompare("New") != 0
 166                         && currentStatus.ICompare("Read") != 0
 167                         && currentStatus.ICompare("Seen") != 0)
 168                         return B_OK;
 169         }
 170
 171         const char* statusString = flag == B_READ ? "Read"
 172                 : flag  == B_SEEN ? "Seen" : "New";
 173         if (node.WriteAttr(B_MAIL_ATTR_STATUS, B_STRING_TYPE, 0, statusString,
 174                         strlen(statusString)) < 0)
 175                 return B_ERROR;
 176
 177         return B_OK;
 178 }
 179
 180
 181 status_t
 182 read_read_attr(BNode& node, read_flags& flag)
 183 {
 184         if (node.ReadAttr(B_MAIL_ATTR_READ, B_INT32_TYPE, 0, &flag, sizeof(int32))
 185                         == sizeof(int32))
 186                 return B_OK;
 187
 188         BString statusString;
 189         if (node.ReadAttrString(B_MAIL_ATTR_STATUS, &statusString) == B_OK) {
 190                 if (statusString.ICompare("New"))
 191                         flag = B_UNREAD;
 192                 else
 193                         flag = B_READ;
 194
 195                 return B_OK;
 196         }
 197
 198         return B_ERROR;
 199 }
 200
 201
 202 // The next couple of functions are our wrapper around convert_to_utf8 and
 203 // convert_from_utf8 so that they can also convert from UTF-8 to UTF-8 by
 204 // specifying the B_MAIL_UTF8_CONVERSION constant as the conversion operation.
 205 // It also lets us add new conversions, like B_MAIL_US_ASCII_CONVERSION.
 206
 207
 208 status_t
 209 mail_convert_to_utf8(uint32 srcEncoding, const char *src, int32 *srcLen,
 210         char *dst, int32 *dstLen, int32 *state, char substitute)
 211 {
 212         int32 copyAmount;
 213         char *originalDst = dst;
 214         status_t returnCode = -1;
 215
 216         if (srcEncoding == B_MAIL_UTF8_CONVERSION) {
 217                 copyAmount = *srcLen;
 218                 if (*dstLen < copyAmount)
 219                         copyAmount = *dstLen;
 220                 memcpy (dst, src, copyAmount);
 221                 *srcLen = copyAmount;
 222                 *dstLen = copyAmount;
 223                 returnCode = B_OK;
 224         } else if (srcEncoding == B_MAIL_US_ASCII_CONVERSION) {
 225                 int32 i;
 226                 unsigned char letter;
 227                 copyAmount = *srcLen;
 228                 if (*dstLen < copyAmount)
 229                         copyAmount = *dstLen;
 230                 for (i = 0; i < copyAmount; i++) {
 231                         letter = *src++;
 232                         if (letter > 0x80U)
 233                                 // Invalid, could also use substitute, but better to strip high bit.
 234                                 *dst++ = letter - 0x80U;
 235                         else if (letter == 0x80U)
 236                                 // Can't convert to 0x00 since that's NUL, which would cause problems.
 237                                 *dst++ = substitute;
 238                         else
 239                                 *dst++ = letter;
 240                 }
 241                 *srcLen = copyAmount;
 242                 *dstLen = copyAmount;
 243                 returnCode = B_OK;
 244         } else
 245                 returnCode = convert_to_utf8 (srcEncoding, src, srcLen,
 246                         dst, dstLen, state, substitute);
 247
 248         if (returnCode == B_OK) {
 249                 // Replace spurious NUL bytes, which should normally not be in the
 250                 // output of the decoding (not normal UTF-8 characters, and no NULs are
 251                 // in our usual input strings).  They happen for some odd ISO-2022-JP
 252                 // byte pair combinations which are improperly handled by the BeOS
 253                 // routines.  Like "\e$ByD\e(B" where \e is the ESC character $1B, the
 254                 // first ESC $ B switches to a Japanese character set, then the next
 255                 // two bytes "yD" specify a character, then ESC ( B switches back to
 256                 // the ASCII character set.  The UTF-8 conversion yields a NUL byte.
 257                 int32 i;
 258                 for (i = 0; i < *dstLen; i++)
 259                         if (originalDst[i] == 0)
 260                                 originalDst[i] = substitute;
 261         }
 262         return returnCode;
 263 }
 264
 265
 266 status_t
 267 mail_convert_from_utf8(uint32 dstEncoding, const char *src, int32 *srcLen,
 268         char *dst, int32 *dstLen, int32 *state, char substitute)
 269 {
 270         int32 copyAmount;
 271         status_t errorCode;
 272         int32 originalDstLen = *dstLen;
 273         int32 tempDstLen;
 274         int32 tempSrcLen;
 275
 276         if (dstEncoding == B_MAIL_UTF8_CONVERSION) {
 277                 copyAmount = *srcLen;
 278                 if (*dstLen < copyAmount)
 279                         copyAmount = *dstLen;
 280                 memcpy (dst, src, copyAmount);
 281                 *srcLen = copyAmount;
 282                 *dstLen = copyAmount;
 283                 return B_OK;
 284         }
 285
 286         if (dstEncoding == B_MAIL_US_ASCII_CONVERSION) {
 287                 int32 characterLength;
 288                 int32 dstRemaining = *dstLen;
 289                 unsigned char letter;
 290                 int32 srcRemaining = *srcLen;
 291
 292                 // state contains the number of source bytes to skip, left over from a
 293                 // partial UTF-8 character split over the end of the buffer from last
 294                 // time.
 295                 if (srcRemaining <= *state) {
 296                         *state -= srcRemaining;
 297                         *dstLen = 0;
 298                         return B_OK;
 299                 }
 300                 srcRemaining -= *state;
 301                 src += *state;
 302                 *state = 0;
 303
 304                 while (true) {
 305                         if (srcRemaining <= 0 || dstRemaining <= 0)
 306                                 break;
 307                         letter = *src;
 308                         if (letter < 0x80)
 309                                 characterLength = 1; // Regular ASCII equivalent code.
 310                         else if (letter < 0xC0)
 311                                 characterLength = 1; // Invalid in-between data byte 10xxxxxx.
 312                         else if (letter < 0xE0)
 313                                 characterLength = 2;
 314                         else if (letter < 0xF0)
 315                                 characterLength = 3;
 316                         else if (letter < 0xF8)
 317                                 characterLength = 4;
 318                         else if (letter < 0xFC)
 319                                 characterLength = 5;
 320                         else if (letter < 0xFE)
 321                                 characterLength = 6;
 322                         else
 323                                 characterLength = 1; // 0xFE and 0xFF are invalid in UTF-8.
 324                         if (letter < 0x80)
 325                                 *dst++ = *src;
 326                         else
 327                                 *dst++ = substitute;
 328                         dstRemaining--;
 329                         if (srcRemaining < characterLength) {
 330                                 // Character split past the end of the buffer.
 331                                 *state = characterLength - srcRemaining;
 332                                 srcRemaining = 0;
 333                         } else {
 334                                 src += characterLength;
 335                                 srcRemaining -= characterLength;
 336                         }
 337                 }
 338                 // Update with the amounts used.
 339                 *srcLen = *srcLen - srcRemaining;
 340                 *dstLen = *dstLen - dstRemaining;
 341                 return B_OK;
 342         }
 343
 344         errorCode = convert_from_utf8(dstEncoding, src, srcLen, dst, dstLen, state,
 345                 substitute);
 346         if (errorCode != B_OK)
 347                 return errorCode;
 348
 349         if (dstEncoding != B_JIS_CONVERSION)
 350                 return B_OK;
 351
 352         // B_JIS_CONVERSION (ISO-2022-JP) works by shifting between different
 353         // character subsets.  For E-mail headers (and other uses), it needs to be
 354         // switched back to ASCII at the end (otherwise the last character gets
 355         // lost or other weird things happen in the headers).  Note that we can't
 356         // just append the escape code since the convert_from_utf8 "state" will be
 357         // wrong.  So we append an ASCII letter and throw it away, leaving just the
 358         // escape code.  Well, it actually switches to the Roman character set, not
 359         // ASCII, but that should be OK.
 360
 361         tempDstLen = originalDstLen - *dstLen;
 362         if (tempDstLen < 3) // Not enough space remaining in the output.
 363                 return B_OK; // Sort of an error, but we did convert the rest OK.
 364         tempSrcLen = 1;
 365         errorCode = convert_from_utf8(dstEncoding, "a", &tempSrcLen,
 366                 dst + *dstLen, &tempDstLen, state, substitute);
 367         if (errorCode != B_OK)
 368                 return errorCode;
 369         *dstLen += tempDstLen - 1 /* don't include the ASCII letter */;
 370         return B_OK;
 371 }
 372
 373
 374 ssize_t
 375 rfc2047_to_utf8(char **bufp, size_t *bufLen, size_t strLen)
 376 {
 377         char *head, *tail;
 378         char *charset, *encoding, *end;
 379         ssize_t ret = B_OK;
 380
 381         if (bufp == NULL || *bufp == NULL)
 382                 return -1;
 383
 384         char *string = *bufp;
 385
 386         //---------Handle *&&^%*&^ non-RFC compliant, 8bit mail
 387         if (handle_non_rfc2047_encoding(bufp,bufLen,&strLen))
 388                 return strLen;
 389
 390         // set up string length
 391         if (strLen == 0)
 392                 strLen = strlen(*bufp);
 393         char lastChar = (*bufp)[strLen];
 394         (*bufp)[strLen] = '\0';
 395
 396         //---------Whew! Now for RFC compliant mail
 397         bool encodedWordFoundPreviously = false;
 398         for (head = tail = string;
 399                 ((charset = strstr(tail, "=?")) != NULL)
 400                 && (((encoding = strchr(charset + 2, '?')) != NULL)
 401                         && encoding[1] && (encoding[2] == '?') && encoding[3])
 402                 && (end = strstr(encoding + 3, "?=")) != NULL;
 403                 // found "=?...charset...?e?...text...?=   (e == encoding)
 404                 //        ^charset       ^encoding    ^end
 405                 tail = end)
 406         {
 407                 // Copy non-encoded text (from tail up to charset) to the output.
 408                 // Ignore spaces between two encoded "words".  RFC2047 says the words
 409                 // should be concatenated without the space (designed for Asian
 410                 // sentences which have no spaces yet need to be broken into "words" to
 411                 // keep within the line length limits).
 412                 bool nonSpaceFound = false;
 413                 for (int i = 0; i < charset-tail; i++) {
 414                         if (!isspace (tail[i])) {
 415                                 nonSpaceFound = true;
 416                                 break;
 417                         }
 418                 }
 419                 if (!encodedWordFoundPreviously || nonSpaceFound) {
 420                         if (string != tail && tail != charset)
 421                                 memmove(string, tail, charset-tail);
 422                         string += charset-tail;
 423                 }
 424                 tail = charset;
 425                 encodedWordFoundPreviously = true;
 426
 427                 // move things to point at what they should:
 428                 //   =?...charset...?e?...text...?=   (e == encoding)
 429                 //     ^charset      ^encoding     ^end
 430                 charset += 2;
 431                 encoding += 1;
 432                 end += 2;
 433
 434                 // find the charset this text is in now
 435                 size_t cLen = encoding - 1 - charset;
 436                 bool base64encoded = toupper(*encoding) == 'B';
 437
 438                 uint32 convertID = B_MAIL_NULL_CONVERSION;
 439                 char charsetName[cLen + 1];
 440                 memcpy(charsetName, charset, cLen);
 441                 charsetName[cLen] = '\0';
 442                 if (strcasecmp(charsetName, "us-ascii") == 0) {
 443                         convertID = B_MAIL_US_ASCII_CONVERSION;
 444                 } else if (strcasecmp(charsetName, "utf-8") == 0) {
 445                         convertID = B_MAIL_UTF8_CONVERSION;
 446                 } else {
 447                         const BCharacterSet* charSet
 448                                 = BCharacterSetRoster::FindCharacterSetByName(charsetName);
 449                         if (charSet != NULL) {
 450                                 convertID = charSet->GetConversionID();
 451                         }
 452                 }
 453                 if (convertID == B_MAIL_NULL_CONVERSION) {
 454                         // unidentified charset
 455                         // what to do? doing nothing skips the encoded text;
 456                         // but we should keep it: we copy it to the output.
 457                         if (string != tail && tail != end)
 458                                 memmove(string, tail, end-tail);
 459                         string += end-tail;
 460                         continue;
 461                 }
 462                 // else we've successfully identified the charset
 463
 464                 char *src = encoding+2;
 465                 int32 srcLen = end - 2 - src;
 466                 // encoded text: src..src+srcLen
 467
 468                 // decode text, get decoded length (reducing xforms)
 469                 srcLen = !base64encoded ? decode_qp(src, src, srcLen, 1)
 470                         : decode_base64(src, src, srcLen);
 471
 472                 // allocate space for the converted text
 473                 int32 dstLen = end-string + *bufLen-strLen;
 474                 char *dst = (char*)malloc(dstLen);
 475                 int32 cvLen = srcLen;
 476                 int32 convState = 0;
 477
 478                 //
 479                 // do the conversion
 480                 //
 481                 ret = mail_convert_to_utf8(convertID, src, &cvLen, dst, &dstLen,
 482                         &convState);
 483                 if (ret != B_OK) {
 484                         // what to do? doing nothing skips the encoded text
 485                         // but we should keep it: we copy it to the output.
 486
 487                         free(dst);
 488
 489                         if (string != tail && tail != end)
 490                                 memmove(string, tail, end-tail);
 491                         string += end-tail;
 492                         continue;
 493                 }
 494                 /* convert_to_ is either returning something wrong or my
 495                    test data is screwed up.  Whatever it is, Not Enough
 496                    Space is not the only cause of the below, so we just
 497                    assume it succeeds if it converts anything at all.
 498                 else if (cvLen < srcLen)
 499                 {
 500                         // not enough room to convert the data;
 501                         // grow *buf and retry
 502
 503                         free(dst);
 504
 505                         char *temp = (char*)realloc(*bufp, 2*(*bufLen + 1));
 506                         if (temp == NULL)
 507                         {
 508                                 ret = B_NO_MEMORY;
 509                                 break;
 510                         }
 511
 512                         *bufp = temp;
 513                         *bufLen = 2*(*bufLen + 1);
 514
 515                         string = *bufp + (string-head);
 516                         tail = *bufp + (tail-head);
 517                         charset = *bufp + (charset-head);
 518                         encoding = *bufp + (encoding-head);
 519                         end = *bufp + (end-head);
 520                         src = *bufp + (src-head);
 521                         head = *bufp;
 522                         continue;
 523                 }
 524                 */
 525                 else {
 526                         if (dstLen > end-string) {
 527                                 // copy the string forward...
 528                                 memmove(string+dstLen, end, strLen - (end-head) + 1);
 529                                 strLen += string+dstLen - end;
 530                                 end = string + dstLen;
 531                         }
 532
 533                         memcpy(string, dst, dstLen);
 534                         string += dstLen;
 535                         free(dst);
 536                         continue;
 537                 }
 538         }
 539
 540         // copy everything that's left
 541         size_t tailLen = strLen - (tail - head);
 542         memmove(string, tail, tailLen+1);
 543         string += tailLen;
 544
 545         // replace the last char
 546         (*bufp)[strLen] = lastChar;
 547
 548         return ret < B_OK ? ret : string-head;
 549 }
 550
 551
 552 ssize_t
 553 utf8_to_rfc2047 (char **bufp, ssize_t length, uint32 charset, char encoding)
 554 {
 555         struct word {
 556                 BString originalWord;
 557                 BString convertedWord;
 558                 bool    needsEncoding;
 559
 560                 // Convert the word from UTF-8 to the desired character set.  The
 561                 // converted version also includes the escape codes to return to ASCII
 562                 // mode, if relevant.  Also note if it uses unprintable characters,
 563                 // which means it will need that special encoding treatment later.
 564                 void ConvertWordToCharset (uint32 charset) {
 565                         int32 state = 0;
 566                         int32 originalLength = originalWord.Length();
 567                         int32 convertedLength = originalLength * 5 + 1;
 568                         char *convertedBuffer = convertedWord.LockBuffer (convertedLength);
 569                         mail_convert_from_utf8 (charset, originalWord.String(),
 570                                 &originalLength, convertedBuffer, &convertedLength, &state);
 571                         for (int i = 0; i < convertedLength; i++) {
 572                                 if ((convertedBuffer[i] & (1 << 7)) ||
 573                                         (convertedBuffer[i] >= 0 && convertedBuffer[i] < 32)) {
 574                                         needsEncoding = true;
 575                                         break;
 576                                 }
 577                         }
 578                         convertedWord.UnlockBuffer (convertedLength);
 579                 };
 580         };
 581         struct word *currentWord;
 582         BList words;
 583
 584         // Break the header into words.  White space characters (including tabs and
 585         // newlines) separate the words.  Each word includes any space before it as
 586         // part of the word.  Actually, quotes and other special characters
 587         // (",()<>@) are treated as separate words of their own so that they don't
 588         // get encoded (because MIME headers get the quotes parsed before character
 589         // set unconversion is done).  The reader is supposed to ignore all white
 590         // space between encoded words, which can be inserted so that older mail
 591         // parsers don't have overly long line length problems.
 592
 593         const char *source = *bufp;
 594         const char *bufEnd = *bufp + length;
 595         const char *specialChars = "\"()<>@,";
 596
 597         while (source < bufEnd) {
 598                 currentWord = new struct word;
 599                 currentWord->needsEncoding = false;
 600
 601                 int wordEnd = 0;
 602
 603                 // Include leading spaces as part of the word.
 604                 while (source + wordEnd < bufEnd && isspace (source[wordEnd]))
 605                         wordEnd++;
 606
 607                 if (source + wordEnd < bufEnd &&
 608                         strchr (specialChars, source[wordEnd]) != NULL) {
 609                         // Got a quote mark or other special character, which is treated as
 610                         // a word in itself since it shouldn't be encoded, which would hide
 611                         // it from the mail system.
 612                         wordEnd++;
 613                 } else {
 614                         // Find the end of the word.  Leave wordEnd pointing just after the
 615                         // last character in the word.
 616                         while (source + wordEnd < bufEnd) {
 617                                 if (isspace(source[wordEnd]) ||
 618                                         strchr (specialChars, source[wordEnd]) != NULL)
 619                                         break;
 620                                 if (wordEnd > 51 /* Makes Base64 ISO-2022-JP "word" a multiple of 4 bytes */ &&
 621                                         0xC0 == (0xC0 & (unsigned int) source[wordEnd])) {
 622                                         // No English words are that long (46 is the longest),
 623                                         // break up what is likely Asian text (which has no spaces)
 624                                         // at the start of the next non-ASCII UTF-8 character (high
 625                                         // two bits are both ones).  Note that two encoded words in
 626                                         // a row get joined together, even if there is a space
 627                                         // between them in the final output text, according to the
 628                                         // standard.  Next word will also be conveniently get
 629                                         // encoded due to the 0xC0 test.
 630                                         currentWord->needsEncoding = true;
 631                                         break;
 632                                 }
 633                                 wordEnd++;
 634                         }
 635                 }
 636                 currentWord->originalWord.SetTo (source, wordEnd);
 637                 currentWord->ConvertWordToCharset (charset);
 638                 words.AddItem(currentWord);
 639                 source += wordEnd;
 640         }
 641
 642         // Combine adjacent words which contain unprintable text so that the
 643         // overhead of switching back and forth between regular text and specially
 644         // encoded text is reduced.  However, the combined word must be shorter
 645         // than the maximum of 75 bytes, including character set specification and
 646         // all those delimiters (worst case 22 bytes of overhead).
 647
 648         struct word *run;
 649
 650         for (int32 i = 0; (currentWord = (struct word *) words.ItemAt (i)) != NULL; i++) {
 651                 if (!currentWord->needsEncoding)
 652                         continue; // No need to combine unencoded words.
 653                 for (int32 g = i+1; (run = (struct word *) words.ItemAt (g)) != NULL; g++) {
 654                         if (!run->needsEncoding)
 655                                 break; // Don't want to combine encoded and unencoded words.
 656                         if ((currentWord->convertedWord.Length() + run->convertedWord.Length() <= 53)) {
 657                                 currentWord->originalWord.Append (run->originalWord);
 658                                 currentWord->ConvertWordToCharset (charset);
 659                                 words.RemoveItem(g);
 660                                 delete run;
 661                                 g--;
 662                         } else // Can't merge this word, result would be too long.
 663                                 break;
 664                 }
 665         }
 666
 667         // Combine the encoded and unencoded words into one line, doing the
 668         // quoted-printable or base64 encoding.  Insert an extra space between
 669         // words which are both encoded to make word wrapping easier, since there
 670         // is normally none, and you're allowed to insert space (the receiver
 671         // throws it away if it is between encoded words).
 672
 673         BString rfc2047;
 674         bool    previousWordNeededEncoding = false;
 675
 676         const char *charset_dec = "none-bug";
 677         for (int32 i = 0; mail_charsets[i].charset != NULL; i++) {
 678                 if (mail_charsets[i].flavor == charset) {
 679                         charset_dec = mail_charsets[i].charset;
 680                         break;
 681                 }
 682         }
 683
 684         while ((currentWord = (struct word *)words.RemoveItem((int32)0)) != NULL) {
 685                 if ((encoding != quoted_printable && encoding != base64) ||
 686                 !currentWord->needsEncoding) {
 687                         rfc2047.Append (currentWord->convertedWord);
 688                 } else {
 689                         // This word needs encoding.  Try to insert a space between it and
 690                         // the previous word.
 691                         if (previousWordNeededEncoding)
 692                                 rfc2047 << ' '; // Can insert as many spaces as you want between encoded words.
 693                         else {
 694                                 // Previous word is not encoded, spaces are significant.  Try
 695                                 // to move a space from the start of this word to be outside of
 696                                 // the encoded text, so that there is a bit of space between
 697                                 // this word and the previous one to enhance word wrapping
 698                                 // chances later on.
 699                                 if (currentWord->originalWord.Length() > 1 &&
 700                                         isspace (currentWord->originalWord[0])) {
 701                                         rfc2047 << currentWord->originalWord[0];
 702                                         currentWord->originalWord.Remove (0 /* offset */, 1 /* length */);
 703                                         currentWord->ConvertWordToCharset (charset);
 704                                 }
 705                         }
 706
 707                         char *encoded = NULL;
 708                         ssize_t encoded_len = 0;
 709                         int32 convertedLength = currentWord->convertedWord.Length ();
 710                         const char *convertedBuffer = currentWord->convertedWord.String ();
 711
 712                         switch (encoding) {
 713                                 case quoted_printable:
 714                                         encoded = (char *) malloc (convertedLength * 3);
 715                                         encoded_len = encode_qp (encoded, convertedBuffer, convertedLength, true /* headerMode */);
 716                                         break;
 717                                 case base64:
 718                                         encoded = (char *) malloc (convertedLength * 2);
 719                                         encoded_len = encode_base64 (encoded, convertedBuffer, convertedLength, true /* headerMode */);
 720                                         break;
 721                                 default: // Unknown encoding type, shouldn't happen.
 722                                         encoded = (char *) convertedBuffer;
 723                                         encoded_len = convertedLength;
 724                                         break;
 725                         }
 726
 727                         rfc2047 << "=?" << charset_dec << '?' << encoding << '?';
 728                         rfc2047.Append (encoded, encoded_len);
 729                         rfc2047 << "?=";
 730
 731                         if (encoding == quoted_printable || encoding == base64)
 732                                 free(encoded);
 733                 }
 734                 previousWordNeededEncoding = currentWord->needsEncoding;
 735                 delete currentWord;
 736         }
 737
 738         free(*bufp);
 739
 740         ssize_t finalLength = rfc2047.Length ();
 741         *bufp = (char *) (malloc (finalLength + 1));
 742         memcpy (*bufp, rfc2047.String(), finalLength);
 743         (*bufp)[finalLength] = 0;
 744
 745         return finalLength;
 746 }
 747
 748
 749 void
 750 FoldLineAtWhiteSpaceAndAddCRLF(BString &string)
 751 {
 752         int inputLength = string.Length();
 753         int lineStartIndex;
 754         const int maxLineLength = 78; // Doesn't include CRLF.
 755         BString output;
 756         int splitIndex;
 757         int tempIndex;
 758
 759         lineStartIndex = 0;
 760         while (true) {
 761                 // If we don't need to wrap the text, just output the remainder, if any.
 762
 763                 if (lineStartIndex + maxLineLength >= inputLength) {
 764                         if (lineStartIndex < inputLength) {
 765                                 output.Insert (string, lineStartIndex /* source offset */,
 766                                         inputLength - lineStartIndex /* count */,
 767                                         output.Length() /* insert at */);
 768                                 output.Append (CRLF);
 769                         }
 770                         break;
 771                 }
 772
 773                 // Look ahead for a convenient spot to split it, between a comma and
 774                 // space, which you often see between e-mail addresses like this:
 775                 // "Joe Who" joe@dot.com, "Someone Else" else@blot.com
 776
 777                 tempIndex = lineStartIndex + maxLineLength;
 778                 if (tempIndex > inputLength)
 779                         tempIndex = inputLength;
 780                 splitIndex = string.FindLast (", ", tempIndex);
 781                 if (splitIndex >= lineStartIndex)
 782                         splitIndex++; // Point to the space character.
 783
 784                 // If none of those exist, try splitting at any white space.
 785
 786                 if (splitIndex <= lineStartIndex)
 787                         splitIndex = string.FindLast (" ", tempIndex);
 788                 if (splitIndex <= lineStartIndex)
 789                         splitIndex = string.FindLast ("\t", tempIndex);
 790
 791                 // If none of those exist, allow for a longer word - split at the next
 792                 // available white space.
 793
 794                 if (splitIndex <= lineStartIndex)
 795                         splitIndex = string.FindFirst (" ", lineStartIndex + 1);
 796                 if (splitIndex <= lineStartIndex)
 797                         splitIndex = string.FindFirst ("\t", lineStartIndex + 1);
 798
 799                 // Give up, the whole rest of the line can't be split, just dump it
 800                 // out.
 801
 802                 if (splitIndex <= lineStartIndex) {
 803                         if (lineStartIndex < inputLength) {
 804                                 output.Insert (string, lineStartIndex /* source offset */,
 805                                         inputLength - lineStartIndex /* count */,
 806                                         output.Length() /* insert at */);
 807                                 output.Append (CRLF);
 808                         }
 809                         break;
 810                 }
 811
 812                 // Do the split.  The current line up to but not including the space
 813                 // gets output, followed by a CRLF.  The space remains to become the
 814                 // start of the next line (and that tells the message reader that it is
 815                 // a continuation line).
 816
 817                 output.Insert (string, lineStartIndex /* source offset */,
 818                         splitIndex - lineStartIndex /* count */,
 819                         output.Length() /* insert at */);
 820                 output.Append (CRLF);
 821                 lineStartIndex = splitIndex;
 822         }
 823         string.SetTo (output);
 824 }
 825
 826
 827 ssize_t
 828 readfoldedline(FILE *file, char **buffer, size_t *buflen)
 829 {
 830         ssize_t len = buflen && *buflen ? *buflen : 0;
 831         char * buf = buffer && *buffer ? *buffer : NULL;
 832         ssize_t cnt = 0; // Number of characters currently in the buffer.
 833         int c;
 834
 835         while (true) {
 836                 // Make sure there is space in the buffer for two more characters (one
 837                 // for the next character, and one for the end of string NUL byte).
 838                 if (buf == NULL || cnt + 2 >= len) {
 839                         char *temp = (char *)realloc(buf, len + 64);
 840                         if (temp == NULL) {
 841                                 // Out of memory, however existing buffer remains allocated.
 842                                 cnt = ENOMEM;
 843                                 break;
 844                         }
 845                         len += 64;
 846                         buf = temp;
 847                 }
 848
 849                 // Read the next character, or end of file, or IO error.
 850                 if ((c = fgetc(file)) == EOF) {
 851                         if (ferror (file)) {
 852                                 cnt = errno;
 853                                 if (cnt >= 0)
 854                                         cnt = -1; // Error codes must be negative.
 855                         } else {
 856                                 // Really is end of file.  Also make it end of line if there is
 857                                 // some text already read in.  If the first thing read was EOF,
 858                                 // just return an empty string.
 859                                 if (cnt > 0) {
 860                                         buf[cnt++] = '\n';
 861                                         if (buf[cnt-2] == '\r') {
 862                                                 buf[cnt-2] = '\n';
 863                                                 --cnt;
 864                                         }
 865                                 }
 866                         }
 867                         break;
 868                 }
 869
 870                 buf[cnt++] = c;
 871
 872                 if (c == '\n') {
 873                         // Convert CRLF end of line to just a LF.  Do it before folding, in
 874                         // case we don't need to fold.
 875                         if (cnt >= 2 && buf[cnt-2] == '\r') {
 876                                 buf[cnt-2] = '\n';
 877                                 --cnt;
 878                         }
 879                         // If the current line is empty then return it (so that empty lines
 880                         // don't disappear if the next line starts with a space).
 881                         if (cnt <= 1)
 882                                 break;
 883                         // Fold if first character on the next line is whitespace.
 884                         c = fgetc(file); // Note it's OK to read EOF and ungetc it too.
 885                         if (c == ' ' || c == '\t')
 886                                 buf[cnt-1] = c; // Replace \n with the white space character.
 887                         else {
 888                                 // Not folding, we finished reading a line; break out of the loop
 889                                 ungetc(c,file);
 890                                 break;
 891                         }
 892                 }
 893         }
 894
 895         if (buf != NULL && cnt >= 0)
 896                 buf[cnt] = '\0';
 897
 898         if (buffer)
 899                 *buffer = buf;
 900         else if (buf)
 901                 free(buf);
 902
 903         if (buflen)
 904                 *buflen = len;
 905
 906         return cnt;
 907 }
 908
 909
 910 ssize_t
 911 readfoldedline(BPositionIO &in, char **buffer, size_t *buflen)
 912 {
 913         ssize_t len = buflen && *buflen ? *buflen : 0;
 914         char * buf = buffer && *buffer ? *buffer : NULL;
 915         ssize_t cnt = 0; // Number of characters currently in the buffer.
 916         char c;
 917         status_t errorCode;
 918
 919         while (true) {
 920                 // Make sure there is space in the buffer for two more characters (one
 921                 // for the next character, and one for the end of string NUL byte).
 922                 if (buf == NULL || cnt + 2 >= len) {
 923                         char *temp = (char *)realloc(buf, len + 64);
 924                         if (temp == NULL) {
 925                                 // Out of memory, however existing buffer remains allocated.
 926                                 cnt = ENOMEM;
 927                                 break;
 928                         }
 929                         len += 64;
 930                         buf = temp;
 931                 }
 932
 933                 errorCode = in.Read (&c,1); // A really slow way of reading - unbuffered.
 934                 if (errorCode != 1) {
 935                         if (errorCode < 0) {
 936                                 cnt = errorCode; // IO error encountered, just return the code.
 937                         } else {
 938                                 // Really is end of file.  Also make it end of line if there is
 939                                 // some text already read in.  If the first thing read was EOF,
 940                                 // just return an empty string.
 941                                 if (cnt > 0) {
 942                                         buf[cnt++] = '\n';
 943                                         if (buf[cnt-2] == '\r') {
 944                                                 buf[cnt-2] = '\n';
 945                                                 --cnt;
 946                                         }
 947                                 }
 948                         }
 949                         break;
 950                 }
 951
 952                 buf[cnt++] = c;
 953
 954                 if (c == '\n') {
 955                         // Convert CRLF end of line to just a LF.  Do it before folding, in
 956                         // case we don't need to fold.
 957                         if (cnt >= 2 && buf[cnt-2] == '\r') {
 958                                 buf[cnt-2] = '\n';
 959                                 --cnt;
 960                         }
 961                         // If the current line is empty then return it (so that empty lines
 962                         // don't disappear if the next line starts with a space).
 963                         if (cnt <= 1)
 964                                 break;
 965                         // if first character on the next line is whitespace, fold lines
 966                         errorCode = in.Read(&c,1);
 967                         if (errorCode == 1) {
 968                                 if (c == ' ' || c == '\t')
 969                                         buf[cnt-1] = c; // Replace \n with the white space character.
 970                                 else {
 971                                         // Not folding, we finished reading a whole line.
 972                                         in.Seek(-1,SEEK_CUR); // Undo the look-ahead character read.
 973                                         break;
 974                                 }
 975                         } else if (errorCode < 0) {
 976                                 cnt = errorCode;
 977                                 break;
 978                         } else // No next line; at the end of the file.  Return the line.
 979                                 break;
 980                 }
 981         }
 982
 983         if (buf != NULL && cnt >= 0)
 984                 buf[cnt] = '\0';
 985
 986         if (buffer)
 987                 *buffer = buf;
 988         else if (buf)
 989                 free(buf);
 990
 991         if (buflen)
 992                 *buflen = len;
 993
 994         return cnt;
 995 }
 996
 997
 998 ssize_t
 999 nextfoldedline(const char** header, char **buffer, size_t *buflen)
1000 {
1001         ssize_t len = buflen && *buflen ? *buflen : 0;
1002         char * buf = buffer && *buffer ? *buffer : NULL;
1003         ssize_t cnt = 0; // Number of characters currently in the buffer.
1004         char c;
1005
1006         while (true)
1007         {
1008                 // Make sure there is space in the buffer for two more characters (one
1009                 // for the next character, and one for the end of string NUL byte).
1010                 if (buf == NULL || cnt + 2 >= len)
1011                 {
1012                         char *temp = (char *)realloc(buf, len + 64);
1013                         if (temp == NULL) {
1014                                 // Out of memory, however existing buffer remains allocated.
1015                                 cnt = ENOMEM;
1016                                 break;
1017                         }
1018                         len += 64;
1019                         buf = temp;
1020                 }
1021
1022                 // Read the next character, or end of file.
1023                 if ((c = *(*header)++) == 0) {
1024                         // End of file.  Also make it end of line if there is some text
1025                         // already read in.  If the first thing read was EOF, just return
1026                         // an empty string.
1027                         if (cnt > 0) {
1028                                 buf[cnt++] = '\n';
1029                                 if (buf[cnt-2] == '\r') {
1030                                         buf[cnt-2] = '\n';
1031                                         --cnt;
1032                                 }
1033                         }
1034                         break;
1035                 }
1036
1037                 buf[cnt++] = c;
1038
1039                 if (c == '\n') {
1040                         // Convert CRLF end of line to just a LF.  Do it before folding, in
1041                         // case we don't need to fold.
1042                         if (cnt >= 2 && buf[cnt-2] == '\r') {
1043                                 buf[cnt-2] = '\n';
1044                                 --cnt;
1045                         }
1046                         // If the current line is empty then return it (so that empty lines
1047                         // don't disappear if the next line starts with a space).
1048                         if (cnt <= 1)
1049                                 break;
1050                         // if first character on the next line is whitespace, fold lines
1051                         c = *(*header)++;
1052                         if (c == ' ' || c == '\t')
1053                                 buf[cnt-1] = c; // Replace \n with the white space character.
1054                         else {
1055                                 // Not folding, we finished reading a line; break out of the loop
1056                                 (*header)--; // Undo read of the non-whitespace.
1057                                 break;
1058                         }
1059                 }
1060         }
1061
1062
1063         if (buf != NULL && cnt >= 0)
1064                 buf[cnt] = '\0';
1065
1066         if (buffer)
1067                 *buffer = buf;
1068         else if (buf)
1069                 free(buf);
1070
1071         if (buflen)
1072                 *buflen = len;
1073
1074         return cnt;
1075 }
1076
1077
1078 void
1079 trim_white_space(BString &string)
1080 {
1081         int32 i;
1082         int32 length = string.Length();
1083         char *buffer = string.LockBuffer(length + 1);
1084
1085         while (length > 0 && isspace(buffer[length - 1]))
1086                 length--;
1087         buffer[length] = '\0';
1088
1089         for (i = 0; buffer[i] && isspace(buffer[i]); i++) {}
1090         if (i != 0) {
1091                 length -= i;
1092                 memmove(buffer,buffer + i,length + 1);
1093         }
1094         string.UnlockBuffer(length);
1095 }
1096
1097
1098 /*!     Tries to return a human-readable name from the specified
1099         header parameter (should be from "To:" or "From:").
1100         Tries to return the name rather than the eMail address.
1101 */
1102 void
1103 extract_address_name(BString &header)
1104 {
1105         BString name;
1106         const char *start = header.String();
1107         const char *stop = start + strlen (start);
1108
1109         // Find a string S in the header (email foo) that matches:
1110         //   Old style name in brackets: foo@bar.com (S)
1111         //   New style quotes: "S" <foo@bar.com>
1112         //   New style no quotes if nothing else found: S <foo@bar.com>
1113         //   If nothing else found then use the whole thing: S
1114
1115         for (int i = 0; i <= 3; i++) {
1116                 // Set p1 to the first letter in the name and p2 to just past the last
1117                 // letter in the name.  p2 stays NULL if a name wasn't found in this
1118                 // pass.
1119                 const char *p1 = NULL, *p2 = NULL;
1120
1121                 switch (i) {
1122                         case 0: // foo@bar.com (S)
1123                                 if ((p1 = strchr(start,'(')) != NULL) {
1124                                         p1++; // Advance to first letter in the name.
1125                                         size_t nest = 1; // Handle nested brackets.
1126                                         for (p2 = p1; p2 < stop; ++p2)
1127                                         {
1128                                                 if (*p2 == ')')
1129                                                         --nest;
1130                                                 else if (*p2 == '(')
1131                                                         ++nest;
1132                                                 if (nest <= 0)
1133                                                         break;
1134                                         }
1135                                         if (nest != 0)
1136                                                 p2 = NULL; // False alarm, no terminating bracket.
1137                                 }
1138                                 break;
1139                         case 1: // "S" <foo@bar.com>
1140                                 if ((p1 = strchr(start, '\"')) != NULL)
1141                                         p2 = strchr(++p1, '\"');
1142                                 break;
1143                         case 2: // S <foo@bar.com>
1144                                 p1 = start;
1145                                 if (name.Length() == 0)
1146                                         p2 = strchr(start, '<');
1147                                 break;
1148                         case 3: // S
1149                                 p1 = start;
1150                                 if (name.Length() == 0)
1151                                         p2 = stop;
1152                                 break;
1153                 }
1154
1155                 // Remove leading and trailing space-like characters and save the
1156                 // result if it is longer than any other likely names found.
1157                 if (p2 != NULL) {
1158                         while (p1 < p2 && (isspace (*p1)))
1159                                 ++p1;
1160
1161                         while (p1 < p2 && (isspace (p2[-1])))
1162                                 --p2;
1163
1164                         int newLength = p2 - p1;
1165                         if (name.Length() < newLength)
1166                                 name.SetTo(p1, newLength);
1167                 }
1168         }
1169
1170         int32 lessIndex = name.FindFirst('<');
1171         int32 greaterIndex = name.FindLast('>');
1172
1173         if (lessIndex == 0) {
1174                 // Have an address of the form <address> and nothing else, so remove
1175                 // the greater and less than signs, if any.
1176                 if (greaterIndex > 0)
1177                         name.Remove(greaterIndex, 1);
1178                 name.Remove(lessIndex, 1);
1179         } else if (lessIndex > 0 && lessIndex < greaterIndex) {
1180                 // Yahoo stupidly inserts the e-mail address into the name string, so
1181                 // this bit of code fixes: "Joe <joe@yahoo.com>" <joe@yahoo.com>
1182                 name.Remove(lessIndex, greaterIndex - lessIndex + 1);
1183         }
1184
1185         trim_white_space(name);
1186         header = name;
1187 }
1188
1189
1190 /*!     Given a subject in a BString, remove the extraneous RE: re: and other stuff
1191         to get down to the core subject string, which should be identical for all
1192         messages posted about a topic.  The input string is modified in place to
1193         become the output core subject string.
1194 */
1195 void
1196 SubjectToThread (BString &string)
1197 {
1198 // a regex that matches a non-ASCII UTF8 character:
1199 #define U8C \
1200         "[\302-\337][\200-\277]" \
1201         "|\340[\302-\337][\200-\277]" \
1202         "|[\341-\357][\200-\277][\200-\277]" \
1203         "|\360[\220-\277][\200-\277][\200-\277]" \
1204         "|[\361-\367][\200-\277][\200-\277][\200-\277]" \
1205         "|\370[\210-\277][\200-\277][\200-\277][\200-\277]" \
1206         "|[\371-\373][\200-\277][\200-\277][\200-\277][\200-\277]" \
1207         "|\374[\204-\277][\200-\277][\200-\277][\200-\277][\200-\277]" \
1208         "|\375[\200-\277][\200-\277][\200-\277][\200-\277][\200-\277]"
1209
1210 #define PATTERN \
1211         "^ +" \
1212         "|^(\\[[^]]*\\])(\\<|  +| *(\\<(\\w|" U8C "){2,3} *(\\[[^\\]]*\\])? *:)+ *)" \
1213         "|^(  +| *(\\<(\\w|" U8C "){2,3} *(\\[[^\\]]*\\])? *:)+ *)" \
1214         "| *\\(fwd\\) *$"
1215
1216         if (gRebuf == NULL && atomic_add(&gLocker, 1) == 0) {
1217                 // the idea is to compile the regexp once to speed up testing
1218
1219                 for (int i=0; i<256; ++i) gTranslation[i]=i;
1220                 for (int i='a'; i<='z'; ++i) gTranslation[i]=toupper(i);
1221
1222                 gRe.translate = gTranslation;
1223                 gRe.regs_allocated = REGS_FIXED;
1224                 re_syntax_options = RE_SYNTAX_POSIX_EXTENDED;
1225
1226                 const char *pattern = PATTERN;
1227                 // count subexpressions in PATTERN
1228                 for (unsigned int i=0; pattern[i] != 0; ++i)
1229                 {
1230                         if (pattern[i] == '\\')
1231                                 ++i;
1232                         else if (pattern[i] == '(')
1233                                 ++gNsub;
1234                 }
1235
1236                 const char *err = re_compile_pattern(pattern,strlen(pattern),&gRe);
1237                 if (err == NULL)
1238                         gRebuf = &gRe;
1239                 else
1240                         fprintf(stderr, "Failed to compile the regex: %s\n", err);
1241         } else {
1242                 int32 tries = 200;
1243                 while (gRebuf == NULL && tries-- > 0)
1244                         snooze(10000);
1245         }
1246
1247         if (gRebuf) {
1248                 struct re_registers regs;
1249                 // can't be static if this function is to be thread-safe
1250
1251                 regs.num_regs = gNsub;
1252                 regs.start = (regoff_t*)malloc(gNsub*sizeof(regoff_t));
1253                 regs.end = (regoff_t*)malloc(gNsub*sizeof(regoff_t));
1254
1255                 for (int start = 0; (start = re_search(gRebuf, string.String(),
1256                                 string.Length(), 0, string.Length(), &regs)) >= 0;) {
1257                         //
1258                         // we found something
1259                         //
1260
1261                         // don't delete [bemaildaemon]...
1262                         if (start == regs.start[1])
1263                                 start = regs.start[2];
1264
1265                         string.Remove(start,regs.end[0]-start);
1266                         if (start)
1267                                 string.Insert(' ',1,start);
1268
1269                         // TODO: for some subjects this results in an endless loop, check
1270                         // why this happen.
1271                         if (regs.end[0] - start <= 1)
1272                                 break;
1273                 }
1274
1275                 free(regs.start);
1276                 free(regs.end);
1277         }
1278
1279         // Finally remove leading and trailing space.  Some software, like
1280         // tm-edit 1.8, appends a space to the subject, which would break
1281         // threading if we left it in.
1282         trim_white_space(string);
1283 }
1284
1285
1286 /*!     Converts a date to a time.  Handles numeric time zones too, unlike
1287         parsedate().  Returns -1 if it fails.
1288 */
1289 time_t
1290 ParseDateWithTimeZone(const char *DateString)
1291 {
1292         time_t currentTime;
1293         time_t dateAsTime;
1294         char tempDateString[80];
1295         char tempZoneString[6];
1296         time_t zoneDeltaTime;
1297         int zoneIndex;
1298         char *zonePntr;
1299
1300         // See if we can remove the time zone portion.  parsedate understands time
1301         // zone 3 letter names, but doesn't understand the numeric +9999 time zone
1302         // format.  To do: see if a newer parsedate exists.
1303
1304         strncpy (tempDateString, DateString, sizeof (tempDateString));
1305         tempDateString[sizeof (tempDateString) - 1] = 0;
1306
1307         // Remove trailing spaces.
1308         zonePntr = tempDateString + strlen (tempDateString) - 1;
1309         while (zonePntr >= tempDateString && isspace (*zonePntr))
1310                 *zonePntr-- = 0;
1311         if (zonePntr < tempDateString)
1312                 return -1; // Empty string.
1313
1314         // Remove the trailing time zone in round brackets, like in
1315         // Fri, 22 Feb 2002 15:22:42 EST (-0500)
1316         // Thu, 25 Apr 1996 11:44:19 -0400 (EDT)
1317         if (tempDateString[strlen(tempDateString)-1] == ')')
1318         {
1319                 zonePntr = strrchr (tempDateString, '(');
1320                 if (zonePntr != NULL)
1321                 {
1322                         *zonePntr-- = 0; // Zap the '(', then remove trailing spaces.
1323                         while (zonePntr >= tempDateString && isspace (*zonePntr))
1324                                 *zonePntr-- = 0;
1325                         if (zonePntr < tempDateString)
1326                                 return -1; // Empty string.
1327                 }
1328         }
1329
1330         // Look for a numeric time zone like  Tue, 30 Dec 2003 05:01:40 +0000
1331         for (zoneIndex = strlen (tempDateString); zoneIndex >= 0; zoneIndex--)
1332         {
1333                 zonePntr = tempDateString + zoneIndex;
1334                 if (zonePntr[0] == '+' || zonePntr[0] == '-')
1335                 {
1336                         if (zonePntr[1] >= '0' && zonePntr[1] <= '9' &&
1337                                 zonePntr[2] >= '0' && zonePntr[2] <= '9' &&
1338                                 zonePntr[3] >= '0' && zonePntr[3] <= '9' &&
1339                                 zonePntr[4] >= '0' && zonePntr[4] <= '9')
1340                                 break;
1341                 }
1342         }
1343         if (zoneIndex >= 0)
1344         {
1345                 // Remove the zone from the date string and any following time zone
1346                 // letter codes.  Also put in GMT so that the date gets parsed as GMT.
1347                 memcpy (tempZoneString, zonePntr, 5);
1348                 tempZoneString [5] = 0;
1349                 strcpy (zonePntr, "GMT");
1350         }
1351         else // No numeric time zone found.
1352                 strcpy (tempZoneString, "+0000");
1353
1354         time (&currentTime);
1355         dateAsTime = parsedate (tempDateString, currentTime);
1356         if (dateAsTime == (time_t) -1)
1357                 return -1; // Failure.
1358
1359         zoneDeltaTime = 60 * atol (tempZoneString + 3); // Get the last two digits - minutes.
1360         tempZoneString[3] = 0;
1361         zoneDeltaTime += atol (tempZoneString + 1) * 60 * 60; // Get the first two digits - hours.
1362         if (tempZoneString[0] == '+')
1363                 zoneDeltaTime = 0 - zoneDeltaTime;
1364         dateAsTime += zoneDeltaTime;
1365
1366         return dateAsTime;
1367 }
1368
1369
1370 /*! Parses a mail header and fills the headers BMessage
1371 */
1372 status_t
1373 parse_header(BMessage &headers, BPositionIO &input)
1374 {
1375         char *buffer = NULL;
1376         size_t bufferSize = 0;
1377         int32 length;
1378
1379         while ((length = readfoldedline(input, &buffer, &bufferSize)) >= 2) {
1380                 --length;
1381                         // Don't include the \n at the end of the buffer.
1382
1383                 // convert to UTF-8 and null-terminate the buffer
1384                 length = rfc2047_to_utf8(&buffer, &bufferSize, length);
1385                 buffer[length] = '\0';
1386
1387                 const char *delimiter = strstr(buffer, ":");
1388                 if (delimiter == NULL)
1389                         continue;
1390
1391                 BString header(buffer, delimiter - buffer);
1392                 header.CapitalizeEachWord();
1393                         // unified case for later fetch
1394
1395                 delimiter++; // Skip the colon.
1396                 // Skip over leading white space and tabs.
1397                 // TODO: (comments in brackets).
1398                 while (isspace(*delimiter))
1399                         delimiter++;
1400
1401                 // TODO: implement joining of multiple header tags (i.e. multiple "Cc:"s)
1402                 headers.AddString(header.String(), delimiter);
1403         }
1404         free(buffer);
1405
1406         return B_OK;
1407 }
1408
1409
1410 status_t
1411 extract_from_header(const BString& header, const BString& field,
1412         BString& target)
1413 {
1414         int32 headerLength = header.Length();
1415         int32 fieldEndPos = 0;
1416         while (true) {
1417                 int32 pos = header.IFindFirst(field, fieldEndPos);
1418                 if (pos < 0)
1419                         return B_BAD_VALUE;
1420                 fieldEndPos = pos + field.Length();
1421
1422                 if (pos != 0 && header.ByteAt(pos - 1) != '\n')
1423                         continue;
1424                 if (header.ByteAt(fieldEndPos) == ':')
1425                         break;
1426         }
1427         fieldEndPos++;
1428
1429         int32 crPos = fieldEndPos;
1430         while (true) {
1431                 fieldEndPos = crPos;
1432                 crPos = header.FindFirst('\n', crPos);
1433                 if (crPos < 0)
1434                         crPos = headerLength;
1435                 BString temp;
1436                 header.CopyInto(temp, fieldEndPos, crPos - fieldEndPos);
1437                 if (header.ByteAt(crPos - 1) == '\r') {
1438                         temp.Truncate(temp.Length() - 1);
1439                         temp += " ";
1440                 }
1441                 target += temp;
1442                 crPos++;
1443                 if (crPos >= headerLength)
1444                         break;
1445                 char nextByte = header.ByteAt(crPos);
1446                 if (nextByte != ' ' && nextByte != '\t')
1447                         break;
1448                 crPos++;
1449         }
1450
1451         size_t bufferSize = target.Length();
1452         char* buffer = target.LockBuffer(bufferSize);
1453         size_t length = rfc2047_to_utf8(&buffer, &bufferSize, bufferSize);
1454         target.UnlockBuffer(length);
1455
1456         trim_white_space(target);
1457
1458         return B_OK;
1459 }
1460
1461
1462 void
1463 extract_address(BString &address)
1464 {
1465         const char *string = address.String();
1466         int32 first;
1467
1468         // first, remove all quoted text
1469
1470         if ((first = address.FindFirst('"')) >= 0) {
1471                 int32 last = first + 1;
1472                 while (string[last] && string[last] != '"')
1473                         last++;
1474
1475                 if (string[last] == '"')
1476                         address.Remove(first, last + 1 - first);
1477         }
1478
1479         // try to extract the address now
1480
1481         if ((first = address.FindFirst('<')) >= 0) {
1482                 // the world likes us and we can just get the address the easy way...
1483                 int32 last = address.FindFirst('>');
1484                 if (last >= 0) {
1485                         address.Truncate(last);
1486                         address.Remove(0, first + 1);
1487
1488                         return;
1489                 }
1490         }
1491
1492         // then, see if there is anything in parenthesis to throw away
1493
1494         if ((first = address.FindFirst('(')) >= 0) {
1495                 int32 last = first + 1;
1496                 while (string[last] && string[last] != ')')
1497                         last++;
1498
1499                 if (string[last] == ')')
1500                         address.Remove(first, last + 1 - first);
1501         }
1502
1503         // now, there shouldn't be much else left
1504
1505         trim_white_space(address);
1506 }
1507
1508
1509 void
1510 get_address_list(BList &list, const char *string,
1511         void (*cleanupFunc)(BString &))
1512 {
1513         if (string == NULL || !string[0])
1514                 return;
1515
1516         const char *start = string;
1517
1518         while (true) {
1519                 if (string[0] == '"') {
1520                         const char *quoteEnd = ++string;
1521
1522                         while (quoteEnd[0] && quoteEnd[0] != '"')
1523                                 quoteEnd++;
1524
1525                         if (!quoteEnd[0])       // string exceeds line!
1526                                 quoteEnd = string;
1527
1528                         string = quoteEnd + 1;
1529                 }
1530
1531                 if (string[0] == ',' || string[0] == '\0') {
1532                         BString address(start, string - start);
1533                         trim_white_space(address);
1534
1535                         if (cleanupFunc)
1536                                 cleanupFunc(address);
1537
1538                         list.AddItem(strdup(address.String()));
1539
1540                         start = string + 1;
1541                 }
1542
1543                 if (!string[0])
1544                         break;
1545
1546                 string++;
1547         }
1548 }
1549