src/kits/mail/mail_util.cpp

   1 /*
   2  * Copyright 2011-2016, Haiku, Inc. All rights reserved.
   3  * Copyright 2001-2003 Dr. Zoidberg Enterprises. All rights reserved.
   4  */
   5
   6
   7 #include <mail_util.h>
   8
   9 #include <stdlib.h>
  10 #include <strings.h>
  11 #include <stdio.h>
  12 #define __USE_GNU
  13 #include <regex.h>
  14 #include <ctype.h>
  15 #include <errno.h>
  16
  17 #include <FindDirectory.h>
  18 #include <List.h>
  19 #include <Locker.h>
  20 #include <parsedate.h>
  21 #include <Path.h>
  22 #include <String.h>
  23 #include <UTF8.h>
  24
  25 #include <mail_encoding.h>
  26
  27 #include <AttributeUtilities.h>
  28 #include <CharacterSet.h>
  29 #include <CharacterSetRoster.h>
  30
  31
  32 using namespace BPrivate;
  33
  34
  35 #define CRLF   "\r\n"
  36
  37 struct CharsetConversionEntry {
  38         const char *charset;
  39         uint32 flavor;
  40 };
  41
  42 extern const CharsetConversionEntry mail_charsets[] = {
  43         // In order of authority, so when searching for the name for a particular
  44         // numbered conversion, start at the beginning of the array.
  45         {"iso-8859-1",  B_ISO1_CONVERSION}, // MIME STANDARD
  46         {"iso-8859-2",  B_ISO2_CONVERSION}, // MIME STANDARD
  47         {"iso-8859-3",  B_ISO3_CONVERSION}, // MIME STANDARD
  48         {"iso-8859-4",  B_ISO4_CONVERSION}, // MIME STANDARD
  49         {"iso-8859-5",  B_ISO5_CONVERSION}, // MIME STANDARD
  50         {"iso-8859-6",  B_ISO6_CONVERSION}, // MIME STANDARD
  51         {"iso-8859-7",  B_ISO7_CONVERSION}, // MIME STANDARD
  52         {"iso-8859-8",  B_ISO8_CONVERSION}, // MIME STANDARD
  53         {"iso-8859-9",  B_ISO9_CONVERSION}, // MIME STANDARD
  54         {"iso-8859-10", B_ISO10_CONVERSION}, // MIME STANDARD
  55         {"iso-8859-13", B_ISO13_CONVERSION}, // MIME STANDARD
  56         {"iso-8859-14", B_ISO14_CONVERSION}, // MIME STANDARD
  57         {"iso-8859-15", B_ISO15_CONVERSION}, // MIME STANDARD
  58
  59         {"shift_jis",   B_SJIS_CONVERSION}, // MIME STANDARD
  60         {"shift-jis",   B_SJIS_CONVERSION},
  61         {"iso-2022-jp", B_JIS_CONVERSION}, // MIME STANDARD
  62         {"euc-jp",              B_EUC_CONVERSION}, // MIME STANDARD
  63
  64         {"euc-kr",      B_EUC_KR_CONVERSION}, // Shift encoding 7 bit and KSC-5601 if bit 8 is on. // MIME STANDARD
  65         {"ksc5601",             B_EUC_KR_CONVERSION},    // Not sure if 7 or 8 bit. // COMPATIBLE?
  66         {"ks_c_5601-1987", B_EUC_KR_CONVERSION}, // Not sure if 7 or 8 bit. // COMPATIBLE with stupid MS software
  67
  68         {"koi8-r",      B_KOI8R_CONVERSION},           // MIME STANDARD
  69         {"windows-1251",B_MS_WINDOWS_1251_CONVERSION}, // MIME STANDARD
  70         {"windows-1252",B_MS_WINDOWS_CONVERSION},      // MIME STANDARD
  71
  72         {"dos-437",     B_MS_DOS_CONVERSION},     // WRONG NAME : MIME STANDARD NAME = NONE ( IBM437? )
  73         {"dos-866",     B_MS_DOS_866_CONVERSION}, // WRONG NAME : MIME STANDARD NAME = NONE ( IBM866? )
  74         {"x-mac-roman", B_MAC_ROMAN_CONVERSION},  // WRONG NAME : MIME STANDARD NAME = NONE ( macintosh? + x-mac-roman? )
  75
  76     {"big5",        24}, // MIME STANDARD
  77
  78     {"gb18030",     25}, // WRONG NAME : MIME STANDARD NAME = NONE ( GB18030? )
  79     {"gb2312",      25}, // COMPATIBLE
  80     {"gbk",         25}, // COMPATIBLE
  81
  82         /* {"utf-16",           B_UNICODE_CONVERSION}, Might not work due to NULs in text, needs testing. */
  83         {"us-ascii",    B_MAIL_US_ASCII_CONVERSION},                                  // MIME STANDARD
  84         {"utf-8",               B_MAIL_UTF8_CONVERSION /* Special code for no conversion */}, // MIME STANDARD
  85
  86         {NULL, (uint32) -1} /* End of list marker, NULL string pointer is the key. */
  87 };
  88
  89
  90 static int32 gLocker = 0;
  91 static size_t gNsub = 1;
  92 static re_pattern_buffer gRe;
  93 static re_pattern_buffer *gRebuf = NULL;
  94 static unsigned char gTranslation[256];
  95
  96
  97 static int
  98 handle_non_rfc2047_encoding(char **buffer, size_t *bufferLength,
  99         size_t *sourceLength)
 100 {
 101         char *string = *buffer;
 102         int32 length = *sourceLength;
 103         int32 i;
 104
 105         // check for 8-bit characters
 106         for (i = 0;i < length;i++)
 107                 if (string[i] & 0x80)
 108                         break;
 109         if (i == length)
 110                 return false;
 111
 112         // check for groups of 8-bit characters - this code is not very smart;
 113         // it just can detect some sort of single-byte encoded stuff, the rest
 114         // is regarded as UTF-8
 115
 116         int32 singletons = 0,doubles = 0;
 117
 118         for (i = 0;i < length;i++)
 119         {
 120                 if (string[i] & 0x80)
 121                 {
 122                         if ((string[i + 1] & 0x80) == 0)
 123                                 singletons++;
 124                         else doubles++;
 125                         i++;
 126                 }
 127         }
 128
 129         if (singletons != 0)    // can't be valid UTF-8 anymore, so we assume ISO-Latin-1
 130         {
 131                 int32 state = 0;
 132                 // just to be sure
 133                 int32 destLength = length * 4 + 1;
 134                 int32 destBufferLength = destLength;
 135                 char *dest = (char*)malloc(destLength);
 136                 if (dest == NULL)
 137                         return 0;
 138
 139                 if (convert_to_utf8(B_ISO1_CONVERSION, string, &length,dest,
 140                         &destLength, &state) == B_OK) {
 141                         *buffer = dest;
 142                         *bufferLength = destBufferLength;
 143                         *sourceLength = destLength;
 144                         return true;
 145                 }
 146                 free(dest);
 147                 return false;
 148         }
 149
 150         // we assume a valid UTF-8 string here, but yes, we don't check it
 151         return true;
 152 }
 153
 154
 155 // #pragma mark -
 156
 157
 158 status_t
 159 write_read_attr(BNode& node, read_flags flag)
 160 {
 161         if (node.WriteAttr(B_MAIL_ATTR_READ, B_INT32_TYPE, 0, &flag, sizeof(int32))
 162                         < 0)
 163                 return B_ERROR;
 164
 165         // Manage the status string only if it currently has a known state
 166         BString currentStatus;
 167         if (node.ReadAttrString(B_MAIL_ATTR_STATUS, &currentStatus) == B_OK
 168                 && currentStatus.ICompare("New") != 0
 169                 && currentStatus.ICompare("Read") != 0
 170                 && currentStatus.ICompare("Seen") != 0) {
 171                 return B_OK;
 172         }
 173
 174         const char* statusString = flag == B_READ ? "Read"
 175                 : flag  == B_SEEN ? "Seen" : "New";
 176         if (node.WriteAttr(B_MAIL_ATTR_STATUS, B_STRING_TYPE, 0, statusString,
 177                         strlen(statusString)) < 0)
 178                 return B_ERROR;
 179
 180         return B_OK;
 181 }
 182
 183
 184 status_t
 185 read_read_attr(BNode& node, read_flags& flag)
 186 {
 187         if (node.ReadAttr(B_MAIL_ATTR_READ, B_INT32_TYPE, 0, &flag, sizeof(int32))
 188                         == sizeof(int32))
 189                 return B_OK;
 190
 191         BString statusString;
 192         if (node.ReadAttrString(B_MAIL_ATTR_STATUS, &statusString) == B_OK) {
 193                 if (statusString.ICompare("New"))
 194                         flag = B_UNREAD;
 195                 else
 196                         flag = B_READ;
 197
 198                 return B_OK;
 199         }
 200
 201         return B_ERROR;
 202 }
 203
 204
 205 // The next couple of functions are our wrapper around convert_to_utf8 and
 206 // convert_from_utf8 so that they can also convert from UTF-8 to UTF-8 by
 207 // specifying the B_MAIL_UTF8_CONVERSION constant as the conversion operation.
 208 // It also lets us add new conversions, like B_MAIL_US_ASCII_CONVERSION.
 209
 210
 211 status_t
 212 mail_convert_to_utf8(uint32 srcEncoding, const char *src, int32 *srcLen,
 213         char *dst, int32 *dstLen, int32 *state, char substitute)
 214 {
 215         int32 copyAmount;
 216         char *originalDst = dst;
 217         status_t returnCode = -1;
 218
 219         if (srcEncoding == B_MAIL_UTF8_CONVERSION) {
 220                 copyAmount = *srcLen;
 221                 if (*dstLen < copyAmount)
 222                         copyAmount = *dstLen;
 223                 memcpy (dst, src, copyAmount);
 224                 *srcLen = copyAmount;
 225                 *dstLen = copyAmount;
 226                 returnCode = B_OK;
 227         } else if (srcEncoding == B_MAIL_US_ASCII_CONVERSION) {
 228                 int32 i;
 229                 unsigned char letter;
 230                 copyAmount = *srcLen;
 231                 if (*dstLen < copyAmount)
 232                         copyAmount = *dstLen;
 233                 for (i = 0; i < copyAmount; i++) {
 234                         letter = *src++;
 235                         if (letter > 0x80U)
 236                                 // Invalid, could also use substitute, but better to strip high bit.
 237                                 *dst++ = letter - 0x80U;
 238                         else if (letter == 0x80U)
 239                                 // Can't convert to 0x00 since that's NUL, which would cause problems.
 240                                 *dst++ = substitute;
 241                         else
 242                                 *dst++ = letter;
 243                 }
 244                 *srcLen = copyAmount;
 245                 *dstLen = copyAmount;
 246                 returnCode = B_OK;
 247         } else
 248                 returnCode = convert_to_utf8 (srcEncoding, src, srcLen,
 249                         dst, dstLen, state, substitute);
 250
 251         if (returnCode == B_OK) {
 252                 // Replace spurious NUL bytes, which should normally not be in the
 253                 // output of the decoding (not normal UTF-8 characters, and no NULs are
 254                 // in our usual input strings).  They happen for some odd ISO-2022-JP
 255                 // byte pair combinations which are improperly handled by the BeOS
 256                 // routines.  Like "\e$ByD\e(B" where \e is the ESC character $1B, the
 257                 // first ESC $ B switches to a Japanese character set, then the next
 258                 // two bytes "yD" specify a character, then ESC ( B switches back to
 259                 // the ASCII character set.  The UTF-8 conversion yields a NUL byte.
 260                 int32 i;
 261                 for (i = 0; i < *dstLen; i++)
 262                         if (originalDst[i] == 0)
 263                                 originalDst[i] = substitute;
 264         }
 265         return returnCode;
 266 }
 267
 268
 269 status_t
 270 mail_convert_from_utf8(uint32 dstEncoding, const char *src, int32 *srcLen,
 271         char *dst, int32 *dstLen, int32 *state, char substitute)
 272 {
 273         int32 copyAmount;
 274         status_t errorCode;
 275         int32 originalDstLen = *dstLen;
 276         int32 tempDstLen;
 277         int32 tempSrcLen;
 278
 279         if (dstEncoding == B_MAIL_UTF8_CONVERSION) {
 280                 copyAmount = *srcLen;
 281                 if (*dstLen < copyAmount)
 282                         copyAmount = *dstLen;
 283                 memcpy (dst, src, copyAmount);
 284                 *srcLen = copyAmount;
 285                 *dstLen = copyAmount;
 286                 return B_OK;
 287         }
 288
 289         if (dstEncoding == B_MAIL_US_ASCII_CONVERSION) {
 290                 int32 characterLength;
 291                 int32 dstRemaining = *dstLen;
 292                 unsigned char letter;
 293                 int32 srcRemaining = *srcLen;
 294
 295                 // state contains the number of source bytes to skip, left over from a
 296                 // partial UTF-8 character split over the end of the buffer from last
 297                 // time.
 298                 if (srcRemaining <= *state) {
 299                         *state -= srcRemaining;
 300                         *dstLen = 0;
 301                         return B_OK;
 302                 }
 303                 srcRemaining -= *state;
 304                 src += *state;
 305                 *state = 0;
 306
 307                 while (true) {
 308                         if (srcRemaining <= 0 || dstRemaining <= 0)
 309                                 break;
 310                         letter = *src;
 311                         if (letter < 0x80)
 312                                 characterLength = 1; // Regular ASCII equivalent code.
 313                         else if (letter < 0xC0)
 314                                 characterLength = 1; // Invalid in-between data byte 10xxxxxx.
 315                         else if (letter < 0xE0)
 316                                 characterLength = 2;
 317                         else if (letter < 0xF0)
 318                                 characterLength = 3;
 319                         else if (letter < 0xF8)
 320                                 characterLength = 4;
 321                         else if (letter < 0xFC)
 322                                 characterLength = 5;
 323                         else if (letter < 0xFE)
 324                                 characterLength = 6;
 325                         else
 326                                 characterLength = 1; // 0xFE and 0xFF are invalid in UTF-8.
 327                         if (letter < 0x80)
 328                                 *dst++ = *src;
 329                         else
 330                                 *dst++ = substitute;
 331                         dstRemaining--;
 332                         if (srcRemaining < characterLength) {
 333                                 // Character split past the end of the buffer.
 334                                 *state = characterLength - srcRemaining;
 335                                 srcRemaining = 0;
 336                         } else {
 337                                 src += characterLength;
 338                                 srcRemaining -= characterLength;
 339                         }
 340                 }
 341                 // Update with the amounts used.
 342                 *srcLen = *srcLen - srcRemaining;
 343                 *dstLen = *dstLen - dstRemaining;
 344                 return B_OK;
 345         }
 346
 347         errorCode = convert_from_utf8(dstEncoding, src, srcLen, dst, dstLen, state,
 348                 substitute);
 349         if (errorCode != B_OK)
 350                 return errorCode;
 351
 352         if (dstEncoding != B_JIS_CONVERSION)
 353                 return B_OK;
 354
 355         // B_JIS_CONVERSION (ISO-2022-JP) works by shifting between different
 356         // character subsets.  For E-mail headers (and other uses), it needs to be
 357         // switched back to ASCII at the end (otherwise the last character gets
 358         // lost or other weird things happen in the headers).  Note that we can't
 359         // just append the escape code since the convert_from_utf8 "state" will be
 360         // wrong.  So we append an ASCII letter and throw it away, leaving just the
 361         // escape code.  Well, it actually switches to the Roman character set, not
 362         // ASCII, but that should be OK.
 363
 364         tempDstLen = originalDstLen - *dstLen;
 365         if (tempDstLen < 3) // Not enough space remaining in the output.
 366                 return B_OK; // Sort of an error, but we did convert the rest OK.
 367         tempSrcLen = 1;
 368         errorCode = convert_from_utf8(dstEncoding, "a", &tempSrcLen,
 369                 dst + *dstLen, &tempDstLen, state, substitute);
 370         if (errorCode != B_OK)
 371                 return errorCode;
 372         *dstLen += tempDstLen - 1 /* don't include the ASCII letter */;
 373         return B_OK;
 374 }
 375
 376
 377 ssize_t
 378 rfc2047_to_utf8(char **bufp, size_t *bufLen, size_t strLen)
 379 {
 380         char *head, *tail;
 381         char *charset, *encoding, *end;
 382         ssize_t ret = B_OK;
 383
 384         if (bufp == NULL || *bufp == NULL)
 385                 return -1;
 386
 387         char *string = *bufp;
 388
 389         //---------Handle *&&^%*&^ non-RFC compliant, 8bit mail
 390         if (handle_non_rfc2047_encoding(bufp,bufLen,&strLen))
 391                 return strLen;
 392
 393         // set up string length
 394         if (strLen == 0)
 395                 strLen = strlen(*bufp);
 396         char lastChar = (*bufp)[strLen];
 397         (*bufp)[strLen] = '\0';
 398
 399         //---------Whew! Now for RFC compliant mail
 400         bool encodedWordFoundPreviously = false;
 401         for (head = tail = string;
 402                 ((charset = strstr(tail, "=?")) != NULL)
 403                 && (((encoding = strchr(charset + 2, '?')) != NULL)
 404                         && encoding[1] && (encoding[2] == '?') && encoding[3])
 405                 && (end = strstr(encoding + 3, "?=")) != NULL;
 406                 // found "=?...charset...?e?...text...?=   (e == encoding)
 407                 //        ^charset       ^encoding    ^end
 408                 tail = end)
 409         {
 410                 // Copy non-encoded text (from tail up to charset) to the output.
 411                 // Ignore spaces between two encoded "words".  RFC2047 says the words
 412                 // should be concatenated without the space (designed for Asian
 413                 // sentences which have no spaces yet need to be broken into "words" to
 414                 // keep within the line length limits).
 415                 bool nonSpaceFound = false;
 416                 for (int i = 0; i < charset-tail; i++) {
 417                         if (!isspace (tail[i])) {
 418                                 nonSpaceFound = true;
 419                                 break;
 420                         }
 421                 }
 422                 if (!encodedWordFoundPreviously || nonSpaceFound) {
 423                         if (string != tail && tail != charset)
 424                                 memmove(string, tail, charset-tail);
 425                         string += charset-tail;
 426                 }
 427                 tail = charset;
 428                 encodedWordFoundPreviously = true;
 429
 430                 // move things to point at what they should:
 431                 //   =?...charset...?e?...text...?=   (e == encoding)
 432                 //     ^charset      ^encoding     ^end
 433                 charset += 2;
 434                 encoding += 1;
 435                 end += 2;
 436
 437                 // find the charset this text is in now
 438                 size_t cLen = encoding - 1 - charset;
 439                 bool base64encoded = toupper(*encoding) == 'B';
 440
 441                 uint32 convertID = B_MAIL_NULL_CONVERSION;
 442                 char charsetName[cLen + 1];
 443                 memcpy(charsetName, charset, cLen);
 444                 charsetName[cLen] = '\0';
 445                 if (strcasecmp(charsetName, "us-ascii") == 0) {
 446                         convertID = B_MAIL_US_ASCII_CONVERSION;
 447                 } else if (strcasecmp(charsetName, "utf-8") == 0) {
 448                         convertID = B_MAIL_UTF8_CONVERSION;
 449                 } else {
 450                         const BCharacterSet* charSet
 451                                 = BCharacterSetRoster::FindCharacterSetByName(charsetName);
 452                         if (charSet != NULL) {
 453                                 convertID = charSet->GetConversionID();
 454                         }
 455                 }
 456                 if (convertID == B_MAIL_NULL_CONVERSION) {
 457                         // unidentified charset
 458                         // what to do? doing nothing skips the encoded text;
 459                         // but we should keep it: we copy it to the output.
 460                         if (string != tail && tail != end)
 461                                 memmove(string, tail, end-tail);
 462                         string += end-tail;
 463                         continue;
 464                 }
 465                 // else we've successfully identified the charset
 466
 467                 char *src = encoding+2;
 468                 int32 srcLen = end - 2 - src;
 469                 // encoded text: src..src+srcLen
 470
 471                 // decode text, get decoded length (reducing xforms)
 472                 srcLen = !base64encoded ? decode_qp(src, src, srcLen, 1)
 473                         : decode_base64(src, src, srcLen);
 474
 475                 // allocate space for the converted text
 476                 int32 dstLen = end-string + *bufLen-strLen;
 477                 char *dst = (char*)malloc(dstLen);
 478                 int32 cvLen = srcLen;
 479                 int32 convState = 0;
 480
 481                 //
 482                 // do the conversion
 483                 //
 484                 ret = mail_convert_to_utf8(convertID, src, &cvLen, dst, &dstLen,
 485                         &convState);
 486                 if (ret != B_OK) {
 487                         // what to do? doing nothing skips the encoded text
 488                         // but we should keep it: we copy it to the output.
 489
 490                         free(dst);
 491
 492                         if (string != tail && tail != end)
 493                                 memmove(string, tail, end-tail);
 494                         string += end-tail;
 495                         continue;
 496                 }
 497                 /* convert_to_ is either returning something wrong or my
 498                    test data is screwed up.  Whatever it is, Not Enough
 499                    Space is not the only cause of the below, so we just
 500                    assume it succeeds if it converts anything at all.
 501                 else if (cvLen < srcLen)
 502                 {
 503                         // not enough room to convert the data;
 504                         // grow *buf and retry
 505
 506                         free(dst);
 507
 508                         char *temp = (char*)realloc(*bufp, 2*(*bufLen + 1));
 509                         if (temp == NULL)
 510                         {
 511                                 ret = B_NO_MEMORY;
 512                                 break;
 513                         }
 514
 515                         *bufp = temp;
 516                         *bufLen = 2*(*bufLen + 1);
 517
 518                         string = *bufp + (string-head);
 519                         tail = *bufp + (tail-head);
 520                         charset = *bufp + (charset-head);
 521                         encoding = *bufp + (encoding-head);
 522                         end = *bufp + (end-head);
 523                         src = *bufp + (src-head);
 524                         head = *bufp;
 525                         continue;
 526                 }
 527                 */
 528                 else {
 529                         if (dstLen > end-string) {
 530                                 // copy the string forward...
 531                                 memmove(string+dstLen, end, strLen - (end-head) + 1);
 532                                 strLen += string+dstLen - end;
 533                                 end = string + dstLen;
 534                         }
 535
 536                         memcpy(string, dst, dstLen);
 537                         string += dstLen;
 538                         free(dst);
 539                         continue;
 540                 }
 541         }
 542
 543         // copy everything that's left
 544         size_t tailLen = strLen - (tail - head);
 545         memmove(string, tail, tailLen+1);
 546         string += tailLen;
 547
 548         // replace the last char
 549         (*bufp)[strLen] = lastChar;
 550
 551         return ret < B_OK ? ret : string-head;
 552 }
 553
 554
 555 ssize_t
 556 utf8_to_rfc2047 (char **bufp, ssize_t length, uint32 charset, char encoding)
 557 {
 558         struct word {
 559                 BString originalWord;
 560                 BString convertedWord;
 561                 bool    needsEncoding;
 562
 563                 // Convert the word from UTF-8 to the desired character set.  The
 564                 // converted version also includes the escape codes to return to ASCII
 565                 // mode, if relevant.  Also note if it uses unprintable characters,
 566                 // which means it will need that special encoding treatment later.
 567                 void ConvertWordToCharset (uint32 charset) {
 568                         int32 state = 0;
 569                         int32 originalLength = originalWord.Length();
 570                         int32 convertedLength = originalLength * 5 + 1;
 571                         char *convertedBuffer = convertedWord.LockBuffer (convertedLength);
 572                         mail_convert_from_utf8 (charset, originalWord.String(),
 573                                 &originalLength, convertedBuffer, &convertedLength, &state);
 574                         for (int i = 0; i < convertedLength; i++) {
 575                                 if ((convertedBuffer[i] & (1 << 7)) ||
 576                                         (convertedBuffer[i] >= 0 && convertedBuffer[i] < 32)) {
 577                                         needsEncoding = true;
 578                                         break;
 579                                 }
 580                         }
 581                         convertedWord.UnlockBuffer (convertedLength);
 582                 };
 583         };
 584         struct word *currentWord;
 585         BList words;
 586
 587         // Break the header into words.  White space characters (including tabs and
 588         // newlines) separate the words.  Each word includes any space before it as
 589         // part of the word.  Actually, quotes and other special characters
 590         // (",()<>@) are treated as separate words of their own so that they don't
 591         // get encoded (because MIME headers get the quotes parsed before character
 592         // set unconversion is done).  The reader is supposed to ignore all white
 593         // space between encoded words, which can be inserted so that older mail
 594         // parsers don't have overly long line length problems.
 595
 596         const char *source = *bufp;
 597         const char *bufEnd = *bufp + length;
 598         const char *specialChars = "\"()<>@,";
 599
 600         while (source < bufEnd) {
 601                 currentWord = new struct word;
 602                 currentWord->needsEncoding = false;
 603
 604                 int wordEnd = 0;
 605
 606                 // Include leading spaces as part of the word.
 607                 while (source + wordEnd < bufEnd && isspace (source[wordEnd]))
 608                         wordEnd++;
 609
 610                 if (source + wordEnd < bufEnd &&
 611                         strchr (specialChars, source[wordEnd]) != NULL) {
 612                         // Got a quote mark or other special character, which is treated as
 613                         // a word in itself since it shouldn't be encoded, which would hide
 614                         // it from the mail system.
 615                         wordEnd++;
 616                 } else {
 617                         // Find the end of the word.  Leave wordEnd pointing just after the
 618                         // last character in the word.
 619                         while (source + wordEnd < bufEnd) {
 620                                 if (isspace(source[wordEnd]) ||
 621                                         strchr (specialChars, source[wordEnd]) != NULL)
 622                                         break;
 623                                 if (wordEnd > 51 /* Makes Base64 ISO-2022-JP "word" a multiple of 4 bytes */ &&
 624                                         0xC0 == (0xC0 & (unsigned int) source[wordEnd])) {
 625                                         // No English words are that long (46 is the longest),
 626                                         // break up what is likely Asian text (which has no spaces)
 627                                         // at the start of the next non-ASCII UTF-8 character (high
 628                                         // two bits are both ones).  Note that two encoded words in
 629                                         // a row get joined together, even if there is a space
 630                                         // between them in the final output text, according to the
 631                                         // standard.  Next word will also be conveniently get
 632                                         // encoded due to the 0xC0 test.
 633                                         currentWord->needsEncoding = true;
 634                                         break;
 635                                 }
 636                                 wordEnd++;
 637                         }
 638                 }
 639                 currentWord->originalWord.SetTo (source, wordEnd);
 640                 currentWord->ConvertWordToCharset (charset);
 641                 words.AddItem(currentWord);
 642                 source += wordEnd;
 643         }
 644
 645         // Combine adjacent words which contain unprintable text so that the
 646         // overhead of switching back and forth between regular text and specially
 647         // encoded text is reduced.  However, the combined word must be shorter
 648         // than the maximum of 75 bytes, including character set specification and
 649         // all those delimiters (worst case 22 bytes of overhead).
 650
 651         struct word *run;
 652
 653         for (int32 i = 0; (currentWord = (struct word *) words.ItemAt (i)) != NULL; i++) {
 654                 if (!currentWord->needsEncoding)
 655                         continue; // No need to combine unencoded words.
 656                 for (int32 g = i+1; (run = (struct word *) words.ItemAt (g)) != NULL; g++) {
 657                         if (!run->needsEncoding)
 658                                 break; // Don't want to combine encoded and unencoded words.
 659                         if ((currentWord->convertedWord.Length() + run->convertedWord.Length() <= 53)) {
 660                                 currentWord->originalWord.Append (run->originalWord);
 661                                 currentWord->ConvertWordToCharset (charset);
 662                                 words.RemoveItem(g);
 663                                 delete run;
 664                                 g--;
 665                         } else // Can't merge this word, result would be too long.
 666                                 break;
 667                 }
 668         }
 669
 670         // Combine the encoded and unencoded words into one line, doing the
 671         // quoted-printable or base64 encoding.  Insert an extra space between
 672         // words which are both encoded to make word wrapping easier, since there
 673         // is normally none, and you're allowed to insert space (the receiver
 674         // throws it away if it is between encoded words).
 675
 676         BString rfc2047;
 677         bool    previousWordNeededEncoding = false;
 678
 679         const char *charset_dec = "none-bug";
 680         for (int32 i = 0; mail_charsets[i].charset != NULL; i++) {
 681                 if (mail_charsets[i].flavor == charset) {
 682                         charset_dec = mail_charsets[i].charset;
 683                         break;
 684                 }
 685         }
 686
 687         while ((currentWord = (struct word *)words.RemoveItem((int32)0)) != NULL) {
 688                 if ((encoding != quoted_printable && encoding != base64) ||
 689                 !currentWord->needsEncoding) {
 690                         rfc2047.Append (currentWord->convertedWord);
 691                 } else {
 692                         // This word needs encoding.  Try to insert a space between it and
 693                         // the previous word.
 694                         if (previousWordNeededEncoding)
 695                                 rfc2047 << ' '; // Can insert as many spaces as you want between encoded words.
 696                         else {
 697                                 // Previous word is not encoded, spaces are significant.  Try
 698                                 // to move a space from the start of this word to be outside of
 699                                 // the encoded text, so that there is a bit of space between
 700                                 // this word and the previous one to enhance word wrapping
 701                                 // chances later on.
 702                                 if (currentWord->originalWord.Length() > 1 &&
 703                                         isspace (currentWord->originalWord[0])) {
 704                                         rfc2047 << currentWord->originalWord[0];
 705                                         currentWord->originalWord.Remove (0 /* offset */, 1 /* length */);
 706                                         currentWord->ConvertWordToCharset (charset);
 707                                 }
 708                         }
 709
 710                         char *encoded = NULL;
 711                         ssize_t encoded_len = 0;
 712                         int32 convertedLength = currentWord->convertedWord.Length ();
 713                         const char *convertedBuffer = currentWord->convertedWord.String ();
 714
 715                         switch (encoding) {
 716                                 case quoted_printable:
 717                                         encoded = (char *) malloc (convertedLength * 3);
 718                                         encoded_len = encode_qp (encoded, convertedBuffer, convertedLength, true /* headerMode */);
 719                                         break;
 720                                 case base64:
 721                                         encoded = (char *) malloc (convertedLength * 2);
 722                                         encoded_len = encode_base64 (encoded, convertedBuffer, convertedLength, true /* headerMode */);
 723                                         break;
 724                                 default: // Unknown encoding type, shouldn't happen.
 725                                         encoded = (char *) convertedBuffer;
 726                                         encoded_len = convertedLength;
 727                                         break;
 728                         }
 729
 730                         rfc2047 << "=?" << charset_dec << '?' << encoding << '?';
 731                         rfc2047.Append (encoded, encoded_len);
 732                         rfc2047 << "?=";
 733
 734                         if (encoding == quoted_printable || encoding == base64)
 735                                 free(encoded);
 736                 }
 737                 previousWordNeededEncoding = currentWord->needsEncoding;
 738                 delete currentWord;
 739         }
 740
 741         free(*bufp);
 742
 743         ssize_t finalLength = rfc2047.Length ();
 744         *bufp = (char *) (malloc (finalLength + 1));
 745         memcpy (*bufp, rfc2047.String(), finalLength);
 746         (*bufp)[finalLength] = 0;
 747
 748         return finalLength;
 749 }
 750
 751
 752 void
 753 FoldLineAtWhiteSpaceAndAddCRLF(BString &string)
 754 {
 755         int inputLength = string.Length();
 756         int lineStartIndex;
 757         const int maxLineLength = 78; // Doesn't include CRLF.
 758         BString output;
 759         int splitIndex;
 760         int tempIndex;
 761
 762         lineStartIndex = 0;
 763         while (true) {
 764                 // If we don't need to wrap the text, just output the remainder, if any.
 765
 766                 if (lineStartIndex + maxLineLength >= inputLength) {
 767                         if (lineStartIndex < inputLength) {
 768                                 output.Insert (string, lineStartIndex /* source offset */,
 769                                         inputLength - lineStartIndex /* count */,
 770                                         output.Length() /* insert at */);
 771                                 output.Append (CRLF);
 772                         }
 773                         break;
 774                 }
 775
 776                 // Look ahead for a convenient spot to split it, between a comma and
 777                 // space, which you often see between e-mail addresses like this:
 778                 // "Joe Who" joe@dot.com, "Someone Else" else@blot.com
 779
 780                 tempIndex = lineStartIndex + maxLineLength;
 781                 if (tempIndex > inputLength)
 782                         tempIndex = inputLength;
 783                 splitIndex = string.FindLast (", ", tempIndex);
 784                 if (splitIndex >= lineStartIndex)
 785                         splitIndex++; // Point to the space character.
 786
 787                 // If none of those exist, try splitting at any white space.
 788
 789                 if (splitIndex <= lineStartIndex)
 790                         splitIndex = string.FindLast (" ", tempIndex);
 791                 if (splitIndex <= lineStartIndex)
 792                         splitIndex = string.FindLast ("\t", tempIndex);
 793
 794                 // If none of those exist, allow for a longer word - split at the next
 795                 // available white space.
 796
 797                 if (splitIndex <= lineStartIndex)
 798                         splitIndex = string.FindFirst (" ", lineStartIndex + 1);
 799                 if (splitIndex <= lineStartIndex)
 800                         splitIndex = string.FindFirst ("\t", lineStartIndex + 1);
 801
 802                 // Give up, the whole rest of the line can't be split, just dump it
 803                 // out.
 804
 805                 if (splitIndex <= lineStartIndex) {
 806                         if (lineStartIndex < inputLength) {
 807                                 output.Insert (string, lineStartIndex /* source offset */,
 808                                         inputLength - lineStartIndex /* count */,
 809                                         output.Length() /* insert at */);
 810                                 output.Append (CRLF);
 811                         }
 812                         break;
 813                 }
 814
 815                 // Do the split.  The current line up to but not including the space
 816                 // gets output, followed by a CRLF.  The space remains to become the
 817                 // start of the next line (and that tells the message reader that it is
 818                 // a continuation line).
 819
 820                 output.Insert (string, lineStartIndex /* source offset */,
 821                         splitIndex - lineStartIndex /* count */,
 822                         output.Length() /* insert at */);
 823                 output.Append (CRLF);
 824                 lineStartIndex = splitIndex;
 825         }
 826         string.SetTo (output);
 827 }
 828
 829
 830 ssize_t
 831 readfoldedline(FILE *file, char **buffer, size_t *buflen)
 832 {
 833         ssize_t len = buflen && *buflen ? *buflen : 0;
 834         char * buf = buffer && *buffer ? *buffer : NULL;
 835         ssize_t cnt = 0; // Number of characters currently in the buffer.
 836         int c;
 837
 838         while (true) {
 839                 // Make sure there is space in the buffer for two more characters (one
 840                 // for the next character, and one for the end of string NUL byte).
 841                 if (buf == NULL || cnt + 2 >= len) {
 842                         char *temp = (char *)realloc(buf, len + 64);
 843                         if (temp == NULL) {
 844                                 // Out of memory, however existing buffer remains allocated.
 845                                 cnt = ENOMEM;
 846                                 break;
 847                         }
 848                         len += 64;
 849                         buf = temp;
 850                 }
 851
 852                 // Read the next character, or end of file, or IO error.
 853                 if ((c = fgetc(file)) == EOF) {
 854                         if (ferror (file)) {
 855                                 cnt = errno;
 856                                 if (cnt >= 0)
 857                                         cnt = -1; // Error codes must be negative.
 858                         } else {
 859                                 // Really is end of file.  Also make it end of line if there is
 860                                 // some text already read in.  If the first thing read was EOF,
 861                                 // just return an empty string.
 862                                 if (cnt > 0) {
 863                                         buf[cnt++] = '\n';
 864                                         if (buf[cnt-2] == '\r') {
 865                                                 buf[cnt-2] = '\n';
 866                                                 --cnt;
 867                                         }
 868                                 }
 869                         }
 870                         break;
 871                 }
 872
 873                 buf[cnt++] = c;
 874
 875                 if (c == '\n') {
 876                         // Convert CRLF end of line to just a LF.  Do it before folding, in
 877                         // case we don't need to fold.
 878                         if (cnt >= 2 && buf[cnt-2] == '\r') {
 879                                 buf[cnt-2] = '\n';
 880                                 --cnt;
 881                         }
 882                         // If the current line is empty then return it (so that empty lines
 883                         // don't disappear if the next line starts with a space).
 884                         if (cnt <= 1)
 885                                 break;
 886                         // Fold if first character on the next line is whitespace.
 887                         c = fgetc(file); // Note it's OK to read EOF and ungetc it too.
 888                         if (c == ' ' || c == '\t')
 889                                 buf[cnt-1] = c; // Replace \n with the white space character.
 890                         else {
 891                                 // Not folding, we finished reading a line; break out of the loop
 892                                 ungetc(c,file);
 893                                 break;
 894                         }
 895                 }
 896         }
 897
 898         if (buf != NULL && cnt >= 0)
 899                 buf[cnt] = '\0';
 900
 901         if (buffer)
 902                 *buffer = buf;
 903         else if (buf)
 904                 free(buf);
 905
 906         if (buflen)
 907                 *buflen = len;
 908
 909         return cnt;
 910 }
 911
 912
 913 ssize_t
 914 readfoldedline(BPositionIO &in, char **buffer, size_t *buflen)
 915 {
 916         ssize_t len = buflen && *buflen ? *buflen : 0;
 917         char * buf = buffer && *buffer ? *buffer : NULL;
 918         ssize_t cnt = 0; // Number of characters currently in the buffer.
 919         char c;
 920         status_t errorCode;
 921
 922         while (true) {
 923                 // Make sure there is space in the buffer for two more characters (one
 924                 // for the next character, and one for the end of string NUL byte).
 925                 if (buf == NULL || cnt + 2 >= len) {
 926                         char *temp = (char *)realloc(buf, len + 64);
 927                         if (temp == NULL) {
 928                                 // Out of memory, however existing buffer remains allocated.
 929                                 cnt = ENOMEM;
 930                                 break;
 931                         }
 932                         len += 64;
 933                         buf = temp;
 934                 }
 935
 936                 errorCode = in.Read (&c,1); // A really slow way of reading - unbuffered.
 937                 if (errorCode != 1) {
 938                         if (errorCode < 0) {
 939                                 cnt = errorCode; // IO error encountered, just return the code.
 940                         } else {
 941                                 // Really is end of file.  Also make it end of line if there is
 942                                 // some text already read in.  If the first thing read was EOF,
 943                                 // just return an empty string.
 944                                 if (cnt > 0) {
 945                                         buf[cnt++] = '\n';
 946                                         if (buf[cnt-2] == '\r') {
 947                                                 buf[cnt-2] = '\n';
 948                                                 --cnt;
 949                                         }
 950                                 }
 951                         }
 952                         break;
 953                 }
 954
 955                 buf[cnt++] = c;
 956
 957                 if (c == '\n') {
 958                         // Convert CRLF end of line to just a LF.  Do it before folding, in
 959                         // case we don't need to fold.
 960                         if (cnt >= 2 && buf[cnt-2] == '\r') {
 961                                 buf[cnt-2] = '\n';
 962                                 --cnt;
 963                         }
 964                         // If the current line is empty then return it (so that empty lines
 965                         // don't disappear if the next line starts with a space).
 966                         if (cnt <= 1)
 967                                 break;
 968                         // if first character on the next line is whitespace, fold lines
 969                         errorCode = in.Read(&c,1);
 970                         if (errorCode == 1) {
 971                                 if (c == ' ' || c == '\t')
 972                                         buf[cnt-1] = c; // Replace \n with the white space character.
 973                                 else {
 974                                         // Not folding, we finished reading a whole line.
 975                                         in.Seek(-1,SEEK_CUR); // Undo the look-ahead character read.
 976                                         break;
 977                                 }
 978                         } else if (errorCode < 0) {
 979                                 cnt = errorCode;
 980                                 break;
 981                         } else // No next line; at the end of the file.  Return the line.
 982                                 break;
 983                 }
 984         }
 985
 986         if (buf != NULL && cnt >= 0)
 987                 buf[cnt] = '\0';
 988
 989         if (buffer)
 990                 *buffer = buf;
 991         else if (buf)
 992                 free(buf);
 993
 994         if (buflen)
 995                 *buflen = len;
 996
 997         return cnt;
 998 }
 999
1000
1001 ssize_t
1002 nextfoldedline(const char** header, char **buffer, size_t *buflen)
1003 {
1004         ssize_t len = buflen && *buflen ? *buflen : 0;
1005         char * buf = buffer && *buffer ? *buffer : NULL;
1006         ssize_t cnt = 0; // Number of characters currently in the buffer.
1007         char c;
1008
1009         while (true)
1010         {
1011                 // Make sure there is space in the buffer for two more characters (one
1012                 // for the next character, and one for the end of string NUL byte).
1013                 if (buf == NULL || cnt + 2 >= len)
1014                 {
1015                         char *temp = (char *)realloc(buf, len + 64);
1016                         if (temp == NULL) {
1017                                 // Out of memory, however existing buffer remains allocated.
1018                                 cnt = ENOMEM;
1019                                 break;
1020                         }
1021                         len += 64;
1022                         buf = temp;
1023                 }
1024
1025                 // Read the next character, or end of file.
1026                 if ((c = *(*header)++) == 0) {
1027                         // End of file.  Also make it end of line if there is some text
1028                         // already read in.  If the first thing read was EOF, just return
1029                         // an empty string.
1030                         if (cnt > 0) {
1031                                 buf[cnt++] = '\n';
1032                                 if (buf[cnt-2] == '\r') {
1033                                         buf[cnt-2] = '\n';
1034                                         --cnt;
1035                                 }
1036                         }
1037                         break;
1038                 }
1039
1040                 buf[cnt++] = c;
1041
1042                 if (c == '\n') {
1043                         // Convert CRLF end of line to just a LF.  Do it before folding, in
1044                         // case we don't need to fold.
1045                         if (cnt >= 2 && buf[cnt-2] == '\r') {
1046                                 buf[cnt-2] = '\n';
1047                                 --cnt;
1048                         }
1049                         // If the current line is empty then return it (so that empty lines
1050                         // don't disappear if the next line starts with a space).
1051                         if (cnt <= 1)
1052                                 break;
1053                         // if first character on the next line is whitespace, fold lines
1054                         c = *(*header)++;
1055                         if (c == ' ' || c == '\t')
1056                                 buf[cnt-1] = c; // Replace \n with the white space character.
1057                         else {
1058                                 // Not folding, we finished reading a line; break out of the loop
1059                                 (*header)--; // Undo read of the non-whitespace.
1060                                 break;
1061                         }
1062                 }
1063         }
1064
1065
1066         if (buf != NULL && cnt >= 0)
1067                 buf[cnt] = '\0';
1068
1069         if (buffer)
1070                 *buffer = buf;
1071         else if (buf)
1072                 free(buf);
1073
1074         if (buflen)
1075                 *buflen = len;
1076
1077         return cnt;
1078 }
1079
1080
1081 void
1082 trim_white_space(BString &string)
1083 {
1084         int32 i;
1085         int32 length = string.Length();
1086         char *buffer = string.LockBuffer(length + 1);
1087
1088         while (length > 0 && isspace(buffer[length - 1]))
1089                 length--;
1090         buffer[length] = '\0';
1091
1092         for (i = 0; buffer[i] && isspace(buffer[i]); i++) {}
1093         if (i != 0) {
1094                 length -= i;
1095                 memmove(buffer,buffer + i,length + 1);
1096         }
1097         string.UnlockBuffer(length);
1098 }
1099
1100
1101 /*!     Tries to return a human-readable name from the specified
1102         header parameter (should be from "To:" or "From:").
1103         Tries to return the name rather than the eMail address.
1104 */
1105 void
1106 extract_address_name(BString &header)
1107 {
1108         BString name;
1109         const char *start = header.String();
1110         const char *stop = start + strlen (start);
1111
1112         // Find a string S in the header (email foo) that matches:
1113         //   Old style name in brackets: foo@bar.com (S)
1114         //   New style quotes: "S" <foo@bar.com>
1115         //   New style no quotes if nothing else found: S <foo@bar.com>
1116         //   If nothing else found then use the whole thing: S
1117
1118         for (int i = 0; i <= 3; i++) {
1119                 // Set p1 to the first letter in the name and p2 to just past the last
1120                 // letter in the name.  p2 stays NULL if a name wasn't found in this
1121                 // pass.
1122                 const char *p1 = NULL, *p2 = NULL;
1123
1124                 switch (i) {
1125                         case 0: // foo@bar.com (S)
1126                                 if ((p1 = strchr(start,'(')) != NULL) {
1127                                         p1++; // Advance to first letter in the name.
1128                                         size_t nest = 1; // Handle nested brackets.
1129                                         for (p2 = p1; p2 < stop; ++p2)
1130                                         {
1131                                                 if (*p2 == ')')
1132                                                         --nest;
1133                                                 else if (*p2 == '(')
1134                                                         ++nest;
1135                                                 if (nest <= 0)
1136                                                         break;
1137                                         }
1138                                         if (nest != 0)
1139                                                 p2 = NULL; // False alarm, no terminating bracket.
1140                                 }
1141                                 break;
1142                         case 1: // "S" <foo@bar.com>
1143                                 if ((p1 = strchr(start, '\"')) != NULL)
1144                                         p2 = strchr(++p1, '\"');
1145                                 break;
1146                         case 2: // S <foo@bar.com>
1147                                 p1 = start;
1148                                 if (name.Length() == 0)
1149                                         p2 = strchr(start, '<');
1150                                 break;
1151                         case 3: // S
1152                                 p1 = start;
1153                                 if (name.Length() == 0)
1154                                         p2 = stop;
1155                                 break;
1156                 }
1157
1158                 // Remove leading and trailing space-like characters and save the
1159                 // result if it is longer than any other likely names found.
1160                 if (p2 != NULL) {
1161                         while (p1 < p2 && (isspace (*p1)))
1162                                 ++p1;
1163
1164                         while (p1 < p2 && (isspace (p2[-1])))
1165                                 --p2;
1166
1167                         int newLength = p2 - p1;
1168                         if (name.Length() < newLength)
1169                                 name.SetTo(p1, newLength);
1170                 }
1171         }
1172
1173         int32 lessIndex = name.FindFirst('<');
1174         int32 greaterIndex = name.FindLast('>');
1175
1176         if (lessIndex == 0) {
1177                 // Have an address of the form <address> and nothing else, so remove
1178                 // the greater and less than signs, if any.
1179                 if (greaterIndex > 0)
1180                         name.Remove(greaterIndex, 1);
1181                 name.Remove(lessIndex, 1);
1182         } else if (lessIndex > 0 && lessIndex < greaterIndex) {
1183                 // Yahoo stupidly inserts the e-mail address into the name string, so
1184                 // this bit of code fixes: "Joe <joe@yahoo.com>" <joe@yahoo.com>
1185                 name.Remove(lessIndex, greaterIndex - lessIndex + 1);
1186         }
1187
1188         trim_white_space(name);
1189         header = name;
1190 }
1191
1192
1193 /*!     Given a subject in a BString, remove the extraneous RE: re: and other stuff
1194         to get down to the core subject string, which should be identical for all
1195         messages posted about a topic.  The input string is modified in place to
1196         become the output core subject string.
1197 */
1198 void
1199 SubjectToThread (BString &string)
1200 {
1201 // a regex that matches a non-ASCII UTF8 character:
1202 #define U8C \
1203         "[\302-\337][\200-\277]" \
1204         "|\340[\302-\337][\200-\277]" \
1205         "|[\341-\357][\200-\277][\200-\277]" \
1206         "|\360[\220-\277][\200-\277][\200-\277]" \
1207         "|[\361-\367][\200-\277][\200-\277][\200-\277]" \
1208         "|\370[\210-\277][\200-\277][\200-\277][\200-\277]" \
1209         "|[\371-\373][\200-\277][\200-\277][\200-\277][\200-\277]" \
1210         "|\374[\204-\277][\200-\277][\200-\277][\200-\277][\200-\277]" \
1211         "|\375[\200-\277][\200-\277][\200-\277][\200-\277][\200-\277]"
1212
1213 #define PATTERN \
1214         "^ +" \
1215         "|^(\\[[^]]*\\])(\\<|  +| *(\\<(\\w|" U8C "){2,3} *(\\[[^\\]]*\\])? *:)+ *)" \
1216         "|^(  +| *(\\<(\\w|" U8C "){2,3} *(\\[[^\\]]*\\])? *:)+ *)" \
1217         "| *\\(fwd\\) *$"
1218
1219         if (gRebuf == NULL && atomic_add(&gLocker, 1) == 0) {
1220                 // the idea is to compile the regexp once to speed up testing
1221
1222                 for (int i=0; i<256; ++i) gTranslation[i]=i;
1223                 for (int i='a'; i<='z'; ++i) gTranslation[i]=toupper(i);
1224
1225                 gRe.translate = gTranslation;
1226                 gRe.regs_allocated = REGS_FIXED;
1227                 re_syntax_options = RE_SYNTAX_POSIX_EXTENDED;
1228
1229                 const char *pattern = PATTERN;
1230                 // count subexpressions in PATTERN
1231                 for (unsigned int i=0; pattern[i] != 0; ++i)
1232                 {
1233                         if (pattern[i] == '\\')
1234                                 ++i;
1235                         else if (pattern[i] == '(')
1236                                 ++gNsub;
1237                 }
1238
1239                 const char *err = re_compile_pattern(pattern,strlen(pattern),&gRe);
1240                 if (err == NULL)
1241                         gRebuf = &gRe;
1242                 else
1243                         fprintf(stderr, "Failed to compile the regex: %s\n", err);
1244         } else {
1245                 int32 tries = 200;
1246                 while (gRebuf == NULL && tries-- > 0)
1247                         snooze(10000);
1248         }
1249
1250         if (gRebuf) {
1251                 struct re_registers regs;
1252                 // can't be static if this function is to be thread-safe
1253
1254                 regs.num_regs = gNsub;
1255                 regs.start = (regoff_t*)malloc(gNsub*sizeof(regoff_t));
1256                 regs.end = (regoff_t*)malloc(gNsub*sizeof(regoff_t));
1257
1258                 for (int start = 0; (start = re_search(gRebuf, string.String(),
1259                                 string.Length(), 0, string.Length(), &regs)) >= 0;) {
1260                         //
1261                         // we found something
1262                         //
1263
1264                         // don't delete [bemaildaemon]...
1265                         if (start == regs.start[1])
1266                                 start = regs.start[2];
1267
1268                         string.Remove(start,regs.end[0]-start);
1269                         if (start)
1270                                 string.Insert(' ',1,start);
1271
1272                         // TODO: for some subjects this results in an endless loop, check
1273                         // why this happen.
1274                         if (regs.end[0] - start <= 1)
1275                                 break;
1276                 }
1277
1278                 free(regs.start);
1279                 free(regs.end);
1280         }
1281
1282         // Finally remove leading and trailing space.  Some software, like
1283         // tm-edit 1.8, appends a space to the subject, which would break
1284         // threading if we left it in.
1285         trim_white_space(string);
1286 }
1287
1288
1289 /*!     Converts a date to a time.  Handles numeric time zones too, unlike
1290         parsedate().  Returns -1 if it fails.
1291 */
1292 time_t
1293 ParseDateWithTimeZone(const char *DateString)
1294 {
1295         time_t currentTime;
1296         time_t dateAsTime;
1297         char tempDateString[80];
1298         char tempZoneString[6];
1299         time_t zoneDeltaTime;
1300         int zoneIndex;
1301         char *zonePntr;
1302
1303         // See if we can remove the time zone portion.  parsedate understands time
1304         // zone 3 letter names, but doesn't understand the numeric +9999 time zone
1305         // format.  To do: see if a newer parsedate exists.
1306
1307         strncpy (tempDateString, DateString, sizeof (tempDateString));
1308         tempDateString[sizeof (tempDateString) - 1] = 0;
1309
1310         // Remove trailing spaces.
1311         zonePntr = tempDateString + strlen (tempDateString) - 1;
1312         while (zonePntr >= tempDateString && isspace (*zonePntr))
1313                 *zonePntr-- = 0;
1314         if (zonePntr < tempDateString)
1315                 return -1; // Empty string.
1316
1317         // Remove the trailing time zone in round brackets, like in
1318         // Fri, 22 Feb 2002 15:22:42 EST (-0500)
1319         // Thu, 25 Apr 1996 11:44:19 -0400 (EDT)
1320         if (tempDateString[strlen(tempDateString)-1] == ')')
1321         {
1322                 zonePntr = strrchr (tempDateString, '(');
1323                 if (zonePntr != NULL)
1324                 {
1325                         *zonePntr-- = 0; // Zap the '(', then remove trailing spaces.
1326                         while (zonePntr >= tempDateString && isspace (*zonePntr))
1327                                 *zonePntr-- = 0;
1328                         if (zonePntr < tempDateString)
1329                                 return -1; // Empty string.
1330                 }
1331         }
1332
1333         // Look for a numeric time zone like  Tue, 30 Dec 2003 05:01:40 +0000
1334         for (zoneIndex = strlen (tempDateString); zoneIndex >= 0; zoneIndex--)
1335         {
1336                 zonePntr = tempDateString + zoneIndex;
1337                 if (zonePntr[0] == '+' || zonePntr[0] == '-')
1338                 {
1339                         if (zonePntr[1] >= '0' && zonePntr[1] <= '9' &&
1340                                 zonePntr[2] >= '0' && zonePntr[2] <= '9' &&
1341                                 zonePntr[3] >= '0' && zonePntr[3] <= '9' &&
1342                                 zonePntr[4] >= '0' && zonePntr[4] <= '9')
1343                                 break;
1344                 }
1345         }
1346         if (zoneIndex >= 0)
1347         {
1348                 // Remove the zone from the date string and any following time zone
1349                 // letter codes.  Also put in GMT so that the date gets parsed as GMT.
1350                 memcpy (tempZoneString, zonePntr, 5);
1351                 tempZoneString [5] = 0;
1352                 strcpy (zonePntr, "GMT");
1353         }
1354         else // No numeric time zone found.
1355                 strcpy (tempZoneString, "+0000");
1356
1357         time (&currentTime);
1358         dateAsTime = parsedate (tempDateString, currentTime);
1359         if (dateAsTime == (time_t) -1)
1360                 return -1; // Failure.
1361
1362         zoneDeltaTime = 60 * atol (tempZoneString + 3); // Get the last two digits - minutes.
1363         tempZoneString[3] = 0;
1364         zoneDeltaTime += atol (tempZoneString + 1) * 60 * 60; // Get the first two digits - hours.
1365         if (tempZoneString[0] == '+')
1366                 zoneDeltaTime = 0 - zoneDeltaTime;
1367         dateAsTime += zoneDeltaTime;
1368
1369         return dateAsTime;
1370 }
1371
1372
1373 /*! Parses a mail header and fills the headers BMessage
1374 */
1375 status_t
1376 parse_header(BMessage &headers, BPositionIO &input)
1377 {
1378         char *buffer = NULL;
1379         size_t bufferSize = 0;
1380         int32 length;
1381
1382         while ((length = readfoldedline(input, &buffer, &bufferSize)) >= 2) {
1383                 --length;
1384                         // Don't include the \n at the end of the buffer.
1385
1386                 // convert to UTF-8 and null-terminate the buffer
1387                 length = rfc2047_to_utf8(&buffer, &bufferSize, length);
1388                 buffer[length] = '\0';
1389
1390                 const char *delimiter = strstr(buffer, ":");
1391                 if (delimiter == NULL)
1392                         continue;
1393
1394                 BString header(buffer, delimiter - buffer);
1395                 header.CapitalizeEachWord();
1396                         // unified case for later fetch
1397
1398                 delimiter++; // Skip the colon.
1399                 // Skip over leading white space and tabs.
1400                 // TODO: (comments in brackets).
1401                 while (isspace(*delimiter))
1402                         delimiter++;
1403
1404                 // TODO: implement joining of multiple header tags (i.e. multiple "Cc:"s)
1405                 headers.AddString(header.String(), delimiter);
1406         }
1407         free(buffer);
1408
1409         return B_OK;
1410 }
1411
1412
1413 status_t
1414 extract_from_header(const BString& header, const BString& field,
1415         BString& target)
1416 {
1417         int32 headerLength = header.Length();
1418         int32 fieldEndPos = 0;
1419         while (true) {
1420                 int32 pos = header.IFindFirst(field, fieldEndPos);
1421                 if (pos < 0)
1422                         return B_BAD_VALUE;
1423                 fieldEndPos = pos + field.Length();
1424
1425                 if (pos != 0 && header.ByteAt(pos - 1) != '\n')
1426                         continue;
1427                 if (header.ByteAt(fieldEndPos) == ':')
1428                         break;
1429         }
1430         fieldEndPos++;
1431
1432         int32 crPos = fieldEndPos;
1433         while (true) {
1434                 fieldEndPos = crPos;
1435                 crPos = header.FindFirst('\n', crPos);
1436                 if (crPos < 0)
1437                         crPos = headerLength;
1438                 BString temp;
1439                 header.CopyInto(temp, fieldEndPos, crPos - fieldEndPos);
1440                 if (header.ByteAt(crPos - 1) == '\r') {
1441                         temp.Truncate(temp.Length() - 1);
1442                         temp += " ";
1443                 }
1444                 target += temp;
1445                 crPos++;
1446                 if (crPos >= headerLength)
1447                         break;
1448                 char nextByte = header.ByteAt(crPos);
1449                 if (nextByte != ' ' && nextByte != '\t')
1450                         break;
1451                 crPos++;
1452         }
1453
1454         size_t bufferSize = target.Length();
1455         char* buffer = target.LockBuffer(bufferSize);
1456         size_t length = rfc2047_to_utf8(&buffer, &bufferSize, bufferSize);
1457         target.UnlockBuffer(length);
1458
1459         trim_white_space(target);
1460
1461         return B_OK;
1462 }
1463
1464
1465 void
1466 extract_address(BString &address)
1467 {
1468         const char *string = address.String();
1469         int32 first;
1470
1471         // first, remove all quoted text
1472
1473         if ((first = address.FindFirst('"')) >= 0) {
1474                 int32 last = first + 1;
1475                 while (string[last] && string[last] != '"')
1476                         last++;
1477
1478                 if (string[last] == '"')
1479                         address.Remove(first, last + 1 - first);
1480         }
1481
1482         // try to extract the address now
1483
1484         if ((first = address.FindFirst('<')) >= 0) {
1485                 // the world likes us and we can just get the address the easy way...
1486                 int32 last = address.FindFirst('>');
1487                 if (last >= 0) {
1488                         address.Truncate(last);
1489                         address.Remove(0, first + 1);
1490
1491                         return;
1492                 }
1493         }
1494
1495         // then, see if there is anything in parenthesis to throw away
1496
1497         if ((first = address.FindFirst('(')) >= 0) {
1498                 int32 last = first + 1;
1499                 while (string[last] && string[last] != ')')
1500                         last++;
1501
1502                 if (string[last] == ')')
1503                         address.Remove(first, last + 1 - first);
1504         }
1505
1506         // now, there shouldn't be much else left
1507
1508         trim_white_space(address);
1509 }
1510
1511
1512 void
1513 get_address_list(BList &list, const char *string,
1514         void (*cleanupFunc)(BString &))
1515 {
1516         if (string == NULL || !string[0])
1517                 return;
1518
1519         const char *start = string;
1520
1521         while (true) {
1522                 if (string[0] == '"') {
1523                         const char *quoteEnd = ++string;
1524
1525                         while (quoteEnd[0] && quoteEnd[0] != '"')
1526                                 quoteEnd++;
1527
1528                         if (!quoteEnd[0])       // string exceeds line!
1529                                 quoteEnd = string;
1530
1531                         string = quoteEnd + 1;
1532                 }
1533
1534                 if (string[0] == ',' || string[0] == '\0') {
1535                         BString address(start, string - start);
1536                         trim_white_space(address);
1537
1538                         if (cleanupFunc)
1539                                 cleanupFunc(address);
1540
1541                         list.AddItem(strdup(address.String()));
1542
1543                         start = string + 1;
1544                 }
1545
1546                 if (!string[0])
1547                         break;
1548
1549                 string++;
1550         }
1551 }
1552
1553
1554 status_t
1555 CopyMailFolderAttributes(const char* targetPath)
1556 {
1557         BPath path;
1558         status_t status = find_directory(B_USER_SETTINGS_DIRECTORY, &path);
1559         if (status != B_OK)
1560                 return status;
1561
1562         path.Append("Tracker");
1563         path.Append("DefaultQueryTemplates");
1564         path.Append("text_x-email");
1565
1566         BNode source(path.Path());
1567         BNode target(targetPath);
1568         return BPrivate::CopyAttributes(source, target);
1569 }