include/tools/inetmime.hxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19 #ifndef INCLUDED_TOOLS_INETMIME_HXX
  20 #define INCLUDED_TOOLS_INETMIME_HXX
  21
  22 #include <boost/ptr_container/ptr_vector.hpp>
  23
  24 #include <tools/toolsdllapi.h>
  25 #include <rtl/alloc.h>
  26 #include <rtl/character.hxx>
  27 #include <rtl/string.hxx>
  28 #include <rtl/strbuf.hxx>
  29 #include <rtl/ustring.hxx>
  30 #include <rtl/tencinfo.h>
  31 #include <tools/debug.hxx>
  32 #include <tools/errcode.hxx>
  33
  34 class DateTime;
  35 class INetContentTypeParameterList;
  36 class INetMIMECharsetList_Impl;
  37 class INetMIMEOutputSink;
  38
  39 class TOOLS_DLLPUBLIC INetMIME
  40 {
  41 public:
  42     enum { SOFT_LINE_LENGTH_LIMIT = 76,
  43            HARD_LINE_LENGTH_LIMIT = 998 };
  44
  45     /** The various types of message header field bodies, with respect to
  46         encoding and decoding them.
  47
  48         @descr  At the moment, five different types of header fields suffice
  49         to describe how to encoded and decode any known message header field
  50         body, but need for more types may arise in the future as new header
  51         fields are introduced.
  52
  53         @descr  The following is an exhaustive list of all the header fields
  54         currently known to our implementation.  For every header field, it
  55         includes a 'canonic' (with regard to capitalization) name, a grammar
  56         rule for the body (using RFC 822 and RFC 2234 conventions), a list of
  57         relevant sources of information, and the HeaderFieldType value to use
  58         with that header field.  The list is based on RFC 2076 and draft-
  59         palme-mailext-headers-02.txt (see also <http://www.dsv.su.se/~jpalme/
  60         ietf/jp-ietf-home.html#anchor1003783>).
  61
  62         Approved: address  ;RFC 1036; HEADER_FIELD_ADDRESS
  63         bcc: #address  ;RFCs 822, 2047; HEADER_FIELD_ADDRESS
  64         cc: 1#address  ;RFCs 822, 2047; HEADER_FIELD_ADDRESS
  65         Comments: *text  ;RFCs 822, RFC 2047; HEADER_FIELD_TEXT
  66         Content-Base: absoluteURI  ;RFC 2110; HEADER_FIELD_TEXT
  67         Content-Description: *text  ;RFC 2045, RFC 2047; HEADER_FIELD_TEXT
  68         Content-Disposition: disposition-type *(";" disposition-parm)
  69             ;RFC 1806; HEADER_FIELD_STRUCTURED
  70         Content-ID: msg-id  ;RFC 2045, RFC 2047; HEADER_FIELD_MESSAGE_ID
  71         Content-Location: absoluteURI / relativeURI  ;RFC 2110;
  72             HEADER_FIELD_TEXT
  73         Content-Transfer-Encoding: mechanism  ;RFC 2045, RFC 2047;
  74             HEADER_FIELD_STRUCTURED
  75         Content-Type: type "/" subtype *(";" parameter)  ;RFC 2045, RFC 2047;
  76             HEADER_FIELD_STRUCTURED
  77         Control:  *text ;RFC 1036; HEADER_FIELD_TEXT
  78         Date: date-time  ;RFC 822, RFC 1123, RFC 2047; HEADER_FIELD_STRUCTURED
  79         Distribution: 1#atom  ;RFC 1036; HEADER_FIELD_STRUCTURED
  80         Encrypted: 1#2word  ;RFC 822, RFC 2047; HEADER_FIELD_STRUCTURED
  81         Expires: date-time  ;RFC 1036; HEADER_FIELD_STRUCTURED
  82         Followup-To: 1#(atom *("." atom))  ;RFC 1036; HEADER_FIELD_STRUCTURED
  83         From: mailbox / 1#mailbox  ;RFC 822, RFC 2047; HEADER_FIELD_ADDRESS
  84         In-Reply-To: *(phrase / msg-id)  ;RFC 822, RFC 2047;
  85             HEADER_FIELD_ADDRESS
  86         Keywords: #phrase  ;RFC 822, RFC 2047; HEADER_FIELD_PHRASE
  87         MIME-Version: 1*DIGIT "." 1*DIGIT  ;RFC 2045, RFC 2047;
  88             HEADER_FIELD_STRUCTURED
  89         Message-ID: msg-id  ;RFC 822, RFC 2047; HEADER_FIELD_MESSAGE_ID
  90         Newsgroups: 1#(atom *("." atom))  ;RFC 1036, RFC 2047;
  91             HEADER_FIELD_STRUCTURED
  92         Organization: *text  ;RFC 1036; HEADER_FIELD_TEXT
  93         Received: ["from" domain] ["by" domain] ["via" atom] *("with" atom)
  94             ["id" msg-id] ["for" addr-spec] ";" date-time  ;RFC 822, RFC 1123,
  95             RFC 2047; HEADER_FIELD_STRUCTURED
  96         References: *(phrase / msg-id)  ;RFC 822, RFC 2047;
  97             HEADER_FIELD_ADDRESS
  98         Reply-To: 1#address  ;RFC 822, RFC 2047; HEADER_FIELD_ADDRESS
  99         Resent-Date: date-time  ;RFC 822, RFC 1123, RFC 2047;
 100             HEADER_FIELD_STRUCTURED
 101         Resent-From: mailbox / 1#mailbox  ;RFC 822, RFC 2047;
 102             HEADER_FIELD_ADDRESS
 103         Resent-Message-ID: msg-id  ;RFC 822, RFC 2047; HEADER_FIELD_MESSAGE_ID
 104         Resent-Reply-To: 1#address  ;RFC 822, RFC 2047; HEADER_FIELD_ADDRESS
 105         Resent-Sender: mailbox  ;RFC 822, RFC 2047; HEADER_FIELD_ADDRESS
 106         Resent-To: 1#address  ;RFC 822, RFC 2047; HEADER_FIELD_ADDRESS
 107         Resent-bcc: #address  ;RFC 822, RFC 2047; HEADER_FIELD_ADDRESS
 108         Resent-cc: 1#address  ;RFC 822, RFC 2047; HEADER_FIELD_ADDRESS
 109         Return-path: route-addr / ("<" ">")  ;RFC 822, RFC 1123, RFC 2047;
 110             HEADER_FIELD_STRUCTURED
 111         Return-Receipt-To: address  ;Not Internet standard;
 112             HEADER_FIELD_ADDRES
 113         Sender: mailbox  ;RFC 822, RFC 2047; HEADER_FIELD_ADDRESS
 114         Subject: *text  ;RFC 822, RFC 2047; HEADER_FIELD_TEXT
 115         Summary: *text  ;RFC 1036; HEADER_FIELD_TEXT
 116         To: 1#address  ;RFC 822, RFC 2047; HEADER_FIELD_ADDRESS
 117         X-CHAOS-Marked: "YES" / "NO"  ;local; HEADER_FIELD_STRUCTURED
 118         X-CHAOS-Read: "YES" / "NO"  ;local; HEADER_FIELD_STRUCTURED
 119         X-CHAOS-Recipients: #*("<" atom word ">")  ;local;
 120             HEADER_FIELD_STRUCTURED
 121         X-CHAOS-Size: 1*DIGIT  ;local; HEADER_FIELD_STRUCTURED
 122         X-Mailer: *text  ;Not Internet standard; HEADER_FIELD_TEXT
 123         X-Mozilla-Status: 4HEXDIG  ;Mozilla; HEADER_FIELD_STRUCTURED
 124         X-Newsreader: *text  ;Not Internet standard; HEADER_FIELD_TEXT
 125         X-Priority: "1" / "2" / "3" / "4" / "5"  ;Not Internet standard;
 126             HEADER_FIELD_STRUCTURED
 127         Xref: sub-domain
 128             1*((atom / string) *("." (atom / string)) ":" msg-number)
 129             ;RFCs 1036, 2047, local; HEADER_FIELD_STRUCTURED
 130      */
 131     enum HeaderFieldType
 132     {
 133         HEADER_FIELD_TEXT,
 134         HEADER_FIELD_STRUCTURED,
 135         HEADER_FIELD_PHRASE,
 136         HEADER_FIELD_MESSAGE_ID,
 137         HEADER_FIELD_ADDRESS
 138     };
 139
 140     /** Check for ISO 8859-1 character.
 141
 142         @param nChar  Some UCS-4 character.
 143
 144         @return  True if nChar is a ISO 8859-1 character (0x00--0xFF).
 145      */
 146     static inline bool isISO88591(sal_uInt32 nChar);
 147
 148     /** Check for US-ASCII control character.
 149
 150         @param nChar  Some UCS-4 character.
 151
 152         @return  True if nChar is a US-ASCII control character (US-ASCII
 153         0x00--0x1F or 0x7F).
 154      */
 155     static inline bool isControl(sal_uInt32 nChar);
 156
 157     /** Check for US-ASCII white space character.
 158
 159         @param nChar  Some UCS-4 character.
 160
 161         @return  True if nChar is a US-ASCII white space character (US-ASCII
 162         0x09 or 0x20).
 163      */
 164     static inline bool isWhiteSpace(sal_uInt32 nChar);
 165
 166     /** Check for US-ASCII visible character.
 167
 168         @param nChar  Some UCS-4 character.
 169
 170         @return  True if nChar is a US-ASCII visible character (US-ASCII
 171         0x21--0x7E).
 172      */
 173     static inline bool isVisible(sal_uInt32 nChar);
 174
 175     /** Check for US-ASCII Base 64 digit character.
 176
 177         @param nChar  Some UCS-4 character.
 178
 179         @return  True if nChar is a US-ASCII Base 64 digit character (US-ASCII
 180         'A'--'Z', 'a'--'z', '0'--'9', '+', or '/').
 181      */
 182     static inline bool isBase64Digit(sal_uInt32 nChar);
 183
 184     /** Check whether some character is valid within an RFC 822 <atom>.
 185
 186         @param nChar  Some UCS-4 character.
 187
 188         @return  True if nChar is valid within an RFC 822 <atom> (US-ASCII
 189         'A'--'Z', 'a'--'z', '0'--'9', '!', '#', '$', '%', '&', ''', '*', '+',
 190         '-', '/', '=', '?', '^', '_', '`', '{', '|', '}', or '~').
 191      */
 192     static bool isAtomChar(sal_uInt32 nChar);
 193
 194     /** Check whether some character is valid within an RFC 2045 <token>.
 195
 196         @param nChar  Some UCS-4 character.
 197
 198         @return  True if nChar is valid within an RFC 2047 <token> (US-ASCII
 199         'A'--'Z', 'a'--'z', '0'--'9', '!', '#', '$', '%', '&', ''', '*', '+',
 200         '-', '.', '^', '_', '`', '{', '|', '}', or '~').
 201      */
 202     static bool isTokenChar(sal_uInt32 nChar);
 203
 204     /** Check whether some character is valid within an RFC 2047 <token>.
 205
 206         @param nChar  Some UCS-4 character.
 207
 208         @return  True if nChar is valid within an RFC 2047 <token> (US-ASCII
 209         'A'--'Z', 'a'--'z', '0'--'9', '!', '#', '$', '%', '&', ''', '*', '+',
 210         '-', '^', '_', '`', '{', '|', '}', or '~').
 211      */
 212     static bool isEncodedWordTokenChar(sal_uInt32 nChar);
 213
 214     /** Check whether some character is valid within an RFC 2060 <atom>.
 215
 216         @param nChar  Some UCS-4 character.
 217
 218         @return  True if nChar is valid within an RFC 2060 <atom> (US-ASCII
 219         'A'--'Z', 'a'--'z', '0'--'9', '!', '#', '$', '&', ''', '+', ',', '-',
 220         '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', ']', '^', '_', '`',
 221         '|', '}', or '~').
 222      */
 223     static bool isIMAPAtomChar(sal_uInt32 nChar);
 224
 225     /** Get the digit weight of a US-ASCII character.
 226
 227         @param nChar  Some UCS-4 character.
 228
 229         @return  If nChar is a US-ASCII (decimal) digit character (US-ASCII
 230         '0'--'9'), return the corresponding weight (0--9); otherwise,
 231         return -1.
 232      */
 233     static inline int getWeight(sal_uInt32 nChar);
 234
 235     /** Get the hexadecimal digit weight of a US-ASCII character.
 236
 237         @param nChar  Some UCS-4 character.
 238
 239         @return  If nChar is a US-ASCII hexadecimal digit character (US-ASCII
 240         '0'--'9', 'A'--'F', or 'a'--'f'), return the corresponding weight
 241         (0--15); otherwise, return -1.
 242      */
 243     static inline int getHexWeight(sal_uInt32 nChar);
 244
 245     /** Get the Base 64 digit weight of a US-ASCII character.
 246
 247         @param nChar  Some UCS-4 character.
 248
 249         @return  If nChar is a US-ASCII Base 64 digit character (US-ASCII
 250         'A'--'F', or 'a'--'f', '0'--'9', '+', or '/'), return the
 251         corresponding weight (0--63); if nChar is the US-ASCII Base 64 padding
 252         character (US-ASCII '='), return -1; otherwise, return -2.
 253      */
 254     static inline int getBase64Weight(sal_uInt32 nChar);
 255
 256     /** Get a hexadecimal digit encoded as US-ASCII.
 257
 258         @param nWeight  Must be in the range 0--15, inclusive.
 259
 260         @return  The canonic (i.e., upper case) hexadecimal digit
 261         corresponding to nWeight (US-ASCII '0'--'9' or 'A'--'F').
 262      */
 263     static sal_uInt32 getHexDigit(int nWeight);
 264
 265     /** Check two US-ASCII strings for equality, ignoring case.
 266
 267         @param pBegin1  Points to the start of the first string, must not be
 268         null.
 269
 270         @param pEnd1  Points past the end of the first string, must be >=
 271         pBegin1.
 272
 273         @param pString2  Points to the start of the null terminated second
 274         string, must not be null.
 275
 276         @return  True if the two strings are equal, ignoring the case of US-
 277         ASCII alphabetic characters (US-ASCII 'A'--'Z' and 'a'--'z').
 278      */
 279     static bool equalIgnoreCase(const sal_Char * pBegin1,
 280                                 const sal_Char * pEnd1,
 281                                 const sal_Char * pString2);
 282
 283     /** Check two US-ASCII strings for equality, ignoring case.
 284
 285         @param pBegin1  Points to the start of the first string, must not be
 286         null.
 287
 288         @param pEnd1  Points past the end of the first string, must be >=
 289         pBegin1.
 290
 291         @param pString2  Points to the start of the null terminated second
 292         string, must not be null.
 293
 294         @return  True if the two strings are equal, ignoring the case of US-
 295         ASCII alphabetic characters (US-ASCII 'A'--'Z' and 'a'--'z').
 296      */
 297     static bool equalIgnoreCase(const sal_Unicode * pBegin1,
 298                                 const sal_Unicode * pEnd1,
 299                                 const sal_Char * pString2);
 300
 301     static inline bool startsWithLineBreak(const sal_Char * pBegin,
 302                                            const sal_Char * pEnd);
 303
 304     static inline bool startsWithLineBreak(const sal_Unicode * pBegin,
 305                                            const sal_Unicode * pEnd);
 306
 307     static inline bool startsWithLineFolding(const sal_Char * pBegin,
 308                                              const sal_Char * pEnd);
 309
 310     static inline bool startsWithLineFolding(const sal_Unicode * pBegin,
 311                                              const sal_Unicode * pEnd);
 312
 313     static bool startsWithLinearWhiteSpace(const sal_Char * pBegin,
 314                                            const sal_Char * pEnd);
 315
 316     static const sal_Unicode * skipLinearWhiteSpace(const sal_Unicode *
 317                                                         pBegin,
 318                                                     const sal_Unicode * pEnd);
 319
 320     static const sal_Unicode * skipComment(const sal_Unicode * pBegin,
 321                                            const sal_Unicode * pEnd);
 322
 323     static const sal_Unicode * skipLinearWhiteSpaceComment(const sal_Unicode *
 324                                                                pBegin,
 325                                                            const sal_Unicode *
 326                                                                pEnd);
 327
 328     static inline bool needsQuotedStringEscape(sal_uInt32 nChar);
 329
 330     static const sal_Char * skipQuotedString(const sal_Char * pBegin,
 331                                              const sal_Char * pEnd);
 332
 333     static const sal_Unicode * skipQuotedString(const sal_Unicode * pBegin,
 334                                                 const sal_Unicode * pEnd);
 335
 336     static bool scanUnsigned(const sal_Unicode *& rBegin,
 337                              const sal_Unicode * pEnd, bool bLeadingZeroes,
 338                              sal_uInt32 & rValue);
 339
 340     static const sal_Unicode * scanQuotedBlock(const sal_Unicode * pBegin,
 341                                                const sal_Unicode * pEnd,
 342                                                sal_uInt32 nOpening,
 343                                                sal_uInt32 nClosing,
 344                                                sal_Size & rLength,
 345                                                bool & rModify);
 346
 347     static sal_Unicode const * scanParameters(sal_Unicode const * pBegin,
 348                                               sal_Unicode const * pEnd,
 349                                               INetContentTypeParameterList *
 350                                                   pParameters);
 351
 352     /** Parse the body of an RFC 2045 Content-Type header field.
 353
 354         @param pBegin  The range (that must be valid) from non-null pBegin,
 355         inclusive. to non-null pEnd, exclusive, forms the body of the
 356         Content-Type header field.  It must be of the form
 357
 358           token "/" token *(";" token "=" (token / quoted-string))
 359
 360         with intervening linear white space and comments (cf. RFCs 822, 2045).
 361         The RFC 2231 extension are supported.  The encoding of rMediaType
 362         should be US-ASCII, but any Unicode values in the range U+0080..U+FFFF
 363         are interpretet 'as appropriate.'
 364
 365         @param pType  If not null, returns the type (the first of the above
 366         tokens), in US-ASCII encoding and converted to lower case.
 367
 368         @param pSubType  If not null, returns the sub-type (the second of the
 369         above tokens), in US-ASCII encoding and converted to lower case.
 370
 371         @param pParameters  If not null, returns the parameters as a list of
 372         INetContentTypeParameters (the attributes are in US-ASCII encoding and
 373         converted to lower case, the values are in Unicode encoding).  If
 374         null, only the syntax of the parameters is checked, but they are not
 375         returned.
 376
 377         @return  Null if the syntax of the field body is incorrect (i.e., does
 378         not start with type and sub-type tokens).  Otherwise, a pointer past the
 379         longest valid input prefix.  If null is returned, none of the output
 380         parameters will be modified.
 381      */
 382     static sal_Unicode const * scanContentType(
 383         sal_Unicode const *pBegin, sal_Unicode const * pEnd,
 384         OUString * pType = 0, OUString * pSubType = 0,
 385         INetContentTypeParameterList * pParameters = 0);
 386
 387     static inline rtl_TextEncoding translateToMIME(rtl_TextEncoding
 388                                                        eEncoding);
 389
 390     static inline rtl_TextEncoding translateFromMIME(rtl_TextEncoding
 391                                                          eEncoding);
 392
 393     static const sal_Char * getCharsetName(rtl_TextEncoding eEncoding);
 394
 395     static rtl_TextEncoding getCharsetEncoding(const sal_Char * pBegin,
 396                                                const sal_Char * pEnd);
 397
 398     static inline bool isMIMECharsetEncoding(rtl_TextEncoding eEncoding);
 399
 400     static INetMIMECharsetList_Impl *
 401     createPreferredCharsetList(rtl_TextEncoding eEncoding);
 402
 403     static sal_Unicode * convertToUnicode(const sal_Char * pBegin,
 404                                           const sal_Char * pEnd,
 405                                           rtl_TextEncoding eEncoding,
 406                                           sal_Size & rSize);
 407
 408     static sal_Char * convertFromUnicode(const sal_Unicode * pBegin,
 409                                          const sal_Unicode * pEnd,
 410                                          rtl_TextEncoding eEncoding,
 411                                          sal_Size & rSize);
 412
 413     /** Get the number of octets required to encode an UCS-4 character using
 414         UTF-8 encoding.
 415
 416         @param nChar  Some UCS-4 character.
 417
 418         @return  The number of octets required (in the range 1--6, inclusive).
 419      */
 420     static inline int getUTF8OctetCount(sal_uInt32 nChar);
 421
 422     static inline void writeEscapeSequence(INetMIMEOutputSink & rSink,
 423                                            sal_uInt32 nChar);
 424
 425     static void writeUTF8(INetMIMEOutputSink & rSink, sal_uInt32 nChar);
 426
 427     static void writeHeaderFieldBody(INetMIMEOutputSink & rSink,
 428                                      HeaderFieldType eType,
 429                                      const OUString& rBody,
 430                                      rtl_TextEncoding ePreferredEncoding,
 431                                      bool bInitialSpace = true);
 432
 433     static bool translateUTF8Char(const sal_Char *& rBegin,
 434                                   const sal_Char * pEnd,
 435                                   rtl_TextEncoding eEncoding,
 436                                   sal_uInt32 & rCharacter);
 437
 438     static OUString decodeHeaderFieldBody(HeaderFieldType eType,
 439                                            const OString& rBody);
 440
 441     /** Get the UTF-32 character at the head of a UTF-16 encoded string.
 442
 443         @param rBegin  Points to the start of the UTF-16 encoded string, must
 444         not be null.  On exit, it points past the first UTF-32 character's
 445         encoding.
 446
 447         @param pEnd  Points past the end of the UTF-16 encoded string, must be
 448         strictly greater than rBegin.
 449
 450         @return  The UCS-4 character at the head of the UTF-16 encoded string.
 451         If the string does not start with the UTF-16 encoding of a UCS-32
 452         character, the first UTF-16 value is returned.
 453      */
 454     static inline sal_uInt32 getUTF32Character(const sal_Unicode *& rBegin,
 455                                                const sal_Unicode * pEnd);
 456
 457     /** Put the UTF-16 encoding of a UTF-32 character into a buffer.
 458
 459         @param pBuffer  Points to a buffer, must not be null.
 460
 461         @param nUTF32  An UTF-32 character, must be in the range 0..0x10FFFF.
 462
 463         @return  A pointer past the UTF-16 characters put into the buffer
 464         (i.e., pBuffer + 1 or pBuffer + 2).
 465      */
 466     static inline sal_Unicode * putUTF32Character(sal_Unicode * pBuffer,
 467                                                   sal_uInt32 nUTF32);
 468 };
 469
 470 // static
 471 inline bool INetMIME::isISO88591(sal_uInt32 nChar)
 472 {
 473     return nChar <= 0xFF;
 474 }
 475
 476 // static
 477 inline bool INetMIME::isControl(sal_uInt32 nChar)
 478 {
 479     return nChar <= 0x1F || nChar == 0x7F;
 480 }
 481
 482 // static
 483 inline bool INetMIME::isWhiteSpace(sal_uInt32 nChar)
 484 {
 485     return nChar == '\t' || nChar == ' ';
 486 }
 487
 488 // static
 489 inline bool INetMIME::isVisible(sal_uInt32 nChar)
 490 {
 491     return nChar >= '!' && nChar <= '~';
 492 }
 493
 494 // static
 495 inline bool INetMIME::isBase64Digit(sal_uInt32 nChar)
 496 {
 497     return rtl::isAsciiUpperCase(nChar) || rtl::isAsciiLowerCase(nChar) || rtl::isAsciiDigit(nChar)
 498            || nChar == '+' || nChar == '/';
 499 }
 500
 501 // static
 502 inline int INetMIME::getWeight(sal_uInt32 nChar)
 503 {
 504     return rtl::isAsciiDigit(nChar) ? int(nChar - '0') : -1;
 505 }
 506
 507 // static
 508 inline int INetMIME::getHexWeight(sal_uInt32 nChar)
 509 {
 510     return rtl::isAsciiDigit(nChar) ? int(nChar - '0') :
 511            nChar >= 'A' && nChar <= 'F' ? int(nChar - 'A' + 10) :
 512            nChar >= 'a' && nChar <= 'f' ? int(nChar - 'a' + 10) : -1;
 513 }
 514
 515 // static
 516 inline int INetMIME::getBase64Weight(sal_uInt32 nChar)
 517 {
 518     return rtl::isAsciiUpperCase(nChar) ? int(nChar - 'A') :
 519            rtl::isAsciiLowerCase(nChar) ? int(nChar - 'a' + 26) :
 520            rtl::isAsciiDigit(nChar) ? int(nChar - '0' + 52) :
 521            nChar == '+' ? 62 :
 522            nChar == '/' ? 63 :
 523            nChar == '=' ? -1 : -2;
 524 }
 525
 526 // static
 527 inline bool INetMIME::startsWithLineBreak(const sal_Char * pBegin,
 528                                           const sal_Char * pEnd)
 529 {
 530     DBG_ASSERT(pBegin && pBegin <= pEnd,
 531                "INetMIME::startsWithLineBreak(): Bad sequence");
 532
 533     return pEnd - pBegin >= 2 && pBegin[0] == 0x0D && pBegin[1] == 0x0A;
 534         // CR, LF
 535 }
 536
 537 // static
 538 inline bool INetMIME::startsWithLineBreak(const sal_Unicode * pBegin,
 539                                               const sal_Unicode * pEnd)
 540 {
 541     DBG_ASSERT(pBegin && pBegin <= pEnd,
 542                "INetMIME::startsWithLineBreak(): Bad sequence");
 543
 544     return pEnd - pBegin >= 2 && pBegin[0] == 0x0D && pBegin[1] == 0x0A;
 545         // CR, LF
 546 }
 547
 548 // static
 549 inline bool INetMIME::startsWithLineFolding(const sal_Char * pBegin,
 550                                             const sal_Char * pEnd)
 551 {
 552     DBG_ASSERT(pBegin && pBegin <= pEnd,
 553                "INetMIME::startsWithLineFolding(): Bad sequence");
 554
 555     return pEnd - pBegin >= 3 && pBegin[0] == 0x0D && pBegin[1] == 0x0A
 556            && isWhiteSpace(pBegin[2]); // CR, LF
 557 }
 558
 559 // static
 560 inline bool INetMIME::startsWithLineFolding(const sal_Unicode * pBegin,
 561                                             const sal_Unicode * pEnd)
 562 {
 563     DBG_ASSERT(pBegin && pBegin <= pEnd,
 564                "INetMIME::startsWithLineFolding(): Bad sequence");
 565
 566     return pEnd - pBegin >= 3 && pBegin[0] == 0x0D && pBegin[1] == 0x0A
 567            && isWhiteSpace(pBegin[2]); // CR, LF
 568 }
 569
 570 // static
 571 inline bool INetMIME::startsWithLinearWhiteSpace(const sal_Char * pBegin,
 572                                                  const sal_Char * pEnd)
 573 {
 574     DBG_ASSERT(pBegin && pBegin <= pEnd,
 575                "INetMIME::startsWithLinearWhiteSpace(): Bad sequence");
 576
 577     return pBegin != pEnd
 578            && (isWhiteSpace(*pBegin) || startsWithLineFolding(pBegin, pEnd));
 579 }
 580
 581 // static
 582 inline bool INetMIME::needsQuotedStringEscape(sal_uInt32 nChar)
 583 {
 584     return nChar == '"' || nChar == '\\';
 585 }
 586
 587 // static
 588 inline rtl_TextEncoding INetMIME::translateToMIME(rtl_TextEncoding eEncoding)
 589 {
 590 #if defined WNT
 591     return eEncoding == RTL_TEXTENCODING_MS_1252 ?
 592                RTL_TEXTENCODING_ISO_8859_1 : eEncoding;
 593 #else // WNT
 594     return eEncoding;
 595 #endif // WNT
 596 }
 597
 598 // static
 599 inline rtl_TextEncoding INetMIME::translateFromMIME(rtl_TextEncoding
 600                                                         eEncoding)
 601 {
 602 #if defined WNT
 603     return eEncoding == RTL_TEXTENCODING_ISO_8859_1 ?
 604                RTL_TEXTENCODING_MS_1252 : eEncoding;
 605 #else
 606     return eEncoding;
 607 #endif
 608 }
 609
 610 // static
 611 inline bool INetMIME::isMIMECharsetEncoding(rtl_TextEncoding eEncoding)
 612 {
 613     return ( rtl_isOctetTextEncoding(eEncoding) == sal_True );
 614 }
 615
 616 // static
 617 inline int INetMIME::getUTF8OctetCount(sal_uInt32 nChar)
 618 {
 619     DBG_ASSERT(nChar < 0x80000000, "INetMIME::getUTF8OctetCount(): Bad char");
 620
 621     return nChar < 0x80 ? 1 :
 622            nChar < 0x800 ? 2 :
 623            nChar <= 0x10000 ? 3 :
 624            nChar <= 0x200000 ? 4 :
 625            nChar <= 0x4000000 ? 5 : 6;
 626 }
 627
 628 // static
 629 inline sal_uInt32 INetMIME::getUTF32Character(const sal_Unicode *& rBegin,
 630                                               const sal_Unicode * pEnd)
 631 {
 632     DBG_ASSERT(rBegin && rBegin < pEnd,
 633                "INetMIME::getUTF32Character(): Bad sequence");
 634     if (rBegin + 1 < pEnd && rBegin[0] >= 0xD800 && rBegin[0] <= 0xDBFF
 635         && rBegin[1] >= 0xDC00 && rBegin[1] <= 0xDFFF)
 636     {
 637         sal_uInt32 nUTF32 = sal_uInt32(*rBegin++ & 0x3FF) << 10;
 638         return (nUTF32 | (*rBegin++ & 0x3FF)) + 0x10000;
 639     }
 640     else
 641         return *rBegin++;
 642 }
 643
 644 // static
 645 inline sal_Unicode * INetMIME::putUTF32Character(sal_Unicode * pBuffer,
 646                                                  sal_uInt32 nUTF32)
 647 {
 648     DBG_ASSERT(nUTF32 <= 0x10FFFF, "INetMIME::putUTF32Character(): Bad char");
 649     if (nUTF32 < 0x10000)
 650         *pBuffer++ = sal_Unicode(nUTF32);
 651     else
 652     {
 653         nUTF32 -= 0x10000;
 654         *pBuffer++ = sal_Unicode(0xD800 | (nUTF32 >> 10));
 655         *pBuffer++ = sal_Unicode(0xDC00 | (nUTF32 & 0x3FF));
 656     }
 657     return pBuffer;
 658 }
 659
 660 class INetMIMEOutputSink
 661 {
 662 public:
 663     static sal_uInt32 const NO_LINE_LENGTH_LIMIT = SAL_MAX_UINT32;
 664
 665 private:
 666     sal_uInt32 m_nColumn;
 667     sal_uInt32 m_nLineLengthLimit;
 668
 669 protected:
 670     /** Write a sequence of octets.
 671
 672         @param pBegin  Points to the start of the sequence, must not be null.
 673
 674         @param pEnd  Points past the end of the sequence, must be >= pBegin.
 675      */
 676     virtual void writeSequence(const sal_Char * pBegin,
 677                                const sal_Char * pEnd) = 0;
 678
 679     /** Write a null terminated sequence of octets (without the terminating
 680         null).
 681
 682         @param pOctets  A null terminated sequence of octets, must not be
 683         null.
 684
 685         @return  The length of pOctets (without the terminating null).
 686      */
 687     sal_Size writeSequence(const sal_Char * pSequence);
 688
 689     /** Write a sequence of octets.
 690
 691         @descr  The supplied sequence of Unicode characters is interpreted as
 692         a sequence of octets.  It is an error if any of the elements of the
 693         sequence has a numerical value greater than 255.
 694
 695         @param pBegin  Points to the start of the sequence, must not be null.
 696
 697         @param pEnd  Points past the end of the sequence, must be >= pBegin.
 698      */
 699     void writeSequence(const sal_Unicode * pBegin,
 700                                const sal_Unicode * pEnd);
 701
 702 public:
 703     INetMIMEOutputSink(sal_uInt32 nTheColumn = 0,
 704                        sal_uInt32 nTheLineLengthLimit
 705                            = INetMIME::SOFT_LINE_LENGTH_LIMIT):
 706         m_nColumn(nTheColumn), m_nLineLengthLimit(nTheLineLengthLimit) {}
 707
 708     virtual ~INetMIMEOutputSink() {}
 709
 710     /** Get the current column.
 711
 712         @return  The current column (starting from zero).
 713      */
 714     sal_uInt32 getColumn() const { return m_nColumn; }
 715
 716     sal_uInt32 getLineLengthLimit() const { return m_nLineLengthLimit; }
 717
 718     void setLineLengthLimit(sal_uInt32 nTheLineLengthLimit)
 719     { m_nLineLengthLimit = nTheLineLengthLimit; }
 720
 721     virtual ErrCode getError() const;
 722
 723     /** Write a sequence of octets.
 724
 725         @param pBegin  Points to the start of the sequence, must not be null.
 726
 727         @param pEnd  Points past the end of the sequence, must be >= pBegin.
 728      */
 729     inline void write(const sal_Char * pBegin, const sal_Char * pEnd);
 730
 731     /** Write a sequence of octets.
 732
 733         @param pBegin  Points to the start of the sequence, must not be null.
 734
 735         @param nLength  The length of the sequence.
 736      */
 737     void write(const sal_Char * pBegin, sal_Size nLength)
 738     { write(pBegin, pBegin + nLength); }
 739
 740     /** Write a sequence of octets.
 741
 742         @descr  The supplied sequence of Unicode characters is interpreted as
 743         a sequence of octets.  It is an error if any of the elements of the
 744         sequence has a numerical value greater than 255.
 745
 746         @param pBegin  Points to the start of the sequence, must not be null.
 747
 748         @param pEnd  Points past the end of the sequence, must be >= pBegin.
 749      */
 750     inline void write(const sal_Unicode * pBegin, const sal_Unicode * pEnd);
 751
 752     /** Write a sequence of octets.
 753
 754         @param rOctets  A OString, interpreted as a sequence of octets.
 755
 756         @param nBegin  The offset of the first character to write.
 757
 758         @param nEnd  The offset past the last character to write.
 759      */
 760     void write(const OString& rOctets, sal_Int32 nBegin, sal_Int32 nEnd)
 761     {
 762         writeSequence(rOctets.getStr() + nBegin, rOctets.getStr() + nEnd);
 763         m_nColumn += nEnd - nBegin;
 764     }
 765
 766     /** Write a single octet.
 767
 768         @param nOctet  Some octet.
 769
 770         @return  This instance.
 771      */
 772     inline INetMIMEOutputSink & operator <<(sal_Char nOctet);
 773
 774     /** Write a null terminated sequence of octets (without the terminating
 775         null).
 776
 777         @param pOctets  A null terminated sequence of octets, must not be
 778         null.
 779
 780         @return  This instance.
 781      */
 782     inline INetMIMEOutputSink & operator <<(const sal_Char * pOctets);
 783
 784     /** Write a sequence of octets.
 785
 786         @param rOctets  A OString, interpreted as a sequence of octets.
 787
 788         @return  This instance.
 789      */
 790     INetMIMEOutputSink & operator <<(const OString& rOctets)
 791     {
 792         writeSequence(rOctets.getStr(), rOctets.getStr() + rOctets.getLength());
 793         m_nColumn += rOctets.getLength();
 794         return *this;
 795     }
 796
 797     /** Call a manipulator function.
 798
 799         @param  pManipulator  A manipulator function.
 800
 801         @return  Whatever the manipulator function returns.
 802      */
 803     INetMIMEOutputSink &
 804     operator <<(INetMIMEOutputSink & (* pManipulator)(INetMIMEOutputSink &))
 805     { return pManipulator(*this); }
 806
 807     /** Write a line end (CR LF).
 808      */
 809     void writeLineEnd();
 810
 811     /** A manipulator function that writes a line end (CR LF).
 812
 813         @param rSink  Some sink.
 814
 815         @return  The sink rSink.
 816      */
 817     static inline INetMIMEOutputSink & endl(INetMIMEOutputSink & rSink);
 818 };
 819
 820 inline void INetMIMEOutputSink::write(const sal_Char * pBegin,
 821                                       const sal_Char * pEnd)
 822 {
 823     writeSequence(pBegin, pEnd);
 824     m_nColumn += pEnd - pBegin;
 825 }
 826
 827 inline void INetMIMEOutputSink::write(const sal_Unicode * pBegin,
 828                                       const sal_Unicode * pEnd)
 829 {
 830     writeSequence(pBegin, pEnd);
 831     m_nColumn += pEnd - pBegin;
 832 }
 833
 834 inline INetMIMEOutputSink & INetMIMEOutputSink::operator <<(sal_Char nOctet)
 835 {
 836     writeSequence(&nOctet, &nOctet + 1);
 837     ++m_nColumn;
 838     return *this;
 839 }
 840
 841 inline INetMIMEOutputSink & INetMIMEOutputSink::operator <<(const sal_Char *
 842                                                                 pOctets)
 843 {
 844     m_nColumn += writeSequence(pOctets);
 845     return *this;
 846 }
 847
 848 // static
 849 inline INetMIMEOutputSink & INetMIMEOutputSink::endl(INetMIMEOutputSink &
 850                                                          rSink)
 851 {
 852     rSink.writeLineEnd();
 853     return rSink;
 854 }
 855
 856 // static
 857 inline void INetMIME::writeEscapeSequence(INetMIMEOutputSink & rSink,
 858                                           sal_uInt32 nChar)
 859 {
 860     DBG_ASSERT(nChar <= 0xFF, "INetMIME::writeEscapeSequence(): Bad char");
 861     rSink << '=' << sal_uInt8(getHexDigit(nChar >> 4))
 862           << sal_uInt8(getHexDigit(nChar & 15));
 863 }
 864
 865 class INetMIMEStringOutputSink: public INetMIMEOutputSink
 866 {
 867     OStringBuffer m_aBuffer;
 868
 869     using INetMIMEOutputSink::writeSequence;
 870
 871     virtual void writeSequence(const sal_Char * pBegin,
 872                                const sal_Char * pEnd) SAL_OVERRIDE;
 873
 874 public:
 875     inline INetMIMEStringOutputSink(sal_uInt32 nColumn = 0,
 876                                     sal_uInt32 nLineLengthLimit
 877                                         = INetMIME::SOFT_LINE_LENGTH_LIMIT):
 878         INetMIMEOutputSink(nColumn, nLineLengthLimit) {}
 879
 880     virtual ErrCode getError() const SAL_OVERRIDE;
 881
 882     OString takeBuffer()
 883     {
 884         return m_aBuffer.makeStringAndClear();
 885     }
 886 };
 887
 888 class INetMIMEEncodedWordOutputSink
 889 {
 890 public:
 891     enum Context { CONTEXT_TEXT = 1,
 892                    CONTEXT_COMMENT = 2,
 893                    CONTEXT_PHRASE = 4 };
 894
 895     enum Space { SPACE_NO, SPACE_ENCODED, SPACE_ALWAYS };
 896
 897 private:
 898     enum { BUFFER_SIZE = 256 };
 899
 900     enum Coding { CODING_NONE, CODING_QUOTED, CODING_ENCODED,
 901                   CODING_ENCODED_TERMINATED };
 902
 903     enum EncodedWordState { STATE_INITIAL, STATE_FIRST_EQUALS,
 904                             STATE_FIRST_QUESTION, STATE_CHARSET,
 905                             STATE_SECOND_QUESTION, STATE_ENCODING,
 906                             STATE_THIRD_QUESTION, STATE_ENCODED_TEXT,
 907                             STATE_FOURTH_QUESTION, STATE_SECOND_EQUALS,
 908                             STATE_BAD };
 909
 910     INetMIMEOutputSink & m_rSink;
 911     Context m_eContext;
 912     Space m_eInitialSpace;
 913     sal_uInt32 m_nExtraSpaces;
 914     INetMIMECharsetList_Impl * m_pEncodingList;
 915     sal_Unicode * m_pBuffer;
 916     sal_uInt32 m_nBufferSize;
 917     sal_Unicode * m_pBufferEnd;
 918     Coding m_ePrevCoding;
 919     rtl_TextEncoding m_ePrevMIMEEncoding;
 920     Coding m_eCoding;
 921     sal_uInt32 m_nQuotedEscaped;
 922     EncodedWordState m_eEncodedWordState;
 923
 924     inline bool needsEncodedWordEscape(sal_uInt32 nChar) const;
 925
 926     void finish(bool bWriteTrailer);
 927
 928 public:
 929     inline INetMIMEEncodedWordOutputSink(INetMIMEOutputSink & rTheSink,
 930                                          Context eTheContext,
 931                                          Space eTheInitialSpace,
 932                                          rtl_TextEncoding ePreferredEncoding);
 933
 934     ~INetMIMEEncodedWordOutputSink();
 935
 936     INetMIMEEncodedWordOutputSink & WriteUInt32(sal_uInt32 nChar);
 937
 938     inline void write(const sal_Char * pBegin, const sal_Char * pEnd);
 939
 940     inline void write(const sal_Unicode * pBegin, const sal_Unicode * pEnd);
 941
 942     inline bool flush();
 943 };
 944
 945 inline INetMIMEEncodedWordOutputSink::INetMIMEEncodedWordOutputSink(
 946            INetMIMEOutputSink & rTheSink, Context eTheContext,
 947            Space eTheInitialSpace, rtl_TextEncoding ePreferredEncoding):
 948     m_rSink(rTheSink),
 949     m_eContext(eTheContext),
 950     m_eInitialSpace(eTheInitialSpace),
 951     m_nExtraSpaces(0),
 952     m_pEncodingList(INetMIME::createPreferredCharsetList(ePreferredEncoding)),
 953     m_ePrevCoding(CODING_NONE),
 954     m_ePrevMIMEEncoding(RTL_TEXTENCODING_DONTKNOW),
 955     m_eCoding(CODING_NONE),
 956     m_nQuotedEscaped(0),
 957     m_eEncodedWordState(STATE_INITIAL)
 958 {
 959     m_nBufferSize = BUFFER_SIZE;
 960     m_pBuffer = static_cast< sal_Unicode * >(rtl_allocateMemory(
 961                                                  m_nBufferSize
 962                                                      * sizeof (sal_Unicode)));
 963     m_pBufferEnd = m_pBuffer;
 964 }
 965
 966 inline void INetMIMEEncodedWordOutputSink::write(const sal_Char * pBegin,
 967                                                  const sal_Char * pEnd)
 968 {
 969     DBG_ASSERT(pBegin && pBegin <= pEnd,
 970                "INetMIMEEncodedWordOutputSink::write(): Bad sequence");
 971
 972     while (pBegin != pEnd)
 973         WriteUInt32(*pBegin++);
 974 }
 975
 976 inline void INetMIMEEncodedWordOutputSink::write(const sal_Unicode * pBegin,
 977                                                  const sal_Unicode * pEnd)
 978 {
 979     DBG_ASSERT(pBegin && pBegin <= pEnd,
 980                "INetMIMEEncodedWordOutputSink::write(): Bad sequence");
 981
 982     while (pBegin != pEnd)
 983         WriteUInt32(*pBegin++);
 984 }
 985
 986 inline bool INetMIMEEncodedWordOutputSink::flush()
 987 {
 988     finish(true);
 989     return m_ePrevCoding != CODING_NONE;
 990 }
 991
 992 struct INetContentTypeParameter
 993 {
 994     /** The name of the attribute, in US-ASCII encoding and converted to lower
 995         case.  If a parameter value is split as described in RFC 2231, there
 996         will only be one item for the complete parameter, with the attribute
 997         name lacking any section suffix.
 998      */
 999     const OString m_sAttribute;
1000
1001     /** The optional character set specification (see RFC 2231), in US-ASCII
1002         encoding and converted to lower case.
1003      */
1004     const OString m_sCharset;
1005
1006     /** The optional language specification (see RFC 2231), in US-ASCII
1007         encoding and converted to lower case.
1008      */
1009     const OString m_sLanguage;
1010
1011     /** The attribute value.  If the value is a quoted-string, it is
1012         'unpacked.'  If a character set is specified, and the value can be
1013         converted to Unicode, this is done.  Also, if no character set is
1014         specified, it is first tried to convert the value from UTF-8 encoding
1015         to Unicode, and if that doesn't work (because the value is not in
1016         UTF-8 encoding), it is converted from ISO-8859-1 encoding to Unicode
1017         (which will always work).  But if a character set is specified and the
1018         value cannot be converted from that character set to Unicode, special
1019         action is taken to produce a value that can possibly be transformed
1020         back into its original form:  Any 8-bit character from a non-encoded
1021         part of the original value is directly converted to Unicode
1022         (effectively handling it as if it was ISO-8859-1 encoded), and any
1023         8-bit character from an encoded part of the original value is mapped
1024         to the range U+F800..U+F8FF at the top of the Corporate Use Subarea
1025         within Unicode's Private Use Area (effectively adding 0xF800 to the
1026         character's numeric value).
1027      */
1028     const OUString m_sValue;
1029
1030     /** This is true if the value is successfully converted to Unicode, and
1031         false if the value is a special mixture of ISO-LATIN-1 characters and
1032         characters from Unicode's Private Use Area.
1033      */
1034     const bool m_bConverted;
1035
1036     INetContentTypeParameter(const OString& rTheAttribute,
1037         const OString& rTheCharset, const OString& rTheLanguage,
1038         const OUString& rTheValue, bool bTheConverted)
1039     : m_sAttribute(rTheAttribute)
1040     , m_sCharset(rTheCharset)
1041     , m_sLanguage(rTheLanguage)
1042     , m_sValue(rTheValue)
1043     , m_bConverted(bTheConverted)
1044     {
1045     }
1046 };
1047
1048 class TOOLS_DLLPUBLIC INetContentTypeParameterList
1049 {
1050 public:
1051
1052     void Clear();
1053
1054     void Insert(INetContentTypeParameter * pParameter, sal_uIntPtr nIndex)
1055     {
1056         maEntries.insert(maEntries.begin()+nIndex,pParameter);
1057     }
1058
1059     void Append(INetContentTypeParameter *pParameter)
1060     {
1061         maEntries.push_back(pParameter);
1062     }
1063
1064     inline const INetContentTypeParameter * GetObject(sal_uIntPtr nIndex) const
1065     {
1066         return &(maEntries[nIndex]);
1067     }
1068
1069     const INetContentTypeParameter * find(const OString& rAttribute) const;
1070
1071 private:
1072
1073     boost::ptr_vector<INetContentTypeParameter> maEntries;
1074 };
1075
1076 #endif
1077
1078 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */