update emoji autocorrect entries from po-files
[LibreOffice.git] / include / tools / inetmime.hxx
blob45bf02a5878e102afa2f5dd8e1e0afe59cebde26
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
19 #ifndef INCLUDED_TOOLS_INETMIME_HXX
20 #define INCLUDED_TOOLS_INETMIME_HXX
22 #include <boost/ptr_container/ptr_vector.hpp>
24 #include <tools/toolsdllapi.h>
25 #include <rtl/alloc.h>
26 #include <rtl/character.hxx>
27 #include <rtl/string.hxx>
28 #include <rtl/strbuf.hxx>
29 #include <rtl/ustring.hxx>
30 #include <rtl/tencinfo.h>
31 #include <tools/debug.hxx>
32 #include <tools/errcode.hxx>
34 class DateTime;
35 class INetContentTypeParameterList;
36 class INetMIMECharsetList_Impl;
37 class INetMIMEOutputSink;
39 class TOOLS_DLLPUBLIC INetMIME
41 public:
42 enum { SOFT_LINE_LENGTH_LIMIT = 76,
43 HARD_LINE_LENGTH_LIMIT = 998 };
45 /** The various types of message header field bodies, with respect to
46 encoding and decoding them.
48 @descr At the moment, five different types of header fields suffice
49 to describe how to encoded and decode any known message header field
50 body, but need for more types may arise in the future as new header
51 fields are introduced.
53 @descr The following is an exhaustive list of all the header fields
54 currently known to our implementation. For every header field, it
55 includes a 'canonic' (with regard to capitalization) name, a grammar
56 rule for the body (using RFC 822 and RFC 2234 conventions), a list of
57 relevant sources of information, and the HeaderFieldType value to use
58 with that header field. The list is based on RFC 2076 and draft-
59 palme-mailext-headers-02.txt (see also <http://www.dsv.su.se/~jpalme/
60 ietf/jp-ietf-home.html#anchor1003783>).
62 Approved: address ;RFC 1036; HEADER_FIELD_ADDRESS
63 bcc: #address ;RFCs 822, 2047; HEADER_FIELD_ADDRESS
64 cc: 1#address ;RFCs 822, 2047; HEADER_FIELD_ADDRESS
65 Comments: *text ;RFCs 822, RFC 2047; HEADER_FIELD_TEXT
66 Content-Base: absoluteURI ;RFC 2110; HEADER_FIELD_TEXT
67 Content-Description: *text ;RFC 2045, RFC 2047; HEADER_FIELD_TEXT
68 Content-Disposition: disposition-type *(";" disposition-parm)
69 ;RFC 1806; HEADER_FIELD_STRUCTURED
70 Content-ID: msg-id ;RFC 2045, RFC 2047; HEADER_FIELD_MESSAGE_ID
71 Content-Location: absoluteURI / relativeURI ;RFC 2110;
72 HEADER_FIELD_TEXT
73 Content-Transfer-Encoding: mechanism ;RFC 2045, RFC 2047;
74 HEADER_FIELD_STRUCTURED
75 Content-Type: type "/" subtype *(";" parameter) ;RFC 2045, RFC 2047;
76 HEADER_FIELD_STRUCTURED
77 Control: *text ;RFC 1036; HEADER_FIELD_TEXT
78 Date: date-time ;RFC 822, RFC 1123, RFC 2047; HEADER_FIELD_STRUCTURED
79 Distribution: 1#atom ;RFC 1036; HEADER_FIELD_STRUCTURED
80 Encrypted: 1#2word ;RFC 822, RFC 2047; HEADER_FIELD_STRUCTURED
81 Expires: date-time ;RFC 1036; HEADER_FIELD_STRUCTURED
82 Followup-To: 1#(atom *("." atom)) ;RFC 1036; HEADER_FIELD_STRUCTURED
83 From: mailbox / 1#mailbox ;RFC 822, RFC 2047; HEADER_FIELD_ADDRESS
84 In-Reply-To: *(phrase / msg-id) ;RFC 822, RFC 2047;
85 HEADER_FIELD_ADDRESS
86 Keywords: #phrase ;RFC 822, RFC 2047; HEADER_FIELD_PHRASE
87 MIME-Version: 1*DIGIT "." 1*DIGIT ;RFC 2045, RFC 2047;
88 HEADER_FIELD_STRUCTURED
89 Message-ID: msg-id ;RFC 822, RFC 2047; HEADER_FIELD_MESSAGE_ID
90 Newsgroups: 1#(atom *("." atom)) ;RFC 1036, RFC 2047;
91 HEADER_FIELD_STRUCTURED
92 Organization: *text ;RFC 1036; HEADER_FIELD_TEXT
93 Received: ["from" domain] ["by" domain] ["via" atom] *("with" atom)
94 ["id" msg-id] ["for" addr-spec] ";" date-time ;RFC 822, RFC 1123,
95 RFC 2047; HEADER_FIELD_STRUCTURED
96 References: *(phrase / msg-id) ;RFC 822, RFC 2047;
97 HEADER_FIELD_ADDRESS
98 Reply-To: 1#address ;RFC 822, RFC 2047; HEADER_FIELD_ADDRESS
99 Resent-Date: date-time ;RFC 822, RFC 1123, RFC 2047;
100 HEADER_FIELD_STRUCTURED
101 Resent-From: mailbox / 1#mailbox ;RFC 822, RFC 2047;
102 HEADER_FIELD_ADDRESS
103 Resent-Message-ID: msg-id ;RFC 822, RFC 2047; HEADER_FIELD_MESSAGE_ID
104 Resent-Reply-To: 1#address ;RFC 822, RFC 2047; HEADER_FIELD_ADDRESS
105 Resent-Sender: mailbox ;RFC 822, RFC 2047; HEADER_FIELD_ADDRESS
106 Resent-To: 1#address ;RFC 822, RFC 2047; HEADER_FIELD_ADDRESS
107 Resent-bcc: #address ;RFC 822, RFC 2047; HEADER_FIELD_ADDRESS
108 Resent-cc: 1#address ;RFC 822, RFC 2047; HEADER_FIELD_ADDRESS
109 Return-path: route-addr / ("<" ">") ;RFC 822, RFC 1123, RFC 2047;
110 HEADER_FIELD_STRUCTURED
111 Return-Receipt-To: address ;Not Internet standard;
112 HEADER_FIELD_ADDRES
113 Sender: mailbox ;RFC 822, RFC 2047; HEADER_FIELD_ADDRESS
114 Subject: *text ;RFC 822, RFC 2047; HEADER_FIELD_TEXT
115 Summary: *text ;RFC 1036; HEADER_FIELD_TEXT
116 To: 1#address ;RFC 822, RFC 2047; HEADER_FIELD_ADDRESS
117 X-CHAOS-Marked: "YES" / "NO" ;local; HEADER_FIELD_STRUCTURED
118 X-CHAOS-Read: "YES" / "NO" ;local; HEADER_FIELD_STRUCTURED
119 X-CHAOS-Recipients: #*("<" atom word ">") ;local;
120 HEADER_FIELD_STRUCTURED
121 X-CHAOS-Size: 1*DIGIT ;local; HEADER_FIELD_STRUCTURED
122 X-Mailer: *text ;Not Internet standard; HEADER_FIELD_TEXT
123 X-Mozilla-Status: 4HEXDIG ;Mozilla; HEADER_FIELD_STRUCTURED
124 X-Newsreader: *text ;Not Internet standard; HEADER_FIELD_TEXT
125 X-Priority: "1" / "2" / "3" / "4" / "5" ;Not Internet standard;
126 HEADER_FIELD_STRUCTURED
127 Xref: sub-domain
128 1*((atom / string) *("." (atom / string)) ":" msg-number)
129 ;RFCs 1036, 2047, local; HEADER_FIELD_STRUCTURED
131 enum HeaderFieldType
133 HEADER_FIELD_TEXT,
134 HEADER_FIELD_STRUCTURED,
135 HEADER_FIELD_PHRASE,
136 HEADER_FIELD_MESSAGE_ID,
137 HEADER_FIELD_ADDRESS
140 /** Check for ISO 8859-1 character.
142 @param nChar Some UCS-4 character.
144 @return True if nChar is a ISO 8859-1 character (0x00--0xFF).
146 static inline bool isISO88591(sal_uInt32 nChar);
148 /** Check for US-ASCII control character.
150 @param nChar Some UCS-4 character.
152 @return True if nChar is a US-ASCII control character (US-ASCII
153 0x00--0x1F or 0x7F).
155 static inline bool isControl(sal_uInt32 nChar);
157 /** Check for US-ASCII white space character.
159 @param nChar Some UCS-4 character.
161 @return True if nChar is a US-ASCII white space character (US-ASCII
162 0x09 or 0x20).
164 static inline bool isWhiteSpace(sal_uInt32 nChar);
166 /** Check for US-ASCII visible character.
168 @param nChar Some UCS-4 character.
170 @return True if nChar is a US-ASCII visible character (US-ASCII
171 0x21--0x7E).
173 static inline bool isVisible(sal_uInt32 nChar);
175 /** Check for US-ASCII Base 64 digit character.
177 @param nChar Some UCS-4 character.
179 @return True if nChar is a US-ASCII Base 64 digit character (US-ASCII
180 'A'--'Z', 'a'--'z', '0'--'9', '+', or '/').
182 static inline bool isBase64Digit(sal_uInt32 nChar);
184 /** Check whether some character is valid within an RFC 822 <atom>.
186 @param nChar Some UCS-4 character.
188 @return True if nChar is valid within an RFC 822 <atom> (US-ASCII
189 'A'--'Z', 'a'--'z', '0'--'9', '!', '#', '$', '%', '&', ''', '*', '+',
190 '-', '/', '=', '?', '^', '_', '`', '{', '|', '}', or '~').
192 static bool isAtomChar(sal_uInt32 nChar);
194 /** Check whether some character is valid within an RFC 2045 <token>.
196 @param nChar Some UCS-4 character.
198 @return True if nChar is valid within an RFC 2047 <token> (US-ASCII
199 'A'--'Z', 'a'--'z', '0'--'9', '!', '#', '$', '%', '&', ''', '*', '+',
200 '-', '.', '^', '_', '`', '{', '|', '}', or '~').
202 static bool isTokenChar(sal_uInt32 nChar);
204 /** Check whether some character is valid within an RFC 2047 <token>.
206 @param nChar Some UCS-4 character.
208 @return True if nChar is valid within an RFC 2047 <token> (US-ASCII
209 'A'--'Z', 'a'--'z', '0'--'9', '!', '#', '$', '%', '&', ''', '*', '+',
210 '-', '^', '_', '`', '{', '|', '}', or '~').
212 static bool isEncodedWordTokenChar(sal_uInt32 nChar);
214 /** Check whether some character is valid within an RFC 2060 <atom>.
216 @param nChar Some UCS-4 character.
218 @return True if nChar is valid within an RFC 2060 <atom> (US-ASCII
219 'A'--'Z', 'a'--'z', '0'--'9', '!', '#', '$', '&', ''', '+', ',', '-',
220 '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', ']', '^', '_', '`',
221 '|', '}', or '~').
223 static bool isIMAPAtomChar(sal_uInt32 nChar);
225 /** Get the digit weight of a US-ASCII character.
227 @param nChar Some UCS-4 character.
229 @return If nChar is a US-ASCII (decimal) digit character (US-ASCII
230 '0'--'9'), return the corresponding weight (0--9); otherwise,
231 return -1.
233 static inline int getWeight(sal_uInt32 nChar);
235 /** Get the hexadecimal digit weight of a US-ASCII character.
237 @param nChar Some UCS-4 character.
239 @return If nChar is a US-ASCII hexadecimal digit character (US-ASCII
240 '0'--'9', 'A'--'F', or 'a'--'f'), return the corresponding weight
241 (0--15); otherwise, return -1.
243 static inline int getHexWeight(sal_uInt32 nChar);
245 /** Get the Base 64 digit weight of a US-ASCII character.
247 @param nChar Some UCS-4 character.
249 @return If nChar is a US-ASCII Base 64 digit character (US-ASCII
250 'A'--'F', or 'a'--'f', '0'--'9', '+', or '/'), return the
251 corresponding weight (0--63); if nChar is the US-ASCII Base 64 padding
252 character (US-ASCII '='), return -1; otherwise, return -2.
254 static inline int getBase64Weight(sal_uInt32 nChar);
256 /** Get a hexadecimal digit encoded as US-ASCII.
258 @param nWeight Must be in the range 0--15, inclusive.
260 @return The canonic (i.e., upper case) hexadecimal digit
261 corresponding to nWeight (US-ASCII '0'--'9' or 'A'--'F').
263 static sal_uInt32 getHexDigit(int nWeight);
265 /** Check two US-ASCII strings for equality, ignoring case.
267 @param pBegin1 Points to the start of the first string, must not be
268 null.
270 @param pEnd1 Points past the end of the first string, must be >=
271 pBegin1.
273 @param pString2 Points to the start of the null terminated second
274 string, must not be null.
276 @return True if the two strings are equal, ignoring the case of US-
277 ASCII alphabetic characters (US-ASCII 'A'--'Z' and 'a'--'z').
279 static bool equalIgnoreCase(const sal_Char * pBegin1,
280 const sal_Char * pEnd1,
281 const sal_Char * pString2);
283 /** Check two US-ASCII strings for equality, ignoring case.
285 @param pBegin1 Points to the start of the first string, must not be
286 null.
288 @param pEnd1 Points past the end of the first string, must be >=
289 pBegin1.
291 @param pString2 Points to the start of the null terminated second
292 string, must not be null.
294 @return True if the two strings are equal, ignoring the case of US-
295 ASCII alphabetic characters (US-ASCII 'A'--'Z' and 'a'--'z').
297 static bool equalIgnoreCase(const sal_Unicode * pBegin1,
298 const sal_Unicode * pEnd1,
299 const sal_Char * pString2);
301 static inline bool startsWithLineBreak(const sal_Char * pBegin,
302 const sal_Char * pEnd);
304 static inline bool startsWithLineBreak(const sal_Unicode * pBegin,
305 const sal_Unicode * pEnd);
307 static inline bool startsWithLineFolding(const sal_Char * pBegin,
308 const sal_Char * pEnd);
310 static inline bool startsWithLineFolding(const sal_Unicode * pBegin,
311 const sal_Unicode * pEnd);
313 static bool startsWithLinearWhiteSpace(const sal_Char * pBegin,
314 const sal_Char * pEnd);
316 static const sal_Unicode * skipLinearWhiteSpace(const sal_Unicode *
317 pBegin,
318 const sal_Unicode * pEnd);
320 static const sal_Unicode * skipComment(const sal_Unicode * pBegin,
321 const sal_Unicode * pEnd);
323 static const sal_Unicode * skipLinearWhiteSpaceComment(const sal_Unicode *
324 pBegin,
325 const sal_Unicode *
326 pEnd);
328 static inline bool needsQuotedStringEscape(sal_uInt32 nChar);
330 static const sal_Char * skipQuotedString(const sal_Char * pBegin,
331 const sal_Char * pEnd);
333 static const sal_Unicode * skipQuotedString(const sal_Unicode * pBegin,
334 const sal_Unicode * pEnd);
336 static bool scanUnsigned(const sal_Unicode *& rBegin,
337 const sal_Unicode * pEnd, bool bLeadingZeroes,
338 sal_uInt32 & rValue);
340 static const sal_Unicode * scanQuotedBlock(const sal_Unicode * pBegin,
341 const sal_Unicode * pEnd,
342 sal_uInt32 nOpening,
343 sal_uInt32 nClosing,
344 sal_Size & rLength,
345 bool & rModify);
347 static sal_Unicode const * scanParameters(sal_Unicode const * pBegin,
348 sal_Unicode const * pEnd,
349 INetContentTypeParameterList *
350 pParameters);
352 /** Parse the body of an RFC 2045 Content-Type header field.
354 @param pBegin The range (that must be valid) from non-null pBegin,
355 inclusive. to non-null pEnd, exclusive, forms the body of the
356 Content-Type header field. It must be of the form
358 token "/" token *(";" token "=" (token / quoted-string))
360 with intervening linear white space and comments (cf. RFCs 822, 2045).
361 The RFC 2231 extension are supported. The encoding of rMediaType
362 should be US-ASCII, but any Unicode values in the range U+0080..U+FFFF
363 are interpretet 'as appropriate.'
365 @param pType If not null, returns the type (the first of the above
366 tokens), in US-ASCII encoding and converted to lower case.
368 @param pSubType If not null, returns the sub-type (the second of the
369 above tokens), in US-ASCII encoding and converted to lower case.
371 @param pParameters If not null, returns the parameters as a list of
372 INetContentTypeParameters (the attributes are in US-ASCII encoding and
373 converted to lower case, the values are in Unicode encoding). If
374 null, only the syntax of the parameters is checked, but they are not
375 returned.
377 @return Null if the syntax of the field body is incorrect (i.e., does
378 not start with type and sub-type tokens). Otherwise, a pointer past the
379 longest valid input prefix. If null is returned, none of the output
380 parameters will be modified.
382 static sal_Unicode const * scanContentType(
383 sal_Unicode const *pBegin, sal_Unicode const * pEnd,
384 OUString * pType = 0, OUString * pSubType = 0,
385 INetContentTypeParameterList * pParameters = 0);
387 static inline rtl_TextEncoding translateToMIME(rtl_TextEncoding
388 eEncoding);
390 static inline rtl_TextEncoding translateFromMIME(rtl_TextEncoding
391 eEncoding);
393 static const sal_Char * getCharsetName(rtl_TextEncoding eEncoding);
395 static rtl_TextEncoding getCharsetEncoding(const sal_Char * pBegin,
396 const sal_Char * pEnd);
398 static inline bool isMIMECharsetEncoding(rtl_TextEncoding eEncoding);
400 static INetMIMECharsetList_Impl *
401 createPreferredCharsetList(rtl_TextEncoding eEncoding);
403 static sal_Unicode * convertToUnicode(const sal_Char * pBegin,
404 const sal_Char * pEnd,
405 rtl_TextEncoding eEncoding,
406 sal_Size & rSize);
408 static sal_Char * convertFromUnicode(const sal_Unicode * pBegin,
409 const sal_Unicode * pEnd,
410 rtl_TextEncoding eEncoding,
411 sal_Size & rSize);
413 /** Get the number of octets required to encode an UCS-4 character using
414 UTF-8 encoding.
416 @param nChar Some UCS-4 character.
418 @return The number of octets required (in the range 1--6, inclusive).
420 static inline int getUTF8OctetCount(sal_uInt32 nChar);
422 static inline void writeEscapeSequence(INetMIMEOutputSink & rSink,
423 sal_uInt32 nChar);
425 static void writeUTF8(INetMIMEOutputSink & rSink, sal_uInt32 nChar);
427 static void writeHeaderFieldBody(INetMIMEOutputSink & rSink,
428 HeaderFieldType eType,
429 const OUString& rBody,
430 rtl_TextEncoding ePreferredEncoding,
431 bool bInitialSpace = true);
433 static bool translateUTF8Char(const sal_Char *& rBegin,
434 const sal_Char * pEnd,
435 rtl_TextEncoding eEncoding,
436 sal_uInt32 & rCharacter);
438 static OUString decodeHeaderFieldBody(HeaderFieldType eType,
439 const OString& rBody);
441 /** Get the UTF-32 character at the head of a UTF-16 encoded string.
443 @param rBegin Points to the start of the UTF-16 encoded string, must
444 not be null. On exit, it points past the first UTF-32 character's
445 encoding.
447 @param pEnd Points past the end of the UTF-16 encoded string, must be
448 strictly greater than rBegin.
450 @return The UCS-4 character at the head of the UTF-16 encoded string.
451 If the string does not start with the UTF-16 encoding of a UCS-32
452 character, the first UTF-16 value is returned.
454 static inline sal_uInt32 getUTF32Character(const sal_Unicode *& rBegin,
455 const sal_Unicode * pEnd);
457 /** Put the UTF-16 encoding of a UTF-32 character into a buffer.
459 @param pBuffer Points to a buffer, must not be null.
461 @param nUTF32 An UTF-32 character, must be in the range 0..0x10FFFF.
463 @return A pointer past the UTF-16 characters put into the buffer
464 (i.e., pBuffer + 1 or pBuffer + 2).
466 static inline sal_Unicode * putUTF32Character(sal_Unicode * pBuffer,
467 sal_uInt32 nUTF32);
470 // static
471 inline bool INetMIME::isISO88591(sal_uInt32 nChar)
473 return nChar <= 0xFF;
476 // static
477 inline bool INetMIME::isControl(sal_uInt32 nChar)
479 return nChar <= 0x1F || nChar == 0x7F;
482 // static
483 inline bool INetMIME::isWhiteSpace(sal_uInt32 nChar)
485 return nChar == '\t' || nChar == ' ';
488 // static
489 inline bool INetMIME::isVisible(sal_uInt32 nChar)
491 return nChar >= '!' && nChar <= '~';
494 // static
495 inline bool INetMIME::isBase64Digit(sal_uInt32 nChar)
497 return rtl::isAsciiUpperCase(nChar) || rtl::isAsciiLowerCase(nChar) || rtl::isAsciiDigit(nChar)
498 || nChar == '+' || nChar == '/';
501 // static
502 inline int INetMIME::getWeight(sal_uInt32 nChar)
504 return rtl::isAsciiDigit(nChar) ? int(nChar - '0') : -1;
507 // static
508 inline int INetMIME::getHexWeight(sal_uInt32 nChar)
510 return rtl::isAsciiDigit(nChar) ? int(nChar - '0') :
511 nChar >= 'A' && nChar <= 'F' ? int(nChar - 'A' + 10) :
512 nChar >= 'a' && nChar <= 'f' ? int(nChar - 'a' + 10) : -1;
515 // static
516 inline int INetMIME::getBase64Weight(sal_uInt32 nChar)
518 return rtl::isAsciiUpperCase(nChar) ? int(nChar - 'A') :
519 rtl::isAsciiLowerCase(nChar) ? int(nChar - 'a' + 26) :
520 rtl::isAsciiDigit(nChar) ? int(nChar - '0' + 52) :
521 nChar == '+' ? 62 :
522 nChar == '/' ? 63 :
523 nChar == '=' ? -1 : -2;
526 // static
527 inline bool INetMIME::startsWithLineBreak(const sal_Char * pBegin,
528 const sal_Char * pEnd)
530 DBG_ASSERT(pBegin && pBegin <= pEnd,
531 "INetMIME::startsWithLineBreak(): Bad sequence");
533 return pEnd - pBegin >= 2 && pBegin[0] == 0x0D && pBegin[1] == 0x0A;
534 // CR, LF
537 // static
538 inline bool INetMIME::startsWithLineBreak(const sal_Unicode * pBegin,
539 const sal_Unicode * pEnd)
541 DBG_ASSERT(pBegin && pBegin <= pEnd,
542 "INetMIME::startsWithLineBreak(): Bad sequence");
544 return pEnd - pBegin >= 2 && pBegin[0] == 0x0D && pBegin[1] == 0x0A;
545 // CR, LF
548 // static
549 inline bool INetMIME::startsWithLineFolding(const sal_Char * pBegin,
550 const sal_Char * pEnd)
552 DBG_ASSERT(pBegin && pBegin <= pEnd,
553 "INetMIME::startsWithLineFolding(): Bad sequence");
555 return pEnd - pBegin >= 3 && pBegin[0] == 0x0D && pBegin[1] == 0x0A
556 && isWhiteSpace(pBegin[2]); // CR, LF
559 // static
560 inline bool INetMIME::startsWithLineFolding(const sal_Unicode * pBegin,
561 const sal_Unicode * pEnd)
563 DBG_ASSERT(pBegin && pBegin <= pEnd,
564 "INetMIME::startsWithLineFolding(): Bad sequence");
566 return pEnd - pBegin >= 3 && pBegin[0] == 0x0D && pBegin[1] == 0x0A
567 && isWhiteSpace(pBegin[2]); // CR, LF
570 // static
571 inline bool INetMIME::startsWithLinearWhiteSpace(const sal_Char * pBegin,
572 const sal_Char * pEnd)
574 DBG_ASSERT(pBegin && pBegin <= pEnd,
575 "INetMIME::startsWithLinearWhiteSpace(): Bad sequence");
577 return pBegin != pEnd
578 && (isWhiteSpace(*pBegin) || startsWithLineFolding(pBegin, pEnd));
581 // static
582 inline bool INetMIME::needsQuotedStringEscape(sal_uInt32 nChar)
584 return nChar == '"' || nChar == '\\';
587 // static
588 inline rtl_TextEncoding INetMIME::translateToMIME(rtl_TextEncoding eEncoding)
590 #if defined WNT
591 return eEncoding == RTL_TEXTENCODING_MS_1252 ?
592 RTL_TEXTENCODING_ISO_8859_1 : eEncoding;
593 #else // WNT
594 return eEncoding;
595 #endif // WNT
598 // static
599 inline rtl_TextEncoding INetMIME::translateFromMIME(rtl_TextEncoding
600 eEncoding)
602 #if defined WNT
603 return eEncoding == RTL_TEXTENCODING_ISO_8859_1 ?
604 RTL_TEXTENCODING_MS_1252 : eEncoding;
605 #else
606 return eEncoding;
607 #endif
610 // static
611 inline bool INetMIME::isMIMECharsetEncoding(rtl_TextEncoding eEncoding)
613 return ( rtl_isOctetTextEncoding(eEncoding) == sal_True );
616 // static
617 inline int INetMIME::getUTF8OctetCount(sal_uInt32 nChar)
619 DBG_ASSERT(nChar < 0x80000000, "INetMIME::getUTF8OctetCount(): Bad char");
621 return nChar < 0x80 ? 1 :
622 nChar < 0x800 ? 2 :
623 nChar <= 0x10000 ? 3 :
624 nChar <= 0x200000 ? 4 :
625 nChar <= 0x4000000 ? 5 : 6;
628 // static
629 inline sal_uInt32 INetMIME::getUTF32Character(const sal_Unicode *& rBegin,
630 const sal_Unicode * pEnd)
632 DBG_ASSERT(rBegin && rBegin < pEnd,
633 "INetMIME::getUTF32Character(): Bad sequence");
634 if (rBegin + 1 < pEnd && rBegin[0] >= 0xD800 && rBegin[0] <= 0xDBFF
635 && rBegin[1] >= 0xDC00 && rBegin[1] <= 0xDFFF)
637 sal_uInt32 nUTF32 = sal_uInt32(*rBegin++ & 0x3FF) << 10;
638 return (nUTF32 | (*rBegin++ & 0x3FF)) + 0x10000;
640 else
641 return *rBegin++;
644 // static
645 inline sal_Unicode * INetMIME::putUTF32Character(sal_Unicode * pBuffer,
646 sal_uInt32 nUTF32)
648 DBG_ASSERT(nUTF32 <= 0x10FFFF, "INetMIME::putUTF32Character(): Bad char");
649 if (nUTF32 < 0x10000)
650 *pBuffer++ = sal_Unicode(nUTF32);
651 else
653 nUTF32 -= 0x10000;
654 *pBuffer++ = sal_Unicode(0xD800 | (nUTF32 >> 10));
655 *pBuffer++ = sal_Unicode(0xDC00 | (nUTF32 & 0x3FF));
657 return pBuffer;
660 class INetMIMEOutputSink
662 public:
663 static sal_uInt32 const NO_LINE_LENGTH_LIMIT = SAL_MAX_UINT32;
665 private:
666 sal_uInt32 m_nColumn;
667 sal_uInt32 m_nLineLengthLimit;
669 protected:
670 /** Write a sequence of octets.
672 @param pBegin Points to the start of the sequence, must not be null.
674 @param pEnd Points past the end of the sequence, must be >= pBegin.
676 virtual void writeSequence(const sal_Char * pBegin,
677 const sal_Char * pEnd) = 0;
679 /** Write a null terminated sequence of octets (without the terminating
680 null).
682 @param pOctets A null terminated sequence of octets, must not be
683 null.
685 @return The length of pOctets (without the terminating null).
687 sal_Size writeSequence(const sal_Char * pSequence);
689 /** Write a sequence of octets.
691 @descr The supplied sequence of Unicode characters is interpreted as
692 a sequence of octets. It is an error if any of the elements of the
693 sequence has a numerical value greater than 255.
695 @param pBegin Points to the start of the sequence, must not be null.
697 @param pEnd Points past the end of the sequence, must be >= pBegin.
699 void writeSequence(const sal_Unicode * pBegin,
700 const sal_Unicode * pEnd);
702 public:
703 INetMIMEOutputSink(sal_uInt32 nTheColumn = 0,
704 sal_uInt32 nTheLineLengthLimit
705 = INetMIME::SOFT_LINE_LENGTH_LIMIT):
706 m_nColumn(nTheColumn), m_nLineLengthLimit(nTheLineLengthLimit) {}
708 virtual ~INetMIMEOutputSink() {}
710 /** Get the current column.
712 @return The current column (starting from zero).
714 sal_uInt32 getColumn() const { return m_nColumn; }
716 sal_uInt32 getLineLengthLimit() const { return m_nLineLengthLimit; }
718 void setLineLengthLimit(sal_uInt32 nTheLineLengthLimit)
719 { m_nLineLengthLimit = nTheLineLengthLimit; }
721 virtual ErrCode getError() const;
723 /** Write a sequence of octets.
725 @param pBegin Points to the start of the sequence, must not be null.
727 @param pEnd Points past the end of the sequence, must be >= pBegin.
729 inline void write(const sal_Char * pBegin, const sal_Char * pEnd);
731 /** Write a sequence of octets.
733 @param pBegin Points to the start of the sequence, must not be null.
735 @param nLength The length of the sequence.
737 void write(const sal_Char * pBegin, sal_Size nLength)
738 { write(pBegin, pBegin + nLength); }
740 /** Write a sequence of octets.
742 @descr The supplied sequence of Unicode characters is interpreted as
743 a sequence of octets. It is an error if any of the elements of the
744 sequence has a numerical value greater than 255.
746 @param pBegin Points to the start of the sequence, must not be null.
748 @param pEnd Points past the end of the sequence, must be >= pBegin.
750 inline void write(const sal_Unicode * pBegin, const sal_Unicode * pEnd);
752 /** Write a sequence of octets.
754 @param rOctets A OString, interpreted as a sequence of octets.
756 @param nBegin The offset of the first character to write.
758 @param nEnd The offset past the last character to write.
760 void write(const OString& rOctets, sal_Int32 nBegin, sal_Int32 nEnd)
762 writeSequence(rOctets.getStr() + nBegin, rOctets.getStr() + nEnd);
763 m_nColumn += nEnd - nBegin;
766 /** Write a single octet.
768 @param nOctet Some octet.
770 @return This instance.
772 inline INetMIMEOutputSink & operator <<(sal_Char nOctet);
774 /** Write a null terminated sequence of octets (without the terminating
775 null).
777 @param pOctets A null terminated sequence of octets, must not be
778 null.
780 @return This instance.
782 inline INetMIMEOutputSink & operator <<(const sal_Char * pOctets);
784 /** Write a sequence of octets.
786 @param rOctets A OString, interpreted as a sequence of octets.
788 @return This instance.
790 INetMIMEOutputSink & operator <<(const OString& rOctets)
792 writeSequence(rOctets.getStr(), rOctets.getStr() + rOctets.getLength());
793 m_nColumn += rOctets.getLength();
794 return *this;
797 /** Call a manipulator function.
799 @param pManipulator A manipulator function.
801 @return Whatever the manipulator function returns.
803 INetMIMEOutputSink &
804 operator <<(INetMIMEOutputSink & (* pManipulator)(INetMIMEOutputSink &))
805 { return pManipulator(*this); }
807 /** Write a line end (CR LF).
809 void writeLineEnd();
811 /** A manipulator function that writes a line end (CR LF).
813 @param rSink Some sink.
815 @return The sink rSink.
817 static inline INetMIMEOutputSink & endl(INetMIMEOutputSink & rSink);
820 inline void INetMIMEOutputSink::write(const sal_Char * pBegin,
821 const sal_Char * pEnd)
823 writeSequence(pBegin, pEnd);
824 m_nColumn += pEnd - pBegin;
827 inline void INetMIMEOutputSink::write(const sal_Unicode * pBegin,
828 const sal_Unicode * pEnd)
830 writeSequence(pBegin, pEnd);
831 m_nColumn += pEnd - pBegin;
834 inline INetMIMEOutputSink & INetMIMEOutputSink::operator <<(sal_Char nOctet)
836 writeSequence(&nOctet, &nOctet + 1);
837 ++m_nColumn;
838 return *this;
841 inline INetMIMEOutputSink & INetMIMEOutputSink::operator <<(const sal_Char *
842 pOctets)
844 m_nColumn += writeSequence(pOctets);
845 return *this;
848 // static
849 inline INetMIMEOutputSink & INetMIMEOutputSink::endl(INetMIMEOutputSink &
850 rSink)
852 rSink.writeLineEnd();
853 return rSink;
856 // static
857 inline void INetMIME::writeEscapeSequence(INetMIMEOutputSink & rSink,
858 sal_uInt32 nChar)
860 DBG_ASSERT(nChar <= 0xFF, "INetMIME::writeEscapeSequence(): Bad char");
861 rSink << '=' << sal_uInt8(getHexDigit(nChar >> 4))
862 << sal_uInt8(getHexDigit(nChar & 15));
865 class INetMIMEStringOutputSink: public INetMIMEOutputSink
867 OStringBuffer m_aBuffer;
869 using INetMIMEOutputSink::writeSequence;
871 virtual void writeSequence(const sal_Char * pBegin,
872 const sal_Char * pEnd) SAL_OVERRIDE;
874 public:
875 inline INetMIMEStringOutputSink(sal_uInt32 nColumn = 0,
876 sal_uInt32 nLineLengthLimit
877 = INetMIME::SOFT_LINE_LENGTH_LIMIT):
878 INetMIMEOutputSink(nColumn, nLineLengthLimit) {}
880 virtual ErrCode getError() const SAL_OVERRIDE;
882 OString takeBuffer()
884 return m_aBuffer.makeStringAndClear();
888 class INetMIMEEncodedWordOutputSink
890 public:
891 enum Context { CONTEXT_TEXT = 1,
892 CONTEXT_COMMENT = 2,
893 CONTEXT_PHRASE = 4 };
895 enum Space { SPACE_NO, SPACE_ENCODED, SPACE_ALWAYS };
897 private:
898 enum { BUFFER_SIZE = 256 };
900 enum Coding { CODING_NONE, CODING_QUOTED, CODING_ENCODED,
901 CODING_ENCODED_TERMINATED };
903 enum EncodedWordState { STATE_INITIAL, STATE_FIRST_EQUALS,
904 STATE_FIRST_QUESTION, STATE_CHARSET,
905 STATE_SECOND_QUESTION, STATE_ENCODING,
906 STATE_THIRD_QUESTION, STATE_ENCODED_TEXT,
907 STATE_FOURTH_QUESTION, STATE_SECOND_EQUALS,
908 STATE_BAD };
910 INetMIMEOutputSink & m_rSink;
911 Context m_eContext;
912 Space m_eInitialSpace;
913 sal_uInt32 m_nExtraSpaces;
914 INetMIMECharsetList_Impl * m_pEncodingList;
915 sal_Unicode * m_pBuffer;
916 sal_uInt32 m_nBufferSize;
917 sal_Unicode * m_pBufferEnd;
918 Coding m_ePrevCoding;
919 rtl_TextEncoding m_ePrevMIMEEncoding;
920 Coding m_eCoding;
921 sal_uInt32 m_nQuotedEscaped;
922 EncodedWordState m_eEncodedWordState;
924 inline bool needsEncodedWordEscape(sal_uInt32 nChar) const;
926 void finish(bool bWriteTrailer);
928 public:
929 inline INetMIMEEncodedWordOutputSink(INetMIMEOutputSink & rTheSink,
930 Context eTheContext,
931 Space eTheInitialSpace,
932 rtl_TextEncoding ePreferredEncoding);
934 ~INetMIMEEncodedWordOutputSink();
936 INetMIMEEncodedWordOutputSink & WriteUInt32(sal_uInt32 nChar);
938 inline void write(const sal_Char * pBegin, const sal_Char * pEnd);
940 inline void write(const sal_Unicode * pBegin, const sal_Unicode * pEnd);
942 inline bool flush();
945 inline INetMIMEEncodedWordOutputSink::INetMIMEEncodedWordOutputSink(
946 INetMIMEOutputSink & rTheSink, Context eTheContext,
947 Space eTheInitialSpace, rtl_TextEncoding ePreferredEncoding):
948 m_rSink(rTheSink),
949 m_eContext(eTheContext),
950 m_eInitialSpace(eTheInitialSpace),
951 m_nExtraSpaces(0),
952 m_pEncodingList(INetMIME::createPreferredCharsetList(ePreferredEncoding)),
953 m_ePrevCoding(CODING_NONE),
954 m_ePrevMIMEEncoding(RTL_TEXTENCODING_DONTKNOW),
955 m_eCoding(CODING_NONE),
956 m_nQuotedEscaped(0),
957 m_eEncodedWordState(STATE_INITIAL)
959 m_nBufferSize = BUFFER_SIZE;
960 m_pBuffer = static_cast< sal_Unicode * >(rtl_allocateMemory(
961 m_nBufferSize
962 * sizeof (sal_Unicode)));
963 m_pBufferEnd = m_pBuffer;
966 inline void INetMIMEEncodedWordOutputSink::write(const sal_Char * pBegin,
967 const sal_Char * pEnd)
969 DBG_ASSERT(pBegin && pBegin <= pEnd,
970 "INetMIMEEncodedWordOutputSink::write(): Bad sequence");
972 while (pBegin != pEnd)
973 WriteUInt32(*pBegin++);
976 inline void INetMIMEEncodedWordOutputSink::write(const sal_Unicode * pBegin,
977 const sal_Unicode * pEnd)
979 DBG_ASSERT(pBegin && pBegin <= pEnd,
980 "INetMIMEEncodedWordOutputSink::write(): Bad sequence");
982 while (pBegin != pEnd)
983 WriteUInt32(*pBegin++);
986 inline bool INetMIMEEncodedWordOutputSink::flush()
988 finish(true);
989 return m_ePrevCoding != CODING_NONE;
992 struct INetContentTypeParameter
994 /** The name of the attribute, in US-ASCII encoding and converted to lower
995 case. If a parameter value is split as described in RFC 2231, there
996 will only be one item for the complete parameter, with the attribute
997 name lacking any section suffix.
999 const OString m_sAttribute;
1001 /** The optional character set specification (see RFC 2231), in US-ASCII
1002 encoding and converted to lower case.
1004 const OString m_sCharset;
1006 /** The optional language specification (see RFC 2231), in US-ASCII
1007 encoding and converted to lower case.
1009 const OString m_sLanguage;
1011 /** The attribute value. If the value is a quoted-string, it is
1012 'unpacked.' If a character set is specified, and the value can be
1013 converted to Unicode, this is done. Also, if no character set is
1014 specified, it is first tried to convert the value from UTF-8 encoding
1015 to Unicode, and if that doesn't work (because the value is not in
1016 UTF-8 encoding), it is converted from ISO-8859-1 encoding to Unicode
1017 (which will always work). But if a character set is specified and the
1018 value cannot be converted from that character set to Unicode, special
1019 action is taken to produce a value that can possibly be transformed
1020 back into its original form: Any 8-bit character from a non-encoded
1021 part of the original value is directly converted to Unicode
1022 (effectively handling it as if it was ISO-8859-1 encoded), and any
1023 8-bit character from an encoded part of the original value is mapped
1024 to the range U+F800..U+F8FF at the top of the Corporate Use Subarea
1025 within Unicode's Private Use Area (effectively adding 0xF800 to the
1026 character's numeric value).
1028 const OUString m_sValue;
1030 /** This is true if the value is successfully converted to Unicode, and
1031 false if the value is a special mixture of ISO-LATIN-1 characters and
1032 characters from Unicode's Private Use Area.
1034 const bool m_bConverted;
1036 INetContentTypeParameter(const OString& rTheAttribute,
1037 const OString& rTheCharset, const OString& rTheLanguage,
1038 const OUString& rTheValue, bool bTheConverted)
1039 : m_sAttribute(rTheAttribute)
1040 , m_sCharset(rTheCharset)
1041 , m_sLanguage(rTheLanguage)
1042 , m_sValue(rTheValue)
1043 , m_bConverted(bTheConverted)
1048 class TOOLS_DLLPUBLIC INetContentTypeParameterList
1050 public:
1052 void Clear();
1054 void Insert(INetContentTypeParameter * pParameter, sal_uIntPtr nIndex)
1056 maEntries.insert(maEntries.begin()+nIndex,pParameter);
1059 void Append(INetContentTypeParameter *pParameter)
1061 maEntries.push_back(pParameter);
1064 inline const INetContentTypeParameter * GetObject(sal_uIntPtr nIndex) const
1066 return &(maEntries[nIndex]);
1069 const INetContentTypeParameter * find(const OString& rAttribute) const;
1071 private:
1073 boost::ptr_vector<INetContentTypeParameter> maEntries;
1076 #endif
1078 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */