1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
22 #include <forward_list>
25 #include <sal/log.hxx>
26 #include <rtl/ustring.hxx>
27 #include <rtl/strbuf.hxx>
28 #include <rtl/ustrbuf.hxx>
29 #include <rtl/tencinfo.h>
30 #include <tools/inetmime.hxx>
31 #include <rtl/character.hxx>
35 rtl_TextEncoding
getCharsetEncoding(const char * pBegin
,
38 /** Check for US-ASCII white space character.
40 @param nChar Some UCS-4 character.
42 @return True if nChar is a US-ASCII white space character (US-ASCII
45 bool isWhiteSpace(sal_uInt32 nChar
)
47 return nChar
== '\t' || nChar
== ' ';
50 /** Get the Base 64 digit weight of a US-ASCII character.
52 @param nChar Some UCS-4 character.
54 @return If nChar is a US-ASCII Base 64 digit character (US-ASCII
55 'A'--'F', or 'a'--'f', '0'--'9', '+', or '/'), return the
56 corresponding weight (0--63); if nChar is the US-ASCII Base 64 padding
57 character (US-ASCII '='), return -1; otherwise, return -2.
59 int getBase64Weight(sal_uInt32 nChar
)
61 return rtl::isAsciiUpperCase(nChar
) ? int(nChar
- 'A') :
62 rtl::isAsciiLowerCase(nChar
) ? int(nChar
- 'a' + 26) :
63 rtl::isAsciiDigit(nChar
) ? int(nChar
- '0' + 52) :
66 nChar
== '=' ? -1 : -2;
69 bool startsWithLineFolding(const sal_Unicode
* pBegin
,
70 const sal_Unicode
* pEnd
)
72 DBG_ASSERT(pBegin
&& pBegin
<= pEnd
,
73 "startsWithLineFolding(): Bad sequence");
75 return pEnd
- pBegin
>= 3 && pBegin
[0] == 0x0D && pBegin
[1] == 0x0A
76 && isWhiteSpace(pBegin
[2]); // CR, LF
79 rtl_TextEncoding
translateFromMIME(rtl_TextEncoding
83 return eEncoding
== RTL_TEXTENCODING_ISO_8859_1
?
84 RTL_TEXTENCODING_MS_1252
: eEncoding
;
90 bool isMIMECharsetEncoding(rtl_TextEncoding eEncoding
)
92 return rtl_isOctetTextEncoding(eEncoding
);
95 std::unique_ptr
<sal_Unicode
[]> convertToUnicode(const char * pBegin
,
97 rtl_TextEncoding eEncoding
,
100 if (eEncoding
== RTL_TEXTENCODING_DONTKNOW
)
102 rtl_TextToUnicodeConverter hConverter
103 = rtl_createTextToUnicodeConverter(eEncoding
);
104 rtl_TextToUnicodeContext hContext
105 = rtl_createTextToUnicodeContext(hConverter
);
106 std::unique_ptr
<sal_Unicode
[]> pBuffer
;
108 for (sal_Size nBufferSize
= pEnd
- pBegin
;;
109 nBufferSize
+= nBufferSize
/ 3 + 1)
111 pBuffer
.reset(new sal_Unicode
[nBufferSize
]);
112 sal_Size nSrcCvtBytes
;
113 rSize
= rtl_convertTextToUnicode(
114 hConverter
, hContext
, pBegin
, pEnd
- pBegin
, pBuffer
.get(),
116 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
117 | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
118 | RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR
,
119 &nInfo
, &nSrcCvtBytes
);
120 if (nInfo
!= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL
)
123 rtl_resetTextToUnicodeContext(hConverter
, hContext
);
125 rtl_destroyTextToUnicodeContext(hConverter
, hContext
);
126 rtl_destroyTextToUnicodeConverter(hConverter
);
134 /** Put the UTF-16 encoding of a UTF-32 character into a buffer.
136 @param pBuffer Points to a buffer, must not be null.
138 @param nUTF32 A UTF-32 character, must be in the range 0..0x10FFFF.
140 @return A pointer past the UTF-16 characters put into the buffer
141 (i.e., pBuffer + 1 or pBuffer + 2).
143 sal_Unicode
* putUTF32Character(sal_Unicode
* pBuffer
,
146 DBG_ASSERT(rtl::isUnicodeCodePoint(nUTF32
), "putUTF32Character(): Bad char");
147 if (nUTF32
< 0x10000)
148 *pBuffer
++ = sal_Unicode(nUTF32
);
152 *pBuffer
++ = sal_Unicode(0xD800 | (nUTF32
>> 10));
153 *pBuffer
++ = sal_Unicode(0xDC00 | (nUTF32
& 0x3FF));
158 void writeUTF8(OStringBuffer
& rSink
, sal_uInt32 nChar
)
160 // See RFC 2279 for a discussion of UTF-8.
161 DBG_ASSERT(nChar
< 0x80000000, "writeUTF8(): Bad char");
164 rSink
.append(char(nChar
));
165 else if (nChar
< 0x800)
166 rSink
.append(char(nChar
>> 6 | 0xC0))
167 .append(char((nChar
& 0x3F) | 0x80));
168 else if (nChar
< 0x10000)
169 rSink
.append(char(nChar
>> 12 | 0xE0))
170 .append(char((nChar
>> 6 & 0x3F) | 0x80))
171 .append(char((nChar
& 0x3F) | 0x80));
172 else if (nChar
< 0x200000)
173 rSink
.append(char(nChar
>> 18 | 0xF0))
174 .append(char((nChar
>> 12 & 0x3F) | 0x80))
175 .append(char((nChar
>> 6 & 0x3F) | 0x80))
176 .append(char((nChar
& 0x3F) | 0x80));
177 else if (nChar
< 0x4000000)
178 rSink
.append(char(nChar
>> 24 | 0xF8))
179 .append(char((nChar
>> 18 & 0x3F) | 0x80))
180 .append(char((nChar
>> 12 & 0x3F) | 0x80))
181 .append(char((nChar
>> 6 & 0x3F) | 0x80))
182 .append(char((nChar
& 0x3F) | 0x80));
184 rSink
.append(char(nChar
>> 30 | 0xFC))
185 .append(char((nChar
>> 24 & 0x3F) | 0x80))
186 .append(char((nChar
>> 18 & 0x3F) | 0x80))
187 .append(char((nChar
>> 12 & 0x3F) | 0x80))
188 .append(char((nChar
>> 6 & 0x3F) | 0x80))
189 .append(char((nChar
& 0x3F) | 0x80));
192 bool translateUTF8Char(const char *& rBegin
,
194 sal_uInt32
& rCharacter
)
196 if (rBegin
== pEnd
|| static_cast< unsigned char >(*rBegin
) < 0x80
197 || static_cast< unsigned char >(*rBegin
) >= 0xFE)
203 const char * p
= rBegin
;
204 if (static_cast< unsigned char >(*p
) < 0xE0)
208 nUCS4
= static_cast< unsigned char >(*p
) & 0x1F;
210 else if (static_cast< unsigned char >(*p
) < 0xF0)
214 nUCS4
= static_cast< unsigned char >(*p
) & 0xF;
216 else if (static_cast< unsigned char >(*p
) < 0xF8)
220 nUCS4
= static_cast< unsigned char >(*p
) & 7;
222 else if (static_cast< unsigned char >(*p
) < 0xFC)
226 nUCS4
= static_cast< unsigned char >(*p
) & 3;
232 nUCS4
= static_cast< unsigned char >(*p
) & 1;
236 for (; nCount
-- > 0; ++p
)
237 if ((static_cast< unsigned char >(*p
) & 0xC0) == 0x80)
238 nUCS4
= (nUCS4
<< 6) | (static_cast< unsigned char >(*p
) & 0x3F);
242 if (!rtl::isUnicodeCodePoint(nUCS4
) || nUCS4
< nMin
)
250 void appendISO88591(OUStringBuffer
& rText
, char const * pBegin
,
255 OString m_aAttribute
;
259 sal_uInt32 m_nSection
;
262 bool operator<(const Parameter
& rhs
) const // is used by std::list<Parameter>::sort
264 int nComp
= m_aAttribute
.compareTo(rhs
.m_aAttribute
);
266 (nComp
== 0 && m_nSection
< rhs
.m_nSection
);
268 struct IsSameSection
// is used to check container for duplicates with std::any_of
270 const OString
& rAttribute
;
271 const sal_uInt32 nSection
;
272 bool operator()(const Parameter
& r
) const
273 { return r
.m_aAttribute
== rAttribute
&& r
.m_nSection
== nSection
; }
277 typedef std::forward_list
<Parameter
> ParameterList
;
279 bool parseParameters(ParameterList
const & rInput
,
280 INetContentTypeParameterList
* pOutput
);
284 void appendISO88591(OUStringBuffer
& rText
, char const * pBegin
,
287 sal_Int32 nLength
= pEnd
- pBegin
;
288 std::unique_ptr
<sal_Unicode
[]> pBuffer(new sal_Unicode
[nLength
]);
289 for (sal_Unicode
* p
= pBuffer
.get(); pBegin
!= pEnd
;)
290 *p
++ = static_cast<unsigned char>(*pBegin
++);
291 rText
.append(pBuffer
.get(), nLength
);
296 bool parseParameters(ParameterList
const & rInput
,
297 INetContentTypeParameterList
* pOutput
)
302 for (auto it
= rInput
.begin(), itPrev
= rInput
.end(); it
!= rInput
.end() ; itPrev
= it
++)
304 if (it
->m_nSection
> 0
305 && (itPrev
== rInput
.end()
306 || itPrev
->m_nSection
!= it
->m_nSection
- 1
307 || itPrev
->m_aAttribute
!= it
->m_aAttribute
))
312 for (auto it
= rInput
.begin(), itNext
= rInput
.begin(); it
!= rInput
.end(); it
= itNext
)
314 bool bCharset
= !it
->m_aCharset
.isEmpty();
315 rtl_TextEncoding eEncoding
= RTL_TEXTENCODING_DONTKNOW
;
318 = getCharsetEncoding(it
->m_aCharset
.getStr(),
319 it
->m_aCharset
.getStr()
320 + it
->m_aCharset
.getLength());
321 OUStringBuffer
aValue(64);
322 bool bBadEncoding
= false;
327 std::unique_ptr
<sal_Unicode
[]> pUnicode
328 = convertToUnicode(itNext
->m_aValue
.getStr(),
329 itNext
->m_aValue
.getStr()
330 + itNext
->m_aValue
.getLength(),
331 bCharset
&& it
->m_bExtended
?
333 RTL_TEXTENCODING_UTF8
,
335 if (!pUnicode
&& !(bCharset
&& it
->m_bExtended
))
336 pUnicode
= convertToUnicode(
337 itNext
->m_aValue
.getStr(),
338 itNext
->m_aValue
.getStr()
339 + itNext
->m_aValue
.getLength(),
340 RTL_TEXTENCODING_ISO_8859_1
, nSize
);
346 aValue
.append(pUnicode
.get(), static_cast<sal_Int32
>(nSize
));
349 while (itNext
!= rInput
.end() && itNext
->m_nSection
!= 0);
357 if (itNext
->m_bExtended
)
359 for (sal_Int32 i
= 0; i
< itNext
->m_aValue
.getLength(); ++i
)
361 static_cast<sal_Unicode
>(
362 static_cast<unsigned char>(itNext
->m_aValue
[i
])
363 | 0xF800)); // map to unicode corporate use sub area
367 for (sal_Int32 i
= 0; i
< itNext
->m_aValue
.getLength(); ++i
)
368 aValue
.append( itNext
->m_aValue
[i
] );
372 while (itNext
!= rInput
.end() && itNext
->m_nSection
!= 0);
374 auto const ret
= pOutput
->insert(
376 {it
->m_aCharset
, it
->m_aLanguage
, aValue
.makeStringAndClear(), !bBadEncoding
}});
377 SAL_INFO_IF(!ret
.second
, "tools",
378 "INetMIME: dropping duplicate parameter: " << it
->m_aAttribute
);
383 /** Check whether some character is valid within an RFC 2045 <token>.
385 @param nChar Some UCS-4 character.
387 @return True if nChar is valid within an RFC 2047 <token> (US-ASCII
388 'A'--'Z', 'a'--'z', '0'--'9', '!', '#', '$', '%', '&', ''', '*', '+',
389 '-', '.', '^', '_', '`', '{', '|', '}', or '~').
391 bool isTokenChar(sal_uInt32 nChar
)
393 static const bool aMap
[128]
394 = { false, false, false, false, false, false, false, false,
395 false, false, false, false, false, false, false, false,
396 false, false, false, false, false, false, false, false,
397 false, false, false, false, false, false, false, false,
398 false, true, false, true, true, true, true, true, // !"#$%&'
399 false, false, true, true, false, true, true, false, //()*+,-./
400 true, true, true, true, true, true, true, true, //01234567
401 true, true, false, false, false, false, false, false, //89:;<=>?
402 false, true, true, true, true, true, true, true, //@ABCDEFG
403 true, true, true, true, true, true, true, true, //HIJKLMNO
404 true, true, true, true, true, true, true, true, //PQRSTUVW
405 true, true, true, false, false, false, true, true, //XYZ[\]^_
406 true, true, true, true, true, true, true, true, //`abcdefg
407 true, true, true, true, true, true, true, true, //hijklmno
408 true, true, true, true, true, true, true, true, //pqrstuvw
409 true, true, true, true, true, true, true, false //xyz{|}~
411 return rtl::isAscii(nChar
) && aMap
[nChar
];
414 const sal_Unicode
* skipComment(const sal_Unicode
* pBegin
,
415 const sal_Unicode
* pEnd
)
417 DBG_ASSERT(pBegin
&& pBegin
<= pEnd
,
418 "skipComment(): Bad sequence");
420 if (pBegin
!= pEnd
&& *pBegin
== '(')
422 sal_uInt32 nLevel
= 0;
423 for (const sal_Unicode
* p
= pBegin
; p
!= pEnd
;)
444 const sal_Unicode
* skipLinearWhiteSpaceComment(const sal_Unicode
*
449 DBG_ASSERT(pBegin
&& pBegin
<= pEnd
,
450 "skipLinearWhiteSpaceComment(): Bad sequence");
452 while (pBegin
!= pEnd
)
461 if (startsWithLineFolding(pBegin
, pEnd
))
469 const sal_Unicode
* p
= skipComment(pBegin
, pEnd
);
482 const sal_Unicode
* skipQuotedString(const sal_Unicode
* pBegin
,
483 const sal_Unicode
* pEnd
)
485 DBG_ASSERT(pBegin
&& pBegin
<= pEnd
,
486 "skipQuotedString(): Bad sequence");
488 if (pBegin
!= pEnd
&& *pBegin
== '"')
489 for (const sal_Unicode
* p
= pBegin
+ 1; p
!= pEnd
;)
493 if (pEnd
- p
< 2 || *p
++ != 0x0A // LF
494 || !isWhiteSpace(*p
++))
509 sal_Unicode
const * scanParameters(sal_Unicode
const * pBegin
,
510 sal_Unicode
const * pEnd
,
511 INetContentTypeParameterList
*
515 sal_Unicode
const * pParameterBegin
= pBegin
;
516 for (sal_Unicode
const * p
= pParameterBegin
;;)
518 pParameterBegin
= skipLinearWhiteSpaceComment(p
, pEnd
);
519 if (pParameterBegin
== pEnd
|| *pParameterBegin
!= ';')
521 p
= pParameterBegin
+ 1;
523 sal_Unicode
const * pAttributeBegin
524 = skipLinearWhiteSpaceComment(p
, pEnd
);
526 bool bDowncaseAttribute
= false;
527 while (p
!= pEnd
&& isTokenChar(*p
) && *p
!= '*')
529 bDowncaseAttribute
= bDowncaseAttribute
|| rtl::isAsciiUpperCase(*p
);
532 if (p
== pAttributeBegin
)
534 OString
aAttribute(pAttributeBegin
, p
- pAttributeBegin
, RTL_TEXTENCODING_ASCII_US
);
535 if (bDowncaseAttribute
)
536 aAttribute
= aAttribute
.toAsciiLowerCase();
538 sal_uInt32 nSection
= 0;
539 if (p
!= pEnd
&& *p
== '*')
542 if (p
!= pEnd
&& rtl::isAsciiDigit(*p
)
543 && !INetMIME::scanUnsigned(p
, pEnd
, false, nSection
))
547 bool bPresent
= std::any_of(aList
.begin(), aList
.end(),
548 Parameter::IsSameSection
{aAttribute
, nSection
});
552 bool bExtended
= false;
553 if (p
!= pEnd
&& *p
== '*')
559 p
= skipLinearWhiteSpaceComment(p
, pEnd
);
561 if (p
== pEnd
|| *p
!= '=')
564 p
= skipLinearWhiteSpaceComment(p
+ 1, pEnd
);
573 sal_Unicode
const * pCharsetBegin
= p
;
574 bool bDowncaseCharset
= false;
575 while (p
!= pEnd
&& isTokenChar(*p
) && *p
!= '\'')
577 bDowncaseCharset
= bDowncaseCharset
|| rtl::isAsciiUpperCase(*p
);
580 if (p
== pCharsetBegin
)
587 RTL_TEXTENCODING_ASCII_US
);
588 if (bDowncaseCharset
)
589 aCharset
= aCharset
.toAsciiLowerCase();
592 if (p
== pEnd
|| *p
!= '\'')
596 sal_Unicode
const * pLanguageBegin
= p
;
597 bool bDowncaseLanguage
= false;
599 for (; p
!= pEnd
; ++p
)
600 if (rtl::isAsciiAlpha(*p
))
604 bDowncaseLanguage
= bDowncaseLanguage
605 || rtl::isAsciiUpperCase(*p
);
615 if (nLetters
== 0 || nLetters
> 8)
622 RTL_TEXTENCODING_ASCII_US
);
623 if (bDowncaseLanguage
)
624 aLanguage
= aLanguage
.toAsciiLowerCase();
627 if (p
== pEnd
|| *p
!= '\'')
637 sal_uInt32 nChar
= INetMIME::getUTF32Character(q
, pEnd
);
638 if (rtl::isAscii(nChar
) && !isTokenChar(nChar
))
641 if (nChar
== '%' && p
+ 1 < pEnd
)
643 int nWeight1
= INetMIME::getHexWeight(p
[0]);
644 int nWeight2
= INetMIME::getHexWeight(p
[1]);
645 if (nWeight1
>= 0 && nWeight2
>= 0)
647 aSink
.append(char(nWeight1
<< 4 | nWeight2
));
652 writeUTF8(aSink
, nChar
);
654 aValue
= aSink
.makeStringAndClear();
657 while (p
!= pEnd
&& (isTokenChar(*p
) || !rtl::isAscii(*p
)))
660 else if (p
!= pEnd
&& *p
== '"')
663 OStringBuffer
aSink(256);
664 bool bInvalid
= false;
672 sal_uInt32 nChar
= INetMIME::getUTF32Character(p
, pEnd
);
675 else if (nChar
== 0x0D) // CR
677 if (pEnd
- p
< 2 || *p
++ != 0x0A // LF
678 || !isWhiteSpace(*p
))
683 nChar
= static_cast<unsigned char>(*p
++);
685 else if (nChar
== '\\')
692 nChar
= INetMIME::getUTF32Character(p
, pEnd
);
694 writeUTF8(aSink
, nChar
);
698 aValue
= aSink
.makeStringAndClear();
702 sal_Unicode
const * pStringEnd
= skipQuotedString(p
, pEnd
);
709 sal_Unicode
const * pTokenBegin
= p
;
710 while (p
!= pEnd
&& (isTokenChar(*p
) || !rtl::isAscii(*p
)))
712 if (p
== pTokenBegin
)
716 pTokenBegin
, p
- pTokenBegin
,
717 RTL_TEXTENCODING_UTF8
);
719 aList
.emplace_front(Parameter
{aAttribute
, aCharset
, aLanguage
, aValue
, nSection
, bExtended
});
722 return parseParameters(aList
, pParameters
) ? pParameterBegin
: pBegin
;
725 bool equalIgnoreCase(const char * pBegin1
,
727 const char * pString2
)
729 DBG_ASSERT(pBegin1
&& pBegin1
<= pEnd1
&& pString2
,
730 "equalIgnoreCase(): Bad sequences");
732 while (*pString2
!= 0)
734 || (rtl::toAsciiUpperCase(static_cast<unsigned char>(*pBegin1
++))
735 != rtl::toAsciiUpperCase(
736 static_cast<unsigned char>(*pString2
++))))
738 return pBegin1
== pEnd1
;
743 char const * m_aName
;
744 rtl_TextEncoding m_eEncoding
;
747 // The source for the following table is <ftp://ftp.iana.org/in-notes/iana/
748 // assignments/character-sets> as of Jan, 21 2000 12:46:00, unless otherwise
750 EncodingEntry
const aEncodingMap
[]
751 = { { "US-ASCII", RTL_TEXTENCODING_ASCII_US
},
752 { "ANSI_X3.4-1968", RTL_TEXTENCODING_ASCII_US
},
753 { "ISO-IR-6", RTL_TEXTENCODING_ASCII_US
},
754 { "ANSI_X3.4-1986", RTL_TEXTENCODING_ASCII_US
},
755 { "ISO_646.IRV:1991", RTL_TEXTENCODING_ASCII_US
},
756 { "ASCII", RTL_TEXTENCODING_ASCII_US
},
757 { "ISO646-US", RTL_TEXTENCODING_ASCII_US
},
758 { "US", RTL_TEXTENCODING_ASCII_US
},
759 { "IBM367", RTL_TEXTENCODING_ASCII_US
},
760 { "CP367", RTL_TEXTENCODING_ASCII_US
},
761 { "CSASCII", RTL_TEXTENCODING_ASCII_US
},
762 { "ISO-8859-1", RTL_TEXTENCODING_ISO_8859_1
},
763 { "ISO_8859-1:1987", RTL_TEXTENCODING_ISO_8859_1
},
764 { "ISO-IR-100", RTL_TEXTENCODING_ISO_8859_1
},
765 { "ISO_8859-1", RTL_TEXTENCODING_ISO_8859_1
},
766 { "LATIN1", RTL_TEXTENCODING_ISO_8859_1
},
767 { "L1", RTL_TEXTENCODING_ISO_8859_1
},
768 { "IBM819", RTL_TEXTENCODING_ISO_8859_1
},
769 { "CP819", RTL_TEXTENCODING_ISO_8859_1
},
770 { "CSISOLATIN1", RTL_TEXTENCODING_ISO_8859_1
},
771 { "ISO-8859-2", RTL_TEXTENCODING_ISO_8859_2
},
772 { "ISO_8859-2:1987", RTL_TEXTENCODING_ISO_8859_2
},
773 { "ISO-IR-101", RTL_TEXTENCODING_ISO_8859_2
},
774 { "ISO_8859-2", RTL_TEXTENCODING_ISO_8859_2
},
775 { "LATIN2", RTL_TEXTENCODING_ISO_8859_2
},
776 { "L2", RTL_TEXTENCODING_ISO_8859_2
},
777 { "CSISOLATIN2", RTL_TEXTENCODING_ISO_8859_2
},
778 { "ISO-8859-3", RTL_TEXTENCODING_ISO_8859_3
},
779 { "ISO_8859-3:1988", RTL_TEXTENCODING_ISO_8859_3
},
780 { "ISO-IR-109", RTL_TEXTENCODING_ISO_8859_3
},
781 { "ISO_8859-3", RTL_TEXTENCODING_ISO_8859_3
},
782 { "LATIN3", RTL_TEXTENCODING_ISO_8859_3
},
783 { "L3", RTL_TEXTENCODING_ISO_8859_3
},
784 { "CSISOLATIN3", RTL_TEXTENCODING_ISO_8859_3
},
785 { "ISO-8859-4", RTL_TEXTENCODING_ISO_8859_4
},
786 { "ISO_8859-4:1988", RTL_TEXTENCODING_ISO_8859_4
},
787 { "ISO-IR-110", RTL_TEXTENCODING_ISO_8859_4
},
788 { "ISO_8859-4", RTL_TEXTENCODING_ISO_8859_4
},
789 { "LATIN4", RTL_TEXTENCODING_ISO_8859_4
},
790 { "L4", RTL_TEXTENCODING_ISO_8859_4
},
791 { "CSISOLATIN4", RTL_TEXTENCODING_ISO_8859_4
},
792 { "ISO-8859-5", RTL_TEXTENCODING_ISO_8859_5
},
793 { "ISO_8859-5:1988", RTL_TEXTENCODING_ISO_8859_5
},
794 { "ISO-IR-144", RTL_TEXTENCODING_ISO_8859_5
},
795 { "ISO_8859-5", RTL_TEXTENCODING_ISO_8859_5
},
796 { "CYRILLIC", RTL_TEXTENCODING_ISO_8859_5
},
797 { "CSISOLATINCYRILLIC", RTL_TEXTENCODING_ISO_8859_5
},
798 { "ISO-8859-6", RTL_TEXTENCODING_ISO_8859_6
},
799 { "ISO_8859-6:1987", RTL_TEXTENCODING_ISO_8859_6
},
800 { "ISO-IR-127", RTL_TEXTENCODING_ISO_8859_6
},
801 { "ISO_8859-6", RTL_TEXTENCODING_ISO_8859_6
},
802 { "ECMA-114", RTL_TEXTENCODING_ISO_8859_6
},
803 { "ASMO-708", RTL_TEXTENCODING_ISO_8859_6
},
804 { "ARABIC", RTL_TEXTENCODING_ISO_8859_6
},
805 { "CSISOLATINARABIC", RTL_TEXTENCODING_ISO_8859_6
},
806 { "ISO-8859-7", RTL_TEXTENCODING_ISO_8859_7
},
807 { "ISO_8859-7:1987", RTL_TEXTENCODING_ISO_8859_7
},
808 { "ISO-IR-126", RTL_TEXTENCODING_ISO_8859_7
},
809 { "ISO_8859-7", RTL_TEXTENCODING_ISO_8859_7
},
810 { "ELOT_928", RTL_TEXTENCODING_ISO_8859_7
},
811 { "ECMA-118", RTL_TEXTENCODING_ISO_8859_7
},
812 { "GREEK", RTL_TEXTENCODING_ISO_8859_7
},
813 { "GREEK8", RTL_TEXTENCODING_ISO_8859_7
},
814 { "CSISOLATINGREEK", RTL_TEXTENCODING_ISO_8859_7
},
815 { "ISO-8859-8", RTL_TEXTENCODING_ISO_8859_8
},
816 { "ISO_8859-8:1988", RTL_TEXTENCODING_ISO_8859_8
},
817 { "ISO-IR-138", RTL_TEXTENCODING_ISO_8859_8
},
818 { "ISO_8859-8", RTL_TEXTENCODING_ISO_8859_8
},
819 { "HEBREW", RTL_TEXTENCODING_ISO_8859_8
},
820 { "CSISOLATINHEBREW", RTL_TEXTENCODING_ISO_8859_8
},
821 { "ISO-8859-9", RTL_TEXTENCODING_ISO_8859_9
},
822 { "ISO_8859-9:1989", RTL_TEXTENCODING_ISO_8859_9
},
823 { "ISO-IR-148", RTL_TEXTENCODING_ISO_8859_9
},
824 { "ISO_8859-9", RTL_TEXTENCODING_ISO_8859_9
},
825 { "LATIN5", RTL_TEXTENCODING_ISO_8859_9
},
826 { "L5", RTL_TEXTENCODING_ISO_8859_9
},
827 { "CSISOLATIN5", RTL_TEXTENCODING_ISO_8859_9
},
828 { "ISO-8859-14", RTL_TEXTENCODING_ISO_8859_14
}, // RFC 2047
829 { "ISO_8859-15", RTL_TEXTENCODING_ISO_8859_15
},
830 { "ISO-8859-15", RTL_TEXTENCODING_ISO_8859_15
}, // RFC 2047
831 { "MACINTOSH", RTL_TEXTENCODING_APPLE_ROMAN
},
832 { "MAC", RTL_TEXTENCODING_APPLE_ROMAN
},
833 { "CSMACINTOSH", RTL_TEXTENCODING_APPLE_ROMAN
},
834 { "IBM437", RTL_TEXTENCODING_IBM_437
},
835 { "CP437", RTL_TEXTENCODING_IBM_437
},
836 { "437", RTL_TEXTENCODING_IBM_437
},
837 { "CSPC8CODEPAGE437", RTL_TEXTENCODING_IBM_437
},
838 { "IBM850", RTL_TEXTENCODING_IBM_850
},
839 { "CP850", RTL_TEXTENCODING_IBM_850
},
840 { "850", RTL_TEXTENCODING_IBM_850
},
841 { "CSPC850MULTILINGUAL", RTL_TEXTENCODING_IBM_850
},
842 { "IBM860", RTL_TEXTENCODING_IBM_860
},
843 { "CP860", RTL_TEXTENCODING_IBM_860
},
844 { "860", RTL_TEXTENCODING_IBM_860
},
845 { "CSIBM860", RTL_TEXTENCODING_IBM_860
},
846 { "IBM861", RTL_TEXTENCODING_IBM_861
},
847 { "CP861", RTL_TEXTENCODING_IBM_861
},
848 { "861", RTL_TEXTENCODING_IBM_861
},
849 { "CP-IS", RTL_TEXTENCODING_IBM_861
},
850 { "CSIBM861", RTL_TEXTENCODING_IBM_861
},
851 { "IBM863", RTL_TEXTENCODING_IBM_863
},
852 { "CP863", RTL_TEXTENCODING_IBM_863
},
853 { "863", RTL_TEXTENCODING_IBM_863
},
854 { "CSIBM863", RTL_TEXTENCODING_IBM_863
},
855 { "IBM865", RTL_TEXTENCODING_IBM_865
},
856 { "CP865", RTL_TEXTENCODING_IBM_865
},
857 { "865", RTL_TEXTENCODING_IBM_865
},
858 { "CSIBM865", RTL_TEXTENCODING_IBM_865
},
859 { "IBM775", RTL_TEXTENCODING_IBM_775
},
860 { "CP775", RTL_TEXTENCODING_IBM_775
},
861 { "CSPC775BALTIC", RTL_TEXTENCODING_IBM_775
},
862 { "IBM852", RTL_TEXTENCODING_IBM_852
},
863 { "CP852", RTL_TEXTENCODING_IBM_852
},
864 { "852", RTL_TEXTENCODING_IBM_852
},
865 { "CSPCP852", RTL_TEXTENCODING_IBM_852
},
866 { "IBM855", RTL_TEXTENCODING_IBM_855
},
867 { "CP855", RTL_TEXTENCODING_IBM_855
},
868 { "855", RTL_TEXTENCODING_IBM_855
},
869 { "CSIBM855", RTL_TEXTENCODING_IBM_855
},
870 { "IBM857", RTL_TEXTENCODING_IBM_857
},
871 { "CP857", RTL_TEXTENCODING_IBM_857
},
872 { "857", RTL_TEXTENCODING_IBM_857
},
873 { "CSIBM857", RTL_TEXTENCODING_IBM_857
},
874 { "IBM862", RTL_TEXTENCODING_IBM_862
},
875 { "CP862", RTL_TEXTENCODING_IBM_862
},
876 { "862", RTL_TEXTENCODING_IBM_862
},
877 { "CSPC862LATINHEBREW", RTL_TEXTENCODING_IBM_862
},
878 { "IBM864", RTL_TEXTENCODING_IBM_864
},
879 { "CP864", RTL_TEXTENCODING_IBM_864
},
880 { "CSIBM864", RTL_TEXTENCODING_IBM_864
},
881 { "IBM866", RTL_TEXTENCODING_IBM_866
},
882 { "CP866", RTL_TEXTENCODING_IBM_866
},
883 { "866", RTL_TEXTENCODING_IBM_866
},
884 { "CSIBM866", RTL_TEXTENCODING_IBM_866
},
885 { "IBM869", RTL_TEXTENCODING_IBM_869
},
886 { "CP869", RTL_TEXTENCODING_IBM_869
},
887 { "869", RTL_TEXTENCODING_IBM_869
},
888 { "CP-GR", RTL_TEXTENCODING_IBM_869
},
889 { "CSIBM869", RTL_TEXTENCODING_IBM_869
},
890 { "WINDOWS-1250", RTL_TEXTENCODING_MS_1250
},
891 { "WINDOWS-1251", RTL_TEXTENCODING_MS_1251
},
892 { "WINDOWS-1253", RTL_TEXTENCODING_MS_1253
},
893 { "WINDOWS-1254", RTL_TEXTENCODING_MS_1254
},
894 { "WINDOWS-1255", RTL_TEXTENCODING_MS_1255
},
895 { "WINDOWS-1256", RTL_TEXTENCODING_MS_1256
},
896 { "WINDOWS-1257", RTL_TEXTENCODING_MS_1257
},
897 { "WINDOWS-1258", RTL_TEXTENCODING_MS_1258
},
898 { "SHIFT_JIS", RTL_TEXTENCODING_SHIFT_JIS
},
899 { "MS_KANJI", RTL_TEXTENCODING_SHIFT_JIS
},
900 { "CSSHIFTJIS", RTL_TEXTENCODING_SHIFT_JIS
},
901 { "GB2312", RTL_TEXTENCODING_GB_2312
},
902 { "CSGB2312", RTL_TEXTENCODING_GB_2312
},
903 { "BIG5", RTL_TEXTENCODING_BIG5
},
904 { "CSBIG5", RTL_TEXTENCODING_BIG5
},
905 { "EUC-JP", RTL_TEXTENCODING_EUC_JP
},
906 { "EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE",
907 RTL_TEXTENCODING_EUC_JP
},
908 { "CSEUCPKDFMTJAPANESE", RTL_TEXTENCODING_EUC_JP
},
909 { "ISO-2022-JP", RTL_TEXTENCODING_ISO_2022_JP
},
910 { "CSISO2022JP", RTL_TEXTENCODING_ISO_2022_JP
},
911 { "ISO-2022-CN", RTL_TEXTENCODING_ISO_2022_CN
},
912 { "KOI8-R", RTL_TEXTENCODING_KOI8_R
},
913 { "CSKOI8R", RTL_TEXTENCODING_KOI8_R
},
914 { "UTF-7", RTL_TEXTENCODING_UTF7
},
915 { "UTF-8", RTL_TEXTENCODING_UTF8
},
916 { "ISO-8859-10", RTL_TEXTENCODING_ISO_8859_10
}, // RFC 2047
917 { "ISO-8859-13", RTL_TEXTENCODING_ISO_8859_13
}, // RFC 2047
918 { "EUC-KR", RTL_TEXTENCODING_EUC_KR
},
919 { "CSEUCKR", RTL_TEXTENCODING_EUC_KR
},
920 { "ISO-2022-KR", RTL_TEXTENCODING_ISO_2022_KR
},
921 { "CSISO2022KR", RTL_TEXTENCODING_ISO_2022_KR
},
922 { "ISO-10646-UCS-4", RTL_TEXTENCODING_UCS4
},
923 { "CSUCS4", RTL_TEXTENCODING_UCS4
},
924 { "ISO-10646-UCS-2", RTL_TEXTENCODING_UCS2
},
925 { "CSUNICODE", RTL_TEXTENCODING_UCS2
} };
927 rtl_TextEncoding
getCharsetEncoding(char const * pBegin
,
930 for (const EncodingEntry
& i
: aEncodingMap
)
931 if (equalIgnoreCase(pBegin
, pEnd
, i
.m_aName
))
932 return i
.m_eEncoding
;
933 return RTL_TEXTENCODING_DONTKNOW
;
941 bool INetMIME::isAtomChar(sal_uInt32 nChar
)
943 static const bool aMap
[128]
944 = { false, false, false, false, false, false, false, false,
945 false, false, false, false, false, false, false, false,
946 false, false, false, false, false, false, false, false,
947 false, false, false, false, false, false, false, false,
948 false, true, false, true, true, true, true, true, // !"#$%&'
949 false, false, true, true, false, true, false, true, //()*+,-./
950 true, true, true, true, true, true, true, true, //01234567
951 true, true, false, false, false, true, false, true, //89:;<=>?
952 false, true, true, true, true, true, true, true, //@ABCDEFG
953 true, true, true, true, true, true, true, true, //HIJKLMNO
954 true, true, true, true, true, true, true, true, //PQRSTUVW
955 true, true, true, false, false, false, true, true, //XYZ[\]^_
956 true, true, true, true, true, true, true, true, //`abcdefg
957 true, true, true, true, true, true, true, true, //hijklmno
958 true, true, true, true, true, true, true, true, //pqrstuvw
959 true, true, true, true, true, true, true, false //xyz{|}~
961 return rtl::isAscii(nChar
) && aMap
[nChar
];
965 bool INetMIME::isIMAPAtomChar(sal_uInt32 nChar
)
967 static const bool aMap
[128]
968 = { false, false, false, false, false, false, false, false,
969 false, false, false, false, false, false, false, false,
970 false, false, false, false, false, false, false, false,
971 false, false, false, false, false, false, false, false,
972 false, true, false, true, true, false, true, true, // !"#$%&'
973 false, false, false, true, true, true, true, true, //()*+,-./
974 true, true, true, true, true, true, true, true, //01234567
975 true, true, true, true, true, true, true, true, //89:;<=>?
976 true, true, true, true, true, true, true, true, //@ABCDEFG
977 true, true, true, true, true, true, true, true, //HIJKLMNO
978 true, true, true, true, true, true, true, true, //PQRSTUVW
979 true, true, true, true, false, true, true, true, //XYZ[\]^_
980 true, true, true, true, true, true, true, true, //`abcdefg
981 true, true, true, true, true, true, true, true, //hijklmno
982 true, true, true, true, true, true, true, true, //pqrstuvw
983 true, true, true, false, true, true, true, false //xyz{|}~
985 return rtl::isAscii(nChar
) && aMap
[nChar
];
989 bool INetMIME::equalIgnoreCase(const sal_Unicode
* pBegin1
,
990 const sal_Unicode
* pEnd1
,
991 const char * pString2
)
993 DBG_ASSERT(pBegin1
&& pBegin1
<= pEnd1
&& pString2
,
994 "INetMIME::equalIgnoreCase(): Bad sequences");
996 while (*pString2
!= 0)
998 || (rtl::toAsciiUpperCase(*pBegin1
++)
999 != rtl::toAsciiUpperCase(
1000 static_cast<unsigned char>(*pString2
++))))
1002 return pBegin1
== pEnd1
;
1006 bool INetMIME::scanUnsigned(const sal_Unicode
*& rBegin
,
1007 const sal_Unicode
* pEnd
, bool bLeadingZeroes
,
1008 sal_uInt32
& rValue
)
1010 sal_uInt64 nTheValue
= 0;
1011 const sal_Unicode
* p
= rBegin
;
1012 for ( ; p
!= pEnd
; ++p
)
1014 int nWeight
= getWeight(*p
);
1017 nTheValue
= 10 * nTheValue
+ nWeight
;
1018 if (nTheValue
> std::numeric_limits
< sal_uInt32
>::max())
1021 if (nTheValue
== 0 && (p
== rBegin
|| (!bLeadingZeroes
&& p
- rBegin
!= 1)))
1024 rValue
= sal_uInt32(nTheValue
);
1029 sal_Unicode
const * INetMIME::scanContentType(
1030 OUString
const & rStr
, OUString
* pType
,
1031 OUString
* pSubType
, INetContentTypeParameterList
* pParameters
)
1033 sal_Unicode
const * pBegin
= rStr
.getStr();
1034 sal_Unicode
const * pEnd
= pBegin
+ rStr
.getLength();
1035 sal_Unicode
const * p
= skipLinearWhiteSpaceComment(pBegin
, pEnd
);
1036 sal_Unicode
const * pTypeBegin
= p
;
1037 while (p
!= pEnd
&& isTokenChar(*p
))
1041 if (p
== pTypeBegin
)
1043 sal_Unicode
const * pTypeEnd
= p
;
1045 p
= skipLinearWhiteSpaceComment(p
, pEnd
);
1046 if (p
== pEnd
|| *p
++ != '/')
1049 p
= skipLinearWhiteSpaceComment(p
, pEnd
);
1050 sal_Unicode
const * pSubTypeBegin
= p
;
1051 while (p
!= pEnd
&& isTokenChar(*p
))
1055 if (p
== pSubTypeBegin
)
1057 sal_Unicode
const * pSubTypeEnd
= p
;
1059 if (pType
!= nullptr)
1061 *pType
= OUString(pTypeBegin
, pTypeEnd
- pTypeBegin
).toAsciiLowerCase();
1063 if (pSubType
!= nullptr)
1065 *pSubType
= OUString(pSubTypeBegin
, pSubTypeEnd
- pSubTypeBegin
)
1066 .toAsciiLowerCase();
1069 return scanParameters(p
, pEnd
, pParameters
);
1073 OUString
INetMIME::decodeHeaderFieldBody(const OString
& rBody
)
1075 // Due to a bug in INetCoreRFC822MessageStream::ConvertTo7Bit(), old
1076 // versions of StarOffice send mails with header fields where encoded
1077 // words can be preceded by '=', ',', '.', '"', or '(', and followed by
1078 // '=', ',', '.', '"', ')', without any required white space in between.
1079 // And there appear to exist some broken mailers that only encode single
1080 // letters within words, like "Appel
1081 // =?iso-8859-1?Q?=E0?=t=?iso-8859-1?Q?=E9?=moin", so it seems best to
1082 // detect encoded words even when not properly surrounded by white space.
1084 // Non US-ASCII characters in rBody are treated as ISO-8859-1.
1086 // encoded-word = "=?"
1087 // 1*(%x21 / %x23-27 / %x2A-2B / %x2D / %30-39 / %x41-5A / %x5E-7E)
1088 // ["*" 1*8ALPHA *("-" 1*8ALPHA)] "?"
1089 // ("B?" *(4base64) (4base64 / 3base64 "=" / 2base64 "==")
1090 // / "Q?" 1*(%x21-3C / %x3E / %x40-7E / "=" 2HEXDIG))
1093 // base64 = ALPHA / DIGIT / "+" / "/"
1095 const char * pBegin
= rBody
.getStr();
1096 const char * pEnd
= pBegin
+ rBody
.getLength();
1098 OUStringBuffer sDecoded
;
1099 const char * pCopyBegin
= pBegin
;
1101 /* bool bStartEncodedWord = true; */
1102 const char * pWSPBegin
= pBegin
;
1104 for (const char * p
= pBegin
; p
!= pEnd
;)
1106 if (*p
== '=' /* && bStartEncodedWord */)
1108 const char * q
= p
+ 1;
1109 bool bEncodedWord
= q
!= pEnd
&& *q
++ == '?';
1111 rtl_TextEncoding eCharsetEncoding
= RTL_TEXTENCODING_DONTKNOW
;
1114 const char * pCharsetBegin
= q
;
1115 const char * pLanguageBegin
= nullptr;
1116 int nAlphaCount
= 0;
1117 for (bool bDone
= false; !bDone
;)
1120 bEncodedWord
= false;
1129 pLanguageBegin
= q
- 1;
1134 if (pLanguageBegin
!= nullptr)
1136 if (nAlphaCount
== 0)
1137 pLanguageBegin
= nullptr;
1144 if (pCharsetBegin
== q
- 1)
1145 bEncodedWord
= false;
1149 = getCharsetEncoding(
1151 pLanguageBegin
== nullptr
1152 || nAlphaCount
== 0 ?
1153 q
- 1 : pLanguageBegin
);
1154 bEncodedWord
= isMIMECharsetEncoding(
1157 = translateFromMIME(eCharsetEncoding
);
1163 if (pLanguageBegin
!= nullptr
1164 && (!rtl::isAsciiAlpha(
1165 static_cast<unsigned char>(cChar
))
1166 || ++nAlphaCount
> 8))
1167 pLanguageBegin
= nullptr;
1173 bool bEncodingB
= false;
1177 bEncodedWord
= false;
1193 bEncodedWord
= false;
1199 bEncodedWord
= bEncodedWord
&& q
!= pEnd
&& *q
++ == '?';
1201 OStringBuffer sText
;
1206 for (bool bDone
= false; !bDone
;)
1210 bEncodedWord
= false;
1215 bool bFinal
= false;
1217 sal_uInt32 nValue
= 0;
1218 for (int nShift
= 18; nShift
>= 0; nShift
-= 6)
1220 int nWeight
= getBase64Weight(*q
++);
1223 bEncodedWord
= false;
1233 bEncodedWord
= false;
1238 nCount
= nShift
== 6 ? 1 : 2;
1242 nValue
|= nWeight
<< nShift
;
1246 for (int nShift
= 16; nCount
-- > 0; nShift
-= 8)
1247 sText
.append(char(nValue
>> nShift
& 0xFF));
1253 if (bFinal
&& !bDone
)
1255 bEncodedWord
= false;
1264 const char * pEncodedTextBegin
= q
;
1265 const char * pEncodedTextCopyBegin
= q
;
1266 for (bool bDone
= false; !bDone
;)
1269 bEncodedWord
= false;
1274 sal_uInt32 nChar
= static_cast<unsigned char>(*q
++);
1281 bEncodedWord
= false;
1285 int nDigit1
= getHexWeight(q
[0]);
1286 int nDigit2
= getHexWeight(q
[1]);
1287 if (nDigit1
< 0 || nDigit2
< 0)
1289 bEncodedWord
= false;
1293 sText
.append(rBody
.copy(
1294 (pEncodedTextCopyBegin
- pBegin
),
1295 (q
- 1 - pEncodedTextCopyBegin
)));
1296 sText
.append(char(nDigit1
<< 4 | nDigit2
));
1298 pEncodedTextCopyBegin
= q
;
1303 if (q
- pEncodedTextBegin
> 1)
1304 sText
.append(rBody
.copy(
1305 (pEncodedTextCopyBegin
- pBegin
),
1306 (q
- 1 - pEncodedTextCopyBegin
)));
1308 bEncodedWord
= false;
1313 sText
.append(rBody
.copy(
1314 (pEncodedTextCopyBegin
- pBegin
),
1315 (q
- 1 - pEncodedTextCopyBegin
)));
1317 pEncodedTextCopyBegin
= q
;
1321 if (!isVisible(nChar
))
1323 bEncodedWord
= false;
1332 bEncodedWord
= bEncodedWord
&& q
!= pEnd
&& *q
++ == '=';
1334 std::unique_ptr
<sal_Unicode
[]> pUnicodeBuffer
;
1335 sal_Size nUnicodeSize
= 0;
1339 = convertToUnicode(sText
.getStr(),
1340 sText
.getStr() + sText
.getLength(),
1341 eCharsetEncoding
, nUnicodeSize
);
1342 if (!pUnicodeBuffer
)
1343 bEncodedWord
= false;
1348 appendISO88591(sDecoded
, pCopyBegin
, pWSPBegin
);
1350 pUnicodeBuffer
.get(),
1351 static_cast< sal_Int32
>(nUnicodeSize
));
1352 pUnicodeBuffer
.reset();
1357 while (p
!= pEnd
&& isWhiteSpace(*p
))
1359 /* bStartEncodedWord = p != pWSPBegin; */
1370 /* bStartEncodedWord = true; */
1374 /* bStartEncodedWord = true; */
1378 /* bStartEncodedWord = false; */
1383 const char * pUTF8Begin
= p
- 1;
1384 const char * pUTF8End
= pUTF8Begin
;
1385 sal_uInt32 nCharacter
= 0;
1386 if (translateUTF8Char(pUTF8End
, pEnd
, nCharacter
))
1388 appendISO88591(sDecoded
, pCopyBegin
, p
- 1);
1389 sal_Unicode aUTF16Buf
[2];
1390 sal_Int32 nUTF16Len
= putUTF32Character(aUTF16Buf
, nCharacter
) - aUTF16Buf
;
1391 sDecoded
.append(aUTF16Buf
, nUTF16Len
);
1395 /* bStartEncodedWord = false; */
1402 appendISO88591(sDecoded
, pCopyBegin
, pEnd
);
1403 return sDecoded
.makeStringAndClear();
1406 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */