1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
22 #include <forward_list>
25 #include <sal/log.hxx>
26 #include <rtl/ustring.hxx>
27 #include <rtl/strbuf.hxx>
28 #include <rtl/ustrbuf.hxx>
29 #include <rtl/tencinfo.h>
30 #include <tools/debug.hxx>
31 #include <tools/inetmime.hxx>
32 #include <rtl/character.hxx>
36 rtl_TextEncoding
getCharsetEncoding(const char * pBegin
,
39 /** Check for US-ASCII white space character.
41 @param nChar Some UCS-4 character.
43 @return True if nChar is a US-ASCII white space character (US-ASCII
46 bool isWhiteSpace(sal_uInt32 nChar
)
48 return nChar
== '\t' || nChar
== ' ';
51 /** Get the Base 64 digit weight of a US-ASCII character.
53 @param nChar Some UCS-4 character.
55 @return If nChar is a US-ASCII Base 64 digit character (US-ASCII
56 'A'--'F', or 'a'--'f', '0'--'9', '+', or '/'), return the
57 corresponding weight (0--63); if nChar is the US-ASCII Base 64 padding
58 character (US-ASCII '='), return -1; otherwise, return -2.
60 int getBase64Weight(sal_uInt32 nChar
)
62 return rtl::isAsciiUpperCase(nChar
) ? int(nChar
- 'A') :
63 rtl::isAsciiLowerCase(nChar
) ? int(nChar
- 'a' + 26) :
64 rtl::isAsciiDigit(nChar
) ? int(nChar
- '0' + 52) :
67 nChar
== '=' ? -1 : -2;
70 bool startsWithLineFolding(const sal_Unicode
* pBegin
,
71 const sal_Unicode
* pEnd
)
73 assert(pBegin
&& pBegin
<= pEnd
&& "startsWithLineFolding(): Bad sequence");
75 return pEnd
- pBegin
>= 3 && pBegin
[0] == 0x0D && pBegin
[1] == 0x0A
76 && isWhiteSpace(pBegin
[2]); // CR, LF
79 rtl_TextEncoding
translateFromMIME(rtl_TextEncoding
83 return eEncoding
== RTL_TEXTENCODING_ISO_8859_1
?
84 RTL_TEXTENCODING_MS_1252
: eEncoding
;
90 bool isMIMECharsetEncoding(rtl_TextEncoding eEncoding
)
92 return rtl_isOctetTextEncoding(eEncoding
);
95 std::unique_ptr
<sal_Unicode
[]> convertToUnicode(const char * pBegin
,
97 rtl_TextEncoding eEncoding
,
100 if (eEncoding
== RTL_TEXTENCODING_DONTKNOW
)
102 rtl_TextToUnicodeConverter hConverter
103 = rtl_createTextToUnicodeConverter(eEncoding
);
104 rtl_TextToUnicodeContext hContext
105 = rtl_createTextToUnicodeContext(hConverter
);
106 std::unique_ptr
<sal_Unicode
[]> pBuffer
;
108 for (sal_Size nBufferSize
= pEnd
- pBegin
;;
109 nBufferSize
+= nBufferSize
/ 3 + 1)
111 pBuffer
.reset(new sal_Unicode
[nBufferSize
]);
112 sal_Size nSrcCvtBytes
;
113 rSize
= rtl_convertTextToUnicode(
114 hConverter
, hContext
, pBegin
, pEnd
- pBegin
, pBuffer
.get(),
116 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
117 | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
118 | RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR
,
119 &nInfo
, &nSrcCvtBytes
);
120 if (nInfo
!= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL
)
123 rtl_resetTextToUnicodeContext(hConverter
, hContext
);
125 rtl_destroyTextToUnicodeContext(hConverter
, hContext
);
126 rtl_destroyTextToUnicodeConverter(hConverter
);
134 void writeUTF8(OStringBuffer
& rSink
, sal_uInt32 nChar
)
136 // See RFC 2279 for a discussion of UTF-8.
137 DBG_ASSERT(nChar
< 0x80000000, "writeUTF8(): Bad char");
140 rSink
.append(char(nChar
));
141 else if (nChar
< 0x800)
142 rSink
.append(OStringChar(char(nChar
>> 6 | 0xC0))
143 + OStringChar(char((nChar
& 0x3F) | 0x80)));
144 else if (nChar
< 0x10000)
146 OStringChar(char(nChar
>> 12 | 0xE0))
147 + OStringChar(char((nChar
>> 6 & 0x3F) | 0x80))
148 + OStringChar(char((nChar
& 0x3F) | 0x80)));
149 else if (nChar
< 0x200000)
151 OStringChar(char(nChar
>> 18 | 0xF0))
152 + OStringChar(char((nChar
>> 12 & 0x3F) | 0x80))
153 + OStringChar(char((nChar
>> 6 & 0x3F) | 0x80))
154 + OStringChar(char((nChar
& 0x3F) | 0x80)));
155 else if (nChar
< 0x4000000)
157 OStringChar(char(nChar
>> 24 | 0xF8))
158 + OStringChar(char((nChar
>> 18 & 0x3F) | 0x80))
159 + OStringChar(char((nChar
>> 12 & 0x3F) | 0x80))
160 + OStringChar(char((nChar
>> 6 & 0x3F) | 0x80))
161 + OStringChar(char((nChar
& 0x3F) | 0x80)));
164 OStringChar(char(nChar
>> 30 | 0xFC))
165 + OStringChar(char((nChar
>> 24 & 0x3F) | 0x80))
166 + OStringChar(char((nChar
>> 18 & 0x3F) | 0x80))
167 + OStringChar(char((nChar
>> 12 & 0x3F) | 0x80))
168 + OStringChar(char((nChar
>> 6 & 0x3F) | 0x80))
169 + OStringChar(char((nChar
& 0x3F) | 0x80)));
172 bool translateUTF8Char(const char *& rBegin
,
174 sal_uInt32
& rCharacter
)
176 if (rBegin
== pEnd
|| static_cast< unsigned char >(*rBegin
) < 0x80
177 || static_cast< unsigned char >(*rBegin
) >= 0xFE)
183 const char * p
= rBegin
;
184 if (static_cast< unsigned char >(*p
) < 0xE0)
188 nUCS4
= static_cast< unsigned char >(*p
) & 0x1F;
190 else if (static_cast< unsigned char >(*p
) < 0xF0)
194 nUCS4
= static_cast< unsigned char >(*p
) & 0xF;
196 else if (static_cast< unsigned char >(*p
) < 0xF8)
200 nUCS4
= static_cast< unsigned char >(*p
) & 7;
202 else if (static_cast< unsigned char >(*p
) < 0xFC)
206 nUCS4
= static_cast< unsigned char >(*p
) & 3;
212 nUCS4
= static_cast< unsigned char >(*p
) & 1;
216 for (; nCount
-- > 0; ++p
)
217 if ((static_cast< unsigned char >(*p
) & 0xC0) == 0x80)
218 nUCS4
= (nUCS4
<< 6) | (static_cast< unsigned char >(*p
) & 0x3F);
222 if (!rtl::isUnicodeCodePoint(nUCS4
) || nUCS4
< nMin
)
230 void appendISO88591(OUStringBuffer
& rText
, char const * pBegin
,
235 OString m_aAttribute
;
239 sal_uInt32 m_nSection
;
242 bool operator<(const Parameter
& rhs
) const // is used by std::list<Parameter>::sort
244 int nComp
= m_aAttribute
.compareTo(rhs
.m_aAttribute
);
246 (nComp
== 0 && m_nSection
< rhs
.m_nSection
);
248 struct IsSameSection
// is used to check container for duplicates with std::any_of
250 const OString
& rAttribute
;
251 const sal_uInt32 nSection
;
252 bool operator()(const Parameter
& r
) const
253 { return r
.m_aAttribute
== rAttribute
&& r
.m_nSection
== nSection
; }
257 typedef std::forward_list
<Parameter
> ParameterList
;
259 bool parseParameters(ParameterList
const & rInput
,
260 INetContentTypeParameterList
* pOutput
);
264 void appendISO88591(OUStringBuffer
& rText
, char const * pBegin
,
267 sal_Int32 nLength
= pEnd
- pBegin
;
268 std::unique_ptr
<sal_Unicode
[]> pBuffer(new sal_Unicode
[nLength
]);
269 for (sal_Unicode
* p
= pBuffer
.get(); pBegin
!= pEnd
;)
270 *p
++ = static_cast<unsigned char>(*pBegin
++);
271 rText
.append(pBuffer
.get(), nLength
);
276 bool parseParameters(ParameterList
const & rInput
,
277 INetContentTypeParameterList
* pOutput
)
282 for (auto it
= rInput
.begin(), itPrev
= rInput
.end(); it
!= rInput
.end() ; itPrev
= it
++)
284 if (it
->m_nSection
> 0
285 && (itPrev
== rInput
.end()
286 || itPrev
->m_nSection
!= it
->m_nSection
- 1
287 || itPrev
->m_aAttribute
!= it
->m_aAttribute
))
292 for (auto it
= rInput
.begin(), itNext
= rInput
.begin(); it
!= rInput
.end(); it
= itNext
)
294 bool bCharset
= !it
->m_aCharset
.isEmpty();
295 rtl_TextEncoding eEncoding
= RTL_TEXTENCODING_DONTKNOW
;
298 = getCharsetEncoding(it
->m_aCharset
.getStr(),
299 it
->m_aCharset
.getStr()
300 + it
->m_aCharset
.getLength());
301 OUStringBuffer
aValue(64);
302 bool bBadEncoding
= false;
307 std::unique_ptr
<sal_Unicode
[]> pUnicode
308 = convertToUnicode(itNext
->m_aValue
.getStr(),
309 itNext
->m_aValue
.getStr()
310 + itNext
->m_aValue
.getLength(),
311 bCharset
&& it
->m_bExtended
?
313 RTL_TEXTENCODING_UTF8
,
315 if (!pUnicode
&& !(bCharset
&& it
->m_bExtended
))
316 pUnicode
= convertToUnicode(
317 itNext
->m_aValue
.getStr(),
318 itNext
->m_aValue
.getStr()
319 + itNext
->m_aValue
.getLength(),
320 RTL_TEXTENCODING_ISO_8859_1
, nSize
);
326 aValue
.append(pUnicode
.get(), static_cast<sal_Int32
>(nSize
));
329 while (itNext
!= rInput
.end() && itNext
->m_nSection
!= 0);
337 if (itNext
->m_bExtended
)
339 for (sal_Int32 i
= 0; i
< itNext
->m_aValue
.getLength(); ++i
)
341 static_cast<sal_Unicode
>(
342 static_cast<unsigned char>(itNext
->m_aValue
[i
])
343 | 0xF800)); // map to unicode corporate use sub area
347 for (sal_Int32 i
= 0; i
< itNext
->m_aValue
.getLength(); ++i
)
348 aValue
.append( itNext
->m_aValue
[i
] );
352 while (itNext
!= rInput
.end() && itNext
->m_nSection
!= 0);
354 auto const ret
= pOutput
->insert(
356 {it
->m_aCharset
, it
->m_aLanguage
, aValue
.makeStringAndClear(), !bBadEncoding
}});
357 SAL_INFO_IF(!ret
.second
, "tools",
358 "INetMIME: dropping duplicate parameter: " << it
->m_aAttribute
);
363 /** Check whether some character is valid within an RFC 2045 <token>.
365 @param nChar Some UCS-4 character.
367 @return True if nChar is valid within an RFC 2047 <token> (US-ASCII
368 'A'--'Z', 'a'--'z', '0'--'9', '!', '#', '$', '%', '&', ''', '*', '+',
369 '-', '.', '^', '_', '`', '{', '|', '}', or '~').
371 bool isTokenChar(sal_uInt32 nChar
)
373 static const bool aMap
[128]
374 = { false, false, false, false, false, false, false, false,
375 false, false, false, false, false, false, false, false,
376 false, false, false, false, false, false, false, false,
377 false, false, false, false, false, false, false, false,
378 false, true, false, true, true, true, true, true, // !"#$%&'
379 false, false, true, true, false, true, true, false, //()*+,-./
380 true, true, true, true, true, true, true, true, //01234567
381 true, true, false, false, false, false, false, false, //89:;<=>?
382 false, true, true, true, true, true, true, true, //@ABCDEFG
383 true, true, true, true, true, true, true, true, //HIJKLMNO
384 true, true, true, true, true, true, true, true, //PQRSTUVW
385 true, true, true, false, false, false, true, true, //XYZ[\]^_
386 true, true, true, true, true, true, true, true, //`abcdefg
387 true, true, true, true, true, true, true, true, //hijklmno
388 true, true, true, true, true, true, true, true, //pqrstuvw
389 true, true, true, true, true, true, true, false //xyz{|}~
391 return rtl::isAscii(nChar
) && aMap
[nChar
];
394 const sal_Unicode
* skipComment(const sal_Unicode
* pBegin
,
395 const sal_Unicode
* pEnd
)
397 assert(pBegin
&& pBegin
<= pEnd
&& "skipComment(): Bad sequence");
399 if (pBegin
!= pEnd
&& *pBegin
== '(')
401 sal_uInt32 nLevel
= 0;
402 for (const sal_Unicode
* p
= pBegin
; p
!= pEnd
;)
423 const sal_Unicode
* skipLinearWhiteSpaceComment(const sal_Unicode
*
428 assert(pBegin
&& pBegin
<= pEnd
&& "skipLinearWhiteSpaceComment(): Bad sequence");
430 while (pBegin
!= pEnd
)
439 if (startsWithLineFolding(pBegin
, pEnd
))
447 const sal_Unicode
* p
= skipComment(pBegin
, pEnd
);
460 const sal_Unicode
* skipQuotedString(const sal_Unicode
* pBegin
,
461 const sal_Unicode
* pEnd
)
463 assert(pBegin
&& pBegin
<= pEnd
&& "skipQuotedString(): Bad sequence");
465 if (pBegin
!= pEnd
&& *pBegin
== '"')
466 for (const sal_Unicode
* p
= pBegin
+ 1; p
!= pEnd
;)
470 if (pEnd
- p
< 2 || *p
++ != 0x0A // LF
471 || !isWhiteSpace(*p
++))
486 sal_Unicode
const * scanParameters(sal_Unicode
const * pBegin
,
487 sal_Unicode
const * pEnd
,
488 INetContentTypeParameterList
*
492 sal_Unicode
const * pParameterBegin
= pBegin
;
493 for (sal_Unicode
const * p
= pParameterBegin
;;)
495 pParameterBegin
= skipLinearWhiteSpaceComment(p
, pEnd
);
496 if (pParameterBegin
== pEnd
|| *pParameterBegin
!= ';')
498 p
= pParameterBegin
+ 1;
500 sal_Unicode
const * pAttributeBegin
501 = skipLinearWhiteSpaceComment(p
, pEnd
);
503 bool bDowncaseAttribute
= false;
504 while (p
!= pEnd
&& isTokenChar(*p
) && *p
!= '*')
506 bDowncaseAttribute
= bDowncaseAttribute
|| rtl::isAsciiUpperCase(*p
);
509 if (p
== pAttributeBegin
)
511 OString
aAttribute(pAttributeBegin
, p
- pAttributeBegin
, RTL_TEXTENCODING_ASCII_US
);
512 if (bDowncaseAttribute
)
513 aAttribute
= aAttribute
.toAsciiLowerCase();
515 sal_uInt32 nSection
= 0;
516 if (p
!= pEnd
&& *p
== '*')
519 if (p
!= pEnd
&& rtl::isAsciiDigit(*p
)
520 && !INetMIME::scanUnsigned(p
, pEnd
, false, nSection
))
524 bool bPresent
= std::any_of(aList
.begin(), aList
.end(),
525 Parameter::IsSameSection
{aAttribute
, nSection
});
529 bool bExtended
= false;
530 if (p
!= pEnd
&& *p
== '*')
536 p
= skipLinearWhiteSpaceComment(p
, pEnd
);
538 if (p
== pEnd
|| *p
!= '=')
541 p
= skipLinearWhiteSpaceComment(p
+ 1, pEnd
);
550 sal_Unicode
const * pCharsetBegin
= p
;
551 bool bDowncaseCharset
= false;
552 while (p
!= pEnd
&& isTokenChar(*p
) && *p
!= '\'')
554 bDowncaseCharset
= bDowncaseCharset
|| rtl::isAsciiUpperCase(*p
);
557 if (p
== pCharsetBegin
)
564 RTL_TEXTENCODING_ASCII_US
);
565 if (bDowncaseCharset
)
566 aCharset
= aCharset
.toAsciiLowerCase();
569 if (p
== pEnd
|| *p
!= '\'')
573 sal_Unicode
const * pLanguageBegin
= p
;
574 bool bDowncaseLanguage
= false;
576 for (; p
!= pEnd
; ++p
)
577 if (rtl::isAsciiAlpha(*p
))
581 bDowncaseLanguage
= bDowncaseLanguage
582 || rtl::isAsciiUpperCase(*p
);
592 if (nLetters
== 0 || nLetters
> 8)
599 RTL_TEXTENCODING_ASCII_US
);
600 if (bDowncaseLanguage
)
601 aLanguage
= aLanguage
.toAsciiLowerCase();
604 if (p
== pEnd
|| *p
!= '\'')
614 sal_uInt32 nChar
= INetMIME::getUTF32Character(q
, pEnd
);
615 if (rtl::isAscii(nChar
) && !isTokenChar(nChar
))
618 if (nChar
== '%' && p
+ 1 < pEnd
)
620 int nWeight1
= INetMIME::getHexWeight(p
[0]);
621 int nWeight2
= INetMIME::getHexWeight(p
[1]);
622 if (nWeight1
>= 0 && nWeight2
>= 0)
624 aSink
.append(char(nWeight1
<< 4 | nWeight2
));
629 writeUTF8(aSink
, nChar
);
631 aValue
= aSink
.makeStringAndClear();
634 while (p
!= pEnd
&& (isTokenChar(*p
) || !rtl::isAscii(*p
)))
637 else if (p
!= pEnd
&& *p
== '"')
640 OStringBuffer
aSink(256);
641 bool bInvalid
= false;
649 sal_uInt32 nChar
= INetMIME::getUTF32Character(p
, pEnd
);
652 else if (nChar
== 0x0D) // CR
654 if (pEnd
- p
< 2 || *p
++ != 0x0A // LF
655 || !isWhiteSpace(*p
))
660 nChar
= static_cast<unsigned char>(*p
++);
662 else if (nChar
== '\\')
669 nChar
= INetMIME::getUTF32Character(p
, pEnd
);
671 writeUTF8(aSink
, nChar
);
675 aValue
= aSink
.makeStringAndClear();
679 sal_Unicode
const * pStringEnd
= skipQuotedString(p
, pEnd
);
686 sal_Unicode
const * pTokenBegin
= p
;
687 while (p
!= pEnd
&& (isTokenChar(*p
) || !rtl::isAscii(*p
)))
689 if (p
== pTokenBegin
)
693 pTokenBegin
, p
- pTokenBegin
,
694 RTL_TEXTENCODING_UTF8
);
696 aList
.emplace_front(Parameter
{aAttribute
, aCharset
, aLanguage
, aValue
, nSection
, bExtended
});
699 return parseParameters(aList
, pParameters
) ? pParameterBegin
: pBegin
;
702 bool equalIgnoreCase(const char * pBegin1
,
704 const char * pString2
)
706 assert(pBegin1
&& pBegin1
<= pEnd1
&& pString2
&&
707 "equalIgnoreCase(): Bad sequences");
709 while (*pString2
!= 0)
711 || (rtl::toAsciiUpperCase(static_cast<unsigned char>(*pBegin1
++))
712 != rtl::toAsciiUpperCase(
713 static_cast<unsigned char>(*pString2
++))))
715 return pBegin1
== pEnd1
;
720 char const * m_aName
;
721 rtl_TextEncoding m_eEncoding
;
724 // The source for the following table is <ftp://ftp.iana.org/in-notes/iana/
725 // assignments/character-sets> as of Jan, 21 2000 12:46:00, unless otherwise
727 EncodingEntry
const aEncodingMap
[]
728 = { { "US-ASCII", RTL_TEXTENCODING_ASCII_US
},
729 { "ANSI_X3.4-1968", RTL_TEXTENCODING_ASCII_US
},
730 { "ISO-IR-6", RTL_TEXTENCODING_ASCII_US
},
731 { "ANSI_X3.4-1986", RTL_TEXTENCODING_ASCII_US
},
732 { "ISO_646.IRV:1991", RTL_TEXTENCODING_ASCII_US
},
733 { "ASCII", RTL_TEXTENCODING_ASCII_US
},
734 { "ISO646-US", RTL_TEXTENCODING_ASCII_US
},
735 { "US", RTL_TEXTENCODING_ASCII_US
},
736 { "IBM367", RTL_TEXTENCODING_ASCII_US
},
737 { "CP367", RTL_TEXTENCODING_ASCII_US
},
738 { "CSASCII", RTL_TEXTENCODING_ASCII_US
},
739 { "ISO-8859-1", RTL_TEXTENCODING_ISO_8859_1
},
740 { "ISO_8859-1:1987", RTL_TEXTENCODING_ISO_8859_1
},
741 { "ISO-IR-100", RTL_TEXTENCODING_ISO_8859_1
},
742 { "ISO_8859-1", RTL_TEXTENCODING_ISO_8859_1
},
743 { "LATIN1", RTL_TEXTENCODING_ISO_8859_1
},
744 { "L1", RTL_TEXTENCODING_ISO_8859_1
},
745 { "IBM819", RTL_TEXTENCODING_ISO_8859_1
},
746 { "CP819", RTL_TEXTENCODING_ISO_8859_1
},
747 { "CSISOLATIN1", RTL_TEXTENCODING_ISO_8859_1
},
748 { "ISO-8859-2", RTL_TEXTENCODING_ISO_8859_2
},
749 { "ISO_8859-2:1987", RTL_TEXTENCODING_ISO_8859_2
},
750 { "ISO-IR-101", RTL_TEXTENCODING_ISO_8859_2
},
751 { "ISO_8859-2", RTL_TEXTENCODING_ISO_8859_2
},
752 { "LATIN2", RTL_TEXTENCODING_ISO_8859_2
},
753 { "L2", RTL_TEXTENCODING_ISO_8859_2
},
754 { "CSISOLATIN2", RTL_TEXTENCODING_ISO_8859_2
},
755 { "ISO-8859-3", RTL_TEXTENCODING_ISO_8859_3
},
756 { "ISO_8859-3:1988", RTL_TEXTENCODING_ISO_8859_3
},
757 { "ISO-IR-109", RTL_TEXTENCODING_ISO_8859_3
},
758 { "ISO_8859-3", RTL_TEXTENCODING_ISO_8859_3
},
759 { "LATIN3", RTL_TEXTENCODING_ISO_8859_3
},
760 { "L3", RTL_TEXTENCODING_ISO_8859_3
},
761 { "CSISOLATIN3", RTL_TEXTENCODING_ISO_8859_3
},
762 { "ISO-8859-4", RTL_TEXTENCODING_ISO_8859_4
},
763 { "ISO_8859-4:1988", RTL_TEXTENCODING_ISO_8859_4
},
764 { "ISO-IR-110", RTL_TEXTENCODING_ISO_8859_4
},
765 { "ISO_8859-4", RTL_TEXTENCODING_ISO_8859_4
},
766 { "LATIN4", RTL_TEXTENCODING_ISO_8859_4
},
767 { "L4", RTL_TEXTENCODING_ISO_8859_4
},
768 { "CSISOLATIN4", RTL_TEXTENCODING_ISO_8859_4
},
769 { "ISO-8859-5", RTL_TEXTENCODING_ISO_8859_5
},
770 { "ISO_8859-5:1988", RTL_TEXTENCODING_ISO_8859_5
},
771 { "ISO-IR-144", RTL_TEXTENCODING_ISO_8859_5
},
772 { "ISO_8859-5", RTL_TEXTENCODING_ISO_8859_5
},
773 { "CYRILLIC", RTL_TEXTENCODING_ISO_8859_5
},
774 { "CSISOLATINCYRILLIC", RTL_TEXTENCODING_ISO_8859_5
},
775 { "ISO-8859-6", RTL_TEXTENCODING_ISO_8859_6
},
776 { "ISO_8859-6:1987", RTL_TEXTENCODING_ISO_8859_6
},
777 { "ISO-IR-127", RTL_TEXTENCODING_ISO_8859_6
},
778 { "ISO_8859-6", RTL_TEXTENCODING_ISO_8859_6
},
779 { "ECMA-114", RTL_TEXTENCODING_ISO_8859_6
},
780 { "ASMO-708", RTL_TEXTENCODING_ISO_8859_6
},
781 { "ARABIC", RTL_TEXTENCODING_ISO_8859_6
},
782 { "CSISOLATINARABIC", RTL_TEXTENCODING_ISO_8859_6
},
783 { "ISO-8859-7", RTL_TEXTENCODING_ISO_8859_7
},
784 { "ISO_8859-7:1987", RTL_TEXTENCODING_ISO_8859_7
},
785 { "ISO-IR-126", RTL_TEXTENCODING_ISO_8859_7
},
786 { "ISO_8859-7", RTL_TEXTENCODING_ISO_8859_7
},
787 { "ELOT_928", RTL_TEXTENCODING_ISO_8859_7
},
788 { "ECMA-118", RTL_TEXTENCODING_ISO_8859_7
},
789 { "GREEK", RTL_TEXTENCODING_ISO_8859_7
},
790 { "GREEK8", RTL_TEXTENCODING_ISO_8859_7
},
791 { "CSISOLATINGREEK", RTL_TEXTENCODING_ISO_8859_7
},
792 { "ISO-8859-8", RTL_TEXTENCODING_ISO_8859_8
},
793 { "ISO_8859-8:1988", RTL_TEXTENCODING_ISO_8859_8
},
794 { "ISO-IR-138", RTL_TEXTENCODING_ISO_8859_8
},
795 { "ISO_8859-8", RTL_TEXTENCODING_ISO_8859_8
},
796 { "HEBREW", RTL_TEXTENCODING_ISO_8859_8
},
797 { "CSISOLATINHEBREW", RTL_TEXTENCODING_ISO_8859_8
},
798 { "ISO-8859-9", RTL_TEXTENCODING_ISO_8859_9
},
799 { "ISO_8859-9:1989", RTL_TEXTENCODING_ISO_8859_9
},
800 { "ISO-IR-148", RTL_TEXTENCODING_ISO_8859_9
},
801 { "ISO_8859-9", RTL_TEXTENCODING_ISO_8859_9
},
802 { "LATIN5", RTL_TEXTENCODING_ISO_8859_9
},
803 { "L5", RTL_TEXTENCODING_ISO_8859_9
},
804 { "CSISOLATIN5", RTL_TEXTENCODING_ISO_8859_9
},
805 { "ISO-8859-14", RTL_TEXTENCODING_ISO_8859_14
}, // RFC 2047
806 { "ISO_8859-15", RTL_TEXTENCODING_ISO_8859_15
},
807 { "ISO-8859-15", RTL_TEXTENCODING_ISO_8859_15
}, // RFC 2047
808 { "MACINTOSH", RTL_TEXTENCODING_APPLE_ROMAN
},
809 { "MAC", RTL_TEXTENCODING_APPLE_ROMAN
},
810 { "CSMACINTOSH", RTL_TEXTENCODING_APPLE_ROMAN
},
811 { "IBM437", RTL_TEXTENCODING_IBM_437
},
812 { "CP437", RTL_TEXTENCODING_IBM_437
},
813 { "437", RTL_TEXTENCODING_IBM_437
},
814 { "CSPC8CODEPAGE437", RTL_TEXTENCODING_IBM_437
},
815 { "IBM850", RTL_TEXTENCODING_IBM_850
},
816 { "CP850", RTL_TEXTENCODING_IBM_850
},
817 { "850", RTL_TEXTENCODING_IBM_850
},
818 { "CSPC850MULTILINGUAL", RTL_TEXTENCODING_IBM_850
},
819 { "IBM860", RTL_TEXTENCODING_IBM_860
},
820 { "CP860", RTL_TEXTENCODING_IBM_860
},
821 { "860", RTL_TEXTENCODING_IBM_860
},
822 { "CSIBM860", RTL_TEXTENCODING_IBM_860
},
823 { "IBM861", RTL_TEXTENCODING_IBM_861
},
824 { "CP861", RTL_TEXTENCODING_IBM_861
},
825 { "861", RTL_TEXTENCODING_IBM_861
},
826 { "CP-IS", RTL_TEXTENCODING_IBM_861
},
827 { "CSIBM861", RTL_TEXTENCODING_IBM_861
},
828 { "IBM863", RTL_TEXTENCODING_IBM_863
},
829 { "CP863", RTL_TEXTENCODING_IBM_863
},
830 { "863", RTL_TEXTENCODING_IBM_863
},
831 { "CSIBM863", RTL_TEXTENCODING_IBM_863
},
832 { "IBM865", RTL_TEXTENCODING_IBM_865
},
833 { "CP865", RTL_TEXTENCODING_IBM_865
},
834 { "865", RTL_TEXTENCODING_IBM_865
},
835 { "CSIBM865", RTL_TEXTENCODING_IBM_865
},
836 { "IBM775", RTL_TEXTENCODING_IBM_775
},
837 { "CP775", RTL_TEXTENCODING_IBM_775
},
838 { "CSPC775BALTIC", RTL_TEXTENCODING_IBM_775
},
839 { "IBM852", RTL_TEXTENCODING_IBM_852
},
840 { "CP852", RTL_TEXTENCODING_IBM_852
},
841 { "852", RTL_TEXTENCODING_IBM_852
},
842 { "CSPCP852", RTL_TEXTENCODING_IBM_852
},
843 { "IBM855", RTL_TEXTENCODING_IBM_855
},
844 { "CP855", RTL_TEXTENCODING_IBM_855
},
845 { "855", RTL_TEXTENCODING_IBM_855
},
846 { "CSIBM855", RTL_TEXTENCODING_IBM_855
},
847 { "IBM857", RTL_TEXTENCODING_IBM_857
},
848 { "CP857", RTL_TEXTENCODING_IBM_857
},
849 { "857", RTL_TEXTENCODING_IBM_857
},
850 { "CSIBM857", RTL_TEXTENCODING_IBM_857
},
851 { "IBM862", RTL_TEXTENCODING_IBM_862
},
852 { "CP862", RTL_TEXTENCODING_IBM_862
},
853 { "862", RTL_TEXTENCODING_IBM_862
},
854 { "CSPC862LATINHEBREW", RTL_TEXTENCODING_IBM_862
},
855 { "IBM864", RTL_TEXTENCODING_IBM_864
},
856 { "CP864", RTL_TEXTENCODING_IBM_864
},
857 { "CSIBM864", RTL_TEXTENCODING_IBM_864
},
858 { "IBM866", RTL_TEXTENCODING_IBM_866
},
859 { "CP866", RTL_TEXTENCODING_IBM_866
},
860 { "866", RTL_TEXTENCODING_IBM_866
},
861 { "CSIBM866", RTL_TEXTENCODING_IBM_866
},
862 { "IBM869", RTL_TEXTENCODING_IBM_869
},
863 { "CP869", RTL_TEXTENCODING_IBM_869
},
864 { "869", RTL_TEXTENCODING_IBM_869
},
865 { "CP-GR", RTL_TEXTENCODING_IBM_869
},
866 { "CSIBM869", RTL_TEXTENCODING_IBM_869
},
867 { "WINDOWS-1250", RTL_TEXTENCODING_MS_1250
},
868 { "WINDOWS-1251", RTL_TEXTENCODING_MS_1251
},
869 { "WINDOWS-1253", RTL_TEXTENCODING_MS_1253
},
870 { "WINDOWS-1254", RTL_TEXTENCODING_MS_1254
},
871 { "WINDOWS-1255", RTL_TEXTENCODING_MS_1255
},
872 { "WINDOWS-1256", RTL_TEXTENCODING_MS_1256
},
873 { "WINDOWS-1257", RTL_TEXTENCODING_MS_1257
},
874 { "WINDOWS-1258", RTL_TEXTENCODING_MS_1258
},
875 { "SHIFT_JIS", RTL_TEXTENCODING_SHIFT_JIS
},
876 { "MS_KANJI", RTL_TEXTENCODING_SHIFT_JIS
},
877 { "CSSHIFTJIS", RTL_TEXTENCODING_SHIFT_JIS
},
878 { "GB2312", RTL_TEXTENCODING_GB_2312
},
879 { "CSGB2312", RTL_TEXTENCODING_GB_2312
},
880 { "BIG5", RTL_TEXTENCODING_BIG5
},
881 { "CSBIG5", RTL_TEXTENCODING_BIG5
},
882 { "EUC-JP", RTL_TEXTENCODING_EUC_JP
},
883 { "EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE",
884 RTL_TEXTENCODING_EUC_JP
},
885 { "CSEUCPKDFMTJAPANESE", RTL_TEXTENCODING_EUC_JP
},
886 { "ISO-2022-JP", RTL_TEXTENCODING_ISO_2022_JP
},
887 { "CSISO2022JP", RTL_TEXTENCODING_ISO_2022_JP
},
888 { "ISO-2022-CN", RTL_TEXTENCODING_ISO_2022_CN
},
889 { "KOI8-R", RTL_TEXTENCODING_KOI8_R
},
890 { "CSKOI8R", RTL_TEXTENCODING_KOI8_R
},
891 { "UTF-7", RTL_TEXTENCODING_UTF7
},
892 { "UTF-8", RTL_TEXTENCODING_UTF8
},
893 { "ISO-8859-10", RTL_TEXTENCODING_ISO_8859_10
}, // RFC 2047
894 { "ISO-8859-13", RTL_TEXTENCODING_ISO_8859_13
}, // RFC 2047
895 { "EUC-KR", RTL_TEXTENCODING_EUC_KR
},
896 { "CSEUCKR", RTL_TEXTENCODING_EUC_KR
},
897 { "ISO-2022-KR", RTL_TEXTENCODING_ISO_2022_KR
},
898 { "CSISO2022KR", RTL_TEXTENCODING_ISO_2022_KR
},
899 { "ISO-10646-UCS-4", RTL_TEXTENCODING_UCS4
},
900 { "CSUCS4", RTL_TEXTENCODING_UCS4
},
901 { "ISO-10646-UCS-2", RTL_TEXTENCODING_UCS2
},
902 { "CSUNICODE", RTL_TEXTENCODING_UCS2
} };
904 rtl_TextEncoding
getCharsetEncoding(char const * pBegin
,
907 for (const EncodingEntry
& i
: aEncodingMap
)
908 if (equalIgnoreCase(pBegin
, pEnd
, i
.m_aName
))
909 return i
.m_eEncoding
;
910 return RTL_TEXTENCODING_DONTKNOW
;
918 bool INetMIME::isAtomChar(sal_uInt32 nChar
)
920 static const bool aMap
[128]
921 = { false, false, false, false, false, false, false, false,
922 false, false, false, false, false, false, false, false,
923 false, false, false, false, false, false, false, false,
924 false, false, false, false, false, false, false, false,
925 false, true, false, true, true, true, true, true, // !"#$%&'
926 false, false, true, true, false, true, false, true, //()*+,-./
927 true, true, true, true, true, true, true, true, //01234567
928 true, true, false, false, false, true, false, true, //89:;<=>?
929 false, true, true, true, true, true, true, true, //@ABCDEFG
930 true, true, true, true, true, true, true, true, //HIJKLMNO
931 true, true, true, true, true, true, true, true, //PQRSTUVW
932 true, true, true, false, false, false, true, true, //XYZ[\]^_
933 true, true, true, true, true, true, true, true, //`abcdefg
934 true, true, true, true, true, true, true, true, //hijklmno
935 true, true, true, true, true, true, true, true, //pqrstuvw
936 true, true, true, true, true, true, true, false //xyz{|}~
938 return rtl::isAscii(nChar
) && aMap
[nChar
];
942 bool INetMIME::isIMAPAtomChar(sal_uInt32 nChar
)
944 static const bool aMap
[128]
945 = { false, false, false, false, false, false, false, false,
946 false, false, false, false, false, false, false, false,
947 false, false, false, false, false, false, false, false,
948 false, false, false, false, false, false, false, false,
949 false, true, false, true, true, false, true, true, // !"#$%&'
950 false, false, false, true, true, true, true, true, //()*+,-./
951 true, true, true, true, true, true, true, true, //01234567
952 true, true, true, true, true, true, true, true, //89:;<=>?
953 true, true, true, true, true, true, true, true, //@ABCDEFG
954 true, true, true, true, true, true, true, true, //HIJKLMNO
955 true, true, true, true, true, true, true, true, //PQRSTUVW
956 true, true, true, true, false, true, true, true, //XYZ[\]^_
957 true, true, true, true, true, true, true, true, //`abcdefg
958 true, true, true, true, true, true, true, true, //hijklmno
959 true, true, true, true, true, true, true, true, //pqrstuvw
960 true, true, true, false, true, true, true, false //xyz{|}~
962 return rtl::isAscii(nChar
) && aMap
[nChar
];
966 bool INetMIME::equalIgnoreCase(const sal_Unicode
* pBegin1
,
967 const sal_Unicode
* pEnd1
,
968 const char * pString2
)
970 assert(pBegin1
&& pBegin1
<= pEnd1
&& pString2
&&
971 "INetMIME::equalIgnoreCase(): Bad sequences");
973 while (*pString2
!= 0)
975 || (rtl::toAsciiUpperCase(*pBegin1
++)
976 != rtl::toAsciiUpperCase(
977 static_cast<unsigned char>(*pString2
++))))
979 return pBegin1
== pEnd1
;
983 bool INetMIME::scanUnsigned(const sal_Unicode
*& rBegin
,
984 const sal_Unicode
* pEnd
, bool bLeadingZeroes
,
987 sal_uInt64 nTheValue
= 0;
988 const sal_Unicode
* p
= rBegin
;
989 for ( ; p
!= pEnd
; ++p
)
991 int nWeight
= getWeight(*p
);
994 nTheValue
= 10 * nTheValue
+ nWeight
;
995 if (nTheValue
> std::numeric_limits
< sal_uInt32
>::max())
998 if (nTheValue
== 0 && (p
== rBegin
|| (!bLeadingZeroes
&& p
- rBegin
!= 1)))
1001 rValue
= sal_uInt32(nTheValue
);
1006 sal_Unicode
const * INetMIME::scanContentType(
1007 std::u16string_view rStr
, OUString
* pType
,
1008 OUString
* pSubType
, INetContentTypeParameterList
* pParameters
)
1010 sal_Unicode
const * pBegin
= rStr
.data();
1011 sal_Unicode
const * pEnd
= pBegin
+ rStr
.size();
1012 sal_Unicode
const * p
= skipLinearWhiteSpaceComment(pBegin
, pEnd
);
1013 sal_Unicode
const * pTypeBegin
= p
;
1014 while (p
!= pEnd
&& isTokenChar(*p
))
1018 if (p
== pTypeBegin
)
1020 sal_Unicode
const * pTypeEnd
= p
;
1022 p
= skipLinearWhiteSpaceComment(p
, pEnd
);
1023 if (p
== pEnd
|| *p
++ != '/')
1026 p
= skipLinearWhiteSpaceComment(p
, pEnd
);
1027 sal_Unicode
const * pSubTypeBegin
= p
;
1028 while (p
!= pEnd
&& isTokenChar(*p
))
1032 if (p
== pSubTypeBegin
)
1034 sal_Unicode
const * pSubTypeEnd
= p
;
1036 if (pType
!= nullptr)
1038 *pType
= OUString(pTypeBegin
, pTypeEnd
- pTypeBegin
).toAsciiLowerCase();
1040 if (pSubType
!= nullptr)
1042 *pSubType
= OUString(pSubTypeBegin
, pSubTypeEnd
- pSubTypeBegin
)
1043 .toAsciiLowerCase();
1046 return scanParameters(p
, pEnd
, pParameters
);
1050 OUString
INetMIME::decodeHeaderFieldBody(const OString
& rBody
)
1052 // Due to a bug in INetCoreRFC822MessageStream::ConvertTo7Bit(), old
1053 // versions of StarOffice send mails with header fields where encoded
1054 // words can be preceded by '=', ',', '.', '"', or '(', and followed by
1055 // '=', ',', '.', '"', ')', without any required white space in between.
1056 // And there appear to exist some broken mailers that only encode single
1057 // letters within words, like "Appel
1058 // =?iso-8859-1?Q?=E0?=t=?iso-8859-1?Q?=E9?=moin", so it seems best to
1059 // detect encoded words even when not properly surrounded by white space.
1061 // Non US-ASCII characters in rBody are treated as ISO-8859-1.
1063 // encoded-word = "=?"
1064 // 1*(%x21 / %x23-27 / %x2A-2B / %x2D / %30-39 / %x41-5A / %x5E-7E)
1065 // ["*" 1*8ALPHA *("-" 1*8ALPHA)] "?"
1066 // ("B?" *(4base64) (4base64 / 3base64 "=" / 2base64 "==")
1067 // / "Q?" 1*(%x21-3C / %x3E / %x40-7E / "=" 2HEXDIG))
1070 // base64 = ALPHA / DIGIT / "+" / "/"
1072 const char * pBegin
= rBody
.getStr();
1073 const char * pEnd
= pBegin
+ rBody
.getLength();
1075 OUStringBuffer sDecoded
;
1076 const char * pCopyBegin
= pBegin
;
1078 /* bool bStartEncodedWord = true; */
1079 const char * pWSPBegin
= pBegin
;
1081 for (const char * p
= pBegin
; p
!= pEnd
;)
1083 if (*p
== '=' /* && bStartEncodedWord */)
1085 const char * q
= p
+ 1;
1086 bool bEncodedWord
= q
!= pEnd
&& *q
++ == '?';
1088 rtl_TextEncoding eCharsetEncoding
= RTL_TEXTENCODING_DONTKNOW
;
1091 const char * pCharsetBegin
= q
;
1092 const char * pLanguageBegin
= nullptr;
1093 int nAlphaCount
= 0;
1094 for (bool bDone
= false; !bDone
;)
1097 bEncodedWord
= false;
1106 pLanguageBegin
= q
- 1;
1111 if (pLanguageBegin
!= nullptr)
1113 if (nAlphaCount
== 0)
1114 pLanguageBegin
= nullptr;
1121 if (pCharsetBegin
== q
- 1)
1122 bEncodedWord
= false;
1126 = getCharsetEncoding(
1128 pLanguageBegin
== nullptr
1129 || nAlphaCount
== 0 ?
1130 q
- 1 : pLanguageBegin
);
1131 bEncodedWord
= isMIMECharsetEncoding(
1134 = translateFromMIME(eCharsetEncoding
);
1140 if (pLanguageBegin
!= nullptr
1141 && (!rtl::isAsciiAlpha(
1142 static_cast<unsigned char>(cChar
))
1143 || ++nAlphaCount
> 8))
1144 pLanguageBegin
= nullptr;
1150 bool bEncodingB
= false;
1154 bEncodedWord
= false;
1170 bEncodedWord
= false;
1176 bEncodedWord
= bEncodedWord
&& q
!= pEnd
&& *q
++ == '?';
1178 OStringBuffer sText
;
1183 for (bool bDone
= false; !bDone
;)
1187 bEncodedWord
= false;
1192 bool bFinal
= false;
1194 sal_uInt32 nValue
= 0;
1195 for (int nShift
= 18; nShift
>= 0; nShift
-= 6)
1197 int nWeight
= getBase64Weight(*q
++);
1200 bEncodedWord
= false;
1210 bEncodedWord
= false;
1215 nCount
= nShift
== 6 ? 1 : 2;
1219 nValue
|= nWeight
<< nShift
;
1223 for (int nShift
= 16; nCount
-- > 0; nShift
-= 8)
1224 sText
.append(char(nValue
>> nShift
& 0xFF));
1230 if (bFinal
&& !bDone
)
1232 bEncodedWord
= false;
1241 const char * pEncodedTextBegin
= q
;
1242 const char * pEncodedTextCopyBegin
= q
;
1243 for (bool bDone
= false; !bDone
;)
1246 bEncodedWord
= false;
1251 sal_uInt32 nChar
= static_cast<unsigned char>(*q
++);
1258 bEncodedWord
= false;
1262 int nDigit1
= getHexWeight(q
[0]);
1263 int nDigit2
= getHexWeight(q
[1]);
1264 if (nDigit1
< 0 || nDigit2
< 0)
1266 bEncodedWord
= false;
1272 (pEncodedTextCopyBegin
- pBegin
),
1273 (q
- 1 - pEncodedTextCopyBegin
))
1274 + OStringChar(char(nDigit1
<< 4 | nDigit2
)));
1276 pEncodedTextCopyBegin
= q
;
1281 if (q
- pEncodedTextBegin
> 1)
1282 sText
.append(rBody
.subView(
1283 (pEncodedTextCopyBegin
- pBegin
),
1284 (q
- 1 - pEncodedTextCopyBegin
)));
1286 bEncodedWord
= false;
1293 (pEncodedTextCopyBegin
- pBegin
),
1294 (q
- 1 - pEncodedTextCopyBegin
))
1295 + OString::Concat(" "));
1296 pEncodedTextCopyBegin
= q
;
1300 if (!isVisible(nChar
))
1302 bEncodedWord
= false;
1311 bEncodedWord
= bEncodedWord
&& q
!= pEnd
&& *q
++ == '=';
1313 std::unique_ptr
<sal_Unicode
[]> pUnicodeBuffer
;
1314 sal_Size nUnicodeSize
= 0;
1318 = convertToUnicode(sText
.getStr(),
1319 sText
.getStr() + sText
.getLength(),
1320 eCharsetEncoding
, nUnicodeSize
);
1321 if (!pUnicodeBuffer
)
1322 bEncodedWord
= false;
1327 appendISO88591(sDecoded
, pCopyBegin
, pWSPBegin
);
1329 pUnicodeBuffer
.get(),
1330 static_cast< sal_Int32
>(nUnicodeSize
));
1331 pUnicodeBuffer
.reset();
1336 while (p
!= pEnd
&& isWhiteSpace(*p
))
1338 /* bStartEncodedWord = p != pWSPBegin; */
1349 /* bStartEncodedWord = true; */
1353 /* bStartEncodedWord = true; */
1357 /* bStartEncodedWord = false; */
1362 const char * pUTF8Begin
= p
- 1;
1363 const char * pUTF8End
= pUTF8Begin
;
1364 sal_uInt32 nCharacter
= 0;
1365 if (translateUTF8Char(pUTF8End
, pEnd
, nCharacter
))
1367 appendISO88591(sDecoded
, pCopyBegin
, p
- 1);
1368 sDecoded
.appendUtf32(nCharacter
);
1372 /* bStartEncodedWord = false; */
1379 appendISO88591(sDecoded
, pCopyBegin
, pEnd
);
1380 return sDecoded
.makeStringAndClear();
1383 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */