1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
22 #include <forward_list>
25 #include <sal/log.hxx>
26 #include <rtl/ustring.hxx>
27 #include <rtl/strbuf.hxx>
28 #include <rtl/ustrbuf.hxx>
29 #include <rtl/tencinfo.h>
30 #include <tools/inetmime.hxx>
31 #include <rtl/character.hxx>
35 rtl_TextEncoding
getCharsetEncoding(const char * pBegin
,
38 /** Check for US-ASCII white space character.
40 @param nChar Some UCS-4 character.
42 @return True if nChar is a US-ASCII white space character (US-ASCII
45 bool isWhiteSpace(sal_uInt32 nChar
)
47 return nChar
== '\t' || nChar
== ' ';
50 /** Get the Base 64 digit weight of a US-ASCII character.
52 @param nChar Some UCS-4 character.
54 @return If nChar is a US-ASCII Base 64 digit character (US-ASCII
55 'A'--'F', or 'a'--'f', '0'--'9', '+', or '/'), return the
56 corresponding weight (0--63); if nChar is the US-ASCII Base 64 padding
57 character (US-ASCII '='), return -1; otherwise, return -2.
59 int getBase64Weight(sal_uInt32 nChar
)
61 return rtl::isAsciiUpperCase(nChar
) ? int(nChar
- 'A') :
62 rtl::isAsciiLowerCase(nChar
) ? int(nChar
- 'a' + 26) :
63 rtl::isAsciiDigit(nChar
) ? int(nChar
- '0' + 52) :
66 nChar
== '=' ? -1 : -2;
69 bool startsWithLineFolding(const sal_Unicode
* pBegin
,
70 const sal_Unicode
* pEnd
)
72 DBG_ASSERT(pBegin
&& pBegin
<= pEnd
,
73 "startsWithLineFolding(): Bad sequence");
75 return pEnd
- pBegin
>= 3 && pBegin
[0] == 0x0D && pBegin
[1] == 0x0A
76 && isWhiteSpace(pBegin
[2]); // CR, LF
79 rtl_TextEncoding
translateFromMIME(rtl_TextEncoding
83 return eEncoding
== RTL_TEXTENCODING_ISO_8859_1
?
84 RTL_TEXTENCODING_MS_1252
: eEncoding
;
90 bool isMIMECharsetEncoding(rtl_TextEncoding eEncoding
)
92 return rtl_isOctetTextEncoding(eEncoding
);
95 std::unique_ptr
<sal_Unicode
[]> convertToUnicode(const char * pBegin
,
97 rtl_TextEncoding eEncoding
,
100 if (eEncoding
== RTL_TEXTENCODING_DONTKNOW
)
102 rtl_TextToUnicodeConverter hConverter
103 = rtl_createTextToUnicodeConverter(eEncoding
);
104 rtl_TextToUnicodeContext hContext
105 = rtl_createTextToUnicodeContext(hConverter
);
106 std::unique_ptr
<sal_Unicode
[]> pBuffer
;
108 for (sal_Size nBufferSize
= pEnd
- pBegin
;;
109 nBufferSize
+= nBufferSize
/ 3 + 1)
111 pBuffer
.reset(new sal_Unicode
[nBufferSize
]);
112 sal_Size nSrcCvtBytes
;
113 rSize
= rtl_convertTextToUnicode(
114 hConverter
, hContext
, pBegin
, pEnd
- pBegin
, pBuffer
.get(),
116 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
117 | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
118 | RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR
,
119 &nInfo
, &nSrcCvtBytes
);
120 if (nInfo
!= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL
)
123 rtl_resetTextToUnicodeContext(hConverter
, hContext
);
125 rtl_destroyTextToUnicodeContext(hConverter
, hContext
);
126 rtl_destroyTextToUnicodeConverter(hConverter
);
134 void writeUTF8(OStringBuffer
& rSink
, sal_uInt32 nChar
)
136 // See RFC 2279 for a discussion of UTF-8.
137 DBG_ASSERT(nChar
< 0x80000000, "writeUTF8(): Bad char");
140 rSink
.append(char(nChar
));
141 else if (nChar
< 0x800)
142 rSink
.append(OStringChar(char(nChar
>> 6 | 0xC0))
143 + OStringChar(char((nChar
& 0x3F) | 0x80)));
144 else if (nChar
< 0x10000)
146 OStringChar(char(nChar
>> 12 | 0xE0))
147 + OStringChar(char((nChar
>> 6 & 0x3F) | 0x80))
148 + OStringChar(char((nChar
& 0x3F) | 0x80)));
149 else if (nChar
< 0x200000)
151 OStringChar(char(nChar
>> 18 | 0xF0))
152 + OStringChar(char((nChar
>> 12 & 0x3F) | 0x80))
153 + OStringChar(char((nChar
>> 6 & 0x3F) | 0x80))
154 + OStringChar(char((nChar
& 0x3F) | 0x80)));
155 else if (nChar
< 0x4000000)
157 OStringChar(char(nChar
>> 24 | 0xF8))
158 + OStringChar(char((nChar
>> 18 & 0x3F) | 0x80))
159 + OStringChar(char((nChar
>> 12 & 0x3F) | 0x80))
160 + OStringChar(char((nChar
>> 6 & 0x3F) | 0x80))
161 + OStringChar(char((nChar
& 0x3F) | 0x80)));
164 OStringChar(char(nChar
>> 30 | 0xFC))
165 + OStringChar(char((nChar
>> 24 & 0x3F) | 0x80))
166 + OStringChar(char((nChar
>> 18 & 0x3F) | 0x80))
167 + OStringChar(char((nChar
>> 12 & 0x3F) | 0x80))
168 + OStringChar(char((nChar
>> 6 & 0x3F) | 0x80))
169 + OStringChar(char((nChar
& 0x3F) | 0x80)));
172 bool translateUTF8Char(const char *& rBegin
,
174 sal_uInt32
& rCharacter
)
176 if (rBegin
== pEnd
|| static_cast< unsigned char >(*rBegin
) < 0x80
177 || static_cast< unsigned char >(*rBegin
) >= 0xFE)
183 const char * p
= rBegin
;
184 if (static_cast< unsigned char >(*p
) < 0xE0)
188 nUCS4
= static_cast< unsigned char >(*p
) & 0x1F;
190 else if (static_cast< unsigned char >(*p
) < 0xF0)
194 nUCS4
= static_cast< unsigned char >(*p
) & 0xF;
196 else if (static_cast< unsigned char >(*p
) < 0xF8)
200 nUCS4
= static_cast< unsigned char >(*p
) & 7;
202 else if (static_cast< unsigned char >(*p
) < 0xFC)
206 nUCS4
= static_cast< unsigned char >(*p
) & 3;
212 nUCS4
= static_cast< unsigned char >(*p
) & 1;
216 for (; nCount
-- > 0; ++p
)
217 if ((static_cast< unsigned char >(*p
) & 0xC0) == 0x80)
218 nUCS4
= (nUCS4
<< 6) | (static_cast< unsigned char >(*p
) & 0x3F);
222 if (!rtl::isUnicodeCodePoint(nUCS4
) || nUCS4
< nMin
)
230 void appendISO88591(OUStringBuffer
& rText
, char const * pBegin
,
235 OString m_aAttribute
;
239 sal_uInt32 m_nSection
;
242 bool operator<(const Parameter
& rhs
) const // is used by std::list<Parameter>::sort
244 int nComp
= m_aAttribute
.compareTo(rhs
.m_aAttribute
);
246 (nComp
== 0 && m_nSection
< rhs
.m_nSection
);
248 struct IsSameSection
// is used to check container for duplicates with std::any_of
250 const OString
& rAttribute
;
251 const sal_uInt32 nSection
;
252 bool operator()(const Parameter
& r
) const
253 { return r
.m_aAttribute
== rAttribute
&& r
.m_nSection
== nSection
; }
257 typedef std::forward_list
<Parameter
> ParameterList
;
259 bool parseParameters(ParameterList
const & rInput
,
260 INetContentTypeParameterList
* pOutput
);
264 void appendISO88591(OUStringBuffer
& rText
, char const * pBegin
,
267 sal_Int32 nLength
= pEnd
- pBegin
;
268 std::unique_ptr
<sal_Unicode
[]> pBuffer(new sal_Unicode
[nLength
]);
269 for (sal_Unicode
* p
= pBuffer
.get(); pBegin
!= pEnd
;)
270 *p
++ = static_cast<unsigned char>(*pBegin
++);
271 rText
.append(pBuffer
.get(), nLength
);
276 bool parseParameters(ParameterList
const & rInput
,
277 INetContentTypeParameterList
* pOutput
)
282 for (auto it
= rInput
.begin(), itPrev
= rInput
.end(); it
!= rInput
.end() ; itPrev
= it
++)
284 if (it
->m_nSection
> 0
285 && (itPrev
== rInput
.end()
286 || itPrev
->m_nSection
!= it
->m_nSection
- 1
287 || itPrev
->m_aAttribute
!= it
->m_aAttribute
))
292 for (auto it
= rInput
.begin(), itNext
= rInput
.begin(); it
!= rInput
.end(); it
= itNext
)
294 bool bCharset
= !it
->m_aCharset
.isEmpty();
295 rtl_TextEncoding eEncoding
= RTL_TEXTENCODING_DONTKNOW
;
298 = getCharsetEncoding(it
->m_aCharset
.getStr(),
299 it
->m_aCharset
.getStr()
300 + it
->m_aCharset
.getLength());
301 OUStringBuffer
aValue(64);
302 bool bBadEncoding
= false;
307 std::unique_ptr
<sal_Unicode
[]> pUnicode
308 = convertToUnicode(itNext
->m_aValue
.getStr(),
309 itNext
->m_aValue
.getStr()
310 + itNext
->m_aValue
.getLength(),
311 bCharset
&& it
->m_bExtended
?
313 RTL_TEXTENCODING_UTF8
,
315 if (!pUnicode
&& !(bCharset
&& it
->m_bExtended
))
316 pUnicode
= convertToUnicode(
317 itNext
->m_aValue
.getStr(),
318 itNext
->m_aValue
.getStr()
319 + itNext
->m_aValue
.getLength(),
320 RTL_TEXTENCODING_ISO_8859_1
, nSize
);
326 aValue
.append(pUnicode
.get(), static_cast<sal_Int32
>(nSize
));
329 while (itNext
!= rInput
.end() && itNext
->m_nSection
!= 0);
337 if (itNext
->m_bExtended
)
339 for (sal_Int32 i
= 0; i
< itNext
->m_aValue
.getLength(); ++i
)
341 static_cast<sal_Unicode
>(
342 static_cast<unsigned char>(itNext
->m_aValue
[i
])
343 | 0xF800)); // map to unicode corporate use sub area
347 for (sal_Int32 i
= 0; i
< itNext
->m_aValue
.getLength(); ++i
)
348 aValue
.append( itNext
->m_aValue
[i
] );
352 while (itNext
!= rInput
.end() && itNext
->m_nSection
!= 0);
354 auto const ret
= pOutput
->insert(
356 {it
->m_aCharset
, it
->m_aLanguage
, aValue
.makeStringAndClear(), !bBadEncoding
}});
357 SAL_INFO_IF(!ret
.second
, "tools",
358 "INetMIME: dropping duplicate parameter: " << it
->m_aAttribute
);
363 /** Check whether some character is valid within an RFC 2045 <token>.
365 @param nChar Some UCS-4 character.
367 @return True if nChar is valid within an RFC 2047 <token> (US-ASCII
368 'A'--'Z', 'a'--'z', '0'--'9', '!', '#', '$', '%', '&', ''', '*', '+',
369 '-', '.', '^', '_', '`', '{', '|', '}', or '~').
371 bool isTokenChar(sal_uInt32 nChar
)
373 static const bool aMap
[128]
374 = { false, false, false, false, false, false, false, false,
375 false, false, false, false, false, false, false, false,
376 false, false, false, false, false, false, false, false,
377 false, false, false, false, false, false, false, false,
378 false, true, false, true, true, true, true, true, // !"#$%&'
379 false, false, true, true, false, true, true, false, //()*+,-./
380 true, true, true, true, true, true, true, true, //01234567
381 true, true, false, false, false, false, false, false, //89:;<=>?
382 false, true, true, true, true, true, true, true, //@ABCDEFG
383 true, true, true, true, true, true, true, true, //HIJKLMNO
384 true, true, true, true, true, true, true, true, //PQRSTUVW
385 true, true, true, false, false, false, true, true, //XYZ[\]^_
386 true, true, true, true, true, true, true, true, //`abcdefg
387 true, true, true, true, true, true, true, true, //hijklmno
388 true, true, true, true, true, true, true, true, //pqrstuvw
389 true, true, true, true, true, true, true, false //xyz{|}~
391 return rtl::isAscii(nChar
) && aMap
[nChar
];
394 const sal_Unicode
* skipComment(const sal_Unicode
* pBegin
,
395 const sal_Unicode
* pEnd
)
397 DBG_ASSERT(pBegin
&& pBegin
<= pEnd
,
398 "skipComment(): Bad sequence");
400 if (pBegin
!= pEnd
&& *pBegin
== '(')
402 sal_uInt32 nLevel
= 0;
403 for (const sal_Unicode
* p
= pBegin
; p
!= pEnd
;)
424 const sal_Unicode
* skipLinearWhiteSpaceComment(const sal_Unicode
*
429 DBG_ASSERT(pBegin
&& pBegin
<= pEnd
,
430 "skipLinearWhiteSpaceComment(): Bad sequence");
432 while (pBegin
!= pEnd
)
441 if (startsWithLineFolding(pBegin
, pEnd
))
449 const sal_Unicode
* p
= skipComment(pBegin
, pEnd
);
462 const sal_Unicode
* skipQuotedString(const sal_Unicode
* pBegin
,
463 const sal_Unicode
* pEnd
)
465 DBG_ASSERT(pBegin
&& pBegin
<= pEnd
,
466 "skipQuotedString(): Bad sequence");
468 if (pBegin
!= pEnd
&& *pBegin
== '"')
469 for (const sal_Unicode
* p
= pBegin
+ 1; p
!= pEnd
;)
473 if (pEnd
- p
< 2 || *p
++ != 0x0A // LF
474 || !isWhiteSpace(*p
++))
489 sal_Unicode
const * scanParameters(sal_Unicode
const * pBegin
,
490 sal_Unicode
const * pEnd
,
491 INetContentTypeParameterList
*
495 sal_Unicode
const * pParameterBegin
= pBegin
;
496 for (sal_Unicode
const * p
= pParameterBegin
;;)
498 pParameterBegin
= skipLinearWhiteSpaceComment(p
, pEnd
);
499 if (pParameterBegin
== pEnd
|| *pParameterBegin
!= ';')
501 p
= pParameterBegin
+ 1;
503 sal_Unicode
const * pAttributeBegin
504 = skipLinearWhiteSpaceComment(p
, pEnd
);
506 bool bDowncaseAttribute
= false;
507 while (p
!= pEnd
&& isTokenChar(*p
) && *p
!= '*')
509 bDowncaseAttribute
= bDowncaseAttribute
|| rtl::isAsciiUpperCase(*p
);
512 if (p
== pAttributeBegin
)
514 OString
aAttribute(pAttributeBegin
, p
- pAttributeBegin
, RTL_TEXTENCODING_ASCII_US
);
515 if (bDowncaseAttribute
)
516 aAttribute
= aAttribute
.toAsciiLowerCase();
518 sal_uInt32 nSection
= 0;
519 if (p
!= pEnd
&& *p
== '*')
522 if (p
!= pEnd
&& rtl::isAsciiDigit(*p
)
523 && !INetMIME::scanUnsigned(p
, pEnd
, false, nSection
))
527 bool bPresent
= std::any_of(aList
.begin(), aList
.end(),
528 Parameter::IsSameSection
{aAttribute
, nSection
});
532 bool bExtended
= false;
533 if (p
!= pEnd
&& *p
== '*')
539 p
= skipLinearWhiteSpaceComment(p
, pEnd
);
541 if (p
== pEnd
|| *p
!= '=')
544 p
= skipLinearWhiteSpaceComment(p
+ 1, pEnd
);
553 sal_Unicode
const * pCharsetBegin
= p
;
554 bool bDowncaseCharset
= false;
555 while (p
!= pEnd
&& isTokenChar(*p
) && *p
!= '\'')
557 bDowncaseCharset
= bDowncaseCharset
|| rtl::isAsciiUpperCase(*p
);
560 if (p
== pCharsetBegin
)
567 RTL_TEXTENCODING_ASCII_US
);
568 if (bDowncaseCharset
)
569 aCharset
= aCharset
.toAsciiLowerCase();
572 if (p
== pEnd
|| *p
!= '\'')
576 sal_Unicode
const * pLanguageBegin
= p
;
577 bool bDowncaseLanguage
= false;
579 for (; p
!= pEnd
; ++p
)
580 if (rtl::isAsciiAlpha(*p
))
584 bDowncaseLanguage
= bDowncaseLanguage
585 || rtl::isAsciiUpperCase(*p
);
595 if (nLetters
== 0 || nLetters
> 8)
602 RTL_TEXTENCODING_ASCII_US
);
603 if (bDowncaseLanguage
)
604 aLanguage
= aLanguage
.toAsciiLowerCase();
607 if (p
== pEnd
|| *p
!= '\'')
617 sal_uInt32 nChar
= INetMIME::getUTF32Character(q
, pEnd
);
618 if (rtl::isAscii(nChar
) && !isTokenChar(nChar
))
621 if (nChar
== '%' && p
+ 1 < pEnd
)
623 int nWeight1
= INetMIME::getHexWeight(p
[0]);
624 int nWeight2
= INetMIME::getHexWeight(p
[1]);
625 if (nWeight1
>= 0 && nWeight2
>= 0)
627 aSink
.append(char(nWeight1
<< 4 | nWeight2
));
632 writeUTF8(aSink
, nChar
);
634 aValue
= aSink
.makeStringAndClear();
637 while (p
!= pEnd
&& (isTokenChar(*p
) || !rtl::isAscii(*p
)))
640 else if (p
!= pEnd
&& *p
== '"')
643 OStringBuffer
aSink(256);
644 bool bInvalid
= false;
652 sal_uInt32 nChar
= INetMIME::getUTF32Character(p
, pEnd
);
655 else if (nChar
== 0x0D) // CR
657 if (pEnd
- p
< 2 || *p
++ != 0x0A // LF
658 || !isWhiteSpace(*p
))
663 nChar
= static_cast<unsigned char>(*p
++);
665 else if (nChar
== '\\')
672 nChar
= INetMIME::getUTF32Character(p
, pEnd
);
674 writeUTF8(aSink
, nChar
);
678 aValue
= aSink
.makeStringAndClear();
682 sal_Unicode
const * pStringEnd
= skipQuotedString(p
, pEnd
);
689 sal_Unicode
const * pTokenBegin
= p
;
690 while (p
!= pEnd
&& (isTokenChar(*p
) || !rtl::isAscii(*p
)))
692 if (p
== pTokenBegin
)
696 pTokenBegin
, p
- pTokenBegin
,
697 RTL_TEXTENCODING_UTF8
);
699 aList
.emplace_front(Parameter
{aAttribute
, aCharset
, aLanguage
, aValue
, nSection
, bExtended
});
702 return parseParameters(aList
, pParameters
) ? pParameterBegin
: pBegin
;
705 bool equalIgnoreCase(const char * pBegin1
,
707 const char * pString2
)
709 DBG_ASSERT(pBegin1
&& pBegin1
<= pEnd1
&& pString2
,
710 "equalIgnoreCase(): Bad sequences");
712 while (*pString2
!= 0)
714 || (rtl::toAsciiUpperCase(static_cast<unsigned char>(*pBegin1
++))
715 != rtl::toAsciiUpperCase(
716 static_cast<unsigned char>(*pString2
++))))
718 return pBegin1
== pEnd1
;
723 char const * m_aName
;
724 rtl_TextEncoding m_eEncoding
;
727 // The source for the following table is <ftp://ftp.iana.org/in-notes/iana/
728 // assignments/character-sets> as of Jan, 21 2000 12:46:00, unless otherwise
730 EncodingEntry
const aEncodingMap
[]
731 = { { "US-ASCII", RTL_TEXTENCODING_ASCII_US
},
732 { "ANSI_X3.4-1968", RTL_TEXTENCODING_ASCII_US
},
733 { "ISO-IR-6", RTL_TEXTENCODING_ASCII_US
},
734 { "ANSI_X3.4-1986", RTL_TEXTENCODING_ASCII_US
},
735 { "ISO_646.IRV:1991", RTL_TEXTENCODING_ASCII_US
},
736 { "ASCII", RTL_TEXTENCODING_ASCII_US
},
737 { "ISO646-US", RTL_TEXTENCODING_ASCII_US
},
738 { "US", RTL_TEXTENCODING_ASCII_US
},
739 { "IBM367", RTL_TEXTENCODING_ASCII_US
},
740 { "CP367", RTL_TEXTENCODING_ASCII_US
},
741 { "CSASCII", RTL_TEXTENCODING_ASCII_US
},
742 { "ISO-8859-1", RTL_TEXTENCODING_ISO_8859_1
},
743 { "ISO_8859-1:1987", RTL_TEXTENCODING_ISO_8859_1
},
744 { "ISO-IR-100", RTL_TEXTENCODING_ISO_8859_1
},
745 { "ISO_8859-1", RTL_TEXTENCODING_ISO_8859_1
},
746 { "LATIN1", RTL_TEXTENCODING_ISO_8859_1
},
747 { "L1", RTL_TEXTENCODING_ISO_8859_1
},
748 { "IBM819", RTL_TEXTENCODING_ISO_8859_1
},
749 { "CP819", RTL_TEXTENCODING_ISO_8859_1
},
750 { "CSISOLATIN1", RTL_TEXTENCODING_ISO_8859_1
},
751 { "ISO-8859-2", RTL_TEXTENCODING_ISO_8859_2
},
752 { "ISO_8859-2:1987", RTL_TEXTENCODING_ISO_8859_2
},
753 { "ISO-IR-101", RTL_TEXTENCODING_ISO_8859_2
},
754 { "ISO_8859-2", RTL_TEXTENCODING_ISO_8859_2
},
755 { "LATIN2", RTL_TEXTENCODING_ISO_8859_2
},
756 { "L2", RTL_TEXTENCODING_ISO_8859_2
},
757 { "CSISOLATIN2", RTL_TEXTENCODING_ISO_8859_2
},
758 { "ISO-8859-3", RTL_TEXTENCODING_ISO_8859_3
},
759 { "ISO_8859-3:1988", RTL_TEXTENCODING_ISO_8859_3
},
760 { "ISO-IR-109", RTL_TEXTENCODING_ISO_8859_3
},
761 { "ISO_8859-3", RTL_TEXTENCODING_ISO_8859_3
},
762 { "LATIN3", RTL_TEXTENCODING_ISO_8859_3
},
763 { "L3", RTL_TEXTENCODING_ISO_8859_3
},
764 { "CSISOLATIN3", RTL_TEXTENCODING_ISO_8859_3
},
765 { "ISO-8859-4", RTL_TEXTENCODING_ISO_8859_4
},
766 { "ISO_8859-4:1988", RTL_TEXTENCODING_ISO_8859_4
},
767 { "ISO-IR-110", RTL_TEXTENCODING_ISO_8859_4
},
768 { "ISO_8859-4", RTL_TEXTENCODING_ISO_8859_4
},
769 { "LATIN4", RTL_TEXTENCODING_ISO_8859_4
},
770 { "L4", RTL_TEXTENCODING_ISO_8859_4
},
771 { "CSISOLATIN4", RTL_TEXTENCODING_ISO_8859_4
},
772 { "ISO-8859-5", RTL_TEXTENCODING_ISO_8859_5
},
773 { "ISO_8859-5:1988", RTL_TEXTENCODING_ISO_8859_5
},
774 { "ISO-IR-144", RTL_TEXTENCODING_ISO_8859_5
},
775 { "ISO_8859-5", RTL_TEXTENCODING_ISO_8859_5
},
776 { "CYRILLIC", RTL_TEXTENCODING_ISO_8859_5
},
777 { "CSISOLATINCYRILLIC", RTL_TEXTENCODING_ISO_8859_5
},
778 { "ISO-8859-6", RTL_TEXTENCODING_ISO_8859_6
},
779 { "ISO_8859-6:1987", RTL_TEXTENCODING_ISO_8859_6
},
780 { "ISO-IR-127", RTL_TEXTENCODING_ISO_8859_6
},
781 { "ISO_8859-6", RTL_TEXTENCODING_ISO_8859_6
},
782 { "ECMA-114", RTL_TEXTENCODING_ISO_8859_6
},
783 { "ASMO-708", RTL_TEXTENCODING_ISO_8859_6
},
784 { "ARABIC", RTL_TEXTENCODING_ISO_8859_6
},
785 { "CSISOLATINARABIC", RTL_TEXTENCODING_ISO_8859_6
},
786 { "ISO-8859-7", RTL_TEXTENCODING_ISO_8859_7
},
787 { "ISO_8859-7:1987", RTL_TEXTENCODING_ISO_8859_7
},
788 { "ISO-IR-126", RTL_TEXTENCODING_ISO_8859_7
},
789 { "ISO_8859-7", RTL_TEXTENCODING_ISO_8859_7
},
790 { "ELOT_928", RTL_TEXTENCODING_ISO_8859_7
},
791 { "ECMA-118", RTL_TEXTENCODING_ISO_8859_7
},
792 { "GREEK", RTL_TEXTENCODING_ISO_8859_7
},
793 { "GREEK8", RTL_TEXTENCODING_ISO_8859_7
},
794 { "CSISOLATINGREEK", RTL_TEXTENCODING_ISO_8859_7
},
795 { "ISO-8859-8", RTL_TEXTENCODING_ISO_8859_8
},
796 { "ISO_8859-8:1988", RTL_TEXTENCODING_ISO_8859_8
},
797 { "ISO-IR-138", RTL_TEXTENCODING_ISO_8859_8
},
798 { "ISO_8859-8", RTL_TEXTENCODING_ISO_8859_8
},
799 { "HEBREW", RTL_TEXTENCODING_ISO_8859_8
},
800 { "CSISOLATINHEBREW", RTL_TEXTENCODING_ISO_8859_8
},
801 { "ISO-8859-9", RTL_TEXTENCODING_ISO_8859_9
},
802 { "ISO_8859-9:1989", RTL_TEXTENCODING_ISO_8859_9
},
803 { "ISO-IR-148", RTL_TEXTENCODING_ISO_8859_9
},
804 { "ISO_8859-9", RTL_TEXTENCODING_ISO_8859_9
},
805 { "LATIN5", RTL_TEXTENCODING_ISO_8859_9
},
806 { "L5", RTL_TEXTENCODING_ISO_8859_9
},
807 { "CSISOLATIN5", RTL_TEXTENCODING_ISO_8859_9
},
808 { "ISO-8859-14", RTL_TEXTENCODING_ISO_8859_14
}, // RFC 2047
809 { "ISO_8859-15", RTL_TEXTENCODING_ISO_8859_15
},
810 { "ISO-8859-15", RTL_TEXTENCODING_ISO_8859_15
}, // RFC 2047
811 { "MACINTOSH", RTL_TEXTENCODING_APPLE_ROMAN
},
812 { "MAC", RTL_TEXTENCODING_APPLE_ROMAN
},
813 { "CSMACINTOSH", RTL_TEXTENCODING_APPLE_ROMAN
},
814 { "IBM437", RTL_TEXTENCODING_IBM_437
},
815 { "CP437", RTL_TEXTENCODING_IBM_437
},
816 { "437", RTL_TEXTENCODING_IBM_437
},
817 { "CSPC8CODEPAGE437", RTL_TEXTENCODING_IBM_437
},
818 { "IBM850", RTL_TEXTENCODING_IBM_850
},
819 { "CP850", RTL_TEXTENCODING_IBM_850
},
820 { "850", RTL_TEXTENCODING_IBM_850
},
821 { "CSPC850MULTILINGUAL", RTL_TEXTENCODING_IBM_850
},
822 { "IBM860", RTL_TEXTENCODING_IBM_860
},
823 { "CP860", RTL_TEXTENCODING_IBM_860
},
824 { "860", RTL_TEXTENCODING_IBM_860
},
825 { "CSIBM860", RTL_TEXTENCODING_IBM_860
},
826 { "IBM861", RTL_TEXTENCODING_IBM_861
},
827 { "CP861", RTL_TEXTENCODING_IBM_861
},
828 { "861", RTL_TEXTENCODING_IBM_861
},
829 { "CP-IS", RTL_TEXTENCODING_IBM_861
},
830 { "CSIBM861", RTL_TEXTENCODING_IBM_861
},
831 { "IBM863", RTL_TEXTENCODING_IBM_863
},
832 { "CP863", RTL_TEXTENCODING_IBM_863
},
833 { "863", RTL_TEXTENCODING_IBM_863
},
834 { "CSIBM863", RTL_TEXTENCODING_IBM_863
},
835 { "IBM865", RTL_TEXTENCODING_IBM_865
},
836 { "CP865", RTL_TEXTENCODING_IBM_865
},
837 { "865", RTL_TEXTENCODING_IBM_865
},
838 { "CSIBM865", RTL_TEXTENCODING_IBM_865
},
839 { "IBM775", RTL_TEXTENCODING_IBM_775
},
840 { "CP775", RTL_TEXTENCODING_IBM_775
},
841 { "CSPC775BALTIC", RTL_TEXTENCODING_IBM_775
},
842 { "IBM852", RTL_TEXTENCODING_IBM_852
},
843 { "CP852", RTL_TEXTENCODING_IBM_852
},
844 { "852", RTL_TEXTENCODING_IBM_852
},
845 { "CSPCP852", RTL_TEXTENCODING_IBM_852
},
846 { "IBM855", RTL_TEXTENCODING_IBM_855
},
847 { "CP855", RTL_TEXTENCODING_IBM_855
},
848 { "855", RTL_TEXTENCODING_IBM_855
},
849 { "CSIBM855", RTL_TEXTENCODING_IBM_855
},
850 { "IBM857", RTL_TEXTENCODING_IBM_857
},
851 { "CP857", RTL_TEXTENCODING_IBM_857
},
852 { "857", RTL_TEXTENCODING_IBM_857
},
853 { "CSIBM857", RTL_TEXTENCODING_IBM_857
},
854 { "IBM862", RTL_TEXTENCODING_IBM_862
},
855 { "CP862", RTL_TEXTENCODING_IBM_862
},
856 { "862", RTL_TEXTENCODING_IBM_862
},
857 { "CSPC862LATINHEBREW", RTL_TEXTENCODING_IBM_862
},
858 { "IBM864", RTL_TEXTENCODING_IBM_864
},
859 { "CP864", RTL_TEXTENCODING_IBM_864
},
860 { "CSIBM864", RTL_TEXTENCODING_IBM_864
},
861 { "IBM866", RTL_TEXTENCODING_IBM_866
},
862 { "CP866", RTL_TEXTENCODING_IBM_866
},
863 { "866", RTL_TEXTENCODING_IBM_866
},
864 { "CSIBM866", RTL_TEXTENCODING_IBM_866
},
865 { "IBM869", RTL_TEXTENCODING_IBM_869
},
866 { "CP869", RTL_TEXTENCODING_IBM_869
},
867 { "869", RTL_TEXTENCODING_IBM_869
},
868 { "CP-GR", RTL_TEXTENCODING_IBM_869
},
869 { "CSIBM869", RTL_TEXTENCODING_IBM_869
},
870 { "WINDOWS-1250", RTL_TEXTENCODING_MS_1250
},
871 { "WINDOWS-1251", RTL_TEXTENCODING_MS_1251
},
872 { "WINDOWS-1253", RTL_TEXTENCODING_MS_1253
},
873 { "WINDOWS-1254", RTL_TEXTENCODING_MS_1254
},
874 { "WINDOWS-1255", RTL_TEXTENCODING_MS_1255
},
875 { "WINDOWS-1256", RTL_TEXTENCODING_MS_1256
},
876 { "WINDOWS-1257", RTL_TEXTENCODING_MS_1257
},
877 { "WINDOWS-1258", RTL_TEXTENCODING_MS_1258
},
878 { "SHIFT_JIS", RTL_TEXTENCODING_SHIFT_JIS
},
879 { "MS_KANJI", RTL_TEXTENCODING_SHIFT_JIS
},
880 { "CSSHIFTJIS", RTL_TEXTENCODING_SHIFT_JIS
},
881 { "GB2312", RTL_TEXTENCODING_GB_2312
},
882 { "CSGB2312", RTL_TEXTENCODING_GB_2312
},
883 { "BIG5", RTL_TEXTENCODING_BIG5
},
884 { "CSBIG5", RTL_TEXTENCODING_BIG5
},
885 { "EUC-JP", RTL_TEXTENCODING_EUC_JP
},
886 { "EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE",
887 RTL_TEXTENCODING_EUC_JP
},
888 { "CSEUCPKDFMTJAPANESE", RTL_TEXTENCODING_EUC_JP
},
889 { "ISO-2022-JP", RTL_TEXTENCODING_ISO_2022_JP
},
890 { "CSISO2022JP", RTL_TEXTENCODING_ISO_2022_JP
},
891 { "ISO-2022-CN", RTL_TEXTENCODING_ISO_2022_CN
},
892 { "KOI8-R", RTL_TEXTENCODING_KOI8_R
},
893 { "CSKOI8R", RTL_TEXTENCODING_KOI8_R
},
894 { "UTF-7", RTL_TEXTENCODING_UTF7
},
895 { "UTF-8", RTL_TEXTENCODING_UTF8
},
896 { "ISO-8859-10", RTL_TEXTENCODING_ISO_8859_10
}, // RFC 2047
897 { "ISO-8859-13", RTL_TEXTENCODING_ISO_8859_13
}, // RFC 2047
898 { "EUC-KR", RTL_TEXTENCODING_EUC_KR
},
899 { "CSEUCKR", RTL_TEXTENCODING_EUC_KR
},
900 { "ISO-2022-KR", RTL_TEXTENCODING_ISO_2022_KR
},
901 { "CSISO2022KR", RTL_TEXTENCODING_ISO_2022_KR
},
902 { "ISO-10646-UCS-4", RTL_TEXTENCODING_UCS4
},
903 { "CSUCS4", RTL_TEXTENCODING_UCS4
},
904 { "ISO-10646-UCS-2", RTL_TEXTENCODING_UCS2
},
905 { "CSUNICODE", RTL_TEXTENCODING_UCS2
} };
907 rtl_TextEncoding
getCharsetEncoding(char const * pBegin
,
910 for (const EncodingEntry
& i
: aEncodingMap
)
911 if (equalIgnoreCase(pBegin
, pEnd
, i
.m_aName
))
912 return i
.m_eEncoding
;
913 return RTL_TEXTENCODING_DONTKNOW
;
921 bool INetMIME::isAtomChar(sal_uInt32 nChar
)
923 static const bool aMap
[128]
924 = { false, false, false, false, false, false, false, false,
925 false, false, false, false, false, false, false, false,
926 false, false, false, false, false, false, false, false,
927 false, false, false, false, false, false, false, false,
928 false, true, false, true, true, true, true, true, // !"#$%&'
929 false, false, true, true, false, true, false, true, //()*+,-./
930 true, true, true, true, true, true, true, true, //01234567
931 true, true, false, false, false, true, false, true, //89:;<=>?
932 false, true, true, true, true, true, true, true, //@ABCDEFG
933 true, true, true, true, true, true, true, true, //HIJKLMNO
934 true, true, true, true, true, true, true, true, //PQRSTUVW
935 true, true, true, false, false, false, true, true, //XYZ[\]^_
936 true, true, true, true, true, true, true, true, //`abcdefg
937 true, true, true, true, true, true, true, true, //hijklmno
938 true, true, true, true, true, true, true, true, //pqrstuvw
939 true, true, true, true, true, true, true, false //xyz{|}~
941 return rtl::isAscii(nChar
) && aMap
[nChar
];
945 bool INetMIME::isIMAPAtomChar(sal_uInt32 nChar
)
947 static const bool aMap
[128]
948 = { false, false, false, false, false, false, false, false,
949 false, false, false, false, false, false, false, false,
950 false, false, false, false, false, false, false, false,
951 false, false, false, false, false, false, false, false,
952 false, true, false, true, true, false, true, true, // !"#$%&'
953 false, false, false, true, true, true, true, true, //()*+,-./
954 true, true, true, true, true, true, true, true, //01234567
955 true, true, true, true, true, true, true, true, //89:;<=>?
956 true, true, true, true, true, true, true, true, //@ABCDEFG
957 true, true, true, true, true, true, true, true, //HIJKLMNO
958 true, true, true, true, true, true, true, true, //PQRSTUVW
959 true, true, true, true, false, true, true, true, //XYZ[\]^_
960 true, true, true, true, true, true, true, true, //`abcdefg
961 true, true, true, true, true, true, true, true, //hijklmno
962 true, true, true, true, true, true, true, true, //pqrstuvw
963 true, true, true, false, true, true, true, false //xyz{|}~
965 return rtl::isAscii(nChar
) && aMap
[nChar
];
969 bool INetMIME::equalIgnoreCase(const sal_Unicode
* pBegin1
,
970 const sal_Unicode
* pEnd1
,
971 const char * pString2
)
973 DBG_ASSERT(pBegin1
&& pBegin1
<= pEnd1
&& pString2
,
974 "INetMIME::equalIgnoreCase(): Bad sequences");
976 while (*pString2
!= 0)
978 || (rtl::toAsciiUpperCase(*pBegin1
++)
979 != rtl::toAsciiUpperCase(
980 static_cast<unsigned char>(*pString2
++))))
982 return pBegin1
== pEnd1
;
986 bool INetMIME::scanUnsigned(const sal_Unicode
*& rBegin
,
987 const sal_Unicode
* pEnd
, bool bLeadingZeroes
,
990 sal_uInt64 nTheValue
= 0;
991 const sal_Unicode
* p
= rBegin
;
992 for ( ; p
!= pEnd
; ++p
)
994 int nWeight
= getWeight(*p
);
997 nTheValue
= 10 * nTheValue
+ nWeight
;
998 if (nTheValue
> std::numeric_limits
< sal_uInt32
>::max())
1001 if (nTheValue
== 0 && (p
== rBegin
|| (!bLeadingZeroes
&& p
- rBegin
!= 1)))
1004 rValue
= sal_uInt32(nTheValue
);
1009 sal_Unicode
const * INetMIME::scanContentType(
1010 std::u16string_view rStr
, OUString
* pType
,
1011 OUString
* pSubType
, INetContentTypeParameterList
* pParameters
)
1013 sal_Unicode
const * pBegin
= rStr
.data();
1014 sal_Unicode
const * pEnd
= pBegin
+ rStr
.size();
1015 sal_Unicode
const * p
= skipLinearWhiteSpaceComment(pBegin
, pEnd
);
1016 sal_Unicode
const * pTypeBegin
= p
;
1017 while (p
!= pEnd
&& isTokenChar(*p
))
1021 if (p
== pTypeBegin
)
1023 sal_Unicode
const * pTypeEnd
= p
;
1025 p
= skipLinearWhiteSpaceComment(p
, pEnd
);
1026 if (p
== pEnd
|| *p
++ != '/')
1029 p
= skipLinearWhiteSpaceComment(p
, pEnd
);
1030 sal_Unicode
const * pSubTypeBegin
= p
;
1031 while (p
!= pEnd
&& isTokenChar(*p
))
1035 if (p
== pSubTypeBegin
)
1037 sal_Unicode
const * pSubTypeEnd
= p
;
1039 if (pType
!= nullptr)
1041 *pType
= OUString(pTypeBegin
, pTypeEnd
- pTypeBegin
).toAsciiLowerCase();
1043 if (pSubType
!= nullptr)
1045 *pSubType
= OUString(pSubTypeBegin
, pSubTypeEnd
- pSubTypeBegin
)
1046 .toAsciiLowerCase();
1049 return scanParameters(p
, pEnd
, pParameters
);
1053 OUString
INetMIME::decodeHeaderFieldBody(const OString
& rBody
)
1055 // Due to a bug in INetCoreRFC822MessageStream::ConvertTo7Bit(), old
1056 // versions of StarOffice send mails with header fields where encoded
1057 // words can be preceded by '=', ',', '.', '"', or '(', and followed by
1058 // '=', ',', '.', '"', ')', without any required white space in between.
1059 // And there appear to exist some broken mailers that only encode single
1060 // letters within words, like "Appel
1061 // =?iso-8859-1?Q?=E0?=t=?iso-8859-1?Q?=E9?=moin", so it seems best to
1062 // detect encoded words even when not properly surrounded by white space.
1064 // Non US-ASCII characters in rBody are treated as ISO-8859-1.
1066 // encoded-word = "=?"
1067 // 1*(%x21 / %x23-27 / %x2A-2B / %x2D / %30-39 / %x41-5A / %x5E-7E)
1068 // ["*" 1*8ALPHA *("-" 1*8ALPHA)] "?"
1069 // ("B?" *(4base64) (4base64 / 3base64 "=" / 2base64 "==")
1070 // / "Q?" 1*(%x21-3C / %x3E / %x40-7E / "=" 2HEXDIG))
1073 // base64 = ALPHA / DIGIT / "+" / "/"
1075 const char * pBegin
= rBody
.getStr();
1076 const char * pEnd
= pBegin
+ rBody
.getLength();
1078 OUStringBuffer sDecoded
;
1079 const char * pCopyBegin
= pBegin
;
1081 /* bool bStartEncodedWord = true; */
1082 const char * pWSPBegin
= pBegin
;
1084 for (const char * p
= pBegin
; p
!= pEnd
;)
1086 if (*p
== '=' /* && bStartEncodedWord */)
1088 const char * q
= p
+ 1;
1089 bool bEncodedWord
= q
!= pEnd
&& *q
++ == '?';
1091 rtl_TextEncoding eCharsetEncoding
= RTL_TEXTENCODING_DONTKNOW
;
1094 const char * pCharsetBegin
= q
;
1095 const char * pLanguageBegin
= nullptr;
1096 int nAlphaCount
= 0;
1097 for (bool bDone
= false; !bDone
;)
1100 bEncodedWord
= false;
1109 pLanguageBegin
= q
- 1;
1114 if (pLanguageBegin
!= nullptr)
1116 if (nAlphaCount
== 0)
1117 pLanguageBegin
= nullptr;
1124 if (pCharsetBegin
== q
- 1)
1125 bEncodedWord
= false;
1129 = getCharsetEncoding(
1131 pLanguageBegin
== nullptr
1132 || nAlphaCount
== 0 ?
1133 q
- 1 : pLanguageBegin
);
1134 bEncodedWord
= isMIMECharsetEncoding(
1137 = translateFromMIME(eCharsetEncoding
);
1143 if (pLanguageBegin
!= nullptr
1144 && (!rtl::isAsciiAlpha(
1145 static_cast<unsigned char>(cChar
))
1146 || ++nAlphaCount
> 8))
1147 pLanguageBegin
= nullptr;
1153 bool bEncodingB
= false;
1157 bEncodedWord
= false;
1173 bEncodedWord
= false;
1179 bEncodedWord
= bEncodedWord
&& q
!= pEnd
&& *q
++ == '?';
1181 OStringBuffer sText
;
1186 for (bool bDone
= false; !bDone
;)
1190 bEncodedWord
= false;
1195 bool bFinal
= false;
1197 sal_uInt32 nValue
= 0;
1198 for (int nShift
= 18; nShift
>= 0; nShift
-= 6)
1200 int nWeight
= getBase64Weight(*q
++);
1203 bEncodedWord
= false;
1213 bEncodedWord
= false;
1218 nCount
= nShift
== 6 ? 1 : 2;
1222 nValue
|= nWeight
<< nShift
;
1226 for (int nShift
= 16; nCount
-- > 0; nShift
-= 8)
1227 sText
.append(char(nValue
>> nShift
& 0xFF));
1233 if (bFinal
&& !bDone
)
1235 bEncodedWord
= false;
1244 const char * pEncodedTextBegin
= q
;
1245 const char * pEncodedTextCopyBegin
= q
;
1246 for (bool bDone
= false; !bDone
;)
1249 bEncodedWord
= false;
1254 sal_uInt32 nChar
= static_cast<unsigned char>(*q
++);
1261 bEncodedWord
= false;
1265 int nDigit1
= getHexWeight(q
[0]);
1266 int nDigit2
= getHexWeight(q
[1]);
1267 if (nDigit1
< 0 || nDigit2
< 0)
1269 bEncodedWord
= false;
1275 (pEncodedTextCopyBegin
- pBegin
),
1276 (q
- 1 - pEncodedTextCopyBegin
))
1277 + OStringChar(char(nDigit1
<< 4 | nDigit2
)));
1279 pEncodedTextCopyBegin
= q
;
1284 if (q
- pEncodedTextBegin
> 1)
1285 sText
.append(rBody
.subView(
1286 (pEncodedTextCopyBegin
- pBegin
),
1287 (q
- 1 - pEncodedTextCopyBegin
)));
1289 bEncodedWord
= false;
1296 (pEncodedTextCopyBegin
- pBegin
),
1297 (q
- 1 - pEncodedTextCopyBegin
))
1298 + OString::Concat(" "));
1299 pEncodedTextCopyBegin
= q
;
1303 if (!isVisible(nChar
))
1305 bEncodedWord
= false;
1314 bEncodedWord
= bEncodedWord
&& q
!= pEnd
&& *q
++ == '=';
1316 std::unique_ptr
<sal_Unicode
[]> pUnicodeBuffer
;
1317 sal_Size nUnicodeSize
= 0;
1321 = convertToUnicode(sText
.getStr(),
1322 sText
.getStr() + sText
.getLength(),
1323 eCharsetEncoding
, nUnicodeSize
);
1324 if (!pUnicodeBuffer
)
1325 bEncodedWord
= false;
1330 appendISO88591(sDecoded
, pCopyBegin
, pWSPBegin
);
1332 pUnicodeBuffer
.get(),
1333 static_cast< sal_Int32
>(nUnicodeSize
));
1334 pUnicodeBuffer
.reset();
1339 while (p
!= pEnd
&& isWhiteSpace(*p
))
1341 /* bStartEncodedWord = p != pWSPBegin; */
1352 /* bStartEncodedWord = true; */
1356 /* bStartEncodedWord = true; */
1360 /* bStartEncodedWord = false; */
1365 const char * pUTF8Begin
= p
- 1;
1366 const char * pUTF8End
= pUTF8Begin
;
1367 sal_uInt32 nCharacter
= 0;
1368 if (translateUTF8Char(pUTF8End
, pEnd
, nCharacter
))
1370 appendISO88591(sDecoded
, pCopyBegin
, p
- 1);
1371 sDecoded
.appendUtf32(nCharacter
);
1375 /* bStartEncodedWord = false; */
1382 appendISO88591(sDecoded
, pCopyBegin
, pEnd
);
1383 return sDecoded
.makeStringAndClear();
1386 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */