1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
22 #include <forward_list>
25 #include <sal/log.hxx>
26 #include <rtl/ustring.hxx>
27 #include <rtl/strbuf.hxx>
28 #include <rtl/ustrbuf.hxx>
29 #include <rtl/tencinfo.h>
30 #include <tools/debug.hxx>
31 #include <tools/inetmime.hxx>
32 #include <rtl/character.hxx>
36 rtl_TextEncoding
getCharsetEncoding(const char * pBegin
,
39 /** Check for US-ASCII white space character.
41 @param nChar Some UCS-4 character.
43 @return True if nChar is a US-ASCII white space character (US-ASCII
46 bool isWhiteSpace(sal_uInt32 nChar
)
48 return nChar
== '\t' || nChar
== ' ';
51 /** Get the Base 64 digit weight of a US-ASCII character.
53 @param nChar Some UCS-4 character.
55 @return If nChar is a US-ASCII Base 64 digit character (US-ASCII
56 'A'--'F', or 'a'--'f', '0'--'9', '+', or '/'), return the
57 corresponding weight (0--63); if nChar is the US-ASCII Base 64 padding
58 character (US-ASCII '='), return -1; otherwise, return -2.
60 int getBase64Weight(sal_uInt32 nChar
)
62 return rtl::isAsciiUpperCase(nChar
) ? int(nChar
- 'A') :
63 rtl::isAsciiLowerCase(nChar
) ? int(nChar
- 'a' + 26) :
64 rtl::isAsciiDigit(nChar
) ? int(nChar
- '0' + 52) :
67 nChar
== '=' ? -1 : -2;
70 bool startsWithLineFolding(const sal_Unicode
* pBegin
,
71 const sal_Unicode
* pEnd
)
73 DBG_ASSERT(pBegin
&& pBegin
<= pEnd
,
74 "startsWithLineFolding(): Bad sequence");
76 return pEnd
- pBegin
>= 3 && pBegin
[0] == 0x0D && pBegin
[1] == 0x0A
77 && isWhiteSpace(pBegin
[2]); // CR, LF
80 rtl_TextEncoding
translateFromMIME(rtl_TextEncoding
84 return eEncoding
== RTL_TEXTENCODING_ISO_8859_1
?
85 RTL_TEXTENCODING_MS_1252
: eEncoding
;
91 bool isMIMECharsetEncoding(rtl_TextEncoding eEncoding
)
93 return rtl_isOctetTextEncoding(eEncoding
);
96 std::unique_ptr
<sal_Unicode
[]> convertToUnicode(const char * pBegin
,
98 rtl_TextEncoding eEncoding
,
101 if (eEncoding
== RTL_TEXTENCODING_DONTKNOW
)
103 rtl_TextToUnicodeConverter hConverter
104 = rtl_createTextToUnicodeConverter(eEncoding
);
105 rtl_TextToUnicodeContext hContext
106 = rtl_createTextToUnicodeContext(hConverter
);
107 std::unique_ptr
<sal_Unicode
[]> pBuffer
;
109 for (sal_Size nBufferSize
= pEnd
- pBegin
;;
110 nBufferSize
+= nBufferSize
/ 3 + 1)
112 pBuffer
.reset(new sal_Unicode
[nBufferSize
]);
113 sal_Size nSrcCvtBytes
;
114 rSize
= rtl_convertTextToUnicode(
115 hConverter
, hContext
, pBegin
, pEnd
- pBegin
, pBuffer
.get(),
117 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
118 | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
119 | RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR
,
120 &nInfo
, &nSrcCvtBytes
);
121 if (nInfo
!= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL
)
124 rtl_resetTextToUnicodeContext(hConverter
, hContext
);
126 rtl_destroyTextToUnicodeContext(hConverter
, hContext
);
127 rtl_destroyTextToUnicodeConverter(hConverter
);
135 void writeUTF8(OStringBuffer
& rSink
, sal_uInt32 nChar
)
137 // See RFC 2279 for a discussion of UTF-8.
138 DBG_ASSERT(nChar
< 0x80000000, "writeUTF8(): Bad char");
141 rSink
.append(char(nChar
));
142 else if (nChar
< 0x800)
143 rSink
.append(OStringChar(char(nChar
>> 6 | 0xC0))
144 + OStringChar(char((nChar
& 0x3F) | 0x80)));
145 else if (nChar
< 0x10000)
147 OStringChar(char(nChar
>> 12 | 0xE0))
148 + OStringChar(char((nChar
>> 6 & 0x3F) | 0x80))
149 + OStringChar(char((nChar
& 0x3F) | 0x80)));
150 else if (nChar
< 0x200000)
152 OStringChar(char(nChar
>> 18 | 0xF0))
153 + OStringChar(char((nChar
>> 12 & 0x3F) | 0x80))
154 + OStringChar(char((nChar
>> 6 & 0x3F) | 0x80))
155 + OStringChar(char((nChar
& 0x3F) | 0x80)));
156 else if (nChar
< 0x4000000)
158 OStringChar(char(nChar
>> 24 | 0xF8))
159 + OStringChar(char((nChar
>> 18 & 0x3F) | 0x80))
160 + OStringChar(char((nChar
>> 12 & 0x3F) | 0x80))
161 + OStringChar(char((nChar
>> 6 & 0x3F) | 0x80))
162 + OStringChar(char((nChar
& 0x3F) | 0x80)));
165 OStringChar(char(nChar
>> 30 | 0xFC))
166 + OStringChar(char((nChar
>> 24 & 0x3F) | 0x80))
167 + OStringChar(char((nChar
>> 18 & 0x3F) | 0x80))
168 + OStringChar(char((nChar
>> 12 & 0x3F) | 0x80))
169 + OStringChar(char((nChar
>> 6 & 0x3F) | 0x80))
170 + OStringChar(char((nChar
& 0x3F) | 0x80)));
173 bool translateUTF8Char(const char *& rBegin
,
175 sal_uInt32
& rCharacter
)
177 if (rBegin
== pEnd
|| static_cast< unsigned char >(*rBegin
) < 0x80
178 || static_cast< unsigned char >(*rBegin
) >= 0xFE)
184 const char * p
= rBegin
;
185 if (static_cast< unsigned char >(*p
) < 0xE0)
189 nUCS4
= static_cast< unsigned char >(*p
) & 0x1F;
191 else if (static_cast< unsigned char >(*p
) < 0xF0)
195 nUCS4
= static_cast< unsigned char >(*p
) & 0xF;
197 else if (static_cast< unsigned char >(*p
) < 0xF8)
201 nUCS4
= static_cast< unsigned char >(*p
) & 7;
203 else if (static_cast< unsigned char >(*p
) < 0xFC)
207 nUCS4
= static_cast< unsigned char >(*p
) & 3;
213 nUCS4
= static_cast< unsigned char >(*p
) & 1;
217 for (; nCount
-- > 0; ++p
)
218 if ((static_cast< unsigned char >(*p
) & 0xC0) == 0x80)
219 nUCS4
= (nUCS4
<< 6) | (static_cast< unsigned char >(*p
) & 0x3F);
223 if (!rtl::isUnicodeCodePoint(nUCS4
) || nUCS4
< nMin
)
231 void appendISO88591(OUStringBuffer
& rText
, char const * pBegin
,
236 OString m_aAttribute
;
240 sal_uInt32 m_nSection
;
243 bool operator<(const Parameter
& rhs
) const // is used by std::list<Parameter>::sort
245 int nComp
= m_aAttribute
.compareTo(rhs
.m_aAttribute
);
247 (nComp
== 0 && m_nSection
< rhs
.m_nSection
);
249 struct IsSameSection
// is used to check container for duplicates with std::any_of
251 const OString
& rAttribute
;
252 const sal_uInt32 nSection
;
253 bool operator()(const Parameter
& r
) const
254 { return r
.m_aAttribute
== rAttribute
&& r
.m_nSection
== nSection
; }
258 typedef std::forward_list
<Parameter
> ParameterList
;
260 bool parseParameters(ParameterList
const & rInput
,
261 INetContentTypeParameterList
* pOutput
);
265 void appendISO88591(OUStringBuffer
& rText
, char const * pBegin
,
268 sal_Int32 nLength
= pEnd
- pBegin
;
269 std::unique_ptr
<sal_Unicode
[]> pBuffer(new sal_Unicode
[nLength
]);
270 for (sal_Unicode
* p
= pBuffer
.get(); pBegin
!= pEnd
;)
271 *p
++ = static_cast<unsigned char>(*pBegin
++);
272 rText
.append(pBuffer
.get(), nLength
);
277 bool parseParameters(ParameterList
const & rInput
,
278 INetContentTypeParameterList
* pOutput
)
283 for (auto it
= rInput
.begin(), itPrev
= rInput
.end(); it
!= rInput
.end() ; itPrev
= it
++)
285 if (it
->m_nSection
> 0
286 && (itPrev
== rInput
.end()
287 || itPrev
->m_nSection
!= it
->m_nSection
- 1
288 || itPrev
->m_aAttribute
!= it
->m_aAttribute
))
293 for (auto it
= rInput
.begin(), itNext
= rInput
.begin(); it
!= rInput
.end(); it
= itNext
)
295 bool bCharset
= !it
->m_aCharset
.isEmpty();
296 rtl_TextEncoding eEncoding
= RTL_TEXTENCODING_DONTKNOW
;
299 = getCharsetEncoding(it
->m_aCharset
.getStr(),
300 it
->m_aCharset
.getStr()
301 + it
->m_aCharset
.getLength());
302 OUStringBuffer
aValue(64);
303 bool bBadEncoding
= false;
308 std::unique_ptr
<sal_Unicode
[]> pUnicode
309 = convertToUnicode(itNext
->m_aValue
.getStr(),
310 itNext
->m_aValue
.getStr()
311 + itNext
->m_aValue
.getLength(),
312 bCharset
&& it
->m_bExtended
?
314 RTL_TEXTENCODING_UTF8
,
316 if (!pUnicode
&& !(bCharset
&& it
->m_bExtended
))
317 pUnicode
= convertToUnicode(
318 itNext
->m_aValue
.getStr(),
319 itNext
->m_aValue
.getStr()
320 + itNext
->m_aValue
.getLength(),
321 RTL_TEXTENCODING_ISO_8859_1
, nSize
);
327 aValue
.append(pUnicode
.get(), static_cast<sal_Int32
>(nSize
));
330 while (itNext
!= rInput
.end() && itNext
->m_nSection
!= 0);
338 if (itNext
->m_bExtended
)
340 for (sal_Int32 i
= 0; i
< itNext
->m_aValue
.getLength(); ++i
)
342 static_cast<sal_Unicode
>(
343 static_cast<unsigned char>(itNext
->m_aValue
[i
])
344 | 0xF800)); // map to unicode corporate use sub area
348 for (sal_Int32 i
= 0; i
< itNext
->m_aValue
.getLength(); ++i
)
349 aValue
.append( itNext
->m_aValue
[i
] );
353 while (itNext
!= rInput
.end() && itNext
->m_nSection
!= 0);
355 auto const ret
= pOutput
->insert(
357 {it
->m_aCharset
, it
->m_aLanguage
, aValue
.makeStringAndClear(), !bBadEncoding
}});
358 SAL_INFO_IF(!ret
.second
, "tools",
359 "INetMIME: dropping duplicate parameter: " << it
->m_aAttribute
);
364 /** Check whether some character is valid within an RFC 2045 <token>.
366 @param nChar Some UCS-4 character.
368 @return True if nChar is valid within an RFC 2047 <token> (US-ASCII
369 'A'--'Z', 'a'--'z', '0'--'9', '!', '#', '$', '%', '&', ''', '*', '+',
370 '-', '.', '^', '_', '`', '{', '|', '}', or '~').
372 bool isTokenChar(sal_uInt32 nChar
)
374 static const bool aMap
[128]
375 = { false, false, false, false, false, false, false, false,
376 false, false, false, false, false, false, false, false,
377 false, false, false, false, false, false, false, false,
378 false, false, false, false, false, false, false, false,
379 false, true, false, true, true, true, true, true, // !"#$%&'
380 false, false, true, true, false, true, true, false, //()*+,-./
381 true, true, true, true, true, true, true, true, //01234567
382 true, true, false, false, false, false, false, false, //89:;<=>?
383 false, true, true, true, true, true, true, true, //@ABCDEFG
384 true, true, true, true, true, true, true, true, //HIJKLMNO
385 true, true, true, true, true, true, true, true, //PQRSTUVW
386 true, true, true, false, false, false, true, true, //XYZ[\]^_
387 true, true, true, true, true, true, true, true, //`abcdefg
388 true, true, true, true, true, true, true, true, //hijklmno
389 true, true, true, true, true, true, true, true, //pqrstuvw
390 true, true, true, true, true, true, true, false //xyz{|}~
392 return rtl::isAscii(nChar
) && aMap
[nChar
];
395 const sal_Unicode
* skipComment(const sal_Unicode
* pBegin
,
396 const sal_Unicode
* pEnd
)
398 DBG_ASSERT(pBegin
&& pBegin
<= pEnd
,
399 "skipComment(): Bad sequence");
401 if (pBegin
!= pEnd
&& *pBegin
== '(')
403 sal_uInt32 nLevel
= 0;
404 for (const sal_Unicode
* p
= pBegin
; p
!= pEnd
;)
425 const sal_Unicode
* skipLinearWhiteSpaceComment(const sal_Unicode
*
430 DBG_ASSERT(pBegin
&& pBegin
<= pEnd
,
431 "skipLinearWhiteSpaceComment(): Bad sequence");
433 while (pBegin
!= pEnd
)
442 if (startsWithLineFolding(pBegin
, pEnd
))
450 const sal_Unicode
* p
= skipComment(pBegin
, pEnd
);
463 const sal_Unicode
* skipQuotedString(const sal_Unicode
* pBegin
,
464 const sal_Unicode
* pEnd
)
466 DBG_ASSERT(pBegin
&& pBegin
<= pEnd
,
467 "skipQuotedString(): Bad sequence");
469 if (pBegin
!= pEnd
&& *pBegin
== '"')
470 for (const sal_Unicode
* p
= pBegin
+ 1; p
!= pEnd
;)
474 if (pEnd
- p
< 2 || *p
++ != 0x0A // LF
475 || !isWhiteSpace(*p
++))
490 sal_Unicode
const * scanParameters(sal_Unicode
const * pBegin
,
491 sal_Unicode
const * pEnd
,
492 INetContentTypeParameterList
*
496 sal_Unicode
const * pParameterBegin
= pBegin
;
497 for (sal_Unicode
const * p
= pParameterBegin
;;)
499 pParameterBegin
= skipLinearWhiteSpaceComment(p
, pEnd
);
500 if (pParameterBegin
== pEnd
|| *pParameterBegin
!= ';')
502 p
= pParameterBegin
+ 1;
504 sal_Unicode
const * pAttributeBegin
505 = skipLinearWhiteSpaceComment(p
, pEnd
);
507 bool bDowncaseAttribute
= false;
508 while (p
!= pEnd
&& isTokenChar(*p
) && *p
!= '*')
510 bDowncaseAttribute
= bDowncaseAttribute
|| rtl::isAsciiUpperCase(*p
);
513 if (p
== pAttributeBegin
)
515 OString
aAttribute(pAttributeBegin
, p
- pAttributeBegin
, RTL_TEXTENCODING_ASCII_US
);
516 if (bDowncaseAttribute
)
517 aAttribute
= aAttribute
.toAsciiLowerCase();
519 sal_uInt32 nSection
= 0;
520 if (p
!= pEnd
&& *p
== '*')
523 if (p
!= pEnd
&& rtl::isAsciiDigit(*p
)
524 && !INetMIME::scanUnsigned(p
, pEnd
, false, nSection
))
528 bool bPresent
= std::any_of(aList
.begin(), aList
.end(),
529 Parameter::IsSameSection
{aAttribute
, nSection
});
533 bool bExtended
= false;
534 if (p
!= pEnd
&& *p
== '*')
540 p
= skipLinearWhiteSpaceComment(p
, pEnd
);
542 if (p
== pEnd
|| *p
!= '=')
545 p
= skipLinearWhiteSpaceComment(p
+ 1, pEnd
);
554 sal_Unicode
const * pCharsetBegin
= p
;
555 bool bDowncaseCharset
= false;
556 while (p
!= pEnd
&& isTokenChar(*p
) && *p
!= '\'')
558 bDowncaseCharset
= bDowncaseCharset
|| rtl::isAsciiUpperCase(*p
);
561 if (p
== pCharsetBegin
)
568 RTL_TEXTENCODING_ASCII_US
);
569 if (bDowncaseCharset
)
570 aCharset
= aCharset
.toAsciiLowerCase();
573 if (p
== pEnd
|| *p
!= '\'')
577 sal_Unicode
const * pLanguageBegin
= p
;
578 bool bDowncaseLanguage
= false;
580 for (; p
!= pEnd
; ++p
)
581 if (rtl::isAsciiAlpha(*p
))
585 bDowncaseLanguage
= bDowncaseLanguage
586 || rtl::isAsciiUpperCase(*p
);
596 if (nLetters
== 0 || nLetters
> 8)
603 RTL_TEXTENCODING_ASCII_US
);
604 if (bDowncaseLanguage
)
605 aLanguage
= aLanguage
.toAsciiLowerCase();
608 if (p
== pEnd
|| *p
!= '\'')
618 sal_uInt32 nChar
= INetMIME::getUTF32Character(q
, pEnd
);
619 if (rtl::isAscii(nChar
) && !isTokenChar(nChar
))
622 if (nChar
== '%' && p
+ 1 < pEnd
)
624 int nWeight1
= INetMIME::getHexWeight(p
[0]);
625 int nWeight2
= INetMIME::getHexWeight(p
[1]);
626 if (nWeight1
>= 0 && nWeight2
>= 0)
628 aSink
.append(char(nWeight1
<< 4 | nWeight2
));
633 writeUTF8(aSink
, nChar
);
635 aValue
= aSink
.makeStringAndClear();
638 while (p
!= pEnd
&& (isTokenChar(*p
) || !rtl::isAscii(*p
)))
641 else if (p
!= pEnd
&& *p
== '"')
644 OStringBuffer
aSink(256);
645 bool bInvalid
= false;
653 sal_uInt32 nChar
= INetMIME::getUTF32Character(p
, pEnd
);
656 else if (nChar
== 0x0D) // CR
658 if (pEnd
- p
< 2 || *p
++ != 0x0A // LF
659 || !isWhiteSpace(*p
))
664 nChar
= static_cast<unsigned char>(*p
++);
666 else if (nChar
== '\\')
673 nChar
= INetMIME::getUTF32Character(p
, pEnd
);
675 writeUTF8(aSink
, nChar
);
679 aValue
= aSink
.makeStringAndClear();
683 sal_Unicode
const * pStringEnd
= skipQuotedString(p
, pEnd
);
690 sal_Unicode
const * pTokenBegin
= p
;
691 while (p
!= pEnd
&& (isTokenChar(*p
) || !rtl::isAscii(*p
)))
693 if (p
== pTokenBegin
)
697 pTokenBegin
, p
- pTokenBegin
,
698 RTL_TEXTENCODING_UTF8
);
700 aList
.emplace_front(Parameter
{aAttribute
, aCharset
, aLanguage
, aValue
, nSection
, bExtended
});
703 return parseParameters(aList
, pParameters
) ? pParameterBegin
: pBegin
;
706 bool equalIgnoreCase(const char * pBegin1
,
708 const char * pString2
)
710 DBG_ASSERT(pBegin1
&& pBegin1
<= pEnd1
&& pString2
,
711 "equalIgnoreCase(): Bad sequences");
713 while (*pString2
!= 0)
715 || (rtl::toAsciiUpperCase(static_cast<unsigned char>(*pBegin1
++))
716 != rtl::toAsciiUpperCase(
717 static_cast<unsigned char>(*pString2
++))))
719 return pBegin1
== pEnd1
;
724 char const * m_aName
;
725 rtl_TextEncoding m_eEncoding
;
728 // The source for the following table is <ftp://ftp.iana.org/in-notes/iana/
729 // assignments/character-sets> as of Jan, 21 2000 12:46:00, unless otherwise
731 EncodingEntry
const aEncodingMap
[]
732 = { { "US-ASCII", RTL_TEXTENCODING_ASCII_US
},
733 { "ANSI_X3.4-1968", RTL_TEXTENCODING_ASCII_US
},
734 { "ISO-IR-6", RTL_TEXTENCODING_ASCII_US
},
735 { "ANSI_X3.4-1986", RTL_TEXTENCODING_ASCII_US
},
736 { "ISO_646.IRV:1991", RTL_TEXTENCODING_ASCII_US
},
737 { "ASCII", RTL_TEXTENCODING_ASCII_US
},
738 { "ISO646-US", RTL_TEXTENCODING_ASCII_US
},
739 { "US", RTL_TEXTENCODING_ASCII_US
},
740 { "IBM367", RTL_TEXTENCODING_ASCII_US
},
741 { "CP367", RTL_TEXTENCODING_ASCII_US
},
742 { "CSASCII", RTL_TEXTENCODING_ASCII_US
},
743 { "ISO-8859-1", RTL_TEXTENCODING_ISO_8859_1
},
744 { "ISO_8859-1:1987", RTL_TEXTENCODING_ISO_8859_1
},
745 { "ISO-IR-100", RTL_TEXTENCODING_ISO_8859_1
},
746 { "ISO_8859-1", RTL_TEXTENCODING_ISO_8859_1
},
747 { "LATIN1", RTL_TEXTENCODING_ISO_8859_1
},
748 { "L1", RTL_TEXTENCODING_ISO_8859_1
},
749 { "IBM819", RTL_TEXTENCODING_ISO_8859_1
},
750 { "CP819", RTL_TEXTENCODING_ISO_8859_1
},
751 { "CSISOLATIN1", RTL_TEXTENCODING_ISO_8859_1
},
752 { "ISO-8859-2", RTL_TEXTENCODING_ISO_8859_2
},
753 { "ISO_8859-2:1987", RTL_TEXTENCODING_ISO_8859_2
},
754 { "ISO-IR-101", RTL_TEXTENCODING_ISO_8859_2
},
755 { "ISO_8859-2", RTL_TEXTENCODING_ISO_8859_2
},
756 { "LATIN2", RTL_TEXTENCODING_ISO_8859_2
},
757 { "L2", RTL_TEXTENCODING_ISO_8859_2
},
758 { "CSISOLATIN2", RTL_TEXTENCODING_ISO_8859_2
},
759 { "ISO-8859-3", RTL_TEXTENCODING_ISO_8859_3
},
760 { "ISO_8859-3:1988", RTL_TEXTENCODING_ISO_8859_3
},
761 { "ISO-IR-109", RTL_TEXTENCODING_ISO_8859_3
},
762 { "ISO_8859-3", RTL_TEXTENCODING_ISO_8859_3
},
763 { "LATIN3", RTL_TEXTENCODING_ISO_8859_3
},
764 { "L3", RTL_TEXTENCODING_ISO_8859_3
},
765 { "CSISOLATIN3", RTL_TEXTENCODING_ISO_8859_3
},
766 { "ISO-8859-4", RTL_TEXTENCODING_ISO_8859_4
},
767 { "ISO_8859-4:1988", RTL_TEXTENCODING_ISO_8859_4
},
768 { "ISO-IR-110", RTL_TEXTENCODING_ISO_8859_4
},
769 { "ISO_8859-4", RTL_TEXTENCODING_ISO_8859_4
},
770 { "LATIN4", RTL_TEXTENCODING_ISO_8859_4
},
771 { "L4", RTL_TEXTENCODING_ISO_8859_4
},
772 { "CSISOLATIN4", RTL_TEXTENCODING_ISO_8859_4
},
773 { "ISO-8859-5", RTL_TEXTENCODING_ISO_8859_5
},
774 { "ISO_8859-5:1988", RTL_TEXTENCODING_ISO_8859_5
},
775 { "ISO-IR-144", RTL_TEXTENCODING_ISO_8859_5
},
776 { "ISO_8859-5", RTL_TEXTENCODING_ISO_8859_5
},
777 { "CYRILLIC", RTL_TEXTENCODING_ISO_8859_5
},
778 { "CSISOLATINCYRILLIC", RTL_TEXTENCODING_ISO_8859_5
},
779 { "ISO-8859-6", RTL_TEXTENCODING_ISO_8859_6
},
780 { "ISO_8859-6:1987", RTL_TEXTENCODING_ISO_8859_6
},
781 { "ISO-IR-127", RTL_TEXTENCODING_ISO_8859_6
},
782 { "ISO_8859-6", RTL_TEXTENCODING_ISO_8859_6
},
783 { "ECMA-114", RTL_TEXTENCODING_ISO_8859_6
},
784 { "ASMO-708", RTL_TEXTENCODING_ISO_8859_6
},
785 { "ARABIC", RTL_TEXTENCODING_ISO_8859_6
},
786 { "CSISOLATINARABIC", RTL_TEXTENCODING_ISO_8859_6
},
787 { "ISO-8859-7", RTL_TEXTENCODING_ISO_8859_7
},
788 { "ISO_8859-7:1987", RTL_TEXTENCODING_ISO_8859_7
},
789 { "ISO-IR-126", RTL_TEXTENCODING_ISO_8859_7
},
790 { "ISO_8859-7", RTL_TEXTENCODING_ISO_8859_7
},
791 { "ELOT_928", RTL_TEXTENCODING_ISO_8859_7
},
792 { "ECMA-118", RTL_TEXTENCODING_ISO_8859_7
},
793 { "GREEK", RTL_TEXTENCODING_ISO_8859_7
},
794 { "GREEK8", RTL_TEXTENCODING_ISO_8859_7
},
795 { "CSISOLATINGREEK", RTL_TEXTENCODING_ISO_8859_7
},
796 { "ISO-8859-8", RTL_TEXTENCODING_ISO_8859_8
},
797 { "ISO_8859-8:1988", RTL_TEXTENCODING_ISO_8859_8
},
798 { "ISO-IR-138", RTL_TEXTENCODING_ISO_8859_8
},
799 { "ISO_8859-8", RTL_TEXTENCODING_ISO_8859_8
},
800 { "HEBREW", RTL_TEXTENCODING_ISO_8859_8
},
801 { "CSISOLATINHEBREW", RTL_TEXTENCODING_ISO_8859_8
},
802 { "ISO-8859-9", RTL_TEXTENCODING_ISO_8859_9
},
803 { "ISO_8859-9:1989", RTL_TEXTENCODING_ISO_8859_9
},
804 { "ISO-IR-148", RTL_TEXTENCODING_ISO_8859_9
},
805 { "ISO_8859-9", RTL_TEXTENCODING_ISO_8859_9
},
806 { "LATIN5", RTL_TEXTENCODING_ISO_8859_9
},
807 { "L5", RTL_TEXTENCODING_ISO_8859_9
},
808 { "CSISOLATIN5", RTL_TEXTENCODING_ISO_8859_9
},
809 { "ISO-8859-14", RTL_TEXTENCODING_ISO_8859_14
}, // RFC 2047
810 { "ISO_8859-15", RTL_TEXTENCODING_ISO_8859_15
},
811 { "ISO-8859-15", RTL_TEXTENCODING_ISO_8859_15
}, // RFC 2047
812 { "MACINTOSH", RTL_TEXTENCODING_APPLE_ROMAN
},
813 { "MAC", RTL_TEXTENCODING_APPLE_ROMAN
},
814 { "CSMACINTOSH", RTL_TEXTENCODING_APPLE_ROMAN
},
815 { "IBM437", RTL_TEXTENCODING_IBM_437
},
816 { "CP437", RTL_TEXTENCODING_IBM_437
},
817 { "437", RTL_TEXTENCODING_IBM_437
},
818 { "CSPC8CODEPAGE437", RTL_TEXTENCODING_IBM_437
},
819 { "IBM850", RTL_TEXTENCODING_IBM_850
},
820 { "CP850", RTL_TEXTENCODING_IBM_850
},
821 { "850", RTL_TEXTENCODING_IBM_850
},
822 { "CSPC850MULTILINGUAL", RTL_TEXTENCODING_IBM_850
},
823 { "IBM860", RTL_TEXTENCODING_IBM_860
},
824 { "CP860", RTL_TEXTENCODING_IBM_860
},
825 { "860", RTL_TEXTENCODING_IBM_860
},
826 { "CSIBM860", RTL_TEXTENCODING_IBM_860
},
827 { "IBM861", RTL_TEXTENCODING_IBM_861
},
828 { "CP861", RTL_TEXTENCODING_IBM_861
},
829 { "861", RTL_TEXTENCODING_IBM_861
},
830 { "CP-IS", RTL_TEXTENCODING_IBM_861
},
831 { "CSIBM861", RTL_TEXTENCODING_IBM_861
},
832 { "IBM863", RTL_TEXTENCODING_IBM_863
},
833 { "CP863", RTL_TEXTENCODING_IBM_863
},
834 { "863", RTL_TEXTENCODING_IBM_863
},
835 { "CSIBM863", RTL_TEXTENCODING_IBM_863
},
836 { "IBM865", RTL_TEXTENCODING_IBM_865
},
837 { "CP865", RTL_TEXTENCODING_IBM_865
},
838 { "865", RTL_TEXTENCODING_IBM_865
},
839 { "CSIBM865", RTL_TEXTENCODING_IBM_865
},
840 { "IBM775", RTL_TEXTENCODING_IBM_775
},
841 { "CP775", RTL_TEXTENCODING_IBM_775
},
842 { "CSPC775BALTIC", RTL_TEXTENCODING_IBM_775
},
843 { "IBM852", RTL_TEXTENCODING_IBM_852
},
844 { "CP852", RTL_TEXTENCODING_IBM_852
},
845 { "852", RTL_TEXTENCODING_IBM_852
},
846 { "CSPCP852", RTL_TEXTENCODING_IBM_852
},
847 { "IBM855", RTL_TEXTENCODING_IBM_855
},
848 { "CP855", RTL_TEXTENCODING_IBM_855
},
849 { "855", RTL_TEXTENCODING_IBM_855
},
850 { "CSIBM855", RTL_TEXTENCODING_IBM_855
},
851 { "IBM857", RTL_TEXTENCODING_IBM_857
},
852 { "CP857", RTL_TEXTENCODING_IBM_857
},
853 { "857", RTL_TEXTENCODING_IBM_857
},
854 { "CSIBM857", RTL_TEXTENCODING_IBM_857
},
855 { "IBM862", RTL_TEXTENCODING_IBM_862
},
856 { "CP862", RTL_TEXTENCODING_IBM_862
},
857 { "862", RTL_TEXTENCODING_IBM_862
},
858 { "CSPC862LATINHEBREW", RTL_TEXTENCODING_IBM_862
},
859 { "IBM864", RTL_TEXTENCODING_IBM_864
},
860 { "CP864", RTL_TEXTENCODING_IBM_864
},
861 { "CSIBM864", RTL_TEXTENCODING_IBM_864
},
862 { "IBM866", RTL_TEXTENCODING_IBM_866
},
863 { "CP866", RTL_TEXTENCODING_IBM_866
},
864 { "866", RTL_TEXTENCODING_IBM_866
},
865 { "CSIBM866", RTL_TEXTENCODING_IBM_866
},
866 { "IBM869", RTL_TEXTENCODING_IBM_869
},
867 { "CP869", RTL_TEXTENCODING_IBM_869
},
868 { "869", RTL_TEXTENCODING_IBM_869
},
869 { "CP-GR", RTL_TEXTENCODING_IBM_869
},
870 { "CSIBM869", RTL_TEXTENCODING_IBM_869
},
871 { "WINDOWS-1250", RTL_TEXTENCODING_MS_1250
},
872 { "WINDOWS-1251", RTL_TEXTENCODING_MS_1251
},
873 { "WINDOWS-1253", RTL_TEXTENCODING_MS_1253
},
874 { "WINDOWS-1254", RTL_TEXTENCODING_MS_1254
},
875 { "WINDOWS-1255", RTL_TEXTENCODING_MS_1255
},
876 { "WINDOWS-1256", RTL_TEXTENCODING_MS_1256
},
877 { "WINDOWS-1257", RTL_TEXTENCODING_MS_1257
},
878 { "WINDOWS-1258", RTL_TEXTENCODING_MS_1258
},
879 { "SHIFT_JIS", RTL_TEXTENCODING_SHIFT_JIS
},
880 { "MS_KANJI", RTL_TEXTENCODING_SHIFT_JIS
},
881 { "CSSHIFTJIS", RTL_TEXTENCODING_SHIFT_JIS
},
882 { "GB2312", RTL_TEXTENCODING_GB_2312
},
883 { "CSGB2312", RTL_TEXTENCODING_GB_2312
},
884 { "BIG5", RTL_TEXTENCODING_BIG5
},
885 { "CSBIG5", RTL_TEXTENCODING_BIG5
},
886 { "EUC-JP", RTL_TEXTENCODING_EUC_JP
},
887 { "EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE",
888 RTL_TEXTENCODING_EUC_JP
},
889 { "CSEUCPKDFMTJAPANESE", RTL_TEXTENCODING_EUC_JP
},
890 { "ISO-2022-JP", RTL_TEXTENCODING_ISO_2022_JP
},
891 { "CSISO2022JP", RTL_TEXTENCODING_ISO_2022_JP
},
892 { "ISO-2022-CN", RTL_TEXTENCODING_ISO_2022_CN
},
893 { "KOI8-R", RTL_TEXTENCODING_KOI8_R
},
894 { "CSKOI8R", RTL_TEXTENCODING_KOI8_R
},
895 { "UTF-7", RTL_TEXTENCODING_UTF7
},
896 { "UTF-8", RTL_TEXTENCODING_UTF8
},
897 { "ISO-8859-10", RTL_TEXTENCODING_ISO_8859_10
}, // RFC 2047
898 { "ISO-8859-13", RTL_TEXTENCODING_ISO_8859_13
}, // RFC 2047
899 { "EUC-KR", RTL_TEXTENCODING_EUC_KR
},
900 { "CSEUCKR", RTL_TEXTENCODING_EUC_KR
},
901 { "ISO-2022-KR", RTL_TEXTENCODING_ISO_2022_KR
},
902 { "CSISO2022KR", RTL_TEXTENCODING_ISO_2022_KR
},
903 { "ISO-10646-UCS-4", RTL_TEXTENCODING_UCS4
},
904 { "CSUCS4", RTL_TEXTENCODING_UCS4
},
905 { "ISO-10646-UCS-2", RTL_TEXTENCODING_UCS2
},
906 { "CSUNICODE", RTL_TEXTENCODING_UCS2
} };
908 rtl_TextEncoding
getCharsetEncoding(char const * pBegin
,
911 for (const EncodingEntry
& i
: aEncodingMap
)
912 if (equalIgnoreCase(pBegin
, pEnd
, i
.m_aName
))
913 return i
.m_eEncoding
;
914 return RTL_TEXTENCODING_DONTKNOW
;
922 bool INetMIME::isAtomChar(sal_uInt32 nChar
)
924 static const bool aMap
[128]
925 = { false, false, false, false, false, false, false, false,
926 false, false, false, false, false, false, false, false,
927 false, false, false, false, false, false, false, false,
928 false, false, false, false, false, false, false, false,
929 false, true, false, true, true, true, true, true, // !"#$%&'
930 false, false, true, true, false, true, false, true, //()*+,-./
931 true, true, true, true, true, true, true, true, //01234567
932 true, true, false, false, false, true, false, true, //89:;<=>?
933 false, true, true, true, true, true, true, true, //@ABCDEFG
934 true, true, true, true, true, true, true, true, //HIJKLMNO
935 true, true, true, true, true, true, true, true, //PQRSTUVW
936 true, true, true, false, false, false, true, true, //XYZ[\]^_
937 true, true, true, true, true, true, true, true, //`abcdefg
938 true, true, true, true, true, true, true, true, //hijklmno
939 true, true, true, true, true, true, true, true, //pqrstuvw
940 true, true, true, true, true, true, true, false //xyz{|}~
942 return rtl::isAscii(nChar
) && aMap
[nChar
];
946 bool INetMIME::isIMAPAtomChar(sal_uInt32 nChar
)
948 static const bool aMap
[128]
949 = { false, false, false, false, false, false, false, false,
950 false, false, false, false, false, false, false, false,
951 false, false, false, false, false, false, false, false,
952 false, false, false, false, false, false, false, false,
953 false, true, false, true, true, false, true, true, // !"#$%&'
954 false, false, false, true, true, true, true, true, //()*+,-./
955 true, true, true, true, true, true, true, true, //01234567
956 true, true, true, true, true, true, true, true, //89:;<=>?
957 true, true, true, true, true, true, true, true, //@ABCDEFG
958 true, true, true, true, true, true, true, true, //HIJKLMNO
959 true, true, true, true, true, true, true, true, //PQRSTUVW
960 true, true, true, true, false, true, true, true, //XYZ[\]^_
961 true, true, true, true, true, true, true, true, //`abcdefg
962 true, true, true, true, true, true, true, true, //hijklmno
963 true, true, true, true, true, true, true, true, //pqrstuvw
964 true, true, true, false, true, true, true, false //xyz{|}~
966 return rtl::isAscii(nChar
) && aMap
[nChar
];
970 bool INetMIME::equalIgnoreCase(const sal_Unicode
* pBegin1
,
971 const sal_Unicode
* pEnd1
,
972 const char * pString2
)
974 DBG_ASSERT(pBegin1
&& pBegin1
<= pEnd1
&& pString2
,
975 "INetMIME::equalIgnoreCase(): Bad sequences");
977 while (*pString2
!= 0)
979 || (rtl::toAsciiUpperCase(*pBegin1
++)
980 != rtl::toAsciiUpperCase(
981 static_cast<unsigned char>(*pString2
++))))
983 return pBegin1
== pEnd1
;
987 bool INetMIME::scanUnsigned(const sal_Unicode
*& rBegin
,
988 const sal_Unicode
* pEnd
, bool bLeadingZeroes
,
991 sal_uInt64 nTheValue
= 0;
992 const sal_Unicode
* p
= rBegin
;
993 for ( ; p
!= pEnd
; ++p
)
995 int nWeight
= getWeight(*p
);
998 nTheValue
= 10 * nTheValue
+ nWeight
;
999 if (nTheValue
> std::numeric_limits
< sal_uInt32
>::max())
1002 if (nTheValue
== 0 && (p
== rBegin
|| (!bLeadingZeroes
&& p
- rBegin
!= 1)))
1005 rValue
= sal_uInt32(nTheValue
);
1010 sal_Unicode
const * INetMIME::scanContentType(
1011 std::u16string_view rStr
, OUString
* pType
,
1012 OUString
* pSubType
, INetContentTypeParameterList
* pParameters
)
1014 sal_Unicode
const * pBegin
= rStr
.data();
1015 sal_Unicode
const * pEnd
= pBegin
+ rStr
.size();
1016 sal_Unicode
const * p
= skipLinearWhiteSpaceComment(pBegin
, pEnd
);
1017 sal_Unicode
const * pTypeBegin
= p
;
1018 while (p
!= pEnd
&& isTokenChar(*p
))
1022 if (p
== pTypeBegin
)
1024 sal_Unicode
const * pTypeEnd
= p
;
1026 p
= skipLinearWhiteSpaceComment(p
, pEnd
);
1027 if (p
== pEnd
|| *p
++ != '/')
1030 p
= skipLinearWhiteSpaceComment(p
, pEnd
);
1031 sal_Unicode
const * pSubTypeBegin
= p
;
1032 while (p
!= pEnd
&& isTokenChar(*p
))
1036 if (p
== pSubTypeBegin
)
1038 sal_Unicode
const * pSubTypeEnd
= p
;
1040 if (pType
!= nullptr)
1042 *pType
= OUString(pTypeBegin
, pTypeEnd
- pTypeBegin
).toAsciiLowerCase();
1044 if (pSubType
!= nullptr)
1046 *pSubType
= OUString(pSubTypeBegin
, pSubTypeEnd
- pSubTypeBegin
)
1047 .toAsciiLowerCase();
1050 return scanParameters(p
, pEnd
, pParameters
);
1054 OUString
INetMIME::decodeHeaderFieldBody(const OString
& rBody
)
1056 // Due to a bug in INetCoreRFC822MessageStream::ConvertTo7Bit(), old
1057 // versions of StarOffice send mails with header fields where encoded
1058 // words can be preceded by '=', ',', '.', '"', or '(', and followed by
1059 // '=', ',', '.', '"', ')', without any required white space in between.
1060 // And there appear to exist some broken mailers that only encode single
1061 // letters within words, like "Appel
1062 // =?iso-8859-1?Q?=E0?=t=?iso-8859-1?Q?=E9?=moin", so it seems best to
1063 // detect encoded words even when not properly surrounded by white space.
1065 // Non US-ASCII characters in rBody are treated as ISO-8859-1.
1067 // encoded-word = "=?"
1068 // 1*(%x21 / %x23-27 / %x2A-2B / %x2D / %30-39 / %x41-5A / %x5E-7E)
1069 // ["*" 1*8ALPHA *("-" 1*8ALPHA)] "?"
1070 // ("B?" *(4base64) (4base64 / 3base64 "=" / 2base64 "==")
1071 // / "Q?" 1*(%x21-3C / %x3E / %x40-7E / "=" 2HEXDIG))
1074 // base64 = ALPHA / DIGIT / "+" / "/"
1076 const char * pBegin
= rBody
.getStr();
1077 const char * pEnd
= pBegin
+ rBody
.getLength();
1079 OUStringBuffer sDecoded
;
1080 const char * pCopyBegin
= pBegin
;
1082 /* bool bStartEncodedWord = true; */
1083 const char * pWSPBegin
= pBegin
;
1085 for (const char * p
= pBegin
; p
!= pEnd
;)
1087 if (*p
== '=' /* && bStartEncodedWord */)
1089 const char * q
= p
+ 1;
1090 bool bEncodedWord
= q
!= pEnd
&& *q
++ == '?';
1092 rtl_TextEncoding eCharsetEncoding
= RTL_TEXTENCODING_DONTKNOW
;
1095 const char * pCharsetBegin
= q
;
1096 const char * pLanguageBegin
= nullptr;
1097 int nAlphaCount
= 0;
1098 for (bool bDone
= false; !bDone
;)
1101 bEncodedWord
= false;
1110 pLanguageBegin
= q
- 1;
1115 if (pLanguageBegin
!= nullptr)
1117 if (nAlphaCount
== 0)
1118 pLanguageBegin
= nullptr;
1125 if (pCharsetBegin
== q
- 1)
1126 bEncodedWord
= false;
1130 = getCharsetEncoding(
1132 pLanguageBegin
== nullptr
1133 || nAlphaCount
== 0 ?
1134 q
- 1 : pLanguageBegin
);
1135 bEncodedWord
= isMIMECharsetEncoding(
1138 = translateFromMIME(eCharsetEncoding
);
1144 if (pLanguageBegin
!= nullptr
1145 && (!rtl::isAsciiAlpha(
1146 static_cast<unsigned char>(cChar
))
1147 || ++nAlphaCount
> 8))
1148 pLanguageBegin
= nullptr;
1154 bool bEncodingB
= false;
1158 bEncodedWord
= false;
1174 bEncodedWord
= false;
1180 bEncodedWord
= bEncodedWord
&& q
!= pEnd
&& *q
++ == '?';
1182 OStringBuffer sText
;
1187 for (bool bDone
= false; !bDone
;)
1191 bEncodedWord
= false;
1196 bool bFinal
= false;
1198 sal_uInt32 nValue
= 0;
1199 for (int nShift
= 18; nShift
>= 0; nShift
-= 6)
1201 int nWeight
= getBase64Weight(*q
++);
1204 bEncodedWord
= false;
1214 bEncodedWord
= false;
1219 nCount
= nShift
== 6 ? 1 : 2;
1223 nValue
|= nWeight
<< nShift
;
1227 for (int nShift
= 16; nCount
-- > 0; nShift
-= 8)
1228 sText
.append(char(nValue
>> nShift
& 0xFF));
1234 if (bFinal
&& !bDone
)
1236 bEncodedWord
= false;
1245 const char * pEncodedTextBegin
= q
;
1246 const char * pEncodedTextCopyBegin
= q
;
1247 for (bool bDone
= false; !bDone
;)
1250 bEncodedWord
= false;
1255 sal_uInt32 nChar
= static_cast<unsigned char>(*q
++);
1262 bEncodedWord
= false;
1266 int nDigit1
= getHexWeight(q
[0]);
1267 int nDigit2
= getHexWeight(q
[1]);
1268 if (nDigit1
< 0 || nDigit2
< 0)
1270 bEncodedWord
= false;
1276 (pEncodedTextCopyBegin
- pBegin
),
1277 (q
- 1 - pEncodedTextCopyBegin
))
1278 + OStringChar(char(nDigit1
<< 4 | nDigit2
)));
1280 pEncodedTextCopyBegin
= q
;
1285 if (q
- pEncodedTextBegin
> 1)
1286 sText
.append(rBody
.subView(
1287 (pEncodedTextCopyBegin
- pBegin
),
1288 (q
- 1 - pEncodedTextCopyBegin
)));
1290 bEncodedWord
= false;
1297 (pEncodedTextCopyBegin
- pBegin
),
1298 (q
- 1 - pEncodedTextCopyBegin
))
1299 + OString::Concat(" "));
1300 pEncodedTextCopyBegin
= q
;
1304 if (!isVisible(nChar
))
1306 bEncodedWord
= false;
1315 bEncodedWord
= bEncodedWord
&& q
!= pEnd
&& *q
++ == '=';
1317 std::unique_ptr
<sal_Unicode
[]> pUnicodeBuffer
;
1318 sal_Size nUnicodeSize
= 0;
1322 = convertToUnicode(sText
.getStr(),
1323 sText
.getStr() + sText
.getLength(),
1324 eCharsetEncoding
, nUnicodeSize
);
1325 if (!pUnicodeBuffer
)
1326 bEncodedWord
= false;
1331 appendISO88591(sDecoded
, pCopyBegin
, pWSPBegin
);
1333 pUnicodeBuffer
.get(),
1334 static_cast< sal_Int32
>(nUnicodeSize
));
1335 pUnicodeBuffer
.reset();
1340 while (p
!= pEnd
&& isWhiteSpace(*p
))
1342 /* bStartEncodedWord = p != pWSPBegin; */
1353 /* bStartEncodedWord = true; */
1357 /* bStartEncodedWord = true; */
1361 /* bStartEncodedWord = false; */
1366 const char * pUTF8Begin
= p
- 1;
1367 const char * pUTF8End
= pUTF8Begin
;
1368 sal_uInt32 nCharacter
= 0;
1369 if (translateUTF8Char(pUTF8End
, pEnd
, nCharacter
))
1371 appendISO88591(sDecoded
, pCopyBegin
, p
- 1);
1372 sDecoded
.appendUtf32(nCharacter
);
1376 /* bStartEncodedWord = false; */
1383 appendISO88591(sDecoded
, pCopyBegin
, pEnd
);
1384 return sDecoded
.makeStringAndClear();
1387 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */