include/tools/inetmime.hxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19 #ifndef INCLUDED_TOOLS_INETMIME_HXX
  20 #define INCLUDED_TOOLS_INETMIME_HXX
  21
  22 #include <tools/toolsdllapi.h>
  23 #include <rtl/character.hxx>
  24 #include <rtl/string.hxx>
  25 #include <rtl/strbuf.hxx>
  26 #include <rtl/ustring.hxx>
  27 #include <rtl/tencinfo.h>
  28 #include <tools/debug.hxx>
  29
  30 #include <unordered_map>
  31
  32 class INetMIMEOutputSink;
  33
  34 struct INetContentTypeParameter
  35 {
  36     /** The attribute value.  If the value is a quoted-string, it is
  37         'unpacked.'  If a character set is specified, and the value can be
  38         converted to Unicode, this is done.  Also, if no character set is
  39         specified, it is first tried to convert the value from UTF-8 encoding
  40         to Unicode, and if that doesn't work (because the value is not in
  41         UTF-8 encoding), it is converted from ISO-8859-1 encoding to Unicode
  42         (which will always work).  But if a character set is specified and the
  43         value cannot be converted from that character set to Unicode, special
  44         action is taken to produce a value that can possibly be transformed
  45         back into its original form:  Any 8-bit character from a non-encoded
  46         part of the original value is directly converted to Unicode
  47         (effectively handling it as if it was ISO-8859-1 encoded), and any
  48         8-bit character from an encoded part of the original value is mapped
  49         to the range U+F800..U+F8FF at the top of the Corporate Use Subarea
  50         within Unicode's Private Use Area (effectively adding 0xF800 to the
  51         character's numeric value).
  52      */
  53     OUString m_sValue;
  54
  55 };
  56
  57 // the key is the m_sAttribute again; all keys are lower case:
  58 typedef std::unordered_map<OString, INetContentTypeParameter, OStringHash>
  59     INetContentTypeParameterList;
  60
  61
  62 class TOOLS_DLLPUBLIC INetMIME
  63 {
  64 public:
  65     /** Check for US-ASCII visible character.
  66
  67         @param nChar  Some UCS-4 character.
  68
  69         @return  True if nChar is a US-ASCII visible character (US-ASCII
  70         0x21--0x7E).
  71      */
  72     static inline bool isVisible(sal_uInt32 nChar);
  73
  74     /** Check whether some character is valid within an RFC 822 <atom>.
  75
  76         @param nChar  Some UCS-4 character.
  77
  78         @return  True if nChar is valid within an RFC 822 <atom> (US-ASCII
  79         'A'--'Z', 'a'--'z', '0'--'9', '!', '#', '$', '%', '&', ''', '*', '+',
  80         '-', '/', '=', '?', '^', '_', '`', '{', '|', '}', or '~').
  81      */
  82     static bool isAtomChar(sal_uInt32 nChar);
  83
  84     /** Check whether some character is valid within an RFC 2060 <atom>.
  85
  86         @param nChar  Some UCS-4 character.
  87
  88         @return  True if nChar is valid within an RFC 2060 <atom> (US-ASCII
  89         'A'--'Z', 'a'--'z', '0'--'9', '!', '#', '$', '&', ''', '+', ',', '-',
  90         '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', ']', '^', '_', '`',
  91         '|', '}', or '~').
  92      */
  93     static bool isIMAPAtomChar(sal_uInt32 nChar);
  94
  95     /** Get the digit weight of a US-ASCII character.
  96
  97         @param nChar  Some UCS-4 character.
  98
  99         @return  If nChar is a US-ASCII (decimal) digit character (US-ASCII
 100         '0'--'9'), return the corresponding weight (0--9); otherwise,
 101         return -1.
 102      */
 103     static inline int getWeight(sal_uInt32 nChar);
 104
 105     /** Get the hexadecimal digit weight of a US-ASCII character.
 106
 107         @param nChar  Some UCS-4 character.
 108
 109         @return  If nChar is a US-ASCII hexadecimal digit character (US-ASCII
 110         '0'--'9', 'A'--'F', or 'a'--'f'), return the corresponding weight
 111         (0--15); otherwise, return -1.
 112      */
 113     static inline int getHexWeight(sal_uInt32 nChar);
 114
 115     /** Get a hexadecimal digit encoded as US-ASCII.
 116
 117         @param nWeight  Must be in the range 0--15, inclusive.
 118
 119         @return  The canonic (i.e., upper case) hexadecimal digit
 120         corresponding to nWeight (US-ASCII '0'--'9' or 'A'--'F').
 121      */
 122     static sal_uInt32 getHexDigit(int nWeight);
 123
 124     /** Check two US-ASCII strings for equality, ignoring case.
 125
 126         @param pBegin1  Points to the start of the first string, must not be
 127         null.
 128
 129         @param pEnd1  Points past the end of the first string, must be >=
 130         pBegin1.
 131
 132         @param pString2  Points to the start of the null terminated second
 133         string, must not be null.
 134
 135         @return  True if the two strings are equal, ignoring the case of US-
 136         ASCII alphabetic characters (US-ASCII 'A'--'Z' and 'a'--'z').
 137      */
 138     static bool equalIgnoreCase(const sal_Unicode * pBegin1,
 139                                 const sal_Unicode * pEnd1,
 140                                 const sal_Char * pString2);
 141
 142     static bool scanUnsigned(const sal_Unicode *& rBegin,
 143                              const sal_Unicode * pEnd, bool bLeadingZeroes,
 144                              sal_uInt32 & rValue);
 145
 146     /** Parse the body of an RFC 2045 Content-Type header field.
 147
 148         @param pBegin  The range (that must be valid) from non-null pBegin,
 149         inclusive. to non-null pEnd, exclusive, forms the body of the
 150         Content-Type header field.  It must be of the form
 151
 152           token "/" token *(";" token "=" (token / quoted-string))
 153
 154         with intervening linear white space and comments (cf. RFCs 822, 2045).
 155         The RFC 2231 extension are supported.  The encoding of rMediaType
 156         should be US-ASCII, but any Unicode values in the range U+0080..U+FFFF
 157         are interpreted 'as appropriate.'
 158
 159         @param pType  If not null, returns the type (the first of the above
 160         tokens), in US-ASCII encoding and converted to lower case.
 161
 162         @param pSubType  If not null, returns the sub-type (the second of the
 163         above tokens), in US-ASCII encoding and converted to lower case.
 164
 165         @param pParameters  If not null, returns the parameters as a list of
 166         INetContentTypeParameters (the attributes are in US-ASCII encoding and
 167         converted to lower case, the values are in Unicode encoding).  If
 168         null, only the syntax of the parameters is checked, but they are not
 169         returned.
 170
 171         @return  Null if the syntax of the field body is incorrect (i.e., does
 172         not start with type and sub-type tokens).  Otherwise, a pointer past the
 173         longest valid input prefix.  If null is returned, none of the output
 174         parameters will be modified.
 175      */
 176     static sal_Unicode const * scanContentType(
 177         sal_Unicode const *pBegin, sal_Unicode const * pEnd,
 178         OUString * pType = nullptr, OUString * pSubType = nullptr,
 179         INetContentTypeParameterList * pParameters = nullptr);
 180
 181     static void writeHeaderFieldBody(INetMIMEOutputSink & rSink,
 182                                      const OUString& rBody,
 183                                      rtl_TextEncoding ePreferredEncoding);
 184
 185     static OUString decodeHeaderFieldBody(const OString& rBody);
 186
 187     /** Get the UTF-32 character at the head of a UTF-16 encoded string.
 188
 189         @param rBegin  Points to the start of the UTF-16 encoded string, must
 190         not be null.  On exit, it points past the first UTF-32 character's
 191         encoding.
 192
 193         @param pEnd  Points past the end of the UTF-16 encoded string, must be
 194         strictly greater than rBegin.
 195
 196         @return  The UCS-4 character at the head of the UTF-16 encoded string.
 197         If the string does not start with the UTF-16 encoding of a UCS-32
 198         character, the first UTF-16 value is returned.
 199      */
 200     static inline sal_uInt32 getUTF32Character(const sal_Unicode *& rBegin,
 201                                                const sal_Unicode * pEnd);
 202 };
 203
 204 // static
 205 inline bool INetMIME::isVisible(sal_uInt32 nChar)
 206 {
 207     return nChar >= '!' && nChar <= '~';
 208 }
 209
 210 // static
 211 inline int INetMIME::getWeight(sal_uInt32 nChar)
 212 {
 213     return rtl::isAsciiDigit(nChar) ? int(nChar - '0') : -1;
 214 }
 215
 216 // static
 217 inline int INetMIME::getHexWeight(sal_uInt32 nChar)
 218 {
 219     return rtl::isAsciiDigit(nChar) ? int(nChar - '0') :
 220            nChar >= 'A' && nChar <= 'F' ? int(nChar - 'A' + 10) :
 221            nChar >= 'a' && nChar <= 'f' ? int(nChar - 'a' + 10) : -1;
 222 }
 223
 224 // static
 225 inline sal_uInt32 INetMIME::getUTF32Character(const sal_Unicode *& rBegin,
 226                                               const sal_Unicode * pEnd)
 227 {
 228     DBG_ASSERT(rBegin && rBegin < pEnd,
 229                "INetMIME::getUTF32Character(): Bad sequence");
 230     if (rBegin + 1 < pEnd && rBegin[0] >= 0xD800 && rBegin[0] <= 0xDBFF
 231         && rBegin[1] >= 0xDC00 && rBegin[1] <= 0xDFFF)
 232     {
 233         sal_uInt32 nUTF32 = sal_uInt32(*rBegin++ & 0x3FF) << 10;
 234         return (nUTF32 | (*rBegin++ & 0x3FF)) + 0x10000;
 235     }
 236     else
 237         return *rBegin++;
 238 }
 239
 240 class INetMIMEOutputSink
 241 {
 242 private:
 243     OStringBuffer m_aBuffer;
 244
 245     /** Write a sequence of octets.
 246
 247         @param pBegin  Points to the start of the sequence, must not be null.
 248
 249         @param pEnd  Points past the end of the sequence, must be >= pBegin.
 250      */
 251     void writeSequence(const sal_Char * pBegin, const sal_Char * pEnd);
 252
 253     /** Write a null terminated sequence of octets (without the terminating
 254         null).
 255
 256         @param pOctets  A null terminated sequence of octets, must not be
 257         null.
 258      */
 259     void writeSequence(const sal_Char * pSequence);
 260
 261     /** Write a sequence of octets.
 262
 263         @descr  The supplied sequence of Unicode characters is interpreted as
 264         a sequence of octets.  It is an error if any of the elements of the
 265         sequence has a numerical value greater than 255.
 266
 267         @param pBegin  Points to the start of the sequence, must not be null.
 268
 269         @param pEnd  Points past the end of the sequence, must be >= pBegin.
 270      */
 271     void writeSequence(const sal_Unicode * pBegin,
 272                                const sal_Unicode * pEnd);
 273
 274 public:
 275     /** Write a sequence of octets.
 276
 277         @descr  The supplied sequence of Unicode characters is interpreted as
 278         a sequence of octets.  It is an error if any of the elements of the
 279         sequence has a numerical value greater than 255.
 280
 281         @param pBegin  Points to the start of the sequence, must not be null.
 282
 283         @param pEnd  Points past the end of the sequence, must be >= pBegin.
 284      */
 285     inline void write(const sal_Unicode * pBegin, const sal_Unicode * pEnd);
 286
 287     /** Write a single octet.
 288
 289         @param nOctet  Some octet.
 290
 291         @return  This instance.
 292      */
 293     inline INetMIMEOutputSink & operator <<(sal_Char nOctet);
 294
 295     /** Write a null terminated sequence of octets (without the terminating
 296         null).
 297
 298         @param pOctets  A null terminated sequence of octets, must not be
 299         null.
 300
 301         @return  This instance.
 302      */
 303     inline INetMIMEOutputSink & operator <<(const sal_Char * pOctets);
 304
 305     OString takeBuffer()
 306     {
 307         return m_aBuffer.makeStringAndClear();
 308     }
 309 };
 310
 311
 312 inline void INetMIMEOutputSink::write(const sal_Unicode * pBegin,
 313                                       const sal_Unicode * pEnd)
 314 {
 315     writeSequence(pBegin, pEnd);
 316 }
 317
 318 inline INetMIMEOutputSink & INetMIMEOutputSink::operator <<(sal_Char nOctet)
 319 {
 320     writeSequence(&nOctet, &nOctet + 1);
 321     return *this;
 322 }
 323
 324 inline INetMIMEOutputSink & INetMIMEOutputSink::operator <<(const sal_Char *
 325                                                                 pOctets)
 326 {
 327     writeSequence(pOctets);
 328     return *this;
 329 }
 330
 331 #endif
 332
 333 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */