sal/rtl/uri.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include "surrogates.hxx"
  21
  22 #include "osl/diagnose.h"
  23 #include "rtl/character.hxx"
  24 #include "rtl/strbuf.hxx"
  25 #include "rtl/textenc.h"
  26 #include "rtl/textcvt.h"
  27 #include "rtl/uri.h"
  28 #include "rtl/ustrbuf.h"
  29 #include "rtl/ustrbuf.hxx"
  30 #include "rtl/ustring.h"
  31 #include "rtl/ustring.hxx"
  32 #include "sal/types.h"
  33 #include "sal/macros.h"
  34
  35 #include <cstddef>
  36
  37 namespace {
  38
  39 std::size_t const nCharClassSize = 128;
  40
  41 sal_Unicode const cEscapePrefix = 0x25; // '%'
  42
  43 inline bool isHighSurrogate(sal_uInt32 nUtf16)
  44 {
  45     return SAL_RTL_IS_HIGH_SURROGATE(nUtf16);
  46 }
  47
  48 inline bool isLowSurrogate(sal_uInt32 nUtf16)
  49 {
  50     return SAL_RTL_IS_LOW_SURROGATE(nUtf16);
  51 }
  52
  53 inline sal_uInt32 combineSurrogates(sal_uInt32 high, sal_uInt32 low)
  54 {
  55     return SAL_RTL_COMBINE_SURROGATES(high, low);
  56 }
  57
  58 inline int getHexWeight(sal_uInt32 nUtf32)
  59 {
  60     return nUtf32 >= 0x30 && nUtf32 <= 0x39 ? // '0'--'9'
  61                static_cast< int >(nUtf32 - 0x30) :
  62            nUtf32 >= 0x41 && nUtf32 <= 0x46 ? // 'A'--'F'
  63                static_cast< int >(nUtf32 - 0x41 + 10) :
  64            nUtf32 >= 0x61 && nUtf32 <= 0x66 ? // 'a'--'f'
  65                static_cast< int >(nUtf32 - 0x61 + 10) :
  66                -1; // not a hex digit
  67 }
  68
  69 inline bool isValid(sal_Bool const * pCharClass, sal_uInt32 nUtf32)
  70 {
  71     return nUtf32 < nCharClassSize && pCharClass[nUtf32];
  72 }
  73
  74 inline void writeUnicode(rtl_uString ** pBuffer, sal_Int32 * pCapacity,
  75                          sal_Unicode cChar)
  76 {
  77     rtl_uStringbuffer_insert(pBuffer, pCapacity, (*pBuffer)->length, &cChar, 1);
  78 }
  79
  80 enum EscapeType
  81 {
  82     EscapeNo,
  83     EscapeChar,
  84     EscapeOctet
  85 };
  86
  87 /* Read any of the following:
  88
  89    - sequence of escape sequences representing character from eCharset,
  90      translated to single UCS4 character; or
  91
  92    - pair of UTF-16 surrogates, translated to single UCS4 character; or
  93
  94    _ single UTF-16 character, extended to UCS4 character.
  95  */
  96 sal_uInt32 readUcs4(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
  97                     bool bEncoded, rtl_TextEncoding eCharset,
  98                     EscapeType * pType)
  99 {
 100     sal_uInt32 nChar = *(*pBegin)++;
 101     int nWeight1;
 102     int nWeight2;
 103     if (nChar == cEscapePrefix && bEncoded && pEnd - *pBegin >= 2
 104         && (nWeight1 = getHexWeight((*pBegin)[0])) >= 0
 105         && (nWeight2 = getHexWeight((*pBegin)[1])) >= 0)
 106     {
 107         *pBegin += 2;
 108         nChar = static_cast< sal_uInt32 >(nWeight1 << 4 | nWeight2);
 109         if (nChar <= 0x7F)
 110             *pType = EscapeChar;
 111         else if (eCharset == RTL_TEXTENCODING_UTF8)
 112         {
 113             if (nChar >= 0xC0 && nChar <= 0xF4)
 114             {
 115                 sal_uInt32 nEncoded;
 116                 int nShift;
 117                 sal_uInt32 nMin;
 118                 if (nChar <= 0xDF)
 119                 {
 120                     nEncoded = (nChar & 0x1F) << 6;
 121                     nShift = 0;
 122                     nMin = 0x80;
 123                 }
 124                 else if (nChar <= 0xEF)
 125                 {
 126                     nEncoded = (nChar & 0x0F) << 12;
 127                     nShift = 6;
 128                     nMin = 0x800;
 129                 }
 130                 else
 131                 {
 132                     nEncoded = (nChar & 0x07) << 18;
 133                     nShift = 12;
 134                     nMin = 0x10000;
 135                 }
 136                 sal_Unicode const * p = *pBegin;
 137                 bool bUTF8 = true;
 138                 for (; nShift >= 0; nShift -= 6)
 139                 {
 140                     if (pEnd - p < 3 || p[0] != cEscapePrefix
 141                         || (nWeight1 = getHexWeight(p[1])) < 8
 142                         || nWeight1 > 11
 143                         || (nWeight2 = getHexWeight(p[2])) < 0)
 144                     {
 145                         bUTF8 = sal_False;
 146                         break;
 147                     }
 148                     p += 3;
 149                     nEncoded |= ((nWeight1 & 3) << 4 | nWeight2) << nShift;
 150                 }
 151                 if (bUTF8 && nEncoded >= nMin && !isHighSurrogate(nEncoded)
 152                     && !isLowSurrogate(nEncoded) && nEncoded <= 0x10FFFF)
 153                 {
 154                     *pBegin = p;
 155                     *pType = EscapeChar;
 156                     return nEncoded;
 157                 }
 158             }
 159             *pType = EscapeOctet;
 160         }
 161         else
 162         {
 163             rtl::OStringBuffer aBuf;
 164             aBuf.append(static_cast< char >(nChar));
 165             rtl_TextToUnicodeConverter aConverter
 166                 = rtl_createTextToUnicodeConverter(eCharset);
 167             sal_Unicode const * p = *pBegin;
 168             for (;;)
 169             {
 170                 sal_Unicode aDst[2];
 171                 sal_uInt32 nInfo;
 172                 sal_Size nConverted;
 173                 sal_Size nDstSize = rtl_convertTextToUnicode(
 174                     aConverter, 0, aBuf.getStr(), aBuf.getLength(), aDst,
 175                     SAL_N_ELEMENTS( aDst ),
 176                     (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
 177                      | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
 178                      | RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR),
 179                     &nInfo, &nConverted);
 180                 if (nInfo == 0)
 181                 {
 182                     assert( nConverted
 183                         == sal::static_int_cast< sal_uInt32 >(
 184                             aBuf.getLength()));
 185                     rtl_destroyTextToUnicodeConverter(aConverter);
 186                     *pBegin = p;
 187                     *pType = EscapeChar;
 188                     assert( nDstSize == 1
 189                         || (nDstSize == 2 && isHighSurrogate(aDst[0])
 190                             && isLowSurrogate(aDst[1])));
 191                     return nDstSize == 1
 192                         ? aDst[0] : combineSurrogates(aDst[0], aDst[1]);
 193                 }
 194                 else if (nInfo == RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
 195                          && pEnd - p >= 3 && p[0] == cEscapePrefix
 196                          && (nWeight1 = getHexWeight(p[1])) >= 0
 197                          && (nWeight2 = getHexWeight(p[2])) >= 0)
 198                 {
 199                     p += 3;
 200                     aBuf.append(static_cast< char >(nWeight1 << 4 | nWeight2));
 201                 }
 202                 else if (nInfo == RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
 203                          && p != pEnd && *p <= 0x7F)
 204                 {
 205                     aBuf.append(static_cast< char >(*p++));
 206                 }
 207                 else
 208                 {
 209                     assert(
 210                         (nInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL)
 211                         == 0);
 212                     break;
 213                 }
 214             }
 215             rtl_destroyTextToUnicodeConverter(aConverter);
 216             *pType = EscapeOctet;
 217         }
 218         return nChar;
 219     }
 220     else
 221     {
 222         *pType = EscapeNo;
 223         return isHighSurrogate(nChar) && *pBegin < pEnd
 224                && isLowSurrogate(**pBegin) ?
 225                    combineSurrogates(nChar, *(*pBegin)++) : nChar;
 226     }
 227 }
 228
 229 void writeUcs4(rtl_uString ** pBuffer, sal_Int32 * pCapacity, sal_uInt32 nUtf32)
 230 {
 231     assert(nUtf32 <= 0x10FFFF); // bad UTF-32 char
 232     if (nUtf32 <= 0xFFFF) {
 233         writeUnicode(
 234             pBuffer, pCapacity, static_cast< sal_Unicode >(nUtf32));
 235     } else {
 236         nUtf32 -= 0x10000;
 237         writeUnicode(
 238             pBuffer, pCapacity,
 239             static_cast< sal_Unicode >(nUtf32 >> 10 | 0xD800));
 240         writeUnicode(
 241             pBuffer, pCapacity,
 242             static_cast< sal_Unicode >((nUtf32 & 0x3FF) | 0xDC00));
 243     }
 244 }
 245
 246 void writeEscapeOctet(rtl_uString ** pBuffer, sal_Int32 * pCapacity,
 247                       sal_uInt32 nOctet)
 248 {
 249     assert(nOctet <= 0xFF); // bad octet
 250
 251     static sal_Unicode const aHex[16]
 252         = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
 253             0x41, 0x42, 0x43, 0x44, 0x45, 0x46 }; /* '0'--'9', 'A'--'F' */
 254
 255     writeUnicode(pBuffer, pCapacity, cEscapePrefix);
 256     writeUnicode(pBuffer, pCapacity, aHex[nOctet >> 4]);
 257     writeUnicode(pBuffer, pCapacity, aHex[nOctet & 15]);
 258 }
 259
 260 bool writeEscapeChar(rtl_uString ** pBuffer, sal_Int32 * pCapacity,
 261                      sal_uInt32 nUtf32, rtl_TextEncoding eCharset, bool bStrict)
 262 {
 263     assert(nUtf32 <= 0x10FFFF); // bad UTF-32 char
 264     if (eCharset == RTL_TEXTENCODING_UTF8) {
 265         if (nUtf32 < 0x80)
 266             writeEscapeOctet(pBuffer, pCapacity, nUtf32);
 267         else if (nUtf32 < 0x800)
 268         {
 269             writeEscapeOctet(pBuffer, pCapacity, nUtf32 >> 6 | 0xC0);
 270             writeEscapeOctet(pBuffer, pCapacity, (nUtf32 & 0x3F) | 0x80);
 271         }
 272         else if (nUtf32 < 0x10000)
 273         {
 274             writeEscapeOctet(pBuffer, pCapacity, nUtf32 >> 12 | 0xE0);
 275             writeEscapeOctet(pBuffer, pCapacity, (nUtf32 >> 6 & 0x3F) | 0x80);
 276             writeEscapeOctet(pBuffer, pCapacity, (nUtf32 & 0x3F) | 0x80);
 277         }
 278         else
 279         {
 280             writeEscapeOctet(pBuffer, pCapacity, nUtf32 >> 18 | 0xF0);
 281             writeEscapeOctet(pBuffer, pCapacity, (nUtf32 >> 12 & 0x3F) | 0x80);
 282             writeEscapeOctet(pBuffer, pCapacity, (nUtf32 >> 6 & 0x3F) | 0x80);
 283             writeEscapeOctet(pBuffer, pCapacity, (nUtf32 & 0x3F) | 0x80);
 284         }
 285     } else {
 286         rtl_UnicodeToTextConverter aConverter
 287             = rtl_createUnicodeToTextConverter(eCharset);
 288         sal_Unicode aSrc[2];
 289         sal_Size nSrcSize;
 290         if (nUtf32 <= 0xFFFF)
 291         {
 292             aSrc[0] = static_cast< sal_Unicode >(nUtf32);
 293             nSrcSize = 1;
 294         }
 295         else
 296         {
 297             aSrc[0] = static_cast< sal_Unicode >(
 298                 ((nUtf32 - 0x10000) >> 10) | 0xD800);
 299             aSrc[1] = static_cast< sal_Unicode >(
 300                 ((nUtf32 - 0x10000) & 0x3FF) | 0xDC00);
 301             nSrcSize = 2;
 302         }
 303         sal_Char aDst[32]; // FIXME  random value
 304         sal_uInt32 nInfo;
 305         sal_Size nConverted;
 306         sal_Size nDstSize = rtl_convertUnicodeToText(
 307             aConverter, 0, aSrc, nSrcSize, aDst, sizeof aDst,
 308             RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR
 309             | RTL_UNICODETOTEXT_FLAGS_INVALID_ERROR
 310             | RTL_UNICODETOTEXT_FLAGS_FLUSH,
 311             &nInfo, &nConverted);
 312         assert((nInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL) == 0);
 313         rtl_destroyUnicodeToTextConverter(aConverter);
 314         if (nInfo == 0) {
 315             assert(nConverted == nSrcSize); // bad rtl_convertUnicodeToText
 316             for (sal_Size i = 0; i < nDstSize; ++i)
 317                 writeEscapeOctet(pBuffer, pCapacity,
 318                                  static_cast< unsigned char >(aDst[i]));
 319                     // FIXME  all octets are escaped, even if there is no need
 320         } else {
 321             if (bStrict) {
 322                 return false;
 323             } else {
 324                 writeUcs4(pBuffer, pCapacity, nUtf32);
 325             }
 326         }
 327     }
 328     return true;
 329 }
 330
 331 struct Component
 332 {
 333     sal_Unicode const * pBegin;
 334     sal_Unicode const * pEnd;
 335
 336     inline Component(): pBegin(0), pEnd(0) {}
 337
 338     inline bool isPresent() const { return pBegin != 0; }
 339
 340     inline sal_Int32 getLength() const;
 341 };
 342
 343 inline sal_Int32 Component::getLength() const
 344 {
 345     assert(isPresent()); // taking length of non-present component
 346     return static_cast< sal_Int32 >(pEnd - pBegin);
 347 }
 348
 349 struct Components
 350 {
 351     Component aScheme;
 352     Component aAuthority;
 353     Component aPath;
 354     Component aQuery;
 355     Component aFragment;
 356 };
 357
 358 void parseUriRef(rtl_uString const * pUriRef, Components * pComponents)
 359 {
 360     // This algorithm is liberal and accepts various forms of illegal input.
 361
 362     sal_Unicode const * pBegin = pUriRef->buffer;
 363     sal_Unicode const * pEnd = pBegin + pUriRef->length;
 364     sal_Unicode const * pPos = pBegin;
 365
 366     if (pPos != pEnd && rtl::isAsciiAlpha(*pPos))
 367     {
 368         for (sal_Unicode const * p = pPos + 1; p != pEnd; ++p)
 369         {
 370             if (*p == ':')
 371             {
 372                 pComponents->aScheme.pBegin = pBegin;
 373                 pComponents->aScheme.pEnd = ++p;
 374                 pPos = p;
 375                 break;
 376             }
 377             else if (!rtl::isAsciiAlphanumeric(*p) && *p != '+' && *p != '-'
 378                      && *p != '.')
 379             {
 380                 break;
 381             }
 382         }
 383     }
 384
 385     if (pEnd - pPos >= 2 && pPos[0] == '/' && pPos[1] == '/')
 386     {
 387         pComponents->aAuthority.pBegin = pPos;
 388         pPos += 2;
 389         while (pPos != pEnd && *pPos != '/' && *pPos != '?' && *pPos != '#')
 390             ++pPos;
 391         pComponents->aAuthority.pEnd = pPos;
 392     }
 393
 394     pComponents->aPath.pBegin = pPos;
 395     while (pPos != pEnd && *pPos != '?' && * pPos != '#')
 396         ++pPos;
 397     pComponents->aPath.pEnd = pPos;
 398
 399     if (pPos != pEnd && *pPos == '?')
 400     {
 401         pComponents->aQuery.pBegin = pPos++;
 402         while (pPos != pEnd && * pPos != '#')
 403             ++pPos;
 404         pComponents->aQuery.pEnd = pPos;
 405     }
 406
 407     if (pPos != pEnd)
 408     {
 409         assert(*pPos == '#');
 410         pComponents->aFragment.pBegin = pPos;
 411         pComponents->aFragment.pEnd = pEnd;
 412     }
 413 }
 414
 415 rtl::OUString joinPaths(Component const & rBasePath, Component const & rRelPath)
 416 {
 417     assert(rBasePath.isPresent() && *rBasePath.pBegin == '/');
 418     assert(rRelPath.isPresent());
 419
 420     // The invariant of aBuffer is that it always starts and ends with a slash
 421     // (until probably right at the end of the algorithm, when the last segment
 422     // of rRelPath is added, which does not necessarily end in a slash):
 423     rtl::OUStringBuffer aBuffer(rBasePath.getLength() + rRelPath.getLength());
 424         // XXX  numeric overflow
 425
 426     // Segments "." and ".." within rBasePath are not conisdered special (but
 427     // are also not removed by ".." segments within rRelPath), RFC 2396 seems a
 428     // bit unclear about this point:
 429     sal_Int32 nFixed = 1;
 430     sal_Unicode const * p = rBasePath.pBegin + 1;
 431     for (sal_Unicode const * q = p; q != rBasePath.pEnd; ++q)
 432         if (*q == '/')
 433         {
 434             if (
 435                 (q - p == 1 && p[0] == '.') ||
 436                 (q - p == 2 && p[0] == '.' && p[1] == '.')
 437                )
 438             {
 439                 nFixed = q + 1 - rBasePath.pBegin;
 440             }
 441             p = q + 1;
 442         }
 443     aBuffer.append(rBasePath.pBegin, p - rBasePath.pBegin);
 444
 445     p = rRelPath.pBegin;
 446     if (p != rRelPath.pEnd)
 447         for (;;)
 448         {
 449             sal_Unicode const * q = p;
 450             sal_Unicode const * r;
 451             for (;;)
 452             {
 453                 if (q == rRelPath.pEnd)
 454                 {
 455                     r = q;
 456                     break;
 457                 }
 458                 if (*q == '/')
 459                 {
 460                     r = q + 1;
 461                     break;
 462                 }
 463                 ++q;
 464             }
 465             if (q - p == 2 && p[0] == '.' && p[1] == '.')
 466             {
 467                 // Erroneous excess segments ".." within rRelPath are left
 468                 // intact, as the examples in RFC 2396, section C.2, suggest:
 469                 sal_Int32 i = aBuffer.getLength() - 1;
 470                 if (i < nFixed)
 471                 {
 472                     aBuffer.append(p, r - p);
 473                     nFixed += 3;
 474                 }
 475                 else
 476                 {
 477                     while (i > 0 && aBuffer[i - 1] != '/')
 478                         --i;
 479                     aBuffer.setLength(i);
 480                 }
 481             }
 482             else if (q - p != 1 || *p != '.')
 483                 aBuffer.append(p, r - p);
 484             if (q == rRelPath.pEnd)
 485                 break;
 486             p = q + 1;
 487         }
 488
 489     return aBuffer.makeStringAndClear();
 490 }
 491
 492 }
 493
 494 sal_Bool const * SAL_CALL rtl_getUriCharClass(rtl_UriCharClass eCharClass)
 495     SAL_THROW_EXTERN_C()
 496 {
 497     static sal_Bool const aCharClass[][nCharClassSize]
 498     = {{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* None */
 499          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 500          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* !"#$%&'()*+,-./*/
 501          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*0123456789:;<=>?*/
 502          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*@ABCDEFGHIJKLMNO*/
 503          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*PQRSTUVWXYZ[\]^_*/
 504          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*`abcdefghijklmno*/
 505          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /*pqrstuvwxyz{|}~ */
 506        },
 507        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Uric */
 508          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 509          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* !"#$%&'()*+,-./*/
 510          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, /*0123456789:;<=>?*/
 511          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
 512          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, /*PQRSTUVWXYZ[\]^_*/
 513          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
 514          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
 515        },
 516        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* UricNoSlash */
 517          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 518          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* !"#$%&'()*+,-./*/
 519          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, /*0123456789:;<=>?*/
 520          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
 521          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
 522          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
 523          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
 524        },
 525        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* RelSegment */
 526          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 527          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* !"#$%&'()*+,-./*/
 528          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, /*0123456789:;<=>?*/
 529          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
 530          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
 531          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
 532          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
 533        },
 534        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* RegName */
 535          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 536          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* !"#$%&'()*+,-./*/
 537          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, /*0123456789:;<=>?*/
 538          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
 539          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
 540          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
 541          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
 542        },
 543        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Userinfo */
 544          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 545          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* !"#$%&'()*+,-./*/
 546          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, /*0123456789:;<=>?*/
 547          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
 548          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
 549          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
 550          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
 551        },
 552        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Pchar */
 553          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 554          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* !"#$%&'()*+,-./*/
 555          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, /*0123456789:;<=>?*/
 556          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
 557          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
 558          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
 559          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
 560        },
 561        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* UnoParamValue */
 562          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 563          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, /* !"#$%&'()*+,-./*/
 564          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*0123456789:;<=>?*/
 565          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
 566          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
 567          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
 568          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
 569        }};
 570     assert(
 571         (eCharClass >= 0
 572          && (sal::static_int_cast< std::size_t >(eCharClass)
 573              < SAL_N_ELEMENTS(aCharClass)))); // bad eCharClass
 574     return aCharClass[eCharClass];
 575 }
 576
 577 void SAL_CALL rtl_uriEncode(rtl_uString * pText, sal_Bool const * pCharClass,
 578                             rtl_UriEncodeMechanism eMechanism,
 579                             rtl_TextEncoding eCharset, rtl_uString ** pResult)
 580     SAL_THROW_EXTERN_C()
 581 {
 582     assert(!pCharClass[0x25]); // make sure the percent sign is encoded...
 583
 584     sal_Unicode const * p = pText->buffer;
 585     sal_Unicode const * pEnd = p + pText->length;
 586     sal_Int32 nCapacity = pText->length;
 587     rtl_uString_new_WithLength(pResult, nCapacity);
 588     while (p < pEnd)
 589     {
 590         EscapeType eType;
 591         sal_uInt32 nUtf32 = readUcs4(
 592             &p, pEnd,
 593             (eMechanism == rtl_UriEncodeKeepEscapes
 594              || eMechanism == rtl_UriEncodeCheckEscapes
 595              || eMechanism == rtl_UriEncodeStrictKeepEscapes),
 596             eCharset, &eType);
 597         switch (eType)
 598         {
 599         case EscapeNo:
 600             if (isValid(pCharClass, nUtf32)) // implies nUtf32 <= 0x7F
 601                 writeUnicode(pResult, &nCapacity,
 602                              static_cast< sal_Unicode >(nUtf32));
 603             else if (!writeEscapeChar(
 604                          pResult, &nCapacity, nUtf32, eCharset,
 605                          (eMechanism == rtl_UriEncodeStrict
 606                           || eMechanism == rtl_UriEncodeStrictKeepEscapes)))
 607             {
 608                 rtl_uString_new(pResult);
 609                 return;
 610             }
 611             break;
 612
 613         case EscapeChar:
 614             if (eMechanism == rtl_UriEncodeCheckEscapes
 615                 && isValid(pCharClass, nUtf32)) // implies nUtf32 <= 0x7F
 616                 writeUnicode(pResult, &nCapacity,
 617                              static_cast< sal_Unicode >(nUtf32));
 618             else if (!writeEscapeChar(
 619                          pResult, &nCapacity, nUtf32, eCharset,
 620                          (eMechanism == rtl_UriEncodeStrict
 621                           || eMechanism == rtl_UriEncodeStrictKeepEscapes)))
 622             {
 623                 rtl_uString_new(pResult);
 624                 return;
 625             }
 626             break;
 627
 628         case EscapeOctet:
 629             writeEscapeOctet(pResult, &nCapacity, nUtf32);
 630             break;
 631         }
 632     }
 633     *pResult = rtl_uStringBuffer_makeStringAndClear( pResult, &nCapacity );
 634 }
 635
 636 void SAL_CALL rtl_uriDecode(rtl_uString * pText,
 637                             rtl_UriDecodeMechanism eMechanism,
 638                             rtl_TextEncoding eCharset, rtl_uString ** pResult)
 639     SAL_THROW_EXTERN_C()
 640 {
 641     switch (eMechanism)
 642     {
 643     case rtl_UriDecodeNone:
 644         rtl_uString_assign(pResult, pText);
 645         break;
 646
 647     case rtl_UriDecodeToIuri:
 648         eCharset = RTL_TEXTENCODING_UTF8;
 649     default: // rtl_UriDecodeWithCharset, rtl_UriDecodeStrict
 650         {
 651             sal_Unicode const * p = pText->buffer;
 652             sal_Unicode const * pEnd = p + pText->length;
 653             sal_Int32 nCapacity = pText->length;
 654             rtl_uString_new_WithLength(pResult, nCapacity);
 655             while (p < pEnd)
 656             {
 657                 EscapeType eType;
 658                 sal_uInt32 nUtf32 = readUcs4(&p, pEnd, true, eCharset, &eType);
 659                 switch (eType)
 660                 {
 661                 case EscapeChar:
 662                     if (nUtf32 <= 0x7F && eMechanism == rtl_UriDecodeToIuri)
 663                     {
 664                         writeEscapeOctet(pResult, &nCapacity, nUtf32);
 665                         break;
 666                     }
 667                 case EscapeNo:
 668                     writeUcs4(pResult, &nCapacity, nUtf32);
 669                     break;
 670
 671                 case EscapeOctet:
 672                     if (eMechanism == rtl_UriDecodeStrict) {
 673                         rtl_uString_new(pResult);
 674                         return;
 675                     }
 676                     writeEscapeOctet(pResult, &nCapacity, nUtf32);
 677                     break;
 678                 }
 679             }
 680             *pResult = rtl_uStringBuffer_makeStringAndClear( pResult, &nCapacity );
 681         }
 682         break;
 683     }
 684 }
 685
 686 sal_Bool SAL_CALL rtl_uriConvertRelToAbs(rtl_uString * pBaseUriRef,
 687                                          rtl_uString * pRelUriRef,
 688                                          rtl_uString ** pResult,
 689                                          rtl_uString ** pException)
 690     SAL_THROW_EXTERN_C()
 691 {
 692     // If pRelUriRef starts with a scheme component it is an absolute URI
 693     // reference, and we are done (i.e., this algorithm does not support
 694     // backwards-compatible relative URIs starting with a scheme component, see
 695     // RFC 2396, section 5.2, step 3):
 696     Components aRelComponents;
 697     parseUriRef(pRelUriRef, &aRelComponents);
 698     if (aRelComponents.aScheme.isPresent())
 699     {
 700         rtl_uString_assign(pResult, pRelUriRef);
 701         return true;
 702     }
 703
 704     // Parse pBaseUriRef; if the scheme component is not present or not valid,
 705     // or the path component is not empty and starts with anything but a slash,
 706     // an exception is raised:
 707     Components aBaseComponents;
 708     parseUriRef(pBaseUriRef, &aBaseComponents);
 709     if (!aBaseComponents.aScheme.isPresent())
 710     {
 711         rtl::OUString aMessage(pBaseUriRef);
 712         aMessage += rtl::OUString(
 713                             " does not start with a scheme component");
 714         rtl_uString_assign(pException,
 715                            const_cast< rtl::OUString & >(aMessage).pData);
 716         return false;
 717     }
 718     if (aBaseComponents.aPath.pBegin != aBaseComponents.aPath.pEnd
 719         && *aBaseComponents.aPath.pBegin != '/')
 720     {
 721         rtl::OUString aMessage(pBaseUriRef);
 722         aMessage += rtl::OUString(
 723                             "path component does not start with slash");
 724         rtl_uString_assign(pException, aMessage.pData);
 725         return false;
 726     }
 727
 728     // Use the algorithm from RFC 2396, section 5.2, to turn the relative URI
 729     // into an absolute one (if the relative URI is a reference to the "current
 730     // document," the "current document" is here taken to be the base URI):
 731     rtl::OUStringBuffer aBuffer;
 732     aBuffer.append(aBaseComponents.aScheme.pBegin,
 733                    aBaseComponents.aScheme.getLength());
 734     if (aRelComponents.aAuthority.isPresent())
 735     {
 736         aBuffer.append(aRelComponents.aAuthority.pBegin,
 737                        aRelComponents.aAuthority.getLength());
 738         aBuffer.append(aRelComponents.aPath.pBegin,
 739                        aRelComponents.aPath.getLength());
 740         if (aRelComponents.aQuery.isPresent())
 741             aBuffer.append(aRelComponents.aQuery.pBegin,
 742                            aRelComponents.aQuery.getLength());
 743     }
 744     else
 745     {
 746         if (aBaseComponents.aAuthority.isPresent())
 747             aBuffer.append(aBaseComponents.aAuthority.pBegin,
 748                            aBaseComponents.aAuthority.getLength());
 749         if (aRelComponents.aPath.pBegin == aRelComponents.aPath.pEnd
 750             && !aRelComponents.aQuery.isPresent())
 751         {
 752             aBuffer.append(aBaseComponents.aPath.pBegin,
 753                            aBaseComponents.aPath.getLength());
 754             if (aBaseComponents.aQuery.isPresent())
 755                 aBuffer.append(aBaseComponents.aQuery.pBegin,
 756                                aBaseComponents.aQuery.getLength());
 757         }
 758         else
 759         {
 760             if (*aRelComponents.aPath.pBegin == '/')
 761                 aBuffer.append(aRelComponents.aPath.pBegin,
 762                                aRelComponents.aPath.getLength());
 763             else
 764                 aBuffer.append(joinPaths(aBaseComponents.aPath,
 765                                          aRelComponents.aPath));
 766             if (aRelComponents.aQuery.isPresent())
 767                 aBuffer.append(aRelComponents.aQuery.pBegin,
 768                                aRelComponents.aQuery.getLength());
 769         }
 770     }
 771     if (aRelComponents.aFragment.isPresent())
 772         aBuffer.append(aRelComponents.aFragment.pBegin,
 773                        aRelComponents.aFragment.getLength());
 774     rtl_uString_assign(pResult, aBuffer.makeStringAndClear().pData);
 775     return true;
 776 }
 777
 778 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */