sal/rtl/source/uri.cxx

   1 /*************************************************************************
   2  *
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * Copyright 2000, 2010 Oracle and/or its affiliates.
   6  *
   7  * OpenOffice.org - a multi-platform office productivity suite
   8  *
   9  * This file is part of OpenOffice.org.
  10  *
  11  * OpenOffice.org is free software: you can redistribute it and/or modify
  12  * it under the terms of the GNU Lesser General Public License version 3
  13  * only, as published by the Free Software Foundation.
  14  *
  15  * OpenOffice.org is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU Lesser General Public License version 3 for more details
  19  * (a copy is included in the LICENSE file that accompanied this code).
  20  *
  21  * You should have received a copy of the GNU Lesser General Public License
  22  * version 3 along with OpenOffice.org.  If not, see
  23  * <http://www.openoffice.org/license.html>
  24  * for a copy of the LGPLv3 License.
  25  *
  26  ************************************************************************/
  27
  28 // MARKER(update_precomp.py): autogen include statement, do not remove
  29 #include "precompiled_sal.hxx"
  30
  31 #include "rtl/uri.h"
  32
  33 #include "surrogates.h"
  34
  35 #include "osl/diagnose.h"
  36 #include "rtl/strbuf.hxx"
  37 #include "rtl/textenc.h"
  38 #include "rtl/textcvt.h"
  39 #include "rtl/uri.h"
  40 #include "rtl/ustrbuf.h"
  41 #include "rtl/ustrbuf.hxx"
  42 #include "rtl/ustring.h"
  43 #include "rtl/ustring.hxx"
  44 #include "sal/types.h"
  45
  46 #include <cstddef>
  47
  48 namespace {
  49
  50 std::size_t const nCharClassSize = 128;
  51
  52 sal_Unicode const cEscapePrefix = 0x25; // '%'
  53
  54 inline bool isDigit(sal_uInt32 nUtf32)
  55 {
  56     return nUtf32 >= 0x30 && nUtf32 <= 0x39; // '0'--'9'
  57 }
  58
  59 inline bool isAlpha(sal_uInt32 nUtf32)
  60 {
  61     // 'A'--'Z', 'a'--'z'
  62     return (
  63             (nUtf32 >= 0x41 && nUtf32 <= 0x5A) ||
  64             (nUtf32 >= 0x61 && nUtf32 <= 0x7A)
  65            );
  66 }
  67
  68 inline bool isHighSurrogate(sal_uInt32 nUtf16)
  69 {
  70     return SAL_RTL_IS_HIGH_SURROGATE(nUtf16);
  71 }
  72
  73 inline bool isLowSurrogate(sal_uInt32 nUtf16)
  74 {
  75     return SAL_RTL_IS_LOW_SURROGATE(nUtf16);
  76 }
  77
  78 inline sal_uInt32 combineSurrogates(sal_uInt32 high, sal_uInt32 low)
  79 {
  80     return SAL_RTL_COMBINE_SURROGATES(high, low);
  81 }
  82
  83 inline int getHexWeight(sal_uInt32 nUtf32)
  84 {
  85     return nUtf32 >= 0x30 && nUtf32 <= 0x39 ? // '0'--'9'
  86                static_cast< int >(nUtf32 - 0x30) :
  87            nUtf32 >= 0x41 && nUtf32 <= 0x46 ? // 'A'--'F'
  88                static_cast< int >(nUtf32 - 0x41 + 10) :
  89            nUtf32 >= 0x61 && nUtf32 <= 0x66 ? // 'a'--'f'
  90                static_cast< int >(nUtf32 - 0x61 + 10) :
  91                -1; // not a hex digit
  92 }
  93
  94 inline bool isValid(sal_Bool const * pCharClass, sal_uInt32 nUtf32)
  95 {
  96     return nUtf32 < nCharClassSize && pCharClass[nUtf32];
  97 }
  98
  99 inline void writeUnicode(rtl_uString ** pBuffer, sal_Int32 * pCapacity,
 100                          sal_Unicode cChar)
 101 {
 102     rtl_uStringbuffer_insert(pBuffer, pCapacity, (*pBuffer)->length, &cChar, 1);
 103 }
 104
 105 enum EscapeType
 106 {
 107     EscapeNo,
 108     EscapeChar,
 109     EscapeOctet
 110 };
 111
 112 /* Read any of the following:
 113
 114    - sequence of escape sequences representing character from eCharset,
 115      translated to single UCS4 character; or
 116
 117    - pair of UTF-16 surrogates, translated to single UCS4 character; or
 118
 119    _ single UTF-16 character, extended to UCS4 character.
 120  */
 121 sal_uInt32 readUcs4(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
 122                     bool bEncoded, rtl_TextEncoding eCharset,
 123                     EscapeType * pType)
 124 {
 125     sal_uInt32 nChar = *(*pBegin)++;
 126     int nWeight1;
 127     int nWeight2;
 128     if (nChar == cEscapePrefix && bEncoded && pEnd - *pBegin >= 2
 129         && (nWeight1 = getHexWeight((*pBegin)[0])) >= 0
 130         && (nWeight2 = getHexWeight((*pBegin)[1])) >= 0)
 131     {
 132         *pBegin += 2;
 133         nChar = static_cast< sal_uInt32 >(nWeight1 << 4 | nWeight2);
 134         if (nChar <= 0x7F)
 135             *pType = EscapeChar;
 136         else if (eCharset == RTL_TEXTENCODING_UTF8)
 137         {
 138             if (nChar >= 0xC0 && nChar <= 0xF4)
 139             {
 140                 sal_uInt32 nEncoded;
 141                 int nShift;
 142                 sal_uInt32 nMin;
 143                 if (nChar <= 0xDF)
 144                 {
 145                     nEncoded = (nChar & 0x1F) << 6;
 146                     nShift = 0;
 147                     nMin = 0x80;
 148                 }
 149                 else if (nChar <= 0xEF)
 150                 {
 151                     nEncoded = (nChar & 0x0F) << 12;
 152                     nShift = 6;
 153                     nMin = 0x800;
 154                 }
 155                 else
 156                 {
 157                     nEncoded = (nChar & 0x07) << 18;
 158                     nShift = 12;
 159                     nMin = 0x10000;
 160                 }
 161                 sal_Unicode const * p = *pBegin;
 162                 bool bUTF8 = true;
 163                 for (; nShift >= 0; nShift -= 6)
 164                 {
 165                     if (pEnd - p < 3 || p[0] != cEscapePrefix
 166                         || (nWeight1 = getHexWeight(p[1])) < 8
 167                         || nWeight1 > 11
 168                         || (nWeight2 = getHexWeight(p[2])) < 0)
 169                     {
 170                         bUTF8 = sal_False;
 171                         break;
 172                     }
 173                     p += 3;
 174                     nEncoded |= ((nWeight1 & 3) << 4 | nWeight2) << nShift;
 175                 }
 176                 if (bUTF8 && nEncoded >= nMin && !isHighSurrogate(nEncoded)
 177                     && !isLowSurrogate(nEncoded) && nEncoded <= 0x10FFFF)
 178                 {
 179                     *pBegin = p;
 180                     *pType = EscapeChar;
 181                     return nEncoded;
 182                 }
 183             }
 184             *pType = EscapeOctet;
 185         }
 186         else
 187         {
 188             rtl::OStringBuffer aBuf;
 189             aBuf.append(static_cast< char >(nChar));
 190             rtl_TextToUnicodeConverter aConverter
 191                 = rtl_createTextToUnicodeConverter(eCharset);
 192             sal_Unicode const * p = *pBegin;
 193             for (;;)
 194             {
 195                 sal_Unicode aDst[2];
 196                 sal_uInt32 nInfo;
 197                 sal_Size nConverted;
 198                 sal_Size nDstSize = rtl_convertTextToUnicode(
 199                     aConverter, 0, aBuf.getStr(), aBuf.getLength(), aDst,
 200                     sizeof aDst / sizeof aDst[0],
 201                     (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
 202                      | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
 203                      | RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR),
 204                     &nInfo, &nConverted);
 205                 if (nInfo == 0)
 206                 {
 207                     OSL_ASSERT(
 208                         nConverted
 209                         == sal::static_int_cast< sal_uInt32 >(
 210                             aBuf.getLength()));
 211                     rtl_destroyTextToUnicodeConverter(aConverter);
 212                     *pBegin = p;
 213                     *pType = EscapeChar;
 214                     OSL_ASSERT(
 215                         nDstSize == 1
 216                         || (nDstSize == 2 && isHighSurrogate(aDst[0])
 217                             && isLowSurrogate(aDst[1])));
 218                     return nDstSize == 1
 219                         ? aDst[0] : combineSurrogates(aDst[0], aDst[1]);
 220                 }
 221                 else if (nInfo == RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
 222                          && pEnd - p >= 3 && p[0] == cEscapePrefix
 223                          && (nWeight1 = getHexWeight(p[1])) >= 0
 224                          && (nWeight2 = getHexWeight(p[2])) >= 0)
 225                 {
 226                     p += 3;
 227                     aBuf.append(static_cast< char >(nWeight1 << 4 | nWeight2));
 228                 }
 229                 else if (nInfo == RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
 230                          && p != pEnd && *p <= 0x7F)
 231                 {
 232                     aBuf.append(static_cast< char >(*p++));
 233                 }
 234                 else
 235                 {
 236                     OSL_ASSERT(
 237                         (nInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL)
 238                         == 0);
 239                     break;
 240                 }
 241             }
 242             rtl_destroyTextToUnicodeConverter(aConverter);
 243             *pType = EscapeOctet;
 244         }
 245         return nChar;
 246     }
 247     else
 248     {
 249         *pType = EscapeNo;
 250         return isHighSurrogate(nChar) && *pBegin < pEnd
 251                && isLowSurrogate(**pBegin) ?
 252                    combineSurrogates(nChar, *(*pBegin)++) : nChar;
 253     }
 254 }
 255
 256 void writeUcs4(rtl_uString ** pBuffer, sal_Int32 * pCapacity, sal_uInt32 nUtf32)
 257 {
 258     OSL_ENSURE(nUtf32 <= 0x10FFFF, "bad UTF-32 char");
 259     if (nUtf32 <= 0xFFFF) {
 260         writeUnicode(
 261             pBuffer, pCapacity, static_cast< sal_Unicode >(nUtf32));
 262     } else {
 263         nUtf32 -= 0x10000;
 264         writeUnicode(
 265             pBuffer, pCapacity,
 266             static_cast< sal_Unicode >(nUtf32 >> 10 | 0xD800));
 267         writeUnicode(
 268             pBuffer, pCapacity,
 269             static_cast< sal_Unicode >((nUtf32 & 0x3FF) | 0xDC00));
 270     }
 271 }
 272
 273 void writeEscapeOctet(rtl_uString ** pBuffer, sal_Int32 * pCapacity,
 274                       sal_uInt32 nOctet)
 275 {
 276     OSL_ENSURE(nOctet <= 0xFF, "bad octet");
 277
 278     static sal_Unicode const aHex[16]
 279         = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
 280             0x41, 0x42, 0x43, 0x44, 0x45, 0x46 }; /* '0'--'9', 'A'--'F' */
 281
 282     writeUnicode(pBuffer, pCapacity, cEscapePrefix);
 283     writeUnicode(pBuffer, pCapacity, aHex[nOctet >> 4]);
 284     writeUnicode(pBuffer, pCapacity, aHex[nOctet & 15]);
 285 }
 286
 287 bool writeEscapeChar(rtl_uString ** pBuffer, sal_Int32 * pCapacity,
 288                      sal_uInt32 nUtf32, rtl_TextEncoding eCharset, bool bStrict)
 289 {
 290     OSL_ENSURE(nUtf32 <= 0x10FFFF, "bad UTF-32 char");
 291     if (eCharset == RTL_TEXTENCODING_UTF8) {
 292         if (nUtf32 < 0x80)
 293             writeEscapeOctet(pBuffer, pCapacity, nUtf32);
 294         else if (nUtf32 < 0x800)
 295         {
 296             writeEscapeOctet(pBuffer, pCapacity, nUtf32 >> 6 | 0xC0);
 297             writeEscapeOctet(pBuffer, pCapacity, (nUtf32 & 0x3F) | 0x80);
 298         }
 299         else if (nUtf32 < 0x10000)
 300         {
 301             writeEscapeOctet(pBuffer, pCapacity, nUtf32 >> 12 | 0xE0);
 302             writeEscapeOctet(pBuffer, pCapacity, (nUtf32 >> 6 & 0x3F) | 0x80);
 303             writeEscapeOctet(pBuffer, pCapacity, (nUtf32 & 0x3F) | 0x80);
 304         }
 305         else
 306         {
 307             writeEscapeOctet(pBuffer, pCapacity, nUtf32 >> 18 | 0xF0);
 308             writeEscapeOctet(pBuffer, pCapacity, (nUtf32 >> 12 & 0x3F) | 0x80);
 309             writeEscapeOctet(pBuffer, pCapacity, (nUtf32 >> 6 & 0x3F) | 0x80);
 310             writeEscapeOctet(pBuffer, pCapacity, (nUtf32 & 0x3F) | 0x80);
 311         }
 312     } else {
 313         rtl_UnicodeToTextConverter aConverter
 314             = rtl_createUnicodeToTextConverter(eCharset);
 315         sal_Unicode aSrc[2];
 316         sal_Size nSrcSize;
 317         if (nUtf32 <= 0xFFFF)
 318         {
 319             aSrc[0] = static_cast< sal_Unicode >(nUtf32);
 320             nSrcSize = 1;
 321         }
 322         else
 323         {
 324             aSrc[0] = static_cast< sal_Unicode >(
 325                 ((nUtf32 - 0x10000) >> 10) | 0xD800);
 326             aSrc[1] = static_cast< sal_Unicode >(
 327                 ((nUtf32 - 0x10000) & 0x3FF) | 0xDC00);
 328             nSrcSize = 2;
 329         }
 330         sal_Char aDst[32]; // FIXME  random value
 331         sal_uInt32 nInfo;
 332         sal_Size nConverted;
 333         sal_Size nDstSize = rtl_convertUnicodeToText(
 334             aConverter, 0, aSrc, nSrcSize, aDst, sizeof aDst,
 335             RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR
 336             | RTL_UNICODETOTEXT_FLAGS_INVALID_ERROR
 337             | RTL_UNICODETOTEXT_FLAGS_FLUSH,
 338             &nInfo, &nConverted);
 339         OSL_ASSERT((nInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL) == 0);
 340         rtl_destroyUnicodeToTextConverter(aConverter);
 341         if (nInfo == 0) {
 342             OSL_ENSURE(nConverted == nSrcSize, "bad rtl_convertUnicodeToText");
 343             for (sal_Size i = 0; i < nDstSize; ++i)
 344                 writeEscapeOctet(pBuffer, pCapacity,
 345                                  static_cast< unsigned char >(aDst[i]));
 346                     // FIXME  all octets are escaped, even if there is no need
 347         } else {
 348             if (bStrict) {
 349                 return false;
 350             } else {
 351                 writeUcs4(pBuffer, pCapacity, nUtf32);
 352             }
 353         }
 354     }
 355     return true;
 356 }
 357
 358 struct Component
 359 {
 360     sal_Unicode const * pBegin;
 361     sal_Unicode const * pEnd;
 362
 363     inline Component(): pBegin(0) {}
 364
 365     inline bool isPresent() const { return pBegin != 0; }
 366
 367     inline sal_Int32 getLength() const;
 368 };
 369
 370 inline sal_Int32 Component::getLength() const
 371 {
 372     OSL_ENSURE(isPresent(), "taking length of non-present component");
 373     return static_cast< sal_Int32 >(pEnd - pBegin);
 374 }
 375
 376 struct Components
 377 {
 378     Component aScheme;
 379     Component aAuthority;
 380     Component aPath;
 381     Component aQuery;
 382     Component aFragment;
 383 };
 384
 385 void parseUriRef(rtl_uString const * pUriRef, Components * pComponents)
 386 {
 387     // This algorithm is liberal and accepts various forms of illegal input.
 388
 389     sal_Unicode const * pBegin = pUriRef->buffer;
 390     sal_Unicode const * pEnd = pBegin + pUriRef->length;
 391     sal_Unicode const * pPos = pBegin;
 392
 393     if (pPos != pEnd && isAlpha(*pPos))
 394         for (sal_Unicode const * p = pPos + 1; p != pEnd; ++p)
 395             if (*p == ':')
 396             {
 397                 pComponents->aScheme.pBegin = pBegin;
 398                 pComponents->aScheme.pEnd = ++p;
 399                 pPos = p;
 400                 break;
 401             }
 402             else if (!isAlpha(*p) && !isDigit(*p) && *p != '+' && *p != '-'
 403                      && *p != '.')
 404                 break;
 405
 406     if (pEnd - pPos >= 2 && pPos[0] == '/' && pPos[1] == '/')
 407     {
 408         pComponents->aAuthority.pBegin = pPos;
 409         pPos += 2;
 410         while (pPos != pEnd && *pPos != '/' && *pPos != '?' && *pPos != '#')
 411             ++pPos;
 412         pComponents->aAuthority.pEnd = pPos;
 413     }
 414
 415     pComponents->aPath.pBegin = pPos;
 416     while (pPos != pEnd && *pPos != '?' && * pPos != '#')
 417         ++pPos;
 418     pComponents->aPath.pEnd = pPos;
 419
 420     if (pPos != pEnd && *pPos == '?')
 421     {
 422         pComponents->aQuery.pBegin = pPos++;
 423         while (pPos != pEnd && * pPos != '#')
 424             ++pPos;
 425         pComponents->aQuery.pEnd = pPos;
 426     }
 427
 428     if (pPos != pEnd)
 429     {
 430         OSL_ASSERT(*pPos == '#');
 431         pComponents->aFragment.pBegin = pPos;
 432         pComponents->aFragment.pEnd = pEnd;
 433     }
 434 }
 435
 436 rtl::OUString joinPaths(Component const & rBasePath, Component const & rRelPath)
 437 {
 438     OSL_ASSERT(rBasePath.isPresent() && *rBasePath.pBegin == '/');
 439     OSL_ASSERT(rRelPath.isPresent());
 440
 441     // The invariant of aBuffer is that it always starts and ends with a slash
 442     // (until probably right at the end of the algorithm, when the last segment
 443     // of rRelPath is added, which does not necessarily end in a slash):
 444     rtl::OUStringBuffer aBuffer(rBasePath.getLength() + rRelPath.getLength());
 445         // XXX  numeric overflow
 446
 447     // Segments "." and ".." within rBasePath are not conisdered special (but
 448     // are also not removed by ".." segments within rRelPath), RFC 2396 seems a
 449     // bit unclear about this point:
 450     sal_Int32 nFixed = 1;
 451     sal_Unicode const * p = rBasePath.pBegin + 1;
 452     for (sal_Unicode const * q = p; q != rBasePath.pEnd; ++q)
 453         if (*q == '/')
 454         {
 455             if (
 456                 (q - p == 1 && p[0] == '.') ||
 457                 (q - p == 2 && p[0] == '.' && p[1] == '.')
 458                )
 459             {
 460                 nFixed = q + 1 - rBasePath.pBegin;
 461             }
 462             p = q + 1;
 463         }
 464     aBuffer.append(rBasePath.pBegin, p - rBasePath.pBegin);
 465
 466     p = rRelPath.pBegin;
 467     if (p != rRelPath.pEnd)
 468         for (;;)
 469         {
 470             sal_Unicode const * q = p;
 471             sal_Unicode const * r;
 472             for (;;)
 473             {
 474                 if (q == rRelPath.pEnd)
 475                 {
 476                     r = q;
 477                     break;
 478                 }
 479                 if (*q == '/')
 480                 {
 481                     r = q + 1;
 482                     break;
 483                 }
 484                 ++q;
 485             }
 486             if (q - p == 2 && p[0] == '.' && p[1] == '.')
 487             {
 488                 // Erroneous excess segments ".." within rRelPath are left
 489                 // intact, as the examples in RFC 2396, section C.2, suggest:
 490                 sal_Int32 i = aBuffer.getLength() - 1;
 491                 if (i < nFixed)
 492                 {
 493                     aBuffer.append(p, r - p);
 494                     nFixed += 3;
 495                 }
 496                 else
 497                 {
 498                     while (aBuffer.charAt(i - 1) != '/')
 499                         --i;
 500                     aBuffer.setLength(i);
 501                 }
 502             }
 503             else if (q - p != 1 || *p != '.')
 504                 aBuffer.append(p, r - p);
 505             if (q == rRelPath.pEnd)
 506                 break;
 507             p = q + 1;
 508         }
 509
 510     return aBuffer.makeStringAndClear();
 511 }
 512
 513 }
 514
 515 sal_Bool const * SAL_CALL rtl_getUriCharClass(rtl_UriCharClass eCharClass)
 516     SAL_THROW_EXTERN_C()
 517 {
 518     static sal_Bool const aCharClass[][nCharClassSize]
 519     = {{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* None */
 520          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 521          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* !"#$%&'()*+,-./*/
 522          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*0123456789:;<=>?*/
 523          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*@ABCDEFGHIJKLMNO*/
 524          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*PQRSTUVWXYZ[\]^_*/
 525          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*`abcdefghijklmno*/
 526          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /*pqrstuvwxyz{|}~ */
 527        },
 528        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Uric */
 529          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 530          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* !"#$%&'()*+,-./*/
 531          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, /*0123456789:;<=>?*/
 532          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
 533          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, /*PQRSTUVWXYZ[\]^_*/
 534          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
 535          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
 536        },
 537        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* UricNoSlash */
 538          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 539          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* !"#$%&'()*+,-./*/
 540          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, /*0123456789:;<=>?*/
 541          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
 542          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
 543          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
 544          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
 545        },
 546        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* RelSegment */
 547          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 548          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* !"#$%&'()*+,-./*/
 549          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, /*0123456789:;<=>?*/
 550          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
 551          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
 552          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
 553          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
 554        },
 555        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* RegName */
 556          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 557          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* !"#$%&'()*+,-./*/
 558          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, /*0123456789:;<=>?*/
 559          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
 560          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
 561          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
 562          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
 563        },
 564        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Userinfo */
 565          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 566          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* !"#$%&'()*+,-./*/
 567          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, /*0123456789:;<=>?*/
 568          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
 569          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
 570          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
 571          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
 572        },
 573        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Pchar */
 574          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 575          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* !"#$%&'()*+,-./*/
 576          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, /*0123456789:;<=>?*/
 577          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
 578          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
 579          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
 580          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
 581        },
 582        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* UnoParamValue */
 583          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 584          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, /* !"#$%&'()*+,-./*/
 585          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*0123456789:;<=>?*/
 586          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
 587          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
 588          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
 589          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
 590        }};
 591     OSL_ENSURE(
 592         (eCharClass >= 0
 593          && (sal::static_int_cast< std::size_t >(eCharClass)
 594              < sizeof aCharClass / sizeof aCharClass[0])),
 595         "bad eCharClass");
 596     return aCharClass[eCharClass];
 597 }
 598
 599 void SAL_CALL rtl_uriEncode(rtl_uString * pText, sal_Bool const * pCharClass,
 600                             rtl_UriEncodeMechanism eMechanism,
 601                             rtl_TextEncoding eCharset, rtl_uString ** pResult)
 602     SAL_THROW_EXTERN_C()
 603 {
 604     OSL_ENSURE(!pCharClass[0x25], "bad pCharClass");
 605         // make sure the percent sign is encoded...
 606
 607     sal_Unicode const * p = pText->buffer;
 608     sal_Unicode const * pEnd = p + pText->length;
 609     sal_Int32 nCapacity = 0;
 610     rtl_uString_new(pResult);
 611     while (p < pEnd)
 612     {
 613         EscapeType eType;
 614         sal_uInt32 nUtf32 = readUcs4(
 615             &p, pEnd,
 616             (eMechanism == rtl_UriEncodeKeepEscapes
 617              || eMechanism == rtl_UriEncodeCheckEscapes
 618              || eMechanism == rtl_UriEncodeStrictKeepEscapes),
 619             eCharset, &eType);
 620         switch (eType)
 621         {
 622         case EscapeNo:
 623             if (isValid(pCharClass, nUtf32)) // implies nUtf32 <= 0x7F
 624                 writeUnicode(pResult, &nCapacity,
 625                              static_cast< sal_Unicode >(nUtf32));
 626             else if (!writeEscapeChar(
 627                          pResult, &nCapacity, nUtf32, eCharset,
 628                          (eMechanism == rtl_UriEncodeStrict
 629                           || eMechanism == rtl_UriEncodeStrictKeepEscapes)))
 630             {
 631                 rtl_uString_new(pResult);
 632                 return;
 633             }
 634             break;
 635
 636         case EscapeChar:
 637             if (eMechanism == rtl_UriEncodeCheckEscapes
 638                 && isValid(pCharClass, nUtf32)) // implies nUtf32 <= 0x7F
 639                 writeUnicode(pResult, &nCapacity,
 640                              static_cast< sal_Unicode >(nUtf32));
 641             else if (!writeEscapeChar(
 642                          pResult, &nCapacity, nUtf32, eCharset,
 643                          (eMechanism == rtl_UriEncodeStrict
 644                           || eMechanism == rtl_UriEncodeStrictKeepEscapes)))
 645             {
 646                 rtl_uString_new(pResult);
 647                 return;
 648             }
 649             break;
 650
 651         case EscapeOctet:
 652             writeEscapeOctet(pResult, &nCapacity, nUtf32);
 653             break;
 654         }
 655     }
 656 }
 657
 658 void SAL_CALL rtl_uriDecode(rtl_uString * pText,
 659                             rtl_UriDecodeMechanism eMechanism,
 660                             rtl_TextEncoding eCharset, rtl_uString ** pResult)
 661     SAL_THROW_EXTERN_C()
 662 {
 663     switch (eMechanism)
 664     {
 665     case rtl_UriDecodeNone:
 666         rtl_uString_assign(pResult, pText);
 667         break;
 668
 669     case rtl_UriDecodeToIuri:
 670         eCharset = RTL_TEXTENCODING_UTF8;
 671     default: // rtl_UriDecodeWithCharset, rtl_UriDecodeStrict
 672         {
 673             sal_Unicode const * p = pText->buffer;
 674             sal_Unicode const * pEnd = p + pText->length;
 675             sal_Int32 nCapacity = 0;
 676             rtl_uString_new(pResult);
 677             while (p < pEnd)
 678             {
 679                 EscapeType eType;
 680                 sal_uInt32 nUtf32 = readUcs4(&p, pEnd, true, eCharset, &eType);
 681                 switch (eType)
 682                 {
 683                 case EscapeChar:
 684                     if (nUtf32 <= 0x7F && eMechanism == rtl_UriDecodeToIuri)
 685                     {
 686                         writeEscapeOctet(pResult, &nCapacity, nUtf32);
 687                         break;
 688                     }
 689                 case EscapeNo:
 690                     writeUcs4(pResult, &nCapacity, nUtf32);
 691                     break;
 692
 693                 case EscapeOctet:
 694                     if (eMechanism == rtl_UriDecodeStrict) {
 695                         rtl_uString_new(pResult);
 696                         return;
 697                     }
 698                     writeEscapeOctet(pResult, &nCapacity, nUtf32);
 699                     break;
 700                 }
 701             }
 702         }
 703         break;
 704     }
 705 }
 706
 707 sal_Bool SAL_CALL rtl_uriConvertRelToAbs(rtl_uString * pBaseUriRef,
 708                                          rtl_uString * pRelUriRef,
 709                                          rtl_uString ** pResult,
 710                                          rtl_uString ** pException)
 711     SAL_THROW_EXTERN_C()
 712 {
 713     // If pRelUriRef starts with a scheme component it is an absolute URI
 714     // reference, and we are done (i.e., this algorithm does not support
 715     // backwards-compatible relative URIs starting with a scheme component, see
 716     // RFC 2396, section 5.2, step 3):
 717     Components aRelComponents;
 718     parseUriRef(pRelUriRef, &aRelComponents);
 719     if (aRelComponents.aScheme.isPresent())
 720     {
 721         rtl_uString_assign(pResult, pRelUriRef);
 722         return true;
 723     }
 724
 725     // Parse pBaseUriRef; if the scheme component is not present or not valid,
 726     // or the path component is not empty and starts with anything but a slash,
 727     // an exception is raised:
 728     Components aBaseComponents;
 729     parseUriRef(pBaseUriRef, &aBaseComponents);
 730     if (!aBaseComponents.aScheme.isPresent())
 731     {
 732         rtl::OUString aMessage(pBaseUriRef);
 733         aMessage += rtl::OUString(
 734                         RTL_CONSTASCII_USTRINGPARAM(
 735                             " does not start with a scheme component"));
 736         rtl_uString_assign(pException,
 737                            const_cast< rtl::OUString & >(aMessage).pData);
 738         return false;
 739     }
 740     if (aBaseComponents.aPath.pBegin != aBaseComponents.aPath.pEnd
 741         && *aBaseComponents.aPath.pBegin != '/')
 742     {
 743         rtl::OUString aMessage(pBaseUriRef);
 744         aMessage += rtl::OUString(
 745                         RTL_CONSTASCII_USTRINGPARAM(
 746                             "path component does not start with slash"));
 747         rtl_uString_assign(pException, aMessage.pData);
 748         return false;
 749     }
 750
 751     // Use the algorithm from RFC 2396, section 5.2, to turn the relative URI
 752     // into an absolute one (if the relative URI is a reference to the "current
 753     // document," the "current document" is here taken to be the base URI):
 754     rtl::OUStringBuffer aBuffer;
 755     aBuffer.append(aBaseComponents.aScheme.pBegin,
 756                    aBaseComponents.aScheme.getLength());
 757     if (aRelComponents.aAuthority.isPresent())
 758     {
 759         aBuffer.append(aRelComponents.aAuthority.pBegin,
 760                        aRelComponents.aAuthority.getLength());
 761         aBuffer.append(aRelComponents.aPath.pBegin,
 762                        aRelComponents.aPath.getLength());
 763         if (aRelComponents.aQuery.isPresent())
 764             aBuffer.append(aRelComponents.aQuery.pBegin,
 765                            aRelComponents.aQuery.getLength());
 766     }
 767     else
 768     {
 769         if (aBaseComponents.aAuthority.isPresent())
 770             aBuffer.append(aBaseComponents.aAuthority.pBegin,
 771                            aBaseComponents.aAuthority.getLength());
 772         if (aRelComponents.aPath.pBegin == aRelComponents.aPath.pEnd
 773             && !aRelComponents.aQuery.isPresent())
 774         {
 775             aBuffer.append(aBaseComponents.aPath.pBegin,
 776                            aBaseComponents.aPath.getLength());
 777             if (aBaseComponents.aQuery.isPresent())
 778                 aBuffer.append(aBaseComponents.aQuery.pBegin,
 779                                aBaseComponents.aQuery.getLength());
 780         }
 781         else
 782         {
 783             if (*aRelComponents.aPath.pBegin == '/')
 784                 aBuffer.append(aRelComponents.aPath.pBegin,
 785                                aRelComponents.aPath.getLength());
 786             else
 787                 aBuffer.append(joinPaths(aBaseComponents.aPath,
 788                                          aRelComponents.aPath));
 789             if (aRelComponents.aQuery.isPresent())
 790                 aBuffer.append(aRelComponents.aQuery.pBegin,
 791                                aRelComponents.aQuery.getLength());
 792         }
 793     }
 794     if (aRelComponents.aFragment.isPresent())
 795         aBuffer.append(aRelComponents.aFragment.pBegin,
 796                        aRelComponents.aFragment.getLength());
 797     rtl_uString_assign(pResult, aBuffer.makeStringAndClear().pData);
 798     return true;
 799 }