sal/rtl/source/uri.cxx

   1 /*************************************************************************
   2  *
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * Copyright 2008 by Sun Microsystems, Inc.
   6  *
   7  * OpenOffice.org - a multi-platform office productivity suite
   8  *
   9  * $RCSfile: uri.cxx,v $
  10  * $Revision: 1.12 $
  11  *
  12  * This file is part of OpenOffice.org.
  13  *
  14  * OpenOffice.org is free software: you can redistribute it and/or modify
  15  * it under the terms of the GNU Lesser General Public License version 3
  16  * only, as published by the Free Software Foundation.
  17  *
  18  * OpenOffice.org is distributed in the hope that it will be useful,
  19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21  * GNU Lesser General Public License version 3 for more details
  22  * (a copy is included in the LICENSE file that accompanied this code).
  23  *
  24  * You should have received a copy of the GNU Lesser General Public License
  25  * version 3 along with OpenOffice.org.  If not, see
  26  * <http://www.openoffice.org/license.html>
  27  * for a copy of the LGPLv3 License.
  28  *
  29  ************************************************************************/
  30
  31 // MARKER(update_precomp.py): autogen include statement, do not remove
  32 #include "precompiled_sal.hxx"
  33
  34 #include "rtl/uri.h"
  35
  36 #include "surrogates.h"
  37
  38 #include "osl/diagnose.h"
  39 #include "rtl/strbuf.hxx"
  40 #include "rtl/textenc.h"
  41 #include "rtl/textcvt.h"
  42 #include "rtl/uri.h"
  43 #include "rtl/ustrbuf.h"
  44 #include "rtl/ustrbuf.hxx"
  45 #include "rtl/ustring.h"
  46 #include "rtl/ustring.hxx"
  47 #include "sal/types.h"
  48
  49 #include <cstddef>
  50
  51 namespace {
  52
  53 std::size_t const nCharClassSize = 128;
  54
  55 sal_Unicode const cEscapePrefix = 0x25; // '%'
  56
  57 inline bool isDigit(sal_uInt32 nUtf32)
  58 {
  59     return nUtf32 >= 0x30 && nUtf32 <= 0x39; // '0'--'9'
  60 }
  61
  62 inline bool isAlpha(sal_uInt32 nUtf32)
  63 {
  64     // 'A'--'Z', 'a'--'z'
  65     return (
  66             (nUtf32 >= 0x41 && nUtf32 <= 0x5A) ||
  67             (nUtf32 >= 0x61 && nUtf32 <= 0x7A)
  68            );
  69 }
  70
  71 inline bool isHighSurrogate(sal_uInt32 nUtf16)
  72 {
  73     return SAL_RTL_IS_HIGH_SURROGATE(nUtf16);
  74 }
  75
  76 inline bool isLowSurrogate(sal_uInt32 nUtf16)
  77 {
  78     return SAL_RTL_IS_LOW_SURROGATE(nUtf16);
  79 }
  80
  81 inline sal_uInt32 combineSurrogates(sal_uInt32 high, sal_uInt32 low)
  82 {
  83     return SAL_RTL_COMBINE_SURROGATES(high, low);
  84 }
  85
  86 inline int getHexWeight(sal_uInt32 nUtf32)
  87 {
  88     return nUtf32 >= 0x30 && nUtf32 <= 0x39 ? // '0'--'9'
  89                static_cast< int >(nUtf32 - 0x30) :
  90            nUtf32 >= 0x41 && nUtf32 <= 0x46 ? // 'A'--'F'
  91                static_cast< int >(nUtf32 - 0x41 + 10) :
  92            nUtf32 >= 0x61 && nUtf32 <= 0x66 ? // 'a'--'f'
  93                static_cast< int >(nUtf32 - 0x61 + 10) :
  94                -1; // not a hex digit
  95 }
  96
  97 inline bool isValid(sal_Bool const * pCharClass, sal_uInt32 nUtf32)
  98 {
  99     return nUtf32 < nCharClassSize && pCharClass[nUtf32];
 100 }
 101
 102 inline void writeUnicode(rtl_uString ** pBuffer, sal_Int32 * pCapacity,
 103                          sal_Unicode cChar)
 104 {
 105     rtl_uStringbuffer_insert(pBuffer, pCapacity, (*pBuffer)->length, &cChar, 1);
 106 }
 107
 108 enum EscapeType
 109 {
 110     EscapeNo,
 111     EscapeChar,
 112     EscapeOctet
 113 };
 114
 115 /* Read any of the following:
 116
 117    - sequence of escape sequences representing character from eCharset,
 118      translated to single UCS4 character; or
 119
 120    - pair of UTF-16 surrogates, translated to single UCS4 character; or
 121
 122    _ single UTF-16 character, extended to UCS4 character.
 123  */
 124 sal_uInt32 readUcs4(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
 125                     bool bEncoded, rtl_TextEncoding eCharset,
 126                     EscapeType * pType)
 127 {
 128     sal_uInt32 nChar = *(*pBegin)++;
 129     int nWeight1;
 130     int nWeight2;
 131     if (nChar == cEscapePrefix && bEncoded && pEnd - *pBegin >= 2
 132         && (nWeight1 = getHexWeight((*pBegin)[0])) >= 0
 133         && (nWeight2 = getHexWeight((*pBegin)[1])) >= 0)
 134     {
 135         *pBegin += 2;
 136         nChar = static_cast< sal_uInt32 >(nWeight1 << 4 | nWeight2);
 137         if (nChar <= 0x7F)
 138             *pType = EscapeChar;
 139         else if (eCharset == RTL_TEXTENCODING_UTF8)
 140         {
 141             if (nChar >= 0xC0 && nChar <= 0xF4)
 142             {
 143                 sal_uInt32 nEncoded;
 144                 int nShift;
 145                 sal_uInt32 nMin;
 146                 if (nChar <= 0xDF)
 147                 {
 148                     nEncoded = (nChar & 0x1F) << 6;
 149                     nShift = 0;
 150                     nMin = 0x80;
 151                 }
 152                 else if (nChar <= 0xEF)
 153                 {
 154                     nEncoded = (nChar & 0x0F) << 12;
 155                     nShift = 6;
 156                     nMin = 0x800;
 157                 }
 158                 else
 159                 {
 160                     nEncoded = (nChar & 0x07) << 18;
 161                     nShift = 12;
 162                     nMin = 0x10000;
 163                 }
 164                 sal_Unicode const * p = *pBegin;
 165                 bool bUTF8 = true;
 166                 for (; nShift >= 0; nShift -= 6)
 167                 {
 168                     if (pEnd - p < 3 || p[0] != cEscapePrefix
 169                         || (nWeight1 = getHexWeight(p[1])) < 8
 170                         || nWeight1 > 11
 171                         || (nWeight2 = getHexWeight(p[2])) < 0)
 172                     {
 173                         bUTF8 = sal_False;
 174                         break;
 175                     }
 176                     p += 3;
 177                     nEncoded |= ((nWeight1 & 3) << 4 | nWeight2) << nShift;
 178                 }
 179                 if (bUTF8 && nEncoded >= nMin && !isHighSurrogate(nEncoded)
 180                     && !isLowSurrogate(nEncoded) && nEncoded <= 0x10FFFF)
 181                 {
 182                     *pBegin = p;
 183                     *pType = EscapeChar;
 184                     return nEncoded;
 185                 }
 186             }
 187             *pType = EscapeOctet;
 188         }
 189         else
 190         {
 191             rtl::OStringBuffer aBuf;
 192             aBuf.append(static_cast< char >(nChar));
 193             rtl_TextToUnicodeConverter aConverter
 194                 = rtl_createTextToUnicodeConverter(eCharset);
 195             sal_Unicode const * p = *pBegin;
 196             for (;;)
 197             {
 198                 sal_Unicode aDst[2];
 199                 sal_uInt32 nInfo;
 200                 sal_Size nConverted;
 201                 sal_Size nDstSize = rtl_convertTextToUnicode(
 202                     aConverter, 0, aBuf.getStr(), aBuf.getLength(), aDst,
 203                     sizeof aDst / sizeof aDst[0],
 204                     (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
 205                      | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
 206                      | RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR),
 207                     &nInfo, &nConverted);
 208                 if (nInfo == 0)
 209                 {
 210                     OSL_ASSERT(
 211                         nConverted
 212                         == sal::static_int_cast< sal_uInt32 >(
 213                             aBuf.getLength()));
 214                     rtl_destroyTextToUnicodeConverter(aConverter);
 215                     *pBegin = p;
 216                     *pType = EscapeChar;
 217                     OSL_ASSERT(
 218                         nDstSize == 1
 219                         || (nDstSize == 2 && isHighSurrogate(aDst[0])
 220                             && isLowSurrogate(aDst[1])));
 221                     return nDstSize == 1
 222                         ? aDst[0] : combineSurrogates(aDst[0], aDst[1]);
 223                 }
 224                 else if (nInfo == RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
 225                          && pEnd - p >= 3 && p[0] == cEscapePrefix
 226                          && (nWeight1 = getHexWeight(p[1])) >= 0
 227                          && (nWeight2 = getHexWeight(p[2])) >= 0)
 228                 {
 229                     p += 3;
 230                     aBuf.append(static_cast< char >(nWeight1 << 4 | nWeight2));
 231                 }
 232                 else if (nInfo == RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
 233                          && p != pEnd && *p <= 0x7F)
 234                 {
 235                     aBuf.append(static_cast< char >(*p++));
 236                 }
 237                 else
 238                 {
 239                     OSL_ASSERT(
 240                         (nInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL)
 241                         == 0);
 242                     break;
 243                 }
 244             }
 245             rtl_destroyTextToUnicodeConverter(aConverter);
 246             *pType = EscapeOctet;
 247         }
 248         return nChar;
 249     }
 250     else
 251     {
 252         *pType = EscapeNo;
 253         return isHighSurrogate(nChar) && *pBegin < pEnd
 254                && isLowSurrogate(**pBegin) ?
 255                    combineSurrogates(nChar, *(*pBegin)++) : nChar;
 256     }
 257 }
 258
 259 void writeUcs4(rtl_uString ** pBuffer, sal_Int32 * pCapacity, sal_uInt32 nUtf32)
 260 {
 261     OSL_ENSURE(nUtf32 <= 0x10FFFF, "bad UTF-32 char");
 262     if (nUtf32 <= 0xFFFF) {
 263         writeUnicode(
 264             pBuffer, pCapacity, static_cast< sal_Unicode >(nUtf32));
 265     } else {
 266         nUtf32 -= 0x10000;
 267         writeUnicode(
 268             pBuffer, pCapacity,
 269             static_cast< sal_Unicode >(nUtf32 >> 10 | 0xD800));
 270         writeUnicode(
 271             pBuffer, pCapacity,
 272             static_cast< sal_Unicode >((nUtf32 & 0x3FF) | 0xDC00));
 273     }
 274 }
 275
 276 void writeEscapeOctet(rtl_uString ** pBuffer, sal_Int32 * pCapacity,
 277                       sal_uInt32 nOctet)
 278 {
 279     OSL_ENSURE(nOctet <= 0xFF, "bad octet");
 280
 281     static sal_Unicode const aHex[16]
 282         = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
 283             0x41, 0x42, 0x43, 0x44, 0x45, 0x46 }; /* '0'--'9', 'A'--'F' */
 284
 285     writeUnicode(pBuffer, pCapacity, cEscapePrefix);
 286     writeUnicode(pBuffer, pCapacity, aHex[nOctet >> 4]);
 287     writeUnicode(pBuffer, pCapacity, aHex[nOctet & 15]);
 288 }
 289
 290 bool writeEscapeChar(rtl_uString ** pBuffer, sal_Int32 * pCapacity,
 291                      sal_uInt32 nUtf32, rtl_TextEncoding eCharset, bool bStrict)
 292 {
 293     OSL_ENSURE(nUtf32 <= 0x10FFFF, "bad UTF-32 char");
 294     if (eCharset == RTL_TEXTENCODING_UTF8) {
 295         if (nUtf32 < 0x80)
 296             writeEscapeOctet(pBuffer, pCapacity, nUtf32);
 297         else if (nUtf32 < 0x800)
 298         {
 299             writeEscapeOctet(pBuffer, pCapacity, nUtf32 >> 6 | 0xC0);
 300             writeEscapeOctet(pBuffer, pCapacity, (nUtf32 & 0x3F) | 0x80);
 301         }
 302         else if (nUtf32 < 0x10000)
 303         {
 304             writeEscapeOctet(pBuffer, pCapacity, nUtf32 >> 12 | 0xE0);
 305             writeEscapeOctet(pBuffer, pCapacity, (nUtf32 >> 6 & 0x3F) | 0x80);
 306             writeEscapeOctet(pBuffer, pCapacity, (nUtf32 & 0x3F) | 0x80);
 307         }
 308         else
 309         {
 310             writeEscapeOctet(pBuffer, pCapacity, nUtf32 >> 18 | 0xF0);
 311             writeEscapeOctet(pBuffer, pCapacity, (nUtf32 >> 12 & 0x3F) | 0x80);
 312             writeEscapeOctet(pBuffer, pCapacity, (nUtf32 >> 6 & 0x3F) | 0x80);
 313             writeEscapeOctet(pBuffer, pCapacity, (nUtf32 & 0x3F) | 0x80);
 314         }
 315     } else {
 316         rtl_UnicodeToTextConverter aConverter
 317             = rtl_createUnicodeToTextConverter(eCharset);
 318         sal_Unicode aSrc[2];
 319         sal_Size nSrcSize;
 320         if (nUtf32 <= 0xFFFF)
 321         {
 322             aSrc[0] = static_cast< sal_Unicode >(nUtf32);
 323             nSrcSize = 1;
 324         }
 325         else
 326         {
 327             aSrc[0] = static_cast< sal_Unicode >(
 328                 ((nUtf32 - 0x10000) >> 10) | 0xD800);
 329             aSrc[1] = static_cast< sal_Unicode >(
 330                 ((nUtf32 - 0x10000) & 0x3FF) | 0xDC00);
 331             nSrcSize = 2;
 332         }
 333         sal_Char aDst[32]; // FIXME  random value
 334         sal_uInt32 nInfo;
 335         sal_Size nConverted;
 336         sal_Size nDstSize = rtl_convertUnicodeToText(
 337             aConverter, 0, aSrc, nSrcSize, aDst, sizeof aDst,
 338             RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR
 339             | RTL_UNICODETOTEXT_FLAGS_INVALID_ERROR
 340             | RTL_UNICODETOTEXT_FLAGS_FLUSH,
 341             &nInfo, &nConverted);
 342         OSL_ASSERT((nInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL) == 0);
 343         rtl_destroyUnicodeToTextConverter(aConverter);
 344         if (nInfo == 0) {
 345             OSL_ENSURE(nConverted == nSrcSize, "bad rtl_convertUnicodeToText");
 346             for (sal_Size i = 0; i < nDstSize; ++i)
 347                 writeEscapeOctet(pBuffer, pCapacity,
 348                                  static_cast< unsigned char >(aDst[i]));
 349                     // FIXME  all octets are escaped, even if there is no need
 350         } else {
 351             if (bStrict) {
 352                 return false;
 353             } else {
 354                 writeUcs4(pBuffer, pCapacity, nUtf32);
 355             }
 356         }
 357     }
 358     return true;
 359 }
 360
 361 struct Component
 362 {
 363     sal_Unicode const * pBegin;
 364     sal_Unicode const * pEnd;
 365
 366     inline Component(): pBegin(0) {}
 367
 368     inline bool isPresent() const { return pBegin != 0; }
 369
 370     inline sal_Int32 getLength() const;
 371 };
 372
 373 inline sal_Int32 Component::getLength() const
 374 {
 375     OSL_ENSURE(isPresent(), "taking length of non-present component");
 376     return static_cast< sal_Int32 >(pEnd - pBegin);
 377 }
 378
 379 struct Components
 380 {
 381     Component aScheme;
 382     Component aAuthority;
 383     Component aPath;
 384     Component aQuery;
 385     Component aFragment;
 386 };
 387
 388 void parseUriRef(rtl_uString const * pUriRef, Components * pComponents)
 389 {
 390     // This algorithm is liberal and accepts various forms of illegal input.
 391
 392     sal_Unicode const * pBegin = pUriRef->buffer;
 393     sal_Unicode const * pEnd = pBegin + pUriRef->length;
 394     sal_Unicode const * pPos = pBegin;
 395
 396     if (pPos != pEnd && isAlpha(*pPos))
 397         for (sal_Unicode const * p = pPos + 1; p != pEnd; ++p)
 398             if (*p == ':')
 399             {
 400                 pComponents->aScheme.pBegin = pBegin;
 401                 pComponents->aScheme.pEnd = ++p;
 402                 pPos = p;
 403                 break;
 404             }
 405             else if (!isAlpha(*p) && !isDigit(*p) && *p != '+' && *p != '-'
 406                      && *p != '.')
 407                 break;
 408
 409     if (pEnd - pPos >= 2 && pPos[0] == '/' && pPos[1] == '/')
 410     {
 411         pComponents->aAuthority.pBegin = pPos;
 412         pPos += 2;
 413         while (pPos != pEnd && *pPos != '/' && *pPos != '?' && *pPos != '#')
 414             ++pPos;
 415         pComponents->aAuthority.pEnd = pPos;
 416     }
 417
 418     pComponents->aPath.pBegin = pPos;
 419     while (pPos != pEnd && *pPos != '?' && * pPos != '#')
 420         ++pPos;
 421     pComponents->aPath.pEnd = pPos;
 422
 423     if (pPos != pEnd && *pPos == '?')
 424     {
 425         pComponents->aQuery.pBegin = pPos++;
 426         while (pPos != pEnd && * pPos != '#')
 427             ++pPos;
 428         pComponents->aQuery.pEnd = pPos;
 429     }
 430
 431     if (pPos != pEnd)
 432     {
 433         OSL_ASSERT(*pPos == '#');
 434         pComponents->aFragment.pBegin = pPos;
 435         pComponents->aFragment.pEnd = pEnd;
 436     }
 437 }
 438
 439 rtl::OUString joinPaths(Component const & rBasePath, Component const & rRelPath)
 440 {
 441     OSL_ASSERT(rBasePath.isPresent() && *rBasePath.pBegin == '/');
 442     OSL_ASSERT(rRelPath.isPresent());
 443
 444     // The invariant of aBuffer is that it always starts and ends with a slash
 445     // (until probably right at the end of the algorithm, when the last segment
 446     // of rRelPath is added, which does not necessarily end in a slash):
 447     rtl::OUStringBuffer aBuffer(rBasePath.getLength() + rRelPath.getLength());
 448         // XXX  numeric overflow
 449
 450     // Segments "." and ".." within rBasePath are not conisdered special (but
 451     // are also not removed by ".." segments within rRelPath), RFC 2396 seems a
 452     // bit unclear about this point:
 453     sal_Int32 nFixed = 1;
 454     sal_Unicode const * p = rBasePath.pBegin + 1;
 455     for (sal_Unicode const * q = p; q != rBasePath.pEnd; ++q)
 456         if (*q == '/')
 457         {
 458             if (
 459                 (q - p == 1 && p[0] == '.') ||
 460                 (q - p == 2 && p[0] == '.' && p[1] == '.')
 461                )
 462             {
 463                 nFixed = q + 1 - rBasePath.pBegin;
 464             }
 465             p = q + 1;
 466         }
 467     aBuffer.append(rBasePath.pBegin, p - rBasePath.pBegin);
 468
 469     p = rRelPath.pBegin;
 470     if (p != rRelPath.pEnd)
 471         for (;;)
 472         {
 473             sal_Unicode const * q = p;
 474             sal_Unicode const * r;
 475             for (;;)
 476             {
 477                 if (q == rRelPath.pEnd)
 478                 {
 479                     r = q;
 480                     break;
 481                 }
 482                 if (*q == '/')
 483                 {
 484                     r = q + 1;
 485                     break;
 486                 }
 487                 ++q;
 488             }
 489             if (q - p == 2 && p[0] == '.' && p[1] == '.')
 490             {
 491                 // Erroneous excess segments ".." within rRelPath are left
 492                 // intact, as the examples in RFC 2396, section C.2, suggest:
 493                 sal_Int32 i = aBuffer.getLength() - 1;
 494                 if (i < nFixed)
 495                 {
 496                     aBuffer.append(p, r - p);
 497                     nFixed += 3;
 498                 }
 499                 else
 500                 {
 501                     while (aBuffer.charAt(i - 1) != '/')
 502                         --i;
 503                     aBuffer.setLength(i);
 504                 }
 505             }
 506             else if (q - p != 1 || *p != '.')
 507                 aBuffer.append(p, r - p);
 508             if (q == rRelPath.pEnd)
 509                 break;
 510             p = q + 1;
 511         }
 512
 513     return aBuffer.makeStringAndClear();
 514 }
 515
 516 }
 517
 518 sal_Bool const * SAL_CALL rtl_getUriCharClass(rtl_UriCharClass eCharClass)
 519     SAL_THROW_EXTERN_C()
 520 {
 521     static sal_Bool const aCharClass[][nCharClassSize]
 522     = {{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* None */
 523          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 524          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* !"#$%&'()*+,-./*/
 525          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*0123456789:;<=>?*/
 526          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*@ABCDEFGHIJKLMNO*/
 527          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*PQRSTUVWXYZ[\]^_*/
 528          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*`abcdefghijklmno*/
 529          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /*pqrstuvwxyz{|}~ */
 530        },
 531        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Uric */
 532          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 533          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* !"#$%&'()*+,-./*/
 534          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, /*0123456789:;<=>?*/
 535          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
 536          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, /*PQRSTUVWXYZ[\]^_*/
 537          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
 538          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
 539        },
 540        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* UricNoSlash */
 541          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 542          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* !"#$%&'()*+,-./*/
 543          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, /*0123456789:;<=>?*/
 544          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
 545          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
 546          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
 547          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
 548        },
 549        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* RelSegment */
 550          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 551          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* !"#$%&'()*+,-./*/
 552          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, /*0123456789:;<=>?*/
 553          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
 554          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
 555          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
 556          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
 557        },
 558        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* RegName */
 559          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 560          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* !"#$%&'()*+,-./*/
 561          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, /*0123456789:;<=>?*/
 562          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
 563          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
 564          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
 565          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
 566        },
 567        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Userinfo */
 568          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 569          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* !"#$%&'()*+,-./*/
 570          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, /*0123456789:;<=>?*/
 571          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
 572          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
 573          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
 574          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
 575        },
 576        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Pchar */
 577          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 578          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* !"#$%&'()*+,-./*/
 579          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, /*0123456789:;<=>?*/
 580          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
 581          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
 582          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
 583          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
 584        },
 585        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* UnoParamValue */
 586          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 587          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, /* !"#$%&'()*+,-./*/
 588          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*0123456789:;<=>?*/
 589          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
 590          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
 591          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
 592          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
 593        }};
 594     OSL_ENSURE(
 595         (eCharClass >= 0
 596          && (sal::static_int_cast< std::size_t >(eCharClass)
 597              < sizeof aCharClass / sizeof aCharClass[0])),
 598         "bad eCharClass");
 599     return aCharClass[eCharClass];
 600 }
 601
 602 void SAL_CALL rtl_uriEncode(rtl_uString * pText, sal_Bool const * pCharClass,
 603                             rtl_UriEncodeMechanism eMechanism,
 604                             rtl_TextEncoding eCharset, rtl_uString ** pResult)
 605     SAL_THROW_EXTERN_C()
 606 {
 607     OSL_ENSURE(!pCharClass[0x25], "bad pCharClass");
 608         // make sure the percent sign is encoded...
 609
 610     sal_Unicode const * p = pText->buffer;
 611     sal_Unicode const * pEnd = p + pText->length;
 612     sal_Int32 nCapacity = 0;
 613     rtl_uString_new(pResult);
 614     while (p < pEnd)
 615     {
 616         EscapeType eType;
 617         sal_uInt32 nUtf32 = readUcs4(
 618             &p, pEnd,
 619             (eMechanism == rtl_UriEncodeKeepEscapes
 620              || eMechanism == rtl_UriEncodeCheckEscapes
 621              || eMechanism == rtl_UriEncodeStrictKeepEscapes),
 622             eCharset, &eType);
 623         switch (eType)
 624         {
 625         case EscapeNo:
 626             if (isValid(pCharClass, nUtf32)) // implies nUtf32 <= 0x7F
 627                 writeUnicode(pResult, &nCapacity,
 628                              static_cast< sal_Unicode >(nUtf32));
 629             else if (!writeEscapeChar(
 630                          pResult, &nCapacity, nUtf32, eCharset,
 631                          (eMechanism == rtl_UriEncodeStrict
 632                           || eMechanism == rtl_UriEncodeStrictKeepEscapes)))
 633             {
 634                 rtl_uString_new(pResult);
 635                 return;
 636             }
 637             break;
 638
 639         case EscapeChar:
 640             if (eMechanism == rtl_UriEncodeCheckEscapes
 641                 && isValid(pCharClass, nUtf32)) // implies nUtf32 <= 0x7F
 642                 writeUnicode(pResult, &nCapacity,
 643                              static_cast< sal_Unicode >(nUtf32));
 644             else if (!writeEscapeChar(
 645                          pResult, &nCapacity, nUtf32, eCharset,
 646                          (eMechanism == rtl_UriEncodeStrict
 647                           || eMechanism == rtl_UriEncodeStrictKeepEscapes)))
 648             {
 649                 rtl_uString_new(pResult);
 650                 return;
 651             }
 652             break;
 653
 654         case EscapeOctet:
 655             writeEscapeOctet(pResult, &nCapacity, nUtf32);
 656             break;
 657         }
 658     }
 659 }
 660
 661 void SAL_CALL rtl_uriDecode(rtl_uString * pText,
 662                             rtl_UriDecodeMechanism eMechanism,
 663                             rtl_TextEncoding eCharset, rtl_uString ** pResult)
 664     SAL_THROW_EXTERN_C()
 665 {
 666     switch (eMechanism)
 667     {
 668     case rtl_UriDecodeNone:
 669         rtl_uString_assign(pResult, pText);
 670         break;
 671
 672     case rtl_UriDecodeToIuri:
 673         eCharset = RTL_TEXTENCODING_UTF8;
 674     default: // rtl_UriDecodeWithCharset, rtl_UriDecodeStrict
 675         {
 676             sal_Unicode const * p = pText->buffer;
 677             sal_Unicode const * pEnd = p + pText->length;
 678             sal_Int32 nCapacity = 0;
 679             rtl_uString_new(pResult);
 680             while (p < pEnd)
 681             {
 682                 EscapeType eType;
 683                 sal_uInt32 nUtf32 = readUcs4(&p, pEnd, true, eCharset, &eType);
 684                 switch (eType)
 685                 {
 686                 case EscapeChar:
 687                     if (nUtf32 <= 0x7F && eMechanism == rtl_UriDecodeToIuri)
 688                     {
 689                         writeEscapeOctet(pResult, &nCapacity, nUtf32);
 690                         break;
 691                     }
 692                 case EscapeNo:
 693                     writeUcs4(pResult, &nCapacity, nUtf32);
 694                     break;
 695
 696                 case EscapeOctet:
 697                     if (eMechanism == rtl_UriDecodeStrict) {
 698                         rtl_uString_new(pResult);
 699                         return;
 700                     }
 701                     writeEscapeOctet(pResult, &nCapacity, nUtf32);
 702                     break;
 703                 }
 704             }
 705         }
 706         break;
 707     }
 708 }
 709
 710 sal_Bool SAL_CALL rtl_uriConvertRelToAbs(rtl_uString * pBaseUriRef,
 711                                          rtl_uString * pRelUriRef,
 712                                          rtl_uString ** pResult,
 713                                          rtl_uString ** pException)
 714     SAL_THROW_EXTERN_C()
 715 {
 716     // If pRelUriRef starts with a scheme component it is an absolute URI
 717     // reference, and we are done (i.e., this algorithm does not support
 718     // backwards-compatible relative URIs starting with a scheme component, see
 719     // RFC 2396, section 5.2, step 3):
 720     Components aRelComponents;
 721     parseUriRef(pRelUriRef, &aRelComponents);
 722     if (aRelComponents.aScheme.isPresent())
 723     {
 724         rtl_uString_assign(pResult, pRelUriRef);
 725         return true;
 726     }
 727
 728     // Parse pBaseUriRef; if the scheme component is not present or not valid,
 729     // or the path component is not empty and starts with anything but a slash,
 730     // an exception is raised:
 731     Components aBaseComponents;
 732     parseUriRef(pBaseUriRef, &aBaseComponents);
 733     if (!aBaseComponents.aScheme.isPresent())
 734     {
 735         rtl::OUString aMessage(pBaseUriRef);
 736         aMessage += rtl::OUString(
 737                         RTL_CONSTASCII_USTRINGPARAM(
 738                             " does not start with a scheme component"));
 739         rtl_uString_assign(pException,
 740                            const_cast< rtl::OUString & >(aMessage).pData);
 741         return false;
 742     }
 743     if (aBaseComponents.aPath.pBegin != aBaseComponents.aPath.pEnd
 744         && *aBaseComponents.aPath.pBegin != '/')
 745     {
 746         rtl::OUString aMessage(pBaseUriRef);
 747         aMessage += rtl::OUString(
 748                         RTL_CONSTASCII_USTRINGPARAM(
 749                             "path component does not start with slash"));
 750         rtl_uString_assign(pException, aMessage.pData);
 751         return false;
 752     }
 753
 754     // Use the algorithm from RFC 2396, section 5.2, to turn the relative URI
 755     // into an absolute one (if the relative URI is a reference to the "current
 756     // document," the "current document" is here taken to be the base URI):
 757     rtl::OUStringBuffer aBuffer;
 758     aBuffer.append(aBaseComponents.aScheme.pBegin,
 759                    aBaseComponents.aScheme.getLength());
 760     if (aRelComponents.aAuthority.isPresent())
 761     {
 762         aBuffer.append(aRelComponents.aAuthority.pBegin,
 763                        aRelComponents.aAuthority.getLength());
 764         aBuffer.append(aRelComponents.aPath.pBegin,
 765                        aRelComponents.aPath.getLength());
 766         if (aRelComponents.aQuery.isPresent())
 767             aBuffer.append(aRelComponents.aQuery.pBegin,
 768                            aRelComponents.aQuery.getLength());
 769     }
 770     else
 771     {
 772         if (aBaseComponents.aAuthority.isPresent())
 773             aBuffer.append(aBaseComponents.aAuthority.pBegin,
 774                            aBaseComponents.aAuthority.getLength());
 775         if (aRelComponents.aPath.pBegin == aRelComponents.aPath.pEnd
 776             && !aRelComponents.aQuery.isPresent())
 777         {
 778             aBuffer.append(aBaseComponents.aPath.pBegin,
 779                            aBaseComponents.aPath.getLength());
 780             if (aBaseComponents.aQuery.isPresent())
 781                 aBuffer.append(aBaseComponents.aQuery.pBegin,
 782                                aBaseComponents.aQuery.getLength());
 783         }
 784         else
 785         {
 786             if (*aRelComponents.aPath.pBegin == '/')
 787                 aBuffer.append(aRelComponents.aPath.pBegin,
 788                                aRelComponents.aPath.getLength());
 789             else
 790                 aBuffer.append(joinPaths(aBaseComponents.aPath,
 791                                          aRelComponents.aPath));
 792             if (aRelComponents.aQuery.isPresent())
 793                 aBuffer.append(aRelComponents.aQuery.pBegin,
 794                                aRelComponents.aQuery.getLength());
 795         }
 796     }
 797     if (aRelComponents.aFragment.isPresent())
 798         aBuffer.append(aRelComponents.aFragment.pBegin,
 799                        aRelComponents.aFragment.getLength());
 800     rtl_uString_assign(pResult, aBuffer.makeStringAndClear().pData);
 801     return true;
 802 }