ucb/source/regexp/regexp.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include <regexp.hxx>
  21
  22 #include <cstddef>
  23
  24 #include <osl/diagnose.h>
  25 #include <com/sun/star/lang/IllegalArgumentException.hpp>
  26 #include <rtl/character.hxx>
  27 #include <rtl/ustrbuf.hxx>
  28 #include <rtl/ustring.hxx>
  29
  30 using namespace com::sun::star;
  31 using namespace ucb_impl;
  32
  33
  34 //  Regexp
  35
  36
  37 inline Regexp::Regexp(Kind eTheKind, OUString const & rThePrefix,
  38                       bool bTheEmptyDomain, OUString const & rTheInfix,
  39                       bool bTheTranslation,
  40                       OUString const & rTheReversePrefix):
  41     m_eKind(eTheKind),
  42     m_aPrefix(rThePrefix),
  43     m_aInfix(rTheInfix),
  44     m_aReversePrefix(rTheReversePrefix),
  45     m_bEmptyDomain(bTheEmptyDomain),
  46     m_bTranslation(bTheTranslation)
  47 {
  48     OSL_ASSERT(m_eKind == KIND_DOMAIN
  49                || (!m_bEmptyDomain && m_aInfix.isEmpty()));
  50     OSL_ASSERT(m_bTranslation || m_aReversePrefix.isEmpty());
  51 }
  52
  53
  54 namespace {
  55
  56 bool matchStringIgnoreCase(sal_Unicode const ** pBegin,
  57                            sal_Unicode const * pEnd,
  58                            OUString const & rString)
  59 {
  60     sal_Unicode const * p = *pBegin;
  61
  62     sal_Unicode const * q = rString.getStr();
  63     sal_Unicode const * qEnd = q + rString.getLength();
  64
  65     if (pEnd - p < qEnd - q)
  66         return false;
  67
  68     while (q != qEnd)
  69     {
  70         if (rtl::compareIgnoreAsciiCase(*p++, *q++) != 0)
  71             return false;
  72     }
  73
  74     *pBegin = p;
  75     return true;
  76 }
  77
  78 }
  79
  80 bool Regexp::matches(OUString const & rString) const
  81 {
  82     sal_Unicode const * pBegin = rString.getStr();
  83     sal_Unicode const * pEnd = pBegin + rString.getLength();
  84
  85     bool bMatches = false;
  86
  87     sal_Unicode const * p = pBegin;
  88     if (matchStringIgnoreCase(&p, pEnd, m_aPrefix))
  89     {
  90         switch (m_eKind)
  91         {
  92             case KIND_PREFIX:
  93                 bMatches = true;
  94                 break;
  95
  96             case KIND_AUTHORITY:
  97                 bMatches = p == pEnd || *p == '/' || *p == '?' || *p == '#';
  98                 break;
  99
 100             case KIND_DOMAIN:
 101                 if (!m_bEmptyDomain)
 102                 {
 103                     if (p == pEnd || *p == '/' || *p == '?' || *p == '#')
 104                         break;
 105                     ++p;
 106                 }
 107                 for (;;)
 108                 {
 109                     sal_Unicode const * q = p;
 110                     if (matchStringIgnoreCase(&q, pEnd, m_aInfix)
 111                         && (q == pEnd || *q == '/' || *q == '?' || *q == '#'))
 112                     {
 113                         bMatches = true;
 114                         break;
 115                     }
 116
 117                     if (p == pEnd)
 118                         break;
 119
 120                     sal_Unicode c = *p++;
 121                     if (c == '/' || c == '?' || c == '#')
 122                         break;
 123                 }
 124                 break;
 125         }
 126     }
 127
 128     return bMatches;
 129 }
 130
 131
 132 namespace {
 133
 134 bool isScheme(OUString const & rString, bool bColon)
 135 {
 136     // Return true if rString matches <scheme> (plus a trailing ":" if bColon
 137     // is true) from RFC 2396:
 138     sal_Unicode const * p = rString.getStr();
 139     sal_Unicode const * pEnd = p + rString.getLength();
 140     if (p != pEnd && rtl::isAsciiAlpha(*p))
 141         for (++p;;)
 142         {
 143             if (p == pEnd)
 144                 return !bColon;
 145             sal_Unicode c = *p++;
 146             if (!(rtl::isAsciiAlphanumeric(c)
 147                   || c == '+' || c == '-' || c == '.'))
 148                 return bColon && c == ':' && p == pEnd;
 149         }
 150     return false;
 151 }
 152
 153 void appendStringLiteral(OUStringBuffer * pBuffer,
 154                          OUString const & rString)
 155 {
 156     OSL_ASSERT(pBuffer);
 157
 158     pBuffer->append('"');
 159     sal_Unicode const * p = rString.getStr();
 160     sal_Unicode const * pEnd = p + rString.getLength();
 161     while (p != pEnd)
 162     {
 163         sal_Unicode c = *p++;
 164         if (c == '"' || c == '\\')
 165             pBuffer->append('\\');
 166         pBuffer->append(c);
 167     }
 168     pBuffer->append('"');
 169 }
 170
 171 }
 172
 173 OUString Regexp::getRegexp() const
 174 {
 175     if (m_bTranslation)
 176     {
 177         OUStringBuffer aBuffer;
 178         if (!m_aPrefix.isEmpty())
 179             appendStringLiteral(&aBuffer, m_aPrefix);
 180         switch (m_eKind)
 181         {
 182             case KIND_PREFIX:
 183                 aBuffer.append("(.*)");
 184                 break;
 185
 186             case KIND_AUTHORITY:
 187                 aBuffer.append("(([/?#].*)?)");
 188                 break;
 189
 190             case KIND_DOMAIN:
 191                 aBuffer.append("([^/?#]");
 192                 aBuffer.append(sal_Unicode(m_bEmptyDomain ? '*' : '+'));
 193                 if (!m_aInfix.isEmpty())
 194                     appendStringLiteral(&aBuffer, m_aInfix);
 195                 aBuffer.append("([/?#].*)?)");
 196                 break;
 197         }
 198         aBuffer.append("->");
 199         if (!m_aReversePrefix.isEmpty())
 200             appendStringLiteral(&aBuffer, m_aReversePrefix);
 201         aBuffer.append("\\1");
 202         return aBuffer.makeStringAndClear();
 203     }
 204     else if (m_eKind == KIND_PREFIX && isScheme(m_aPrefix, true))
 205         return m_aPrefix.copy(0, m_aPrefix.getLength() - 1);
 206     else
 207     {
 208         OUStringBuffer aBuffer;
 209         if (!m_aPrefix.isEmpty())
 210             appendStringLiteral(&aBuffer, m_aPrefix);
 211         switch (m_eKind)
 212         {
 213             case KIND_PREFIX:
 214                 aBuffer.append(".*");
 215                 break;
 216
 217             case KIND_AUTHORITY:
 218                 aBuffer.append("([/?#].*)?");
 219                 break;
 220
 221             case KIND_DOMAIN:
 222                 aBuffer.append("[^/?#]");
 223                 aBuffer.append( m_bEmptyDomain ? '*' : '+' );
 224                 if (!m_aInfix.isEmpty())
 225                     appendStringLiteral(&aBuffer, m_aInfix);
 226                 aBuffer.append("([/?#].*)?");
 227                 break;
 228         }
 229         return aBuffer.makeStringAndClear();
 230     }
 231 }
 232
 233
 234 namespace {
 235
 236 bool matchString(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
 237                  sal_Char const * pString, size_t nStringLength)
 238 {
 239     sal_Unicode const * p = *pBegin;
 240
 241     unsigned char const * q = reinterpret_cast< unsigned char const * >(pString);
 242     unsigned char const * qEnd = q + nStringLength;
 243
 244     if (pEnd - p < qEnd - q)
 245         return false;
 246
 247     while (q != qEnd)
 248     {
 249         sal_Unicode c1 = *p++;
 250         sal_Unicode c2 = *q++;
 251         if (c1 != c2)
 252             return false;
 253     }
 254
 255     *pBegin = p;
 256     return true;
 257 }
 258
 259 bool scanStringLiteral(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
 260                        OUString * pString)
 261 {
 262     sal_Unicode const * p = *pBegin;
 263
 264     if (p == pEnd || *p++ != '"')
 265         return false;
 266
 267     OUStringBuffer aBuffer;
 268     for (;;)
 269     {
 270         if (p == pEnd)
 271             return false;
 272         sal_Unicode c = *p++;
 273         if (c == '"')
 274             break;
 275         if (c == '\\')
 276         {
 277             if (p == pEnd)
 278                 return false;
 279             c = *p++;
 280             if (c != '"' && c != '\\')
 281                 return false;
 282         }
 283         aBuffer.append(c);
 284     }
 285
 286     *pBegin = p;
 287     *pString = aBuffer.makeStringAndClear();
 288     return true;
 289 }
 290
 291 }
 292
 293 Regexp Regexp::parse(OUString const & rRegexp)
 294 {
 295     // Detect an input of '<scheme>' as an abbreviation of '"<scheme>:".*'
 296     // where <scheme> is as defined in RFC 2396:
 297     if (isScheme(rRegexp, false))
 298         return Regexp(Regexp::KIND_PREFIX,
 299                       rRegexp + ":",
 300                       false,
 301                       OUString(),
 302                       false,
 303                       OUString());
 304
 305     sal_Unicode const * p = rRegexp.getStr();
 306     sal_Unicode const * pEnd = p + rRegexp.getLength();
 307
 308     OUString aPrefix;
 309     scanStringLiteral(&p, pEnd, &aPrefix);
 310
 311     if (p == pEnd)
 312         throw lang::IllegalArgumentException();
 313
 314     // This and the matchString() calls below are some of the few places where
 315     // RTL_CONSTASCII_STRINGPARAM() should NOT be removed.
 316     // (c.f. https://gerrit.libreoffice.org/3117)
 317     if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(".*")))
 318     {
 319         if (p != pEnd)
 320             throw lang::IllegalArgumentException();
 321
 322         return Regexp(Regexp::KIND_PREFIX, aPrefix, false, OUString(),
 323                       false, OUString());
 324     }
 325     else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("(.*)->")))
 326     {
 327         OUString aReversePrefix;
 328         scanStringLiteral(&p, pEnd, &aReversePrefix);
 329
 330         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
 331             || p != pEnd)
 332             throw lang::IllegalArgumentException();
 333
 334         return Regexp(Regexp::KIND_PREFIX, aPrefix, false, OUString(),
 335                       true, aReversePrefix);
 336     }
 337     else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
 338     {
 339         if (p != pEnd)
 340             throw lang::IllegalArgumentException();
 341
 342         return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, OUString(),
 343                       false, OUString());
 344     }
 345     else if (matchString(&p, pEnd,
 346                          RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)->")))
 347     {
 348         OUString aReversePrefix;
 349         if (!(scanStringLiteral(&p, pEnd, &aReversePrefix)
 350               && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
 351               && p == pEnd))
 352             throw lang::IllegalArgumentException();
 353
 354         return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, OUString(),
 355                       true, aReversePrefix);
 356     }
 357     else
 358     {
 359         bool bOpen = false;
 360         if (p != pEnd && *p == '(')
 361         {
 362             ++p;
 363             bOpen = true;
 364         }
 365
 366         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("[^/?#]")))
 367             throw lang::IllegalArgumentException();
 368
 369         if (p == pEnd || (*p != '*' && *p != '+'))
 370             throw lang::IllegalArgumentException();
 371         bool bEmptyDomain = *p++ == '*';
 372
 373         OUString aInfix;
 374         scanStringLiteral(&p, pEnd, &aInfix);
 375
 376         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
 377             throw lang::IllegalArgumentException();
 378
 379         OUString aReversePrefix;
 380         if (bOpen
 381             && !(matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(")->"))
 382                  && scanStringLiteral(&p, pEnd, &aReversePrefix)
 383                  && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))))
 384             throw lang::IllegalArgumentException();
 385
 386         if (p != pEnd)
 387             throw lang::IllegalArgumentException();
 388
 389         return Regexp(Regexp::KIND_DOMAIN, aPrefix, bEmptyDomain, aInfix,
 390                       bOpen, aReversePrefix);
 391     }
 392 }
 393
 394 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */