ucb/source/regexp/regexp.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include <regexp.hxx>
  21
  22 #include <cstddef>
  23
  24 #include <osl/diagnose.h>
  25 #include <com/sun/star/lang/IllegalArgumentException.hpp>
  26 #include <rtl/character.hxx>
  27 #include <rtl/ustrbuf.hxx>
  28 #include <rtl/ustring.hxx>
  29 #include <utility>
  30
  31 using namespace com::sun::star;
  32 using namespace ucb_impl;
  33
  34
  35 //  Regexp
  36
  37
  38 inline Regexp::Regexp(Kind eTheKind, OUString aThePrefix,
  39                       bool bTheEmptyDomain, OUString aTheInfix,
  40                       bool bTheTranslation,
  41                       OUString aTheReversePrefix):
  42     m_eKind(eTheKind),
  43     m_aPrefix(std::move(aThePrefix)),
  44     m_aInfix(std::move(aTheInfix)),
  45     m_aReversePrefix(std::move(aTheReversePrefix)),
  46     m_bEmptyDomain(bTheEmptyDomain),
  47     m_bTranslation(bTheTranslation)
  48 {
  49     OSL_ASSERT(m_eKind == KIND_DOMAIN
  50                || (!m_bEmptyDomain && m_aInfix.isEmpty()));
  51     OSL_ASSERT(m_bTranslation || m_aReversePrefix.isEmpty());
  52 }
  53
  54
  55 namespace {
  56
  57 bool matchStringIgnoreCase(sal_Unicode const ** pBegin,
  58                            sal_Unicode const * pEnd,
  59                            OUString const & rString)
  60 {
  61     sal_Unicode const * p = *pBegin;
  62
  63     sal_Unicode const * q = rString.getStr();
  64     sal_Unicode const * qEnd = q + rString.getLength();
  65
  66     if (pEnd - p < qEnd - q)
  67         return false;
  68
  69     while (q != qEnd)
  70     {
  71         if (rtl::compareIgnoreAsciiCase(*p++, *q++) != 0)
  72             return false;
  73     }
  74
  75     *pBegin = p;
  76     return true;
  77 }
  78
  79 }
  80
  81 bool Regexp::matches(OUString const & rString) const
  82 {
  83     sal_Unicode const * pBegin = rString.getStr();
  84     sal_Unicode const * pEnd = pBegin + rString.getLength();
  85
  86     bool bMatches = false;
  87
  88     sal_Unicode const * p = pBegin;
  89     if (matchStringIgnoreCase(&p, pEnd, m_aPrefix))
  90     {
  91         switch (m_eKind)
  92         {
  93             case KIND_PREFIX:
  94                 bMatches = true;
  95                 break;
  96
  97             case KIND_AUTHORITY:
  98                 bMatches = p == pEnd || *p == '/' || *p == '?' || *p == '#';
  99                 break;
 100
 101             case KIND_DOMAIN:
 102                 if (!m_bEmptyDomain)
 103                 {
 104                     if (p == pEnd || *p == '/' || *p == '?' || *p == '#')
 105                         break;
 106                     ++p;
 107                 }
 108                 for (;;)
 109                 {
 110                     sal_Unicode const * q = p;
 111                     if (matchStringIgnoreCase(&q, pEnd, m_aInfix)
 112                         && (q == pEnd || *q == '/' || *q == '?' || *q == '#'))
 113                     {
 114                         bMatches = true;
 115                         break;
 116                     }
 117
 118                     if (p == pEnd)
 119                         break;
 120
 121                     sal_Unicode c = *p++;
 122                     if (c == '/' || c == '?' || c == '#')
 123                         break;
 124                 }
 125                 break;
 126         }
 127     }
 128
 129     return bMatches;
 130 }
 131
 132
 133 namespace {
 134
 135 bool isScheme(OUString const & rString, bool bColon)
 136 {
 137     // Return true if rString matches <scheme> (plus a trailing ":" if bColon
 138     // is true) from RFC 2396:
 139     sal_Unicode const * p = rString.getStr();
 140     sal_Unicode const * pEnd = p + rString.getLength();
 141     if (p != pEnd && rtl::isAsciiAlpha(*p))
 142         for (++p;;)
 143         {
 144             if (p == pEnd)
 145                 return !bColon;
 146             sal_Unicode c = *p++;
 147             if (!(rtl::isAsciiAlphanumeric(c)
 148                   || c == '+' || c == '-' || c == '.'))
 149                 return bColon && c == ':' && p == pEnd;
 150         }
 151     return false;
 152 }
 153
 154 void appendStringLiteral(OUStringBuffer * pBuffer,
 155                          OUString const & rString)
 156 {
 157     OSL_ASSERT(pBuffer);
 158
 159     pBuffer->append('"');
 160     sal_Unicode const * p = rString.getStr();
 161     sal_Unicode const * pEnd = p + rString.getLength();
 162     while (p != pEnd)
 163     {
 164         sal_Unicode c = *p++;
 165         if (c == '"' || c == '\\')
 166             pBuffer->append('\\');
 167         pBuffer->append(c);
 168     }
 169     pBuffer->append('"');
 170 }
 171
 172 }
 173
 174 OUString Regexp::getRegexp() const
 175 {
 176     if (m_bTranslation)
 177     {
 178         OUStringBuffer aBuffer;
 179         if (!m_aPrefix.isEmpty())
 180             appendStringLiteral(&aBuffer, m_aPrefix);
 181         switch (m_eKind)
 182         {
 183             case KIND_PREFIX:
 184                 aBuffer.append("(.*)");
 185                 break;
 186
 187             case KIND_AUTHORITY:
 188                 aBuffer.append("(([/?#].*)?)");
 189                 break;
 190
 191             case KIND_DOMAIN:
 192                 aBuffer.append("([^/?#]" + OUStringChar(sal_Unicode(m_bEmptyDomain ? '*' : '+')));
 193                 if (!m_aInfix.isEmpty())
 194                     appendStringLiteral(&aBuffer, m_aInfix);
 195                 aBuffer.append("([/?#].*)?)");
 196                 break;
 197         }
 198         aBuffer.append("->");
 199         if (!m_aReversePrefix.isEmpty())
 200             appendStringLiteral(&aBuffer, m_aReversePrefix);
 201         aBuffer.append("\\1");
 202         return aBuffer.makeStringAndClear();
 203     }
 204     else if (m_eKind == KIND_PREFIX && isScheme(m_aPrefix, true))
 205         return m_aPrefix.copy(0, m_aPrefix.getLength() - 1);
 206     else
 207     {
 208         OUStringBuffer aBuffer;
 209         if (!m_aPrefix.isEmpty())
 210             appendStringLiteral(&aBuffer, m_aPrefix);
 211         switch (m_eKind)
 212         {
 213             case KIND_PREFIX:
 214                 aBuffer.append(".*");
 215                 break;
 216
 217             case KIND_AUTHORITY:
 218                 aBuffer.append("([/?#].*)?");
 219                 break;
 220
 221             case KIND_DOMAIN:
 222                 aBuffer.append("[^/?#]" + OUStringChar( m_bEmptyDomain ? '*' : '+' ));
 223                 if (!m_aInfix.isEmpty())
 224                     appendStringLiteral(&aBuffer, m_aInfix);
 225                 aBuffer.append("([/?#].*)?");
 226                 break;
 227         }
 228         return aBuffer.makeStringAndClear();
 229     }
 230 }
 231
 232
 233 namespace {
 234
 235 bool matchString(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
 236                  char const * pString, size_t nStringLength)
 237 {
 238     sal_Unicode const * p = *pBegin;
 239
 240     unsigned char const * q = reinterpret_cast< unsigned char const * >(pString);
 241     unsigned char const * qEnd = q + nStringLength;
 242
 243     if (pEnd - p < qEnd - q)
 244         return false;
 245
 246     while (q != qEnd)
 247     {
 248         sal_Unicode c1 = *p++;
 249         sal_Unicode c2 = *q++;
 250         if (c1 != c2)
 251             return false;
 252     }
 253
 254     *pBegin = p;
 255     return true;
 256 }
 257
 258 bool scanStringLiteral(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
 259                        OUString * pString)
 260 {
 261     sal_Unicode const * p = *pBegin;
 262
 263     if (p == pEnd || *p++ != '"')
 264         return false;
 265
 266     OUStringBuffer aBuffer;
 267     for (;;)
 268     {
 269         if (p == pEnd)
 270             return false;
 271         sal_Unicode c = *p++;
 272         if (c == '"')
 273             break;
 274         if (c == '\\')
 275         {
 276             if (p == pEnd)
 277                 return false;
 278             c = *p++;
 279             if (c != '"' && c != '\\')
 280                 return false;
 281         }
 282         aBuffer.append(c);
 283     }
 284
 285     *pBegin = p;
 286     *pString = aBuffer.makeStringAndClear();
 287     return true;
 288 }
 289
 290 }
 291
 292 Regexp Regexp::parse(OUString const & rRegexp)
 293 {
 294     // Detect an input of '<scheme>' as an abbreviation of '"<scheme>:".*'
 295     // where <scheme> is as defined in RFC 2396:
 296     if (isScheme(rRegexp, false))
 297         return Regexp(Regexp::KIND_PREFIX,
 298                       rRegexp + ":",
 299                       false,
 300                       OUString(),
 301                       false,
 302                       OUString());
 303
 304     sal_Unicode const * p = rRegexp.getStr();
 305     sal_Unicode const * pEnd = p + rRegexp.getLength();
 306
 307     OUString aPrefix;
 308     scanStringLiteral(&p, pEnd, &aPrefix);
 309
 310     if (p == pEnd)
 311         throw lang::IllegalArgumentException();
 312
 313     // This and the matchString() calls below are some of the few places where
 314     // RTL_CONSTASCII_STRINGPARAM() should NOT be removed.
 315     // (c.f. https://gerrit.libreoffice.org/3117)
 316     if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(".*")))
 317     {
 318         if (p != pEnd)
 319             throw lang::IllegalArgumentException();
 320
 321         return Regexp(Regexp::KIND_PREFIX, aPrefix, false, OUString(),
 322                       false, OUString());
 323     }
 324     else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("(.*)->")))
 325     {
 326         OUString aReversePrefix;
 327         scanStringLiteral(&p, pEnd, &aReversePrefix);
 328
 329         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
 330             || p != pEnd)
 331             throw lang::IllegalArgumentException();
 332
 333         return Regexp(Regexp::KIND_PREFIX, aPrefix, false, OUString(),
 334                       true, aReversePrefix);
 335     }
 336     else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
 337     {
 338         if (p != pEnd)
 339             throw lang::IllegalArgumentException();
 340
 341         return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, OUString(),
 342                       false, OUString());
 343     }
 344     else if (matchString(&p, pEnd,
 345                          RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)->")))
 346     {
 347         OUString aReversePrefix;
 348         if (!(scanStringLiteral(&p, pEnd, &aReversePrefix)
 349               && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
 350               && p == pEnd))
 351             throw lang::IllegalArgumentException();
 352
 353         return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, OUString(),
 354                       true, aReversePrefix);
 355     }
 356     else
 357     {
 358         bool bOpen = false;
 359         if (p != pEnd && *p == '(')
 360         {
 361             ++p;
 362             bOpen = true;
 363         }
 364
 365         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("[^/?#]")))
 366             throw lang::IllegalArgumentException();
 367
 368         if (p == pEnd || (*p != '*' && *p != '+'))
 369             throw lang::IllegalArgumentException();
 370         bool bEmptyDomain = *p++ == '*';
 371
 372         OUString aInfix;
 373         scanStringLiteral(&p, pEnd, &aInfix);
 374
 375         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
 376             throw lang::IllegalArgumentException();
 377
 378         OUString aReversePrefix;
 379         if (bOpen
 380             && !(matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(")->"))
 381                  && scanStringLiteral(&p, pEnd, &aReversePrefix)
 382                  && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))))
 383             throw lang::IllegalArgumentException();
 384
 385         if (p != pEnd)
 386             throw lang::IllegalArgumentException();
 387
 388         return Regexp(Regexp::KIND_DOMAIN, aPrefix, bEmptyDomain, aInfix,
 389                       bOpen, aReversePrefix);
 390     }
 391 }
 392
 393 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */