ucb/source/regexp/regexp.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include <regexp.hxx>
  21
  22 #include <cstddef>
  23
  24 #include "osl/diagnose.h"
  25 #include <com/sun/star/lang/IllegalArgumentException.hpp>
  26 #include <rtl/ustrbuf.hxx>
  27 #include <rtl/ustring.hxx>
  28 #include <comphelper/string.hxx>
  29
  30 namespace unnamed_ucb_regexp {} using namespace unnamed_ucb_regexp;
  31     // unnamed namespaces don't work well yet...
  32
  33 using namespace com::sun::star;
  34 using namespace ucb_impl;
  35
  36
  37
  38 //  Regexp
  39
  40
  41
  42 inline Regexp::Regexp(Kind eTheKind, OUString const & rThePrefix,
  43                       bool bTheEmptyDomain, OUString const & rTheInfix,
  44                       bool bTheTranslation,
  45                       OUString const & rTheReversePrefix):
  46     m_eKind(eTheKind),
  47     m_aPrefix(rThePrefix),
  48     m_aInfix(rTheInfix),
  49     m_aReversePrefix(rTheReversePrefix),
  50     m_bEmptyDomain(bTheEmptyDomain),
  51     m_bTranslation(bTheTranslation)
  52 {
  53     OSL_ASSERT(m_eKind == KIND_DOMAIN
  54                || (!m_bEmptyDomain && m_aInfix.isEmpty()));
  55     OSL_ASSERT(m_bTranslation || m_aReversePrefix.isEmpty());
  56 }
  57
  58
  59 namespace unnamed_ucb_regexp {
  60
  61 bool matchStringIgnoreCase(sal_Unicode const ** pBegin,
  62                            sal_Unicode const * pEnd,
  63                            OUString const & rString)
  64 {
  65     sal_Unicode const * p = *pBegin;
  66
  67     sal_Unicode const * q = rString.getStr();
  68     sal_Unicode const * qEnd = q + rString.getLength();
  69
  70     if (pEnd - p < qEnd - q)
  71         return false;
  72
  73     while (q != qEnd)
  74     {
  75         sal_Unicode c1 = *p++;
  76         sal_Unicode c2 = *q++;
  77         if (c1 >= 'a' && c1 <= 'z')
  78             c1 -= 'a' - 'A';
  79         if (c2 >= 'a' && c2 <= 'z')
  80             c2 -= 'a' - 'A';
  81         if (c1 != c2)
  82             return false;
  83     }
  84
  85     *pBegin = p;
  86     return true;
  87 }
  88
  89 }
  90
  91 bool Regexp::matches(OUString const & rString,
  92                      OUString * pTranslation, bool * pTranslated) const
  93 {
  94     sal_Unicode const * pBegin = rString.getStr();
  95     sal_Unicode const * pEnd = pBegin + rString.getLength();
  96
  97     bool bMatches = false;
  98
  99     sal_Unicode const * p = pBegin;
 100     if (matchStringIgnoreCase(&p, pEnd, m_aPrefix))
 101     {
 102         sal_Unicode const * pBlock1Begin = p;
 103         sal_Unicode const * pBlock1End = pEnd;
 104
 105         sal_Unicode const * pBlock2Begin = 0;
 106         sal_Unicode const * pBlock2End = 0;
 107
 108         switch (m_eKind)
 109         {
 110             case KIND_PREFIX:
 111                 bMatches = true;
 112                 break;
 113
 114             case KIND_AUTHORITY:
 115                 bMatches = p == pEnd || *p == '/' || *p == '?' || *p == '#';
 116                 break;
 117
 118             case KIND_DOMAIN:
 119                 if (!m_bEmptyDomain)
 120                 {
 121                     if (p == pEnd || *p == '/' || *p == '?' || *p == '#')
 122                         break;
 123                     ++p;
 124                 }
 125                 for (;;)
 126                 {
 127                     sal_Unicode const * q = p;
 128                     if (matchStringIgnoreCase(&q, pEnd, m_aInfix)
 129                         && (q == pEnd || *q == '/' || *q == '?' || *q == '#'))
 130                     {
 131                         bMatches = true;
 132                         pBlock1End = p;
 133                         pBlock2Begin = q;
 134                         pBlock2End = pEnd;
 135                         break;
 136                     }
 137
 138                     if (p == pEnd)
 139                         break;
 140
 141                     sal_Unicode c = *p++;
 142                     if (c == '/' || c == '?' || c == '#')
 143                         break;
 144                 }
 145                 break;
 146         }
 147
 148         if (bMatches)
 149         {
 150             if (m_bTranslation)
 151             {
 152                 if (pTranslation)
 153                 {
 154                     OUStringBuffer aBuffer(m_aReversePrefix);
 155                     aBuffer.append(pBlock1Begin, pBlock1End - pBlock1Begin);
 156                     aBuffer.append(m_aInfix);
 157                     aBuffer.append(pBlock2Begin, pBlock2End - pBlock2Begin);
 158                     *pTranslation = aBuffer.makeStringAndClear();
 159                 }
 160                 if (pTranslated)
 161                     *pTranslated = true;
 162             }
 163             else
 164             {
 165                 if (pTranslation)
 166                     *pTranslation = rString;
 167                 if (pTranslated)
 168                     *pTranslated = false;
 169             }
 170         }
 171     }
 172
 173     return bMatches;
 174 }
 175
 176
 177 namespace unnamed_ucb_regexp {
 178
 179 bool isScheme(OUString const & rString, bool bColon)
 180 {
 181     using comphelper::string::isalphaAscii;
 182     using comphelper::string::isdigitAscii;
 183     // Return true if rString matches <scheme> (plus a trailing ":" if bColon
 184     // is true) from RFC 2396:
 185     sal_Unicode const * p = rString.getStr();
 186     sal_Unicode const * pEnd = p + rString.getLength();
 187     if (p != pEnd && isalphaAscii(*p))
 188         for (++p;;)
 189         {
 190             if (p == pEnd)
 191                 return !bColon;
 192             sal_Unicode c = *p++;
 193             if (!(isalphaAscii(c) || isdigitAscii(c)
 194                   || c == '+' || c == '-' || c == '.'))
 195                 return bColon && c == ':' && p == pEnd;
 196         }
 197     return false;
 198 }
 199
 200 void appendStringLiteral(OUStringBuffer * pBuffer,
 201                          OUString const & rString)
 202 {
 203     OSL_ASSERT(pBuffer);
 204
 205     pBuffer->append('"');
 206     sal_Unicode const * p = rString.getStr();
 207     sal_Unicode const * pEnd = p + rString.getLength();
 208     while (p != pEnd)
 209     {
 210         sal_Unicode c = *p++;
 211         if (c == '"' || c == '\\')
 212             pBuffer->append('\\');
 213         pBuffer->append(c);
 214     }
 215     pBuffer->append('"');
 216 }
 217
 218 }
 219
 220 OUString Regexp::getRegexp(bool bReverse) const
 221 {
 222     if (m_bTranslation)
 223     {
 224         OUStringBuffer aBuffer;
 225         if (bReverse)
 226         {
 227             if (!m_aReversePrefix.isEmpty())
 228                 appendStringLiteral(&aBuffer, m_aReversePrefix);
 229         }
 230         else
 231         {
 232             if (!m_aPrefix.isEmpty())
 233                 appendStringLiteral(&aBuffer, m_aPrefix);
 234         }
 235         switch (m_eKind)
 236         {
 237             case KIND_PREFIX:
 238                 aBuffer.append("(.*)");
 239                 break;
 240
 241             case KIND_AUTHORITY:
 242                 aBuffer.append("(([/?#].*)?)");
 243                 break;
 244
 245             case KIND_DOMAIN:
 246                 aBuffer.append("([^/?#]");
 247                 aBuffer.append(sal_Unicode(m_bEmptyDomain ? '*' : '+'));
 248                 if (!m_aInfix.isEmpty())
 249                     appendStringLiteral(&aBuffer, m_aInfix);
 250                 aBuffer.append("([/?#].*)?)");
 251                 break;
 252         }
 253         aBuffer.append("->");
 254         if (bReverse)
 255         {
 256             if (!m_aPrefix.isEmpty())
 257                 appendStringLiteral(&aBuffer, m_aPrefix);
 258         }
 259         else
 260         {
 261             if (!m_aReversePrefix.isEmpty())
 262                 appendStringLiteral(&aBuffer, m_aReversePrefix);
 263         }
 264         aBuffer.append("\\1");
 265         return aBuffer.makeStringAndClear();
 266     }
 267     else if (m_eKind == KIND_PREFIX && isScheme(m_aPrefix, true))
 268         return m_aPrefix.copy(0, m_aPrefix.getLength() - 1);
 269     else
 270     {
 271         OUStringBuffer aBuffer;
 272         if (!m_aPrefix.isEmpty())
 273             appendStringLiteral(&aBuffer, m_aPrefix);
 274         switch (m_eKind)
 275         {
 276             case KIND_PREFIX:
 277                 aBuffer.append(".*");
 278                 break;
 279
 280             case KIND_AUTHORITY:
 281                 aBuffer.append("([/?#].*)?");
 282                 break;
 283
 284             case KIND_DOMAIN:
 285                 aBuffer.append("[^/?#]");
 286                 aBuffer.append( m_bEmptyDomain ? '*' : '+' );
 287                 if (!m_aInfix.isEmpty())
 288                     appendStringLiteral(&aBuffer, m_aInfix);
 289                 aBuffer.append("([/?#].*)?");
 290                 break;
 291         }
 292         return aBuffer.makeStringAndClear();
 293     }
 294 }
 295
 296
 297 namespace unnamed_ucb_regexp {
 298
 299 bool matchString(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
 300                  sal_Char const * pString, size_t nStringLength)
 301 {
 302     sal_Unicode const * p = *pBegin;
 303
 304     unsigned char const * q = reinterpret_cast< unsigned char const * >(pString);
 305     unsigned char const * qEnd = q + nStringLength;
 306
 307     if (pEnd - p < qEnd - q)
 308         return false;
 309
 310     while (q != qEnd)
 311     {
 312         sal_Unicode c1 = *p++;
 313         sal_Unicode c2 = *q++;
 314         if (c1 != c2)
 315             return false;
 316     }
 317
 318     *pBegin = p;
 319     return true;
 320 }
 321
 322 bool scanStringLiteral(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
 323                        OUString * pString)
 324 {
 325     sal_Unicode const * p = *pBegin;
 326
 327     if (p == pEnd || *p++ != '"')
 328         return false;
 329
 330     OUStringBuffer aBuffer;
 331     for (;;)
 332     {
 333         if (p == pEnd)
 334             return false;
 335         sal_Unicode c = *p++;
 336         if (c == '"')
 337             break;
 338         if (c == '\\')
 339         {
 340             if (p == pEnd)
 341                 return false;
 342             c = *p++;
 343             if (c != '"' && c != '\\')
 344                 return false;
 345         }
 346         aBuffer.append(c);
 347     }
 348
 349     *pBegin = p;
 350     *pString = aBuffer.makeStringAndClear();
 351     return true;
 352 }
 353
 354 }
 355
 356 Regexp Regexp::parse(OUString const & rRegexp)
 357 {
 358     // Detect an input of '<scheme>' as an abbreviation of '"<scheme>:".*'
 359     // where <scheme> is as defined in RFC 2396:
 360     if (isScheme(rRegexp, false))
 361         return Regexp(Regexp::KIND_PREFIX,
 362                       rRegexp + ":",
 363                       false,
 364                       OUString(),
 365                       false,
 366                       OUString());
 367
 368     sal_Unicode const * p = rRegexp.getStr();
 369     sal_Unicode const * pEnd = p + rRegexp.getLength();
 370
 371     OUString aPrefix;
 372     scanStringLiteral(&p, pEnd, &aPrefix);
 373
 374     if (p == pEnd)
 375         throw lang::IllegalArgumentException();
 376
 377     // This and the matchString() calls below are some of the few places where
 378     // RTL_CONSTASCII_STRINGPARAM() should NOT be removed.
 379     // (c.f. https://gerrit.libreoffice.org/3117)
 380     if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(".*")))
 381     {
 382         if (p != pEnd)
 383             throw lang::IllegalArgumentException();
 384
 385         return Regexp(Regexp::KIND_PREFIX, aPrefix, false, OUString(),
 386                       false, OUString());
 387     }
 388     else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("(.*)->")))
 389     {
 390         OUString aReversePrefix;
 391         scanStringLiteral(&p, pEnd, &aReversePrefix);
 392
 393         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
 394             || p != pEnd)
 395             throw lang::IllegalArgumentException();
 396
 397         return Regexp(Regexp::KIND_PREFIX, aPrefix, false, OUString(),
 398                       true, aReversePrefix);
 399     }
 400     else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
 401     {
 402         if (p != pEnd)
 403             throw lang::IllegalArgumentException();
 404
 405         return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, OUString(),
 406                       false, OUString());
 407     }
 408     else if (matchString(&p, pEnd,
 409                          RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)->")))
 410     {
 411         OUString aReversePrefix;
 412         if (!(scanStringLiteral(&p, pEnd, &aReversePrefix)
 413               && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
 414               && p == pEnd))
 415             throw lang::IllegalArgumentException();
 416
 417         return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, OUString(),
 418                       true, aReversePrefix);
 419     }
 420     else
 421     {
 422         bool bOpen = false;
 423         if (p != pEnd && *p == '(')
 424         {
 425             ++p;
 426             bOpen = true;
 427         }
 428
 429         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("[^/?#]")))
 430             throw lang::IllegalArgumentException();
 431
 432         if (p == pEnd || (*p != '*' && *p != '+'))
 433             throw lang::IllegalArgumentException();
 434         bool bEmptyDomain = *p++ == '*';
 435
 436         OUString aInfix;
 437         scanStringLiteral(&p, pEnd, &aInfix);
 438
 439         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
 440             throw lang::IllegalArgumentException();
 441
 442         OUString aReversePrefix;
 443         if (bOpen
 444             && !(matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(")->"))
 445                  && scanStringLiteral(&p, pEnd, &aReversePrefix)
 446                  && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))))
 447             throw lang::IllegalArgumentException();
 448
 449         if (p != pEnd)
 450             throw lang::IllegalArgumentException();
 451
 452         return Regexp(Regexp::KIND_DOMAIN, aPrefix, bEmptyDomain, aInfix,
 453                       bOpen, aReversePrefix);
 454     }
 455 }
 456
 457 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */