ucb/source/regexp/regexp.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*************************************************************************
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * Copyright 2000, 2010 Oracle and/or its affiliates.
   7  *
   8  * OpenOffice.org - a multi-platform office productivity suite
   9  *
  10  * This file is part of OpenOffice.org.
  11  *
  12  * OpenOffice.org is free software: you can redistribute it and/or modify
  13  * it under the terms of the GNU Lesser General Public License version 3
  14  * only, as published by the Free Software Foundation.
  15  *
  16  * OpenOffice.org is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19  * GNU Lesser General Public License version 3 for more details
  20  * (a copy is included in the LICENSE file that accompanied this code).
  21  *
  22  * You should have received a copy of the GNU Lesser General Public License
  23  * version 3 along with OpenOffice.org.  If not, see
  24  * <http://www.openoffice.org/license.html>
  25  * for a copy of the LGPLv3 License.
  26  *
  27  ************************************************************************/
  28
  29 #include <regexp.hxx>
  30
  31 #include <cstddef>
  32
  33 #include "osl/diagnose.h"
  34 #include <com/sun/star/lang/IllegalArgumentException.hpp>
  35 #include <rtl/ustrbuf.hxx>
  36 #include <rtl/ustring.hxx>
  37 #include <comphelper/string.hxx>
  38
  39 namespace unnamed_ucb_regexp {} using namespace unnamed_ucb_regexp;
  40     // unnamed namespaces don't work well yet...
  41
  42 using namespace com::sun::star;
  43 using namespace ucb_impl;
  44
  45 //============================================================================
  46 //
  47 //  Regexp
  48 //
  49 //============================================================================
  50
  51 inline Regexp::Regexp(Kind eTheKind, rtl::OUString const & rThePrefix,
  52                       bool bTheEmptyDomain, rtl::OUString const & rTheInfix,
  53                       bool bTheTranslation,
  54                       rtl::OUString const & rTheReversePrefix):
  55     m_eKind(eTheKind),
  56     m_aPrefix(rThePrefix),
  57     m_aInfix(rTheInfix),
  58     m_aReversePrefix(rTheReversePrefix),
  59     m_bEmptyDomain(bTheEmptyDomain),
  60     m_bTranslation(bTheTranslation)
  61 {
  62     OSL_ASSERT(m_eKind == KIND_DOMAIN
  63                || (!m_bEmptyDomain && m_aInfix.isEmpty()));
  64     OSL_ASSERT(m_bTranslation || m_aReversePrefix.isEmpty());
  65 }
  66
  67 //============================================================================
  68 namespace unnamed_ucb_regexp {
  69
  70 bool matchStringIgnoreCase(sal_Unicode const ** pBegin,
  71                            sal_Unicode const * pEnd,
  72                            rtl::OUString const & rString)
  73 {
  74     sal_Unicode const * p = *pBegin;
  75
  76     sal_Unicode const * q = rString.getStr();
  77     sal_Unicode const * qEnd = q + rString.getLength();
  78
  79     if (pEnd - p < qEnd - q)
  80         return false;
  81
  82     while (q != qEnd)
  83     {
  84         sal_Unicode c1 = *p++;
  85         sal_Unicode c2 = *q++;
  86         if (c1 >= 'a' && c1 <= 'z')
  87             c1 -= 'a' - 'A';
  88         if (c2 >= 'a' && c2 <= 'z')
  89             c2 -= 'a' - 'A';
  90         if (c1 != c2)
  91             return false;
  92     }
  93
  94     *pBegin = p;
  95     return true;
  96 }
  97
  98 }
  99
 100 bool Regexp::matches(rtl::OUString const & rString,
 101                      rtl::OUString * pTranslation, bool * pTranslated) const
 102 {
 103     sal_Unicode const * pBegin = rString.getStr();
 104     sal_Unicode const * pEnd = pBegin + rString.getLength();
 105
 106     bool bMatches = false;
 107
 108     sal_Unicode const * p = pBegin;
 109     if (matchStringIgnoreCase(&p, pEnd, m_aPrefix))
 110     {
 111         sal_Unicode const * pBlock1Begin = p;
 112         sal_Unicode const * pBlock1End = pEnd;
 113
 114         sal_Unicode const * pBlock2Begin = 0;
 115         sal_Unicode const * pBlock2End = 0;
 116
 117         switch (m_eKind)
 118         {
 119             case KIND_PREFIX:
 120                 bMatches = true;
 121                 break;
 122
 123             case KIND_AUTHORITY:
 124                 bMatches = p == pEnd || *p == '/' || *p == '?' || *p == '#';
 125                 break;
 126
 127             case KIND_DOMAIN:
 128                 if (!m_bEmptyDomain)
 129                 {
 130                     if (p == pEnd || *p == '/' || *p == '?' || *p == '#')
 131                         break;
 132                     ++p;
 133                 }
 134                 for (;;)
 135                 {
 136                     sal_Unicode const * q = p;
 137                     if (matchStringIgnoreCase(&q, pEnd, m_aInfix)
 138                         && (q == pEnd || *q == '/' || *q == '?' || *q == '#'))
 139                     {
 140                         bMatches = true;
 141                         pBlock1End = p;
 142                         pBlock2Begin = q;
 143                         pBlock2End = pEnd;
 144                         break;
 145                     }
 146
 147                     if (p == pEnd)
 148                         break;
 149
 150                     sal_Unicode c = *p++;
 151                     if (c == '/' || c == '?' || c == '#')
 152                         break;
 153                 }
 154                 break;
 155         }
 156
 157         if (bMatches)
 158         {
 159             if (m_bTranslation)
 160             {
 161                 if (pTranslation)
 162                 {
 163                     rtl::OUStringBuffer aBuffer(m_aReversePrefix);
 164                     aBuffer.append(pBlock1Begin, pBlock1End - pBlock1Begin);
 165                     aBuffer.append(m_aInfix);
 166                     aBuffer.append(pBlock2Begin, pBlock2End - pBlock2Begin);
 167                     *pTranslation = aBuffer.makeStringAndClear();
 168                 }
 169                 if (pTranslated)
 170                     *pTranslated = true;
 171             }
 172             else
 173             {
 174                 if (pTranslation)
 175                     *pTranslation = rString;
 176                 if (pTranslated)
 177                     *pTranslated = false;
 178             }
 179         }
 180     }
 181
 182     return bMatches;
 183 }
 184
 185 //============================================================================
 186 namespace unnamed_ucb_regexp {
 187
 188 bool isScheme(rtl::OUString const & rString, bool bColon)
 189 {
 190     using comphelper::string::isalphaAscii;
 191     using comphelper::string::isdigitAscii;
 192     // Return true if rString matches <scheme> (plus a trailing ":" if bColon
 193     // is true) from RFC 2396:
 194     sal_Unicode const * p = rString.getStr();
 195     sal_Unicode const * pEnd = p + rString.getLength();
 196     if (p != pEnd && isalphaAscii(*p))
 197         for (++p;;)
 198         {
 199             if (p == pEnd)
 200                 return !bColon;
 201             sal_Unicode c = *p++;
 202             if (!(isalphaAscii(c) || isdigitAscii(c)
 203                   || c == '+' || c == '-' || c == '.'))
 204                 return bColon && c == ':' && p == pEnd;
 205         }
 206     return false;
 207 }
 208
 209 void appendStringLiteral(rtl::OUStringBuffer * pBuffer,
 210                          rtl::OUString const & rString)
 211 {
 212     OSL_ASSERT(pBuffer);
 213
 214     pBuffer->append(sal_Unicode('"'));
 215     sal_Unicode const * p = rString.getStr();
 216     sal_Unicode const * pEnd = p + rString.getLength();
 217     while (p != pEnd)
 218     {
 219         sal_Unicode c = *p++;
 220         if (c == '"' || c == '\\')
 221             pBuffer->append(sal_Unicode('\\'));
 222         pBuffer->append(c);
 223     }
 224     pBuffer->append(sal_Unicode('"'));
 225 }
 226
 227 }
 228
 229 rtl::OUString Regexp::getRegexp(bool bReverse) const
 230 {
 231     if (m_bTranslation)
 232     {
 233         rtl::OUStringBuffer aBuffer;
 234         if (bReverse)
 235         {
 236             if (!m_aReversePrefix.isEmpty())
 237                 appendStringLiteral(&aBuffer, m_aReversePrefix);
 238         }
 239         else
 240         {
 241             if (!m_aPrefix.isEmpty())
 242                 appendStringLiteral(&aBuffer, m_aPrefix);
 243         }
 244         switch (m_eKind)
 245         {
 246             case KIND_PREFIX:
 247                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("(.*)"));
 248                 break;
 249
 250             case KIND_AUTHORITY:
 251                 aBuffer.
 252                     appendAscii(RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)"));
 253                 break;
 254
 255             case KIND_DOMAIN:
 256                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([^/?#]"));
 257                 aBuffer.append(sal_Unicode(m_bEmptyDomain ? '*' : '+'));
 258                 if (!m_aInfix.isEmpty())
 259                     appendStringLiteral(&aBuffer, m_aInfix);
 260                 aBuffer.
 261                     appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?)"));
 262                 break;
 263         }
 264         aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("->"));
 265         if (bReverse)
 266         {
 267             if (!m_aPrefix.isEmpty())
 268                 appendStringLiteral(&aBuffer, m_aPrefix);
 269         }
 270         else
 271         {
 272             if (!m_aReversePrefix.isEmpty())
 273                 appendStringLiteral(&aBuffer, m_aReversePrefix);
 274         }
 275         aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("\\1"));
 276         return aBuffer.makeStringAndClear();
 277     }
 278     else if (m_eKind == KIND_PREFIX && isScheme(m_aPrefix, true))
 279         return m_aPrefix.copy(0, m_aPrefix.getLength() - 1);
 280     else
 281     {
 282         rtl::OUStringBuffer aBuffer;
 283         if (!m_aPrefix.isEmpty())
 284             appendStringLiteral(&aBuffer, m_aPrefix);
 285         switch (m_eKind)
 286         {
 287             case KIND_PREFIX:
 288                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM(".*"));
 289                 break;
 290
 291             case KIND_AUTHORITY:
 292                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?"));
 293                 break;
 294
 295             case KIND_DOMAIN:
 296                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("[^/?#]"));
 297                 aBuffer.append(sal_Unicode(m_bEmptyDomain ? '*' : '+'));
 298                 if (!m_aInfix.isEmpty())
 299                     appendStringLiteral(&aBuffer, m_aInfix);
 300                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?"));
 301                 break;
 302         }
 303         return aBuffer.makeStringAndClear();
 304     }
 305 }
 306
 307 //============================================================================
 308 namespace unnamed_ucb_regexp {
 309
 310 bool matchString(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
 311                  sal_Char const * pString, size_t nStringLength)
 312 {
 313     sal_Unicode const * p = *pBegin;
 314
 315     sal_uChar const * q = reinterpret_cast< sal_uChar const * >(pString);
 316     sal_uChar const * qEnd = q + nStringLength;
 317
 318     if (pEnd - p < qEnd - q)
 319         return false;
 320
 321     while (q != qEnd)
 322     {
 323         sal_Unicode c1 = *p++;
 324         sal_Unicode c2 = *q++;
 325         if (c1 != c2)
 326             return false;
 327     }
 328
 329     *pBegin = p;
 330     return true;
 331 }
 332
 333 bool scanStringLiteral(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
 334                        rtl::OUString * pString)
 335 {
 336     sal_Unicode const * p = *pBegin;
 337
 338     if (p == pEnd || *p++ != '"')
 339         return false;
 340
 341     rtl::OUStringBuffer aBuffer;
 342     for (;;)
 343     {
 344         if (p == pEnd)
 345             return false;
 346         sal_Unicode c = *p++;
 347         if (c == '"')
 348             break;
 349         if (c == '\\')
 350         {
 351             if (p == pEnd)
 352                 return false;
 353             c = *p++;
 354             if (c != '"' && c != '\\')
 355                 return false;
 356         }
 357         aBuffer.append(c);
 358     }
 359
 360     *pBegin = p;
 361     *pString = aBuffer.makeStringAndClear();
 362     return true;
 363 }
 364
 365 }
 366
 367 Regexp Regexp::parse(rtl::OUString const & rRegexp)
 368 {
 369     // Detect an input of '<scheme>' as an abbreviation of '"<scheme>:".*'
 370     // where <scheme> is as defined in RFC 2396:
 371     if (isScheme(rRegexp, false))
 372         return Regexp(Regexp::KIND_PREFIX,
 373                       rRegexp
 374                           + rtl::OUString(RTL_CONSTASCII_USTRINGPARAM(":")),
 375                       false,
 376                       rtl::OUString(),
 377                       false,
 378                       rtl::OUString());
 379
 380     sal_Unicode const * p = rRegexp.getStr();
 381     sal_Unicode const * pEnd = p + rRegexp.getLength();
 382
 383     rtl::OUString aPrefix;
 384     scanStringLiteral(&p, pEnd, &aPrefix);
 385
 386     if (p == pEnd)
 387         throw lang::IllegalArgumentException();
 388
 389     if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(".*")))
 390     {
 391         if (p != pEnd)
 392             throw lang::IllegalArgumentException();
 393
 394         return Regexp(Regexp::KIND_PREFIX, aPrefix, false, rtl::OUString(),
 395                       false, rtl::OUString());
 396     }
 397     else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("(.*)->")))
 398     {
 399         rtl::OUString aReversePrefix;
 400         scanStringLiteral(&p, pEnd, &aReversePrefix);
 401
 402         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
 403             || p != pEnd)
 404             throw lang::IllegalArgumentException();
 405
 406         return Regexp(Regexp::KIND_PREFIX, aPrefix, false, rtl::OUString(),
 407                       true, aReversePrefix);
 408     }
 409     else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
 410     {
 411         if (p != pEnd)
 412             throw lang::IllegalArgumentException();
 413
 414         return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, rtl::OUString(),
 415                       false, rtl::OUString());
 416     }
 417     else if (matchString(&p, pEnd,
 418                          RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)->")))
 419     {
 420         rtl::OUString aReversePrefix;
 421         if (!(scanStringLiteral(&p, pEnd, &aReversePrefix)
 422               && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
 423               && p == pEnd))
 424             throw lang::IllegalArgumentException();
 425
 426         return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, rtl::OUString(),
 427                       true, aReversePrefix);
 428     }
 429     else
 430     {
 431         bool bOpen = false;
 432         if (p != pEnd && *p == '(')
 433         {
 434             ++p;
 435             bOpen = true;
 436         }
 437
 438         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("[^/?#]")))
 439             throw lang::IllegalArgumentException();
 440
 441         if (p == pEnd || (*p != '*' && *p != '+'))
 442             throw lang::IllegalArgumentException();
 443         bool bEmptyDomain = *p++ == '*';
 444
 445         rtl::OUString aInfix;
 446         scanStringLiteral(&p, pEnd, &aInfix);
 447
 448         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
 449             throw lang::IllegalArgumentException();
 450
 451         rtl::OUString aReversePrefix;
 452         if (bOpen
 453             && !(matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(")->"))
 454                  && scanStringLiteral(&p, pEnd, &aReversePrefix)
 455                  && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))))
 456             throw lang::IllegalArgumentException();
 457
 458         if (p != pEnd)
 459             throw lang::IllegalArgumentException();
 460
 461         return Regexp(Regexp::KIND_DOMAIN, aPrefix, bEmptyDomain, aInfix,
 462                       bOpen, aReversePrefix);
 463     }
 464 }
 465
 466 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */