ucb/source/regexp/regexp.cxx

   1 /*************************************************************************
   2  *
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * Copyright 2008 by Sun Microsystems, Inc.
   6  *
   7  * OpenOffice.org - a multi-platform office productivity suite
   8  *
   9  * $RCSfile: regexp.cxx,v $
  10  * $Revision: 1.8 $
  11  *
  12  * This file is part of OpenOffice.org.
  13  *
  14  * OpenOffice.org is free software: you can redistribute it and/or modify
  15  * it under the terms of the GNU Lesser General Public License version 3
  16  * only, as published by the Free Software Foundation.
  17  *
  18  * OpenOffice.org is distributed in the hope that it will be useful,
  19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21  * GNU Lesser General Public License version 3 for more details
  22  * (a copy is included in the LICENSE file that accompanied this code).
  23  *
  24  * You should have received a copy of the GNU Lesser General Public License
  25  * version 3 along with OpenOffice.org.  If not, see
  26  * <http://www.openoffice.org/license.html>
  27  * for a copy of the LGPLv3 License.
  28  *
  29  ************************************************************************/
  30
  31 // MARKER(update_precomp.py): autogen include statement, do not remove
  32 #include "precompiled_ucb.hxx"
  33 #include <regexp.hxx>
  34
  35 #include <cstddef>
  36
  37 #include "osl/diagnose.h"
  38 #include <com/sun/star/lang/IllegalArgumentException.hpp>
  39 #include <rtl/ustrbuf.hxx>
  40 #include <rtl/ustring.hxx>
  41
  42 namespace unnamed_ucb_regexp {} using namespace unnamed_ucb_regexp;
  43     // unnamed namespaces don't work well yet...
  44
  45 using namespace com::sun::star;
  46 using namespace ucb_impl;
  47
  48 //============================================================================
  49 //
  50 //  Regexp
  51 //
  52 //============================================================================
  53
  54 inline Regexp::Regexp(Kind eTheKind, rtl::OUString const & rThePrefix,
  55                       bool bTheEmptyDomain, rtl::OUString const & rTheInfix,
  56                       bool bTheTranslation,
  57                       rtl::OUString const & rTheReversePrefix):
  58     m_eKind(eTheKind),
  59     m_aPrefix(rThePrefix),
  60     m_aInfix(rTheInfix),
  61     m_aReversePrefix(rTheReversePrefix),
  62     m_bEmptyDomain(bTheEmptyDomain),
  63     m_bTranslation(bTheTranslation)
  64 {
  65     OSL_ASSERT(m_eKind == KIND_DOMAIN
  66                || !m_bEmptyDomain && m_aInfix.getLength() == 0);
  67     OSL_ASSERT(m_bTranslation || m_aReversePrefix.getLength() == 0);
  68 }
  69
  70 //============================================================================
  71 namespace unnamed_ucb_regexp {
  72
  73 bool matchStringIgnoreCase(sal_Unicode const ** pBegin,
  74                            sal_Unicode const * pEnd,
  75                            rtl::OUString const & rString)
  76 {
  77     sal_Unicode const * p = *pBegin;
  78
  79     sal_Unicode const * q = rString.getStr();
  80     sal_Unicode const * qEnd = q + rString.getLength();
  81
  82     if (pEnd - p < qEnd - q)
  83         return false;
  84
  85     while (q != qEnd)
  86     {
  87         sal_Unicode c1 = *p++;
  88         sal_Unicode c2 = *q++;
  89         if (c1 >= 'a' && c1 <= 'z')
  90             c1 -= 'a' - 'A';
  91         if (c2 >= 'a' && c2 <= 'z')
  92             c2 -= 'a' - 'A';
  93         if (c1 != c2)
  94             return false;
  95     }
  96
  97     *pBegin = p;
  98     return true;
  99 }
 100
 101 }
 102
 103 bool Regexp::matches(rtl::OUString const & rString,
 104                      rtl::OUString * pTranslation, bool * pTranslated) const
 105 {
 106     sal_Unicode const * pBegin = rString.getStr();
 107     sal_Unicode const * pEnd = pBegin + rString.getLength();
 108
 109     bool bMatches = false;
 110
 111     sal_Unicode const * p = pBegin;
 112     if (matchStringIgnoreCase(&p, pEnd, m_aPrefix))
 113     {
 114         sal_Unicode const * pBlock1Begin = p;
 115         sal_Unicode const * pBlock1End = pEnd;
 116
 117         sal_Unicode const * pBlock2Begin = 0;
 118         sal_Unicode const * pBlock2End = 0;
 119
 120         switch (m_eKind)
 121         {
 122             case KIND_PREFIX:
 123                 bMatches = true;
 124                 break;
 125
 126             case KIND_AUTHORITY:
 127                 bMatches = p == pEnd || *p == '/' || *p == '?' || *p == '#';
 128                 break;
 129
 130             case KIND_DOMAIN:
 131                 if (!m_bEmptyDomain)
 132                 {
 133                     if (p == pEnd || *p == '/' || *p == '?' || *p == '#')
 134                         break;
 135                     ++p;
 136                 }
 137                 for (;;)
 138                 {
 139                     sal_Unicode const * q = p;
 140                     if (matchStringIgnoreCase(&q, pEnd, m_aInfix)
 141                         && (q == pEnd || *q == '/' || *q == '?' || *q == '#'))
 142                     {
 143                         bMatches = true;
 144                         pBlock1End = p;
 145                         pBlock2Begin = q;
 146                         pBlock2End = pEnd;
 147                         break;
 148                     }
 149
 150                     if (p == pEnd)
 151                         break;
 152
 153                     sal_Unicode c = *p++;
 154                     if (c == '/' || c == '?' || c == '#')
 155                         break;
 156                 }
 157                 break;
 158         }
 159
 160         if (bMatches)
 161         {
 162             if (m_bTranslation)
 163             {
 164                 if (pTranslation)
 165                 {
 166                     rtl::OUStringBuffer aBuffer(m_aReversePrefix);
 167                     aBuffer.append(pBlock1Begin, pBlock1End - pBlock1Begin);
 168                     aBuffer.append(m_aInfix);
 169                     aBuffer.append(pBlock2Begin, pBlock2End - pBlock2Begin);
 170                     *pTranslation = aBuffer.makeStringAndClear();
 171                 }
 172                 if (pTranslated)
 173                     *pTranslated = true;
 174             }
 175             else
 176             {
 177                 if (pTranslation)
 178                     *pTranslation = rString;
 179                 if (pTranslated)
 180                     *pTranslated = false;
 181             }
 182         }
 183     }
 184
 185     return bMatches;
 186 }
 187
 188 //============================================================================
 189 namespace unnamed_ucb_regexp {
 190
 191 inline bool isAlpha(sal_Unicode c)
 192 {
 193     return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
 194 }
 195
 196 inline bool isDigit(sal_Unicode c)
 197 {
 198     return c >= '0' && c <= '9';
 199 }
 200
 201 bool isScheme(rtl::OUString const & rString, bool bColon)
 202 {
 203     // Return true if rString matches <scheme> (plus a trailing ":" if bColon
 204     // is true) from RFC 2396:
 205     sal_Unicode const * p = rString.getStr();
 206     sal_Unicode const * pEnd = p + rString.getLength();
 207     if (p != pEnd && isAlpha(*p))
 208         for (++p;;)
 209         {
 210             if (p == pEnd)
 211                 return !bColon;
 212             sal_Unicode c = *p++;
 213             if (!(isAlpha(c) || isDigit(c)
 214                   || c == '+' || c == '-' || c == '.'))
 215                 return bColon && c == ':' && p == pEnd;
 216         }
 217     return false;
 218 }
 219
 220 void appendStringLiteral(rtl::OUStringBuffer * pBuffer,
 221                          rtl::OUString const & rString)
 222 {
 223     OSL_ASSERT(pBuffer);
 224
 225     pBuffer->append(sal_Unicode('"'));
 226     sal_Unicode const * p = rString.getStr();
 227     sal_Unicode const * pEnd = p + rString.getLength();
 228     while (p != pEnd)
 229     {
 230         sal_Unicode c = *p++;
 231         if (c == '"' || c == '\\')
 232             pBuffer->append(sal_Unicode('\\'));
 233         pBuffer->append(c);
 234     }
 235     pBuffer->append(sal_Unicode('"'));
 236 }
 237
 238 }
 239
 240 rtl::OUString Regexp::getRegexp(bool bReverse) const
 241 {
 242     if (m_bTranslation)
 243     {
 244         rtl::OUStringBuffer aBuffer;
 245         if (bReverse)
 246         {
 247             if (m_aReversePrefix.getLength() != 0)
 248                 appendStringLiteral(&aBuffer, m_aReversePrefix);
 249         }
 250         else
 251         {
 252             if (m_aPrefix.getLength() != 0)
 253                 appendStringLiteral(&aBuffer, m_aPrefix);
 254         }
 255         switch (m_eKind)
 256         {
 257             case KIND_PREFIX:
 258                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("(.*)"));
 259                 break;
 260
 261             case KIND_AUTHORITY:
 262                 aBuffer.
 263                     appendAscii(RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)"));
 264                 break;
 265
 266             case KIND_DOMAIN:
 267                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([^/?#]"));
 268                 aBuffer.append(sal_Unicode(m_bEmptyDomain ? '*' : '+'));
 269                 if (m_aInfix.getLength() != 0)
 270                     appendStringLiteral(&aBuffer, m_aInfix);
 271                 aBuffer.
 272                     appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?)"));
 273                 break;
 274         }
 275         aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("->"));
 276         if (bReverse)
 277         {
 278             if (m_aPrefix.getLength() != 0)
 279                 appendStringLiteral(&aBuffer, m_aPrefix);
 280         }
 281         else
 282         {
 283             if (m_aReversePrefix.getLength() != 0)
 284                 appendStringLiteral(&aBuffer, m_aReversePrefix);
 285         }
 286         aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("\\1"));
 287         return aBuffer.makeStringAndClear();
 288     }
 289     else if (m_eKind == KIND_PREFIX && isScheme(m_aPrefix, true))
 290         return m_aPrefix.copy(0, m_aPrefix.getLength() - 1);
 291     else
 292     {
 293         rtl::OUStringBuffer aBuffer;
 294         if (m_aPrefix.getLength() != 0)
 295             appendStringLiteral(&aBuffer, m_aPrefix);
 296         switch (m_eKind)
 297         {
 298             case KIND_PREFIX:
 299                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM(".*"));
 300                 break;
 301
 302             case KIND_AUTHORITY:
 303                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?"));
 304                 break;
 305
 306             case KIND_DOMAIN:
 307                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("[^/?#]"));
 308                 aBuffer.append(sal_Unicode(m_bEmptyDomain ? '*' : '+'));
 309                 if (m_aInfix.getLength() != 0)
 310                     appendStringLiteral(&aBuffer, m_aInfix);
 311                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?"));
 312                 break;
 313         }
 314         return aBuffer.makeStringAndClear();
 315     }
 316 }
 317
 318 //============================================================================
 319 namespace unnamed_ucb_regexp {
 320
 321 bool matchString(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
 322                  sal_Char const * pString, size_t nStringLength)
 323 {
 324     sal_Unicode const * p = *pBegin;
 325
 326     sal_uChar const * q = reinterpret_cast< sal_uChar const * >(pString);
 327     sal_uChar const * qEnd = q + nStringLength;
 328
 329     if (pEnd - p < qEnd - q)
 330         return false;
 331
 332     while (q != qEnd)
 333     {
 334         sal_Unicode c1 = *p++;
 335         sal_Unicode c2 = *q++;
 336         if (c1 != c2)
 337             return false;
 338     }
 339
 340     *pBegin = p;
 341     return true;
 342 }
 343
 344 bool scanStringLiteral(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
 345                        rtl::OUString * pString)
 346 {
 347     sal_Unicode const * p = *pBegin;
 348
 349     if (p == pEnd || *p++ != '"')
 350         return false;
 351
 352     rtl::OUStringBuffer aBuffer;
 353     for (;;)
 354     {
 355         if (p == pEnd)
 356             return false;
 357         sal_Unicode c = *p++;
 358         if (c == '"')
 359             break;
 360         if (c == '\\')
 361         {
 362             if (p == pEnd)
 363                 return false;
 364             c = *p++;
 365             if (c != '"' && c != '\\')
 366                 return false;
 367         }
 368         aBuffer.append(c);
 369     }
 370
 371     *pBegin = p;
 372     *pString = aBuffer.makeStringAndClear();
 373     return true;
 374 }
 375
 376 }
 377
 378 Regexp Regexp::parse(rtl::OUString const & rRegexp)
 379 {
 380     // Detect an input of '<scheme>' as an abbreviation of '"<scheme>:".*'
 381     // where <scheme> is as defined in RFC 2396:
 382     if (isScheme(rRegexp, false))
 383         return Regexp(Regexp::KIND_PREFIX,
 384                       rRegexp
 385                           + rtl::OUString(RTL_CONSTASCII_USTRINGPARAM(":")),
 386                       false,
 387                       rtl::OUString(),
 388                       false,
 389                       rtl::OUString());
 390
 391     sal_Unicode const * p = rRegexp.getStr();
 392     sal_Unicode const * pEnd = p + rRegexp.getLength();
 393
 394     rtl::OUString aPrefix;
 395     scanStringLiteral(&p, pEnd, &aPrefix);
 396
 397     if (p == pEnd)
 398         throw lang::IllegalArgumentException();
 399
 400     if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(".*")))
 401     {
 402         if (p != pEnd)
 403             throw lang::IllegalArgumentException();
 404
 405         return Regexp(Regexp::KIND_PREFIX, aPrefix, false, rtl::OUString(),
 406                       false, rtl::OUString());
 407     }
 408     else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("(.*)->")))
 409     {
 410         rtl::OUString aReversePrefix;
 411         scanStringLiteral(&p, pEnd, &aReversePrefix);
 412
 413         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
 414             || p != pEnd)
 415             throw lang::IllegalArgumentException();
 416
 417         return Regexp(Regexp::KIND_PREFIX, aPrefix, false, rtl::OUString(),
 418                       true, aReversePrefix);
 419     }
 420     else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
 421     {
 422         if (p != pEnd)
 423             throw lang::IllegalArgumentException();
 424
 425         return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, rtl::OUString(),
 426                       false, rtl::OUString());
 427     }
 428     else if (matchString(&p, pEnd,
 429                          RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)->")))
 430     {
 431         rtl::OUString aReversePrefix;
 432         if (!(scanStringLiteral(&p, pEnd, &aReversePrefix)
 433               && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
 434               && p == pEnd))
 435             throw lang::IllegalArgumentException();
 436
 437         return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, rtl::OUString(),
 438                       true, aReversePrefix);
 439     }
 440     else
 441     {
 442         bool bOpen = false;
 443         if (p != pEnd && *p == '(')
 444         {
 445             ++p;
 446             bOpen = true;
 447         }
 448
 449         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("[^/?#]")))
 450             throw lang::IllegalArgumentException();
 451
 452         if (p == pEnd || (*p != '*' && *p != '+'))
 453             throw lang::IllegalArgumentException();
 454         bool bEmptyDomain = *p++ == '*';
 455
 456         rtl::OUString aInfix;
 457         scanStringLiteral(&p, pEnd, &aInfix);
 458
 459         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
 460             throw lang::IllegalArgumentException();
 461
 462         rtl::OUString aReversePrefix;
 463         if (bOpen
 464             && !(matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(")->"))
 465                  && scanStringLiteral(&p, pEnd, &aReversePrefix)
 466                  && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))))
 467             throw lang::IllegalArgumentException();
 468
 469         if (p != pEnd)
 470             throw lang::IllegalArgumentException();
 471
 472         return Regexp(Regexp::KIND_DOMAIN, aPrefix, bEmptyDomain, aInfix,
 473                       bOpen, aReversePrefix);
 474     }
 475 }
 476