ucb/source/regexp/regexp.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include <regexp.hxx>
  21
  22 #include <cstddef>
  23
  24 #include "osl/diagnose.h"
  25 #include <com/sun/star/lang/IllegalArgumentException.hpp>
  26 #include <rtl/ustrbuf.hxx>
  27 #include <rtl/ustring.hxx>
  28 #include <comphelper/string.hxx>
  29
  30 namespace unnamed_ucb_regexp {} using namespace unnamed_ucb_regexp;
  31     // unnamed namespaces don't work well yet...
  32
  33 using namespace com::sun::star;
  34 using namespace ucb_impl;
  35
  36 //============================================================================
  37 //
  38 //  Regexp
  39 //
  40 //============================================================================
  41
  42 inline Regexp::Regexp(Kind eTheKind, rtl::OUString const & rThePrefix,
  43                       bool bTheEmptyDomain, rtl::OUString const & rTheInfix,
  44                       bool bTheTranslation,
  45                       rtl::OUString const & rTheReversePrefix):
  46     m_eKind(eTheKind),
  47     m_aPrefix(rThePrefix),
  48     m_aInfix(rTheInfix),
  49     m_aReversePrefix(rTheReversePrefix),
  50     m_bEmptyDomain(bTheEmptyDomain),
  51     m_bTranslation(bTheTranslation)
  52 {
  53     OSL_ASSERT(m_eKind == KIND_DOMAIN
  54                || (!m_bEmptyDomain && m_aInfix.isEmpty()));
  55     OSL_ASSERT(m_bTranslation || m_aReversePrefix.isEmpty());
  56 }
  57
  58 //============================================================================
  59 namespace unnamed_ucb_regexp {
  60
  61 bool matchStringIgnoreCase(sal_Unicode const ** pBegin,
  62                            sal_Unicode const * pEnd,
  63                            rtl::OUString const & rString)
  64 {
  65     sal_Unicode const * p = *pBegin;
  66
  67     sal_Unicode const * q = rString.getStr();
  68     sal_Unicode const * qEnd = q + rString.getLength();
  69
  70     if (pEnd - p < qEnd - q)
  71         return false;
  72
  73     while (q != qEnd)
  74     {
  75         sal_Unicode c1 = *p++;
  76         sal_Unicode c2 = *q++;
  77         if (c1 >= 'a' && c1 <= 'z')
  78             c1 -= 'a' - 'A';
  79         if (c2 >= 'a' && c2 <= 'z')
  80             c2 -= 'a' - 'A';
  81         if (c1 != c2)
  82             return false;
  83     }
  84
  85     *pBegin = p;
  86     return true;
  87 }
  88
  89 }
  90
  91 bool Regexp::matches(rtl::OUString const & rString,
  92                      rtl::OUString * pTranslation, bool * pTranslated) const
  93 {
  94     sal_Unicode const * pBegin = rString.getStr();
  95     sal_Unicode const * pEnd = pBegin + rString.getLength();
  96
  97     bool bMatches = false;
  98
  99     sal_Unicode const * p = pBegin;
 100     if (matchStringIgnoreCase(&p, pEnd, m_aPrefix))
 101     {
 102         sal_Unicode const * pBlock1Begin = p;
 103         sal_Unicode const * pBlock1End = pEnd;
 104
 105         sal_Unicode const * pBlock2Begin = 0;
 106         sal_Unicode const * pBlock2End = 0;
 107
 108         switch (m_eKind)
 109         {
 110             case KIND_PREFIX:
 111                 bMatches = true;
 112                 break;
 113
 114             case KIND_AUTHORITY:
 115                 bMatches = p == pEnd || *p == '/' || *p == '?' || *p == '#';
 116                 break;
 117
 118             case KIND_DOMAIN:
 119                 if (!m_bEmptyDomain)
 120                 {
 121                     if (p == pEnd || *p == '/' || *p == '?' || *p == '#')
 122                         break;
 123                     ++p;
 124                 }
 125                 for (;;)
 126                 {
 127                     sal_Unicode const * q = p;
 128                     if (matchStringIgnoreCase(&q, pEnd, m_aInfix)
 129                         && (q == pEnd || *q == '/' || *q == '?' || *q == '#'))
 130                     {
 131                         bMatches = true;
 132                         pBlock1End = p;
 133                         pBlock2Begin = q;
 134                         pBlock2End = pEnd;
 135                         break;
 136                     }
 137
 138                     if (p == pEnd)
 139                         break;
 140
 141                     sal_Unicode c = *p++;
 142                     if (c == '/' || c == '?' || c == '#')
 143                         break;
 144                 }
 145                 break;
 146         }
 147
 148         if (bMatches)
 149         {
 150             if (m_bTranslation)
 151             {
 152                 if (pTranslation)
 153                 {
 154                     rtl::OUStringBuffer aBuffer(m_aReversePrefix);
 155                     aBuffer.append(pBlock1Begin, pBlock1End - pBlock1Begin);
 156                     aBuffer.append(m_aInfix);
 157                     aBuffer.append(pBlock2Begin, pBlock2End - pBlock2Begin);
 158                     *pTranslation = aBuffer.makeStringAndClear();
 159                 }
 160                 if (pTranslated)
 161                     *pTranslated = true;
 162             }
 163             else
 164             {
 165                 if (pTranslation)
 166                     *pTranslation = rString;
 167                 if (pTranslated)
 168                     *pTranslated = false;
 169             }
 170         }
 171     }
 172
 173     return bMatches;
 174 }
 175
 176 //============================================================================
 177 namespace unnamed_ucb_regexp {
 178
 179 bool isScheme(rtl::OUString const & rString, bool bColon)
 180 {
 181     using comphelper::string::isalphaAscii;
 182     using comphelper::string::isdigitAscii;
 183     // Return true if rString matches <scheme> (plus a trailing ":" if bColon
 184     // is true) from RFC 2396:
 185     sal_Unicode const * p = rString.getStr();
 186     sal_Unicode const * pEnd = p + rString.getLength();
 187     if (p != pEnd && isalphaAscii(*p))
 188         for (++p;;)
 189         {
 190             if (p == pEnd)
 191                 return !bColon;
 192             sal_Unicode c = *p++;
 193             if (!(isalphaAscii(c) || isdigitAscii(c)
 194                   || c == '+' || c == '-' || c == '.'))
 195                 return bColon && c == ':' && p == pEnd;
 196         }
 197     return false;
 198 }
 199
 200 void appendStringLiteral(rtl::OUStringBuffer * pBuffer,
 201                          rtl::OUString const & rString)
 202 {
 203     OSL_ASSERT(pBuffer);
 204
 205     pBuffer->append(sal_Unicode('"'));
 206     sal_Unicode const * p = rString.getStr();
 207     sal_Unicode const * pEnd = p + rString.getLength();
 208     while (p != pEnd)
 209     {
 210         sal_Unicode c = *p++;
 211         if (c == '"' || c == '\\')
 212             pBuffer->append(sal_Unicode('\\'));
 213         pBuffer->append(c);
 214     }
 215     pBuffer->append(sal_Unicode('"'));
 216 }
 217
 218 }
 219
 220 rtl::OUString Regexp::getRegexp(bool bReverse) const
 221 {
 222     if (m_bTranslation)
 223     {
 224         rtl::OUStringBuffer aBuffer;
 225         if (bReverse)
 226         {
 227             if (!m_aReversePrefix.isEmpty())
 228                 appendStringLiteral(&aBuffer, m_aReversePrefix);
 229         }
 230         else
 231         {
 232             if (!m_aPrefix.isEmpty())
 233                 appendStringLiteral(&aBuffer, m_aPrefix);
 234         }
 235         switch (m_eKind)
 236         {
 237             case KIND_PREFIX:
 238                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("(.*)"));
 239                 break;
 240
 241             case KIND_AUTHORITY:
 242                 aBuffer.
 243                     appendAscii(RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)"));
 244                 break;
 245
 246             case KIND_DOMAIN:
 247                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([^/?#]"));
 248                 aBuffer.append(sal_Unicode(m_bEmptyDomain ? '*' : '+'));
 249                 if (!m_aInfix.isEmpty())
 250                     appendStringLiteral(&aBuffer, m_aInfix);
 251                 aBuffer.
 252                     appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?)"));
 253                 break;
 254         }
 255         aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("->"));
 256         if (bReverse)
 257         {
 258             if (!m_aPrefix.isEmpty())
 259                 appendStringLiteral(&aBuffer, m_aPrefix);
 260         }
 261         else
 262         {
 263             if (!m_aReversePrefix.isEmpty())
 264                 appendStringLiteral(&aBuffer, m_aReversePrefix);
 265         }
 266         aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("\\1"));
 267         return aBuffer.makeStringAndClear();
 268     }
 269     else if (m_eKind == KIND_PREFIX && isScheme(m_aPrefix, true))
 270         return m_aPrefix.copy(0, m_aPrefix.getLength() - 1);
 271     else
 272     {
 273         rtl::OUStringBuffer aBuffer;
 274         if (!m_aPrefix.isEmpty())
 275             appendStringLiteral(&aBuffer, m_aPrefix);
 276         switch (m_eKind)
 277         {
 278             case KIND_PREFIX:
 279                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM(".*"));
 280                 break;
 281
 282             case KIND_AUTHORITY:
 283                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?"));
 284                 break;
 285
 286             case KIND_DOMAIN:
 287                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("[^/?#]"));
 288                 aBuffer.append(sal_Unicode(m_bEmptyDomain ? '*' : '+'));
 289                 if (!m_aInfix.isEmpty())
 290                     appendStringLiteral(&aBuffer, m_aInfix);
 291                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?"));
 292                 break;
 293         }
 294         return aBuffer.makeStringAndClear();
 295     }
 296 }
 297
 298 //============================================================================
 299 namespace unnamed_ucb_regexp {
 300
 301 bool matchString(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
 302                  sal_Char const * pString, size_t nStringLength)
 303 {
 304     sal_Unicode const * p = *pBegin;
 305
 306     sal_uChar const * q = reinterpret_cast< sal_uChar const * >(pString);
 307     sal_uChar const * qEnd = q + nStringLength;
 308
 309     if (pEnd - p < qEnd - q)
 310         return false;
 311
 312     while (q != qEnd)
 313     {
 314         sal_Unicode c1 = *p++;
 315         sal_Unicode c2 = *q++;
 316         if (c1 != c2)
 317             return false;
 318     }
 319
 320     *pBegin = p;
 321     return true;
 322 }
 323
 324 bool scanStringLiteral(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
 325                        rtl::OUString * pString)
 326 {
 327     sal_Unicode const * p = *pBegin;
 328
 329     if (p == pEnd || *p++ != '"')
 330         return false;
 331
 332     rtl::OUStringBuffer aBuffer;
 333     for (;;)
 334     {
 335         if (p == pEnd)
 336             return false;
 337         sal_Unicode c = *p++;
 338         if (c == '"')
 339             break;
 340         if (c == '\\')
 341         {
 342             if (p == pEnd)
 343                 return false;
 344             c = *p++;
 345             if (c != '"' && c != '\\')
 346                 return false;
 347         }
 348         aBuffer.append(c);
 349     }
 350
 351     *pBegin = p;
 352     *pString = aBuffer.makeStringAndClear();
 353     return true;
 354 }
 355
 356 }
 357
 358 Regexp Regexp::parse(rtl::OUString const & rRegexp)
 359 {
 360     // Detect an input of '<scheme>' as an abbreviation of '"<scheme>:".*'
 361     // where <scheme> is as defined in RFC 2396:
 362     if (isScheme(rRegexp, false))
 363         return Regexp(Regexp::KIND_PREFIX,
 364                       rRegexp
 365                           + rtl::OUString(RTL_CONSTASCII_USTRINGPARAM(":")),
 366                       false,
 367                       rtl::OUString(),
 368                       false,
 369                       rtl::OUString());
 370
 371     sal_Unicode const * p = rRegexp.getStr();
 372     sal_Unicode const * pEnd = p + rRegexp.getLength();
 373
 374     rtl::OUString aPrefix;
 375     scanStringLiteral(&p, pEnd, &aPrefix);
 376
 377     if (p == pEnd)
 378         throw lang::IllegalArgumentException();
 379
 380     if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(".*")))
 381     {
 382         if (p != pEnd)
 383             throw lang::IllegalArgumentException();
 384
 385         return Regexp(Regexp::KIND_PREFIX, aPrefix, false, rtl::OUString(),
 386                       false, rtl::OUString());
 387     }
 388     else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("(.*)->")))
 389     {
 390         rtl::OUString aReversePrefix;
 391         scanStringLiteral(&p, pEnd, &aReversePrefix);
 392
 393         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
 394             || p != pEnd)
 395             throw lang::IllegalArgumentException();
 396
 397         return Regexp(Regexp::KIND_PREFIX, aPrefix, false, rtl::OUString(),
 398                       true, aReversePrefix);
 399     }
 400     else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
 401     {
 402         if (p != pEnd)
 403             throw lang::IllegalArgumentException();
 404
 405         return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, rtl::OUString(),
 406                       false, rtl::OUString());
 407     }
 408     else if (matchString(&p, pEnd,
 409                          RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)->")))
 410     {
 411         rtl::OUString aReversePrefix;
 412         if (!(scanStringLiteral(&p, pEnd, &aReversePrefix)
 413               && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
 414               && p == pEnd))
 415             throw lang::IllegalArgumentException();
 416
 417         return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, rtl::OUString(),
 418                       true, aReversePrefix);
 419     }
 420     else
 421     {
 422         bool bOpen = false;
 423         if (p != pEnd && *p == '(')
 424         {
 425             ++p;
 426             bOpen = true;
 427         }
 428
 429         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("[^/?#]")))
 430             throw lang::IllegalArgumentException();
 431
 432         if (p == pEnd || (*p != '*' && *p != '+'))
 433             throw lang::IllegalArgumentException();
 434         bool bEmptyDomain = *p++ == '*';
 435
 436         rtl::OUString aInfix;
 437         scanStringLiteral(&p, pEnd, &aInfix);
 438
 439         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
 440             throw lang::IllegalArgumentException();
 441
 442         rtl::OUString aReversePrefix;
 443         if (bOpen
 444             && !(matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(")->"))
 445                  && scanStringLiteral(&p, pEnd, &aReversePrefix)
 446                  && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))))
 447             throw lang::IllegalArgumentException();
 448
 449         if (p != pEnd)
 450             throw lang::IllegalArgumentException();
 451
 452         return Regexp(Regexp::KIND_DOMAIN, aPrefix, bEmptyDomain, aInfix,
 453                       bOpen, aReversePrefix);
 454     }
 455 }
 456
 457 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */