Version 6.4.0.0.beta1, tag libreoffice-6.4.0.0.beta1
[LibreOffice.git] / ucb / source / regexp / regexp.cxx
bloba1504cb63720d59bcafa45125402638d97f84a63
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <regexp.hxx>
22 #include <cstddef>
24 #include <osl/diagnose.h>
25 #include <com/sun/star/lang/IllegalArgumentException.hpp>
26 #include <rtl/character.hxx>
27 #include <rtl/ustrbuf.hxx>
28 #include <rtl/ustring.hxx>
30 using namespace com::sun::star;
31 using namespace ucb_impl;
34 // Regexp
37 inline Regexp::Regexp(Kind eTheKind, OUString const & rThePrefix,
38 bool bTheEmptyDomain, OUString const & rTheInfix,
39 bool bTheTranslation,
40 OUString const & rTheReversePrefix):
41 m_eKind(eTheKind),
42 m_aPrefix(rThePrefix),
43 m_aInfix(rTheInfix),
44 m_aReversePrefix(rTheReversePrefix),
45 m_bEmptyDomain(bTheEmptyDomain),
46 m_bTranslation(bTheTranslation)
48 OSL_ASSERT(m_eKind == KIND_DOMAIN
49 || (!m_bEmptyDomain && m_aInfix.isEmpty()));
50 OSL_ASSERT(m_bTranslation || m_aReversePrefix.isEmpty());
54 namespace {
56 bool matchStringIgnoreCase(sal_Unicode const ** pBegin,
57 sal_Unicode const * pEnd,
58 OUString const & rString)
60 sal_Unicode const * p = *pBegin;
62 sal_Unicode const * q = rString.getStr();
63 sal_Unicode const * qEnd = q + rString.getLength();
65 if (pEnd - p < qEnd - q)
66 return false;
68 while (q != qEnd)
70 if (rtl::compareIgnoreAsciiCase(*p++, *q++) != 0)
71 return false;
74 *pBegin = p;
75 return true;
80 bool Regexp::matches(OUString const & rString) const
82 sal_Unicode const * pBegin = rString.getStr();
83 sal_Unicode const * pEnd = pBegin + rString.getLength();
85 bool bMatches = false;
87 sal_Unicode const * p = pBegin;
88 if (matchStringIgnoreCase(&p, pEnd, m_aPrefix))
90 switch (m_eKind)
92 case KIND_PREFIX:
93 bMatches = true;
94 break;
96 case KIND_AUTHORITY:
97 bMatches = p == pEnd || *p == '/' || *p == '?' || *p == '#';
98 break;
100 case KIND_DOMAIN:
101 if (!m_bEmptyDomain)
103 if (p == pEnd || *p == '/' || *p == '?' || *p == '#')
104 break;
105 ++p;
107 for (;;)
109 sal_Unicode const * q = p;
110 if (matchStringIgnoreCase(&q, pEnd, m_aInfix)
111 && (q == pEnd || *q == '/' || *q == '?' || *q == '#'))
113 bMatches = true;
114 break;
117 if (p == pEnd)
118 break;
120 sal_Unicode c = *p++;
121 if (c == '/' || c == '?' || c == '#')
122 break;
124 break;
128 return bMatches;
132 namespace {
134 bool isScheme(OUString const & rString, bool bColon)
136 // Return true if rString matches <scheme> (plus a trailing ":" if bColon
137 // is true) from RFC 2396:
138 sal_Unicode const * p = rString.getStr();
139 sal_Unicode const * pEnd = p + rString.getLength();
140 if (p != pEnd && rtl::isAsciiAlpha(*p))
141 for (++p;;)
143 if (p == pEnd)
144 return !bColon;
145 sal_Unicode c = *p++;
146 if (!(rtl::isAsciiAlphanumeric(c)
147 || c == '+' || c == '-' || c == '.'))
148 return bColon && c == ':' && p == pEnd;
150 return false;
153 void appendStringLiteral(OUStringBuffer * pBuffer,
154 OUString const & rString)
156 OSL_ASSERT(pBuffer);
158 pBuffer->append('"');
159 sal_Unicode const * p = rString.getStr();
160 sal_Unicode const * pEnd = p + rString.getLength();
161 while (p != pEnd)
163 sal_Unicode c = *p++;
164 if (c == '"' || c == '\\')
165 pBuffer->append('\\');
166 pBuffer->append(c);
168 pBuffer->append('"');
173 OUString Regexp::getRegexp() const
175 if (m_bTranslation)
177 OUStringBuffer aBuffer;
178 if (!m_aPrefix.isEmpty())
179 appendStringLiteral(&aBuffer, m_aPrefix);
180 switch (m_eKind)
182 case KIND_PREFIX:
183 aBuffer.append("(.*)");
184 break;
186 case KIND_AUTHORITY:
187 aBuffer.append("(([/?#].*)?)");
188 break;
190 case KIND_DOMAIN:
191 aBuffer.append("([^/?#]");
192 aBuffer.append(sal_Unicode(m_bEmptyDomain ? '*' : '+'));
193 if (!m_aInfix.isEmpty())
194 appendStringLiteral(&aBuffer, m_aInfix);
195 aBuffer.append("([/?#].*)?)");
196 break;
198 aBuffer.append("->");
199 if (!m_aReversePrefix.isEmpty())
200 appendStringLiteral(&aBuffer, m_aReversePrefix);
201 aBuffer.append("\\1");
202 return aBuffer.makeStringAndClear();
204 else if (m_eKind == KIND_PREFIX && isScheme(m_aPrefix, true))
205 return m_aPrefix.copy(0, m_aPrefix.getLength() - 1);
206 else
208 OUStringBuffer aBuffer;
209 if (!m_aPrefix.isEmpty())
210 appendStringLiteral(&aBuffer, m_aPrefix);
211 switch (m_eKind)
213 case KIND_PREFIX:
214 aBuffer.append(".*");
215 break;
217 case KIND_AUTHORITY:
218 aBuffer.append("([/?#].*)?");
219 break;
221 case KIND_DOMAIN:
222 aBuffer.append("[^/?#]");
223 aBuffer.append( m_bEmptyDomain ? '*' : '+' );
224 if (!m_aInfix.isEmpty())
225 appendStringLiteral(&aBuffer, m_aInfix);
226 aBuffer.append("([/?#].*)?");
227 break;
229 return aBuffer.makeStringAndClear();
234 namespace {
236 bool matchString(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
237 sal_Char const * pString, size_t nStringLength)
239 sal_Unicode const * p = *pBegin;
241 unsigned char const * q = reinterpret_cast< unsigned char const * >(pString);
242 unsigned char const * qEnd = q + nStringLength;
244 if (pEnd - p < qEnd - q)
245 return false;
247 while (q != qEnd)
249 sal_Unicode c1 = *p++;
250 sal_Unicode c2 = *q++;
251 if (c1 != c2)
252 return false;
255 *pBegin = p;
256 return true;
259 bool scanStringLiteral(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
260 OUString * pString)
262 sal_Unicode const * p = *pBegin;
264 if (p == pEnd || *p++ != '"')
265 return false;
267 OUStringBuffer aBuffer;
268 for (;;)
270 if (p == pEnd)
271 return false;
272 sal_Unicode c = *p++;
273 if (c == '"')
274 break;
275 if (c == '\\')
277 if (p == pEnd)
278 return false;
279 c = *p++;
280 if (c != '"' && c != '\\')
281 return false;
283 aBuffer.append(c);
286 *pBegin = p;
287 *pString = aBuffer.makeStringAndClear();
288 return true;
293 Regexp Regexp::parse(OUString const & rRegexp)
295 // Detect an input of '<scheme>' as an abbreviation of '"<scheme>:".*'
296 // where <scheme> is as defined in RFC 2396:
297 if (isScheme(rRegexp, false))
298 return Regexp(Regexp::KIND_PREFIX,
299 rRegexp + ":",
300 false,
301 OUString(),
302 false,
303 OUString());
305 sal_Unicode const * p = rRegexp.getStr();
306 sal_Unicode const * pEnd = p + rRegexp.getLength();
308 OUString aPrefix;
309 scanStringLiteral(&p, pEnd, &aPrefix);
311 if (p == pEnd)
312 throw lang::IllegalArgumentException();
314 // This and the matchString() calls below are some of the few places where
315 // RTL_CONSTASCII_STRINGPARAM() should NOT be removed.
316 // (c.f. https://gerrit.libreoffice.org/3117)
317 if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(".*")))
319 if (p != pEnd)
320 throw lang::IllegalArgumentException();
322 return Regexp(Regexp::KIND_PREFIX, aPrefix, false, OUString(),
323 false, OUString());
325 else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("(.*)->")))
327 OUString aReversePrefix;
328 scanStringLiteral(&p, pEnd, &aReversePrefix);
330 if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
331 || p != pEnd)
332 throw lang::IllegalArgumentException();
334 return Regexp(Regexp::KIND_PREFIX, aPrefix, false, OUString(),
335 true, aReversePrefix);
337 else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
339 if (p != pEnd)
340 throw lang::IllegalArgumentException();
342 return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, OUString(),
343 false, OUString());
345 else if (matchString(&p, pEnd,
346 RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)->")))
348 OUString aReversePrefix;
349 if (!(scanStringLiteral(&p, pEnd, &aReversePrefix)
350 && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
351 && p == pEnd))
352 throw lang::IllegalArgumentException();
354 return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, OUString(),
355 true, aReversePrefix);
357 else
359 bool bOpen = false;
360 if (p != pEnd && *p == '(')
362 ++p;
363 bOpen = true;
366 if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("[^/?#]")))
367 throw lang::IllegalArgumentException();
369 if (p == pEnd || (*p != '*' && *p != '+'))
370 throw lang::IllegalArgumentException();
371 bool bEmptyDomain = *p++ == '*';
373 OUString aInfix;
374 scanStringLiteral(&p, pEnd, &aInfix);
376 if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
377 throw lang::IllegalArgumentException();
379 OUString aReversePrefix;
380 if (bOpen
381 && !(matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(")->"))
382 && scanStringLiteral(&p, pEnd, &aReversePrefix)
383 && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))))
384 throw lang::IllegalArgumentException();
386 if (p != pEnd)
387 throw lang::IllegalArgumentException();
389 return Regexp(Regexp::KIND_DOMAIN, aPrefix, bEmptyDomain, aInfix,
390 bOpen, aReversePrefix);
394 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */