Update ooo320-m1
[ooovba.git] / ucb / source / regexp / regexp.cxx
blob0e38b25e3a34a8988c90bc9990882cbd901190b1
1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * Copyright 2008 by Sun Microsystems, Inc.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * $RCSfile: regexp.cxx,v $
10 * $Revision: 1.8 $
12 * This file is part of OpenOffice.org.
14 * OpenOffice.org is free software: you can redistribute it and/or modify
15 * it under the terms of the GNU Lesser General Public License version 3
16 * only, as published by the Free Software Foundation.
18 * OpenOffice.org is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU Lesser General Public License version 3 for more details
22 * (a copy is included in the LICENSE file that accompanied this code).
24 * You should have received a copy of the GNU Lesser General Public License
25 * version 3 along with OpenOffice.org. If not, see
26 * <http://www.openoffice.org/license.html>
27 * for a copy of the LGPLv3 License.
29 ************************************************************************/
31 // MARKER(update_precomp.py): autogen include statement, do not remove
32 #include "precompiled_ucb.hxx"
33 #include <regexp.hxx>
35 #include <cstddef>
37 #include "osl/diagnose.h"
38 #include <com/sun/star/lang/IllegalArgumentException.hpp>
39 #include <rtl/ustrbuf.hxx>
40 #include <rtl/ustring.hxx>
42 namespace unnamed_ucb_regexp {} using namespace unnamed_ucb_regexp;
43 // unnamed namespaces don't work well yet...
45 using namespace com::sun::star;
46 using namespace ucb_impl;
48 //============================================================================
50 // Regexp
52 //============================================================================
54 inline Regexp::Regexp(Kind eTheKind, rtl::OUString const & rThePrefix,
55 bool bTheEmptyDomain, rtl::OUString const & rTheInfix,
56 bool bTheTranslation,
57 rtl::OUString const & rTheReversePrefix):
58 m_eKind(eTheKind),
59 m_aPrefix(rThePrefix),
60 m_aInfix(rTheInfix),
61 m_aReversePrefix(rTheReversePrefix),
62 m_bEmptyDomain(bTheEmptyDomain),
63 m_bTranslation(bTheTranslation)
65 OSL_ASSERT(m_eKind == KIND_DOMAIN
66 || !m_bEmptyDomain && m_aInfix.getLength() == 0);
67 OSL_ASSERT(m_bTranslation || m_aReversePrefix.getLength() == 0);
70 //============================================================================
71 namespace unnamed_ucb_regexp {
73 bool matchStringIgnoreCase(sal_Unicode const ** pBegin,
74 sal_Unicode const * pEnd,
75 rtl::OUString const & rString)
77 sal_Unicode const * p = *pBegin;
79 sal_Unicode const * q = rString.getStr();
80 sal_Unicode const * qEnd = q + rString.getLength();
82 if (pEnd - p < qEnd - q)
83 return false;
85 while (q != qEnd)
87 sal_Unicode c1 = *p++;
88 sal_Unicode c2 = *q++;
89 if (c1 >= 'a' && c1 <= 'z')
90 c1 -= 'a' - 'A';
91 if (c2 >= 'a' && c2 <= 'z')
92 c2 -= 'a' - 'A';
93 if (c1 != c2)
94 return false;
97 *pBegin = p;
98 return true;
103 bool Regexp::matches(rtl::OUString const & rString,
104 rtl::OUString * pTranslation, bool * pTranslated) const
106 sal_Unicode const * pBegin = rString.getStr();
107 sal_Unicode const * pEnd = pBegin + rString.getLength();
109 bool bMatches = false;
111 sal_Unicode const * p = pBegin;
112 if (matchStringIgnoreCase(&p, pEnd, m_aPrefix))
114 sal_Unicode const * pBlock1Begin = p;
115 sal_Unicode const * pBlock1End = pEnd;
117 sal_Unicode const * pBlock2Begin = 0;
118 sal_Unicode const * pBlock2End = 0;
120 switch (m_eKind)
122 case KIND_PREFIX:
123 bMatches = true;
124 break;
126 case KIND_AUTHORITY:
127 bMatches = p == pEnd || *p == '/' || *p == '?' || *p == '#';
128 break;
130 case KIND_DOMAIN:
131 if (!m_bEmptyDomain)
133 if (p == pEnd || *p == '/' || *p == '?' || *p == '#')
134 break;
135 ++p;
137 for (;;)
139 sal_Unicode const * q = p;
140 if (matchStringIgnoreCase(&q, pEnd, m_aInfix)
141 && (q == pEnd || *q == '/' || *q == '?' || *q == '#'))
143 bMatches = true;
144 pBlock1End = p;
145 pBlock2Begin = q;
146 pBlock2End = pEnd;
147 break;
150 if (p == pEnd)
151 break;
153 sal_Unicode c = *p++;
154 if (c == '/' || c == '?' || c == '#')
155 break;
157 break;
160 if (bMatches)
162 if (m_bTranslation)
164 if (pTranslation)
166 rtl::OUStringBuffer aBuffer(m_aReversePrefix);
167 aBuffer.append(pBlock1Begin, pBlock1End - pBlock1Begin);
168 aBuffer.append(m_aInfix);
169 aBuffer.append(pBlock2Begin, pBlock2End - pBlock2Begin);
170 *pTranslation = aBuffer.makeStringAndClear();
172 if (pTranslated)
173 *pTranslated = true;
175 else
177 if (pTranslation)
178 *pTranslation = rString;
179 if (pTranslated)
180 *pTranslated = false;
185 return bMatches;
188 //============================================================================
189 namespace unnamed_ucb_regexp {
191 inline bool isAlpha(sal_Unicode c)
193 return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
196 inline bool isDigit(sal_Unicode c)
198 return c >= '0' && c <= '9';
201 bool isScheme(rtl::OUString const & rString, bool bColon)
203 // Return true if rString matches <scheme> (plus a trailing ":" if bColon
204 // is true) from RFC 2396:
205 sal_Unicode const * p = rString.getStr();
206 sal_Unicode const * pEnd = p + rString.getLength();
207 if (p != pEnd && isAlpha(*p))
208 for (++p;;)
210 if (p == pEnd)
211 return !bColon;
212 sal_Unicode c = *p++;
213 if (!(isAlpha(c) || isDigit(c)
214 || c == '+' || c == '-' || c == '.'))
215 return bColon && c == ':' && p == pEnd;
217 return false;
220 void appendStringLiteral(rtl::OUStringBuffer * pBuffer,
221 rtl::OUString const & rString)
223 OSL_ASSERT(pBuffer);
225 pBuffer->append(sal_Unicode('"'));
226 sal_Unicode const * p = rString.getStr();
227 sal_Unicode const * pEnd = p + rString.getLength();
228 while (p != pEnd)
230 sal_Unicode c = *p++;
231 if (c == '"' || c == '\\')
232 pBuffer->append(sal_Unicode('\\'));
233 pBuffer->append(c);
235 pBuffer->append(sal_Unicode('"'));
240 rtl::OUString Regexp::getRegexp(bool bReverse) const
242 if (m_bTranslation)
244 rtl::OUStringBuffer aBuffer;
245 if (bReverse)
247 if (m_aReversePrefix.getLength() != 0)
248 appendStringLiteral(&aBuffer, m_aReversePrefix);
250 else
252 if (m_aPrefix.getLength() != 0)
253 appendStringLiteral(&aBuffer, m_aPrefix);
255 switch (m_eKind)
257 case KIND_PREFIX:
258 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("(.*)"));
259 break;
261 case KIND_AUTHORITY:
262 aBuffer.
263 appendAscii(RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)"));
264 break;
266 case KIND_DOMAIN:
267 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([^/?#]"));
268 aBuffer.append(sal_Unicode(m_bEmptyDomain ? '*' : '+'));
269 if (m_aInfix.getLength() != 0)
270 appendStringLiteral(&aBuffer, m_aInfix);
271 aBuffer.
272 appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?)"));
273 break;
275 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("->"));
276 if (bReverse)
278 if (m_aPrefix.getLength() != 0)
279 appendStringLiteral(&aBuffer, m_aPrefix);
281 else
283 if (m_aReversePrefix.getLength() != 0)
284 appendStringLiteral(&aBuffer, m_aReversePrefix);
286 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("\\1"));
287 return aBuffer.makeStringAndClear();
289 else if (m_eKind == KIND_PREFIX && isScheme(m_aPrefix, true))
290 return m_aPrefix.copy(0, m_aPrefix.getLength() - 1);
291 else
293 rtl::OUStringBuffer aBuffer;
294 if (m_aPrefix.getLength() != 0)
295 appendStringLiteral(&aBuffer, m_aPrefix);
296 switch (m_eKind)
298 case KIND_PREFIX:
299 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM(".*"));
300 break;
302 case KIND_AUTHORITY:
303 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?"));
304 break;
306 case KIND_DOMAIN:
307 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("[^/?#]"));
308 aBuffer.append(sal_Unicode(m_bEmptyDomain ? '*' : '+'));
309 if (m_aInfix.getLength() != 0)
310 appendStringLiteral(&aBuffer, m_aInfix);
311 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?"));
312 break;
314 return aBuffer.makeStringAndClear();
318 //============================================================================
319 namespace unnamed_ucb_regexp {
321 bool matchString(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
322 sal_Char const * pString, size_t nStringLength)
324 sal_Unicode const * p = *pBegin;
326 sal_uChar const * q = reinterpret_cast< sal_uChar const * >(pString);
327 sal_uChar const * qEnd = q + nStringLength;
329 if (pEnd - p < qEnd - q)
330 return false;
332 while (q != qEnd)
334 sal_Unicode c1 = *p++;
335 sal_Unicode c2 = *q++;
336 if (c1 != c2)
337 return false;
340 *pBegin = p;
341 return true;
344 bool scanStringLiteral(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
345 rtl::OUString * pString)
347 sal_Unicode const * p = *pBegin;
349 if (p == pEnd || *p++ != '"')
350 return false;
352 rtl::OUStringBuffer aBuffer;
353 for (;;)
355 if (p == pEnd)
356 return false;
357 sal_Unicode c = *p++;
358 if (c == '"')
359 break;
360 if (c == '\\')
362 if (p == pEnd)
363 return false;
364 c = *p++;
365 if (c != '"' && c != '\\')
366 return false;
368 aBuffer.append(c);
371 *pBegin = p;
372 *pString = aBuffer.makeStringAndClear();
373 return true;
378 Regexp Regexp::parse(rtl::OUString const & rRegexp)
380 // Detect an input of '<scheme>' as an abbreviation of '"<scheme>:".*'
381 // where <scheme> is as defined in RFC 2396:
382 if (isScheme(rRegexp, false))
383 return Regexp(Regexp::KIND_PREFIX,
384 rRegexp
385 + rtl::OUString(RTL_CONSTASCII_USTRINGPARAM(":")),
386 false,
387 rtl::OUString(),
388 false,
389 rtl::OUString());
391 sal_Unicode const * p = rRegexp.getStr();
392 sal_Unicode const * pEnd = p + rRegexp.getLength();
394 rtl::OUString aPrefix;
395 scanStringLiteral(&p, pEnd, &aPrefix);
397 if (p == pEnd)
398 throw lang::IllegalArgumentException();
400 if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(".*")))
402 if (p != pEnd)
403 throw lang::IllegalArgumentException();
405 return Regexp(Regexp::KIND_PREFIX, aPrefix, false, rtl::OUString(),
406 false, rtl::OUString());
408 else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("(.*)->")))
410 rtl::OUString aReversePrefix;
411 scanStringLiteral(&p, pEnd, &aReversePrefix);
413 if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
414 || p != pEnd)
415 throw lang::IllegalArgumentException();
417 return Regexp(Regexp::KIND_PREFIX, aPrefix, false, rtl::OUString(),
418 true, aReversePrefix);
420 else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
422 if (p != pEnd)
423 throw lang::IllegalArgumentException();
425 return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, rtl::OUString(),
426 false, rtl::OUString());
428 else if (matchString(&p, pEnd,
429 RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)->")))
431 rtl::OUString aReversePrefix;
432 if (!(scanStringLiteral(&p, pEnd, &aReversePrefix)
433 && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
434 && p == pEnd))
435 throw lang::IllegalArgumentException();
437 return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, rtl::OUString(),
438 true, aReversePrefix);
440 else
442 bool bOpen = false;
443 if (p != pEnd && *p == '(')
445 ++p;
446 bOpen = true;
449 if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("[^/?#]")))
450 throw lang::IllegalArgumentException();
452 if (p == pEnd || (*p != '*' && *p != '+'))
453 throw lang::IllegalArgumentException();
454 bool bEmptyDomain = *p++ == '*';
456 rtl::OUString aInfix;
457 scanStringLiteral(&p, pEnd, &aInfix);
459 if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
460 throw lang::IllegalArgumentException();
462 rtl::OUString aReversePrefix;
463 if (bOpen
464 && !(matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(")->"))
465 && scanStringLiteral(&p, pEnd, &aReversePrefix)
466 && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))))
467 throw lang::IllegalArgumentException();
469 if (p != pEnd)
470 throw lang::IllegalArgumentException();
472 return Regexp(Regexp::KIND_DOMAIN, aPrefix, bEmptyDomain, aInfix,
473 bOpen, aReversePrefix);