Version 6.4.0.0.beta1, tag libreoffice-6.4.0.0.beta1
[LibreOffice.git] / i18npool / source / search / textsearch.cxx
blob4174c6cd1e86eebb5d3f408699d284a0e534cf15
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include "textsearch.hxx"
21 #include "levdis.hxx"
22 #include <com/sun/star/lang/XSingleServiceFactory.hpp>
23 #include <comphelper/processfactory.hxx>
24 #include <com/sun/star/i18n/BreakIterator.hpp>
25 #include <com/sun/star/util/SearchAlgorithms2.hpp>
26 #include <com/sun/star/util/SearchFlags.hpp>
27 #include <com/sun/star/i18n/WordType.hpp>
28 #include <com/sun/star/i18n/ScriptType.hpp>
29 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
30 #include <com/sun/star/i18n/CharacterClassification.hpp>
31 #include <com/sun/star/i18n/KCharacterType.hpp>
32 #include <com/sun/star/i18n/Transliteration.hpp>
33 #include <cppuhelper/factory.hxx>
34 #include <cppuhelper/supportsservice.hxx>
35 #include <cppuhelper/weak.hxx>
36 #include <i18nutil/transliteration.hxx>
37 #include <rtl/ustrbuf.hxx>
38 #include <sal/log.hxx>
40 #include <unicode/regex.h>
42 using namespace ::com::sun::star::util;
43 using namespace ::com::sun::star::uno;
44 using namespace ::com::sun::star::lang;
45 using namespace ::com::sun::star::i18n;
46 using namespace ::com::sun::star;
48 const TransliterationFlags COMPLEX_TRANS_MASK =
49 TransliterationFlags::ignoreBaFa_ja_JP |
50 TransliterationFlags::ignoreIterationMark_ja_JP |
51 TransliterationFlags::ignoreTiJi_ja_JP |
52 TransliterationFlags::ignoreHyuByu_ja_JP |
53 TransliterationFlags::ignoreSeZe_ja_JP |
54 TransliterationFlags::ignoreIandEfollowedByYa_ja_JP |
55 TransliterationFlags::ignoreKiKuFollowedBySa_ja_JP |
56 TransliterationFlags::ignoreProlongedSoundMark_ja_JP;
58 namespace
60 TransliterationFlags maskComplexTrans( TransliterationFlags n )
62 // IGNORE_KANA and FULLWIDTH_HALFWIDTH are simple but need to take effect
63 // in complex transliteration.
64 return
65 n & (COMPLEX_TRANS_MASK | // all set ignore bits
66 TransliterationFlags::IGNORE_KANA | // plus IGNORE_KANA bit
67 TransliterationFlags::FULLWIDTH_HALFWIDTH); // and the FULLWIDTH_HALFWIDTH value
70 bool isComplexTrans( TransliterationFlags n )
72 return bool(n & COMPLEX_TRANS_MASK);
75 TransliterationFlags maskSimpleTrans( TransliterationFlags n )
77 return n & ~COMPLEX_TRANS_MASK;
80 bool isSimpleTrans( TransliterationFlags n )
82 return bool(maskSimpleTrans(n));
85 // Regex patterns are case sensitive.
86 TransliterationFlags maskSimpleRegexTrans( TransliterationFlags n )
88 TransliterationFlags m = (n & TransliterationFlags::IGNORE_MASK) & ~TransliterationFlags::IGNORE_CASE;
89 TransliterationFlags v = n & TransliterationFlags::NON_IGNORE_MASK;
90 if (v == TransliterationFlags::UPPERCASE_LOWERCASE || v == TransliterationFlags::LOWERCASE_UPPERCASE)
91 v = TransliterationFlags::NONE;
92 return (m | v) & ~COMPLEX_TRANS_MASK;
95 bool isSimpleRegexTrans( TransliterationFlags n )
97 return bool(maskSimpleRegexTrans(n));
101 TextSearch::TextSearch(const Reference < XComponentContext > & rxContext)
102 : m_xContext( rxContext )
104 SearchOptions2 aOpt;
105 aOpt.AlgorithmType2 = SearchAlgorithms2::ABSOLUTE;
106 aOpt.algorithmType = SearchAlgorithms_ABSOLUTE;
107 aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE;
108 //aOpt.Locale = ???;
109 setOptions( aOpt );
112 TextSearch::~TextSearch()
114 pRegexMatcher.reset();
115 pWLD.reset();
116 pJumpTable.reset();
117 pJumpTable2.reset();
120 void TextSearch::setOptions2( const SearchOptions2& rOptions )
122 osl::MutexGuard g(m_aMutex);
124 aSrchPara = rOptions;
126 pRegexMatcher.reset();
127 pWLD.reset();
128 pJumpTable.reset();
129 pJumpTable2.reset();
130 maWildcardReversePattern.clear();
131 maWildcardReversePattern2.clear();
132 TransliterationFlags transliterateFlags = static_cast<TransliterationFlags>(aSrchPara.transliterateFlags);
134 // Create Transliteration class
135 if( isSimpleTrans( transliterateFlags) )
137 if( !xTranslit.is() )
138 xTranslit.set( Transliteration::create( m_xContext ) );
139 xTranslit->loadModule(
140 static_cast<TransliterationModules>(maskSimpleTrans(transliterateFlags)),
141 aSrchPara.Locale);
143 else if( xTranslit.is() )
144 xTranslit = nullptr;
146 // Create Transliteration for 2<->1, 2<->2 transliteration
147 if ( isComplexTrans( transliterateFlags) )
149 if( !xTranslit2.is() )
150 xTranslit2.set( Transliteration::create( m_xContext ) );
151 // Load transliteration module
152 xTranslit2->loadModule(
153 static_cast<TransliterationModules>(maskComplexTrans(transliterateFlags)),
154 aSrchPara.Locale);
157 if ( !xBreak.is() )
158 xBreak = css::i18n::BreakIterator::create( m_xContext );
160 sSrchStr = aSrchPara.searchString;
162 // Transliterate search string.
163 if (aSrchPara.AlgorithmType2 == SearchAlgorithms2::REGEXP)
165 if (isSimpleRegexTrans(transliterateFlags))
167 if (maskSimpleRegexTrans(transliterateFlags) !=
168 maskSimpleTrans(transliterateFlags))
170 css::uno::Reference< XExtendedTransliteration > xTranslitPattern(
171 Transliteration::create( m_xContext ));
172 if (xTranslitPattern.is())
174 xTranslitPattern->loadModule(
175 static_cast<TransliterationModules>(maskSimpleRegexTrans(transliterateFlags)),
176 aSrchPara.Locale);
177 sSrchStr = xTranslitPattern->transliterateString2String(
178 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
181 else
183 if (xTranslit.is())
184 sSrchStr = xTranslit->transliterateString2String(
185 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
187 // xTranslit2 complex transliterated sSrchStr2 is not used in
188 // regex, see TextSearch::searchForward() and
189 // TextSearch::searchBackward()
192 else
194 if ( xTranslit.is() && isSimpleTrans(transliterateFlags) )
195 sSrchStr = xTranslit->transliterateString2String(
196 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
198 if ( xTranslit2.is() && isComplexTrans(transliterateFlags) )
199 sSrchStr2 = xTranslit2->transliterateString2String(
200 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
203 // When start or end of search string is a complex script type, we need to
204 // make sure the result boundary is not located in the middle of cell.
205 checkCTLStart = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 0) ==
206 ScriptType::COMPLEX));
207 checkCTLEnd = (xBreak.is() && (xBreak->getScriptType(sSrchStr,
208 sSrchStr.getLength()-1) == ScriptType::COMPLEX));
210 // Take the new SearchOptions2::AlgorithmType2 field and ignore
211 // SearchOptions::algorithmType
212 switch( aSrchPara.AlgorithmType2)
214 case SearchAlgorithms2::REGEXP:
215 fnForward = &TextSearch::RESrchFrwrd;
216 fnBackward = &TextSearch::RESrchBkwrd;
217 RESrchPrepare( aSrchPara);
218 break;
220 case SearchAlgorithms2::APPROXIMATE:
221 fnForward = &TextSearch::ApproxSrchFrwrd;
222 fnBackward = &TextSearch::ApproxSrchBkwrd;
224 pWLD.reset( new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars,
225 aSrchPara.insertedChars, aSrchPara.deletedChars,
226 0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) ) );
228 nLimit = pWLD->GetLimit();
229 break;
231 case SearchAlgorithms2::WILDCARD:
232 mcWildcardEscapeChar = static_cast<sal_uInt32>(aSrchPara.WildcardEscapeCharacter);
233 mbWildcardAllowSubstring = ((aSrchPara.searchFlag & SearchFlags::WILD_MATCH_SELECTION) == 0);
234 fnForward = &TextSearch::WildcardSrchFrwrd;
235 fnBackward = &TextSearch::WildcardSrchBkwrd;
236 break;
238 default:
239 SAL_WARN("i18npool","TextSearch::setOptions2 - default what?");
240 [[fallthrough]];
241 case SearchAlgorithms2::ABSOLUTE:
242 fnForward = &TextSearch::NSrchFrwrd;
243 fnBackward = &TextSearch::NSrchBkwrd;
244 break;
248 void TextSearch::setOptions( const SearchOptions& rOptions )
250 osl::MutexGuard g(m_aMutex);
252 sal_Int16 nAlgorithmType2;
253 switch (rOptions.algorithmType)
255 case SearchAlgorithms_REGEXP:
256 nAlgorithmType2 = SearchAlgorithms2::REGEXP;
257 break;
258 case SearchAlgorithms_APPROXIMATE:
259 nAlgorithmType2 = SearchAlgorithms2::APPROXIMATE;
260 break;
261 default:
262 SAL_WARN("i18npool","TextSearch::setOptions - default what?");
263 [[fallthrough]];
264 case SearchAlgorithms_ABSOLUTE:
265 nAlgorithmType2 = SearchAlgorithms2::ABSOLUTE;
266 break;
268 // It would be nice if an inherited struct had a ctor that takes an
269 // instance of the object the struct derived from...
270 SearchOptions2 aOptions2(
271 rOptions.algorithmType,
272 rOptions.searchFlag,
273 rOptions.searchString,
274 rOptions.replaceString,
275 rOptions.Locale,
276 rOptions.changedChars,
277 rOptions.deletedChars,
278 rOptions.insertedChars,
279 rOptions.transliterateFlags,
280 nAlgorithmType2,
281 0 // no wildcard search, no escape character...
283 setOptions2( aOptions2);
286 static sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos )
288 auto pOff = std::find_if(rOff.begin(), rOff.end(),
289 [nPos](const sal_Int32 nOff) { return nOff >= nPos; });
290 return static_cast<sal_Int32>(std::distance(rOff.begin(), pOff));
293 bool TextSearch::isCellStart(const OUString& searchStr, sal_Int32 nPos)
295 sal_Int32 nDone;
296 return nPos == xBreak->previousCharacters(searchStr, nPos+1,
297 aSrchPara.Locale, CharacterIteratorMode::SKIPCELL, 1, nDone);
300 SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
302 osl::MutexGuard g(m_aMutex);
304 SearchResult sres;
306 OUString in_str(searchStr);
308 bUsePrimarySrchStr = true;
310 if ( xTranslit.is() )
312 // apply normal transliteration (1<->1, 1<->0)
313 css::uno::Sequence<sal_Int32> offset(endPos - startPos);
314 in_str = xTranslit->transliterate( searchStr, startPos, endPos - startPos, offset );
316 // JP 20.6.2001: also the start and end positions must be corrected!
317 sal_Int32 newStartPos =
318 (startPos == 0) ? 0 : FindPosInSeq_Impl( offset, startPos );
320 sal_Int32 newEndPos = (endPos < searchStr.getLength())
321 ? FindPosInSeq_Impl( offset, endPos )
322 : in_str.getLength();
324 sal_Int32 nExtraOffset = 0;
325 if (pRegexMatcher && startPos > 0)
327 // avoid matching ^ here - in_str omits a prefix of the searchStr
328 // this is a really lame way to do it, but ICU only offers
329 // useAnchoringBounds() to disable *both* bounds but what is needed
330 // here is to disable only one bound and respect the other
331 in_str = "X" + in_str;
332 nExtraOffset = 1;
333 newStartPos += nExtraOffset;
334 newEndPos += nExtraOffset;
337 sres = (this->*fnForward)( in_str, newStartPos, newEndPos );
339 // Map offsets back to untransliterated string.
340 const sal_Int32 nOffsets = offset.getLength();
341 if (nOffsets)
343 // For regex nGroups is the number of groups+1 with group 0 being
344 // the entire match.
345 const sal_Int32 nGroups = sres.startOffset.getLength();
346 for ( sal_Int32 k = 0; k < nGroups; k++ )
348 const sal_Int32 nStart = sres.startOffset[k] - nExtraOffset;
349 // Result offsets are negative (-1) if a group expression was
350 // not matched.
351 if (nStart >= 0)
352 sres.startOffset[k] = (nStart < nOffsets ? offset[nStart] : (offset[nOffsets - 1] + 1));
353 // JP 20.6.2001: end is ever exclusive and then don't return
354 // the position of the next character - return the
355 // next position behind the last found character!
356 // "a b c" find "b" must return 2,3 and not 2,4!!!
357 const sal_Int32 nStop = sres.endOffset[k] - nExtraOffset;
358 if (nStop >= 0)
360 if (nStop > 0)
361 sres.endOffset[k] = offset[(nStop <= nOffsets ? nStop : nOffsets) - 1] + 1;
362 else
363 sres.endOffset[k] = offset[0];
368 else
370 sres = (this->*fnForward)( in_str, startPos, endPos );
373 if ( xTranslit2.is() && aSrchPara.AlgorithmType2 != SearchAlgorithms2::REGEXP)
375 SearchResult sres2;
377 in_str = searchStr;
378 css::uno::Sequence <sal_Int32> offset( in_str.getLength());
380 in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset );
382 if( startPos )
383 startPos = FindPosInSeq_Impl( offset, startPos );
385 if( endPos < searchStr.getLength() )
386 endPos = FindPosInSeq_Impl( offset, endPos );
387 else
388 endPos = in_str.getLength();
390 bUsePrimarySrchStr = false;
391 sres2 = (this->*fnForward)( in_str, startPos, endPos );
393 for ( int k = 0; k < sres2.startOffset.getLength(); k++ )
395 if (sres2.startOffset[k])
396 sres2.startOffset[k] = offset[sres2.startOffset[k]-1] + 1;
397 if (sres2.endOffset[k])
398 sres2.endOffset[k] = offset[sres2.endOffset[k]-1] + 1;
401 // pick first and long one
402 if ( sres.subRegExpressions == 0)
403 return sres2;
404 if ( sres2.subRegExpressions == 1)
406 if ( sres.startOffset[0] > sres2.startOffset[0])
407 return sres2;
408 else if ( sres.startOffset[0] == sres2.startOffset[0] &&
409 sres.endOffset[0] < sres2.endOffset[0])
410 return sres2;
414 return sres;
417 SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
419 osl::MutexGuard g(m_aMutex);
421 SearchResult sres;
423 OUString in_str(searchStr);
425 bUsePrimarySrchStr = true;
427 if ( xTranslit.is() )
429 // apply only simple 1<->1 transliteration here
430 css::uno::Sequence<sal_Int32> offset(startPos - endPos);
431 in_str = xTranslit->transliterate( searchStr, endPos, startPos - endPos, offset );
433 // JP 20.6.2001: also the start and end positions must be corrected!
434 sal_Int32 const newStartPos = (startPos < searchStr.getLength())
435 ? FindPosInSeq_Impl( offset, startPos )
436 : in_str.getLength();
438 sal_Int32 const newEndPos =
439 (endPos == 0) ? 0 : FindPosInSeq_Impl( offset, endPos );
441 // TODO: this would need nExtraOffset handling to avoid $ matching
442 // if (pRegexMatcher && startPos < searchStr.getLength())
443 // but that appears to be impossible with ICU regex
445 sres = (this->*fnBackward)( in_str, newStartPos, newEndPos );
447 // Map offsets back to untransliterated string.
448 const sal_Int32 nOffsets = offset.getLength();
449 if (nOffsets)
451 // For regex nGroups is the number of groups+1 with group 0 being
452 // the entire match.
453 const sal_Int32 nGroups = sres.startOffset.getLength();
454 for ( sal_Int32 k = 0; k < nGroups; k++ )
456 const sal_Int32 nStart = sres.startOffset[k];
457 // Result offsets are negative (-1) if a group expression was
458 // not matched.
459 if (nStart >= 0)
461 if (nStart > 0)
462 sres.startOffset[k] = offset[(nStart <= nOffsets ? nStart : nOffsets) - 1] + 1;
463 else
464 sres.startOffset[k] = offset[0];
466 // JP 20.6.2001: end is ever exclusive and then don't return
467 // the position of the next character - return the
468 // next position behind the last found character!
469 // "a b c" find "b" must return 2,3 and not 2,4!!!
470 const sal_Int32 nStop = sres.endOffset[k];
471 if (nStop >= 0)
472 sres.endOffset[k] = (nStop < nOffsets ? offset[nStop] : (offset[nOffsets - 1] + 1));
476 else
478 sres = (this->*fnBackward)( in_str, startPos, endPos );
481 if ( xTranslit2.is() && aSrchPara.AlgorithmType2 != SearchAlgorithms2::REGEXP )
483 SearchResult sres2;
485 in_str = searchStr;
486 css::uno::Sequence <sal_Int32> offset( in_str.getLength());
488 in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset);
490 if( startPos < searchStr.getLength() )
491 startPos = FindPosInSeq_Impl( offset, startPos );
492 else
493 startPos = in_str.getLength();
495 if( endPos )
496 endPos = FindPosInSeq_Impl( offset, endPos );
498 bUsePrimarySrchStr = false;
499 sres2 = (this->*fnBackward)( in_str, startPos, endPos );
501 for( int k = 0; k < sres2.startOffset.getLength(); k++ )
503 if (sres2.startOffset[k])
504 sres2.startOffset[k] = offset[sres2.startOffset[k]-1]+1;
505 if (sres2.endOffset[k])
506 sres2.endOffset[k] = offset[sres2.endOffset[k]-1]+1;
509 // pick last and long one
510 if ( sres.subRegExpressions == 0 )
511 return sres2;
512 if ( sres2.subRegExpressions == 1 )
514 if ( sres.startOffset[0] < sres2.startOffset[0] )
515 return sres2;
516 if ( sres.startOffset[0] == sres2.startOffset[0] &&
517 sres.endOffset[0] > sres2.endOffset[0] )
518 return sres2;
522 return sres;
526 bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const
528 bool bRet = true;
529 if( '\x7f' != rStr[nPos])
531 if ( !xCharClass.is() )
532 xCharClass = CharacterClassification::create( m_xContext );
533 sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos,
534 aSrchPara.Locale );
535 if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA |
536 KCharacterType::LETTER ) & nCType ) )
537 bRet = false;
539 return bRet;
542 // --------- helper methods for Boyer-Moore like text searching ----------
543 // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available
545 void TextSearch::MakeForwardTab()
547 // create the jumptable for the search text
549 if( pJumpTable && bIsForwardTab )
551 return; // the jumpTable is ok
553 bIsForwardTab = true;
555 sal_Int32 n, nLen = sSrchStr.getLength();
556 pJumpTable.reset( new TextSearchJumpTable );
558 for( n = 0; n < nLen - 1; ++n )
560 sal_Unicode cCh = sSrchStr[n];
561 sal_Int32 nDiff = nLen - n - 1;
562 TextSearchJumpTable::value_type aEntry( cCh, nDiff );
564 ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
565 pJumpTable->insert( aEntry );
566 if ( !aPair.second )
567 (*(aPair.first)).second = nDiff;
571 void TextSearch::MakeForwardTab2()
573 // create the jumptable for the search text
574 if( pJumpTable2 && bIsForwardTab )
576 return; // the jumpTable is ok
578 bIsForwardTab = true;
580 sal_Int32 n, nLen = sSrchStr2.getLength();
581 pJumpTable2.reset( new TextSearchJumpTable );
583 for( n = 0; n < nLen - 1; ++n )
585 sal_Unicode cCh = sSrchStr2[n];
586 sal_Int32 nDiff = nLen - n - 1;
588 TextSearchJumpTable::value_type aEntry( cCh, nDiff );
589 ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
590 pJumpTable2->insert( aEntry );
591 if ( !aPair.second )
592 (*(aPair.first)).second = nDiff;
596 void TextSearch::MakeBackwardTab()
598 // create the jumptable for the search text
599 if( pJumpTable && !bIsForwardTab)
601 return; // the jumpTable is ok
603 bIsForwardTab = false;
605 sal_Int32 n, nLen = sSrchStr.getLength();
606 pJumpTable.reset( new TextSearchJumpTable );
608 for( n = nLen-1; n > 0; --n )
610 sal_Unicode cCh = sSrchStr[n];
611 TextSearchJumpTable::value_type aEntry( cCh, n );
612 ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
613 pJumpTable->insert( aEntry );
614 if ( !aPair.second )
615 (*(aPair.first)).second = n;
619 void TextSearch::MakeBackwardTab2()
621 // create the jumptable for the search text
622 if( pJumpTable2 && !bIsForwardTab )
624 return; // the jumpTable is ok
626 bIsForwardTab = false;
628 sal_Int32 n, nLen = sSrchStr2.getLength();
629 pJumpTable2.reset( new TextSearchJumpTable );
631 for( n = nLen-1; n > 0; --n )
633 sal_Unicode cCh = sSrchStr2[n];
634 TextSearchJumpTable::value_type aEntry( cCh, n );
635 ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
636 pJumpTable2->insert( aEntry );
637 if ( !aPair.second )
638 (*(aPair.first)).second = n;
642 sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const
644 TextSearchJumpTable *pJump;
645 OUString sSearchKey;
647 if ( bUsePrimarySrchStr ) {
648 pJump = pJumpTable.get();
649 sSearchKey = sSrchStr;
650 } else {
651 pJump = pJumpTable2.get();
652 sSearchKey = sSrchStr2;
655 TextSearchJumpTable::const_iterator iLook = pJump->find( cChr );
656 if ( iLook == pJump->end() )
657 return sSearchKey.getLength();
658 return (*iLook).second;
662 SearchResult TextSearch::NSrchFrwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
664 SearchResult aRet;
665 aRet.subRegExpressions = 0;
667 OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
669 sal_Int32 nSuchIdx = searchStr.getLength();
670 sal_Int32 nEnde = endPos;
671 if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx )
672 return aRet;
675 if( nEnde < sSearchKey.getLength() ) // position inside the search region ?
676 return aRet;
678 nEnde -= sSearchKey.getLength();
680 if (bUsePrimarySrchStr)
681 MakeForwardTab(); // create the jumptable
682 else
683 MakeForwardTab2();
685 for (sal_Int32 nCmpIdx = startPos; // start position for the search
686 nCmpIdx <= nEnde;
687 nCmpIdx += GetDiff( searchStr[nCmpIdx + sSearchKey.getLength()-1]))
689 // if the match would be the completed cells, skip it.
690 if ( (checkCTLStart && !isCellStart( searchStr, nCmpIdx )) || (checkCTLEnd
691 && !isCellStart( searchStr, nCmpIdx + sSearchKey.getLength())) )
692 continue;
694 nSuchIdx = sSearchKey.getLength() - 1;
695 while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == searchStr[nCmpIdx + nSuchIdx])
697 if( nSuchIdx == 0 )
699 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
701 sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength();
702 bool bAtStart = !nCmpIdx;
703 bool bAtEnd = nFndEnd == endPos;
704 bool bDelimBefore = bAtStart || IsDelimiter( searchStr, nCmpIdx-1 );
705 bool bDelimBehind = bAtEnd || IsDelimiter( searchStr, nFndEnd );
706 // * 1 -> only one word in the paragraph
707 // * 2 -> at begin of paragraph
708 // * 3 -> at end of paragraph
709 // * 4 -> inside the paragraph
710 if( !( ( bAtStart && bAtEnd ) || // 1
711 ( bAtStart && bDelimBehind ) || // 2
712 ( bAtEnd && bDelimBefore ) || // 3
713 ( bDelimBefore && bDelimBehind ))) // 4
714 break;
717 aRet.subRegExpressions = 1;
718 aRet.startOffset.realloc( 1 );
719 aRet.startOffset[ 0 ] = nCmpIdx;
720 aRet.endOffset.realloc( 1 );
721 aRet.endOffset[ 0 ] = nCmpIdx + sSearchKey.getLength();
723 return aRet;
725 else
726 nSuchIdx--;
729 return aRet;
732 SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
734 SearchResult aRet;
735 aRet.subRegExpressions = 0;
737 OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
739 sal_Int32 nSuchIdx = searchStr.getLength();
740 sal_Int32 nEnde = endPos;
741 if( nSuchIdx == 0 || sSearchKey.isEmpty() || sSearchKey.getLength() > nSuchIdx)
742 return aRet;
744 if (bUsePrimarySrchStr)
745 MakeBackwardTab(); // create the jumptable
746 else
747 MakeBackwardTab2();
749 if( nEnde == nSuchIdx ) // end position for the search
750 nEnde = sSearchKey.getLength();
751 else
752 nEnde += sSearchKey.getLength();
754 sal_Int32 nCmpIdx = startPos; // start position for the search
756 while (nCmpIdx >= nEnde)
758 // if the match would be the completed cells, skip it.
759 if ( (!checkCTLStart || isCellStart( searchStr, nCmpIdx -
760 sSearchKey.getLength() )) && (!checkCTLEnd ||
761 isCellStart( searchStr, nCmpIdx)))
763 nSuchIdx = 0;
764 while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] ==
765 searchStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] )
766 nSuchIdx++;
767 if( nSuchIdx >= sSearchKey.getLength() )
769 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
771 sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength();
772 bool bAtStart = !nFndStt;
773 bool bAtEnd = nCmpIdx == startPos;
774 bool bDelimBehind = bAtEnd || IsDelimiter( searchStr, nCmpIdx );
775 bool bDelimBefore = bAtStart || // begin of paragraph
776 IsDelimiter( searchStr, nFndStt-1 );
777 // * 1 -> only one word in the paragraph
778 // * 2 -> at begin of paragraph
779 // * 3 -> at end of paragraph
780 // * 4 -> inside the paragraph
781 if( ( bAtStart && bAtEnd ) || // 1
782 ( bAtStart && bDelimBehind ) || // 2
783 ( bAtEnd && bDelimBefore ) || // 3
784 ( bDelimBefore && bDelimBehind )) // 4
786 aRet.subRegExpressions = 1;
787 aRet.startOffset.realloc( 1 );
788 aRet.startOffset[ 0 ] = nCmpIdx;
789 aRet.endOffset.realloc( 1 );
790 aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength();
791 return aRet;
794 else
796 aRet.subRegExpressions = 1;
797 aRet.startOffset.realloc( 1 );
798 aRet.startOffset[ 0 ] = nCmpIdx;
799 aRet.endOffset.realloc( 1 );
800 aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength();
801 return aRet;
805 nSuchIdx = GetDiff( searchStr[nCmpIdx - sSearchKey.getLength()] );
806 if( nCmpIdx < nSuchIdx )
807 return aRet;
808 nCmpIdx -= nSuchIdx;
810 return aRet;
813 void TextSearch::RESrchPrepare( const css::util::SearchOptions2& rOptions)
815 TransliterationFlags transliterateFlags = static_cast<TransliterationFlags>(rOptions.transliterateFlags);
816 // select the transliterated pattern string
817 const OUString& rPatternStr =
818 (isSimpleTrans(transliterateFlags) ? sSrchStr
819 : (isComplexTrans(transliterateFlags) ? sSrchStr2 : rOptions.searchString));
821 sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability
822 // map css::util::SearchFlags to ICU uregex.h flags
823 // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
824 // REG_NEWLINE is neither properly defined nor used anywhere => not implemented
825 // REG_NOSUB is not used anywhere => not implemented
826 // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
827 // LEV_RELAXED is only used for SearchAlgorithm==Approximate
828 // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO
829 // probably because the transliteration flag IGNORE_CASE handles it as well.
830 if( (rOptions.searchFlag & css::util::SearchFlags::ALL_IGNORE_CASE) != 0
831 || (transliterateFlags & TransliterationFlags::IGNORE_CASE))
832 nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
833 UErrorCode nIcuErr = U_ZERO_ERROR;
834 // assumption: transliteration didn't mangle regexp control chars
835 IcuUniString aIcuSearchPatStr( reinterpret_cast<const UChar*>(rPatternStr.getStr()), rPatternStr.getLength());
836 #ifndef DISABLE_WORDBOUND_EMULATION
837 // for convenience specific syntax elements of the old regex engine are emulated
838 // - by replacing \< with "word-break followed by a look-ahead word-char"
839 static const IcuUniString aChevronPatternB( "\\\\<", -1, IcuUniString::kInvariant);
840 static const IcuUniString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, IcuUniString::kInvariant);
841 static icu::RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr);
842 aChevronMatcherB.reset( aIcuSearchPatStr);
843 aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr);
844 aChevronMatcherB.reset();
845 // - by replacing \> with "look-behind word-char followed by a word-break"
846 static const IcuUniString aChevronPatternE( "\\\\>", -1, IcuUniString::kInvariant);
847 static const IcuUniString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, IcuUniString::kInvariant);
848 static icu::RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr);
849 aChevronMatcherE.reset( aIcuSearchPatStr);
850 aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr);
851 aChevronMatcherE.reset();
852 #endif
853 pRegexMatcher.reset( new icu::RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr) );
854 if (nIcuErr)
856 SAL_INFO( "i18npool", "TextSearch::RESrchPrepare UErrorCode " << nIcuErr);
857 pRegexMatcher.reset();
859 else
861 // Pathological patterns may result in exponential run time making the
862 // application appear to be frozen. Limit that. Documentation for this
863 // call says
864 // https://ssl.icu-project.org/apiref/icu4c/classicu_1_1RegexMatcher.html#a6ebcfcab4fe6a38678c0291643a03a00
865 // "The units of the limit are steps of the match engine.
866 // Correspondence with actual processor time will depend on the speed
867 // of the processor and the details of the specific pattern, but will
868 // typically be on the order of milliseconds."
869 // Just what is a good value? 42 is always an answer ... the 23 enigma
870 // as well... which on the dev's machine is roughly 50 seconds with the
871 // pattern of fdo#70627.
872 /* TODO: make this a configuration settable value and possibly take
873 * complexity of expression into account and maybe even length of text
874 * to be matched; currently (2013-11-25) that is at most one 64k
875 * paragraph per RESrchFrwrd()/RESrchBkwrd() call. */
876 pRegexMatcher->setTimeLimit( 23*1000, nIcuErr);
881 static bool lcl_findRegex( std::unique_ptr<icu::RegexMatcher> const & pRegexMatcher, sal_Int32 nStartPos, UErrorCode & rIcuErr )
883 if (!pRegexMatcher->find( nStartPos, rIcuErr))
885 /* TODO: future versions could pass the UErrorCode or translations
886 * thereof to the caller, for example to inform the user of
887 * U_REGEX_TIME_OUT. The strange thing though is that an error is set
888 * only after the second call that returns immediately and not if
889 * timeout occurred on the first call?!? */
890 SAL_INFO( "i18npool", "lcl_findRegex UErrorCode " << rIcuErr);
891 return false;
893 return true;
896 SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr,
897 sal_Int32 startPos, sal_Int32 endPos )
899 SearchResult aRet;
900 aRet.subRegExpressions = 0;
901 if( !pRegexMatcher)
902 return aRet;
904 if( endPos > searchStr.getLength())
905 endPos = searchStr.getLength();
907 // use the ICU RegexMatcher to find the matches
908 UErrorCode nIcuErr = U_ZERO_ERROR;
909 const IcuUniString aSearchTargetStr( reinterpret_cast<const UChar*>(searchStr.getStr()), endPos);
910 pRegexMatcher->reset( aSearchTargetStr);
911 // search until there is a valid match
912 for(;;)
914 if (!lcl_findRegex( pRegexMatcher, startPos, nIcuErr))
915 return aRet;
917 // #i118887# ignore zero-length matches e.g. "a*" in "bc"
918 int nStartOfs = pRegexMatcher->start( nIcuErr);
919 int nEndOfs = pRegexMatcher->end( nIcuErr);
920 if( nStartOfs < nEndOfs)
921 break;
922 // If the zero-length match is behind the string, do not match it again
923 // and again until startPos reaches there. A match behind the string is
924 // a "$" anchor.
925 if (nStartOfs == endPos)
926 break;
927 // try at next position if there was a zero-length match
928 if( ++startPos >= endPos)
929 return aRet;
932 // extract the result of the search
933 const int nGroupCount = pRegexMatcher->groupCount();
934 aRet.subRegExpressions = nGroupCount + 1;
935 aRet.startOffset.realloc( aRet.subRegExpressions);
936 aRet.endOffset.realloc( aRet.subRegExpressions);
937 aRet.startOffset[0] = pRegexMatcher->start( nIcuErr);
938 aRet.endOffset[0] = pRegexMatcher->end( nIcuErr);
939 for( int i = 1; i <= nGroupCount; ++i) {
940 aRet.startOffset[i] = pRegexMatcher->start( i, nIcuErr);
941 aRet.endOffset[i] = pRegexMatcher->end( i, nIcuErr);
944 return aRet;
947 SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr,
948 sal_Int32 startPos, sal_Int32 endPos )
950 // NOTE: for backwards search callers provide startPos/endPos inverted!
951 SearchResult aRet;
952 aRet.subRegExpressions = 0;
953 if( !pRegexMatcher)
954 return aRet;
956 if( startPos > searchStr.getLength())
957 startPos = searchStr.getLength();
959 // use the ICU RegexMatcher to find the matches
960 // TODO: use ICU's backward searching once it becomes available
961 // as its replacement using forward search is not as good as the real thing
962 UErrorCode nIcuErr = U_ZERO_ERROR;
963 const IcuUniString aSearchTargetStr( reinterpret_cast<const UChar*>(searchStr.getStr()), startPos);
964 pRegexMatcher->reset( aSearchTargetStr);
965 if (!lcl_findRegex( pRegexMatcher, endPos, nIcuErr))
966 return aRet;
968 // find the last match
969 int nLastPos = 0;
970 int nFoundEnd = 0;
971 int nGoodPos = 0, nGoodEnd = 0;
972 bool bFirst = true;
973 do {
974 nLastPos = pRegexMatcher->start( nIcuErr);
975 nFoundEnd = pRegexMatcher->end( nIcuErr);
976 if (nLastPos < nFoundEnd)
978 // remember last non-zero-length match
979 nGoodPos = nLastPos;
980 nGoodEnd = nFoundEnd;
982 if( nFoundEnd >= startPos)
983 break;
984 bFirst = false;
985 if( nFoundEnd == nLastPos)
986 ++nFoundEnd;
987 } while( lcl_findRegex( pRegexMatcher, nFoundEnd, nIcuErr));
989 // Ignore all zero-length matches except "$" anchor on first match.
990 if (nGoodPos == nGoodEnd)
992 if (bFirst && nLastPos == startPos)
993 nGoodPos = nLastPos;
994 else
995 return aRet;
998 // find last match again to get its details
999 lcl_findRegex( pRegexMatcher, nGoodPos, nIcuErr);
1001 // fill in the details of the last match
1002 const int nGroupCount = pRegexMatcher->groupCount();
1003 aRet.subRegExpressions = nGroupCount + 1;
1004 aRet.startOffset.realloc( aRet.subRegExpressions);
1005 aRet.endOffset.realloc( aRet.subRegExpressions);
1006 // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted!
1007 aRet.startOffset[0] = pRegexMatcher->end( nIcuErr);
1008 aRet.endOffset[0] = pRegexMatcher->start( nIcuErr);
1009 for( int i = 1; i <= nGroupCount; ++i) {
1010 aRet.startOffset[i] = pRegexMatcher->end( i, nIcuErr);
1011 aRet.endOffset[i] = pRegexMatcher->start( i, nIcuErr);
1014 return aRet;
1018 // search for words phonetically
1019 SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr,
1020 sal_Int32 startPos, sal_Int32 endPos )
1022 SearchResult aRet;
1023 aRet.subRegExpressions = 0;
1025 if( !xBreak.is() )
1026 return aRet;
1028 sal_Int32 nStt, nEnd;
1030 Boundary aWBnd = xBreak->getWordBoundary( searchStr, startPos,
1031 aSrchPara.Locale,
1032 WordType::ANYWORD_IGNOREWHITESPACES, true );
1036 if( aWBnd.startPos >= endPos )
1037 break;
1038 nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos;
1039 nEnd = std::min(aWBnd.endPos, endPos);
1041 if( nStt < nEnd &&
1042 pWLD->WLD( searchStr.getStr() + nStt, nEnd - nStt ) <= nLimit )
1044 aRet.subRegExpressions = 1;
1045 aRet.startOffset.realloc( 1 );
1046 aRet.startOffset[ 0 ] = nStt;
1047 aRet.endOffset.realloc( 1 );
1048 aRet.endOffset[ 0 ] = nEnd;
1049 break;
1052 nStt = nEnd - 1;
1053 aWBnd = xBreak->nextWord( searchStr, nStt, aSrchPara.Locale,
1054 WordType::ANYWORD_IGNOREWHITESPACES);
1055 } while( aWBnd.startPos != aWBnd.endPos ||
1056 (aWBnd.endPos != searchStr.getLength() && aWBnd.endPos != nEnd) );
1057 // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only
1058 // whitespace) in searchStr, getWordBoundary() returned startPos,startPos
1059 // and nextWord() does also => don't loop forever.
1060 return aRet;
1063 SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr,
1064 sal_Int32 startPos, sal_Int32 endPos )
1066 SearchResult aRet;
1067 aRet.subRegExpressions = 0;
1069 if( !xBreak.is() )
1070 return aRet;
1072 sal_Int32 nStt, nEnd;
1074 Boundary aWBnd = xBreak->getWordBoundary( searchStr, startPos,
1075 aSrchPara.Locale,
1076 WordType::ANYWORD_IGNOREWHITESPACES, true );
1080 if( aWBnd.endPos <= endPos )
1081 break;
1082 nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos;
1083 nEnd = std::min(aWBnd.endPos, startPos);
1085 if( nStt < nEnd &&
1086 pWLD->WLD( searchStr.getStr() + nStt, nEnd - nStt ) <= nLimit )
1088 aRet.subRegExpressions = 1;
1089 aRet.startOffset.realloc( 1 );
1090 aRet.startOffset[ 0 ] = nEnd;
1091 aRet.endOffset.realloc( 1 );
1092 aRet.endOffset[ 0 ] = nStt;
1093 break;
1095 if( !nStt )
1096 break;
1098 aWBnd = xBreak->previousWord( searchStr, nStt, aSrchPara.Locale,
1099 WordType::ANYWORD_IGNOREWHITESPACES);
1100 } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != searchStr.getLength() );
1101 return aRet;
1105 namespace {
1106 void setWildcardMatch( css::util::SearchResult& rRes, sal_Int32 nStartOffset, sal_Int32 nEndOffset )
1108 rRes.subRegExpressions = 1;
1109 rRes.startOffset.realloc(1);
1110 rRes.endOffset.realloc(1);
1111 rRes.startOffset[0] = nStartOffset;
1112 rRes.endOffset[0] = nEndOffset;
1116 SearchResult TextSearch::WildcardSrchFrwrd( const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos )
1118 SearchResult aRes;
1119 aRes.subRegExpressions = 0; // no match
1120 sal_Int32 nStartOffset = nStartPos;
1121 sal_Int32 nEndOffset = nEndPos;
1123 const sal_Int32 nStringLen = searchStr.getLength();
1125 // Forward nStartPos inclusive, nEndPos exclusive, but allow for empty
1126 // string match with [0,0).
1127 if (nStartPos < 0 || nEndPos > nStringLen || nEndPos < nStartPos ||
1128 (nStartPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos)))
1129 return aRes;
1131 const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2);
1132 const sal_Int32 nPatternLen = rPattern.getLength();
1134 // Handle special cases empty pattern and/or string outside of the loop to
1135 // not add performance penalties there and simplify.
1136 if (nStartPos == nEndPos)
1138 sal_Int32 i = 0;
1139 while (i < nPatternLen && rPattern[i] == '*')
1140 ++i;
1141 if (i == nPatternLen)
1142 setWildcardMatch( aRes, nStartOffset, nEndOffset);
1143 return aRes;
1146 // Empty pattern does not match any non-empty string.
1147 if (!nPatternLen)
1148 return aRes;
1150 bool bRewind = false;
1151 sal_uInt32 cPattern = 0;
1152 sal_Int32 nPattern = 0;
1153 sal_Int32 nAfterFakePattern = nPattern;
1154 if (mbWildcardAllowSubstring)
1156 // Fake a leading '*' wildcard.
1157 cPattern = '*';
1158 bRewind = true;
1159 // Assume a non-'*' pattern character follows. If it is a '*' instead
1160 // that will be handled in the loop by setting nPat.
1161 sal_uInt32 cu = rPattern.iterateCodePoints( &nAfterFakePattern);
1162 if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern < nPatternLen)
1163 rPattern.iterateCodePoints( &nAfterFakePattern);
1166 sal_Int32 nString = nStartPos, nPat = -1, nStr = -1, nLastAsterisk = -1;
1167 sal_uInt32 cPatternAfterAsterisk = 0;
1168 bool bEscaped = false, bEscapedAfterAsterisk = false;
1170 // The loop code tries to avoid costly calls to iterateCodePoints() when
1171 // possible.
1175 if (bRewind)
1177 // Reuse cPattern after '*', nPattern was correspondingly
1178 // incremented to point behind cPattern.
1179 bRewind = false;
1181 else if (nPattern < nPatternLen)
1183 // nPattern will be incremented by iterateCodePoints().
1184 cPattern = rPattern.iterateCodePoints( &nPattern);
1185 if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen)
1187 bEscaped = true;
1188 cPattern = rPattern.iterateCodePoints( &nPattern);
1191 else
1193 // A trailing '*' is handled below.
1194 if (mbWildcardAllowSubstring)
1196 // If the pattern is consumed and substring match allowed we're good.
1197 setWildcardMatch( aRes, nStartOffset, nString);
1198 return aRes;
1200 else if (nString < nEndPos && nLastAsterisk >= 0)
1202 // If substring match is not allowed try a greedy '*' match.
1203 nPattern = nLastAsterisk;
1204 continue; // do
1206 else
1207 return aRes;
1210 if (cPattern == '*' && !bEscaped)
1212 // '*' is one code unit, so not using iterateCodePoints() is ok.
1213 while (nPattern < nPatternLen && rPattern[nPattern] == '*')
1214 ++nPattern;
1216 if (nPattern >= nPatternLen)
1218 // Last pattern is '*', remaining string matches.
1219 setWildcardMatch( aRes, nStartOffset, nEndOffset);
1220 return aRes;
1223 nLastAsterisk = nPattern; // Remember last encountered '*'.
1225 // cPattern will be the next non-'*' character, nPattern
1226 // incremented.
1227 cPattern = rPattern.iterateCodePoints( &nPattern);
1228 if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen)
1230 bEscaped = true;
1231 cPattern = rPattern.iterateCodePoints( &nPattern);
1234 cPatternAfterAsterisk = cPattern;
1235 bEscapedAfterAsterisk = bEscaped;
1236 nPat = nPattern; // Remember position of pattern behind '*', already incremented.
1237 nStr = nString; // Remember the current string to be matched.
1240 if (nString >= nEndPos)
1241 // Whatever follows in pattern, string will not match.
1242 return aRes;
1244 // nString will be incremented by iterateCodePoints().
1245 sal_uInt32 cString = searchStr.iterateCodePoints( &nString);
1247 if ((cPattern != '?' || bEscaped) && cPattern != cString)
1249 if (nPat == -1)
1250 // Non-match already without any '*' pattern.
1251 return aRes;
1253 bRewind = true;
1254 nPattern = nPat; // Rewind pattern to character behind '*', already incremented.
1255 cPattern = cPatternAfterAsterisk;
1256 bEscaped = bEscapedAfterAsterisk;
1257 searchStr.iterateCodePoints( &nStr);
1258 nString = nStr; // Restore incremented remembered string position.
1259 if (nPat == nAfterFakePattern)
1261 // Next start offset will be the next character.
1262 nStartOffset = nString;
1265 else
1267 // An unescaped '?' pattern matched any character, or characters
1268 // matched. Reset only escaped state.
1269 bEscaped = false;
1272 while (nString < nEndPos);
1274 if (bRewind)
1275 return aRes;
1277 // Eat trailing '*' pattern that matches anything, including nothing.
1278 // '*' is one code unit, so not using iterateCodePoints() is ok.
1279 while (nPattern < nPatternLen && rPattern[nPattern] == '*')
1280 ++nPattern;
1282 if (nPattern == nPatternLen)
1283 setWildcardMatch( aRes, nStartOffset, nEndOffset);
1284 return aRes;
1287 SearchResult TextSearch::WildcardSrchBkwrd( const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos )
1289 SearchResult aRes;
1290 aRes.subRegExpressions = 0; // no match
1292 sal_Int32 nStartOffset = nStartPos;
1293 sal_Int32 nEndOffset = nEndPos;
1295 const sal_Int32 nStringLen = searchStr.getLength();
1297 // Backward nStartPos exclusive, nEndPos inclusive, but allow for empty
1298 // string match with (0,0].
1299 if (nStartPos > nStringLen || nEndPos < 0 || nStartPos < nEndPos ||
1300 (nEndPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos)))
1301 return aRes;
1303 const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2);
1304 sal_Int32 nPatternLen = rPattern.getLength();
1306 // Handle special cases empty pattern and/or string outside of the loop to
1307 // not add performance penalties there and simplify.
1308 if (nStartPos == nEndPos)
1310 sal_Int32 i = 0;
1311 while (i < nPatternLen && rPattern[i] == '*')
1312 ++i;
1313 if (i == nPatternLen)
1314 setWildcardMatch( aRes, nStartOffset, nEndOffset);
1315 return aRes;
1318 // Empty pattern does not match any non-empty string.
1319 if (!nPatternLen)
1320 return aRes;
1322 // Reverse escaped patterns to ease the handling of escapes, keeping escape
1323 // and following character as one sequence in backward direction.
1324 if ((bUsePrimarySrchStr && maWildcardReversePattern.isEmpty()) ||
1325 (!bUsePrimarySrchStr && maWildcardReversePattern2.isEmpty()))
1327 OUStringBuffer aPatternBuf( rPattern);
1328 sal_Int32 nIndex = 0;
1329 while (nIndex < nPatternLen)
1331 const sal_Int32 nOld = nIndex;
1332 const sal_uInt32 cu = rPattern.iterateCodePoints( &nIndex);
1333 if (cu == mcWildcardEscapeChar)
1335 if (nIndex < nPatternLen)
1337 if (nIndex - nOld == 1)
1339 // Simply move code units, we already memorized the one
1340 // in 'cu'.
1341 const sal_Int32 nOld2 = nIndex;
1342 rPattern.iterateCodePoints( &nIndex);
1343 for (sal_Int32 i=0; i < nIndex - nOld2; ++i)
1344 aPatternBuf[nOld+i] = rPattern[nOld2+i];
1345 aPatternBuf[nIndex-1] = static_cast<sal_Unicode>(cu);
1347 else
1349 // Copy the escape character code units first in the
1350 // unlikely case that it would not be of BMP.
1351 assert(nIndex - nOld == 2); // it's UTF-16, so...
1352 sal_Unicode buf[2];
1353 buf[0] = rPattern[nOld];
1354 buf[1] = rPattern[nOld+1];
1355 const sal_Int32 nOld2 = nIndex;
1356 rPattern.iterateCodePoints( &nIndex);
1357 for (sal_Int32 i=0; i < nIndex - nOld2; ++i)
1358 aPatternBuf[nOld+i] = rPattern[nOld2+i];
1359 aPatternBuf[nIndex-2] = buf[0];
1360 aPatternBuf[nIndex-1] = buf[1];
1363 else
1365 // Trailing escape would become leading escape, do what?
1366 // Eliminate.
1367 aPatternBuf.remove( nOld, nIndex - nOld);
1371 if (bUsePrimarySrchStr)
1372 maWildcardReversePattern = aPatternBuf.makeStringAndClear();
1373 else
1374 maWildcardReversePattern2 = aPatternBuf.makeStringAndClear();
1376 const OUString& rReversePattern = (bUsePrimarySrchStr ? maWildcardReversePattern : maWildcardReversePattern2);
1377 nPatternLen = rReversePattern.getLength();
1379 bool bRewind = false;
1380 sal_uInt32 cPattern = 0;
1381 sal_Int32 nPattern = nPatternLen;
1382 sal_Int32 nAfterFakePattern = nPattern;
1383 if (mbWildcardAllowSubstring)
1385 // Fake a trailing '*' wildcard.
1386 cPattern = '*';
1387 bRewind = true;
1388 // Assume a non-'*' pattern character follows. If it is a '*' instead
1389 // that will be handled in the loop by setting nPat.
1390 sal_uInt32 cu = rReversePattern.iterateCodePoints( &nAfterFakePattern, -1);
1391 if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern > 0)
1392 rReversePattern.iterateCodePoints( &nAfterFakePattern, -1);
1395 sal_Int32 nString = nStartPos, nPat = -1, nStr = -1, nLastAsterisk = -1;
1396 sal_uInt32 cPatternAfterAsterisk = 0;
1397 bool bEscaped = false, bEscapedAfterAsterisk = false;
1399 // The loop code tries to avoid costly calls to iterateCodePoints() when
1400 // possible.
1404 if (bRewind)
1406 // Reuse cPattern after '*', nPattern was correspondingly
1407 // decremented to point before cPattern.
1408 bRewind = false;
1410 else if (nPattern > 0)
1412 // nPattern will be decremented by iterateCodePoints().
1413 cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1414 if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0)
1416 bEscaped = true;
1417 cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1420 else
1422 // A trailing '*' is handled below.
1423 if (mbWildcardAllowSubstring)
1425 // If the pattern is consumed and substring match allowed we're good.
1426 setWildcardMatch( aRes, nStartOffset, nString);
1427 return aRes;
1429 else if (nString > nEndPos && nLastAsterisk >= 0)
1431 // If substring match is not allowed try a greedy '*' match.
1432 nPattern = nLastAsterisk;
1433 continue; // do
1435 else
1436 return aRes;
1439 if (cPattern == '*' && !bEscaped)
1441 // '*' is one code unit, so not using iterateCodePoints() is ok.
1442 while (nPattern > 0 && rReversePattern[nPattern-1] == '*')
1443 --nPattern;
1445 if (nPattern <= 0)
1447 // First pattern is '*', remaining string matches.
1448 setWildcardMatch( aRes, nStartOffset, nEndOffset);
1449 return aRes;
1452 nLastAsterisk = nPattern; // Remember last encountered '*'.
1454 // cPattern will be the previous non-'*' character, nPattern
1455 // decremented.
1456 cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1457 if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0)
1459 bEscaped = true;
1460 cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1463 cPatternAfterAsterisk = cPattern;
1464 bEscapedAfterAsterisk = bEscaped;
1465 nPat = nPattern; // Remember position of pattern before '*', already decremented.
1466 nStr = nString; // Remember the current string to be matched.
1469 if (nString <= nEndPos)
1470 // Whatever leads in pattern, string will not match.
1471 return aRes;
1473 // nString will be decremented by iterateCodePoints().
1474 sal_uInt32 cString = searchStr.iterateCodePoints( &nString, -1);
1476 if ((cPattern != '?' || bEscaped) && cPattern != cString)
1478 if (nPat == -1)
1479 // Non-match already without any '*' pattern.
1480 return aRes;
1482 bRewind = true;
1483 nPattern = nPat; // Rewind pattern to character before '*', already decremented.
1484 cPattern = cPatternAfterAsterisk;
1485 bEscaped = bEscapedAfterAsterisk;
1486 searchStr.iterateCodePoints( &nStr, -1);
1487 nString = nStr; // Restore decremented remembered string position.
1488 if (nPat == nAfterFakePattern)
1490 // Next start offset will be this character (exclusive).
1491 nStartOffset = nString;
1494 else
1496 // An unescaped '?' pattern matched any character, or characters
1497 // matched. Reset only escaped state.
1498 bEscaped = false;
1501 while (nString > nEndPos);
1503 if (bRewind)
1504 return aRes;
1506 // Eat leading '*' pattern that matches anything, including nothing.
1507 // '*' is one code unit, so not using iterateCodePoints() is ok.
1508 while (nPattern > 0 && rReversePattern[nPattern-1] == '*')
1509 --nPattern;
1511 if (nPattern == 0)
1512 setWildcardMatch( aRes, nStartOffset, nEndOffset);
1513 return aRes;
1517 static const sal_Char cSearchImpl[] = "com.sun.star.util.TextSearch_i18n";
1519 static uno::Sequence< OUString > getServiceName_Static()
1521 uno::Sequence< OUString > aRet(2);
1522 aRet[0] = "com.sun.star.util.TextSearch";
1523 aRet[1] = "com.sun.star.util.TextSearch2";
1524 return aRet;
1527 static OUString getImplementationName_Static()
1529 return cSearchImpl;
1532 OUString SAL_CALL
1533 TextSearch::getImplementationName()
1535 return getImplementationName_Static();
1538 sal_Bool SAL_CALL TextSearch::supportsService(const OUString& rServiceName)
1540 return cppu::supportsService(this, rServiceName);
1543 Sequence< OUString > SAL_CALL
1544 TextSearch::getSupportedServiceNames()
1546 Sequence< OUString > aRet { getServiceName_Static() };
1547 return aRet;
1550 static css::uno::Reference< css::uno::XInterface >
1551 TextSearch_CreateInstance(
1552 const css::uno::Reference<
1553 css::lang::XMultiServiceFactory >& rxMSF )
1555 return css::uno::Reference<
1556 css::uno::XInterface >(
1557 static_cast<cppu::OWeakObject*>(new TextSearch(
1558 comphelper::getComponentContext( rxMSF ) )) );
1561 extern "C"
1563 SAL_DLLPUBLIC_EXPORT void*
1564 i18nsearch_component_getFactory( const sal_Char* sImplementationName,
1565 void* _pServiceManager,
1566 SAL_UNUSED_PARAMETER void* )
1568 void* pRet = nullptr;
1570 css::lang::XMultiServiceFactory* pServiceManager =
1571 static_cast< css::lang::XMultiServiceFactory* >
1572 ( _pServiceManager );
1573 css::uno::Reference<
1574 css::lang::XSingleServiceFactory > xFactory;
1576 if ( 0 == rtl_str_compare( sImplementationName, cSearchImpl) )
1578 css::uno::Sequence< OUString > aServiceNames { getServiceName_Static() };
1579 xFactory = ::cppu::createSingleFactory(
1580 pServiceManager, getImplementationName_Static(),
1581 &TextSearch_CreateInstance, aServiceNames );
1584 if ( xFactory.is() )
1586 xFactory->acquire();
1587 pRet = xFactory.get();
1590 return pRet;
1593 } // extern "C"
1595 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */