1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include "textsearch.hxx"
22 #include <com/sun/star/lang/XSingleServiceFactory.hpp>
23 #include <comphelper/processfactory.hxx>
24 #include <com/sun/star/i18n/BreakIterator.hpp>
25 #include <com/sun/star/util/SearchAlgorithms2.hpp>
26 #include <com/sun/star/util/SearchFlags.hpp>
27 #include <com/sun/star/i18n/WordType.hpp>
28 #include <com/sun/star/i18n/ScriptType.hpp>
29 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
30 #include <com/sun/star/i18n/CharacterClassification.hpp>
31 #include <com/sun/star/i18n/KCharacterType.hpp>
32 #include <com/sun/star/i18n/Transliteration.hpp>
33 #include <cppuhelper/factory.hxx>
34 #include <cppuhelper/supportsservice.hxx>
35 #include <cppuhelper/weak.hxx>
36 #include <i18nutil/transliteration.hxx>
37 #include <rtl/ustrbuf.hxx>
38 #include <sal/log.hxx>
40 #include <unicode/regex.h>
42 using namespace ::com::sun::star::util
;
43 using namespace ::com::sun::star::uno
;
44 using namespace ::com::sun::star::lang
;
45 using namespace ::com::sun::star::i18n
;
46 using namespace ::com::sun::star
;
48 const TransliterationFlags COMPLEX_TRANS_MASK
=
49 TransliterationFlags::ignoreBaFa_ja_JP
|
50 TransliterationFlags::ignoreIterationMark_ja_JP
|
51 TransliterationFlags::ignoreTiJi_ja_JP
|
52 TransliterationFlags::ignoreHyuByu_ja_JP
|
53 TransliterationFlags::ignoreSeZe_ja_JP
|
54 TransliterationFlags::ignoreIandEfollowedByYa_ja_JP
|
55 TransliterationFlags::ignoreKiKuFollowedBySa_ja_JP
|
56 TransliterationFlags::ignoreProlongedSoundMark_ja_JP
;
60 TransliterationFlags
maskComplexTrans( TransliterationFlags n
)
62 // IGNORE_KANA and FULLWIDTH_HALFWIDTH are simple but need to take effect
63 // in complex transliteration.
65 n
& (COMPLEX_TRANS_MASK
| // all set ignore bits
66 TransliterationFlags::IGNORE_KANA
| // plus IGNORE_KANA bit
67 TransliterationFlags::FULLWIDTH_HALFWIDTH
); // and the FULLWIDTH_HALFWIDTH value
70 bool isComplexTrans( TransliterationFlags n
)
72 return bool(n
& COMPLEX_TRANS_MASK
);
75 TransliterationFlags
maskSimpleTrans( TransliterationFlags n
)
77 return n
& ~COMPLEX_TRANS_MASK
;
80 bool isSimpleTrans( TransliterationFlags n
)
82 return bool(maskSimpleTrans(n
));
85 // Regex patterns are case sensitive.
86 TransliterationFlags
maskSimpleRegexTrans( TransliterationFlags n
)
88 TransliterationFlags m
= (n
& TransliterationFlags::IGNORE_MASK
) & ~TransliterationFlags::IGNORE_CASE
;
89 TransliterationFlags v
= n
& TransliterationFlags::NON_IGNORE_MASK
;
90 if (v
== TransliterationFlags::UPPERCASE_LOWERCASE
|| v
== TransliterationFlags::LOWERCASE_UPPERCASE
)
91 v
= TransliterationFlags::NONE
;
92 return (m
| v
) & ~COMPLEX_TRANS_MASK
;
95 bool isSimpleRegexTrans( TransliterationFlags n
)
97 return bool(maskSimpleRegexTrans(n
));
101 TextSearch::TextSearch(const Reference
< XComponentContext
> & rxContext
)
102 : m_xContext( rxContext
)
105 aOpt
.AlgorithmType2
= SearchAlgorithms2::ABSOLUTE
;
106 aOpt
.algorithmType
= SearchAlgorithms_ABSOLUTE
;
107 aOpt
.searchFlag
= SearchFlags::ALL_IGNORE_CASE
;
112 TextSearch::~TextSearch()
114 pRegexMatcher
.reset();
120 void TextSearch::setOptions2( const SearchOptions2
& rOptions
)
122 osl::MutexGuard
g(m_aMutex
);
124 aSrchPara
= rOptions
;
126 pRegexMatcher
.reset();
130 maWildcardReversePattern
.clear();
131 maWildcardReversePattern2
.clear();
132 TransliterationFlags transliterateFlags
= static_cast<TransliterationFlags
>(aSrchPara
.transliterateFlags
);
134 // Create Transliteration class
135 if( isSimpleTrans( transliterateFlags
) )
137 if( !xTranslit
.is() )
138 xTranslit
.set( Transliteration::create( m_xContext
) );
139 xTranslit
->loadModule(
140 static_cast<TransliterationModules
>(maskSimpleTrans(transliterateFlags
)),
143 else if( xTranslit
.is() )
146 // Create Transliteration for 2<->1, 2<->2 transliteration
147 if ( isComplexTrans( transliterateFlags
) )
149 if( !xTranslit2
.is() )
150 xTranslit2
.set( Transliteration::create( m_xContext
) );
151 // Load transliteration module
152 xTranslit2
->loadModule(
153 static_cast<TransliterationModules
>(maskComplexTrans(transliterateFlags
)),
158 xBreak
= css::i18n::BreakIterator::create( m_xContext
);
160 sSrchStr
= aSrchPara
.searchString
;
162 // Transliterate search string.
163 if (aSrchPara
.AlgorithmType2
== SearchAlgorithms2::REGEXP
)
165 if (isSimpleRegexTrans(transliterateFlags
))
167 if (maskSimpleRegexTrans(transliterateFlags
) !=
168 maskSimpleTrans(transliterateFlags
))
170 css::uno::Reference
< XExtendedTransliteration
> xTranslitPattern(
171 Transliteration::create( m_xContext
));
172 if (xTranslitPattern
.is())
174 xTranslitPattern
->loadModule(
175 static_cast<TransliterationModules
>(maskSimpleRegexTrans(transliterateFlags
)),
177 sSrchStr
= xTranslitPattern
->transliterateString2String(
178 aSrchPara
.searchString
, 0, aSrchPara
.searchString
.getLength());
184 sSrchStr
= xTranslit
->transliterateString2String(
185 aSrchPara
.searchString
, 0, aSrchPara
.searchString
.getLength());
187 // xTranslit2 complex transliterated sSrchStr2 is not used in
188 // regex, see TextSearch::searchForward() and
189 // TextSearch::searchBackward()
194 if ( xTranslit
.is() && isSimpleTrans(transliterateFlags
) )
195 sSrchStr
= xTranslit
->transliterateString2String(
196 aSrchPara
.searchString
, 0, aSrchPara
.searchString
.getLength());
198 if ( xTranslit2
.is() && isComplexTrans(transliterateFlags
) )
199 sSrchStr2
= xTranslit2
->transliterateString2String(
200 aSrchPara
.searchString
, 0, aSrchPara
.searchString
.getLength());
203 // When start or end of search string is a complex script type, we need to
204 // make sure the result boundary is not located in the middle of cell.
205 checkCTLStart
= (xBreak
.is() && (xBreak
->getScriptType(sSrchStr
, 0) ==
206 ScriptType::COMPLEX
));
207 checkCTLEnd
= (xBreak
.is() && (xBreak
->getScriptType(sSrchStr
,
208 sSrchStr
.getLength()-1) == ScriptType::COMPLEX
));
210 // Take the new SearchOptions2::AlgorithmType2 field and ignore
211 // SearchOptions::algorithmType
212 switch( aSrchPara
.AlgorithmType2
)
214 case SearchAlgorithms2::REGEXP
:
215 fnForward
= &TextSearch::RESrchFrwrd
;
216 fnBackward
= &TextSearch::RESrchBkwrd
;
217 RESrchPrepare( aSrchPara
);
220 case SearchAlgorithms2::APPROXIMATE
:
221 fnForward
= &TextSearch::ApproxSrchFrwrd
;
222 fnBackward
= &TextSearch::ApproxSrchBkwrd
;
224 pWLD
.reset( new WLevDistance( sSrchStr
.getStr(), aSrchPara
.changedChars
,
225 aSrchPara
.insertedChars
, aSrchPara
.deletedChars
,
226 0 != (SearchFlags::LEV_RELAXED
& aSrchPara
.searchFlag
) ) );
228 nLimit
= pWLD
->GetLimit();
231 case SearchAlgorithms2::WILDCARD
:
232 mcWildcardEscapeChar
= static_cast<sal_uInt32
>(aSrchPara
.WildcardEscapeCharacter
);
233 mbWildcardAllowSubstring
= ((aSrchPara
.searchFlag
& SearchFlags::WILD_MATCH_SELECTION
) == 0);
234 fnForward
= &TextSearch::WildcardSrchFrwrd
;
235 fnBackward
= &TextSearch::WildcardSrchBkwrd
;
239 SAL_WARN("i18npool","TextSearch::setOptions2 - default what?");
241 case SearchAlgorithms2::ABSOLUTE
:
242 fnForward
= &TextSearch::NSrchFrwrd
;
243 fnBackward
= &TextSearch::NSrchBkwrd
;
248 void TextSearch::setOptions( const SearchOptions
& rOptions
)
250 osl::MutexGuard
g(m_aMutex
);
252 sal_Int16 nAlgorithmType2
;
253 switch (rOptions
.algorithmType
)
255 case SearchAlgorithms_REGEXP
:
256 nAlgorithmType2
= SearchAlgorithms2::REGEXP
;
258 case SearchAlgorithms_APPROXIMATE
:
259 nAlgorithmType2
= SearchAlgorithms2::APPROXIMATE
;
262 SAL_WARN("i18npool","TextSearch::setOptions - default what?");
264 case SearchAlgorithms_ABSOLUTE
:
265 nAlgorithmType2
= SearchAlgorithms2::ABSOLUTE
;
268 // It would be nice if an inherited struct had a ctor that takes an
269 // instance of the object the struct derived from...
270 SearchOptions2
aOptions2(
271 rOptions
.algorithmType
,
273 rOptions
.searchString
,
274 rOptions
.replaceString
,
276 rOptions
.changedChars
,
277 rOptions
.deletedChars
,
278 rOptions
.insertedChars
,
279 rOptions
.transliterateFlags
,
281 0 // no wildcard search, no escape character...
283 setOptions2( aOptions2
);
286 static sal_Int32
FindPosInSeq_Impl( const Sequence
<sal_Int32
>& rOff
, sal_Int32 nPos
)
288 auto pOff
= std::find_if(rOff
.begin(), rOff
.end(),
289 [nPos
](const sal_Int32 nOff
) { return nOff
>= nPos
; });
290 return static_cast<sal_Int32
>(std::distance(rOff
.begin(), pOff
));
293 bool TextSearch::isCellStart(const OUString
& searchStr
, sal_Int32 nPos
)
296 return nPos
== xBreak
->previousCharacters(searchStr
, nPos
+1,
297 aSrchPara
.Locale
, CharacterIteratorMode::SKIPCELL
, 1, nDone
);
300 SearchResult
TextSearch::searchForward( const OUString
& searchStr
, sal_Int32 startPos
, sal_Int32 endPos
)
302 osl::MutexGuard
g(m_aMutex
);
306 OUString
in_str(searchStr
);
308 bUsePrimarySrchStr
= true;
310 if ( xTranslit
.is() )
312 // apply normal transliteration (1<->1, 1<->0)
313 css::uno::Sequence
<sal_Int32
> offset(endPos
- startPos
);
314 in_str
= xTranslit
->transliterate( searchStr
, startPos
, endPos
- startPos
, offset
);
316 // JP 20.6.2001: also the start and end positions must be corrected!
317 sal_Int32 newStartPos
=
318 (startPos
== 0) ? 0 : FindPosInSeq_Impl( offset
, startPos
);
320 sal_Int32 newEndPos
= (endPos
< searchStr
.getLength())
321 ? FindPosInSeq_Impl( offset
, endPos
)
322 : in_str
.getLength();
324 sal_Int32 nExtraOffset
= 0;
325 if (pRegexMatcher
&& startPos
> 0)
327 // avoid matching ^ here - in_str omits a prefix of the searchStr
328 // this is a really lame way to do it, but ICU only offers
329 // useAnchoringBounds() to disable *both* bounds but what is needed
330 // here is to disable only one bound and respect the other
331 in_str
= "X" + in_str
;
333 newStartPos
+= nExtraOffset
;
334 newEndPos
+= nExtraOffset
;
337 sres
= (this->*fnForward
)( in_str
, newStartPos
, newEndPos
);
339 // Map offsets back to untransliterated string.
340 const sal_Int32 nOffsets
= offset
.getLength();
343 // For regex nGroups is the number of groups+1 with group 0 being
345 const sal_Int32 nGroups
= sres
.startOffset
.getLength();
346 for ( sal_Int32 k
= 0; k
< nGroups
; k
++ )
348 const sal_Int32 nStart
= sres
.startOffset
[k
] - nExtraOffset
;
349 // Result offsets are negative (-1) if a group expression was
352 sres
.startOffset
[k
] = (nStart
< nOffsets
? offset
[nStart
] : (offset
[nOffsets
- 1] + 1));
353 // JP 20.6.2001: end is ever exclusive and then don't return
354 // the position of the next character - return the
355 // next position behind the last found character!
356 // "a b c" find "b" must return 2,3 and not 2,4!!!
357 const sal_Int32 nStop
= sres
.endOffset
[k
] - nExtraOffset
;
361 sres
.endOffset
[k
] = offset
[(nStop
<= nOffsets
? nStop
: nOffsets
) - 1] + 1;
363 sres
.endOffset
[k
] = offset
[0];
370 sres
= (this->*fnForward
)( in_str
, startPos
, endPos
);
373 if ( xTranslit2
.is() && aSrchPara
.AlgorithmType2
!= SearchAlgorithms2::REGEXP
)
378 css::uno::Sequence
<sal_Int32
> offset( in_str
.getLength());
380 in_str
= xTranslit2
->transliterate( searchStr
, 0, in_str
.getLength(), offset
);
383 startPos
= FindPosInSeq_Impl( offset
, startPos
);
385 if( endPos
< searchStr
.getLength() )
386 endPos
= FindPosInSeq_Impl( offset
, endPos
);
388 endPos
= in_str
.getLength();
390 bUsePrimarySrchStr
= false;
391 sres2
= (this->*fnForward
)( in_str
, startPos
, endPos
);
393 for ( int k
= 0; k
< sres2
.startOffset
.getLength(); k
++ )
395 if (sres2
.startOffset
[k
])
396 sres2
.startOffset
[k
] = offset
[sres2
.startOffset
[k
]-1] + 1;
397 if (sres2
.endOffset
[k
])
398 sres2
.endOffset
[k
] = offset
[sres2
.endOffset
[k
]-1] + 1;
401 // pick first and long one
402 if ( sres
.subRegExpressions
== 0)
404 if ( sres2
.subRegExpressions
== 1)
406 if ( sres
.startOffset
[0] > sres2
.startOffset
[0])
408 else if ( sres
.startOffset
[0] == sres2
.startOffset
[0] &&
409 sres
.endOffset
[0] < sres2
.endOffset
[0])
417 SearchResult
TextSearch::searchBackward( const OUString
& searchStr
, sal_Int32 startPos
, sal_Int32 endPos
)
419 osl::MutexGuard
g(m_aMutex
);
423 OUString
in_str(searchStr
);
425 bUsePrimarySrchStr
= true;
427 if ( xTranslit
.is() )
429 // apply only simple 1<->1 transliteration here
430 css::uno::Sequence
<sal_Int32
> offset(startPos
- endPos
);
431 in_str
= xTranslit
->transliterate( searchStr
, endPos
, startPos
- endPos
, offset
);
433 // JP 20.6.2001: also the start and end positions must be corrected!
434 sal_Int32
const newStartPos
= (startPos
< searchStr
.getLength())
435 ? FindPosInSeq_Impl( offset
, startPos
)
436 : in_str
.getLength();
438 sal_Int32
const newEndPos
=
439 (endPos
== 0) ? 0 : FindPosInSeq_Impl( offset
, endPos
);
441 // TODO: this would need nExtraOffset handling to avoid $ matching
442 // if (pRegexMatcher && startPos < searchStr.getLength())
443 // but that appears to be impossible with ICU regex
445 sres
= (this->*fnBackward
)( in_str
, newStartPos
, newEndPos
);
447 // Map offsets back to untransliterated string.
448 const sal_Int32 nOffsets
= offset
.getLength();
451 // For regex nGroups is the number of groups+1 with group 0 being
453 const sal_Int32 nGroups
= sres
.startOffset
.getLength();
454 for ( sal_Int32 k
= 0; k
< nGroups
; k
++ )
456 const sal_Int32 nStart
= sres
.startOffset
[k
];
457 // Result offsets are negative (-1) if a group expression was
462 sres
.startOffset
[k
] = offset
[(nStart
<= nOffsets
? nStart
: nOffsets
) - 1] + 1;
464 sres
.startOffset
[k
] = offset
[0];
466 // JP 20.6.2001: end is ever exclusive and then don't return
467 // the position of the next character - return the
468 // next position behind the last found character!
469 // "a b c" find "b" must return 2,3 and not 2,4!!!
470 const sal_Int32 nStop
= sres
.endOffset
[k
];
472 sres
.endOffset
[k
] = (nStop
< nOffsets
? offset
[nStop
] : (offset
[nOffsets
- 1] + 1));
478 sres
= (this->*fnBackward
)( in_str
, startPos
, endPos
);
481 if ( xTranslit2
.is() && aSrchPara
.AlgorithmType2
!= SearchAlgorithms2::REGEXP
)
486 css::uno::Sequence
<sal_Int32
> offset( in_str
.getLength());
488 in_str
= xTranslit2
->transliterate(searchStr
, 0, in_str
.getLength(), offset
);
490 if( startPos
< searchStr
.getLength() )
491 startPos
= FindPosInSeq_Impl( offset
, startPos
);
493 startPos
= in_str
.getLength();
496 endPos
= FindPosInSeq_Impl( offset
, endPos
);
498 bUsePrimarySrchStr
= false;
499 sres2
= (this->*fnBackward
)( in_str
, startPos
, endPos
);
501 for( int k
= 0; k
< sres2
.startOffset
.getLength(); k
++ )
503 if (sres2
.startOffset
[k
])
504 sres2
.startOffset
[k
] = offset
[sres2
.startOffset
[k
]-1]+1;
505 if (sres2
.endOffset
[k
])
506 sres2
.endOffset
[k
] = offset
[sres2
.endOffset
[k
]-1]+1;
509 // pick last and long one
510 if ( sres
.subRegExpressions
== 0 )
512 if ( sres2
.subRegExpressions
== 1 )
514 if ( sres
.startOffset
[0] < sres2
.startOffset
[0] )
516 if ( sres
.startOffset
[0] == sres2
.startOffset
[0] &&
517 sres
.endOffset
[0] > sres2
.endOffset
[0] )
526 bool TextSearch::IsDelimiter( const OUString
& rStr
, sal_Int32 nPos
) const
529 if( '\x7f' != rStr
[nPos
])
531 if ( !xCharClass
.is() )
532 xCharClass
= CharacterClassification::create( m_xContext
);
533 sal_Int32 nCType
= xCharClass
->getCharacterType( rStr
, nPos
,
535 if( 0 != (( KCharacterType::DIGIT
| KCharacterType::ALPHA
|
536 KCharacterType::LETTER
) & nCType
) )
542 // --------- helper methods for Boyer-Moore like text searching ----------
543 // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available
545 void TextSearch::MakeForwardTab()
547 // create the jumptable for the search text
549 if( pJumpTable
&& bIsForwardTab
)
551 return; // the jumpTable is ok
553 bIsForwardTab
= true;
555 sal_Int32 n
, nLen
= sSrchStr
.getLength();
556 pJumpTable
.reset( new TextSearchJumpTable
);
558 for( n
= 0; n
< nLen
- 1; ++n
)
560 sal_Unicode cCh
= sSrchStr
[n
];
561 sal_Int32 nDiff
= nLen
- n
- 1;
562 TextSearchJumpTable::value_type
aEntry( cCh
, nDiff
);
564 ::std::pair
< TextSearchJumpTable::iterator
, bool > aPair
=
565 pJumpTable
->insert( aEntry
);
567 (*(aPair
.first
)).second
= nDiff
;
571 void TextSearch::MakeForwardTab2()
573 // create the jumptable for the search text
574 if( pJumpTable2
&& bIsForwardTab
)
576 return; // the jumpTable is ok
578 bIsForwardTab
= true;
580 sal_Int32 n
, nLen
= sSrchStr2
.getLength();
581 pJumpTable2
.reset( new TextSearchJumpTable
);
583 for( n
= 0; n
< nLen
- 1; ++n
)
585 sal_Unicode cCh
= sSrchStr2
[n
];
586 sal_Int32 nDiff
= nLen
- n
- 1;
588 TextSearchJumpTable::value_type
aEntry( cCh
, nDiff
);
589 ::std::pair
< TextSearchJumpTable::iterator
, bool > aPair
=
590 pJumpTable2
->insert( aEntry
);
592 (*(aPair
.first
)).second
= nDiff
;
596 void TextSearch::MakeBackwardTab()
598 // create the jumptable for the search text
599 if( pJumpTable
&& !bIsForwardTab
)
601 return; // the jumpTable is ok
603 bIsForwardTab
= false;
605 sal_Int32 n
, nLen
= sSrchStr
.getLength();
606 pJumpTable
.reset( new TextSearchJumpTable
);
608 for( n
= nLen
-1; n
> 0; --n
)
610 sal_Unicode cCh
= sSrchStr
[n
];
611 TextSearchJumpTable::value_type
aEntry( cCh
, n
);
612 ::std::pair
< TextSearchJumpTable::iterator
, bool > aPair
=
613 pJumpTable
->insert( aEntry
);
615 (*(aPair
.first
)).second
= n
;
619 void TextSearch::MakeBackwardTab2()
621 // create the jumptable for the search text
622 if( pJumpTable2
&& !bIsForwardTab
)
624 return; // the jumpTable is ok
626 bIsForwardTab
= false;
628 sal_Int32 n
, nLen
= sSrchStr2
.getLength();
629 pJumpTable2
.reset( new TextSearchJumpTable
);
631 for( n
= nLen
-1; n
> 0; --n
)
633 sal_Unicode cCh
= sSrchStr2
[n
];
634 TextSearchJumpTable::value_type
aEntry( cCh
, n
);
635 ::std::pair
< TextSearchJumpTable::iterator
, bool > aPair
=
636 pJumpTable2
->insert( aEntry
);
638 (*(aPair
.first
)).second
= n
;
642 sal_Int32
TextSearch::GetDiff( const sal_Unicode cChr
) const
644 TextSearchJumpTable
*pJump
;
647 if ( bUsePrimarySrchStr
) {
648 pJump
= pJumpTable
.get();
649 sSearchKey
= sSrchStr
;
651 pJump
= pJumpTable2
.get();
652 sSearchKey
= sSrchStr2
;
655 TextSearchJumpTable::const_iterator iLook
= pJump
->find( cChr
);
656 if ( iLook
== pJump
->end() )
657 return sSearchKey
.getLength();
658 return (*iLook
).second
;
662 SearchResult
TextSearch::NSrchFrwrd( const OUString
& searchStr
, sal_Int32 startPos
, sal_Int32 endPos
)
665 aRet
.subRegExpressions
= 0;
667 OUString sSearchKey
= bUsePrimarySrchStr
? sSrchStr
: sSrchStr2
;
669 sal_Int32 nSuchIdx
= searchStr
.getLength();
670 sal_Int32 nEnde
= endPos
;
671 if( !nSuchIdx
|| !sSearchKey
.getLength() || sSearchKey
.getLength() > nSuchIdx
)
675 if( nEnde
< sSearchKey
.getLength() ) // position inside the search region ?
678 nEnde
-= sSearchKey
.getLength();
680 if (bUsePrimarySrchStr
)
681 MakeForwardTab(); // create the jumptable
685 for (sal_Int32 nCmpIdx
= startPos
; // start position for the search
687 nCmpIdx
+= GetDiff( searchStr
[nCmpIdx
+ sSearchKey
.getLength()-1]))
689 // if the match would be the completed cells, skip it.
690 if ( (checkCTLStart
&& !isCellStart( searchStr
, nCmpIdx
)) || (checkCTLEnd
691 && !isCellStart( searchStr
, nCmpIdx
+ sSearchKey
.getLength())) )
694 nSuchIdx
= sSearchKey
.getLength() - 1;
695 while( nSuchIdx
>= 0 && sSearchKey
[nSuchIdx
] == searchStr
[nCmpIdx
+ nSuchIdx
])
699 if( SearchFlags::NORM_WORD_ONLY
& aSrchPara
.searchFlag
)
701 sal_Int32 nFndEnd
= nCmpIdx
+ sSearchKey
.getLength();
702 bool bAtStart
= !nCmpIdx
;
703 bool bAtEnd
= nFndEnd
== endPos
;
704 bool bDelimBefore
= bAtStart
|| IsDelimiter( searchStr
, nCmpIdx
-1 );
705 bool bDelimBehind
= bAtEnd
|| IsDelimiter( searchStr
, nFndEnd
);
706 // * 1 -> only one word in the paragraph
707 // * 2 -> at begin of paragraph
708 // * 3 -> at end of paragraph
709 // * 4 -> inside the paragraph
710 if( !( ( bAtStart
&& bAtEnd
) || // 1
711 ( bAtStart
&& bDelimBehind
) || // 2
712 ( bAtEnd
&& bDelimBefore
) || // 3
713 ( bDelimBefore
&& bDelimBehind
))) // 4
717 aRet
.subRegExpressions
= 1;
718 aRet
.startOffset
.realloc( 1 );
719 aRet
.startOffset
[ 0 ] = nCmpIdx
;
720 aRet
.endOffset
.realloc( 1 );
721 aRet
.endOffset
[ 0 ] = nCmpIdx
+ sSearchKey
.getLength();
732 SearchResult
TextSearch::NSrchBkwrd( const OUString
& searchStr
, sal_Int32 startPos
, sal_Int32 endPos
)
735 aRet
.subRegExpressions
= 0;
737 OUString sSearchKey
= bUsePrimarySrchStr
? sSrchStr
: sSrchStr2
;
739 sal_Int32 nSuchIdx
= searchStr
.getLength();
740 sal_Int32 nEnde
= endPos
;
741 if( nSuchIdx
== 0 || sSearchKey
.isEmpty() || sSearchKey
.getLength() > nSuchIdx
)
744 if (bUsePrimarySrchStr
)
745 MakeBackwardTab(); // create the jumptable
749 if( nEnde
== nSuchIdx
) // end position for the search
750 nEnde
= sSearchKey
.getLength();
752 nEnde
+= sSearchKey
.getLength();
754 sal_Int32 nCmpIdx
= startPos
; // start position for the search
756 while (nCmpIdx
>= nEnde
)
758 // if the match would be the completed cells, skip it.
759 if ( (!checkCTLStart
|| isCellStart( searchStr
, nCmpIdx
-
760 sSearchKey
.getLength() )) && (!checkCTLEnd
||
761 isCellStart( searchStr
, nCmpIdx
)))
764 while( nSuchIdx
< sSearchKey
.getLength() && sSearchKey
[nSuchIdx
] ==
765 searchStr
[nCmpIdx
+ nSuchIdx
- sSearchKey
.getLength()] )
767 if( nSuchIdx
>= sSearchKey
.getLength() )
769 if( SearchFlags::NORM_WORD_ONLY
& aSrchPara
.searchFlag
)
771 sal_Int32 nFndStt
= nCmpIdx
- sSearchKey
.getLength();
772 bool bAtStart
= !nFndStt
;
773 bool bAtEnd
= nCmpIdx
== startPos
;
774 bool bDelimBehind
= bAtEnd
|| IsDelimiter( searchStr
, nCmpIdx
);
775 bool bDelimBefore
= bAtStart
|| // begin of paragraph
776 IsDelimiter( searchStr
, nFndStt
-1 );
777 // * 1 -> only one word in the paragraph
778 // * 2 -> at begin of paragraph
779 // * 3 -> at end of paragraph
780 // * 4 -> inside the paragraph
781 if( ( bAtStart
&& bAtEnd
) || // 1
782 ( bAtStart
&& bDelimBehind
) || // 2
783 ( bAtEnd
&& bDelimBefore
) || // 3
784 ( bDelimBefore
&& bDelimBehind
)) // 4
786 aRet
.subRegExpressions
= 1;
787 aRet
.startOffset
.realloc( 1 );
788 aRet
.startOffset
[ 0 ] = nCmpIdx
;
789 aRet
.endOffset
.realloc( 1 );
790 aRet
.endOffset
[ 0 ] = nCmpIdx
- sSearchKey
.getLength();
796 aRet
.subRegExpressions
= 1;
797 aRet
.startOffset
.realloc( 1 );
798 aRet
.startOffset
[ 0 ] = nCmpIdx
;
799 aRet
.endOffset
.realloc( 1 );
800 aRet
.endOffset
[ 0 ] = nCmpIdx
- sSearchKey
.getLength();
805 nSuchIdx
= GetDiff( searchStr
[nCmpIdx
- sSearchKey
.getLength()] );
806 if( nCmpIdx
< nSuchIdx
)
813 void TextSearch::RESrchPrepare( const css::util::SearchOptions2
& rOptions
)
815 TransliterationFlags transliterateFlags
= static_cast<TransliterationFlags
>(rOptions
.transliterateFlags
);
816 // select the transliterated pattern string
817 const OUString
& rPatternStr
=
818 (isSimpleTrans(transliterateFlags
) ? sSrchStr
819 : (isComplexTrans(transliterateFlags
) ? sSrchStr2
: rOptions
.searchString
));
821 sal_uInt32 nIcuSearchFlags
= UREGEX_UWORD
; // request UAX#29 unicode capability
822 // map css::util::SearchFlags to ICU uregex.h flags
823 // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
824 // REG_NEWLINE is neither properly defined nor used anywhere => not implemented
825 // REG_NOSUB is not used anywhere => not implemented
826 // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
827 // LEV_RELAXED is only used for SearchAlgorithm==Approximate
828 // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO
829 // probably because the transliteration flag IGNORE_CASE handles it as well.
830 if( (rOptions
.searchFlag
& css::util::SearchFlags::ALL_IGNORE_CASE
) != 0
831 || (transliterateFlags
& TransliterationFlags::IGNORE_CASE
))
832 nIcuSearchFlags
|= UREGEX_CASE_INSENSITIVE
;
833 UErrorCode nIcuErr
= U_ZERO_ERROR
;
834 // assumption: transliteration didn't mangle regexp control chars
835 IcuUniString
aIcuSearchPatStr( reinterpret_cast<const UChar
*>(rPatternStr
.getStr()), rPatternStr
.getLength());
836 #ifndef DISABLE_WORDBOUND_EMULATION
837 // for convenience specific syntax elements of the old regex engine are emulated
838 // - by replacing \< with "word-break followed by a look-ahead word-char"
839 static const IcuUniString
aChevronPatternB( "\\\\<", -1, IcuUniString::kInvariant
);
840 static const IcuUniString
aChevronReplaceB( "\\\\b(?=\\\\w)", -1, IcuUniString::kInvariant
);
841 static icu::RegexMatcher
aChevronMatcherB( aChevronPatternB
, 0, nIcuErr
);
842 aChevronMatcherB
.reset( aIcuSearchPatStr
);
843 aIcuSearchPatStr
= aChevronMatcherB
.replaceAll( aChevronReplaceB
, nIcuErr
);
844 aChevronMatcherB
.reset();
845 // - by replacing \> with "look-behind word-char followed by a word-break"
846 static const IcuUniString
aChevronPatternE( "\\\\>", -1, IcuUniString::kInvariant
);
847 static const IcuUniString
aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, IcuUniString::kInvariant
);
848 static icu::RegexMatcher
aChevronMatcherE( aChevronPatternE
, 0, nIcuErr
);
849 aChevronMatcherE
.reset( aIcuSearchPatStr
);
850 aIcuSearchPatStr
= aChevronMatcherE
.replaceAll( aChevronReplaceE
, nIcuErr
);
851 aChevronMatcherE
.reset();
853 pRegexMatcher
.reset( new icu::RegexMatcher( aIcuSearchPatStr
, nIcuSearchFlags
, nIcuErr
) );
856 SAL_INFO( "i18npool", "TextSearch::RESrchPrepare UErrorCode " << nIcuErr
);
857 pRegexMatcher
.reset();
861 // Pathological patterns may result in exponential run time making the
862 // application appear to be frozen. Limit that. Documentation for this
864 // https://ssl.icu-project.org/apiref/icu4c/classicu_1_1RegexMatcher.html#a6ebcfcab4fe6a38678c0291643a03a00
865 // "The units of the limit are steps of the match engine.
866 // Correspondence with actual processor time will depend on the speed
867 // of the processor and the details of the specific pattern, but will
868 // typically be on the order of milliseconds."
869 // Just what is a good value? 42 is always an answer ... the 23 enigma
870 // as well... which on the dev's machine is roughly 50 seconds with the
871 // pattern of fdo#70627.
872 /* TODO: make this a configuration settable value and possibly take
873 * complexity of expression into account and maybe even length of text
874 * to be matched; currently (2013-11-25) that is at most one 64k
875 * paragraph per RESrchFrwrd()/RESrchBkwrd() call. */
876 pRegexMatcher
->setTimeLimit( 23*1000, nIcuErr
);
881 static bool lcl_findRegex( std::unique_ptr
<icu::RegexMatcher
> const & pRegexMatcher
, sal_Int32 nStartPos
, UErrorCode
& rIcuErr
)
883 if (!pRegexMatcher
->find( nStartPos
, rIcuErr
))
885 /* TODO: future versions could pass the UErrorCode or translations
886 * thereof to the caller, for example to inform the user of
887 * U_REGEX_TIME_OUT. The strange thing though is that an error is set
888 * only after the second call that returns immediately and not if
889 * timeout occurred on the first call?!? */
890 SAL_INFO( "i18npool", "lcl_findRegex UErrorCode " << rIcuErr
);
896 SearchResult
TextSearch::RESrchFrwrd( const OUString
& searchStr
,
897 sal_Int32 startPos
, sal_Int32 endPos
)
900 aRet
.subRegExpressions
= 0;
904 if( endPos
> searchStr
.getLength())
905 endPos
= searchStr
.getLength();
907 // use the ICU RegexMatcher to find the matches
908 UErrorCode nIcuErr
= U_ZERO_ERROR
;
909 const IcuUniString
aSearchTargetStr( reinterpret_cast<const UChar
*>(searchStr
.getStr()), endPos
);
910 pRegexMatcher
->reset( aSearchTargetStr
);
911 // search until there is a valid match
914 if (!lcl_findRegex( pRegexMatcher
, startPos
, nIcuErr
))
917 // #i118887# ignore zero-length matches e.g. "a*" in "bc"
918 int nStartOfs
= pRegexMatcher
->start( nIcuErr
);
919 int nEndOfs
= pRegexMatcher
->end( nIcuErr
);
920 if( nStartOfs
< nEndOfs
)
922 // If the zero-length match is behind the string, do not match it again
923 // and again until startPos reaches there. A match behind the string is
925 if (nStartOfs
== endPos
)
927 // try at next position if there was a zero-length match
928 if( ++startPos
>= endPos
)
932 // extract the result of the search
933 const int nGroupCount
= pRegexMatcher
->groupCount();
934 aRet
.subRegExpressions
= nGroupCount
+ 1;
935 aRet
.startOffset
.realloc( aRet
.subRegExpressions
);
936 aRet
.endOffset
.realloc( aRet
.subRegExpressions
);
937 aRet
.startOffset
[0] = pRegexMatcher
->start( nIcuErr
);
938 aRet
.endOffset
[0] = pRegexMatcher
->end( nIcuErr
);
939 for( int i
= 1; i
<= nGroupCount
; ++i
) {
940 aRet
.startOffset
[i
] = pRegexMatcher
->start( i
, nIcuErr
);
941 aRet
.endOffset
[i
] = pRegexMatcher
->end( i
, nIcuErr
);
947 SearchResult
TextSearch::RESrchBkwrd( const OUString
& searchStr
,
948 sal_Int32 startPos
, sal_Int32 endPos
)
950 // NOTE: for backwards search callers provide startPos/endPos inverted!
952 aRet
.subRegExpressions
= 0;
956 if( startPos
> searchStr
.getLength())
957 startPos
= searchStr
.getLength();
959 // use the ICU RegexMatcher to find the matches
960 // TODO: use ICU's backward searching once it becomes available
961 // as its replacement using forward search is not as good as the real thing
962 UErrorCode nIcuErr
= U_ZERO_ERROR
;
963 const IcuUniString
aSearchTargetStr( reinterpret_cast<const UChar
*>(searchStr
.getStr()), startPos
);
964 pRegexMatcher
->reset( aSearchTargetStr
);
965 if (!lcl_findRegex( pRegexMatcher
, endPos
, nIcuErr
))
968 // find the last match
971 int nGoodPos
= 0, nGoodEnd
= 0;
974 nLastPos
= pRegexMatcher
->start( nIcuErr
);
975 nFoundEnd
= pRegexMatcher
->end( nIcuErr
);
976 if (nLastPos
< nFoundEnd
)
978 // remember last non-zero-length match
980 nGoodEnd
= nFoundEnd
;
982 if( nFoundEnd
>= startPos
)
985 if( nFoundEnd
== nLastPos
)
987 } while( lcl_findRegex( pRegexMatcher
, nFoundEnd
, nIcuErr
));
989 // Ignore all zero-length matches except "$" anchor on first match.
990 if (nGoodPos
== nGoodEnd
)
992 if (bFirst
&& nLastPos
== startPos
)
998 // find last match again to get its details
999 lcl_findRegex( pRegexMatcher
, nGoodPos
, nIcuErr
);
1001 // fill in the details of the last match
1002 const int nGroupCount
= pRegexMatcher
->groupCount();
1003 aRet
.subRegExpressions
= nGroupCount
+ 1;
1004 aRet
.startOffset
.realloc( aRet
.subRegExpressions
);
1005 aRet
.endOffset
.realloc( aRet
.subRegExpressions
);
1006 // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted!
1007 aRet
.startOffset
[0] = pRegexMatcher
->end( nIcuErr
);
1008 aRet
.endOffset
[0] = pRegexMatcher
->start( nIcuErr
);
1009 for( int i
= 1; i
<= nGroupCount
; ++i
) {
1010 aRet
.startOffset
[i
] = pRegexMatcher
->end( i
, nIcuErr
);
1011 aRet
.endOffset
[i
] = pRegexMatcher
->start( i
, nIcuErr
);
1018 // search for words phonetically
1019 SearchResult
TextSearch::ApproxSrchFrwrd( const OUString
& searchStr
,
1020 sal_Int32 startPos
, sal_Int32 endPos
)
1023 aRet
.subRegExpressions
= 0;
1028 sal_Int32 nStt
, nEnd
;
1030 Boundary aWBnd
= xBreak
->getWordBoundary( searchStr
, startPos
,
1032 WordType::ANYWORD_IGNOREWHITESPACES
, true );
1036 if( aWBnd
.startPos
>= endPos
)
1038 nStt
= aWBnd
.startPos
< startPos
? startPos
: aWBnd
.startPos
;
1039 nEnd
= std::min(aWBnd
.endPos
, endPos
);
1042 pWLD
->WLD( searchStr
.getStr() + nStt
, nEnd
- nStt
) <= nLimit
)
1044 aRet
.subRegExpressions
= 1;
1045 aRet
.startOffset
.realloc( 1 );
1046 aRet
.startOffset
[ 0 ] = nStt
;
1047 aRet
.endOffset
.realloc( 1 );
1048 aRet
.endOffset
[ 0 ] = nEnd
;
1053 aWBnd
= xBreak
->nextWord( searchStr
, nStt
, aSrchPara
.Locale
,
1054 WordType::ANYWORD_IGNOREWHITESPACES
);
1055 } while( aWBnd
.startPos
!= aWBnd
.endPos
||
1056 (aWBnd
.endPos
!= searchStr
.getLength() && aWBnd
.endPos
!= nEnd
) );
1057 // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only
1058 // whitespace) in searchStr, getWordBoundary() returned startPos,startPos
1059 // and nextWord() does also => don't loop forever.
1063 SearchResult
TextSearch::ApproxSrchBkwrd( const OUString
& searchStr
,
1064 sal_Int32 startPos
, sal_Int32 endPos
)
1067 aRet
.subRegExpressions
= 0;
1072 sal_Int32 nStt
, nEnd
;
1074 Boundary aWBnd
= xBreak
->getWordBoundary( searchStr
, startPos
,
1076 WordType::ANYWORD_IGNOREWHITESPACES
, true );
1080 if( aWBnd
.endPos
<= endPos
)
1082 nStt
= aWBnd
.startPos
< endPos
? endPos
: aWBnd
.startPos
;
1083 nEnd
= std::min(aWBnd
.endPos
, startPos
);
1086 pWLD
->WLD( searchStr
.getStr() + nStt
, nEnd
- nStt
) <= nLimit
)
1088 aRet
.subRegExpressions
= 1;
1089 aRet
.startOffset
.realloc( 1 );
1090 aRet
.startOffset
[ 0 ] = nEnd
;
1091 aRet
.endOffset
.realloc( 1 );
1092 aRet
.endOffset
[ 0 ] = nStt
;
1098 aWBnd
= xBreak
->previousWord( searchStr
, nStt
, aSrchPara
.Locale
,
1099 WordType::ANYWORD_IGNOREWHITESPACES
);
1100 } while( aWBnd
.startPos
!= aWBnd
.endPos
|| aWBnd
.endPos
!= searchStr
.getLength() );
1106 void setWildcardMatch( css::util::SearchResult
& rRes
, sal_Int32 nStartOffset
, sal_Int32 nEndOffset
)
1108 rRes
.subRegExpressions
= 1;
1109 rRes
.startOffset
.realloc(1);
1110 rRes
.endOffset
.realloc(1);
1111 rRes
.startOffset
[0] = nStartOffset
;
1112 rRes
.endOffset
[0] = nEndOffset
;
1116 SearchResult
TextSearch::WildcardSrchFrwrd( const OUString
& searchStr
, sal_Int32 nStartPos
, sal_Int32 nEndPos
)
1119 aRes
.subRegExpressions
= 0; // no match
1120 sal_Int32 nStartOffset
= nStartPos
;
1121 sal_Int32 nEndOffset
= nEndPos
;
1123 const sal_Int32 nStringLen
= searchStr
.getLength();
1125 // Forward nStartPos inclusive, nEndPos exclusive, but allow for empty
1126 // string match with [0,0).
1127 if (nStartPos
< 0 || nEndPos
> nStringLen
|| nEndPos
< nStartPos
||
1128 (nStartPos
== nStringLen
&& (nStringLen
!= 0 || nStartPos
!= nEndPos
)))
1131 const OUString
& rPattern
= (bUsePrimarySrchStr
? sSrchStr
: sSrchStr2
);
1132 const sal_Int32 nPatternLen
= rPattern
.getLength();
1134 // Handle special cases empty pattern and/or string outside of the loop to
1135 // not add performance penalties there and simplify.
1136 if (nStartPos
== nEndPos
)
1139 while (i
< nPatternLen
&& rPattern
[i
] == '*')
1141 if (i
== nPatternLen
)
1142 setWildcardMatch( aRes
, nStartOffset
, nEndOffset
);
1146 // Empty pattern does not match any non-empty string.
1150 bool bRewind
= false;
1151 sal_uInt32 cPattern
= 0;
1152 sal_Int32 nPattern
= 0;
1153 sal_Int32 nAfterFakePattern
= nPattern
;
1154 if (mbWildcardAllowSubstring
)
1156 // Fake a leading '*' wildcard.
1159 // Assume a non-'*' pattern character follows. If it is a '*' instead
1160 // that will be handled in the loop by setting nPat.
1161 sal_uInt32 cu
= rPattern
.iterateCodePoints( &nAfterFakePattern
);
1162 if (cu
== mcWildcardEscapeChar
&& mcWildcardEscapeChar
&& nAfterFakePattern
< nPatternLen
)
1163 rPattern
.iterateCodePoints( &nAfterFakePattern
);
1166 sal_Int32 nString
= nStartPos
, nPat
= -1, nStr
= -1, nLastAsterisk
= -1;
1167 sal_uInt32 cPatternAfterAsterisk
= 0;
1168 bool bEscaped
= false, bEscapedAfterAsterisk
= false;
1170 // The loop code tries to avoid costly calls to iterateCodePoints() when
1177 // Reuse cPattern after '*', nPattern was correspondingly
1178 // incremented to point behind cPattern.
1181 else if (nPattern
< nPatternLen
)
1183 // nPattern will be incremented by iterateCodePoints().
1184 cPattern
= rPattern
.iterateCodePoints( &nPattern
);
1185 if (cPattern
== mcWildcardEscapeChar
&& mcWildcardEscapeChar
&& nPattern
< nPatternLen
)
1188 cPattern
= rPattern
.iterateCodePoints( &nPattern
);
1193 // A trailing '*' is handled below.
1194 if (mbWildcardAllowSubstring
)
1196 // If the pattern is consumed and substring match allowed we're good.
1197 setWildcardMatch( aRes
, nStartOffset
, nString
);
1200 else if (nString
< nEndPos
&& nLastAsterisk
>= 0)
1202 // If substring match is not allowed try a greedy '*' match.
1203 nPattern
= nLastAsterisk
;
1210 if (cPattern
== '*' && !bEscaped
)
1212 // '*' is one code unit, so not using iterateCodePoints() is ok.
1213 while (nPattern
< nPatternLen
&& rPattern
[nPattern
] == '*')
1216 if (nPattern
>= nPatternLen
)
1218 // Last pattern is '*', remaining string matches.
1219 setWildcardMatch( aRes
, nStartOffset
, nEndOffset
);
1223 nLastAsterisk
= nPattern
; // Remember last encountered '*'.
1225 // cPattern will be the next non-'*' character, nPattern
1227 cPattern
= rPattern
.iterateCodePoints( &nPattern
);
1228 if (cPattern
== mcWildcardEscapeChar
&& mcWildcardEscapeChar
&& nPattern
< nPatternLen
)
1231 cPattern
= rPattern
.iterateCodePoints( &nPattern
);
1234 cPatternAfterAsterisk
= cPattern
;
1235 bEscapedAfterAsterisk
= bEscaped
;
1236 nPat
= nPattern
; // Remember position of pattern behind '*', already incremented.
1237 nStr
= nString
; // Remember the current string to be matched.
1240 if (nString
>= nEndPos
)
1241 // Whatever follows in pattern, string will not match.
1244 // nString will be incremented by iterateCodePoints().
1245 sal_uInt32 cString
= searchStr
.iterateCodePoints( &nString
);
1247 if ((cPattern
!= '?' || bEscaped
) && cPattern
!= cString
)
1250 // Non-match already without any '*' pattern.
1254 nPattern
= nPat
; // Rewind pattern to character behind '*', already incremented.
1255 cPattern
= cPatternAfterAsterisk
;
1256 bEscaped
= bEscapedAfterAsterisk
;
1257 searchStr
.iterateCodePoints( &nStr
);
1258 nString
= nStr
; // Restore incremented remembered string position.
1259 if (nPat
== nAfterFakePattern
)
1261 // Next start offset will be the next character.
1262 nStartOffset
= nString
;
1267 // An unescaped '?' pattern matched any character, or characters
1268 // matched. Reset only escaped state.
1272 while (nString
< nEndPos
);
1277 // Eat trailing '*' pattern that matches anything, including nothing.
1278 // '*' is one code unit, so not using iterateCodePoints() is ok.
1279 while (nPattern
< nPatternLen
&& rPattern
[nPattern
] == '*')
1282 if (nPattern
== nPatternLen
)
1283 setWildcardMatch( aRes
, nStartOffset
, nEndOffset
);
1287 SearchResult
TextSearch::WildcardSrchBkwrd( const OUString
& searchStr
, sal_Int32 nStartPos
, sal_Int32 nEndPos
)
1290 aRes
.subRegExpressions
= 0; // no match
1292 sal_Int32 nStartOffset
= nStartPos
;
1293 sal_Int32 nEndOffset
= nEndPos
;
1295 const sal_Int32 nStringLen
= searchStr
.getLength();
1297 // Backward nStartPos exclusive, nEndPos inclusive, but allow for empty
1298 // string match with (0,0].
1299 if (nStartPos
> nStringLen
|| nEndPos
< 0 || nStartPos
< nEndPos
||
1300 (nEndPos
== nStringLen
&& (nStringLen
!= 0 || nStartPos
!= nEndPos
)))
1303 const OUString
& rPattern
= (bUsePrimarySrchStr
? sSrchStr
: sSrchStr2
);
1304 sal_Int32 nPatternLen
= rPattern
.getLength();
1306 // Handle special cases empty pattern and/or string outside of the loop to
1307 // not add performance penalties there and simplify.
1308 if (nStartPos
== nEndPos
)
1311 while (i
< nPatternLen
&& rPattern
[i
] == '*')
1313 if (i
== nPatternLen
)
1314 setWildcardMatch( aRes
, nStartOffset
, nEndOffset
);
1318 // Empty pattern does not match any non-empty string.
1322 // Reverse escaped patterns to ease the handling of escapes, keeping escape
1323 // and following character as one sequence in backward direction.
1324 if ((bUsePrimarySrchStr
&& maWildcardReversePattern
.isEmpty()) ||
1325 (!bUsePrimarySrchStr
&& maWildcardReversePattern2
.isEmpty()))
1327 OUStringBuffer
aPatternBuf( rPattern
);
1328 sal_Int32 nIndex
= 0;
1329 while (nIndex
< nPatternLen
)
1331 const sal_Int32 nOld
= nIndex
;
1332 const sal_uInt32 cu
= rPattern
.iterateCodePoints( &nIndex
);
1333 if (cu
== mcWildcardEscapeChar
)
1335 if (nIndex
< nPatternLen
)
1337 if (nIndex
- nOld
== 1)
1339 // Simply move code units, we already memorized the one
1341 const sal_Int32 nOld2
= nIndex
;
1342 rPattern
.iterateCodePoints( &nIndex
);
1343 for (sal_Int32 i
=0; i
< nIndex
- nOld2
; ++i
)
1344 aPatternBuf
[nOld
+i
] = rPattern
[nOld2
+i
];
1345 aPatternBuf
[nIndex
-1] = static_cast<sal_Unicode
>(cu
);
1349 // Copy the escape character code units first in the
1350 // unlikely case that it would not be of BMP.
1351 assert(nIndex
- nOld
== 2); // it's UTF-16, so...
1353 buf
[0] = rPattern
[nOld
];
1354 buf
[1] = rPattern
[nOld
+1];
1355 const sal_Int32 nOld2
= nIndex
;
1356 rPattern
.iterateCodePoints( &nIndex
);
1357 for (sal_Int32 i
=0; i
< nIndex
- nOld2
; ++i
)
1358 aPatternBuf
[nOld
+i
] = rPattern
[nOld2
+i
];
1359 aPatternBuf
[nIndex
-2] = buf
[0];
1360 aPatternBuf
[nIndex
-1] = buf
[1];
1365 // Trailing escape would become leading escape, do what?
1367 aPatternBuf
.remove( nOld
, nIndex
- nOld
);
1371 if (bUsePrimarySrchStr
)
1372 maWildcardReversePattern
= aPatternBuf
.makeStringAndClear();
1374 maWildcardReversePattern2
= aPatternBuf
.makeStringAndClear();
1376 const OUString
& rReversePattern
= (bUsePrimarySrchStr
? maWildcardReversePattern
: maWildcardReversePattern2
);
1377 nPatternLen
= rReversePattern
.getLength();
1379 bool bRewind
= false;
1380 sal_uInt32 cPattern
= 0;
1381 sal_Int32 nPattern
= nPatternLen
;
1382 sal_Int32 nAfterFakePattern
= nPattern
;
1383 if (mbWildcardAllowSubstring
)
1385 // Fake a trailing '*' wildcard.
1388 // Assume a non-'*' pattern character follows. If it is a '*' instead
1389 // that will be handled in the loop by setting nPat.
1390 sal_uInt32 cu
= rReversePattern
.iterateCodePoints( &nAfterFakePattern
, -1);
1391 if (cu
== mcWildcardEscapeChar
&& mcWildcardEscapeChar
&& nAfterFakePattern
> 0)
1392 rReversePattern
.iterateCodePoints( &nAfterFakePattern
, -1);
1395 sal_Int32 nString
= nStartPos
, nPat
= -1, nStr
= -1, nLastAsterisk
= -1;
1396 sal_uInt32 cPatternAfterAsterisk
= 0;
1397 bool bEscaped
= false, bEscapedAfterAsterisk
= false;
1399 // The loop code tries to avoid costly calls to iterateCodePoints() when
1406 // Reuse cPattern after '*', nPattern was correspondingly
1407 // decremented to point before cPattern.
1410 else if (nPattern
> 0)
1412 // nPattern will be decremented by iterateCodePoints().
1413 cPattern
= rReversePattern
.iterateCodePoints( &nPattern
, -1);
1414 if (cPattern
== mcWildcardEscapeChar
&& mcWildcardEscapeChar
&& nPattern
> 0)
1417 cPattern
= rReversePattern
.iterateCodePoints( &nPattern
, -1);
1422 // A trailing '*' is handled below.
1423 if (mbWildcardAllowSubstring
)
1425 // If the pattern is consumed and substring match allowed we're good.
1426 setWildcardMatch( aRes
, nStartOffset
, nString
);
1429 else if (nString
> nEndPos
&& nLastAsterisk
>= 0)
1431 // If substring match is not allowed try a greedy '*' match.
1432 nPattern
= nLastAsterisk
;
1439 if (cPattern
== '*' && !bEscaped
)
1441 // '*' is one code unit, so not using iterateCodePoints() is ok.
1442 while (nPattern
> 0 && rReversePattern
[nPattern
-1] == '*')
1447 // First pattern is '*', remaining string matches.
1448 setWildcardMatch( aRes
, nStartOffset
, nEndOffset
);
1452 nLastAsterisk
= nPattern
; // Remember last encountered '*'.
1454 // cPattern will be the previous non-'*' character, nPattern
1456 cPattern
= rReversePattern
.iterateCodePoints( &nPattern
, -1);
1457 if (cPattern
== mcWildcardEscapeChar
&& mcWildcardEscapeChar
&& nPattern
> 0)
1460 cPattern
= rReversePattern
.iterateCodePoints( &nPattern
, -1);
1463 cPatternAfterAsterisk
= cPattern
;
1464 bEscapedAfterAsterisk
= bEscaped
;
1465 nPat
= nPattern
; // Remember position of pattern before '*', already decremented.
1466 nStr
= nString
; // Remember the current string to be matched.
1469 if (nString
<= nEndPos
)
1470 // Whatever leads in pattern, string will not match.
1473 // nString will be decremented by iterateCodePoints().
1474 sal_uInt32 cString
= searchStr
.iterateCodePoints( &nString
, -1);
1476 if ((cPattern
!= '?' || bEscaped
) && cPattern
!= cString
)
1479 // Non-match already without any '*' pattern.
1483 nPattern
= nPat
; // Rewind pattern to character before '*', already decremented.
1484 cPattern
= cPatternAfterAsterisk
;
1485 bEscaped
= bEscapedAfterAsterisk
;
1486 searchStr
.iterateCodePoints( &nStr
, -1);
1487 nString
= nStr
; // Restore decremented remembered string position.
1488 if (nPat
== nAfterFakePattern
)
1490 // Next start offset will be this character (exclusive).
1491 nStartOffset
= nString
;
1496 // An unescaped '?' pattern matched any character, or characters
1497 // matched. Reset only escaped state.
1501 while (nString
> nEndPos
);
1506 // Eat leading '*' pattern that matches anything, including nothing.
1507 // '*' is one code unit, so not using iterateCodePoints() is ok.
1508 while (nPattern
> 0 && rReversePattern
[nPattern
-1] == '*')
1512 setWildcardMatch( aRes
, nStartOffset
, nEndOffset
);
1517 static const sal_Char cSearchImpl
[] = "com.sun.star.util.TextSearch_i18n";
1519 static uno::Sequence
< OUString
> getServiceName_Static()
1521 uno::Sequence
< OUString
> aRet(2);
1522 aRet
[0] = "com.sun.star.util.TextSearch";
1523 aRet
[1] = "com.sun.star.util.TextSearch2";
1527 static OUString
getImplementationName_Static()
1533 TextSearch::getImplementationName()
1535 return getImplementationName_Static();
1538 sal_Bool SAL_CALL
TextSearch::supportsService(const OUString
& rServiceName
)
1540 return cppu::supportsService(this, rServiceName
);
1543 Sequence
< OUString
> SAL_CALL
1544 TextSearch::getSupportedServiceNames()
1546 Sequence
< OUString
> aRet
{ getServiceName_Static() };
1550 static css::uno::Reference
< css::uno::XInterface
>
1551 TextSearch_CreateInstance(
1552 const css::uno::Reference
<
1553 css::lang::XMultiServiceFactory
>& rxMSF
)
1555 return css::uno::Reference
<
1556 css::uno::XInterface
>(
1557 static_cast<cppu::OWeakObject
*>(new TextSearch(
1558 comphelper::getComponentContext( rxMSF
) )) );
1563 SAL_DLLPUBLIC_EXPORT
void*
1564 i18nsearch_component_getFactory( const sal_Char
* sImplementationName
,
1565 void* _pServiceManager
,
1566 SAL_UNUSED_PARAMETER
void* )
1568 void* pRet
= nullptr;
1570 css::lang::XMultiServiceFactory
* pServiceManager
=
1571 static_cast< css::lang::XMultiServiceFactory
* >
1572 ( _pServiceManager
);
1573 css::uno::Reference
<
1574 css::lang::XSingleServiceFactory
> xFactory
;
1576 if ( 0 == rtl_str_compare( sImplementationName
, cSearchImpl
) )
1578 css::uno::Sequence
< OUString
> aServiceNames
{ getServiceName_Static() };
1579 xFactory
= ::cppu::createSingleFactory(
1580 pServiceManager
, getImplementationName_Static(),
1581 &TextSearch_CreateInstance
, aServiceNames
);
1584 if ( xFactory
.is() )
1586 xFactory
->acquire();
1587 pRet
= xFactory
.get();
1595 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */