1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include "textsearch.hxx"
22 #include <com/sun/star/i18n/BreakIterator.hpp>
23 #include <com/sun/star/util/SearchAlgorithms2.hpp>
24 #include <com/sun/star/util/SearchFlags.hpp>
25 #include <com/sun/star/i18n/WordType.hpp>
26 #include <com/sun/star/i18n/ScriptType.hpp>
27 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
28 #include <com/sun/star/i18n/CharacterClassification.hpp>
29 #include <com/sun/star/i18n/KCharacterType.hpp>
30 #include <com/sun/star/i18n/Transliteration.hpp>
31 #include <cppuhelper/supportsservice.hxx>
32 #include <cppuhelper/weak.hxx>
33 #include <i18nutil/transliteration.hxx>
34 #include <rtl/ustrbuf.hxx>
35 #include <sal/log.hxx>
37 #include <unicode/regex.h>
39 using namespace ::com::sun::star::util
;
40 using namespace ::com::sun::star::uno
;
41 using namespace ::com::sun::star::lang
;
42 using namespace ::com::sun::star::i18n
;
43 using namespace ::com::sun::star
;
45 const TransliterationFlags COMPLEX_TRANS_MASK
=
46 TransliterationFlags::ignoreBaFa_ja_JP
|
47 TransliterationFlags::ignoreIterationMark_ja_JP
|
48 TransliterationFlags::ignoreTiJi_ja_JP
|
49 TransliterationFlags::ignoreHyuByu_ja_JP
|
50 TransliterationFlags::ignoreSeZe_ja_JP
|
51 TransliterationFlags::ignoreIandEfollowedByYa_ja_JP
|
52 TransliterationFlags::ignoreKiKuFollowedBySa_ja_JP
|
53 TransliterationFlags::ignoreProlongedSoundMark_ja_JP
;
57 TransliterationFlags
maskComplexTrans( TransliterationFlags n
)
59 // IGNORE_KANA and FULLWIDTH_HALFWIDTH are simple but need to take effect
60 // in complex transliteration.
62 n
& (COMPLEX_TRANS_MASK
| // all set ignore bits
63 TransliterationFlags::IGNORE_KANA
| // plus IGNORE_KANA bit
64 TransliterationFlags::FULLWIDTH_HALFWIDTH
); // and the FULLWIDTH_HALFWIDTH value
67 bool isComplexTrans( TransliterationFlags n
)
69 return bool(n
& COMPLEX_TRANS_MASK
);
72 TransliterationFlags
maskSimpleTrans( TransliterationFlags n
)
74 return n
& ~COMPLEX_TRANS_MASK
;
77 bool isSimpleTrans( TransliterationFlags n
)
79 return bool(maskSimpleTrans(n
));
82 // Regex patterns are case sensitive.
83 TransliterationFlags
maskSimpleRegexTrans( TransliterationFlags n
)
85 TransliterationFlags m
= (n
& TransliterationFlags::IGNORE_MASK
) & ~TransliterationFlags::IGNORE_CASE
;
86 TransliterationFlags v
= n
& TransliterationFlags::NON_IGNORE_MASK
;
87 if (v
== TransliterationFlags::UPPERCASE_LOWERCASE
|| v
== TransliterationFlags::LOWERCASE_UPPERCASE
)
88 v
= TransliterationFlags::NONE
;
89 return (m
| v
) & ~COMPLEX_TRANS_MASK
;
92 bool isSimpleRegexTrans( TransliterationFlags n
)
94 return bool(maskSimpleRegexTrans(n
));
97 bool isReplacePunctuation( OUString
&rStr
)
99 return rStr
.indexOf(u
'\u2018') > -1 ||
100 rStr
.indexOf(u
'\u2019') > -1 ||
101 rStr
.indexOf(u
'\u201A') > -1 ||
102 rStr
.indexOf(u
'\u201B') > -1 ||
103 rStr
.indexOf(u
'\u201C') > -1 ||
104 rStr
.indexOf(u
'\u201D') > -1 ||
105 rStr
.indexOf(u
'\u201E') > -1 ||
106 rStr
.indexOf(u
'\u201F') > -1;
109 OUString
replacePunctuation( OUString
&rStr
)
111 return rStr
.replace(u
'\u2018', '\'')
112 .replace(u
'\u2019', '\'')
113 .replace(u
'\u201A', '\'')
114 .replace(u
'\u201B', '\'')
115 .replace(u
'\u201C', '"')
116 .replace(u
'\u201D', '"')
117 .replace(u
'\u201E', '"')
118 .replace(u
'\u201F', '"');
122 TextSearch::TextSearch(const Reference
< XComponentContext
> & rxContext
)
123 : m_xContext( rxContext
)
126 aOpt
.AlgorithmType2
= SearchAlgorithms2::ABSOLUTE
;
127 aOpt
.algorithmType
= SearchAlgorithms_ABSOLUTE
;
128 aOpt
.searchFlag
= SearchFlags::ALL_IGNORE_CASE
;
133 TextSearch::~TextSearch()
135 pRegexMatcher
.reset();
141 void TextSearch::setOptions2( const SearchOptions2
& rOptions
)
143 std::unique_lock
g(m_aMutex
);
145 aSrchPara
= rOptions
;
147 pRegexMatcher
.reset();
151 maWildcardReversePattern
.clear();
152 maWildcardReversePattern2
.clear();
153 TransliterationFlags transliterateFlags
= static_cast<TransliterationFlags
>(aSrchPara
.transliterateFlags
);
154 bSearchApostrophe
= false;
155 bool bReplaceApostrophe
= false;
156 if (aSrchPara
.AlgorithmType2
== SearchAlgorithms2::REGEXP
)
158 // RESrchPrepare will consider aSrchPara.transliterateFlags when
159 // picking the actual regex pattern
160 // (sSrchStr|sSrchStr2|SearchOptions2::searchString) and setting
161 // case-insensitivity. Create transliteration instance, if any, without
162 // ignore-case so later in TextSearch::searchForward() the string to
163 // match is not case-altered, leave case-(in)sensitive to regex engine.
164 transliterateFlags
&= ~TransliterationFlags::IGNORE_CASE
;
166 else if ( aSrchPara
.searchString
.indexOf('\'') > - 1 || aSrchPara
.searchString
.indexOf('"') > - 1 )
168 bSearchApostrophe
= true;
169 bReplaceApostrophe
= isReplacePunctuation(aSrchPara
.searchString
);
172 // Create Transliteration class
173 if( isSimpleTrans( transliterateFlags
) )
175 if( !xTranslit
.is() )
176 xTranslit
.set( Transliteration::create( m_xContext
) );
177 xTranslit
->loadModule(
178 static_cast<TransliterationModules
>(maskSimpleTrans(transliterateFlags
)),
181 else if( xTranslit
.is() )
184 // Create Transliteration for 2<->1, 2<->2 transliteration
185 if ( isComplexTrans( transliterateFlags
) )
187 if( !xTranslit2
.is() )
188 xTranslit2
.set( Transliteration::create( m_xContext
) );
189 // Load transliteration module
190 xTranslit2
->loadModule(
191 static_cast<TransliterationModules
>(maskComplexTrans(transliterateFlags
)),
196 xBreak
= css::i18n::BreakIterator::create( m_xContext
);
198 sSrchStr
= aSrchPara
.searchString
;
200 // Transliterate search string.
201 if (aSrchPara
.AlgorithmType2
== SearchAlgorithms2::REGEXP
)
203 if (isSimpleRegexTrans(transliterateFlags
))
205 if (maskSimpleRegexTrans(transliterateFlags
) !=
206 maskSimpleTrans(transliterateFlags
))
208 css::uno::Reference
< XExtendedTransliteration
> xTranslitPattern(
209 Transliteration::create( m_xContext
));
210 if (xTranslitPattern
.is())
212 xTranslitPattern
->loadModule(
213 static_cast<TransliterationModules
>(maskSimpleRegexTrans(transliterateFlags
)),
215 sSrchStr
= xTranslitPattern
->transliterateString2String(
216 aSrchPara
.searchString
, 0, aSrchPara
.searchString
.getLength());
222 sSrchStr
= xTranslit
->transliterateString2String(
223 aSrchPara
.searchString
, 0, aSrchPara
.searchString
.getLength());
225 // xTranslit2 complex transliterated sSrchStr2 is not used in
226 // regex, see TextSearch::searchForward() and
227 // TextSearch::searchBackward()
232 if ( xTranslit
.is() && isSimpleTrans(transliterateFlags
) )
233 sSrchStr
= xTranslit
->transliterateString2String(
234 aSrchPara
.searchString
, 0, aSrchPara
.searchString
.getLength());
236 if ( xTranslit2
.is() && isComplexTrans(transliterateFlags
) )
237 sSrchStr2
= xTranslit2
->transliterateString2String(
238 aSrchPara
.searchString
, 0, aSrchPara
.searchString
.getLength());
241 if ( bReplaceApostrophe
)
242 sSrchStr
= replacePunctuation(sSrchStr
);
244 // Take the new SearchOptions2::AlgorithmType2 field and ignore
245 // SearchOptions::algorithmType
246 switch( aSrchPara
.AlgorithmType2
)
248 case SearchAlgorithms2::REGEXP
:
249 fnForward
= &TextSearch::RESrchFrwrd
;
250 fnBackward
= &TextSearch::RESrchBkwrd
;
251 RESrchPrepare( aSrchPara
);
254 case SearchAlgorithms2::APPROXIMATE
:
255 fnForward
= &TextSearch::ApproxSrchFrwrd
;
256 fnBackward
= &TextSearch::ApproxSrchBkwrd
;
258 pWLD
.reset( new WLevDistance( sSrchStr
.getStr(), aSrchPara
.changedChars
,
259 aSrchPara
.insertedChars
, aSrchPara
.deletedChars
,
260 0 != (SearchFlags::LEV_RELAXED
& aSrchPara
.searchFlag
) ) );
262 nLimit
= pWLD
->GetLimit();
265 case SearchAlgorithms2::WILDCARD
:
266 mcWildcardEscapeChar
= static_cast<sal_uInt32
>(aSrchPara
.WildcardEscapeCharacter
);
267 mbWildcardAllowSubstring
= ((aSrchPara
.searchFlag
& SearchFlags::WILD_MATCH_SELECTION
) == 0);
268 fnForward
= &TextSearch::WildcardSrchFrwrd
;
269 fnBackward
= &TextSearch::WildcardSrchBkwrd
;
273 SAL_WARN("i18npool","TextSearch::setOptions2 - default what?");
275 case SearchAlgorithms2::ABSOLUTE
:
276 fnForward
= &TextSearch::NSrchFrwrd
;
277 fnBackward
= &TextSearch::NSrchBkwrd
;
282 void TextSearch::setOptions( const SearchOptions
& rOptions
)
284 sal_Int16 nAlgorithmType2
;
285 switch (rOptions
.algorithmType
)
287 case SearchAlgorithms_REGEXP
:
288 nAlgorithmType2
= SearchAlgorithms2::REGEXP
;
290 case SearchAlgorithms_APPROXIMATE
:
291 nAlgorithmType2
= SearchAlgorithms2::APPROXIMATE
;
294 SAL_WARN("i18npool","TextSearch::setOptions - default what?");
296 case SearchAlgorithms_ABSOLUTE
:
297 nAlgorithmType2
= SearchAlgorithms2::ABSOLUTE
;
300 // It would be nice if an inherited struct had a ctor that takes an
301 // instance of the object the struct derived from...
302 SearchOptions2
aOptions2(
303 rOptions
.algorithmType
,
305 rOptions
.searchString
,
306 rOptions
.replaceString
,
308 rOptions
.changedChars
,
309 rOptions
.deletedChars
,
310 rOptions
.insertedChars
,
311 rOptions
.transliterateFlags
,
313 0 // no wildcard search, no escape character...
315 setOptions2( aOptions2
);
318 static sal_Int32
FindPosInSeq_Impl( const Sequence
<sal_Int32
>& rOff
, sal_Int32 nPos
)
320 auto pOff
= std::find_if(rOff
.begin(), rOff
.end(),
321 [nPos
](const sal_Int32 nOff
) { return nOff
>= nPos
; });
322 return static_cast<sal_Int32
>(std::distance(rOff
.begin(), pOff
));
325 SearchResult
TextSearch::searchForward( const OUString
& searchStr
, sal_Int32 startPos
, sal_Int32 endPos
)
327 std::unique_lock
g(m_aMutex
);
331 OUString
in_str(searchStr
);
333 // in non-regex mode, allow searching typographical apostrophe with the ASCII one
334 // to avoid regression after using automatic conversion to U+2019 during typing in Writer
335 bool bReplaceApostrophe
= bSearchApostrophe
&& isReplacePunctuation(in_str
);
337 bUsePrimarySrchStr
= true;
339 if ( xTranslit
.is() )
341 // apply normal transliteration (1<->1, 1<->0)
343 sal_Int32 nInStartPos
= startPos
;
344 if (pRegexMatcher
&& startPos
> 0)
346 // tdf#89665, tdf#75806: An optimization to avoid transliterating the whole string, yet
347 // transliterate enough of the leading text to allow sensible look-behind assertions.
348 // 100 is chosen arbitrarily in the hope that look-behind assertions would largely fit.
349 // See http://userguide.icu-project.org/strings/regexp for look-behind assertion syntax.
350 // When search regex doesn't start with an assertion, 3 is to allow startPos to be in
351 // the middle of a surrogate pair, preceded by another surrogate pair.
352 const sal_Int32 nMaxLeadingLen
= aSrchPara
.searchString
.startsWith("(?") ? 100 : 3;
353 nInStartPos
-= std::min(nMaxLeadingLen
, startPos
);
355 sal_Int32 nInEndPos
= endPos
;
356 if (pRegexMatcher
&& endPos
< searchStr
.getLength())
358 // tdf#65038: ditto for look-ahead assertions
359 const sal_Int32 nMaxTrailingLen
= aSrchPara
.searchString
.endsWith(")") ? 100 : 3;
360 nInEndPos
+= std::min(nMaxTrailingLen
, searchStr
.getLength() - endPos
);
363 css::uno::Sequence
<sal_Int32
> offset(nInEndPos
- nInStartPos
);
364 in_str
= xTranslit
->transliterate(searchStr
, nInStartPos
, nInEndPos
- nInStartPos
, offset
);
366 if ( bReplaceApostrophe
)
367 in_str
= replacePunctuation(in_str
);
369 // JP 20.6.2001: also the start and end positions must be corrected!
370 sal_Int32 newStartPos
=
371 (startPos
== 0) ? 0 : FindPosInSeq_Impl( offset
, startPos
);
373 sal_Int32 newEndPos
= (endPos
< searchStr
.getLength())
374 ? FindPosInSeq_Impl( offset
, endPos
)
375 : in_str
.getLength();
377 sres
= (this->*fnForward
)( g
, in_str
, newStartPos
, newEndPos
);
379 // Map offsets back to untransliterated string.
380 const sal_Int32 nOffsets
= offset
.getLength();
383 auto sres_startOffsetRange
= asNonConstRange(sres
.startOffset
);
384 auto sres_endOffsetRange
= asNonConstRange(sres
.endOffset
);
385 // For regex nGroups is the number of groups+1 with group 0 being
387 const sal_Int32 nGroups
= sres
.startOffset
.getLength();
388 for ( sal_Int32 k
= 0; k
< nGroups
; k
++ )
390 const sal_Int32 nStart
= sres
.startOffset
[k
];
391 // Result offsets are negative (-1) if a group expression was
394 sres_startOffsetRange
[k
] = (nStart
< nOffsets
? offset
[nStart
] : (offset
[nOffsets
- 1] + 1));
395 // JP 20.6.2001: end is ever exclusive and then don't return
396 // the position of the next character - return the
397 // next position behind the last found character!
398 // "a b c" find "b" must return 2,3 and not 2,4!!!
399 const sal_Int32 nStop
= sres
.endOffset
[k
];
403 sres_endOffsetRange
[k
] = offset
[(nStop
<= nOffsets
? nStop
: nOffsets
) - 1] + 1;
405 sres_endOffsetRange
[k
] = offset
[0];
412 if ( bReplaceApostrophe
)
413 in_str
= in_str
.replace(u
'\u2019', '\'');
415 sres
= (this->*fnForward
)( g
, in_str
, startPos
, endPos
);
418 if ( xTranslit2
.is() && aSrchPara
.AlgorithmType2
!= SearchAlgorithms2::REGEXP
)
423 css::uno::Sequence
<sal_Int32
> offset( in_str
.getLength());
425 in_str
= xTranslit2
->transliterate( searchStr
, 0, in_str
.getLength(), offset
);
428 startPos
= FindPosInSeq_Impl( offset
, startPos
);
430 if( endPos
< searchStr
.getLength() )
431 endPos
= FindPosInSeq_Impl( offset
, endPos
);
433 endPos
= in_str
.getLength();
435 bUsePrimarySrchStr
= false;
436 sres2
= (this->*fnForward
)( g
, in_str
, startPos
, endPos
);
437 auto sres2_startOffsetRange
= asNonConstRange(sres2
.startOffset
);
438 auto sres2_endOffsetRange
= asNonConstRange(sres2
.endOffset
);
440 for ( int k
= 0; k
< sres2
.startOffset
.getLength(); k
++ )
442 if (sres2
.startOffset
[k
])
443 sres2_startOffsetRange
[k
] = offset
[sres2
.startOffset
[k
]-1] + 1;
444 if (sres2
.endOffset
[k
])
445 sres2_endOffsetRange
[k
] = offset
[sres2
.endOffset
[k
]-1] + 1;
448 // pick first and long one
449 if ( sres
.subRegExpressions
== 0)
451 if ( sres2
.subRegExpressions
== 1)
453 if ( sres
.startOffset
[0] > sres2
.startOffset
[0])
455 else if ( sres
.startOffset
[0] == sres2
.startOffset
[0] &&
456 sres
.endOffset
[0] < sres2
.endOffset
[0])
464 SearchResult
TextSearch::searchBackward( const OUString
& searchStr
, sal_Int32 startPos
, sal_Int32 endPos
)
466 std::unique_lock
g(m_aMutex
);
470 OUString
in_str(searchStr
);
472 // in non-regex mode, allow searching typographical apostrophe with the ASCII one
473 // to avoid regression after using automatic conversion to U+2019 during typing in Writer
474 bool bReplaceApostrophe
= bSearchApostrophe
&& isReplacePunctuation(in_str
);
476 bUsePrimarySrchStr
= true;
478 if ( xTranslit
.is() )
480 // apply only simple 1<->1 transliteration here
481 css::uno::Sequence
<sal_Int32
> offset(startPos
- endPos
);
482 in_str
= xTranslit
->transliterate( searchStr
, endPos
, startPos
- endPos
, offset
);
484 if ( bReplaceApostrophe
)
485 in_str
= replacePunctuation(in_str
);
487 // JP 20.6.2001: also the start and end positions must be corrected!
488 sal_Int32
const newStartPos
= (startPos
< searchStr
.getLength())
489 ? FindPosInSeq_Impl( offset
, startPos
)
490 : in_str
.getLength();
492 sal_Int32
const newEndPos
=
493 (endPos
== 0) ? 0 : FindPosInSeq_Impl( offset
, endPos
);
495 // TODO: this would need nExtraOffset handling to avoid $ matching
496 // if (pRegexMatcher && startPos < searchStr.getLength())
497 // but that appears to be impossible with ICU regex
499 sres
= (this->*fnBackward
)( g
, in_str
, newStartPos
, newEndPos
);
501 // Map offsets back to untransliterated string.
502 const sal_Int32 nOffsets
= offset
.getLength();
505 auto sres_startOffsetRange
= asNonConstRange(sres
.startOffset
);
506 auto sres_endOffsetRange
= asNonConstRange(sres
.endOffset
);
507 // For regex nGroups is the number of groups+1 with group 0 being
509 const sal_Int32 nGroups
= sres
.startOffset
.getLength();
510 for ( sal_Int32 k
= 0; k
< nGroups
; k
++ )
512 const sal_Int32 nStart
= sres
.startOffset
[k
];
513 // Result offsets are negative (-1) if a group expression was
518 sres_startOffsetRange
[k
] = offset
[(nStart
<= nOffsets
? nStart
: nOffsets
) - 1] + 1;
520 sres_startOffsetRange
[k
] = offset
[0];
522 // JP 20.6.2001: end is ever exclusive and then don't return
523 // the position of the next character - return the
524 // next position behind the last found character!
525 // "a b c" find "b" must return 2,3 and not 2,4!!!
526 const sal_Int32 nStop
= sres
.endOffset
[k
];
528 sres_endOffsetRange
[k
] = (nStop
< nOffsets
? offset
[nStop
] : (offset
[nOffsets
- 1] + 1));
534 if ( bReplaceApostrophe
)
535 in_str
= replacePunctuation(in_str
);
537 sres
= (this->*fnBackward
)( g
, in_str
, startPos
, endPos
);
540 if ( xTranslit2
.is() && aSrchPara
.AlgorithmType2
!= SearchAlgorithms2::REGEXP
)
545 css::uno::Sequence
<sal_Int32
> offset( in_str
.getLength());
547 in_str
= xTranslit2
->transliterate(searchStr
, 0, in_str
.getLength(), offset
);
549 if( startPos
< searchStr
.getLength() )
550 startPos
= FindPosInSeq_Impl( offset
, startPos
);
552 startPos
= in_str
.getLength();
555 endPos
= FindPosInSeq_Impl( offset
, endPos
);
557 bUsePrimarySrchStr
= false;
558 sres2
= (this->*fnBackward
)( g
, in_str
, startPos
, endPos
);
559 auto sres2_startOffsetRange
= asNonConstRange(sres2
.startOffset
);
560 auto sres2_endOffsetRange
= asNonConstRange(sres2
.endOffset
);
562 for( int k
= 0; k
< sres2
.startOffset
.getLength(); k
++ )
564 if (sres2
.startOffset
[k
])
565 sres2_startOffsetRange
[k
] = offset
[sres2
.startOffset
[k
]-1]+1;
566 if (sres2
.endOffset
[k
])
567 sres2_endOffsetRange
[k
] = offset
[sres2
.endOffset
[k
]-1]+1;
570 // pick last and long one
571 if ( sres
.subRegExpressions
== 0 )
573 if ( sres2
.subRegExpressions
== 1 )
575 if ( sres
.startOffset
[0] < sres2
.startOffset
[0] )
577 if ( sres
.startOffset
[0] == sres2
.startOffset
[0] &&
578 sres
.endOffset
[0] > sres2
.endOffset
[0] )
587 bool TextSearch::IsDelimiter( const OUString
& rStr
, sal_Int32 nPos
) const
590 if( '\x7f' != rStr
[nPos
])
592 if ( !xCharClass
.is() )
593 xCharClass
= CharacterClassification::create( m_xContext
);
594 sal_Int32 nCType
= xCharClass
->getCharacterType( rStr
, nPos
,
596 if( 0 != (( KCharacterType::DIGIT
| KCharacterType::ALPHA
|
597 KCharacterType::LETTER
) & nCType
) )
603 // --------- helper methods for Boyer-Moore like text searching ----------
604 // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available
606 void TextSearch::MakeForwardTab()
608 // create the jumptable for the search text
610 if( pJumpTable
&& bIsForwardTab
)
612 return; // the jumpTable is ok
614 bIsForwardTab
= true;
616 sal_Int32 n
, nLen
= sSrchStr
.getLength();
617 pJumpTable
.reset( new TextSearchJumpTable
);
619 for( n
= 0; n
< nLen
- 1; ++n
)
621 sal_Unicode cCh
= sSrchStr
[n
];
622 sal_Int32 nDiff
= nLen
- n
- 1;
623 TextSearchJumpTable::value_type
aEntry( cCh
, nDiff
);
625 ::std::pair
< TextSearchJumpTable::iterator
, bool > aPair
=
626 pJumpTable
->insert( aEntry
);
628 (*(aPair
.first
)).second
= nDiff
;
632 void TextSearch::MakeForwardTab2()
634 // create the jumptable for the search text
635 if( pJumpTable2
&& bIsForwardTab
)
637 return; // the jumpTable is ok
639 bIsForwardTab
= true;
641 sal_Int32 n
, nLen
= sSrchStr2
.getLength();
642 pJumpTable2
.reset( new TextSearchJumpTable
);
644 for( n
= 0; n
< nLen
- 1; ++n
)
646 sal_Unicode cCh
= sSrchStr2
[n
];
647 sal_Int32 nDiff
= nLen
- n
- 1;
649 TextSearchJumpTable::value_type
aEntry( cCh
, nDiff
);
650 ::std::pair
< TextSearchJumpTable::iterator
, bool > aPair
=
651 pJumpTable2
->insert( aEntry
);
653 (*(aPair
.first
)).second
= nDiff
;
657 void TextSearch::MakeBackwardTab()
659 // create the jumptable for the search text
660 if( pJumpTable
&& !bIsForwardTab
)
662 return; // the jumpTable is ok
664 bIsForwardTab
= false;
666 sal_Int32 n
, nLen
= sSrchStr
.getLength();
667 pJumpTable
.reset( new TextSearchJumpTable
);
669 for( n
= nLen
-1; n
> 0; --n
)
671 sal_Unicode cCh
= sSrchStr
[n
];
672 TextSearchJumpTable::value_type
aEntry( cCh
, n
);
673 ::std::pair
< TextSearchJumpTable::iterator
, bool > aPair
=
674 pJumpTable
->insert( aEntry
);
676 (*(aPair
.first
)).second
= n
;
680 void TextSearch::MakeBackwardTab2()
682 // create the jumptable for the search text
683 if( pJumpTable2
&& !bIsForwardTab
)
685 return; // the jumpTable is ok
687 bIsForwardTab
= false;
689 sal_Int32 n
, nLen
= sSrchStr2
.getLength();
690 pJumpTable2
.reset( new TextSearchJumpTable
);
692 for( n
= nLen
-1; n
> 0; --n
)
694 sal_Unicode cCh
= sSrchStr2
[n
];
695 TextSearchJumpTable::value_type
aEntry( cCh
, n
);
696 ::std::pair
< TextSearchJumpTable::iterator
, bool > aPair
=
697 pJumpTable2
->insert( aEntry
);
699 (*(aPair
.first
)).second
= n
;
703 sal_Int32
TextSearch::GetDiff( const sal_Unicode cChr
) const
705 TextSearchJumpTable
*pJump
;
708 if ( bUsePrimarySrchStr
) {
709 pJump
= pJumpTable
.get();
710 sSearchKey
= sSrchStr
;
712 pJump
= pJumpTable2
.get();
713 sSearchKey
= sSrchStr2
;
716 TextSearchJumpTable::const_iterator iLook
= pJump
->find( cChr
);
717 if ( iLook
== pJump
->end() )
718 return sSearchKey
.getLength();
719 return (*iLook
).second
;
723 SearchResult
TextSearch::NSrchFrwrd( std::unique_lock
<std::mutex
>& /*rGuard*/, const OUString
& searchStr
, sal_Int32 startPos
, sal_Int32 endPos
)
726 aRet
.subRegExpressions
= 0;
728 OUString sSearchKey
= bUsePrimarySrchStr
? sSrchStr
: sSrchStr2
;
730 sal_Int32 nSuchIdx
= searchStr
.getLength();
731 sal_Int32 nEnd
= endPos
;
732 if( !nSuchIdx
|| !sSearchKey
.getLength() || sSearchKey
.getLength() > nSuchIdx
)
736 if( nEnd
< sSearchKey
.getLength() ) // position inside the search region ?
739 nEnd
-= sSearchKey
.getLength();
741 if (bUsePrimarySrchStr
)
742 MakeForwardTab(); // create the jumptable
746 for (sal_Int32 nCmpIdx
= startPos
; // start position for the search
748 nCmpIdx
+= GetDiff( searchStr
[nCmpIdx
+ sSearchKey
.getLength()-1]))
750 nSuchIdx
= sSearchKey
.getLength() - 1;
751 while( nSuchIdx
>= 0 && sSearchKey
[nSuchIdx
] == searchStr
[nCmpIdx
+ nSuchIdx
])
755 if( SearchFlags::NORM_WORD_ONLY
& aSrchPara
.searchFlag
)
757 sal_Int32 nFndEnd
= nCmpIdx
+ sSearchKey
.getLength();
758 bool bAtStart
= !nCmpIdx
;
759 bool bAtEnd
= nFndEnd
== endPos
;
760 bool bDelimBefore
= bAtStart
|| IsDelimiter( searchStr
, nCmpIdx
-1 );
761 bool bDelimBehind
= bAtEnd
|| IsDelimiter( searchStr
, nFndEnd
);
762 // * 1 -> only one word in the paragraph
763 // * 2 -> at begin of paragraph
764 // * 3 -> at end of paragraph
765 // * 4 -> inside the paragraph
766 if( !( ( bAtStart
&& bAtEnd
) || // 1
767 ( bAtStart
&& bDelimBehind
) || // 2
768 ( bAtEnd
&& bDelimBefore
) || // 3
769 ( bDelimBefore
&& bDelimBehind
))) // 4
773 aRet
.subRegExpressions
= 1;
774 aRet
.startOffset
= { nCmpIdx
};
775 aRet
.endOffset
= { nCmpIdx
+ sSearchKey
.getLength() };
786 SearchResult
TextSearch::NSrchBkwrd( std::unique_lock
<std::mutex
>& /*rGuard*/,const OUString
& searchStr
, sal_Int32 startPos
, sal_Int32 endPos
)
789 aRet
.subRegExpressions
= 0;
791 OUString sSearchKey
= bUsePrimarySrchStr
? sSrchStr
: sSrchStr2
;
793 sal_Int32 nSuchIdx
= searchStr
.getLength();
794 sal_Int32 nEnd
= endPos
;
795 if( nSuchIdx
== 0 || sSearchKey
.isEmpty() || sSearchKey
.getLength() > nSuchIdx
)
798 if (bUsePrimarySrchStr
)
799 MakeBackwardTab(); // create the jumptable
803 if( nEnd
== nSuchIdx
) // end position for the search
804 nEnd
= sSearchKey
.getLength();
806 nEnd
+= sSearchKey
.getLength();
808 sal_Int32 nCmpIdx
= startPos
; // start position for the search
810 while (nCmpIdx
>= nEnd
)
813 while( nSuchIdx
< sSearchKey
.getLength() && sSearchKey
[nSuchIdx
] ==
814 searchStr
[nCmpIdx
+ nSuchIdx
- sSearchKey
.getLength()] )
816 if( nSuchIdx
>= sSearchKey
.getLength() )
818 if( SearchFlags::NORM_WORD_ONLY
& aSrchPara
.searchFlag
)
820 sal_Int32 nFndStt
= nCmpIdx
- sSearchKey
.getLength();
821 bool bAtStart
= !nFndStt
;
822 bool bAtEnd
= nCmpIdx
== startPos
;
823 bool bDelimBehind
= bAtEnd
|| IsDelimiter( searchStr
, nCmpIdx
);
824 bool bDelimBefore
= bAtStart
|| // begin of paragraph
825 IsDelimiter( searchStr
, nFndStt
-1 );
826 // * 1 -> only one word in the paragraph
827 // * 2 -> at begin of paragraph
828 // * 3 -> at end of paragraph
829 // * 4 -> inside the paragraph
830 if( ( bAtStart
&& bAtEnd
) || // 1
831 ( bAtStart
&& bDelimBehind
) || // 2
832 ( bAtEnd
&& bDelimBefore
) || // 3
833 ( bDelimBefore
&& bDelimBehind
)) // 4
835 aRet
.subRegExpressions
= 1;
836 aRet
.startOffset
= { nCmpIdx
};
837 aRet
.endOffset
= { nCmpIdx
- sSearchKey
.getLength() };
843 aRet
.subRegExpressions
= 1;
844 aRet
.startOffset
= { nCmpIdx
};
845 aRet
.endOffset
= { nCmpIdx
- sSearchKey
.getLength() };
849 nSuchIdx
= GetDiff( searchStr
[nCmpIdx
- sSearchKey
.getLength()] );
850 if( nCmpIdx
< nSuchIdx
)
857 void TextSearch::RESrchPrepare( const css::util::SearchOptions2
& rOptions
)
859 TransliterationFlags transliterateFlags
= static_cast<TransliterationFlags
>(rOptions
.transliterateFlags
);
860 // select the transliterated pattern string
861 const OUString
& rPatternStr
=
862 (isSimpleTrans(transliterateFlags
) ? sSrchStr
863 : (isComplexTrans(transliterateFlags
) ? sSrchStr2
: rOptions
.searchString
));
865 sal_uInt32 nIcuSearchFlags
= UREGEX_UWORD
; // request UAX#29 unicode capability
866 // map css::util::SearchFlags to ICU uregex.h flags
867 // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
868 // REG_NEWLINE is neither properly defined nor used anywhere => not implemented
869 // REG_NOSUB is not used anywhere => not implemented
870 // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
871 // LEV_RELAXED is only used for SearchAlgorithm==Approximate
872 // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO
873 // probably because the transliteration flag IGNORE_CASE handles it as well.
874 if( (rOptions
.searchFlag
& css::util::SearchFlags::ALL_IGNORE_CASE
) != 0
875 || (transliterateFlags
& TransliterationFlags::IGNORE_CASE
))
876 nIcuSearchFlags
|= UREGEX_CASE_INSENSITIVE
;
877 UErrorCode nIcuErr
= U_ZERO_ERROR
;
878 // assumption: transliteration didn't mangle regexp control chars
879 icu::UnicodeString
aIcuSearchPatStr( reinterpret_cast<const UChar
*>(rPatternStr
.getStr()), rPatternStr
.getLength());
880 #ifndef DISABLE_WORDBOUND_EMULATION
881 // for convenience specific syntax elements of the old regex engine are emulated
882 // - by replacing \< with "word-break followed by a look-ahead word-char"
883 static const icu::UnicodeString
aChevronPatternB( "\\\\<", -1, icu::UnicodeString::kInvariant
);
884 static const icu::UnicodeString
aChevronReplaceB( "\\\\b(?=\\\\w)", -1, icu::UnicodeString::kInvariant
);
885 static icu::RegexMatcher
aChevronMatcherB( aChevronPatternB
, 0, nIcuErr
);
886 aChevronMatcherB
.reset( aIcuSearchPatStr
);
887 aIcuSearchPatStr
= aChevronMatcherB
.replaceAll( aChevronReplaceB
, nIcuErr
);
888 aChevronMatcherB
.reset();
889 // - by replacing \> with "look-behind word-char followed by a word-break"
890 static const icu::UnicodeString
aChevronPatternE( "\\\\>", -1, icu::UnicodeString::kInvariant
);
891 static const icu::UnicodeString
aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, icu::UnicodeString::kInvariant
);
892 static icu::RegexMatcher
aChevronMatcherE( aChevronPatternE
, 0, nIcuErr
);
893 aChevronMatcherE
.reset( aIcuSearchPatStr
);
894 aIcuSearchPatStr
= aChevronMatcherE
.replaceAll( aChevronReplaceE
, nIcuErr
);
895 aChevronMatcherE
.reset();
897 pRegexMatcher
.reset( new icu::RegexMatcher( aIcuSearchPatStr
, nIcuSearchFlags
, nIcuErr
) );
900 SAL_INFO( "i18npool", "TextSearch::RESrchPrepare UErrorCode " << nIcuErr
);
901 pRegexMatcher
.reset();
905 // Pathological patterns may result in exponential run time making the
906 // application appear to be frozen. Limit that. Documentation for this
908 // https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1RegexMatcher.html#a6ebcfcab4fe6a38678c0291643a03a00
909 // "The units of the limit are steps of the match engine.
910 // Correspondence with actual processor time will depend on the speed
911 // of the processor and the details of the specific pattern, but will
912 // typically be on the order of milliseconds."
913 // Just what is a good value? 42 is always an answer ... the 23 enigma
914 // as well... which on the dev's machine is roughly 50 seconds with the
915 // pattern of fdo#70627.
916 /* TODO: make this a configuration settable value and possibly take
917 * complexity of expression into account and maybe even length of text
918 * to be matched; currently (2013-11-25) that is at most one 64k
919 * paragraph per RESrchFrwrd()/RESrchBkwrd() call. */
920 pRegexMatcher
->setTimeLimit( 23*1000, nIcuErr
);
925 static bool lcl_findRegex(std::unique_ptr
<icu::RegexMatcher
> const& pRegexMatcher
,
926 sal_Int32 nStartPos
, sal_Int32 nEndPos
, UErrorCode
& rIcuErr
)
928 pRegexMatcher
->region(nStartPos
, nEndPos
, rIcuErr
);
929 pRegexMatcher
->useAnchoringBounds(false); // use whole text's anchoring bounds, not region's
930 pRegexMatcher
->useTransparentBounds(true); // take text outside of the region into account for
931 // look-ahead/behind assertions
933 if (!pRegexMatcher
->find(rIcuErr
))
935 /* TODO: future versions could pass the UErrorCode or translations
936 * thereof to the caller, for example to inform the user of
937 * U_REGEX_TIME_OUT. The strange thing though is that an error is set
938 * only after the second call that returns immediately and not if
939 * timeout occurred on the first call?!? */
940 SAL_INFO( "i18npool", "lcl_findRegex UErrorCode " << rIcuErr
);
946 SearchResult
TextSearch::RESrchFrwrd( std::unique_lock
<std::mutex
>& /*rGuard*/, const OUString
& searchStr
,
947 sal_Int32 startPos
, sal_Int32 endPos
)
950 aRet
.subRegExpressions
= 0;
954 if( endPos
> searchStr
.getLength())
955 endPos
= searchStr
.getLength();
957 // use the ICU RegexMatcher to find the matches
958 UErrorCode nIcuErr
= U_ZERO_ERROR
;
959 const icu::UnicodeString
aSearchTargetStr(false, reinterpret_cast<const UChar
*>(searchStr
.getStr()),
960 searchStr
.getLength());
961 pRegexMatcher
->reset( aSearchTargetStr
);
962 // search until there is a valid match
965 if (!lcl_findRegex( pRegexMatcher
, startPos
, endPos
, nIcuErr
))
968 // #i118887# ignore zero-length matches e.g. "a*" in "bc"
969 int nStartOfs
= pRegexMatcher
->start( nIcuErr
);
970 int nEndOfs
= pRegexMatcher
->end( nIcuErr
);
971 if( nStartOfs
< nEndOfs
)
973 // If the zero-length match is behind the string, do not match it again
974 // and again until startPos reaches there. A match behind the string is
976 if (nStartOfs
== endPos
)
978 // try at next position if there was a zero-length match
979 if( ++startPos
>= endPos
)
983 // extract the result of the search
984 const int nGroupCount
= pRegexMatcher
->groupCount();
985 aRet
.subRegExpressions
= nGroupCount
+ 1;
986 aRet
.startOffset
.realloc( aRet
.subRegExpressions
);
987 auto pstartOffset
= aRet
.startOffset
.getArray();
988 aRet
.endOffset
.realloc( aRet
.subRegExpressions
);
989 auto pendOffset
= aRet
.endOffset
.getArray();
990 pstartOffset
[0] = pRegexMatcher
->start( nIcuErr
);
991 pendOffset
[0] = pRegexMatcher
->end( nIcuErr
);
992 for( int i
= 1; i
<= nGroupCount
; ++i
) {
993 pstartOffset
[i
] = pRegexMatcher
->start( i
, nIcuErr
);
994 pendOffset
[i
] = pRegexMatcher
->end( i
, nIcuErr
);
1000 SearchResult
TextSearch::RESrchBkwrd( std::unique_lock
<std::mutex
>& /*rGuard*/, const OUString
& searchStr
,
1001 sal_Int32 startPos
, sal_Int32 endPos
)
1003 // NOTE: for backwards search callers provide startPos/endPos inverted!
1005 aRet
.subRegExpressions
= 0;
1009 if( startPos
> searchStr
.getLength())
1010 startPos
= searchStr
.getLength();
1012 // use the ICU RegexMatcher to find the matches
1013 // TODO: use ICU's backward searching once it becomes available
1014 // as its replacement using forward search is not as good as the real thing
1015 UErrorCode nIcuErr
= U_ZERO_ERROR
;
1016 const icu::UnicodeString
aSearchTargetStr(false, reinterpret_cast<const UChar
*>(searchStr
.getStr()),
1017 searchStr
.getLength());
1018 pRegexMatcher
->reset( aSearchTargetStr
);
1019 if (!lcl_findRegex( pRegexMatcher
, endPos
, startPos
, nIcuErr
))
1022 // find the last match
1025 int nGoodPos
= 0, nGoodEnd
= 0;
1028 nLastPos
= pRegexMatcher
->start( nIcuErr
);
1029 nFoundEnd
= pRegexMatcher
->end( nIcuErr
);
1030 if (nLastPos
< nFoundEnd
)
1032 // remember last non-zero-length match
1033 nGoodPos
= nLastPos
;
1034 nGoodEnd
= nFoundEnd
;
1036 if( nFoundEnd
>= startPos
)
1039 if( nFoundEnd
== nLastPos
)
1041 } while( lcl_findRegex( pRegexMatcher
, nFoundEnd
, startPos
, nIcuErr
));
1043 // Ignore all zero-length matches except "$" anchor on first match.
1044 if (nGoodPos
== nGoodEnd
)
1046 if (bFirst
&& nLastPos
== startPos
)
1047 nGoodPos
= nLastPos
;
1052 // find last match again to get its details
1053 lcl_findRegex( pRegexMatcher
, nGoodPos
, startPos
, nIcuErr
);
1055 // fill in the details of the last match
1056 const int nGroupCount
= pRegexMatcher
->groupCount();
1057 aRet
.subRegExpressions
= nGroupCount
+ 1;
1058 aRet
.startOffset
.realloc( aRet
.subRegExpressions
);
1059 auto pstartOffset
= aRet
.startOffset
.getArray();
1060 aRet
.endOffset
.realloc( aRet
.subRegExpressions
);
1061 auto pendOffset
= aRet
.endOffset
.getArray();
1062 // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted!
1063 pstartOffset
[0] = pRegexMatcher
->end( nIcuErr
);
1064 pendOffset
[0] = pRegexMatcher
->start( nIcuErr
);
1065 for( int i
= 1; i
<= nGroupCount
; ++i
) {
1066 pstartOffset
[i
] = pRegexMatcher
->end( i
, nIcuErr
);
1067 pendOffset
[i
] = pRegexMatcher
->start( i
, nIcuErr
);
1074 // search for words phonetically
1075 SearchResult
TextSearch::ApproxSrchFrwrd( std::unique_lock
<std::mutex
>& /*rGuard*/, const OUString
& searchStr
,
1076 sal_Int32 startPos
, sal_Int32 endPos
)
1079 aRet
.subRegExpressions
= 0;
1084 sal_Int32 nStt
, nEnd
;
1086 Boundary aWBnd
= xBreak
->getWordBoundary( searchStr
, startPos
,
1088 WordType::ANYWORD_IGNOREWHITESPACES
, true );
1092 if( aWBnd
.startPos
>= endPos
)
1094 nStt
= aWBnd
.startPos
< startPos
? startPos
: aWBnd
.startPos
;
1095 nEnd
= std::min(aWBnd
.endPos
, endPos
);
1098 pWLD
->WLD( searchStr
.getStr() + nStt
, nEnd
- nStt
) <= nLimit
)
1100 aRet
.subRegExpressions
= 1;
1101 aRet
.startOffset
= { nStt
};
1102 aRet
.endOffset
= { nEnd
};
1107 aWBnd
= xBreak
->nextWord( searchStr
, nStt
, aSrchPara
.Locale
,
1108 WordType::ANYWORD_IGNOREWHITESPACES
);
1109 } while( aWBnd
.startPos
!= aWBnd
.endPos
||
1110 (aWBnd
.endPos
!= searchStr
.getLength() && aWBnd
.endPos
!= nEnd
) );
1111 // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only
1112 // whitespace) in searchStr, getWordBoundary() returned startPos,startPos
1113 // and nextWord() does also => don't loop forever.
1117 SearchResult
TextSearch::ApproxSrchBkwrd( std::unique_lock
<std::mutex
>& /*rGuard*/, const OUString
& searchStr
,
1118 sal_Int32 startPos
, sal_Int32 endPos
)
1121 aRet
.subRegExpressions
= 0;
1126 sal_Int32 nStt
, nEnd
;
1128 Boundary aWBnd
= xBreak
->getWordBoundary( searchStr
, startPos
,
1130 WordType::ANYWORD_IGNOREWHITESPACES
, true );
1134 if( aWBnd
.endPos
<= endPos
)
1136 nStt
= aWBnd
.startPos
< endPos
? endPos
: aWBnd
.startPos
;
1137 nEnd
= std::min(aWBnd
.endPos
, startPos
);
1140 pWLD
->WLD( searchStr
.getStr() + nStt
, nEnd
- nStt
) <= nLimit
)
1142 aRet
.subRegExpressions
= 1;
1143 aRet
.startOffset
= { nEnd
};
1144 aRet
.endOffset
= { nStt
};
1150 aWBnd
= xBreak
->previousWord( searchStr
, nStt
, aSrchPara
.Locale
,
1151 WordType::ANYWORD_IGNOREWHITESPACES
);
1152 } while( aWBnd
.startPos
!= aWBnd
.endPos
|| aWBnd
.endPos
!= searchStr
.getLength() );
1158 void setWildcardMatch( css::util::SearchResult
& rRes
, sal_Int32 nStartOffset
, sal_Int32 nEndOffset
)
1160 rRes
.subRegExpressions
= 1;
1161 rRes
.startOffset
= { nStartOffset
};
1162 rRes
.endOffset
= { nEndOffset
};
1166 SearchResult
TextSearch::WildcardSrchFrwrd( std::unique_lock
<std::mutex
>& /*rGuard*/, const OUString
& searchStr
, sal_Int32 nStartPos
, sal_Int32 nEndPos
)
1169 aRes
.subRegExpressions
= 0; // no match
1170 sal_Int32 nStartOffset
= nStartPos
;
1171 sal_Int32 nEndOffset
= nEndPos
;
1173 const sal_Int32 nStringLen
= searchStr
.getLength();
1175 // Forward nStartPos inclusive, nEndPos exclusive, but allow for empty
1176 // string match with [0,0).
1177 if (nStartPos
< 0 || nEndPos
> nStringLen
|| nEndPos
< nStartPos
||
1178 (nStartPos
== nStringLen
&& (nStringLen
!= 0 || nStartPos
!= nEndPos
)))
1181 const OUString
& rPattern
= (bUsePrimarySrchStr
? sSrchStr
: sSrchStr2
);
1182 const sal_Int32 nPatternLen
= rPattern
.getLength();
1184 // Handle special cases empty pattern and/or string outside of the loop to
1185 // not add performance penalties there and simplify.
1186 if (nStartPos
== nEndPos
)
1189 while (i
< nPatternLen
&& rPattern
[i
] == '*')
1191 if (i
== nPatternLen
)
1192 setWildcardMatch( aRes
, nStartOffset
, nEndOffset
);
1196 // Empty pattern does not match any non-empty string.
1200 bool bRewind
= false;
1201 sal_uInt32 cPattern
= 0;
1202 sal_Int32 nPattern
= 0;
1203 sal_Int32 nAfterFakePattern
= nPattern
;
1204 if (mbWildcardAllowSubstring
)
1206 // Fake a leading '*' wildcard.
1209 // Assume a non-'*' pattern character follows. If it is a '*' instead
1210 // that will be handled in the loop by setting nPat.
1211 sal_uInt32 cu
= rPattern
.iterateCodePoints( &nAfterFakePattern
);
1212 if (cu
== mcWildcardEscapeChar
&& mcWildcardEscapeChar
&& nAfterFakePattern
< nPatternLen
)
1213 rPattern
.iterateCodePoints( &nAfterFakePattern
);
1216 sal_Int32 nString
= nStartPos
, nPat
= -1, nStr
= -1, nLastAsterisk
= -1;
1217 sal_uInt32 cPatternAfterAsterisk
= 0;
1218 bool bEscaped
= false, bEscapedAfterAsterisk
= false;
1220 // The loop code tries to avoid costly calls to iterateCodePoints() when
1227 // Reuse cPattern after '*', nPattern was correspondingly
1228 // incremented to point behind cPattern.
1231 else if (nPattern
< nPatternLen
)
1233 // nPattern will be incremented by iterateCodePoints().
1234 cPattern
= rPattern
.iterateCodePoints( &nPattern
);
1235 if (cPattern
== mcWildcardEscapeChar
&& mcWildcardEscapeChar
&& nPattern
< nPatternLen
)
1238 cPattern
= rPattern
.iterateCodePoints( &nPattern
);
1243 // A trailing '*' is handled below.
1244 if (mbWildcardAllowSubstring
)
1246 // If the pattern is consumed and substring match allowed we're good.
1247 setWildcardMatch( aRes
, nStartOffset
, nString
);
1250 else if (nString
< nEndPos
&& nLastAsterisk
>= 0)
1252 // If substring match is not allowed try a greedy '*' match.
1253 nPattern
= nLastAsterisk
;
1260 if (cPattern
== '*' && !bEscaped
)
1262 // '*' is one code unit, so not using iterateCodePoints() is ok.
1263 while (nPattern
< nPatternLen
&& rPattern
[nPattern
] == '*')
1266 if (nPattern
>= nPatternLen
)
1268 // Last pattern is '*', remaining string matches.
1269 setWildcardMatch( aRes
, nStartOffset
, nEndOffset
);
1273 nLastAsterisk
= nPattern
; // Remember last encountered '*'.
1275 // cPattern will be the next non-'*' character, nPattern
1277 cPattern
= rPattern
.iterateCodePoints( &nPattern
);
1278 if (cPattern
== mcWildcardEscapeChar
&& mcWildcardEscapeChar
&& nPattern
< nPatternLen
)
1281 cPattern
= rPattern
.iterateCodePoints( &nPattern
);
1284 cPatternAfterAsterisk
= cPattern
;
1285 bEscapedAfterAsterisk
= bEscaped
;
1286 nPat
= nPattern
; // Remember position of pattern behind '*', already incremented.
1287 nStr
= nString
; // Remember the current string to be matched.
1290 if (nString
>= nEndPos
)
1291 // Whatever follows in pattern, string will not match.
1294 // nString will be incremented by iterateCodePoints().
1295 sal_uInt32 cString
= searchStr
.iterateCodePoints( &nString
);
1297 if ((cPattern
!= '?' || bEscaped
) && cPattern
!= cString
)
1300 // Non-match already without any '*' pattern.
1304 nPattern
= nPat
; // Rewind pattern to character behind '*', already incremented.
1305 cPattern
= cPatternAfterAsterisk
;
1306 bEscaped
= bEscapedAfterAsterisk
;
1307 searchStr
.iterateCodePoints( &nStr
);
1308 nString
= nStr
; // Restore incremented remembered string position.
1309 if (nPat
== nAfterFakePattern
)
1311 // Next start offset will be the next character.
1312 nStartOffset
= nString
;
1317 // An unescaped '?' pattern matched any character, or characters
1318 // matched. Reset only escaped state.
1322 while (nString
< nEndPos
);
1327 // Eat trailing '*' pattern that matches anything, including nothing.
1328 // '*' is one code unit, so not using iterateCodePoints() is ok.
1329 while (nPattern
< nPatternLen
&& rPattern
[nPattern
] == '*')
1332 if (nPattern
== nPatternLen
)
1333 setWildcardMatch( aRes
, nStartOffset
, nEndOffset
);
1337 SearchResult
TextSearch::WildcardSrchBkwrd( std::unique_lock
<std::mutex
>& /*rGuard*/, const OUString
& searchStr
, sal_Int32 nStartPos
, sal_Int32 nEndPos
)
1340 aRes
.subRegExpressions
= 0; // no match
1342 sal_Int32 nStartOffset
= nStartPos
;
1343 sal_Int32 nEndOffset
= nEndPos
;
1345 const sal_Int32 nStringLen
= searchStr
.getLength();
1347 // Backward nStartPos exclusive, nEndPos inclusive, but allow for empty
1348 // string match with (0,0].
1349 if (nStartPos
> nStringLen
|| nEndPos
< 0 || nStartPos
< nEndPos
||
1350 (nEndPos
== nStringLen
&& (nStringLen
!= 0 || nStartPos
!= nEndPos
)))
1353 const OUString
& rPattern
= (bUsePrimarySrchStr
? sSrchStr
: sSrchStr2
);
1354 sal_Int32 nPatternLen
= rPattern
.getLength();
1356 // Handle special cases empty pattern and/or string outside of the loop to
1357 // not add performance penalties there and simplify.
1358 if (nStartPos
== nEndPos
)
1361 while (i
< nPatternLen
&& rPattern
[i
] == '*')
1363 if (i
== nPatternLen
)
1364 setWildcardMatch( aRes
, nStartOffset
, nEndOffset
);
1368 // Empty pattern does not match any non-empty string.
1372 // Reverse escaped patterns to ease the handling of escapes, keeping escape
1373 // and following character as one sequence in backward direction.
1374 if ((bUsePrimarySrchStr
&& maWildcardReversePattern
.isEmpty()) ||
1375 (!bUsePrimarySrchStr
&& maWildcardReversePattern2
.isEmpty()))
1377 OUStringBuffer
aPatternBuf( rPattern
);
1378 sal_Int32 nIndex
= 0;
1379 while (nIndex
< nPatternLen
)
1381 const sal_Int32 nOld
= nIndex
;
1382 const sal_uInt32 cu
= rPattern
.iterateCodePoints( &nIndex
);
1383 if (cu
== mcWildcardEscapeChar
)
1385 if (nIndex
< nPatternLen
)
1387 if (nIndex
- nOld
== 1)
1389 // Simply move code units, we already memorized the one
1391 const sal_Int32 nOld2
= nIndex
;
1392 rPattern
.iterateCodePoints( &nIndex
);
1393 for (sal_Int32 i
=0; i
< nIndex
- nOld2
; ++i
)
1394 aPatternBuf
[nOld
+i
] = rPattern
[nOld2
+i
];
1395 aPatternBuf
[nIndex
-1] = static_cast<sal_Unicode
>(cu
);
1399 // Copy the escape character code units first in the
1400 // unlikely case that it would not be of BMP.
1401 assert(nIndex
- nOld
== 2); // it's UTF-16, so...
1403 buf
[0] = rPattern
[nOld
];
1404 buf
[1] = rPattern
[nOld
+1];
1405 const sal_Int32 nOld2
= nIndex
;
1406 rPattern
.iterateCodePoints( &nIndex
);
1407 for (sal_Int32 i
=0; i
< nIndex
- nOld2
; ++i
)
1408 aPatternBuf
[nOld
+i
] = rPattern
[nOld2
+i
];
1409 aPatternBuf
[nIndex
-2] = buf
[0];
1410 aPatternBuf
[nIndex
-1] = buf
[1];
1415 // Trailing escape would become leading escape, do what?
1417 aPatternBuf
.remove( nOld
, nIndex
- nOld
);
1421 if (bUsePrimarySrchStr
)
1422 maWildcardReversePattern
= aPatternBuf
.makeStringAndClear();
1424 maWildcardReversePattern2
= aPatternBuf
.makeStringAndClear();
1426 const OUString
& rReversePattern
= (bUsePrimarySrchStr
? maWildcardReversePattern
: maWildcardReversePattern2
);
1427 nPatternLen
= rReversePattern
.getLength();
1429 bool bRewind
= false;
1430 sal_uInt32 cPattern
= 0;
1431 sal_Int32 nPattern
= nPatternLen
;
1432 sal_Int32 nAfterFakePattern
= nPattern
;
1433 if (mbWildcardAllowSubstring
)
1435 // Fake a trailing '*' wildcard.
1438 // Assume a non-'*' pattern character follows. If it is a '*' instead
1439 // that will be handled in the loop by setting nPat.
1440 sal_uInt32 cu
= rReversePattern
.iterateCodePoints( &nAfterFakePattern
, -1);
1441 if (cu
== mcWildcardEscapeChar
&& mcWildcardEscapeChar
&& nAfterFakePattern
> 0)
1442 rReversePattern
.iterateCodePoints( &nAfterFakePattern
, -1);
1445 sal_Int32 nString
= nStartPos
, nPat
= -1, nStr
= -1, nLastAsterisk
= -1;
1446 sal_uInt32 cPatternAfterAsterisk
= 0;
1447 bool bEscaped
= false, bEscapedAfterAsterisk
= false;
1449 // The loop code tries to avoid costly calls to iterateCodePoints() when
1456 // Reuse cPattern after '*', nPattern was correspondingly
1457 // decremented to point before cPattern.
1460 else if (nPattern
> 0)
1462 // nPattern will be decremented by iterateCodePoints().
1463 cPattern
= rReversePattern
.iterateCodePoints( &nPattern
, -1);
1464 if (cPattern
== mcWildcardEscapeChar
&& mcWildcardEscapeChar
&& nPattern
> 0)
1467 cPattern
= rReversePattern
.iterateCodePoints( &nPattern
, -1);
1472 // A trailing '*' is handled below.
1473 if (mbWildcardAllowSubstring
)
1475 // If the pattern is consumed and substring match allowed we're good.
1476 setWildcardMatch( aRes
, nStartOffset
, nString
);
1479 else if (nString
> nEndPos
&& nLastAsterisk
>= 0)
1481 // If substring match is not allowed try a greedy '*' match.
1482 nPattern
= nLastAsterisk
;
1489 if (cPattern
== '*' && !bEscaped
)
1491 // '*' is one code unit, so not using iterateCodePoints() is ok.
1492 while (nPattern
> 0 && rReversePattern
[nPattern
-1] == '*')
1497 // First pattern is '*', remaining string matches.
1498 setWildcardMatch( aRes
, nStartOffset
, nEndOffset
);
1502 nLastAsterisk
= nPattern
; // Remember last encountered '*'.
1504 // cPattern will be the previous non-'*' character, nPattern
1506 cPattern
= rReversePattern
.iterateCodePoints( &nPattern
, -1);
1507 if (cPattern
== mcWildcardEscapeChar
&& mcWildcardEscapeChar
&& nPattern
> 0)
1510 cPattern
= rReversePattern
.iterateCodePoints( &nPattern
, -1);
1513 cPatternAfterAsterisk
= cPattern
;
1514 bEscapedAfterAsterisk
= bEscaped
;
1515 nPat
= nPattern
; // Remember position of pattern before '*', already decremented.
1516 nStr
= nString
; // Remember the current string to be matched.
1519 if (nString
<= nEndPos
)
1520 // Whatever leads in pattern, string will not match.
1523 // nString will be decremented by iterateCodePoints().
1524 sal_uInt32 cString
= searchStr
.iterateCodePoints( &nString
, -1);
1526 if ((cPattern
!= '?' || bEscaped
) && cPattern
!= cString
)
1529 // Non-match already without any '*' pattern.
1533 nPattern
= nPat
; // Rewind pattern to character before '*', already decremented.
1534 cPattern
= cPatternAfterAsterisk
;
1535 bEscaped
= bEscapedAfterAsterisk
;
1536 searchStr
.iterateCodePoints( &nStr
, -1);
1537 nString
= nStr
; // Restore decremented remembered string position.
1538 if (nPat
== nAfterFakePattern
)
1540 // Next start offset will be this character (exclusive).
1541 nStartOffset
= nString
;
1546 // An unescaped '?' pattern matched any character, or characters
1547 // matched. Reset only escaped state.
1551 while (nString
> nEndPos
);
1556 // Eat leading '*' pattern that matches anything, including nothing.
1557 // '*' is one code unit, so not using iterateCodePoints() is ok.
1558 while (nPattern
> 0 && rReversePattern
[nPattern
-1] == '*')
1562 setWildcardMatch( aRes
, nStartOffset
, nEndOffset
);
1568 TextSearch::getImplementationName()
1570 return u
"com.sun.star.util.TextSearch_i18n"_ustr
;
1573 sal_Bool SAL_CALL
TextSearch::supportsService(const OUString
& rServiceName
)
1575 return cppu::supportsService(this, rServiceName
);
1578 Sequence
< OUString
> SAL_CALL
1579 TextSearch::getSupportedServiceNames()
1581 return { u
"com.sun.star.util.TextSearch"_ustr
, u
"com.sun.star.util.TextSearch2"_ustr
};
1584 extern "C" SAL_DLLPUBLIC_EXPORT
css::uno::XInterface
*
1585 i18npool_TextSearch_get_implementation(
1586 css::uno::XComponentContext
* context
, css::uno::Sequence
<css::uno::Any
> const&)
1588 return cppu::acquire(new TextSearch(context
));
1591 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */