tdf#130857 qt weld: Support mail merge "Server Auth" dialog
[LibreOffice.git] / i18npool / source / search / textsearch.cxx
blob5475ef5b7ddeaeaea203be13a823736e87e86a27
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include "textsearch.hxx"
21 #include "levdis.hxx"
22 #include <com/sun/star/i18n/BreakIterator.hpp>
23 #include <com/sun/star/util/SearchAlgorithms2.hpp>
24 #include <com/sun/star/util/SearchFlags.hpp>
25 #include <com/sun/star/i18n/WordType.hpp>
26 #include <com/sun/star/i18n/ScriptType.hpp>
27 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
28 #include <com/sun/star/i18n/CharacterClassification.hpp>
29 #include <com/sun/star/i18n/KCharacterType.hpp>
30 #include <com/sun/star/i18n/Transliteration.hpp>
31 #include <cppuhelper/supportsservice.hxx>
32 #include <cppuhelper/weak.hxx>
33 #include <i18nutil/transliteration.hxx>
34 #include <rtl/ustrbuf.hxx>
35 #include <sal/log.hxx>
37 #include <unicode/regex.h>
39 using namespace ::com::sun::star::util;
40 using namespace ::com::sun::star::uno;
41 using namespace ::com::sun::star::lang;
42 using namespace ::com::sun::star::i18n;
43 using namespace ::com::sun::star;
45 const TransliterationFlags COMPLEX_TRANS_MASK =
46 TransliterationFlags::ignoreBaFa_ja_JP |
47 TransliterationFlags::ignoreIterationMark_ja_JP |
48 TransliterationFlags::ignoreTiJi_ja_JP |
49 TransliterationFlags::ignoreHyuByu_ja_JP |
50 TransliterationFlags::ignoreSeZe_ja_JP |
51 TransliterationFlags::ignoreIandEfollowedByYa_ja_JP |
52 TransliterationFlags::ignoreKiKuFollowedBySa_ja_JP |
53 TransliterationFlags::ignoreProlongedSoundMark_ja_JP;
55 namespace
57 TransliterationFlags maskComplexTrans( TransliterationFlags n )
59 // IGNORE_KANA and FULLWIDTH_HALFWIDTH are simple but need to take effect
60 // in complex transliteration.
61 return
62 n & (COMPLEX_TRANS_MASK | // all set ignore bits
63 TransliterationFlags::IGNORE_KANA | // plus IGNORE_KANA bit
64 TransliterationFlags::FULLWIDTH_HALFWIDTH); // and the FULLWIDTH_HALFWIDTH value
67 bool isComplexTrans( TransliterationFlags n )
69 return bool(n & COMPLEX_TRANS_MASK);
72 TransliterationFlags maskSimpleTrans( TransliterationFlags n )
74 return n & ~COMPLEX_TRANS_MASK;
77 bool isSimpleTrans( TransliterationFlags n )
79 return bool(maskSimpleTrans(n));
82 // Regex patterns are case sensitive.
83 TransliterationFlags maskSimpleRegexTrans( TransliterationFlags n )
85 TransliterationFlags m = (n & TransliterationFlags::IGNORE_MASK) & ~TransliterationFlags::IGNORE_CASE;
86 TransliterationFlags v = n & TransliterationFlags::NON_IGNORE_MASK;
87 if (v == TransliterationFlags::UPPERCASE_LOWERCASE || v == TransliterationFlags::LOWERCASE_UPPERCASE)
88 v = TransliterationFlags::NONE;
89 return (m | v) & ~COMPLEX_TRANS_MASK;
92 bool isSimpleRegexTrans( TransliterationFlags n )
94 return bool(maskSimpleRegexTrans(n));
97 bool isReplacePunctuation( OUString &rStr )
99 return rStr.indexOf(u'\u2018') > -1 ||
100 rStr.indexOf(u'\u2019') > -1 ||
101 rStr.indexOf(u'\u201A') > -1 ||
102 rStr.indexOf(u'\u201B') > -1 ||
103 rStr.indexOf(u'\u201C') > -1 ||
104 rStr.indexOf(u'\u201D') > -1 ||
105 rStr.indexOf(u'\u201E') > -1 ||
106 rStr.indexOf(u'\u201F') > -1;
109 OUString replacePunctuation( OUString &rStr )
111 return rStr.replace(u'\u2018', '\'')
112 .replace(u'\u2019', '\'')
113 .replace(u'\u201A', '\'')
114 .replace(u'\u201B', '\'')
115 .replace(u'\u201C', '"')
116 .replace(u'\u201D', '"')
117 .replace(u'\u201E', '"')
118 .replace(u'\u201F', '"');
122 TextSearch::TextSearch(const Reference < XComponentContext > & rxContext)
123 : m_xContext( rxContext )
125 SearchOptions2 aOpt;
126 aOpt.AlgorithmType2 = SearchAlgorithms2::ABSOLUTE;
127 aOpt.algorithmType = SearchAlgorithms_ABSOLUTE;
128 aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE;
129 //aOpt.Locale = ???;
130 setOptions( aOpt );
133 TextSearch::~TextSearch()
135 pRegexMatcher.reset();
136 pWLD.reset();
137 pJumpTable.reset();
138 pJumpTable2.reset();
141 void TextSearch::setOptions2( const SearchOptions2& rOptions )
143 std::unique_lock g(m_aMutex);
145 aSrchPara = rOptions;
147 pRegexMatcher.reset();
148 pWLD.reset();
149 pJumpTable.reset();
150 pJumpTable2.reset();
151 maWildcardReversePattern.clear();
152 maWildcardReversePattern2.clear();
153 TransliterationFlags transliterateFlags = static_cast<TransliterationFlags>(aSrchPara.transliterateFlags);
154 bSearchApostrophe = false;
155 bool bReplaceApostrophe = false;
156 if (aSrchPara.AlgorithmType2 == SearchAlgorithms2::REGEXP)
158 // RESrchPrepare will consider aSrchPara.transliterateFlags when
159 // picking the actual regex pattern
160 // (sSrchStr|sSrchStr2|SearchOptions2::searchString) and setting
161 // case-insensitivity. Create transliteration instance, if any, without
162 // ignore-case so later in TextSearch::searchForward() the string to
163 // match is not case-altered, leave case-(in)sensitive to regex engine.
164 transliterateFlags &= ~TransliterationFlags::IGNORE_CASE;
166 else if ( aSrchPara.searchString.indexOf('\'') > - 1 || aSrchPara.searchString.indexOf('"') > - 1 )
168 bSearchApostrophe = true;
169 bReplaceApostrophe = isReplacePunctuation(aSrchPara.searchString);
172 // Create Transliteration class
173 if( isSimpleTrans( transliterateFlags) )
175 if( !xTranslit.is() )
176 xTranslit.set( Transliteration::create( m_xContext ) );
177 xTranslit->loadModule(
178 static_cast<TransliterationModules>(maskSimpleTrans(transliterateFlags)),
179 aSrchPara.Locale);
181 else if( xTranslit.is() )
182 xTranslit = nullptr;
184 // Create Transliteration for 2<->1, 2<->2 transliteration
185 if ( isComplexTrans( transliterateFlags) )
187 if( !xTranslit2.is() )
188 xTranslit2.set( Transliteration::create( m_xContext ) );
189 // Load transliteration module
190 xTranslit2->loadModule(
191 static_cast<TransliterationModules>(maskComplexTrans(transliterateFlags)),
192 aSrchPara.Locale);
195 if ( !xBreak.is() )
196 xBreak = css::i18n::BreakIterator::create( m_xContext );
198 sSrchStr = aSrchPara.searchString;
200 // Transliterate search string.
201 if (aSrchPara.AlgorithmType2 == SearchAlgorithms2::REGEXP)
203 if (isSimpleRegexTrans(transliterateFlags))
205 if (maskSimpleRegexTrans(transliterateFlags) !=
206 maskSimpleTrans(transliterateFlags))
208 css::uno::Reference< XExtendedTransliteration > xTranslitPattern(
209 Transliteration::create( m_xContext ));
210 if (xTranslitPattern.is())
212 xTranslitPattern->loadModule(
213 static_cast<TransliterationModules>(maskSimpleRegexTrans(transliterateFlags)),
214 aSrchPara.Locale);
215 sSrchStr = xTranslitPattern->transliterateString2String(
216 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
219 else
221 if (xTranslit.is())
222 sSrchStr = xTranslit->transliterateString2String(
223 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
225 // xTranslit2 complex transliterated sSrchStr2 is not used in
226 // regex, see TextSearch::searchForward() and
227 // TextSearch::searchBackward()
230 else
232 if ( xTranslit.is() && isSimpleTrans(transliterateFlags) )
233 sSrchStr = xTranslit->transliterateString2String(
234 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
236 if ( xTranslit2.is() && isComplexTrans(transliterateFlags) )
237 sSrchStr2 = xTranslit2->transliterateString2String(
238 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
241 if ( bReplaceApostrophe )
242 sSrchStr = replacePunctuation(sSrchStr);
244 // Take the new SearchOptions2::AlgorithmType2 field and ignore
245 // SearchOptions::algorithmType
246 switch( aSrchPara.AlgorithmType2)
248 case SearchAlgorithms2::REGEXP:
249 fnForward = &TextSearch::RESrchFrwrd;
250 fnBackward = &TextSearch::RESrchBkwrd;
251 RESrchPrepare( aSrchPara);
252 break;
254 case SearchAlgorithms2::APPROXIMATE:
255 fnForward = &TextSearch::ApproxSrchFrwrd;
256 fnBackward = &TextSearch::ApproxSrchBkwrd;
258 pWLD.reset( new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars,
259 aSrchPara.insertedChars, aSrchPara.deletedChars,
260 0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) ) );
262 nLimit = pWLD->GetLimit();
263 break;
265 case SearchAlgorithms2::WILDCARD:
266 mcWildcardEscapeChar = static_cast<sal_uInt32>(aSrchPara.WildcardEscapeCharacter);
267 mbWildcardAllowSubstring = ((aSrchPara.searchFlag & SearchFlags::WILD_MATCH_SELECTION) == 0);
268 fnForward = &TextSearch::WildcardSrchFrwrd;
269 fnBackward = &TextSearch::WildcardSrchBkwrd;
270 break;
272 default:
273 SAL_WARN("i18npool","TextSearch::setOptions2 - default what?");
274 [[fallthrough]];
275 case SearchAlgorithms2::ABSOLUTE:
276 fnForward = &TextSearch::NSrchFrwrd;
277 fnBackward = &TextSearch::NSrchBkwrd;
278 break;
282 void TextSearch::setOptions( const SearchOptions& rOptions )
284 sal_Int16 nAlgorithmType2;
285 switch (rOptions.algorithmType)
287 case SearchAlgorithms_REGEXP:
288 nAlgorithmType2 = SearchAlgorithms2::REGEXP;
289 break;
290 case SearchAlgorithms_APPROXIMATE:
291 nAlgorithmType2 = SearchAlgorithms2::APPROXIMATE;
292 break;
293 default:
294 SAL_WARN("i18npool","TextSearch::setOptions - default what?");
295 [[fallthrough]];
296 case SearchAlgorithms_ABSOLUTE:
297 nAlgorithmType2 = SearchAlgorithms2::ABSOLUTE;
298 break;
300 // It would be nice if an inherited struct had a ctor that takes an
301 // instance of the object the struct derived from...
302 SearchOptions2 aOptions2(
303 rOptions.algorithmType,
304 rOptions.searchFlag,
305 rOptions.searchString,
306 rOptions.replaceString,
307 rOptions.Locale,
308 rOptions.changedChars,
309 rOptions.deletedChars,
310 rOptions.insertedChars,
311 rOptions.transliterateFlags,
312 nAlgorithmType2,
313 0 // no wildcard search, no escape character...
315 setOptions2( aOptions2);
318 static sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos )
320 auto pOff = std::find_if(rOff.begin(), rOff.end(),
321 [nPos](const sal_Int32 nOff) { return nOff >= nPos; });
322 return static_cast<sal_Int32>(std::distance(rOff.begin(), pOff));
325 SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
327 std::unique_lock g(m_aMutex);
329 SearchResult sres;
331 OUString in_str(searchStr);
333 // in non-regex mode, allow searching typographical apostrophe with the ASCII one
334 // to avoid regression after using automatic conversion to U+2019 during typing in Writer
335 bool bReplaceApostrophe = bSearchApostrophe && isReplacePunctuation(in_str);
337 bUsePrimarySrchStr = true;
339 if ( xTranslit.is() )
341 // apply normal transliteration (1<->1, 1<->0)
343 sal_Int32 nInStartPos = startPos;
344 if (pRegexMatcher && startPos > 0)
346 // tdf#89665, tdf#75806: An optimization to avoid transliterating the whole string, yet
347 // transliterate enough of the leading text to allow sensible look-behind assertions.
348 // 100 is chosen arbitrarily in the hope that look-behind assertions would largely fit.
349 // See http://userguide.icu-project.org/strings/regexp for look-behind assertion syntax.
350 // When search regex doesn't start with an assertion, 3 is to allow startPos to be in
351 // the middle of a surrogate pair, preceded by another surrogate pair.
352 const sal_Int32 nMaxLeadingLen = aSrchPara.searchString.startsWith("(?") ? 100 : 3;
353 nInStartPos -= std::min(nMaxLeadingLen, startPos);
355 sal_Int32 nInEndPos = endPos;
356 if (pRegexMatcher && endPos < searchStr.getLength())
358 // tdf#65038: ditto for look-ahead assertions
359 const sal_Int32 nMaxTrailingLen = aSrchPara.searchString.endsWith(")") ? 100 : 3;
360 nInEndPos += std::min(nMaxTrailingLen, searchStr.getLength() - endPos);
363 css::uno::Sequence<sal_Int32> offset(nInEndPos - nInStartPos);
364 in_str = xTranslit->transliterate(searchStr, nInStartPos, nInEndPos - nInStartPos, offset);
366 if ( bReplaceApostrophe )
367 in_str = replacePunctuation(in_str);
369 // JP 20.6.2001: also the start and end positions must be corrected!
370 sal_Int32 newStartPos =
371 (startPos == 0) ? 0 : FindPosInSeq_Impl( offset, startPos );
373 sal_Int32 newEndPos = (endPos < searchStr.getLength())
374 ? FindPosInSeq_Impl( offset, endPos )
375 : in_str.getLength();
377 sres = (this->*fnForward)( g, in_str, newStartPos, newEndPos );
379 // Map offsets back to untransliterated string.
380 const sal_Int32 nOffsets = offset.getLength();
381 if (nOffsets)
383 auto sres_startOffsetRange = asNonConstRange(sres.startOffset);
384 auto sres_endOffsetRange = asNonConstRange(sres.endOffset);
385 // For regex nGroups is the number of groups+1 with group 0 being
386 // the entire match.
387 const sal_Int32 nGroups = sres.startOffset.getLength();
388 for ( sal_Int32 k = 0; k < nGroups; k++ )
390 const sal_Int32 nStart = sres.startOffset[k];
391 // Result offsets are negative (-1) if a group expression was
392 // not matched.
393 if (nStart >= 0)
394 sres_startOffsetRange[k] = (nStart < nOffsets ? offset[nStart] : (offset[nOffsets - 1] + 1));
395 // JP 20.6.2001: end is ever exclusive and then don't return
396 // the position of the next character - return the
397 // next position behind the last found character!
398 // "a b c" find "b" must return 2,3 and not 2,4!!!
399 const sal_Int32 nStop = sres.endOffset[k];
400 if (nStop >= 0)
402 if (nStop > 0)
403 sres_endOffsetRange[k] = offset[(nStop <= nOffsets ? nStop : nOffsets) - 1] + 1;
404 else
405 sres_endOffsetRange[k] = offset[0];
410 else
412 if ( bReplaceApostrophe )
413 in_str = in_str.replace(u'\u2019', '\'');
415 sres = (this->*fnForward)( g, in_str, startPos, endPos );
418 if ( xTranslit2.is() && aSrchPara.AlgorithmType2 != SearchAlgorithms2::REGEXP)
420 SearchResult sres2;
422 in_str = searchStr;
423 css::uno::Sequence <sal_Int32> offset( in_str.getLength());
425 in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset );
427 if( startPos )
428 startPos = FindPosInSeq_Impl( offset, startPos );
430 if( endPos < searchStr.getLength() )
431 endPos = FindPosInSeq_Impl( offset, endPos );
432 else
433 endPos = in_str.getLength();
435 bUsePrimarySrchStr = false;
436 sres2 = (this->*fnForward)( g, in_str, startPos, endPos );
437 auto sres2_startOffsetRange = asNonConstRange(sres2.startOffset);
438 auto sres2_endOffsetRange = asNonConstRange(sres2.endOffset);
440 for ( int k = 0; k < sres2.startOffset.getLength(); k++ )
442 if (sres2.startOffset[k])
443 sres2_startOffsetRange[k] = offset[sres2.startOffset[k]-1] + 1;
444 if (sres2.endOffset[k])
445 sres2_endOffsetRange[k] = offset[sres2.endOffset[k]-1] + 1;
448 // pick first and long one
449 if ( sres.subRegExpressions == 0)
450 return sres2;
451 if ( sres2.subRegExpressions == 1)
453 if ( sres.startOffset[0] > sres2.startOffset[0])
454 return sres2;
455 else if ( sres.startOffset[0] == sres2.startOffset[0] &&
456 sres.endOffset[0] < sres2.endOffset[0])
457 return sres2;
461 return sres;
464 SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
466 std::unique_lock g(m_aMutex);
468 SearchResult sres;
470 OUString in_str(searchStr);
472 // in non-regex mode, allow searching typographical apostrophe with the ASCII one
473 // to avoid regression after using automatic conversion to U+2019 during typing in Writer
474 bool bReplaceApostrophe = bSearchApostrophe && isReplacePunctuation(in_str);
476 bUsePrimarySrchStr = true;
478 if ( xTranslit.is() )
480 // apply only simple 1<->1 transliteration here
481 css::uno::Sequence<sal_Int32> offset(startPos - endPos);
482 in_str = xTranslit->transliterate( searchStr, endPos, startPos - endPos, offset );
484 if ( bReplaceApostrophe )
485 in_str = replacePunctuation(in_str);
487 // JP 20.6.2001: also the start and end positions must be corrected!
488 sal_Int32 const newStartPos = (startPos < searchStr.getLength())
489 ? FindPosInSeq_Impl( offset, startPos )
490 : in_str.getLength();
492 sal_Int32 const newEndPos =
493 (endPos == 0) ? 0 : FindPosInSeq_Impl( offset, endPos );
495 // TODO: this would need nExtraOffset handling to avoid $ matching
496 // if (pRegexMatcher && startPos < searchStr.getLength())
497 // but that appears to be impossible with ICU regex
499 sres = (this->*fnBackward)( g, in_str, newStartPos, newEndPos );
501 // Map offsets back to untransliterated string.
502 const sal_Int32 nOffsets = offset.getLength();
503 if (nOffsets)
505 auto sres_startOffsetRange = asNonConstRange(sres.startOffset);
506 auto sres_endOffsetRange = asNonConstRange(sres.endOffset);
507 // For regex nGroups is the number of groups+1 with group 0 being
508 // the entire match.
509 const sal_Int32 nGroups = sres.startOffset.getLength();
510 for ( sal_Int32 k = 0; k < nGroups; k++ )
512 const sal_Int32 nStart = sres.startOffset[k];
513 // Result offsets are negative (-1) if a group expression was
514 // not matched.
515 if (nStart >= 0)
517 if (nStart > 0)
518 sres_startOffsetRange[k] = offset[(nStart <= nOffsets ? nStart : nOffsets) - 1] + 1;
519 else
520 sres_startOffsetRange[k] = offset[0];
522 // JP 20.6.2001: end is ever exclusive and then don't return
523 // the position of the next character - return the
524 // next position behind the last found character!
525 // "a b c" find "b" must return 2,3 and not 2,4!!!
526 const sal_Int32 nStop = sres.endOffset[k];
527 if (nStop >= 0)
528 sres_endOffsetRange[k] = (nStop < nOffsets ? offset[nStop] : (offset[nOffsets - 1] + 1));
532 else
534 if ( bReplaceApostrophe )
535 in_str = replacePunctuation(in_str);
537 sres = (this->*fnBackward)( g, in_str, startPos, endPos );
540 if ( xTranslit2.is() && aSrchPara.AlgorithmType2 != SearchAlgorithms2::REGEXP )
542 SearchResult sres2;
544 in_str = searchStr;
545 css::uno::Sequence <sal_Int32> offset( in_str.getLength());
547 in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset);
549 if( startPos < searchStr.getLength() )
550 startPos = FindPosInSeq_Impl( offset, startPos );
551 else
552 startPos = in_str.getLength();
554 if( endPos )
555 endPos = FindPosInSeq_Impl( offset, endPos );
557 bUsePrimarySrchStr = false;
558 sres2 = (this->*fnBackward)( g, in_str, startPos, endPos );
559 auto sres2_startOffsetRange = asNonConstRange(sres2.startOffset);
560 auto sres2_endOffsetRange = asNonConstRange(sres2.endOffset);
562 for( int k = 0; k < sres2.startOffset.getLength(); k++ )
564 if (sres2.startOffset[k])
565 sres2_startOffsetRange[k] = offset[sres2.startOffset[k]-1]+1;
566 if (sres2.endOffset[k])
567 sres2_endOffsetRange[k] = offset[sres2.endOffset[k]-1]+1;
570 // pick last and long one
571 if ( sres.subRegExpressions == 0 )
572 return sres2;
573 if ( sres2.subRegExpressions == 1 )
575 if ( sres.startOffset[0] < sres2.startOffset[0] )
576 return sres2;
577 if ( sres.startOffset[0] == sres2.startOffset[0] &&
578 sres.endOffset[0] > sres2.endOffset[0] )
579 return sres2;
583 return sres;
587 bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const
589 bool bRet = true;
590 if( '\x7f' != rStr[nPos])
592 if ( !xCharClass.is() )
593 xCharClass = CharacterClassification::create( m_xContext );
594 sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos,
595 aSrchPara.Locale );
596 if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA |
597 KCharacterType::LETTER ) & nCType ) )
598 bRet = false;
600 return bRet;
603 // --------- helper methods for Boyer-Moore like text searching ----------
604 // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available
606 void TextSearch::MakeForwardTab()
608 // create the jumptable for the search text
610 if( pJumpTable && bIsForwardTab )
612 return; // the jumpTable is ok
614 bIsForwardTab = true;
616 sal_Int32 n, nLen = sSrchStr.getLength();
617 pJumpTable.reset( new TextSearchJumpTable );
619 for( n = 0; n < nLen - 1; ++n )
621 sal_Unicode cCh = sSrchStr[n];
622 sal_Int32 nDiff = nLen - n - 1;
623 TextSearchJumpTable::value_type aEntry( cCh, nDiff );
625 ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
626 pJumpTable->insert( aEntry );
627 if ( !aPair.second )
628 (*(aPair.first)).second = nDiff;
632 void TextSearch::MakeForwardTab2()
634 // create the jumptable for the search text
635 if( pJumpTable2 && bIsForwardTab )
637 return; // the jumpTable is ok
639 bIsForwardTab = true;
641 sal_Int32 n, nLen = sSrchStr2.getLength();
642 pJumpTable2.reset( new TextSearchJumpTable );
644 for( n = 0; n < nLen - 1; ++n )
646 sal_Unicode cCh = sSrchStr2[n];
647 sal_Int32 nDiff = nLen - n - 1;
649 TextSearchJumpTable::value_type aEntry( cCh, nDiff );
650 ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
651 pJumpTable2->insert( aEntry );
652 if ( !aPair.second )
653 (*(aPair.first)).second = nDiff;
657 void TextSearch::MakeBackwardTab()
659 // create the jumptable for the search text
660 if( pJumpTable && !bIsForwardTab)
662 return; // the jumpTable is ok
664 bIsForwardTab = false;
666 sal_Int32 n, nLen = sSrchStr.getLength();
667 pJumpTable.reset( new TextSearchJumpTable );
669 for( n = nLen-1; n > 0; --n )
671 sal_Unicode cCh = sSrchStr[n];
672 TextSearchJumpTable::value_type aEntry( cCh, n );
673 ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
674 pJumpTable->insert( aEntry );
675 if ( !aPair.second )
676 (*(aPair.first)).second = n;
680 void TextSearch::MakeBackwardTab2()
682 // create the jumptable for the search text
683 if( pJumpTable2 && !bIsForwardTab )
685 return; // the jumpTable is ok
687 bIsForwardTab = false;
689 sal_Int32 n, nLen = sSrchStr2.getLength();
690 pJumpTable2.reset( new TextSearchJumpTable );
692 for( n = nLen-1; n > 0; --n )
694 sal_Unicode cCh = sSrchStr2[n];
695 TextSearchJumpTable::value_type aEntry( cCh, n );
696 ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
697 pJumpTable2->insert( aEntry );
698 if ( !aPair.second )
699 (*(aPair.first)).second = n;
703 sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const
705 TextSearchJumpTable *pJump;
706 OUString sSearchKey;
708 if ( bUsePrimarySrchStr ) {
709 pJump = pJumpTable.get();
710 sSearchKey = sSrchStr;
711 } else {
712 pJump = pJumpTable2.get();
713 sSearchKey = sSrchStr2;
716 TextSearchJumpTable::const_iterator iLook = pJump->find( cChr );
717 if ( iLook == pJump->end() )
718 return sSearchKey.getLength();
719 return (*iLook).second;
723 SearchResult TextSearch::NSrchFrwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
725 SearchResult aRet;
726 aRet.subRegExpressions = 0;
728 OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
730 sal_Int32 nSuchIdx = searchStr.getLength();
731 sal_Int32 nEnd = endPos;
732 if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx )
733 return aRet;
736 if( nEnd < sSearchKey.getLength() ) // position inside the search region ?
737 return aRet;
739 nEnd -= sSearchKey.getLength();
741 if (bUsePrimarySrchStr)
742 MakeForwardTab(); // create the jumptable
743 else
744 MakeForwardTab2();
746 for (sal_Int32 nCmpIdx = startPos; // start position for the search
747 nCmpIdx <= nEnd;
748 nCmpIdx += GetDiff( searchStr[nCmpIdx + sSearchKey.getLength()-1]))
750 nSuchIdx = sSearchKey.getLength() - 1;
751 while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == searchStr[nCmpIdx + nSuchIdx])
753 if( nSuchIdx == 0 )
755 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
757 sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength();
758 bool bAtStart = !nCmpIdx;
759 bool bAtEnd = nFndEnd == endPos;
760 bool bDelimBefore = bAtStart || IsDelimiter( searchStr, nCmpIdx-1 );
761 bool bDelimBehind = bAtEnd || IsDelimiter( searchStr, nFndEnd );
762 // * 1 -> only one word in the paragraph
763 // * 2 -> at begin of paragraph
764 // * 3 -> at end of paragraph
765 // * 4 -> inside the paragraph
766 if( !( ( bAtStart && bAtEnd ) || // 1
767 ( bAtStart && bDelimBehind ) || // 2
768 ( bAtEnd && bDelimBefore ) || // 3
769 ( bDelimBefore && bDelimBehind ))) // 4
770 break;
773 aRet.subRegExpressions = 1;
774 aRet.startOffset = { nCmpIdx };
775 aRet.endOffset = { nCmpIdx + sSearchKey.getLength() };
777 return aRet;
779 else
780 nSuchIdx--;
783 return aRet;
786 SearchResult TextSearch::NSrchBkwrd( std::unique_lock<std::mutex>& /*rGuard*/,const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
788 SearchResult aRet;
789 aRet.subRegExpressions = 0;
791 OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
793 sal_Int32 nSuchIdx = searchStr.getLength();
794 sal_Int32 nEnd = endPos;
795 if( nSuchIdx == 0 || sSearchKey.isEmpty() || sSearchKey.getLength() > nSuchIdx)
796 return aRet;
798 if (bUsePrimarySrchStr)
799 MakeBackwardTab(); // create the jumptable
800 else
801 MakeBackwardTab2();
803 if( nEnd == nSuchIdx ) // end position for the search
804 nEnd = sSearchKey.getLength();
805 else
806 nEnd += sSearchKey.getLength();
808 sal_Int32 nCmpIdx = startPos; // start position for the search
810 while (nCmpIdx >= nEnd)
812 nSuchIdx = 0;
813 while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] ==
814 searchStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] )
815 nSuchIdx++;
816 if( nSuchIdx >= sSearchKey.getLength() )
818 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
820 sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength();
821 bool bAtStart = !nFndStt;
822 bool bAtEnd = nCmpIdx == startPos;
823 bool bDelimBehind = bAtEnd || IsDelimiter( searchStr, nCmpIdx );
824 bool bDelimBefore = bAtStart || // begin of paragraph
825 IsDelimiter( searchStr, nFndStt-1 );
826 // * 1 -> only one word in the paragraph
827 // * 2 -> at begin of paragraph
828 // * 3 -> at end of paragraph
829 // * 4 -> inside the paragraph
830 if( ( bAtStart && bAtEnd ) || // 1
831 ( bAtStart && bDelimBehind ) || // 2
832 ( bAtEnd && bDelimBefore ) || // 3
833 ( bDelimBefore && bDelimBehind )) // 4
835 aRet.subRegExpressions = 1;
836 aRet.startOffset = { nCmpIdx };
837 aRet.endOffset = { nCmpIdx - sSearchKey.getLength() };
838 return aRet;
841 else
843 aRet.subRegExpressions = 1;
844 aRet.startOffset = { nCmpIdx };
845 aRet.endOffset = { nCmpIdx - sSearchKey.getLength() };
846 return aRet;
849 nSuchIdx = GetDiff( searchStr[nCmpIdx - sSearchKey.getLength()] );
850 if( nCmpIdx < nSuchIdx )
851 return aRet;
852 nCmpIdx -= nSuchIdx;
854 return aRet;
857 void TextSearch::RESrchPrepare( const css::util::SearchOptions2& rOptions)
859 TransliterationFlags transliterateFlags = static_cast<TransliterationFlags>(rOptions.transliterateFlags);
860 // select the transliterated pattern string
861 const OUString& rPatternStr =
862 (isSimpleTrans(transliterateFlags) ? sSrchStr
863 : (isComplexTrans(transliterateFlags) ? sSrchStr2 : rOptions.searchString));
865 sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability
866 // map css::util::SearchFlags to ICU uregex.h flags
867 // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
868 // REG_NEWLINE is neither properly defined nor used anywhere => not implemented
869 // REG_NOSUB is not used anywhere => not implemented
870 // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
871 // LEV_RELAXED is only used for SearchAlgorithm==Approximate
872 // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO
873 // probably because the transliteration flag IGNORE_CASE handles it as well.
874 if( (rOptions.searchFlag & css::util::SearchFlags::ALL_IGNORE_CASE) != 0
875 || (transliterateFlags & TransliterationFlags::IGNORE_CASE))
876 nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
877 UErrorCode nIcuErr = U_ZERO_ERROR;
878 // assumption: transliteration didn't mangle regexp control chars
879 icu::UnicodeString aIcuSearchPatStr( reinterpret_cast<const UChar*>(rPatternStr.getStr()), rPatternStr.getLength());
880 #ifndef DISABLE_WORDBOUND_EMULATION
881 // for convenience specific syntax elements of the old regex engine are emulated
882 // - by replacing \< with "word-break followed by a look-ahead word-char"
883 static const icu::UnicodeString aChevronPatternB( "\\\\<", -1, icu::UnicodeString::kInvariant);
884 static const icu::UnicodeString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, icu::UnicodeString::kInvariant);
885 static icu::RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr);
886 aChevronMatcherB.reset( aIcuSearchPatStr);
887 aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr);
888 aChevronMatcherB.reset();
889 // - by replacing \> with "look-behind word-char followed by a word-break"
890 static const icu::UnicodeString aChevronPatternE( "\\\\>", -1, icu::UnicodeString::kInvariant);
891 static const icu::UnicodeString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, icu::UnicodeString::kInvariant);
892 static icu::RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr);
893 aChevronMatcherE.reset( aIcuSearchPatStr);
894 aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr);
895 aChevronMatcherE.reset();
896 #endif
897 pRegexMatcher.reset( new icu::RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr) );
898 if (nIcuErr)
900 SAL_INFO( "i18npool", "TextSearch::RESrchPrepare UErrorCode " << nIcuErr);
901 pRegexMatcher.reset();
903 else
905 // Pathological patterns may result in exponential run time making the
906 // application appear to be frozen. Limit that. Documentation for this
907 // call says
908 // https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1RegexMatcher.html#a6ebcfcab4fe6a38678c0291643a03a00
909 // "The units of the limit are steps of the match engine.
910 // Correspondence with actual processor time will depend on the speed
911 // of the processor and the details of the specific pattern, but will
912 // typically be on the order of milliseconds."
913 // Just what is a good value? 42 is always an answer ... the 23 enigma
914 // as well... which on the dev's machine is roughly 50 seconds with the
915 // pattern of fdo#70627.
916 /* TODO: make this a configuration settable value and possibly take
917 * complexity of expression into account and maybe even length of text
918 * to be matched; currently (2013-11-25) that is at most one 64k
919 * paragraph per RESrchFrwrd()/RESrchBkwrd() call. */
920 pRegexMatcher->setTimeLimit( 23*1000, nIcuErr);
925 static bool lcl_findRegex(std::unique_ptr<icu::RegexMatcher> const& pRegexMatcher,
926 sal_Int32 nStartPos, sal_Int32 nEndPos, UErrorCode& rIcuErr)
928 pRegexMatcher->region(nStartPos, nEndPos, rIcuErr);
929 pRegexMatcher->useAnchoringBounds(false); // use whole text's anchoring bounds, not region's
930 pRegexMatcher->useTransparentBounds(true); // take text outside of the region into account for
931 // look-ahead/behind assertions
933 if (!pRegexMatcher->find(rIcuErr))
935 /* TODO: future versions could pass the UErrorCode or translations
936 * thereof to the caller, for example to inform the user of
937 * U_REGEX_TIME_OUT. The strange thing though is that an error is set
938 * only after the second call that returns immediately and not if
939 * timeout occurred on the first call?!? */
940 SAL_INFO( "i18npool", "lcl_findRegex UErrorCode " << rIcuErr);
941 return false;
943 return true;
946 SearchResult TextSearch::RESrchFrwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr,
947 sal_Int32 startPos, sal_Int32 endPos )
949 SearchResult aRet;
950 aRet.subRegExpressions = 0;
951 if( !pRegexMatcher)
952 return aRet;
954 if( endPos > searchStr.getLength())
955 endPos = searchStr.getLength();
957 // use the ICU RegexMatcher to find the matches
958 UErrorCode nIcuErr = U_ZERO_ERROR;
959 const icu::UnicodeString aSearchTargetStr(false, reinterpret_cast<const UChar*>(searchStr.getStr()),
960 searchStr.getLength());
961 pRegexMatcher->reset( aSearchTargetStr);
962 // search until there is a valid match
963 for(;;)
965 if (!lcl_findRegex( pRegexMatcher, startPos, endPos, nIcuErr))
966 return aRet;
968 // #i118887# ignore zero-length matches e.g. "a*" in "bc"
969 int nStartOfs = pRegexMatcher->start( nIcuErr);
970 int nEndOfs = pRegexMatcher->end( nIcuErr);
971 if( nStartOfs < nEndOfs)
972 break;
973 // If the zero-length match is behind the string, do not match it again
974 // and again until startPos reaches there. A match behind the string is
975 // a "$" anchor.
976 if (nStartOfs == endPos)
977 break;
978 // try at next position if there was a zero-length match
979 if( ++startPos >= endPos)
980 return aRet;
983 // extract the result of the search
984 const int nGroupCount = pRegexMatcher->groupCount();
985 aRet.subRegExpressions = nGroupCount + 1;
986 aRet.startOffset.realloc( aRet.subRegExpressions);
987 auto pstartOffset = aRet.startOffset.getArray();
988 aRet.endOffset.realloc( aRet.subRegExpressions);
989 auto pendOffset = aRet.endOffset.getArray();
990 pstartOffset[0] = pRegexMatcher->start( nIcuErr);
991 pendOffset[0] = pRegexMatcher->end( nIcuErr);
992 for( int i = 1; i <= nGroupCount; ++i) {
993 pstartOffset[i] = pRegexMatcher->start( i, nIcuErr);
994 pendOffset[i] = pRegexMatcher->end( i, nIcuErr);
997 return aRet;
1000 SearchResult TextSearch::RESrchBkwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr,
1001 sal_Int32 startPos, sal_Int32 endPos )
1003 // NOTE: for backwards search callers provide startPos/endPos inverted!
1004 SearchResult aRet;
1005 aRet.subRegExpressions = 0;
1006 if( !pRegexMatcher)
1007 return aRet;
1009 if( startPos > searchStr.getLength())
1010 startPos = searchStr.getLength();
1012 // use the ICU RegexMatcher to find the matches
1013 // TODO: use ICU's backward searching once it becomes available
1014 // as its replacement using forward search is not as good as the real thing
1015 UErrorCode nIcuErr = U_ZERO_ERROR;
1016 const icu::UnicodeString aSearchTargetStr(false, reinterpret_cast<const UChar*>(searchStr.getStr()),
1017 searchStr.getLength());
1018 pRegexMatcher->reset( aSearchTargetStr);
1019 if (!lcl_findRegex( pRegexMatcher, endPos, startPos, nIcuErr))
1020 return aRet;
1022 // find the last match
1023 int nLastPos = 0;
1024 int nFoundEnd = 0;
1025 int nGoodPos = 0, nGoodEnd = 0;
1026 bool bFirst = true;
1027 do {
1028 nLastPos = pRegexMatcher->start( nIcuErr);
1029 nFoundEnd = pRegexMatcher->end( nIcuErr);
1030 if (nLastPos < nFoundEnd)
1032 // remember last non-zero-length match
1033 nGoodPos = nLastPos;
1034 nGoodEnd = nFoundEnd;
1036 if( nFoundEnd >= startPos)
1037 break;
1038 bFirst = false;
1039 if( nFoundEnd == nLastPos)
1040 ++nFoundEnd;
1041 } while( lcl_findRegex( pRegexMatcher, nFoundEnd, startPos, nIcuErr));
1043 // Ignore all zero-length matches except "$" anchor on first match.
1044 if (nGoodPos == nGoodEnd)
1046 if (bFirst && nLastPos == startPos)
1047 nGoodPos = nLastPos;
1048 else
1049 return aRet;
1052 // find last match again to get its details
1053 lcl_findRegex( pRegexMatcher, nGoodPos, startPos, nIcuErr);
1055 // fill in the details of the last match
1056 const int nGroupCount = pRegexMatcher->groupCount();
1057 aRet.subRegExpressions = nGroupCount + 1;
1058 aRet.startOffset.realloc( aRet.subRegExpressions);
1059 auto pstartOffset = aRet.startOffset.getArray();
1060 aRet.endOffset.realloc( aRet.subRegExpressions);
1061 auto pendOffset = aRet.endOffset.getArray();
1062 // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted!
1063 pstartOffset[0] = pRegexMatcher->end( nIcuErr);
1064 pendOffset[0] = pRegexMatcher->start( nIcuErr);
1065 for( int i = 1; i <= nGroupCount; ++i) {
1066 pstartOffset[i] = pRegexMatcher->end( i, nIcuErr);
1067 pendOffset[i] = pRegexMatcher->start( i, nIcuErr);
1070 return aRet;
1074 // search for words phonetically
1075 SearchResult TextSearch::ApproxSrchFrwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr,
1076 sal_Int32 startPos, sal_Int32 endPos )
1078 SearchResult aRet;
1079 aRet.subRegExpressions = 0;
1081 if( !xBreak.is() )
1082 return aRet;
1084 sal_Int32 nStt, nEnd;
1086 Boundary aWBnd = xBreak->getWordBoundary( searchStr, startPos,
1087 aSrchPara.Locale,
1088 WordType::ANYWORD_IGNOREWHITESPACES, true );
1092 if( aWBnd.startPos >= endPos )
1093 break;
1094 nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos;
1095 nEnd = std::min(aWBnd.endPos, endPos);
1097 if( nStt < nEnd &&
1098 pWLD->WLD( searchStr.getStr() + nStt, nEnd - nStt ) <= nLimit )
1100 aRet.subRegExpressions = 1;
1101 aRet.startOffset = { nStt };
1102 aRet.endOffset = { nEnd };
1103 break;
1106 nStt = nEnd - 1;
1107 aWBnd = xBreak->nextWord( searchStr, nStt, aSrchPara.Locale,
1108 WordType::ANYWORD_IGNOREWHITESPACES);
1109 } while( aWBnd.startPos != aWBnd.endPos ||
1110 (aWBnd.endPos != searchStr.getLength() && aWBnd.endPos != nEnd) );
1111 // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only
1112 // whitespace) in searchStr, getWordBoundary() returned startPos,startPos
1113 // and nextWord() does also => don't loop forever.
1114 return aRet;
1117 SearchResult TextSearch::ApproxSrchBkwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr,
1118 sal_Int32 startPos, sal_Int32 endPos )
1120 SearchResult aRet;
1121 aRet.subRegExpressions = 0;
1123 if( !xBreak.is() )
1124 return aRet;
1126 sal_Int32 nStt, nEnd;
1128 Boundary aWBnd = xBreak->getWordBoundary( searchStr, startPos,
1129 aSrchPara.Locale,
1130 WordType::ANYWORD_IGNOREWHITESPACES, true );
1134 if( aWBnd.endPos <= endPos )
1135 break;
1136 nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos;
1137 nEnd = std::min(aWBnd.endPos, startPos);
1139 if( nStt < nEnd &&
1140 pWLD->WLD( searchStr.getStr() + nStt, nEnd - nStt ) <= nLimit )
1142 aRet.subRegExpressions = 1;
1143 aRet.startOffset = { nEnd };
1144 aRet.endOffset = { nStt };
1145 break;
1147 if( !nStt )
1148 break;
1150 aWBnd = xBreak->previousWord( searchStr, nStt, aSrchPara.Locale,
1151 WordType::ANYWORD_IGNOREWHITESPACES);
1152 } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != searchStr.getLength() );
1153 return aRet;
1157 namespace {
1158 void setWildcardMatch( css::util::SearchResult& rRes, sal_Int32 nStartOffset, sal_Int32 nEndOffset )
1160 rRes.subRegExpressions = 1;
1161 rRes.startOffset = { nStartOffset };
1162 rRes.endOffset = { nEndOffset };
1166 SearchResult TextSearch::WildcardSrchFrwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos )
1168 SearchResult aRes;
1169 aRes.subRegExpressions = 0; // no match
1170 sal_Int32 nStartOffset = nStartPos;
1171 sal_Int32 nEndOffset = nEndPos;
1173 const sal_Int32 nStringLen = searchStr.getLength();
1175 // Forward nStartPos inclusive, nEndPos exclusive, but allow for empty
1176 // string match with [0,0).
1177 if (nStartPos < 0 || nEndPos > nStringLen || nEndPos < nStartPos ||
1178 (nStartPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos)))
1179 return aRes;
1181 const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2);
1182 const sal_Int32 nPatternLen = rPattern.getLength();
1184 // Handle special cases empty pattern and/or string outside of the loop to
1185 // not add performance penalties there and simplify.
1186 if (nStartPos == nEndPos)
1188 sal_Int32 i = 0;
1189 while (i < nPatternLen && rPattern[i] == '*')
1190 ++i;
1191 if (i == nPatternLen)
1192 setWildcardMatch( aRes, nStartOffset, nEndOffset);
1193 return aRes;
1196 // Empty pattern does not match any non-empty string.
1197 if (!nPatternLen)
1198 return aRes;
1200 bool bRewind = false;
1201 sal_uInt32 cPattern = 0;
1202 sal_Int32 nPattern = 0;
1203 sal_Int32 nAfterFakePattern = nPattern;
1204 if (mbWildcardAllowSubstring)
1206 // Fake a leading '*' wildcard.
1207 cPattern = '*';
1208 bRewind = true;
1209 // Assume a non-'*' pattern character follows. If it is a '*' instead
1210 // that will be handled in the loop by setting nPat.
1211 sal_uInt32 cu = rPattern.iterateCodePoints( &nAfterFakePattern);
1212 if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern < nPatternLen)
1213 rPattern.iterateCodePoints( &nAfterFakePattern);
1216 sal_Int32 nString = nStartPos, nPat = -1, nStr = -1, nLastAsterisk = -1;
1217 sal_uInt32 cPatternAfterAsterisk = 0;
1218 bool bEscaped = false, bEscapedAfterAsterisk = false;
1220 // The loop code tries to avoid costly calls to iterateCodePoints() when
1221 // possible.
1225 if (bRewind)
1227 // Reuse cPattern after '*', nPattern was correspondingly
1228 // incremented to point behind cPattern.
1229 bRewind = false;
1231 else if (nPattern < nPatternLen)
1233 // nPattern will be incremented by iterateCodePoints().
1234 cPattern = rPattern.iterateCodePoints( &nPattern);
1235 if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen)
1237 bEscaped = true;
1238 cPattern = rPattern.iterateCodePoints( &nPattern);
1241 else
1243 // A trailing '*' is handled below.
1244 if (mbWildcardAllowSubstring)
1246 // If the pattern is consumed and substring match allowed we're good.
1247 setWildcardMatch( aRes, nStartOffset, nString);
1248 return aRes;
1250 else if (nString < nEndPos && nLastAsterisk >= 0)
1252 // If substring match is not allowed try a greedy '*' match.
1253 nPattern = nLastAsterisk;
1254 continue; // do
1256 else
1257 return aRes;
1260 if (cPattern == '*' && !bEscaped)
1262 // '*' is one code unit, so not using iterateCodePoints() is ok.
1263 while (nPattern < nPatternLen && rPattern[nPattern] == '*')
1264 ++nPattern;
1266 if (nPattern >= nPatternLen)
1268 // Last pattern is '*', remaining string matches.
1269 setWildcardMatch( aRes, nStartOffset, nEndOffset);
1270 return aRes;
1273 nLastAsterisk = nPattern; // Remember last encountered '*'.
1275 // cPattern will be the next non-'*' character, nPattern
1276 // incremented.
1277 cPattern = rPattern.iterateCodePoints( &nPattern);
1278 if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen)
1280 bEscaped = true;
1281 cPattern = rPattern.iterateCodePoints( &nPattern);
1284 cPatternAfterAsterisk = cPattern;
1285 bEscapedAfterAsterisk = bEscaped;
1286 nPat = nPattern; // Remember position of pattern behind '*', already incremented.
1287 nStr = nString; // Remember the current string to be matched.
1290 if (nString >= nEndPos)
1291 // Whatever follows in pattern, string will not match.
1292 return aRes;
1294 // nString will be incremented by iterateCodePoints().
1295 sal_uInt32 cString = searchStr.iterateCodePoints( &nString);
1297 if ((cPattern != '?' || bEscaped) && cPattern != cString)
1299 if (nPat == -1)
1300 // Non-match already without any '*' pattern.
1301 return aRes;
1303 bRewind = true;
1304 nPattern = nPat; // Rewind pattern to character behind '*', already incremented.
1305 cPattern = cPatternAfterAsterisk;
1306 bEscaped = bEscapedAfterAsterisk;
1307 searchStr.iterateCodePoints( &nStr);
1308 nString = nStr; // Restore incremented remembered string position.
1309 if (nPat == nAfterFakePattern)
1311 // Next start offset will be the next character.
1312 nStartOffset = nString;
1315 else
1317 // An unescaped '?' pattern matched any character, or characters
1318 // matched. Reset only escaped state.
1319 bEscaped = false;
1322 while (nString < nEndPos);
1324 if (bRewind)
1325 return aRes;
1327 // Eat trailing '*' pattern that matches anything, including nothing.
1328 // '*' is one code unit, so not using iterateCodePoints() is ok.
1329 while (nPattern < nPatternLen && rPattern[nPattern] == '*')
1330 ++nPattern;
1332 if (nPattern == nPatternLen)
1333 setWildcardMatch( aRes, nStartOffset, nEndOffset);
1334 return aRes;
1337 SearchResult TextSearch::WildcardSrchBkwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos )
1339 SearchResult aRes;
1340 aRes.subRegExpressions = 0; // no match
1342 sal_Int32 nStartOffset = nStartPos;
1343 sal_Int32 nEndOffset = nEndPos;
1345 const sal_Int32 nStringLen = searchStr.getLength();
1347 // Backward nStartPos exclusive, nEndPos inclusive, but allow for empty
1348 // string match with (0,0].
1349 if (nStartPos > nStringLen || nEndPos < 0 || nStartPos < nEndPos ||
1350 (nEndPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos)))
1351 return aRes;
1353 const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2);
1354 sal_Int32 nPatternLen = rPattern.getLength();
1356 // Handle special cases empty pattern and/or string outside of the loop to
1357 // not add performance penalties there and simplify.
1358 if (nStartPos == nEndPos)
1360 sal_Int32 i = 0;
1361 while (i < nPatternLen && rPattern[i] == '*')
1362 ++i;
1363 if (i == nPatternLen)
1364 setWildcardMatch( aRes, nStartOffset, nEndOffset);
1365 return aRes;
1368 // Empty pattern does not match any non-empty string.
1369 if (!nPatternLen)
1370 return aRes;
1372 // Reverse escaped patterns to ease the handling of escapes, keeping escape
1373 // and following character as one sequence in backward direction.
1374 if ((bUsePrimarySrchStr && maWildcardReversePattern.isEmpty()) ||
1375 (!bUsePrimarySrchStr && maWildcardReversePattern2.isEmpty()))
1377 OUStringBuffer aPatternBuf( rPattern);
1378 sal_Int32 nIndex = 0;
1379 while (nIndex < nPatternLen)
1381 const sal_Int32 nOld = nIndex;
1382 const sal_uInt32 cu = rPattern.iterateCodePoints( &nIndex);
1383 if (cu == mcWildcardEscapeChar)
1385 if (nIndex < nPatternLen)
1387 if (nIndex - nOld == 1)
1389 // Simply move code units, we already memorized the one
1390 // in 'cu'.
1391 const sal_Int32 nOld2 = nIndex;
1392 rPattern.iterateCodePoints( &nIndex);
1393 for (sal_Int32 i=0; i < nIndex - nOld2; ++i)
1394 aPatternBuf[nOld+i] = rPattern[nOld2+i];
1395 aPatternBuf[nIndex-1] = static_cast<sal_Unicode>(cu);
1397 else
1399 // Copy the escape character code units first in the
1400 // unlikely case that it would not be of BMP.
1401 assert(nIndex - nOld == 2); // it's UTF-16, so...
1402 sal_Unicode buf[2];
1403 buf[0] = rPattern[nOld];
1404 buf[1] = rPattern[nOld+1];
1405 const sal_Int32 nOld2 = nIndex;
1406 rPattern.iterateCodePoints( &nIndex);
1407 for (sal_Int32 i=0; i < nIndex - nOld2; ++i)
1408 aPatternBuf[nOld+i] = rPattern[nOld2+i];
1409 aPatternBuf[nIndex-2] = buf[0];
1410 aPatternBuf[nIndex-1] = buf[1];
1413 else
1415 // Trailing escape would become leading escape, do what?
1416 // Eliminate.
1417 aPatternBuf.remove( nOld, nIndex - nOld);
1421 if (bUsePrimarySrchStr)
1422 maWildcardReversePattern = aPatternBuf.makeStringAndClear();
1423 else
1424 maWildcardReversePattern2 = aPatternBuf.makeStringAndClear();
1426 const OUString& rReversePattern = (bUsePrimarySrchStr ? maWildcardReversePattern : maWildcardReversePattern2);
1427 nPatternLen = rReversePattern.getLength();
1429 bool bRewind = false;
1430 sal_uInt32 cPattern = 0;
1431 sal_Int32 nPattern = nPatternLen;
1432 sal_Int32 nAfterFakePattern = nPattern;
1433 if (mbWildcardAllowSubstring)
1435 // Fake a trailing '*' wildcard.
1436 cPattern = '*';
1437 bRewind = true;
1438 // Assume a non-'*' pattern character follows. If it is a '*' instead
1439 // that will be handled in the loop by setting nPat.
1440 sal_uInt32 cu = rReversePattern.iterateCodePoints( &nAfterFakePattern, -1);
1441 if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern > 0)
1442 rReversePattern.iterateCodePoints( &nAfterFakePattern, -1);
1445 sal_Int32 nString = nStartPos, nPat = -1, nStr = -1, nLastAsterisk = -1;
1446 sal_uInt32 cPatternAfterAsterisk = 0;
1447 bool bEscaped = false, bEscapedAfterAsterisk = false;
1449 // The loop code tries to avoid costly calls to iterateCodePoints() when
1450 // possible.
1454 if (bRewind)
1456 // Reuse cPattern after '*', nPattern was correspondingly
1457 // decremented to point before cPattern.
1458 bRewind = false;
1460 else if (nPattern > 0)
1462 // nPattern will be decremented by iterateCodePoints().
1463 cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1464 if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0)
1466 bEscaped = true;
1467 cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1470 else
1472 // A trailing '*' is handled below.
1473 if (mbWildcardAllowSubstring)
1475 // If the pattern is consumed and substring match allowed we're good.
1476 setWildcardMatch( aRes, nStartOffset, nString);
1477 return aRes;
1479 else if (nString > nEndPos && nLastAsterisk >= 0)
1481 // If substring match is not allowed try a greedy '*' match.
1482 nPattern = nLastAsterisk;
1483 continue; // do
1485 else
1486 return aRes;
1489 if (cPattern == '*' && !bEscaped)
1491 // '*' is one code unit, so not using iterateCodePoints() is ok.
1492 while (nPattern > 0 && rReversePattern[nPattern-1] == '*')
1493 --nPattern;
1495 if (nPattern <= 0)
1497 // First pattern is '*', remaining string matches.
1498 setWildcardMatch( aRes, nStartOffset, nEndOffset);
1499 return aRes;
1502 nLastAsterisk = nPattern; // Remember last encountered '*'.
1504 // cPattern will be the previous non-'*' character, nPattern
1505 // decremented.
1506 cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1507 if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0)
1509 bEscaped = true;
1510 cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1513 cPatternAfterAsterisk = cPattern;
1514 bEscapedAfterAsterisk = bEscaped;
1515 nPat = nPattern; // Remember position of pattern before '*', already decremented.
1516 nStr = nString; // Remember the current string to be matched.
1519 if (nString <= nEndPos)
1520 // Whatever leads in pattern, string will not match.
1521 return aRes;
1523 // nString will be decremented by iterateCodePoints().
1524 sal_uInt32 cString = searchStr.iterateCodePoints( &nString, -1);
1526 if ((cPattern != '?' || bEscaped) && cPattern != cString)
1528 if (nPat == -1)
1529 // Non-match already without any '*' pattern.
1530 return aRes;
1532 bRewind = true;
1533 nPattern = nPat; // Rewind pattern to character before '*', already decremented.
1534 cPattern = cPatternAfterAsterisk;
1535 bEscaped = bEscapedAfterAsterisk;
1536 searchStr.iterateCodePoints( &nStr, -1);
1537 nString = nStr; // Restore decremented remembered string position.
1538 if (nPat == nAfterFakePattern)
1540 // Next start offset will be this character (exclusive).
1541 nStartOffset = nString;
1544 else
1546 // An unescaped '?' pattern matched any character, or characters
1547 // matched. Reset only escaped state.
1548 bEscaped = false;
1551 while (nString > nEndPos);
1553 if (bRewind)
1554 return aRes;
1556 // Eat leading '*' pattern that matches anything, including nothing.
1557 // '*' is one code unit, so not using iterateCodePoints() is ok.
1558 while (nPattern > 0 && rReversePattern[nPattern-1] == '*')
1559 --nPattern;
1561 if (nPattern == 0)
1562 setWildcardMatch( aRes, nStartOffset, nEndOffset);
1563 return aRes;
1567 OUString SAL_CALL
1568 TextSearch::getImplementationName()
1570 return u"com.sun.star.util.TextSearch_i18n"_ustr;
1573 sal_Bool SAL_CALL TextSearch::supportsService(const OUString& rServiceName)
1575 return cppu::supportsService(this, rServiceName);
1578 Sequence< OUString > SAL_CALL
1579 TextSearch::getSupportedServiceNames()
1581 return { u"com.sun.star.util.TextSearch"_ustr, u"com.sun.star.util.TextSearch2"_ustr };
1584 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface*
1585 i18npool_TextSearch_get_implementation(
1586 css::uno::XComponentContext* context , css::uno::Sequence<css::uno::Any> const&)
1588 return cppu::acquire(new TextSearch(context));
1591 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */