i18npool/source/search/textsearch.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include "textsearch.hxx"
  21 #include "levdis.hxx"
  22 #include <com/sun/star/i18n/BreakIterator.hpp>
  23 #include <com/sun/star/util/SearchAlgorithms2.hpp>
  24 #include <com/sun/star/util/SearchFlags.hpp>
  25 #include <com/sun/star/i18n/WordType.hpp>
  26 #include <com/sun/star/i18n/ScriptType.hpp>
  27 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
  28 #include <com/sun/star/i18n/CharacterClassification.hpp>
  29 #include <com/sun/star/i18n/KCharacterType.hpp>
  30 #include <com/sun/star/i18n/Transliteration.hpp>
  31 #include <cppuhelper/supportsservice.hxx>
  32 #include <cppuhelper/weak.hxx>
  33 #include <i18nutil/transliteration.hxx>
  34 #include <rtl/ustrbuf.hxx>
  35 #include <sal/log.hxx>
  36
  37 #include <unicode/regex.h>
  38
  39 using namespace ::com::sun::star::util;
  40 using namespace ::com::sun::star::uno;
  41 using namespace ::com::sun::star::lang;
  42 using namespace ::com::sun::star::i18n;
  43 using namespace ::com::sun::star;
  44
  45 const TransliterationFlags COMPLEX_TRANS_MASK =
  46     TransliterationFlags::ignoreBaFa_ja_JP |
  47     TransliterationFlags::ignoreIterationMark_ja_JP |
  48     TransliterationFlags::ignoreTiJi_ja_JP |
  49     TransliterationFlags::ignoreHyuByu_ja_JP |
  50     TransliterationFlags::ignoreSeZe_ja_JP |
  51     TransliterationFlags::ignoreIandEfollowedByYa_ja_JP |
  52     TransliterationFlags::ignoreKiKuFollowedBySa_ja_JP |
  53     TransliterationFlags::ignoreProlongedSoundMark_ja_JP;
  54
  55 namespace
  56 {
  57 TransliterationFlags maskComplexTrans( TransliterationFlags n )
  58 {
  59     // IGNORE_KANA and FULLWIDTH_HALFWIDTH are simple but need to take effect
  60     // in complex transliteration.
  61     return
  62         n & (COMPLEX_TRANS_MASK |                       // all set ignore bits
  63         TransliterationFlags::IGNORE_KANA |            // plus IGNORE_KANA bit
  64         TransliterationFlags::FULLWIDTH_HALFWIDTH);    // and the FULLWIDTH_HALFWIDTH value
  65 }
  66
  67 bool isComplexTrans( TransliterationFlags n )
  68 {
  69     return bool(n & COMPLEX_TRANS_MASK);
  70 }
  71
  72 TransliterationFlags maskSimpleTrans( TransliterationFlags n )
  73 {
  74     return n & ~COMPLEX_TRANS_MASK;
  75 }
  76
  77 bool isSimpleTrans( TransliterationFlags n )
  78 {
  79     return bool(maskSimpleTrans(n));
  80 }
  81
  82 // Regex patterns are case sensitive.
  83 TransliterationFlags maskSimpleRegexTrans( TransliterationFlags n )
  84 {
  85     TransliterationFlags m = (n & TransliterationFlags::IGNORE_MASK) & ~TransliterationFlags::IGNORE_CASE;
  86     TransliterationFlags v = n & TransliterationFlags::NON_IGNORE_MASK;
  87     if (v == TransliterationFlags::UPPERCASE_LOWERCASE || v == TransliterationFlags::LOWERCASE_UPPERCASE)
  88         v = TransliterationFlags::NONE;
  89     return (m | v) & ~COMPLEX_TRANS_MASK;
  90 }
  91
  92 bool isSimpleRegexTrans( TransliterationFlags n )
  93 {
  94     return bool(maskSimpleRegexTrans(n));
  95 }
  96
  97 bool isReplacePunctuation( OUString &rStr )
  98 {
  99     return rStr.indexOf(u'\u2018') > -1 ||
 100            rStr.indexOf(u'\u2019') > -1 ||
 101            rStr.indexOf(u'\u201A') > -1 ||
 102            rStr.indexOf(u'\u201B') > -1 ||
 103            rStr.indexOf(u'\u201C') > -1 ||
 104            rStr.indexOf(u'\u201D') > -1 ||
 105            rStr.indexOf(u'\u201E') > -1 ||
 106            rStr.indexOf(u'\u201F') > -1;
 107 }
 108
 109 OUString replacePunctuation( OUString &rStr )
 110 {
 111     return rStr.replace(u'\u2018', '\'')
 112                .replace(u'\u2019', '\'')
 113                .replace(u'\u201A', '\'')
 114                .replace(u'\u201B', '\'')
 115                .replace(u'\u201C', '"')
 116                .replace(u'\u201D', '"')
 117                .replace(u'\u201E', '"')
 118                .replace(u'\u201F', '"');
 119 }
 120 };
 121
 122 TextSearch::TextSearch(const Reference < XComponentContext > & rxContext)
 123         : m_xContext( rxContext )
 124 {
 125     SearchOptions2 aOpt;
 126     aOpt.AlgorithmType2 = SearchAlgorithms2::ABSOLUTE;
 127     aOpt.algorithmType = SearchAlgorithms_ABSOLUTE;
 128     aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE;
 129     //aOpt.Locale = ???;
 130     setOptions( aOpt );
 131 }
 132
 133 TextSearch::~TextSearch()
 134 {
 135     pRegexMatcher.reset();
 136     pWLD.reset();
 137     pJumpTable.reset();
 138     pJumpTable2.reset();
 139 }
 140
 141 void TextSearch::setOptions2( const SearchOptions2& rOptions )
 142 {
 143     std::unique_lock g(m_aMutex);
 144
 145     aSrchPara = rOptions;
 146
 147     pRegexMatcher.reset();
 148     pWLD.reset();
 149     pJumpTable.reset();
 150     pJumpTable2.reset();
 151     maWildcardReversePattern.clear();
 152     maWildcardReversePattern2.clear();
 153     TransliterationFlags transliterateFlags = static_cast<TransliterationFlags>(aSrchPara.transliterateFlags);
 154     bSearchApostrophe = false;
 155     bool bReplaceApostrophe = false;
 156     if (aSrchPara.AlgorithmType2 == SearchAlgorithms2::REGEXP)
 157     {
 158         // RESrchPrepare will consider aSrchPara.transliterateFlags when
 159         // picking the actual regex pattern
 160         // (sSrchStr|sSrchStr2|SearchOptions2::searchString) and setting
 161         // case-insensitivity. Create transliteration instance, if any, without
 162         // ignore-case so later in TextSearch::searchForward() the string to
 163         // match is not case-altered, leave case-(in)sensitive to regex engine.
 164         transliterateFlags &= ~TransliterationFlags::IGNORE_CASE;
 165     }
 166     else if ( aSrchPara.searchString.indexOf('\'') > - 1 || aSrchPara.searchString.indexOf('"') > - 1 )
 167     {
 168         bSearchApostrophe = true;
 169         bReplaceApostrophe = isReplacePunctuation(aSrchPara.searchString);
 170     }
 171
 172     // Create Transliteration class
 173     if( isSimpleTrans( transliterateFlags) )
 174     {
 175         if( !xTranslit.is() )
 176             xTranslit.set( Transliteration::create( m_xContext ) );
 177         xTranslit->loadModule(
 178              static_cast<TransliterationModules>(maskSimpleTrans(transliterateFlags)),
 179              aSrchPara.Locale);
 180     }
 181     else if( xTranslit.is() )
 182         xTranslit = nullptr;
 183
 184     // Create Transliteration for 2<->1, 2<->2 transliteration
 185     if ( isComplexTrans( transliterateFlags) )
 186     {
 187         if( !xTranslit2.is() )
 188             xTranslit2.set( Transliteration::create( m_xContext ) );
 189         // Load transliteration module
 190         xTranslit2->loadModule(
 191              static_cast<TransliterationModules>(maskComplexTrans(transliterateFlags)),
 192              aSrchPara.Locale);
 193     }
 194
 195     if ( !xBreak.is() )
 196         xBreak = css::i18n::BreakIterator::create( m_xContext );
 197
 198     sSrchStr = aSrchPara.searchString;
 199
 200     // Transliterate search string.
 201     if (aSrchPara.AlgorithmType2 == SearchAlgorithms2::REGEXP)
 202     {
 203         if (isSimpleRegexTrans(transliterateFlags))
 204         {
 205             if (maskSimpleRegexTrans(transliterateFlags) !=
 206                     maskSimpleTrans(transliterateFlags))
 207             {
 208                 css::uno::Reference< XExtendedTransliteration > xTranslitPattern(
 209                          Transliteration::create( m_xContext ));
 210                 if (xTranslitPattern.is())
 211                 {
 212                     xTranslitPattern->loadModule(
 213                             static_cast<TransliterationModules>(maskSimpleRegexTrans(transliterateFlags)),
 214                             aSrchPara.Locale);
 215                     sSrchStr = xTranslitPattern->transliterateString2String(
 216                             aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
 217                 }
 218             }
 219             else
 220             {
 221                 if (xTranslit.is())
 222                     sSrchStr = xTranslit->transliterateString2String(
 223                             aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
 224             }
 225             // xTranslit2 complex transliterated sSrchStr2 is not used in
 226             // regex, see TextSearch::searchForward() and
 227             // TextSearch::searchBackward()
 228         }
 229     }
 230     else
 231     {
 232         if ( xTranslit.is() && isSimpleTrans(transliterateFlags) )
 233             sSrchStr = xTranslit->transliterateString2String(
 234                     aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
 235
 236         if ( xTranslit2.is() && isComplexTrans(transliterateFlags) )
 237             sSrchStr2 = xTranslit2->transliterateString2String(
 238                     aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
 239     }
 240
 241     if ( bReplaceApostrophe )
 242         sSrchStr = replacePunctuation(sSrchStr);
 243
 244     // Take the new SearchOptions2::AlgorithmType2 field and ignore
 245     // SearchOptions::algorithmType
 246     switch( aSrchPara.AlgorithmType2)
 247     {
 248         case SearchAlgorithms2::REGEXP:
 249             fnForward = &TextSearch::RESrchFrwrd;
 250             fnBackward = &TextSearch::RESrchBkwrd;
 251             RESrchPrepare( aSrchPara);
 252             break;
 253
 254         case SearchAlgorithms2::APPROXIMATE:
 255             fnForward = &TextSearch::ApproxSrchFrwrd;
 256             fnBackward = &TextSearch::ApproxSrchBkwrd;
 257
 258             pWLD.reset( new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars,
 259                     aSrchPara.insertedChars, aSrchPara.deletedChars,
 260                     0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) ) );
 261
 262             nLimit = pWLD->GetLimit();
 263             break;
 264
 265         case SearchAlgorithms2::WILDCARD:
 266             mcWildcardEscapeChar = static_cast<sal_uInt32>(aSrchPara.WildcardEscapeCharacter);
 267             mbWildcardAllowSubstring = ((aSrchPara.searchFlag & SearchFlags::WILD_MATCH_SELECTION) == 0);
 268             fnForward = &TextSearch::WildcardSrchFrwrd;
 269             fnBackward = &TextSearch::WildcardSrchBkwrd;
 270             break;
 271
 272         default:
 273             SAL_WARN("i18npool","TextSearch::setOptions2 - default what?");
 274             [[fallthrough]];
 275         case SearchAlgorithms2::ABSOLUTE:
 276             fnForward = &TextSearch::NSrchFrwrd;
 277             fnBackward = &TextSearch::NSrchBkwrd;
 278             break;
 279     }
 280 }
 281
 282 void TextSearch::setOptions( const SearchOptions& rOptions )
 283 {
 284     sal_Int16 nAlgorithmType2;
 285     switch (rOptions.algorithmType)
 286     {
 287         case SearchAlgorithms_REGEXP:
 288             nAlgorithmType2 = SearchAlgorithms2::REGEXP;
 289             break;
 290         case SearchAlgorithms_APPROXIMATE:
 291             nAlgorithmType2 = SearchAlgorithms2::APPROXIMATE;
 292             break;
 293         default:
 294             SAL_WARN("i18npool","TextSearch::setOptions - default what?");
 295             [[fallthrough]];
 296         case SearchAlgorithms_ABSOLUTE:
 297             nAlgorithmType2 = SearchAlgorithms2::ABSOLUTE;
 298             break;
 299     }
 300     // It would be nice if an inherited struct had a ctor that takes an
 301     // instance of the object the struct derived from...
 302     SearchOptions2 aOptions2(
 303             rOptions.algorithmType,
 304             rOptions.searchFlag,
 305             rOptions.searchString,
 306             rOptions.replaceString,
 307             rOptions.Locale,
 308             rOptions.changedChars,
 309             rOptions.deletedChars,
 310             rOptions.insertedChars,
 311             rOptions.transliterateFlags,
 312             nAlgorithmType2,
 313             0   // no wildcard search, no escape character...
 314             );
 315     setOptions2( aOptions2);
 316 }
 317
 318 static sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos )
 319 {
 320     auto pOff = std::find_if(rOff.begin(), rOff.end(),
 321         [nPos](const sal_Int32 nOff) { return nOff >= nPos; });
 322     return static_cast<sal_Int32>(std::distance(rOff.begin(), pOff));
 323 }
 324
 325 SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
 326 {
 327     std::unique_lock g(m_aMutex);
 328
 329     SearchResult sres;
 330
 331     OUString in_str(searchStr);
 332
 333     // in non-regex mode, allow searching typographical apostrophe with the ASCII one
 334     // to avoid regression after using automatic conversion to U+2019 during typing in Writer
 335     bool bReplaceApostrophe = bSearchApostrophe && isReplacePunctuation(in_str);
 336
 337     bUsePrimarySrchStr = true;
 338
 339     if ( xTranslit.is() )
 340     {
 341         // apply normal transliteration (1<->1, 1<->0)
 342
 343         sal_Int32 nInStartPos = startPos;
 344         if (pRegexMatcher && startPos > 0)
 345         {
 346             // tdf#89665, tdf#75806: An optimization to avoid transliterating the whole string, yet
 347             // transliterate enough of the leading text to allow sensible look-behind assertions.
 348             // 100 is chosen arbitrarily in the hope that look-behind assertions would largely fit.
 349             // See http://userguide.icu-project.org/strings/regexp for look-behind assertion syntax.
 350             // When search regex doesn't start with an assertion, 3 is to allow startPos to be in
 351             // the middle of a surrogate pair, preceded by another surrogate pair.
 352             const sal_Int32 nMaxLeadingLen = aSrchPara.searchString.startsWith("(?") ? 100 : 3;
 353             nInStartPos -= std::min(nMaxLeadingLen, startPos);
 354         }
 355         sal_Int32 nInEndPos = endPos;
 356         if (pRegexMatcher && endPos < searchStr.getLength())
 357         {
 358             // tdf#65038: ditto for look-ahead assertions
 359             const sal_Int32 nMaxTrailingLen = aSrchPara.searchString.endsWith(")") ? 100 : 3;
 360             nInEndPos += std::min(nMaxTrailingLen, searchStr.getLength() - endPos);
 361         }
 362
 363         css::uno::Sequence<sal_Int32> offset(nInEndPos - nInStartPos);
 364         in_str = xTranslit->transliterate(searchStr, nInStartPos, nInEndPos - nInStartPos, offset);
 365
 366         if ( bReplaceApostrophe )
 367             in_str = replacePunctuation(in_str);
 368
 369         // JP 20.6.2001: also the start and end positions must be corrected!
 370         sal_Int32 newStartPos =
 371             (startPos == 0) ? 0 : FindPosInSeq_Impl( offset, startPos );
 372
 373         sal_Int32 newEndPos = (endPos < searchStr.getLength())
 374             ? FindPosInSeq_Impl( offset, endPos )
 375             : in_str.getLength();
 376
 377         sres = (this->*fnForward)( g, in_str, newStartPos, newEndPos );
 378
 379         // Map offsets back to untransliterated string.
 380         const sal_Int32 nOffsets = offset.getLength();
 381         if (nOffsets)
 382         {
 383             auto sres_startOffsetRange = asNonConstRange(sres.startOffset);
 384             auto sres_endOffsetRange = asNonConstRange(sres.endOffset);
 385             // For regex nGroups is the number of groups+1 with group 0 being
 386             // the entire match.
 387             const sal_Int32 nGroups = sres.startOffset.getLength();
 388             for ( sal_Int32 k = 0; k < nGroups; k++ )
 389             {
 390                 const sal_Int32 nStart = sres.startOffset[k];
 391                 // Result offsets are negative (-1) if a group expression was
 392                 // not matched.
 393                 if (nStart >= 0)
 394                     sres_startOffsetRange[k] = (nStart < nOffsets ? offset[nStart] : (offset[nOffsets - 1] + 1));
 395                 // JP 20.6.2001: end is ever exclusive and then don't return
 396                 //               the position of the next character - return the
 397                 //               next position behind the last found character!
 398                 //               "a b c" find "b" must return 2,3 and not 2,4!!!
 399                 const sal_Int32 nStop = sres.endOffset[k];
 400                 if (nStop >= 0)
 401                 {
 402                     if (nStop > 0)
 403                         sres_endOffsetRange[k] = offset[(nStop <= nOffsets ? nStop : nOffsets) - 1] + 1;
 404                     else
 405                         sres_endOffsetRange[k] = offset[0];
 406                 }
 407             }
 408         }
 409     }
 410     else
 411     {
 412         if ( bReplaceApostrophe )
 413             in_str = in_str.replace(u'\u2019', '\'');
 414
 415         sres = (this->*fnForward)( g, in_str, startPos, endPos );
 416     }
 417
 418     if ( xTranslit2.is() && aSrchPara.AlgorithmType2 != SearchAlgorithms2::REGEXP)
 419     {
 420         SearchResult sres2;
 421
 422         in_str = searchStr;
 423         css::uno::Sequence <sal_Int32> offset( in_str.getLength());
 424
 425         in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset );
 426
 427         if( startPos )
 428             startPos = FindPosInSeq_Impl( offset, startPos );
 429
 430         if( endPos < searchStr.getLength() )
 431             endPos = FindPosInSeq_Impl( offset, endPos );
 432         else
 433             endPos = in_str.getLength();
 434
 435         bUsePrimarySrchStr = false;
 436         sres2 = (this->*fnForward)( g, in_str, startPos, endPos );
 437         auto sres2_startOffsetRange = asNonConstRange(sres2.startOffset);
 438         auto sres2_endOffsetRange = asNonConstRange(sres2.endOffset);
 439
 440         for ( int k = 0; k < sres2.startOffset.getLength(); k++ )
 441         {
 442             if (sres2.startOffset[k])
 443                 sres2_startOffsetRange[k] = offset[sres2.startOffset[k]-1] + 1;
 444             if (sres2.endOffset[k])
 445                 sres2_endOffsetRange[k] = offset[sres2.endOffset[k]-1] + 1;
 446         }
 447
 448         // pick first and long one
 449         if ( sres.subRegExpressions == 0)
 450             return sres2;
 451         if ( sres2.subRegExpressions == 1)
 452         {
 453             if ( sres.startOffset[0] > sres2.startOffset[0])
 454                 return sres2;
 455             else if ( sres.startOffset[0] == sres2.startOffset[0] &&
 456                     sres.endOffset[0] < sres2.endOffset[0])
 457                 return sres2;
 458         }
 459     }
 460
 461     return sres;
 462 }
 463
 464 SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
 465 {
 466     std::unique_lock g(m_aMutex);
 467
 468     SearchResult sres;
 469
 470     OUString in_str(searchStr);
 471
 472     // in non-regex mode, allow searching typographical apostrophe with the ASCII one
 473     // to avoid regression after using automatic conversion to U+2019 during typing in Writer
 474     bool bReplaceApostrophe = bSearchApostrophe && isReplacePunctuation(in_str);
 475
 476     bUsePrimarySrchStr = true;
 477
 478     if ( xTranslit.is() )
 479     {
 480         // apply only simple 1<->1 transliteration here
 481         css::uno::Sequence<sal_Int32> offset(startPos - endPos);
 482         in_str = xTranslit->transliterate( searchStr, endPos, startPos - endPos, offset );
 483
 484         if ( bReplaceApostrophe )
 485             in_str = replacePunctuation(in_str);
 486
 487         // JP 20.6.2001: also the start and end positions must be corrected!
 488         sal_Int32 const newStartPos = (startPos < searchStr.getLength())
 489             ? FindPosInSeq_Impl( offset, startPos )
 490             : in_str.getLength();
 491
 492         sal_Int32 const newEndPos =
 493             (endPos == 0) ? 0 : FindPosInSeq_Impl( offset, endPos );
 494
 495         // TODO: this would need nExtraOffset handling to avoid $ matching
 496         // if (pRegexMatcher && startPos < searchStr.getLength())
 497         // but that appears to be impossible with ICU regex
 498
 499         sres = (this->*fnBackward)( g, in_str, newStartPos, newEndPos );
 500
 501         // Map offsets back to untransliterated string.
 502         const sal_Int32 nOffsets = offset.getLength();
 503         if (nOffsets)
 504         {
 505             auto sres_startOffsetRange = asNonConstRange(sres.startOffset);
 506             auto sres_endOffsetRange = asNonConstRange(sres.endOffset);
 507             // For regex nGroups is the number of groups+1 with group 0 being
 508             // the entire match.
 509             const sal_Int32 nGroups = sres.startOffset.getLength();
 510             for ( sal_Int32 k = 0; k < nGroups; k++ )
 511             {
 512                 const sal_Int32 nStart = sres.startOffset[k];
 513                 // Result offsets are negative (-1) if a group expression was
 514                 // not matched.
 515                 if (nStart >= 0)
 516                 {
 517                     if (nStart > 0)
 518                         sres_startOffsetRange[k] = offset[(nStart <= nOffsets ? nStart : nOffsets) - 1] + 1;
 519                     else
 520                         sres_startOffsetRange[k] = offset[0];
 521                 }
 522                 // JP 20.6.2001: end is ever exclusive and then don't return
 523                 //               the position of the next character - return the
 524                 //               next position behind the last found character!
 525                 //               "a b c" find "b" must return 2,3 and not 2,4!!!
 526                 const sal_Int32 nStop = sres.endOffset[k];
 527                 if (nStop >= 0)
 528                     sres_endOffsetRange[k] = (nStop < nOffsets ? offset[nStop] : (offset[nOffsets - 1] + 1));
 529             }
 530         }
 531     }
 532     else
 533     {
 534         if ( bReplaceApostrophe )
 535             in_str = replacePunctuation(in_str);
 536
 537         sres = (this->*fnBackward)( g, in_str, startPos, endPos );
 538     }
 539
 540     if ( xTranslit2.is() && aSrchPara.AlgorithmType2 != SearchAlgorithms2::REGEXP )
 541     {
 542         SearchResult sres2;
 543
 544         in_str = searchStr;
 545         css::uno::Sequence <sal_Int32> offset( in_str.getLength());
 546
 547         in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset);
 548
 549         if( startPos < searchStr.getLength() )
 550             startPos = FindPosInSeq_Impl( offset, startPos );
 551         else
 552             startPos = in_str.getLength();
 553
 554         if( endPos )
 555             endPos = FindPosInSeq_Impl( offset, endPos );
 556
 557         bUsePrimarySrchStr = false;
 558         sres2 = (this->*fnBackward)( g, in_str, startPos, endPos );
 559         auto sres2_startOffsetRange = asNonConstRange(sres2.startOffset);
 560         auto sres2_endOffsetRange = asNonConstRange(sres2.endOffset);
 561
 562         for( int k = 0; k < sres2.startOffset.getLength(); k++ )
 563         {
 564             if (sres2.startOffset[k])
 565                 sres2_startOffsetRange[k] = offset[sres2.startOffset[k]-1]+1;
 566             if (sres2.endOffset[k])
 567                 sres2_endOffsetRange[k] = offset[sres2.endOffset[k]-1]+1;
 568         }
 569
 570         // pick last and long one
 571         if ( sres.subRegExpressions == 0 )
 572             return sres2;
 573         if ( sres2.subRegExpressions == 1 )
 574         {
 575             if ( sres.startOffset[0] < sres2.startOffset[0] )
 576                 return sres2;
 577             if ( sres.startOffset[0] == sres2.startOffset[0] &&
 578                     sres.endOffset[0] > sres2.endOffset[0] )
 579                 return sres2;
 580         }
 581     }
 582
 583     return sres;
 584 }
 585
 586
 587 bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const
 588 {
 589     bool bRet = true;
 590     if( '\x7f' != rStr[nPos])
 591     {
 592         if ( !xCharClass.is() )
 593              xCharClass = CharacterClassification::create( m_xContext );
 594         sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos,
 595                 aSrchPara.Locale );
 596         if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA |
 597                         KCharacterType::LETTER ) & nCType ) )
 598             bRet = false;
 599     }
 600     return bRet;
 601 }
 602
 603 // --------- helper methods for Boyer-Moore like text searching ----------
 604 // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available
 605
 606 void TextSearch::MakeForwardTab()
 607 {
 608     // create the jumptable for the search text
 609
 610     if( pJumpTable && bIsForwardTab )
 611     {
 612         return; // the jumpTable is ok
 613     }
 614     bIsForwardTab = true;
 615
 616     sal_Int32 n, nLen = sSrchStr.getLength();
 617     pJumpTable.reset( new TextSearchJumpTable );
 618
 619     for( n = 0; n < nLen - 1; ++n )
 620     {
 621         sal_Unicode cCh = sSrchStr[n];
 622         sal_Int32 nDiff = nLen - n - 1;
 623         TextSearchJumpTable::value_type aEntry( cCh, nDiff );
 624
 625         ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
 626             pJumpTable->insert( aEntry );
 627         if ( !aPair.second )
 628             (*(aPair.first)).second = nDiff;
 629     }
 630 }
 631
 632 void TextSearch::MakeForwardTab2()
 633 {
 634     // create the jumptable for the search text
 635     if( pJumpTable2 && bIsForwardTab )
 636     {
 637         return;        // the jumpTable is ok
 638     }
 639     bIsForwardTab = true;
 640
 641     sal_Int32 n, nLen = sSrchStr2.getLength();
 642     pJumpTable2.reset( new TextSearchJumpTable );
 643
 644     for( n = 0; n < nLen - 1; ++n )
 645     {
 646         sal_Unicode cCh = sSrchStr2[n];
 647         sal_Int32 nDiff = nLen - n - 1;
 648
 649         TextSearchJumpTable::value_type aEntry( cCh, nDiff );
 650         ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
 651             pJumpTable2->insert( aEntry );
 652         if ( !aPair.second )
 653             (*(aPair.first)).second = nDiff;
 654     }
 655 }
 656
 657 void TextSearch::MakeBackwardTab()
 658 {
 659     // create the jumptable for the search text
 660     if( pJumpTable && !bIsForwardTab)
 661     {
 662         return;   // the jumpTable is ok
 663     }
 664     bIsForwardTab = false;
 665
 666     sal_Int32 n, nLen = sSrchStr.getLength();
 667     pJumpTable.reset( new TextSearchJumpTable );
 668
 669     for( n = nLen-1; n > 0; --n )
 670     {
 671         sal_Unicode cCh = sSrchStr[n];
 672         TextSearchJumpTable::value_type aEntry( cCh, n );
 673         ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
 674             pJumpTable->insert( aEntry );
 675         if ( !aPair.second )
 676             (*(aPair.first)).second = n;
 677     }
 678 }
 679
 680 void TextSearch::MakeBackwardTab2()
 681 {
 682     // create the jumptable for the search text
 683     if( pJumpTable2 && !bIsForwardTab )
 684     {
 685         return;    // the jumpTable is ok
 686     }
 687     bIsForwardTab = false;
 688
 689     sal_Int32 n, nLen = sSrchStr2.getLength();
 690     pJumpTable2.reset( new TextSearchJumpTable );
 691
 692     for( n = nLen-1; n > 0; --n )
 693     {
 694         sal_Unicode cCh = sSrchStr2[n];
 695         TextSearchJumpTable::value_type aEntry( cCh, n );
 696         ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
 697             pJumpTable2->insert( aEntry );
 698         if ( !aPair.second )
 699             (*(aPair.first)).second = n;
 700     }
 701 }
 702
 703 sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const
 704 {
 705     TextSearchJumpTable *pJump;
 706     OUString sSearchKey;
 707
 708     if ( bUsePrimarySrchStr ) {
 709         pJump = pJumpTable.get();
 710         sSearchKey = sSrchStr;
 711     } else {
 712         pJump = pJumpTable2.get();
 713         sSearchKey = sSrchStr2;
 714     }
 715
 716     TextSearchJumpTable::const_iterator iLook = pJump->find( cChr );
 717     if ( iLook == pJump->end() )
 718         return sSearchKey.getLength();
 719     return (*iLook).second;
 720 }
 721
 722
 723 SearchResult TextSearch::NSrchFrwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
 724 {
 725     SearchResult aRet;
 726     aRet.subRegExpressions = 0;
 727
 728     OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
 729
 730     sal_Int32 nSuchIdx = searchStr.getLength();
 731     sal_Int32 nEnd = endPos;
 732     if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx )
 733         return aRet;
 734
 735
 736     if( nEnd < sSearchKey.getLength() )   // position inside the search region ?
 737         return aRet;
 738
 739     nEnd -= sSearchKey.getLength();
 740
 741     if (bUsePrimarySrchStr)
 742       MakeForwardTab();                   // create the jumptable
 743     else
 744       MakeForwardTab2();
 745
 746     for (sal_Int32 nCmpIdx = startPos; // start position for the search
 747             nCmpIdx <= nEnd;
 748             nCmpIdx += GetDiff( searchStr[nCmpIdx + sSearchKey.getLength()-1]))
 749     {
 750         nSuchIdx = sSearchKey.getLength() - 1;
 751         while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == searchStr[nCmpIdx + nSuchIdx])
 752         {
 753             if( nSuchIdx == 0 )
 754             {
 755                 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
 756                 {
 757                     sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength();
 758                     bool bAtStart = !nCmpIdx;
 759                     bool bAtEnd = nFndEnd == endPos;
 760                     bool bDelimBefore = bAtStart || IsDelimiter( searchStr, nCmpIdx-1 );
 761                     bool bDelimBehind = bAtEnd || IsDelimiter(  searchStr, nFndEnd );
 762                     //  *       1 -> only one word in the paragraph
 763                     //  *       2 -> at begin of paragraph
 764                     //  *       3 -> at end of paragraph
 765                     //  *       4 -> inside the paragraph
 766                     if( !(  ( bAtStart && bAtEnd ) ||           // 1
 767                                 ( bAtStart && bDelimBehind ) ||     // 2
 768                                 ( bAtEnd && bDelimBefore ) ||       // 3
 769                                 ( bDelimBefore && bDelimBehind )))  // 4
 770                         break;
 771                 }
 772
 773                 aRet.subRegExpressions = 1;
 774                 aRet.startOffset = { nCmpIdx };
 775                 aRet.endOffset = { nCmpIdx + sSearchKey.getLength() };
 776
 777                 return aRet;
 778             }
 779             else
 780                 nSuchIdx--;
 781         }
 782     }
 783     return aRet;
 784 }
 785
 786 SearchResult TextSearch::NSrchBkwrd( std::unique_lock<std::mutex>& /*rGuard*/,const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
 787 {
 788     SearchResult aRet;
 789     aRet.subRegExpressions = 0;
 790
 791     OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
 792
 793     sal_Int32 nSuchIdx = searchStr.getLength();
 794     sal_Int32 nEnd = endPos;
 795     if( nSuchIdx == 0 || sSearchKey.isEmpty() || sSearchKey.getLength() > nSuchIdx)
 796         return aRet;
 797
 798     if (bUsePrimarySrchStr)
 799         MakeBackwardTab();                  // create the jumptable
 800     else
 801         MakeBackwardTab2();
 802
 803     if( nEnd == nSuchIdx )                  // end position for the search
 804         nEnd = sSearchKey.getLength();
 805     else
 806         nEnd += sSearchKey.getLength();
 807
 808     sal_Int32 nCmpIdx = startPos;          // start position for the search
 809
 810     while (nCmpIdx >= nEnd)
 811     {
 812         nSuchIdx = 0;
 813         while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] ==
 814                 searchStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] )
 815             nSuchIdx++;
 816         if( nSuchIdx >= sSearchKey.getLength() )
 817         {
 818             if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
 819             {
 820                 sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength();
 821                 bool bAtStart = !nFndStt;
 822                 bool bAtEnd = nCmpIdx == startPos;
 823                 bool bDelimBehind = bAtEnd || IsDelimiter( searchStr, nCmpIdx );
 824                 bool bDelimBefore = bAtStart || // begin of paragraph
 825                     IsDelimiter( searchStr, nFndStt-1 );
 826                 //  *       1 -> only one word in the paragraph
 827                 //  *       2 -> at begin of paragraph
 828                 //  *       3 -> at end of paragraph
 829                 //  *       4 -> inside the paragraph
 830                 if( ( bAtStart && bAtEnd ) ||           // 1
 831                         ( bAtStart && bDelimBehind ) ||     // 2
 832                         ( bAtEnd && bDelimBefore ) ||       // 3
 833                         ( bDelimBefore && bDelimBehind ))   // 4
 834                 {
 835                     aRet.subRegExpressions = 1;
 836                     aRet.startOffset = { nCmpIdx };
 837                     aRet.endOffset = { nCmpIdx - sSearchKey.getLength() };
 838                     return aRet;
 839                 }
 840             }
 841             else
 842             {
 843                 aRet.subRegExpressions = 1;
 844                 aRet.startOffset = { nCmpIdx };
 845                 aRet.endOffset = { nCmpIdx - sSearchKey.getLength() };
 846                 return aRet;
 847             }
 848         }
 849         nSuchIdx = GetDiff( searchStr[nCmpIdx - sSearchKey.getLength()] );
 850         if( nCmpIdx < nSuchIdx )
 851             return aRet;
 852         nCmpIdx -= nSuchIdx;
 853     }
 854     return aRet;
 855 }
 856
 857 void TextSearch::RESrchPrepare( const css::util::SearchOptions2& rOptions)
 858 {
 859     TransliterationFlags transliterateFlags = static_cast<TransliterationFlags>(rOptions.transliterateFlags);
 860     // select the transliterated pattern string
 861     const OUString& rPatternStr =
 862         (isSimpleTrans(transliterateFlags) ? sSrchStr
 863         : (isComplexTrans(transliterateFlags) ? sSrchStr2 : rOptions.searchString));
 864
 865     sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability
 866     // map css::util::SearchFlags to ICU uregex.h flags
 867     // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
 868     // REG_NEWLINE is neither properly defined nor used anywhere => not implemented
 869     // REG_NOSUB is not used anywhere => not implemented
 870     // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
 871     // LEV_RELAXED is only used for SearchAlgorithm==Approximate
 872     // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO
 873     // probably because the transliteration flag IGNORE_CASE handles it as well.
 874     if( (rOptions.searchFlag & css::util::SearchFlags::ALL_IGNORE_CASE) != 0
 875     ||  (transliterateFlags & TransliterationFlags::IGNORE_CASE))
 876         nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
 877     UErrorCode nIcuErr = U_ZERO_ERROR;
 878     // assumption: transliteration didn't mangle regexp control chars
 879     icu::UnicodeString aIcuSearchPatStr( reinterpret_cast<const UChar*>(rPatternStr.getStr()), rPatternStr.getLength());
 880 #ifndef DISABLE_WORDBOUND_EMULATION
 881     // for convenience specific syntax elements of the old regex engine are emulated
 882     // - by replacing \< with "word-break followed by a look-ahead word-char"
 883     static const icu::UnicodeString aChevronPatternB( "\\\\<", -1, icu::UnicodeString::kInvariant);
 884     static const icu::UnicodeString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, icu::UnicodeString::kInvariant);
 885     static icu::RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr);
 886     aChevronMatcherB.reset( aIcuSearchPatStr);
 887     aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr);
 888     aChevronMatcherB.reset();
 889     // - by replacing \> with "look-behind word-char followed by a word-break"
 890     static const icu::UnicodeString aChevronPatternE( "\\\\>", -1, icu::UnicodeString::kInvariant);
 891     static const icu::UnicodeString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, icu::UnicodeString::kInvariant);
 892     static icu::RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr);
 893     aChevronMatcherE.reset( aIcuSearchPatStr);
 894     aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr);
 895     aChevronMatcherE.reset();
 896 #endif
 897     pRegexMatcher.reset( new icu::RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr) );
 898     if (nIcuErr)
 899     {
 900         SAL_INFO( "i18npool", "TextSearch::RESrchPrepare UErrorCode " << nIcuErr);
 901         pRegexMatcher.reset();
 902     }
 903     else
 904     {
 905         // Pathological patterns may result in exponential run time making the
 906         // application appear to be frozen. Limit that. Documentation for this
 907         // call says
 908         // https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1RegexMatcher.html#a6ebcfcab4fe6a38678c0291643a03a00
 909         // "The units of the limit are steps of the match engine.
 910         // Correspondence with actual processor time will depend on the speed
 911         // of the processor and the details of the specific pattern, but will
 912         // typically be on the order of milliseconds."
 913         // Just what is a good value? 42 is always an answer ... the 23 enigma
 914         // as well... which on the dev's machine is roughly 50 seconds with the
 915         // pattern of fdo#70627.
 916         /* TODO: make this a configuration settable value and possibly take
 917          * complexity of expression into account and maybe even length of text
 918          * to be matched; currently (2013-11-25) that is at most one 64k
 919          * paragraph per RESrchFrwrd()/RESrchBkwrd() call. */
 920         pRegexMatcher->setTimeLimit( 23*1000, nIcuErr);
 921     }
 922 }
 923
 924
 925 static bool lcl_findRegex(std::unique_ptr<icu::RegexMatcher> const& pRegexMatcher,
 926                           sal_Int32 nStartPos, sal_Int32 nEndPos, UErrorCode& rIcuErr)
 927 {
 928     pRegexMatcher->region(nStartPos, nEndPos, rIcuErr);
 929     pRegexMatcher->useAnchoringBounds(false); // use whole text's anchoring bounds, not region's
 930     pRegexMatcher->useTransparentBounds(true); // take text outside of the region into account for
 931                                                // look-ahead/behind assertions
 932
 933     if (!pRegexMatcher->find(rIcuErr))
 934     {
 935         /* TODO: future versions could pass the UErrorCode or translations
 936          * thereof to the caller, for example to inform the user of
 937          * U_REGEX_TIME_OUT. The strange thing though is that an error is set
 938          * only after the second call that returns immediately and not if
 939          * timeout occurred on the first call?!? */
 940         SAL_INFO( "i18npool", "lcl_findRegex UErrorCode " << rIcuErr);
 941         return false;
 942     }
 943     return true;
 944 }
 945
 946 SearchResult TextSearch::RESrchFrwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr,
 947                                       sal_Int32 startPos, sal_Int32 endPos )
 948 {
 949     SearchResult aRet;
 950     aRet.subRegExpressions = 0;
 951     if( !pRegexMatcher)
 952         return aRet;
 953
 954     if( endPos > searchStr.getLength())
 955         endPos = searchStr.getLength();
 956
 957     // use the ICU RegexMatcher to find the matches
 958     UErrorCode nIcuErr = U_ZERO_ERROR;
 959     const icu::UnicodeString aSearchTargetStr(false, reinterpret_cast<const UChar*>(searchStr.getStr()),
 960                                         searchStr.getLength());
 961     pRegexMatcher->reset( aSearchTargetStr);
 962     // search until there is a valid match
 963     for(;;)
 964     {
 965         if (!lcl_findRegex( pRegexMatcher, startPos, endPos, nIcuErr))
 966             return aRet;
 967
 968         // #i118887# ignore zero-length matches e.g. "a*" in "bc"
 969         int nStartOfs = pRegexMatcher->start( nIcuErr);
 970         int nEndOfs = pRegexMatcher->end( nIcuErr);
 971         if( nStartOfs < nEndOfs)
 972             break;
 973         // If the zero-length match is behind the string, do not match it again
 974         // and again until startPos reaches there. A match behind the string is
 975         // a "$" anchor.
 976         if (nStartOfs == endPos)
 977             break;
 978         // try at next position if there was a zero-length match
 979         if( ++startPos >= endPos)
 980             return aRet;
 981     }
 982
 983     // extract the result of the search
 984     const int nGroupCount = pRegexMatcher->groupCount();
 985     aRet.subRegExpressions = nGroupCount + 1;
 986     aRet.startOffset.realloc( aRet.subRegExpressions);
 987     auto pstartOffset = aRet.startOffset.getArray();
 988     aRet.endOffset.realloc( aRet.subRegExpressions);
 989     auto pendOffset = aRet.endOffset.getArray();
 990     pstartOffset[0] = pRegexMatcher->start( nIcuErr);
 991     pendOffset[0]   = pRegexMatcher->end( nIcuErr);
 992     for( int i = 1; i <= nGroupCount; ++i) {
 993         pstartOffset[i] = pRegexMatcher->start( i, nIcuErr);
 994         pendOffset[i]   = pRegexMatcher->end( i, nIcuErr);
 995     }
 996
 997     return aRet;
 998 }
 999
1000 SearchResult TextSearch::RESrchBkwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr,
1001                                       sal_Int32 startPos, sal_Int32 endPos )
1002 {
1003     // NOTE: for backwards search callers provide startPos/endPos inverted!
1004     SearchResult aRet;
1005     aRet.subRegExpressions = 0;
1006     if( !pRegexMatcher)
1007         return aRet;
1008
1009     if( startPos > searchStr.getLength())
1010         startPos = searchStr.getLength();
1011
1012     // use the ICU RegexMatcher to find the matches
1013     // TODO: use ICU's backward searching once it becomes available
1014     //       as its replacement using forward search is not as good as the real thing
1015     UErrorCode nIcuErr = U_ZERO_ERROR;
1016     const icu::UnicodeString aSearchTargetStr(false, reinterpret_cast<const UChar*>(searchStr.getStr()),
1017                                         searchStr.getLength());
1018     pRegexMatcher->reset( aSearchTargetStr);
1019     if (!lcl_findRegex( pRegexMatcher, endPos, startPos, nIcuErr))
1020         return aRet;
1021
1022     // find the last match
1023     int nLastPos = 0;
1024     int nFoundEnd = 0;
1025     int nGoodPos = 0, nGoodEnd = 0;
1026     bool bFirst = true;
1027     do {
1028         nLastPos = pRegexMatcher->start( nIcuErr);
1029         nFoundEnd = pRegexMatcher->end( nIcuErr);
1030         if (nLastPos < nFoundEnd)
1031         {
1032             // remember last non-zero-length match
1033             nGoodPos = nLastPos;
1034             nGoodEnd = nFoundEnd;
1035         }
1036         if( nFoundEnd >= startPos)
1037             break;
1038         bFirst = false;
1039         if( nFoundEnd == nLastPos)
1040             ++nFoundEnd;
1041     } while( lcl_findRegex( pRegexMatcher, nFoundEnd, startPos, nIcuErr));
1042
1043     // Ignore all zero-length matches except "$" anchor on first match.
1044     if (nGoodPos == nGoodEnd)
1045     {
1046         if (bFirst && nLastPos == startPos)
1047             nGoodPos = nLastPos;
1048         else
1049             return aRet;
1050     }
1051
1052     // find last match again to get its details
1053     lcl_findRegex( pRegexMatcher, nGoodPos, startPos, nIcuErr);
1054
1055     // fill in the details of the last match
1056     const int nGroupCount = pRegexMatcher->groupCount();
1057     aRet.subRegExpressions = nGroupCount + 1;
1058     aRet.startOffset.realloc( aRet.subRegExpressions);
1059     auto pstartOffset = aRet.startOffset.getArray();
1060     aRet.endOffset.realloc( aRet.subRegExpressions);
1061     auto pendOffset = aRet.endOffset.getArray();
1062     // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted!
1063     pstartOffset[0] = pRegexMatcher->end( nIcuErr);
1064     pendOffset[0]   = pRegexMatcher->start( nIcuErr);
1065     for( int i = 1; i <= nGroupCount; ++i) {
1066         pstartOffset[i] = pRegexMatcher->end( i, nIcuErr);
1067         pendOffset[i]   = pRegexMatcher->start( i, nIcuErr);
1068     }
1069
1070     return aRet;
1071 }
1072
1073
1074 // search for words phonetically
1075 SearchResult TextSearch::ApproxSrchFrwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr,
1076                                           sal_Int32 startPos, sal_Int32 endPos )
1077 {
1078     SearchResult aRet;
1079     aRet.subRegExpressions = 0;
1080
1081     if( !xBreak.is() )
1082         return aRet;
1083
1084     sal_Int32 nStt, nEnd;
1085
1086     Boundary aWBnd = xBreak->getWordBoundary( searchStr, startPos,
1087             aSrchPara.Locale,
1088             WordType::ANYWORD_IGNOREWHITESPACES, true );
1089
1090     do
1091     {
1092         if( aWBnd.startPos >= endPos )
1093             break;
1094         nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos;
1095         nEnd = std::min(aWBnd.endPos, endPos);
1096
1097         if( nStt < nEnd &&
1098                 pWLD->WLD( searchStr.getStr() + nStt, nEnd - nStt ) <= nLimit )
1099         {
1100             aRet.subRegExpressions = 1;
1101             aRet.startOffset = { nStt };
1102             aRet.endOffset = { nEnd };
1103             break;
1104         }
1105
1106         nStt = nEnd - 1;
1107         aWBnd = xBreak->nextWord( searchStr, nStt, aSrchPara.Locale,
1108                 WordType::ANYWORD_IGNOREWHITESPACES);
1109     } while( aWBnd.startPos != aWBnd.endPos ||
1110             (aWBnd.endPos != searchStr.getLength() && aWBnd.endPos != nEnd) );
1111     // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only
1112     // whitespace) in searchStr, getWordBoundary() returned startPos,startPos
1113     // and nextWord() does also => don't loop forever.
1114     return aRet;
1115 }
1116
1117 SearchResult TextSearch::ApproxSrchBkwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr,
1118                                           sal_Int32 startPos, sal_Int32 endPos )
1119 {
1120     SearchResult aRet;
1121     aRet.subRegExpressions = 0;
1122
1123     if( !xBreak.is() )
1124         return aRet;
1125
1126     sal_Int32 nStt, nEnd;
1127
1128     Boundary aWBnd = xBreak->getWordBoundary( searchStr, startPos,
1129             aSrchPara.Locale,
1130             WordType::ANYWORD_IGNOREWHITESPACES, true );
1131
1132     do
1133     {
1134         if( aWBnd.endPos <= endPos )
1135             break;
1136         nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos;
1137         nEnd = std::min(aWBnd.endPos, startPos);
1138
1139         if( nStt < nEnd &&
1140                 pWLD->WLD( searchStr.getStr() + nStt, nEnd - nStt ) <= nLimit )
1141         {
1142             aRet.subRegExpressions = 1;
1143             aRet.startOffset = { nEnd };
1144             aRet.endOffset = { nStt };
1145             break;
1146         }
1147         if( !nStt )
1148             break;
1149
1150         aWBnd = xBreak->previousWord( searchStr, nStt, aSrchPara.Locale,
1151                 WordType::ANYWORD_IGNOREWHITESPACES);
1152     } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != searchStr.getLength() );
1153     return aRet;
1154 }
1155
1156
1157 namespace {
1158 void setWildcardMatch( css::util::SearchResult& rRes, sal_Int32 nStartOffset, sal_Int32 nEndOffset )
1159 {
1160     rRes.subRegExpressions = 1;
1161     rRes.startOffset = { nStartOffset };
1162     rRes.endOffset = { nEndOffset };
1163 }
1164 }
1165
1166 SearchResult TextSearch::WildcardSrchFrwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos )
1167 {
1168     SearchResult aRes;
1169     aRes.subRegExpressions = 0;     // no match
1170     sal_Int32 nStartOffset = nStartPos;
1171     sal_Int32 nEndOffset = nEndPos;
1172
1173     const sal_Int32 nStringLen = searchStr.getLength();
1174
1175     // Forward nStartPos inclusive, nEndPos exclusive, but allow for empty
1176     // string match with [0,0).
1177     if (nStartPos < 0 || nEndPos > nStringLen || nEndPos < nStartPos ||
1178             (nStartPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos)))
1179         return aRes;
1180
1181     const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2);
1182     const sal_Int32 nPatternLen = rPattern.getLength();
1183
1184     // Handle special cases empty pattern and/or string outside of the loop to
1185     // not add performance penalties there and simplify.
1186     if (nStartPos == nEndPos)
1187     {
1188         sal_Int32 i = 0;
1189         while (i < nPatternLen && rPattern[i] == '*')
1190             ++i;
1191         if (i == nPatternLen)
1192             setWildcardMatch( aRes, nStartOffset, nEndOffset);
1193         return aRes;
1194     }
1195
1196     // Empty pattern does not match any non-empty string.
1197     if (!nPatternLen)
1198         return aRes;
1199
1200     bool bRewind = false;
1201     sal_uInt32 cPattern = 0;
1202     sal_Int32 nPattern = 0;
1203     sal_Int32 nAfterFakePattern = nPattern;
1204     if (mbWildcardAllowSubstring)
1205     {
1206         // Fake a leading '*' wildcard.
1207         cPattern = '*';
1208         bRewind = true;
1209         // Assume a non-'*' pattern character follows. If it is a '*' instead
1210         // that will be handled in the loop by setting nPat.
1211         sal_uInt32 cu = rPattern.iterateCodePoints( &nAfterFakePattern);
1212         if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern < nPatternLen)
1213             rPattern.iterateCodePoints( &nAfterFakePattern);
1214     }
1215
1216     sal_Int32 nString = nStartPos, nPat = -1, nStr = -1, nLastAsterisk = -1;
1217     sal_uInt32 cPatternAfterAsterisk = 0;
1218     bool bEscaped = false, bEscapedAfterAsterisk = false;
1219
1220     // The loop code tries to avoid costly calls to iterateCodePoints() when
1221     // possible.
1222
1223     do
1224     {
1225         if (bRewind)
1226         {
1227             // Reuse cPattern after '*', nPattern was correspondingly
1228             // incremented to point behind cPattern.
1229             bRewind = false;
1230         }
1231         else if (nPattern < nPatternLen)
1232         {
1233             // nPattern will be incremented by iterateCodePoints().
1234             cPattern = rPattern.iterateCodePoints( &nPattern);
1235             if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen)
1236             {
1237                 bEscaped = true;
1238                 cPattern = rPattern.iterateCodePoints( &nPattern);
1239             }
1240         }
1241         else
1242         {
1243             // A trailing '*' is handled below.
1244             if (mbWildcardAllowSubstring)
1245             {
1246                 // If the pattern is consumed and substring match allowed we're good.
1247                 setWildcardMatch( aRes, nStartOffset, nString);
1248                 return aRes;
1249             }
1250             else if (nString < nEndPos && nLastAsterisk >= 0)
1251             {
1252                 // If substring match is not allowed try a greedy '*' match.
1253                 nPattern = nLastAsterisk;
1254                 continue;   // do
1255             }
1256             else
1257                 return aRes;
1258         }
1259
1260         if (cPattern == '*' && !bEscaped)
1261         {
1262             // '*' is one code unit, so not using iterateCodePoints() is ok.
1263             while (nPattern < nPatternLen && rPattern[nPattern] == '*')
1264                 ++nPattern;
1265
1266             if (nPattern >= nPatternLen)
1267             {
1268                 // Last pattern is '*', remaining string matches.
1269                 setWildcardMatch( aRes, nStartOffset, nEndOffset);
1270                 return aRes;
1271             }
1272
1273             nLastAsterisk = nPattern;   // Remember last encountered '*'.
1274
1275             // cPattern will be the next non-'*' character, nPattern
1276             // incremented.
1277             cPattern = rPattern.iterateCodePoints( &nPattern);
1278             if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen)
1279             {
1280                 bEscaped = true;
1281                 cPattern = rPattern.iterateCodePoints( &nPattern);
1282             }
1283
1284             cPatternAfterAsterisk = cPattern;
1285             bEscapedAfterAsterisk = bEscaped;
1286             nPat = nPattern;    // Remember position of pattern behind '*', already incremented.
1287             nStr = nString;     // Remember the current string to be matched.
1288         }
1289
1290         if (nString >= nEndPos)
1291             // Whatever follows in pattern, string will not match.
1292             return aRes;
1293
1294         // nString will be incremented by iterateCodePoints().
1295         sal_uInt32 cString = searchStr.iterateCodePoints( &nString);
1296
1297         if ((cPattern != '?' || bEscaped) && cPattern != cString)
1298         {
1299             if (nPat == -1)
1300                 // Non-match already without any '*' pattern.
1301                 return aRes;
1302
1303             bRewind = true;
1304             nPattern = nPat;                    // Rewind pattern to character behind '*', already incremented.
1305             cPattern = cPatternAfterAsterisk;
1306             bEscaped = bEscapedAfterAsterisk;
1307             searchStr.iterateCodePoints( &nStr);
1308             nString = nStr;                     // Restore incremented remembered string position.
1309             if (nPat == nAfterFakePattern)
1310             {
1311                 // Next start offset will be the next character.
1312                 nStartOffset = nString;
1313             }
1314         }
1315         else
1316         {
1317             // An unescaped '?' pattern matched any character, or characters
1318             // matched. Reset only escaped state.
1319             bEscaped = false;
1320         }
1321     }
1322     while (nString < nEndPos);
1323
1324     if (bRewind)
1325         return aRes;
1326
1327     // Eat trailing '*' pattern that matches anything, including nothing.
1328     // '*' is one code unit, so not using iterateCodePoints() is ok.
1329     while (nPattern < nPatternLen && rPattern[nPattern] == '*')
1330         ++nPattern;
1331
1332     if (nPattern == nPatternLen)
1333         setWildcardMatch( aRes, nStartOffset, nEndOffset);
1334     return aRes;
1335 }
1336
1337 SearchResult TextSearch::WildcardSrchBkwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos )
1338 {
1339     SearchResult aRes;
1340     aRes.subRegExpressions = 0;     // no match
1341
1342     sal_Int32 nStartOffset = nStartPos;
1343     sal_Int32 nEndOffset = nEndPos;
1344
1345     const sal_Int32 nStringLen = searchStr.getLength();
1346
1347     // Backward nStartPos exclusive, nEndPos inclusive, but allow for empty
1348     // string match with (0,0].
1349     if (nStartPos > nStringLen || nEndPos < 0 || nStartPos < nEndPos ||
1350             (nEndPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos)))
1351         return aRes;
1352
1353     const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2);
1354     sal_Int32 nPatternLen = rPattern.getLength();
1355
1356     // Handle special cases empty pattern and/or string outside of the loop to
1357     // not add performance penalties there and simplify.
1358     if (nStartPos == nEndPos)
1359     {
1360         sal_Int32 i = 0;
1361         while (i < nPatternLen && rPattern[i] == '*')
1362             ++i;
1363         if (i == nPatternLen)
1364             setWildcardMatch( aRes, nStartOffset, nEndOffset);
1365         return aRes;
1366     }
1367
1368     // Empty pattern does not match any non-empty string.
1369     if (!nPatternLen)
1370         return aRes;
1371
1372     // Reverse escaped patterns to ease the handling of escapes, keeping escape
1373     // and following character as one sequence in backward direction.
1374     if ((bUsePrimarySrchStr && maWildcardReversePattern.isEmpty()) ||
1375             (!bUsePrimarySrchStr && maWildcardReversePattern2.isEmpty()))
1376     {
1377         OUStringBuffer aPatternBuf( rPattern);
1378         sal_Int32 nIndex = 0;
1379         while (nIndex < nPatternLen)
1380         {
1381             const sal_Int32 nOld = nIndex;
1382             const sal_uInt32 cu = rPattern.iterateCodePoints( &nIndex);
1383             if (cu == mcWildcardEscapeChar)
1384             {
1385                 if (nIndex < nPatternLen)
1386                 {
1387                     if (nIndex - nOld == 1)
1388                     {
1389                         // Simply move code units, we already memorized the one
1390                         // in 'cu'.
1391                         const sal_Int32 nOld2 = nIndex;
1392                         rPattern.iterateCodePoints( &nIndex);
1393                         for (sal_Int32 i=0; i < nIndex - nOld2; ++i)
1394                             aPatternBuf[nOld+i] = rPattern[nOld2+i];
1395                         aPatternBuf[nIndex-1] = static_cast<sal_Unicode>(cu);
1396                     }
1397                     else
1398                     {
1399                         // Copy the escape character code units first in the
1400                         // unlikely case that it would not be of BMP.
1401                         assert(nIndex - nOld == 2);  // it's UTF-16, so...
1402                         sal_Unicode buf[2];
1403                         buf[0] = rPattern[nOld];
1404                         buf[1] = rPattern[nOld+1];
1405                         const sal_Int32 nOld2 = nIndex;
1406                         rPattern.iterateCodePoints( &nIndex);
1407                         for (sal_Int32 i=0; i < nIndex - nOld2; ++i)
1408                             aPatternBuf[nOld+i] = rPattern[nOld2+i];
1409                         aPatternBuf[nIndex-2] = buf[0];
1410                         aPatternBuf[nIndex-1] = buf[1];
1411                     }
1412                 }
1413                 else
1414                 {
1415                     // Trailing escape would become leading escape, do what?
1416                     // Eliminate.
1417                     aPatternBuf.remove( nOld, nIndex - nOld);
1418                 }
1419             }
1420         }
1421         if (bUsePrimarySrchStr)
1422             maWildcardReversePattern = aPatternBuf.makeStringAndClear();
1423         else
1424             maWildcardReversePattern2 = aPatternBuf.makeStringAndClear();
1425     }
1426     const OUString& rReversePattern = (bUsePrimarySrchStr ? maWildcardReversePattern : maWildcardReversePattern2);
1427     nPatternLen = rReversePattern.getLength();
1428
1429     bool bRewind = false;
1430     sal_uInt32 cPattern = 0;
1431     sal_Int32 nPattern = nPatternLen;
1432     sal_Int32 nAfterFakePattern = nPattern;
1433     if (mbWildcardAllowSubstring)
1434     {
1435         // Fake a trailing '*' wildcard.
1436         cPattern = '*';
1437         bRewind = true;
1438         // Assume a non-'*' pattern character follows. If it is a '*' instead
1439         // that will be handled in the loop by setting nPat.
1440         sal_uInt32 cu = rReversePattern.iterateCodePoints( &nAfterFakePattern, -1);
1441         if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern > 0)
1442             rReversePattern.iterateCodePoints( &nAfterFakePattern, -1);
1443     }
1444
1445     sal_Int32 nString = nStartPos, nPat = -1, nStr = -1, nLastAsterisk = -1;
1446     sal_uInt32 cPatternAfterAsterisk = 0;
1447     bool bEscaped = false, bEscapedAfterAsterisk = false;
1448
1449     // The loop code tries to avoid costly calls to iterateCodePoints() when
1450     // possible.
1451
1452     do
1453     {
1454         if (bRewind)
1455         {
1456             // Reuse cPattern after '*', nPattern was correspondingly
1457             // decremented to point before cPattern.
1458             bRewind = false;
1459         }
1460         else if (nPattern > 0)
1461         {
1462             // nPattern will be decremented by iterateCodePoints().
1463             cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1464             if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0)
1465             {
1466                 bEscaped = true;
1467                 cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1468             }
1469         }
1470         else
1471         {
1472             // A trailing '*' is handled below.
1473             if (mbWildcardAllowSubstring)
1474             {
1475                 // If the pattern is consumed and substring match allowed we're good.
1476                 setWildcardMatch( aRes, nStartOffset, nString);
1477                 return aRes;
1478             }
1479             else if (nString > nEndPos && nLastAsterisk >= 0)
1480             {
1481                 // If substring match is not allowed try a greedy '*' match.
1482                 nPattern = nLastAsterisk;
1483                 continue;   // do
1484             }
1485             else
1486                 return aRes;
1487         }
1488
1489         if (cPattern == '*' && !bEscaped)
1490         {
1491             // '*' is one code unit, so not using iterateCodePoints() is ok.
1492             while (nPattern > 0 && rReversePattern[nPattern-1] == '*')
1493                 --nPattern;
1494
1495             if (nPattern <= 0)
1496             {
1497                 // First pattern is '*', remaining string matches.
1498                 setWildcardMatch( aRes, nStartOffset, nEndOffset);
1499                 return aRes;
1500             }
1501
1502             nLastAsterisk = nPattern;   // Remember last encountered '*'.
1503
1504             // cPattern will be the previous non-'*' character, nPattern
1505             // decremented.
1506             cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1507             if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0)
1508             {
1509                 bEscaped = true;
1510                 cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1511             }
1512
1513             cPatternAfterAsterisk = cPattern;
1514             bEscapedAfterAsterisk = bEscaped;
1515             nPat = nPattern;    // Remember position of pattern before '*', already decremented.
1516             nStr = nString;     // Remember the current string to be matched.
1517         }
1518
1519         if (nString <= nEndPos)
1520             // Whatever leads in pattern, string will not match.
1521             return aRes;
1522
1523         // nString will be decremented by iterateCodePoints().
1524         sal_uInt32 cString = searchStr.iterateCodePoints( &nString, -1);
1525
1526         if ((cPattern != '?' || bEscaped) && cPattern != cString)
1527         {
1528             if (nPat == -1)
1529                 // Non-match already without any '*' pattern.
1530                 return aRes;
1531
1532             bRewind = true;
1533             nPattern = nPat;                    // Rewind pattern to character before '*', already decremented.
1534             cPattern = cPatternAfterAsterisk;
1535             bEscaped = bEscapedAfterAsterisk;
1536             searchStr.iterateCodePoints( &nStr, -1);
1537             nString = nStr;                     // Restore decremented remembered string position.
1538             if (nPat == nAfterFakePattern)
1539             {
1540                 // Next start offset will be this character (exclusive).
1541                 nStartOffset = nString;
1542             }
1543         }
1544         else
1545         {
1546             // An unescaped '?' pattern matched any character, or characters
1547             // matched. Reset only escaped state.
1548             bEscaped = false;
1549         }
1550     }
1551     while (nString > nEndPos);
1552
1553     if (bRewind)
1554         return aRes;
1555
1556     // Eat leading '*' pattern that matches anything, including nothing.
1557     // '*' is one code unit, so not using iterateCodePoints() is ok.
1558     while (nPattern > 0 && rReversePattern[nPattern-1] == '*')
1559         --nPattern;
1560
1561     if (nPattern == 0)
1562         setWildcardMatch( aRes, nStartOffset, nEndOffset);
1563     return aRes;
1564 }
1565
1566
1567 OUString SAL_CALL
1568 TextSearch::getImplementationName()
1569 {
1570     return u"com.sun.star.util.TextSearch_i18n"_ustr;
1571 }
1572
1573 sal_Bool SAL_CALL TextSearch::supportsService(const OUString& rServiceName)
1574 {
1575     return cppu::supportsService(this, rServiceName);
1576 }
1577
1578 Sequence< OUString > SAL_CALL
1579 TextSearch::getSupportedServiceNames()
1580 {
1581     return { u"com.sun.star.util.TextSearch"_ustr, u"com.sun.star.util.TextSearch2"_ustr };
1582 }
1583
1584 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface*
1585 i18npool_TextSearch_get_implementation(
1586     css::uno::XComponentContext* context , css::uno::Sequence<css::uno::Any> const&)
1587 {
1588     return cppu::acquire(new TextSearch(context));
1589 }
1590
1591 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */