bump product version to 4.1.6.2
[LibreOffice.git] / i18npool / source / search / textsearch.cxx
blob91a770bba3a6fd44ea53d53131aa67524b8c855e
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include "textsearch.hxx"
21 #include "levdis.hxx"
22 #include <com/sun/star/lang/Locale.hpp>
23 #include <com/sun/star/lang/XMultiServiceFactory.hpp>
24 #include <comphelper/processfactory.hxx>
25 #include <com/sun/star/i18n/BreakIterator.hpp>
26 #include <com/sun/star/i18n/UnicodeType.hpp>
27 #include <com/sun/star/util/SearchFlags.hpp>
28 #include <com/sun/star/i18n/WordType.hpp>
29 #include <com/sun/star/i18n/ScriptType.hpp>
30 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
31 #include <com/sun/star/i18n/CharacterClassification.hpp>
32 #include <com/sun/star/i18n/KCharacterType.hpp>
33 #include <com/sun/star/i18n/Transliteration.hpp>
34 #include <com/sun/star/registry/XRegistryKey.hpp>
35 #include <cppuhelper/factory.hxx>
36 #include <cppuhelper/weak.hxx>
38 #ifdef _MSC_VER
39 // get rid of that dumb compiler warning
40 // identifier was truncated to '255' characters in the debug information
41 // for STL template usage, if .pdb files are to be created
42 #pragma warning( disable: 4786 )
43 #endif
45 #include <string.h>
47 using namespace ::com::sun::star::util;
48 using namespace ::com::sun::star::uno;
49 using namespace ::com::sun::star::lang;
50 using namespace ::com::sun::star::i18n;
51 using namespace ::com::sun::star;
53 static const sal_Int32 COMPLEX_TRANS_MASK_TMP =
54 TransliterationModules_ignoreBaFa_ja_JP |
55 TransliterationModules_ignoreIterationMark_ja_JP |
56 TransliterationModules_ignoreTiJi_ja_JP |
57 TransliterationModules_ignoreHyuByu_ja_JP |
58 TransliterationModules_ignoreSeZe_ja_JP |
59 TransliterationModules_ignoreIandEfollowedByYa_ja_JP |
60 TransliterationModules_ignoreKiKuFollowedBySa_ja_JP |
61 TransliterationModules_ignoreProlongedSoundMark_ja_JP;
63 // These 2 transliterations are simple but need to take effect in
64 // complex transliteration.
65 static const sal_Int32 COMPLEX_TRANS_MASK =
66 COMPLEX_TRANS_MASK_TMP |
67 TransliterationModules_IGNORE_KANA |
68 TransliterationModules_FULLWIDTH_HALFWIDTH;
70 static const sal_Int32 SIMPLE_TRANS_MASK = ~COMPLEX_TRANS_MASK;
72 // Regex patterns are case sensitive.
73 static const sal_Int32 SIMPLE_TRANS_MASK_REPATTERN =
74 ~(COMPLEX_TRANS_MASK |
75 TransliterationModules_IGNORE_CASE |
76 TransliterationModules_UPPERCASE_LOWERCASE |
77 TransliterationModules_LOWERCASE_UPPERCASE);
80 TextSearch::TextSearch(const Reference < XComponentContext > & rxContext)
81 : m_xContext( rxContext )
82 , pJumpTable( 0 )
83 , pJumpTable2( 0 )
84 , pRegexMatcher( NULL )
85 , pWLD( 0 )
87 SearchOptions aOpt;
88 aOpt.algorithmType = SearchAlgorithms_ABSOLUTE;
89 aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE;
90 //aOpt.Locale = ???;
91 setOptions( aOpt );
94 TextSearch::~TextSearch()
96 delete pRegexMatcher;
97 delete pWLD;
98 delete pJumpTable;
99 delete pJumpTable2;
102 void TextSearch::setOptions( const SearchOptions& rOptions ) throw( RuntimeException )
104 aSrchPara = rOptions;
106 delete pRegexMatcher, pRegexMatcher = NULL;
107 delete pWLD, pWLD = 0;
108 delete pJumpTable, pJumpTable = 0;
109 delete pJumpTable2, pJumpTable2 = 0;
111 // Create Transliteration class
112 if( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK )
114 if( !xTranslit.is() )
115 xTranslit.set( Transliteration::create( m_xContext ) );
116 xTranslit->loadModule(
117 (TransliterationModules)( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ),
118 aSrchPara.Locale);
120 else if( xTranslit.is() )
121 xTranslit = 0;
123 // Create Transliteration for 2<->1, 2<->2 transliteration
124 if ( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK )
126 if( !xTranslit2.is() )
127 xTranslit2.set( Transliteration::create( m_xContext ) );
128 // Load transliteration module
129 xTranslit2->loadModule(
130 (TransliterationModules)( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ),
131 aSrchPara.Locale);
134 if ( !xBreak.is() )
135 xBreak = com::sun::star::i18n::BreakIterator::create( m_xContext );
137 sSrchStr = aSrchPara.searchString;
139 // Transliterate search string.
140 if (aSrchPara.algorithmType == SearchAlgorithms_REGEXP)
142 if (aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK_REPATTERN)
144 if ((aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK_REPATTERN) !=
145 (aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK))
147 com::sun::star::uno::Reference< XExtendedTransliteration > xTranslitPattern(
148 Transliteration::create( m_xContext ));
149 if (xTranslitPattern.is())
151 xTranslitPattern->loadModule(
152 (TransliterationModules)( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK_REPATTERN ),
153 aSrchPara.Locale);
154 sSrchStr = xTranslitPattern->transliterateString2String(
155 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
158 else
160 if (xTranslit.is())
161 sSrchStr = xTranslit->transliterateString2String(
162 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
164 // xTranslit2 complex transliterated sSrchStr2 is not used in
165 // regex, see TextSearch::searchForward() and
166 // TextSearch::searchBackward()
169 else
171 if ( xTranslit.is() && aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK )
172 sSrchStr = xTranslit->transliterateString2String(
173 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
175 if ( xTranslit2.is() && aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK )
176 sSrchStr2 = xTranslit2->transliterateString2String(
177 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
180 // When start or end of search string is a complex script type, we need to
181 // make sure the result boundary is not located in the middle of cell.
182 checkCTLStart = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 0) ==
183 ScriptType::COMPLEX));
184 checkCTLEnd = (xBreak.is() && (xBreak->getScriptType(sSrchStr,
185 sSrchStr.getLength()-1) == ScriptType::COMPLEX));
187 switch( aSrchPara.algorithmType)
189 case SearchAlgorithms_REGEXP:
190 fnForward = &TextSearch::RESrchFrwrd;
191 fnBackward = &TextSearch::RESrchBkwrd;
192 RESrchPrepare( aSrchPara);
193 break;
195 case SearchAlgorithms_APPROXIMATE:
196 fnForward = &TextSearch::ApproxSrchFrwrd;
197 fnBackward = &TextSearch::ApproxSrchBkwrd;
199 pWLD = new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars,
200 aSrchPara.insertedChars, aSrchPara.deletedChars,
201 0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) );
203 nLimit = pWLD->GetLimit();
204 break;
206 default:
207 fnForward = &TextSearch::NSrchFrwrd;
208 fnBackward = &TextSearch::NSrchBkwrd;
209 break;
213 sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos )
215 sal_Int32 nRet = 0, nEnd = rOff.getLength();
216 while( nRet < nEnd && nPos > rOff[ nRet ] ) ++nRet;
217 return nRet;
220 sal_Bool TextSearch::isCellStart(const OUString& searchStr, sal_Int32 nPos)
221 throw( RuntimeException )
223 sal_Int32 nDone;
224 return nPos == xBreak->previousCharacters(searchStr, nPos+1,
225 aSrchPara.Locale, CharacterIteratorMode::SKIPCELL, 1, nDone);
228 SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
229 throw( RuntimeException )
231 SearchResult sres;
233 OUString in_str(searchStr);
234 sal_Int32 newStartPos = startPos;
235 sal_Int32 newEndPos = endPos;
237 bUsePrimarySrchStr = true;
239 if ( xTranslit.is() )
241 // apply normal transliteration (1<->1, 1<->0)
242 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
243 in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset );
245 // JP 20.6.2001: also the start and end positions must be corrected!
246 if( startPos )
247 newStartPos = FindPosInSeq_Impl( offset, startPos );
249 if( endPos < searchStr.getLength() )
250 newEndPos = FindPosInSeq_Impl( offset, endPos );
251 else
252 newEndPos = in_str.getLength();
254 sres = (this->*fnForward)( in_str, newStartPos, newEndPos );
256 // Map offsets back to untransliterated string.
257 const sal_Int32 nOffsets = offset.getLength();
258 if (nOffsets)
260 // For regex nGroups is the number of groups+1 with group 0 being
261 // the entire match.
262 const sal_Int32 nGroups = sres.startOffset.getLength();
263 for ( sal_Int32 k = 0; k < nGroups; k++ )
265 const sal_Int32 nStart = sres.startOffset[k];
266 if (nStart > 0)
267 sres.startOffset[k] = (nStart < nOffsets ? offset[nStart] : (offset[nOffsets - 1] + 1));
268 // JP 20.6.2001: end is ever exclusive and then don't return
269 // the position of the next character - return the
270 // next position behind the last found character!
271 // "a b c" find "b" must return 2,3 and not 2,4!!!
272 const sal_Int32 nStop = sres.endOffset[k];
273 if (nStop > 0)
274 sres.endOffset[k] = offset[(nStop <= nOffsets ? nStop : nOffsets) - 1] + 1;
278 else
280 sres = (this->*fnForward)( in_str, startPos, endPos );
283 if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP)
285 SearchResult sres2;
287 in_str = OUString(searchStr);
288 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
290 in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset );
292 if( startPos )
293 startPos = FindPosInSeq_Impl( offset, startPos );
295 if( endPos < searchStr.getLength() )
296 endPos = FindPosInSeq_Impl( offset, endPos );
297 else
298 endPos = in_str.getLength();
300 bUsePrimarySrchStr = false;
301 sres2 = (this->*fnForward)( in_str, startPos, endPos );
303 for ( int k = 0; k < sres2.startOffset.getLength(); k++ )
305 if (sres2.startOffset[k])
306 sres2.startOffset[k] = offset[sres2.startOffset[k]-1] + 1;
307 if (sres2.endOffset[k])
308 sres2.endOffset[k] = offset[sres2.endOffset[k]-1] + 1;
311 // pick first and long one
312 if ( sres.subRegExpressions == 0)
313 return sres2;
314 if ( sres2.subRegExpressions == 1)
316 if ( sres.startOffset[0] > sres2.startOffset[0])
317 return sres2;
318 else if ( sres.startOffset[0] == sres2.startOffset[0] &&
319 sres.endOffset[0] < sres2.endOffset[0])
320 return sres2;
324 return sres;
327 SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
328 throw(RuntimeException)
330 SearchResult sres;
332 OUString in_str(searchStr);
333 sal_Int32 newStartPos = startPos;
334 sal_Int32 newEndPos = endPos;
336 bUsePrimarySrchStr = true;
338 if ( xTranslit.is() )
340 // apply only simple 1<->1 transliteration here
341 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
342 in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset );
344 // JP 20.6.2001: also the start and end positions must be corrected!
345 if( startPos < searchStr.getLength() )
346 newStartPos = FindPosInSeq_Impl( offset, startPos );
347 else
348 newStartPos = in_str.getLength();
350 if( endPos )
351 newEndPos = FindPosInSeq_Impl( offset, endPos );
353 sres = (this->*fnBackward)( in_str, newStartPos, newEndPos );
355 // Map offsets back to untransliterated string.
356 const sal_Int32 nOffsets = offset.getLength();
357 if (nOffsets)
359 // For regex nGroups is the number of groups+1 with group 0 being
360 // the entire match.
361 const sal_Int32 nGroups = sres.startOffset.getLength();
362 for ( sal_Int32 k = 0; k < nGroups; k++ )
364 const sal_Int32 nStart = sres.startOffset[k];
365 if (nStart > 0)
366 sres.startOffset[k] = offset[(nStart <= nOffsets ? nStart : nOffsets) - 1] + 1;
367 // JP 20.6.2001: end is ever exclusive and then don't return
368 // the position of the next character - return the
369 // next position behind the last found character!
370 // "a b c" find "b" must return 2,3 and not 2,4!!!
371 const sal_Int32 nStop = sres.endOffset[k];
372 if (nStop > 0)
373 sres.endOffset[k] = (nStop < nOffsets ? offset[nStop] : (offset[nOffsets - 1] + 1));
377 else
379 sres = (this->*fnBackward)( in_str, startPos, endPos );
382 if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP )
384 SearchResult sres2;
386 in_str = OUString(searchStr);
387 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
389 in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset);
391 if( startPos < searchStr.getLength() )
392 startPos = FindPosInSeq_Impl( offset, startPos );
393 else
394 startPos = in_str.getLength();
396 if( endPos )
397 endPos = FindPosInSeq_Impl( offset, endPos );
399 bUsePrimarySrchStr = false;
400 sres2 = (this->*fnBackward)( in_str, startPos, endPos );
402 for( int k = 0; k < sres2.startOffset.getLength(); k++ )
404 if (sres2.startOffset[k])
405 sres2.startOffset[k] = offset[sres2.startOffset[k]-1]+1;
406 if (sres2.endOffset[k])
407 sres2.endOffset[k] = offset[sres2.endOffset[k]-1]+1;
410 // pick last and long one
411 if ( sres.subRegExpressions == 0 )
412 return sres2;
413 if ( sres2.subRegExpressions == 1 )
415 if ( sres.startOffset[0] < sres2.startOffset[0] )
416 return sres2;
417 if ( sres.startOffset[0] == sres2.startOffset[0] &&
418 sres.endOffset[0] > sres2.endOffset[0] )
419 return sres2;
423 return sres;
426 //---------------------------------------------------------------------
428 bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const
430 bool bRet = 1;
431 if( '\x7f' != rStr[nPos])
433 if ( !xCharClass.is() )
434 xCharClass = CharacterClassification::create( m_xContext );
435 sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos,
436 aSrchPara.Locale );
437 if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA |
438 KCharacterType::LETTER ) & nCType ) )
439 bRet = 0;
441 return bRet;
444 // --------- helper methods for Boyer-Moore like text searching ----------
445 // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available
447 void TextSearch::MakeForwardTab()
449 // create the jumptable for the search text
450 if( pJumpTable )
452 if( bIsForwardTab )
453 return ; // the jumpTable is ok
454 delete pJumpTable;
456 bIsForwardTab = true;
458 sal_Int32 n, nLen = sSrchStr.getLength();
459 pJumpTable = new TextSearchJumpTable;
461 for( n = 0; n < nLen - 1; ++n )
463 sal_Unicode cCh = sSrchStr[n];
464 sal_Int32 nDiff = nLen - n - 1;
465 TextSearchJumpTable::value_type aEntry( cCh, nDiff );
467 ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
468 pJumpTable->insert( aEntry );
469 if ( !aPair.second )
470 (*(aPair.first)).second = nDiff;
474 void TextSearch::MakeForwardTab2()
476 // create the jumptable for the search text
477 if( pJumpTable2 )
479 if( bIsForwardTab )
480 return ; // the jumpTable is ok
481 delete pJumpTable2;
483 bIsForwardTab = true;
485 sal_Int32 n, nLen = sSrchStr2.getLength();
486 pJumpTable2 = new TextSearchJumpTable;
488 for( n = 0; n < nLen - 1; ++n )
490 sal_Unicode cCh = sSrchStr2[n];
491 sal_Int32 nDiff = nLen - n - 1;
493 TextSearchJumpTable::value_type aEntry( cCh, nDiff );
494 ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
495 pJumpTable2->insert( aEntry );
496 if ( !aPair.second )
497 (*(aPair.first)).second = nDiff;
501 void TextSearch::MakeBackwardTab()
503 // create the jumptable for the search text
504 if( pJumpTable )
506 if( !bIsForwardTab )
507 return ; // the jumpTable is ok
508 delete pJumpTable;
510 bIsForwardTab = false;
512 sal_Int32 n, nLen = sSrchStr.getLength();
513 pJumpTable = new TextSearchJumpTable;
515 for( n = nLen-1; n > 0; --n )
517 sal_Unicode cCh = sSrchStr[n];
518 TextSearchJumpTable::value_type aEntry( cCh, n );
519 ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
520 pJumpTable->insert( aEntry );
521 if ( !aPair.second )
522 (*(aPair.first)).second = n;
526 void TextSearch::MakeBackwardTab2()
528 // create the jumptable for the search text
529 if( pJumpTable2 )
531 if( !bIsForwardTab )
532 return ; // the jumpTable is ok
533 delete pJumpTable2;
535 bIsForwardTab = false;
537 sal_Int32 n, nLen = sSrchStr2.getLength();
538 pJumpTable2 = new TextSearchJumpTable;
540 for( n = nLen-1; n > 0; --n )
542 sal_Unicode cCh = sSrchStr2[n];
543 TextSearchJumpTable::value_type aEntry( cCh, n );
544 ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
545 pJumpTable2->insert( aEntry );
546 if ( !aPair.second )
547 (*(aPair.first)).second = n;
551 sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const
553 TextSearchJumpTable *pJump;
554 OUString sSearchKey;
556 if ( bUsePrimarySrchStr ) {
557 pJump = pJumpTable;
558 sSearchKey = sSrchStr;
559 } else {
560 pJump = pJumpTable2;
561 sSearchKey = sSrchStr2;
564 TextSearchJumpTable::const_iterator iLook = pJump->find( cChr );
565 if ( iLook == pJump->end() )
566 return sSearchKey.getLength();
567 return (*iLook).second;
571 // TextSearch::NSrchFrwrd is mis-optimized on unxsoli (#i105945#)
572 SearchResult TextSearch::NSrchFrwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
573 throw(RuntimeException)
575 SearchResult aRet;
576 aRet.subRegExpressions = 0;
578 OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
580 OUString aStr( searchStr );
581 sal_Int32 nSuchIdx = aStr.getLength();
582 sal_Int32 nEnde = endPos;
583 if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx )
584 return aRet;
587 if( nEnde < sSearchKey.getLength() ) // position inside the search region ?
588 return aRet;
590 nEnde -= sSearchKey.getLength();
592 if (bUsePrimarySrchStr)
593 MakeForwardTab(); // create the jumptable
594 else
595 MakeForwardTab2();
597 for (sal_Int32 nCmpIdx = startPos; // start position for the search
598 nCmpIdx <= nEnde;
599 nCmpIdx += GetDiff( aStr[nCmpIdx + sSearchKey.getLength()-1]))
601 // if the match would be the completed cells, skip it.
602 if ( (checkCTLStart && !isCellStart( aStr, nCmpIdx )) || (checkCTLEnd
603 && !isCellStart( aStr, nCmpIdx + sSearchKey.getLength())) )
604 continue;
606 nSuchIdx = sSearchKey.getLength() - 1;
607 while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == aStr[nCmpIdx + nSuchIdx])
609 if( nSuchIdx == 0 )
611 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
613 sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength();
614 bool bAtStart = !nCmpIdx;
615 bool bAtEnd = nFndEnd == endPos;
616 bool bDelimBefore = bAtStart || IsDelimiter( aStr, nCmpIdx-1 );
617 bool bDelimBehind = IsDelimiter( aStr, nFndEnd );
618 // * 1 -> only one word in the paragraph
619 // * 2 -> at begin of paragraph
620 // * 3 -> at end of paragraph
621 // * 4 -> inside the paragraph
622 if( !( ( bAtStart && bAtEnd ) || // 1
623 ( bAtStart && bDelimBehind ) || // 2
624 ( bAtEnd && bDelimBefore ) || // 3
625 ( bDelimBefore && bDelimBehind ))) // 4
626 break;
629 aRet.subRegExpressions = 1;
630 aRet.startOffset.realloc( 1 );
631 aRet.startOffset[ 0 ] = nCmpIdx;
632 aRet.endOffset.realloc( 1 );
633 aRet.endOffset[ 0 ] = nCmpIdx + sSearchKey.getLength();
635 return aRet;
637 else
638 nSuchIdx--;
641 return aRet;
644 SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
645 throw(RuntimeException)
647 SearchResult aRet;
648 aRet.subRegExpressions = 0;
650 OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
652 OUString aStr( searchStr );
653 sal_Int32 nSuchIdx = aStr.getLength();
654 sal_Int32 nEnde = endPos;
655 if( nSuchIdx == 0 || sSearchKey.isEmpty() || sSearchKey.getLength() > nSuchIdx)
656 return aRet;
658 if (bUsePrimarySrchStr)
659 MakeBackwardTab(); // create the jumptable
660 else
661 MakeBackwardTab2();
663 if( nEnde == nSuchIdx ) // end position for the search
664 nEnde = sSearchKey.getLength();
665 else
666 nEnde += sSearchKey.getLength();
668 sal_Int32 nCmpIdx = startPos; // start position for the search
670 while (nCmpIdx >= nEnde)
672 // if the match would be the completed cells, skip it.
673 if ( (!checkCTLStart || isCellStart( aStr, nCmpIdx -
674 sSearchKey.getLength() )) && (!checkCTLEnd ||
675 isCellStart( aStr, nCmpIdx)))
677 nSuchIdx = 0;
678 while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] ==
679 aStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] )
680 nSuchIdx++;
681 if( nSuchIdx >= sSearchKey.getLength() )
683 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
685 sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength();
686 bool bAtStart = !nFndStt;
687 bool bAtEnd = nCmpIdx == startPos;
688 bool bDelimBehind = IsDelimiter( aStr, nCmpIdx );
689 bool bDelimBefore = bAtStart || // begin of paragraph
690 IsDelimiter( aStr, nFndStt-1 );
691 // * 1 -> only one word in the paragraph
692 // * 2 -> at begin of paragraph
693 // * 3 -> at end of paragraph
694 // * 4 -> inside the paragraph
695 if( ( bAtStart && bAtEnd ) || // 1
696 ( bAtStart && bDelimBehind ) || // 2
697 ( bAtEnd && bDelimBefore ) || // 3
698 ( bDelimBefore && bDelimBehind )) // 4
700 aRet.subRegExpressions = 1;
701 aRet.startOffset.realloc( 1 );
702 aRet.startOffset[ 0 ] = nCmpIdx;
703 aRet.endOffset.realloc( 1 );
704 aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength();
705 return aRet;
708 else
710 aRet.subRegExpressions = 1;
711 aRet.startOffset.realloc( 1 );
712 aRet.startOffset[ 0 ] = nCmpIdx;
713 aRet.endOffset.realloc( 1 );
714 aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength();
715 return aRet;
719 nSuchIdx = GetDiff( aStr[nCmpIdx - sSearchKey.getLength()] );
720 if( nCmpIdx < nSuchIdx )
721 return aRet;
722 nCmpIdx -= nSuchIdx;
724 return aRet;
727 void TextSearch::RESrchPrepare( const ::com::sun::star::util::SearchOptions& rOptions)
729 // select the transliterated pattern string
730 const OUString& rPatternStr =
731 (rOptions.transliterateFlags & SIMPLE_TRANS_MASK) ? sSrchStr
732 : ((rOptions.transliterateFlags & COMPLEX_TRANS_MASK) ? sSrchStr2 : rOptions.searchString);
734 sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability
735 // map com::sun::star::util::SearchFlags to ICU uregex.h flags
736 // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
737 // REG_NEWLINE is neither properly defined nor used anywhere => not implemented
738 // REG_NOSUB is not used anywhere => not implemented
739 // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
740 // LEV_RELAXED is only used for SearchAlgorithm==Approximate
741 // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO
742 // probably because the transliteration flag IGNORE_CASE handles it as well.
743 if( (rOptions.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0
744 || (rOptions.transliterateFlags & TransliterationModules_IGNORE_CASE) != 0)
745 nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
746 UErrorCode nIcuErr = U_ZERO_ERROR;
747 // assumption: transliteration didn't mangle regexp control chars
748 IcuUniString aIcuSearchPatStr( (const UChar*)rPatternStr.getStr(), rPatternStr.getLength());
749 #ifndef DISABLE_WORDBOUND_EMULATION
750 // for conveniance specific syntax elements of the old regex engine are emulated
751 // - by replacing \< with "word-break followed by a look-ahead word-char"
752 static const IcuUniString aChevronPatternB( "\\\\<", -1, IcuUniString::kInvariant);
753 static const IcuUniString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, IcuUniString::kInvariant);
754 static RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr);
755 aChevronMatcherB.reset( aIcuSearchPatStr);
756 aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr);
757 aChevronMatcherB.reset();
758 // - by replacing \> with "look-behind word-char followed by a word-break"
759 static const IcuUniString aChevronPatternE( "\\\\>", -1, IcuUniString::kInvariant);
760 static const IcuUniString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, IcuUniString::kInvariant);
761 static RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr);
762 aChevronMatcherE.reset( aIcuSearchPatStr);
763 aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr);
764 aChevronMatcherE.reset();
765 #endif
766 pRegexMatcher = new RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr);
767 if (nIcuErr)
769 delete pRegexMatcher;
770 pRegexMatcher = NULL;
772 else
774 // Pathological patterns may result in exponential run time making the
775 // application appear to be frozen. Limit that. Documentation for this
776 // call says
777 // https://ssl.icu-project.org/apiref/icu4c/classicu_1_1RegexMatcher.html#a6ebcfcab4fe6a38678c0291643a03a00
778 // "The units of the limit are steps of the match engine.
779 // Correspondence with actual processor time will depend on the speed
780 // of the processor and the details of the specific pattern, but will
781 // typically be on the order of milliseconds."
782 // Just what is a good value? 42 is always an answer ... the 23 enigma
783 // as well.. which on the dev's machine is roughly 50 seconds with the
784 // pattern of fdo#70627.
785 /* TODO: make this a configuration settable value and possibly take
786 * complexity of expression into account and maybe even length of text
787 * to be matched; currently (2013-11-25) that is at most one 64k
788 * paragraph per RESrchFrwrd()/RESrchBkwrd() call. */
789 pRegexMatcher->setTimeLimit( 23*1000, nIcuErr);
793 //---------------------------------------------------------------------------
795 SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr,
796 sal_Int32 startPos, sal_Int32 endPos )
797 throw(RuntimeException)
799 SearchResult aRet;
800 aRet.subRegExpressions = 0;
801 if( !pRegexMatcher)
802 return aRet;
804 if( endPos > searchStr.getLength())
805 endPos = searchStr.getLength();
807 // use the ICU RegexMatcher to find the matches
808 UErrorCode nIcuErr = U_ZERO_ERROR;
809 const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), endPos);
810 pRegexMatcher->reset( aSearchTargetStr);
811 // search until there is a valid match
812 for(;;)
814 if( !pRegexMatcher->find( startPos, nIcuErr))
815 return aRet;
817 // #i118887# ignore zero-length matches e.g. "a*" in "bc"
818 int nStartOfs = pRegexMatcher->start( nIcuErr);
819 int nEndOfs = pRegexMatcher->end( nIcuErr);
820 if( nStartOfs < nEndOfs)
821 break;
822 // If the zero-length match is behind the string, do not match it again
823 // and again until startPos reaches there. A match behind the string is
824 // a "$" anchor.
825 if (nStartOfs == endPos)
826 break;
827 // try at next position if there was a zero-length match
828 if( ++startPos >= endPos)
829 return aRet;
832 // extract the result of the search
833 const int nGroupCount = pRegexMatcher->groupCount();
834 aRet.subRegExpressions = nGroupCount + 1;
835 aRet.startOffset.realloc( aRet.subRegExpressions);
836 aRet.endOffset.realloc( aRet.subRegExpressions);
837 aRet.startOffset[0] = pRegexMatcher->start( nIcuErr);
838 aRet.endOffset[0] = pRegexMatcher->end( nIcuErr);
839 for( int i = 1; i <= nGroupCount; ++i) {
840 aRet.startOffset[i] = pRegexMatcher->start( i, nIcuErr);
841 aRet.endOffset[i] = pRegexMatcher->end( i, nIcuErr);
844 return aRet;
847 SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr,
848 sal_Int32 startPos, sal_Int32 endPos )
849 throw(RuntimeException)
851 // NOTE: for backwards search callers provide startPos/endPos inverted!
852 SearchResult aRet;
853 aRet.subRegExpressions = 0;
854 if( !pRegexMatcher)
855 return aRet;
857 if( startPos > searchStr.getLength())
858 startPos = searchStr.getLength();
860 // use the ICU RegexMatcher to find the matches
861 // TODO: use ICU's backward searching once it becomes available
862 // as its replacement using forward search is not as good as the real thing
863 UErrorCode nIcuErr = U_ZERO_ERROR;
864 const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), startPos);
865 pRegexMatcher->reset( aSearchTargetStr);
866 if( !pRegexMatcher->find( endPos, nIcuErr))
867 return aRet;
869 // find the last match
870 int nLastPos = 0;
871 int nFoundEnd = 0;
872 int nGoodPos = 0, nGoodEnd = 0;
873 bool bFirst = true;
874 do {
875 nLastPos = pRegexMatcher->start( nIcuErr);
876 nFoundEnd = pRegexMatcher->end( nIcuErr);
877 if (nLastPos < nFoundEnd)
879 // remember last non-zero-length match
880 nGoodPos = nLastPos;
881 nGoodEnd = nFoundEnd;
883 if( nFoundEnd >= startPos)
884 break;
885 bFirst = false;
886 if( nFoundEnd == nLastPos)
887 ++nFoundEnd;
888 } while( pRegexMatcher->find( nFoundEnd, nIcuErr));
890 // Ignore all zero-length matches except "$" anchor on first match.
891 if (nGoodPos == nGoodEnd)
893 if (bFirst && nLastPos == startPos)
894 nGoodPos = nLastPos;
895 else
896 return aRet;
899 // find last match again to get its details
900 pRegexMatcher->find( nGoodPos, nIcuErr);
902 // fill in the details of the last match
903 const int nGroupCount = pRegexMatcher->groupCount();
904 aRet.subRegExpressions = nGroupCount + 1;
905 aRet.startOffset.realloc( aRet.subRegExpressions);
906 aRet.endOffset.realloc( aRet.subRegExpressions);
907 // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted!
908 aRet.startOffset[0] = pRegexMatcher->end( nIcuErr);
909 aRet.endOffset[0] = pRegexMatcher->start( nIcuErr);
910 for( int i = 1; i <= nGroupCount; ++i) {
911 aRet.startOffset[i] = pRegexMatcher->end( i, nIcuErr);
912 aRet.endOffset[i] = pRegexMatcher->start( i, nIcuErr);
915 return aRet;
918 //---------------------------------------------------------------------------
920 // search for words phonetically
921 SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr,
922 sal_Int32 startPos, sal_Int32 endPos )
923 throw(RuntimeException)
925 SearchResult aRet;
926 aRet.subRegExpressions = 0;
928 if( !xBreak.is() )
929 return aRet;
931 OUString aWTemp( searchStr );
933 register sal_Int32 nStt, nEnd;
935 Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos,
936 aSrchPara.Locale,
937 WordType::ANYWORD_IGNOREWHITESPACES, sal_True );
941 if( aWBnd.startPos >= endPos )
942 break;
943 nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos;
944 nEnd = aWBnd.endPos > endPos ? endPos : aWBnd.endPos;
946 if( nStt < nEnd &&
947 pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit )
949 aRet.subRegExpressions = 1;
950 aRet.startOffset.realloc( 1 );
951 aRet.startOffset[ 0 ] = nStt;
952 aRet.endOffset.realloc( 1 );
953 aRet.endOffset[ 0 ] = nEnd;
954 break;
957 nStt = nEnd - 1;
958 aWBnd = xBreak->nextWord( aWTemp, nStt, aSrchPara.Locale,
959 WordType::ANYWORD_IGNOREWHITESPACES);
960 } while( aWBnd.startPos != aWBnd.endPos ||
961 (aWBnd.endPos != aWTemp.getLength() && aWBnd.endPos != nEnd) );
962 // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only
963 // whitespace) in searchStr, getWordBoundary() returned startPos,startPos
964 // and nextWord() does also => don't loop forever.
965 return aRet;
968 SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr,
969 sal_Int32 startPos, sal_Int32 endPos )
970 throw(RuntimeException)
972 SearchResult aRet;
973 aRet.subRegExpressions = 0;
975 if( !xBreak.is() )
976 return aRet;
978 OUString aWTemp( searchStr );
980 register sal_Int32 nStt, nEnd;
982 Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos,
983 aSrchPara.Locale,
984 WordType::ANYWORD_IGNOREWHITESPACES, sal_True );
988 if( aWBnd.endPos <= endPos )
989 break;
990 nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos;
991 nEnd = aWBnd.endPos > startPos ? startPos : aWBnd.endPos;
993 if( nStt < nEnd &&
994 pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit )
996 aRet.subRegExpressions = 1;
997 aRet.startOffset.realloc( 1 );
998 aRet.startOffset[ 0 ] = nEnd;
999 aRet.endOffset.realloc( 1 );
1000 aRet.endOffset[ 0 ] = nStt;
1001 break;
1003 if( !nStt )
1004 break;
1006 aWBnd = xBreak->previousWord( aWTemp, nStt, aSrchPara.Locale,
1007 WordType::ANYWORD_IGNOREWHITESPACES);
1008 } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != aWTemp.getLength() );
1009 return aRet;
1013 static const sal_Char cSearchName[] = "com.sun.star.util.TextSearch";
1014 static const sal_Char cSearchImpl[] = "com.sun.star.util.TextSearch_i18n";
1016 static OUString getServiceName_Static()
1018 return OUString::createFromAscii( cSearchName );
1021 static OUString getImplementationName_Static()
1023 return OUString::createFromAscii( cSearchImpl );
1026 OUString SAL_CALL
1027 TextSearch::getImplementationName()
1028 throw( RuntimeException )
1030 return getImplementationName_Static();
1033 sal_Bool SAL_CALL
1034 TextSearch::supportsService(const OUString& rServiceName)
1035 throw( RuntimeException )
1037 return rServiceName == cSearchName;
1040 Sequence< OUString > SAL_CALL
1041 TextSearch::getSupportedServiceNames(void) throw( RuntimeException )
1043 Sequence< OUString > aRet(1);
1044 aRet[0] = getServiceName_Static();
1045 return aRet;
1048 ::com::sun::star::uno::Reference< ::com::sun::star::uno::XInterface >
1049 SAL_CALL TextSearch_CreateInstance(
1050 const ::com::sun::star::uno::Reference<
1051 ::com::sun::star::lang::XMultiServiceFactory >& rxMSF )
1053 return ::com::sun::star::uno::Reference<
1054 ::com::sun::star::uno::XInterface >(
1055 (::cppu::OWeakObject*) new TextSearch(
1056 comphelper::getComponentContext( rxMSF ) ) );
1059 extern "C"
1061 SAL_DLLPUBLIC_EXPORT void* SAL_CALL
1062 i18nsearch_component_getFactory( const sal_Char* sImplementationName,
1063 void* _pServiceManager,
1064 SAL_UNUSED_PARAMETER void* )
1066 void* pRet = NULL;
1068 ::com::sun::star::lang::XMultiServiceFactory* pServiceManager =
1069 reinterpret_cast< ::com::sun::star::lang::XMultiServiceFactory* >
1070 ( _pServiceManager );
1071 ::com::sun::star::uno::Reference<
1072 ::com::sun::star::lang::XSingleServiceFactory > xFactory;
1074 if ( 0 == rtl_str_compare( sImplementationName, cSearchImpl) )
1076 ::com::sun::star::uno::Sequence< OUString > aServiceNames(1);
1077 aServiceNames[0] = getServiceName_Static();
1078 xFactory = ::cppu::createSingleFactory(
1079 pServiceManager, getImplementationName_Static(),
1080 &TextSearch_CreateInstance, aServiceNames );
1083 if ( xFactory.is() )
1085 xFactory->acquire();
1086 pRet = xFactory.get();
1089 return pRet;
1092 } // extern "C"
1094 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */