Version 4.0.0.1, tag libreoffice-4.0.0.1
[LibreOffice.git] / i18npool / source / search / textsearch.cxx
blob314dd5b0e0b50eddd95869e54e7b980c6f39f64a
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include "textsearch.hxx"
21 #include "levdis.hxx"
22 #include <com/sun/star/lang/Locale.hpp>
23 #include <com/sun/star/lang/XMultiServiceFactory.hpp>
24 #include <comphelper/processfactory.hxx>
25 #include <com/sun/star/i18n/BreakIterator.hpp>
26 #include <com/sun/star/i18n/UnicodeType.hpp>
27 #include <com/sun/star/util/SearchFlags.hpp>
28 #include <com/sun/star/i18n/WordType.hpp>
29 #include <com/sun/star/i18n/ScriptType.hpp>
30 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
31 #include <com/sun/star/i18n/CharacterClassification.hpp>
32 #include <com/sun/star/i18n/KCharacterType.hpp>
33 #include <com/sun/star/i18n/Transliteration.hpp>
34 #include <com/sun/star/registry/XRegistryKey.hpp>
35 #include <cppuhelper/factory.hxx>
36 #include <cppuhelper/weak.hxx>
38 #ifdef _MSC_VER
39 // get rid of that dumb compiler warning
40 // identifier was truncated to '255' characters in the debug information
41 // for STL template usage, if .pdb files are to be created
42 #pragma warning( disable: 4786 )
43 #endif
45 #include <string.h>
47 using namespace ::com::sun::star::util;
48 using namespace ::com::sun::star::uno;
49 using namespace ::com::sun::star::lang;
50 using namespace ::com::sun::star::i18n;
51 using namespace ::com::sun::star;
53 static sal_Int32 COMPLEX_TRANS_MASK_TMP =
54 TransliterationModules_ignoreBaFa_ja_JP |
55 TransliterationModules_ignoreIterationMark_ja_JP |
56 TransliterationModules_ignoreTiJi_ja_JP |
57 TransliterationModules_ignoreHyuByu_ja_JP |
58 TransliterationModules_ignoreSeZe_ja_JP |
59 TransliterationModules_ignoreIandEfollowedByYa_ja_JP |
60 TransliterationModules_ignoreKiKuFollowedBySa_ja_JP |
61 TransliterationModules_ignoreProlongedSoundMark_ja_JP;
62 static const sal_Int32 COMPLEX_TRANS_MASK = COMPLEX_TRANS_MASK_TMP | TransliterationModules_IGNORE_KANA | TransliterationModules_FULLWIDTH_HALFWIDTH;
63 static const sal_Int32 SIMPLE_TRANS_MASK = ~COMPLEX_TRANS_MASK;
64 // Above 2 transliteration is simple but need to take effect in
65 // complex transliteration
67 TextSearch::TextSearch(const Reference < XComponentContext > & rxContext)
68 : m_xContext( rxContext )
69 , pJumpTable( 0 )
70 , pJumpTable2( 0 )
71 , pRegexMatcher( NULL )
72 , pWLD( 0 )
74 SearchOptions aOpt;
75 aOpt.algorithmType = SearchAlgorithms_ABSOLUTE;
76 aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE;
77 //aOpt.Locale = ???;
78 setOptions( aOpt );
81 TextSearch::~TextSearch()
83 delete pRegexMatcher;
84 delete pWLD;
85 delete pJumpTable;
86 delete pJumpTable2;
89 void TextSearch::setOptions( const SearchOptions& rOptions ) throw( RuntimeException )
91 aSrchPara = rOptions;
93 delete pRegexMatcher, pRegexMatcher = NULL;
94 delete pWLD, pWLD = 0;
95 delete pJumpTable, pJumpTable = 0;
96 delete pJumpTable2, pJumpTable2 = 0;
98 // Create Transliteration class
99 if( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK )
101 if( !xTranslit.is() )
102 xTranslit.set( Transliteration::create( m_xContext ) );
103 xTranslit->loadModule(
104 (TransliterationModules)( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ),
105 aSrchPara.Locale);
107 else if( xTranslit.is() )
108 xTranslit = 0;
110 // Create Transliteration for 2<->1, 2<->2 transliteration
111 if ( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK )
113 if( !xTranslit2.is() )
114 xTranslit2.set( Transliteration::create( m_xContext ) );
115 // Load transliteration module
116 xTranslit2->loadModule(
117 (TransliterationModules)( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ),
118 aSrchPara.Locale);
121 if ( !xBreak.is() )
122 xBreak = com::sun::star::i18n::BreakIterator::create( m_xContext );
124 sSrchStr = aSrchPara.searchString;
126 // use transliteration here
127 if ( xTranslit.is() &&
128 aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK )
129 sSrchStr = xTranslit->transliterateString2String(
130 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
132 if ( xTranslit2.is() &&
133 aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK )
134 sSrchStr2 = xTranslit2->transliterateString2String(
135 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
137 // When start or end of search string is a complex script type, we need to
138 // make sure the result boundary is not located in the middle of cell.
139 checkCTLStart = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 0) ==
140 ScriptType::COMPLEX));
141 checkCTLEnd = (xBreak.is() && (xBreak->getScriptType(sSrchStr,
142 sSrchStr.getLength()-1) == ScriptType::COMPLEX));
144 switch( aSrchPara.algorithmType)
146 case SearchAlgorithms_REGEXP:
147 fnForward = &TextSearch::RESrchFrwrd;
148 fnBackward = &TextSearch::RESrchBkwrd;
149 RESrchPrepare( aSrchPara);
150 break;
152 case SearchAlgorithms_APPROXIMATE:
153 fnForward = &TextSearch::ApproxSrchFrwrd;
154 fnBackward = &TextSearch::ApproxSrchBkwrd;
156 pWLD = new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars,
157 aSrchPara.insertedChars, aSrchPara.deletedChars,
158 0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) );
160 nLimit = pWLD->GetLimit();
161 break;
163 default:
164 fnForward = &TextSearch::NSrchFrwrd;
165 fnBackward = &TextSearch::NSrchBkwrd;
166 break;
170 sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos )
172 sal_Int32 nRet = 0, nEnd = rOff.getLength();
173 while( nRet < nEnd && nPos > rOff[ nRet ] ) ++nRet;
174 return nRet;
177 sal_Bool TextSearch::isCellStart(const OUString& searchStr, sal_Int32 nPos)
178 throw( RuntimeException )
180 sal_Int32 nDone;
181 return nPos == xBreak->previousCharacters(searchStr, nPos+1,
182 aSrchPara.Locale, CharacterIteratorMode::SKIPCELL, 1, nDone);
185 SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
186 throw( RuntimeException )
188 SearchResult sres;
190 OUString in_str(searchStr);
191 sal_Int32 newStartPos = startPos;
192 sal_Int32 newEndPos = endPos;
194 bUsePrimarySrchStr = true;
196 if ( xTranslit.is() )
198 // apply normal transliteration (1<->1, 1<->0)
199 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
200 in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset );
202 // JP 20.6.2001: also the start and end positions must be corrected!
203 if( startPos )
204 newStartPos = FindPosInSeq_Impl( offset, startPos );
206 if( endPos < searchStr.getLength() )
207 newEndPos = FindPosInSeq_Impl( offset, endPos );
208 else
209 newEndPos = in_str.getLength();
211 sres = (this->*fnForward)( in_str, newStartPos, newEndPos );
213 for ( int k = 0; k < sres.startOffset.getLength(); k++ )
215 if (sres.startOffset[k])
216 sres.startOffset[k] = offset[sres.startOffset[k]];
217 // JP 20.6.2001: end is ever exclusive and then don't return
218 // the position of the next character - return the
219 // next position behind the last found character!
220 // "a b c" find "b" must return 2,3 and not 2,4!!!
221 if (sres.endOffset[k])
222 sres.endOffset[k] = offset[sres.endOffset[k]-1] + 1;
225 else
227 sres = (this->*fnForward)( in_str, startPos, endPos );
230 if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP)
232 SearchResult sres2;
234 in_str = OUString(searchStr);
235 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
237 in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset );
239 if( startPos )
240 startPos = FindPosInSeq_Impl( offset, startPos );
242 if( endPos < searchStr.getLength() )
243 endPos = FindPosInSeq_Impl( offset, endPos );
244 else
245 endPos = in_str.getLength();
247 bUsePrimarySrchStr = false;
248 sres2 = (this->*fnForward)( in_str, startPos, endPos );
250 for ( int k = 0; k < sres2.startOffset.getLength(); k++ )
252 if (sres2.startOffset[k])
253 sres2.startOffset[k] = offset[sres2.startOffset[k]-1] + 1;
254 if (sres2.endOffset[k])
255 sres2.endOffset[k] = offset[sres2.endOffset[k]-1] + 1;
258 // pick first and long one
259 if ( sres.subRegExpressions == 0)
260 return sres2;
261 if ( sres2.subRegExpressions == 1)
263 if ( sres.startOffset[0] > sres2.startOffset[0])
264 return sres2;
265 else if ( sres.startOffset[0] == sres2.startOffset[0] &&
266 sres.endOffset[0] < sres2.endOffset[0])
267 return sres2;
271 return sres;
274 SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
275 throw(RuntimeException)
277 SearchResult sres;
279 OUString in_str(searchStr);
280 sal_Int32 newStartPos = startPos;
281 sal_Int32 newEndPos = endPos;
283 bUsePrimarySrchStr = true;
285 if ( xTranslit.is() )
287 // apply only simple 1<->1 transliteration here
288 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
289 in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset );
291 // JP 20.6.2001: also the start and end positions must be corrected!
292 if( startPos < searchStr.getLength() )
293 newStartPos = FindPosInSeq_Impl( offset, startPos );
294 else
295 newStartPos = in_str.getLength();
297 if( endPos )
298 newEndPos = FindPosInSeq_Impl( offset, endPos );
300 sres = (this->*fnBackward)( in_str, newStartPos, newEndPos );
302 for ( int k = 0; k < sres.startOffset.getLength(); k++ )
304 if (sres.startOffset[k])
305 sres.startOffset[k] = offset[sres.startOffset[k] - 1] + 1;
306 // JP 20.6.2001: end is ever exclusive and then don't return
307 // the position of the next character - return the
308 // next position behind the last found character!
309 // "a b c" find "b" must return 2,3 and not 2,4!!!
310 if (sres.endOffset[k])
311 sres.endOffset[k] = offset[sres.endOffset[k]];
314 else
316 sres = (this->*fnBackward)( in_str, startPos, endPos );
319 if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP )
321 SearchResult sres2;
323 in_str = OUString(searchStr);
324 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
326 in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset);
328 if( startPos < searchStr.getLength() )
329 startPos = FindPosInSeq_Impl( offset, startPos );
330 else
331 startPos = in_str.getLength();
333 if( endPos )
334 endPos = FindPosInSeq_Impl( offset, endPos );
336 bUsePrimarySrchStr = false;
337 sres2 = (this->*fnBackward)( in_str, startPos, endPos );
339 for( int k = 0; k < sres2.startOffset.getLength(); k++ )
341 if (sres2.startOffset[k])
342 sres2.startOffset[k] = offset[sres2.startOffset[k]-1]+1;
343 if (sres2.endOffset[k])
344 sres2.endOffset[k] = offset[sres2.endOffset[k]-1]+1;
347 // pick last and long one
348 if ( sres.subRegExpressions == 0 )
349 return sres2;
350 if ( sres2.subRegExpressions == 1 )
352 if ( sres.startOffset[0] < sres2.startOffset[0] )
353 return sres2;
354 if ( sres.startOffset[0] == sres2.startOffset[0] &&
355 sres.endOffset[0] > sres2.endOffset[0] )
356 return sres2;
360 return sres;
363 //---------------------------------------------------------------------
365 bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const
367 bool bRet = 1;
368 if( '\x7f' != rStr[nPos])
370 if ( !xCharClass.is() )
371 xCharClass = CharacterClassification::create( m_xContext );
372 sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos,
373 aSrchPara.Locale );
374 if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA |
375 KCharacterType::LETTER ) & nCType ) )
376 bRet = 0;
378 return bRet;
381 // --------- helper methods for Boyer-Moore like text searching ----------
382 // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available
384 void TextSearch::MakeForwardTab()
386 // create the jumptable for the search text
387 if( pJumpTable )
389 if( bIsForwardTab )
390 return ; // the jumpTable is ok
391 delete pJumpTable;
393 bIsForwardTab = true;
395 sal_Int32 n, nLen = sSrchStr.getLength();
396 pJumpTable = new TextSearchJumpTable;
398 for( n = 0; n < nLen - 1; ++n )
400 sal_Unicode cCh = sSrchStr[n];
401 sal_Int32 nDiff = nLen - n - 1;
402 TextSearchJumpTable::value_type aEntry( cCh, nDiff );
404 ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
405 pJumpTable->insert( aEntry );
406 if ( !aPair.second )
407 (*(aPair.first)).second = nDiff;
411 void TextSearch::MakeForwardTab2()
413 // create the jumptable for the search text
414 if( pJumpTable2 )
416 if( bIsForwardTab )
417 return ; // the jumpTable is ok
418 delete pJumpTable2;
420 bIsForwardTab = true;
422 sal_Int32 n, nLen = sSrchStr2.getLength();
423 pJumpTable2 = new TextSearchJumpTable;
425 for( n = 0; n < nLen - 1; ++n )
427 sal_Unicode cCh = sSrchStr2[n];
428 sal_Int32 nDiff = nLen - n - 1;
430 TextSearchJumpTable::value_type aEntry( cCh, nDiff );
431 ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
432 pJumpTable2->insert( aEntry );
433 if ( !aPair.second )
434 (*(aPair.first)).second = nDiff;
438 void TextSearch::MakeBackwardTab()
440 // create the jumptable for the search text
441 if( pJumpTable )
443 if( !bIsForwardTab )
444 return ; // the jumpTable is ok
445 delete pJumpTable;
447 bIsForwardTab = false;
449 sal_Int32 n, nLen = sSrchStr.getLength();
450 pJumpTable = new TextSearchJumpTable;
452 for( n = nLen-1; n > 0; --n )
454 sal_Unicode cCh = sSrchStr[n];
455 TextSearchJumpTable::value_type aEntry( cCh, n );
456 ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
457 pJumpTable->insert( aEntry );
458 if ( !aPair.second )
459 (*(aPair.first)).second = n;
463 void TextSearch::MakeBackwardTab2()
465 // create the jumptable for the search text
466 if( pJumpTable2 )
468 if( !bIsForwardTab )
469 return ; // the jumpTable is ok
470 delete pJumpTable2;
472 bIsForwardTab = false;
474 sal_Int32 n, nLen = sSrchStr2.getLength();
475 pJumpTable2 = new TextSearchJumpTable;
477 for( n = nLen-1; n > 0; --n )
479 sal_Unicode cCh = sSrchStr2[n];
480 TextSearchJumpTable::value_type aEntry( cCh, n );
481 ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
482 pJumpTable2->insert( aEntry );
483 if ( !aPair.second )
484 (*(aPair.first)).second = n;
488 sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const
490 TextSearchJumpTable *pJump;
491 OUString sSearchKey;
493 if ( bUsePrimarySrchStr ) {
494 pJump = pJumpTable;
495 sSearchKey = sSrchStr;
496 } else {
497 pJump = pJumpTable2;
498 sSearchKey = sSrchStr2;
501 TextSearchJumpTable::const_iterator iLook = pJump->find( cChr );
502 if ( iLook == pJump->end() )
503 return sSearchKey.getLength();
504 return (*iLook).second;
508 // TextSearch::NSrchFrwrd is mis-optimized on unxsoli (#i105945#)
509 SearchResult TextSearch::NSrchFrwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
510 throw(RuntimeException)
512 SearchResult aRet;
513 aRet.subRegExpressions = 0;
515 OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
517 OUString aStr( searchStr );
518 sal_Int32 nSuchIdx = aStr.getLength();
519 sal_Int32 nEnde = endPos;
520 if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx )
521 return aRet;
524 if( nEnde < sSearchKey.getLength() ) // position inside the search region ?
525 return aRet;
527 nEnde -= sSearchKey.getLength();
529 if (bUsePrimarySrchStr)
530 MakeForwardTab(); // create the jumptable
531 else
532 MakeForwardTab2();
534 for (sal_Int32 nCmpIdx = startPos; // start position for the search
535 nCmpIdx <= nEnde;
536 nCmpIdx += GetDiff( aStr[nCmpIdx + sSearchKey.getLength()-1]))
538 // if the match would be the completed cells, skip it.
539 if ( (checkCTLStart && !isCellStart( aStr, nCmpIdx )) || (checkCTLEnd
540 && !isCellStart( aStr, nCmpIdx + sSearchKey.getLength())) )
541 continue;
543 nSuchIdx = sSearchKey.getLength() - 1;
544 while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == aStr[nCmpIdx + nSuchIdx])
546 if( nSuchIdx == 0 )
548 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
550 sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength();
551 bool bAtStart = !nCmpIdx;
552 bool bAtEnd = nFndEnd == endPos;
553 bool bDelimBefore = bAtStart || IsDelimiter( aStr, nCmpIdx-1 );
554 bool bDelimBehind = IsDelimiter( aStr, nFndEnd );
555 // * 1 -> only one word in the paragraph
556 // * 2 -> at begin of paragraph
557 // * 3 -> at end of paragraph
558 // * 4 -> inside the paragraph
559 if( !( ( bAtStart && bAtEnd ) || // 1
560 ( bAtStart && bDelimBehind ) || // 2
561 ( bAtEnd && bDelimBefore ) || // 3
562 ( bDelimBefore && bDelimBehind ))) // 4
563 break;
566 aRet.subRegExpressions = 1;
567 aRet.startOffset.realloc( 1 );
568 aRet.startOffset[ 0 ] = nCmpIdx;
569 aRet.endOffset.realloc( 1 );
570 aRet.endOffset[ 0 ] = nCmpIdx + sSearchKey.getLength();
572 return aRet;
574 else
575 nSuchIdx--;
578 return aRet;
581 SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
582 throw(RuntimeException)
584 SearchResult aRet;
585 aRet.subRegExpressions = 0;
587 OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
589 OUString aStr( searchStr );
590 sal_Int32 nSuchIdx = aStr.getLength();
591 sal_Int32 nEnde = endPos;
592 if( nSuchIdx == 0 || sSearchKey.isEmpty() || sSearchKey.getLength() > nSuchIdx)
593 return aRet;
595 if (bUsePrimarySrchStr)
596 MakeBackwardTab(); // create the jumptable
597 else
598 MakeBackwardTab2();
600 if( nEnde == nSuchIdx ) // end position for the search
601 nEnde = sSearchKey.getLength();
602 else
603 nEnde += sSearchKey.getLength();
605 sal_Int32 nCmpIdx = startPos; // start position for the search
607 while (nCmpIdx >= nEnde)
609 // if the match would be the completed cells, skip it.
610 if ( (!checkCTLStart || isCellStart( aStr, nCmpIdx -
611 sSearchKey.getLength() )) && (!checkCTLEnd ||
612 isCellStart( aStr, nCmpIdx)))
614 nSuchIdx = 0;
615 while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] ==
616 aStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] )
617 nSuchIdx++;
618 if( nSuchIdx >= sSearchKey.getLength() )
620 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
622 sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength();
623 bool bAtStart = !nFndStt;
624 bool bAtEnd = nCmpIdx == startPos;
625 bool bDelimBehind = IsDelimiter( aStr, nCmpIdx );
626 bool bDelimBefore = bAtStart || // begin of paragraph
627 IsDelimiter( aStr, nFndStt-1 );
628 // * 1 -> only one word in the paragraph
629 // * 2 -> at begin of paragraph
630 // * 3 -> at end of paragraph
631 // * 4 -> inside the paragraph
632 if( ( bAtStart && bAtEnd ) || // 1
633 ( bAtStart && bDelimBehind ) || // 2
634 ( bAtEnd && bDelimBefore ) || // 3
635 ( bDelimBefore && bDelimBehind )) // 4
637 aRet.subRegExpressions = 1;
638 aRet.startOffset.realloc( 1 );
639 aRet.startOffset[ 0 ] = nCmpIdx;
640 aRet.endOffset.realloc( 1 );
641 aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength();
642 return aRet;
645 else
647 aRet.subRegExpressions = 1;
648 aRet.startOffset.realloc( 1 );
649 aRet.startOffset[ 0 ] = nCmpIdx;
650 aRet.endOffset.realloc( 1 );
651 aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength();
652 return aRet;
656 nSuchIdx = GetDiff( aStr[nCmpIdx - sSearchKey.getLength()] );
657 if( nCmpIdx < nSuchIdx )
658 return aRet;
659 nCmpIdx -= nSuchIdx;
661 return aRet;
664 void TextSearch::RESrchPrepare( const ::com::sun::star::util::SearchOptions& rOptions)
666 // select the transliterated pattern string
667 const OUString& rPatternStr =
668 (rOptions.transliterateFlags & SIMPLE_TRANS_MASK) ? sSrchStr
669 : ((rOptions.transliterateFlags & COMPLEX_TRANS_MASK) ? sSrchStr2 : rOptions.searchString);
671 sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability
672 // map com::sun::star::util::SearchFlags to ICU uregex.h flags
673 // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
674 // REG_NEWLINE is neither properly defined nor used anywhere => not implemented
675 // REG_NOSUB is not used anywhere => not implemented
676 // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
677 // LEV_RELAXED is only used for SearchAlgorithm==Approximate
678 // why is even ALL_IGNORE_CASE deprecated in UNO? because of transliteration taking care of it???
679 if( (rOptions.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0)
680 nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
681 UErrorCode nIcuErr = U_ZERO_ERROR;
682 // assumption: transliteration didn't mangle regexp control chars
683 IcuUniString aIcuSearchPatStr( (const UChar*)rPatternStr.getStr(), rPatternStr.getLength());
684 #ifndef DISABLE_WORDBOUND_EMULATION
685 // for conveniance specific syntax elements of the old regex engine are emulated
686 // by using regular word boundary matching \b to replace \< and \>
687 static const IcuUniString aChevronPattern( "\\\\<|\\\\>", -1, IcuUniString::kInvariant);
688 static const IcuUniString aChevronReplace( "\\\\b", -1, IcuUniString::kInvariant);
689 static RegexMatcher aChevronMatcher( aChevronPattern, 0, nIcuErr);
690 aChevronMatcher.reset( aIcuSearchPatStr);
691 aIcuSearchPatStr = aChevronMatcher.replaceAll( aChevronReplace, nIcuErr);
692 aChevronMatcher.reset();
693 #endif
694 pRegexMatcher = new RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr);
695 if( nIcuErr)
696 { delete pRegexMatcher; pRegexMatcher = NULL;}
699 //---------------------------------------------------------------------------
701 SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr,
702 sal_Int32 startPos, sal_Int32 endPos )
703 throw(RuntimeException)
705 SearchResult aRet;
706 aRet.subRegExpressions = 0;
707 if( !pRegexMatcher)
708 return aRet;
710 if( endPos > searchStr.getLength())
711 endPos = searchStr.getLength();
713 // use the ICU RegexMatcher to find the matches
714 UErrorCode nIcuErr = U_ZERO_ERROR;
715 const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), endPos);
716 pRegexMatcher->reset( aSearchTargetStr);
717 // search until there is a valid match
718 for(;;)
720 if( !pRegexMatcher->find( startPos, nIcuErr))
721 return aRet;
723 // #i118887# ignore zero-length matches e.g. "a*" in "bc"
724 int nStartOfs = pRegexMatcher->start( nIcuErr);
725 int nEndOfs = pRegexMatcher->end( nIcuErr);
726 if( nStartOfs < nEndOfs)
727 break;
728 // try at next position if there was a zero-length match
729 if( ++startPos >= endPos)
730 return aRet;
733 // extract the result of the search
734 const int nGroupCount = pRegexMatcher->groupCount();
735 aRet.subRegExpressions = nGroupCount + 1;
736 aRet.startOffset.realloc( aRet.subRegExpressions);
737 aRet.endOffset.realloc( aRet.subRegExpressions);
738 aRet.startOffset[0] = pRegexMatcher->start( nIcuErr);
739 aRet.endOffset[0] = pRegexMatcher->end( nIcuErr);
740 for( int i = 1; i <= nGroupCount; ++i) {
741 aRet.startOffset[i] = pRegexMatcher->start( i, nIcuErr);
742 aRet.endOffset[i] = pRegexMatcher->end( i, nIcuErr);
745 return aRet;
748 SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr,
749 sal_Int32 startPos, sal_Int32 endPos )
750 throw(RuntimeException)
752 // NOTE: for backwards search callers provide startPos/endPos inverted!
753 SearchResult aRet;
754 aRet.subRegExpressions = 0;
755 if( !pRegexMatcher)
756 return aRet;
758 if( startPos > searchStr.getLength())
759 startPos = searchStr.getLength();
761 // use the ICU RegexMatcher to find the matches
762 // TODO: use ICU's backward searching once it becomes available
763 // as its replacement using forward search is not as good as the real thing
764 UErrorCode nIcuErr = U_ZERO_ERROR;
765 const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), startPos);
766 pRegexMatcher->reset( aSearchTargetStr);
767 if( !pRegexMatcher->find( endPos, nIcuErr))
768 return aRet;
770 // find the last match
771 int nLastPos = 0;
772 do {
773 nLastPos = pRegexMatcher->start( nIcuErr);
774 } while( pRegexMatcher->find( nLastPos + 1, nIcuErr));
776 // find last match again to get its details
777 pRegexMatcher->find( nLastPos, nIcuErr);
779 // fill in the details of the last match
780 const int nGroupCount = pRegexMatcher->groupCount();
781 aRet.subRegExpressions = nGroupCount + 1;
782 aRet.startOffset.realloc( aRet.subRegExpressions);
783 aRet.endOffset.realloc( aRet.subRegExpressions);
784 // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted!
785 aRet.startOffset[0] = pRegexMatcher->end( nIcuErr);
786 aRet.endOffset[0] = pRegexMatcher->start( nIcuErr);
787 for( int i = 1; i <= nGroupCount; ++i) {
788 aRet.startOffset[i] = pRegexMatcher->end( i, nIcuErr);
789 aRet.endOffset[i] = pRegexMatcher->start( i, nIcuErr);
792 return aRet;
795 //---------------------------------------------------------------------------
797 // search for words phonetically
798 SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr,
799 sal_Int32 startPos, sal_Int32 endPos )
800 throw(RuntimeException)
802 SearchResult aRet;
803 aRet.subRegExpressions = 0;
805 if( !xBreak.is() )
806 return aRet;
808 OUString aWTemp( searchStr );
810 register sal_Int32 nStt, nEnd;
812 Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos,
813 aSrchPara.Locale,
814 WordType::ANYWORD_IGNOREWHITESPACES, sal_True );
818 if( aWBnd.startPos >= endPos )
819 break;
820 nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos;
821 nEnd = aWBnd.endPos > endPos ? endPos : aWBnd.endPos;
823 if( nStt < nEnd &&
824 pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit )
826 aRet.subRegExpressions = 1;
827 aRet.startOffset.realloc( 1 );
828 aRet.startOffset[ 0 ] = nStt;
829 aRet.endOffset.realloc( 1 );
830 aRet.endOffset[ 0 ] = nEnd;
831 break;
834 nStt = nEnd - 1;
835 aWBnd = xBreak->nextWord( aWTemp, nStt, aSrchPara.Locale,
836 WordType::ANYWORD_IGNOREWHITESPACES);
837 } while( aWBnd.startPos != aWBnd.endPos ||
838 (aWBnd.endPos != aWTemp.getLength() && aWBnd.endPos != nEnd) );
839 // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only
840 // whitespace) in searchStr, getWordBoundary() returned startPos,startPos
841 // and nextWord() does also => don't loop forever.
842 return aRet;
845 SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr,
846 sal_Int32 startPos, sal_Int32 endPos )
847 throw(RuntimeException)
849 SearchResult aRet;
850 aRet.subRegExpressions = 0;
852 if( !xBreak.is() )
853 return aRet;
855 OUString aWTemp( searchStr );
857 register sal_Int32 nStt, nEnd;
859 Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos,
860 aSrchPara.Locale,
861 WordType::ANYWORD_IGNOREWHITESPACES, sal_True );
865 if( aWBnd.endPos <= endPos )
866 break;
867 nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos;
868 nEnd = aWBnd.endPos > startPos ? startPos : aWBnd.endPos;
870 if( nStt < nEnd &&
871 pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit )
873 aRet.subRegExpressions = 1;
874 aRet.startOffset.realloc( 1 );
875 aRet.startOffset[ 0 ] = nEnd;
876 aRet.endOffset.realloc( 1 );
877 aRet.endOffset[ 0 ] = nStt;
878 break;
880 if( !nStt )
881 break;
883 aWBnd = xBreak->previousWord( aWTemp, nStt, aSrchPara.Locale,
884 WordType::ANYWORD_IGNOREWHITESPACES);
885 } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != aWTemp.getLength() );
886 return aRet;
890 static const sal_Char cSearchName[] = "com.sun.star.util.TextSearch";
891 static const sal_Char cSearchImpl[] = "com.sun.star.util.TextSearch_i18n";
893 static OUString getServiceName_Static()
895 return OUString::createFromAscii( cSearchName );
898 static OUString getImplementationName_Static()
900 return OUString::createFromAscii( cSearchImpl );
903 OUString SAL_CALL
904 TextSearch::getImplementationName()
905 throw( RuntimeException )
907 return getImplementationName_Static();
910 sal_Bool SAL_CALL
911 TextSearch::supportsService(const OUString& rServiceName)
912 throw( RuntimeException )
914 return rServiceName == cSearchName;
917 Sequence< OUString > SAL_CALL
918 TextSearch::getSupportedServiceNames(void) throw( RuntimeException )
920 Sequence< OUString > aRet(1);
921 aRet[0] = getServiceName_Static();
922 return aRet;
925 ::com::sun::star::uno::Reference< ::com::sun::star::uno::XInterface >
926 SAL_CALL TextSearch_CreateInstance(
927 const ::com::sun::star::uno::Reference<
928 ::com::sun::star::lang::XMultiServiceFactory >& rxMSF )
930 return ::com::sun::star::uno::Reference<
931 ::com::sun::star::uno::XInterface >(
932 (::cppu::OWeakObject*) new TextSearch(
933 comphelper::getComponentContext( rxMSF ) ) );
936 extern "C"
938 SAL_DLLPUBLIC_EXPORT void* SAL_CALL
939 i18nsearch_component_getFactory( const sal_Char* sImplementationName,
940 void* _pServiceManager,
941 SAL_UNUSED_PARAMETER void* )
943 void* pRet = NULL;
945 ::com::sun::star::lang::XMultiServiceFactory* pServiceManager =
946 reinterpret_cast< ::com::sun::star::lang::XMultiServiceFactory* >
947 ( _pServiceManager );
948 ::com::sun::star::uno::Reference<
949 ::com::sun::star::lang::XSingleServiceFactory > xFactory;
951 if ( 0 == rtl_str_compare( sImplementationName, cSearchImpl) )
953 ::com::sun::star::uno::Sequence< OUString > aServiceNames(1);
954 aServiceNames[0] = getServiceName_Static();
955 xFactory = ::cppu::createSingleFactory(
956 pServiceManager, getImplementationName_Static(),
957 &TextSearch_CreateInstance, aServiceNames );
960 if ( xFactory.is() )
962 xFactory->acquire();
963 pRet = xFactory.get();
966 return pRet;
969 } // extern "C"
971 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */