1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include "textsearch.hxx"
22 #include <com/sun/star/lang/Locale.hpp>
23 #include <com/sun/star/lang/XMultiServiceFactory.hpp>
24 #include <comphelper/processfactory.hxx>
25 #include <com/sun/star/i18n/BreakIterator.hpp>
26 #include <com/sun/star/i18n/UnicodeType.hpp>
27 #include <com/sun/star/util/SearchFlags.hpp>
28 #include <com/sun/star/i18n/WordType.hpp>
29 #include <com/sun/star/i18n/ScriptType.hpp>
30 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
31 #include <com/sun/star/i18n/CharacterClassification.hpp>
32 #include <com/sun/star/i18n/KCharacterType.hpp>
33 #include <com/sun/star/i18n/Transliteration.hpp>
34 #include <com/sun/star/registry/XRegistryKey.hpp>
35 #include <cppuhelper/factory.hxx>
36 #include <cppuhelper/weak.hxx>
39 // get rid of that dumb compiler warning
40 // identifier was truncated to '255' characters in the debug information
41 // for STL template usage, if .pdb files are to be created
42 #pragma warning( disable: 4786 )
47 using namespace ::com::sun::star::util
;
48 using namespace ::com::sun::star::uno
;
49 using namespace ::com::sun::star::lang
;
50 using namespace ::com::sun::star::i18n
;
51 using namespace ::com::sun::star
;
53 static const sal_Int32 COMPLEX_TRANS_MASK_TMP
=
54 TransliterationModules_ignoreBaFa_ja_JP
|
55 TransliterationModules_ignoreIterationMark_ja_JP
|
56 TransliterationModules_ignoreTiJi_ja_JP
|
57 TransliterationModules_ignoreHyuByu_ja_JP
|
58 TransliterationModules_ignoreSeZe_ja_JP
|
59 TransliterationModules_ignoreIandEfollowedByYa_ja_JP
|
60 TransliterationModules_ignoreKiKuFollowedBySa_ja_JP
|
61 TransliterationModules_ignoreProlongedSoundMark_ja_JP
;
63 // These 2 transliterations are simple but need to take effect in
64 // complex transliteration.
65 static const sal_Int32 COMPLEX_TRANS_MASK
=
66 COMPLEX_TRANS_MASK_TMP
|
67 TransliterationModules_IGNORE_KANA
|
68 TransliterationModules_FULLWIDTH_HALFWIDTH
;
70 static const sal_Int32 SIMPLE_TRANS_MASK
= ~COMPLEX_TRANS_MASK
;
72 // Regex patterns are case sensitive.
73 static const sal_Int32 SIMPLE_TRANS_MASK_REPATTERN
=
74 ~(COMPLEX_TRANS_MASK
|
75 TransliterationModules_IGNORE_CASE
|
76 TransliterationModules_UPPERCASE_LOWERCASE
|
77 TransliterationModules_LOWERCASE_UPPERCASE
);
80 TextSearch::TextSearch(const Reference
< XComponentContext
> & rxContext
)
81 : m_xContext( rxContext
)
84 , pRegexMatcher( NULL
)
88 aOpt
.algorithmType
= SearchAlgorithms_ABSOLUTE
;
89 aOpt
.searchFlag
= SearchFlags::ALL_IGNORE_CASE
;
94 TextSearch::~TextSearch()
102 void TextSearch::setOptions( const SearchOptions
& rOptions
) throw( RuntimeException
)
104 aSrchPara
= rOptions
;
106 delete pRegexMatcher
, pRegexMatcher
= NULL
;
107 delete pWLD
, pWLD
= 0;
108 delete pJumpTable
, pJumpTable
= 0;
109 delete pJumpTable2
, pJumpTable2
= 0;
111 // Create Transliteration class
112 if( aSrchPara
.transliterateFlags
& SIMPLE_TRANS_MASK
)
114 if( !xTranslit
.is() )
115 xTranslit
.set( Transliteration::create( m_xContext
) );
116 xTranslit
->loadModule(
117 (TransliterationModules
)( aSrchPara
.transliterateFlags
& SIMPLE_TRANS_MASK
),
120 else if( xTranslit
.is() )
123 // Create Transliteration for 2<->1, 2<->2 transliteration
124 if ( aSrchPara
.transliterateFlags
& COMPLEX_TRANS_MASK
)
126 if( !xTranslit2
.is() )
127 xTranslit2
.set( Transliteration::create( m_xContext
) );
128 // Load transliteration module
129 xTranslit2
->loadModule(
130 (TransliterationModules
)( aSrchPara
.transliterateFlags
& COMPLEX_TRANS_MASK
),
135 xBreak
= com::sun::star::i18n::BreakIterator::create( m_xContext
);
137 sSrchStr
= aSrchPara
.searchString
;
139 // Transliterate search string.
140 if (aSrchPara
.algorithmType
== SearchAlgorithms_REGEXP
)
142 if (aSrchPara
.transliterateFlags
& SIMPLE_TRANS_MASK_REPATTERN
)
144 if ((aSrchPara
.transliterateFlags
& SIMPLE_TRANS_MASK_REPATTERN
) !=
145 (aSrchPara
.transliterateFlags
& SIMPLE_TRANS_MASK
))
147 com::sun::star::uno::Reference
< XExtendedTransliteration
> xTranslitPattern(
148 Transliteration::create( m_xContext
));
149 if (xTranslitPattern
.is())
151 xTranslitPattern
->loadModule(
152 (TransliterationModules
)( aSrchPara
.transliterateFlags
& SIMPLE_TRANS_MASK_REPATTERN
),
154 sSrchStr
= xTranslitPattern
->transliterateString2String(
155 aSrchPara
.searchString
, 0, aSrchPara
.searchString
.getLength());
161 sSrchStr
= xTranslit
->transliterateString2String(
162 aSrchPara
.searchString
, 0, aSrchPara
.searchString
.getLength());
164 // xTranslit2 complex transliterated sSrchStr2 is not used in
165 // regex, see TextSearch::searchForward() and
166 // TextSearch::searchBackward()
171 if ( xTranslit
.is() && aSrchPara
.transliterateFlags
& SIMPLE_TRANS_MASK
)
172 sSrchStr
= xTranslit
->transliterateString2String(
173 aSrchPara
.searchString
, 0, aSrchPara
.searchString
.getLength());
175 if ( xTranslit2
.is() && aSrchPara
.transliterateFlags
& COMPLEX_TRANS_MASK
)
176 sSrchStr2
= xTranslit2
->transliterateString2String(
177 aSrchPara
.searchString
, 0, aSrchPara
.searchString
.getLength());
180 // When start or end of search string is a complex script type, we need to
181 // make sure the result boundary is not located in the middle of cell.
182 checkCTLStart
= (xBreak
.is() && (xBreak
->getScriptType(sSrchStr
, 0) ==
183 ScriptType::COMPLEX
));
184 checkCTLEnd
= (xBreak
.is() && (xBreak
->getScriptType(sSrchStr
,
185 sSrchStr
.getLength()-1) == ScriptType::COMPLEX
));
187 switch( aSrchPara
.algorithmType
)
189 case SearchAlgorithms_REGEXP
:
190 fnForward
= &TextSearch::RESrchFrwrd
;
191 fnBackward
= &TextSearch::RESrchBkwrd
;
192 RESrchPrepare( aSrchPara
);
195 case SearchAlgorithms_APPROXIMATE
:
196 fnForward
= &TextSearch::ApproxSrchFrwrd
;
197 fnBackward
= &TextSearch::ApproxSrchBkwrd
;
199 pWLD
= new WLevDistance( sSrchStr
.getStr(), aSrchPara
.changedChars
,
200 aSrchPara
.insertedChars
, aSrchPara
.deletedChars
,
201 0 != (SearchFlags::LEV_RELAXED
& aSrchPara
.searchFlag
) );
203 nLimit
= pWLD
->GetLimit();
207 fnForward
= &TextSearch::NSrchFrwrd
;
208 fnBackward
= &TextSearch::NSrchBkwrd
;
213 sal_Int32
FindPosInSeq_Impl( const Sequence
<sal_Int32
>& rOff
, sal_Int32 nPos
)
215 sal_Int32 nRet
= 0, nEnd
= rOff
.getLength();
216 while( nRet
< nEnd
&& nPos
> rOff
[ nRet
] ) ++nRet
;
220 sal_Bool
TextSearch::isCellStart(const OUString
& searchStr
, sal_Int32 nPos
)
221 throw( RuntimeException
)
224 return nPos
== xBreak
->previousCharacters(searchStr
, nPos
+1,
225 aSrchPara
.Locale
, CharacterIteratorMode::SKIPCELL
, 1, nDone
);
228 SearchResult
TextSearch::searchForward( const OUString
& searchStr
, sal_Int32 startPos
, sal_Int32 endPos
)
229 throw( RuntimeException
)
233 OUString
in_str(searchStr
);
234 sal_Int32 newStartPos
= startPos
;
235 sal_Int32 newEndPos
= endPos
;
237 bUsePrimarySrchStr
= true;
239 if ( xTranslit
.is() )
241 // apply normal transliteration (1<->1, 1<->0)
242 com::sun::star::uno::Sequence
<sal_Int32
> offset( in_str
.getLength());
243 in_str
= xTranslit
->transliterate( searchStr
, 0, in_str
.getLength(), offset
);
245 // JP 20.6.2001: also the start and end positions must be corrected!
247 newStartPos
= FindPosInSeq_Impl( offset
, startPos
);
249 if( endPos
< searchStr
.getLength() )
250 newEndPos
= FindPosInSeq_Impl( offset
, endPos
);
252 newEndPos
= in_str
.getLength();
254 sres
= (this->*fnForward
)( in_str
, newStartPos
, newEndPos
);
256 // Map offsets back to untransliterated string.
257 const sal_Int32 nOffsets
= offset
.getLength();
260 // For regex nGroups is the number of groups+1 with group 0 being
262 const sal_Int32 nGroups
= sres
.startOffset
.getLength();
263 for ( sal_Int32 k
= 0; k
< nGroups
; k
++ )
265 const sal_Int32 nStart
= sres
.startOffset
[k
];
267 sres
.startOffset
[k
] = (nStart
< nOffsets
? offset
[nStart
] : (offset
[nOffsets
- 1] + 1));
268 // JP 20.6.2001: end is ever exclusive and then don't return
269 // the position of the next character - return the
270 // next position behind the last found character!
271 // "a b c" find "b" must return 2,3 and not 2,4!!!
272 const sal_Int32 nStop
= sres
.endOffset
[k
];
274 sres
.endOffset
[k
] = offset
[(nStop
<= nOffsets
? nStop
: nOffsets
) - 1] + 1;
280 sres
= (this->*fnForward
)( in_str
, startPos
, endPos
);
283 if ( xTranslit2
.is() && aSrchPara
.algorithmType
!= SearchAlgorithms_REGEXP
)
287 in_str
= OUString(searchStr
);
288 com::sun::star::uno::Sequence
<sal_Int32
> offset( in_str
.getLength());
290 in_str
= xTranslit2
->transliterate( searchStr
, 0, in_str
.getLength(), offset
);
293 startPos
= FindPosInSeq_Impl( offset
, startPos
);
295 if( endPos
< searchStr
.getLength() )
296 endPos
= FindPosInSeq_Impl( offset
, endPos
);
298 endPos
= in_str
.getLength();
300 bUsePrimarySrchStr
= false;
301 sres2
= (this->*fnForward
)( in_str
, startPos
, endPos
);
303 for ( int k
= 0; k
< sres2
.startOffset
.getLength(); k
++ )
305 if (sres2
.startOffset
[k
])
306 sres2
.startOffset
[k
] = offset
[sres2
.startOffset
[k
]-1] + 1;
307 if (sres2
.endOffset
[k
])
308 sres2
.endOffset
[k
] = offset
[sres2
.endOffset
[k
]-1] + 1;
311 // pick first and long one
312 if ( sres
.subRegExpressions
== 0)
314 if ( sres2
.subRegExpressions
== 1)
316 if ( sres
.startOffset
[0] > sres2
.startOffset
[0])
318 else if ( sres
.startOffset
[0] == sres2
.startOffset
[0] &&
319 sres
.endOffset
[0] < sres2
.endOffset
[0])
327 SearchResult
TextSearch::searchBackward( const OUString
& searchStr
, sal_Int32 startPos
, sal_Int32 endPos
)
328 throw(RuntimeException
)
332 OUString
in_str(searchStr
);
333 sal_Int32 newStartPos
= startPos
;
334 sal_Int32 newEndPos
= endPos
;
336 bUsePrimarySrchStr
= true;
338 if ( xTranslit
.is() )
340 // apply only simple 1<->1 transliteration here
341 com::sun::star::uno::Sequence
<sal_Int32
> offset( in_str
.getLength());
342 in_str
= xTranslit
->transliterate( searchStr
, 0, in_str
.getLength(), offset
);
344 // JP 20.6.2001: also the start and end positions must be corrected!
345 if( startPos
< searchStr
.getLength() )
346 newStartPos
= FindPosInSeq_Impl( offset
, startPos
);
348 newStartPos
= in_str
.getLength();
351 newEndPos
= FindPosInSeq_Impl( offset
, endPos
);
353 sres
= (this->*fnBackward
)( in_str
, newStartPos
, newEndPos
);
355 // Map offsets back to untransliterated string.
356 const sal_Int32 nOffsets
= offset
.getLength();
359 // For regex nGroups is the number of groups+1 with group 0 being
361 const sal_Int32 nGroups
= sres
.startOffset
.getLength();
362 for ( sal_Int32 k
= 0; k
< nGroups
; k
++ )
364 const sal_Int32 nStart
= sres
.startOffset
[k
];
366 sres
.startOffset
[k
] = offset
[(nStart
<= nOffsets
? nStart
: nOffsets
) - 1] + 1;
367 // JP 20.6.2001: end is ever exclusive and then don't return
368 // the position of the next character - return the
369 // next position behind the last found character!
370 // "a b c" find "b" must return 2,3 and not 2,4!!!
371 const sal_Int32 nStop
= sres
.endOffset
[k
];
373 sres
.endOffset
[k
] = (nStop
< nOffsets
? offset
[nStop
] : (offset
[nOffsets
- 1] + 1));
379 sres
= (this->*fnBackward
)( in_str
, startPos
, endPos
);
382 if ( xTranslit2
.is() && aSrchPara
.algorithmType
!= SearchAlgorithms_REGEXP
)
386 in_str
= OUString(searchStr
);
387 com::sun::star::uno::Sequence
<sal_Int32
> offset( in_str
.getLength());
389 in_str
= xTranslit2
->transliterate(searchStr
, 0, in_str
.getLength(), offset
);
391 if( startPos
< searchStr
.getLength() )
392 startPos
= FindPosInSeq_Impl( offset
, startPos
);
394 startPos
= in_str
.getLength();
397 endPos
= FindPosInSeq_Impl( offset
, endPos
);
399 bUsePrimarySrchStr
= false;
400 sres2
= (this->*fnBackward
)( in_str
, startPos
, endPos
);
402 for( int k
= 0; k
< sres2
.startOffset
.getLength(); k
++ )
404 if (sres2
.startOffset
[k
])
405 sres2
.startOffset
[k
] = offset
[sres2
.startOffset
[k
]-1]+1;
406 if (sres2
.endOffset
[k
])
407 sres2
.endOffset
[k
] = offset
[sres2
.endOffset
[k
]-1]+1;
410 // pick last and long one
411 if ( sres
.subRegExpressions
== 0 )
413 if ( sres2
.subRegExpressions
== 1 )
415 if ( sres
.startOffset
[0] < sres2
.startOffset
[0] )
417 if ( sres
.startOffset
[0] == sres2
.startOffset
[0] &&
418 sres
.endOffset
[0] > sres2
.endOffset
[0] )
426 //---------------------------------------------------------------------
428 bool TextSearch::IsDelimiter( const OUString
& rStr
, sal_Int32 nPos
) const
431 if( '\x7f' != rStr
[nPos
])
433 if ( !xCharClass
.is() )
434 xCharClass
= CharacterClassification::create( m_xContext
);
435 sal_Int32 nCType
= xCharClass
->getCharacterType( rStr
, nPos
,
437 if( 0 != (( KCharacterType::DIGIT
| KCharacterType::ALPHA
|
438 KCharacterType::LETTER
) & nCType
) )
444 // --------- helper methods for Boyer-Moore like text searching ----------
445 // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available
447 void TextSearch::MakeForwardTab()
449 // create the jumptable for the search text
453 return ; // the jumpTable is ok
456 bIsForwardTab
= true;
458 sal_Int32 n
, nLen
= sSrchStr
.getLength();
459 pJumpTable
= new TextSearchJumpTable
;
461 for( n
= 0; n
< nLen
- 1; ++n
)
463 sal_Unicode cCh
= sSrchStr
[n
];
464 sal_Int32 nDiff
= nLen
- n
- 1;
465 TextSearchJumpTable::value_type
aEntry( cCh
, nDiff
);
467 ::std::pair
< TextSearchJumpTable::iterator
, bool > aPair
=
468 pJumpTable
->insert( aEntry
);
470 (*(aPair
.first
)).second
= nDiff
;
474 void TextSearch::MakeForwardTab2()
476 // create the jumptable for the search text
480 return ; // the jumpTable is ok
483 bIsForwardTab
= true;
485 sal_Int32 n
, nLen
= sSrchStr2
.getLength();
486 pJumpTable2
= new TextSearchJumpTable
;
488 for( n
= 0; n
< nLen
- 1; ++n
)
490 sal_Unicode cCh
= sSrchStr2
[n
];
491 sal_Int32 nDiff
= nLen
- n
- 1;
493 TextSearchJumpTable::value_type
aEntry( cCh
, nDiff
);
494 ::std::pair
< TextSearchJumpTable::iterator
, bool > aPair
=
495 pJumpTable2
->insert( aEntry
);
497 (*(aPair
.first
)).second
= nDiff
;
501 void TextSearch::MakeBackwardTab()
503 // create the jumptable for the search text
507 return ; // the jumpTable is ok
510 bIsForwardTab
= false;
512 sal_Int32 n
, nLen
= sSrchStr
.getLength();
513 pJumpTable
= new TextSearchJumpTable
;
515 for( n
= nLen
-1; n
> 0; --n
)
517 sal_Unicode cCh
= sSrchStr
[n
];
518 TextSearchJumpTable::value_type
aEntry( cCh
, n
);
519 ::std::pair
< TextSearchJumpTable::iterator
, bool > aPair
=
520 pJumpTable
->insert( aEntry
);
522 (*(aPair
.first
)).second
= n
;
526 void TextSearch::MakeBackwardTab2()
528 // create the jumptable for the search text
532 return ; // the jumpTable is ok
535 bIsForwardTab
= false;
537 sal_Int32 n
, nLen
= sSrchStr2
.getLength();
538 pJumpTable2
= new TextSearchJumpTable
;
540 for( n
= nLen
-1; n
> 0; --n
)
542 sal_Unicode cCh
= sSrchStr2
[n
];
543 TextSearchJumpTable::value_type
aEntry( cCh
, n
);
544 ::std::pair
< TextSearchJumpTable::iterator
, bool > aPair
=
545 pJumpTable2
->insert( aEntry
);
547 (*(aPair
.first
)).second
= n
;
551 sal_Int32
TextSearch::GetDiff( const sal_Unicode cChr
) const
553 TextSearchJumpTable
*pJump
;
556 if ( bUsePrimarySrchStr
) {
558 sSearchKey
= sSrchStr
;
561 sSearchKey
= sSrchStr2
;
564 TextSearchJumpTable::const_iterator iLook
= pJump
->find( cChr
);
565 if ( iLook
== pJump
->end() )
566 return sSearchKey
.getLength();
567 return (*iLook
).second
;
571 // TextSearch::NSrchFrwrd is mis-optimized on unxsoli (#i105945#)
572 SearchResult
TextSearch::NSrchFrwrd( const OUString
& searchStr
, sal_Int32 startPos
, sal_Int32 endPos
)
573 throw(RuntimeException
)
576 aRet
.subRegExpressions
= 0;
578 OUString sSearchKey
= bUsePrimarySrchStr
? sSrchStr
: sSrchStr2
;
580 OUString
aStr( searchStr
);
581 sal_Int32 nSuchIdx
= aStr
.getLength();
582 sal_Int32 nEnde
= endPos
;
583 if( !nSuchIdx
|| !sSearchKey
.getLength() || sSearchKey
.getLength() > nSuchIdx
)
587 if( nEnde
< sSearchKey
.getLength() ) // position inside the search region ?
590 nEnde
-= sSearchKey
.getLength();
592 if (bUsePrimarySrchStr
)
593 MakeForwardTab(); // create the jumptable
597 for (sal_Int32 nCmpIdx
= startPos
; // start position for the search
599 nCmpIdx
+= GetDiff( aStr
[nCmpIdx
+ sSearchKey
.getLength()-1]))
601 // if the match would be the completed cells, skip it.
602 if ( (checkCTLStart
&& !isCellStart( aStr
, nCmpIdx
)) || (checkCTLEnd
603 && !isCellStart( aStr
, nCmpIdx
+ sSearchKey
.getLength())) )
606 nSuchIdx
= sSearchKey
.getLength() - 1;
607 while( nSuchIdx
>= 0 && sSearchKey
[nSuchIdx
] == aStr
[nCmpIdx
+ nSuchIdx
])
611 if( SearchFlags::NORM_WORD_ONLY
& aSrchPara
.searchFlag
)
613 sal_Int32 nFndEnd
= nCmpIdx
+ sSearchKey
.getLength();
614 bool bAtStart
= !nCmpIdx
;
615 bool bAtEnd
= nFndEnd
== endPos
;
616 bool bDelimBefore
= bAtStart
|| IsDelimiter( aStr
, nCmpIdx
-1 );
617 bool bDelimBehind
= IsDelimiter( aStr
, nFndEnd
);
618 // * 1 -> only one word in the paragraph
619 // * 2 -> at begin of paragraph
620 // * 3 -> at end of paragraph
621 // * 4 -> inside the paragraph
622 if( !( ( bAtStart
&& bAtEnd
) || // 1
623 ( bAtStart
&& bDelimBehind
) || // 2
624 ( bAtEnd
&& bDelimBefore
) || // 3
625 ( bDelimBefore
&& bDelimBehind
))) // 4
629 aRet
.subRegExpressions
= 1;
630 aRet
.startOffset
.realloc( 1 );
631 aRet
.startOffset
[ 0 ] = nCmpIdx
;
632 aRet
.endOffset
.realloc( 1 );
633 aRet
.endOffset
[ 0 ] = nCmpIdx
+ sSearchKey
.getLength();
644 SearchResult
TextSearch::NSrchBkwrd( const OUString
& searchStr
, sal_Int32 startPos
, sal_Int32 endPos
)
645 throw(RuntimeException
)
648 aRet
.subRegExpressions
= 0;
650 OUString sSearchKey
= bUsePrimarySrchStr
? sSrchStr
: sSrchStr2
;
652 OUString
aStr( searchStr
);
653 sal_Int32 nSuchIdx
= aStr
.getLength();
654 sal_Int32 nEnde
= endPos
;
655 if( nSuchIdx
== 0 || sSearchKey
.isEmpty() || sSearchKey
.getLength() > nSuchIdx
)
658 if (bUsePrimarySrchStr
)
659 MakeBackwardTab(); // create the jumptable
663 if( nEnde
== nSuchIdx
) // end position for the search
664 nEnde
= sSearchKey
.getLength();
666 nEnde
+= sSearchKey
.getLength();
668 sal_Int32 nCmpIdx
= startPos
; // start position for the search
670 while (nCmpIdx
>= nEnde
)
672 // if the match would be the completed cells, skip it.
673 if ( (!checkCTLStart
|| isCellStart( aStr
, nCmpIdx
-
674 sSearchKey
.getLength() )) && (!checkCTLEnd
||
675 isCellStart( aStr
, nCmpIdx
)))
678 while( nSuchIdx
< sSearchKey
.getLength() && sSearchKey
[nSuchIdx
] ==
679 aStr
[nCmpIdx
+ nSuchIdx
- sSearchKey
.getLength()] )
681 if( nSuchIdx
>= sSearchKey
.getLength() )
683 if( SearchFlags::NORM_WORD_ONLY
& aSrchPara
.searchFlag
)
685 sal_Int32 nFndStt
= nCmpIdx
- sSearchKey
.getLength();
686 bool bAtStart
= !nFndStt
;
687 bool bAtEnd
= nCmpIdx
== startPos
;
688 bool bDelimBehind
= IsDelimiter( aStr
, nCmpIdx
);
689 bool bDelimBefore
= bAtStart
|| // begin of paragraph
690 IsDelimiter( aStr
, nFndStt
-1 );
691 // * 1 -> only one word in the paragraph
692 // * 2 -> at begin of paragraph
693 // * 3 -> at end of paragraph
694 // * 4 -> inside the paragraph
695 if( ( bAtStart
&& bAtEnd
) || // 1
696 ( bAtStart
&& bDelimBehind
) || // 2
697 ( bAtEnd
&& bDelimBefore
) || // 3
698 ( bDelimBefore
&& bDelimBehind
)) // 4
700 aRet
.subRegExpressions
= 1;
701 aRet
.startOffset
.realloc( 1 );
702 aRet
.startOffset
[ 0 ] = nCmpIdx
;
703 aRet
.endOffset
.realloc( 1 );
704 aRet
.endOffset
[ 0 ] = nCmpIdx
- sSearchKey
.getLength();
710 aRet
.subRegExpressions
= 1;
711 aRet
.startOffset
.realloc( 1 );
712 aRet
.startOffset
[ 0 ] = nCmpIdx
;
713 aRet
.endOffset
.realloc( 1 );
714 aRet
.endOffset
[ 0 ] = nCmpIdx
- sSearchKey
.getLength();
719 nSuchIdx
= GetDiff( aStr
[nCmpIdx
- sSearchKey
.getLength()] );
720 if( nCmpIdx
< nSuchIdx
)
727 void TextSearch::RESrchPrepare( const ::com::sun::star::util::SearchOptions
& rOptions
)
729 // select the transliterated pattern string
730 const OUString
& rPatternStr
=
731 (rOptions
.transliterateFlags
& SIMPLE_TRANS_MASK
) ? sSrchStr
732 : ((rOptions
.transliterateFlags
& COMPLEX_TRANS_MASK
) ? sSrchStr2
: rOptions
.searchString
);
734 sal_uInt32 nIcuSearchFlags
= UREGEX_UWORD
; // request UAX#29 unicode capability
735 // map com::sun::star::util::SearchFlags to ICU uregex.h flags
736 // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
737 // REG_NEWLINE is neither properly defined nor used anywhere => not implemented
738 // REG_NOSUB is not used anywhere => not implemented
739 // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
740 // LEV_RELAXED is only used for SearchAlgorithm==Approximate
741 // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO
742 // probably because the transliteration flag IGNORE_CASE handles it as well.
743 if( (rOptions
.searchFlag
& com::sun::star::util::SearchFlags::ALL_IGNORE_CASE
) != 0
744 || (rOptions
.transliterateFlags
& TransliterationModules_IGNORE_CASE
) != 0)
745 nIcuSearchFlags
|= UREGEX_CASE_INSENSITIVE
;
746 UErrorCode nIcuErr
= U_ZERO_ERROR
;
747 // assumption: transliteration didn't mangle regexp control chars
748 IcuUniString
aIcuSearchPatStr( (const UChar
*)rPatternStr
.getStr(), rPatternStr
.getLength());
749 #ifndef DISABLE_WORDBOUND_EMULATION
750 // for conveniance specific syntax elements of the old regex engine are emulated
751 // - by replacing \< with "word-break followed by a look-ahead word-char"
752 static const IcuUniString
aChevronPatternB( "\\\\<", -1, IcuUniString::kInvariant
);
753 static const IcuUniString
aChevronReplaceB( "\\\\b(?=\\\\w)", -1, IcuUniString::kInvariant
);
754 static RegexMatcher
aChevronMatcherB( aChevronPatternB
, 0, nIcuErr
);
755 aChevronMatcherB
.reset( aIcuSearchPatStr
);
756 aIcuSearchPatStr
= aChevronMatcherB
.replaceAll( aChevronReplaceB
, nIcuErr
);
757 aChevronMatcherB
.reset();
758 // - by replacing \> with "look-behind word-char followed by a word-break"
759 static const IcuUniString
aChevronPatternE( "\\\\>", -1, IcuUniString::kInvariant
);
760 static const IcuUniString
aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, IcuUniString::kInvariant
);
761 static RegexMatcher
aChevronMatcherE( aChevronPatternE
, 0, nIcuErr
);
762 aChevronMatcherE
.reset( aIcuSearchPatStr
);
763 aIcuSearchPatStr
= aChevronMatcherE
.replaceAll( aChevronReplaceE
, nIcuErr
);
764 aChevronMatcherE
.reset();
766 pRegexMatcher
= new RegexMatcher( aIcuSearchPatStr
, nIcuSearchFlags
, nIcuErr
);
769 delete pRegexMatcher
;
770 pRegexMatcher
= NULL
;
774 // Pathological patterns may result in exponential run time making the
775 // application appear to be frozen. Limit that. Documentation for this
777 // https://ssl.icu-project.org/apiref/icu4c/classicu_1_1RegexMatcher.html#a6ebcfcab4fe6a38678c0291643a03a00
778 // "The units of the limit are steps of the match engine.
779 // Correspondence with actual processor time will depend on the speed
780 // of the processor and the details of the specific pattern, but will
781 // typically be on the order of milliseconds."
782 // Just what is a good value? 42 is always an answer ... the 23 enigma
783 // as well.. which on the dev's machine is roughly 50 seconds with the
784 // pattern of fdo#70627.
785 /* TODO: make this a configuration settable value and possibly take
786 * complexity of expression into account and maybe even length of text
787 * to be matched; currently (2013-11-25) that is at most one 64k
788 * paragraph per RESrchFrwrd()/RESrchBkwrd() call. */
789 pRegexMatcher
->setTimeLimit( 23*1000, nIcuErr
);
793 //---------------------------------------------------------------------------
795 SearchResult
TextSearch::RESrchFrwrd( const OUString
& searchStr
,
796 sal_Int32 startPos
, sal_Int32 endPos
)
797 throw(RuntimeException
)
800 aRet
.subRegExpressions
= 0;
804 if( endPos
> searchStr
.getLength())
805 endPos
= searchStr
.getLength();
807 // use the ICU RegexMatcher to find the matches
808 UErrorCode nIcuErr
= U_ZERO_ERROR
;
809 const IcuUniString
aSearchTargetStr( (const UChar
*)searchStr
.getStr(), endPos
);
810 pRegexMatcher
->reset( aSearchTargetStr
);
811 // search until there is a valid match
814 if( !pRegexMatcher
->find( startPos
, nIcuErr
))
817 // #i118887# ignore zero-length matches e.g. "a*" in "bc"
818 int nStartOfs
= pRegexMatcher
->start( nIcuErr
);
819 int nEndOfs
= pRegexMatcher
->end( nIcuErr
);
820 if( nStartOfs
< nEndOfs
)
822 // If the zero-length match is behind the string, do not match it again
823 // and again until startPos reaches there. A match behind the string is
825 if (nStartOfs
== endPos
)
827 // try at next position if there was a zero-length match
828 if( ++startPos
>= endPos
)
832 // extract the result of the search
833 const int nGroupCount
= pRegexMatcher
->groupCount();
834 aRet
.subRegExpressions
= nGroupCount
+ 1;
835 aRet
.startOffset
.realloc( aRet
.subRegExpressions
);
836 aRet
.endOffset
.realloc( aRet
.subRegExpressions
);
837 aRet
.startOffset
[0] = pRegexMatcher
->start( nIcuErr
);
838 aRet
.endOffset
[0] = pRegexMatcher
->end( nIcuErr
);
839 for( int i
= 1; i
<= nGroupCount
; ++i
) {
840 aRet
.startOffset
[i
] = pRegexMatcher
->start( i
, nIcuErr
);
841 aRet
.endOffset
[i
] = pRegexMatcher
->end( i
, nIcuErr
);
847 SearchResult
TextSearch::RESrchBkwrd( const OUString
& searchStr
,
848 sal_Int32 startPos
, sal_Int32 endPos
)
849 throw(RuntimeException
)
851 // NOTE: for backwards search callers provide startPos/endPos inverted!
853 aRet
.subRegExpressions
= 0;
857 if( startPos
> searchStr
.getLength())
858 startPos
= searchStr
.getLength();
860 // use the ICU RegexMatcher to find the matches
861 // TODO: use ICU's backward searching once it becomes available
862 // as its replacement using forward search is not as good as the real thing
863 UErrorCode nIcuErr
= U_ZERO_ERROR
;
864 const IcuUniString
aSearchTargetStr( (const UChar
*)searchStr
.getStr(), startPos
);
865 pRegexMatcher
->reset( aSearchTargetStr
);
866 if( !pRegexMatcher
->find( endPos
, nIcuErr
))
869 // find the last match
872 int nGoodPos
= 0, nGoodEnd
= 0;
875 nLastPos
= pRegexMatcher
->start( nIcuErr
);
876 nFoundEnd
= pRegexMatcher
->end( nIcuErr
);
877 if (nLastPos
< nFoundEnd
)
879 // remember last non-zero-length match
881 nGoodEnd
= nFoundEnd
;
883 if( nFoundEnd
>= startPos
)
886 if( nFoundEnd
== nLastPos
)
888 } while( pRegexMatcher
->find( nFoundEnd
, nIcuErr
));
890 // Ignore all zero-length matches except "$" anchor on first match.
891 if (nGoodPos
== nGoodEnd
)
893 if (bFirst
&& nLastPos
== startPos
)
899 // find last match again to get its details
900 pRegexMatcher
->find( nGoodPos
, nIcuErr
);
902 // fill in the details of the last match
903 const int nGroupCount
= pRegexMatcher
->groupCount();
904 aRet
.subRegExpressions
= nGroupCount
+ 1;
905 aRet
.startOffset
.realloc( aRet
.subRegExpressions
);
906 aRet
.endOffset
.realloc( aRet
.subRegExpressions
);
907 // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted!
908 aRet
.startOffset
[0] = pRegexMatcher
->end( nIcuErr
);
909 aRet
.endOffset
[0] = pRegexMatcher
->start( nIcuErr
);
910 for( int i
= 1; i
<= nGroupCount
; ++i
) {
911 aRet
.startOffset
[i
] = pRegexMatcher
->end( i
, nIcuErr
);
912 aRet
.endOffset
[i
] = pRegexMatcher
->start( i
, nIcuErr
);
918 //---------------------------------------------------------------------------
920 // search for words phonetically
921 SearchResult
TextSearch::ApproxSrchFrwrd( const OUString
& searchStr
,
922 sal_Int32 startPos
, sal_Int32 endPos
)
923 throw(RuntimeException
)
926 aRet
.subRegExpressions
= 0;
931 OUString
aWTemp( searchStr
);
933 register sal_Int32 nStt
, nEnd
;
935 Boundary aWBnd
= xBreak
->getWordBoundary( aWTemp
, startPos
,
937 WordType::ANYWORD_IGNOREWHITESPACES
, sal_True
);
941 if( aWBnd
.startPos
>= endPos
)
943 nStt
= aWBnd
.startPos
< startPos
? startPos
: aWBnd
.startPos
;
944 nEnd
= aWBnd
.endPos
> endPos
? endPos
: aWBnd
.endPos
;
947 pWLD
->WLD( aWTemp
.getStr() + nStt
, nEnd
- nStt
) <= nLimit
)
949 aRet
.subRegExpressions
= 1;
950 aRet
.startOffset
.realloc( 1 );
951 aRet
.startOffset
[ 0 ] = nStt
;
952 aRet
.endOffset
.realloc( 1 );
953 aRet
.endOffset
[ 0 ] = nEnd
;
958 aWBnd
= xBreak
->nextWord( aWTemp
, nStt
, aSrchPara
.Locale
,
959 WordType::ANYWORD_IGNOREWHITESPACES
);
960 } while( aWBnd
.startPos
!= aWBnd
.endPos
||
961 (aWBnd
.endPos
!= aWTemp
.getLength() && aWBnd
.endPos
!= nEnd
) );
962 // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only
963 // whitespace) in searchStr, getWordBoundary() returned startPos,startPos
964 // and nextWord() does also => don't loop forever.
968 SearchResult
TextSearch::ApproxSrchBkwrd( const OUString
& searchStr
,
969 sal_Int32 startPos
, sal_Int32 endPos
)
970 throw(RuntimeException
)
973 aRet
.subRegExpressions
= 0;
978 OUString
aWTemp( searchStr
);
980 register sal_Int32 nStt
, nEnd
;
982 Boundary aWBnd
= xBreak
->getWordBoundary( aWTemp
, startPos
,
984 WordType::ANYWORD_IGNOREWHITESPACES
, sal_True
);
988 if( aWBnd
.endPos
<= endPos
)
990 nStt
= aWBnd
.startPos
< endPos
? endPos
: aWBnd
.startPos
;
991 nEnd
= aWBnd
.endPos
> startPos
? startPos
: aWBnd
.endPos
;
994 pWLD
->WLD( aWTemp
.getStr() + nStt
, nEnd
- nStt
) <= nLimit
)
996 aRet
.subRegExpressions
= 1;
997 aRet
.startOffset
.realloc( 1 );
998 aRet
.startOffset
[ 0 ] = nEnd
;
999 aRet
.endOffset
.realloc( 1 );
1000 aRet
.endOffset
[ 0 ] = nStt
;
1006 aWBnd
= xBreak
->previousWord( aWTemp
, nStt
, aSrchPara
.Locale
,
1007 WordType::ANYWORD_IGNOREWHITESPACES
);
1008 } while( aWBnd
.startPos
!= aWBnd
.endPos
|| aWBnd
.endPos
!= aWTemp
.getLength() );
1013 static const sal_Char cSearchName
[] = "com.sun.star.util.TextSearch";
1014 static const sal_Char cSearchImpl
[] = "com.sun.star.util.TextSearch_i18n";
1016 static OUString
getServiceName_Static()
1018 return OUString::createFromAscii( cSearchName
);
1021 static OUString
getImplementationName_Static()
1023 return OUString::createFromAscii( cSearchImpl
);
1027 TextSearch::getImplementationName()
1028 throw( RuntimeException
)
1030 return getImplementationName_Static();
1034 TextSearch::supportsService(const OUString
& rServiceName
)
1035 throw( RuntimeException
)
1037 return rServiceName
== cSearchName
;
1040 Sequence
< OUString
> SAL_CALL
1041 TextSearch::getSupportedServiceNames(void) throw( RuntimeException
)
1043 Sequence
< OUString
> aRet(1);
1044 aRet
[0] = getServiceName_Static();
1048 ::com::sun::star::uno::Reference
< ::com::sun::star::uno::XInterface
>
1049 SAL_CALL
TextSearch_CreateInstance(
1050 const ::com::sun::star::uno::Reference
<
1051 ::com::sun::star::lang::XMultiServiceFactory
>& rxMSF
)
1053 return ::com::sun::star::uno::Reference
<
1054 ::com::sun::star::uno::XInterface
>(
1055 (::cppu::OWeakObject
*) new TextSearch(
1056 comphelper::getComponentContext( rxMSF
) ) );
1061 SAL_DLLPUBLIC_EXPORT
void* SAL_CALL
1062 i18nsearch_component_getFactory( const sal_Char
* sImplementationName
,
1063 void* _pServiceManager
,
1064 SAL_UNUSED_PARAMETER
void* )
1068 ::com::sun::star::lang::XMultiServiceFactory
* pServiceManager
=
1069 reinterpret_cast< ::com::sun::star::lang::XMultiServiceFactory
* >
1070 ( _pServiceManager
);
1071 ::com::sun::star::uno::Reference
<
1072 ::com::sun::star::lang::XSingleServiceFactory
> xFactory
;
1074 if ( 0 == rtl_str_compare( sImplementationName
, cSearchImpl
) )
1076 ::com::sun::star::uno::Sequence
< OUString
> aServiceNames(1);
1077 aServiceNames
[0] = getServiceName_Static();
1078 xFactory
= ::cppu::createSingleFactory(
1079 pServiceManager
, getImplementationName_Static(),
1080 &TextSearch_CreateInstance
, aServiceNames
);
1083 if ( xFactory
.is() )
1085 xFactory
->acquire();
1086 pRet
= xFactory
.get();
1094 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */