1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include "textsearch.hxx"
22 #include <com/sun/star/lang/Locale.hpp>
23 #include <com/sun/star/lang/XMultiServiceFactory.hpp>
24 #include <comphelper/processfactory.hxx>
25 #include <com/sun/star/i18n/BreakIterator.hpp>
26 #include <com/sun/star/i18n/UnicodeType.hpp>
27 #include <com/sun/star/util/SearchFlags.hpp>
28 #include <com/sun/star/i18n/WordType.hpp>
29 #include <com/sun/star/i18n/ScriptType.hpp>
30 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
31 #include <com/sun/star/i18n/CharacterClassification.hpp>
32 #include <com/sun/star/i18n/KCharacterType.hpp>
33 #include <com/sun/star/i18n/Transliteration.hpp>
34 #include <com/sun/star/registry/XRegistryKey.hpp>
35 #include <cppuhelper/factory.hxx>
36 #include <cppuhelper/weak.hxx>
39 // get rid of that dumb compiler warning
40 // identifier was truncated to '255' characters in the debug information
41 // for STL template usage, if .pdb files are to be created
42 #pragma warning( disable: 4786 )
47 using namespace ::com::sun::star::util
;
48 using namespace ::com::sun::star::uno
;
49 using namespace ::com::sun::star::lang
;
50 using namespace ::com::sun::star::i18n
;
51 using namespace ::com::sun::star
;
53 static sal_Int32 COMPLEX_TRANS_MASK_TMP
=
54 TransliterationModules_ignoreBaFa_ja_JP
|
55 TransliterationModules_ignoreIterationMark_ja_JP
|
56 TransliterationModules_ignoreTiJi_ja_JP
|
57 TransliterationModules_ignoreHyuByu_ja_JP
|
58 TransliterationModules_ignoreSeZe_ja_JP
|
59 TransliterationModules_ignoreIandEfollowedByYa_ja_JP
|
60 TransliterationModules_ignoreKiKuFollowedBySa_ja_JP
|
61 TransliterationModules_ignoreProlongedSoundMark_ja_JP
;
62 static const sal_Int32 COMPLEX_TRANS_MASK
= COMPLEX_TRANS_MASK_TMP
| TransliterationModules_IGNORE_KANA
| TransliterationModules_FULLWIDTH_HALFWIDTH
;
63 static const sal_Int32 SIMPLE_TRANS_MASK
= ~COMPLEX_TRANS_MASK
;
64 // Above 2 transliteration is simple but need to take effect in
65 // complex transliteration
67 TextSearch::TextSearch(const Reference
< XComponentContext
> & rxContext
)
68 : m_xContext( rxContext
)
71 , pRegexMatcher( NULL
)
75 aOpt
.algorithmType
= SearchAlgorithms_ABSOLUTE
;
76 aOpt
.searchFlag
= SearchFlags::ALL_IGNORE_CASE
;
81 TextSearch::~TextSearch()
89 void TextSearch::setOptions( const SearchOptions
& rOptions
) throw( RuntimeException
)
93 delete pRegexMatcher
, pRegexMatcher
= NULL
;
94 delete pWLD
, pWLD
= 0;
95 delete pJumpTable
, pJumpTable
= 0;
96 delete pJumpTable2
, pJumpTable2
= 0;
98 // Create Transliteration class
99 if( aSrchPara
.transliterateFlags
& SIMPLE_TRANS_MASK
)
101 if( !xTranslit
.is() )
102 xTranslit
.set( Transliteration::create( m_xContext
) );
103 xTranslit
->loadModule(
104 (TransliterationModules
)( aSrchPara
.transliterateFlags
& SIMPLE_TRANS_MASK
),
107 else if( xTranslit
.is() )
110 // Create Transliteration for 2<->1, 2<->2 transliteration
111 if ( aSrchPara
.transliterateFlags
& COMPLEX_TRANS_MASK
)
113 if( !xTranslit2
.is() )
114 xTranslit2
.set( Transliteration::create( m_xContext
) );
115 // Load transliteration module
116 xTranslit2
->loadModule(
117 (TransliterationModules
)( aSrchPara
.transliterateFlags
& COMPLEX_TRANS_MASK
),
122 xBreak
= com::sun::star::i18n::BreakIterator::create( m_xContext
);
124 sSrchStr
= aSrchPara
.searchString
;
126 // use transliteration here
127 if ( xTranslit
.is() &&
128 aSrchPara
.transliterateFlags
& SIMPLE_TRANS_MASK
)
129 sSrchStr
= xTranslit
->transliterateString2String(
130 aSrchPara
.searchString
, 0, aSrchPara
.searchString
.getLength());
132 if ( xTranslit2
.is() &&
133 aSrchPara
.transliterateFlags
& COMPLEX_TRANS_MASK
)
134 sSrchStr2
= xTranslit2
->transliterateString2String(
135 aSrchPara
.searchString
, 0, aSrchPara
.searchString
.getLength());
137 // When start or end of search string is a complex script type, we need to
138 // make sure the result boundary is not located in the middle of cell.
139 checkCTLStart
= (xBreak
.is() && (xBreak
->getScriptType(sSrchStr
, 0) ==
140 ScriptType::COMPLEX
));
141 checkCTLEnd
= (xBreak
.is() && (xBreak
->getScriptType(sSrchStr
,
142 sSrchStr
.getLength()-1) == ScriptType::COMPLEX
));
144 switch( aSrchPara
.algorithmType
)
146 case SearchAlgorithms_REGEXP
:
147 fnForward
= &TextSearch::RESrchFrwrd
;
148 fnBackward
= &TextSearch::RESrchBkwrd
;
149 RESrchPrepare( aSrchPara
);
152 case SearchAlgorithms_APPROXIMATE
:
153 fnForward
= &TextSearch::ApproxSrchFrwrd
;
154 fnBackward
= &TextSearch::ApproxSrchBkwrd
;
156 pWLD
= new WLevDistance( sSrchStr
.getStr(), aSrchPara
.changedChars
,
157 aSrchPara
.insertedChars
, aSrchPara
.deletedChars
,
158 0 != (SearchFlags::LEV_RELAXED
& aSrchPara
.searchFlag
) );
160 nLimit
= pWLD
->GetLimit();
164 fnForward
= &TextSearch::NSrchFrwrd
;
165 fnBackward
= &TextSearch::NSrchBkwrd
;
170 sal_Int32
FindPosInSeq_Impl( const Sequence
<sal_Int32
>& rOff
, sal_Int32 nPos
)
172 sal_Int32 nRet
= 0, nEnd
= rOff
.getLength();
173 while( nRet
< nEnd
&& nPos
> rOff
[ nRet
] ) ++nRet
;
177 sal_Bool
TextSearch::isCellStart(const OUString
& searchStr
, sal_Int32 nPos
)
178 throw( RuntimeException
)
181 return nPos
== xBreak
->previousCharacters(searchStr
, nPos
+1,
182 aSrchPara
.Locale
, CharacterIteratorMode::SKIPCELL
, 1, nDone
);
185 SearchResult
TextSearch::searchForward( const OUString
& searchStr
, sal_Int32 startPos
, sal_Int32 endPos
)
186 throw( RuntimeException
)
190 OUString
in_str(searchStr
);
191 sal_Int32 newStartPos
= startPos
;
192 sal_Int32 newEndPos
= endPos
;
194 bUsePrimarySrchStr
= true;
196 if ( xTranslit
.is() )
198 // apply normal transliteration (1<->1, 1<->0)
199 com::sun::star::uno::Sequence
<sal_Int32
> offset( in_str
.getLength());
200 in_str
= xTranslit
->transliterate( searchStr
, 0, in_str
.getLength(), offset
);
202 // JP 20.6.2001: also the start and end positions must be corrected!
204 newStartPos
= FindPosInSeq_Impl( offset
, startPos
);
206 if( endPos
< searchStr
.getLength() )
207 newEndPos
= FindPosInSeq_Impl( offset
, endPos
);
209 newEndPos
= in_str
.getLength();
211 sres
= (this->*fnForward
)( in_str
, newStartPos
, newEndPos
);
213 for ( int k
= 0; k
< sres
.startOffset
.getLength(); k
++ )
215 if (sres
.startOffset
[k
])
216 sres
.startOffset
[k
] = offset
[sres
.startOffset
[k
]];
217 // JP 20.6.2001: end is ever exclusive and then don't return
218 // the position of the next character - return the
219 // next position behind the last found character!
220 // "a b c" find "b" must return 2,3 and not 2,4!!!
221 if (sres
.endOffset
[k
])
222 sres
.endOffset
[k
] = offset
[sres
.endOffset
[k
]-1] + 1;
227 sres
= (this->*fnForward
)( in_str
, startPos
, endPos
);
230 if ( xTranslit2
.is() && aSrchPara
.algorithmType
!= SearchAlgorithms_REGEXP
)
234 in_str
= OUString(searchStr
);
235 com::sun::star::uno::Sequence
<sal_Int32
> offset( in_str
.getLength());
237 in_str
= xTranslit2
->transliterate( searchStr
, 0, in_str
.getLength(), offset
);
240 startPos
= FindPosInSeq_Impl( offset
, startPos
);
242 if( endPos
< searchStr
.getLength() )
243 endPos
= FindPosInSeq_Impl( offset
, endPos
);
245 endPos
= in_str
.getLength();
247 bUsePrimarySrchStr
= false;
248 sres2
= (this->*fnForward
)( in_str
, startPos
, endPos
);
250 for ( int k
= 0; k
< sres2
.startOffset
.getLength(); k
++ )
252 if (sres2
.startOffset
[k
])
253 sres2
.startOffset
[k
] = offset
[sres2
.startOffset
[k
]-1] + 1;
254 if (sres2
.endOffset
[k
])
255 sres2
.endOffset
[k
] = offset
[sres2
.endOffset
[k
]-1] + 1;
258 // pick first and long one
259 if ( sres
.subRegExpressions
== 0)
261 if ( sres2
.subRegExpressions
== 1)
263 if ( sres
.startOffset
[0] > sres2
.startOffset
[0])
265 else if ( sres
.startOffset
[0] == sres2
.startOffset
[0] &&
266 sres
.endOffset
[0] < sres2
.endOffset
[0])
274 SearchResult
TextSearch::searchBackward( const OUString
& searchStr
, sal_Int32 startPos
, sal_Int32 endPos
)
275 throw(RuntimeException
)
279 OUString
in_str(searchStr
);
280 sal_Int32 newStartPos
= startPos
;
281 sal_Int32 newEndPos
= endPos
;
283 bUsePrimarySrchStr
= true;
285 if ( xTranslit
.is() )
287 // apply only simple 1<->1 transliteration here
288 com::sun::star::uno::Sequence
<sal_Int32
> offset( in_str
.getLength());
289 in_str
= xTranslit
->transliterate( searchStr
, 0, in_str
.getLength(), offset
);
291 // JP 20.6.2001: also the start and end positions must be corrected!
292 if( startPos
< searchStr
.getLength() )
293 newStartPos
= FindPosInSeq_Impl( offset
, startPos
);
295 newStartPos
= in_str
.getLength();
298 newEndPos
= FindPosInSeq_Impl( offset
, endPos
);
300 sres
= (this->*fnBackward
)( in_str
, newStartPos
, newEndPos
);
302 for ( int k
= 0; k
< sres
.startOffset
.getLength(); k
++ )
304 if (sres
.startOffset
[k
])
305 sres
.startOffset
[k
] = offset
[sres
.startOffset
[k
] - 1] + 1;
306 // JP 20.6.2001: end is ever exclusive and then don't return
307 // the position of the next character - return the
308 // next position behind the last found character!
309 // "a b c" find "b" must return 2,3 and not 2,4!!!
310 if (sres
.endOffset
[k
])
311 sres
.endOffset
[k
] = offset
[sres
.endOffset
[k
]];
316 sres
= (this->*fnBackward
)( in_str
, startPos
, endPos
);
319 if ( xTranslit2
.is() && aSrchPara
.algorithmType
!= SearchAlgorithms_REGEXP
)
323 in_str
= OUString(searchStr
);
324 com::sun::star::uno::Sequence
<sal_Int32
> offset( in_str
.getLength());
326 in_str
= xTranslit2
->transliterate(searchStr
, 0, in_str
.getLength(), offset
);
328 if( startPos
< searchStr
.getLength() )
329 startPos
= FindPosInSeq_Impl( offset
, startPos
);
331 startPos
= in_str
.getLength();
334 endPos
= FindPosInSeq_Impl( offset
, endPos
);
336 bUsePrimarySrchStr
= false;
337 sres2
= (this->*fnBackward
)( in_str
, startPos
, endPos
);
339 for( int k
= 0; k
< sres2
.startOffset
.getLength(); k
++ )
341 if (sres2
.startOffset
[k
])
342 sres2
.startOffset
[k
] = offset
[sres2
.startOffset
[k
]-1]+1;
343 if (sres2
.endOffset
[k
])
344 sres2
.endOffset
[k
] = offset
[sres2
.endOffset
[k
]-1]+1;
347 // pick last and long one
348 if ( sres
.subRegExpressions
== 0 )
350 if ( sres2
.subRegExpressions
== 1 )
352 if ( sres
.startOffset
[0] < sres2
.startOffset
[0] )
354 if ( sres
.startOffset
[0] == sres2
.startOffset
[0] &&
355 sres
.endOffset
[0] > sres2
.endOffset
[0] )
363 //---------------------------------------------------------------------
365 bool TextSearch::IsDelimiter( const OUString
& rStr
, sal_Int32 nPos
) const
368 if( '\x7f' != rStr
[nPos
])
370 if ( !xCharClass
.is() )
371 xCharClass
= CharacterClassification::create( m_xContext
);
372 sal_Int32 nCType
= xCharClass
->getCharacterType( rStr
, nPos
,
374 if( 0 != (( KCharacterType::DIGIT
| KCharacterType::ALPHA
|
375 KCharacterType::LETTER
) & nCType
) )
381 // --------- helper methods for Boyer-Moore like text searching ----------
382 // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available
384 void TextSearch::MakeForwardTab()
386 // create the jumptable for the search text
390 return ; // the jumpTable is ok
393 bIsForwardTab
= true;
395 sal_Int32 n
, nLen
= sSrchStr
.getLength();
396 pJumpTable
= new TextSearchJumpTable
;
398 for( n
= 0; n
< nLen
- 1; ++n
)
400 sal_Unicode cCh
= sSrchStr
[n
];
401 sal_Int32 nDiff
= nLen
- n
- 1;
402 TextSearchJumpTable::value_type
aEntry( cCh
, nDiff
);
404 ::std::pair
< TextSearchJumpTable::iterator
, bool > aPair
=
405 pJumpTable
->insert( aEntry
);
407 (*(aPair
.first
)).second
= nDiff
;
411 void TextSearch::MakeForwardTab2()
413 // create the jumptable for the search text
417 return ; // the jumpTable is ok
420 bIsForwardTab
= true;
422 sal_Int32 n
, nLen
= sSrchStr2
.getLength();
423 pJumpTable2
= new TextSearchJumpTable
;
425 for( n
= 0; n
< nLen
- 1; ++n
)
427 sal_Unicode cCh
= sSrchStr2
[n
];
428 sal_Int32 nDiff
= nLen
- n
- 1;
430 TextSearchJumpTable::value_type
aEntry( cCh
, nDiff
);
431 ::std::pair
< TextSearchJumpTable::iterator
, bool > aPair
=
432 pJumpTable2
->insert( aEntry
);
434 (*(aPair
.first
)).second
= nDiff
;
438 void TextSearch::MakeBackwardTab()
440 // create the jumptable for the search text
444 return ; // the jumpTable is ok
447 bIsForwardTab
= false;
449 sal_Int32 n
, nLen
= sSrchStr
.getLength();
450 pJumpTable
= new TextSearchJumpTable
;
452 for( n
= nLen
-1; n
> 0; --n
)
454 sal_Unicode cCh
= sSrchStr
[n
];
455 TextSearchJumpTable::value_type
aEntry( cCh
, n
);
456 ::std::pair
< TextSearchJumpTable::iterator
, bool > aPair
=
457 pJumpTable
->insert( aEntry
);
459 (*(aPair
.first
)).second
= n
;
463 void TextSearch::MakeBackwardTab2()
465 // create the jumptable for the search text
469 return ; // the jumpTable is ok
472 bIsForwardTab
= false;
474 sal_Int32 n
, nLen
= sSrchStr2
.getLength();
475 pJumpTable2
= new TextSearchJumpTable
;
477 for( n
= nLen
-1; n
> 0; --n
)
479 sal_Unicode cCh
= sSrchStr2
[n
];
480 TextSearchJumpTable::value_type
aEntry( cCh
, n
);
481 ::std::pair
< TextSearchJumpTable::iterator
, bool > aPair
=
482 pJumpTable2
->insert( aEntry
);
484 (*(aPair
.first
)).second
= n
;
488 sal_Int32
TextSearch::GetDiff( const sal_Unicode cChr
) const
490 TextSearchJumpTable
*pJump
;
493 if ( bUsePrimarySrchStr
) {
495 sSearchKey
= sSrchStr
;
498 sSearchKey
= sSrchStr2
;
501 TextSearchJumpTable::const_iterator iLook
= pJump
->find( cChr
);
502 if ( iLook
== pJump
->end() )
503 return sSearchKey
.getLength();
504 return (*iLook
).second
;
508 // TextSearch::NSrchFrwrd is mis-optimized on unxsoli (#i105945#)
509 SearchResult
TextSearch::NSrchFrwrd( const OUString
& searchStr
, sal_Int32 startPos
, sal_Int32 endPos
)
510 throw(RuntimeException
)
513 aRet
.subRegExpressions
= 0;
515 OUString sSearchKey
= bUsePrimarySrchStr
? sSrchStr
: sSrchStr2
;
517 OUString
aStr( searchStr
);
518 sal_Int32 nSuchIdx
= aStr
.getLength();
519 sal_Int32 nEnde
= endPos
;
520 if( !nSuchIdx
|| !sSearchKey
.getLength() || sSearchKey
.getLength() > nSuchIdx
)
524 if( nEnde
< sSearchKey
.getLength() ) // position inside the search region ?
527 nEnde
-= sSearchKey
.getLength();
529 if (bUsePrimarySrchStr
)
530 MakeForwardTab(); // create the jumptable
534 for (sal_Int32 nCmpIdx
= startPos
; // start position for the search
536 nCmpIdx
+= GetDiff( aStr
[nCmpIdx
+ sSearchKey
.getLength()-1]))
538 // if the match would be the completed cells, skip it.
539 if ( (checkCTLStart
&& !isCellStart( aStr
, nCmpIdx
)) || (checkCTLEnd
540 && !isCellStart( aStr
, nCmpIdx
+ sSearchKey
.getLength())) )
543 nSuchIdx
= sSearchKey
.getLength() - 1;
544 while( nSuchIdx
>= 0 && sSearchKey
[nSuchIdx
] == aStr
[nCmpIdx
+ nSuchIdx
])
548 if( SearchFlags::NORM_WORD_ONLY
& aSrchPara
.searchFlag
)
550 sal_Int32 nFndEnd
= nCmpIdx
+ sSearchKey
.getLength();
551 bool bAtStart
= !nCmpIdx
;
552 bool bAtEnd
= nFndEnd
== endPos
;
553 bool bDelimBefore
= bAtStart
|| IsDelimiter( aStr
, nCmpIdx
-1 );
554 bool bDelimBehind
= IsDelimiter( aStr
, nFndEnd
);
555 // * 1 -> only one word in the paragraph
556 // * 2 -> at begin of paragraph
557 // * 3 -> at end of paragraph
558 // * 4 -> inside the paragraph
559 if( !( ( bAtStart
&& bAtEnd
) || // 1
560 ( bAtStart
&& bDelimBehind
) || // 2
561 ( bAtEnd
&& bDelimBefore
) || // 3
562 ( bDelimBefore
&& bDelimBehind
))) // 4
566 aRet
.subRegExpressions
= 1;
567 aRet
.startOffset
.realloc( 1 );
568 aRet
.startOffset
[ 0 ] = nCmpIdx
;
569 aRet
.endOffset
.realloc( 1 );
570 aRet
.endOffset
[ 0 ] = nCmpIdx
+ sSearchKey
.getLength();
581 SearchResult
TextSearch::NSrchBkwrd( const OUString
& searchStr
, sal_Int32 startPos
, sal_Int32 endPos
)
582 throw(RuntimeException
)
585 aRet
.subRegExpressions
= 0;
587 OUString sSearchKey
= bUsePrimarySrchStr
? sSrchStr
: sSrchStr2
;
589 OUString
aStr( searchStr
);
590 sal_Int32 nSuchIdx
= aStr
.getLength();
591 sal_Int32 nEnde
= endPos
;
592 if( nSuchIdx
== 0 || sSearchKey
.isEmpty() || sSearchKey
.getLength() > nSuchIdx
)
595 if (bUsePrimarySrchStr
)
596 MakeBackwardTab(); // create the jumptable
600 if( nEnde
== nSuchIdx
) // end position for the search
601 nEnde
= sSearchKey
.getLength();
603 nEnde
+= sSearchKey
.getLength();
605 sal_Int32 nCmpIdx
= startPos
; // start position for the search
607 while (nCmpIdx
>= nEnde
)
609 // if the match would be the completed cells, skip it.
610 if ( (!checkCTLStart
|| isCellStart( aStr
, nCmpIdx
-
611 sSearchKey
.getLength() )) && (!checkCTLEnd
||
612 isCellStart( aStr
, nCmpIdx
)))
615 while( nSuchIdx
< sSearchKey
.getLength() && sSearchKey
[nSuchIdx
] ==
616 aStr
[nCmpIdx
+ nSuchIdx
- sSearchKey
.getLength()] )
618 if( nSuchIdx
>= sSearchKey
.getLength() )
620 if( SearchFlags::NORM_WORD_ONLY
& aSrchPara
.searchFlag
)
622 sal_Int32 nFndStt
= nCmpIdx
- sSearchKey
.getLength();
623 bool bAtStart
= !nFndStt
;
624 bool bAtEnd
= nCmpIdx
== startPos
;
625 bool bDelimBehind
= IsDelimiter( aStr
, nCmpIdx
);
626 bool bDelimBefore
= bAtStart
|| // begin of paragraph
627 IsDelimiter( aStr
, nFndStt
-1 );
628 // * 1 -> only one word in the paragraph
629 // * 2 -> at begin of paragraph
630 // * 3 -> at end of paragraph
631 // * 4 -> inside the paragraph
632 if( ( bAtStart
&& bAtEnd
) || // 1
633 ( bAtStart
&& bDelimBehind
) || // 2
634 ( bAtEnd
&& bDelimBefore
) || // 3
635 ( bDelimBefore
&& bDelimBehind
)) // 4
637 aRet
.subRegExpressions
= 1;
638 aRet
.startOffset
.realloc( 1 );
639 aRet
.startOffset
[ 0 ] = nCmpIdx
;
640 aRet
.endOffset
.realloc( 1 );
641 aRet
.endOffset
[ 0 ] = nCmpIdx
- sSearchKey
.getLength();
647 aRet
.subRegExpressions
= 1;
648 aRet
.startOffset
.realloc( 1 );
649 aRet
.startOffset
[ 0 ] = nCmpIdx
;
650 aRet
.endOffset
.realloc( 1 );
651 aRet
.endOffset
[ 0 ] = nCmpIdx
- sSearchKey
.getLength();
656 nSuchIdx
= GetDiff( aStr
[nCmpIdx
- sSearchKey
.getLength()] );
657 if( nCmpIdx
< nSuchIdx
)
664 void TextSearch::RESrchPrepare( const ::com::sun::star::util::SearchOptions
& rOptions
)
666 // select the transliterated pattern string
667 const OUString
& rPatternStr
=
668 (rOptions
.transliterateFlags
& SIMPLE_TRANS_MASK
) ? sSrchStr
669 : ((rOptions
.transliterateFlags
& COMPLEX_TRANS_MASK
) ? sSrchStr2
: rOptions
.searchString
);
671 sal_uInt32 nIcuSearchFlags
= UREGEX_UWORD
; // request UAX#29 unicode capability
672 // map com::sun::star::util::SearchFlags to ICU uregex.h flags
673 // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
674 // REG_NEWLINE is neither properly defined nor used anywhere => not implemented
675 // REG_NOSUB is not used anywhere => not implemented
676 // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
677 // LEV_RELAXED is only used for SearchAlgorithm==Approximate
678 // why is even ALL_IGNORE_CASE deprecated in UNO? because of transliteration taking care of it???
679 if( (rOptions
.searchFlag
& com::sun::star::util::SearchFlags::ALL_IGNORE_CASE
) != 0)
680 nIcuSearchFlags
|= UREGEX_CASE_INSENSITIVE
;
681 UErrorCode nIcuErr
= U_ZERO_ERROR
;
682 // assumption: transliteration didn't mangle regexp control chars
683 IcuUniString
aIcuSearchPatStr( (const UChar
*)rPatternStr
.getStr(), rPatternStr
.getLength());
684 #ifndef DISABLE_WORDBOUND_EMULATION
685 // for conveniance specific syntax elements of the old regex engine are emulated
686 // by using regular word boundary matching \b to replace \< and \>
687 static const IcuUniString
aChevronPattern( "\\\\<|\\\\>", -1, IcuUniString::kInvariant
);
688 static const IcuUniString
aChevronReplace( "\\\\b", -1, IcuUniString::kInvariant
);
689 static RegexMatcher
aChevronMatcher( aChevronPattern
, 0, nIcuErr
);
690 aChevronMatcher
.reset( aIcuSearchPatStr
);
691 aIcuSearchPatStr
= aChevronMatcher
.replaceAll( aChevronReplace
, nIcuErr
);
692 aChevronMatcher
.reset();
694 pRegexMatcher
= new RegexMatcher( aIcuSearchPatStr
, nIcuSearchFlags
, nIcuErr
);
696 { delete pRegexMatcher
; pRegexMatcher
= NULL
;}
699 //---------------------------------------------------------------------------
701 SearchResult
TextSearch::RESrchFrwrd( const OUString
& searchStr
,
702 sal_Int32 startPos
, sal_Int32 endPos
)
703 throw(RuntimeException
)
706 aRet
.subRegExpressions
= 0;
710 if( endPos
> searchStr
.getLength())
711 endPos
= searchStr
.getLength();
713 // use the ICU RegexMatcher to find the matches
714 UErrorCode nIcuErr
= U_ZERO_ERROR
;
715 const IcuUniString
aSearchTargetStr( (const UChar
*)searchStr
.getStr(), endPos
);
716 pRegexMatcher
->reset( aSearchTargetStr
);
717 // search until there is a valid match
720 if( !pRegexMatcher
->find( startPos
, nIcuErr
))
723 // #i118887# ignore zero-length matches e.g. "a*" in "bc"
724 int nStartOfs
= pRegexMatcher
->start( nIcuErr
);
725 int nEndOfs
= pRegexMatcher
->end( nIcuErr
);
726 if( nStartOfs
< nEndOfs
)
728 // try at next position if there was a zero-length match
729 if( ++startPos
>= endPos
)
733 // extract the result of the search
734 const int nGroupCount
= pRegexMatcher
->groupCount();
735 aRet
.subRegExpressions
= nGroupCount
+ 1;
736 aRet
.startOffset
.realloc( aRet
.subRegExpressions
);
737 aRet
.endOffset
.realloc( aRet
.subRegExpressions
);
738 aRet
.startOffset
[0] = pRegexMatcher
->start( nIcuErr
);
739 aRet
.endOffset
[0] = pRegexMatcher
->end( nIcuErr
);
740 for( int i
= 1; i
<= nGroupCount
; ++i
) {
741 aRet
.startOffset
[i
] = pRegexMatcher
->start( i
, nIcuErr
);
742 aRet
.endOffset
[i
] = pRegexMatcher
->end( i
, nIcuErr
);
748 SearchResult
TextSearch::RESrchBkwrd( const OUString
& searchStr
,
749 sal_Int32 startPos
, sal_Int32 endPos
)
750 throw(RuntimeException
)
752 // NOTE: for backwards search callers provide startPos/endPos inverted!
754 aRet
.subRegExpressions
= 0;
758 if( startPos
> searchStr
.getLength())
759 startPos
= searchStr
.getLength();
761 // use the ICU RegexMatcher to find the matches
762 // TODO: use ICU's backward searching once it becomes available
763 // as its replacement using forward search is not as good as the real thing
764 UErrorCode nIcuErr
= U_ZERO_ERROR
;
765 const IcuUniString
aSearchTargetStr( (const UChar
*)searchStr
.getStr(), startPos
);
766 pRegexMatcher
->reset( aSearchTargetStr
);
767 if( !pRegexMatcher
->find( endPos
, nIcuErr
))
770 // find the last match
773 nLastPos
= pRegexMatcher
->start( nIcuErr
);
774 } while( pRegexMatcher
->find( nLastPos
+ 1, nIcuErr
));
776 // find last match again to get its details
777 pRegexMatcher
->find( nLastPos
, nIcuErr
);
779 // fill in the details of the last match
780 const int nGroupCount
= pRegexMatcher
->groupCount();
781 aRet
.subRegExpressions
= nGroupCount
+ 1;
782 aRet
.startOffset
.realloc( aRet
.subRegExpressions
);
783 aRet
.endOffset
.realloc( aRet
.subRegExpressions
);
784 // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted!
785 aRet
.startOffset
[0] = pRegexMatcher
->end( nIcuErr
);
786 aRet
.endOffset
[0] = pRegexMatcher
->start( nIcuErr
);
787 for( int i
= 1; i
<= nGroupCount
; ++i
) {
788 aRet
.startOffset
[i
] = pRegexMatcher
->end( i
, nIcuErr
);
789 aRet
.endOffset
[i
] = pRegexMatcher
->start( i
, nIcuErr
);
795 //---------------------------------------------------------------------------
797 // search for words phonetically
798 SearchResult
TextSearch::ApproxSrchFrwrd( const OUString
& searchStr
,
799 sal_Int32 startPos
, sal_Int32 endPos
)
800 throw(RuntimeException
)
803 aRet
.subRegExpressions
= 0;
808 OUString
aWTemp( searchStr
);
810 register sal_Int32 nStt
, nEnd
;
812 Boundary aWBnd
= xBreak
->getWordBoundary( aWTemp
, startPos
,
814 WordType::ANYWORD_IGNOREWHITESPACES
, sal_True
);
818 if( aWBnd
.startPos
>= endPos
)
820 nStt
= aWBnd
.startPos
< startPos
? startPos
: aWBnd
.startPos
;
821 nEnd
= aWBnd
.endPos
> endPos
? endPos
: aWBnd
.endPos
;
824 pWLD
->WLD( aWTemp
.getStr() + nStt
, nEnd
- nStt
) <= nLimit
)
826 aRet
.subRegExpressions
= 1;
827 aRet
.startOffset
.realloc( 1 );
828 aRet
.startOffset
[ 0 ] = nStt
;
829 aRet
.endOffset
.realloc( 1 );
830 aRet
.endOffset
[ 0 ] = nEnd
;
835 aWBnd
= xBreak
->nextWord( aWTemp
, nStt
, aSrchPara
.Locale
,
836 WordType::ANYWORD_IGNOREWHITESPACES
);
837 } while( aWBnd
.startPos
!= aWBnd
.endPos
||
838 (aWBnd
.endPos
!= aWTemp
.getLength() && aWBnd
.endPos
!= nEnd
) );
839 // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only
840 // whitespace) in searchStr, getWordBoundary() returned startPos,startPos
841 // and nextWord() does also => don't loop forever.
845 SearchResult
TextSearch::ApproxSrchBkwrd( const OUString
& searchStr
,
846 sal_Int32 startPos
, sal_Int32 endPos
)
847 throw(RuntimeException
)
850 aRet
.subRegExpressions
= 0;
855 OUString
aWTemp( searchStr
);
857 register sal_Int32 nStt
, nEnd
;
859 Boundary aWBnd
= xBreak
->getWordBoundary( aWTemp
, startPos
,
861 WordType::ANYWORD_IGNOREWHITESPACES
, sal_True
);
865 if( aWBnd
.endPos
<= endPos
)
867 nStt
= aWBnd
.startPos
< endPos
? endPos
: aWBnd
.startPos
;
868 nEnd
= aWBnd
.endPos
> startPos
? startPos
: aWBnd
.endPos
;
871 pWLD
->WLD( aWTemp
.getStr() + nStt
, nEnd
- nStt
) <= nLimit
)
873 aRet
.subRegExpressions
= 1;
874 aRet
.startOffset
.realloc( 1 );
875 aRet
.startOffset
[ 0 ] = nEnd
;
876 aRet
.endOffset
.realloc( 1 );
877 aRet
.endOffset
[ 0 ] = nStt
;
883 aWBnd
= xBreak
->previousWord( aWTemp
, nStt
, aSrchPara
.Locale
,
884 WordType::ANYWORD_IGNOREWHITESPACES
);
885 } while( aWBnd
.startPos
!= aWBnd
.endPos
|| aWBnd
.endPos
!= aWTemp
.getLength() );
890 static const sal_Char cSearchName
[] = "com.sun.star.util.TextSearch";
891 static const sal_Char cSearchImpl
[] = "com.sun.star.util.TextSearch_i18n";
893 static OUString
getServiceName_Static()
895 return OUString::createFromAscii( cSearchName
);
898 static OUString
getImplementationName_Static()
900 return OUString::createFromAscii( cSearchImpl
);
904 TextSearch::getImplementationName()
905 throw( RuntimeException
)
907 return getImplementationName_Static();
911 TextSearch::supportsService(const OUString
& rServiceName
)
912 throw( RuntimeException
)
914 return rServiceName
== cSearchName
;
917 Sequence
< OUString
> SAL_CALL
918 TextSearch::getSupportedServiceNames(void) throw( RuntimeException
)
920 Sequence
< OUString
> aRet(1);
921 aRet
[0] = getServiceName_Static();
925 ::com::sun::star::uno::Reference
< ::com::sun::star::uno::XInterface
>
926 SAL_CALL
TextSearch_CreateInstance(
927 const ::com::sun::star::uno::Reference
<
928 ::com::sun::star::lang::XMultiServiceFactory
>& rxMSF
)
930 return ::com::sun::star::uno::Reference
<
931 ::com::sun::star::uno::XInterface
>(
932 (::cppu::OWeakObject
*) new TextSearch(
933 comphelper::getComponentContext( rxMSF
) ) );
938 SAL_DLLPUBLIC_EXPORT
void* SAL_CALL
939 i18nsearch_component_getFactory( const sal_Char
* sImplementationName
,
940 void* _pServiceManager
,
941 SAL_UNUSED_PARAMETER
void* )
945 ::com::sun::star::lang::XMultiServiceFactory
* pServiceManager
=
946 reinterpret_cast< ::com::sun::star::lang::XMultiServiceFactory
* >
947 ( _pServiceManager
);
948 ::com::sun::star::uno::Reference
<
949 ::com::sun::star::lang::XSingleServiceFactory
> xFactory
;
951 if ( 0 == rtl_str_compare( sImplementationName
, cSearchImpl
) )
953 ::com::sun::star::uno::Sequence
< OUString
> aServiceNames(1);
954 aServiceNames
[0] = getServiceName_Static();
955 xFactory
= ::cppu::createSingleFactory(
956 pServiceManager
, getImplementationName_Static(),
957 &TextSearch_CreateInstance
, aServiceNames
);
963 pRet
= xFactory
.get();
971 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */