1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 * Copyright 2000, 2010 Oracle and/or its affiliates.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * This file is part of OpenOffice.org.
11 * OpenOffice.org is free software: you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser General Public License version 3
13 * only, as published by the Free Software Foundation.
15 * OpenOffice.org is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Lesser General Public License version 3 for more details
19 * (a copy is included in the LICENSE file that accompanied this code).
21 * You should have received a copy of the GNU Lesser General Public License
22 * version 3 along with OpenOffice.org. If not, see
23 * <http://www.openoffice.org/license.html>
24 * for a copy of the LGPLv3 License.
26 ************************************************************************/
28 // MARKER(update_precomp.py): autogen include statement, do not remove
29 #include "precompiled_i18npool.hxx"
30 #include <breakiterator_unicode.hxx>
31 #include <localedata.hxx>
32 #include <unicode/uchar.h>
33 #include <unicode/locid.h>
34 #include <unicode/rbbi.h>
35 #include <unicode/udata.h>
36 #include <rtl/strbuf.hxx>
37 #include <rtl/ustring.hxx>
40 extern const char OpenOffice_dat
[];
43 using namespace ::com::sun::star
;
44 using namespace ::com::sun::star::lang
;
45 using namespace ::rtl
;
47 namespace com
{ namespace sun
{ namespace star
{ namespace i18n
{
49 #define ERROR ::com::sun::star::uno::RuntimeException()
51 //#define ImplementName "com.sun.star.i18n.BreakIterator_Unicode";
54 BreakIterator_Unicode::BreakIterator_Unicode() :
55 cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ), // implementation name
71 BreakIterator_Unicode::~BreakIterator_Unicode()
73 if (icuBI
&& icuBI
->aBreakIterator
) {
74 delete icuBI
->aBreakIterator
;
75 icuBI
->aBreakIterator
=NULL
;
77 if (character
.aBreakIterator
) delete character
.aBreakIterator
;
78 if (word
.aBreakIterator
) delete word
.aBreakIterator
;
79 if (sentence
.aBreakIterator
) delete sentence
.aBreakIterator
;
80 if (line
.aBreakIterator
) delete line
.aBreakIterator
;
84 Wrapper class to provide public access to the RuleBasedBreakIterator's
87 class OOoRuleBasedBreakIterator
: public RuleBasedBreakIterator
{
89 inline void publicSetBreakType(int32_t type
) {
92 OOoRuleBasedBreakIterator(UDataMemory
* image
,
94 RuleBasedBreakIterator(image
, status
) { };
98 // loading ICU breakiterator on demand.
99 void SAL_CALL
BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star::lang::Locale
& rLocale
,
100 sal_Int16 rBreakType
, sal_Int16 rWordType
, const sal_Char
*rule
, const OUString
& rText
) throw(uno::RuntimeException
)
102 sal_Bool newBreak
= sal_False
;
103 UErrorCode status
= U_ZERO_ERROR
;
104 sal_Int16 breakType
= 0;
105 switch (rBreakType
) {
106 case LOAD_CHARACTER_BREAKITERATOR
: icuBI
=&character
; breakType
= 3; break;
107 case LOAD_WORD_BREAKITERATOR
: icuBI
=&word
;
109 case WordType::ANYWORD_IGNOREWHITESPACES
: breakType
= 0; rule
=wordRule
= "edit_word"; break;
110 case WordType::DICTIONARY_WORD
: breakType
= 1; rule
=wordRule
= "dict_word"; break;
111 case WordType::WORD_COUNT
: breakType
= 2; rule
=wordRule
= "count_word"; break;
114 case LOAD_SENTENCE_BREAKITERATOR
: icuBI
=&sentence
; breakType
= 5; break;
115 case LOAD_LINE_BREAKITERATOR
: icuBI
=&line
; breakType
= 4; break;
117 if (!icuBI
->aBreakIterator
|| rWordType
!= aWordType
||
118 rLocale
.Language
!= aLocale
.Language
|| rLocale
.Country
!= aLocale
.Country
||
119 rLocale
.Variant
!= aLocale
.Variant
) {
120 if (icuBI
->aBreakIterator
) {
121 delete icuBI
->aBreakIterator
;
122 icuBI
->aBreakIterator
=NULL
;
125 uno::Sequence
< OUString
> breakRules
= LocaleData().getBreakIteratorRules(rLocale
);
127 status
= U_ZERO_ERROR
;
128 udata_setAppData("OpenOffice", OpenOffice_dat
, &status
);
129 if ( !U_SUCCESS(status
) ) throw ERROR
;
131 OOoRuleBasedBreakIterator
*rbi
= NULL
;
133 if (breakRules
.getLength() > breakType
&& breakRules
[breakType
].getLength() > 0) {
134 rbi
= new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk",
135 OUStringToOString(breakRules
[breakType
], RTL_TEXTENCODING_ASCII_US
).getStr(), &status
), status
);
137 status
= U_ZERO_ERROR
;
138 OStringBuffer
aUDName(64);
139 aUDName
.append(rule
);
141 aUDName
.append( OUStringToOString(rLocale
.Language
, RTL_TEXTENCODING_ASCII_US
));
142 UDataMemory
* pUData
= udata_open("OpenOffice", "brk", aUDName
.getStr(), &status
);
143 if( U_SUCCESS(status
) )
144 rbi
= new OOoRuleBasedBreakIterator( pUData
, status
);
145 if (!U_SUCCESS(status
) ) {
146 status
= U_ZERO_ERROR
;
147 pUData
= udata_open("OpenOffice", "brk", rule
, &status
);
148 if( U_SUCCESS(status
) )
149 rbi
= new OOoRuleBasedBreakIterator( pUData
, status
);
150 if (!U_SUCCESS(status
) ) icuBI
->aBreakIterator
=NULL
;
154 switch (rBreakType
) {
155 case LOAD_CHARACTER_BREAKITERATOR
: rbi
->publicSetBreakType(UBRK_CHARACTER
); break;
156 case LOAD_WORD_BREAKITERATOR
: rbi
->publicSetBreakType(UBRK_WORD
); break;
157 case LOAD_SENTENCE_BREAKITERATOR
: rbi
->publicSetBreakType(UBRK_SENTENCE
); break;
158 case LOAD_LINE_BREAKITERATOR
: rbi
->publicSetBreakType(UBRK_LINE
); break;
160 icuBI
->aBreakIterator
= rbi
;
164 if (!icuBI
->aBreakIterator
) {
165 icu::Locale
icuLocale(
166 OUStringToOString(rLocale
.Language
, RTL_TEXTENCODING_ASCII_US
).getStr(),
167 OUStringToOString(rLocale
.Country
, RTL_TEXTENCODING_ASCII_US
).getStr(),
168 OUStringToOString(rLocale
.Variant
, RTL_TEXTENCODING_ASCII_US
).getStr());
170 status
= U_ZERO_ERROR
;
171 switch (rBreakType
) {
172 case LOAD_CHARACTER_BREAKITERATOR
:
173 icuBI
->aBreakIterator
= icu::BreakIterator::createCharacterInstance(icuLocale
, status
);
175 case LOAD_WORD_BREAKITERATOR
:
176 icuBI
->aBreakIterator
= icu::BreakIterator::createWordInstance(icuLocale
, status
);
178 case LOAD_SENTENCE_BREAKITERATOR
:
179 icuBI
->aBreakIterator
= icu::BreakIterator::createSentenceInstance(icuLocale
, status
);
181 case LOAD_LINE_BREAKITERATOR
:
182 icuBI
->aBreakIterator
= icu::BreakIterator::createLineInstance(icuLocale
, status
);
185 if ( !U_SUCCESS(status
) ) {
186 icuBI
->aBreakIterator
=NULL
;
190 if (icuBI
->aBreakIterator
) {
193 aBreakType
=rBreakType
;
200 if (newBreak
|| icuBI
->aICUText
.compare(UnicodeString(reinterpret_cast<const UChar
*>(rText
.getStr()), rText
.getLength()))) { // UChar != sal_Unicode in MinGW
201 icuBI
->aICUText
=UnicodeString(reinterpret_cast<const UChar
*>(rText
.getStr()), rText
.getLength());
202 icuBI
->aBreakIterator
->setText(icuBI
->aICUText
);
207 sal_Int32 SAL_CALL
BreakIterator_Unicode::nextCharacters( const OUString
& Text
,
208 sal_Int32 nStartPos
, const lang::Locale
&rLocale
,
209 sal_Int16 nCharacterIteratorMode
, sal_Int32 nCount
, sal_Int32
& nDone
)
210 throw(uno::RuntimeException
)
212 if (nCharacterIteratorMode
== CharacterIteratorMode::SKIPCELL
) { // for CELL mode
213 loadICUBreakIterator(rLocale
, LOAD_CHARACTER_BREAKITERATOR
, 0, "char", Text
);
214 for (nDone
= 0; nDone
< nCount
; nDone
++) {
215 nStartPos
= character
.aBreakIterator
->following(nStartPos
);
216 if (nStartPos
== BreakIterator::DONE
)
217 return Text
.getLength();
219 } else { // for CHARACTER mode
220 for (nDone
= 0; nDone
< nCount
&& nStartPos
< Text
.getLength(); nDone
++)
221 Text
.iterateCodePoints(&nStartPos
, 1);
226 sal_Int32 SAL_CALL
BreakIterator_Unicode::previousCharacters( const OUString
& Text
,
227 sal_Int32 nStartPos
, const lang::Locale
& rLocale
,
228 sal_Int16 nCharacterIteratorMode
, sal_Int32 nCount
, sal_Int32
& nDone
)
229 throw(uno::RuntimeException
)
231 if (nCharacterIteratorMode
== CharacterIteratorMode::SKIPCELL
) { // for CELL mode
232 loadICUBreakIterator(rLocale
, LOAD_CHARACTER_BREAKITERATOR
, 0, "char", Text
);
233 for (nDone
= 0; nDone
< nCount
; nDone
++) {
234 nStartPos
= character
.aBreakIterator
->preceding(nStartPos
);
235 if (nStartPos
== BreakIterator::DONE
)
238 } else { // for BS to delete one char and CHARACTER mode.
239 for (nDone
= 0; nDone
< nCount
&& nStartPos
> 0; nDone
++)
240 Text
.iterateCodePoints(&nStartPos
, -1);
246 Boundary SAL_CALL
BreakIterator_Unicode::nextWord( const OUString
& Text
, sal_Int32 nStartPos
,
247 const lang::Locale
& rLocale
, sal_Int16 rWordType
) throw(uno::RuntimeException
)
249 loadICUBreakIterator(rLocale
, LOAD_WORD_BREAKITERATOR
, rWordType
, NULL
, Text
);
251 result
.startPos
= word
.aBreakIterator
->following(nStartPos
);
252 if( result
.startPos
>= Text
.getLength() || result
.startPos
== BreakIterator::DONE
)
253 result
.endPos
= result
.startPos
;
255 if ( (rWordType
== WordType::ANYWORD_IGNOREWHITESPACES
||
256 rWordType
== WordType::DICTIONARY_WORD
) &&
257 u_isWhitespace(Text
.iterateCodePoints(&result
.startPos
, 0)) )
258 result
.startPos
= word
.aBreakIterator
->following(result
.startPos
);
260 result
.endPos
= word
.aBreakIterator
->following(result
.startPos
);
261 if(result
.endPos
== BreakIterator::DONE
)
262 result
.endPos
= result
.startPos
;
268 Boundary SAL_CALL
BreakIterator_Unicode::previousWord(const OUString
& Text
, sal_Int32 nStartPos
,
269 const lang::Locale
& rLocale
, sal_Int16 rWordType
) throw(uno::RuntimeException
)
271 loadICUBreakIterator(rLocale
, LOAD_WORD_BREAKITERATOR
, rWordType
, NULL
, Text
);
273 result
.startPos
= word
.aBreakIterator
->preceding(nStartPos
);
274 if( result
.startPos
< 0 || result
.startPos
== BreakIterator::DONE
)
275 result
.endPos
= result
.startPos
;
277 if ( (rWordType
== WordType::ANYWORD_IGNOREWHITESPACES
||
278 rWordType
== WordType::DICTIONARY_WORD
) &&
279 u_isWhitespace(Text
.iterateCodePoints(&result
.startPos
, 0)) )
280 result
.startPos
= word
.aBreakIterator
->preceding(result
.startPos
);
282 result
.endPos
= word
.aBreakIterator
->following(result
.startPos
);
283 if(result
.endPos
== BreakIterator::DONE
)
284 result
.endPos
= result
.startPos
;
290 Boundary SAL_CALL
BreakIterator_Unicode::getWordBoundary( const OUString
& Text
, sal_Int32 nPos
, const lang::Locale
& rLocale
,
291 sal_Int16 rWordType
, sal_Bool bDirection
) throw(uno::RuntimeException
)
293 loadICUBreakIterator(rLocale
, LOAD_WORD_BREAKITERATOR
, rWordType
, NULL
, Text
);
294 sal_Int32 len
= Text
.getLength();
296 if(word
.aBreakIterator
->isBoundary(nPos
)) {
297 result
.startPos
= result
.endPos
= nPos
;
298 if((bDirection
|| nPos
== 0) && nPos
< len
) //forward
299 result
.endPos
= word
.aBreakIterator
->following(nPos
);
301 result
.startPos
= word
.aBreakIterator
->preceding(nPos
);
305 result
.endPos
= len
? word
.aBreakIterator
->following((sal_Int32
)0) : 0;
306 } else if(nPos
>= len
) {
307 result
.startPos
= word
.aBreakIterator
->preceding(len
);
310 result
.startPos
= word
.aBreakIterator
->preceding(nPos
);
311 result
.endPos
= word
.aBreakIterator
->following(nPos
);
314 if (result
.startPos
== BreakIterator::DONE
)
315 result
.startPos
= result
.endPos
;
316 else if (result
.endPos
== BreakIterator::DONE
)
317 result
.endPos
= result
.startPos
;
323 sal_Int32 SAL_CALL
BreakIterator_Unicode::beginOfSentence( const OUString
& Text
, sal_Int32 nStartPos
,
324 const lang::Locale
&rLocale
) throw(uno::RuntimeException
)
326 loadICUBreakIterator(rLocale
, LOAD_SENTENCE_BREAKITERATOR
, 0, "sent", Text
);
328 sal_Int32 len
= Text
.getLength();
329 if (len
> 0 && nStartPos
== len
)
330 Text
.iterateCodePoints(&nStartPos
, -1); // issue #i27703# treat end position as part of last sentence
331 if (!sentence
.aBreakIterator
->isBoundary(nStartPos
))
332 nStartPos
= sentence
.aBreakIterator
->preceding(nStartPos
);
334 // skip preceding space.
335 sal_uInt32 ch
= Text
.iterateCodePoints(&nStartPos
, 1);
336 while (nStartPos
< len
&& u_isWhitespace(ch
)) ch
= Text
.iterateCodePoints(&nStartPos
, 1);
337 Text
.iterateCodePoints(&nStartPos
, -1);
342 sal_Int32 SAL_CALL
BreakIterator_Unicode::endOfSentence( const OUString
& Text
, sal_Int32 nStartPos
,
343 const lang::Locale
&rLocale
) throw(uno::RuntimeException
)
345 loadICUBreakIterator(rLocale
, LOAD_SENTENCE_BREAKITERATOR
, 0, "sent", Text
);
347 sal_Int32 len
= Text
.getLength();
348 if (len
> 0 && nStartPos
== len
)
349 Text
.iterateCodePoints(&nStartPos
, -1); // issue #i27703# treat end position as part of last sentence
350 nStartPos
= sentence
.aBreakIterator
->following(nStartPos
);
352 sal_Int32 nPos
=nStartPos
;
353 while (nPos
> 0 && u_isWhitespace(Text
.iterateCodePoints(&nPos
, -1))) nStartPos
=nPos
;
358 LineBreakResults SAL_CALL
BreakIterator_Unicode::getLineBreak(
359 const OUString
& Text
, sal_Int32 nStartPos
,
360 const lang::Locale
& rLocale
, sal_Int32 nMinBreakPos
,
361 const LineBreakHyphenationOptions
& hOptions
,
362 const LineBreakUserOptions
& /*rOptions*/ ) throw(uno::RuntimeException
)
364 LineBreakResults lbr
;
366 if (nStartPos
>= Text
.getLength()) {
367 lbr
.breakIndex
= Text
.getLength();
368 lbr
.breakType
= BreakType::WORDBOUNDARY
;
372 loadICUBreakIterator(rLocale
, LOAD_LINE_BREAKITERATOR
, 0, lineRule
, Text
);
374 sal_Bool GlueSpace
=sal_True
;
376 if (line
.aBreakIterator
->preceding(nStartPos
+ 1) == nStartPos
) { //Line boundary break
377 lbr
.breakIndex
= nStartPos
;
378 lbr
.breakType
= BreakType::WORDBOUNDARY
;
379 } else if (hOptions
.rHyphenator
.is()) { //Hyphenation break
380 Boundary wBoundary
= getWordBoundary( Text
, nStartPos
, rLocale
,
381 WordType::DICTIONARY_WORD
, false);
382 uno::Reference
< linguistic2::XHyphenatedWord
> aHyphenatedWord
;
383 aHyphenatedWord
= hOptions
.rHyphenator
->hyphenate(Text
.copy(wBoundary
.startPos
,
384 wBoundary
.endPos
- wBoundary
.startPos
), rLocale
,
385 (sal_Int16
) (hOptions
.hyphenIndex
- wBoundary
.startPos
), hOptions
.aHyphenationOptions
);
386 if (aHyphenatedWord
.is()) {
387 lbr
.rHyphenatedWord
= aHyphenatedWord
;
388 if(wBoundary
.startPos
+ aHyphenatedWord
->getHyphenationPos() + 1 < nMinBreakPos
)
391 lbr
.breakIndex
= wBoundary
.startPos
; //aHyphenatedWord->getHyphenationPos();
392 lbr
.breakType
= BreakType::HYPHENATION
;
394 lbr
.breakIndex
= line
.aBreakIterator
->preceding(nStartPos
);
395 lbr
.breakType
= BreakType::WORDBOUNDARY
;;
397 } else { //word boundary break
398 lbr
.breakIndex
= line
.aBreakIterator
->preceding(nStartPos
);
399 lbr
.breakType
= BreakType::WORDBOUNDARY
;
402 #define WJ 0x2060 // Word Joiner
404 if (lbr
.breakType
== BreakType::WORDBOUNDARY
) {
405 nStartPos
= lbr
.breakIndex
;
406 if (Text
[nStartPos
--] == WJ
)
408 while (nStartPos
>= 0 &&
409 (u_isWhitespace(Text
.iterateCodePoints(&nStartPos
, 0)) || Text
[nStartPos
] == WJ
)) {
410 if (Text
[nStartPos
--] == WJ
)
413 if (GlueSpace
&& nStartPos
< 0) {
426 BreakIterator_Unicode::getImplementationName(void) throw( uno::RuntimeException
)
428 return OUString::createFromAscii(cBreakIterator
);
432 BreakIterator_Unicode::supportsService(const OUString
& rServiceName
) throw( uno::RuntimeException
)
434 return !rServiceName
.compareToAscii(cBreakIterator
);
437 uno::Sequence
< OUString
> SAL_CALL
438 BreakIterator_Unicode::getSupportedServiceNames(void) throw( uno::RuntimeException
)
440 uno::Sequence
< OUString
> aRet(1);
441 aRet
[0] = OUString::createFromAscii(cBreakIterator
);