1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 * Copyright 2008 by Sun Microsystems, Inc.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * $RCSfile: breakiterator_unicode.cxx,v $
10 * $Revision: 1.36.2.1 $
12 * This file is part of OpenOffice.org.
14 * OpenOffice.org is free software: you can redistribute it and/or modify
15 * it under the terms of the GNU Lesser General Public License version 3
16 * only, as published by the Free Software Foundation.
18 * OpenOffice.org is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU Lesser General Public License version 3 for more details
22 * (a copy is included in the LICENSE file that accompanied this code).
24 * You should have received a copy of the GNU Lesser General Public License
25 * version 3 along with OpenOffice.org. If not, see
26 * <http://www.openoffice.org/license.html>
27 * for a copy of the LGPLv3 License.
29 ************************************************************************/
31 // MARKER(update_precomp.py): autogen include statement, do not remove
32 #include "precompiled_i18npool.hxx"
33 #include <breakiterator_unicode.hxx>
34 #include <localedata.hxx>
35 #include <unicode/uchar.h>
36 #include <unicode/locid.h>
37 #include <unicode/rbbi.h>
38 #include <unicode/udata.h>
39 #include <rtl/strbuf.hxx>
40 #include <rtl/ustring.hxx>
43 extern const char OpenOffice_dat
[];
46 using namespace ::com::sun::star
;
47 using namespace ::com::sun::star::lang
;
48 using namespace ::rtl
;
50 namespace com
{ namespace sun
{ namespace star
{ namespace i18n
{
52 #define ERROR ::com::sun::star::uno::RuntimeException()
54 //#define ImplementName "com.sun.star.i18n.BreakIterator_Unicode";
57 BreakIterator_Unicode::BreakIterator_Unicode() :
58 cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ), // implementation name
74 BreakIterator_Unicode::~BreakIterator_Unicode()
76 if (icuBI
&& icuBI
->aBreakIterator
) {
77 delete icuBI
->aBreakIterator
;
78 icuBI
->aBreakIterator
=NULL
;
80 if (character
.aBreakIterator
) delete character
.aBreakIterator
;
81 if (word
.aBreakIterator
) delete word
.aBreakIterator
;
82 if (sentence
.aBreakIterator
) delete sentence
.aBreakIterator
;
83 if (line
.aBreakIterator
) delete line
.aBreakIterator
;
87 Wrapper class to provide public access to the RuleBasedBreakIterator's
90 class OOoRuleBasedBreakIterator
: public RuleBasedBreakIterator
{
92 inline void publicSetBreakType(int32_t type
) {
95 OOoRuleBasedBreakIterator(UDataMemory
* image
,
97 RuleBasedBreakIterator(image
, status
) { };
101 // loading ICU breakiterator on demand.
102 void SAL_CALL
BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star::lang::Locale
& rLocale
,
103 sal_Int16 rBreakType
, sal_Int16 rWordType
, const sal_Char
*rule
, const OUString
& rText
) throw(uno::RuntimeException
)
105 sal_Bool newBreak
= sal_False
;
106 UErrorCode status
= U_ZERO_ERROR
;
107 sal_Int16 breakType
= 0;
108 switch (rBreakType
) {
109 case LOAD_CHARACTER_BREAKITERATOR
: icuBI
=&character
; breakType
= 3; break;
110 case LOAD_WORD_BREAKITERATOR
: icuBI
=&word
;
112 case WordType::ANYWORD_IGNOREWHITESPACES
: breakType
= 0; rule
=wordRule
= "edit_word"; break;
113 case WordType::DICTIONARY_WORD
: breakType
= 1; rule
=wordRule
= "dict_word"; break;
114 case WordType::WORD_COUNT
: breakType
= 2; rule
=wordRule
= "count_word"; break;
117 case LOAD_SENTENCE_BREAKITERATOR
: icuBI
=&sentence
; breakType
= 5; break;
118 case LOAD_LINE_BREAKITERATOR
: icuBI
=&line
; breakType
= 4; break;
120 if (!icuBI
->aBreakIterator
|| rWordType
!= aWordType
||
121 rLocale
.Language
!= aLocale
.Language
|| rLocale
.Country
!= aLocale
.Country
||
122 rLocale
.Variant
!= aLocale
.Variant
) {
123 if (icuBI
->aBreakIterator
) {
124 delete icuBI
->aBreakIterator
;
125 icuBI
->aBreakIterator
=NULL
;
128 uno::Sequence
< OUString
> breakRules
= LocaleData().getBreakIteratorRules(rLocale
);
130 status
= U_ZERO_ERROR
;
131 udata_setAppData("OpenOffice", OpenOffice_dat
, &status
);
132 if ( !U_SUCCESS(status
) ) throw ERROR
;
134 OOoRuleBasedBreakIterator
*rbi
= NULL
;
136 if (breakRules
.getLength() > breakType
&& breakRules
[breakType
].getLength() > 0) {
137 rbi
= new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk",
138 OUStringToOString(breakRules
[breakType
], RTL_TEXTENCODING_ASCII_US
).getStr(), &status
), status
);
140 status
= U_ZERO_ERROR
;
141 OStringBuffer
aUDName(64);
142 aUDName
.append(rule
);
144 aUDName
.append( OUStringToOString(rLocale
.Language
, RTL_TEXTENCODING_ASCII_US
));
145 UDataMemory
* pUData
= udata_open("OpenOffice", "brk", aUDName
.getStr(), &status
);
146 if( U_SUCCESS(status
) )
147 rbi
= new OOoRuleBasedBreakIterator( pUData
, status
);
148 if (!U_SUCCESS(status
) ) {
149 status
= U_ZERO_ERROR
;
150 pUData
= udata_open("OpenOffice", "brk", rule
, &status
);
151 if( U_SUCCESS(status
) )
152 rbi
= new OOoRuleBasedBreakIterator( pUData
, status
);
153 if (!U_SUCCESS(status
) ) icuBI
->aBreakIterator
=NULL
;
157 switch (rBreakType
) {
158 case LOAD_CHARACTER_BREAKITERATOR
: rbi
->publicSetBreakType(UBRK_CHARACTER
); break;
159 case LOAD_WORD_BREAKITERATOR
: rbi
->publicSetBreakType(UBRK_WORD
); break;
160 case LOAD_SENTENCE_BREAKITERATOR
: rbi
->publicSetBreakType(UBRK_SENTENCE
); break;
161 case LOAD_LINE_BREAKITERATOR
: rbi
->publicSetBreakType(UBRK_LINE
); break;
163 icuBI
->aBreakIterator
= rbi
;
167 if (!icuBI
->aBreakIterator
) {
168 icu::Locale
icuLocale(
169 OUStringToOString(rLocale
.Language
, RTL_TEXTENCODING_ASCII_US
).getStr(),
170 OUStringToOString(rLocale
.Country
, RTL_TEXTENCODING_ASCII_US
).getStr(),
171 OUStringToOString(rLocale
.Variant
, RTL_TEXTENCODING_ASCII_US
).getStr());
173 status
= U_ZERO_ERROR
;
174 switch (rBreakType
) {
175 case LOAD_CHARACTER_BREAKITERATOR
:
176 icuBI
->aBreakIterator
= icu::BreakIterator::createCharacterInstance(icuLocale
, status
);
178 case LOAD_WORD_BREAKITERATOR
:
179 icuBI
->aBreakIterator
= icu::BreakIterator::createWordInstance(icuLocale
, status
);
181 case LOAD_SENTENCE_BREAKITERATOR
:
182 icuBI
->aBreakIterator
= icu::BreakIterator::createSentenceInstance(icuLocale
, status
);
184 case LOAD_LINE_BREAKITERATOR
:
185 icuBI
->aBreakIterator
= icu::BreakIterator::createLineInstance(icuLocale
, status
);
188 if ( !U_SUCCESS(status
) ) {
189 icuBI
->aBreakIterator
=NULL
;
193 if (icuBI
->aBreakIterator
) {
196 aBreakType
=rBreakType
;
203 if (newBreak
|| icuBI
->aICUText
.compare(UnicodeString(reinterpret_cast<const UChar
*>(rText
.getStr()), rText
.getLength()))) { // UChar != sal_Unicode in MinGW
204 icuBI
->aICUText
=UnicodeString(reinterpret_cast<const UChar
*>(rText
.getStr()), rText
.getLength());
205 icuBI
->aBreakIterator
->setText(icuBI
->aICUText
);
210 sal_Int32 SAL_CALL
BreakIterator_Unicode::nextCharacters( const OUString
& Text
,
211 sal_Int32 nStartPos
, const lang::Locale
&rLocale
,
212 sal_Int16 nCharacterIteratorMode
, sal_Int32 nCount
, sal_Int32
& nDone
)
213 throw(uno::RuntimeException
)
215 if (nCharacterIteratorMode
== CharacterIteratorMode::SKIPCELL
) { // for CELL mode
216 loadICUBreakIterator(rLocale
, LOAD_CHARACTER_BREAKITERATOR
, 0, "char", Text
);
217 for (nDone
= 0; nDone
< nCount
; nDone
++) {
218 nStartPos
= character
.aBreakIterator
->following(nStartPos
);
219 if (nStartPos
== BreakIterator::DONE
)
220 return Text
.getLength();
222 } else { // for CHARACTER mode
223 for (nDone
= 0; nDone
< nCount
&& nStartPos
< Text
.getLength(); nDone
++)
224 Text
.iterateCodePoints(&nStartPos
, 1);
229 sal_Int32 SAL_CALL
BreakIterator_Unicode::previousCharacters( const OUString
& Text
,
230 sal_Int32 nStartPos
, const lang::Locale
& rLocale
,
231 sal_Int16 nCharacterIteratorMode
, sal_Int32 nCount
, sal_Int32
& nDone
)
232 throw(uno::RuntimeException
)
234 if (nCharacterIteratorMode
== CharacterIteratorMode::SKIPCELL
) { // for CELL mode
235 loadICUBreakIterator(rLocale
, LOAD_CHARACTER_BREAKITERATOR
, 0, "char", Text
);
236 for (nDone
= 0; nDone
< nCount
; nDone
++) {
237 nStartPos
= character
.aBreakIterator
->preceding(nStartPos
);
238 if (nStartPos
== BreakIterator::DONE
)
241 } else { // for BS to delete one char and CHARACTER mode.
242 for (nDone
= 0; nDone
< nCount
&& nStartPos
> 0; nDone
++)
243 Text
.iterateCodePoints(&nStartPos
, -1);
249 Boundary SAL_CALL
BreakIterator_Unicode::nextWord( const OUString
& Text
, sal_Int32 nStartPos
,
250 const lang::Locale
& rLocale
, sal_Int16 rWordType
) throw(uno::RuntimeException
)
252 loadICUBreakIterator(rLocale
, LOAD_WORD_BREAKITERATOR
, rWordType
, NULL
, Text
);
254 result
.startPos
= word
.aBreakIterator
->following(nStartPos
);
255 if( result
.startPos
>= Text
.getLength() || result
.startPos
== BreakIterator::DONE
)
256 result
.endPos
= result
.startPos
;
258 if ( (rWordType
== WordType::ANYWORD_IGNOREWHITESPACES
||
259 rWordType
== WordType::DICTIONARY_WORD
) &&
260 u_isWhitespace(Text
.iterateCodePoints(&result
.startPos
, 0)) )
261 result
.startPos
= word
.aBreakIterator
->following(result
.startPos
);
263 result
.endPos
= word
.aBreakIterator
->following(result
.startPos
);
264 if(result
.endPos
== BreakIterator::DONE
)
265 result
.endPos
= result
.startPos
;
271 Boundary SAL_CALL
BreakIterator_Unicode::previousWord(const OUString
& Text
, sal_Int32 nStartPos
,
272 const lang::Locale
& rLocale
, sal_Int16 rWordType
) throw(uno::RuntimeException
)
274 loadICUBreakIterator(rLocale
, LOAD_WORD_BREAKITERATOR
, rWordType
, NULL
, Text
);
276 result
.startPos
= word
.aBreakIterator
->preceding(nStartPos
);
277 if( result
.startPos
< 0 || result
.startPos
== BreakIterator::DONE
)
278 result
.endPos
= result
.startPos
;
280 if ( (rWordType
== WordType::ANYWORD_IGNOREWHITESPACES
||
281 rWordType
== WordType::DICTIONARY_WORD
) &&
282 u_isWhitespace(Text
.iterateCodePoints(&result
.startPos
, 0)) )
283 result
.startPos
= word
.aBreakIterator
->preceding(result
.startPos
);
285 result
.endPos
= word
.aBreakIterator
->following(result
.startPos
);
286 if(result
.endPos
== BreakIterator::DONE
)
287 result
.endPos
= result
.startPos
;
293 Boundary SAL_CALL
BreakIterator_Unicode::getWordBoundary( const OUString
& Text
, sal_Int32 nPos
, const lang::Locale
& rLocale
,
294 sal_Int16 rWordType
, sal_Bool bDirection
) throw(uno::RuntimeException
)
296 loadICUBreakIterator(rLocale
, LOAD_WORD_BREAKITERATOR
, rWordType
, NULL
, Text
);
297 sal_Int32 len
= Text
.getLength();
299 if(word
.aBreakIterator
->isBoundary(nPos
)) {
300 result
.startPos
= result
.endPos
= nPos
;
301 if((bDirection
|| nPos
== 0) && nPos
< len
) //forward
302 result
.endPos
= word
.aBreakIterator
->following(nPos
);
304 result
.startPos
= word
.aBreakIterator
->preceding(nPos
);
308 result
.endPos
= len
? word
.aBreakIterator
->following((sal_Int32
)0) : 0;
309 } else if(nPos
>= len
) {
310 result
.startPos
= word
.aBreakIterator
->preceding(len
);
313 result
.startPos
= word
.aBreakIterator
->preceding(nPos
);
314 result
.endPos
= word
.aBreakIterator
->following(nPos
);
317 if (result
.startPos
== BreakIterator::DONE
)
318 result
.startPos
= result
.endPos
;
319 else if (result
.endPos
== BreakIterator::DONE
)
320 result
.endPos
= result
.startPos
;
326 sal_Int32 SAL_CALL
BreakIterator_Unicode::beginOfSentence( const OUString
& Text
, sal_Int32 nStartPos
,
327 const lang::Locale
&rLocale
) throw(uno::RuntimeException
)
329 loadICUBreakIterator(rLocale
, LOAD_SENTENCE_BREAKITERATOR
, 0, "sent", Text
);
331 sal_Int32 len
= Text
.getLength();
332 if (len
> 0 && nStartPos
== len
)
333 Text
.iterateCodePoints(&nStartPos
, -1); // issue #i27703# treat end position as part of last sentence
334 if (!sentence
.aBreakIterator
->isBoundary(nStartPos
))
335 nStartPos
= sentence
.aBreakIterator
->preceding(nStartPos
);
337 // skip preceding space.
338 sal_uInt32 ch
= Text
.iterateCodePoints(&nStartPos
, 1);
339 while (nStartPos
< len
&& u_isWhitespace(ch
)) ch
= Text
.iterateCodePoints(&nStartPos
, 1);
340 Text
.iterateCodePoints(&nStartPos
, -1);
345 sal_Int32 SAL_CALL
BreakIterator_Unicode::endOfSentence( const OUString
& Text
, sal_Int32 nStartPos
,
346 const lang::Locale
&rLocale
) throw(uno::RuntimeException
)
348 loadICUBreakIterator(rLocale
, LOAD_SENTENCE_BREAKITERATOR
, 0, "sent", Text
);
350 sal_Int32 len
= Text
.getLength();
351 if (len
> 0 && nStartPos
== len
)
352 Text
.iterateCodePoints(&nStartPos
, -1); // issue #i27703# treat end position as part of last sentence
353 nStartPos
= sentence
.aBreakIterator
->following(nStartPos
);
355 sal_Int32 nPos
=nStartPos
;
356 while (nPos
> 0 && u_isWhitespace(Text
.iterateCodePoints(&nPos
, -1))) nStartPos
=nPos
;
361 LineBreakResults SAL_CALL
BreakIterator_Unicode::getLineBreak(
362 const OUString
& Text
, sal_Int32 nStartPos
,
363 const lang::Locale
& rLocale
, sal_Int32 nMinBreakPos
,
364 const LineBreakHyphenationOptions
& hOptions
,
365 const LineBreakUserOptions
& /*rOptions*/ ) throw(uno::RuntimeException
)
367 LineBreakResults lbr
;
369 if (nStartPos
>= Text
.getLength()) {
370 lbr
.breakIndex
= Text
.getLength();
371 lbr
.breakType
= BreakType::WORDBOUNDARY
;
375 loadICUBreakIterator(rLocale
, LOAD_LINE_BREAKITERATOR
, 0, lineRule
, Text
);
377 sal_Bool GlueSpace
=sal_True
;
379 if (line
.aBreakIterator
->preceding(nStartPos
+ 1) == nStartPos
) { //Line boundary break
380 lbr
.breakIndex
= nStartPos
;
381 lbr
.breakType
= BreakType::WORDBOUNDARY
;
382 } else if (hOptions
.rHyphenator
.is()) { //Hyphenation break
383 Boundary wBoundary
= getWordBoundary( Text
, nStartPos
, rLocale
,
384 WordType::DICTIONARY_WORD
, false);
385 uno::Reference
< linguistic2::XHyphenatedWord
> aHyphenatedWord
;
386 aHyphenatedWord
= hOptions
.rHyphenator
->hyphenate(Text
.copy(wBoundary
.startPos
,
387 wBoundary
.endPos
- wBoundary
.startPos
), rLocale
,
388 (sal_Int16
) (hOptions
.hyphenIndex
- wBoundary
.startPos
), hOptions
.aHyphenationOptions
);
389 if (aHyphenatedWord
.is()) {
390 lbr
.rHyphenatedWord
= aHyphenatedWord
;
391 if(wBoundary
.startPos
+ aHyphenatedWord
->getHyphenationPos() + 1 < nMinBreakPos
)
394 lbr
.breakIndex
= wBoundary
.startPos
; //aHyphenatedWord->getHyphenationPos();
395 lbr
.breakType
= BreakType::HYPHENATION
;
397 lbr
.breakIndex
= line
.aBreakIterator
->preceding(nStartPos
);
398 lbr
.breakType
= BreakType::WORDBOUNDARY
;;
400 } else { //word boundary break
401 lbr
.breakIndex
= line
.aBreakIterator
->preceding(nStartPos
);
402 lbr
.breakType
= BreakType::WORDBOUNDARY
;
405 #define WJ 0x2060 // Word Joiner
407 if (lbr
.breakType
== BreakType::WORDBOUNDARY
) {
408 nStartPos
= lbr
.breakIndex
;
409 if (Text
[nStartPos
--] == WJ
)
411 while (nStartPos
>= 0 &&
412 (u_isWhitespace(Text
.iterateCodePoints(&nStartPos
, 0)) || Text
[nStartPos
] == WJ
)) {
413 if (Text
[nStartPos
--] == WJ
)
416 if (GlueSpace
&& nStartPos
< 0) {
429 BreakIterator_Unicode::getImplementationName(void) throw( uno::RuntimeException
)
431 return OUString::createFromAscii(cBreakIterator
);
435 BreakIterator_Unicode::supportsService(const OUString
& rServiceName
) throw( uno::RuntimeException
)
437 return !rServiceName
.compareToAscii(cBreakIterator
);
440 uno::Sequence
< OUString
> SAL_CALL
441 BreakIterator_Unicode::getSupportedServiceNames(void) throw( uno::RuntimeException
)
443 uno::Sequence
< OUString
> aRet(1);
444 aRet
[0] = OUString::createFromAscii(cBreakIterator
);