1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <breakiterator_unicode.hxx>
21 #include <cppuhelper/supportsservice.hxx>
22 #include <localedata.hxx>
23 #include <i18nlangtag/languagetag.hxx>
24 #include <i18nlangtag/languagetagicu.hxx>
25 #include <unicode/uchar.h>
26 #include <unicode/locid.h>
27 #include <unicode/rbbi.h>
28 #include <unicode/udata.h>
29 #include <rtl/strbuf.hxx>
30 #include <rtl/ustring.hxx>
32 #include <com/sun/star/i18n/BreakType.hpp>
33 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
34 #include <com/sun/star/i18n/WordType.hpp>
37 extern const char OpenOffice_dat
[];
40 using namespace ::com::sun::star
;
41 using namespace ::com::sun::star::i18n
;
42 using namespace ::com::sun::star::lang
;
46 // Cache map of breakiterators, stores state information so has to be
48 thread_local
static BreakIterator_Unicode::BIMap theBIMap
;
50 BreakIterator_Unicode::BreakIterator_Unicode()
51 : cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ) // implementation name
57 BreakIterator_Unicode::~BreakIterator_Unicode()
62 Wrapper class to provide public access to the icu::RuleBasedBreakIterator's
65 class OOoRuleBasedBreakIterator
: public icu::RuleBasedBreakIterator
68 #if (U_ICU_VERSION_MAJOR_NUM < 58)
69 // icu::RuleBasedBreakIterator::setBreakType() is private as of ICU 58.
70 void publicSetBreakType(int32_t type
)
75 OOoRuleBasedBreakIterator(UDataMemory
* image
,
77 : icu::RuleBasedBreakIterator(image
, status
)
82 // loading ICU breakiterator on demand.
83 void BreakIterator_Unicode::loadICUBreakIterator(const css::lang::Locale
& rLocale
,
84 sal_Int16 rBreakType
, sal_Int16 nWordType
, const sal_Char
*rule
, const OUString
& rText
)
86 bool bNewBreak
= false;
87 UErrorCode status
= U_ZERO_ERROR
;
88 sal_Int16 breakType
= 0;
90 case LOAD_CHARACTER_BREAKITERATOR
: icuBI
=&character
; breakType
= 3; break;
91 case LOAD_WORD_BREAKITERATOR
:
92 assert (nWordType
>= 0 && nWordType
<= WordType::WORD_COUNT
);
93 icuBI
=&words
[nWordType
];
95 case WordType::ANY_WORD
: break; // odd but previous behavior
96 case WordType::ANYWORD_IGNOREWHITESPACES
:
97 breakType
= 0; rule
= "edit_word"; break;
98 case WordType::DICTIONARY_WORD
:
99 breakType
= 1; rule
= "dict_word"; break;
101 case WordType::WORD_COUNT
:
102 breakType
= 2; rule
= "count_word"; break;
105 case LOAD_SENTENCE_BREAKITERATOR
: icuBI
=&sentence
; breakType
= 5; break;
106 case LOAD_LINE_BREAKITERATOR
: icuBI
=&line
; breakType
= 4; break;
109 // Using the cache map prevents accessing the file system for each
110 // udata_open() where ICU tries first files then data objects. And that for
111 // two fallbacks worst case... for each new allocated EditEngine, layout
112 // cell, ... *ouch* Also non-rule locale based iterators can be mapped.
113 // This also speeds up loading iterators for alternating or generally more
114 // than one language/locale in that iterators are not constructed and
115 // destroyed en masse.
116 // Four possible keys, locale rule based with break type, locale rule based
117 // only, rule based only, locale based with break type. A fifth global key
118 // for the initial lookup.
119 // Multiple global keys may map to identical value data.
120 // All enums used here should be in the range 0..9 so assert that and avoid
121 // expensive numeric conversion in append() for faster construction of the
122 // always used global key.
123 assert( 0 <= breakType
&& breakType
<= 9 && 0 <= rBreakType
&& rBreakType
<= 9 && 0 <= nWordType
&& nWordType
<= 9);
124 const OString
aLangtagStr( LanguageTag::convertToBcp47( rLocale
).toUtf8());
125 OStringBuffer
aKeyBuf(64);
126 aKeyBuf
.append( aLangtagStr
).append(';');
128 aKeyBuf
.append(rule
);
129 aKeyBuf
.append(';').append( static_cast<sal_Char
>('0'+breakType
)).append(';').
130 append( static_cast<sal_Char
>('0'+rBreakType
)).append(';').append( static_cast<sal_Char
>('0'+nWordType
));
131 // langtag;rule;breakType;rBreakType;nWordType
132 const OString
aBIMapGlobalKey( aKeyBuf
.makeStringAndClear());
134 if (icuBI
->maBIMapKey
!= aBIMapGlobalKey
|| !icuBI
->mpValue
|| !icuBI
->mpValue
->mpBreakIterator
)
137 auto aMapIt( theBIMap
.find( aBIMapGlobalKey
));
138 bool bInMap
= (aMapIt
!= theBIMap
.end());
140 icuBI
->mpValue
= aMapIt
->second
;
142 icuBI
->mpValue
.reset();
144 if (!bInMap
&& rule
) do {
145 const uno::Sequence
< OUString
> breakRules
= LocaleDataImpl::get()->getBreakIteratorRules(rLocale
);
147 status
= U_ZERO_ERROR
;
148 udata_setAppData("OpenOffice", OpenOffice_dat
, &status
);
149 if ( !U_SUCCESS(status
) ) throw uno::RuntimeException();
151 std::unique_ptr
<OOoRuleBasedBreakIterator
> rbi
;
153 if (breakRules
.getLength() > breakType
&& !breakRules
[breakType
].isEmpty())
155 // langtag;rule;breakType
156 const OString
aBIMapRuleTypeKey( aLangtagStr
+ ";" + rule
+ ";" + OString::number(breakType
));
157 aMapIt
= theBIMap
.find( aBIMapRuleTypeKey
);
158 bInMap
= (aMapIt
!= theBIMap
.end());
161 icuBI
->mpValue
= aMapIt
->second
;
162 icuBI
->maBIMapKey
= aBIMapGlobalKey
;
163 theBIMap
.insert( std::make_pair( aBIMapGlobalKey
, icuBI
->mpValue
));
167 rbi
.reset(new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk",
168 OUStringToOString(breakRules
[breakType
], RTL_TEXTENCODING_ASCII_US
).getStr(), &status
), status
));
170 if (U_SUCCESS(status
))
172 icuBI
->mpValue
.reset( new BI_ValueData
);
173 icuBI
->mpValue
->mpBreakIterator
= std::move( rbi
);
174 theBIMap
.insert( std::make_pair( aBIMapRuleTypeKey
, icuBI
->mpValue
));
181 //use icu's breakiterator for Thai, Tibetan and Dzongkha
182 else if (rLocale
.Language
!= "th" && rLocale
.Language
!= "lo" && rLocale
.Language
!= "bo" && rLocale
.Language
!= "dz" && rLocale
.Language
!= "km")
184 // language;rule (not langtag, unless we'd actually load such)
185 OString
aLanguage( LanguageTag( rLocale
).getLanguage().toUtf8());
186 const OString
aBIMapRuleKey( aLanguage
+ ";" + rule
);
187 aMapIt
= theBIMap
.find( aBIMapRuleKey
);
188 bInMap
= (aMapIt
!= theBIMap
.end());
191 icuBI
->mpValue
= aMapIt
->second
;
192 icuBI
->maBIMapKey
= aBIMapGlobalKey
;
193 theBIMap
.insert( std::make_pair( aBIMapGlobalKey
, icuBI
->mpValue
));
197 status
= U_ZERO_ERROR
;
198 OString aUDName
= rtl::OStringView(rule
) + "_" + aLanguage
;
199 UDataMemory
* pUData
= udata_open("OpenOffice", "brk", aUDName
.getStr(), &status
);
200 if( U_SUCCESS(status
) )
201 rbi
.reset(new OOoRuleBasedBreakIterator( pUData
, status
));
202 if ( U_SUCCESS(status
) )
204 icuBI
->mpValue
.reset( new BI_ValueData
);
205 icuBI
->mpValue
->mpBreakIterator
= std::move( rbi
);
206 theBIMap
.insert( std::make_pair( aBIMapRuleKey
, icuBI
->mpValue
));
213 const OString
aBIMapRuleOnlyKey( OStringLiteral(";") + rule
);
214 aMapIt
= theBIMap
.find( aBIMapRuleOnlyKey
);
215 bInMap
= (aMapIt
!= theBIMap
.end());
218 icuBI
->mpValue
= aMapIt
->second
;
219 icuBI
->maBIMapKey
= aBIMapGlobalKey
;
220 theBIMap
.insert( std::make_pair( aBIMapGlobalKey
, icuBI
->mpValue
));
224 status
= U_ZERO_ERROR
;
225 pUData
= udata_open("OpenOffice", "brk", rule
, &status
);
226 if( U_SUCCESS(status
) )
227 rbi
.reset(new OOoRuleBasedBreakIterator( pUData
, status
));
228 if ( U_SUCCESS(status
) )
230 icuBI
->mpValue
.reset( new BI_ValueData
);
231 icuBI
->mpValue
->mpBreakIterator
= std::move( rbi
);
232 theBIMap
.insert( std::make_pair( aBIMapRuleOnlyKey
, icuBI
->mpValue
));
241 #if (U_ICU_VERSION_MAJOR_NUM < 58)
242 // ICU 58 made RuleBasedBreakIterator::setBreakType() private
243 // instead of protected, so the old workaround of
244 // https://ssl.icu-project.org/trac/ticket/5498
245 // doesn't work anymore. However, they also claim to have fixed
246 // the cause that an initial fBreakType==-1 would lead to an
247 // endless loop under some circumstances.
249 switch (rBreakType
) {
250 case LOAD_CHARACTER_BREAKITERATOR
: rbi
->publicSetBreakType(UBRK_CHARACTER
); break;
251 case LOAD_WORD_BREAKITERATOR
: rbi
->publicSetBreakType(UBRK_WORD
); break;
252 case LOAD_SENTENCE_BREAKITERATOR
: rbi
->publicSetBreakType(UBRK_SENTENCE
); break;
253 case LOAD_LINE_BREAKITERATOR
: rbi
->publicSetBreakType(UBRK_LINE
); break;
259 if (!icuBI
->mpValue
|| !icuBI
->mpValue
->mpBreakIterator
) do {
260 // langtag;;;rBreakType (empty rule; empty breakType)
261 const OString
aBIMapLocaleTypeKey( aLangtagStr
+ ";;;" + OString::number(rBreakType
));
262 aMapIt
= theBIMap
.find( aBIMapLocaleTypeKey
);
263 bInMap
= (aMapIt
!= theBIMap
.end());
266 icuBI
->mpValue
= aMapIt
->second
;
267 icuBI
->maBIMapKey
= aBIMapGlobalKey
;
268 theBIMap
.insert( std::make_pair( aBIMapGlobalKey
, icuBI
->mpValue
));
272 icu::Locale
icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale
)));
273 std::shared_ptr
< icu::BreakIterator
> pBI
;
275 status
= U_ZERO_ERROR
;
276 switch (rBreakType
) {
277 case LOAD_CHARACTER_BREAKITERATOR
:
278 pBI
.reset( icu::BreakIterator::createCharacterInstance(icuLocale
, status
) );
280 case LOAD_WORD_BREAKITERATOR
:
281 pBI
.reset( icu::BreakIterator::createWordInstance(icuLocale
, status
) );
283 case LOAD_SENTENCE_BREAKITERATOR
:
284 pBI
.reset( icu::BreakIterator::createSentenceInstance(icuLocale
, status
) );
286 case LOAD_LINE_BREAKITERATOR
:
287 pBI
.reset( icu::BreakIterator::createLineInstance(icuLocale
, status
) );
290 if ( !U_SUCCESS(status
) || !pBI
) {
291 throw uno::RuntimeException();
293 icuBI
->mpValue
.reset( new BI_ValueData
);
294 icuBI
->mpValue
->mpBreakIterator
= pBI
;
295 theBIMap
.insert( std::make_pair( aBIMapLocaleTypeKey
, icuBI
->mpValue
));
297 if (!icuBI
->mpValue
|| !icuBI
->mpValue
->mpBreakIterator
) {
298 throw uno::RuntimeException();
300 icuBI
->maBIMapKey
= aBIMapGlobalKey
;
302 theBIMap
.insert( std::make_pair( aBIMapGlobalKey
, icuBI
->mpValue
));
306 if (bNewBreak
|| icuBI
->mpValue
->maICUText
.pData
!= rText
.pData
)
308 const UChar
*pText
= reinterpret_cast<const UChar
*>(rText
.getStr());
310 status
= U_ZERO_ERROR
;
311 icuBI
->mpValue
->mpUt
= utext_openUChars(icuBI
->mpValue
->mpUt
, pText
, rText
.getLength(), &status
);
313 if (!U_SUCCESS(status
))
314 throw uno::RuntimeException();
316 icuBI
->mpValue
->mpBreakIterator
->setText(icuBI
->mpValue
->mpUt
, status
);
318 if (!U_SUCCESS(status
))
319 throw uno::RuntimeException();
321 icuBI
->mpValue
->maICUText
= rText
;
325 sal_Int32 SAL_CALL
BreakIterator_Unicode::nextCharacters( const OUString
& Text
,
326 sal_Int32 nStartPos
, const lang::Locale
&rLocale
,
327 sal_Int16 nCharacterIteratorMode
, sal_Int32 nCount
, sal_Int32
& nDone
)
329 if (nCharacterIteratorMode
== CharacterIteratorMode::SKIPCELL
) { // for CELL mode
330 loadICUBreakIterator(rLocale
, LOAD_CHARACTER_BREAKITERATOR
, 0, "char", Text
);
331 icu::BreakIterator
* pBI
= character
.mpValue
->mpBreakIterator
.get();
332 for (nDone
= 0; nDone
< nCount
; nDone
++) {
333 nStartPos
= pBI
->following(nStartPos
);
334 if (nStartPos
== icu::BreakIterator::DONE
)
335 return Text
.getLength();
337 } else { // for CHARACTER mode
338 for (nDone
= 0; nDone
< nCount
&& nStartPos
< Text
.getLength(); nDone
++)
339 Text
.iterateCodePoints(&nStartPos
);
344 sal_Int32 SAL_CALL
BreakIterator_Unicode::previousCharacters( const OUString
& Text
,
345 sal_Int32 nStartPos
, const lang::Locale
& rLocale
,
346 sal_Int16 nCharacterIteratorMode
, sal_Int32 nCount
, sal_Int32
& nDone
)
348 if (nCharacterIteratorMode
== CharacterIteratorMode::SKIPCELL
) { // for CELL mode
349 loadICUBreakIterator(rLocale
, LOAD_CHARACTER_BREAKITERATOR
, 0, "char", Text
);
350 icu::BreakIterator
* pBI
= character
.mpValue
->mpBreakIterator
.get();
351 for (nDone
= 0; nDone
< nCount
; nDone
++) {
352 nStartPos
= pBI
->preceding(nStartPos
);
353 if (nStartPos
== icu::BreakIterator::DONE
)
356 } else { // for BS to delete one char and CHARACTER mode.
357 for (nDone
= 0; nDone
< nCount
&& nStartPos
> 0; nDone
++)
358 Text
.iterateCodePoints(&nStartPos
, -1);
364 Boundary SAL_CALL
BreakIterator_Unicode::nextWord( const OUString
& Text
, sal_Int32 nStartPos
,
365 const lang::Locale
& rLocale
, sal_Int16 rWordType
)
367 loadICUBreakIterator(rLocale
, LOAD_WORD_BREAKITERATOR
, rWordType
, nullptr, Text
);
370 rv
.startPos
= icuBI
->mpValue
->mpBreakIterator
->following(nStartPos
);
371 if( rv
.startPos
>= Text
.getLength() || rv
.startPos
== icu::BreakIterator::DONE
)
372 rv
.endPos
= result
.startPos
;
374 if ( (rWordType
== WordType::ANYWORD_IGNOREWHITESPACES
||
375 rWordType
== WordType::DICTIONARY_WORD
) &&
376 u_isWhitespace(Text
.iterateCodePoints(&rv
.startPos
, 0)) )
377 rv
.startPos
= icuBI
->mpValue
->mpBreakIterator
->following(rv
.startPos
);
379 rv
.endPos
= icuBI
->mpValue
->mpBreakIterator
->following(rv
.startPos
);
380 if(rv
.endPos
== icu::BreakIterator::DONE
)
381 rv
.endPos
= rv
.startPos
;
387 Boundary SAL_CALL
BreakIterator_Unicode::previousWord(const OUString
& Text
, sal_Int32 nStartPos
,
388 const lang::Locale
& rLocale
, sal_Int16 rWordType
)
390 loadICUBreakIterator(rLocale
, LOAD_WORD_BREAKITERATOR
, rWordType
, nullptr, Text
);
393 rv
.startPos
= icuBI
->mpValue
->mpBreakIterator
->preceding(nStartPos
);
395 rv
.endPos
= rv
.startPos
;
397 if ( (rWordType
== WordType::ANYWORD_IGNOREWHITESPACES
||
398 rWordType
== WordType::DICTIONARY_WORD
) &&
399 u_isWhitespace(Text
.iterateCodePoints(&rv
.startPos
, 0)) )
400 rv
.startPos
= icuBI
->mpValue
->mpBreakIterator
->preceding(rv
.startPos
);
402 rv
.endPos
= icuBI
->mpValue
->mpBreakIterator
->following(rv
.startPos
);
403 if(rv
.endPos
== icu::BreakIterator::DONE
)
404 rv
.endPos
= rv
.startPos
;
410 Boundary SAL_CALL
BreakIterator_Unicode::getWordBoundary( const OUString
& Text
, sal_Int32 nPos
, const lang::Locale
& rLocale
,
411 sal_Int16 rWordType
, sal_Bool bDirection
)
413 loadICUBreakIterator(rLocale
, LOAD_WORD_BREAKITERATOR
, rWordType
, nullptr, Text
);
414 sal_Int32 len
= Text
.getLength();
417 if(icuBI
->mpValue
->mpBreakIterator
->isBoundary(nPos
)) {
418 rv
.startPos
= rv
.endPos
= nPos
;
419 if((bDirection
|| nPos
== 0) && nPos
< len
) //forward
420 rv
.endPos
= icuBI
->mpValue
->mpBreakIterator
->following(nPos
);
422 rv
.startPos
= icuBI
->mpValue
->mpBreakIterator
->preceding(nPos
);
426 rv
.endPos
= len
? icuBI
->mpValue
->mpBreakIterator
->following(sal_Int32(0)) : 0;
427 } else if(nPos
>= len
) {
428 rv
.startPos
= icuBI
->mpValue
->mpBreakIterator
->preceding(len
);
431 rv
.startPos
= icuBI
->mpValue
->mpBreakIterator
->preceding(nPos
);
432 rv
.endPos
= icuBI
->mpValue
->mpBreakIterator
->following(nPos
);
435 if (rv
.startPos
== icu::BreakIterator::DONE
)
436 rv
.startPos
= rv
.endPos
;
437 else if (rv
.endPos
== icu::BreakIterator::DONE
)
438 rv
.endPos
= rv
.startPos
;
444 sal_Int32 SAL_CALL
BreakIterator_Unicode::beginOfSentence( const OUString
& Text
, sal_Int32 nStartPos
,
445 const lang::Locale
&rLocale
)
447 loadICUBreakIterator(rLocale
, LOAD_SENTENCE_BREAKITERATOR
, 0, "sent", Text
);
449 sal_Int32 len
= Text
.getLength();
450 if (len
> 0 && nStartPos
== len
)
451 Text
.iterateCodePoints(&nStartPos
, -1); // issue #i27703# treat end position as part of last sentence
452 if (!sentence
.mpValue
->mpBreakIterator
->isBoundary(nStartPos
))
453 nStartPos
= sentence
.mpValue
->mpBreakIterator
->preceding(nStartPos
);
455 // skip preceding space.
456 sal_uInt32 ch
= Text
.iterateCodePoints(&nStartPos
);
457 while (nStartPos
< len
&& u_isWhitespace(ch
)) ch
= Text
.iterateCodePoints(&nStartPos
);
458 Text
.iterateCodePoints(&nStartPos
, -1);
463 sal_Int32 SAL_CALL
BreakIterator_Unicode::endOfSentence( const OUString
& Text
, sal_Int32 nStartPos
,
464 const lang::Locale
&rLocale
)
466 loadICUBreakIterator(rLocale
, LOAD_SENTENCE_BREAKITERATOR
, 0, "sent", Text
);
468 sal_Int32 len
= Text
.getLength();
469 if (len
> 0 && nStartPos
== len
)
470 Text
.iterateCodePoints(&nStartPos
, -1); // issue #i27703# treat end position as part of last sentence
471 nStartPos
= sentence
.mpValue
->mpBreakIterator
->following(nStartPos
);
473 sal_Int32 nPos
=nStartPos
;
474 while (nPos
> 0 && u_isWhitespace(Text
.iterateCodePoints(&nPos
, -1))) nStartPos
=nPos
;
479 LineBreakResults SAL_CALL
BreakIterator_Unicode::getLineBreak(
480 const OUString
& Text
, sal_Int32 nStartPos
,
481 const lang::Locale
& rLocale
, sal_Int32 nMinBreakPos
,
482 const LineBreakHyphenationOptions
& hOptions
,
483 const LineBreakUserOptions
& /*rOptions*/ )
485 LineBreakResults lbr
;
487 if (nStartPos
>= Text
.getLength()) {
488 lbr
.breakIndex
= Text
.getLength();
489 lbr
.breakType
= BreakType::WORDBOUNDARY
;
493 loadICUBreakIterator(rLocale
, LOAD_LINE_BREAKITERATOR
, 0, lineRule
, Text
);
495 icu::BreakIterator
* pLineBI
= line
.mpValue
->mpBreakIterator
.get();
498 if (pLineBI
->preceding(nStartPos
+ 1) == nStartPos
) { //Line boundary break
499 lbr
.breakIndex
= nStartPos
;
500 lbr
.breakType
= BreakType::WORDBOUNDARY
;
501 } else if (hOptions
.rHyphenator
.is()) { //Hyphenation break
502 sal_Int32 boundary_with_punctuation
= (pLineBI
->next() != icu::BreakIterator::DONE
) ? pLineBI
->current() : 0;
503 pLineBI
->preceding(nStartPos
+ 1); // reset to check correct hyphenation of "word-word"
505 sal_Int32 nStartPosWordEnd
= nStartPos
;
506 while (pLineBI
->current() < nStartPosWordEnd
&& u_ispunct(static_cast<sal_uInt32
>(Text
[nStartPosWordEnd
]))) // starting punctuation
509 Boundary wBoundary
= getWordBoundary( Text
, nStartPosWordEnd
, rLocale
,
510 WordType::DICTIONARY_WORD
, false);
512 nStartPosWordEnd
= wBoundary
.endPos
;
513 while (nStartPosWordEnd
< Text
.getLength() && (u_ispunct(static_cast<sal_uInt32
>(Text
[nStartPosWordEnd
])))) // ending punctuation
515 nStartPosWordEnd
= nStartPosWordEnd
- wBoundary
.endPos
;
516 if (hOptions
.hyphenIndex
- wBoundary
.startPos
< nStartPosWordEnd
) nStartPosWordEnd
= hOptions
.hyphenIndex
- wBoundary
.startPos
;
518 while (boundary_with_punctuation
> wBoundary
.endPos
&& Text
[--boundary_with_punctuation
] == SPACE
);
519 uno::Reference
< linguistic2::XHyphenatedWord
> aHyphenatedWord
= hOptions
.rHyphenator
->hyphenate(Text
.copy(wBoundary
.startPos
,
520 wBoundary
.endPos
- wBoundary
.startPos
), rLocale
,
521 static_cast<sal_Int16
>(hOptions
.hyphenIndex
- wBoundary
.startPos
- ((hOptions
.hyphenIndex
== wBoundary
.endPos
)? nStartPosWordEnd
: 0)), hOptions
.aHyphenationOptions
);
522 if (aHyphenatedWord
.is()) {
523 lbr
.rHyphenatedWord
= aHyphenatedWord
;
524 if(wBoundary
.startPos
+ aHyphenatedWord
->getHyphenationPos() + 1 < nMinBreakPos
)
527 lbr
.breakIndex
= wBoundary
.startPos
; //aHyphenatedWord->getHyphenationPos();
528 lbr
.breakType
= BreakType::HYPHENATION
;
530 // check not optimal hyphenation of "word-word" (word with hyphens)
531 if (lbr
.breakIndex
> -1 && wBoundary
.startPos
+ aHyphenatedWord
->getHyphenationPos() < pLineBI
->current()) {
532 lbr
.breakIndex
= pLineBI
->current();
533 lbr
.breakType
= BreakType::WORDBOUNDARY
;
537 lbr
.breakIndex
= pLineBI
->preceding(nStartPos
);
538 lbr
.breakType
= BreakType::WORDBOUNDARY
;
540 } else { //word boundary break
541 lbr
.breakIndex
= pLineBI
->preceding(nStartPos
);
542 lbr
.breakType
= BreakType::WORDBOUNDARY
;
544 // Special case for Slash U+002F SOLIDUS in URI and path names.
545 // TR14 defines that as SY: Symbols Allowing Break After (A).
546 // This is unwanted in paths, see also i#17155
547 if (lbr
.breakIndex
> 0 && Text
[lbr
.breakIndex
-1] == '/')
549 // Look backward and take any whitespace before as a break
550 // opportunity. This also glues something like "w/o".
551 // Avoid an overly long path and break it as was indicated.
552 // Overly long here is arbitrarily defined.
553 const sal_Int32 nOverlyLong
= 66;
554 sal_Int32 nPos
= lbr
.breakIndex
- 1;
555 while (nPos
> 0 && lbr
.breakIndex
- nPos
< nOverlyLong
)
557 if (u_isWhitespace(Text
.iterateCodePoints( &nPos
, -1)))
559 lbr
.breakIndex
= nPos
+ 1;
566 #define WJ 0x2060 // Word Joiner
568 if (lbr
.breakType
== BreakType::WORDBOUNDARY
) {
569 nStartPos
= lbr
.breakIndex
;
570 if (nStartPos
>= 0 && Text
[nStartPos
--] == WJ
)
572 while (nStartPos
>= 0 &&
573 (u_isWhitespace(Text
.iterateCodePoints(&nStartPos
, 0)) || Text
[nStartPos
] == WJ
)) {
574 if (Text
[nStartPos
--] == WJ
)
577 if (GlueSpace
&& nStartPos
< 0) {
588 BreakIterator_Unicode::getImplementationName()
590 return OUString::createFromAscii(cBreakIterator
);
594 BreakIterator_Unicode::supportsService(const OUString
& rServiceName
)
596 return cppu::supportsService(this, rServiceName
);
599 uno::Sequence
< OUString
> SAL_CALL
600 BreakIterator_Unicode::getSupportedServiceNames()
602 uno::Sequence
< OUString
> aRet
{ OUString::createFromAscii(cBreakIterator
) };
608 extern "C" SAL_DLLPUBLIC_EXPORT
css::uno::XInterface
*
609 com_sun_star_i18n_BreakIterator_Unicode_get_implementation(
610 css::uno::XComponentContext
*,
611 css::uno::Sequence
<css::uno::Any
> const &)
613 return cppu::acquire(new i18npool::BreakIterator_Unicode());
616 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */