1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <breakiterator_unicode.hxx>
21 #include <cppuhelper/supportsservice.hxx>
22 #include <localedata.hxx>
23 #include <i18nlangtag/languagetag.hxx>
24 #include <i18nlangtag/languagetagicu.hxx>
25 #include <unicode/uchar.h>
26 #include <unicode/locid.h>
27 #include <unicode/rbbi.h>
28 #include <unicode/udata.h>
29 #include <rtl/strbuf.hxx>
30 #include <rtl/ustring.hxx>
34 extern const char OpenOffice_dat
[];
37 using namespace ::com::sun::star
;
38 using namespace ::com::sun::star::i18n
;
39 using namespace ::com::sun::star::lang
;
43 // Cache map of breakiterators, stores state information so has to be
45 thread_local
static BreakIterator_Unicode::BIMap theBIMap
;
47 BreakIterator_Unicode::BreakIterator_Unicode()
48 : cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ) // implementation name
54 BreakIterator_Unicode::~BreakIterator_Unicode()
59 Wrapper class to provide public access to the icu::RuleBasedBreakIterator's
62 class OOoRuleBasedBreakIterator
: public icu::RuleBasedBreakIterator
65 #if (U_ICU_VERSION_MAJOR_NUM < 58)
66 // icu::RuleBasedBreakIterator::setBreakType() is private as of ICU 58.
67 void publicSetBreakType(int32_t type
)
72 OOoRuleBasedBreakIterator(UDataMemory
* image
,
74 : icu::RuleBasedBreakIterator(image
, status
)
79 // loading ICU breakiterator on demand.
80 void BreakIterator_Unicode::loadICUBreakIterator(const css::lang::Locale
& rLocale
,
81 sal_Int16 rBreakType
, sal_Int16 nWordType
, const sal_Char
*rule
, const OUString
& rText
)
83 bool bNewBreak
= false;
84 UErrorCode status
= U_ZERO_ERROR
;
85 sal_Int16 breakType
= 0;
87 case LOAD_CHARACTER_BREAKITERATOR
: icuBI
=&character
; breakType
= 3; break;
88 case LOAD_WORD_BREAKITERATOR
:
89 assert (nWordType
>= 0 && nWordType
<= WordType::WORD_COUNT
);
90 icuBI
=&words
[nWordType
];
92 case WordType::ANY_WORD
: break; // odd but previous behavior
93 case WordType::ANYWORD_IGNOREWHITESPACES
:
94 breakType
= 0; rule
= "edit_word"; break;
95 case WordType::DICTIONARY_WORD
:
96 breakType
= 1; rule
= "dict_word"; break;
98 case WordType::WORD_COUNT
:
99 breakType
= 2; rule
= "count_word"; break;
102 case LOAD_SENTENCE_BREAKITERATOR
: icuBI
=&sentence
; breakType
= 5; break;
103 case LOAD_LINE_BREAKITERATOR
: icuBI
=&line
; breakType
= 4; break;
106 // Using the cache map prevents accessing the file system for each
107 // udata_open() where ICU tries first files then data objects. And that for
108 // two fallbacks worst case.. for each new allocated EditEngine, layout
109 // cell, ... *ouch* Also non-rule locale based iterators can be mapped.
110 // This also speeds up loading iterators for alternating or generally more
111 // than one language/locale in that iterators are not constructed and
112 // destroyed en masse.
113 // Four possible keys, locale rule based with break type, locale rule based
114 // only, rule based only, locale based with break type. A fifth global key
115 // for the initial lookup.
116 // Multiple global keys may map to identical value data.
117 // All enums used here should be in the range 0..9 so assert that and avoid
118 // expensive numeric conversion in append() for faster construction of the
119 // always used global key.
120 assert( 0 <= breakType
&& breakType
<= 9 && 0 <= rBreakType
&& rBreakType
<= 9 && 0 <= nWordType
&& nWordType
<= 9);
121 const OString
aLangtagStr( LanguageTag::convertToBcp47( rLocale
).toUtf8());
122 OStringBuffer
aKeyBuf(64);
123 aKeyBuf
.append( aLangtagStr
).append(';');
125 aKeyBuf
.append(rule
);
126 aKeyBuf
.append(';').append( static_cast<sal_Char
>('0'+breakType
)).append(';').
127 append( static_cast<sal_Char
>('0'+rBreakType
)).append(';').append( static_cast<sal_Char
>('0'+nWordType
));
128 // langtag;rule;breakType;rBreakType;nWordType
129 const OString
aBIMapGlobalKey( aKeyBuf
.makeStringAndClear());
131 if (icuBI
->maBIMapKey
!= aBIMapGlobalKey
|| !icuBI
->mpValue
|| !icuBI
->mpValue
->mpBreakIterator
)
134 auto aMapIt( theBIMap
.find( aBIMapGlobalKey
));
135 bool bInMap
= (aMapIt
!= theBIMap
.end());
137 icuBI
->mpValue
= aMapIt
->second
;
139 icuBI
->mpValue
.reset();
141 if (!bInMap
&& rule
) do {
142 uno::Sequence
< OUString
> breakRules
= LocaleDataImpl::get()->getBreakIteratorRules(rLocale
);
144 status
= U_ZERO_ERROR
;
145 udata_setAppData("OpenOffice", OpenOffice_dat
, &status
);
146 if ( !U_SUCCESS(status
) ) throw uno::RuntimeException();
148 OOoRuleBasedBreakIterator
*rbi
= nullptr;
150 if (breakRules
.getLength() > breakType
&& !breakRules
[breakType
].isEmpty())
152 // langtag;rule;breakType
153 const OString
aBIMapRuleTypeKey( aLangtagStr
+ ";" + rule
+ ";" + OString::number(breakType
));
154 aMapIt
= theBIMap
.find( aBIMapRuleTypeKey
);
155 bInMap
= (aMapIt
!= theBIMap
.end());
158 icuBI
->mpValue
= aMapIt
->second
;
159 icuBI
->maBIMapKey
= aBIMapGlobalKey
;
160 theBIMap
.insert( std::make_pair( aBIMapGlobalKey
, icuBI
->mpValue
));
164 rbi
= new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk",
165 OUStringToOString(breakRules
[breakType
], RTL_TEXTENCODING_ASCII_US
).getStr(), &status
), status
);
167 if (U_SUCCESS(status
))
169 icuBI
->mpValue
.reset( new BI_ValueData
);
170 icuBI
->mpValue
->mpBreakIterator
.reset( rbi
);
171 theBIMap
.insert( std::make_pair( aBIMapRuleTypeKey
, icuBI
->mpValue
));
179 //use icu's breakiterator for Thai, Tibetan and Dzongkha
180 else if (rLocale
.Language
!= "th" && rLocale
.Language
!= "lo" && rLocale
.Language
!= "bo" && rLocale
.Language
!= "dz" && rLocale
.Language
!= "km")
182 // language;rule (not langtag, unless we'd actually load such)
183 OString
aLanguage( LanguageTag( rLocale
).getLanguage().toUtf8());
184 const OString
aBIMapRuleKey( aLanguage
+ ";" + rule
);
185 aMapIt
= theBIMap
.find( aBIMapRuleKey
);
186 bInMap
= (aMapIt
!= theBIMap
.end());
189 icuBI
->mpValue
= aMapIt
->second
;
190 icuBI
->maBIMapKey
= aBIMapGlobalKey
;
191 theBIMap
.insert( std::make_pair( aBIMapGlobalKey
, icuBI
->mpValue
));
195 status
= U_ZERO_ERROR
;
196 OStringBuffer
aUDName(64);
197 aUDName
.append(rule
);
199 aUDName
.append( aLanguage
);
200 UDataMemory
* pUData
= udata_open("OpenOffice", "brk", aUDName
.getStr(), &status
);
201 if( U_SUCCESS(status
) )
202 rbi
= new OOoRuleBasedBreakIterator( pUData
, status
);
203 if ( U_SUCCESS(status
) )
205 icuBI
->mpValue
.reset( new BI_ValueData
);
206 icuBI
->mpValue
->mpBreakIterator
.reset( rbi
);
207 theBIMap
.insert( std::make_pair( aBIMapRuleKey
, icuBI
->mpValue
));
215 const OString
aBIMapRuleOnlyKey( OString(";") + rule
);
216 aMapIt
= theBIMap
.find( aBIMapRuleOnlyKey
);
217 bInMap
= (aMapIt
!= theBIMap
.end());
220 icuBI
->mpValue
= aMapIt
->second
;
221 icuBI
->maBIMapKey
= aBIMapGlobalKey
;
222 theBIMap
.insert( std::make_pair( aBIMapGlobalKey
, icuBI
->mpValue
));
226 status
= U_ZERO_ERROR
;
227 pUData
= udata_open("OpenOffice", "brk", rule
, &status
);
228 if( U_SUCCESS(status
) )
229 rbi
= new OOoRuleBasedBreakIterator( pUData
, status
);
230 if ( U_SUCCESS(status
) )
232 icuBI
->mpValue
.reset( new BI_ValueData
);
233 icuBI
->mpValue
->mpBreakIterator
.reset( rbi
);
234 theBIMap
.insert( std::make_pair( aBIMapRuleOnlyKey
, icuBI
->mpValue
));
244 #if (U_ICU_VERSION_MAJOR_NUM < 58)
245 // ICU 58 made RuleBasedBreakIterator::setBreakType() private
246 // instead of protected, so the old workaround of
247 // https://ssl.icu-project.org/trac/ticket/5498
248 // doesn't work anymore. However, they also claim to have fixed
249 // the cause that an initial fBreakType==-1 would lead to an
250 // endless loop under some circumstances.
252 switch (rBreakType
) {
253 case LOAD_CHARACTER_BREAKITERATOR
: rbi
->publicSetBreakType(UBRK_CHARACTER
); break;
254 case LOAD_WORD_BREAKITERATOR
: rbi
->publicSetBreakType(UBRK_WORD
); break;
255 case LOAD_SENTENCE_BREAKITERATOR
: rbi
->publicSetBreakType(UBRK_SENTENCE
); break;
256 case LOAD_LINE_BREAKITERATOR
: rbi
->publicSetBreakType(UBRK_LINE
); break;
262 if (!icuBI
->mpValue
|| !icuBI
->mpValue
->mpBreakIterator
) do {
263 // langtag;;;rBreakType (empty rule; empty breakType)
264 const OString
aBIMapLocaleTypeKey( aLangtagStr
+ ";;;" + OString::number(rBreakType
));
265 aMapIt
= theBIMap
.find( aBIMapLocaleTypeKey
);
266 bInMap
= (aMapIt
!= theBIMap
.end());
269 icuBI
->mpValue
= aMapIt
->second
;
270 icuBI
->maBIMapKey
= aBIMapGlobalKey
;
271 theBIMap
.insert( std::make_pair( aBIMapGlobalKey
, icuBI
->mpValue
));
275 icu::Locale
icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale
)));
276 std::shared_ptr
< icu::BreakIterator
> pBI
;
278 status
= U_ZERO_ERROR
;
279 switch (rBreakType
) {
280 case LOAD_CHARACTER_BREAKITERATOR
:
281 pBI
.reset( icu::BreakIterator::createCharacterInstance(icuLocale
, status
) );
283 case LOAD_WORD_BREAKITERATOR
:
284 pBI
.reset( icu::BreakIterator::createWordInstance(icuLocale
, status
) );
286 case LOAD_SENTENCE_BREAKITERATOR
:
287 pBI
.reset( icu::BreakIterator::createSentenceInstance(icuLocale
, status
) );
289 case LOAD_LINE_BREAKITERATOR
:
290 pBI
.reset( icu::BreakIterator::createLineInstance(icuLocale
, status
) );
293 if ( !U_SUCCESS(status
) || !pBI
) {
294 throw uno::RuntimeException();
296 icuBI
->mpValue
.reset( new BI_ValueData
);
297 icuBI
->mpValue
->mpBreakIterator
= pBI
;
298 theBIMap
.insert( std::make_pair( aBIMapLocaleTypeKey
, icuBI
->mpValue
));
300 if (!icuBI
->mpValue
|| !icuBI
->mpValue
->mpBreakIterator
) {
301 throw uno::RuntimeException();
303 icuBI
->maBIMapKey
= aBIMapGlobalKey
;
305 theBIMap
.insert( std::make_pair( aBIMapGlobalKey
, icuBI
->mpValue
));
309 if (bNewBreak
|| icuBI
->mpValue
->maICUText
.pData
!= rText
.pData
)
311 const UChar
*pText
= reinterpret_cast<const UChar
*>(rText
.getStr());
313 status
= U_ZERO_ERROR
;
314 icuBI
->mpValue
->mpUt
= utext_openUChars(icuBI
->mpValue
->mpUt
, pText
, rText
.getLength(), &status
);
316 if (!U_SUCCESS(status
))
317 throw uno::RuntimeException();
319 icuBI
->mpValue
->mpBreakIterator
->setText(icuBI
->mpValue
->mpUt
, status
);
321 if (!U_SUCCESS(status
))
322 throw uno::RuntimeException();
324 icuBI
->mpValue
->maICUText
= rText
;
328 sal_Int32 SAL_CALL
BreakIterator_Unicode::nextCharacters( const OUString
& Text
,
329 sal_Int32 nStartPos
, const lang::Locale
&rLocale
,
330 sal_Int16 nCharacterIteratorMode
, sal_Int32 nCount
, sal_Int32
& nDone
)
332 if (nCharacterIteratorMode
== CharacterIteratorMode::SKIPCELL
) { // for CELL mode
333 loadICUBreakIterator(rLocale
, LOAD_CHARACTER_BREAKITERATOR
, 0, "char", Text
);
334 icu::BreakIterator
* pBI
= character
.mpValue
->mpBreakIterator
.get();
335 for (nDone
= 0; nDone
< nCount
; nDone
++) {
336 nStartPos
= pBI
->following(nStartPos
);
337 if (nStartPos
== icu::BreakIterator::DONE
)
338 return Text
.getLength();
340 } else { // for CHARACTER mode
341 for (nDone
= 0; nDone
< nCount
&& nStartPos
< Text
.getLength(); nDone
++)
342 Text
.iterateCodePoints(&nStartPos
);
347 sal_Int32 SAL_CALL
BreakIterator_Unicode::previousCharacters( const OUString
& Text
,
348 sal_Int32 nStartPos
, const lang::Locale
& rLocale
,
349 sal_Int16 nCharacterIteratorMode
, sal_Int32 nCount
, sal_Int32
& nDone
)
351 if (nCharacterIteratorMode
== CharacterIteratorMode::SKIPCELL
) { // for CELL mode
352 loadICUBreakIterator(rLocale
, LOAD_CHARACTER_BREAKITERATOR
, 0, "char", Text
);
353 icu::BreakIterator
* pBI
= character
.mpValue
->mpBreakIterator
.get();
354 for (nDone
= 0; nDone
< nCount
; nDone
++) {
355 nStartPos
= pBI
->preceding(nStartPos
);
356 if (nStartPos
== icu::BreakIterator::DONE
)
359 } else { // for BS to delete one char and CHARACTER mode.
360 for (nDone
= 0; nDone
< nCount
&& nStartPos
> 0; nDone
++)
361 Text
.iterateCodePoints(&nStartPos
, -1);
367 Boundary SAL_CALL
BreakIterator_Unicode::nextWord( const OUString
& Text
, sal_Int32 nStartPos
,
368 const lang::Locale
& rLocale
, sal_Int16 rWordType
)
370 loadICUBreakIterator(rLocale
, LOAD_WORD_BREAKITERATOR
, rWordType
, nullptr, Text
);
373 rv
.startPos
= icuBI
->mpValue
->mpBreakIterator
->following(nStartPos
);
374 if( rv
.startPos
>= Text
.getLength() || rv
.startPos
== icu::BreakIterator::DONE
)
375 rv
.endPos
= result
.startPos
;
377 if ( (rWordType
== WordType::ANYWORD_IGNOREWHITESPACES
||
378 rWordType
== WordType::DICTIONARY_WORD
) &&
379 u_isWhitespace(Text
.iterateCodePoints(&rv
.startPos
, 0)) )
380 rv
.startPos
= icuBI
->mpValue
->mpBreakIterator
->following(rv
.startPos
);
382 rv
.endPos
= icuBI
->mpValue
->mpBreakIterator
->following(rv
.startPos
);
383 if(rv
.endPos
== icu::BreakIterator::DONE
)
384 rv
.endPos
= rv
.startPos
;
390 Boundary SAL_CALL
BreakIterator_Unicode::previousWord(const OUString
& Text
, sal_Int32 nStartPos
,
391 const lang::Locale
& rLocale
, sal_Int16 rWordType
)
393 loadICUBreakIterator(rLocale
, LOAD_WORD_BREAKITERATOR
, rWordType
, nullptr, Text
);
396 rv
.startPos
= icuBI
->mpValue
->mpBreakIterator
->preceding(nStartPos
);
397 if( rv
.startPos
< 0 || rv
.startPos
== icu::BreakIterator::DONE
)
398 rv
.endPos
= rv
.startPos
;
400 if ( (rWordType
== WordType::ANYWORD_IGNOREWHITESPACES
||
401 rWordType
== WordType::DICTIONARY_WORD
) &&
402 u_isWhitespace(Text
.iterateCodePoints(&rv
.startPos
, 0)) )
403 rv
.startPos
= icuBI
->mpValue
->mpBreakIterator
->preceding(rv
.startPos
);
405 rv
.endPos
= icuBI
->mpValue
->mpBreakIterator
->following(rv
.startPos
);
406 if(rv
.endPos
== icu::BreakIterator::DONE
)
407 rv
.endPos
= rv
.startPos
;
413 Boundary SAL_CALL
BreakIterator_Unicode::getWordBoundary( const OUString
& Text
, sal_Int32 nPos
, const lang::Locale
& rLocale
,
414 sal_Int16 rWordType
, sal_Bool bDirection
)
416 loadICUBreakIterator(rLocale
, LOAD_WORD_BREAKITERATOR
, rWordType
, nullptr, Text
);
417 sal_Int32 len
= Text
.getLength();
420 if(icuBI
->mpValue
->mpBreakIterator
->isBoundary(nPos
)) {
421 rv
.startPos
= rv
.endPos
= nPos
;
422 if((bDirection
|| nPos
== 0) && nPos
< len
) //forward
423 rv
.endPos
= icuBI
->mpValue
->mpBreakIterator
->following(nPos
);
425 rv
.startPos
= icuBI
->mpValue
->mpBreakIterator
->preceding(nPos
);
429 rv
.endPos
= len
? icuBI
->mpValue
->mpBreakIterator
->following(sal_Int32(0)) : 0;
430 } else if(nPos
>= len
) {
431 rv
.startPos
= icuBI
->mpValue
->mpBreakIterator
->preceding(len
);
434 rv
.startPos
= icuBI
->mpValue
->mpBreakIterator
->preceding(nPos
);
435 rv
.endPos
= icuBI
->mpValue
->mpBreakIterator
->following(nPos
);
438 if (rv
.startPos
== icu::BreakIterator::DONE
)
439 rv
.startPos
= rv
.endPos
;
440 else if (rv
.endPos
== icu::BreakIterator::DONE
)
441 rv
.endPos
= rv
.startPos
;
447 sal_Int32 SAL_CALL
BreakIterator_Unicode::beginOfSentence( const OUString
& Text
, sal_Int32 nStartPos
,
448 const lang::Locale
&rLocale
)
450 loadICUBreakIterator(rLocale
, LOAD_SENTENCE_BREAKITERATOR
, 0, "sent", Text
);
452 sal_Int32 len
= Text
.getLength();
453 if (len
> 0 && nStartPos
== len
)
454 Text
.iterateCodePoints(&nStartPos
, -1); // issue #i27703# treat end position as part of last sentence
455 if (!sentence
.mpValue
->mpBreakIterator
->isBoundary(nStartPos
))
456 nStartPos
= sentence
.mpValue
->mpBreakIterator
->preceding(nStartPos
);
458 // skip preceding space.
459 sal_uInt32 ch
= Text
.iterateCodePoints(&nStartPos
);
460 while (nStartPos
< len
&& u_isWhitespace(ch
)) ch
= Text
.iterateCodePoints(&nStartPos
);
461 Text
.iterateCodePoints(&nStartPos
, -1);
466 sal_Int32 SAL_CALL
BreakIterator_Unicode::endOfSentence( const OUString
& Text
, sal_Int32 nStartPos
,
467 const lang::Locale
&rLocale
)
469 loadICUBreakIterator(rLocale
, LOAD_SENTENCE_BREAKITERATOR
, 0, "sent", Text
);
471 sal_Int32 len
= Text
.getLength();
472 if (len
> 0 && nStartPos
== len
)
473 Text
.iterateCodePoints(&nStartPos
, -1); // issue #i27703# treat end position as part of last sentence
474 nStartPos
= sentence
.mpValue
->mpBreakIterator
->following(nStartPos
);
476 sal_Int32 nPos
=nStartPos
;
477 while (nPos
> 0 && u_isWhitespace(Text
.iterateCodePoints(&nPos
, -1))) nStartPos
=nPos
;
482 LineBreakResults SAL_CALL
BreakIterator_Unicode::getLineBreak(
483 const OUString
& Text
, sal_Int32 nStartPos
,
484 const lang::Locale
& rLocale
, sal_Int32 nMinBreakPos
,
485 const LineBreakHyphenationOptions
& hOptions
,
486 const LineBreakUserOptions
& /*rOptions*/ )
488 LineBreakResults lbr
;
490 if (nStartPos
>= Text
.getLength()) {
491 lbr
.breakIndex
= Text
.getLength();
492 lbr
.breakType
= BreakType::WORDBOUNDARY
;
496 loadICUBreakIterator(rLocale
, LOAD_LINE_BREAKITERATOR
, 0, lineRule
, Text
);
498 icu::BreakIterator
* pLineBI
= line
.mpValue
->mpBreakIterator
.get();
501 if (pLineBI
->preceding(nStartPos
+ 1) == nStartPos
) { //Line boundary break
502 lbr
.breakIndex
= nStartPos
;
503 lbr
.breakType
= BreakType::WORDBOUNDARY
;
504 } else if (hOptions
.rHyphenator
.is()) { //Hyphenation break
505 sal_Int32 boundary_with_punctuation
= (pLineBI
->next() != icu::BreakIterator::DONE
) ? pLineBI
->current() : 0;
506 pLineBI
->preceding(nStartPos
+ 1); // reset to check correct hyphenation of "word-word"
508 sal_Int32 nStartPosWordEnd
= nStartPos
;
509 while (pLineBI
->current() < nStartPosWordEnd
&& u_ispunct(static_cast<sal_uInt32
>(Text
[nStartPosWordEnd
]))) // starting punctuation
512 Boundary wBoundary
= getWordBoundary( Text
, nStartPosWordEnd
, rLocale
,
513 WordType::DICTIONARY_WORD
, false);
515 nStartPosWordEnd
= wBoundary
.endPos
;
516 while (nStartPosWordEnd
< Text
.getLength() && (u_ispunct(static_cast<sal_uInt32
>(Text
[nStartPosWordEnd
])))) // ending punctuation
518 nStartPosWordEnd
= nStartPosWordEnd
- wBoundary
.endPos
;
519 if (hOptions
.hyphenIndex
- wBoundary
.startPos
< nStartPosWordEnd
) nStartPosWordEnd
= hOptions
.hyphenIndex
- wBoundary
.startPos
;
521 while (boundary_with_punctuation
> wBoundary
.endPos
&& Text
[--boundary_with_punctuation
] == SPACE
);
522 uno::Reference
< linguistic2::XHyphenatedWord
> aHyphenatedWord
;
523 aHyphenatedWord
= hOptions
.rHyphenator
->hyphenate(Text
.copy(wBoundary
.startPos
,
524 wBoundary
.endPos
- wBoundary
.startPos
), rLocale
,
525 static_cast<sal_Int16
>(hOptions
.hyphenIndex
- wBoundary
.startPos
- ((hOptions
.hyphenIndex
== wBoundary
.endPos
)? nStartPosWordEnd
: 0)), hOptions
.aHyphenationOptions
);
526 if (aHyphenatedWord
.is()) {
527 lbr
.rHyphenatedWord
= aHyphenatedWord
;
528 if(wBoundary
.startPos
+ aHyphenatedWord
->getHyphenationPos() + 1 < nMinBreakPos
)
531 lbr
.breakIndex
= wBoundary
.startPos
; //aHyphenatedWord->getHyphenationPos();
532 lbr
.breakType
= BreakType::HYPHENATION
;
534 // check not optimal hyphenation of "word-word" (word with hyphens)
535 if (lbr
.breakIndex
> -1 && wBoundary
.startPos
+ aHyphenatedWord
->getHyphenationPos() < pLineBI
->current()) {
536 lbr
.breakIndex
= pLineBI
->current();
537 lbr
.breakType
= BreakType::WORDBOUNDARY
;
541 lbr
.breakIndex
= pLineBI
->preceding(nStartPos
);
542 lbr
.breakType
= BreakType::WORDBOUNDARY
;
544 } else { //word boundary break
545 lbr
.breakIndex
= pLineBI
->preceding(nStartPos
);
546 lbr
.breakType
= BreakType::WORDBOUNDARY
;
548 // Special case for Slash U+002F SOLIDUS in URI and path names.
549 // TR14 defines that as SY: Symbols Allowing Break After (A).
550 // This is unwanted in paths, see also i#17155
551 if (lbr
.breakIndex
> 0 && Text
[lbr
.breakIndex
-1] == '/')
553 // Look backward and take any whitespace before as a break
554 // opportunity. This also glues something like "w/o".
555 // Avoid an overly long path and break it as was indicated.
556 // Overly long here is arbitrarily defined.
557 const sal_Int32 nOverlyLong
= 66;
558 sal_Int32 nPos
= lbr
.breakIndex
- 1;
559 while (nPos
> 0 && lbr
.breakIndex
- nPos
< nOverlyLong
)
561 if (u_isWhitespace(Text
.iterateCodePoints( &nPos
, -1)))
563 lbr
.breakIndex
= nPos
+ 1;
570 #define WJ 0x2060 // Word Joiner
572 if (lbr
.breakType
== BreakType::WORDBOUNDARY
) {
573 nStartPos
= lbr
.breakIndex
;
574 if (nStartPos
>= 0 && Text
[nStartPos
--] == WJ
)
576 while (nStartPos
>= 0 &&
577 (u_isWhitespace(Text
.iterateCodePoints(&nStartPos
, 0)) || Text
[nStartPos
] == WJ
)) {
578 if (Text
[nStartPos
--] == WJ
)
581 if (GlueSpace
&& nStartPos
< 0) {
592 BreakIterator_Unicode::getImplementationName()
594 return OUString::createFromAscii(cBreakIterator
);
598 BreakIterator_Unicode::supportsService(const OUString
& rServiceName
)
600 return cppu::supportsService(this, rServiceName
);
603 uno::Sequence
< OUString
> SAL_CALL
604 BreakIterator_Unicode::getSupportedServiceNames()
606 uno::Sequence
< OUString
> aRet
{ OUString::createFromAscii(cBreakIterator
) };
612 extern "C" SAL_DLLPUBLIC_EXPORT
css::uno::XInterface
*
613 com_sun_star_i18n_BreakIterator_Unicode_get_implementation(
614 css::uno::XComponentContext
*,
615 css::uno::Sequence
<css::uno::Any
> const &)
617 return cppu::acquire(new i18npool::BreakIterator_Unicode());
620 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */