1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "base/i18n/break_iterator.h"
7 #include "base/logging.h"
8 #include "third_party/icu/source/common/unicode/ubrk.h"
9 #include "third_party/icu/source/common/unicode/uchar.h"
10 #include "third_party/icu/source/common/unicode/ustring.h"
15 const size_t npos
= static_cast<size_t>(-1);
17 BreakIterator::BreakIterator(const StringPiece16
& str
, BreakType break_type
)
20 break_type_(break_type
),
25 BreakIterator::BreakIterator(const StringPiece16
& str
, const string16
& rules
)
29 break_type_(RULE_BASED
),
34 BreakIterator::~BreakIterator() {
36 ubrk_close(static_cast<UBreakIterator
*>(iter_
));
39 bool BreakIterator::Init() {
40 UErrorCode status
= U_ZERO_ERROR
;
41 UParseError parse_error
;
42 UBreakIteratorType break_type
;
43 switch (break_type_
) {
45 break_type
= UBRK_CHARACTER
;
48 break_type
= UBRK_WORD
;
52 case RULE_BASED
: // (Keep compiler happy, break_type not used in this case)
53 break_type
= UBRK_LINE
;
56 NOTREACHED() << "invalid break_type_";
59 if (break_type_
== RULE_BASED
) {
60 iter_
= ubrk_openRules(rules_
.c_str(),
61 static_cast<int32_t>(rules_
.length()),
63 static_cast<int32_t>(string_
.size()),
66 if (U_FAILURE(status
)) {
67 NOTREACHED() << "ubrk_openRules failed to parse rule string at line "
68 << parse_error
.line
<< ", offset " << parse_error
.offset
;
71 iter_
= ubrk_open(break_type
,
74 static_cast<int32_t>(string_
.size()),
76 if (U_FAILURE(status
)) {
77 NOTREACHED() << "ubrk_open failed for type " << break_type
78 << " with error " << status
;
82 if (U_FAILURE(status
)) {
86 // Move the iterator to the beginning of the string.
87 ubrk_first(static_cast<UBreakIterator
*>(iter_
));
91 bool BreakIterator::Advance() {
95 switch (break_type_
) {
100 pos
= ubrk_next(static_cast<UBreakIterator
*>(iter_
));
101 if (pos
== UBRK_DONE
) {
105 pos_
= static_cast<size_t>(pos
);
109 pos
= ubrk_next(static_cast<UBreakIterator
*>(iter_
));
110 if (pos
== UBRK_DONE
)
112 pos_
= static_cast<size_t>(pos
);
113 status
= ubrk_getRuleStatus(static_cast<UBreakIterator
*>(iter_
));
114 } while (status
>= UBRK_LINE_SOFT
&& status
< UBRK_LINE_SOFT_LIMIT
);
115 if (pos
== UBRK_DONE
&& prev_
== pos_
) {
121 NOTREACHED() << "invalid break_type_";
126 bool BreakIterator::SetText(const base::char16
* text
, const size_t length
) {
127 UErrorCode status
= U_ZERO_ERROR
;
128 ubrk_setText(static_cast<UBreakIterator
*>(iter_
),
129 text
, length
, &status
);
130 pos_
= 0; // implicit when ubrk_setText is done
132 if (U_FAILURE(status
)) {
133 NOTREACHED() << "ubrk_setText failed";
136 string_
= StringPiece16(text
, length
);
140 bool BreakIterator::IsWord() const {
141 return GetWordBreakStatus() == IS_WORD_BREAK
;
144 BreakIterator::WordBreakStatus
BreakIterator::GetWordBreakStatus() const {
145 int32_t status
= ubrk_getRuleStatus(static_cast<UBreakIterator
*>(iter_
));
146 if (break_type_
!= BREAK_WORD
&& break_type_
!= RULE_BASED
)
147 return IS_LINE_OR_CHAR_BREAK
;
148 return status
== UBRK_WORD_NONE
? IS_SKIPPABLE_WORD
: IS_WORD_BREAK
;
151 bool BreakIterator::IsEndOfWord(size_t position
) const {
152 if (break_type_
!= BREAK_WORD
&& break_type_
!= RULE_BASED
)
155 UBreakIterator
* iter
= static_cast<UBreakIterator
*>(iter_
);
156 UBool boundary
= ubrk_isBoundary(iter
, static_cast<int32_t>(position
));
157 int32_t status
= ubrk_getRuleStatus(iter
);
158 return (!!boundary
&& status
!= UBRK_WORD_NONE
);
161 bool BreakIterator::IsStartOfWord(size_t position
) const {
162 if (break_type_
!= BREAK_WORD
&& break_type_
!= RULE_BASED
)
165 UBreakIterator
* iter
= static_cast<UBreakIterator
*>(iter_
);
166 UBool boundary
= ubrk_isBoundary(iter
, static_cast<int32_t>(position
));
168 int32_t next_status
= ubrk_getRuleStatus(iter
);
169 return (!!boundary
&& next_status
!= UBRK_WORD_NONE
);
172 bool BreakIterator::IsGraphemeBoundary(size_t position
) const {
173 if (break_type_
!= BREAK_CHARACTER
)
176 UBreakIterator
* iter
= static_cast<UBreakIterator
*>(iter_
);
177 return !!ubrk_isBoundary(iter
, static_cast<int32_t>(position
));
180 string16
BreakIterator::GetString() const {
181 return GetStringPiece().as_string();
184 StringPiece16
BreakIterator::GetStringPiece() const {
185 DCHECK(prev_
!= npos
&& pos_
!= npos
);
186 return string_
.substr(prev_
, pos_
- prev_
);