1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "base/i18n/break_iterator.h"
7 #include "base/logging.h"
8 #include "third_party/icu/source/common/unicode/ubrk.h"
9 #include "third_party/icu/source/common/unicode/uchar.h"
10 #include "third_party/icu/source/common/unicode/ustring.h"
15 const size_t npos
= static_cast<size_t>(-1);
17 BreakIterator::BreakIterator(const StringPiece16
& str
, BreakType break_type
)
20 break_type_(break_type
),
25 BreakIterator::BreakIterator(const StringPiece16
& str
, const string16
& rules
)
29 break_type_(RULE_BASED
),
34 BreakIterator::~BreakIterator() {
36 ubrk_close(static_cast<UBreakIterator
*>(iter_
));
39 bool BreakIterator::Init() {
40 UErrorCode status
= U_ZERO_ERROR
;
41 UParseError parse_error
;
42 UBreakIteratorType break_type
;
43 switch (break_type_
) {
45 break_type
= UBRK_CHARACTER
;
48 break_type
= UBRK_WORD
;
52 case RULE_BASED
: // (Keep compiler happy, break_type not used in this case)
53 break_type
= UBRK_LINE
;
56 NOTREACHED() << "invalid break_type_";
59 if (break_type_
== RULE_BASED
) {
60 iter_
= ubrk_openRules(rules_
.c_str(),
61 static_cast<int32_t>(rules_
.length()),
63 static_cast<int32_t>(string_
.size()),
66 if (U_FAILURE(status
)) {
67 NOTREACHED() << "ubrk_openRules failed to parse rule string at line "
68 << parse_error
.line
<< ", offset " << parse_error
.offset
;
71 iter_
= ubrk_open(break_type
,
74 static_cast<int32_t>(string_
.size()),
76 if (U_FAILURE(status
)) {
77 NOTREACHED() << "ubrk_open failed";
81 if (U_FAILURE(status
)) {
85 // Move the iterator to the beginning of the string.
86 ubrk_first(static_cast<UBreakIterator
*>(iter_
));
90 bool BreakIterator::Advance() {
94 switch (break_type_
) {
99 pos
= ubrk_next(static_cast<UBreakIterator
*>(iter_
));
100 if (pos
== UBRK_DONE
) {
104 pos_
= static_cast<size_t>(pos
);
108 pos
= ubrk_next(static_cast<UBreakIterator
*>(iter_
));
109 if (pos
== UBRK_DONE
)
111 pos_
= static_cast<size_t>(pos
);
112 status
= ubrk_getRuleStatus(static_cast<UBreakIterator
*>(iter_
));
113 } while (status
>= UBRK_LINE_SOFT
&& status
< UBRK_LINE_SOFT_LIMIT
);
114 if (pos
== UBRK_DONE
&& prev_
== pos_
) {
120 NOTREACHED() << "invalid break_type_";
125 bool BreakIterator::SetText(const base::char16
* text
, const size_t length
) {
126 UErrorCode status
= U_ZERO_ERROR
;
127 ubrk_setText(static_cast<UBreakIterator
*>(iter_
),
128 text
, length
, &status
);
129 pos_
= 0; // implicit when ubrk_setText is done
131 if (U_FAILURE(status
)) {
132 NOTREACHED() << "ubrk_setText failed";
135 string_
= StringPiece16(text
, length
);
139 bool BreakIterator::IsWord() const {
140 int32_t status
= ubrk_getRuleStatus(static_cast<UBreakIterator
*>(iter_
));
141 if (break_type_
!= BREAK_WORD
&& break_type_
!= RULE_BASED
)
143 return status
!= UBRK_WORD_NONE
;
146 bool BreakIterator::IsEndOfWord(size_t position
) const {
147 if (break_type_
!= BREAK_WORD
&& break_type_
!= RULE_BASED
)
150 UBreakIterator
* iter
= static_cast<UBreakIterator
*>(iter_
);
151 UBool boundary
= ubrk_isBoundary(iter
, static_cast<int32_t>(position
));
152 int32_t status
= ubrk_getRuleStatus(iter
);
153 return (!!boundary
&& status
!= UBRK_WORD_NONE
);
156 bool BreakIterator::IsStartOfWord(size_t position
) const {
157 if (break_type_
!= BREAK_WORD
&& break_type_
!= RULE_BASED
)
160 UBreakIterator
* iter
= static_cast<UBreakIterator
*>(iter_
);
161 UBool boundary
= ubrk_isBoundary(iter
, static_cast<int32_t>(position
));
163 int32_t next_status
= ubrk_getRuleStatus(iter
);
164 return (!!boundary
&& next_status
!= UBRK_WORD_NONE
);
167 bool BreakIterator::IsGraphemeBoundary(size_t position
) const {
168 if (break_type_
!= BREAK_CHARACTER
)
171 UBreakIterator
* iter
= static_cast<UBreakIterator
*>(iter_
);
172 return !!ubrk_isBoundary(iter
, static_cast<int32_t>(position
));
175 string16
BreakIterator::GetString() const {
176 return GetStringPiece().as_string();
179 StringPiece16
BreakIterator::GetStringPiece() const {
180 DCHECK(prev_
!= npos
&& pos_
!= npos
);
181 return string_
.substr(prev_
, pos_
- prev_
);