1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "base/i18n/word_iterator.h"
7 #include "base/logging.h"
8 #include "unicode/ubrk.h"
9 #include "unicode/ustring.h"
11 const size_t npos
= -1;
13 WordIterator::WordIterator(const std::wstring
& str
, BreakType break_type
)
16 break_type_(break_type
),
21 WordIterator::~WordIterator() {
26 bool WordIterator::Init() {
27 UErrorCode status
= U_ZERO_ERROR
;
28 UBreakIteratorType break_type
;
29 switch (break_type_
) {
31 break_type
= UBRK_WORD
;
34 break_type
= UBRK_LINE
;
38 break_type
= UBRK_LINE
;
40 #if defined(WCHAR_T_IS_UTF16)
41 iter_
= ubrk_open(break_type
, NULL
,
42 string_
.data(), static_cast<int32_t>(string_
.size()),
44 #else // WCHAR_T_IS_UTF16
45 // When wchar_t is wider than UChar (16 bits), transform |string_| into a
46 // UChar* string. Size the UChar* buffer to be large enough to hold twice
47 // as many UTF-16 code points as there are UCS-4 characters, in case each
48 // character translates to a UTF-16 surrogate pair, and leave room for a NUL
50 // TODO(avi): avoid this alloc
51 chars_
.resize(string_
.length() * sizeof(UChar
) + 1);
53 UErrorCode error
= U_ZERO_ERROR
;
55 u_strFromWCS(&chars_
[0], chars_
.size(), &destLength
, string_
.data(),
56 string_
.length(), &error
);
58 iter_
= ubrk_open(break_type
, NULL
, &chars_
[0], destLength
, &status
);
60 if (U_FAILURE(status
)) {
61 NOTREACHED() << "ubrk_open failed";
64 ubrk_first(iter_
); // Move the iterator to the beginning of the string.
68 bool WordIterator::Advance() {
70 const int32_t pos
= ubrk_next(iter_
);
71 if (pos
== UBRK_DONE
) {
75 pos_
= static_cast<size_t>(pos
);
80 bool WordIterator::IsWord() const {
81 return (ubrk_getRuleStatus(iter_
) != UBRK_WORD_NONE
);
84 std::wstring
WordIterator::GetWord() const {
85 DCHECK(prev_
!= npos
&& pos_
!= npos
);
86 #if defined(WCHAR_T_IS_UTF16)
87 return string_
.substr(prev_
, pos_
- prev_
);
88 #else // WCHAR_T_IS_UTF16
89 // See comment in Init(). If there are no surrogate pairs,
90 // |out_length| will be exactly |in_length|, if there are surrogate
91 // pairs it will be less than |in_length|.
93 UErrorCode error
= U_ZERO_ERROR
;
94 const int32_t in_length
= pos_
- prev_
;
95 std::vector
<std::wstring::value_type
> out_buffer(in_length
);
96 u_strToWCS(&out_buffer
[0], in_length
, &out_length
,
97 &chars_
[prev_
], in_length
, &error
);
98 DCHECK_LE(out_length
, in_length
);
99 return std::wstring(&out_buffer
[0], out_length
);