1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "base/strings/utf_offset_string_conversions.h"
9 #include "base/logging.h"
10 #include "base/memory/scoped_ptr.h"
11 #include "base/strings/string_piece.h"
12 #include "base/strings/utf_string_conversion_utils.h"
16 OffsetAdjuster::Adjustment::Adjustment(size_t original_offset
,
17 size_t original_length
,
19 : original_offset(original_offset
),
20 original_length(original_length
),
21 output_length(output_length
) {
25 void OffsetAdjuster::AdjustOffsets(
26 const Adjustments
& adjustments
,
27 std::vector
<size_t>* offsets_for_adjustment
) {
28 if (!offsets_for_adjustment
|| adjustments
.empty())
30 for (std::vector
<size_t>::iterator
i(offsets_for_adjustment
->begin());
31 i
!= offsets_for_adjustment
->end(); ++i
)
32 AdjustOffset(adjustments
, &(*i
));
36 void OffsetAdjuster::AdjustOffset(const Adjustments
& adjustments
,
38 if (*offset
== string16::npos
)
41 for (Adjustments::const_iterator i
= adjustments
.begin();
42 i
!= adjustments
.end(); ++i
) {
43 if (*offset
<= i
->original_offset
)
45 if (*offset
< (i
->original_offset
+ i
->original_length
)) {
46 *offset
= string16::npos
;
49 adjustment
+= static_cast<int>(i
->original_length
- i
->output_length
);
51 *offset
-= adjustment
;
55 void OffsetAdjuster::UnadjustOffsets(
56 const Adjustments
& adjustments
,
57 std::vector
<size_t>* offsets_for_unadjustment
) {
58 if (!offsets_for_unadjustment
|| adjustments
.empty())
60 for (std::vector
<size_t>::iterator
i(offsets_for_unadjustment
->begin());
61 i
!= offsets_for_unadjustment
->end(); ++i
)
62 UnadjustOffset(adjustments
, &(*i
));
66 void OffsetAdjuster::UnadjustOffset(const Adjustments
& adjustments
,
68 if (*offset
== string16::npos
)
71 for (Adjustments::const_iterator i
= adjustments
.begin();
72 i
!= adjustments
.end(); ++i
) {
73 if (*offset
+ adjustment
<= i
->original_offset
)
75 adjustment
+= static_cast<int>(i
->original_length
- i
->output_length
);
76 if ((*offset
+ adjustment
) <
77 (i
->original_offset
+ i
->original_length
)) {
78 *offset
= string16::npos
;
82 *offset
+= adjustment
;
86 void OffsetAdjuster::MergeSequentialAdjustments(
87 const Adjustments
& first_adjustments
,
88 Adjustments
* adjustments_on_adjusted_string
) {
89 Adjustments::iterator adjusted_iter
= adjustments_on_adjusted_string
->begin();
90 Adjustments::const_iterator first_iter
= first_adjustments
.begin();
91 // Simultaneously iterate over all |adjustments_on_adjusted_string| and
92 // |first_adjustments|, adding adjustments to or correcting the adjustments
93 // in |adjustments_on_adjusted_string| as we go. |shift| keeps track of the
94 // current number of characters collapsed by |first_adjustments| up to this
95 // point. |currently_collapsing| keeps track of the number of characters
96 // collapsed by |first_adjustments| into the current |adjusted_iter|'s
97 // length. These are characters that will change |shift| as soon as we're
98 // done processing the current |adjusted_iter|; they are not yet reflected in
101 size_t currently_collapsing
= 0;
102 while (adjusted_iter
!= adjustments_on_adjusted_string
->end()) {
103 if ((first_iter
== first_adjustments
.end()) ||
104 ((adjusted_iter
->original_offset
+ shift
+
105 adjusted_iter
->original_length
) <= first_iter
->original_offset
)) {
106 // Entire |adjusted_iter| (accounting for its shift and including its
107 // whole original length) comes before |first_iter|.
109 // Correct the offset at |adjusted_iter| and move onto the next
110 // adjustment that needs revising.
111 adjusted_iter
->original_offset
+= shift
;
112 shift
+= currently_collapsing
;
113 currently_collapsing
= 0;
115 } else if ((adjusted_iter
->original_offset
+ shift
) >
116 first_iter
->original_offset
) {
117 // |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|).
119 // It's not possible for the adjustments to overlap. (It shouldn't
120 // be possible that we have an |adjusted_iter->original_offset| that,
121 // when adjusted by the computed |shift|, is in the middle of
122 // |first_iter|'s output's length. After all, that would mean the
123 // current adjustment_on_adjusted_string somehow points to an offset
124 // that was supposed to have been eliminated by the first set of
126 DCHECK_LE(first_iter
->original_offset
+ first_iter
->output_length
,
127 adjusted_iter
->original_offset
+ shift
);
129 // Add the |first_adjustment_iter| to the full set of adjustments while
130 // making sure |adjusted_iter| continues pointing to the same element.
131 // We do this by inserting the |first_adjustment_iter| right before
132 // |adjusted_iter|, then incrementing |adjusted_iter| so it points to
133 // the following element.
134 shift
+= first_iter
->original_length
- first_iter
->output_length
;
135 adjusted_iter
= adjustments_on_adjusted_string
->insert(
136 adjusted_iter
, *first_iter
);
140 // The first adjustment adjusted something that then got further adjusted
141 // by the second set of adjustments. In other words, |first_iter| points
142 // to something in the range covered by |adjusted_iter|'s length (after
143 // accounting for |shift|). Precisely,
144 // adjusted_iter->original_offset + shift
146 // first_iter->original_offset
148 // adjusted_iter->original_offset + shift +
149 // adjusted_iter->original_length
151 // Modify the current |adjusted_iter| to include whatever collapsing
152 // happened in |first_iter|, then advance to the next |first_adjustments|
153 // because we dealt with the current one.
154 const int collapse
= static_cast<int>(first_iter
->original_length
) -
155 static_cast<int>(first_iter
->output_length
);
156 // This function does not know how to deal with a string that expands and
157 // then gets modified, only strings that collapse and then get modified.
158 DCHECK_GT(collapse
, 0);
159 adjusted_iter
->original_length
+= collapse
;
160 currently_collapsing
+= collapse
;
164 DCHECK_EQ(0u, currently_collapsing
);
165 if (first_iter
!= first_adjustments
.end()) {
166 // Only first adjustments are left. These do not need to be modified.
167 // (Their offsets are already correct with respect to the original string.)
169 DCHECK(adjusted_iter
== adjustments_on_adjusted_string
->end());
170 adjustments_on_adjusted_string
->insert(
171 adjustments_on_adjusted_string
->end(), first_iter
,
172 first_adjustments
.end());
176 // Converts the given source Unicode character type to the given destination
177 // Unicode character type as a STL string. The given input buffer and size
178 // determine the source, and the given output STL string will be replaced by
179 // the result. If non-NULL, |adjustments| is set to reflect the all the
180 // alterations to the string that are not one-character-to-one-character.
181 // It will always be sorted by increasing offset.
182 template<typename SrcChar
, typename DestStdString
>
183 bool ConvertUnicode(const SrcChar
* src
,
185 DestStdString
* output
,
186 OffsetAdjuster::Adjustments
* adjustments
) {
188 adjustments
->clear();
189 // ICU requires 32-bit numbers.
191 int32 src_len32
= static_cast<int32
>(src_len
);
192 for (int32 i
= 0; i
< src_len32
; i
++) {
194 size_t original_i
= i
;
195 size_t chars_written
= 0;
196 if (ReadUnicodeCharacter(src
, src_len32
, &i
, &code_point
)) {
197 chars_written
= WriteUnicodeCharacter(code_point
, output
);
199 chars_written
= WriteUnicodeCharacter(0xFFFD, output
);
203 // Only bother writing an adjustment if this modification changed the
204 // length of this character.
205 // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last
206 // character read, not after it (so that incrementing it in the loop
207 // increment will place it at the right location), so we need to account
208 // for that in determining the amount that was read.
209 if (adjustments
&& ((i
- original_i
+ 1) != chars_written
)) {
210 adjustments
->push_back(OffsetAdjuster::Adjustment(
211 original_i
, i
- original_i
+ 1, chars_written
));
217 bool UTF8ToUTF16WithAdjustments(
221 base::OffsetAdjuster::Adjustments
* adjustments
) {
222 PrepareForUTF16Or32Output(src
, src_len
, output
);
223 return ConvertUnicode(src
, src_len
, output
, adjustments
);
226 string16
UTF8ToUTF16WithAdjustments(
227 const base::StringPiece
& utf8
,
228 base::OffsetAdjuster::Adjustments
* adjustments
) {
230 UTF8ToUTF16WithAdjustments(utf8
.data(), utf8
.length(), &result
, adjustments
);
234 string16
UTF8ToUTF16AndAdjustOffsets(
235 const base::StringPiece
& utf8
,
236 std::vector
<size_t>* offsets_for_adjustment
) {
237 std::for_each(offsets_for_adjustment
->begin(),
238 offsets_for_adjustment
->end(),
239 LimitOffset
<base::StringPiece
>(utf8
.length()));
240 OffsetAdjuster::Adjustments adjustments
;
241 string16 result
= UTF8ToUTF16WithAdjustments(utf8
, &adjustments
);
242 OffsetAdjuster::AdjustOffsets(adjustments
, offsets_for_adjustment
);
246 std::string
UTF16ToUTF8AndAdjustOffsets(
247 const base::StringPiece16
& utf16
,
248 std::vector
<size_t>* offsets_for_adjustment
) {
249 std::for_each(offsets_for_adjustment
->begin(),
250 offsets_for_adjustment
->end(),
251 LimitOffset
<base::StringPiece16
>(utf16
.length()));
253 PrepareForUTF8Output(utf16
.data(), utf16
.length(), &result
);
254 OffsetAdjuster::Adjustments adjustments
;
255 ConvertUnicode(utf16
.data(), utf16
.length(), &result
, &adjustments
);
256 OffsetAdjuster::AdjustOffsets(adjustments
, offsets_for_adjustment
);