1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
7 #include "base/logging.h"
8 #include "base/strings/string_piece.h"
9 #include "base/strings/utf_offset_string_conversions.h"
10 #include "testing/gtest/include/gtest/gtest.h"
16 static const size_t kNpos
= string16::npos
;
20 TEST(UTFOffsetStringConversionsTest
, AdjustOffset
) {
21 struct UTF8ToUTF16Case
{
25 } utf8_to_utf16_cases
[] = {
28 {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, kNpos
},
29 {"\xe4\xbd\xa0\xe5\xa5\xbd", 3, 1},
30 {"\xed\xb0\x80z", 3, 1},
31 {"A\xF0\x90\x8C\x80z", 1, 1},
32 {"A\xF0\x90\x8C\x80z", 2, kNpos
},
33 {"A\xF0\x90\x8C\x80z", 5, 3},
34 {"A\xF0\x90\x8C\x80z", 6, 4},
35 {"A\xF0\x90\x8C\x80z", kNpos
, kNpos
},
37 for (size_t i
= 0; i
< ARRAYSIZE_UNSAFE(utf8_to_utf16_cases
); ++i
) {
38 const size_t offset
= utf8_to_utf16_cases
[i
].input_offset
;
39 std::vector
<size_t> offsets
;
40 offsets
.push_back(offset
);
41 UTF8ToUTF16AndAdjustOffsets(utf8_to_utf16_cases
[i
].utf8
, &offsets
);
42 EXPECT_EQ(utf8_to_utf16_cases
[i
].output_offset
, offsets
[0]);
45 struct UTF16ToUTF8Case
{
49 } utf16_to_utf8_cases
[] = {
51 // Converted to 3-byte utf-8 sequences
52 {{0x5909, 0x63DB}, 3, kNpos
},
53 {{0x5909, 0x63DB}, 2, 6},
54 {{0x5909, 0x63DB}, 1, 3},
55 {{0x5909, 0x63DB}, 0, 0},
56 // Converted to 2-byte utf-8 sequences
57 {{'A', 0x00bc, 0x00be, 'z'}, 1, 1},
58 {{'A', 0x00bc, 0x00be, 'z'}, 2, 3},
59 {{'A', 0x00bc, 0x00be, 'z'}, 3, 5},
60 {{'A', 0x00bc, 0x00be, 'z'}, 4, 6},
62 {{'A', 0xd800, 0xdf00, 'z'}, 1, 1},
63 {{'A', 0xd800, 0xdf00, 'z'}, 2, kNpos
},
64 {{'A', 0xd800, 0xdf00, 'z'}, 3, 5},
65 {{'A', 0xd800, 0xdf00, 'z'}, 4, 6},
67 for (size_t i
= 0; i
< ARRAYSIZE_UNSAFE(utf16_to_utf8_cases
); ++i
) {
68 size_t offset
= utf16_to_utf8_cases
[i
].input_offset
;
69 std::vector
<size_t> offsets
;
70 offsets
.push_back(offset
);
71 UTF16ToUTF8AndAdjustOffsets(utf16_to_utf8_cases
[i
].utf16
, &offsets
);
72 EXPECT_EQ(utf16_to_utf8_cases
[i
].output_offset
, offsets
[0]) << i
;
76 TEST(UTFOffsetStringConversionsTest
, LimitOffsets
) {
77 const size_t kLimit
= 10;
78 const size_t kItems
= 20;
79 std::vector
<size_t> size_ts
;
80 for (size_t t
= 0; t
< kItems
; ++t
)
82 std::for_each(size_ts
.begin(), size_ts
.end(),
83 LimitOffset
<string16
>(kLimit
));
84 size_t unlimited_count
= 0;
85 for (std::vector
<size_t>::iterator ti
= size_ts
.begin(); ti
!= size_ts
.end();
90 EXPECT_EQ(11U, unlimited_count
);
92 // Reverse the values in the vector and try again.
94 for (size_t t
= kItems
; t
> 0; --t
)
95 size_ts
.push_back(t
- 1);
96 std::for_each(size_ts
.begin(), size_ts
.end(),
97 LimitOffset
<string16
>(kLimit
));
99 for (std::vector
<size_t>::iterator ti
= size_ts
.begin(); ti
!= size_ts
.end();
104 EXPECT_EQ(11U, unlimited_count
);
107 TEST(UTFOffsetStringConversionsTest
, AdjustOffsets
) {
108 // Imagine we have strings as shown in the following cases where the
109 // X's represent encoded characters.
110 // 1: abcXXXdef ==> abcXdef
112 std::vector
<size_t> offsets
;
113 for (size_t t
= 0; t
<= 9; ++t
)
114 offsets
.push_back(t
);
115 OffsetAdjuster::Adjustments adjustments
;
116 adjustments
.push_back(OffsetAdjuster::Adjustment(3, 3, 1));
117 OffsetAdjuster::AdjustOffsets(adjustments
, &offsets
);
118 size_t expected_1
[] = {0, 1, 2, 3, kNpos
, kNpos
, 4, 5, 6, 7};
119 EXPECT_EQ(offsets
.size(), arraysize(expected_1
));
120 for (size_t i
= 0; i
< arraysize(expected_1
); ++i
)
121 EXPECT_EQ(expected_1
[i
], offsets
[i
]);
124 // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX
126 std::vector
<size_t> offsets
;
127 for (size_t t
= 0; t
<= 23; ++t
)
128 offsets
.push_back(t
);
129 OffsetAdjuster::Adjustments adjustments
;
130 adjustments
.push_back(OffsetAdjuster::Adjustment(0, 3, 1));
131 adjustments
.push_back(OffsetAdjuster::Adjustment(4, 4, 2));
132 adjustments
.push_back(OffsetAdjuster::Adjustment(10, 7, 4));
133 adjustments
.push_back(OffsetAdjuster::Adjustment(20, 3, 1));
134 OffsetAdjuster::AdjustOffsets(adjustments
, &offsets
);
135 size_t expected_2
[] = {
136 0, kNpos
, kNpos
, 1, 2, kNpos
, kNpos
, kNpos
, 4, 5, 6, kNpos
, kNpos
, kNpos
,
137 kNpos
, kNpos
, kNpos
, 10, 11, 12, 13, kNpos
, kNpos
, 14
139 EXPECT_EQ(offsets
.size(), arraysize(expected_2
));
140 for (size_t i
= 0; i
< arraysize(expected_2
); ++i
)
141 EXPECT_EQ(expected_2
[i
], offsets
[i
]);
144 // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe
146 std::vector
<size_t> offsets
;
147 for (size_t t
= 0; t
<= 17; ++t
)
148 offsets
.push_back(t
);
149 OffsetAdjuster::Adjustments adjustments
;
150 adjustments
.push_back(OffsetAdjuster::Adjustment(0, 3, 0));
151 adjustments
.push_back(OffsetAdjuster::Adjustment(4, 4, 4));
152 adjustments
.push_back(OffsetAdjuster::Adjustment(11, 3, 3));
153 adjustments
.push_back(OffsetAdjuster::Adjustment(15, 2, 0));
154 OffsetAdjuster::AdjustOffsets(adjustments
, &offsets
);
155 size_t expected_3
[] = {
156 0, kNpos
, kNpos
, 0, 1, kNpos
, kNpos
, kNpos
, 5, 6, 7, 8, kNpos
, kNpos
, 11,
159 EXPECT_EQ(offsets
.size(), arraysize(expected_3
));
160 for (size_t i
= 0; i
< arraysize(expected_3
); ++i
)
161 EXPECT_EQ(expected_3
[i
], offsets
[i
]);
165 TEST(UTFOffsetStringConversionsTest
, UnadjustOffsets
) {
166 // Imagine we have strings as shown in the following cases where the
167 // X's represent encoded characters.
168 // 1: abcXXXdef ==> abcXdef
170 std::vector
<size_t> offsets
;
171 for (size_t t
= 0; t
<= 7; ++t
)
172 offsets
.push_back(t
);
173 OffsetAdjuster::Adjustments adjustments
;
174 adjustments
.push_back(OffsetAdjuster::Adjustment(3, 3, 1));
175 OffsetAdjuster::UnadjustOffsets(adjustments
, &offsets
);
176 size_t expected_1
[] = {0, 1, 2, 3, 6, 7, 8, 9};
177 EXPECT_EQ(offsets
.size(), arraysize(expected_1
));
178 for (size_t i
= 0; i
< arraysize(expected_1
); ++i
)
179 EXPECT_EQ(expected_1
[i
], offsets
[i
]);
182 // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX
184 std::vector
<size_t> offsets
;
185 for (size_t t
= 0; t
<= 14; ++t
)
186 offsets
.push_back(t
);
187 OffsetAdjuster::Adjustments adjustments
;
188 adjustments
.push_back(OffsetAdjuster::Adjustment(0, 3, 1));
189 adjustments
.push_back(OffsetAdjuster::Adjustment(4, 4, 2));
190 adjustments
.push_back(OffsetAdjuster::Adjustment(10, 7, 4));
191 adjustments
.push_back(OffsetAdjuster::Adjustment(20, 3, 1));
192 OffsetAdjuster::UnadjustOffsets(adjustments
, &offsets
);
193 size_t expected_2
[] = {
194 0, 3, 4, kNpos
, 8, 9, 10, kNpos
, kNpos
, kNpos
, 17, 18, 19, 20, 23
196 EXPECT_EQ(offsets
.size(), arraysize(expected_2
));
197 for (size_t i
= 0; i
< arraysize(expected_2
); ++i
)
198 EXPECT_EQ(expected_2
[i
], offsets
[i
]);
201 // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe
203 std::vector
<size_t> offsets
;
204 for (size_t t
= 0; t
<= 12; ++t
)
205 offsets
.push_back(t
);
206 OffsetAdjuster::Adjustments adjustments
;
207 adjustments
.push_back(OffsetAdjuster::Adjustment(0, 3, 0));
208 adjustments
.push_back(OffsetAdjuster::Adjustment(4, 4, 4));
209 adjustments
.push_back(OffsetAdjuster::Adjustment(11, 3, 3));
210 adjustments
.push_back(OffsetAdjuster::Adjustment(15, 2, 0));
211 OffsetAdjuster::UnadjustOffsets(adjustments
, &offsets
);
212 size_t expected_3
[] = {
213 0, // this could just as easily be 3
214 4, kNpos
, kNpos
, kNpos
, 8, 9, 10, 11, kNpos
, kNpos
, 14,
215 15 // this could just as easily be 17
217 EXPECT_EQ(offsets
.size(), arraysize(expected_3
));
218 for (size_t i
= 0; i
< arraysize(expected_3
); ++i
)
219 EXPECT_EQ(expected_3
[i
], offsets
[i
]);
223 // MergeSequentialAdjustments is used by net/base/escape.{h,cc} and
224 // net/base/net_util.{h,cc}. The two tests EscapeTest.AdjustOffset and
225 // NetUtilTest.FormatUrlWithOffsets test its behavior extensively. This
226 // is simply a short, additional test.
227 TEST(UTFOffsetStringConversionsTest
, MergeSequentialAdjustments
) {
228 // Pretend the input string is "abcdefghijklmnopqrstuvwxyz".
230 // Set up |first_adjustments| to
231 // - remove the leading "a"
232 // - combine the "bc" into one character (call it ".")
234 // - remove the "tuv"
235 // The resulting string should be ".deghijklmnopqrswxyz".
236 OffsetAdjuster::Adjustments first_adjustments
;
237 first_adjustments
.push_back(OffsetAdjuster::Adjustment(0, 1, 0));
238 first_adjustments
.push_back(OffsetAdjuster::Adjustment(1, 2, 1));
239 first_adjustments
.push_back(OffsetAdjuster::Adjustment(5, 1, 0));
240 first_adjustments
.push_back(OffsetAdjuster::Adjustment(19, 3, 0));
242 // Set up |adjustments_on_adjusted_string| to
243 // - combine the "." character that replaced "bc" with "d" into one character
245 // - remove the "egh"
246 // - expand the "i" into two characters (call them "12")
247 // - combine the "jkl" into one character (call it "@")
248 // - expand the "z" into two characters (call it "34")
249 // The resulting string should be "?12@mnopqrswxy34".
250 OffsetAdjuster::Adjustments adjustments_on_adjusted_string
;
251 adjustments_on_adjusted_string
.push_back(OffsetAdjuster::Adjustment(
253 adjustments_on_adjusted_string
.push_back(OffsetAdjuster::Adjustment(
255 adjustments_on_adjusted_string
.push_back(OffsetAdjuster::Adjustment(
257 adjustments_on_adjusted_string
.push_back(OffsetAdjuster::Adjustment(
259 adjustments_on_adjusted_string
.push_back(OffsetAdjuster::Adjustment(
262 // Now merge the adjustments and check the results.
263 OffsetAdjuster::MergeSequentialAdjustments(first_adjustments
,
264 &adjustments_on_adjusted_string
);
265 // The merged adjustments should look like
266 // - combine abcd into "?"
267 // - note: it's also reasonable for the Merge function to instead produce
268 // two adjustments instead of this, one to remove a and another to
269 // combine bcd into "?". This test verifies the current behavior.
271 // - expand i into "12"
272 // - combine jkl into "@"
274 // - expand z into "34"
275 ASSERT_EQ(6u, adjustments_on_adjusted_string
.size());
276 EXPECT_EQ(0u, adjustments_on_adjusted_string
[0].original_offset
);
277 EXPECT_EQ(4u, adjustments_on_adjusted_string
[0].original_length
);
278 EXPECT_EQ(1u, adjustments_on_adjusted_string
[0].output_length
);
279 EXPECT_EQ(4u, adjustments_on_adjusted_string
[1].original_offset
);
280 EXPECT_EQ(4u, adjustments_on_adjusted_string
[1].original_length
);
281 EXPECT_EQ(0u, adjustments_on_adjusted_string
[1].output_length
);
282 EXPECT_EQ(8u, adjustments_on_adjusted_string
[2].original_offset
);
283 EXPECT_EQ(1u, adjustments_on_adjusted_string
[2].original_length
);
284 EXPECT_EQ(2u, adjustments_on_adjusted_string
[2].output_length
);
285 EXPECT_EQ(9u, adjustments_on_adjusted_string
[3].original_offset
);
286 EXPECT_EQ(3u, adjustments_on_adjusted_string
[3].original_length
);
287 EXPECT_EQ(1u, adjustments_on_adjusted_string
[3].output_length
);
288 EXPECT_EQ(19u, adjustments_on_adjusted_string
[4].original_offset
);
289 EXPECT_EQ(3u, adjustments_on_adjusted_string
[4].original_length
);
290 EXPECT_EQ(0u, adjustments_on_adjusted_string
[4].output_length
);
291 EXPECT_EQ(25u, adjustments_on_adjusted_string
[5].original_offset
);
292 EXPECT_EQ(1u, adjustments_on_adjusted_string
[5].original_length
);
293 EXPECT_EQ(2u, adjustments_on_adjusted_string
[5].output_length
);