Roll src/third_party/WebKit d9c6159:8139f33 (svn 201974:201975)
[chromium-blink-merge.git] / base / i18n / streaming_utf8_validator_unittest.cc
blob20ea564c0322a68d9ffc39b6a3911f4a0feafcd2
1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "base/i18n/streaming_utf8_validator.h"
7 #include <stdio.h>
8 #include <string.h>
10 #include <string>
12 #include "base/strings/string_piece.h"
13 #include "testing/gtest/include/gtest/gtest.h"
15 // Define BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST to verify that this class
16 // accepts exactly the same set of 4-byte strings as ICU-based validation. This
17 // tests every possible 4-byte string, so it is too slow to run routinely on
18 // low-powered machines.
20 // #define BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST
22 #ifdef BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST
24 #include "base/basictypes.h"
25 #include "base/bind.h"
26 #include "base/location.h"
27 #include "base/logging.h"
28 #include "base/memory/ref_counted.h"
29 #include "base/strings/string_util.h"
30 #include "base/strings/stringprintf.h"
31 #include "base/strings/utf_string_conversion_utils.h"
32 #include "base/synchronization/condition_variable.h"
33 #include "base/synchronization/lock.h"
34 #include "base/threading/sequenced_worker_pool.h"
35 #include "third_party/icu/source/common/unicode/utf8.h"
37 #endif // BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST
39 namespace base {
40 namespace {
42 // Avoid having to qualify the enum values in the tests.
43 const StreamingUtf8Validator::State VALID_ENDPOINT =
44 StreamingUtf8Validator::VALID_ENDPOINT;
45 const StreamingUtf8Validator::State VALID_MIDPOINT =
46 StreamingUtf8Validator::VALID_MIDPOINT;
47 const StreamingUtf8Validator::State INVALID = StreamingUtf8Validator::INVALID;
49 #ifdef BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST
51 const uint32 kThoroughTestChunkSize = 1 << 24;
53 class StreamingUtf8ValidatorThoroughTest : public ::testing::Test {
54 protected:
55 StreamingUtf8ValidatorThoroughTest()
56 : all_done_(&lock_), tasks_dispatched_(0), tasks_finished_(0) {}
58 // This uses the same logic as base::IsStringUTF8 except it considers
59 // non-characters valid (and doesn't require a string as input).
60 static bool IsStringUtf8(const char* src, int32 src_len) {
61 int32 char_index = 0;
63 while (char_index < src_len) {
64 int32 code_point;
65 U8_NEXT(src, char_index, src_len, code_point);
66 if (!base::IsValidCodepoint(code_point))
67 return false;
69 return true;
72 // Converts the passed-in integer to a 4 byte string and then
73 // verifies that IsStringUtf8 and StreamingUtf8Validator agree on
74 // whether it is valid UTF-8 or not.
75 void TestNumber(uint32 n) const {
76 char test[sizeof n];
77 memcpy(test, &n, sizeof n);
78 StreamingUtf8Validator validator;
79 EXPECT_EQ(IsStringUtf8(test, sizeof n),
80 validator.AddBytes(test, sizeof n) == VALID_ENDPOINT)
81 << "Difference of opinion for \""
82 << base::StringPrintf("\\x%02X\\x%02X\\x%02X\\x%02X",
83 test[0] & 0xFF,
84 test[1] & 0xFF,
85 test[2] & 0xFF,
86 test[3] & 0xFF) << "\"";
89 public:
90 // Tests the 4-byte sequences corresponding to the |size| integers
91 // starting at |begin|. This is intended to be run from a worker
92 // pool. Signals |all_done_| at the end if it thinks all tasks are
93 // finished.
94 void TestRange(uint32 begin, uint32 size) {
95 for (uint32 i = 0; i < size; ++i) {
96 TestNumber(begin + i);
98 base::AutoLock al(lock_);
99 ++tasks_finished_;
100 LOG(INFO) << tasks_finished_ << " / " << tasks_dispatched_
101 << " tasks done\n";
102 if (tasks_finished_ >= tasks_dispatched_) {
103 all_done_.Signal();
107 protected:
108 base::Lock lock_;
109 base::ConditionVariable all_done_;
110 int tasks_dispatched_;
111 int tasks_finished_;
114 TEST_F(StreamingUtf8ValidatorThoroughTest, TestEverything) {
115 scoped_refptr<base::SequencedWorkerPool> pool =
116 new base::SequencedWorkerPool(32, "TestEverything");
117 base::AutoLock al(lock_);
118 uint32 begin = 0;
119 do {
120 pool->PostWorkerTask(
121 FROM_HERE,
122 base::Bind(&StreamingUtf8ValidatorThoroughTest::TestRange,
123 base::Unretained(this),
124 begin,
125 kThoroughTestChunkSize));
126 ++tasks_dispatched_;
127 begin += kThoroughTestChunkSize;
128 } while (begin != 0);
129 while (tasks_finished_ < tasks_dispatched_)
130 all_done_.Wait();
133 #endif // BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST
135 // These valid and invalid UTF-8 sequences are based on the tests from
136 // base/strings/string_util_unittest.cc
138 // All of the strings in |valid| must represent a single codepoint, because
139 // partial sequences are constructed by taking non-empty prefixes of these
140 // strings.
141 const char* const valid[] = {"\r", "\n", "a",
142 "\xc2\x81", "\xe1\x80\xbf", "\xf1\x80\xa0\xbf",
143 "\xef\xbb\xbf", // UTF-8 BOM
146 const char* const* const valid_end = valid + arraysize(valid);
148 const char* const invalid[] = {
149 // always invalid bytes
150 "\xc0", "\xc1",
151 "\xf5", "\xf6", "\xf7",
152 "\xf8", "\xf9", "\xfa", "\xfb", "\xfc", "\xfd", "\xfe", "\xff",
153 // surrogate code points
154 "\xed\xa0\x80", "\xed\x0a\x8f", "\xed\xbf\xbf",
156 // overlong sequences
157 "\xc0\x80" // U+0000
158 "\xc1\x80", // "A"
159 "\xc1\x81", // "B"
160 "\xe0\x80\x80", // U+0000
161 "\xe0\x82\x80", // U+0080
162 "\xe0\x9f\xbf", // U+07ff
163 "\xf0\x80\x80\x8D", // U+000D
164 "\xf0\x80\x82\x91", // U+0091
165 "\xf0\x80\xa0\x80", // U+0800
166 "\xf0\x8f\xbb\xbf", // U+FEFF (BOM)
167 "\xf8\x80\x80\x80\xbf", // U+003F
168 "\xfc\x80\x80\x80\xa0\xa5",
170 // Beyond U+10FFFF
171 "\xf4\x90\x80\x80", // U+110000
172 "\xf8\xa0\xbf\x80\xbf", // 5 bytes
173 "\xfc\x9c\xbf\x80\xbf\x80", // 6 bytes
175 // BOMs in UTF-16(BE|LE)
176 "\xfe\xff", "\xff\xfe",
179 const char* const* const invalid_end = invalid + arraysize(invalid);
181 // A ForwardIterator which returns all the non-empty prefixes of the elements of
182 // "valid".
183 class PartialIterator {
184 public:
185 // The constructor returns the first iterator, ie. it is equivalent to
186 // begin().
187 PartialIterator() : index_(0), prefix_length_(0) { Advance(); }
188 // The trivial destructor left intentionally undefined.
189 // This is a value type; the default copy constructor and assignment operator
190 // generated by the compiler are used.
192 static PartialIterator end() { return PartialIterator(arraysize(valid), 1); }
194 PartialIterator& operator++() {
195 Advance();
196 return *this;
199 base::StringPiece operator*() const {
200 return base::StringPiece(valid[index_], prefix_length_);
203 bool operator==(const PartialIterator& rhs) const {
204 return index_ == rhs.index_ && prefix_length_ == rhs.prefix_length_;
207 bool operator!=(const PartialIterator& rhs) const { return !(rhs == *this); }
209 private:
210 // This constructor is used by the end() method.
211 PartialIterator(size_t index, size_t prefix_length)
212 : index_(index), prefix_length_(prefix_length) {}
214 void Advance() {
215 if (index_ < arraysize(valid) && prefix_length_ < strlen(valid[index_]))
216 ++prefix_length_;
217 while (index_ < arraysize(valid) &&
218 prefix_length_ == strlen(valid[index_])) {
219 ++index_;
220 prefix_length_ = 1;
224 // The UTF-8 sequence, as an offset into the |valid| array.
225 size_t index_;
226 size_t prefix_length_;
229 // A test fixture for tests which test one UTF-8 sequence (or invalid
230 // byte sequence) at a time.
231 class StreamingUtf8ValidatorSingleSequenceTest : public ::testing::Test {
232 protected:
233 // Iterator must be convertible when de-referenced to StringPiece.
234 template <typename Iterator>
235 void CheckRange(Iterator begin,
236 Iterator end,
237 StreamingUtf8Validator::State expected) {
238 for (Iterator it = begin; it != end; ++it) {
239 StreamingUtf8Validator validator;
240 base::StringPiece sequence = *it;
241 EXPECT_EQ(expected,
242 validator.AddBytes(sequence.data(), sequence.size()))
243 << "Failed for \"" << sequence << "\"";
247 // Adding input a byte at a time should make absolutely no difference.
248 template <typename Iterator>
249 void CheckRangeByteAtATime(Iterator begin,
250 Iterator end,
251 StreamingUtf8Validator::State expected) {
252 for (Iterator it = begin; it != end; ++it) {
253 StreamingUtf8Validator validator;
254 base::StringPiece sequence = *it;
255 StreamingUtf8Validator::State state = VALID_ENDPOINT;
256 for (base::StringPiece::const_iterator cit = sequence.begin();
257 cit != sequence.end();
258 ++cit) {
259 state = validator.AddBytes(&*cit, 1);
261 EXPECT_EQ(expected, state) << "Failed for \"" << sequence << "\"";
266 // A test fixture for tests which test the concatenation of byte sequences.
267 class StreamingUtf8ValidatorDoubleSequenceTest : public ::testing::Test {
268 protected:
269 // Check every possible concatenation of byte sequences from two
270 // ranges, and verify that the combination matches the expected
271 // state.
272 template <typename Iterator1, typename Iterator2>
273 void CheckCombinations(Iterator1 begin1,
274 Iterator1 end1,
275 Iterator2 begin2,
276 Iterator2 end2,
277 StreamingUtf8Validator::State expected) {
278 StreamingUtf8Validator validator;
279 for (Iterator1 it1 = begin1; it1 != end1; ++it1) {
280 base::StringPiece c1 = *it1;
281 for (Iterator2 it2 = begin2; it2 != end2; ++it2) {
282 base::StringPiece c2 = *it2;
283 validator.AddBytes(c1.data(), c1.size());
284 EXPECT_EQ(expected, validator.AddBytes(c2.data(), c2.size()))
285 << "Failed for \"" << c1 << c2 << "\"";
286 validator.Reset();
292 TEST(StreamingUtf8ValidatorTest, NothingIsValid) {
293 static const char kNothing[] = "";
294 EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes(kNothing, 0));
297 // Because the members of the |valid| array need to be non-zero length
298 // sequences and are measured with strlen(), |valid| cannot be used it
299 // to test the NUL character '\0', so the NUL character gets its own
300 // test.
301 TEST(StreamingUtf8ValidatorTest, NulIsValid) {
302 static const char kNul[] = "\x00";
303 EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes(kNul, 1));
306 // Just a basic sanity test before we start getting fancy.
307 TEST(StreamingUtf8ValidatorTest, HelloWorld) {
308 static const char kHelloWorld[] = "Hello, World!";
309 EXPECT_EQ(
310 VALID_ENDPOINT,
311 StreamingUtf8Validator().AddBytes(kHelloWorld, strlen(kHelloWorld)));
314 // Check that the Reset() method works.
315 TEST(StreamingUtf8ValidatorTest, ResetWorks) {
316 StreamingUtf8Validator validator;
317 EXPECT_EQ(INVALID, validator.AddBytes("\xC0", 1));
318 EXPECT_EQ(INVALID, validator.AddBytes("a", 1));
319 validator.Reset();
320 EXPECT_EQ(VALID_ENDPOINT, validator.AddBytes("a", 1));
323 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Valid) {
324 CheckRange(valid, valid_end, VALID_ENDPOINT);
327 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Partial) {
328 CheckRange(PartialIterator(), PartialIterator::end(), VALID_MIDPOINT);
331 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Invalid) {
332 CheckRange(invalid, invalid_end, INVALID);
335 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, ValidByByte) {
336 CheckRangeByteAtATime(valid, valid_end, VALID_ENDPOINT);
339 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, PartialByByte) {
340 CheckRangeByteAtATime(
341 PartialIterator(), PartialIterator::end(), VALID_MIDPOINT);
344 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, InvalidByByte) {
345 CheckRangeByteAtATime(invalid, invalid_end, INVALID);
348 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusValidIsValid) {
349 CheckCombinations(valid, valid_end, valid, valid_end, VALID_ENDPOINT);
352 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusPartialIsPartial) {
353 CheckCombinations(valid,
354 valid_end,
355 PartialIterator(),
356 PartialIterator::end(),
357 VALID_MIDPOINT);
360 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusValidIsInvalid) {
361 CheckCombinations(
362 PartialIterator(), PartialIterator::end(), valid, valid_end, INVALID);
365 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusPartialIsInvalid) {
366 CheckCombinations(PartialIterator(),
367 PartialIterator::end(),
368 PartialIterator(),
369 PartialIterator::end(),
370 INVALID);
373 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusInvalidIsInvalid) {
374 CheckCombinations(valid, valid_end, invalid, invalid_end, INVALID);
377 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusValidIsInvalid) {
378 CheckCombinations(invalid, invalid_end, valid, valid_end, INVALID);
381 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusInvalidIsInvalid) {
382 CheckCombinations(invalid, invalid_end, invalid, invalid_end, INVALID);
385 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusPartialIsInvalid) {
386 CheckCombinations(
387 invalid, invalid_end, PartialIterator(), PartialIterator::end(), INVALID);
390 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusInvalidIsInvalid) {
391 CheckCombinations(
392 PartialIterator(), PartialIterator::end(), invalid, invalid_end, INVALID);
395 TEST(StreamingUtf8ValidatorValidateTest, EmptyIsValid) {
396 EXPECT_TRUE(StreamingUtf8Validator::Validate(std::string()));
399 TEST(StreamingUtf8ValidatorValidateTest, SimpleValidCase) {
400 EXPECT_TRUE(StreamingUtf8Validator::Validate("\xc2\x81"));
403 TEST(StreamingUtf8ValidatorValidateTest, SimpleInvalidCase) {
404 EXPECT_FALSE(StreamingUtf8Validator::Validate("\xc0\x80"));
407 TEST(StreamingUtf8ValidatorValidateTest, TruncatedIsInvalid) {
408 EXPECT_FALSE(StreamingUtf8Validator::Validate("\xc2"));
411 } // namespace
412 } // namespace base