1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // Create a state machine for validating UTF-8. The algorithm in brief:
6 // 1. Convert the complete unicode range of code points, except for the
7 // surrogate code points, to an ordered array of sequences of bytes in
9 // 2. Convert individual bytes to ranges, starting from the right of each byte
10 // sequence. For each range, ensure the bytes on the left and the ranges
11 // on the right are the identical.
12 // 3. Convert the resulting list of ranges into a state machine, collapsing
14 // 4. Convert the state machine to an array of bytes.
15 // 5. Output as a C++ file.
18 // $ ninja -C out/Release build_utf8_validator_tables
19 // $ out/Release/build_utf8_validator_tables
20 // --output=base/i18n/utf8_validator_tables.cc
21 // $ git add base/i18n/utf8_validator_tables.cc
23 // Because the table is not expected to ever change, it is checked into the
24 // repository rather than being regenerated at build time.
26 // This code uses type uint8 throughout to represent bytes, to avoid
27 // signed/unsigned char confusion.
38 #include "base/basictypes.h"
39 #include "base/command_line.h"
40 #include "base/file_util.h"
41 #include "base/files/file_path.h"
42 #include "base/logging.h"
43 #include "base/numerics/safe_conversions.h"
44 #include "base/strings/stringprintf.h"
45 #include "third_party/icu/source/common/unicode/utf8.h"
49 const char kHelpText
[] =
50 "Usage: build_utf8_validator_tables [ --help ] [ --output=<file> ]\n";
52 const char kProlog
[] =
53 "// Copyright 2013 The Chromium Authors. All rights reserved.\n"
54 "// Use of this source code is governed by a BSD-style license that can "
56 "// found in the LICENSE file.\n"
58 "// This file is auto-generated by build_utf8_validator_tables.\n"
61 "#include \"base/i18n/utf8_validator_tables.h\"\n"
64 "namespace internal {\n"
66 "const uint8 kUtf8ValidatorTables[] = {\n";
68 const char kEpilog
[] =
71 "const size_t kUtf8ValidatorTablesSize = arraysize(kUtf8ValidatorTables);\n"
73 "} // namespace internal\n"
74 "} // namespace base\n";
76 // Ranges are inclusive at both ends--they represent [from, to]
79 // Ranges always start with just one byte.
80 explicit Range(uint8 value
) : from_(value
), to_(value
) {}
82 // Range objects are copyable and assignable to be used in STL
83 // containers. Since they only contain non-pointer POD types, the default copy
84 // constructor, assignment operator and destructor will work.
86 // Add a byte to the range. We intentionally only support adding a byte at the
87 // end, since that is the only operation the code needs.
88 void AddByte(uint8 to
) {
93 uint8
from() const { return from_
; }
94 uint8
to() const { return to_
; }
96 bool operator<(const Range
& rhs
) const {
97 return (from() < rhs
.from() || (from() == rhs
.from() && to() < rhs
.to()));
100 bool operator==(const Range
& rhs
) const {
101 return from() == rhs
.from() && to() == rhs
.to();
109 // A vector of Ranges is like a simple regular expression--it corresponds to
110 // a set of strings of the same length that have bytes in each position in
111 // the appropriate range.
112 typedef std::vector
<Range
> StringSet
;
114 // A UTF-8 "character" is represented by a sequence of bytes.
115 typedef std::vector
<uint8
> Character
;
117 // In the second stage of the algorithm, we want to convert a large list of
118 // Characters into a small list of StringSets.
124 typedef std::vector
<Pair
> PairVector
;
126 // A class to print a table of numbers in the same style as clang-format.
129 explicit TablePrinter(FILE* stream
)
130 : stream_(stream
), values_on_this_line_(0), current_offset_(0) {}
132 void PrintValue(uint8 value
) {
133 if (values_on_this_line_
== 0) {
135 } else if (values_on_this_line_
== kMaxValuesPerLine
) {
136 fprintf(stream_
, " // 0x%02x\n ", current_offset_
);
137 values_on_this_line_
= 0;
139 fprintf(stream_
, " 0x%02x,", static_cast<int>(value
));
140 ++values_on_this_line_
;
145 while (values_on_this_line_
< kMaxValuesPerLine
) {
147 ++values_on_this_line_
;
149 fprintf(stream_
, " // 0x%02x\n", current_offset_
);
150 values_on_this_line_
= 0;
154 // stdio stream. Not owned.
157 // Number of values so far printed on this line.
158 int values_on_this_line_
;
160 // Total values printed so far.
163 static const int kMaxValuesPerLine
= 8;
165 DISALLOW_COPY_AND_ASSIGN(TablePrinter
);
168 // Start by filling a PairVector with characters. The resulting vector goes from
169 // "\x00" to "\xf4\x8f\xbf\xbf".
170 PairVector
InitializeCharacters() {
172 for (int i
= 0; i
<= 0x10FFFF; ++i
) {
173 if (i
>= 0xD800 && i
< 0xE000) {
174 // Surrogate codepoints are not permitted. Non-character code points are
175 // explicitly permitted.
179 unsigned int offset
= 0;
180 UBool is_error
= false;
181 U8_APPEND(bytes
, offset
, arraysize(bytes
), i
, is_error
);
183 DCHECK_GT(offset
, 0u);
184 DCHECK_LE(offset
, arraysize(bytes
));
185 Pair pair
= {Character(bytes
, bytes
+ offset
), StringSet()};
186 vector
.push_back(pair
);
191 // Construct a new Pair from |character| and the concatenation of |new_range|
192 // and |existing_set|, and append it to |pairs|.
193 void ConstructPairAndAppend(const Character
& character
,
194 const Range
& new_range
,
195 const StringSet
& existing_set
,
197 Pair new_pair
= {character
, StringSet(1, new_range
)};
199 new_pair
.set
.end(), existing_set
.begin(), existing_set
.end());
200 pairs
->push_back(new_pair
);
203 // Each pass over the PairVector strips one byte off the right-hand-side of the
204 // characters and adds a range to the set on the right. For example, the first
205 // pass converts the range from "\xe0\xa0\x80" to "\xe0\xa0\xbf" to ("\xe0\xa0",
206 // [\x80-\xbf]), then the second pass converts the range from ("\xe0\xa0",
207 // [\x80-\xbf]) to ("\xe0\xbf", [\x80-\xbf]) to ("\xe0",
208 // [\xa0-\xbf][\x80-\xbf]).
209 void MoveRightMostCharToSet(PairVector
* pairs
) {
210 PairVector new_pairs
;
211 PairVector::const_iterator it
= pairs
->begin();
212 while (it
!= pairs
->end() && it
->character
.empty()) {
213 new_pairs
.push_back(*it
);
216 CHECK(it
!= pairs
->end());
217 Character
unconverted_bytes(it
->character
.begin(), it
->character
.end() - 1);
218 Range
new_range(it
->character
.back());
219 StringSet converted
= it
->set
;
221 while (it
!= pairs
->end()) {
222 const Pair
& current_pair
= *it
++;
223 if (current_pair
.character
.size() == unconverted_bytes
.size() + 1 &&
224 std::equal(unconverted_bytes
.begin(),
225 unconverted_bytes
.end(),
226 current_pair
.character
.begin()) &&
227 converted
== current_pair
.set
) {
228 // The particular set of UTF-8 codepoints we are validating guarantees
229 // that each byte range will be contiguous. This would not necessarily be
230 // true for an arbitrary set of UTF-8 codepoints.
231 DCHECK_EQ(new_range
.to() + 1, current_pair
.character
.back());
232 new_range
.AddByte(current_pair
.character
.back());
235 ConstructPairAndAppend(unconverted_bytes
, new_range
, converted
, &new_pairs
);
236 unconverted_bytes
= Character(current_pair
.character
.begin(),
237 current_pair
.character
.end() - 1);
238 new_range
= Range(current_pair
.character
.back());
239 converted
= current_pair
.set
;
241 ConstructPairAndAppend(unconverted_bytes
, new_range
, converted
, &new_pairs
);
242 new_pairs
.swap(*pairs
);
245 void MoveAllCharsToSets(PairVector
* pairs
) {
246 // Since each pass of the function moves one character, and UTF-8 sequences
247 // are at most 4 characters long, this simply runs the algorithm four times.
248 for (int i
= 0; i
< 4; ++i
) {
249 MoveRightMostCharToSet(pairs
);
252 for (PairVector::const_iterator it
= pairs
->begin(); it
!= pairs
->end();
254 DCHECK(it
->character
.empty());
259 // Logs the generated string sets in regular-expression style, ie. [\x00-\x7f],
260 // [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the
261 // algorithm is working. Use the command-line option
262 // --vmodule=build_utf8_validator_tables=1 to see this output.
263 void LogStringSets(const PairVector
& pairs
) {
264 for (PairVector::const_iterator pair_it
= pairs
.begin();
265 pair_it
!= pairs
.end();
267 std::string set_as_string
;
268 for (StringSet::const_iterator set_it
= pair_it
->set
.begin();
269 set_it
!= pair_it
->set
.end();
271 set_as_string
+= base::StringPrintf("[\\x%02x-\\x%02x]",
272 static_cast<int>(set_it
->from()),
273 static_cast<int>(set_it
->to()));
275 VLOG(1) << set_as_string
;
279 // A single state in the state machine is represented by a sorted vector of
280 // start bytes and target states. All input bytes in the range between the start
281 // byte and the next entry in the vector (or 0xFF) result in a transition to the
288 typedef std::vector
<StateRange
> State
;
290 // Generates a state where all bytes go to state 1 (invalid). This is also used
291 // as an initialiser for other states (since bytes from outside the desired
292 // range are invalid).
293 State
GenerateInvalidState() {
294 const StateRange range
= {0, 1};
295 return State(1, range
);
298 // A map from a state (ie. a set of strings which will match from this state) to
299 // a number (which is an index into the array of states).
300 typedef std::map
<StringSet
, uint8
> StateMap
;
302 // Create a new state corresponding to |set|, add it |states| and |state_map|
303 // and return the index it was given in |states|.
304 uint8
MakeState(const StringSet
& set
,
305 std::vector
<State
>* states
,
306 StateMap
* state_map
) {
307 DCHECK(!set
.empty());
308 const Range
& range
= set
.front();
309 const StringSet
rest(set
.begin() + 1, set
.end());
310 const StateMap::const_iterator where
= state_map
->find(rest
);
311 const uint8 target_state
= where
== state_map
->end()
312 ? MakeState(rest
, states
, state_map
)
314 DCHECK_LT(0, range
.from());
315 DCHECK_LT(range
.to(), 0xFF);
316 const StateRange new_state_initializer
[] = {
317 {0, 1}, {range
.from(), target_state
},
318 {static_cast<uint8
>(range
.to() + 1), 1}};
320 State(new_state_initializer
,
321 new_state_initializer
+ arraysize(new_state_initializer
)));
322 const uint8 new_state_number
=
323 base::checked_cast
<uint8
>(states
->size() - 1);
324 CHECK(state_map
->insert(std::make_pair(set
, new_state_number
)).second
);
325 return new_state_number
;
328 std::vector
<State
> GenerateStates(const PairVector
& pairs
) {
329 // States 0 and 1 are the initial/valid state and invalid state, respectively.
330 std::vector
<State
> states(2, GenerateInvalidState());
332 state_map
.insert(std::make_pair(StringSet(), 0));
333 for (PairVector::const_iterator it
= pairs
.begin(); it
!= pairs
.end(); ++it
) {
334 DCHECK(it
->character
.empty());
335 DCHECK(!it
->set
.empty());
336 const Range
& range
= it
->set
.front();
337 const StringSet
rest(it
->set
.begin() + 1, it
->set
.end());
338 const StateMap::const_iterator where
= state_map
.find(rest
);
339 const uint8 target_state
= where
== state_map
.end()
340 ? MakeState(rest
, &states
, &state_map
)
342 if (states
[0].back().from
== range
.from()) {
343 DCHECK_EQ(1, states
[0].back().target_state
);
344 states
[0].back().target_state
= target_state
;
345 DCHECK_LT(range
.to(), 0xFF);
346 const StateRange new_range
= {static_cast<uint8
>(range
.to() + 1), 1};
347 states
[0].push_back(new_range
);
349 DCHECK_LT(range
.to(), 0xFF);
350 const StateRange new_range_initializer
[] = {{range
.from(), target_state
},
351 {static_cast<uint8
>(range
.to() + 1), 1}};
353 .insert(states
[0].end(),
354 new_range_initializer
,
355 new_range_initializer
+ arraysize(new_range_initializer
));
361 // Output the generated states as a C++ table. Two tricks are used to compact
362 // the table: each state in the table starts with a shift value which indicates
363 // how many bits we can discard from the right-hand-side of the byte before
364 // doing the table lookup. Secondly, only the state-transitions for bytes
365 // with the top-bit set are included in the table; bytes without the top-bit set
366 // are just ASCII and are handled directly by the code.
367 void PrintStates(const std::vector
<State
>& states
, FILE* stream
) {
368 // First calculate the start-offset of each state. This allows the state
369 // machine to jump directly to the correct offset, avoiding an extra
370 // indirection. State 0 starts at offset 0.
371 std::vector
<uint8
> state_offset(1, 0);
372 std::vector
<uint8
> shifts
;
375 for (std::vector
<State
>::const_iterator state_it
= states
.begin();
376 state_it
!= states
.end();
378 // We want to set |shift| to the (0-based) index of the least-significant
379 // set bit in any of the ranges for this state, since this tells us how many
380 // bits we can discard and still determine what range a byte lies in. Sadly
381 // it appears that ffs() is not portable, so we do it clumsily.
383 for (State::const_iterator range_it
= state_it
->begin();
384 range_it
!= state_it
->end();
386 while (shift
> 0 && range_it
->from
% (1 << shift
) != 0) {
390 shifts
.push_back(shift
);
391 pos
+= 1 + (1 << (7 - shift
));
392 state_offset
.push_back(pos
);
395 DCHECK_EQ(129, state_offset
[1]);
397 fputs(kProlog
, stream
);
398 TablePrinter
table_printer(stream
);
400 for (uint8 state_index
= 0; state_index
< states
.size(); ++state_index
) {
401 const uint8 shift
= shifts
[state_index
];
402 uint8 next_range
= 0;
403 uint8 target_state
= 1;
405 " // State %d, offset 0x%02x\n",
406 static_cast<int>(state_index
),
407 static_cast<int>(state_offset
[state_index
]));
408 table_printer
.PrintValue(shift
);
409 for (int i
= 0; i
< 0x100; i
+= (1 << shift
)) {
410 if (next_range
< states
[state_index
].size() &&
411 states
[state_index
][next_range
].from
== i
) {
412 target_state
= states
[state_index
][next_range
].target_state
;
416 table_printer
.PrintValue(state_offset
[target_state
]);
419 table_printer
.NewLine();
422 fputs(kEpilog
, stream
);
427 int main(int argc
, char* argv
[]) {
428 CommandLine::Init(argc
, argv
);
429 logging::LoggingSettings settings
;
430 settings
.logging_dest
= logging::LOG_TO_SYSTEM_DEBUG_LOG
;
431 logging::InitLogging(settings
);
432 if (CommandLine::ForCurrentProcess()->HasSwitch("help")) {
433 fwrite(kHelpText
, 1, arraysize(kHelpText
), stdout
);
436 base::FilePath filename
=
437 CommandLine::ForCurrentProcess()->GetSwitchValuePath("output");
439 FILE* output
= stdout
;
440 if (!filename
.empty()) {
441 output
= base::OpenFile(filename
, "wb");
443 PLOG(FATAL
) << "Couldn't open '" << filename
.AsUTF8Unsafe()
447 // Step 1: Enumerate the characters
448 PairVector pairs
= InitializeCharacters();
449 // Step 2: Convert to sets.
450 MoveAllCharsToSets(&pairs
);
452 LogStringSets(pairs
);
454 // Step 3: Generate states.
455 std::vector
<State
> states
= GenerateStates(pairs
);
456 // Step 4/5: Print output
457 PrintStates(states
, output
);
459 if (!filename
.empty()) {
460 if (!base::CloseFile(output
))
461 PLOG(FATAL
) << "Couldn't finish writing '" << filename
.AsUTF8Unsafe()