1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
6 #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
8 #if !defined(CLD_WINDOWS)
10 #include "util/utf8/utf8statetable.h"
14 #include "encodings/compact_lang_det/win/cld_basictypes.h"
16 // These four-byte entries compactly encode how many bytes 0..255 to delete
17 // in making a string replacement, how many bytes to add 0..255, and the offset
18 // 0..64k-1 of the replacement string in remap_string.
25 // Exit type codes for state tables. All but the first get stuffed into
26 // signed one-byte entries. The first is only generated by executable code.
27 // To distinguish from next-state entries, these must be contiguous and
30 kExitDstSpaceFull
= 239,
31 kExitIllegalStructure
, // 240
50 kExitDstSpaceFull_2
= -32769,
51 kExitIllegalStructure_2
, // -32768
60 kExitReplaceOffset1_2
,
61 kExitReplaceOffset2_2
,
69 // This struct represents one entire state table. The three initialized byte
70 // areas are state_table, remap_base, and remap_string. state0 and state0_size
71 // give the byte offset and length within state_table of the initial state --
72 // table lookups are expected to start and end in this state, but for
73 // truncated UTF-8 strings, may end in a different state. These allow a quick
74 // test for that condition. entry_shift is 8 for tables subscripted by a full
75 // byte value and 6 for space-optimized tables subscripted by only six
76 // significant bits in UTF-8 continuation bytes.
79 const uint32 state0_size
;
80 const uint32 total_size
;
82 const int entry_shift
;
83 const int bytes_per_entry
;
86 const uint8
* state_table
;
87 const RemapEntry
* remap_base
;
88 const uint8
* remap_string
;
89 const uint8
* fast_state
;
90 } UTF8StateMachineObj
;
92 // Near-duplicate declaration for tables with two-byte entries
95 const uint32 state0_size
;
96 const uint32 total_size
;
98 const int entry_shift
;
99 const int bytes_per_entry
;
102 const signed short* state_table
;
103 const RemapEntry
* remap_base
;
104 const uint8
* remap_string
;
105 const uint8
* fast_state
;
106 } UTF8StateMachineObj_2
;
109 typedef UTF8StateMachineObj UTF8PropObj
;
110 typedef UTF8StateMachineObj UTF8ScanObj
;
111 typedef UTF8StateMachineObj_2 UTF8PropObj_2
;
114 // Look up property of one UTF-8 character and advance over it
115 // Return 0 if input length is zero
116 // Return 0 and advance one byte if input is ill-formed
117 uint8
UTF8GenericProperty(const UTF8PropObj
* st
,
121 // BigOneByte versions are needed for tables > 240 states, but most
122 // won't need the TwoByte versions.
124 // Look up property of one UTF-8 character and advance over it
125 // Return 0 if input length is zero
126 // Return 0 and advance one byte if input is ill-formed
127 uint8
UTF8GenericPropertyBigOneByte(const UTF8PropObj
* st
,
131 // Scan a UTF-8 stringpiece based on a state table.
132 // Always scan complete UTF-8 characters
133 // Set number of bytes scanned. Return reason for exiting
134 int UTF8GenericScan(const UTF8ScanObj
* st
,
137 int* bytes_consumed
);
141 #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_