1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
7 #include "base/basictypes.h"
9 // Return true if current Tbl pointer is within state0 range
10 // Note that unsigned compare checks both ends of range simultaneously
11 static inline bool InStateZero(const UTF8ScanObj
* st
, const uint8
* Tbl
) {
12 const uint8
* Tbl0
= &st
->state_table
[st
->state0
];
13 return (static_cast<uint32
>(Tbl
- Tbl0
) < st
->state0_size
);
17 // Look up property of one UTF-8 character and advance over it
18 // Return 0 if input length is zero
19 // Return 0 and advance one byte if input is ill-formed
20 uint8
UTF8GenericProperty(const UTF8PropObj
* st
,
27 const uint8
* lsrc
= *src
;
28 const uint8
* Tbl_0
= &st
->state_table
[st
->state0
];
29 const uint8
* Tbl
= Tbl_0
;
31 int eshift
= st
->entry_shift
;
33 // Short series of tests faster than switch, optimizes 7-bit ASCII
34 unsigned char c
= lsrc
[0];
35 if (static_cast<signed char>(c
) >= 0) { // one byte
39 } else if (((c
& 0xe0) == 0xc0) && (*srclen
>= 2)) { // two bytes
41 Tbl
= &Tbl_0
[e
<< eshift
];
45 } else if (((c
& 0xf0) == 0xe0) && (*srclen
>= 3)) { // three bytes
47 Tbl
= &Tbl_0
[e
<< eshift
];
49 Tbl
= &Tbl_0
[e
<< eshift
];
53 }else if (((c
& 0xf8) == 0xf0) && (*srclen
>= 4)) { // four bytes
55 Tbl
= &Tbl_0
[e
<< eshift
];
57 Tbl
= &Tbl_0
[e
<< eshift
];
59 Tbl
= &Tbl_0
[e
<< eshift
];
63 } else { // Ill-formed
71 // BigOneByte versions are needed for tables > 240 states, but most
72 // won't need the TwoByte versions.
73 // Internally, to next-to-last offset is multiplied by 16 and the last
74 // offset is relative instead of absolute.
75 // Look up property of one UTF-8 character and advance over it
76 // Return 0 if input length is zero
77 // Return 0 and advance one byte if input is ill-formed
78 uint8
UTF8GenericPropertyBigOneByte(const UTF8PropObj
* st
,
85 const uint8
* lsrc
= *src
;
86 const uint8
* Tbl_0
= &st
->state_table
[st
->state0
];
87 const uint8
* Tbl
= Tbl_0
;
89 int eshift
= st
->entry_shift
;
91 // Short series of tests faster than switch, optimizes 7-bit ASCII
92 unsigned char c
= lsrc
[0];
93 if (static_cast<signed char>(c
) >= 0) { // one byte
97 } else if (((c
& 0xe0) == 0xc0) && (*srclen
>= 2)) { // two bytes
99 Tbl
= &Tbl_0
[e
<< eshift
];
103 } else if (((c
& 0xf0) == 0xe0) && (*srclen
>= 3)) { // three bytes
105 Tbl
= &Tbl_0
[e
<< (eshift
+ 4)]; // 16x the range
106 e
= (reinterpret_cast<const int8
*>(Tbl
))[lsrc
[1]];
107 Tbl
= &Tbl
[e
<< eshift
]; // Relative +/-
111 }else if (((c
& 0xf8) == 0xf0) && (*srclen
>= 4)) { // four bytes
113 Tbl
= &Tbl_0
[e
<< eshift
];
115 Tbl
= &Tbl_0
[e
<< (eshift
+ 4)]; // 16x the range
116 e
= (reinterpret_cast<const int8
*>(Tbl
))[lsrc
[2]];
117 Tbl
= &Tbl
[e
<< eshift
]; // Relative +/-
121 } else { // Ill-formed
129 // Scan a UTF-8 stringpiece based on a state table.
130 // Always scan complete UTF-8 characters
131 // Set number of bytes scanned. Return reason for exiting
132 int UTF8GenericScan(const UTF8ScanObj
* st
,
135 int* bytes_consumed
) {
136 int eshift
= st
->entry_shift
; // 6 (space optimized) or 8
137 // int nEntries = (1 << eshift); // 64 or 256 entries per state
139 const uint8
* isrc
= str
;
140 //reinterpret_cast<const uint8*>(str.data());
141 const uint8
* src
= isrc
;
142 //const int len = str.length();
143 const uint8
* srclimit
= isrc
+ len
;
144 const uint8
* srclimit8
= srclimit
- 7;
146 if (len
== 0) return kExitOK
;
148 const uint8
* Tbl_0
= &st
->state_table
[st
->state0
];
151 // Do state-table scan
155 // Do fast for groups of 8 identity bytes.
156 // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
157 // including slowing slightly on cr/lf/ht
158 //----------------------------
159 const uint8
* Tbl2
= &st
->fast_state
[0];
160 uint32 losub
= st
->losub
;
161 uint32 hiadd
= st
->hiadd
;
162 while (src
< srclimit8
) {
163 uint32 s0123
= UnalignedLoad32(src
);
164 uint32 s4567
= UnalignedLoad32(src
+ 4);
166 // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
167 uint32 temp
= (s0123
- losub
) | (s0123
+ hiadd
) |
168 (s4567
- losub
) | (s4567
+ hiadd
);
169 if ((temp
& 0x80808080) != 0) {
170 // We typically end up here on cr/lf/ht; src was incremented
171 int e0123
= (Tbl2
[src
[-8]] | Tbl2
[src
[-7]]) |
172 (Tbl2
[src
[-6]] | Tbl2
[src
[-5]]);
173 if (e0123
!= 0) {src
-= 8; break;} // Exit on Non-interchange
174 e0123
= (Tbl2
[src
[-4]] | Tbl2
[src
[-3]]) |
175 (Tbl2
[src
[-2]] | Tbl2
[src
[-1]]);
176 if (e0123
!= 0) {src
-= 4; break;} // Exit on Non-interchange
177 // Else OK, go around again
180 //----------------------------
182 // Byte-at-a-time scan
183 //----------------------------
184 const uint8
* Tbl
= Tbl_0
;
185 while (src
< srclimit
) {
189 if (e
>= kExitIllegalStructure
) {break;}
190 Tbl
= &Tbl_0
[e
<< eshift
];
192 //----------------------------
195 // Exit posibilities:
196 // Some exit code, !state0, back up over last char
197 // Some exit code, state0, back up one byte exactly
198 // source consumed, !state0, back up over partial char
199 // source consumed, state0, exit OK
200 // For illegal byte in state0, avoid backup up over PREVIOUS char
201 // For truncated last char, back up to beginning of it
203 if (e
>= kExitIllegalStructure
) {
204 // Back up over exactly one byte of rejected/illegal UTF-8 character
206 // Back up more if needed
207 if (!InStateZero(st
, Tbl
)) {
208 do {src
--;} while ((src
> isrc
) && ((src
[0] & 0xc0) == 0x80));
210 } else if (!InStateZero(st
, Tbl
)) {
211 // Back up over truncated UTF-8 character
212 e
= kExitIllegalStructure
;
213 do {src
--;} while ((src
> isrc
) && ((src
[0] & 0xc0) == 0x80));
215 // Normal termination, source fully consumed
219 if (e
== kExitDoAgain
) {
220 // Loop back up to the fast scan
224 *bytes_consumed
= src
- isrc
;