5 #ifndef YAML_PREFETCH_SIZE
6 #define YAML_PREFETCH_SIZE 2048
9 #define S_ARRAY_SIZE( A ) (sizeof(A)/sizeof(*(A)))
10 #define S_ARRAY_END( A ) ((A) + S_ARRAY_SIZE(A))
12 #define CP_REPLACEMENT_CHARACTER (0xFFFD)
38 enum UtfIntroCharType
{
50 static bool s_introFinalState
[] = {
53 false, //uis_utf32be_b2
54 false, //uis_utf32be_bom3
57 false, //uis_utf16be_bom1
58 false, //uis_utfle_bom1
59 false, //uis_utf16le_bom2
60 false, //uis_utf32le_bom3
64 false, //uis_utf16le_imp
65 false, //uis_utf32le_imp3
66 false, //uis_utf8_bom1
67 false, //uis_utf8_bom2
72 static UtfIntroState s_introTransitions
[][uictMax
] = {
73 // uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther
74 {uis_utfbe_b1
, uis_utf8
, uis_utf8
, uis_utf8_bom1
, uis_utf16be_bom1
, uis_utfle_bom1
, uis_utf8_imp
, uis_utf8
},
75 {uis_utf32be_b2
, uis_utf8
, uis_utf8
, uis_utf8
, uis_utf8
, uis_utf8
, uis_utf16be
, uis_utf8
},
76 {uis_utf32be
, uis_utf8
, uis_utf8
, uis_utf8
, uis_utf32be_bom3
, uis_utf8
, uis_utf8
, uis_utf8
},
77 {uis_utf8
, uis_utf8
, uis_utf8
, uis_utf8
, uis_utf8
, uis_utf32be
, uis_utf8
, uis_utf8
},
78 {uis_utf32be
, uis_utf32be
, uis_utf32be
, uis_utf32be
, uis_utf32be
, uis_utf32be
, uis_utf32be
, uis_utf32be
},
79 {uis_utf16be
, uis_utf16be
, uis_utf16be
, uis_utf16be
, uis_utf16be
, uis_utf16be
, uis_utf16be
, uis_utf16be
},
80 {uis_utf8
, uis_utf8
, uis_utf8
, uis_utf8
, uis_utf8
, uis_utf16be
, uis_utf8
, uis_utf8
},
81 {uis_utf8
, uis_utf8
, uis_utf8
, uis_utf8
, uis_utf16le_bom2
, uis_utf8
, uis_utf8
, uis_utf8
},
82 {uis_utf32le_bom3
, uis_utf16le
, uis_utf16le
, uis_utf16le
, uis_utf16le
, uis_utf16le
, uis_utf16le
, uis_utf16le
},
83 {uis_utf32le
, uis_utf16le
, uis_utf16le
, uis_utf16le
, uis_utf16le
, uis_utf16le
, uis_utf16le
, uis_utf16le
},
84 {uis_utf16le
, uis_utf16le
, uis_utf16le
, uis_utf16le
, uis_utf16le
, uis_utf16le
, uis_utf16le
, uis_utf16le
},
85 {uis_utf32le
, uis_utf32le
, uis_utf32le
, uis_utf32le
, uis_utf32le
, uis_utf32le
, uis_utf32le
, uis_utf32le
},
86 {uis_utf16le_imp
, uis_utf8
, uis_utf8
, uis_utf8
, uis_utf8
, uis_utf8
, uis_utf8
, uis_utf8
},
87 {uis_utf32le_imp3
, uis_utf16le
, uis_utf16le
, uis_utf16le
, uis_utf16le
, uis_utf16le
, uis_utf16le
, uis_utf16le
},
88 {uis_utf32le
, uis_utf16le
, uis_utf16le
, uis_utf16le
, uis_utf16le
, uis_utf16le
, uis_utf16le
, uis_utf16le
},
89 {uis_utf8
, uis_utf8_bom2
, uis_utf8
, uis_utf8
, uis_utf8
, uis_utf8
, uis_utf8
, uis_utf8
},
90 {uis_utf8
, uis_utf8
, uis_utf8
, uis_utf8
, uis_utf8
, uis_utf8
, uis_utf8
, uis_utf8
},
91 {uis_utf8
, uis_utf8
, uis_utf8
, uis_utf8
, uis_utf8
, uis_utf8
, uis_utf8
, uis_utf8
},
94 static char s_introUngetCount
[][uictMax
] = {
95 // uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther
96 {0, 1, 1, 0, 0, 0, 0, 1},
97 {0, 2, 2, 2, 2, 2, 2, 2},
98 {3, 3, 3, 3, 0, 3, 3, 3},
99 {4, 4, 4, 4, 4, 0, 4, 4},
100 {1, 1, 1, 1, 1, 1, 1, 1},
101 {1, 1, 1, 1, 1, 1, 1, 1},
102 {2, 2, 2, 2, 2, 0, 2, 2},
103 {2, 2, 2, 2, 0, 2, 2, 2},
104 {0, 1, 1, 1, 1, 1, 1, 1},
105 {0, 2, 2, 2, 2, 2, 2, 2},
106 {1, 1, 1, 1, 1, 1, 1, 1},
107 {1, 1, 1, 1, 1, 1, 1, 1},
108 {0, 2, 2, 2, 2, 2, 2, 2},
109 {0, 3, 3, 3, 3, 3, 3, 3},
110 {4, 4, 4, 4, 4, 4, 4, 4},
111 {2, 0, 2, 2, 2, 2, 2, 2},
112 {3, 3, 0, 3, 3, 3, 3, 3},
113 {1, 1, 1, 1, 1, 1, 1, 1},
116 inline UtfIntroCharType
IntroCharTypeOf(std::istream::int_type ch
)
118 if (std::istream::traits_type::eof() == ch
) {
123 case 0: return uict00
;
124 case 0xBB: return uictBB
;
125 case 0xBF: return uictBF
;
126 case 0xEF: return uictEF
;
127 case 0xFE: return uictFE
;
128 case 0xFF: return uictFF
;
131 if ((ch
> 0) && (ch
< 0xFF)) {
138 inline char Utf8Adjust(unsigned long ch
, unsigned char lead_bits
, unsigned char rshift
)
140 const unsigned char header
= ((1 << lead_bits
) - 1) << (8 - lead_bits
);
141 const unsigned char mask
= (0xFF >> (lead_bits
+ 1));
142 return static_cast<char>(static_cast<unsigned char>(
143 header
| ((ch
>> rshift
) & mask
)
147 inline void QueueUnicodeCodepoint(std::deque
<char>& q
, unsigned long ch
)
149 // We are not allowed to queue the Stream::eof() codepoint, so
150 // replace it with CP_REPLACEMENT_CHARACTER
151 if (static_cast<unsigned long>(Stream::eof()) == ch
)
153 ch
= CP_REPLACEMENT_CHARACTER
;
158 q
.push_back(Utf8Adjust(ch
, 0, 0));
162 q
.push_back(Utf8Adjust(ch
, 2, 6));
163 q
.push_back(Utf8Adjust(ch
, 1, 0));
165 else if (ch
< 0x10000)
167 q
.push_back(Utf8Adjust(ch
, 3, 12));
168 q
.push_back(Utf8Adjust(ch
, 1, 6));
169 q
.push_back(Utf8Adjust(ch
, 1, 0));
173 q
.push_back(Utf8Adjust(ch
, 4, 18));
174 q
.push_back(Utf8Adjust(ch
, 1, 12));
175 q
.push_back(Utf8Adjust(ch
, 1, 6));
176 q
.push_back(Utf8Adjust(ch
, 1, 0));
180 Stream::Stream(std::istream
& input
)
182 m_pPrefetched(new unsigned char[YAML_PREFETCH_SIZE
]),
183 m_nPrefetchedAvailable(0), m_nPrefetchedUsed(0)
185 typedef std::istream::traits_type char_traits
;
190 // Determine (or guess) the character-set by reading the BOM, if any. See
191 // the YAML specification for the determination algorithm.
192 char_traits::int_type intro
[4];
194 UtfIntroState state
= uis_start
;
195 for(; !s_introFinalState
[state
]; ) {
196 std::istream::int_type ch
= input
.get();
197 intro
[nIntroUsed
++] = ch
;
198 UtfIntroCharType charType
= IntroCharTypeOf(ch
);
199 UtfIntroState newState
= s_introTransitions
[state
][charType
];
200 int nUngets
= s_introUngetCount
[state
][charType
];
203 for(; nUngets
> 0; --nUngets
) {
204 if(char_traits::eof() != intro
[--nIntroUsed
])
205 input
.putback(char_traits::to_char_type(intro
[nIntroUsed
]));
212 case uis_utf8
: m_charSet
= utf8
; break;
213 case uis_utf16le
: m_charSet
= utf16le
; break;
214 case uis_utf16be
: m_charSet
= utf16be
; break;
215 case uis_utf32le
: m_charSet
= utf32le
; break;
216 case uis_utf32be
: m_charSet
= utf32be
; break;
217 default: m_charSet
= utf8
; break;
225 delete[] m_pPrefetched
;
228 char Stream::peek() const
230 if (m_readahead
.empty())
232 return Stream::eof();
235 return m_readahead
[0];
238 Stream::operator bool() const
240 return m_input
.good() || (!m_readahead
.empty() && m_readahead
[0] != Stream::eof());
244 // . Extracts a character from the stream and updates our position
260 // . Extracts 'n' characters from the stream and updates our position
261 std::string
Stream::get(int n
)
271 // . Eats 'n' characters and updates our position.
272 void Stream::eat(int n
)
278 void Stream::AdvanceCurrent()
280 if (!m_readahead
.empty())
282 m_readahead
.pop_front();
289 bool Stream::_ReadAheadTo(size_t i
) const
291 while (m_input
.good() && (m_readahead
.size() <= i
))
295 case utf8
: StreamInUtf8(); break;
296 case utf16le
: StreamInUtf16(); break;
297 case utf16be
: StreamInUtf16(); break;
298 case utf32le
: StreamInUtf32(); break;
299 case utf32be
: StreamInUtf32(); break;
303 // signal end of stream
305 m_readahead
.push_back(Stream::eof());
307 return m_readahead
.size() > i
;
310 void Stream::StreamInUtf8() const
312 unsigned char b
= GetNextByte();
315 m_readahead
.push_back(b
);
319 void Stream::StreamInUtf16() const
321 unsigned long ch
= 0;
322 unsigned char bytes
[2];
323 int nBigEnd
= (m_charSet
== utf16be
) ? 0 : 1;
325 bytes
[0] = GetNextByte();
326 bytes
[1] = GetNextByte();
331 ch
= (static_cast<unsigned long>(bytes
[nBigEnd
]) << 8) |
332 static_cast<unsigned long>(bytes
[1 ^ nBigEnd
]);
334 if (ch
>= 0xDC00 && ch
< 0xE000)
336 // Trailing (low) surrogate...ugh, wrong order
337 QueueUnicodeCodepoint(m_readahead
, CP_REPLACEMENT_CHARACTER
);
340 else if (ch
>= 0xD800 && ch
< 0xDC00)
342 // ch is a leading (high) surrogate
344 // Four byte UTF-8 code point
346 // Read the trailing (low) surrogate
349 bytes
[0] = GetNextByte();
350 bytes
[1] = GetNextByte();
353 QueueUnicodeCodepoint(m_readahead
, CP_REPLACEMENT_CHARACTER
);
356 unsigned long chLow
= (static_cast<unsigned long>(bytes
[nBigEnd
]) << 8) |
357 static_cast<unsigned long>(bytes
[1 ^ nBigEnd
]);
358 if (chLow
< 0xDC00 || ch
>= 0xE000)
360 // Trouble...not a low surrogate. Dump a REPLACEMENT CHARACTER into the stream.
361 QueueUnicodeCodepoint(m_readahead
, CP_REPLACEMENT_CHARACTER
);
363 // Deal with the next UTF-16 unit
364 if (chLow
< 0xD800 || ch
>= 0xE000)
366 // Easiest case: queue the codepoint and return
367 QueueUnicodeCodepoint(m_readahead
, ch
);
372 // Start the loop over with the new high surrogate
378 // Select the payload bits from the high surrogate
382 // Include bits from low surrogate
383 ch
|= (chLow
& 0x3FF);
385 // Add the surrogacy offset
390 QueueUnicodeCodepoint(m_readahead
, ch
);
393 inline char* ReadBuffer(unsigned char* pBuffer
)
395 return reinterpret_cast<char*>(pBuffer
);
398 unsigned char Stream::GetNextByte() const
400 if (m_nPrefetchedUsed
>= m_nPrefetchedAvailable
)
402 std::streambuf
*pBuf
= m_input
.rdbuf();
403 m_nPrefetchedAvailable
= pBuf
->sgetn(ReadBuffer(m_pPrefetched
),
405 m_nPrefetchedUsed
= 0;
406 if (!m_nPrefetchedAvailable
)
408 m_input
.setstate(std::ios_base::eofbit
);
411 if (0 == m_nPrefetchedAvailable
)
417 return m_pPrefetched
[m_nPrefetchedUsed
++];
420 void Stream::StreamInUtf32() const
422 static int indexes
[2][4] = {
427 unsigned long ch
= 0;
428 unsigned char bytes
[4];
429 int* pIndexes
= (m_charSet
== utf32be
) ? indexes
[1] : indexes
[0];
431 bytes
[0] = GetNextByte();
432 bytes
[1] = GetNextByte();
433 bytes
[2] = GetNextByte();
434 bytes
[3] = GetNextByte();
440 for (int i
= 0; i
< 4; ++i
)
443 ch
|= bytes
[pIndexes
[i
]];
446 QueueUnicodeCodepoint(m_readahead
, ch
);