2 /// Base functions to initialize and manipulate a UCS2 input stream
4 #include <antlr3input.h>
7 // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
8 // http://www.temporal-wave.com
9 // http://www.linkedin.com/in/jimidle
11 // All rights reserved.
13 // Redistribution and use in source and binary forms, with or without
14 // modification, are permitted provided that the following conditions
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 // 3. The name of the author may not be used to endorse or promote products
22 // derived from this software without specific prior written permission.
24 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
25 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
28 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
29 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
33 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 static void antlr3UCS2Consume (pANTLR3_INT_STREAM is
);
38 static ANTLR3_UCHAR
antlr3UCS2LA (pANTLR3_INT_STREAM is
, ANTLR3_INT32 la
);
39 static ANTLR3_MARKER
antlr3UCS2Index (pANTLR3_INT_STREAM is
);
40 static void antlr3UCS2Seek (pANTLR3_INT_STREAM is
, ANTLR3_MARKER seekPoint
);
42 // ucs2 Charstream API functions
44 static pANTLR3_STRING
antlr3UCS2Substr (pANTLR3_INPUT_STREAM input
, ANTLR3_MARKER start
, ANTLR3_MARKER stop
);
46 /// \brief Common function to setup function interface for a 16 bit "UCS2" input stream.
48 /// \param input Input stream context pointer
51 /// - Strictly speaking, there is no such thing as a UCS2 input stream as the term
52 /// tends to confuse the notions of character encoding, unicode and so on. However
53 /// because there will possibly be a need for a UTF-16 stream, I needed to identify 16 bit
54 /// streams that do not support surrogate encodings and UCS2 is how it is mostly referred to.
55 /// For instance Java, Oracle and others use a 16 bit encoding of characters and so this type
56 /// of stream is very common.
57 /// Take it to mean, therefore, a straight 16 bit uncomplicated encoding of Unicode code points.
60 antlr3UCS2SetupStream (pANTLR3_INPUT_STREAM input
, ANTLR3_UINT32 type
)
62 // Build a string factory for this stream. This is a 16 bit string "UCS2" factory which is a standard
63 // part of the ANTLR3 string. The string factory is then passed through the whole chain of lexer->parser->tree->treeparser
66 input
->strFactory
= antlr3UCS2StringFactoryNew();
68 // Install function pointers for an 8 bit ASCII input, which are good for almost
69 // all input stream functions. We will then override those that won't work for 16 bit characters.
71 antlr3GenericSetupStream (input
, type
);
73 // Intstream API overrides for UCS2
75 input
->istream
->consume
= antlr3UCS2Consume
; // Consume the next 16 bit character in the buffer
76 input
->istream
->_LA
= antlr3UCS2LA
; // Return the UTF32 character at offset n (1 based)
77 input
->istream
->index
= antlr3UCS2Index
; // Calculate current index in input stream, 16 bit based
78 input
->istream
->seek
= antlr3UCS2Seek
; // How to seek to a specific point in the stream
80 // Charstream API overrides for UCS2
82 input
->substr
= antlr3UCS2Substr
; // Return a string from the input stream
84 input
->charByteSize
= 2; // Size in bytes of characters in this stream.
88 /// \brief Consume the next character in an 8 bit ASCII input stream
90 /// \param input Input stream context pointer
93 antlr3UCS2Consume(pANTLR3_INT_STREAM is
)
95 pANTLR3_INPUT_STREAM input
;
97 input
= ((pANTLR3_INPUT_STREAM
) (is
->super
));
99 if ((pANTLR3_UINT16
)(input
->nextChar
) < (((pANTLR3_UINT16
)input
->data
) + input
->sizeBuf
))
101 // Indicate one more character in this line
103 input
->charPositionInLine
++;
105 if ((ANTLR3_UCHAR
)(*((pANTLR3_UINT16
)input
->nextChar
)) == input
->newlineChar
)
107 // Reset for start of a new line of input
110 input
->charPositionInLine
= 0;
111 input
->currentLine
= (void *)(((pANTLR3_UINT16
)input
->nextChar
) + 1);
114 // Increment to next character position
116 input
->nextChar
= (void *)(((pANTLR3_UINT16
)input
->nextChar
) + 1);
120 /// \brief Return the input element assuming an 8 bit ascii input
122 /// \param[in] input Input stream context pointer
123 /// \param[in] la 1 based offset of next input stream element
125 /// \return Next input character in internal ANTLR3 encoding (UTF32)
128 antlr3UCS2LA(pANTLR3_INT_STREAM is
, ANTLR3_INT32 la
)
130 pANTLR3_INPUT_STREAM input
;
132 input
= ((pANTLR3_INPUT_STREAM
) (is
->super
));
134 if (( ((pANTLR3_UINT16
)input
->nextChar
) + la
- 1) >= (((pANTLR3_UINT16
)input
->data
) + input
->sizeBuf
))
136 return ANTLR3_CHARSTREAM_EOF
;
140 return (ANTLR3_UCHAR
)(*((pANTLR3_UINT16
)input
->nextChar
+ la
- 1));
145 /// \brief Calculate the current index in the output stream.
146 /// \param[in] input Input stream context pointer
149 antlr3UCS2Index(pANTLR3_INT_STREAM is
)
151 pANTLR3_INPUT_STREAM input
;
153 input
= ((pANTLR3_INPUT_STREAM
) (is
->super
));
155 return (ANTLR3_MARKER
)(input
->nextChar
);
158 /// \brief Rewind the lexer input to the state specified by the supplied mark.
160 /// \param[in] input Input stream context pointer
163 /// Assumes ASCII (or at least, 8 Bit) input stream.
166 antlr3UCS2Seek (pANTLR3_INT_STREAM is
, ANTLR3_MARKER seekPoint
)
169 pANTLR3_INPUT_STREAM input
;
171 input
= ((pANTLR3_INPUT_STREAM
) is
->super
);
173 // If the requested seek point is less than the current
174 // input point, then we assume that we are resetting from a mark
175 // and do not need to scan, but can just set to there.
177 if (seekPoint
<= (ANTLR3_MARKER
)(input
->nextChar
))
179 input
->nextChar
= (void *)seekPoint
;
183 count
= (ANTLR3_UINT32
)((seekPoint
- (ANTLR3_MARKER
)(input
->nextChar
)) / 2); // 16 bits per character in UCS2
191 /// \brief Return a substring of the ucs2 (16 bit) input stream in
192 /// newly allocated memory.
194 /// \param input Input stream context pointer
195 /// \param start Offset in input stream where the string starts
196 /// \param stop Offset in the input stream where the string ends.
198 static pANTLR3_STRING
199 antlr3UCS2Substr (pANTLR3_INPUT_STREAM input
, ANTLR3_MARKER start
, ANTLR3_MARKER stop
)
201 return input
->strFactory
->newPtr(input
->strFactory
, (pANTLR3_UINT8
)start
, ((ANTLR3_UINT32_CAST(stop
- start
))/2) + 1);