4 Rudimentary lexer grammar for a non-validating XML parser.
5 Lexer is not intended to be used by parser, but is standalone.
8 while ( lexer.nextToken().getType() != Token.EOF_TYPE );
10 to iterate through tokens.
12 Replace print statements (only there to make something visible) with your
13 own code and have fun.
16 - internal DTD is parsed but not processed
17 - only supported encoding is iso-8859-1 aka extended ASCII aka ISO-latin-1
18 - special entity references (like & <) do not get resolved (to '&', '<')
19 - uses SAX attribute implementation (could easily be dropped)
20 [TJP: commented out so it compiles w/o SAX.]
23 The good thing about some of these limitations is, that the parsed XML
24 can be written *literally* unmodified.
26 Author: Olli Z. (oliver@zeigermann.de)
28 Initial date: 07.02.1999 (02/07/99)
29 Complete revision: 16.01.2003 (01/16/03)
31 Developed and testes with ANTLR 2.7.2
34 // import org.xml.sax.helpers.*;
37 class XMLLexer extends Lexer;
39 // needed to tell "<!DOCTYPE..."
40 // from "<?..." and "<tag..." and "</tag...>" and "<![CDATA...>"
41 // also on exit branch "]]>", "-->"
44 //charVocabulary = '\3'..'\377'; // extended ASCII (3-255 in octal notation)
45 charVocabulary='\u0000'..'\u007F'; // allow ascii
54 "<!DOCTYPE" WS rootElementName:NAME
55 { System.out.println("ROOTELEMENT: "+rootElementName.getText()); }
58 ( "SYSTEM" WS sys1:STRING
59 { System.out.println("SYSTEM: "+sys1.getText()); }
61 | "PUBLIC" WS pub:STRING WS sys2:STRING
62 { System.out.println("PUBLIC: "+pub.getText()); }
63 { System.out.println("SYSTEM: "+sys2.getText()); }
67 ( dtd:INTERNAL_DTD ( WS )?
68 { System.out.println("DTD: "+dtd.getText()); }
74 protected INTERNAL_DTD
77 // reports warning, but is absolutely ok (checked generated code)
78 // besides this warning was not generated with k=1 which is
79 // enough for this rule...
80 ( options {greedy=false;} : NL
81 | STRING // handle string specially to avoid to mistake ']' in string for end dtd
89 // { AttributesImpl attributes = new AttributesImpl(); }
93 ( ATTR /*[attributes]*/ ( WS )? )*
95 if (target.getText().equalsIgnoreCase("xml")) {
96 // this is the xml declaration, handle it
97 System.out.println("XMLDECL: "+target.getText());
99 System.out.println("PI: "+target.getText());
108 : "<!--" c:COMMENT_DATA "-->"
109 { System.out.println("COMMENT: "+c.getText()); }
112 protected COMMENT_DATA
114 ( options {greedy=false;} : NL
122 "</" g:NAME ( WS )? '>'
123 { System.out.println("ENDTAG: "+g.getText()); }
131 // XXX should org.xml.sax.AttributesImpl be replaced by something else?
132 // { AttributesImpl attributes = new AttributesImpl(); }
136 ( ATTR /*[attributes]*/ ( WS )? )*
138 { System.out.println("EMTYTAG: "+g.getText()); }
140 { System.out.println("STARTTAG: "+g.getText()); }
146 { System.out.println("PCDATA: "+p.getText()); }
149 protected PCDATA_DATA
151 ( options {greedy=true;} : NL
152 | ~( '<' | '\n' | '\r' )
157 : "<![CDATA[" p:CDATA_DATA "]]>"
158 { System.out.println("CDATABLOCK: "+p.getText()); }
163 ( options {greedy=false;} : NL
168 protected ATTR // [AttributesImpl attributes]
169 : name:NAME ( WS )? '=' ( WS )? value:STRING_NO_QUOTE
171 { attributes.addAttribute("", "", name.getText(), "CDATA",
175 { System.out.println("ATTRIBUTE: "+name.getText()+"="+value.getText()); }
178 protected STRING_NO_QUOTE
180 | '\''! (~'\'')* '\''!
189 : ( LETTER | '_' | ':') ( options {greedy=true;} : NAMECHAR )*
193 : LETTER | DIGIT | '.' | '-' | '_' | ':'
221 // Alexander Hinds & Terence Parr
222 // from antlr 2.5.0: example/html
224 // '\r' '\n' can be matched in one alternative or by matching
225 // '\r' in one iteration and '\n' in another. I am trying to
226 // handle any flavor of newline that comes in, but the language
227 // that allows both "\r\n" and "\r" and "\n" to all be valid
228 // newline is ambiguous. Consequently, the resulting grammar
229 // must be ambiguous. I'm shutting this warning off.
232 generateAmbigWarnings=false;