2 Lightweight and fast xml parser.
4 Part of the swftools package.
6 Copyright (c) 2010 Matthias Kramm <kramm@quiss.org>
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
29 group: 0=data 1=whitespace 2='"' 3='<' 4='>' 5='&' 6=';' 7='?' 8='/' 9='=' 10='!' 11=EOF
32 static int group
[256] =
34 // 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
36 13, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
37 // 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
39 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
40 // 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
41 // ! " # $ % & ' ( ) * + , - . /
42 1,10, 2, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 8,
43 // 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
44 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
45 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 3, 9, 4, 7,
46 // 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f
47 // @ A B C D E F G H I J K L M N O
48 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
49 // 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f
50 // P Q R S T U V W X Y Z [ \ ] ^ _
51 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,11, 0,12, 0, 0,
52 // 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f
53 // ` a b c d e f g h i j k l m n o
54 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
55 // 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f
56 // p q r s t u v w x y z { | } ~
57 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
58 // 80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f
60 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
61 // 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f
63 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
64 // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af
66 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
67 // b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf
69 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
70 // c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 ca cb cc cd ce cf
72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
73 // d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 da db dc dd de df
75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76 // e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 ea eb ec ed ee ef
78 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
79 // f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb fc fd fe ff
81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
84 static const char*errors
[]=
88 /*E1*/"xml file must start with <?",
90 /*E2*/"<,; or & not allowed inside tags",
94 /*E4*/"internal error",
96 /*E5*/"attribute definition without =\"",
99 static int new_state
[][16]=
100 { /* dt ws " < > & ; ? / = ! [ ] - EOB*/
101 /* 0 */{ E1
, 0,E1
, 1,E1
,E1
,E1
,E1
,E1
,E1
,E3
,E1
,E1
,-63}, // .<
102 /* 1 */{ E1
,E1
,E1
,E1
,E1
,E1
,E1
, 9,E1
,E1
,E3
,E1
,E1
,-63}, // <.?
103 /* 2 */{ -3, 2,E3
,E2
,E2
,E2
,E2
,E2
,12,E2
,16,E2
,E2
,-63}, // <.
104 /* 3 */{ E3
,E3
,E3
,E3
,-1,E3
,E3
,E3
,E3
,E1
,E3
,E3
,E3
,-63}, // < /.>
105 /* 4 */{ E3
,E3
,E3
,E3
,-2,E3
,E3
,E3
,E3
,E1
,E1
,E3
,E3
,-63}, // < .>
106 /* 5 */{ 5, 5, 5,-4, 5, 5, 5, 5, 5, 5, 5,E3
,E3
,-63}, // da.ta
107 /* 6 */{ 6,-7,E3
,E2
,-6,E2
,E2
,E3
,-9,E3
,E3
,E3
,E3
,-63}, // <na.me
108 /* 7 */{ -8, 7,E3
,E2
,-2,E2
,E2
, 7, 3,E3
,E3
,E3
,E3
,-63}, // <name .
109 /* 8 */{ 8,-12,E3
,E2
,E3
,E2
,E2
,E3
,E3
,-10,E3
,E3
,E3
,-63}, // att.r
110 /* 9 */{ 9, 7,E3
,E3
,E3
,E3
,E3
,E3
,E3
,E3
,E3
,E3
,E3
,-63}, // <?x.ml
111 /* 10 */{ E5
,10,-11,E5
,E5
,E5
,E5
,E5
,E5
,E5
,E3
,E3
,E3
,-63},// attr=."
112 /* 11 */{ 11,11,-5 ,11,11,11,11,11,11,11,E3
,E3
,E3
,-63},// attr="va.l
113 /* 12 */{ -13,12,E3
,E3
,E3
,E3
,E3
,E3
,E3
,E3
,E3
,E3
,E3
,-63}, // </ . >
114 /* 13 */{ 13,-14,E3
,E3
,-16,E3
,E3
,E3
,E3
,E3
,E3
,E3
,E3
,-63},// </ na.me>
115 /* 14 */{ E3
,14,E3
,E3
,-15,E3
,E3
,E3
,E3
,E3
,E3
,E3
,E3
,-63},// </ name. >
116 /* 15 */{ E3
,15,E3
,E2
,E3
,E3
,E3
,E3
,E3
,10,E3
,E3
,E3
,-63}, // attr .=
117 /* 16 */{ E3
,E3
,E3
,E2
,E3
,E3
,E3
,E3
,E3
,E3
,E3
,17,E3
,-63}, // <!.[CDATA[ ]]>
118 /* 17 */{ 17,E3
,E3
,E3
,E3
,E3
,E3
,E3
,E3
,E3
,E3
,18,E3
,-63}, // <![C.DATA[ ]]>
119 /* 18 */{ 18,18,18,18,18,18,18,18,18,18,18,18,19,-63}, // <![CDATA[ . ]]>
120 /* 19 */{ 18,18,18,18,-20,18,18,18,18,18,18,18,19,-63}, // <![CDATA[ ].]>
121 /* 20 */{0,0,0,0,0,0,0,0,0,0,0,-63},
122 /* 21 */{0,0,0,0,0,0,0,0,0,0,0,-63},
123 /* 22 */{0,0,0,0,0,0,0,0,0,0,0,-63},
124 /* 23 */{0,0,0,0,0,0,0,0,0,0,0,-63},
125 /* 24 */{0,0,0,0,0,0,0,0,0,0,0,-63},
128 typedef struct _tag_stack
{
130 struct _tag_stack
*prev
;
133 typedef struct _stringstate
{
140 static void stringstate_start(stringstate_t
*s
, char*buffer
, int pos
)
143 s
->start
= &buffer
[pos
];
147 static void stringstate_save(stringstate_t
*s
, char*buffer
, int pos
)
151 int add
= &buffer
[pos
] - s
->start
;
154 s
->current
= malloc(add
+1);
155 memcpy(s
->current
, s
->start
, add
);
157 s
->current
= realloc(s
->current
, s
->len
+ add
+ 1);
158 memcpy(s
->current
+s
->len
, s
->start
, add
);
161 s
->current
[s
->len
] = 0;
164 static void stringstate_finish(stringstate_t
*s
, char*buffer
, int pos
)
166 stringstate_save(s
, buffer
, pos
);
167 s
->result
= s
->current
;
171 static void stringstate_clear(stringstate_t
*s
)
173 if(s
->result
) free(s
->result
);
174 if(s
->current
) free(s
->current
);
175 memset(s
, 0, sizeof(stringstate_t
));
177 static xmlattribute_t
*attributes_reverse(xmlattribute_t
*attr
)
179 xmlattribute_t
*prev
= 0;
181 xmlattribute_t
*next
= attr
->next
;
189 static void attributes_free(xmlattribute_t
*attributes
)
192 xmlattribute_t
*next
= attributes
->next
;
193 free((void*)attributes
->name
);
194 free((void*)attributes
->value
);
200 int xml_parse(reader_t
*reader
, xmlconsumer_t
*out
)
203 int old
= 0, state
= 0;
205 tag_stack_t
*stack
= 0;
207 stringstate_t tagname
= {0,0,0,0};
208 stringstate_t attr_name
= {0,0,0,0};
209 stringstate_t attr_value
= {0,0,0,0};
210 stringstate_t data
= {0,0,0,0};
211 xmlattribute_t
*attributes
= 0;
214 int num
= reader
->read(reader
, buffer
, 4096);
219 /*printf("%c, state %d->%d\n",
220 buffer[pos], state, new_state[state][group[buffer[pos]]]);*/
224 state
= new_state
[old
=state
][group
[(unsigned char)buffer
[pos
++]]];
230 case -63: // end of buffer
232 // we could backtrace, but the spec says this is indeed illegal
233 fprintf(stderr
, "error: xml contains \\0 chars\n");
240 case -1: // self closing tag
241 attributes
= attributes_reverse(attributes
);
242 out
->start_tag(out
, tagname
.result
, attributes
);
243 out
->end_tag(out
, tagname
.result
);
244 stringstate_clear(&tagname
);
245 attributes_free(attributes
);attributes
= 0;
246 stringstate_start(&data
, buffer
, pos
);
249 case -6: // after <tagname, at >
250 stringstate_finish(&tagname
, buffer
, pos
-1);
253 st
= malloc(sizeof(tag_stack_t
));
254 st
->name
= tagname
.result
;
257 attributes
= attributes_reverse(attributes
);
258 if(!first
) out
->start_tag(out
, tagname
.result
, attributes
);
259 attributes_free(attributes
);attributes
= 0;
260 stringstate_start(&data
, buffer
, pos
);
263 case -3: case -13: // after <, start of tag name
265 stringstate_start(&tagname
, buffer
, pos
-1);
266 state
= state
==-3?6:13;
268 case -14: // after </, end of tag name, begin of white space
269 stringstate_finish(&tagname
, buffer
, pos
-1);
272 case -16: // after </, at >, end of tag name
273 stringstate_finish(&tagname
, buffer
, pos
-1);
275 case -15: // after </, at >
276 out
->end_tag(out
, tagname
.result
);
277 stringstate_clear(&tagname
);
278 stringstate_start(&data
, buffer
, pos
);
281 case -4: // end of data
282 stringstate_finish(&data
, buffer
, pos
-1);
283 if(!first
) out
->data(out
, data
.result
, data
.len
);
284 stringstate_clear(&data
);
287 case -7: // after <, at whitespace, end of tag name
288 stringstate_finish(&tagname
, buffer
, pos
-1);
291 case -8: // inside tag, start of attribute name
292 stringstate_start(&attr_name
, buffer
, pos
-1);
296 stringstate_finish(&tagname
, buffer
, pos
-1);
299 case -12: // end of attribute name, at ws
300 stringstate_finish(&attr_name
, buffer
, pos
-1);
303 case -10: // end of attribute name, at =
304 stringstate_finish(&attr_name
, buffer
, pos
-1);
307 case -11: // start of attribute value
308 stringstate_start(&attr_value
, buffer
, pos
);
311 case -5: // end of attribute value
312 stringstate_finish(&attr_value
, buffer
, pos
-1);
313 a
= malloc(sizeof(xmlattribute_t
));
314 a
->name
= attr_name
.result
;attr_name
.result
=0;
315 a
->value
= attr_value
.result
;attr_value
.result
=0;
316 a
->next
= attributes
;
325 fprintf(stderr
, "%s (state %d, char '%c')\n", errors
[(-state
)&0x3f], old
, buffer
[pos
-1]);
328 fprintf(stderr
, "internal error: no action %d\n", state
);
334 stringstate_save(&tagname
, buffer
, pos
);
335 stringstate_save(&attr_name
, buffer
, pos
);
336 stringstate_save(&attr_value
, buffer
, pos
);
337 stringstate_save(&data
, buffer
, pos
);
340 /* note: any of these except data *has* to be empty for a well formed xml */
341 stringstate_clear(&tagname
);
342 stringstate_clear(&attr_name
);
343 stringstate_clear(&attr_value
);
344 stringstate_clear(&data
);
347 tag_stack_t
*next
= stack
->prev
;
348 free((void*)stack
->name
);
356 void my_start_tag(xmlconsumer_t
*c
, char*name
, xmlattribute_t
*attr
)
359 for(;attr
;attr
=attr
->next
) {
360 printf(" %s=\"%s\"", attr
->name
, attr
->value
);
364 void my_data(xmlconsumer_t
*c
, char*data
, int len
)
368 void my_end_tag(xmlconsumer_t
*c
, char*name
)
370 printf("</%s>", name
);
374 xmlconsumer_t c
= {my_start_tag
, my_data
, my_end_tag
, 0};
377 reader_init_filereader2(&r
, "test.xml");