3 Copyright (C) 2016 and later, Cockos Incorporated
5 This software is provided 'as-is', without any express or implied
6 warranty. In no event will the authors be held liable for any damages
7 arising from the use of this software.
9 Permission is granted to anyone to use this software for any purpose,
10 including commercial applications, and to alter it and redistribute it
11 freely, subject to the following restrictions:
13 1. The origin of this software must not be misrepresented; you must not
14 claim that you wrote the original software. If you use this software
15 in a product, an acknowledgment in the product documentation would be
16 appreciated but is not required.
17 2. Altered source versions must be plainly marked as such, and must not be
18 misrepresented as being the original software.
19 3. This notice may not be removed or altered from any source distribution.
21 very very very lightweight XML parser
23 reads: <?xml, <!DOCTYPE, <![CDATA[, <>&"'઼{ top level <? blocks, ignores unknown <?tag blocks?> inside elements
24 always uses 8-bit characters, uses UTF-8 encoding for &#xyz
25 relatively strict. for overflow safety, enforces a token length limit of 512MB
29 #ifndef _WDL_XML_PARSE_H_
30 #define _WDL_XML_PARSE_H_
32 #include "assocarray.h"
33 #include "wdlstring.h"
35 #include "wdlcstring.h"
37 class wdl_xml_element
{
38 static void attr_free(char *a
) { free(a
); }
40 wdl_xml_element(const char *_name
, int _line
, int _col
, bool _sort_attr
=true) :
41 attributes(WDL_assocarray_cmpstr
<char>,NULL
,attr_free
,attr_free
), name(strdup(_name
)), line(_line
), col(_col
),
42 m_sort_attributes(_sort_attr
), m_has_discrete_close(false) { }
43 ~wdl_xml_element() { free(name
); elements
.Empty(true); }
45 WDL_PtrList
<wdl_xml_element
> elements
;
46 WDL_AssocArray
<char *, char *> attributes
;
47 WDL_FastString value
; // value excluding any leading whitespace and excluding any elements
51 bool m_sort_attributes
;
52 bool m_has_discrete_close
;
54 const char *get_attribute(const char *v
, const char *def
=NULL
) const
56 if (!m_sort_attributes
)
58 const int n
= attributes
.GetSize();
59 for (int x
= 0; x
< n
; x
++)
62 const char *val
= attributes
.Enumerate(x
,&key
);
63 if (key
&& !strcmp(key
,v
)) return val
;
66 return attributes
.Get((char*)v
,(char*)def
);
70 class wdl_xml_parser
{
72 wdl_xml_parser(const char *rdptr
, int rdptr_len
, bool sort_attributes
=true) :
73 element_xml(NULL
), element_root(NULL
),
74 m_rdptr((const unsigned char *)rdptr
), m_err(NULL
),
75 m_rdptr_len(rdptr_len
), m_line(1), m_col(0), m_lastchar(0),
76 m_last_line(1),m_last_col(0),
77 m_sort_attributes(sort_attributes
)
80 virtual ~wdl_xml_parser()
84 element_doctype_tokens
.Empty(true,free
);
85 element_root_meta
.Empty(true);
88 const char *parse() // call only once, returns NULL on success, error message on failure
90 m_lastchar
= nextchar();
92 if (!m_tok
.ResizeOK(256)) return "token buffer malloc fail";
94 const char *p
= parse_element_body(NULL
);
96 if (!*m_err
) m_err
="unexpected end of file";
98 snprintf(m_errbuf
,sizeof(m_errbuf
),"%s: %s",p
,m_err
);
103 WDL_PtrList
<char> element_doctype_tokens
; // tokens after <!DOCTYPE
104 wdl_xml_element
*element_xml
, *element_root
;
106 WDL_PtrList
<wdl_xml_element
> element_root_meta
; // any topen level <? elements?> other than <?xml which goes into element_xml
108 // get location after parse() returns error
109 int getLine() const { return m_last_line
; }
110 int getCol() const { return m_last_col
; }
116 const unsigned char *m_rdptr
;
117 const char *m_err
; // NULL if no error, "" if EOF
119 int m_rdptr_len
, m_line
, m_col
, m_lastchar
, m_last_line
,m_last_col
;
120 bool m_sort_attributes
;
122 virtual int moredata(const char **dataOut
) { return 0; }
126 if (m_rdptr_len
< 1 && (m_rdptr_len
= moredata((const char **)&m_rdptr
)) < 1) return -1;
129 const int ret
= (int)*m_rdptr
++;
131 if (ret
== '\n') { m_line
++; m_col
=0; }
137 int skip_whitespace()
139 int rv
=0, lc
= m_lastchar
;
140 while (char_type(lc
) < 0) { lc
= nextchar(); rv
++; }
145 static int char_type(int c
)
149 case ' ': case '\r': case '\n': case '\t':
152 case '/': case '!': case '\\': case '\'': case '"': case '#': case '$':
153 case '%': case '(': case ')': case '*': case '+': case ',': case ';':
154 case '=': case '>': case '?': case '@': case '[': case ']': case '^':
155 case '`': case '{': case '|': case '}': case '~':
167 unsigned char *realloc_tok(int &tok_sz
)
169 tok_sz
+= tok_sz
+ tok_sz
/ 4;
170 if (tok_sz
>= (1<<29))
172 m_err
="token buffer tried to malloc() more than 512MB, probably unsafe and invalid XML";
175 unsigned char *t
= (unsigned char *) m_tok
.ResizeOK(tok_sz
);
176 if (!t
) m_err
="token buffer malloc fail";
180 // gets a token, normally skipping whitespace, but if get_tok(true), then return NULL on whitespace
181 const char *get_tok(bool no_skip_whitespace
=false)
183 if (!no_skip_whitespace
) skip_whitespace();
185 m_last_line
= m_line
;
188 int wrpos
=0, lc
= m_lastchar
, tok_sz
= m_tok
.GetSize();
189 unsigned char *tok_buf
= (unsigned char *)m_tok
.Get();
190 switch (lc
> 0 ? char_type(lc
) : -2)
195 tok_buf
[wrpos
++] = lc
;
196 if (WDL_unlikely(wrpos
>= tok_sz
) && WDL_unlikely(!(tok_buf
=realloc_tok(tok_sz
)))) return NULL
;
199 while (lc
> 0 && !(char_type(lc
)&~4));
205 if (lc
== '\'' || lc
== '\"')
208 tok_buf
[wrpos
++] = lc
;
214 m_last_line
=m_line
; m_last_col
=m_col
;
215 m_err
="illegal '<' character in quoted string";
223 if (WDL_unlikely(wrpos
+8 >= tok_sz
) && WDL_unlikely(!(tok_buf
=realloc_tok(tok_sz
)))) return NULL
;
225 const int tmp
[2]={m_line
,m_col
};
226 if (!decode_entity((char*)tok_buf
+wrpos
))
228 m_last_line
=tmp
[0]; m_last_col
=tmp
[1];
229 m_err
="unknown entity in quoted string";
233 while (tok_buf
[wrpos
]) wrpos
++;
238 tok_buf
[wrpos
++] = lc
;
239 if (WDL_unlikely(wrpos
>= tok_sz
) && WDL_unlikely(!(tok_buf
=realloc_tok(tok_sz
)))) return NULL
;
242 if (llc
== endc
) break;
248 tok_buf
[wrpos
++] = lc
;
249 if (WDL_unlikely(wrpos
>= tok_sz
) && WDL_unlikely(!(tok_buf
=realloc_tok(tok_sz
)))) return NULL
;
255 m_err
="unexpected whitespace";
263 return (char *)tok_buf
;
266 bool decode_entity(char *wr
) // will never write more than 8 bytes
270 while (i
< 31 && (m_lastchar
= nextchar()) > 0 && m_lastchar
!= ';')
272 if (char_type(m_lastchar
) && m_lastchar
!= '#') break;
273 tmp
[i
++] = m_lastchar
;
276 if (m_lastchar
== ';')
279 if (!strcmp(tmp
,"lt")) byteval
= '<';
280 else if (!strcmp(tmp
,"gt")) byteval
= '>';
281 else if (!strcmp(tmp
,"amp")) byteval
= '&';
282 else if (!strcmp(tmp
,"apos")) byteval
= '\'';
283 else if (!strcmp(tmp
,"quot")) byteval
= '"';
284 else if (tmp
[0] == '#')
286 if (tmp
[1] >= '0' && tmp
[1] <= '9') byteval
= atoi(tmp
+1);
287 if (tmp
[1] == 'x') byteval
= strtol(tmp
+1,NULL
,16);
290 if (!byteval
) return false;
291 WDL_MakeUTFChar((char*)wr
,byteval
,8);
292 m_lastchar
= nextchar();
296 bool skip_until(const char *s
) // raw search, no tokenization
298 int state
= 0, c
= m_lastchar
;
299 while (c
>0 && s
[state
])
301 state
= (state
&& c
== (unsigned char)s
[state
]) ? (state
+1) : (c
== (unsigned char)s
[0]);
308 const char *parse_element_attributes(wdl_xml_element
*elem
)
310 char *attr_name
=NULL
;
313 const char *tok
= get_tok();
316 if (*tok
== '-' || *tok
== '.' || (*tok
>= '0' && *tok
<= '9')) { m_err
="attribute must not begin with .- or number"; break; }
318 if (char_type(*tok
)) return tok
;
320 attr_name
= strdup(tok
);
321 if (!attr_name
) { m_err
="malloc fail"; break; }
323 if (m_sort_attributes
&&
324 elem
->attributes
.Get(attr_name
))
326 m_err
="attribute specified more than once";
332 if (*tok
!= '=') { m_err
="attribute name must be followed by '='"; break; }
336 if (*tok
!= '\'' && *tok
!= '"') { m_err
="attribute value must be quoted string"; break; }
338 const size_t tok_len
= strlen(tok
);
339 if (tok_len
< 2 || tok
[tok_len
-1] != tok
[0]) { m_err
="attribute value missing trailing quote"; break; }
341 char *value
= (char *)malloc(tok_len
-2+1);
342 if (!value
) { m_err
="malloc fail"; break; }
344 memcpy(value
,tok
+1,tok_len
-2);
347 if (m_sort_attributes
)
348 elem
->attributes
.Insert(attr_name
,value
);
350 elem
->attributes
.AddUnsorted(attr_name
,value
);
358 const char *parse_element_body(wdl_xml_element
*elem
) // return NULL on success, error message on failure
365 bool want_add
= elem
->value
.GetLength() > 0;
366 while (m_lastchar
!= '<' && m_lastchar
> 0)
368 if (!want_add
&& char_type(m_lastchar
)>=0) want_add
=true;
371 if (m_lastchar
== '&')
373 m_last_line
=m_line
; m_last_col
=m_col
;
375 if (!decode_entity(buf
)) return "unknown entity in element body";
376 elem
->value
.Append(buf
);
381 unsigned char c
= (unsigned char)m_lastchar
;
382 elem
->value
.Append((const char *)&c
,1);
385 if (adv
) m_lastchar
= nextchar();
389 const char *tok
= get_tok(elem
!= NULL
);
390 const int start_line
= m_last_line
, start_col
= m_last_col
;
393 if (m_err
&& *m_err
== 0 && !elem
) m_err
= NULL
; // clear m_error if EOF and top level
394 return elem
? "unterminated block" : NULL
;
396 if (*tok
!= '<') return "expected < tag";
399 if (!tok
) return "expected token after <";
404 if (!tok
) return "expected token following <!";
409 if (!tok
) return "expected token following <!-";
410 if (*tok
!= '-') return "unknown token following <!-";
411 if (!skip_until("--"))
413 m_last_line
=start_line
;
414 m_last_col
=start_col
;
415 return "unterminated comment";
418 if (!tok
|| tok
[0] != '>') return "-- not allowed in comment";
420 else if (*tok
== '[')
422 if (!elem
) return "<![ not allowed at document level";
424 if (!tok
|| strcmp(tok
,"CDATA")) return "unknown token beginning <![";
426 if (!tok
|| tok
[0] != '[') return "unknown token beginning <![CDATA but without trailing [";
428 // add content literally until ]]>
429 int lc
=m_lastchar
, last1
=0,last2
=0;
432 if (lc
== '>' && last1
== ']' && last2
== ']') break;
434 unsigned char c
= (unsigned char)lc
;
435 elem
->value
.Append((const char *)&c
,1);
443 m_last_line
=start_line
;
444 m_last_col
=start_col
;
445 return "unterminated <![CDATA[";
448 elem
->value
.SetLen(elem
->value
.GetLength()-2); // remove ]]
449 m_lastchar
= nextchar();
452 else if (!strcmp(tok
,"DOCTYPE"))
454 if (elem
) return "<!DOCTYPE must be at top level";
455 if (element_doctype_tokens
.GetSize()) return "<!DOCTYPE already specified";
458 if (!tok
|| char_type(*tok
)) return "expected document type token following <!DOCTYPE";
461 element_doctype_tokens
.Add(strdup(tok
));
465 m_last_line
=start_line
;
466 m_last_col
=start_col
;
467 return "unterminated <!DOCTYPE";
469 } while (tok
[0] != '>');
471 else return "unknown token following <!";
473 else if (tok
[0] == '?')
476 if (!tok
) return "expected token following <?";
478 if (!strcmp(tok
,"xml"))
480 if (elem
|| cnt
|| element_xml
|| element_root_meta
.GetSize()) return "<?xml must begin document";
482 element_xml
= new wdl_xml_element("xml",start_line
,start_col
,m_sort_attributes
);
483 tok
= parse_element_attributes(element_xml
);
484 if (!tok
|| tok
[0] != '?' || !(tok
=get_tok(true)) || tok
[0] != '>')
485 return "<?xml not terminated";
491 wdl_xml_element
*ne
= new wdl_xml_element(tok
,start_line
,start_col
,m_sort_attributes
);
492 tok
= parse_element_attributes(ne
);
493 if (!tok
|| tok
[0] != '?' || !(tok
=get_tok(true)) || tok
[0] != '>')
496 return "<? element not terminated";
498 element_root_meta
.Add(ne
);
500 else if (!skip_until("?>")) // ignore <? inside elements
502 m_last_line
=start_line
;
503 m_last_col
=start_col
;
504 return "unterminated <? block";
508 else if (tok
[0] == '/')
510 if (!elem
) return "unexpected </ at root level";
513 if (strcmp(tok
,elem
->name
))
515 return "mismatched </ tag name";
519 if (!tok
|| tok
[0] != '>') return "expected > following </tag";
521 elem
->m_has_discrete_close
= true;
526 if (!elem
&& element_root
)
527 return "multiple top level elements";
529 if (*tok
== '-' || *tok
== '.' || (*tok
>= '0' && *tok
<= '9'))
530 return "element name must not begin with .- or number";
532 wdl_xml_element
*sub
= new wdl_xml_element(tok
,start_line
,start_col
,m_sort_attributes
);
533 if (elem
) elem
->elements
.Add(sub
);
534 else element_root
= sub
;
536 tok
= parse_element_attributes(sub
);
537 if (!tok
) return "unterminated element";
542 if (!tok
|| *tok
!= '>') return "expected > following / to end element";
544 else if (*tok
== '>')
546 const char *ret
= parse_element_body(sub
);
551 return "unknown token in element";
559 class wdl_xml_fileread
: public wdl_xml_parser
{
564 virtual int moredata(const char **dataptr
)
567 const int cs
= m_charset
;
568 if (m_fp
) switch (cs
)
571 return (int) fread(m_buf
,1,sizeof(m_buf
),m_fp
);
575 unsigned char tmp
[128];
576 const int l
= (int) fread(tmp
,1,sizeof(tmp
),m_fp
);
580 const int amt
=wdl_utf8_makechar(cs
==1 ? ((tmp
[rd
]<<8)|tmp
[rd
+1]) : (tmp
[rd
]|(tmp
[rd
+1]<<8)),
582 (int)sizeof(m_buf
)-wpos
);
583 if (amt
>0) wpos
+= amt
;
593 wdl_xml_fileread(FILE *fp
) : wdl_xml_parser(NULL
,0)
596 m_charset
=0; // default to utf-8
599 unsigned char bom
[2];
600 if (fread(bom
,1,2,fp
)==2)
602 if (bom
[0] == 0xEF && bom
[1] == 0xBB && fgetc(fp
) == 0xBF) m_charset
=0;
603 else if (bom
[0] == 0xFE && bom
[1] == 0xFF) m_charset
=1; // utf-16 BE
604 else if (bom
[0] == 0xFF && bom
[1] == 0xFE) m_charset
=2; // utf-16 LE
605 else fseek(fp
,0,SEEK_SET
); // rewind
609 virtual ~wdl_xml_fileread() { if (m_fp
) fclose(m_fp
); }