3 Copyright (C) 2016 and later, Cockos Incorporated
5 This software is provided 'as-is', without any express or implied
6 warranty. In no event will the authors be held liable for any damages
7 arising from the use of this software.
9 Permission is granted to anyone to use this software for any purpose,
10 including commercial applications, and to alter it and redistribute it
11 freely, subject to the following restrictions:
13 1. The origin of this software must not be misrepresented; you must not
14 claim that you wrote the original software. If you use this software
15 in a product, an acknowledgment in the product documentation would be
16 appreciated but is not required.
17 2. Altered source versions must be plainly marked as such, and must not be
18 misrepresented as being the original software.
19 3. This notice may not be removed or altered from any source distribution.
21 very very very lightweight XML parser
23 reads: <?xml, <!DOCTYPE, <![CDATA[, <>&"'઼{ ignores unknown <?tag blocks?>
24 always uses 8-bit characters, uses UTF-8 encoding for &#xyz
25 relatively strict. for overflow safety, enforces a token length limit of 512MB
29 #ifndef _WDL_XML_PARSE_H_
30 #define _WDL_XML_PARSE_H_
32 #include "assocarray.h"
33 #include "wdlstring.h"
36 class wdl_xml_element
{
37 static int attr_cmp(char **a
, char **b
) { return strcmp(*a
,*b
); }
38 static void attr_free(char *a
) { free(a
); }
40 wdl_xml_element(const char *_name
, int _line
, int _col
, bool _sort_attr
=true) :
41 attributes(attr_cmp
,NULL
,attr_free
,attr_free
), name(strdup(_name
)), line(_line
), col(_col
),
42 m_sort_attributes(_sort_attr
), m_has_discrete_close(false) { }
43 ~wdl_xml_element() { free(name
); elements
.Empty(true); }
45 WDL_PtrList
<wdl_xml_element
> elements
;
46 WDL_AssocArray
<char *, char *> attributes
;
47 WDL_FastString value
; // value excluding any leading whitespace and excluding any elements
51 bool m_sort_attributes
;
52 bool m_has_discrete_close
;
54 const char *get_attribute(const char *v
, const char *def
=NULL
) const
56 if (!m_sort_attributes
)
58 const int n
= attributes
.GetSize();
59 for (int x
= 0; x
< n
; x
++)
62 const char *val
= attributes
.Enumerate(x
,&key
);
63 if (key
&& !strcmp(key
,v
)) return val
;
66 return attributes
.Get((char*)v
,(char*)def
);
70 class wdl_xml_parser
{
72 wdl_xml_parser(const char *rdptr
, int rdptr_len
, bool sort_attributes
=true) :
73 element_xml(NULL
), element_root(NULL
),
74 m_rdptr((const unsigned char *)rdptr
), m_err(NULL
),
75 m_rdptr_len(rdptr_len
), m_line(1), m_col(0), m_lastchar(0),
76 m_last_line(1),m_last_col(0),
77 m_sort_attributes(sort_attributes
)
80 virtual ~wdl_xml_parser()
84 element_doctype_tokens
.Empty(true,free
);
87 const char *parse() // call only once, returns NULL on success, error message on failure
89 m_lastchar
= nextchar();
91 if (!m_tok
.ResizeOK(256)) return "token buffer malloc fail";
93 const char *p
= parse_element_body(NULL
);
94 if (m_err
) return m_err
;
96 if (get_tok()) return "document: extra characters following root element";
102 WDL_PtrList
<char> element_doctype_tokens
; // tokens after <!DOCTYPE
103 wdl_xml_element
*element_xml
, *element_root
;
105 // get location after parse() returns error
106 int getLine() const { return m_last_line
; }
107 int getCol() const { return m_last_col
; }
113 const unsigned char *m_rdptr
;
115 int m_rdptr_len
, m_line
, m_col
, m_lastchar
, m_last_line
,m_last_col
;
116 bool m_sort_attributes
;
118 virtual int moredata(const char **dataOut
) { return 0; }
122 if (m_rdptr_len
< 1 && (m_rdptr_len
= moredata((const char **)&m_rdptr
)) < 1) return -1;
125 const int ret
= (int)*m_rdptr
++;
127 if (ret
== '\n') { m_line
++; m_col
=0; }
133 int skip_whitespace()
135 int rv
=0, lc
= m_lastchar
;
136 while (char_type(lc
) < 0) { lc
= nextchar(); rv
++; }
141 static int char_type(int c
)
145 case ' ': case '\r': case '\n': case '\t':
148 case '/': case '!': case '\\': case '\'': case '"': case '#': case '$':
149 case '%': case '(': case ')': case '*': case '+': case ',': case ';':
150 case '=': case '>': case '?': case '@': case '[': case ']': case '^':
151 case '`': case '{': case '|': case '}': case '~':
163 unsigned char *realloc_tok(int &tok_sz
)
165 tok_sz
+= tok_sz
+ tok_sz
/ 4;
166 if (tok_sz
>= (1<<29))
168 m_err
="token buffer tried to malloc() more than 512MB, probably unsafe and invalid XML";
171 unsigned char *t
= (unsigned char *) m_tok
.ResizeOK(tok_sz
);
172 if (!t
) m_err
="token buffer malloc fail";
176 // gets a token, normally skipping whitespace, but if get_tok(true), then return NULL on whitespace
177 const char *get_tok(bool no_skip_whitespace
=false)
179 if (!no_skip_whitespace
) skip_whitespace();
181 m_last_line
= m_line
;
184 int wrpos
=0, lc
= m_lastchar
, tok_sz
= m_tok
.GetSize();
185 unsigned char *tok_buf
= (unsigned char *)m_tok
.Get();
186 switch (lc
> 0 ? char_type(lc
) : -2)
191 tok_buf
[wrpos
++] = lc
;
192 if (WDL_unlikely(wrpos
>= tok_sz
) && WDL_unlikely(!(tok_buf
=realloc_tok(tok_sz
)))) return NULL
;
195 while (lc
> 0 && !(char_type(lc
)&~4));
201 if (lc
== '\'' || lc
== '\"')
204 tok_buf
[wrpos
++] = lc
;
210 m_last_line
=m_line
; m_last_col
=m_col
;
211 m_err
="illegal '<' character in quoted string";
219 if (WDL_unlikely(wrpos
+8 >= tok_sz
) && WDL_unlikely(!(tok_buf
=realloc_tok(tok_sz
)))) return NULL
;
221 const int tmp
[2]={m_line
,m_col
};
222 if (!decode_entity((char*)tok_buf
+wrpos
))
224 m_last_line
=tmp
[0]; m_last_col
=tmp
[1];
225 m_err
="unknown entity in quoted string";
229 while (tok_buf
[wrpos
]) wrpos
++;
234 tok_buf
[wrpos
++] = lc
;
235 if (WDL_unlikely(wrpos
>= tok_sz
) && WDL_unlikely(!(tok_buf
=realloc_tok(tok_sz
)))) return NULL
;
238 if (llc
== endc
) break;
244 tok_buf
[wrpos
++] = lc
;
245 if (WDL_unlikely(wrpos
>= tok_sz
) && WDL_unlikely(!(tok_buf
=realloc_tok(tok_sz
)))) return NULL
;
251 m_err
="unexpected whitespace";
254 m_err
="unexpected end of file";
259 return (char *)tok_buf
;
262 bool decode_entity(char *wr
) // will never write more than 8 bytes
266 while (i
< 31 && (m_lastchar
= nextchar()) > 0 && m_lastchar
!= ';')
268 if (char_type(m_lastchar
) && m_lastchar
!= '#') break;
269 tmp
[i
++] = m_lastchar
;
272 if (m_lastchar
== ';')
275 if (!strcmp(tmp
,"lt")) byteval
= '<';
276 else if (!strcmp(tmp
,"gt")) byteval
= '>';
277 else if (!strcmp(tmp
,"amp")) byteval
= '&';
278 else if (!strcmp(tmp
,"apos")) byteval
= '\'';
279 else if (!strcmp(tmp
,"quot")) byteval
= '"';
280 else if (tmp
[0] == '#')
282 if (tmp
[1] >= '0' && tmp
[1] <= '9') byteval
= atoi(tmp
+1);
283 if (tmp
[1] == 'x') byteval
= strtol(tmp
+1,NULL
,16);
286 if (!byteval
) return false;
287 WDL_MakeUTFChar((char*)wr
,byteval
,8);
288 m_lastchar
= nextchar();
292 bool skip_until(const char *tok
, const char *a
, const char *b
)
295 if (!tok
) tok
= get_tok();
298 if (state
&& !strcmp(b
,tok
)) return true;
299 state
= !strcmp(a
,tok
);
300 if (state
&& !b
) return true;
302 if (skip_whitespace()) state
=false;
309 const char *parse_element_attributes(wdl_xml_element
*elem
)
311 char *attr_name
=NULL
;
314 const char *tok
= get_tok();
317 if (*tok
== '-' || *tok
== '.' || (*tok
>= '0' && *tok
<= '9')) { m_err
="attribute must not begin with .- or number"; break; }
319 if (char_type(*tok
)) return tok
;
321 char *attr_name
= strdup(tok
);
322 if (!attr_name
) { m_err
="malloc fail"; break; }
324 if (m_sort_attributes
&&
325 elem
->attributes
.Get(attr_name
))
327 m_err
="attribute specified more than once";
333 if (*tok
!= '=') { m_err
="attribute name must be followed by '='"; break; }
337 if (*tok
!= '\'' && *tok
!= '"') { m_err
="attribute value must be quoted string"; break; }
339 const size_t tok_len
= strlen(tok
);
340 if (tok_len
< 2 || tok
[tok_len
-1] != tok
[0]) { m_err
="attribute value missing trailing quote"; break; }
342 char *value
= (char *)malloc(tok_len
-2+1);
343 if (!value
) { m_err
="malloc fail"; break; }
345 memcpy(value
,tok
+1,tok_len
-2);
348 if (m_sort_attributes
)
349 elem
->attributes
.Insert(attr_name
,value
);
351 elem
->attributes
.AddUnsorted(attr_name
,value
);
359 const char *parse_element_body(wdl_xml_element
*elem
) // return NULL on success, error message on failure
366 bool want_add
= elem
->value
.GetLength() > 0;
367 while (m_lastchar
!= '<' && m_lastchar
> 0)
369 if (!want_add
&& char_type(m_lastchar
)>=0) want_add
=true;
372 if (m_lastchar
== '&')
374 m_last_line
=m_line
; m_last_col
=m_col
;
376 if (!decode_entity(buf
)) return "unknown entity in element body";
377 elem
->value
.Append(buf
);
382 unsigned char c
= (unsigned char)m_lastchar
;
383 elem
->value
.Append((const char *)&c
,1);
386 if (adv
) m_lastchar
= nextchar();
390 const char *tok
= get_tok(elem
!= NULL
);
391 const int start_line
= m_last_line
, start_col
= m_last_col
;
392 if (!tok
) return elem
? "unterminated block" : NULL
;
393 if (*tok
!= '<') return "expected < tag";
396 if (!tok
) return "expected token after <";
401 if (!tok
) return "expected token following <!";
406 if (!tok
) return "expected token following <!-";
407 if (*tok
!= '-') return "unknown token following <!-";
408 if (!skip_until(NULL
,"-","-"))
410 m_last_line
=start_line
;
411 m_last_col
=start_col
;
412 return m_err
= "unterminated comment";
415 if (!tok
|| tok
[0] != '>') return "-- not allowed in comment";
417 else if (*tok
== '[')
419 if (!elem
) return "<![ not allowed at document level";
421 if (!tok
|| strcmp(tok
,"CDATA")) return "unknown token beginning <![";
423 if (!tok
|| tok
[0] != '[') return "unknown token beginning <![CDATA but without trailing [";
425 // add content literally until ]]>
426 int lc
=m_lastchar
, last1
=0,last2
=0;
429 if (lc
== '>' && last1
== ']' && last2
== ']') break;
431 unsigned char c
= (unsigned char)lc
;
432 elem
->value
.Append((const char *)&c
,1);
440 m_last_line
=start_line
;
441 m_last_col
=start_col
;
442 return m_err
= "unterminated <![CDATA[";
445 elem
->value
.SetLen(elem
->value
.GetLength()-2); // remove ]]
446 m_lastchar
= nextchar();
449 else if (!strcmp(tok
,"DOCTYPE"))
451 if (elem
) return "<!DOCTYPE must be at top level";
452 if (element_doctype_tokens
.GetSize()) return "<!DOCTYPE already specified";
455 if (!tok
|| char_type(*tok
)) return "expected document type token following <!DOCTYPE";
458 element_doctype_tokens
.Add(strdup(tok
));
462 m_last_line
=start_line
;
463 m_last_col
=start_col
;
464 return m_err
= "unterminated <!DOCTYPE";
466 } while (tok
[0] != '>');
468 else return "unknown token following <!";
470 else if (tok
[0] == '?')
473 if (!tok
) return "expected token following <?";
475 if (!strcmp(tok
,"xml"))
477 if (elem
|| cnt
|| element_xml
) return "<?xml must begin document";
479 element_xml
= new wdl_xml_element("xml",start_line
,start_col
,m_sort_attributes
);
480 tok
= parse_element_attributes(element_xml
);
481 if (!tok
|| tok
[0] != '?' || !(tok
=get_tok(true)) || tok
[0] != '>')
482 return "<?xml not terminated";
486 if (!skip_until(tok
, "?",">"))
488 m_last_line
=start_line
;
489 m_last_col
=start_col
;
490 return m_err
= "unterminated <? block";
494 else if (tok
[0] == '/')
496 if (!elem
) return "unexpected </ at root level";
499 if (strcmp(tok
,elem
->name
))
501 return "mismatched </ tag name";
505 if (!tok
|| tok
[0] != '>') return "expected > following </tag";
507 elem
->m_has_discrete_close
= true;
512 if (*tok
== '-' || *tok
== '.' || (*tok
>= '0' && *tok
<= '9'))
513 return "element name must not begin with .- or number";
515 wdl_xml_element
*sub
= new wdl_xml_element(tok
,start_line
,start_col
,m_sort_attributes
);
516 if (elem
) elem
->elements
.Add(sub
);
517 else element_root
= sub
;
519 tok
= parse_element_attributes(sub
);
520 if (!tok
) return "unterminated element";
525 if (!tok
|| *tok
!= '>') return "expected > following / to end element";
527 else if (*tok
== '>')
529 const char *ret
= parse_element_body(sub
);
534 return "unknown token in element";
536 if (!elem
) return NULL
; // finish after parsing a top level block
543 class wdl_xml_fileread
: public wdl_xml_parser
{
548 virtual int moredata(const char **dataptr
)
551 const int cs
= m_charset
;
552 if (m_fp
) switch (cs
)
555 return (int) fread(m_buf
,1,sizeof(m_buf
),m_fp
);
559 unsigned char tmp
[128];
560 const int l
= (int) fread(tmp
,1,sizeof(tmp
),m_fp
);
564 const int amt
=wdl_utf8_makechar(cs
==1 ? ((tmp
[rd
]<<8)|tmp
[rd
+1]) : (tmp
[rd
]|(tmp
[rd
+1]<<8)),
566 (int)sizeof(m_buf
)-wpos
);
567 if (amt
>0) wpos
+= amt
;
577 wdl_xml_fileread(FILE *fp
) : wdl_xml_parser(NULL
,0)
580 m_charset
=0; // default to utf-8
583 unsigned char bom
[2];
584 if (fread(bom
,1,2,fp
)==2)
586 if (bom
[0] == 0xEF && bom
[1] == 0xBB && fgetc(fp
) == 0xBF) m_charset
=0;
587 else if (bom
[0] == 0xFE && bom
[1] == 0xFF) m_charset
=1; // utf-16 BE
588 else if (bom
[0] == 0xFF && bom
[1] == 0xFE) m_charset
=2; // utf-16 LE
589 else fseek(fp
,0,SEEK_SET
); // rewind
593 virtual ~wdl_xml_fileread() { if (m_fp
) fclose(m_fp
); }