Merge pull request #110 from tesselode/fixes
[wdl/wdl-ol.git] / WDL / xmlparse.h
blob7a22c5e44f5c86c9781fd4c50e7b145ef8b66d82
1 /*
2 WDL - xmlparse.h
3 Copyright (C) 2016 and later, Cockos Incorporated
5 This software is provided 'as-is', without any express or implied
6 warranty. In no event will the authors be held liable for any damages
7 arising from the use of this software.
9 Permission is granted to anyone to use this software for any purpose,
10 including commercial applications, and to alter it and redistribute it
11 freely, subject to the following restrictions:
13 1. The origin of this software must not be misrepresented; you must not
14 claim that you wrote the original software. If you use this software
15 in a product, an acknowledgment in the product documentation would be
16 appreciated but is not required.
17 2. Altered source versions must be plainly marked as such, and must not be
18 misrepresented as being the original software.
19 3. This notice may not be removed or altered from any source distribution.
21 very very very lightweight XML parser
23 reads: <?xml, <!DOCTYPE, <![CDATA[, &lt;&gt;&amp;&quot;&apos;&#xABC;&#123; ignores unknown <?tag blocks?>
24 always uses 8-bit characters, uses UTF-8 encoding for &#xyz
25 relatively strict. for overflow safety, enforces a token length limit of 512MB
29 #ifndef _WDL_XML_PARSE_H_
30 #define _WDL_XML_PARSE_H_
31 #include "ptrlist.h"
32 #include "assocarray.h"
33 #include "wdlstring.h"
34 #include "wdlutf8.h"
36 class wdl_xml_element {
37 static int attr_cmp(char **a, char **b) { return strcmp(*a,*b); }
38 static void attr_free(char *a) { free(a); }
39 public:
40 wdl_xml_element(const char *_name, int _line, int _col, bool _sort_attr=true) :
41 attributes(attr_cmp,NULL,attr_free,attr_free), name(strdup(_name)), line(_line), col(_col),
42 m_sort_attributes(_sort_attr), m_has_discrete_close(false) { }
43 ~wdl_xml_element() { free(name); elements.Empty(true); }
45 WDL_PtrList<wdl_xml_element> elements;
46 WDL_AssocArray<char *, char *> attributes;
47 WDL_FastString value; // value excluding any leading whitespace and excluding any elements
49 char *name;
50 int line, col;
51 bool m_sort_attributes;
52 bool m_has_discrete_close;
54 const char *get_attribute(const char *v, const char *def=NULL) const
56 if (!m_sort_attributes)
58 const int n = attributes.GetSize();
59 for (int x = 0; x < n; x ++)
61 char *key = NULL;
62 const char *val = attributes.Enumerate(x,&key);
63 if (key && !strcmp(key,v)) return val;
66 return attributes.Get((char*)v,(char*)def);
70 class wdl_xml_parser {
71 public:
72 wdl_xml_parser(const char *rdptr, int rdptr_len, bool sort_attributes=true) :
73 element_xml(NULL), element_root(NULL),
74 m_rdptr((const unsigned char *)rdptr), m_err(NULL),
75 m_rdptr_len(rdptr_len), m_line(1), m_col(0), m_lastchar(0),
76 m_last_line(1),m_last_col(0),
77 m_sort_attributes(sort_attributes)
80 virtual ~wdl_xml_parser()
82 delete element_xml;
83 delete element_root;
84 element_doctype_tokens.Empty(true,free);
87 const char *parse() // call only once, returns NULL on success, error message on failure
89 m_lastchar = nextchar();
91 if (!m_tok.ResizeOK(256)) return "token buffer malloc fail";
93 const char *p = parse_element_body(NULL);
94 if (m_err) return m_err;
95 if (p) return p;
96 if (get_tok()) return "document: extra characters following root element";
98 return NULL;
101 // output
102 WDL_PtrList<char> element_doctype_tokens; // tokens after <!DOCTYPE
103 wdl_xml_element *element_xml, *element_root;
105 // get location after parse() returns error
106 int getLine() const { return m_last_line; }
107 int getCol() const { return m_last_col; }
110 private:
112 WDL_HeapBuf m_tok;
113 const unsigned char *m_rdptr;
114 const char *m_err;
115 int m_rdptr_len, m_line, m_col, m_lastchar, m_last_line,m_last_col;
116 bool m_sort_attributes;
118 virtual int moredata(const char **dataOut) { return 0; }
120 int nextchar()
122 if (m_rdptr_len < 1 && (m_rdptr_len = moredata((const char **)&m_rdptr)) < 1) return -1;
124 m_rdptr_len--;
125 const int ret = (int)*m_rdptr++;
127 if (ret == '\n') { m_line++; m_col=0; }
128 else m_col++;
130 return ret;
133 int skip_whitespace()
135 int rv=0, lc = m_lastchar;
136 while (char_type(lc) < 0) { lc = nextchar(); rv++; }
137 m_lastchar = lc;
138 return rv;
141 static int char_type(int c)
143 switch (c)
145 case ' ': case '\r': case '\n': case '\t':
146 return -1;
148 case '/': case '!': case '\\': case '\'': case '"': case '#': case '$':
149 case '%': case '(': case ')': case '*': case '+': case ',': case ';':
150 case '=': case '>': case '?': case '@': case '[': case ']': case '^':
151 case '`': case '{': case '|': case '}': case '~':
152 return 1;
154 case '<': case '&':
155 return 2;
157 case '-': case '.':
158 return 4;
160 return 0;
163 unsigned char *realloc_tok(int &tok_sz)
165 tok_sz += tok_sz + tok_sz / 4;
166 if (tok_sz >= (1<<29))
168 m_err="token buffer tried to malloc() more than 512MB, probably unsafe and invalid XML";
169 return NULL;
171 unsigned char *t = (unsigned char *) m_tok.ResizeOK(tok_sz);
172 if (!t) m_err="token buffer malloc fail";
173 return t;
176 // gets a token, normally skipping whitespace, but if get_tok(true), then return NULL on whitespace
177 const char *get_tok(bool no_skip_whitespace=false)
179 if (!no_skip_whitespace) skip_whitespace();
181 m_last_line = m_line;
182 m_last_col = m_col;
184 int wrpos=0, lc = m_lastchar, tok_sz = m_tok.GetSize();
185 unsigned char *tok_buf = (unsigned char *)m_tok.Get();
186 switch (lc > 0 ? char_type(lc) : -2)
188 case 0:
191 tok_buf[wrpos++] = lc;
192 if (WDL_unlikely(wrpos >= tok_sz) && WDL_unlikely(!(tok_buf=realloc_tok(tok_sz)))) return NULL;
193 lc = nextchar();
195 while (lc > 0 && !(char_type(lc)&~4));
196 break;
198 case 1:
199 case 2:
200 case 4:
201 if (lc == '\'' || lc == '\"')
203 const int endc = lc;
204 tok_buf[wrpos++] = lc;
205 lc = nextchar();
206 while (lc > 0)
208 if (lc == '<')
210 m_last_line=m_line; m_last_col=m_col;
211 m_err="illegal '<' character in quoted string";
212 m_lastchar = lc;
213 return NULL;
216 if (lc == '&')
218 m_lastchar = lc;
219 if (WDL_unlikely(wrpos+8 >= tok_sz) && WDL_unlikely(!(tok_buf=realloc_tok(tok_sz)))) return NULL;
221 const int tmp[2]={m_line,m_col};
222 if (!decode_entity((char*)tok_buf+wrpos))
224 m_last_line=tmp[0]; m_last_col=tmp[1];
225 m_err="unknown entity in quoted string";
226 return NULL;
228 lc = m_lastchar;
229 while (tok_buf[wrpos]) wrpos++;
231 else
233 const int llc = lc;
234 tok_buf[wrpos++] = lc;
235 if (WDL_unlikely(wrpos >= tok_sz) && WDL_unlikely(!(tok_buf=realloc_tok(tok_sz)))) return NULL;
236 lc = nextchar();
238 if (llc == endc) break;
242 else
244 tok_buf[wrpos++] = lc;
245 if (WDL_unlikely(wrpos >= tok_sz) && WDL_unlikely(!(tok_buf=realloc_tok(tok_sz)))) return NULL;
247 lc = nextchar();
249 break;
250 case -1:
251 m_err="unexpected whitespace";
252 return NULL;
253 default:
254 m_err="unexpected end of file";
255 return NULL;
257 tok_buf[wrpos]=0;
258 m_lastchar = lc;
259 return (char *)tok_buf;
262 bool decode_entity(char *wr) // will never write more than 8 bytes
264 char tmp[32];
265 int i=0;
266 while (i < 31 && (m_lastchar = nextchar()) > 0 && m_lastchar != ';')
268 if (char_type(m_lastchar) && m_lastchar != '#') break;
269 tmp[i++] = m_lastchar;
271 int byteval = 0;
272 if (m_lastchar == ';')
274 tmp[i]=0;
275 if (!strcmp(tmp,"lt")) byteval = '<';
276 else if (!strcmp(tmp,"gt")) byteval = '>';
277 else if (!strcmp(tmp,"amp")) byteval = '&';
278 else if (!strcmp(tmp,"apos")) byteval = '\'';
279 else if (!strcmp(tmp,"quot")) byteval = '"';
280 else if (tmp[0] == '#')
282 if (tmp[1] >= '0' && tmp[1] <= '9') byteval = atoi(tmp+1);
283 if (tmp[1] == 'x') byteval = strtol(tmp+1,NULL,16);
286 if (!byteval) return false;
287 WDL_MakeUTFChar((char*)wr,byteval,8);
288 m_lastchar = nextchar();
289 return true;
292 bool skip_until(const char *tok, const char *a, const char *b)
294 bool state=false;
295 if (!tok) tok = get_tok();
296 while (tok)
298 if (state && !strcmp(b,tok)) return true;
299 state = !strcmp(a,tok);
300 if (state && !b) return true;
302 if (skip_whitespace()) state=false;
303 tok = get_tok(true);
306 return false;
309 const char *parse_element_attributes(wdl_xml_element *elem)
311 char *attr_name=NULL;
312 for (;;)
314 const char *tok = get_tok();
315 if (!tok) break;
317 if (*tok == '-' || *tok == '.' || (*tok >= '0' && *tok <= '9')) { m_err="attribute must not begin with .- or number"; break; }
319 if (char_type(*tok)) return tok;
321 char *attr_name = strdup(tok);
322 if (!attr_name) { m_err="malloc fail"; break; }
324 if (m_sort_attributes &&
325 elem->attributes.Get(attr_name))
327 m_err="attribute specified more than once";
328 break;
331 tok = get_tok();
332 if (!tok) break;
333 if (*tok != '=') { m_err="attribute name must be followed by '='"; break; }
335 tok = get_tok();
336 if (!tok) break;
337 if (*tok != '\'' && *tok != '"') { m_err="attribute value must be quoted string"; break; }
339 const size_t tok_len = strlen(tok);
340 if (tok_len < 2 || tok[tok_len-1] != tok[0]) { m_err="attribute value missing trailing quote"; break; }
342 char *value = (char *)malloc(tok_len-2+1);
343 if (!value) { m_err="malloc fail"; break; }
345 memcpy(value,tok+1,tok_len-2);
346 value[tok_len-2]=0;
348 if (m_sort_attributes)
349 elem->attributes.Insert(attr_name,value);
350 else
351 elem->attributes.AddUnsorted(attr_name,value);
353 attr_name = NULL;
355 free(attr_name);
356 return NULL;
359 const char *parse_element_body(wdl_xml_element *elem) // return NULL on success, error message on failure
361 int cnt=0;
362 for (;;)
364 if (elem)
366 bool want_add = elem->value.GetLength() > 0;
367 while (m_lastchar != '<' && m_lastchar > 0)
369 if (!want_add && char_type(m_lastchar)>=0) want_add=true;
371 bool adv=true;
372 if (m_lastchar == '&')
374 m_last_line=m_line; m_last_col=m_col;
375 char buf[8];
376 if (!decode_entity(buf)) return "unknown entity in element body";
377 elem->value.Append(buf);
378 adv=false;
380 else if (want_add)
382 unsigned char c = (unsigned char)m_lastchar;
383 elem->value.Append((const char *)&c,1);
386 if (adv) m_lastchar = nextchar();
390 const char *tok = get_tok(elem != NULL);
391 const int start_line = m_last_line, start_col = m_last_col;
392 if (!tok) return elem ? "unterminated block" : NULL;
393 if (*tok != '<') return "expected < tag";
395 tok = get_tok(true);
396 if (!tok) return "expected token after <";
398 if (tok[0] == '!')
400 tok = get_tok(true);
401 if (!tok) return "expected token following <!";
403 if (*tok == '-')
405 tok = get_tok(true);
406 if (!tok) return "expected token following <!-";
407 if (*tok != '-') return "unknown token following <!-";
408 if (!skip_until(NULL,"-","-"))
410 m_last_line=start_line;
411 m_last_col=start_col;
412 return m_err = "unterminated comment";
414 tok = get_tok(true);
415 if (!tok || tok[0] != '>') return "-- not allowed in comment";
417 else if (*tok == '[')
419 if (!elem) return "<![ not allowed at document level";
420 tok = get_tok(true);
421 if (!tok || strcmp(tok,"CDATA")) return "unknown token beginning <![";
422 tok=get_tok(true);
423 if (!tok || tok[0] != '[') return "unknown token beginning <![CDATA but without trailing [";
425 // add content literally until ]]>
426 int lc=m_lastchar, last1=0,last2=0;
427 for (;;)
429 if (lc == '>' && last1 == ']' && last2 == ']') break;
431 unsigned char c = (unsigned char)lc;
432 elem->value.Append((const char *)&c,1);
433 last2 = last1;
434 last1 = lc;
436 lc = nextchar();
437 if (lc <= 0)
439 m_lastchar = -1;
440 m_last_line=start_line;
441 m_last_col=start_col;
442 return m_err = "unterminated <![CDATA[";
445 elem->value.SetLen(elem->value.GetLength()-2); // remove ]]
446 m_lastchar = nextchar();
449 else if (!strcmp(tok,"DOCTYPE"))
451 if (elem) return "<!DOCTYPE must be at top level";
452 if (element_doctype_tokens.GetSize()) return "<!DOCTYPE already specified";
454 tok = get_tok();
455 if (!tok || char_type(*tok)) return "expected document type token following <!DOCTYPE";
458 element_doctype_tokens.Add(strdup(tok));
459 tok = get_tok();
460 if (!tok)
462 m_last_line=start_line;
463 m_last_col=start_col;
464 return m_err = "unterminated <!DOCTYPE";
466 } while (tok[0] != '>');
468 else return "unknown token following <!";
470 else if (tok[0] == '?')
472 tok = get_tok(true);
473 if (!tok) return "expected token following <?";
475 if (!strcmp(tok,"xml"))
477 if (elem || cnt || element_xml) return "<?xml must begin document";
479 element_xml = new wdl_xml_element("xml",start_line,start_col,m_sort_attributes);
480 tok = parse_element_attributes(element_xml);
481 if (!tok || tok[0] != '?' || !(tok=get_tok(true)) || tok[0] != '>')
482 return "<?xml not terminated";
484 else
486 if (!skip_until(tok, "?",">"))
488 m_last_line=start_line;
489 m_last_col=start_col;
490 return m_err = "unterminated <? block";
494 else if (tok[0] == '/')
496 if (!elem) return "unexpected </ at root level";
498 tok = get_tok(true);
499 if (strcmp(tok,elem->name))
501 return "mismatched </ tag name";
504 tok = get_tok();
505 if (!tok || tok[0] != '>') return "expected > following </tag";
506 // done!
507 elem->m_has_discrete_close = true;
508 return NULL;
510 else
512 if (*tok == '-' || *tok == '.' || (*tok >= '0' && *tok <= '9'))
513 return "element name must not begin with .- or number";
515 wdl_xml_element *sub = new wdl_xml_element(tok,start_line,start_col,m_sort_attributes);
516 if (elem) elem->elements.Add(sub);
517 else element_root = sub;
519 tok = parse_element_attributes(sub);
520 if (!tok) return "unterminated element";
522 if (*tok == '/')
524 tok = get_tok(true);
525 if (!tok || *tok != '>') return "expected > following / to end element";
527 else if (*tok == '>')
529 const char *ret = parse_element_body(sub);
530 if (ret) return ret;
532 else
534 return "unknown token in element";
536 if (!elem) return NULL; // finish after parsing a top level block
538 cnt++;
543 class wdl_xml_fileread : public wdl_xml_parser {
544 FILE *m_fp;
545 char m_buf[1024];
546 int m_charset;
548 virtual int moredata(const char **dataptr)
550 *dataptr = m_buf;
551 const int cs = m_charset;
552 if (m_fp) switch (cs)
554 case 0:
555 return (int) fread(m_buf,1,sizeof(m_buf),m_fp);
556 case 1:
557 case 2:
559 unsigned char tmp[128];
560 const int l = (int) fread(tmp,1,sizeof(tmp),m_fp);
561 int rd=0, wpos=0;
562 while (rd+1 < l)
564 const int amt=wdl_utf8_makechar(cs==1 ? ((tmp[rd]<<8)|tmp[rd+1]) : (tmp[rd]|(tmp[rd+1]<<8)),
565 m_buf+wpos,
566 (int)sizeof(m_buf)-wpos);
567 if (amt>0) wpos += amt;
569 rd+=2;
571 return wpos;
574 return 0;
576 public:
577 wdl_xml_fileread(FILE *fp) : wdl_xml_parser(NULL,0)
579 m_fp=fp;
580 m_charset=0; // default to utf-8
581 if (fp)
583 unsigned char bom[2];
584 if (fread(bom,1,2,fp)==2)
586 if (bom[0] == 0xEF && bom[1] == 0xBB && fgetc(fp) == 0xBF) m_charset=0;
587 else if (bom[0] == 0xFE && bom[1] == 0xFF) m_charset=1; // utf-16 BE
588 else if (bom[0] == 0xFF && bom[1] == 0xFE) m_charset=2; // utf-16 LE
589 else fseek(fp,0,SEEK_SET); // rewind
593 virtual ~wdl_xml_fileread() { if (m_fp) fclose(m_fp); }
596 #endif