langpackedit: sorting fixes, 0.015 -- from 8f06f769
[wdl.git] / WDL / xmlparse.h
bloba7964da6d60f1085833ed293b2295da6811893af
1 /*
2 WDL - xmlparse.h
3 Copyright (C) 2016 and later, Cockos Incorporated
5 This software is provided 'as-is', without any express or implied
6 warranty. In no event will the authors be held liable for any damages
7 arising from the use of this software.
9 Permission is granted to anyone to use this software for any purpose,
10 including commercial applications, and to alter it and redistribute it
11 freely, subject to the following restrictions:
13 1. The origin of this software must not be misrepresented; you must not
14 claim that you wrote the original software. If you use this software
15 in a product, an acknowledgment in the product documentation would be
16 appreciated but is not required.
17 2. Altered source versions must be plainly marked as such, and must not be
18 misrepresented as being the original software.
19 3. This notice may not be removed or altered from any source distribution.
21 very very very lightweight XML parser
23 reads: <?xml, <!DOCTYPE, <![CDATA[, &lt;&gt;&amp;&quot;&apos;&#xABC;&#123; top level <? blocks, ignores unknown <?tag blocks?> inside elements
24 always uses 8-bit characters, uses UTF-8 encoding for &#xyz
25 relatively strict. for overflow safety, enforces a token length limit of 512MB
29 #ifndef _WDL_XML_PARSE_H_
30 #define _WDL_XML_PARSE_H_
31 #include "ptrlist.h"
32 #include "assocarray.h"
33 #include "wdlstring.h"
34 #include "wdlutf8.h"
35 #include "wdlcstring.h"
37 class wdl_xml_element {
38 static void attr_free(char *a) { free(a); }
39 public:
40 wdl_xml_element(const char *_name, int _line, int _col, bool _sort_attr=true) :
41 attributes(WDL_assocarray_cmpstr<char>,NULL,attr_free,attr_free), name(strdup(_name)), line(_line), col(_col),
42 m_sort_attributes(_sort_attr), m_has_discrete_close(false) { }
43 ~wdl_xml_element() { free(name); elements.Empty(true); }
45 WDL_PtrList<wdl_xml_element> elements;
46 WDL_AssocArray<char *, char *> attributes;
47 WDL_FastString value; // value excluding any leading whitespace and excluding any elements
49 char *name;
50 int line, col;
51 bool m_sort_attributes;
52 bool m_has_discrete_close;
54 const char *get_attribute(const char *v, const char *def=NULL) const
56 if (!m_sort_attributes)
58 const int n = attributes.GetSize();
59 for (int x = 0; x < n; x ++)
61 char *key = NULL;
62 const char *val = attributes.Enumerate(x,&key);
63 if (key && !strcmp(key,v)) return val;
66 return attributes.Get((char*)v,(char*)def);
70 class wdl_xml_parser {
71 public:
72 wdl_xml_parser(const char *rdptr, int rdptr_len, bool sort_attributes=true) :
73 element_xml(NULL), element_root(NULL),
74 m_rdptr((const unsigned char *)rdptr), m_err(NULL),
75 m_rdptr_len(rdptr_len), m_line(1), m_col(0), m_lastchar(0),
76 m_last_line(1),m_last_col(0),
77 m_sort_attributes(sort_attributes)
80 virtual ~wdl_xml_parser()
82 delete element_xml;
83 delete element_root;
84 element_doctype_tokens.Empty(true,free);
85 element_root_meta.Empty(true);
88 const char *parse() // call only once, returns NULL on success, error message on failure
90 m_lastchar = nextchar();
92 if (!m_tok.ResizeOK(256)) return "token buffer malloc fail";
94 const char *p = parse_element_body(NULL);
95 if (!m_err) return p;
96 if (!*m_err) m_err="unexpected end of file";
97 if (!p) return m_err;
98 snprintf(m_errbuf,sizeof(m_errbuf),"%s: %s",p,m_err);
99 return m_errbuf;
102 // output
103 WDL_PtrList<char> element_doctype_tokens; // tokens after <!DOCTYPE
104 wdl_xml_element *element_xml, *element_root;
106 WDL_PtrList<wdl_xml_element> element_root_meta; // any topen level <? elements?> other than <?xml which goes into element_xml
108 // get location after parse() returns error
109 int getLine() const { return m_last_line; }
110 int getCol() const { return m_last_col; }
113 private:
115 WDL_HeapBuf m_tok;
116 const unsigned char *m_rdptr;
117 const char *m_err; // NULL if no error, "" if EOF
118 char m_errbuf[128];
119 int m_rdptr_len, m_line, m_col, m_lastchar, m_last_line,m_last_col;
120 bool m_sort_attributes;
122 virtual int moredata(const char **dataOut) { return 0; }
124 int nextchar()
126 if (m_rdptr_len < 1 && (m_rdptr_len = moredata((const char **)&m_rdptr)) < 1) return -1;
128 m_rdptr_len--;
129 const int ret = (int)*m_rdptr++;
131 if (ret == '\n') { m_line++; m_col=0; }
132 else m_col++;
134 return ret;
137 int skip_whitespace()
139 int rv=0, lc = m_lastchar;
140 while (char_type(lc) < 0) { lc = nextchar(); rv++; }
141 m_lastchar = lc;
142 return rv;
145 static int char_type(int c)
147 switch (c)
149 case ' ': case '\r': case '\n': case '\t':
150 return -1;
152 case '/': case '!': case '\\': case '\'': case '"': case '#': case '$':
153 case '%': case '(': case ')': case '*': case '+': case ',': case ';':
154 case '=': case '>': case '?': case '@': case '[': case ']': case '^':
155 case '`': case '{': case '|': case '}': case '~':
156 return 1;
158 case '<': case '&':
159 return 2;
161 case '-': case '.':
162 return 4;
164 return 0;
167 unsigned char *realloc_tok(int &tok_sz)
169 tok_sz += tok_sz + tok_sz / 4;
170 if (tok_sz >= (1<<29))
172 m_err="token buffer tried to malloc() more than 512MB, probably unsafe and invalid XML";
173 return NULL;
175 unsigned char *t = (unsigned char *) m_tok.ResizeOK(tok_sz);
176 if (!t) m_err="token buffer malloc fail";
177 return t;
180 // gets a token, normally skipping whitespace, but if get_tok(true), then return NULL on whitespace
181 const char *get_tok(bool no_skip_whitespace=false)
183 if (!no_skip_whitespace) skip_whitespace();
185 m_last_line = m_line;
186 m_last_col = m_col;
188 int wrpos=0, lc = m_lastchar, tok_sz = m_tok.GetSize();
189 unsigned char *tok_buf = (unsigned char *)m_tok.Get();
190 switch (lc > 0 ? char_type(lc) : -2)
192 case 0:
195 tok_buf[wrpos++] = lc;
196 if (WDL_unlikely(wrpos >= tok_sz) && WDL_unlikely(!(tok_buf=realloc_tok(tok_sz)))) return NULL;
197 lc = nextchar();
199 while (lc > 0 && !(char_type(lc)&~4));
200 break;
202 case 1:
203 case 2:
204 case 4:
205 if (lc == '\'' || lc == '\"')
207 const int endc = lc;
208 tok_buf[wrpos++] = lc;
209 lc = nextchar();
210 while (lc > 0)
212 if (lc == '<')
214 m_last_line=m_line; m_last_col=m_col;
215 m_err="illegal '<' character in quoted string";
216 m_lastchar = lc;
217 return NULL;
220 if (lc == '&')
222 m_lastchar = lc;
223 if (WDL_unlikely(wrpos+8 >= tok_sz) && WDL_unlikely(!(tok_buf=realloc_tok(tok_sz)))) return NULL;
225 const int tmp[2]={m_line,m_col};
226 if (!decode_entity((char*)tok_buf+wrpos))
228 m_last_line=tmp[0]; m_last_col=tmp[1];
229 m_err="unknown entity in quoted string";
230 return NULL;
232 lc = m_lastchar;
233 while (tok_buf[wrpos]) wrpos++;
235 else
237 const int llc = lc;
238 tok_buf[wrpos++] = lc;
239 if (WDL_unlikely(wrpos >= tok_sz) && WDL_unlikely(!(tok_buf=realloc_tok(tok_sz)))) return NULL;
240 lc = nextchar();
242 if (llc == endc) break;
246 else
248 tok_buf[wrpos++] = lc;
249 if (WDL_unlikely(wrpos >= tok_sz) && WDL_unlikely(!(tok_buf=realloc_tok(tok_sz)))) return NULL;
251 lc = nextchar();
253 break;
254 case -1:
255 m_err="unexpected whitespace";
256 return NULL;
257 default:
258 m_err=""; // EOF
259 return NULL;
261 tok_buf[wrpos]=0;
262 m_lastchar = lc;
263 return (char *)tok_buf;
266 bool decode_entity(char *wr) // will never write more than 8 bytes
268 char tmp[32];
269 int i=0;
270 while (i < 31 && (m_lastchar = nextchar()) > 0 && m_lastchar != ';')
272 if (char_type(m_lastchar) && m_lastchar != '#') break;
273 tmp[i++] = m_lastchar;
275 int byteval = 0;
276 if (m_lastchar == ';')
278 tmp[i]=0;
279 if (!strcmp(tmp,"lt")) byteval = '<';
280 else if (!strcmp(tmp,"gt")) byteval = '>';
281 else if (!strcmp(tmp,"amp")) byteval = '&';
282 else if (!strcmp(tmp,"apos")) byteval = '\'';
283 else if (!strcmp(tmp,"quot")) byteval = '"';
284 else if (tmp[0] == '#')
286 if (tmp[1] >= '0' && tmp[1] <= '9') byteval = atoi(tmp+1);
287 if (tmp[1] == 'x') byteval = strtol(tmp+1,NULL,16);
290 if (!byteval) return false;
291 WDL_MakeUTFChar((char*)wr,byteval,8);
292 m_lastchar = nextchar();
293 return true;
296 bool skip_until(const char *s) // raw search, no tokenization
298 int state = 0, c = m_lastchar;
299 while (c>0 && s[state])
301 state = (state && c == (unsigned char)s[state]) ? (state+1) : (c == (unsigned char)s[0]);
302 c = nextchar();
304 m_lastchar = c;
305 return !s[state];
308 const char *parse_element_attributes(wdl_xml_element *elem)
310 char *attr_name=NULL;
311 for (;;)
313 const char *tok = get_tok();
314 if (!tok) break;
316 if (*tok == '-' || *tok == '.' || (*tok >= '0' && *tok <= '9')) { m_err="attribute must not begin with .- or number"; break; }
318 if (char_type(*tok)) return tok;
320 attr_name = strdup(tok);
321 if (!attr_name) { m_err="malloc fail"; break; }
323 if (m_sort_attributes &&
324 elem->attributes.Get(attr_name))
326 m_err="attribute specified more than once";
327 break;
330 tok = get_tok();
331 if (!tok) break;
332 if (*tok != '=') { m_err="attribute name must be followed by '='"; break; }
334 tok = get_tok();
335 if (!tok) break;
336 if (*tok != '\'' && *tok != '"') { m_err="attribute value must be quoted string"; break; }
338 const size_t tok_len = strlen(tok);
339 if (tok_len < 2 || tok[tok_len-1] != tok[0]) { m_err="attribute value missing trailing quote"; break; }
341 char *value = (char *)malloc(tok_len-2+1);
342 if (!value) { m_err="malloc fail"; break; }
344 memcpy(value,tok+1,tok_len-2);
345 value[tok_len-2]=0;
347 if (m_sort_attributes)
348 elem->attributes.Insert(attr_name,value);
349 else
350 elem->attributes.AddUnsorted(attr_name,value);
352 attr_name = NULL;
354 free(attr_name);
355 return NULL;
358 const char *parse_element_body(wdl_xml_element *elem) // return NULL on success, error message on failure
360 int cnt=0;
361 for (;;)
363 if (elem)
365 bool want_add = elem->value.GetLength() > 0;
366 while (m_lastchar != '<' && m_lastchar > 0)
368 if (!want_add && char_type(m_lastchar)>=0) want_add=true;
370 bool adv=true;
371 if (m_lastchar == '&')
373 m_last_line=m_line; m_last_col=m_col;
374 char buf[8];
375 if (!decode_entity(buf)) return "unknown entity in element body";
376 elem->value.Append(buf);
377 adv=false;
379 else if (want_add)
381 unsigned char c = (unsigned char)m_lastchar;
382 elem->value.Append((const char *)&c,1);
385 if (adv) m_lastchar = nextchar();
389 const char *tok = get_tok(elem != NULL);
390 const int start_line = m_last_line, start_col = m_last_col;
391 if (!tok)
393 if (m_err && *m_err == 0 && !elem) m_err = NULL; // clear m_error if EOF and top level
394 return elem ? "unterminated block" : NULL;
396 if (*tok != '<') return "expected < tag";
398 tok = get_tok(true);
399 if (!tok) return "expected token after <";
401 if (tok[0] == '!')
403 tok = get_tok(true);
404 if (!tok) return "expected token following <!";
406 if (*tok == '-')
408 tok = get_tok(true);
409 if (!tok) return "expected token following <!-";
410 if (*tok != '-') return "unknown token following <!-";
411 if (!skip_until("--"))
413 m_last_line=start_line;
414 m_last_col=start_col;
415 return "unterminated comment";
417 tok = get_tok(true);
418 if (!tok || tok[0] != '>') return "-- not allowed in comment";
420 else if (*tok == '[')
422 if (!elem) return "<![ not allowed at document level";
423 tok = get_tok(true);
424 if (!tok || strcmp(tok,"CDATA")) return "unknown token beginning <![";
425 tok=get_tok(true);
426 if (!tok || tok[0] != '[') return "unknown token beginning <![CDATA but without trailing [";
428 // add content literally until ]]>
429 int lc=m_lastchar, last1=0,last2=0;
430 for (;;)
432 if (lc == '>' && last1 == ']' && last2 == ']') break;
434 unsigned char c = (unsigned char)lc;
435 elem->value.Append((const char *)&c,1);
436 last2 = last1;
437 last1 = lc;
439 lc = nextchar();
440 if (lc <= 0)
442 m_lastchar = -1;
443 m_last_line=start_line;
444 m_last_col=start_col;
445 return "unterminated <![CDATA[";
448 elem->value.SetLen(elem->value.GetLength()-2); // remove ]]
449 m_lastchar = nextchar();
452 else if (!strcmp(tok,"DOCTYPE"))
454 if (elem) return "<!DOCTYPE must be at top level";
455 if (element_doctype_tokens.GetSize()) return "<!DOCTYPE already specified";
457 tok = get_tok();
458 if (!tok || char_type(*tok)) return "expected document type token following <!DOCTYPE";
461 element_doctype_tokens.Add(strdup(tok));
462 tok = get_tok();
463 if (!tok)
465 m_last_line=start_line;
466 m_last_col=start_col;
467 return "unterminated <!DOCTYPE";
469 } while (tok[0] != '>');
471 else return "unknown token following <!";
473 else if (tok[0] == '?')
475 tok = get_tok(true);
476 if (!tok) return "expected token following <?";
478 if (!strcmp(tok,"xml"))
480 if (elem || cnt || element_xml || element_root_meta.GetSize()) return "<?xml must begin document";
482 element_xml = new wdl_xml_element("xml",start_line,start_col,m_sort_attributes);
483 tok = parse_element_attributes(element_xml);
484 if (!tok || tok[0] != '?' || !(tok=get_tok(true)) || tok[0] != '>')
485 return "<?xml not terminated";
487 else
489 if (!elem)
491 wdl_xml_element *ne = new wdl_xml_element(tok,start_line,start_col,m_sort_attributes);
492 tok = parse_element_attributes(ne);
493 if (!tok || tok[0] != '?' || !(tok=get_tok(true)) || tok[0] != '>')
495 delete ne;
496 return "<? element not terminated";
498 element_root_meta.Add(ne);
500 else if (!skip_until("?>")) // ignore <? inside elements
502 m_last_line=start_line;
503 m_last_col=start_col;
504 return "unterminated <? block";
508 else if (tok[0] == '/')
510 if (!elem) return "unexpected </ at root level";
512 tok = get_tok(true);
513 if (strcmp(tok,elem->name))
515 return "mismatched </ tag name";
518 tok = get_tok();
519 if (!tok || tok[0] != '>') return "expected > following </tag";
520 // done!
521 elem->m_has_discrete_close = true;
522 return NULL;
524 else
526 if (!elem && element_root)
527 return "multiple top level elements";
529 if (*tok == '-' || *tok == '.' || (*tok >= '0' && *tok <= '9'))
530 return "element name must not begin with .- or number";
532 wdl_xml_element *sub = new wdl_xml_element(tok,start_line,start_col,m_sort_attributes);
533 if (elem) elem->elements.Add(sub);
534 else element_root = sub;
536 tok = parse_element_attributes(sub);
537 if (!tok) return "unterminated element";
539 if (*tok == '/')
541 tok = get_tok(true);
542 if (!tok || *tok != '>') return "expected > following / to end element";
544 else if (*tok == '>')
546 const char *ret = parse_element_body(sub);
547 if (ret) return ret;
549 else
551 return "unknown token in element";
554 cnt++;
559 class wdl_xml_fileread : public wdl_xml_parser {
560 FILE *m_fp;
561 char m_buf[1024];
562 int m_charset;
564 virtual int moredata(const char **dataptr)
566 *dataptr = m_buf;
567 const int cs = m_charset;
568 if (m_fp) switch (cs)
570 case 0:
571 return (int) fread(m_buf,1,sizeof(m_buf),m_fp);
572 case 1:
573 case 2:
575 unsigned char tmp[128];
576 const int l = (int) fread(tmp,1,sizeof(tmp),m_fp);
577 int rd=0, wpos=0;
578 while (rd+1 < l)
580 const int amt=wdl_utf8_makechar(cs==1 ? ((tmp[rd]<<8)|tmp[rd+1]) : (tmp[rd]|(tmp[rd+1]<<8)),
581 m_buf+wpos,
582 (int)sizeof(m_buf)-wpos);
583 if (amt>0) wpos += amt;
585 rd+=2;
587 return wpos;
590 return 0;
592 public:
593 wdl_xml_fileread(FILE *fp) : wdl_xml_parser(NULL,0)
595 m_fp=fp;
596 m_charset=0; // default to utf-8
597 if (fp)
599 unsigned char bom[2];
600 if (fread(bom,1,2,fp)==2)
602 if (bom[0] == 0xEF && bom[1] == 0xBB && fgetc(fp) == 0xBF) m_charset=0;
603 else if (bom[0] == 0xFE && bom[1] == 0xFF) m_charset=1; // utf-16 BE
604 else if (bom[0] == 0xFF && bom[1] == 0xFE) m_charset=2; // utf-16 LE
605 else fseek(fp,0,SEEK_SET); // rewind
609 virtual ~wdl_xml_fileread() { if (m_fp) fclose(m_fp); }
612 #endif