WDL/xmlparse.h

   1 /*
   2     WDL - xmlparse.h
   3     Copyright (C) 2016 and later, Cockos Incorporated
   4
   5     This software is provided 'as-is', without any express or implied
   6     warranty.  In no event will the authors be held liable for any damages
   7     arising from the use of this software.
   8
   9     Permission is granted to anyone to use this software for any purpose,
  10     including commercial applications, and to alter it and redistribute it
  11     freely, subject to the following restrictions:
  12
  13     1. The origin of this software must not be misrepresented; you must not
  14        claim that you wrote the original software. If you use this software
  15        in a product, an acknowledgment in the product documentation would be
  16        appreciated but is not required.
  17     2. Altered source versions must be plainly marked as such, and must not be
  18        misrepresented as being the original software.
  19     3. This notice may not be removed or altered from any source distribution.
  20
  21 very very very lightweight XML parser
  22
  23 reads: <?xml, <!DOCTYPE, <![CDATA[, &lt;&gt;&amp;&quot;&apos;&#xABC;&#123; top level <? blocks, ignores unknown <?tag blocks?> inside elements
  24 always uses 8-bit characters, uses UTF-8 encoding for &#xyz
  25 relatively strict. for overflow safety, enforces a token length limit of 512MB
  26
  27 */
  28
  29 #ifndef _WDL_XML_PARSE_H_
  30 #define _WDL_XML_PARSE_H_
  31 #include "ptrlist.h"
  32 #include "assocarray.h"
  33 #include "wdlstring.h"
  34 #include "wdlutf8.h"
  35 #include "wdlcstring.h"
  36
  37 class wdl_xml_element {
  38     static void attr_free(char *a) { free(a); }
  39   public:
  40     wdl_xml_element(const char *_name, int _line, int _col, bool _sort_attr=true) :
  41       attributes(WDL_assocarray_cmpstr<char>,NULL,attr_free,attr_free), name(strdup(_name)), line(_line), col(_col),
  42       m_sort_attributes(_sort_attr), m_has_discrete_close(false) { }
  43     ~wdl_xml_element() { free(name); elements.Empty(true); }
  44
  45     WDL_PtrList<wdl_xml_element> elements;
  46     WDL_AssocArray<char *, char *> attributes;
  47     WDL_FastString value; // value excluding any leading whitespace and excluding any elements
  48
  49     char *name;
  50     int line, col;
  51     bool m_sort_attributes;
  52     bool m_has_discrete_close;
  53
  54     const char *get_attribute(const char *v, const char *def=NULL) const
  55     {
  56       if (!m_sort_attributes)
  57       {
  58         const int n = attributes.GetSize();
  59         for (int x = 0; x < n; x ++)
  60         {
  61           char *key = NULL;
  62           const char *val = attributes.Enumerate(x,&key);
  63           if (key && !strcmp(key,v)) return val;
  64         }
  65       }
  66       return attributes.Get((char*)v,(char*)def);
  67     }
  68 };
  69
  70 class wdl_xml_parser {
  71   public:
  72     wdl_xml_parser(const char *rdptr, int rdptr_len, bool sort_attributes=true) :
  73       element_xml(NULL), element_root(NULL),
  74       m_rdptr((const unsigned char *)rdptr), m_err(NULL),
  75       m_rdptr_len(rdptr_len), m_line(1), m_col(0), m_lastchar(0),
  76       m_last_line(1),m_last_col(0),
  77       m_sort_attributes(sort_attributes)
  78     {
  79     }
  80     virtual ~wdl_xml_parser()
  81     {
  82       delete element_xml;
  83       delete element_root;
  84       element_doctype_tokens.Empty(true,free);
  85       element_root_meta.Empty(true);
  86     }
  87
  88     const char *parse() // call only once, returns NULL on success, error message on failure
  89     {
  90       m_lastchar = nextchar();
  91
  92       if (!m_tok.ResizeOK(256)) return "token buffer malloc fail";
  93
  94       const char *p = parse_element_body(NULL);
  95       if (!m_err) return p;
  96       if (!*m_err) m_err="unexpected end of file";
  97       if (!p) return m_err;
  98       snprintf(m_errbuf,sizeof(m_errbuf),"%s: %s",p,m_err);
  99       return m_errbuf;
 100     }
 101
 102     // output
 103     WDL_PtrList<char> element_doctype_tokens; // tokens after <!DOCTYPE
 104     wdl_xml_element *element_xml, *element_root;
 105
 106     WDL_PtrList<wdl_xml_element> element_root_meta; // any topen level <? elements?> other than <?xml which goes into element_xml
 107
 108     // get location  after parse() returns error
 109     int getLine() const { return m_last_line; }
 110     int getCol() const { return m_last_col; }
 111
 112
 113   private:
 114
 115     WDL_HeapBuf m_tok;
 116     const unsigned char *m_rdptr;
 117     const char *m_err; // NULL if no error, "" if EOF
 118     char m_errbuf[128];
 119     int m_rdptr_len, m_line, m_col, m_lastchar, m_last_line,m_last_col;
 120     bool m_sort_attributes;
 121
 122     virtual int moredata(const char **dataOut) { return 0; }
 123
 124     int nextchar()
 125     {
 126       if (m_rdptr_len < 1 && (m_rdptr_len = moredata((const char **)&m_rdptr)) < 1) return -1;
 127
 128       m_rdptr_len--;
 129       const int ret = (int)*m_rdptr++;
 130
 131       if (ret == '\n') { m_line++; m_col=0; }
 132       else m_col++;
 133
 134       return ret;
 135     }
 136
 137     int skip_whitespace()
 138     {
 139       int rv=0, lc = m_lastchar;
 140       while (char_type(lc) < 0) { lc = nextchar(); rv++; }
 141       m_lastchar = lc;
 142       return rv;
 143     }
 144
 145     static int char_type(int c)
 146     {
 147       switch (c)
 148       {
 149         case ' ': case '\r': case '\n': case '\t':
 150           return -1;
 151
 152         case '/': case '!': case '\\': case '\'': case '"': case '#': case '$':
 153         case '%': case '(': case ')': case '*': case '+': case ',': case ';':
 154         case '=': case '>': case '?': case '@': case '[': case ']': case '^':
 155         case '`': case '{': case '|': case '}': case '~':
 156           return 1;
 157
 158         case '<': case '&':
 159           return 2;
 160
 161         case '-': case '.':
 162           return 4;
 163       }
 164       return 0;
 165     }
 166
 167     unsigned char *realloc_tok(int &tok_sz)
 168     {
 169       tok_sz += tok_sz + tok_sz / 4;
 170       if (tok_sz >= (1<<29))
 171       {
 172         m_err="token buffer tried to malloc() more than 512MB, probably unsafe and invalid XML";
 173         return NULL;
 174       }
 175       unsigned char *t = (unsigned char *) m_tok.ResizeOK(tok_sz);
 176       if (!t) m_err="token buffer malloc fail";
 177       return t;
 178     }
 179
 180     // gets a token, normally skipping whitespace, but if get_tok(true), then return NULL on whitespace
 181     const char *get_tok(bool no_skip_whitespace=false)
 182     {
 183       if (!no_skip_whitespace) skip_whitespace();
 184
 185       m_last_line = m_line;
 186       m_last_col = m_col;
 187
 188       int wrpos=0, lc = m_lastchar, tok_sz = m_tok.GetSize();
 189       unsigned char *tok_buf = (unsigned char *)m_tok.Get();
 190       switch (lc > 0 ? char_type(lc) : -2)
 191       {
 192         case 0:
 193           do
 194           {
 195             tok_buf[wrpos++] = lc;
 196             if (WDL_unlikely(wrpos >= tok_sz) && WDL_unlikely(!(tok_buf=realloc_tok(tok_sz)))) return NULL;
 197             lc = nextchar();
 198           }
 199           while (lc > 0 && !(char_type(lc)&~4));
 200         break;
 201
 202         case 1:
 203         case 2:
 204         case 4:
 205           if (lc == '\'' || lc == '\"')
 206           {
 207             const int endc = lc;
 208             tok_buf[wrpos++] = lc;
 209             lc = nextchar();
 210             while (lc > 0)
 211             {
 212               if (lc == '<')
 213               {
 214                 m_last_line=m_line; m_last_col=m_col;
 215                 m_err="illegal '<' character in quoted string";
 216                 m_lastchar = lc;
 217                 return NULL;
 218               }
 219
 220               if (lc == '&')
 221               {
 222                 m_lastchar = lc;
 223                 if (WDL_unlikely(wrpos+8 >= tok_sz) && WDL_unlikely(!(tok_buf=realloc_tok(tok_sz)))) return NULL;
 224
 225                 const int tmp[2]={m_line,m_col};
 226                 if (!decode_entity((char*)tok_buf+wrpos))
 227                 {
 228                   m_last_line=tmp[0]; m_last_col=tmp[1];
 229                   m_err="unknown entity in quoted string";
 230                   return NULL;
 231                 }
 232                 lc = m_lastchar;
 233                 while (tok_buf[wrpos]) wrpos++;
 234               }
 235               else
 236               {
 237                 const int llc = lc;
 238                 tok_buf[wrpos++] = lc;
 239                 if (WDL_unlikely(wrpos >= tok_sz) && WDL_unlikely(!(tok_buf=realloc_tok(tok_sz)))) return NULL;
 240                 lc = nextchar();
 241
 242                 if (llc == endc) break;
 243               }
 244             }
 245           }
 246           else
 247           {
 248             tok_buf[wrpos++] = lc;
 249             if (WDL_unlikely(wrpos >= tok_sz) && WDL_unlikely(!(tok_buf=realloc_tok(tok_sz)))) return NULL;
 250
 251             lc = nextchar();
 252           }
 253         break;
 254         case -1:
 255           m_err="unexpected whitespace";
 256         return NULL;
 257         default:
 258           m_err=""; // EOF
 259         return NULL;
 260       }
 261       tok_buf[wrpos]=0;
 262       m_lastchar = lc;
 263       return (char *)tok_buf;
 264     }
 265
 266     bool decode_entity(char *wr) // will never write more than 8 bytes
 267     {
 268       char tmp[32];
 269       int i=0;
 270       while (i < 31 && (m_lastchar = nextchar()) > 0 && m_lastchar != ';')
 271       {
 272         if (char_type(m_lastchar) && m_lastchar != '#') break;
 273         tmp[i++] = m_lastchar;
 274       }
 275       int byteval = 0;
 276       if (m_lastchar == ';')
 277       {
 278         tmp[i]=0;
 279         if (!strcmp(tmp,"lt")) byteval = '<';
 280         else if (!strcmp(tmp,"gt")) byteval = '>';
 281         else if (!strcmp(tmp,"amp")) byteval = '&';
 282         else if (!strcmp(tmp,"apos")) byteval = '\'';
 283         else if (!strcmp(tmp,"quot")) byteval = '"';
 284         else if (tmp[0] == '#')
 285         {
 286           if (tmp[1] >= '0' && tmp[1] <= '9') byteval = atoi(tmp+1);
 287           if (tmp[1] == 'x') byteval = strtol(tmp+1,NULL,16);
 288         }
 289       }
 290       if (!byteval) return false;
 291       WDL_MakeUTFChar((char*)wr,byteval,8);
 292       m_lastchar = nextchar();
 293       return true;
 294     }
 295
 296     bool skip_until(const char *s) // raw search, no tokenization
 297     {
 298       int state = 0, c = m_lastchar;
 299       while (c>0 && s[state])
 300       {
 301         state = (state && c == (unsigned char)s[state]) ? (state+1) : (c == (unsigned char)s[0]);
 302         c = nextchar();
 303       }
 304       m_lastchar = c;
 305       return !s[state];
 306     }
 307
 308     const char *parse_element_attributes(wdl_xml_element *elem)
 309     {
 310       char *attr_name=NULL;
 311       for (;;)
 312       {
 313         const char *tok = get_tok();
 314         if (!tok) break;
 315
 316         if (*tok == '-' || *tok == '.' || (*tok >= '0' && *tok <= '9')) { m_err="attribute must not begin with .- or number"; break; }
 317
 318         if (char_type(*tok)) return tok;
 319
 320         attr_name = strdup(tok);
 321         if (!attr_name) { m_err="malloc fail"; break; }
 322
 323         if (m_sort_attributes &&
 324             elem->attributes.Get(attr_name))
 325         {
 326           m_err="attribute specified more than once";
 327           break;
 328         }
 329
 330         tok = get_tok();
 331         if (!tok) break;
 332         if (*tok != '=') { m_err="attribute name must be followed by '='"; break; }
 333
 334         tok = get_tok();
 335         if (!tok) break;
 336         if (*tok != '\'' && *tok != '"') { m_err="attribute value must be quoted string"; break; }
 337
 338         const size_t tok_len = strlen(tok);
 339         if (tok_len < 2 || tok[tok_len-1] != tok[0]) { m_err="attribute value missing trailing quote"; break;  }
 340
 341         char *value = (char *)malloc(tok_len-2+1);
 342         if (!value) { m_err="malloc fail"; break; }
 343
 344         memcpy(value,tok+1,tok_len-2);
 345         value[tok_len-2]=0;
 346
 347         if (m_sort_attributes)
 348           elem->attributes.Insert(attr_name,value);
 349         else
 350           elem->attributes.AddUnsorted(attr_name,value);
 351
 352         attr_name = NULL;
 353       }
 354       free(attr_name);
 355       return NULL;
 356     }
 357
 358     const char *parse_element_body(wdl_xml_element *elem) // return NULL on success, error message on failure
 359     {
 360       int cnt=0;
 361       for (;;)
 362       {
 363         if (elem)
 364         {
 365           bool want_add = elem->value.GetLength() > 0;
 366           while (m_lastchar != '<' && m_lastchar > 0)
 367           {
 368             if (!want_add && char_type(m_lastchar)>=0) want_add=true;
 369
 370             bool adv=true;
 371             if (m_lastchar == '&')
 372             {
 373               m_last_line=m_line; m_last_col=m_col;
 374               char buf[8];
 375               if (!decode_entity(buf)) return "unknown entity in element body";
 376               elem->value.Append(buf);
 377               adv=false;
 378             }
 379             else if (want_add)
 380             {
 381               unsigned char c = (unsigned char)m_lastchar;
 382               elem->value.Append((const char *)&c,1);
 383             }
 384
 385             if (adv) m_lastchar = nextchar();
 386           }
 387         }
 388
 389         const char *tok = get_tok(elem != NULL);
 390         const int start_line = m_last_line, start_col = m_last_col;
 391         if (!tok)
 392         {
 393           if (m_err && *m_err == 0 && !elem) m_err = NULL; // clear m_error if EOF and top level
 394           return elem ? "unterminated block" : NULL;
 395         }
 396         if (*tok != '<') return "expected < tag";
 397
 398         tok = get_tok(true);
 399         if (!tok) return "expected token after <";
 400
 401         if (tok[0] == '!')
 402         {
 403           tok = get_tok(true);
 404           if (!tok) return "expected token following <!";
 405
 406           if (*tok == '-')
 407           {
 408             tok = get_tok(true);
 409             if (!tok) return "expected token following <!-";
 410             if (*tok != '-') return "unknown token following <!-";
 411             if (!skip_until("--"))
 412             {
 413               m_last_line=start_line;
 414               m_last_col=start_col;
 415               return "unterminated comment";
 416             }
 417             tok = get_tok(true);
 418             if (!tok || tok[0] != '>') return "-- not allowed in comment";
 419           }
 420           else if (*tok == '[')
 421           {
 422             if (!elem) return "<![ not allowed at document level";
 423             tok = get_tok(true);
 424             if (!tok || strcmp(tok,"CDATA")) return "unknown token beginning <![";
 425             tok=get_tok(true);
 426             if (!tok || tok[0] != '[') return "unknown token beginning <![CDATA but without trailing [";
 427
 428             // add content literally until ]]>
 429             int lc=m_lastchar, last1=0,last2=0;
 430             for (;;)
 431             {
 432               if (lc == '>' && last1 == ']' && last2 == ']') break;
 433
 434               unsigned char c = (unsigned char)lc;
 435               elem->value.Append((const char *)&c,1);
 436               last2 = last1;
 437               last1 = lc;
 438
 439               lc = nextchar();
 440               if (lc <= 0)
 441               {
 442                 m_lastchar = -1;
 443                 m_last_line=start_line;
 444                 m_last_col=start_col;
 445                 return "unterminated <![CDATA[";
 446               }
 447             }
 448             elem->value.SetLen(elem->value.GetLength()-2); // remove ]]
 449             m_lastchar = nextchar();
 450
 451           }
 452           else if (!strcmp(tok,"DOCTYPE"))
 453           {
 454             if (elem) return "<!DOCTYPE must be at top level";
 455             if (element_doctype_tokens.GetSize()) return "<!DOCTYPE already specified";
 456
 457             tok = get_tok();
 458             if (!tok || char_type(*tok)) return "expected document type token following <!DOCTYPE";
 459             do
 460             {
 461               element_doctype_tokens.Add(strdup(tok));
 462               tok = get_tok();
 463               if (!tok)
 464               {
 465                 m_last_line=start_line;
 466                 m_last_col=start_col;
 467                 return "unterminated <!DOCTYPE";
 468               }
 469             } while (tok[0] != '>');
 470           }
 471           else return "unknown token following <!";
 472         }
 473         else if (tok[0] == '?')
 474         {
 475           tok = get_tok(true);
 476           if (!tok) return "expected token following <?";
 477
 478           if (!strcmp(tok,"xml"))
 479           {
 480             if (elem || cnt || element_xml || element_root_meta.GetSize()) return "<?xml must begin document";
 481
 482             element_xml = new wdl_xml_element("xml",start_line,start_col,m_sort_attributes);
 483             tok = parse_element_attributes(element_xml);
 484             if (!tok || tok[0] != '?' || !(tok=get_tok(true)) || tok[0] != '>')
 485               return "<?xml not terminated";
 486           }
 487           else
 488           {
 489             if (!elem)
 490             {
 491               wdl_xml_element *ne = new wdl_xml_element(tok,start_line,start_col,m_sort_attributes);
 492               tok = parse_element_attributes(ne);
 493               if (!tok || tok[0] != '?' || !(tok=get_tok(true)) || tok[0] != '>')
 494               {
 495                 delete ne;
 496                 return "<? element not terminated";
 497               }
 498               element_root_meta.Add(ne);
 499             }
 500             else if (!skip_until("?>")) // ignore <? inside elements
 501             {
 502               m_last_line=start_line;
 503               m_last_col=start_col;
 504               return "unterminated <? block";
 505             }
 506           }
 507         }
 508         else if (tok[0] == '/')
 509         {
 510           if (!elem) return "unexpected </ at root level";
 511
 512           tok = get_tok(true);
 513           if (strcmp(tok,elem->name))
 514           {
 515             return "mismatched </ tag name";
 516           }
 517
 518           tok = get_tok();
 519           if (!tok || tok[0] != '>') return "expected > following </tag";
 520           // done!
 521           elem->m_has_discrete_close = true;
 522           return NULL;
 523         }
 524         else
 525         {
 526           if (!elem && element_root)
 527             return "multiple top level elements";
 528
 529           if (*tok == '-' || *tok == '.' || (*tok >= '0' && *tok <= '9'))
 530             return "element name must not begin with .- or number";
 531
 532           wdl_xml_element *sub = new wdl_xml_element(tok,start_line,start_col,m_sort_attributes);
 533           if (elem) elem->elements.Add(sub);
 534           else element_root = sub;
 535
 536           tok = parse_element_attributes(sub);
 537           if (!tok) return "unterminated element";
 538
 539           if (*tok == '/')
 540           {
 541             tok = get_tok(true);
 542             if (!tok || *tok != '>') return "expected > following / to end element";
 543           }
 544           else if (*tok == '>')
 545           {
 546             const char *ret = parse_element_body(sub);
 547             if (ret) return ret;
 548           }
 549           else
 550           {
 551             return "unknown token in element";
 552           }
 553         }
 554         cnt++;
 555       }
 556     }
 557 };
 558
 559 class wdl_xml_fileread : public wdl_xml_parser {
 560   FILE *m_fp;
 561   char m_buf[1024];
 562   int m_charset;
 563
 564   virtual int moredata(const char **dataptr)
 565   {
 566     *dataptr = m_buf;
 567     const int cs = m_charset;
 568     if (m_fp) switch (cs)
 569     {
 570       case 0:
 571         return (int) fread(m_buf,1,sizeof(m_buf),m_fp);
 572       case 1:
 573       case 2:
 574         {
 575           unsigned char tmp[128];
 576           const int l = (int) fread(tmp,1,sizeof(tmp),m_fp);
 577           int rd=0, wpos=0;
 578           while (rd+1 < l)
 579           {
 580             const int amt=wdl_utf8_makechar(cs==1 ? ((tmp[rd]<<8)|tmp[rd+1]) : (tmp[rd]|(tmp[rd+1]<<8)),
 581                 m_buf+wpos,
 582                 (int)sizeof(m_buf)-wpos);
 583             if (amt>0) wpos += amt;
 584
 585             rd+=2;
 586           }
 587           return wpos;
 588         }
 589     }
 590     return 0;
 591   }
 592 public:
 593   wdl_xml_fileread(FILE *fp) : wdl_xml_parser(NULL,0)
 594   {
 595     m_fp=fp;
 596     m_charset=0; // default to utf-8
 597     if (fp)
 598     {
 599       unsigned char bom[2];
 600       if (fread(bom,1,2,fp)==2)
 601       {
 602         if (bom[0] == 0xEF && bom[1] == 0xBB && fgetc(fp) == 0xBF) m_charset=0;
 603         else if (bom[0] == 0xFE && bom[1] == 0xFF) m_charset=1; // utf-16 BE
 604         else if (bom[0] == 0xFF && bom[1] == 0xFE) m_charset=2; // utf-16 LE
 605         else fseek(fp,0,SEEK_SET); // rewind
 606       }
 607     }
 608   }
 609   virtual ~wdl_xml_fileread() { if (m_fp) fclose(m_fp); }
 610 };
 611
 612 #endif
 613