nel/src/gui/html_parser.cpp

   1 // Ryzom - MMORPG Framework <http://dev.ryzom.com/projects/ryzom/>
   2 // Copyright (C) 2010  Winch Gate Property Limited
   3 //
   4 // This program is free software: you can redistribute it and/or modify
   5 // it under the terms of the GNU Affero General Public License as
   6 // published by the Free Software Foundation, either version 3 of the
   7 // License, or (at your option) any later version.
   8 //
   9 // This program is distributed in the hope that it will be useful,
  10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 // GNU Affero General Public License for more details.
  13 //
  14 // You should have received a copy of the GNU Affero General Public License
  15 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17
  18 #include "stdpch.h"
  19
  20 #include "nel/gui/html_parser.h"
  21
  22 #include <string>
  23 #include <libxml/HTMLparser.h>
  24
  25 #include "nel/misc/types_nl.h"
  26 #include "nel/gui/libwww.h"
  27 #include "nel/gui/group_html.h"
  28
  29 using namespace std;
  30 using namespace NLMISC;
  31
  32 #ifdef DEBUG_NEW
  33 #define new DEBUG_NEW
  34 #endif
  35
  36 namespace NLGUI
  37 {
  38
  39         // ***************************************************************************
  40         void CHtmlParser::parseStyle(xmlNode *a_node, std::string &styleString) const
  41         {
  42                 xmlNode *node = a_node;
  43                 while(node)
  44                 {
  45                         if (node->type == XML_CDATA_SECTION_NODE)
  46                         {
  47                                 styleString += (const char*)node->content;
  48                         }
  49                         else
  50                         {
  51                                 nlwarning("<style> tag has child elements other than cdata[%d]", node->type);
  52                         }
  53
  54                         node = node->next;
  55                 }
  56         }
  57
  58         // ***************************************************************************
  59         // recursive function to walk html document
  60         void CHtmlParser::parseNode(xmlNode *a_node, CHtmlElement &parent, std::vector<std::string> &styles, std::vector<StyleLink> &links) const
  61         {
  62                 uint childIndex = 0;
  63                 uint element_number;
  64                 xmlNode *node = a_node;
  65                 while(node)
  66                 {
  67                         if (node->type == XML_TEXT_NODE)
  68                         {
  69                                 parent.Children.push_back(CHtmlElement(CHtmlElement::TEXT_NODE, (const char*)(node->content)));
  70                         }
  71                         else
  72                         if (node->type == XML_ELEMENT_NODE)
  73                         {
  74                                 // find html element
  75                                 element_number = htmlElementLookup((const char*)node->name);
  76
  77                                 // get pointer to previous sibling
  78                                 CHtmlElement *prevSibling = NULL;
  79                                 if (!parent.Children.empty())
  80                                 {
  81                                         // skip text nodes
  82                                         for(std::list<CHtmlElement>::reverse_iterator it = parent.Children.rbegin(); it != parent.Children.rend(); ++it)
  83                                         {
  84                                                 if (it->Type == CHtmlElement::ELEMENT_NODE)
  85                                                 {
  86                                                         prevSibling = &(*it);
  87                                                         break;
  88                                                 }
  89                                         }
  90                                 }
  91
  92                                 parent.Children.push_back(CHtmlElement(CHtmlElement::ELEMENT_NODE, toLowerAscii((const char*)node->name)));
  93                                 CHtmlElement &elm = parent.Children.back();
  94                                 elm.ID = element_number;
  95                                 elm.parent = &parent;
  96                                 elm.childIndex = childIndex;
  97
  98                                 // previous/next sibling that is ELEMENT_NODE
  99                                 elm.previousSibling = prevSibling;
 100                                 if (prevSibling)
 101                                 {
 102                                         prevSibling->nextSibling = &parent.Children.back();
 103                                 }
 104
 105                                 childIndex++;
 106
 107                                 // TODO: harvest <link type="css">, <style>, <img>
 108
 109                                 elm.Attributes.clear();
 110
 111                                 for (xmlAttr *cur_attr = node->properties; cur_attr; cur_attr = cur_attr->next) {
 112                                         std::string key(toLowerAscii((const char *)(cur_attr->name)));
 113                                         std::string value;
 114                                         if (cur_attr->children)
 115                                         {
 116                                                 value = (const char *)(cur_attr->children->content);
 117                                         }
 118                                         elm.Attributes[key] = value;
 119                                 }
 120
 121                                 if (elm.hasAttribute("class"))
 122                                 {
 123                                         std::vector<std::string> parts;
 124                                         NLMISC::splitString(elm.getAttribute("class"), " ", parts);
 125                                         for(uint i = 0; i<parts.size();++i)
 126                                         {
 127                                                 elm.ClassNames.insert(toLowerAscii(trim(parts[i])));
 128                                         }
 129                                 }
 130
 131                                 if (elm.Value == "style")
 132                                 {
 133                                         // <style type="text/css" media="all, screen">
 134                                         // ...
 135                                         // </style>
 136                                         bool useStyle = true;
 137                                         if (elm.hasAttribute("media"))
 138                                         {
 139                                                 std::string media = trim(toLowerAscii(elm.Attributes["media"]));
 140                                                 useStyle = media.empty() || media.find("all") != std::string::npos || media.find("screen") != std::string::npos;
 141
 142                                                 // <style media="ryzom"> for ingame browser
 143                                                 useStyle = useStyle || media == "ryzom";
 144                                         }
 145
 146                                         if (useStyle)
 147                                         {
 148                                                 std::string style;
 149                                                 parseStyle(node->children, style);
 150                                                 styles.push_back(style);
 151                                         }
 152                                         // style tag is kept in dom
 153                                 }
 154                                 if (elm.Value == "link" && elm.getAttribute("rel") == "stylesheet")
 155                                 {
 156                                         bool useStyle = true;
 157                                         if (elm.hasAttribute("media"))
 158                                         {
 159                                                 std::string media = trim(toLowerAscii(elm.Attributes["media"]));
 160                                                 useStyle = media.empty() || media.find("all") != std::string::npos || media.find("screen") != std::string::npos;
 161
 162                                                 // <style media="ryzom"> for ingame browser
 163                                                 useStyle = useStyle || media == "ryzom";
 164                                         }
 165
 166                                         if (useStyle)
 167                                         {
 168                                                 styles.push_back("");
 169                                                 links.push_back(StyleLink(styles.size()-1, elm.getAttribute("href")));
 170                                         }
 171                                         // link tag is kept in dom
 172                                 }
 173                                 else if (node->children)
 174                                 {
 175                                         parseNode(node->children, elm, styles, links);
 176
 177                                         // must cleanup nested tags that libxml2 does not fix
 178                                         // dt without end tag: <dl><dt><dt></dl>
 179                                         // dd without end tag: <dl><dd><dd></dl>
 180                                         if (!elm.Children.empty() && (elm.Value == "dt" || elm.Value == "dd"))
 181                                         {
 182                                                 std::string tag = elm.Value;
 183                                                 std::list<CHtmlElement>::iterator it;
 184                                                 for(it = elm.Children.begin(); it != elm.Children.end(); ++it)
 185                                                 {
 186                                                         if (it->Type == CHtmlElement::ELEMENT_NODE && it->Value == tag)
 187                                                         {
 188                                                                 // relocate this and remaining elements over to parent
 189                                                                 parent.Children.splice(parent.Children.end(), elm.Children, it, elm.Children.end());
 190                                                                 break;
 191                                                         }
 192                                                 }
 193                                                 elm.reindexChilds();
 194                                                 parent.reindexChilds();
 195                                         }
 196
 197                                         // move all <tr> directly under <table> to its own <tbody> ("table > tbody > tr" selector).
 198                                         // TODO: move first real <thead> to front, move first real <tfoot> at the end
 199                                         if (elm.ID == HTML_TABLE)
 200                                         {
 201                                                 std::list<CHtmlElement>::iterator it = elm.Children.begin();
 202                                                 std::list<CHtmlElement>::iterator tbody = elm.Children.end();
 203                                                 for(it = elm.Children.begin(); it != elm.Children.end(); ++it)
 204                                                 {
 205                                                         if (it->ID == HTML_TR)
 206                                                         {
 207                                                                 if (tbody == elm.Children.end())
 208                                                                 {
 209                                                                         tbody = elm.Children.insert(it, CHtmlElement(CHtmlElement::ELEMENT_NODE, "tbody"));
 210                                                                         tbody->ID = HTML_TBODY;
 211                                                                         tbody->parent = &elm;
 212                                                                 }
 213                                                                 tbody->Children.splice(tbody->Children.end(), elm.Children, it);
 214                                                                 it = tbody;
 215                                                         }
 216                                                         else if (tbody != elm.Children.end())
 217                                                         {
 218                                                                 tbody->reindexChilds();
 219                                                                 tbody = elm.Children.end();
 220                                                         }
 221                                                 }
 222
 223                                                 elm.reindexChilds();
 224                                         }
 225                                 }
 226                         }
 227
 228                         // move into next sibling
 229                         node = node->next;
 230                 }
 231         }
 232
 233         // ***************************************************************************
 234         // http://stackoverflow.com/a/18335183
 235         static std::string correctNonUtf8(const std::string &str)
 236         {
 237                 int i, f_size=str.size();
 238                 unsigned char c,c2,c3,c4;
 239                 std::string to;
 240                 to.reserve(f_size);
 241
 242                 for(i=0 ; i<f_size ; i++)
 243                 {
 244                         c=(unsigned char)(str[i]);
 245                         if (c<32)
 246                         {
 247                                 //control char
 248                                 if(c==9 || c==10 || c==13)
 249                                 {
 250                                         //allow only \t \n \r
 251                                         to.append(1,c);
 252                                 }
 253                                 continue;
 254                         }
 255                         else if (c<127)
 256                         {
 257                                 //normal ASCII
 258                                 to.append(1,c);
 259                                 continue;
 260                         }
 261                         else if (c < 160)
 262                         {
 263                                 //control char (nothing should be defined here either ASCI, ISO_8859-1 or UTF8, so skipping)
 264                                 if (c == 128)
 265                                 {
 266                                         //fix microsoft mess, add euro
 267                                         to.append(1,226);
 268                                         to.append(1,130);
 269                                         to.append(1,172);
 270                                 }
 271
 272                                 if (c == 133)
 273                                 {
 274                                         //fix IBM mess, add NEL = \n\r
 275                                         to.append(1,10);
 276                                         to.append(1,13);
 277                                 }
 278                                 continue;
 279                         }
 280                         else if (c < 192)
 281                         {
 282                                 //invalid for UTF8, converting ASCII
 283                                 to.append(1,(unsigned char)194);
 284                                 to.append(1,c);
 285                                 continue;
 286                         }
 287                         else if (c < 194)
 288                         {
 289                                 //invalid for UTF8, converting ASCII
 290                                 to.append(1,(unsigned char)195);
 291                                 to.append(1,c-64);
 292                                 continue;
 293                         }
 294                         else if (c < 224 && i + 1 < f_size)
 295                         {
 296                                 //possibly 2byte UTF8
 297                                 c2 = (unsigned char)(str[i+1]);
 298
 299                                 if (c2 > 127 && c2 < 192)
 300                                 {
 301                                         //valid 2byte UTF8
 302                                         if (c == 194 && c2 < 160)
 303                                         {
 304                                                 //control char, skipping
 305                                                 ;
 306                                         }
 307                                         else
 308                                         {
 309                                                 to.append(1,c);
 310                                                 to.append(1,c2);
 311                                         }
 312                                         i++;
 313                                         continue;
 314                                 }
 315                         }
 316                         else if (c < 240 && i + 2 < f_size)
 317                         {
 318                                 // possibly 3byte UTF8
 319                                 c2 = (unsigned char)(str[i+1]);
 320                                 c3 = (unsigned char)(str[i+2]);
 321
 322                                 if (c2 > 127 && c2 < 192 && c3 > 127 && c3 < 192)
 323                                 {
 324                                         // valid 3byte UTF8
 325                                         to.append(1,c);
 326                                         to.append(1,c2);
 327                                         to.append(1,c3);
 328                                         i+=2;
 329                                         continue;
 330                                 }
 331                         }
 332                         else if (c < 245 && i + 3 < f_size)
 333                         {
 334                                 //possibly 4byte UTF8
 335                                 c2 = (unsigned char)(str[i+1]);
 336                                 c3 = (unsigned char)(str[i+2]);
 337                                 c4 = (unsigned char)(str[i+3]);
 338                                 if (c2 > 127 && c2 < 192 && c3 > 127 && c3 < 192 && c4 > 127 && c4 < 192)
 339                                 {
 340                                         //valid 4byte UTF8
 341                                         to.append(1,c);
 342                                         to.append(1,c2);
 343                                         to.append(1,c3);
 344                                         to.append(1,c4);
 345                                         i+=3;
 346                                         continue;
 347                                 }
 348                         }
 349
 350                         //invalid UTF8, converting ASCII (c>245 || string too short for multi-byte))
 351                         to.append(1,(unsigned char)195);
 352                         to.append(1,c-64);
 353                 }
 354                 return to;
 355         }
 356
 357         // ***************************************************************************
 358         static void patchHtmlQuirks(std::string &htmlString)
 359         {
 360                 size_t npos = std::string::npos;
 361                 size_t pos;
 362
 363                 // get rid of BOM (some ingame help files does not show up otherwise)
 364                 if (htmlString.substr(0, 3) == "\xEF\xBB\xBF")
 365                 {
 366                         htmlString.erase(0, 3);
 367                 }
 368
 369                 // if any element is before <html>, then parser adds <html><body>
 370                 // and original tags are ignored (their attributes not processed)
 371                 //
 372                 // only fix situation when there is <body> tag with attributes
 373                 //
 374                 // tags are considered to be lowercase
 375
 376                 pos = htmlString.find("<body ");
 377                 if (pos != npos)
 378                 {
 379                         size_t start = htmlString.find("<");
 380                         // skip <!doctype html>
 381                         if (htmlString.substr(start, 2) == "<!")
 382                                 start = htmlString.find("<", start + 1);
 383
 384                         // if there is no html tag, then abort
 385                         size_t end = htmlString.find("<html>");
 386                         if (end != npos && start < end && end < pos)
 387                         {
 388                                 // body tag end position
 389                                 size_t insert = htmlString.find(">", pos);
 390                                 if (insert != npos)
 391                                 {
 392                                         std::string str = htmlString.substr(start, end - start);
 393                                         htmlString.insert(insert+1, str);
 394                                         htmlString.erase(start, str.size());
 395                                 }
 396                         }
 397                 }
 398
 399                 // make sure </html> (if present) is last in document or tags coming after it are ignored
 400                 pos = htmlString.find("</html>");
 401                 if (pos != npos && htmlString.find("<", pos+1) > pos)
 402                 {
 403                         htmlString.erase(pos, 7);
 404                         htmlString += "</html>";
 405                 }
 406
 407                 // if there is invalid utf-8 chars, then libxml will break everything after first it finds.
 408                 htmlString = correctNonUtf8(htmlString);
 409         }
 410
 411         // ***************************************************************************
 412         void CHtmlParser::getDOM(std::string htmlString, CHtmlElement &dom, std::vector<std::string> &styles, std::vector<StyleLink> &links) const
 413         {
 414                 htmlParserCtxtPtr parser = htmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL, XML_CHAR_ENCODING_UTF8);
 415                 if (!parser)
 416                 {
 417                         nlwarning("Creating html parser context failed");
 418                         return;
 419                 }
 420
 421                 htmlCtxtUseOptions(parser, HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET);
 422
 423                 // parser is little strict on tag order, so fix whats needed
 424                 patchHtmlQuirks(htmlString);
 425
 426                 htmlParseChunk(parser, htmlString.c_str(), htmlString.size(), 0);
 427                 htmlParseChunk(parser, "", 0, 1);
 428
 429                 if (parser->myDoc)
 430                 {
 431                         xmlNode *root = xmlDocGetRootElement(parser->myDoc);
 432                         if (root)
 433                         {
 434                                 parseNode(root, dom, styles, links);
 435                         }
 436                         else
 437                         {
 438                                 nlwarning("html root node failed");
 439                         }
 440                         xmlFreeDoc(parser->myDoc);
 441                 }
 442                 else
 443                 {
 444                         nlwarning("htmlstring parsing failed");
 445                 }
 446
 447                 htmlFreeParserCtxt(parser);
 448         }
 449
 450 }
 451
 452