nel/src/gui/html_parser.cpp

   1 // Ryzom - MMORPG Framework <http://dev.ryzom.com/projects/ryzom/>
   2 // Copyright (C) 2010-2021  Winch Gate Property Limited
   3 //
   4 // This source file has been modified by the following contributors:
   5 // Copyright (C) 2020  Jan BOON (Kaetemi) <jan.boon@kaetemi.be>
   6 //
   7 // This program is free software: you can redistribute it and/or modify
   8 // it under the terms of the GNU Affero General Public License as
   9 // published by the Free Software Foundation, either version 3 of the
  10 // License, or (at your option) any later version.
  11 //
  12 // This program is distributed in the hope that it will be useful,
  13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 // GNU Affero General Public License for more details.
  16 //
  17 // You should have received a copy of the GNU Affero General Public License
  18 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19
  20
  21 #include "stdpch.h"
  22
  23 #include "nel/gui/html_parser.h"
  24
  25 #include <string>
  26 #include <libxml/HTMLparser.h>
  27
  28 #include "nel/misc/types_nl.h"
  29 #include "nel/gui/libwww.h"
  30 #include "nel/gui/group_html.h"
  31
  32 using namespace std;
  33 using namespace NLMISC;
  34
  35 #ifdef DEBUG_NEW
  36 #define new DEBUG_NEW
  37 #endif
  38
  39 namespace NLGUI
  40 {
  41
  42         // ***************************************************************************
  43         void CHtmlParser::parseStyle(xmlNode *a_node, std::string &styleString) const
  44         {
  45                 xmlNode *node = a_node;
  46                 while(node)
  47                 {
  48                         if (node->type == XML_CDATA_SECTION_NODE)
  49                         {
  50                                 styleString += (const char*)node->content;
  51                         }
  52                         else
  53                         {
  54                                 nlwarning("<style> tag has child elements other than cdata[%d]", node->type);
  55                         }
  56
  57                         node = node->next;
  58                 }
  59         }
  60
  61         // ***************************************************************************
  62         // recursive function to walk html document
  63         void CHtmlParser::parseNode(xmlNode *a_node, CHtmlElement &parent, std::vector<std::string> &styles, std::vector<StyleLink> &links) const
  64         {
  65                 uint childIndex = 0;
  66                 uint element_number;
  67                 xmlNode *node = a_node;
  68                 while(node)
  69                 {
  70                         if (node->type == XML_TEXT_NODE)
  71                         {
  72                                 // linebreak right after pre,textare open tag should be removed
  73                                 if (parent.Children.empty() && (*node->content == '\n') && (parent.ID == HTML_PRE || parent.ID == HTML_TEXTAREA))
  74                                 {
  75                                         parent.Children.push_back(CHtmlElement(CHtmlElement::TEXT_NODE, (const char*)(node->content) + 1));
  76                                 }
  77                                 else
  78                                 {
  79                                         parent.Children.push_back(CHtmlElement(CHtmlElement::TEXT_NODE, (const char*)(node->content)));
  80                                 }
  81                         }
  82                         else
  83                         if (node->type == XML_ELEMENT_NODE)
  84                         {
  85                                 // find html element
  86                                 element_number = htmlElementLookup((const char*)node->name);
  87
  88                                 // get pointer to previous sibling
  89                                 CHtmlElement *prevSibling = NULL;
  90                                 if (!parent.Children.empty())
  91                                 {
  92                                         // skip text nodes
  93                                         for(std::list<CHtmlElement>::reverse_iterator it = parent.Children.rbegin(); it != parent.Children.rend(); ++it)
  94                                         {
  95                                                 if (it->Type == CHtmlElement::ELEMENT_NODE)
  96                                                 {
  97                                                         prevSibling = &(*it);
  98                                                         break;
  99                                                 }
 100                                         }
 101                                 }
 102
 103                                 parent.Children.push_back(CHtmlElement(CHtmlElement::ELEMENT_NODE, toLowerAscii((const char*)node->name)));
 104                                 CHtmlElement &elm = parent.Children.back();
 105                                 elm.ID = element_number;
 106                                 elm.parent = &parent;
 107                                 elm.childIndex = childIndex;
 108
 109                                 // previous/next sibling that is ELEMENT_NODE
 110                                 elm.previousSibling = prevSibling;
 111                                 if (prevSibling)
 112                                 {
 113                                         prevSibling->nextSibling = &parent.Children.back();
 114                                 }
 115
 116                                 childIndex++;
 117
 118                                 // TODO: harvest <link type="css">, <style>, <img>
 119
 120                                 elm.Attributes.clear();
 121
 122                                 for (xmlAttr *cur_attr = node->properties; cur_attr; cur_attr = cur_attr->next) {
 123                                         std::string key(toLowerAscii((const char *)(cur_attr->name)));
 124                                         std::string value;
 125                                         if (cur_attr->children)
 126                                         {
 127                                                 value = (const char *)(cur_attr->children->content);
 128                                         }
 129                                         elm.Attributes[key] = value;
 130                                 }
 131
 132                                 if (elm.hasAttribute("class"))
 133                                 {
 134                                         std::vector<std::string> parts;
 135                                         NLMISC::splitString(elm.getAttribute("class"), " ", parts);
 136                                         for(uint i = 0; i<parts.size();++i)
 137                                         {
 138                                                 elm.ClassNames.insert(toLowerAscii(trim(parts[i])));
 139                                         }
 140                                 }
 141
 142                                 if (elm.Value == "style")
 143                                 {
 144                                         // <style type="text/css" media="all, screen">
 145                                         // ...
 146                                         // </style>
 147                                         bool useStyle = true;
 148                                         if (elm.hasAttribute("media"))
 149                                         {
 150                                                 std::string media = trim(toLowerAscii(elm.Attributes["media"]));
 151                                                 useStyle = media.empty() || media.find("all") != std::string::npos || media.find("screen") != std::string::npos;
 152
 153                                                 // <style media="ryzom"> for ingame browser
 154                                                 useStyle = useStyle || media == "ryzom";
 155                                         }
 156
 157                                         if (useStyle)
 158                                         {
 159                                                 std::string style;
 160                                                 parseStyle(node->children, style);
 161                                                 styles.push_back(style);
 162                                         }
 163                                         // style tag is kept in dom
 164                                 }
 165                                 if (elm.Value == "link" && elm.getAttribute("rel") == "stylesheet")
 166                                 {
 167                                         bool useStyle = true;
 168                                         if (elm.hasAttribute("media"))
 169                                         {
 170                                                 std::string media = trim(toLowerAscii(elm.Attributes["media"]));
 171                                                 useStyle = media.empty() || media.find("all") != std::string::npos || media.find("screen") != std::string::npos;
 172
 173                                                 // <style media="ryzom"> for ingame browser
 174                                                 useStyle = useStyle || media == "ryzom";
 175                                         }
 176
 177                                         if (useStyle)
 178                                         {
 179                                                 styles.push_back("");
 180                                                 links.push_back(StyleLink(styles.size()-1, elm.getAttribute("href")));
 181                                         }
 182                                         // link tag is kept in dom
 183                                 }
 184                                 else if (node->children)
 185                                 {
 186                                         parseNode(node->children, elm, styles, links);
 187
 188                                         if (!elm.Children.empty() && elm.ID == HTML_PRE && elm.Children.back().Type == CHtmlElement::TEXT_NODE)
 189                                         {
 190                                                 std::string::size_type size = elm.Children.back().Value.size();
 191                                                 // strip last '\n' from non-empty line
 192                                                 if (size > 1 && elm.Children.back().Value[size-1] == '\n')
 193                                                 {
 194                                                         elm.Children.back().Value = elm.Children.back().Value.substr(0, size - 1);
 195                                                 }
 196                                         }
 197
 198                                         // must cleanup nested tags that libxml2 does not fix
 199                                         // dt without end tag: <dl><dt><dt></dl>
 200                                         // dd without end tag: <dl><dd><dd></dl>
 201                                         if (!elm.Children.empty() && (elm.Value == "dt" || elm.Value == "dd"))
 202                                         {
 203                                                 std::string tag = elm.Value;
 204                                                 std::list<CHtmlElement>::iterator it;
 205                                                 for(it = elm.Children.begin(); it != elm.Children.end(); ++it)
 206                                                 {
 207                                                         if (it->Type == CHtmlElement::ELEMENT_NODE && it->Value == tag)
 208                                                         {
 209                                                                 // relocate this and remaining elements over to parent
 210                                                                 parent.Children.splice(parent.Children.end(), elm.Children, it, elm.Children.end());
 211                                                                 break;
 212                                                         }
 213                                                 }
 214                                                 elm.reindexChilds();
 215                                                 parent.reindexChilds();
 216                                         }
 217
 218                                         // move all <tr> directly under <table> to its own <tbody> ("table > tbody > tr" selector).
 219                                         // TODO: move first real <thead> to front, move first real <tfoot> at the end
 220                                         if (elm.ID == HTML_TABLE)
 221                                         {
 222                                                 std::list<CHtmlElement>::iterator it = elm.Children.begin();
 223                                                 std::list<CHtmlElement>::iterator tbody = elm.Children.end();
 224                                                 for(it = elm.Children.begin(); it != elm.Children.end(); ++it)
 225                                                 {
 226                                                         if (it->ID == HTML_TR)
 227                                                         {
 228                                                                 if (tbody == elm.Children.end())
 229                                                                 {
 230                                                                         tbody = elm.Children.insert(it, CHtmlElement(CHtmlElement::ELEMENT_NODE, "tbody"));
 231                                                                         tbody->ID = HTML_TBODY;
 232                                                                         tbody->parent = &elm;
 233                                                                 }
 234                                                                 tbody->Children.splice(tbody->Children.end(), elm.Children, it);
 235                                                                 it = tbody;
 236                                                         }
 237                                                         else if (tbody != elm.Children.end())
 238                                                         {
 239                                                                 tbody->reindexChilds();
 240                                                                 tbody = elm.Children.end();
 241                                                         }
 242                                                 }
 243
 244                                                 elm.reindexChilds();
 245                                         }
 246                                 }
 247                         }
 248
 249                         // move into next sibling
 250                         node = node->next;
 251                 }
 252         }
 253
 254         // ***************************************************************************
 255         // http://stackoverflow.com/a/18335183
 256         static std::string correctNonUtf8(const std::string &str)
 257         {
 258                 int i, f_size=str.size();
 259                 unsigned char c,c2,c3,c4;
 260                 std::string to;
 261                 to.reserve(f_size);
 262
 263                 for(i=0 ; i<f_size ; i++)
 264                 {
 265                         c=(unsigned char)(str[i]);
 266                         if (c<32)
 267                         {
 268                                 //control char
 269                                 if(c==9 || c==10 || c==13)
 270                                 {
 271                                         //allow only \t \n \r
 272                                         to.append(1,c);
 273                                 }
 274                                 continue;
 275                         }
 276                         else if (c<127)
 277                         {
 278                                 //normal ASCII
 279                                 to.append(1,c);
 280                                 continue;
 281                         }
 282                         else if (c < 160)
 283                         {
 284                                 //control char (nothing should be defined here either ASCI, ISO_8859-1 or UTF8, so skipping)
 285                                 if (c == 128)
 286                                 {
 287                                         //fix microsoft mess, add euro
 288                                         to.append(1,226);
 289                                         to.append(1,130);
 290                                         to.append(1,172);
 291                                 }
 292
 293                                 if (c == 133)
 294                                 {
 295                                         //fix IBM mess, add NEL = \n\r
 296                                         to.append(1,10);
 297                                         to.append(1,13);
 298                                 }
 299                                 continue;
 300                         }
 301                         else if (c < 192)
 302                         {
 303                                 //invalid for UTF8, converting ASCII
 304                                 to.append(1,(unsigned char)194);
 305                                 to.append(1,c);
 306                                 continue;
 307                         }
 308                         else if (c < 194)
 309                         {
 310                                 //invalid for UTF8, converting ASCII
 311                                 to.append(1,(unsigned char)195);
 312                                 to.append(1,c-64);
 313                                 continue;
 314                         }
 315                         else if (c < 224 && i + 1 < f_size)
 316                         {
 317                                 //possibly 2byte UTF8
 318                                 c2 = (unsigned char)(str[i+1]);
 319
 320                                 if (c2 > 127 && c2 < 192)
 321                                 {
 322                                         //valid 2byte UTF8
 323                                         if (c == 194 && c2 < 160)
 324                                         {
 325                                                 //control char, skipping
 326                                                 ;
 327                                         }
 328                                         else
 329                                         {
 330                                                 to.append(1,c);
 331                                                 to.append(1,c2);
 332                                         }
 333                                         i++;
 334                                         continue;
 335                                 }
 336                         }
 337                         else if (c < 240 && i + 2 < f_size)
 338                         {
 339                                 // possibly 3byte UTF8
 340                                 c2 = (unsigned char)(str[i+1]);
 341                                 c3 = (unsigned char)(str[i+2]);
 342
 343                                 if (c2 > 127 && c2 < 192 && c3 > 127 && c3 < 192)
 344                                 {
 345                                         // valid 3byte UTF8
 346                                         to.append(1,c);
 347                                         to.append(1,c2);
 348                                         to.append(1,c3);
 349                                         i+=2;
 350                                         continue;
 351                                 }
 352                         }
 353                         else if (c < 245 && i + 3 < f_size)
 354                         {
 355                                 //possibly 4byte UTF8
 356                                 c2 = (unsigned char)(str[i+1]);
 357                                 c3 = (unsigned char)(str[i+2]);
 358                                 c4 = (unsigned char)(str[i+3]);
 359                                 if (c2 > 127 && c2 < 192 && c3 > 127 && c3 < 192 && c4 > 127 && c4 < 192)
 360                                 {
 361                                         //valid 4byte UTF8
 362                                         to.append(1,c);
 363                                         to.append(1,c2);
 364                                         to.append(1,c3);
 365                                         to.append(1,c4);
 366                                         i+=3;
 367                                         continue;
 368                                 }
 369                         }
 370
 371                         //invalid UTF8, converting ASCII (c>245 || string too short for multi-byte))
 372                         to.append(1,(unsigned char)195);
 373                         to.append(1,c-64);
 374                 }
 375                 return to;
 376         }
 377
 378         // ***************************************************************************
 379         static void patchHtmlQuirks(std::string &htmlString)
 380         {
 381                 size_t npos = std::string::npos;
 382                 size_t pos;
 383
 384                 // get rid of BOM (some ingame help files does not show up otherwise)
 385                 if (htmlString.substr(0, 3) == "\xEF\xBB\xBF")
 386                 {
 387                         htmlString.erase(0, 3);
 388                 }
 389
 390                 // if any element is before <html>, then parser adds <html><body>
 391                 // and original tags are ignored (their attributes not processed)
 392                 //
 393                 // only fix situation when there is <body> tag with attributes
 394                 //
 395                 // tags are considered to be lowercase
 396
 397                 pos = htmlString.find("<body ");
 398                 if (pos != npos)
 399                 {
 400                         size_t start = htmlString.find("<");
 401                         // skip <!doctype html>
 402                         if (htmlString.substr(start, 2) == "<!")
 403                                 start = htmlString.find("<", start + 1);
 404
 405                         // if there is no html tag, then abort
 406                         size_t end = htmlString.find("<html>");
 407                         if (end != npos && start < end && end < pos)
 408                         {
 409                                 // body tag end position
 410                                 size_t insert = htmlString.find(">", pos);
 411                                 if (insert != npos)
 412                                 {
 413                                         std::string str = htmlString.substr(start, end - start);
 414                                         htmlString.insert(insert+1, str);
 415                                         htmlString.erase(start, str.size());
 416                                 }
 417                         }
 418                 }
 419
 420                 // make sure </html> (if present) is last in document or tags coming after it are ignored
 421                 pos = htmlString.find("</html>");
 422                 if (pos != npos && htmlString.find("<", pos+1) > pos)
 423                 {
 424                         htmlString.erase(pos, 7);
 425                         htmlString += "</html>";
 426                 }
 427
 428                 // if there is invalid utf-8 chars, then libxml will break everything after first it finds.
 429                 htmlString = correctNonUtf8(htmlString);
 430         }
 431
 432         // ***************************************************************************
 433         void CHtmlParser::getDOM(std::string htmlString, CHtmlElement &dom, std::vector<std::string> &styles, std::vector<StyleLink> &links) const
 434         {
 435                 htmlParserCtxtPtr parser = htmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL, XML_CHAR_ENCODING_UTF8);
 436                 if (!parser)
 437                 {
 438                         nlwarning("Creating html parser context failed");
 439                         return;
 440                 }
 441
 442                 htmlCtxtUseOptions(parser, HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET);
 443
 444                 // parser is little strict on tag order, so fix whats needed
 445                 patchHtmlQuirks(htmlString);
 446
 447                 htmlParseChunk(parser, htmlString.c_str(), htmlString.size(), 0);
 448                 htmlParseChunk(parser, "", 0, 1);
 449
 450                 if (parser->myDoc)
 451                 {
 452                         xmlNode *root = xmlDocGetRootElement(parser->myDoc);
 453                         if (root)
 454                         {
 455                                 parseNode(root, dom, styles, links);
 456                         }
 457                         else
 458                         {
 459                                 nlwarning("html root node failed");
 460                         }
 461                         xmlFreeDoc(parser->myDoc);
 462                 }
 463                 else
 464                 {
 465                         nlwarning("htmlstring parsing failed");
 466                 }
 467
 468                 htmlFreeParserCtxt(parser);
 469         }
 470
 471 }
 472
 473