Merge branch 'main/rendor-staging' into fixes
[ryzomcore.git] / nel / src / gui / html_parser.cpp
blob2425d7f9545851e9802370577a9d612613bb8af8
1 // Ryzom - MMORPG Framework <http://dev.ryzom.com/projects/ryzom/>
2 // Copyright (C) 2010-2021 Winch Gate Property Limited
3 //
4 // This source file has been modified by the following contributors:
5 // Copyright (C) 2020 Jan BOON (Kaetemi) <jan.boon@kaetemi.be>
6 //
7 // This program is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU Affero General Public License as
9 // published by the Free Software Foundation, either version 3 of the
10 // License, or (at your option) any later version.
12 // This program is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU Affero General Public License for more details.
17 // You should have received a copy of the GNU Affero General Public License
18 // along with this program. If not, see <http://www.gnu.org/licenses/>.
21 #include "stdpch.h"
23 #include "nel/gui/html_parser.h"
25 #include <string>
26 #include <libxml/HTMLparser.h>
28 #include "nel/misc/types_nl.h"
29 #include "nel/gui/libwww.h"
30 #include "nel/gui/group_html.h"
32 using namespace std;
33 using namespace NLMISC;
35 #ifdef DEBUG_NEW
36 #define new DEBUG_NEW
37 #endif
39 namespace NLGUI
42 // ***************************************************************************
43 void CHtmlParser::parseStyle(xmlNode *a_node, std::string &styleString) const
45 xmlNode *node = a_node;
46 while(node)
48 if (node->type == XML_CDATA_SECTION_NODE)
50 styleString += (const char*)node->content;
52 else
54 nlwarning("<style> tag has child elements other than cdata[%d]", node->type);
57 node = node->next;
61 // ***************************************************************************
62 // recursive function to walk html document
63 void CHtmlParser::parseNode(xmlNode *a_node, CHtmlElement &parent, std::vector<std::string> &styles, std::vector<StyleLink> &links) const
65 uint childIndex = 0;
66 uint element_number;
67 xmlNode *node = a_node;
68 while(node)
70 if (node->type == XML_TEXT_NODE)
72 // linebreak right after pre,textare open tag should be removed
73 if (parent.Children.empty() && (*node->content == '\n') && (parent.ID == HTML_PRE || parent.ID == HTML_TEXTAREA))
75 parent.Children.push_back(CHtmlElement(CHtmlElement::TEXT_NODE, (const char*)(node->content) + 1));
77 else
79 parent.Children.push_back(CHtmlElement(CHtmlElement::TEXT_NODE, (const char*)(node->content)));
82 else
83 if (node->type == XML_ELEMENT_NODE)
85 // find html element
86 element_number = htmlElementLookup((const char*)node->name);
88 // get pointer to previous sibling
89 CHtmlElement *prevSibling = NULL;
90 if (!parent.Children.empty())
92 // skip text nodes
93 for(std::list<CHtmlElement>::reverse_iterator it = parent.Children.rbegin(); it != parent.Children.rend(); ++it)
95 if (it->Type == CHtmlElement::ELEMENT_NODE)
97 prevSibling = &(*it);
98 break;
103 parent.Children.push_back(CHtmlElement(CHtmlElement::ELEMENT_NODE, toLowerAscii((const char*)node->name)));
104 CHtmlElement &elm = parent.Children.back();
105 elm.ID = element_number;
106 elm.parent = &parent;
107 elm.childIndex = childIndex;
109 // previous/next sibling that is ELEMENT_NODE
110 elm.previousSibling = prevSibling;
111 if (prevSibling)
113 prevSibling->nextSibling = &parent.Children.back();
116 childIndex++;
118 // TODO: harvest <link type="css">, <style>, <img>
120 elm.Attributes.clear();
122 for (xmlAttr *cur_attr = node->properties; cur_attr; cur_attr = cur_attr->next) {
123 std::string key(toLowerAscii((const char *)(cur_attr->name)));
124 std::string value;
125 if (cur_attr->children)
127 value = (const char *)(cur_attr->children->content);
129 elm.Attributes[key] = value;
132 if (elm.hasAttribute("class"))
134 std::vector<std::string> parts;
135 NLMISC::splitString(elm.getAttribute("class"), " ", parts);
136 for(uint i = 0; i<parts.size();++i)
138 elm.ClassNames.insert(toLowerAscii(trim(parts[i])));
142 if (elm.Value == "style")
144 // <style type="text/css" media="all, screen">
145 // ...
146 // </style>
147 bool useStyle = true;
148 if (elm.hasAttribute("media"))
150 std::string media = trim(toLowerAscii(elm.Attributes["media"]));
151 useStyle = media.empty() || media.find("all") != std::string::npos || media.find("screen") != std::string::npos;
153 // <style media="ryzom"> for ingame browser
154 useStyle = useStyle || media == "ryzom";
157 if (useStyle)
159 std::string style;
160 parseStyle(node->children, style);
161 styles.push_back(style);
163 // style tag is kept in dom
165 if (elm.Value == "link" && elm.getAttribute("rel") == "stylesheet")
167 bool useStyle = true;
168 if (elm.hasAttribute("media"))
170 std::string media = trim(toLowerAscii(elm.Attributes["media"]));
171 useStyle = media.empty() || media.find("all") != std::string::npos || media.find("screen") != std::string::npos;
173 // <style media="ryzom"> for ingame browser
174 useStyle = useStyle || media == "ryzom";
177 if (useStyle)
179 styles.push_back("");
180 links.push_back(StyleLink(styles.size()-1, elm.getAttribute("href")));
182 // link tag is kept in dom
184 else if (node->children)
186 parseNode(node->children, elm, styles, links);
188 if (!elm.Children.empty() && elm.ID == HTML_PRE && elm.Children.back().Type == CHtmlElement::TEXT_NODE)
190 std::string::size_type size = elm.Children.back().Value.size();
191 // strip last '\n' from non-empty line
192 if (size > 1 && elm.Children.back().Value[size-1] == '\n')
194 elm.Children.back().Value = elm.Children.back().Value.substr(0, size - 1);
198 // must cleanup nested tags that libxml2 does not fix
199 // dt without end tag: <dl><dt><dt></dl>
200 // dd without end tag: <dl><dd><dd></dl>
201 if (!elm.Children.empty() && (elm.Value == "dt" || elm.Value == "dd"))
203 std::string tag = elm.Value;
204 std::list<CHtmlElement>::iterator it;
205 for(it = elm.Children.begin(); it != elm.Children.end(); ++it)
207 if (it->Type == CHtmlElement::ELEMENT_NODE && it->Value == tag)
209 // relocate this and remaining elements over to parent
210 parent.Children.splice(parent.Children.end(), elm.Children, it, elm.Children.end());
211 break;
214 elm.reindexChilds();
215 parent.reindexChilds();
218 // move all <tr> directly under <table> to its own <tbody> ("table > tbody > tr" selector).
219 // TODO: move first real <thead> to front, move first real <tfoot> at the end
220 if (elm.ID == HTML_TABLE)
222 std::list<CHtmlElement>::iterator it = elm.Children.begin();
223 std::list<CHtmlElement>::iterator tbody = elm.Children.end();
224 for(it = elm.Children.begin(); it != elm.Children.end(); ++it)
226 if (it->ID == HTML_TR)
228 if (tbody == elm.Children.end())
230 tbody = elm.Children.insert(it, CHtmlElement(CHtmlElement::ELEMENT_NODE, "tbody"));
231 tbody->ID = HTML_TBODY;
232 tbody->parent = &elm;
234 tbody->Children.splice(tbody->Children.end(), elm.Children, it);
235 it = tbody;
237 else if (tbody != elm.Children.end())
239 tbody->reindexChilds();
240 tbody = elm.Children.end();
244 elm.reindexChilds();
249 // move into next sibling
250 node = node->next;
254 // ***************************************************************************
255 // http://stackoverflow.com/a/18335183
256 static std::string correctNonUtf8(const std::string &str)
258 int i, f_size=str.size();
259 unsigned char c,c2,c3,c4;
260 std::string to;
261 to.reserve(f_size);
263 for(i=0 ; i<f_size ; i++)
265 c=(unsigned char)(str[i]);
266 if (c<32)
268 //control char
269 if(c==9 || c==10 || c==13)
271 //allow only \t \n \r
272 to.append(1,c);
274 continue;
276 else if (c<127)
278 //normal ASCII
279 to.append(1,c);
280 continue;
282 else if (c < 160)
284 //control char (nothing should be defined here either ASCI, ISO_8859-1 or UTF8, so skipping)
285 if (c == 128)
287 //fix microsoft mess, add euro
288 to.append(1,226);
289 to.append(1,130);
290 to.append(1,172);
293 if (c == 133)
295 //fix IBM mess, add NEL = \n\r
296 to.append(1,10);
297 to.append(1,13);
299 continue;
301 else if (c < 192)
303 //invalid for UTF8, converting ASCII
304 to.append(1,(unsigned char)194);
305 to.append(1,c);
306 continue;
308 else if (c < 194)
310 //invalid for UTF8, converting ASCII
311 to.append(1,(unsigned char)195);
312 to.append(1,c-64);
313 continue;
315 else if (c < 224 && i + 1 < f_size)
317 //possibly 2byte UTF8
318 c2 = (unsigned char)(str[i+1]);
320 if (c2 > 127 && c2 < 192)
322 //valid 2byte UTF8
323 if (c == 194 && c2 < 160)
325 //control char, skipping
328 else
330 to.append(1,c);
331 to.append(1,c2);
333 i++;
334 continue;
337 else if (c < 240 && i + 2 < f_size)
339 // possibly 3byte UTF8
340 c2 = (unsigned char)(str[i+1]);
341 c3 = (unsigned char)(str[i+2]);
343 if (c2 > 127 && c2 < 192 && c3 > 127 && c3 < 192)
345 // valid 3byte UTF8
346 to.append(1,c);
347 to.append(1,c2);
348 to.append(1,c3);
349 i+=2;
350 continue;
353 else if (c < 245 && i + 3 < f_size)
355 //possibly 4byte UTF8
356 c2 = (unsigned char)(str[i+1]);
357 c3 = (unsigned char)(str[i+2]);
358 c4 = (unsigned char)(str[i+3]);
359 if (c2 > 127 && c2 < 192 && c3 > 127 && c3 < 192 && c4 > 127 && c4 < 192)
361 //valid 4byte UTF8
362 to.append(1,c);
363 to.append(1,c2);
364 to.append(1,c3);
365 to.append(1,c4);
366 i+=3;
367 continue;
371 //invalid UTF8, converting ASCII (c>245 || string too short for multi-byte))
372 to.append(1,(unsigned char)195);
373 to.append(1,c-64);
375 return to;
378 // ***************************************************************************
379 static void patchHtmlQuirks(std::string &htmlString)
381 size_t npos = std::string::npos;
382 size_t pos;
384 // get rid of BOM (some ingame help files does not show up otherwise)
385 if (htmlString.substr(0, 3) == "\xEF\xBB\xBF")
387 htmlString.erase(0, 3);
390 // if any element is before <html>, then parser adds <html><body>
391 // and original tags are ignored (their attributes not processed)
393 // only fix situation when there is <body> tag with attributes
395 // tags are considered to be lowercase
397 pos = htmlString.find("<body ");
398 if (pos != npos)
400 size_t start = htmlString.find("<");
401 // skip <!doctype html>
402 if (htmlString.substr(start, 2) == "<!")
403 start = htmlString.find("<", start + 1);
405 // if there is no html tag, then abort
406 size_t end = htmlString.find("<html>");
407 if (end != npos && start < end && end < pos)
409 // body tag end position
410 size_t insert = htmlString.find(">", pos);
411 if (insert != npos)
413 std::string str = htmlString.substr(start, end - start);
414 htmlString.insert(insert+1, str);
415 htmlString.erase(start, str.size());
420 // make sure </html> (if present) is last in document or tags coming after it are ignored
421 pos = htmlString.find("</html>");
422 if (pos != npos && htmlString.find("<", pos+1) > pos)
424 htmlString.erase(pos, 7);
425 htmlString += "</html>";
428 // if there is invalid utf-8 chars, then libxml will break everything after first it finds.
429 htmlString = correctNonUtf8(htmlString);
432 // ***************************************************************************
433 void CHtmlParser::getDOM(std::string htmlString, CHtmlElement &dom, std::vector<std::string> &styles, std::vector<StyleLink> &links) const
435 htmlParserCtxtPtr parser = htmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL, XML_CHAR_ENCODING_UTF8);
436 if (!parser)
438 nlwarning("Creating html parser context failed");
439 return;
442 htmlCtxtUseOptions(parser, HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET);
444 // parser is little strict on tag order, so fix whats needed
445 patchHtmlQuirks(htmlString);
447 htmlParseChunk(parser, htmlString.c_str(), htmlString.size(), 0);
448 htmlParseChunk(parser, "", 0, 1);
450 if (parser->myDoc)
452 xmlNode *root = xmlDocGetRootElement(parser->myDoc);
453 if (root)
455 parseNode(root, dom, styles, links);
457 else
459 nlwarning("html root node failed");
461 xmlFreeDoc(parser->myDoc);
463 else
465 nlwarning("htmlstring parsing failed");
468 htmlFreeParserCtxt(parser);