1 // Ryzom - MMORPG Framework <http://dev.ryzom.com/projects/ryzom/>
2 // Copyright (C) 2010-2021 Winch Gate Property Limited
4 // This source file has been modified by the following contributors:
5 // Copyright (C) 2020 Jan BOON (Kaetemi) <jan.boon@kaetemi.be>
7 // This program is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU Affero General Public License as
9 // published by the Free Software Foundation, either version 3 of the
10 // License, or (at your option) any later version.
12 // This program is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU Affero General Public License for more details.
17 // You should have received a copy of the GNU Affero General Public License
18 // along with this program. If not, see <http://www.gnu.org/licenses/>.
23 #include "nel/gui/html_parser.h"
26 #include <libxml/HTMLparser.h>
28 #include "nel/misc/types_nl.h"
29 #include "nel/gui/libwww.h"
30 #include "nel/gui/group_html.h"
33 using namespace NLMISC
;
42 // ***************************************************************************
43 void CHtmlParser::parseStyle(xmlNode
*a_node
, std::string
&styleString
) const
45 xmlNode
*node
= a_node
;
48 if (node
->type
== XML_CDATA_SECTION_NODE
)
50 styleString
+= (const char*)node
->content
;
54 nlwarning("<style> tag has child elements other than cdata[%d]", node
->type
);
61 // ***************************************************************************
62 // recursive function to walk html document
63 void CHtmlParser::parseNode(xmlNode
*a_node
, CHtmlElement
&parent
, std::vector
<std::string
> &styles
, std::vector
<StyleLink
> &links
) const
67 xmlNode
*node
= a_node
;
70 if (node
->type
== XML_TEXT_NODE
)
72 // linebreak right after pre,textare open tag should be removed
73 if (parent
.Children
.empty() && (*node
->content
== '\n') && (parent
.ID
== HTML_PRE
|| parent
.ID
== HTML_TEXTAREA
))
75 parent
.Children
.push_back(CHtmlElement(CHtmlElement::TEXT_NODE
, (const char*)(node
->content
) + 1));
79 parent
.Children
.push_back(CHtmlElement(CHtmlElement::TEXT_NODE
, (const char*)(node
->content
)));
83 if (node
->type
== XML_ELEMENT_NODE
)
86 element_number
= htmlElementLookup((const char*)node
->name
);
88 // get pointer to previous sibling
89 CHtmlElement
*prevSibling
= NULL
;
90 if (!parent
.Children
.empty())
93 for(std::list
<CHtmlElement
>::reverse_iterator it
= parent
.Children
.rbegin(); it
!= parent
.Children
.rend(); ++it
)
95 if (it
->Type
== CHtmlElement::ELEMENT_NODE
)
103 parent
.Children
.push_back(CHtmlElement(CHtmlElement::ELEMENT_NODE
, toLowerAscii((const char*)node
->name
)));
104 CHtmlElement
&elm
= parent
.Children
.back();
105 elm
.ID
= element_number
;
106 elm
.parent
= &parent
;
107 elm
.childIndex
= childIndex
;
109 // previous/next sibling that is ELEMENT_NODE
110 elm
.previousSibling
= prevSibling
;
113 prevSibling
->nextSibling
= &parent
.Children
.back();
118 // TODO: harvest <link type="css">, <style>, <img>
120 elm
.Attributes
.clear();
122 for (xmlAttr
*cur_attr
= node
->properties
; cur_attr
; cur_attr
= cur_attr
->next
) {
123 std::string
key(toLowerAscii((const char *)(cur_attr
->name
)));
125 if (cur_attr
->children
)
127 value
= (const char *)(cur_attr
->children
->content
);
129 elm
.Attributes
[key
] = value
;
132 if (elm
.hasAttribute("class"))
134 std::vector
<std::string
> parts
;
135 NLMISC::splitString(elm
.getAttribute("class"), " ", parts
);
136 for(uint i
= 0; i
<parts
.size();++i
)
138 elm
.ClassNames
.insert(toLowerAscii(trim(parts
[i
])));
142 if (elm
.Value
== "style")
144 // <style type="text/css" media="all, screen">
147 bool useStyle
= true;
148 if (elm
.hasAttribute("media"))
150 std::string media
= trim(toLowerAscii(elm
.Attributes
["media"]));
151 useStyle
= media
.empty() || media
.find("all") != std::string::npos
|| media
.find("screen") != std::string::npos
;
153 // <style media="ryzom"> for ingame browser
154 useStyle
= useStyle
|| media
== "ryzom";
160 parseStyle(node
->children
, style
);
161 styles
.push_back(style
);
163 // style tag is kept in dom
165 if (elm
.Value
== "link" && elm
.getAttribute("rel") == "stylesheet")
167 bool useStyle
= true;
168 if (elm
.hasAttribute("media"))
170 std::string media
= trim(toLowerAscii(elm
.Attributes
["media"]));
171 useStyle
= media
.empty() || media
.find("all") != std::string::npos
|| media
.find("screen") != std::string::npos
;
173 // <style media="ryzom"> for ingame browser
174 useStyle
= useStyle
|| media
== "ryzom";
179 styles
.push_back("");
180 links
.push_back(StyleLink(styles
.size()-1, elm
.getAttribute("href")));
182 // link tag is kept in dom
184 else if (node
->children
)
186 parseNode(node
->children
, elm
, styles
, links
);
188 if (!elm
.Children
.empty() && elm
.ID
== HTML_PRE
&& elm
.Children
.back().Type
== CHtmlElement::TEXT_NODE
)
190 std::string::size_type size
= elm
.Children
.back().Value
.size();
191 // strip last '\n' from non-empty line
192 if (size
> 1 && elm
.Children
.back().Value
[size
-1] == '\n')
194 elm
.Children
.back().Value
= elm
.Children
.back().Value
.substr(0, size
- 1);
198 // must cleanup nested tags that libxml2 does not fix
199 // dt without end tag: <dl><dt><dt></dl>
200 // dd without end tag: <dl><dd><dd></dl>
201 if (!elm
.Children
.empty() && (elm
.Value
== "dt" || elm
.Value
== "dd"))
203 std::string tag
= elm
.Value
;
204 std::list
<CHtmlElement
>::iterator it
;
205 for(it
= elm
.Children
.begin(); it
!= elm
.Children
.end(); ++it
)
207 if (it
->Type
== CHtmlElement::ELEMENT_NODE
&& it
->Value
== tag
)
209 // relocate this and remaining elements over to parent
210 parent
.Children
.splice(parent
.Children
.end(), elm
.Children
, it
, elm
.Children
.end());
215 parent
.reindexChilds();
218 // move all <tr> directly under <table> to its own <tbody> ("table > tbody > tr" selector).
219 // TODO: move first real <thead> to front, move first real <tfoot> at the end
220 if (elm
.ID
== HTML_TABLE
)
222 std::list
<CHtmlElement
>::iterator it
= elm
.Children
.begin();
223 std::list
<CHtmlElement
>::iterator tbody
= elm
.Children
.end();
224 for(it
= elm
.Children
.begin(); it
!= elm
.Children
.end(); ++it
)
226 if (it
->ID
== HTML_TR
)
228 if (tbody
== elm
.Children
.end())
230 tbody
= elm
.Children
.insert(it
, CHtmlElement(CHtmlElement::ELEMENT_NODE
, "tbody"));
231 tbody
->ID
= HTML_TBODY
;
232 tbody
->parent
= &elm
;
234 tbody
->Children
.splice(tbody
->Children
.end(), elm
.Children
, it
);
237 else if (tbody
!= elm
.Children
.end())
239 tbody
->reindexChilds();
240 tbody
= elm
.Children
.end();
249 // move into next sibling
254 // ***************************************************************************
255 // http://stackoverflow.com/a/18335183
256 static std::string
correctNonUtf8(const std::string
&str
)
258 int i
, f_size
=str
.size();
259 unsigned char c
,c2
,c3
,c4
;
263 for(i
=0 ; i
<f_size
; i
++)
265 c
=(unsigned char)(str
[i
]);
269 if(c
==9 || c
==10 || c
==13)
271 //allow only \t \n \r
284 //control char (nothing should be defined here either ASCI, ISO_8859-1 or UTF8, so skipping)
287 //fix microsoft mess, add euro
295 //fix IBM mess, add NEL = \n\r
303 //invalid for UTF8, converting ASCII
304 to
.append(1,(unsigned char)194);
310 //invalid for UTF8, converting ASCII
311 to
.append(1,(unsigned char)195);
315 else if (c
< 224 && i
+ 1 < f_size
)
317 //possibly 2byte UTF8
318 c2
= (unsigned char)(str
[i
+1]);
320 if (c2
> 127 && c2
< 192)
323 if (c
== 194 && c2
< 160)
325 //control char, skipping
337 else if (c
< 240 && i
+ 2 < f_size
)
339 // possibly 3byte UTF8
340 c2
= (unsigned char)(str
[i
+1]);
341 c3
= (unsigned char)(str
[i
+2]);
343 if (c2
> 127 && c2
< 192 && c3
> 127 && c3
< 192)
353 else if (c
< 245 && i
+ 3 < f_size
)
355 //possibly 4byte UTF8
356 c2
= (unsigned char)(str
[i
+1]);
357 c3
= (unsigned char)(str
[i
+2]);
358 c4
= (unsigned char)(str
[i
+3]);
359 if (c2
> 127 && c2
< 192 && c3
> 127 && c3
< 192 && c4
> 127 && c4
< 192)
371 //invalid UTF8, converting ASCII (c>245 || string too short for multi-byte))
372 to
.append(1,(unsigned char)195);
378 // ***************************************************************************
379 static void patchHtmlQuirks(std::string
&htmlString
)
381 size_t npos
= std::string::npos
;
384 // get rid of BOM (some ingame help files does not show up otherwise)
385 if (htmlString
.substr(0, 3) == "\xEF\xBB\xBF")
387 htmlString
.erase(0, 3);
390 // if any element is before <html>, then parser adds <html><body>
391 // and original tags are ignored (their attributes not processed)
393 // only fix situation when there is <body> tag with attributes
395 // tags are considered to be lowercase
397 pos
= htmlString
.find("<body ");
400 size_t start
= htmlString
.find("<");
401 // skip <!doctype html>
402 if (htmlString
.substr(start
, 2) == "<!")
403 start
= htmlString
.find("<", start
+ 1);
405 // if there is no html tag, then abort
406 size_t end
= htmlString
.find("<html>");
407 if (end
!= npos
&& start
< end
&& end
< pos
)
409 // body tag end position
410 size_t insert
= htmlString
.find(">", pos
);
413 std::string str
= htmlString
.substr(start
, end
- start
);
414 htmlString
.insert(insert
+1, str
);
415 htmlString
.erase(start
, str
.size());
420 // make sure </html> (if present) is last in document or tags coming after it are ignored
421 pos
= htmlString
.find("</html>");
422 if (pos
!= npos
&& htmlString
.find("<", pos
+1) > pos
)
424 htmlString
.erase(pos
, 7);
425 htmlString
+= "</html>";
428 // if there is invalid utf-8 chars, then libxml will break everything after first it finds.
429 htmlString
= correctNonUtf8(htmlString
);
432 // ***************************************************************************
433 void CHtmlParser::getDOM(std::string htmlString
, CHtmlElement
&dom
, std::vector
<std::string
> &styles
, std::vector
<StyleLink
> &links
) const
435 htmlParserCtxtPtr parser
= htmlCreatePushParserCtxt(NULL
, NULL
, NULL
, 0, NULL
, XML_CHAR_ENCODING_UTF8
);
438 nlwarning("Creating html parser context failed");
442 htmlCtxtUseOptions(parser
, HTML_PARSE_NOBLANKS
| HTML_PARSE_NOERROR
| HTML_PARSE_NOWARNING
| HTML_PARSE_NONET
);
444 // parser is little strict on tag order, so fix whats needed
445 patchHtmlQuirks(htmlString
);
447 htmlParseChunk(parser
, htmlString
.c_str(), htmlString
.size(), 0);
448 htmlParseChunk(parser
, "", 0, 1);
452 xmlNode
*root
= xmlDocGetRootElement(parser
->myDoc
);
455 parseNode(root
, dom
, styles
, links
);
459 nlwarning("html root node failed");
461 xmlFreeDoc(parser
->myDoc
);
465 nlwarning("htmlstring parsing failed");
468 htmlFreeParserCtxt(parser
);