1 // Ryzom - MMORPG Framework <http://dev.ryzom.com/projects/ryzom/>
2 // Copyright (C) 2010 Winch Gate Property Limited
4 // This program is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU Affero General Public License as
6 // published by the Free Software Foundation, either version 3 of the
7 // License, or (at your option) any later version.
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU Affero General Public License for more details.
14 // You should have received a copy of the GNU Affero General Public License
15 // along with this program. If not, see <http://www.gnu.org/licenses/>.
20 #include "nel/gui/html_parser.h"
23 #include <libxml/HTMLparser.h>
25 #include "nel/misc/types_nl.h"
26 #include "nel/gui/libwww.h"
27 #include "nel/gui/group_html.h"
30 using namespace NLMISC
;
39 // ***************************************************************************
40 void CHtmlParser::parseStyle(xmlNode
*a_node
, std::string
&styleString
) const
42 xmlNode
*node
= a_node
;
45 if (node
->type
== XML_CDATA_SECTION_NODE
)
47 styleString
+= (const char*)node
->content
;
51 nlwarning("<style> tag has child elements other than cdata[%d]", node
->type
);
58 // ***************************************************************************
59 // recursive function to walk html document
60 void CHtmlParser::parseNode(xmlNode
*a_node
, CHtmlElement
&parent
, std::vector
<std::string
> &styles
, std::vector
<StyleLink
> &links
) const
64 xmlNode
*node
= a_node
;
67 if (node
->type
== XML_TEXT_NODE
)
69 parent
.Children
.push_back(CHtmlElement(CHtmlElement::TEXT_NODE
, (const char*)(node
->content
)));
72 if (node
->type
== XML_ELEMENT_NODE
)
75 element_number
= htmlElementLookup((const char*)node
->name
);
77 // get pointer to previous sibling
78 CHtmlElement
*prevSibling
= NULL
;
79 if (!parent
.Children
.empty())
82 for(std::list
<CHtmlElement
>::reverse_iterator it
= parent
.Children
.rbegin(); it
!= parent
.Children
.rend(); ++it
)
84 if (it
->Type
== CHtmlElement::ELEMENT_NODE
)
92 parent
.Children
.push_back(CHtmlElement(CHtmlElement::ELEMENT_NODE
, toLowerAscii((const char*)node
->name
)));
93 CHtmlElement
&elm
= parent
.Children
.back();
94 elm
.ID
= element_number
;
96 elm
.childIndex
= childIndex
;
98 // previous/next sibling that is ELEMENT_NODE
99 elm
.previousSibling
= prevSibling
;
102 prevSibling
->nextSibling
= &parent
.Children
.back();
107 // TODO: harvest <link type="css">, <style>, <img>
109 elm
.Attributes
.clear();
111 for (xmlAttr
*cur_attr
= node
->properties
; cur_attr
; cur_attr
= cur_attr
->next
) {
112 std::string
key(toLowerAscii((const char *)(cur_attr
->name
)));
114 if (cur_attr
->children
)
116 value
= (const char *)(cur_attr
->children
->content
);
118 elm
.Attributes
[key
] = value
;
121 if (elm
.hasAttribute("class"))
123 std::vector
<std::string
> parts
;
124 NLMISC::splitString(elm
.getAttribute("class"), " ", parts
);
125 for(uint i
= 0; i
<parts
.size();++i
)
127 elm
.ClassNames
.insert(toLowerAscii(trim(parts
[i
])));
131 if (elm
.Value
== "style")
133 // <style type="text/css" media="all, screen">
136 bool useStyle
= true;
137 if (elm
.hasAttribute("media"))
139 std::string media
= trim(toLowerAscii(elm
.Attributes
["media"]));
140 useStyle
= media
.empty() || media
.find("all") != std::string::npos
|| media
.find("screen") != std::string::npos
;
142 // <style media="ryzom"> for ingame browser
143 useStyle
= useStyle
|| media
== "ryzom";
149 parseStyle(node
->children
, style
);
150 styles
.push_back(style
);
152 // style tag is kept in dom
154 if (elm
.Value
== "link" && elm
.getAttribute("rel") == "stylesheet")
156 bool useStyle
= true;
157 if (elm
.hasAttribute("media"))
159 std::string media
= trim(toLowerAscii(elm
.Attributes
["media"]));
160 useStyle
= media
.empty() || media
.find("all") != std::string::npos
|| media
.find("screen") != std::string::npos
;
162 // <style media="ryzom"> for ingame browser
163 useStyle
= useStyle
|| media
== "ryzom";
168 styles
.push_back("");
169 links
.push_back(StyleLink(styles
.size()-1, elm
.getAttribute("href")));
171 // link tag is kept in dom
173 else if (node
->children
)
175 parseNode(node
->children
, elm
, styles
, links
);
177 // must cleanup nested tags that libxml2 does not fix
178 // dt without end tag: <dl><dt><dt></dl>
179 // dd without end tag: <dl><dd><dd></dl>
180 if (!elm
.Children
.empty() && (elm
.Value
== "dt" || elm
.Value
== "dd"))
182 std::string tag
= elm
.Value
;
183 std::list
<CHtmlElement
>::iterator it
;
184 for(it
= elm
.Children
.begin(); it
!= elm
.Children
.end(); ++it
)
186 if (it
->Type
== CHtmlElement::ELEMENT_NODE
&& it
->Value
== tag
)
188 // relocate this and remaining elements over to parent
189 parent
.Children
.splice(parent
.Children
.end(), elm
.Children
, it
, elm
.Children
.end());
194 parent
.reindexChilds();
197 // move all <tr> directly under <table> to its own <tbody> ("table > tbody > tr" selector).
198 // TODO: move first real <thead> to front, move first real <tfoot> at the end
199 if (elm
.ID
== HTML_TABLE
)
201 std::list
<CHtmlElement
>::iterator it
= elm
.Children
.begin();
202 std::list
<CHtmlElement
>::iterator tbody
= elm
.Children
.end();
203 for(it
= elm
.Children
.begin(); it
!= elm
.Children
.end(); ++it
)
205 if (it
->ID
== HTML_TR
)
207 if (tbody
== elm
.Children
.end())
209 tbody
= elm
.Children
.insert(it
, CHtmlElement(CHtmlElement::ELEMENT_NODE
, "tbody"));
210 tbody
->ID
= HTML_TBODY
;
211 tbody
->parent
= &elm
;
213 tbody
->Children
.splice(tbody
->Children
.end(), elm
.Children
, it
);
216 else if (tbody
!= elm
.Children
.end())
218 tbody
->reindexChilds();
219 tbody
= elm
.Children
.end();
228 // move into next sibling
233 // ***************************************************************************
234 // http://stackoverflow.com/a/18335183
235 static std::string
correctNonUtf8(const std::string
&str
)
237 int i
, f_size
=str
.size();
238 unsigned char c
,c2
,c3
,c4
;
242 for(i
=0 ; i
<f_size
; i
++)
244 c
=(unsigned char)(str
[i
]);
248 if(c
==9 || c
==10 || c
==13)
250 //allow only \t \n \r
263 //control char (nothing should be defined here either ASCI, ISO_8859-1 or UTF8, so skipping)
266 //fix microsoft mess, add euro
274 //fix IBM mess, add NEL = \n\r
282 //invalid for UTF8, converting ASCII
283 to
.append(1,(unsigned char)194);
289 //invalid for UTF8, converting ASCII
290 to
.append(1,(unsigned char)195);
294 else if (c
< 224 && i
+ 1 < f_size
)
296 //possibly 2byte UTF8
297 c2
= (unsigned char)(str
[i
+1]);
299 if (c2
> 127 && c2
< 192)
302 if (c
== 194 && c2
< 160)
304 //control char, skipping
316 else if (c
< 240 && i
+ 2 < f_size
)
318 // possibly 3byte UTF8
319 c2
= (unsigned char)(str
[i
+1]);
320 c3
= (unsigned char)(str
[i
+2]);
322 if (c2
> 127 && c2
< 192 && c3
> 127 && c3
< 192)
332 else if (c
< 245 && i
+ 3 < f_size
)
334 //possibly 4byte UTF8
335 c2
= (unsigned char)(str
[i
+1]);
336 c3
= (unsigned char)(str
[i
+2]);
337 c4
= (unsigned char)(str
[i
+3]);
338 if (c2
> 127 && c2
< 192 && c3
> 127 && c3
< 192 && c4
> 127 && c4
< 192)
350 //invalid UTF8, converting ASCII (c>245 || string too short for multi-byte))
351 to
.append(1,(unsigned char)195);
357 // ***************************************************************************
358 static void patchHtmlQuirks(std::string
&htmlString
)
360 size_t npos
= std::string::npos
;
363 // get rid of BOM (some ingame help files does not show up otherwise)
364 if (htmlString
.substr(0, 3) == "\xEF\xBB\xBF")
366 htmlString
.erase(0, 3);
369 // if any element is before <html>, then parser adds <html><body>
370 // and original tags are ignored (their attributes not processed)
372 // only fix situation when there is <body> tag with attributes
374 // tags are considered to be lowercase
376 pos
= htmlString
.find("<body ");
379 size_t start
= htmlString
.find("<");
380 // skip <!doctype html>
381 if (htmlString
.substr(start
, 2) == "<!")
382 start
= htmlString
.find("<", start
+ 1);
384 // if there is no html tag, then abort
385 size_t end
= htmlString
.find("<html>");
386 if (end
!= npos
&& start
< end
&& end
< pos
)
388 // body tag end position
389 size_t insert
= htmlString
.find(">", pos
);
392 std::string str
= htmlString
.substr(start
, end
- start
);
393 htmlString
.insert(insert
+1, str
);
394 htmlString
.erase(start
, str
.size());
399 // make sure </html> (if present) is last in document or tags coming after it are ignored
400 pos
= htmlString
.find("</html>");
401 if (pos
!= npos
&& htmlString
.find("<", pos
+1) > pos
)
403 htmlString
.erase(pos
, 7);
404 htmlString
+= "</html>";
407 // if there is invalid utf-8 chars, then libxml will break everything after first it finds.
408 htmlString
= correctNonUtf8(htmlString
);
411 // ***************************************************************************
412 void CHtmlParser::getDOM(std::string htmlString
, CHtmlElement
&dom
, std::vector
<std::string
> &styles
, std::vector
<StyleLink
> &links
) const
414 htmlParserCtxtPtr parser
= htmlCreatePushParserCtxt(NULL
, NULL
, NULL
, 0, NULL
, XML_CHAR_ENCODING_UTF8
);
417 nlwarning("Creating html parser context failed");
421 htmlCtxtUseOptions(parser
, HTML_PARSE_NOBLANKS
| HTML_PARSE_NOERROR
| HTML_PARSE_NOWARNING
| HTML_PARSE_NONET
);
423 // parser is little strict on tag order, so fix whats needed
424 patchHtmlQuirks(htmlString
);
426 htmlParseChunk(parser
, htmlString
.c_str(), htmlString
.size(), 0);
427 htmlParseChunk(parser
, "", 0, 1);
431 xmlNode
*root
= xmlDocGetRootElement(parser
->myDoc
);
434 parseNode(root
, dom
, styles
, links
);
438 nlwarning("html root node failed");
440 xmlFreeDoc(parser
->myDoc
);
444 nlwarning("htmlstring parsing failed");
447 htmlFreeParserCtxt(parser
);