Fix css style order when using external css files
[ryzomcore.git] / nel / src / gui / html_parser.cpp
blobbcb83bf8e5deac4fe9a72da0c193a8c7e9febfd6
1 // Ryzom - MMORPG Framework <http://dev.ryzom.com/projects/ryzom/>
2 // Copyright (C) 2010 Winch Gate Property Limited
3 //
4 // This program is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU Affero General Public License as
6 // published by the Free Software Foundation, either version 3 of the
7 // License, or (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU Affero General Public License for more details.
14 // You should have received a copy of the GNU Affero General Public License
15 // along with this program. If not, see <http://www.gnu.org/licenses/>.
18 #include "stdpch.h"
20 #include "nel/gui/html_parser.h"
22 #include <string>
23 #include <libxml/HTMLparser.h>
25 #include "nel/misc/types_nl.h"
26 #include "nel/gui/libwww.h"
27 #include "nel/gui/group_html.h"
29 using namespace std;
30 using namespace NLMISC;
32 #ifdef DEBUG_NEW
33 #define new DEBUG_NEW
34 #endif
36 namespace NLGUI
39 // ***************************************************************************
40 void CHtmlParser::parseStyle(xmlNode *a_node, std::string &styleString) const
42 xmlNode *node = a_node;
43 while(node)
45 if (node->type == XML_CDATA_SECTION_NODE)
47 styleString += (const char*)node->content;
49 else
51 nlwarning("<style> tag has child elements other than cdata[%d]", node->type);
54 node = node->next;
58 // ***************************************************************************
59 // recursive function to walk html document
60 void CHtmlParser::parseNode(xmlNode *a_node, CHtmlElement &parent, std::vector<std::string> &styles, std::vector<StyleLink> &links) const
62 uint childIndex = 0;
63 uint element_number;
64 xmlNode *node = a_node;
65 while(node)
67 if (node->type == XML_TEXT_NODE)
69 parent.Children.push_back(CHtmlElement(CHtmlElement::TEXT_NODE, (const char*)(node->content)));
71 else
72 if (node->type == XML_ELEMENT_NODE)
74 // find html element
75 element_number = htmlElementLookup((const char*)node->name);
77 // get pointer to previous sibling
78 CHtmlElement *prevSibling = NULL;
79 if (!parent.Children.empty())
81 // skip text nodes
82 for(std::list<CHtmlElement>::reverse_iterator it = parent.Children.rbegin(); it != parent.Children.rend(); ++it)
84 if (it->Type == CHtmlElement::ELEMENT_NODE)
86 prevSibling = &(*it);
87 break;
92 parent.Children.push_back(CHtmlElement(CHtmlElement::ELEMENT_NODE, toLowerAscii((const char*)node->name)));
93 CHtmlElement &elm = parent.Children.back();
94 elm.ID = element_number;
95 elm.parent = &parent;
96 elm.childIndex = childIndex;
98 // previous/next sibling that is ELEMENT_NODE
99 elm.previousSibling = prevSibling;
100 if (prevSibling)
102 prevSibling->nextSibling = &parent.Children.back();
105 childIndex++;
107 // TODO: harvest <link type="css">, <style>, <img>
109 elm.Attributes.clear();
111 for (xmlAttr *cur_attr = node->properties; cur_attr; cur_attr = cur_attr->next) {
112 std::string key(toLowerAscii((const char *)(cur_attr->name)));
113 std::string value;
114 if (cur_attr->children)
116 value = (const char *)(cur_attr->children->content);
118 elm.Attributes[key] = value;
121 if (elm.hasAttribute("class"))
123 std::vector<std::string> parts;
124 NLMISC::splitString(elm.getAttribute("class"), " ", parts);
125 for(uint i = 0; i<parts.size();++i)
127 elm.ClassNames.insert(toLowerAscii(trim(parts[i])));
131 if (elm.Value == "style")
133 // <style type="text/css" media="all, screen">
134 // ...
135 // </style>
136 bool useStyle = true;
137 if (elm.hasAttribute("media"))
139 std::string media = trim(toLowerAscii(elm.Attributes["media"]));
140 useStyle = media.empty() || media.find("all") != std::string::npos || media.find("screen") != std::string::npos;
142 // <style media="ryzom"> for ingame browser
143 useStyle = useStyle || media == "ryzom";
146 if (useStyle)
148 std::string style;
149 parseStyle(node->children, style);
150 styles.push_back(style);
152 // style tag is kept in dom
154 if (elm.Value == "link" && elm.getAttribute("rel") == "stylesheet")
156 bool useStyle = true;
157 if (elm.hasAttribute("media"))
159 std::string media = trim(toLowerAscii(elm.Attributes["media"]));
160 useStyle = media.empty() || media.find("all") != std::string::npos || media.find("screen") != std::string::npos;
162 // <style media="ryzom"> for ingame browser
163 useStyle = useStyle || media == "ryzom";
166 if (useStyle)
168 styles.push_back("");
169 links.push_back(StyleLink(styles.size()-1, elm.getAttribute("href")));
171 // link tag is kept in dom
173 else if (node->children)
175 parseNode(node->children, elm, styles, links);
177 // must cleanup nested tags that libxml2 does not fix
178 // dt without end tag: <dl><dt><dt></dl>
179 // dd without end tag: <dl><dd><dd></dl>
180 if (!elm.Children.empty() && (elm.Value == "dt" || elm.Value == "dd"))
182 std::string tag = elm.Value;
183 std::list<CHtmlElement>::iterator it;
184 for(it = elm.Children.begin(); it != elm.Children.end(); ++it)
186 if (it->Type == CHtmlElement::ELEMENT_NODE && it->Value == tag)
188 // relocate this and remaining elements over to parent
189 parent.Children.splice(parent.Children.end(), elm.Children, it, elm.Children.end());
190 break;
193 elm.reindexChilds();
194 parent.reindexChilds();
197 // move all <tr> directly under <table> to its own <tbody> ("table > tbody > tr" selector).
198 // TODO: move first real <thead> to front, move first real <tfoot> at the end
199 if (elm.ID == HTML_TABLE)
201 std::list<CHtmlElement>::iterator it = elm.Children.begin();
202 std::list<CHtmlElement>::iterator tbody = elm.Children.end();
203 for(it = elm.Children.begin(); it != elm.Children.end(); ++it)
205 if (it->ID == HTML_TR)
207 if (tbody == elm.Children.end())
209 tbody = elm.Children.insert(it, CHtmlElement(CHtmlElement::ELEMENT_NODE, "tbody"));
210 tbody->ID = HTML_TBODY;
211 tbody->parent = &elm;
213 tbody->Children.splice(tbody->Children.end(), elm.Children, it);
214 it = tbody;
216 else if (tbody != elm.Children.end())
218 tbody->reindexChilds();
219 tbody = elm.Children.end();
223 elm.reindexChilds();
228 // move into next sibling
229 node = node->next;
233 // ***************************************************************************
234 // http://stackoverflow.com/a/18335183
235 static std::string correctNonUtf8(const std::string &str)
237 int i, f_size=str.size();
238 unsigned char c,c2,c3,c4;
239 std::string to;
240 to.reserve(f_size);
242 for(i=0 ; i<f_size ; i++)
244 c=(unsigned char)(str[i]);
245 if (c<32)
247 //control char
248 if(c==9 || c==10 || c==13)
250 //allow only \t \n \r
251 to.append(1,c);
253 continue;
255 else if (c<127)
257 //normal ASCII
258 to.append(1,c);
259 continue;
261 else if (c < 160)
263 //control char (nothing should be defined here either ASCI, ISO_8859-1 or UTF8, so skipping)
264 if (c == 128)
266 //fix microsoft mess, add euro
267 to.append(1,226);
268 to.append(1,130);
269 to.append(1,172);
272 if (c == 133)
274 //fix IBM mess, add NEL = \n\r
275 to.append(1,10);
276 to.append(1,13);
278 continue;
280 else if (c < 192)
282 //invalid for UTF8, converting ASCII
283 to.append(1,(unsigned char)194);
284 to.append(1,c);
285 continue;
287 else if (c < 194)
289 //invalid for UTF8, converting ASCII
290 to.append(1,(unsigned char)195);
291 to.append(1,c-64);
292 continue;
294 else if (c < 224 && i + 1 < f_size)
296 //possibly 2byte UTF8
297 c2 = (unsigned char)(str[i+1]);
299 if (c2 > 127 && c2 < 192)
301 //valid 2byte UTF8
302 if (c == 194 && c2 < 160)
304 //control char, skipping
307 else
309 to.append(1,c);
310 to.append(1,c2);
312 i++;
313 continue;
316 else if (c < 240 && i + 2 < f_size)
318 // possibly 3byte UTF8
319 c2 = (unsigned char)(str[i+1]);
320 c3 = (unsigned char)(str[i+2]);
322 if (c2 > 127 && c2 < 192 && c3 > 127 && c3 < 192)
324 // valid 3byte UTF8
325 to.append(1,c);
326 to.append(1,c2);
327 to.append(1,c3);
328 i+=2;
329 continue;
332 else if (c < 245 && i + 3 < f_size)
334 //possibly 4byte UTF8
335 c2 = (unsigned char)(str[i+1]);
336 c3 = (unsigned char)(str[i+2]);
337 c4 = (unsigned char)(str[i+3]);
338 if (c2 > 127 && c2 < 192 && c3 > 127 && c3 < 192 && c4 > 127 && c4 < 192)
340 //valid 4byte UTF8
341 to.append(1,c);
342 to.append(1,c2);
343 to.append(1,c3);
344 to.append(1,c4);
345 i+=3;
346 continue;
350 //invalid UTF8, converting ASCII (c>245 || string too short for multi-byte))
351 to.append(1,(unsigned char)195);
352 to.append(1,c-64);
354 return to;
357 // ***************************************************************************
358 static void patchHtmlQuirks(std::string &htmlString)
360 size_t npos = std::string::npos;
361 size_t pos;
363 // get rid of BOM (some ingame help files does not show up otherwise)
364 if (htmlString.substr(0, 3) == "\xEF\xBB\xBF")
366 htmlString.erase(0, 3);
369 // if any element is before <html>, then parser adds <html><body>
370 // and original tags are ignored (their attributes not processed)
372 // only fix situation when there is <body> tag with attributes
374 // tags are considered to be lowercase
376 pos = htmlString.find("<body ");
377 if (pos != npos)
379 size_t start = htmlString.find("<");
380 // skip <!doctype html>
381 if (htmlString.substr(start, 2) == "<!")
382 start = htmlString.find("<", start + 1);
384 // if there is no html tag, then abort
385 size_t end = htmlString.find("<html>");
386 if (end != npos && start < end && end < pos)
388 // body tag end position
389 size_t insert = htmlString.find(">", pos);
390 if (insert != npos)
392 std::string str = htmlString.substr(start, end - start);
393 htmlString.insert(insert+1, str);
394 htmlString.erase(start, str.size());
399 // make sure </html> (if present) is last in document or tags coming after it are ignored
400 pos = htmlString.find("</html>");
401 if (pos != npos && htmlString.find("<", pos+1) > pos)
403 htmlString.erase(pos, 7);
404 htmlString += "</html>";
407 // if there is invalid utf-8 chars, then libxml will break everything after first it finds.
408 htmlString = correctNonUtf8(htmlString);
411 // ***************************************************************************
412 void CHtmlParser::getDOM(std::string htmlString, CHtmlElement &dom, std::vector<std::string> &styles, std::vector<StyleLink> &links) const
414 htmlParserCtxtPtr parser = htmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL, XML_CHAR_ENCODING_UTF8);
415 if (!parser)
417 nlwarning("Creating html parser context failed");
418 return;
421 htmlCtxtUseOptions(parser, HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET);
423 // parser is little strict on tag order, so fix whats needed
424 patchHtmlQuirks(htmlString);
426 htmlParseChunk(parser, htmlString.c_str(), htmlString.size(), 0);
427 htmlParseChunk(parser, "", 0, 1);
429 if (parser->myDoc)
431 xmlNode *root = xmlDocGetRootElement(parser->myDoc);
432 if (root)
434 parseNode(root, dom, styles, links);
436 else
438 nlwarning("html root node failed");
440 xmlFreeDoc(parser->myDoc);
442 else
444 nlwarning("htmlstring parsing failed");
447 htmlFreeParserCtxt(parser);