libs/xml2/HTMLtree.c

   1 /*
   2  * HTMLtree.c : implementation of access function for an HTML tree.
   3  *
   4  * See Copyright for the status of this software.
   5  *
   6  * daniel@veillard.com
   7  */
   8
   9
  10 #define IN_LIBXML
  11 #include "libxml.h"
  12 #ifdef LIBXML_HTML_ENABLED
  13
  14 #include <string.h> /* for memset() only ! */
  15 #include <ctype.h>
  16 #include <stdlib.h>
  17
  18 #include <libxml/xmlmemory.h>
  19 #include <libxml/HTMLparser.h>
  20 #include <libxml/HTMLtree.h>
  21 #include <libxml/entities.h>
  22 #include <libxml/valid.h>
  23 #include <libxml/xmlerror.h>
  24 #include <libxml/parserInternals.h>
  25 #include <libxml/globals.h>
  26 #include <libxml/uri.h>
  27
  28 #include "private/buf.h"
  29 #include "private/error.h"
  30 #include "private/io.h"
  31 #include "private/save.h"
  32
  33 /************************************************************************
  34  *                                                                      *
  35  *              Getting/Setting encoding meta tags                      *
  36  *                                                                      *
  37  ************************************************************************/
  38
  39 /**
  40  * htmlGetMetaEncoding:
  41  * @doc:  the document
  42  *
  43  * Encoding definition lookup in the Meta tags
  44  *
  45  * Returns the current encoding as flagged in the HTML source
  46  */
  47 const xmlChar *
  48 htmlGetMetaEncoding(htmlDocPtr doc) {
  49     htmlNodePtr cur;
  50     const xmlChar *content;
  51     const xmlChar *encoding;
  52
  53     if (doc == NULL)
  54         return(NULL);
  55     cur = doc->children;
  56
  57     /*
  58      * Search the html
  59      */
  60     while (cur != NULL) {
  61         if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
  62             if (xmlStrEqual(cur->name, BAD_CAST"html"))
  63                 break;
  64             if (xmlStrEqual(cur->name, BAD_CAST"head"))
  65                 goto found_head;
  66             if (xmlStrEqual(cur->name, BAD_CAST"meta"))
  67                 goto found_meta;
  68         }
  69         cur = cur->next;
  70     }
  71     if (cur == NULL)
  72         return(NULL);
  73     cur = cur->children;
  74
  75     /*
  76      * Search the head
  77      */
  78     while (cur != NULL) {
  79         if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
  80             if (xmlStrEqual(cur->name, BAD_CAST"head"))
  81                 break;
  82             if (xmlStrEqual(cur->name, BAD_CAST"meta"))
  83                 goto found_meta;
  84         }
  85         cur = cur->next;
  86     }
  87     if (cur == NULL)
  88         return(NULL);
  89 found_head:
  90     cur = cur->children;
  91
  92     /*
  93      * Search the meta elements
  94      */
  95 found_meta:
  96     while (cur != NULL) {
  97         if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
  98             if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
  99                 xmlAttrPtr attr = cur->properties;
 100                 int http;
 101                 const xmlChar *value;
 102
 103                 content = NULL;
 104                 http = 0;
 105                 while (attr != NULL) {
 106                     if ((attr->children != NULL) &&
 107                         (attr->children->type == XML_TEXT_NODE) &&
 108                         (attr->children->next == NULL)) {
 109                         value = attr->children->content;
 110                         if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
 111                          && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
 112                             http = 1;
 113                         else if ((value != NULL)
 114                          && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
 115                             content = value;
 116                         if ((http != 0) && (content != NULL))
 117                             goto found_content;
 118                     }
 119                     attr = attr->next;
 120                 }
 121             }
 122         }
 123         cur = cur->next;
 124     }
 125     return(NULL);
 126
 127 found_content:
 128     encoding = xmlStrstr(content, BAD_CAST"charset=");
 129     if (encoding == NULL)
 130         encoding = xmlStrstr(content, BAD_CAST"Charset=");
 131     if (encoding == NULL)
 132         encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
 133     if (encoding != NULL) {
 134         encoding += 8;
 135     } else {
 136         encoding = xmlStrstr(content, BAD_CAST"charset =");
 137         if (encoding == NULL)
 138             encoding = xmlStrstr(content, BAD_CAST"Charset =");
 139         if (encoding == NULL)
 140             encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
 141         if (encoding != NULL)
 142             encoding += 9;
 143     }
 144     if (encoding != NULL) {
 145         while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
 146     }
 147     return(encoding);
 148 }
 149
 150 /**
 151  * htmlSetMetaEncoding:
 152  * @doc:  the document
 153  * @encoding:  the encoding string
 154  *
 155  * Sets the current encoding in the Meta tags
 156  * NOTE: this will not change the document content encoding, just
 157  * the META flag associated.
 158  *
 159  * Returns 0 in case of success and -1 in case of error
 160  */
 161 int
 162 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
 163     htmlNodePtr cur, meta = NULL, head = NULL;
 164     const xmlChar *content = NULL;
 165     char newcontent[100];
 166
 167     newcontent[0] = 0;
 168
 169     if (doc == NULL)
 170         return(-1);
 171
 172     /* html isn't a real encoding it's just libxml2 way to get entities */
 173     if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
 174         return(-1);
 175
 176     if (encoding != NULL) {
 177         snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
 178                 (char *)encoding);
 179         newcontent[sizeof(newcontent) - 1] = 0;
 180     }
 181
 182     cur = doc->children;
 183
 184     /*
 185      * Search the html
 186      */
 187     while (cur != NULL) {
 188         if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
 189             if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
 190                 break;
 191             if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
 192                 goto found_head;
 193             if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
 194                 goto found_meta;
 195         }
 196         cur = cur->next;
 197     }
 198     if (cur == NULL)
 199         return(-1);
 200     cur = cur->children;
 201
 202     /*
 203      * Search the head
 204      */
 205     while (cur != NULL) {
 206         if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
 207             if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
 208                 break;
 209             if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
 210                 head = cur->parent;
 211                 goto found_meta;
 212             }
 213         }
 214         cur = cur->next;
 215     }
 216     if (cur == NULL)
 217         return(-1);
 218 found_head:
 219     head = cur;
 220     if (cur->children == NULL)
 221         goto create;
 222     cur = cur->children;
 223
 224 found_meta:
 225     /*
 226      * Search and update all the remaining the meta elements carrying
 227      * encoding information
 228      */
 229     while (cur != NULL) {
 230         if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
 231             if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
 232                 xmlAttrPtr attr = cur->properties;
 233                 int http;
 234                 const xmlChar *value;
 235
 236                 content = NULL;
 237                 http = 0;
 238                 while (attr != NULL) {
 239                     if ((attr->children != NULL) &&
 240                         (attr->children->type == XML_TEXT_NODE) &&
 241                         (attr->children->next == NULL)) {
 242                         value = attr->children->content;
 243                         if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
 244                          && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
 245                             http = 1;
 246                         else
 247                         {
 248                            if ((value != NULL) &&
 249                                (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
 250                                content = value;
 251                         }
 252                         if ((http != 0) && (content != NULL))
 253                             break;
 254                     }
 255                     attr = attr->next;
 256                 }
 257                 if ((http != 0) && (content != NULL)) {
 258                     meta = cur;
 259                     break;
 260                 }
 261
 262             }
 263         }
 264         cur = cur->next;
 265     }
 266 create:
 267     if (meta == NULL) {
 268         if ((encoding != NULL) && (head != NULL)) {
 269             /*
 270              * Create a new Meta element with the right attributes
 271              */
 272
 273             meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
 274             if (head->children == NULL)
 275                 xmlAddChild(head, meta);
 276             else
 277                 xmlAddPrevSibling(head->children, meta);
 278             xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
 279             xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
 280         }
 281     } else {
 282         /* remove the meta tag if NULL is passed */
 283         if (encoding == NULL) {
 284             xmlUnlinkNode(meta);
 285             xmlFreeNode(meta);
 286         }
 287         /* change the document only if there is a real encoding change */
 288         else if (xmlStrcasestr(content, encoding) == NULL) {
 289             xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
 290         }
 291     }
 292
 293
 294     return(0);
 295 }
 296
 297 /**
 298  * booleanHTMLAttrs:
 299  *
 300  * These are the HTML attributes which will be output
 301  * in minimized form, i.e. <option selected="selected"> will be
 302  * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
 303  *
 304  */
 305 static const char* const htmlBooleanAttrs[] = {
 306   "checked", "compact", "declare", "defer", "disabled", "ismap",
 307   "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
 308   "selected", NULL
 309 };
 310
 311
 312 /**
 313  * htmlIsBooleanAttr:
 314  * @name:  the name of the attribute to check
 315  *
 316  * Determine if a given attribute is a boolean attribute.
 317  *
 318  * returns: false if the attribute is not boolean, true otherwise.
 319  */
 320 int
 321 htmlIsBooleanAttr(const xmlChar *name)
 322 {
 323     int i = 0;
 324
 325     while (htmlBooleanAttrs[i] != NULL) {
 326         if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
 327             return 1;
 328         i++;
 329     }
 330     return 0;
 331 }
 332
 333 #ifdef LIBXML_OUTPUT_ENABLED
 334 /************************************************************************
 335  *                                                                      *
 336  *                      Output error handlers                           *
 337  *                                                                      *
 338  ************************************************************************/
 339 /**
 340  * htmlSaveErrMemory:
 341  * @extra:  extra information
 342  *
 343  * Handle an out of memory condition
 344  */
 345 static void
 346 htmlSaveErrMemory(const char *extra)
 347 {
 348     __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
 349 }
 350
 351 /**
 352  * htmlSaveErr:
 353  * @code:  the error number
 354  * @node:  the location of the error.
 355  * @extra:  extra information
 356  *
 357  * Handle an out of memory condition
 358  */
 359 static void
 360 htmlSaveErr(int code, xmlNodePtr node, const char *extra)
 361 {
 362     const char *msg = NULL;
 363
 364     switch(code) {
 365         case XML_SAVE_NOT_UTF8:
 366             msg = "string is not in UTF-8\n";
 367             break;
 368         case XML_SAVE_CHAR_INVALID:
 369             msg = "invalid character value\n";
 370             break;
 371         case XML_SAVE_UNKNOWN_ENCODING:
 372             msg = "unknown encoding %s\n";
 373             break;
 374         case XML_SAVE_NO_DOCTYPE:
 375             msg = "HTML has no DOCTYPE\n";
 376             break;
 377         default:
 378             msg = "unexpected error number\n";
 379     }
 380     __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
 381 }
 382
 383 /************************************************************************
 384  *                                                                      *
 385  *              Dumping HTML tree content to a simple buffer            *
 386  *                                                                      *
 387  ************************************************************************/
 388
 389 /**
 390  * htmlBufNodeDumpFormat:
 391  * @buf:  the xmlBufPtr output
 392  * @doc:  the document
 393  * @cur:  the current node
 394  * @format:  should formatting spaces been added
 395  *
 396  * Dump an HTML node, recursive behaviour,children are printed too.
 397  *
 398  * Returns the number of byte written or -1 in case of error
 399  */
 400 static size_t
 401 htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
 402                    int format) {
 403     size_t use;
 404     int ret;
 405     xmlOutputBufferPtr outbuf;
 406
 407     if (cur == NULL) {
 408         return (-1);
 409     }
 410     if (buf == NULL) {
 411         return (-1);
 412     }
 413     outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
 414     if (outbuf == NULL) {
 415         htmlSaveErrMemory("allocating HTML output buffer");
 416         return (-1);
 417     }
 418     memset(outbuf, 0, sizeof(xmlOutputBuffer));
 419     outbuf->buffer = buf;
 420     outbuf->encoder = NULL;
 421     outbuf->writecallback = NULL;
 422     outbuf->closecallback = NULL;
 423     outbuf->context = NULL;
 424     outbuf->written = 0;
 425
 426     use = xmlBufUse(buf);
 427     htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
 428     xmlFree(outbuf);
 429     ret = xmlBufUse(buf) - use;
 430     return (ret);
 431 }
 432
 433 /**
 434  * htmlNodeDump:
 435  * @buf:  the HTML buffer output
 436  * @doc:  the document
 437  * @cur:  the current node
 438  *
 439  * Dump an HTML node, recursive behaviour,children are printed too,
 440  * and formatting returns are added.
 441  *
 442  * Returns the number of byte written or -1 in case of error
 443  */
 444 int
 445 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
 446     xmlBufPtr buffer;
 447     size_t ret;
 448
 449     if ((buf == NULL) || (cur == NULL))
 450         return(-1);
 451
 452     xmlInitParser();
 453     buffer = xmlBufFromBuffer(buf);
 454     if (buffer == NULL)
 455         return(-1);
 456
 457     ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
 458
 459     xmlBufBackToBuffer(buffer);
 460
 461     if (ret > INT_MAX)
 462         return(-1);
 463     return((int) ret);
 464 }
 465
 466 /**
 467  * htmlNodeDumpFileFormat:
 468  * @out:  the FILE pointer
 469  * @doc:  the document
 470  * @cur:  the current node
 471  * @encoding: the document encoding
 472  * @format:  should formatting spaces been added
 473  *
 474  * Dump an HTML node, recursive behaviour,children are printed too.
 475  *
 476  * TODO: if encoding == NULL try to save in the doc encoding
 477  *
 478  * returns: the number of byte written or -1 in case of failure.
 479  */
 480 int
 481 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
 482                        xmlNodePtr cur, const char *encoding, int format) {
 483     xmlOutputBufferPtr buf;
 484     xmlCharEncodingHandlerPtr handler = NULL;
 485     int ret;
 486
 487     xmlInitParser();
 488
 489     if (encoding != NULL) {
 490         xmlCharEncoding enc;
 491
 492         enc = xmlParseCharEncoding(encoding);
 493         if (enc != XML_CHAR_ENCODING_UTF8) {
 494             handler = xmlFindCharEncodingHandler(encoding);
 495             if (handler == NULL)
 496                 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
 497         }
 498     } else {
 499         /*
 500          * Fallback to HTML or ASCII when the encoding is unspecified
 501          */
 502         if (handler == NULL)
 503             handler = xmlFindCharEncodingHandler("HTML");
 504         if (handler == NULL)
 505             handler = xmlFindCharEncodingHandler("ascii");
 506     }
 507
 508     /*
 509      * save the content to a temp buffer.
 510      */
 511     buf = xmlOutputBufferCreateFile(out, handler);
 512     if (buf == NULL) return(0);
 513
 514     htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format);
 515
 516     ret = xmlOutputBufferClose(buf);
 517     return(ret);
 518 }
 519
 520 /**
 521  * htmlNodeDumpFile:
 522  * @out:  the FILE pointer
 523  * @doc:  the document
 524  * @cur:  the current node
 525  *
 526  * Dump an HTML node, recursive behaviour,children are printed too,
 527  * and formatting returns are added.
 528  */
 529 void
 530 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
 531     htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
 532 }
 533
 534 /**
 535  * htmlDocDumpMemoryFormat:
 536  * @cur:  the document
 537  * @mem:  OUT: the memory pointer
 538  * @size:  OUT: the memory length
 539  * @format:  should formatting spaces been added
 540  *
 541  * Dump an HTML document in memory and return the xmlChar * and it's size.
 542  * It's up to the caller to free the memory.
 543  */
 544 void
 545 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
 546     xmlOutputBufferPtr buf;
 547     xmlCharEncodingHandlerPtr handler = NULL;
 548     const char *encoding;
 549
 550     xmlInitParser();
 551
 552     if ((mem == NULL) || (size == NULL))
 553         return;
 554     if (cur == NULL) {
 555         *mem = NULL;
 556         *size = 0;
 557         return;
 558     }
 559
 560     encoding = (const char *) htmlGetMetaEncoding(cur);
 561
 562     if (encoding != NULL) {
 563         xmlCharEncoding enc;
 564
 565         enc = xmlParseCharEncoding(encoding);
 566         if (enc != XML_CHAR_ENCODING_UTF8) {
 567             handler = xmlFindCharEncodingHandler(encoding);
 568             if (handler == NULL)
 569                 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
 570
 571         }
 572     } else {
 573         /*
 574          * Fallback to HTML or ASCII when the encoding is unspecified
 575          */
 576         if (handler == NULL)
 577             handler = xmlFindCharEncodingHandler("HTML");
 578         if (handler == NULL)
 579             handler = xmlFindCharEncodingHandler("ascii");
 580     }
 581
 582     buf = xmlAllocOutputBufferInternal(handler);
 583     if (buf == NULL) {
 584         *mem = NULL;
 585         *size = 0;
 586         return;
 587     }
 588
 589     htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
 590
 591     xmlOutputBufferFlush(buf);
 592     if (buf->conv != NULL) {
 593         *size = xmlBufUse(buf->conv);
 594         *mem = xmlStrndup(xmlBufContent(buf->conv), *size);
 595     } else {
 596         *size = xmlBufUse(buf->buffer);
 597         *mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
 598     }
 599     (void)xmlOutputBufferClose(buf);
 600 }
 601
 602 /**
 603  * htmlDocDumpMemory:
 604  * @cur:  the document
 605  * @mem:  OUT: the memory pointer
 606  * @size:  OUT: the memory length
 607  *
 608  * Dump an HTML document in memory and return the xmlChar * and it's size.
 609  * It's up to the caller to free the memory.
 610  */
 611 void
 612 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
 613         htmlDocDumpMemoryFormat(cur, mem, size, 1);
 614 }
 615
 616
 617 /************************************************************************
 618  *                                                                      *
 619  *              Dumping HTML tree content to an I/O output buffer       *
 620  *                                                                      *
 621  ************************************************************************/
 622
 623 /**
 624  * htmlDtdDumpOutput:
 625  * @buf:  the HTML buffer output
 626  * @doc:  the document
 627  * @encoding:  the encoding string
 628  *
 629  * TODO: check whether encoding is needed
 630  *
 631  * Dump the HTML document DTD, if any.
 632  */
 633 static void
 634 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
 635                   const char *encoding ATTRIBUTE_UNUSED) {
 636     xmlDtdPtr cur = doc->intSubset;
 637
 638     if (cur == NULL) {
 639         htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
 640         return;
 641     }
 642     xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
 643     xmlOutputBufferWriteString(buf, (const char *)cur->name);
 644     if (cur->ExternalID != NULL) {
 645         xmlOutputBufferWriteString(buf, " PUBLIC ");
 646         xmlBufWriteQuotedString(buf->buffer, cur->ExternalID);
 647         if (cur->SystemID != NULL) {
 648             xmlOutputBufferWriteString(buf, " ");
 649             xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
 650         }
 651     } else if (cur->SystemID != NULL &&
 652                xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
 653         xmlOutputBufferWriteString(buf, " SYSTEM ");
 654         xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
 655     }
 656     xmlOutputBufferWriteString(buf, ">\n");
 657 }
 658
 659 /**
 660  * htmlAttrDumpOutput:
 661  * @buf:  the HTML buffer output
 662  * @doc:  the document
 663  * @cur:  the attribute pointer
 664  *
 665  * Dump an HTML attribute
 666  */
 667 static void
 668 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
 669     xmlChar *value;
 670
 671     /*
 672      * The html output method should not escape a & character
 673      * occurring in an attribute value immediately followed by
 674      * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
 675      * This is implemented in xmlEncodeEntitiesReentrant
 676      */
 677
 678     if (cur == NULL) {
 679         return;
 680     }
 681     xmlOutputBufferWriteString(buf, " ");
 682     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
 683         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
 684         xmlOutputBufferWriteString(buf, ":");
 685     }
 686     xmlOutputBufferWriteString(buf, (const char *)cur->name);
 687     if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
 688         value = xmlNodeListGetString(doc, cur->children, 0);
 689         if (value) {
 690             xmlOutputBufferWriteString(buf, "=");
 691             if ((cur->ns == NULL) && (cur->parent != NULL) &&
 692                 (cur->parent->ns == NULL) &&
 693                 ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
 694                  (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
 695                  (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
 696                  ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
 697                   (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
 698                 xmlChar *escaped;
 699                 xmlChar *tmp = value;
 700
 701                 while (IS_BLANK_CH(*tmp)) tmp++;
 702
 703                 /*
 704                  * Angle brackets are technically illegal in URIs, but they're
 705                  * used in server side includes, for example. Curly brackets
 706                  * are illegal as well and often used in templates.
 707                  * Don't escape non-whitespace, printable ASCII chars for
 708                  * improved interoperability. Only escape space, control
 709                  * and non-ASCII chars.
 710                  */
 711                 escaped = xmlURIEscapeStr(tmp,
 712                         BAD_CAST "\"#$%&+,/:;<=>?@[\\]^`{|}");
 713                 if (escaped != NULL) {
 714                     xmlBufWriteQuotedString(buf->buffer, escaped);
 715                     xmlFree(escaped);
 716                 } else {
 717                     xmlBufWriteQuotedString(buf->buffer, value);
 718                 }
 719             } else {
 720                 xmlBufWriteQuotedString(buf->buffer, value);
 721             }
 722             xmlFree(value);
 723         } else  {
 724             xmlOutputBufferWriteString(buf, "=\"\"");
 725         }
 726     }
 727 }
 728
 729 /**
 730  * htmlNodeDumpFormatOutput:
 731  * @buf:  the HTML buffer output
 732  * @doc:  the document
 733  * @cur:  the current node
 734  * @encoding:  the encoding string (unused)
 735  * @format:  should formatting spaces been added
 736  *
 737  * Dump an HTML node, recursive behaviour,children are printed too.
 738  */
 739 void
 740 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
 741                          xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
 742                          int format) {
 743     xmlNodePtr root, parent;
 744     xmlAttrPtr attr;
 745     const htmlElemDesc * info;
 746
 747     xmlInitParser();
 748
 749     if ((cur == NULL) || (buf == NULL)) {
 750         return;
 751     }
 752
 753     root = cur;
 754     parent = cur->parent;
 755     while (1) {
 756         switch (cur->type) {
 757         case XML_HTML_DOCUMENT_NODE:
 758         case XML_DOCUMENT_NODE:
 759             if (((xmlDocPtr) cur)->intSubset != NULL) {
 760                 htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
 761             }
 762             if (cur->children != NULL) {
 763                 /* Always validate cur->parent when descending. */
 764                 if (cur->parent == parent) {
 765                     parent = cur;
 766                     cur = cur->children;
 767                     continue;
 768                 }
 769             } else {
 770                 xmlOutputBufferWriteString(buf, "\n");
 771             }
 772             break;
 773
 774         case XML_ELEMENT_NODE:
 775             /*
 776              * Some users like lxml are known to pass nodes with a corrupted
 777              * tree structure. Fall back to a recursive call to handle this
 778              * case.
 779              */
 780             if ((cur->parent != parent) && (cur->children != NULL)) {
 781                 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
 782                 break;
 783             }
 784
 785             /*
 786              * Get specific HTML info for that node.
 787              */
 788             if (cur->ns == NULL)
 789                 info = htmlTagLookup(cur->name);
 790             else
 791                 info = NULL;
 792
 793             xmlOutputBufferWriteString(buf, "<");
 794             if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
 795                 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
 796                 xmlOutputBufferWriteString(buf, ":");
 797             }
 798             xmlOutputBufferWriteString(buf, (const char *)cur->name);
 799             if (cur->nsDef)
 800                 xmlNsListDumpOutput(buf, cur->nsDef);
 801             attr = cur->properties;
 802             while (attr != NULL) {
 803                 htmlAttrDumpOutput(buf, doc, attr);
 804                 attr = attr->next;
 805             }
 806
 807             if ((info != NULL) && (info->empty)) {
 808                 xmlOutputBufferWriteString(buf, ">");
 809             } else if (cur->children == NULL) {
 810                 if ((info != NULL) && (info->saveEndTag != 0) &&
 811                     (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
 812                     (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
 813                     xmlOutputBufferWriteString(buf, ">");
 814                 } else {
 815                     xmlOutputBufferWriteString(buf, "></");
 816                     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
 817                         xmlOutputBufferWriteString(buf,
 818                                 (const char *)cur->ns->prefix);
 819                         xmlOutputBufferWriteString(buf, ":");
 820                     }
 821                     xmlOutputBufferWriteString(buf, (const char *)cur->name);
 822                     xmlOutputBufferWriteString(buf, ">");
 823                 }
 824             } else {
 825                 xmlOutputBufferWriteString(buf, ">");
 826                 if ((format) && (info != NULL) && (!info->isinline) &&
 827                     (cur->children->type != HTML_TEXT_NODE) &&
 828                     (cur->children->type != HTML_ENTITY_REF_NODE) &&
 829                     (cur->children != cur->last) &&
 830                     (cur->name != NULL) &&
 831                     (cur->name[0] != 'p')) /* p, pre, param */
 832                     xmlOutputBufferWriteString(buf, "\n");
 833                 parent = cur;
 834                 cur = cur->children;
 835                 continue;
 836             }
 837
 838             if ((format) && (cur->next != NULL) &&
 839                 (info != NULL) && (!info->isinline)) {
 840                 if ((cur->next->type != HTML_TEXT_NODE) &&
 841                     (cur->next->type != HTML_ENTITY_REF_NODE) &&
 842                     (parent != NULL) &&
 843                     (parent->name != NULL) &&
 844                     (parent->name[0] != 'p')) /* p, pre, param */
 845                     xmlOutputBufferWriteString(buf, "\n");
 846             }
 847
 848             break;
 849
 850         case XML_ATTRIBUTE_NODE:
 851             htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
 852             break;
 853
 854         case HTML_TEXT_NODE:
 855             if (cur->content == NULL)
 856                 break;
 857             if (((cur->name == (const xmlChar *)xmlStringText) ||
 858                  (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
 859                 ((parent == NULL) ||
 860                  ((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
 861                   (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
 862                 xmlChar *buffer;
 863
 864                 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
 865                 if (buffer != NULL) {
 866                     xmlOutputBufferWriteString(buf, (const char *)buffer);
 867                     xmlFree(buffer);
 868                 }
 869             } else {
 870                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
 871             }
 872             break;
 873
 874         case HTML_COMMENT_NODE:
 875             if (cur->content != NULL) {
 876                 xmlOutputBufferWriteString(buf, "<!--");
 877                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
 878                 xmlOutputBufferWriteString(buf, "-->");
 879             }
 880             break;
 881
 882         case HTML_PI_NODE:
 883             if (cur->name != NULL) {
 884                 xmlOutputBufferWriteString(buf, "<?");
 885                 xmlOutputBufferWriteString(buf, (const char *)cur->name);
 886                 if (cur->content != NULL) {
 887                     xmlOutputBufferWriteString(buf, " ");
 888                     xmlOutputBufferWriteString(buf,
 889                             (const char *)cur->content);
 890                 }
 891                 xmlOutputBufferWriteString(buf, ">");
 892             }
 893             break;
 894
 895         case HTML_ENTITY_REF_NODE:
 896             xmlOutputBufferWriteString(buf, "&");
 897             xmlOutputBufferWriteString(buf, (const char *)cur->name);
 898             xmlOutputBufferWriteString(buf, ";");
 899             break;
 900
 901         case HTML_PRESERVE_NODE:
 902             if (cur->content != NULL) {
 903                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
 904             }
 905             break;
 906
 907         default:
 908             break;
 909         }
 910
 911         while (1) {
 912             if (cur == root)
 913                 return;
 914             if (cur->next != NULL) {
 915                 cur = cur->next;
 916                 break;
 917             }
 918
 919             cur = parent;
 920             /* cur->parent was validated when descending. */
 921             parent = cur->parent;
 922
 923             if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
 924                 (cur->type == XML_DOCUMENT_NODE)) {
 925                 xmlOutputBufferWriteString(buf, "\n");
 926             } else {
 927                 if ((format) && (cur->ns == NULL))
 928                     info = htmlTagLookup(cur->name);
 929                 else
 930                     info = NULL;
 931
 932                 if ((format) && (info != NULL) && (!info->isinline) &&
 933                     (cur->last->type != HTML_TEXT_NODE) &&
 934                     (cur->last->type != HTML_ENTITY_REF_NODE) &&
 935                     (cur->children != cur->last) &&
 936                     (cur->name != NULL) &&
 937                     (cur->name[0] != 'p')) /* p, pre, param */
 938                     xmlOutputBufferWriteString(buf, "\n");
 939
 940                 xmlOutputBufferWriteString(buf, "</");
 941                 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
 942                     xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
 943                     xmlOutputBufferWriteString(buf, ":");
 944                 }
 945                 xmlOutputBufferWriteString(buf, (const char *)cur->name);
 946                 xmlOutputBufferWriteString(buf, ">");
 947
 948                 if ((format) && (info != NULL) && (!info->isinline) &&
 949                     (cur->next != NULL)) {
 950                     if ((cur->next->type != HTML_TEXT_NODE) &&
 951                         (cur->next->type != HTML_ENTITY_REF_NODE) &&
 952                         (parent != NULL) &&
 953                         (parent->name != NULL) &&
 954                         (parent->name[0] != 'p')) /* p, pre, param */
 955                         xmlOutputBufferWriteString(buf, "\n");
 956                 }
 957             }
 958         }
 959     }
 960 }
 961
 962 /**
 963  * htmlNodeDumpOutput:
 964  * @buf:  the HTML buffer output
 965  * @doc:  the document
 966  * @cur:  the current node
 967  * @encoding:  the encoding string (unused)
 968  *
 969  * Dump an HTML node, recursive behaviour,children are printed too,
 970  * and formatting returns/spaces are added.
 971  */
 972 void
 973 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
 974                    xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
 975     htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1);
 976 }
 977
 978 /**
 979  * htmlDocContentDumpFormatOutput:
 980  * @buf:  the HTML buffer output
 981  * @cur:  the document
 982  * @encoding:  the encoding string (unused)
 983  * @format:  should formatting spaces been added
 984  *
 985  * Dump an HTML document.
 986  */
 987 void
 988 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
 989                                const char *encoding ATTRIBUTE_UNUSED,
 990                                int format) {
 991     int type = 0;
 992     if (cur) {
 993         type = cur->type;
 994         cur->type = XML_HTML_DOCUMENT_NODE;
 995     }
 996     htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format);
 997     if (cur)
 998         cur->type = (xmlElementType) type;
 999 }
1000
1001 /**
1002  * htmlDocContentDumpOutput:
1003  * @buf:  the HTML buffer output
1004  * @cur:  the document
1005  * @encoding:  the encoding string (unused)
1006  *
1007  * Dump an HTML document. Formatting return/spaces are added.
1008  */
1009 void
1010 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1011                          const char *encoding ATTRIBUTE_UNUSED) {
1012     htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1);
1013 }
1014
1015 /************************************************************************
1016  *                                                                      *
1017  *              Saving functions front-ends                             *
1018  *                                                                      *
1019  ************************************************************************/
1020
1021 /**
1022  * htmlDocDump:
1023  * @f:  the FILE*
1024  * @cur:  the document
1025  *
1026  * Dump an HTML document to an open FILE.
1027  *
1028  * returns: the number of byte written or -1 in case of failure.
1029  */
1030 int
1031 htmlDocDump(FILE *f, xmlDocPtr cur) {
1032     xmlOutputBufferPtr buf;
1033     xmlCharEncodingHandlerPtr handler = NULL;
1034     const char *encoding;
1035     int ret;
1036
1037     xmlInitParser();
1038
1039     if ((cur == NULL) || (f == NULL)) {
1040         return(-1);
1041     }
1042
1043     encoding = (const char *) htmlGetMetaEncoding(cur);
1044
1045     if (encoding != NULL) {
1046         xmlCharEncoding enc;
1047
1048         enc = xmlParseCharEncoding(encoding);
1049         if (enc != XML_CHAR_ENCODING_UTF8) {
1050             handler = xmlFindCharEncodingHandler(encoding);
1051             if (handler == NULL)
1052                 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1053         }
1054     } else {
1055         /*
1056          * Fallback to HTML or ASCII when the encoding is unspecified
1057          */
1058         if (handler == NULL)
1059             handler = xmlFindCharEncodingHandler("HTML");
1060         if (handler == NULL)
1061             handler = xmlFindCharEncodingHandler("ascii");
1062     }
1063
1064     buf = xmlOutputBufferCreateFile(f, handler);
1065     if (buf == NULL) return(-1);
1066     htmlDocContentDumpOutput(buf, cur, NULL);
1067
1068     ret = xmlOutputBufferClose(buf);
1069     return(ret);
1070 }
1071
1072 /**
1073  * htmlSaveFile:
1074  * @filename:  the filename (or URL)
1075  * @cur:  the document
1076  *
1077  * Dump an HTML document to a file. If @filename is "-" the stdout file is
1078  * used.
1079  * returns: the number of byte written or -1 in case of failure.
1080  */
1081 int
1082 htmlSaveFile(const char *filename, xmlDocPtr cur) {
1083     xmlOutputBufferPtr buf;
1084     xmlCharEncodingHandlerPtr handler = NULL;
1085     const char *encoding;
1086     int ret;
1087
1088     if ((cur == NULL) || (filename == NULL))
1089         return(-1);
1090
1091     xmlInitParser();
1092
1093     encoding = (const char *) htmlGetMetaEncoding(cur);
1094
1095     if (encoding != NULL) {
1096         xmlCharEncoding enc;
1097
1098         enc = xmlParseCharEncoding(encoding);
1099         if (enc != XML_CHAR_ENCODING_UTF8) {
1100             handler = xmlFindCharEncodingHandler(encoding);
1101             if (handler == NULL)
1102                 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1103         }
1104     } else {
1105         /*
1106          * Fallback to HTML or ASCII when the encoding is unspecified
1107          */
1108         if (handler == NULL)
1109             handler = xmlFindCharEncodingHandler("HTML");
1110         if (handler == NULL)
1111             handler = xmlFindCharEncodingHandler("ascii");
1112     }
1113
1114     /*
1115      * save the content to a temp buffer.
1116      */
1117     buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1118     if (buf == NULL) return(0);
1119
1120     htmlDocContentDumpOutput(buf, cur, NULL);
1121
1122     ret = xmlOutputBufferClose(buf);
1123     return(ret);
1124 }
1125
1126 /**
1127  * htmlSaveFileFormat:
1128  * @filename:  the filename
1129  * @cur:  the document
1130  * @format:  should formatting spaces been added
1131  * @encoding: the document encoding
1132  *
1133  * Dump an HTML document to a file using a given encoding.
1134  *
1135  * returns: the number of byte written or -1 in case of failure.
1136  */
1137 int
1138 htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1139                    const char *encoding, int format) {
1140     xmlOutputBufferPtr buf;
1141     xmlCharEncodingHandlerPtr handler = NULL;
1142     int ret;
1143
1144     if ((cur == NULL) || (filename == NULL))
1145         return(-1);
1146
1147     xmlInitParser();
1148
1149     if (encoding != NULL) {
1150         xmlCharEncoding enc;
1151
1152         enc = xmlParseCharEncoding(encoding);
1153         if (enc != XML_CHAR_ENCODING_UTF8) {
1154             handler = xmlFindCharEncodingHandler(encoding);
1155             if (handler == NULL)
1156                 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1157         }
1158         htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1159     } else {
1160         htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1161
1162         /*
1163          * Fallback to HTML or ASCII when the encoding is unspecified
1164          */
1165         if (handler == NULL)
1166             handler = xmlFindCharEncodingHandler("HTML");
1167         if (handler == NULL)
1168             handler = xmlFindCharEncodingHandler("ascii");
1169     }
1170
1171     /*
1172      * save the content to a temp buffer.
1173      */
1174     buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1175     if (buf == NULL) return(0);
1176
1177     htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1178
1179     ret = xmlOutputBufferClose(buf);
1180     return(ret);
1181 }
1182
1183 /**
1184  * htmlSaveFileEnc:
1185  * @filename:  the filename
1186  * @cur:  the document
1187  * @encoding: the document encoding
1188  *
1189  * Dump an HTML document to a file using a given encoding
1190  * and formatting returns/spaces are added.
1191  *
1192  * returns: the number of byte written or -1 in case of failure.
1193  */
1194 int
1195 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1196     return(htmlSaveFileFormat(filename, cur, encoding, 1));
1197 }
1198
1199 #endif /* LIBXML_OUTPUT_ENABLED */
1200
1201 #endif /* LIBXML_HTML_ENABLED */