1 /*****************************************************************************
3 * Project ___| | | | _ \| |
5 * | (__| |_| | _ <| |___
6 * \___|\___/|_| \_\_____|
8 * $Id: htmltidy.c,v 1.1.1.1 2008-09-23 16:32:05 hoffman Exp $
10 * Download a document and use libtidy to parse the HTML.
11 * Written by Jeff Pohlmeyer
13 * LibTidy => http://tidy.sourceforge.net
15 * gcc -Wall -I/usr/local/include tidycurl.c -lcurl -ltidy -o tidycurl
20 #include <tidy/tidy.h>
21 #include <tidy/buffio.h>
22 #include <curl/curl.h>
24 /* curl write callback, to fill tidy's input buffer... */
25 uint
write_cb(char *in
, uint size
, uint nmemb
, TidyBuffer
*out
)
29 tidyBufAppend( out
, in
, r
);
33 /* Traverse the document tree */
34 void dumpNode(TidyDoc doc
, TidyNode tnod
, int indent
)
37 for ( child
= tidyGetChild(tnod
); child
; child
= tidyGetNext(child
) )
39 ctmbstr name
= tidyNodeGetName( child
);
42 /* if it has a name, then it's an HTML tag ... */
44 printf( "%*.*s%s ", indent
, indent
, "<", name
);
45 /* walk the attribute list */
46 for ( attr
=tidyAttrFirst(child
); attr
; attr
=tidyAttrNext(attr
) ) {
47 printf(tidyAttrName(attr
));
48 tidyAttrValue(attr
)?printf("=\"%s\" ",
49 tidyAttrValue(attr
)):printf(" ");
54 /* if it doesn't have a name, then it's probably text, cdata, etc... */
57 tidyNodeGetText(doc
, child
, &buf
);
58 printf("%*.*s\n", indent
, indent
, buf
.bp
?(char *)buf
.bp
:"");
61 dumpNode( doc
, child
, indent
+ 4 ); /* recursive */
66 int main(int argc
, char **argv
)
69 char curl_errbuf
[CURL_ERROR_SIZE
];
71 TidyBuffer docbuf
= {0};
72 TidyBuffer tidy_errbuf
= {0};
75 curl
= curl_easy_init();
76 curl_easy_setopt(curl
, CURLOPT_URL
, argv
[1]);
77 curl_easy_setopt(curl
, CURLOPT_ERRORBUFFER
, curl_errbuf
);
78 curl_easy_setopt(curl
, CURLOPT_NOPROGRESS
, 0L);
79 curl_easy_setopt(curl
, CURLOPT_VERBOSE
, 1L);
80 curl_easy_setopt(curl
, CURLOPT_WRITEFUNCTION
, write_cb
);
83 tidyOptSetBool(tdoc
, TidyForceOutput
, yes
); /* try harder */
84 tidyOptSetInt(tdoc
, TidyWrapLen
, 4096);
85 tidySetErrorBuffer( tdoc
, &tidy_errbuf
);
88 curl_easy_setopt(curl
, CURLOPT_WRITEDATA
, &docbuf
);
89 err
=curl_easy_perform(curl
);
91 err
= tidyParseBuffer(tdoc
, &docbuf
); /* parse the input */
93 err
= tidyCleanAndRepair(tdoc
); /* fix any problems */
95 err
= tidyRunDiagnostics(tdoc
); /* load tidy error buffer */
97 dumpNode( tdoc
, tidyGetRoot(tdoc
), 0 ); /* walk the tree */
98 fprintf(stderr
, "%s\n", tidy_errbuf
.bp
); /* show errors */
104 fprintf(stderr
, "%s\n", curl_errbuf
);
107 curl_easy_cleanup(curl
);
108 tidyBufFree(&docbuf
);
109 tidyBufFree(&tidy_errbuf
);
115 printf( "usage: %s <url>\n", argv
[0] );