1 /*****************************************************************************
3 * Project ___| | | | _ \| |
5 * | (__| |_| | _ <| |___
6 * \___|\___/|_| \_\_____|
8 * $Id: htmltitle.cc,v 1.1.1.1 2008-09-23 16:32:05 hoffman Exp $
11 // Get a web page, parse it with libxml.
13 // Written by Lars Nilsson
15 // GNU C++ compile command line suggestion (edit paths accordingly):
17 // g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cc \
18 // -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2
24 #include <curl/curl.h>
25 #include <libxml/HTMLparser.h>
28 // Case-insensitive string comparison
32 #define COMPARE(a, b) (!stricmp((a), (b)))
34 #define COMPARE(a, b) (!strcasecmp((a), (b)))
38 // libxml callback context structure
43 Context(): addTitle(false) { }
50 // libcurl variables for error strings and returned data
52 static char errorBuffer
[CURL_ERROR_SIZE
];
53 static std::string buffer
;
56 // libcurl write callback function
59 static int writer(char *data
, size_t size
, size_t nmemb
,
60 std::string
*writerData
)
62 if (writerData
== NULL
)
65 writerData
->append(data
, size
*nmemb
);
71 // libcurl connection initialization
74 static bool init(CURL
*&conn
, char *url
)
78 conn
= curl_easy_init();
82 fprintf(stderr
, "Failed to create CURL connection\n");
87 code
= curl_easy_setopt(conn
, CURLOPT_ERRORBUFFER
, errorBuffer
);
90 fprintf(stderr
, "Failed to set error buffer [%d]\n", code
);
95 code
= curl_easy_setopt(conn
, CURLOPT_URL
, url
);
98 fprintf(stderr
, "Failed to set URL [%s]\n", errorBuffer
);
103 code
= curl_easy_setopt(conn
, CURLOPT_FOLLOWLOCATION
, 1L);
104 if (code
!= CURLE_OK
)
106 fprintf(stderr
, "Failed to set redirect option [%s]\n", errorBuffer
);
111 code
= curl_easy_setopt(conn
, CURLOPT_WRITEFUNCTION
, writer
);
112 if (code
!= CURLE_OK
)
114 fprintf(stderr
, "Failed to set writer [%s]\n", errorBuffer
);
119 code
= curl_easy_setopt(conn
, CURLOPT_WRITEDATA
, &buffer
);
120 if (code
!= CURLE_OK
)
122 fprintf(stderr
, "Failed to set write data [%s]\n", errorBuffer
);
131 // libxml start element callback function
134 static void StartElement(void *voidContext
,
136 const xmlChar
**attributes
)
138 Context
*context
= (Context
*)voidContext
;
140 if (COMPARE((char *)name
, "TITLE"))
143 context
->addTitle
= true;
148 // libxml end element callback function
151 static void EndElement(void *voidContext
,
154 Context
*context
= (Context
*)voidContext
;
156 if (COMPARE((char *)name
, "TITLE"))
157 context
->addTitle
= false;
161 // Text handling helper function
164 static void handleCharacters(Context
*context
,
165 const xmlChar
*chars
,
168 if (context
->addTitle
)
169 context
->title
.append((char *)chars
, length
);
173 // libxml PCDATA callback function
176 static void Characters(void *voidContext
,
177 const xmlChar
*chars
,
180 Context
*context
= (Context
*)voidContext
;
182 handleCharacters(context
, chars
, length
);
186 // libxml CDATA callback function
189 static void cdata(void *voidContext
,
190 const xmlChar
*chars
,
193 Context
*context
= (Context
*)voidContext
;
195 handleCharacters(context
, chars
, length
);
199 // libxml SAX callback structure
202 static htmlSAXHandler saxHandler
=
234 // Parse given (assumed to be) HTML text and return the title
237 static void parseHtml(const std::string
&html
,
240 htmlParserCtxtPtr ctxt
;
243 ctxt
= htmlCreatePushParserCtxt(&saxHandler
, &context
, "", 0, "",
244 XML_CHAR_ENCODING_NONE
);
246 htmlParseChunk(ctxt
, html
.c_str(), html
.size(), 0);
247 htmlParseChunk(ctxt
, "", 0, 1);
249 htmlFreeParserCtxt(ctxt
);
251 title
= context
.title
;
254 int main(int argc
, char *argv
[])
260 // Ensure one argument is given
264 fprintf(stderr
, "Usage: %s <url>\n", argv
[0]);
269 curl_global_init(CURL_GLOBAL_DEFAULT
);
271 // Initialize CURL connection
273 if (!init(conn
, argv
[1]))
275 fprintf(stderr
, "Connection initializion failed\n");
280 // Retrieve content for the URL
282 code
= curl_easy_perform(conn
);
283 curl_easy_cleanup(conn
);
285 if (code
!= CURLE_OK
)
287 fprintf(stderr
, "Failed to get '%s' [%s]\n", argv
[1], errorBuffer
);
292 // Parse the (assumed) HTML code
294 parseHtml(buffer
, title
);
296 // Display the extracted title
298 printf("Title: %s\n", title
.c_str());