4 * Copyright (C) 1998-1999, Mark Spencer <markster@marko.net>
5 * 2003, Nathan Walp <faceprint@faceprint.com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
33 #include <sys/socket.h>
35 #include <netinet/in.h>
38 #include <sys/types.h>
48 gchar
*strip_html(const gchar
*text
)
52 gchar
*text2
= g_strdup(text
);
57 for (i
= 0, j
= 0; text2
[i
]; i
++) {
58 if (text2
[i
] == '<') {
60 if(g_ascii_isspace(text2
[k
])) {
64 if (text2
[k
] == '<') {
68 if (text2
[k
] == '>') {
75 } else if (text2
[i
] == '>' && !visible
) {
79 if (text2
[i
] == '&' && strncasecmp(text2
+i
,""",6) == 0) {
85 text2
[j
++] = text2
[i
];
92 struct g_url
*parse_url(char *url
)
94 struct g_url
*test
= (struct g_url
*)malloc(sizeof(struct g_url
));
99 /* hyphen at end includes it in control set */
100 char addr_ctrl
[] = "A-Za-z0-9.-";
101 char port_ctrl
[] = "0-9";
102 char page_ctrl
[] = "A-Za-z0-9.~_/&%%?=+^-";
104 if((turl
=strstr(url
, "http://")) || (turl
=strstr(url
, "HTTP://")))
107 snprintf(scan_info
, sizeof(scan_info
),
108 "%%[%s]:%%[%s]/%%[%s]",
109 addr_ctrl
, port_ctrl
, page_ctrl
);
111 f
= sscanf(url
, scan_info
, test
->address
, port
, test
->page
);
113 snprintf(scan_info
, sizeof(scan_info
),
115 addr_ctrl
, page_ctrl
);
116 f
= sscanf(url
, scan_info
, test
->address
, test
->page
);
117 snprintf(port
, sizeof(port
), "80");
120 snprintf(test
->page
, sizeof(test
->page
), "");
122 sscanf(port
, "%d", &test
->port
);
126 struct grab_url_data
{
127 void (* callback
)(gpointer
, char *, unsigned long);
129 struct g_url
*website
;
137 gboolean startsaving
;
140 unsigned long data_len
;
144 parse_redirect(const char *data
, size_t data_len
, gint sock
,
145 struct grab_url_data
*gunk
)
149 if ((s
= g_strstr_len(data
, data_len
, "Location: ")) != NULL
) {
150 gchar
*new_url
, *end
;
153 s
+= strlen("Location: ");
154 end
= strchr(s
, '\r');
156 /* Just in case :) */
158 end
= strchr(s
, '\n');
162 new_url
= g_malloc(len
+ 1);
163 strncpy(new_url
, s
, len
);
166 /* Close the existing stuff. */
167 gaim_input_remove(gunk
->inpa
);
170 /* Try again, with this new location. */
171 grab_url(new_url
, gunk
->full
, gunk
->callback
,
176 g_free(gunk
->webdata
);
177 g_free(gunk
->website
);
188 parse_content_len(const char *data
, size_t data_len
)
190 size_t content_len
= 0;
192 sscanf(data
, "Content-Length: %d", &content_len
);
197 static void grab_url_callback(gpointer dat
, gint sock
, GaimInputCondition cond
)
199 struct grab_url_data
*gunk
= dat
;
203 gunk
->callback(gunk
->data
, NULL
, 0);
204 g_free(gunk
->website
);
210 if (!gunk
->sentreq
) {
213 g_snprintf(buf
, sizeof(buf
), "GET %s%s HTTP/1.0\r\n\r\n", gunk
->full
? "" : "/",
214 gunk
->full
? gunk
->url
: gunk
->website
->page
);
216 gaim_debug(GAIM_DEBUG_MISC
, "grab_url_callback",
217 "Request: %s\n", buf
);
219 write(sock
, buf
, strlen(buf
));
220 fcntl(sock
, F_SETFL
, O_NONBLOCK
);
221 gunk
->sentreq
= TRUE
;
222 gunk
->inpa
= gaim_input_add(sock
, GAIM_INPUT_READ
, grab_url_callback
, dat
);
223 gunk
->data_len
= 4096;
224 gunk
->webdata
= g_malloc(gunk
->data_len
);
228 if (read(sock
, &data
, 1) > 0 || errno
== EWOULDBLOCK
) {
229 if (errno
== EWOULDBLOCK
) {
236 if (gunk
->len
== gunk
->data_len
+ 1) {
237 gunk
->data_len
+= (gunk
->data_len
) / 2;
239 gunk
->webdata
= g_realloc(gunk
->webdata
, gunk
->data_len
);
242 gunk
->webdata
[gunk
->len
- 1] = data
;
244 if (!gunk
->startsaving
) {
250 gunk
->startsaving
= TRUE
;
252 /* See if we can find a redirect. */
253 if (parse_redirect(gunk
->webdata
, gunk
->len
, sock
, gunk
))
256 /* No redirect. See if we can find a content length. */
257 content_len
= parse_content_len(gunk
->webdata
, gunk
->len
);
259 if (content_len
== 0) {
260 /* We'll stick with an initial 8192 */
264 /* Out with the old... */
266 g_free(gunk
->webdata
);
267 gunk
->webdata
= NULL
;
269 /* In with the new. */
270 gunk
->data_len
= content_len
;
271 gunk
->webdata
= g_malloc(gunk
->data_len
);
274 gunk
->newline
= TRUE
;
277 gunk
->newline
= FALSE
;
279 } else if (errno
!= ETIMEDOUT
) {
280 gunk
->webdata
= g_realloc(gunk
->webdata
, gunk
->len
+ 1);
281 gunk
->webdata
[gunk
->len
] = 0;
283 gaim_debug(GAIM_DEBUG_MISC
, "grab_url_callback",
284 "Received: '%s'\n", gunk
->webdata
);
286 gaim_input_remove(gunk
->inpa
);
288 gunk
->callback(gunk
->data
, gunk
->webdata
, gunk
->len
);
290 g_free(gunk
->webdata
);
291 g_free(gunk
->website
);
295 gaim_input_remove(gunk
->inpa
);
297 gunk
->callback(gunk
->data
, NULL
, 0);
299 g_free(gunk
->webdata
);
300 g_free(gunk
->website
);
306 void grab_url(char *url
, gboolean full
, void callback(gpointer
, char *, unsigned long), gpointer data
)
309 struct grab_url_data
*gunk
= g_new0(struct grab_url_data
, 1);
311 gunk
->callback
= callback
;
313 gunk
->url
= g_strdup(url
);
314 gunk
->website
= parse_url(url
);
317 if ((sock
= proxy_connect(NULL
, gunk
->website
->address
, gunk
->website
->port
,
318 grab_url_callback
, gunk
)) < 0) {
319 g_free(gunk
->website
);
322 callback(data
, g_strdup(_("g003: Error opening connection.\n")), 0);
326 struct gaim_parse_tag
{
331 #define ALLOW_TAG_ALT(x, y) if(!g_ascii_strncasecmp(c, "<" x " ", strlen("<" x " "))) { \
332 const char *o = c + strlen("<" x); \
333 const char *p = NULL, *q = NULL, *r = NULL; \
334 GString *innards = g_string_new(""); \
336 if(!q && (*o == '\"' || *o == '\'') ) { \
340 char *unescaped = g_strndup(q+1, o-q-1); \
341 char *escaped = g_markup_escape_text(unescaped, -1); \
342 g_string_append_printf(innards, "%c%s%c", *q, escaped, *q); \
344 } else if(*c == '\\') { \
347 } else if(*o == '<') { \
349 } else if(*o == '>') { \
353 innards = g_string_append_c(innards, *o); \
358 if(*(p-1) != '/') { \
359 struct gaim_parse_tag *pt = g_new0(struct gaim_parse_tag, 1); \
362 tags = g_list_prepend(tags, pt); \
364 xhtml = g_string_append(xhtml, "<" y); \
365 c += strlen("<" x ); \
366 xhtml = g_string_append(xhtml, innards->str); \
367 xhtml = g_string_append_c(xhtml, '>'); \
370 xhtml = g_string_append(xhtml, "<"); \
371 plain = g_string_append_c(plain, '<'); \
374 g_string_free(innards, TRUE); \
377 if(!g_ascii_strncasecmp(c, "<" x, strlen("<" x)) && \
378 (*(c+strlen("<" x)) == '>' || \
379 !g_ascii_strncasecmp(c+strlen("<" x), "/>", 2))) { \
380 xhtml = g_string_append(xhtml, "<" y); \
381 c += strlen("<" x); \
383 struct gaim_parse_tag *pt = g_new0(struct gaim_parse_tag, 1); \
386 tags = g_list_prepend(tags, pt); \
387 xhtml = g_string_append_c(xhtml, '>'); \
389 xhtml = g_string_append(xhtml, "/>");\
391 c = strchr(c, '>') + 1; \
394 #define ALLOW_TAG(x) ALLOW_TAG_ALT(x, x)
396 void html_to_xhtml(const char *html
, char **xhtml_out
, char **plain_out
) {
397 GString
*xhtml
= g_string_new("");
398 GString
*plain
= g_string_new("");
399 GList
*tags
= NULL
, *tag
;
400 const char *c
= html
;
404 if(*(c
+1) == '/') { /* closing tag */
407 struct gaim_parse_tag
*pt
= tag
->data
;
408 if(!g_ascii_strncasecmp((c
+2), pt
->src_tag
, strlen(pt
->src_tag
)) && *(c
+strlen(pt
->src_tag
)+2) == '>') {
409 c
+= strlen(pt
->src_tag
) + 3;
416 struct gaim_parse_tag
*pt
= tags
->data
;
417 g_string_append_printf(xhtml
, "</%s>", pt
->dest_tag
);
420 tags
= g_list_remove(tags
, pt
);
424 tags
= g_list_remove(tags
, tag
->data
);
426 /* we tried to close a tag we never opened! escape it
428 xhtml
= g_string_append(xhtml
, "<");
429 plain
= g_string_append_c(plain
, '<');
432 } else { /* opening tag */
434 ALLOW_TAG_ALT("b", "strong");
435 ALLOW_TAG("blockquote");
436 ALLOW_TAG_ALT("bold", "strong");
447 ALLOW_TAG_ALT("i", "em");
448 ALLOW_TAG_ALT("italic", "em");
458 /* we skip <HR> because it's not legal in XHTML-IM. However,
459 * we still want to send something sensible, so we put a
460 * linebreak in its place. <BR> also needs special handling
461 * because putting a </BR> to close it would just be dumb. */
462 if((!g_ascii_strncasecmp(c
, "<br", 3)
463 || !g_ascii_strncasecmp(c
, "<hr", 3))
465 !g_ascii_strncasecmp(c
+3, "/>", 2) ||
466 !g_ascii_strncasecmp(c
+3, " />", 3))) {
467 c
= strchr(c
, '>') + 1;
468 xhtml
= g_string_append(xhtml
, "<br/>");
470 plain
= g_string_append_c(plain
, '\n');
473 if(!g_ascii_strncasecmp(c
, "<u>", 3) || !g_ascii_strncasecmp(c
, "<underline>", strlen("<underline>"))) {
474 struct gaim_parse_tag
*pt
= g_new0(struct gaim_parse_tag
, 1);
475 pt
->src_tag
= *(c
+2) == '>' ? "u" : "underline";
476 pt
->dest_tag
= "span";
477 tags
= g_list_prepend(tags
, pt
);
478 c
= strchr(c
, '>') + 1;
479 xhtml
= g_string_append(xhtml
, "<span style='text-decoration: underline;'>");
482 if(!g_ascii_strncasecmp(c
, "<s>", 3) || !g_ascii_strncasecmp(c
, "<strike>", strlen("<strike>"))) {
483 struct gaim_parse_tag
*pt
= g_new0(struct gaim_parse_tag
, 1);
484 pt
->src_tag
= *(c
+2) == '>' ? "s" : "strike";
485 pt
->dest_tag
= "span";
486 tags
= g_list_prepend(tags
, pt
);
487 c
= strchr(c
, '>') + 1;
488 xhtml
= g_string_append(xhtml
, "<span style='text-decoration: line-through;'>");
491 if(!g_ascii_strncasecmp(c
, "<sub>", 5)) {
492 struct gaim_parse_tag
*pt
= g_new0(struct gaim_parse_tag
, 1);
494 pt
->dest_tag
= "span";
495 tags
= g_list_prepend(tags
, pt
);
496 c
= strchr(c
, '>') + 1;
497 xhtml
= g_string_append(xhtml
, "<span style='vertical-align:sub;'>");
500 if(!g_ascii_strncasecmp(c
, "<sup>", 5)) {
501 struct gaim_parse_tag
*pt
= g_new0(struct gaim_parse_tag
, 1);
503 pt
->dest_tag
= "span";
504 tags
= g_list_prepend(tags
, pt
);
505 c
= strchr(c
, '>') + 1;
506 xhtml
= g_string_append(xhtml
, "<span style='vertical-align:super;'>");
509 if(!g_ascii_strncasecmp(c
, "<font", 5) && (*(c
+5) == '>' || *(c
+5) == ' ')) {
511 GString
*style
= g_string_new("");
512 struct gaim_parse_tag
*pt
;
513 while(*p
&& *p
!= '>') {
514 if(!g_ascii_strncasecmp(p
, "color=", strlen("color="))) {
515 const char *q
= p
+ strlen("color=");
516 GString
*color
= g_string_new("");
517 if(*q
== '\'' || *q
== '\"')
519 while(*q
&& *q
!= '\"' && *q
!= '\'' && *q
!= ' ') {
520 color
= g_string_append_c(color
, *q
);
523 g_string_append_printf(style
, "color: %s; ", color
->str
);
524 g_string_free(color
, TRUE
);
526 } else if(!g_ascii_strncasecmp(p
, "face=", strlen("face="))) {
527 const char *q
= p
+ strlen("face=");
528 gboolean space_allowed
= FALSE
;
529 GString
*face
= g_string_new("");
530 if(*q
== '\'' || *q
== '\"') {
531 space_allowed
= TRUE
;
534 while(*q
&& *q
!= '\"' && *q
!= '\'' && (space_allowed
|| *q
!= ' ')) {
535 face
= g_string_append_c(face
, *q
);
538 g_string_append_printf(style
, "font-family: %s; ", face
->str
);
539 g_string_free(face
, TRUE
);
541 } else if(!g_ascii_strncasecmp(p
, "size=", strlen("size="))) {
542 const char *q
= p
+ strlen("size=");
544 const char *size
= "medium";
545 if(*q
== '\'' || *q
== '\"')
552 g_string_append_printf(style
, "font-size: %s; ", size
);
557 c
= strchr(c
, '>') + 1;
558 pt
= g_new0(struct gaim_parse_tag
, 1);
559 pt
->src_tag
= "font";
560 pt
->dest_tag
= "span";
561 tags
= g_list_prepend(tags
, pt
);
562 xhtml
= g_string_append(xhtml
, "<span");
564 g_string_append_printf(xhtml
, " style='%s'", style
->str
);
565 xhtml
= g_string_append_c(xhtml
, '>');
566 g_string_free(style
, TRUE
);
569 if(!g_ascii_strncasecmp(c
, "<body ", 6)) {
571 gboolean did_something
= FALSE
;
572 while(*p
&& *p
!= '>') {
573 if(!g_ascii_strncasecmp(p
, "bgcolor=", strlen("bgcolor="))) {
574 const char *q
= p
+ strlen("bgcolor=");
575 struct gaim_parse_tag
*pt
= g_new0(struct gaim_parse_tag
, 1);
576 GString
*color
= g_string_new("");
577 if(*q
== '\'' || *q
== '\"')
579 while(*q
&& *q
!= '\"' && *q
!= '\'' && *q
!= ' ') {
580 color
= g_string_append_c(color
, *q
);
583 g_string_append_printf(xhtml
, "<span style='background: %s;'>", color
->str
);
584 g_string_free(color
, TRUE
);
585 c
= strchr(c
, '>') + 1;
586 pt
->src_tag
= "body";
587 pt
->dest_tag
= "span";
588 tags
= g_list_prepend(tags
, pt
);
589 did_something
= TRUE
;
594 if(did_something
) continue;
596 /* this has to come after the special case for bgcolor */
598 if(!g_ascii_strncasecmp(c
, "<!--", strlen("<!--"))) {
599 char *p
= strstr(c
+ strlen("<!--"), "-->");
601 xhtml
= g_string_append(xhtml
, "<!--");
607 xhtml
= g_string_append(xhtml
, "<");
608 plain
= g_string_append_c(plain
, '<');
612 xhtml
= g_string_append_c(xhtml
, *c
);
613 plain
= g_string_append_c(plain
, *c
);
619 g_string_append_printf(xhtml
, "</%s>", (char *)tag
->data
);
624 *xhtml_out
= g_strdup(xhtml
->str
);
626 *plain_out
= g_strdup(plain
->str
);
627 g_string_free(xhtml
, TRUE
);
628 g_string_free(plain
, TRUE
);