4 * Copyright (C) 1998-1999, Mark Spencer <markster@marko.net>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
32 #include <sys/socket.h>
34 #include <netinet/in.h>
37 #include <sys/types.h>
47 gchar
*strip_html(const gchar
*text
)
51 gchar
*text2
= g_strdup(text
);
56 for (i
= 0, j
= 0; text2
[i
]; i
++) {
57 if (text2
[i
] == '<') {
59 if(g_ascii_isspace(text2
[k
])) {
63 if (text2
[k
] == '<') {
67 if (text2
[k
] == '>') {
74 } else if (text2
[i
] == '>' && !visible
) {
78 if (text2
[i
] == '&' && strncasecmp(text2
+i
,""",6) == 0) {
84 text2
[j
++] = text2
[i
];
91 struct g_url
*parse_url(char *url
)
93 struct g_url
*test
= g_new0(struct g_url
, 1);
98 if (strstr(url
, "http://"))
99 g_snprintf(scan_info
, sizeof(scan_info
),
100 "http://%%[A-Za-z0-9.]:%%[0-9]/%%[A-Za-z0-9.~_-/&%%?=+]");
102 g_snprintf(scan_info
, sizeof(scan_info
),
103 "%%[A-Za-z0-9.]:%%[0-9]/%%[A-Za-z0-9.~_-/&%%?=+^]");
104 f
= sscanf(url
, scan_info
, test
->address
, port
, test
->page
);
106 if (strstr(url
, "http://"))
107 g_snprintf(scan_info
, sizeof(scan_info
),
108 "http://%%[A-Za-z0-9.]/%%[A-Za-z0-9.~_-/&%%?=+^]");
110 g_snprintf(scan_info
, sizeof(scan_info
),
111 "%%[A-Za-z0-9.]/%%[A-Za-z0-9.~_-/&%%?=+^]");
112 f
= sscanf(url
, scan_info
, test
->address
, test
->page
);
113 g_snprintf(port
, sizeof(test
->port
), "80");
117 if (strstr(url
, "http://"))
118 g_snprintf(scan_info
, sizeof(scan_info
), "http://%%[A-Za-z0-9.]");
120 g_snprintf(scan_info
, sizeof(scan_info
), "%%[A-Za-z0-9.]");
121 f
= sscanf(url
, scan_info
, test
->address
);
122 g_snprintf(test
->page
, sizeof(test
->page
), "%c", '\0');
125 sscanf(port
, "%d", &test
->port
);
129 struct grab_url_data
{
130 void (* callback
)(gpointer
, char *, unsigned long);
132 struct g_url
*website
;
140 gboolean startsaving
;
143 unsigned long data_len
;
147 parse_redirect(const char *data
, size_t data_len
, gint sock
,
148 struct grab_url_data
*gunk
)
152 if ((s
= g_strstr_len(data
, data_len
, "Location: ")) != NULL
) {
153 gchar
*new_url
, *end
;
156 s
+= strlen("Location: ");
157 end
= strchr(s
, '\r');
159 /* Just in case :) */
161 end
= strchr(s
, '\n');
165 new_url
= g_malloc(len
+ 1);
166 strncpy(new_url
, s
, len
);
169 /* Close the existing stuff. */
170 gaim_input_remove(gunk
->inpa
);
173 /* Try again, with this new location. */
174 grab_url(new_url
, gunk
->full
, gunk
->callback
,
179 g_free(gunk
->webdata
);
180 g_free(gunk
->website
);
191 parse_content_len(const char *data
, size_t data_len
)
193 size_t content_len
= 0;
195 sscanf(data
, "Content-Length: %d", &content_len
);
200 static void grab_url_callback(gpointer dat
, gint sock
, GaimInputCondition cond
)
202 struct grab_url_data
*gunk
= dat
;
206 gunk
->callback(gunk
->data
, NULL
, 0);
207 g_free(gunk
->website
);
213 if (!gunk
->sentreq
) {
216 g_snprintf(buf
, sizeof(buf
), "GET %s%s HTTP/1.0\r\n\r\n", gunk
->full
? "" : "/",
217 gunk
->full
? gunk
->url
: gunk
->website
->page
);
218 debug_printf("Request: %s\n", buf
);
220 write(sock
, buf
, strlen(buf
));
221 fcntl(sock
, F_SETFL
, O_NONBLOCK
);
222 gunk
->sentreq
= TRUE
;
223 gunk
->inpa
= gaim_input_add(sock
, GAIM_INPUT_READ
, grab_url_callback
, dat
);
224 gunk
->data_len
= 4096;
225 gunk
->webdata
= g_malloc(gunk
->data_len
);
229 if (read(sock
, &data
, 1) > 0 || errno
== EWOULDBLOCK
) {
230 if (errno
== EWOULDBLOCK
) {
237 if (gunk
->len
== gunk
->data_len
+ 1) {
238 gunk
->data_len
+= (gunk
->data_len
) / 2;
240 gunk
->webdata
= g_realloc(gunk
->webdata
, gunk
->data_len
);
243 gunk
->webdata
[gunk
->len
- 1] = data
;
245 if (!gunk
->startsaving
) {
251 gunk
->startsaving
= TRUE
;
253 /* See if we can find a redirect. */
254 if (parse_redirect(gunk
->webdata
, gunk
->len
, sock
, gunk
))
257 /* No redirect. See if we can find a content length. */
258 content_len
= parse_content_len(gunk
->webdata
, gunk
->len
);
260 if (content_len
== 0) {
261 /* We'll stick with an initial 8192 */
265 /* Out with the old... */
267 g_free(gunk
->webdata
);
268 gunk
->webdata
= NULL
;
270 /* In with the new. */
271 gunk
->data_len
= content_len
;
272 gunk
->webdata
= g_malloc(gunk
->data_len
);
275 gunk
->newline
= TRUE
;
278 gunk
->newline
= FALSE
;
280 } else if (errno
!= ETIMEDOUT
) {
281 gunk
->webdata
= g_realloc(gunk
->webdata
, gunk
->len
+ 1);
282 gunk
->webdata
[gunk
->len
] = 0;
284 debug_printf(_("Received: '%s'\n"), gunk
->webdata
);
286 gaim_input_remove(gunk
->inpa
);
288 gunk
->callback(gunk
->data
, gunk
->webdata
, gunk
->len
);
290 g_free(gunk
->webdata
);
291 g_free(gunk
->website
);
295 gaim_input_remove(gunk
->inpa
);
297 gunk
->callback(gunk
->data
, NULL
, 0);
299 g_free(gunk
->webdata
);
300 g_free(gunk
->website
);
306 void grab_url(char *url
, gboolean full
, void callback(gpointer
, char *, unsigned long), gpointer data
)
309 struct grab_url_data
*gunk
= g_new0(struct grab_url_data
, 1);
311 gunk
->callback
= callback
;
313 gunk
->url
= g_strdup(url
);
314 gunk
->website
= parse_url(url
);
317 if ((sock
= proxy_connect(NULL
, gunk
->website
->address
, gunk
->website
->port
,
318 grab_url_callback
, gunk
)) < 0) {
319 g_free(gunk
->website
);
322 callback(data
, g_strdup(_("g003: Error opening connection.\n")), 0);
326 struct gaim_parse_tag
{
331 #define ALLOW_TAG_ALT(x, y) if(!g_ascii_strncasecmp(c, "<" x " ", strlen("<" x " "))) { \
332 char *o = strchr(c+1, '<'); \
333 char *p = strchr(c+1, '>'); \
334 if(p && (!o || p < o)) { \
335 if(*(p-1) != '/') { \
336 struct gaim_parse_tag *pt = g_new0(struct gaim_parse_tag, 1); \
339 tags = g_list_prepend(tags, pt); \
341 xhtml = g_string_append(xhtml, "<" y); \
342 c += strlen("<" x ); \
343 xhtml = g_string_append_len(xhtml, c, (p - c) + 1); \
346 xhtml = g_string_append(xhtml, "<"); \
347 plain = g_string_append_c(plain, '<'); \
351 if(!g_ascii_strncasecmp(c, "<" x, strlen("<" x)) && \
352 (*(c+strlen("<" x)) == '>' || \
353 !g_ascii_strncasecmp(c+strlen("<" x), "/>", 2))) { \
354 xhtml = g_string_append(xhtml, "<" y); \
355 c += strlen("<" x); \
357 struct gaim_parse_tag *pt = g_new0(struct gaim_parse_tag, 1); \
360 tags = g_list_prepend(tags, pt); \
361 xhtml = g_string_append_c(xhtml, '>'); \
363 xhtml = g_string_append(xhtml, "/>");\
365 c = strchr(c, '>') + 1; \
368 #define ALLOW_TAG(x) ALLOW_TAG_ALT(x, x)
370 void html_to_xhtml(const char *html
, char **xhtml_out
, char **plain_out
) {
371 GString
*xhtml
= g_string_new("");
372 GString
*plain
= g_string_new("");
373 GList
*tags
= NULL
, *tag
;
374 const char *q
= NULL
, *c
= html
;
376 if(!q
&& (*c
== '\"' || *c
== '\'')) {
378 xhtml
= g_string_append_c(xhtml
, *c
);
379 plain
= g_string_append_c(plain
, *c
);
384 } else if(*c
== '\\') {
385 xhtml
= g_string_append_c(xhtml
, *c
);
386 plain
= g_string_append_c(plain
, *c
);
389 xhtml
= g_string_append_c(xhtml
, *c
);
390 plain
= g_string_append_c(plain
, *c
);
392 } else if(*c
== '<') {
393 if(*(c
+1) == '/') { /* closing tag */
396 struct gaim_parse_tag
*pt
= tag
->data
;
397 if(!g_ascii_strncasecmp((c
+2), pt
->src_tag
, strlen(pt
->src_tag
)) && *(c
+strlen(pt
->src_tag
)+2) == '>') {
398 c
+= strlen(pt
->src_tag
) + 3;
405 struct gaim_parse_tag
*pt
= tags
->data
;
406 g_string_append_printf(xhtml
, "</%s>", pt
->dest_tag
);
409 tags
= g_list_remove(tags
, pt
);
413 tags
= g_list_remove(tags
, tag
->data
);
415 /* we tried to close a tag we never opened! escape it
417 xhtml
= g_string_append(xhtml
, "<");
418 plain
= g_string_append_c(plain
, '<');
421 } else { /* opening tag */
423 ALLOW_TAG_ALT("b", "strong");
424 ALLOW_TAG("blockquote");
425 ALLOW_TAG_ALT("bold", "strong");
436 ALLOW_TAG("hr"); /* FIXME: not valid, need to skip?? */
438 ALLOW_TAG_ALT("i", "em");
439 ALLOW_TAG_ALT("italic", "em");
449 if(!g_ascii_strncasecmp(c
, "<u>", 2) || !g_ascii_strncasecmp(c
, "<underline>", strlen("<underline>"))) {
450 struct gaim_parse_tag
*pt
= g_new0(struct gaim_parse_tag
, 1);
451 pt
->src_tag
= *(c
+2) == '>' ? "u" : "underline";
452 pt
->dest_tag
= "span";
453 tags
= g_list_prepend(tags
, pt
);
454 c
= strchr(c
, '>') + 1;
455 xhtml
= g_string_append(xhtml
, "<span style='text-decoration: underline;'>");
458 if(!g_ascii_strncasecmp(c
, "<s>", 2) || !g_ascii_strncasecmp(c
, "<strike>", strlen("<strike>"))) {
459 struct gaim_parse_tag
*pt
= g_new0(struct gaim_parse_tag
, 1);
460 pt
->src_tag
= *(c
+2) == '>' ? "s" : "strike";
461 pt
->dest_tag
= "span";
462 tags
= g_list_prepend(tags
, pt
);
463 c
= strchr(c
, '>') + 1;
464 xhtml
= g_string_append(xhtml
, "<span style='text-decoration: line-through;'>");
467 if(!g_ascii_strncasecmp(c
, "<sub>", 5)) {
468 struct gaim_parse_tag
*pt
= g_new0(struct gaim_parse_tag
, 1);
470 pt
->dest_tag
= "span";
471 tags
= g_list_prepend(tags
, pt
);
472 c
= strchr(c
, '>') + 1;
473 xhtml
= g_string_append(xhtml
, "<span style='vertical-align:sub;'>");
476 if(!g_ascii_strncasecmp(c
, "<sup>", 5)) {
477 struct gaim_parse_tag
*pt
= g_new0(struct gaim_parse_tag
, 1);
479 pt
->dest_tag
= "span";
480 tags
= g_list_prepend(tags
, pt
);
481 c
= strchr(c
, '>') + 1;
482 xhtml
= g_string_append(xhtml
, "<span style='vertical-align:super;'>");
485 if(!g_ascii_strncasecmp(c
, "<font", 5) && (*(c
+5) == '>' || *(c
+5) == ' ')) {
487 GString
*style
= g_string_new("");
488 struct gaim_parse_tag
*pt
;
489 while(*p
&& *p
!= '>') {
490 if(!g_ascii_strncasecmp(p
, "color=", strlen("color="))) {
491 const char *q
= p
+ strlen("color=");
492 GString
*color
= g_string_new("");
493 if(*q
== '\'' || *q
== '\"')
495 while(*q
&& *q
!= '\"' && *q
!= '\'' && *q
!= ' ') {
496 color
= g_string_append_c(color
, *q
);
499 g_string_append_printf(style
, "color: %s; ", color
->str
);
500 g_string_free(color
, TRUE
);
502 } else if(!g_ascii_strncasecmp(p
, "face=", strlen("face="))) {
503 const char *q
= p
+ strlen("face=");
504 gboolean space_allowed
= FALSE
;
505 GString
*face
= g_string_new("");
506 if(*q
== '\'' || *q
== '\"') {
507 space_allowed
= TRUE
;
510 while(*q
&& *q
!= '\"' && *q
!= '\'' && (space_allowed
|| *q
!= ' ')) {
511 face
= g_string_append_c(face
, *q
);
514 g_string_append_printf(style
, "font-family: %s; ", face
->str
);
515 g_string_free(face
, TRUE
);
517 } else if(!g_ascii_strncasecmp(p
, "size=", strlen("size="))) {
518 const char *q
= p
+ strlen("size=");
520 const char *size
= "medium";
521 if(*q
== '\'' || *q
== '\"')
528 g_string_append_printf(style
, "font-size: %s; ", size
);
533 c
= strchr(c
, '>') + 1;
534 pt
= g_new0(struct gaim_parse_tag
, 1);
535 pt
->src_tag
= "font";
536 pt
->dest_tag
= "span";
537 tags
= g_list_prepend(tags
, pt
);
538 xhtml
= g_string_append(xhtml
, "<span");
540 g_string_append_printf(xhtml
, " style='%s'", style
->str
);
541 xhtml
= g_string_append_c(xhtml
, '>');
542 g_string_free(style
, TRUE
);
545 if(!g_ascii_strncasecmp(c
, "<body ", 6)) {
547 gboolean did_something
= FALSE
;
548 while(*p
&& *p
!= '>') {
549 if(!g_ascii_strncasecmp(p
, "bgcolor=", strlen("bgcolor="))) {
550 const char *q
= p
+ strlen("bgcolor=");
551 struct gaim_parse_tag
*pt
= g_new0(struct gaim_parse_tag
, 1);
552 GString
*color
= g_string_new("");
553 if(*q
== '\'' || *q
== '\"')
555 while(*q
&& *q
!= '\"' && *q
!= '\'' && *q
!= ' ') {
556 color
= g_string_append_c(color
, *q
);
559 g_string_append_printf(xhtml
, "<span style='background: %s;'>", color
->str
);
560 g_string_free(color
, TRUE
);
561 c
= strchr(c
, '>') + 1;
562 pt
->src_tag
= "body";
563 pt
->dest_tag
= "span";
564 tags
= g_list_prepend(tags
, pt
);
565 did_something
= TRUE
;
570 if(did_something
) continue;
572 /* this has to come after the special case for bgcolor */
574 if(!g_ascii_strncasecmp(c
, "<!--", strlen("<!--"))) {
575 char *p
= strstr(c
+ strlen("<!--"), "-->");
577 xhtml
= g_string_append(xhtml
, "<!--");
583 xhtml
= g_string_append(xhtml
, "<");
584 plain
= g_string_append_c(plain
, '<');
588 xhtml
= g_string_append_c(xhtml
, *c
);
589 plain
= g_string_append_c(plain
, *c
);
595 g_string_append_printf(xhtml
, "</%s>", (char *)tag
->data
);
600 *xhtml_out
= g_strdup(xhtml
->str
);
602 *plain_out
= g_strdup(plain
->str
);
603 g_string_free(xhtml
, TRUE
);
604 g_string_free(plain
, TRUE
);