[gaim-migrate @ 5479]
[pidgin-git.git] / src / html.c
blobffaba15922b36c8d0d5b483f05ec883437b48db8
1 /*
2 * gaim
4 * Copyright (C) 1998-1999, Mark Spencer <markster@marko.net>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25 #include <string.h>
26 #include <stdio.h>
27 #include <stdlib.h>
29 #ifndef _WIN32
30 #include <sys/time.h>
31 #include <unistd.h>
32 #include <sys/socket.h>
33 #include <netdb.h>
34 #include <netinet/in.h>
35 #endif
37 #include <sys/types.h>
38 #include <fcntl.h>
39 #include <errno.h>
40 #include "gaim.h"
41 #include "proxy.h"
43 #ifdef _WIN32
44 #include "win32dep.h"
45 #endif
47 gchar *strip_html(const gchar *text)
49 int i, j, k;
50 int visible = 1;
51 gchar *text2 = g_strdup(text);
53 if(!text)
54 return NULL;
56 for (i = 0, j = 0; text2[i]; i++) {
57 if (text2[i] == '<') {
58 k = i + 1;
59 if(g_ascii_isspace(text2[k])) {
60 visible = 1;
61 } else {
62 while (text2[k]) {
63 if (text2[k] == '<') {
64 visible = 1;
65 break;
67 if (text2[k] == '>') {
68 visible = 0;
69 break;
71 k++;
74 } else if (text2[i] == '>' && !visible) {
75 visible = 1;
76 continue;
78 if (text2[i] == '&' && strncasecmp(text2+i,"&quot;",6) == 0) {
79 text2[j++] = '\"';
80 i = i+5;
81 continue;
83 if (visible) {
84 text2[j++] = text2[i];
87 text2[j] = '\0';
88 return text2;
91 struct g_url *parse_url(char *url)
93 struct g_url *test = g_new0(struct g_url, 1);
94 char scan_info[255];
95 char port[5];
96 int f;
98 if (strstr(url, "http://"))
99 g_snprintf(scan_info, sizeof(scan_info),
100 "http://%%[A-Za-z0-9.]:%%[0-9]/%%[A-Za-z0-9.~_-/&%%?=+]");
101 else
102 g_snprintf(scan_info, sizeof(scan_info),
103 "%%[A-Za-z0-9.]:%%[0-9]/%%[A-Za-z0-9.~_-/&%%?=+^]");
104 f = sscanf(url, scan_info, test->address, port, test->page);
105 if (f == 1) {
106 if (strstr(url, "http://"))
107 g_snprintf(scan_info, sizeof(scan_info),
108 "http://%%[A-Za-z0-9.]/%%[A-Za-z0-9.~_-/&%%?=+^]");
109 else
110 g_snprintf(scan_info, sizeof(scan_info),
111 "%%[A-Za-z0-9.]/%%[A-Za-z0-9.~_-/&%%?=+^]");
112 f = sscanf(url, scan_info, test->address, test->page);
113 g_snprintf(port, sizeof(test->port), "80");
114 port[2] = 0;
116 if (f == 1) {
117 if (strstr(url, "http://"))
118 g_snprintf(scan_info, sizeof(scan_info), "http://%%[A-Za-z0-9.]");
119 else
120 g_snprintf(scan_info, sizeof(scan_info), "%%[A-Za-z0-9.]");
121 f = sscanf(url, scan_info, test->address);
122 g_snprintf(test->page, sizeof(test->page), "%c", '\0');
125 sscanf(port, "%d", &test->port);
126 return test;
129 struct grab_url_data {
130 void (* callback)(gpointer, char *, unsigned long);
131 gpointer data;
132 struct g_url *website;
133 char *url;
134 gboolean full;
136 int inpa;
138 gboolean sentreq;
139 gboolean newline;
140 gboolean startsaving;
141 char *webdata;
142 unsigned long len;
143 unsigned long data_len;
146 static gboolean
147 parse_redirect(const char *data, size_t data_len, gint sock,
148 struct grab_url_data *gunk)
150 gchar *s;
152 if ((s = g_strstr_len(data, data_len, "Location: ")) != NULL) {
153 gchar *new_url, *end;
154 int len;
156 s += strlen("Location: ");
157 end = strchr(s, '\r');
159 /* Just in case :) */
160 if (end == NULL)
161 end = strchr(s, '\n');
163 len = end - s;
165 new_url = g_malloc(len + 1);
166 strncpy(new_url, s, len);
167 new_url[len] = '\0';
169 /* Close the existing stuff. */
170 gaim_input_remove(gunk->inpa);
171 close(sock);
173 /* Try again, with this new location. */
174 grab_url(new_url, gunk->full, gunk->callback,
175 gunk->data);
177 /* Free up. */
178 g_free(new_url);
179 g_free(gunk->webdata);
180 g_free(gunk->website);
181 g_free(gunk->url);
182 g_free(gunk);
184 return TRUE;
187 return FALSE;
190 static size_t
191 parse_content_len(const char *data, size_t data_len)
193 size_t content_len = 0;
195 sscanf(data, "Content-Length: %d", &content_len);
197 return content_len;
200 static void grab_url_callback(gpointer dat, gint sock, GaimInputCondition cond)
202 struct grab_url_data *gunk = dat;
203 char data;
205 if (sock == -1) {
206 gunk->callback(gunk->data, NULL, 0);
207 g_free(gunk->website);
208 g_free(gunk->url);
209 g_free(gunk);
210 return;
213 if (!gunk->sentreq) {
214 char buf[256];
216 g_snprintf(buf, sizeof(buf), "GET %s%s HTTP/1.0\r\n\r\n", gunk->full ? "" : "/",
217 gunk->full ? gunk->url : gunk->website->page);
218 debug_printf("Request: %s\n", buf);
220 write(sock, buf, strlen(buf));
221 fcntl(sock, F_SETFL, O_NONBLOCK);
222 gunk->sentreq = TRUE;
223 gunk->inpa = gaim_input_add(sock, GAIM_INPUT_READ, grab_url_callback, dat);
224 gunk->data_len = 4096;
225 gunk->webdata = g_malloc(gunk->data_len);
226 return;
229 if (read(sock, &data, 1) > 0 || errno == EWOULDBLOCK) {
230 if (errno == EWOULDBLOCK) {
231 errno = 0;
232 return;
235 gunk->len++;
237 if (gunk->len == gunk->data_len + 1) {
238 gunk->data_len += (gunk->data_len) / 2;
240 gunk->webdata = g_realloc(gunk->webdata, gunk->data_len);
243 gunk->webdata[gunk->len - 1] = data;
245 if (!gunk->startsaving) {
246 if (data == '\r')
247 return;
248 if (data == '\n') {
249 if (gunk->newline) {
250 size_t content_len;
251 gunk->startsaving = TRUE;
253 /* See if we can find a redirect. */
254 if (parse_redirect(gunk->webdata, gunk->len, sock, gunk))
255 return;
257 /* No redirect. See if we can find a content length. */
258 content_len = parse_content_len(gunk->webdata, gunk->len);
260 if (content_len == 0) {
261 /* We'll stick with an initial 8192 */
262 content_len = 8192;
265 /* Out with the old... */
266 gunk->len = 0;
267 g_free(gunk->webdata);
268 gunk->webdata = NULL;
270 /* In with the new. */
271 gunk->data_len = content_len;
272 gunk->webdata = g_malloc(gunk->data_len);
274 else
275 gunk->newline = TRUE;
276 return;
278 gunk->newline = FALSE;
280 } else if (errno != ETIMEDOUT) {
281 gunk->webdata = g_realloc(gunk->webdata, gunk->len + 1);
282 gunk->webdata[gunk->len] = 0;
284 debug_printf(_("Received: '%s'\n"), gunk->webdata);
286 gaim_input_remove(gunk->inpa);
287 close(sock);
288 gunk->callback(gunk->data, gunk->webdata, gunk->len);
289 if (gunk->webdata)
290 g_free(gunk->webdata);
291 g_free(gunk->website);
292 g_free(gunk->url);
293 g_free(gunk);
294 } else {
295 gaim_input_remove(gunk->inpa);
296 close(sock);
297 gunk->callback(gunk->data, NULL, 0);
298 if (gunk->webdata)
299 g_free(gunk->webdata);
300 g_free(gunk->website);
301 g_free(gunk->url);
302 g_free(gunk);
306 void grab_url(char *url, gboolean full, void callback(gpointer, char *, unsigned long), gpointer data)
308 int sock;
309 struct grab_url_data *gunk = g_new0(struct grab_url_data, 1);
311 gunk->callback = callback;
312 gunk->data = data;
313 gunk->url = g_strdup(url);
314 gunk->website = parse_url(url);
315 gunk->full = full;
317 if ((sock = proxy_connect(NULL, gunk->website->address, gunk->website->port,
318 grab_url_callback, gunk)) < 0) {
319 g_free(gunk->website);
320 g_free(gunk->url);
321 g_free(gunk);
322 callback(data, g_strdup(_("g003: Error opening connection.\n")), 0);
326 struct gaim_parse_tag {
327 char *src_tag;
328 char *dest_tag;
331 #define ALLOW_TAG_ALT(x, y) if(!g_ascii_strncasecmp(c, "<" x " ", strlen("<" x " "))) { \
332 char *o = strchr(c+1, '<'); \
333 char *p = strchr(c+1, '>'); \
334 if(p && (!o || p < o)) { \
335 if(*(p-1) != '/') { \
336 struct gaim_parse_tag *pt = g_new0(struct gaim_parse_tag, 1); \
337 pt->src_tag = x; \
338 pt->dest_tag = y; \
339 tags = g_list_prepend(tags, pt); \
341 xhtml = g_string_append(xhtml, "<" y); \
342 c += strlen("<" x ); \
343 xhtml = g_string_append_len(xhtml, c, (p - c) + 1); \
344 c = p + 1; \
345 } else { \
346 xhtml = g_string_append(xhtml, "&lt;"); \
347 plain = g_string_append_c(plain, '<'); \
349 continue; \
351 if(!g_ascii_strncasecmp(c, "<" x, strlen("<" x)) && \
352 (*(c+strlen("<" x)) == '>' || \
353 !g_ascii_strncasecmp(c+strlen("<" x), "/>", 2))) { \
354 xhtml = g_string_append(xhtml, "<" y); \
355 c += strlen("<" x); \
356 if(*c != '/') { \
357 struct gaim_parse_tag *pt = g_new0(struct gaim_parse_tag, 1); \
358 pt->src_tag = x; \
359 pt->dest_tag = y; \
360 tags = g_list_prepend(tags, pt); \
361 xhtml = g_string_append_c(xhtml, '>'); \
362 } else { \
363 xhtml = g_string_append(xhtml, "/>");\
365 c = strchr(c, '>') + 1; \
366 continue; \
368 #define ALLOW_TAG(x) ALLOW_TAG_ALT(x, x)
370 void html_to_xhtml(const char *html, char **xhtml_out, char **plain_out) {
371 GString *xhtml = g_string_new("");
372 GString *plain = g_string_new("");
373 GList *tags = NULL, *tag;
374 const char *q = NULL, *c = html;
375 while(*c) {
376 if(!q && (*c == '\"' || *c == '\'')) {
377 q = c;
378 xhtml = g_string_append_c(xhtml, *c);
379 plain = g_string_append_c(plain, *c);
380 c++;
381 } else if(q) {
382 if(*c == *q) {
383 q = NULL;
384 } else if(*c == '\\') {
385 xhtml = g_string_append_c(xhtml, *c);
386 plain = g_string_append_c(plain, *c);
387 c++;
389 xhtml = g_string_append_c(xhtml, *c);
390 plain = g_string_append_c(plain, *c);
391 c++;
392 } else if(*c == '<') {
393 if(*(c+1) == '/') { /* closing tag */
394 tag = tags;
395 while(tag) {
396 struct gaim_parse_tag *pt = tag->data;
397 if(!g_ascii_strncasecmp((c+2), pt->src_tag, strlen(pt->src_tag)) && *(c+strlen(pt->src_tag)+2) == '>') {
398 c += strlen(pt->src_tag) + 3;
399 break;
401 tag = tag->next;
403 if(tag) {
404 while(tags) {
405 struct gaim_parse_tag *pt = tags->data;
406 g_string_append_printf(xhtml, "</%s>", pt->dest_tag);
407 if(tags == tag)
408 break;
409 tags = g_list_remove(tags, pt);
410 g_free(pt);
412 g_free(tag->data);
413 tags = g_list_remove(tags, tag->data);
414 } else {
415 /* we tried to close a tag we never opened! escape it
416 * and move on */
417 xhtml = g_string_append(xhtml, "&lt;");
418 plain = g_string_append_c(plain, '<');
419 c++;
421 } else { /* opening tag */
422 ALLOW_TAG("a");
423 ALLOW_TAG_ALT("b", "strong");
424 ALLOW_TAG("blockquote");
425 ALLOW_TAG_ALT("bold", "strong");
426 ALLOW_TAG("br");
427 ALLOW_TAG("cite");
428 ALLOW_TAG("div");
429 ALLOW_TAG("em");
430 ALLOW_TAG("h1");
431 ALLOW_TAG("h2");
432 ALLOW_TAG("h3");
433 ALLOW_TAG("h4");
434 ALLOW_TAG("h5");
435 ALLOW_TAG("h6");
436 ALLOW_TAG("hr"); /* FIXME: not valid, need to skip?? */
437 ALLOW_TAG("html");
438 ALLOW_TAG_ALT("i", "em");
439 ALLOW_TAG_ALT("italic", "em");
440 ALLOW_TAG("li");
441 ALLOW_TAG("ol");
442 ALLOW_TAG("p");
443 ALLOW_TAG("pre");
444 ALLOW_TAG("q");
445 ALLOW_TAG("span");
446 ALLOW_TAG("strong");
447 ALLOW_TAG("ul");
449 if(!g_ascii_strncasecmp(c, "<u>", 2) || !g_ascii_strncasecmp(c, "<underline>", strlen("<underline>"))) {
450 struct gaim_parse_tag *pt = g_new0(struct gaim_parse_tag, 1);
451 pt->src_tag = *(c+2) == '>' ? "u" : "underline";
452 pt->dest_tag = "span";
453 tags = g_list_prepend(tags, pt);
454 c = strchr(c, '>') + 1;
455 xhtml = g_string_append(xhtml, "<span style='text-decoration: underline;'>");
456 continue;
458 if(!g_ascii_strncasecmp(c, "<s>", 2) || !g_ascii_strncasecmp(c, "<strike>", strlen("<strike>"))) {
459 struct gaim_parse_tag *pt = g_new0(struct gaim_parse_tag, 1);
460 pt->src_tag = *(c+2) == '>' ? "s" : "strike";
461 pt->dest_tag = "span";
462 tags = g_list_prepend(tags, pt);
463 c = strchr(c, '>') + 1;
464 xhtml = g_string_append(xhtml, "<span style='text-decoration: line-through;'>");
465 continue;
467 if(!g_ascii_strncasecmp(c, "<sub>", 5)) {
468 struct gaim_parse_tag *pt = g_new0(struct gaim_parse_tag, 1);
469 pt->src_tag = "sub";
470 pt->dest_tag = "span";
471 tags = g_list_prepend(tags, pt);
472 c = strchr(c, '>') + 1;
473 xhtml = g_string_append(xhtml, "<span style='vertical-align:sub;'>");
474 continue;
476 if(!g_ascii_strncasecmp(c, "<sup>", 5)) {
477 struct gaim_parse_tag *pt = g_new0(struct gaim_parse_tag, 1);
478 pt->src_tag = "sup";
479 pt->dest_tag = "span";
480 tags = g_list_prepend(tags, pt);
481 c = strchr(c, '>') + 1;
482 xhtml = g_string_append(xhtml, "<span style='vertical-align:super;'>");
483 continue;
485 if(!g_ascii_strncasecmp(c, "<font", 5) && (*(c+5) == '>' || *(c+5) == ' ')) {
486 const char *p = c;
487 GString *style = g_string_new("");
488 struct gaim_parse_tag *pt;
489 while(*p && *p != '>') {
490 if(!g_ascii_strncasecmp(p, "color=", strlen("color="))) {
491 const char *q = p + strlen("color=");
492 GString *color = g_string_new("");
493 if(*q == '\'' || *q == '\"')
494 q++;
495 while(*q && *q != '\"' && *q != '\'' && *q != ' ') {
496 color = g_string_append_c(color, *q);
497 q++;
499 g_string_append_printf(style, "color: %s; ", color->str);
500 g_string_free(color, TRUE);
501 p = q;
502 } else if(!g_ascii_strncasecmp(p, "face=", strlen("face="))) {
503 const char *q = p + strlen("face=");
504 gboolean space_allowed = FALSE;
505 GString *face = g_string_new("");
506 if(*q == '\'' || *q == '\"') {
507 space_allowed = TRUE;
508 q++;
510 while(*q && *q != '\"' && *q != '\'' && (space_allowed || *q != ' ')) {
511 face = g_string_append_c(face, *q);
512 q++;
514 g_string_append_printf(style, "font-family: %s; ", face->str);
515 g_string_free(face, TRUE);
516 p = q;
517 } else if(!g_ascii_strncasecmp(p, "size=", strlen("size="))) {
518 const char *q = p + strlen("size=");
519 int sz;
520 const char *size = "medium";
521 if(*q == '\'' || *q == '\"')
522 q++;
523 sz = atoi(q);
524 if(sz < 3)
525 size = "smaller";
526 else if(sz > 3)
527 size = "larger";
528 g_string_append_printf(style, "font-size: %s; ", size);
529 p = q;
531 p++;
533 c = strchr(c, '>') + 1;
534 pt = g_new0(struct gaim_parse_tag, 1);
535 pt->src_tag = "font";
536 pt->dest_tag = "span";
537 tags = g_list_prepend(tags, pt);
538 xhtml = g_string_append(xhtml, "<span");
539 if(style->len)
540 g_string_append_printf(xhtml, " style='%s'", style->str);
541 xhtml = g_string_append_c(xhtml, '>');
542 g_string_free(style, TRUE);
543 continue;
545 if(!g_ascii_strncasecmp(c, "<body ", 6)) {
546 const char *p = c;
547 gboolean did_something = FALSE;
548 while(*p && *p != '>') {
549 if(!g_ascii_strncasecmp(p, "bgcolor=", strlen("bgcolor="))) {
550 const char *q = p + strlen("bgcolor=");
551 struct gaim_parse_tag *pt = g_new0(struct gaim_parse_tag, 1);
552 GString *color = g_string_new("");
553 if(*q == '\'' || *q == '\"')
554 q++;
555 while(*q && *q != '\"' && *q != '\'' && *q != ' ') {
556 color = g_string_append_c(color, *q);
557 q++;
559 g_string_append_printf(xhtml, "<span style='background: %s;'>", color->str);
560 g_string_free(color, TRUE);
561 c = strchr(c, '>') + 1;
562 pt->src_tag = "body";
563 pt->dest_tag = "span";
564 tags = g_list_prepend(tags, pt);
565 did_something = TRUE;
566 break;
568 p++;
570 if(did_something) continue;
572 /* this has to come after the special case for bgcolor */
573 ALLOW_TAG("body");
574 if(!g_ascii_strncasecmp(c, "<!--", strlen("<!--"))) {
575 char *p = strstr(c + strlen("<!--"), "-->");
576 if(p) {
577 xhtml = g_string_append(xhtml, "<!--");
578 c += strlen("<!--");
579 continue;
583 xhtml = g_string_append(xhtml, "&lt;");
584 plain = g_string_append_c(plain, '<');
585 c++;
587 } else {
588 xhtml = g_string_append_c(xhtml, *c);
589 plain = g_string_append_c(plain, *c);
590 c++;
593 tag = tags;
594 while(tag) {
595 g_string_append_printf(xhtml, "</%s>", (char *)tag->data);
596 tag = tag->next;
598 g_list_free(tags);
599 if(xhtml_out)
600 *xhtml_out = g_strdup(xhtml->str);
601 if(plain_out)
602 *plain_out = g_strdup(plain->str);
603 g_string_free(xhtml, TRUE);
604 g_string_free(plain, TRUE);