[gaim-migrate @ 5900]
[pidgin-git.git] / src / html.c
blobed3a064c973cde6f43963ae6e99f9c51b2d12639
1 /*
2 * gaim
4 * Copyright (C) 1998-1999, Mark Spencer <markster@marko.net>
5 * 2003, Nathan Walp <faceprint@faceprint.com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 #ifdef HAVE_CONFIG_H
24 #include <config.h>
25 #endif
26 #include <string.h>
27 #include <stdio.h>
28 #include <stdlib.h>
30 #ifndef _WIN32
31 #include <sys/time.h>
32 #include <unistd.h>
33 #include <sys/socket.h>
34 #include <netdb.h>
35 #include <netinet/in.h>
36 #endif
38 #include <sys/types.h>
39 #include <fcntl.h>
40 #include <errno.h>
41 #include "gaim.h"
42 #include "proxy.h"
44 #ifdef _WIN32
45 #include "win32dep.h"
46 #endif
48 gchar *strip_html(const gchar *text)
50 int i, j, k;
51 int visible = 1;
52 gchar *text2 = g_strdup(text);
54 if(!text)
55 return NULL;
57 for (i = 0, j = 0; text2[i]; i++) {
58 if (text2[i] == '<') {
59 k = i + 1;
60 if(g_ascii_isspace(text2[k])) {
61 visible = 1;
62 } else {
63 while (text2[k]) {
64 if (text2[k] == '<') {
65 visible = 1;
66 break;
68 if (text2[k] == '>') {
69 visible = 0;
70 break;
72 k++;
75 } else if (text2[i] == '>' && !visible) {
76 visible = 1;
77 continue;
79 if (text2[i] == '&' && strncasecmp(text2+i,"&quot;",6) == 0) {
80 text2[j++] = '\"';
81 i = i+5;
82 continue;
84 if (visible) {
85 text2[j++] = text2[i];
88 text2[j] = '\0';
89 return text2;
92 struct g_url *parse_url(char *url)
94 struct g_url *test = (struct g_url*)malloc(sizeof(struct g_url));
95 char scan_info[255];
96 char port[5];
97 int f;
98 char* turl;
99 /* hyphen at end includes it in control set */
100 char addr_ctrl[] = "A-Za-z0-9.-";
101 char port_ctrl[] = "0-9";
102 char page_ctrl[] = "A-Za-z0-9.~_/&%%?=+^-";
104 if((turl=strstr(url, "http://")) || (turl=strstr(url, "HTTP://")))
105 url=turl+=7;
107 snprintf(scan_info, sizeof(scan_info),
108 "%%[%s]:%%[%s]/%%[%s]",
109 addr_ctrl, port_ctrl, page_ctrl);
111 f = sscanf(url, scan_info, test->address, port, test->page);
112 if (f == 1) {
113 snprintf(scan_info, sizeof(scan_info),
114 "%%[%s]/%%[%s]",
115 addr_ctrl, page_ctrl);
116 f = sscanf(url, scan_info, test->address, test->page);
117 snprintf(port, sizeof(port), "80");
119 if (f == 1)
120 snprintf(test->page, sizeof(test->page), "");
122 sscanf(port, "%d", &test->port);
123 return test;
126 struct grab_url_data {
127 void (* callback)(gpointer, char *, unsigned long);
128 gpointer data;
129 struct g_url *website;
130 char *url;
131 gboolean full;
133 int inpa;
135 gboolean sentreq;
136 gboolean newline;
137 gboolean startsaving;
138 char *webdata;
139 unsigned long len;
140 unsigned long data_len;
143 static gboolean
144 parse_redirect(const char *data, size_t data_len, gint sock,
145 struct grab_url_data *gunk)
147 gchar *s;
149 if ((s = g_strstr_len(data, data_len, "Location: ")) != NULL) {
150 gchar *new_url, *end;
151 int len;
153 s += strlen("Location: ");
154 end = strchr(s, '\r');
156 /* Just in case :) */
157 if (end == NULL)
158 end = strchr(s, '\n');
160 len = end - s;
162 new_url = g_malloc(len + 1);
163 strncpy(new_url, s, len);
164 new_url[len] = '\0';
166 /* Close the existing stuff. */
167 gaim_input_remove(gunk->inpa);
168 close(sock);
170 /* Try again, with this new location. */
171 grab_url(new_url, gunk->full, gunk->callback,
172 gunk->data);
174 /* Free up. */
175 g_free(new_url);
176 g_free(gunk->webdata);
177 g_free(gunk->website);
178 g_free(gunk->url);
179 g_free(gunk);
181 return TRUE;
184 return FALSE;
187 static size_t
188 parse_content_len(const char *data, size_t data_len)
190 size_t content_len = 0;
192 sscanf(data, "Content-Length: %d", &content_len);
194 return content_len;
197 static void grab_url_callback(gpointer dat, gint sock, GaimInputCondition cond)
199 struct grab_url_data *gunk = dat;
200 char data;
202 if (sock == -1) {
203 gunk->callback(gunk->data, NULL, 0);
204 g_free(gunk->website);
205 g_free(gunk->url);
206 g_free(gunk);
207 return;
210 if (!gunk->sentreq) {
211 char buf[256];
213 g_snprintf(buf, sizeof(buf), "GET %s%s HTTP/1.0\r\n\r\n", gunk->full ? "" : "/",
214 gunk->full ? gunk->url : gunk->website->page);
216 gaim_debug(GAIM_DEBUG_MISC, "grab_url_callback",
217 "Request: %s\n", buf);
219 write(sock, buf, strlen(buf));
220 fcntl(sock, F_SETFL, O_NONBLOCK);
221 gunk->sentreq = TRUE;
222 gunk->inpa = gaim_input_add(sock, GAIM_INPUT_READ, grab_url_callback, dat);
223 gunk->data_len = 4096;
224 gunk->webdata = g_malloc(gunk->data_len);
225 return;
228 if (read(sock, &data, 1) > 0 || errno == EWOULDBLOCK) {
229 if (errno == EWOULDBLOCK) {
230 errno = 0;
231 return;
234 gunk->len++;
236 if (gunk->len == gunk->data_len + 1) {
237 gunk->data_len += (gunk->data_len) / 2;
239 gunk->webdata = g_realloc(gunk->webdata, gunk->data_len);
242 gunk->webdata[gunk->len - 1] = data;
244 if (!gunk->startsaving) {
245 if (data == '\r')
246 return;
247 if (data == '\n') {
248 if (gunk->newline) {
249 size_t content_len;
250 gunk->startsaving = TRUE;
252 /* See if we can find a redirect. */
253 if (parse_redirect(gunk->webdata, gunk->len, sock, gunk))
254 return;
256 /* No redirect. See if we can find a content length. */
257 content_len = parse_content_len(gunk->webdata, gunk->len);
259 if (content_len == 0) {
260 /* We'll stick with an initial 8192 */
261 content_len = 8192;
264 /* Out with the old... */
265 gunk->len = 0;
266 g_free(gunk->webdata);
267 gunk->webdata = NULL;
269 /* In with the new. */
270 gunk->data_len = content_len;
271 gunk->webdata = g_malloc(gunk->data_len);
273 else
274 gunk->newline = TRUE;
275 return;
277 gunk->newline = FALSE;
279 } else if (errno != ETIMEDOUT) {
280 gunk->webdata = g_realloc(gunk->webdata, gunk->len + 1);
281 gunk->webdata[gunk->len] = 0;
283 gaim_debug(GAIM_DEBUG_MISC, "grab_url_callback",
284 "Received: '%s'\n", gunk->webdata);
286 gaim_input_remove(gunk->inpa);
287 close(sock);
288 gunk->callback(gunk->data, gunk->webdata, gunk->len);
289 if (gunk->webdata)
290 g_free(gunk->webdata);
291 g_free(gunk->website);
292 g_free(gunk->url);
293 g_free(gunk);
294 } else {
295 gaim_input_remove(gunk->inpa);
296 close(sock);
297 gunk->callback(gunk->data, NULL, 0);
298 if (gunk->webdata)
299 g_free(gunk->webdata);
300 g_free(gunk->website);
301 g_free(gunk->url);
302 g_free(gunk);
306 void grab_url(char *url, gboolean full, void callback(gpointer, char *, unsigned long), gpointer data)
308 int sock;
309 struct grab_url_data *gunk = g_new0(struct grab_url_data, 1);
311 gunk->callback = callback;
312 gunk->data = data;
313 gunk->url = g_strdup(url);
314 gunk->website = parse_url(url);
315 gunk->full = full;
317 if ((sock = proxy_connect(NULL, gunk->website->address, gunk->website->port,
318 grab_url_callback, gunk)) < 0) {
319 g_free(gunk->website);
320 g_free(gunk->url);
321 g_free(gunk);
322 callback(data, g_strdup(_("g003: Error opening connection.\n")), 0);
326 struct gaim_parse_tag {
327 char *src_tag;
328 char *dest_tag;
331 #define ALLOW_TAG_ALT(x, y) if(!g_ascii_strncasecmp(c, "<" x " ", strlen("<" x " "))) { \
332 const char *o = c + strlen("<" x); \
333 const char *p = NULL, *q = NULL, *r = NULL; \
334 GString *innards = g_string_new(""); \
335 while(o && *o) { \
336 if(!q && (*o == '\"' || *o == '\'') ) { \
337 q = o; \
338 } else if(q) { \
339 if(*o == *q) { \
340 char *unescaped = g_strndup(q+1, o-q-1); \
341 char *escaped = g_markup_escape_text(unescaped, -1); \
342 g_string_append_printf(innards, "%c%s%c", *q, escaped, *q); \
343 q = NULL; \
344 } else if(*c == '\\') { \
345 o++; \
347 } else if(*o == '<') { \
348 r = o; \
349 } else if(*o == '>') { \
350 p = o; \
351 break; \
352 } else { \
353 innards = g_string_append_c(innards, *o); \
355 o++; \
357 if(p && !r) { \
358 if(*(p-1) != '/') { \
359 struct gaim_parse_tag *pt = g_new0(struct gaim_parse_tag, 1); \
360 pt->src_tag = x; \
361 pt->dest_tag = y; \
362 tags = g_list_prepend(tags, pt); \
364 xhtml = g_string_append(xhtml, "<" y); \
365 c += strlen("<" x ); \
366 xhtml = g_string_append(xhtml, innards->str); \
367 xhtml = g_string_append_c(xhtml, '>'); \
368 c = p + 1; \
369 } else { \
370 xhtml = g_string_append(xhtml, "&lt;"); \
371 plain = g_string_append_c(plain, '<'); \
372 c++; \
374 g_string_free(innards, TRUE); \
375 continue; \
377 if(!g_ascii_strncasecmp(c, "<" x, strlen("<" x)) && \
378 (*(c+strlen("<" x)) == '>' || \
379 !g_ascii_strncasecmp(c+strlen("<" x), "/>", 2))) { \
380 xhtml = g_string_append(xhtml, "<" y); \
381 c += strlen("<" x); \
382 if(*c != '/') { \
383 struct gaim_parse_tag *pt = g_new0(struct gaim_parse_tag, 1); \
384 pt->src_tag = x; \
385 pt->dest_tag = y; \
386 tags = g_list_prepend(tags, pt); \
387 xhtml = g_string_append_c(xhtml, '>'); \
388 } else { \
389 xhtml = g_string_append(xhtml, "/>");\
391 c = strchr(c, '>') + 1; \
392 continue; \
394 #define ALLOW_TAG(x) ALLOW_TAG_ALT(x, x)
396 void html_to_xhtml(const char *html, char **xhtml_out, char **plain_out) {
397 GString *xhtml = g_string_new("");
398 GString *plain = g_string_new("");
399 GList *tags = NULL, *tag;
400 const char *c = html;
402 while(c && *c) {
403 if(*c == '<') {
404 if(*(c+1) == '/') { /* closing tag */
405 tag = tags;
406 while(tag) {
407 struct gaim_parse_tag *pt = tag->data;
408 if(!g_ascii_strncasecmp((c+2), pt->src_tag, strlen(pt->src_tag)) && *(c+strlen(pt->src_tag)+2) == '>') {
409 c += strlen(pt->src_tag) + 3;
410 break;
412 tag = tag->next;
414 if(tag) {
415 while(tags) {
416 struct gaim_parse_tag *pt = tags->data;
417 g_string_append_printf(xhtml, "</%s>", pt->dest_tag);
418 if(tags == tag)
419 break;
420 tags = g_list_remove(tags, pt);
421 g_free(pt);
423 g_free(tag->data);
424 tags = g_list_remove(tags, tag->data);
425 } else {
426 /* we tried to close a tag we never opened! escape it
427 * and move on */
428 xhtml = g_string_append(xhtml, "&lt;");
429 plain = g_string_append_c(plain, '<');
430 c++;
432 } else { /* opening tag */
433 ALLOW_TAG("a");
434 ALLOW_TAG_ALT("b", "strong");
435 ALLOW_TAG("blockquote");
436 ALLOW_TAG_ALT("bold", "strong");
437 ALLOW_TAG("cite");
438 ALLOW_TAG("div");
439 ALLOW_TAG("em");
440 ALLOW_TAG("h1");
441 ALLOW_TAG("h2");
442 ALLOW_TAG("h3");
443 ALLOW_TAG("h4");
444 ALLOW_TAG("h5");
445 ALLOW_TAG("h6");
446 ALLOW_TAG("html");
447 ALLOW_TAG_ALT("i", "em");
448 ALLOW_TAG_ALT("italic", "em");
449 ALLOW_TAG("li");
450 ALLOW_TAG("ol");
451 ALLOW_TAG("p");
452 ALLOW_TAG("pre");
453 ALLOW_TAG("q");
454 ALLOW_TAG("span");
455 ALLOW_TAG("strong");
456 ALLOW_TAG("ul");
458 /* we skip <HR> because it's not legal in XHTML-IM. However,
459 * we still want to send something sensible, so we put a
460 * linebreak in its place. <BR> also needs special handling
461 * because putting a </BR> to close it would just be dumb. */
462 if((!g_ascii_strncasecmp(c, "<br", 3)
463 || !g_ascii_strncasecmp(c, "<hr", 3))
464 && (*(c+3) == '>' ||
465 !g_ascii_strncasecmp(c+3, "/>", 2) ||
466 !g_ascii_strncasecmp(c+3, " />", 3))) {
467 c = strchr(c, '>') + 1;
468 xhtml = g_string_append(xhtml, "<br/>");
469 if(*c != '\n')
470 plain = g_string_append_c(plain, '\n');
471 continue;
473 if(!g_ascii_strncasecmp(c, "<u>", 3) || !g_ascii_strncasecmp(c, "<underline>", strlen("<underline>"))) {
474 struct gaim_parse_tag *pt = g_new0(struct gaim_parse_tag, 1);
475 pt->src_tag = *(c+2) == '>' ? "u" : "underline";
476 pt->dest_tag = "span";
477 tags = g_list_prepend(tags, pt);
478 c = strchr(c, '>') + 1;
479 xhtml = g_string_append(xhtml, "<span style='text-decoration: underline;'>");
480 continue;
482 if(!g_ascii_strncasecmp(c, "<s>", 3) || !g_ascii_strncasecmp(c, "<strike>", strlen("<strike>"))) {
483 struct gaim_parse_tag *pt = g_new0(struct gaim_parse_tag, 1);
484 pt->src_tag = *(c+2) == '>' ? "s" : "strike";
485 pt->dest_tag = "span";
486 tags = g_list_prepend(tags, pt);
487 c = strchr(c, '>') + 1;
488 xhtml = g_string_append(xhtml, "<span style='text-decoration: line-through;'>");
489 continue;
491 if(!g_ascii_strncasecmp(c, "<sub>", 5)) {
492 struct gaim_parse_tag *pt = g_new0(struct gaim_parse_tag, 1);
493 pt->src_tag = "sub";
494 pt->dest_tag = "span";
495 tags = g_list_prepend(tags, pt);
496 c = strchr(c, '>') + 1;
497 xhtml = g_string_append(xhtml, "<span style='vertical-align:sub;'>");
498 continue;
500 if(!g_ascii_strncasecmp(c, "<sup>", 5)) {
501 struct gaim_parse_tag *pt = g_new0(struct gaim_parse_tag, 1);
502 pt->src_tag = "sup";
503 pt->dest_tag = "span";
504 tags = g_list_prepend(tags, pt);
505 c = strchr(c, '>') + 1;
506 xhtml = g_string_append(xhtml, "<span style='vertical-align:super;'>");
507 continue;
509 if(!g_ascii_strncasecmp(c, "<font", 5) && (*(c+5) == '>' || *(c+5) == ' ')) {
510 const char *p = c;
511 GString *style = g_string_new("");
512 struct gaim_parse_tag *pt;
513 while(*p && *p != '>') {
514 if(!g_ascii_strncasecmp(p, "color=", strlen("color="))) {
515 const char *q = p + strlen("color=");
516 GString *color = g_string_new("");
517 if(*q == '\'' || *q == '\"')
518 q++;
519 while(*q && *q != '\"' && *q != '\'' && *q != ' ') {
520 color = g_string_append_c(color, *q);
521 q++;
523 g_string_append_printf(style, "color: %s; ", color->str);
524 g_string_free(color, TRUE);
525 p = q;
526 } else if(!g_ascii_strncasecmp(p, "face=", strlen("face="))) {
527 const char *q = p + strlen("face=");
528 gboolean space_allowed = FALSE;
529 GString *face = g_string_new("");
530 if(*q == '\'' || *q == '\"') {
531 space_allowed = TRUE;
532 q++;
534 while(*q && *q != '\"' && *q != '\'' && (space_allowed || *q != ' ')) {
535 face = g_string_append_c(face, *q);
536 q++;
538 g_string_append_printf(style, "font-family: %s; ", face->str);
539 g_string_free(face, TRUE);
540 p = q;
541 } else if(!g_ascii_strncasecmp(p, "size=", strlen("size="))) {
542 const char *q = p + strlen("size=");
543 int sz;
544 const char *size = "medium";
545 if(*q == '\'' || *q == '\"')
546 q++;
547 sz = atoi(q);
548 if(sz < 3)
549 size = "smaller";
550 else if(sz > 3)
551 size = "larger";
552 g_string_append_printf(style, "font-size: %s; ", size);
553 p = q;
555 p++;
557 c = strchr(c, '>') + 1;
558 pt = g_new0(struct gaim_parse_tag, 1);
559 pt->src_tag = "font";
560 pt->dest_tag = "span";
561 tags = g_list_prepend(tags, pt);
562 xhtml = g_string_append(xhtml, "<span");
563 if(style->len)
564 g_string_append_printf(xhtml, " style='%s'", style->str);
565 xhtml = g_string_append_c(xhtml, '>');
566 g_string_free(style, TRUE);
567 continue;
569 if(!g_ascii_strncasecmp(c, "<body ", 6)) {
570 const char *p = c;
571 gboolean did_something = FALSE;
572 while(*p && *p != '>') {
573 if(!g_ascii_strncasecmp(p, "bgcolor=", strlen("bgcolor="))) {
574 const char *q = p + strlen("bgcolor=");
575 struct gaim_parse_tag *pt = g_new0(struct gaim_parse_tag, 1);
576 GString *color = g_string_new("");
577 if(*q == '\'' || *q == '\"')
578 q++;
579 while(*q && *q != '\"' && *q != '\'' && *q != ' ') {
580 color = g_string_append_c(color, *q);
581 q++;
583 g_string_append_printf(xhtml, "<span style='background: %s;'>", color->str);
584 g_string_free(color, TRUE);
585 c = strchr(c, '>') + 1;
586 pt->src_tag = "body";
587 pt->dest_tag = "span";
588 tags = g_list_prepend(tags, pt);
589 did_something = TRUE;
590 break;
592 p++;
594 if(did_something) continue;
596 /* this has to come after the special case for bgcolor */
597 ALLOW_TAG("body");
598 if(!g_ascii_strncasecmp(c, "<!--", strlen("<!--"))) {
599 char *p = strstr(c + strlen("<!--"), "-->");
600 if(p) {
601 xhtml = g_string_append(xhtml, "<!--");
602 c += strlen("<!--");
603 continue;
607 xhtml = g_string_append(xhtml, "&lt;");
608 plain = g_string_append_c(plain, '<');
609 c++;
611 } else {
612 xhtml = g_string_append_c(xhtml, *c);
613 plain = g_string_append_c(plain, *c);
614 c++;
617 tag = tags;
618 while(tag) {
619 g_string_append_printf(xhtml, "</%s>", (char *)tag->data);
620 tag = tag->next;
622 g_list_free(tags);
623 if(xhtml_out)
624 *xhtml_out = g_strdup(xhtml->str);
625 if(plain_out)
626 *plain_out = g_strdup(plain->str);
627 g_string_free(xhtml, TRUE);
628 g_string_free(plain, TRUE);