webcit/html2html.c

   1 /*
   2  * $Id$
   3  */
   4 /**
   5  * \defgroup HTML2HTML Output an HTML message, modifying it slightly to make sure it plays nice
   6  * with the rest of our web framework.
   7  * \ingroup WebcitHttpServer
   8  */
   9 /*@{*/
  10 #include "webcit.h"
  11 #include "webserver.h"
  12
  13
  14 /**
  15  * \brief       Strip surrounding single or double quotes from a string.
  16  *
  17  * \param s     String to be stripped.
  18  */
  19 void stripquotes(char *s)
  20 {
  21         int len;
  22
  23         if (!s) return;
  24
  25         len = strlen(s);
  26         if (len < 2) return;
  27
  28         if ( ( (s[0] == '\"') && (s[len-1] == '\"') ) || ( (s[0] == '\'') && (s[len-1] == '\'') ) ) {
  29                 s[len-1] = 0;
  30                 strcpy(s, &s[1]);
  31         }
  32 }
  33
  34
  35 /**
  36  * \brief Check to see if a META tag has overridden the declared MIME character set.
  37  *
  38  * \param charset               Character set name (left unchanged if we don't do anything)
  39  * \param meta_http_equiv       Content of the "http-equiv" portion of the META tag
  40  * \param meta_content          Content of the "content" portion of the META tag
  41  */
  42 void extract_charset_from_meta(char *charset, char *meta_http_equiv, char *meta_content)
  43 {
  44         char *ptr;
  45         char buf[64];
  46
  47         if (!charset) return;
  48         if (!meta_http_equiv) return;
  49         if (!meta_content) return;
  50
  51
  52         if (strcasecmp(meta_http_equiv, "Content-type")) return;
  53
  54         ptr = strchr(meta_content, ';');
  55         if (!ptr) return;
  56
  57         safestrncpy(buf, ++ptr, sizeof buf);
  58         striplt(buf);
  59         if (!strncasecmp(buf, "charset=", 8)) {
  60                 strcpy(charset, &buf[8]);
  61
  62                 /*
  63                  * The brain-damaged webmail program in Microsoft Exchange declares
  64                  * a charset of "unicode" when they really mean "UTF-8".  GNU iconv
  65                  * treats "unicode" as an alias for "UTF-16" so we have to manually
  66                  * fix this here, otherwise messages generated in Exchange webmail
  67                  * show up as a big pile of weird characters.
  68                  */
  69                 if (!strcasecmp(charset, "unicode")) {
  70                         strcpy(charset, "UTF-8");
  71                 }
  72
  73         }
  74 }
  75
  76
  77
  78 /**
  79  * \brief Sanitize and enhance an HTML message for display.
  80  *        Also convert weird character sets to UTF-8 if necessary.
  81  *        Also fixup img src="cid:..." type inline images to fetch the image
  82  *
  83  * \param supplied_charset the input charset as declared in the MIME headers
  84  */
  85 void output_html(const char *supplied_charset, int treat_as_wiki, int msgnum, StrBuf *Source, StrBuf *Target) {
  86         char buf[SIZ];
  87         char *msg;
  88         char *ptr;
  89         char *msgstart;
  90         char *msgend;
  91         StrBuf *converted_msg;
  92         int buffer_length = 1;
  93         int line_length = 0;
  94         int content_length = 0;
  95         char new_window[SIZ];
  96         int brak = 0;
  97         int alevel = 0;
  98         int scriptlevel = 0;
  99         int script_start_pos = (-1);
 100         int i;
 101         int linklen;
 102         char charset[128];
 103 #ifdef HAVE_ICONV
 104         iconv_t ic = (iconv_t)(-1) ;
 105         char *ibuf;                   /**< Buffer of characters to be converted */
 106         char *obuf;                   /**< Buffer for converted characters      */
 107         size_t ibuflen;               /**< Length of input buffer               */
 108         size_t obuflen;               /**< Length of output buffer              */
 109         char *osav;                   /**< Saved pointer to output buffer       */
 110 #endif
 111         if (Target == NULL)
 112                 Target = WC->WBuf;
 113
 114         safestrncpy(charset, supplied_charset, sizeof charset);
 115         msg = strdup("");
 116         sprintf(new_window, "<a target=\"%s\" href=", TARGET);
 117
 118         if (Source == NULL) while (serv_getln(buf, sizeof buf), strcmp(buf, "000")) {
 119                 line_length = strlen(buf);
 120                 buffer_length = content_length + line_length + 2;
 121                 ptr = realloc(msg, buffer_length);
 122                 if (ptr == NULL) {
 123                         StrBufAppendPrintf(Target, "<b>");
 124                         StrBufAppendPrintf(Target, _("realloc() error! couldn't get %d bytes: %s"),
 125                                 buffer_length + 1,
 126                                 strerror(errno));
 127                         StrBufAppendPrintf(Target, "</b><br /><br />\n");
 128                         while (serv_getln(buf, sizeof buf), strcmp(buf, "000")) {
 129                                 /** flush */
 130                         }
 131                         free(msg);
 132                         return;
 133                 }
 134                 msg = ptr;
 135                 strcpy(&msg[content_length], buf);
 136                 content_length += line_length;
 137                 strcpy(&msg[content_length], "\n");
 138                 content_length += 1;
 139         }
 140         else {
 141                 content_length = StrLength(Source);
 142                 free(msg);
 143                 msg = (char*) ChrPtr(Source);/* TODO: remove cast */
 144                 buffer_length = content_length;
 145         }
 146
 147         /** Do a first pass to isolate the message body */
 148         ptr = msg + 1;
 149         msgstart = msg;
 150         msgend = &msg[content_length];
 151
 152         while (ptr < msgend) {
 153
 154                 /** Advance to next tag */
 155                 ptr = strchr(ptr, '<');
 156                 if ((ptr == NULL) || (ptr >= msgend)) break;
 157                 ++ptr;
 158                 if ((ptr == NULL) || (ptr >= msgend)) break;
 159
 160                 /**
 161                  *  Look for META tags.  Some messages (particularly in
 162                  *  Asian locales) illegally declare a message's character
 163                  *  set in the HTML instead of in the MIME headers.  This
 164                  *  is wrong but we have to work around it anyway.
 165                  */
 166                 if (!strncasecmp(ptr, "META", 4)) {
 167
 168                         char *meta_start;
 169                         char *meta_end;
 170                         int meta_length;
 171                         char *meta;
 172                         char *meta_http_equiv;
 173                         char *meta_content;
 174                         char *spaceptr;
 175
 176                         meta_start = &ptr[4];
 177                         meta_end = strchr(ptr, '>');
 178                         if ((meta_end != NULL) && (meta_end <= msgend)) {
 179                                 meta_length = meta_end - meta_start + 1;
 180                                 meta = malloc(meta_length + 1);
 181                                 safestrncpy(meta, meta_start, meta_length);
 182                                 meta[meta_length] = 0;
 183                                 striplt(meta);
 184                                 if (!strncasecmp(meta, "HTTP-EQUIV=", 11)) {
 185                                         meta_http_equiv = strdup(&meta[11]);
 186                                         spaceptr = strchr(meta_http_equiv, ' ');
 187                                         if (spaceptr != NULL) {
 188                                                 *spaceptr = 0;
 189                                                 meta_content = strdup(++spaceptr);
 190                                                 if (!strncasecmp(meta_content, "content=", 8)) {
 191                                                         strcpy(meta_content, &meta_content[8]);
 192                                                         stripquotes(meta_http_equiv);
 193                                                         stripquotes(meta_content);
 194                                                         extract_charset_from_meta(charset,
 195                                                                 meta_http_equiv, meta_content);
 196                                                 }
 197                                                 free(meta_content);
 198                                         }
 199                                         free(meta_http_equiv);
 200                                 }
 201                                 free(meta);
 202                         }
 203                 }
 204
 205                 /**
 206                  * Any of these tags cause everything up to and including
 207                  * the tag to be removed.
 208                  */
 209                 if ( (!strncasecmp(ptr, "HTML", 4))
 210                    ||(!strncasecmp(ptr, "HEAD", 4))
 211                    ||(!strncasecmp(ptr, "/HEAD", 5))
 212                    ||(!strncasecmp(ptr, "BODY", 4)) ) {
 213                         ptr = strchr(ptr, '>');
 214                         if ((ptr == NULL) || (ptr >= msgend)) break;
 215                         ++ptr;
 216                         if ((ptr == NULL) || (ptr >= msgend)) break;
 217                         msgstart = ptr;
 218                 }
 219
 220                 /**
 221                  * Any of these tags cause everything including and following
 222                  * the tag to be removed.
 223                  */
 224                 if ( (!strncasecmp(ptr, "/HTML", 5))
 225                    ||(!strncasecmp(ptr, "/BODY", 5)) ) {
 226                         --ptr;
 227                         msgend = ptr;
 228                         strcpy(ptr, "");
 229
 230                 }
 231
 232                 ++ptr;
 233         }
 234         if (msgstart > msg) {
 235                 strcpy(msg, msgstart);
 236         }
 237
 238         /** Now go through the message, parsing tags as necessary. */
 239         converted_msg = NewStrBufPlain(NULL, content_length + 8192);
 240
 241
 242         /** Convert foreign character sets to UTF-8 if necessary. */
 243 #ifdef HAVE_ICONV
 244         if ( (strcasecmp(charset, "us-ascii"))
 245            && (strcasecmp(charset, "UTF-8"))
 246            && (strcasecmp(charset, ""))
 247         ) {
 248                 lprintf(9, "Converting %s to UTF-8\n", charset);
 249                 ctdl_iconv_open("UTF-8", charset, &ic);
 250                 if (ic == (iconv_t)(-1) ) {
 251                         lprintf(5, "%s:%d iconv_open() failed: %s\n",
 252                                 __FILE__, __LINE__, strerror(errno));
 253                 }
 254         }
 255         if  (Source == NULL) {
 256                 if (ic != (iconv_t)(-1) ) {
 257                         ibuf = msg;
 258                         ibuflen = content_length;
 259                         obuflen = content_length + (content_length / 2) ;
 260                         obuf = (char *) malloc(obuflen);
 261                         osav = obuf;
 262                         iconv(ic, &ibuf, &ibuflen, &obuf, &obuflen);
 263                         content_length = content_length + (content_length / 2) - obuflen;
 264                         osav[content_length] = 0;
 265                         free(msg);
 266                         msg = osav;
 267                         iconv_close(ic);
 268                 }
 269         }
 270         else {
 271                 if (ic != (iconv_t)(-1) ) {
 272                         StrBuf *Buf = NewStrBufPlain(NULL, StrLength(Source) + 8096);;
 273                         StrBufConvert(Source, Buf, &ic);
 274                         FreeStrBuf(&Buf);
 275                         iconv_close(ic);
 276                         msg = (char*)ChrPtr(Source); /* TODO: get rid of this. */
 277                 }
 278         }
 279
 280 #endif
 281
 282         /**
 283          *      At this point, the message has been stripped down to
 284          *      only the content inside the <BODY></BODY> tags, and has
 285          *      been converted to UTF-8 if it was originally in a foreign
 286          *      character set.  The text is also guaranteed to be null
 287          *      terminated now.
 288          */
 289
 290         if (converted_msg == NULL) {
 291                 StrBufAppendPrintf(Target, "Error %d: %s<br />%s:%d", errno, strerror(errno), __FILE__, __LINE__);
 292                 goto BAIL;
 293         }
 294
 295         ptr = msg;
 296         msgend = strchr(msg, 0);
 297         while (ptr < msgend) {
 298
 299                 /** Try to sanitize the html of any rogue scripts */
 300                 if (!strncasecmp(ptr, "<script", 7)) {
 301                         if (scriptlevel == 0) {
 302                                 script_start_pos = StrLength(converted_msg);
 303                         }
 304                         ++scriptlevel;
 305                 }
 306                 if (!strncasecmp(ptr, "</script", 8)) {
 307                         --scriptlevel;
 308                 }
 309
 310                 /**
 311                  * Change mailto: links to WebCit mail, by replacing the
 312                  * link with one that points back to our mail room.  Due to
 313                  * the way we parse URL's, it'll even handle mailto: links
 314                  * that have "?subject=" in them.
 315                  */
 316                 if (!strncasecmp(ptr, "<a href=\"mailto:", 16)) {
 317                         content_length += 64;
 318                         StrBufAppendPrintf(converted_msg,
 319                                 "<a href=\"display_enter?force_room=_MAIL_&recp=");
 320                         ptr = &ptr[16];
 321                         ++alevel;
 322                         ++brak;
 323                 }
 324                 /** Make external links open in a separate window */
 325                 else if (!strncasecmp(ptr, "<a href=\"", 9)) {
 326                         ++alevel;
 327                         ++brak;
 328                         if ( ((strchr(ptr, ':') < strchr(ptr, '/')))
 329                              &&  ((strchr(ptr, '/') < strchr(ptr, '>')))
 330                              ) {
 331                                 /* open external links to new window */
 332                                 StrBufAppendPrintf(converted_msg, new_window);
 333                                 ptr = &ptr[8];
 334                         }
 335                         else if ( (treat_as_wiki) && (strncasecmp(ptr, "<a href=\"wiki?", 14)) ) {
 336                                 content_length += 64;
 337                                 StrBufAppendPrintf(converted_msg, "<a href=\"wiki?page=");
 338                                 ptr = &ptr[9];
 339                         }
 340                         else {
 341                                 StrBufAppendPrintf(converted_msg, "<a href=\"");
 342                                 ptr = &ptr[9];
 343                         }
 344                 }
 345                 /** Fixup <img src="cid:... ...> to fetch the mime part */
 346                 else if (!strncasecmp(ptr, "<img ", 5)) {
 347                         char* tag_end=strchr(ptr,'>');
 348                         char* src=strstr(ptr, " src=\"cid:");
 349                         char *cid_start, *cid_end;
 350                         ++brak;
 351
 352                         if (src &&
 353                                 (cid_start=strchr(src,':')) &&
 354                                 (cid_end=strchr(cid_start,'"')) &&
 355                                 (cid_end < tag_end)) {
 356
 357                                 /* copy tag and attributes up to src="cid: */
 358                                 StrBufAppendBufPlain(converted_msg, ptr, src - ptr, 0);
 359                                 cid_start++;
 360
 361                                 /* add in /webcit/mimepart/<msgno>/CID/
 362                                    trailing / stops dumb URL filters getting excited */
 363                                 StrBufAppendPrintf(converted_msg,
 364                                         "src=\"/webcit/mimepart/%d/",msgnum);
 365                                 StrBufAppendBufPlain(converted_msg, cid_start, cid_end - cid_start, 0);
 366                                 StrBufAppendBufPlain(converted_msg, "/\"", -1, 0);
 367
 368                                 ptr = cid_end+1;
 369                         }
 370                         StrBufAppendBufPlain(converted_msg, ptr, tag_end - ptr, 0);
 371                         ptr = tag_end;
 372                 }
 373
 374                 /**
 375                  * Turn anything that looks like a URL into a real link, as long
 376                  * as it's not inside a tag already
 377                  */
 378                 else if ( (brak == 0) && (alevel == 0)
 379                      && (!strncasecmp(ptr, "http://", 7))) {
 380                                 /** Find the end of the link */
 381                                 int strlenptr;
 382                                 linklen = 0;
 383
 384                                 strlenptr = strlen(ptr);
 385                                 for (i=0; i<=strlenptr; ++i) {
 386                                         if ((ptr[i]==0)
 387                                            ||(isspace(ptr[i]))
 388                                            ||(ptr[i]==10)
 389                                            ||(ptr[i]==13)
 390                                            ||(ptr[i]=='(')
 391                                            ||(ptr[i]==')')
 392                                            ||(ptr[i]=='<')
 393                                            ||(ptr[i]=='>')
 394                                            ||(ptr[i]=='[')
 395                                            ||(ptr[i]==']')
 396                                            ||(ptr[i]=='"')
 397                                            ||(ptr[i]=='\'')
 398                                         ) linklen = i;
 399                                         /* did s.b. send us an entity? */
 400                                         if (ptr[i] == '&') {
 401                                                 if ((ptr[i+2] ==';') ||
 402                                                     (ptr[i+3] ==';') ||
 403                                                     (ptr[i+5] ==';') ||
 404                                                     (ptr[i+6] ==';') ||
 405                                                     (ptr[i+7] ==';'))
 406                                                         linklen = i;
 407                                         }
 408                                         if (linklen > 0) break;
 409                                 }
 410                                 if (linklen > 0) {
 411                                         char *ltreviewptr;
 412                                         char *nbspreviewptr;
 413                                         char linkedchar;
 414                                         int len = linklen;
 415
 416                                         len = linklen;
 417                                         linkedchar = ptr[len];
 418                                         ptr[len] = '\0';
 419                                         /* spot for some subject strings tinymce tends to give us. */
 420                                         ltreviewptr = strchr(ptr, '<');
 421                                         if (ltreviewptr != NULL) {
 422                                                 *ltreviewptr = '\0';
 423                                                 linklen = ltreviewptr - ptr;
 424                                         }
 425
 426                                         nbspreviewptr = strstr(ptr, "&nbsp;");
 427                                         if (nbspreviewptr != NULL) {
 428                                                 /* nbspreviewptr = '\0'; */
 429                                                 linklen = nbspreviewptr - ptr;
 430                                         }
 431                                         if (ltreviewptr != 0)
 432                                                 *ltreviewptr = '<';
 433
 434                                         ptr[len] = linkedchar;
 435
 436                                         content_length += (32 + linklen);
 437                                         StrBufAppendPrintf(converted_msg, "%s\"", new_window);
 438                                         StrBufAppendBufPlain(converted_msg, ptr, linklen, 0);
 439                                         StrBufAppendPrintf(converted_msg, "\">");
 440                                         StrBufAppendBufPlain(converted_msg, ptr, linklen, 0);
 441                                         ptr += linklen;
 442                                         StrBufAppendPrintf(converted_msg, "</A>");
 443                                 }
 444                 }
 445                 else {
 446                         StrBufAppendBufPlain(converted_msg, ptr, 1, 0);
 447                         ptr++;
 448                 }
 449
 450                 /**
 451                  * We need to know when we're inside a tag,
 452                  * so we don't turn things that look like URL's into
 453                  * links, when they're already links - or image sources.
 454                  */
 455                 if (*(ptr-1) == '<') {
 456                         ++brak;
 457                 }
 458                 if (*(ptr-1) == '>') {
 459                         --brak;
 460                         if ((scriptlevel == 0) && (script_start_pos >= 0)) {
 461                                 StrBufCutRight(converted_msg, StrLength(converted_msg) - script_start_pos);
 462                                 script_start_pos = (-1);
 463                         }
 464                 }
 465                 if (!strncasecmp(ptr, "</A>", 3)) --alevel;
 466         }
 467
 468         /**     uncomment these two lines to override conversion        */
 469         /**     memcpy(converted_msg, msg, content_length);             */
 470         /**     output_length = content_length;                         */
 471
 472         /** Output our big pile of markup */
 473         StrBufAppendBuf(Target, converted_msg, 0);
 474
 475 BAIL:   /** A little trailing vertical whitespace... */
 476         StrBufAppendPrintf(Target, "<br /><br />\n");
 477
 478         /** Now give back the memory */
 479         FreeStrBuf(&converted_msg);
 480         if ((msg != NULL) && (Source == NULL)) free(msg);
 481 }
 482
 483
 484
 485
 486
 487
 488 /*
 489  * Look for URL's embedded in a buffer and make them linkable.  We use a
 490  * target window in order to keep the Citadel session in its own window.
 491  */
 492 void UrlizeText(StrBuf* Target, StrBuf *Source, StrBuf *WrkBuf)
 493 {
 494         int len, UrlLen, Offset, TrailerLen;
 495         const char *start, *end, *pos;
 496
 497         FlushStrBuf(Target);
 498
 499         start = NULL;
 500         len = StrLength(Source);
 501         end = ChrPtr(Source) + len;
 502         for (pos = ChrPtr(Source); (pos < end) && (start == NULL); ++pos) {
 503                 if (!strncasecmp(pos, "http://", 7))
 504                         start = pos;
 505                 else if (!strncasecmp(pos, "ftp://", 6))
 506                         start = pos;
 507         }
 508
 509         if (start == NULL) {
 510                 StrBufAppendBuf(Target, Source, 0);
 511                 return;
 512         }
 513         FlushStrBuf(WrkBuf);
 514
 515         for (pos = ChrPtr(Source) + len; pos > start; --pos) {
 516                 if (  (!isprint(*pos))
 517                    || (isspace(*pos))
 518                    || (*pos == '{')
 519                    || (*pos == '}')
 520                    || (*pos == '|')
 521                    || (*pos == '\\')
 522                    || (*pos == '^')
 523                    || (*pos == '[')
 524                    || (*pos == ']')
 525                    || (*pos == '`')
 526                    || (*pos == '<')
 527                    || (*pos == '>')
 528                    || (*pos == '(')
 529                    || (*pos == ')')
 530                 ) {
 531                         end = pos;
 532                 }
 533         }
 534
 535         UrlLen = end - start;
 536         StrBufAppendBufPlain(WrkBuf, start, UrlLen, 0);
 537
 538         Offset = start - ChrPtr(Source);
 539         if (Offset != 0)
 540                 StrBufAppendBufPlain(Target, ChrPtr(Source), Offset, 0);
 541         StrBufAppendPrintf(Target, "%ca href=%c%s%c TARGET=%c%s%c%c%s%c/A%c",
 542                            LB, QU, ChrPtr(WrkBuf), QU, QU, TARGET,
 543                            QU, RB, ChrPtr(WrkBuf), LB, RB);
 544
 545         TrailerLen = StrLength(Source) - (end - ChrPtr(Source));
 546         if (TrailerLen > 0)
 547                 StrBufAppendBufPlain(Target, end, TrailerLen, 0);
 548 }
 549 void url(char *buf, size_t bufsize)
 550 {
 551         int len, UrlLen, Offset, TrailerLen, outpos;
 552         char *start, *end, *pos;
 553         char urlbuf[SIZ];
 554         char outbuf[SIZ];
 555
 556         start = NULL;
 557         len = strlen(buf);
 558         if (len > bufsize) {
 559                 lprintf(1, "URL: content longer than buffer!");
 560                 return;
 561         }
 562         end = buf + len;
 563         for (pos = buf; (pos < end) && (start == NULL); ++pos) {
 564                 if (!strncasecmp(pos, "http://", 7))
 565                         start = pos;
 566                 if (!strncasecmp(pos, "ftp://", 6))
 567                         start = pos;
 568         }
 569
 570         if (start == NULL)
 571                 return;
 572
 573         for (pos = buf+len; pos > start; --pos) {
 574                 if (  (!isprint(*pos))
 575                    || (isspace(*pos))
 576                    || (*pos == '{')
 577                    || (*pos == '}')
 578                    || (*pos == '|')
 579                    || (*pos == '\\')
 580                    || (*pos == '^')
 581                    || (*pos == '[')
 582                    || (*pos == ']')
 583                    || (*pos == '`')
 584                    || (*pos == '<')
 585                    || (*pos == '>')
 586                    || (*pos == '(')
 587                    || (*pos == ')')
 588                 ) {
 589                         end = pos;
 590                 }
 591         }
 592
 593         UrlLen = end - start;
 594         if (UrlLen > sizeof(urlbuf)){
 595                 lprintf(1, "URL: content longer than buffer!");
 596                 return;
 597         }
 598         memcpy(urlbuf, start, UrlLen);
 599         urlbuf[UrlLen] = '\0';
 600
 601         Offset = start - buf;
 602         if ((Offset != 0) && (Offset < sizeof(outbuf)))
 603                 memcpy(outbuf, buf, Offset);
 604         outpos = snprintf(&outbuf[Offset], sizeof(outbuf) - Offset,
 605                           "%ca href=%c%s%c TARGET=%c%s%c%c%s%c/A%c",
 606                           LB, QU, urlbuf, QU, QU, TARGET, QU, RB, urlbuf, LB, RB);
 607         if (outpos >= sizeof(outbuf) - Offset) {
 608                 lprintf(1, "URL: content longer than buffer!");
 609                 return;
 610         }
 611
 612         TrailerLen = len - (end - start);
 613         if (TrailerLen > 0)
 614                 memcpy(outbuf + Offset + outpos, end, TrailerLen);
 615         if (Offset + outpos + TrailerLen > bufsize) {
 616                 lprintf(1, "URL: content longer than buffer!");
 617                 return;
 618         }
 619         memcpy (buf, outbuf, Offset + outpos + TrailerLen);
 620         *(buf + Offset + outpos + TrailerLen) = '\0';
 621 }
 622
 623
 624
 625
 626 /*@}*/