* Fixed an error in the template that was causing the 'c_creataide' field to load...
[citadel.git] / libcitadel / lib / html_to_ascii.c
blobe2cb2b77abe96b48963b25f224f8e0ce8abaf419
1 /*
2 * $Id: html.c 6014 2008-02-04 18:38:35Z ajc $
4 * Functions which handle translation between HTML and plain text
5 * Copyright (c) 2000-2005 by Art Cancro and others. This program is
6 * released under the terms of the GNU General Public License.
7 */
9 #include <stdlib.h>
10 #include <unistd.h>
11 #include <stdio.h>
12 #include <signal.h>
13 #include <sys/types.h>
14 #include <ctype.h>
15 #include <string.h>
16 #include <sys/stat.h>
17 #include <errno.h>
18 #include <limits.h>
20 #if TIME_WITH_SYS_TIME
21 # include <sys/time.h>
22 # include <time.h>
23 #else
24 # if HAVE_SYS_TIME_H
25 # include <sys/time.h>
26 # else
27 # include <time.h>
28 # endif
29 #endif
31 #include "libcitadel.h"
35 * Convert HTML to plain text.
37 * inputmsg = pointer to raw HTML message
38 * screenwidth = desired output screenwidth
39 * do_citaformat = set to 1 to indent newlines with spaces
41 char *html_to_ascii(char *inputmsg, int msglen, int screenwidth, int do_citaformat) {
42 char inbuf[SIZ];
43 int inbuf_len = 0;
44 char outbuf[SIZ];
45 char tag[1024];
46 int done_reading = 0;
47 char *inptr;
48 char *outptr;
49 size_t outptr_buffer_size;
50 size_t output_len = 0;
51 int i, j, ch, did_out, rb, scanch;
52 int nest = 0; /* Bracket nesting level */
53 int blockquote = 0; /* BLOCKQUOTE nesting level */
54 int styletag = 0; /* STYLE tag nesting level */
55 int styletag_start = 0;
56 int bytes_processed = 0;
57 char nl[128];
59 strcpy(nl, "\n");
60 inptr = inputmsg;
61 strcpy(inbuf, "");
62 strcpy(outbuf, "");
63 if (msglen == 0) msglen = strlen(inputmsg);
65 outptr_buffer_size = strlen(inptr) + SIZ;
66 outptr = malloc(outptr_buffer_size);
67 if (outptr == NULL) return NULL;
68 strcpy(outptr, "");
69 output_len = 0;
71 do {
72 /* Fill the input buffer */
73 inbuf_len = strlen(inbuf);
74 if ( (done_reading == 0) && (inbuf_len < (SIZ-128)) ) {
76 ch = *inptr++;
77 if (ch != 0) {
78 inbuf[inbuf_len++] = ch;
79 inbuf[inbuf_len] = 0;
81 else {
82 done_reading = 1;
85 ++bytes_processed;
86 if (bytes_processed > msglen) {
87 done_reading = 1;
92 /* Do some parsing */
93 if (!IsEmptyStr(inbuf)) {
96 /* Fold in all the spacing */
97 for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
98 if (inbuf[i]==10) inbuf[i]=32;
99 if (inbuf[i]==13) inbuf[i]=32;
100 if (inbuf[i]==9) inbuf[i]=32;
102 for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
103 while ((inbuf[i]==32)&&(inbuf[i+1]==32)) {
104 strcpy(&inbuf[i], &inbuf[i+1]);
108 for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
110 ch = inbuf[i];
112 if (ch == '<') {
113 ++nest;
114 strcpy(tag, "");
117 else if (ch == '>') { /* We have a tag. */
118 if (nest > 0) --nest;
120 /* Unqualify the tag (truncate at first space) */
121 if (strchr(tag, ' ') != NULL) {
122 strcpy(strchr(tag, ' '), "");
125 if (!strcasecmp(tag, "P")) {
126 strcat(outbuf, nl);
127 strcat(outbuf, nl);
130 if (!strcasecmp(tag, "/DIV")) {
131 strcat(outbuf, nl);
132 strcat(outbuf, nl);
135 if (!strcasecmp(tag, "LI")) {
136 strcat(outbuf, nl);
137 strcat(outbuf, " * ");
140 else if (!strcasecmp(tag, "/UL")) {
141 strcat(outbuf, nl);
142 strcat(outbuf, nl);
145 else if (!strcasecmp(tag, "H1")) {
146 strcat(outbuf, nl);
147 strcat(outbuf, nl);
150 else if (!strcasecmp(tag, "H2")) {
151 strcat(outbuf, nl);
152 strcat(outbuf, nl);
155 else if (!strcasecmp(tag, "H3")) {
156 strcat(outbuf, nl);
157 strcat(outbuf, nl);
160 else if (!strcasecmp(tag, "H4")) {
161 strcat(outbuf, nl);
162 strcat(outbuf, nl);
165 else if (!strcasecmp(tag, "/H1")) {
166 strcat(outbuf, nl);
169 else if (!strcasecmp(tag, "/H2")) {
170 strcat(outbuf, nl);
173 else if (!strcasecmp(tag, "/H3")) {
174 strcat(outbuf, nl);
177 else if (!strcasecmp(tag, "/H4")) {
178 strcat(outbuf, nl);
181 else if (!strcasecmp(tag, "HR")) {
182 strcat(outbuf, nl);
183 strcat(outbuf, " ");
184 for (j=0; j<screenwidth-2; ++j)
185 strcat(outbuf, "-");
186 strcat(outbuf, nl);
189 else if (!strcasecmp(tag, "BR")) {
190 strcat(outbuf, nl);
193 else if (!strcasecmp(tag, "TR")) {
194 strcat(outbuf, nl);
197 else if (!strcasecmp(tag, "/TABLE")) {
198 strcat(outbuf, nl);
201 else if (!strcasecmp(tag, "BLOCKQUOTE")) {
202 ++blockquote;
203 strcpy(nl, "\n");
204 for (j=0; j<blockquote; ++j) strcat(nl, ">");
205 strcat(outbuf, nl);
208 else if (!strcasecmp(tag, "/BLOCKQUOTE")) {
209 strcat(outbuf, "\n");
210 --blockquote;
211 strcpy(nl, "\n");
212 for (j=0; j<blockquote; ++j) strcat(nl, ">");
213 strcat(outbuf, nl);
216 else if (!strcasecmp(tag, "STYLE")) {
217 ++styletag;
218 if (styletag == 1) {
219 styletag_start = strlen(outbuf);
223 else if (!strcasecmp(tag, "/STYLE")) {
224 --styletag;
225 if (styletag == 0) {
226 outbuf[styletag_start] = 0;
232 else if ((nest > 0) && (strlen(tag)<(sizeof(tag)-1))) {
233 tag[strlen(tag)+1] = 0;
234 tag[strlen(tag)] = ch;
237 else if (!nest) {
238 outbuf[strlen(outbuf)+1] = 0;
239 outbuf[strlen(outbuf)] = ch;
242 strcpy(inbuf, &inbuf[i]);
245 /* Convert &; tags to the forbidden characters */
246 if (!IsEmptyStr(outbuf)) for (i=0; !IsEmptyStr(&outbuf[i]); ++i) {
248 /* Character entity references */
249 if (!strncasecmp(&outbuf[i], "&nbsp;", 6)) {
250 outbuf[i] = ' ';
251 strcpy(&outbuf[i+1], &outbuf[i+6]);
254 if (!strncasecmp(&outbuf[i], "&ensp;", 6)) {
255 outbuf[i] = ' ';
256 strcpy(&outbuf[i+1], &outbuf[i+6]);
259 if (!strncasecmp(&outbuf[i], "&emsp;", 6)) {
260 outbuf[i] = ' ';
261 strcpy(&outbuf[i+1], &outbuf[i+6]);
264 if (!strncasecmp(&outbuf[i], "&thinsp;", 8)) {
265 outbuf[i] = ' ';
266 strcpy(&outbuf[i+1], &outbuf[i+8]);
269 else if (!strncasecmp(&outbuf[i], "&lt;", 4)) {
270 outbuf[i] = '<';
271 strcpy(&outbuf[i+1], &outbuf[i+4]);
274 else if (!strncasecmp(&outbuf[i], "&gt;", 4)) {
275 outbuf[i] = '>';
276 strcpy(&outbuf[i+1], &outbuf[i+4]);
279 else if (!strncasecmp(&outbuf[i], "&amp;", 5)) {
280 strcpy(&outbuf[i+1], &outbuf[i+5]);
283 else if (!strncasecmp(&outbuf[i], "&quot;", 6)) {
284 outbuf[i] = '\"';
285 strcpy(&outbuf[i+1], &outbuf[i+6]);
288 else if (!strncasecmp(&outbuf[i], "&lsquo;", 7)) {
289 outbuf[i] = '`';
290 strcpy(&outbuf[i+1], &outbuf[i+7]);
293 else if (!strncasecmp(&outbuf[i], "&rsquo;", 7)) {
294 outbuf[i] = '\'';
295 strcpy(&outbuf[i+1], &outbuf[i+7]);
298 else if (!strncasecmp(&outbuf[i], "&copy;", 6)) {
299 outbuf[i] = '(';
300 outbuf[i+1] = 'c';
301 outbuf[i+2] = ')';
302 strcpy(&outbuf[i+3], &outbuf[i+6]);
305 else if (!strncasecmp(&outbuf[i], "&bull;", 6)) {
306 outbuf[i] = ' ';
307 outbuf[i+1] = '*';
308 outbuf[i+2] = ' ';
309 strcpy(&outbuf[i+3], &outbuf[i+6]);
312 else if (!strncasecmp(&outbuf[i], "&hellip;", 8)) {
313 outbuf[i] = '.';
314 outbuf[i+1] = '.';
315 outbuf[i+2] = '.';
316 strcpy(&outbuf[i+3], &outbuf[i+8]);
319 else if (!strncasecmp(&outbuf[i], "&trade;", 7)) {
320 outbuf[i] = '(';
321 outbuf[i+1] = 't';
322 outbuf[i+2] = 'm';
323 outbuf[i+3] = ')';
324 strcpy(&outbuf[i+4], &outbuf[i+7]);
327 else if (!strncasecmp(&outbuf[i], "&reg;", 5)) {
328 outbuf[i] = '(';
329 outbuf[i+1] = 'r';
330 outbuf[i+2] = ')';
331 strcpy(&outbuf[i+3], &outbuf[i+5]);
334 else if (!strncasecmp(&outbuf[i], "&frac14;", 8)) {
335 outbuf[i] = '1';
336 outbuf[i+1] = '/';
337 outbuf[i+2] = '4';
338 strcpy(&outbuf[i+3], &outbuf[i+8]);
341 else if (!strncasecmp(&outbuf[i], "&frac12;", 8)) {
342 outbuf[i] = '1';
343 outbuf[i+1] = '/';
344 outbuf[i+2] = '2';
345 strcpy(&outbuf[i+3], &outbuf[i+8]);
348 else if (!strncasecmp(&outbuf[i], "&frac34;", 8)) {
349 outbuf[i] = '3';
350 outbuf[i+1] = '/';
351 outbuf[i+2] = '4';
352 strcpy(&outbuf[i+3], &outbuf[i+8]);
355 else if (!strncasecmp(&outbuf[i], "&ndash;", 7)) {
356 outbuf[i] = '-';
357 outbuf[i+1] = '-';
358 strcpy(&outbuf[i+2], &outbuf[i+7]);
361 else if (!strncasecmp(&outbuf[i], "&mdash;", 7)) {
362 outbuf[i] = '-';
363 outbuf[i+1] = '-';
364 outbuf[i+2] = '-';
365 strcpy(&outbuf[i+3], &outbuf[i+7]);
368 else if (!strncmp(&outbuf[i], "&Ccedil;", 8)) {
369 outbuf[i] = 'C';
370 strcpy(&outbuf[i+1], &outbuf[i+8]);
373 else if (!strncasecmp(&outbuf[i], "&ccedil;", 8)) {
374 outbuf[i] = 'c';
375 strcpy(&outbuf[i+1], &outbuf[i+8]);
378 else if (!strncmp(&outbuf[i], "&Egrave;", 8)) {
379 outbuf[i] = 'E';
380 strcpy(&outbuf[i+1], &outbuf[i+8]);
383 else if (!strncasecmp(&outbuf[i], "&egrave;", 8)) {
384 outbuf[i] = 'e';
385 strcpy(&outbuf[i+1], &outbuf[i+8]);
388 else if (!strncmp(&outbuf[i], "&Ecirc;", 7)) {
389 outbuf[i] = 'E';
390 strcpy(&outbuf[i+1], &outbuf[i+7]);
393 else if (!strncasecmp(&outbuf[i], "&ecirc;", 7)) {
394 outbuf[i] = 'e';
395 strcpy(&outbuf[i+1], &outbuf[i+7]);
398 else if (!strncmp(&outbuf[i], "&Eacute;", 8)) {
399 outbuf[i] = 'E';
400 strcpy(&outbuf[i+1], &outbuf[i+8]);
403 else if (!strncasecmp(&outbuf[i], "&eacute;", 8)) {
404 outbuf[i] = 'e';
405 strcpy(&outbuf[i+1], &outbuf[i+8]);
408 else if (!strncmp(&outbuf[i], "&Agrave;", 8)) {
409 outbuf[i] = 'A';
410 strcpy(&outbuf[i+1], &outbuf[i+8]);
413 else if (!strncasecmp(&outbuf[i], "&agrave;", 8)) {
414 outbuf[i] = 'a';
415 strcpy(&outbuf[i+1], &outbuf[i+8]);
418 else if (!strncasecmp(&outbuf[i], "&ldquo;", 7)) {
419 outbuf[i] = '\"';
420 strcpy(&outbuf[i+1], &outbuf[i+7]);
423 else if (!strncasecmp(&outbuf[i], "&rdquo;", 7)) {
424 outbuf[i] = '\"';
425 strcpy(&outbuf[i+1], &outbuf[i+7]);
428 else if (!strncasecmp(&outbuf[i], "&acute;", 7)) {
429 outbuf[i] = '\'';
430 strcpy(&outbuf[i+1], &outbuf[i+7]);
433 /* two-digit decimal equivalents */
434 else if ((!strncmp(&outbuf[i], "&#", 2))
435 && (outbuf[i+4] == ';') ) {
436 scanch = 0;
437 sscanf(&outbuf[i+2], "%02d", &scanch);
438 outbuf[i] = scanch;
439 strcpy(&outbuf[i+1], &outbuf[i+5]);
442 /* three-digit decimal equivalents */
443 else if ((!strncmp(&outbuf[i], "&#", 2))
444 && (outbuf[i+5] == ';') ) {
445 scanch = 0;
446 sscanf(&outbuf[i+2], "%03d", &scanch);
447 outbuf[i] = scanch;
448 strcpy(&outbuf[i+1], &outbuf[i+6]);
453 /* Make sure the output buffer is big enough */
454 if ((output_len + strlen(outbuf) + SIZ) > outptr_buffer_size) {
455 outptr_buffer_size += SIZ;
456 outptr = realloc(outptr, outptr_buffer_size);
457 if (outptr == NULL) {
458 abort();
462 /* Output any lines terminated with hard line breaks */
463 do {
464 did_out = 0;
465 if (strlen(outbuf) > 0) {
466 for (i = 0; i<strlen(outbuf); ++i) {
467 if ( (i<(screenwidth-2)) && (outbuf[i]=='\n')) {
469 strncpy(&outptr[output_len], outbuf, i+1);
470 output_len += (i+1);
472 if (do_citaformat) {
473 strcpy(&outptr[output_len], " ");
474 ++output_len;
477 strcpy(outbuf, &outbuf[i+1]);
478 i = 0;
479 did_out = 1;
483 } while (did_out);
485 /* Add soft line breaks */
486 if (strlen(outbuf) > (screenwidth - 2 )) {
487 rb = (-1);
488 for (i=0; i<(screenwidth-2); ++i) {
489 if (outbuf[i]==32) rb = i;
491 if (rb>=0) {
492 strncpy(&outptr[output_len], outbuf, rb);
493 output_len += rb;
494 strcpy(&outptr[output_len], nl);
495 output_len += strlen(nl);
496 if (do_citaformat) {
497 strcpy(&outptr[output_len], " ");
498 ++output_len;
500 strcpy(outbuf, &outbuf[rb+1]);
501 } else {
502 strncpy(&outptr[output_len], outbuf,
503 screenwidth-2);
504 output_len += (screenwidth-2);
505 strcpy(&outptr[output_len], nl);
506 output_len += strlen(nl);
507 if (do_citaformat) {
508 strcpy(&outptr[output_len], " ");
509 ++output_len;
511 strcpy(outbuf, &outbuf[screenwidth-2]);
515 } while (done_reading == 0);
517 strcpy(&outptr[output_len], outbuf);
518 output_len += strlen(outbuf);
520 /* Strip leading/trailing whitespace. We can't do this with
521 * striplt() because it uses too many strlen()'s
523 while ((output_len > 0) && (isspace(outptr[0]))) {
524 strcpy(outptr, &outptr[1]);
525 --output_len;
527 while ((output_len > 0) && (isspace(outptr[output_len-1]))) {
528 outptr[output_len-1] = 0;
529 --output_len;
532 if (outptr[output_len-1] != '\n') {
533 strcat(outptr, "\n");
534 ++output_len;
537 return outptr;