2 * $Id: html.c 6014 2008-02-04 18:38:35Z ajc $
4 * Functions which handle translation between HTML and plain text
5 * Copyright (c) 2000-2005 by Art Cancro and others. This program is
6 * released under the terms of the GNU General Public License.
13 #include <sys/types.h>
20 #if TIME_WITH_SYS_TIME
21 # include <sys/time.h>
25 # include <sys/time.h>
31 #include "libcitadel.h"
35 * Convert HTML to plain text.
37 * inputmsg = pointer to raw HTML message
38 * screenwidth = desired output screenwidth
39 * do_citaformat = set to 1 to indent newlines with spaces
41 char *html_to_ascii(char *inputmsg
, int msglen
, int screenwidth
, int do_citaformat
) {
49 size_t outptr_buffer_size
;
50 size_t output_len
= 0;
51 int i
, j
, ch
, did_out
, rb
, scanch
;
52 int nest
= 0; /* Bracket nesting level */
53 int blockquote
= 0; /* BLOCKQUOTE nesting level */
54 int styletag
= 0; /* STYLE tag nesting level */
55 int styletag_start
= 0;
56 int bytes_processed
= 0;
63 if (msglen
== 0) msglen
= strlen(inputmsg
);
65 outptr_buffer_size
= strlen(inptr
) + SIZ
;
66 outptr
= malloc(outptr_buffer_size
);
67 if (outptr
== NULL
) return NULL
;
72 /* Fill the input buffer */
73 inbuf_len
= strlen(inbuf
);
74 if ( (done_reading
== 0) && (inbuf_len
< (SIZ
-128)) ) {
78 inbuf
[inbuf_len
++] = ch
;
86 if (bytes_processed
> msglen
) {
93 if (!IsEmptyStr(inbuf
)) {
96 /* Fold in all the spacing */
97 for (i
=0; !IsEmptyStr(&inbuf
[i
]); ++i
) {
98 if (inbuf
[i
]==10) inbuf
[i
]=32;
99 if (inbuf
[i
]==13) inbuf
[i
]=32;
100 if (inbuf
[i
]==9) inbuf
[i
]=32;
102 for (i
=0; !IsEmptyStr(&inbuf
[i
]); ++i
) {
103 while ((inbuf
[i
]==32)&&(inbuf
[i
+1]==32)) {
104 strcpy(&inbuf
[i
], &inbuf
[i
+1]);
108 for (i
=0; !IsEmptyStr(&inbuf
[i
]); ++i
) {
117 else if (ch
== '>') { /* We have a tag. */
118 if (nest
> 0) --nest
;
120 /* Unqualify the tag (truncate at first space) */
121 if (strchr(tag
, ' ') != NULL
) {
122 strcpy(strchr(tag
, ' '), "");
125 if (!strcasecmp(tag
, "P")) {
130 if (!strcasecmp(tag
, "/DIV")) {
135 if (!strcasecmp(tag
, "LI")) {
137 strcat(outbuf
, " * ");
140 else if (!strcasecmp(tag
, "/UL")) {
145 else if (!strcasecmp(tag
, "H1")) {
150 else if (!strcasecmp(tag
, "H2")) {
155 else if (!strcasecmp(tag
, "H3")) {
160 else if (!strcasecmp(tag
, "H4")) {
165 else if (!strcasecmp(tag
, "/H1")) {
169 else if (!strcasecmp(tag
, "/H2")) {
173 else if (!strcasecmp(tag
, "/H3")) {
177 else if (!strcasecmp(tag
, "/H4")) {
181 else if (!strcasecmp(tag
, "HR")) {
184 for (j
=0; j
<screenwidth
-2; ++j
)
189 else if (!strcasecmp(tag
, "BR")) {
193 else if (!strcasecmp(tag
, "TR")) {
197 else if (!strcasecmp(tag
, "/TABLE")) {
201 else if (!strcasecmp(tag
, "BLOCKQUOTE")) {
204 for (j
=0; j
<blockquote
; ++j
) strcat(nl
, ">");
208 else if (!strcasecmp(tag
, "/BLOCKQUOTE")) {
209 strcat(outbuf
, "\n");
212 for (j
=0; j
<blockquote
; ++j
) strcat(nl
, ">");
216 else if (!strcasecmp(tag
, "STYLE")) {
219 styletag_start
= strlen(outbuf
);
223 else if (!strcasecmp(tag
, "/STYLE")) {
226 outbuf
[styletag_start
] = 0;
232 else if ((nest
> 0) && (strlen(tag
)<(sizeof(tag
)-1))) {
233 tag
[strlen(tag
)+1] = 0;
234 tag
[strlen(tag
)] = ch
;
238 outbuf
[strlen(outbuf
)+1] = 0;
239 outbuf
[strlen(outbuf
)] = ch
;
242 strcpy(inbuf
, &inbuf
[i
]);
245 /* Convert &; tags to the forbidden characters */
246 if (!IsEmptyStr(outbuf
)) for (i
=0; !IsEmptyStr(&outbuf
[i
]); ++i
) {
248 /* Character entity references */
249 if (!strncasecmp(&outbuf
[i
], " ", 6)) {
251 strcpy(&outbuf
[i
+1], &outbuf
[i
+6]);
254 if (!strncasecmp(&outbuf
[i
], " ", 6)) {
256 strcpy(&outbuf
[i
+1], &outbuf
[i
+6]);
259 if (!strncasecmp(&outbuf
[i
], " ", 6)) {
261 strcpy(&outbuf
[i
+1], &outbuf
[i
+6]);
264 if (!strncasecmp(&outbuf
[i
], " ", 8)) {
266 strcpy(&outbuf
[i
+1], &outbuf
[i
+8]);
269 else if (!strncasecmp(&outbuf
[i
], "<", 4)) {
271 strcpy(&outbuf
[i
+1], &outbuf
[i
+4]);
274 else if (!strncasecmp(&outbuf
[i
], ">", 4)) {
276 strcpy(&outbuf
[i
+1], &outbuf
[i
+4]);
279 else if (!strncasecmp(&outbuf
[i
], "&", 5)) {
280 strcpy(&outbuf
[i
+1], &outbuf
[i
+5]);
283 else if (!strncasecmp(&outbuf
[i
], """, 6)) {
285 strcpy(&outbuf
[i
+1], &outbuf
[i
+6]);
288 else if (!strncasecmp(&outbuf
[i
], "‘", 7)) {
290 strcpy(&outbuf
[i
+1], &outbuf
[i
+7]);
293 else if (!strncasecmp(&outbuf
[i
], "’", 7)) {
295 strcpy(&outbuf
[i
+1], &outbuf
[i
+7]);
298 else if (!strncasecmp(&outbuf
[i
], "©", 6)) {
302 strcpy(&outbuf
[i
+3], &outbuf
[i
+6]);
305 else if (!strncasecmp(&outbuf
[i
], "•", 6)) {
309 strcpy(&outbuf
[i
+3], &outbuf
[i
+6]);
312 else if (!strncasecmp(&outbuf
[i
], "…", 8)) {
316 strcpy(&outbuf
[i
+3], &outbuf
[i
+8]);
319 else if (!strncasecmp(&outbuf
[i
], "™", 7)) {
324 strcpy(&outbuf
[i
+4], &outbuf
[i
+7]);
327 else if (!strncasecmp(&outbuf
[i
], "®", 5)) {
331 strcpy(&outbuf
[i
+3], &outbuf
[i
+5]);
334 else if (!strncasecmp(&outbuf
[i
], "¼", 8)) {
338 strcpy(&outbuf
[i
+3], &outbuf
[i
+8]);
341 else if (!strncasecmp(&outbuf
[i
], "½", 8)) {
345 strcpy(&outbuf
[i
+3], &outbuf
[i
+8]);
348 else if (!strncasecmp(&outbuf
[i
], "¾", 8)) {
352 strcpy(&outbuf
[i
+3], &outbuf
[i
+8]);
355 else if (!strncasecmp(&outbuf
[i
], "–", 7)) {
358 strcpy(&outbuf
[i
+2], &outbuf
[i
+7]);
361 else if (!strncasecmp(&outbuf
[i
], "—", 7)) {
365 strcpy(&outbuf
[i
+3], &outbuf
[i
+7]);
368 else if (!strncmp(&outbuf
[i
], "Ç", 8)) {
370 strcpy(&outbuf
[i
+1], &outbuf
[i
+8]);
373 else if (!strncasecmp(&outbuf
[i
], "ç", 8)) {
375 strcpy(&outbuf
[i
+1], &outbuf
[i
+8]);
378 else if (!strncmp(&outbuf
[i
], "È", 8)) {
380 strcpy(&outbuf
[i
+1], &outbuf
[i
+8]);
383 else if (!strncasecmp(&outbuf
[i
], "è", 8)) {
385 strcpy(&outbuf
[i
+1], &outbuf
[i
+8]);
388 else if (!strncmp(&outbuf
[i
], "Ê", 7)) {
390 strcpy(&outbuf
[i
+1], &outbuf
[i
+7]);
393 else if (!strncasecmp(&outbuf
[i
], "ê", 7)) {
395 strcpy(&outbuf
[i
+1], &outbuf
[i
+7]);
398 else if (!strncmp(&outbuf
[i
], "É", 8)) {
400 strcpy(&outbuf
[i
+1], &outbuf
[i
+8]);
403 else if (!strncasecmp(&outbuf
[i
], "é", 8)) {
405 strcpy(&outbuf
[i
+1], &outbuf
[i
+8]);
408 else if (!strncmp(&outbuf
[i
], "À", 8)) {
410 strcpy(&outbuf
[i
+1], &outbuf
[i
+8]);
413 else if (!strncasecmp(&outbuf
[i
], "à", 8)) {
415 strcpy(&outbuf
[i
+1], &outbuf
[i
+8]);
418 else if (!strncasecmp(&outbuf
[i
], "“", 7)) {
420 strcpy(&outbuf
[i
+1], &outbuf
[i
+7]);
423 else if (!strncasecmp(&outbuf
[i
], "”", 7)) {
425 strcpy(&outbuf
[i
+1], &outbuf
[i
+7]);
428 else if (!strncasecmp(&outbuf
[i
], "´", 7)) {
430 strcpy(&outbuf
[i
+1], &outbuf
[i
+7]);
433 /* two-digit decimal equivalents */
434 else if ((!strncmp(&outbuf
[i
], "&#", 2))
435 && (outbuf
[i
+4] == ';') ) {
437 sscanf(&outbuf
[i
+2], "%02d", &scanch
);
439 strcpy(&outbuf
[i
+1], &outbuf
[i
+5]);
442 /* three-digit decimal equivalents */
443 else if ((!strncmp(&outbuf
[i
], "&#", 2))
444 && (outbuf
[i
+5] == ';') ) {
446 sscanf(&outbuf
[i
+2], "%03d", &scanch
);
448 strcpy(&outbuf
[i
+1], &outbuf
[i
+6]);
453 /* Make sure the output buffer is big enough */
454 if ((output_len
+ strlen(outbuf
) + SIZ
) > outptr_buffer_size
) {
455 outptr_buffer_size
+= SIZ
;
456 outptr
= realloc(outptr
, outptr_buffer_size
);
457 if (outptr
== NULL
) {
462 /* Output any lines terminated with hard line breaks */
465 if (strlen(outbuf
) > 0) {
466 for (i
= 0; i
<strlen(outbuf
); ++i
) {
467 if ( (i
<(screenwidth
-2)) && (outbuf
[i
]=='\n')) {
469 strncpy(&outptr
[output_len
], outbuf
, i
+1);
473 strcpy(&outptr
[output_len
], " ");
477 strcpy(outbuf
, &outbuf
[i
+1]);
485 /* Add soft line breaks */
486 if (strlen(outbuf
) > (screenwidth
- 2 )) {
488 for (i
=0; i
<(screenwidth
-2); ++i
) {
489 if (outbuf
[i
]==32) rb
= i
;
492 strncpy(&outptr
[output_len
], outbuf
, rb
);
494 strcpy(&outptr
[output_len
], nl
);
495 output_len
+= strlen(nl
);
497 strcpy(&outptr
[output_len
], " ");
500 strcpy(outbuf
, &outbuf
[rb
+1]);
502 strncpy(&outptr
[output_len
], outbuf
,
504 output_len
+= (screenwidth
-2);
505 strcpy(&outptr
[output_len
], nl
);
506 output_len
+= strlen(nl
);
508 strcpy(&outptr
[output_len
], " ");
511 strcpy(outbuf
, &outbuf
[screenwidth
-2]);
515 } while (done_reading
== 0);
517 strcpy(&outptr
[output_len
], outbuf
);
518 output_len
+= strlen(outbuf
);
520 /* Strip leading/trailing whitespace. We can't do this with
521 * striplt() because it uses too many strlen()'s
523 while ((output_len
> 0) && (isspace(outptr
[0]))) {
524 strcpy(outptr
, &outptr
[1]);
527 while ((output_len
> 0) && (isspace(outptr
[output_len
-1]))) {
528 outptr
[output_len
-1] = 0;
532 if (outptr
[output_len
-1] != '\n') {
533 strcat(outptr
, "\n");