4 * Copyright (c) 2004 Novell, Inc. All Rights Reserved.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; version 2 of the License.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA
21 /* This code was adapted from the sample RTF reader found here:
22 * http://msdn.microsoft.com/library/default.asp?url=/library/en-us/dnrtfspec/html/rtfspec.asp
35 /* Internal RTF parser error codes */
36 #define NMRTF_OK 0 /* Everything's fine! */
37 #define NMRTF_STACK_UNDERFLOW 1 /* Unmatched '}' */
38 #define NMRTF_STACK_OVERFLOW 2 /* Too many '{' -- memory exhausted */
39 #define NMRTF_UNMATCHED_BRACE 3 /* RTF ended during an open group. */
40 #define NMRTF_INVALID_HEX 4 /* invalid hex character found in data */
41 #define NMRTF_BAD_TABLE 5 /* RTF table (sym or prop) invalid */
42 #define NMRTF_ASSERTION 6 /* Assertion failure */
43 #define NMRTF_EOF 7 /* End of file reached while reading RTF */
44 #define NMRTF_CONVERT_ERROR 8 /* Error converting text */
46 #define NMRTF_MAX_DEPTH 256
52 NMRTF_STATE_FONTTABLE
,
55 } NMRtfState
; /* Rtf State */
57 /* Property types that we care about */
61 NMRTF_PROP_FONT_CHARSET
,
69 NMRTF_SPECIAL_UNICODE
,
77 } NMRtfDestinationType
;
89 /* All we care about for now is the font.
90 * bold, italic, underline, etc. should be
106 char *keyword
; /* RTF keyword */
107 int default_val
; /* default value to use */
108 gboolean pass_default
; /* true to use default value from this table */
109 NMRtfKeywordType kwd_type
; /* the type of the keyword */
110 int action
; /* property type if the keyword represents a property */
111 /* destination type if the keyword represents a destination */
112 /* character to print if the keyword represents a character */
126 NMRtfState rds
; /* destination state */
127 NMRtfState ris
; /* internal state */
128 NMRtfCharProp chp
; /* current character properties (ie. font, bold, italic, etc.) */
129 GSList
*font_table
; /* the font table */
130 GSList
*saved
; /* saved state stack */
131 int param
; /* numeric parameter for the current keyword */
132 long bytes_to_skip
; /* number of bytes to skip (after encountering \bin) */
133 int depth
; /* how many groups deep are we */
134 gboolean skip_unknown
; /* if true, skip any unknown destinations (this is set after encountering '\*') */
135 char *input
; /* input string */
136 guchar nextch
; /* next char in input */
137 gboolean nextch_available
; /* nextch value is set */
138 GString
*ansi
; /* Temporary ansi text, will be convert/flushed to the output string */
139 GString
*output
; /* The plain text UTF8 string */
142 static int rtf_parse(NMRtfContext
*ctx
);
143 static int rtf_push_state(NMRtfContext
*ctx
);
144 static int rtf_pop_state(NMRtfContext
*ctx
);
145 static NMRtfFont
*rtf_get_font(NMRtfContext
*ctx
, int index
);
146 static int rtf_get_char(NMRtfContext
*ctx
, guchar
*ch
);
147 static int rtf_unget_char(NMRtfContext
*ctx
, guchar ch
);
148 static int rtf_flush_data(NMRtfContext
*ctx
);
149 static int rtf_parse_keyword(NMRtfContext
*ctx
);
150 static int rtf_dispatch_control(NMRtfContext
*ctx
, char *keyword
, int param
, gboolean param_set
);
151 static int rtf_dispatch_char(NMRtfContext
*ctx
, guchar ch
);
152 static int rtf_dispatch_unicode_char(NMRtfContext
*ctx
, gunichar ch
);
153 static int rtf_print_char(NMRtfContext
*ctx
, guchar ch
);
154 static int rtf_print_unicode_char(NMRtfContext
*ctx
, gunichar ch
);
155 static int rtf_change_destination(NMRtfContext
*ctx
, NMRtfDestinationType dest
);
156 static int rtf_dispatch_special(NMRtfContext
*ctx
, NMRtfSpecialKwd special
);
157 static int rtf_apply_property(NMRtfContext
*ctx
, NMRtfProperty prop
, int val
);
159 /* RTF parser tables */
161 /* Keyword descriptions */
162 NMRtfSymbol rtf_symbols
[] = {
163 /* keyword, default, pass_default, keyword_type, action */
164 {"fonttbl", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_FONTTABLE
},
165 {"f", 0, FALSE
, NMRTF_KWD_PROP
, NMRTF_PROP_FONT_IDX
},
166 {"fcharset", 0, FALSE
, NMRTF_KWD_PROP
, NMRTF_PROP_FONT_CHARSET
},
167 {"par", 0, FALSE
, NMRTF_KWD_CHAR
, 0x0a},
168 {"line", 0, FALSE
, NMRTF_KWD_CHAR
, 0x0a},
169 {"\0x0a", 0, FALSE
, NMRTF_KWD_CHAR
, 0x0a},
170 {"\0x0d", 0, FALSE
, NMRTF_KWD_CHAR
, 0x0a},
171 {"tab", 0, FALSE
, NMRTF_KWD_CHAR
, 0x09},
172 {"\r", 0, FALSE
, NMRTF_KWD_CHAR
, '\r'},
173 {"\n", 0, FALSE
, NMRTF_KWD_CHAR
, '\n'},
174 {"ldblquote",0, FALSE
, NMRTF_KWD_CHAR
, '"'},
175 {"rdblquote",0, FALSE
, NMRTF_KWD_CHAR
, '"'},
176 {"{", 0, FALSE
, NMRTF_KWD_CHAR
, '{'},
177 {"}", 0, FALSE
, NMRTF_KWD_CHAR
, '}'},
178 {"\\", 0, FALSE
, NMRTF_KWD_CHAR
, '\\'},
179 {"bin", 0, FALSE
, NMRTF_KWD_SPEC
, NMRTF_SPECIAL_BIN
},
180 {"*", 0, FALSE
, NMRTF_KWD_SPEC
, NMRTF_SPECIAL_SKIP
},
181 {"'", 0, FALSE
, NMRTF_KWD_SPEC
, NMRTF_SPECIAL_HEX
},
182 {"u", 0, FALSE
, NMRTF_KWD_SPEC
, NMRTF_SPECIAL_UNICODE
},
183 {"colortbl", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
184 {"author", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
185 {"buptim", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
186 {"comment", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
187 {"creatim", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
188 {"doccomm", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
189 {"footer", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
190 {"footerf", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
191 {"footerl", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
192 {"footerr", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
193 {"footnote", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
194 {"ftncn", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
195 {"ftnsep", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
196 {"ftnsepc", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
197 {"header", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
198 {"headerf", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
199 {"headerl", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
200 {"headerr", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
201 {"info", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
202 {"keywords", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
203 {"operator", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
204 {"pict", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
205 {"printim", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
206 {"private1", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
207 {"revtim", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
208 {"rxe", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
209 {"stylesheet", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
210 {"subject", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
211 {"tc", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
212 {"title", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
213 {"txe", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
214 {"xe", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
}
216 int table_size
= sizeof(rtf_symbols
) / sizeof(NMRtfSymbol
);
221 NMRtfContext
*ctx
= g_new0(NMRtfContext
, 1);
222 ctx
->nextch_available
= FALSE
;
223 ctx
->ansi
= g_string_new("");
224 ctx
->output
= g_string_new("");
229 nm_rtf_strip_formatting(NMRtfContext
*ctx
, const char *input
)
233 ctx
->input
= (char *)input
;
234 status
= rtf_parse(ctx
);
235 if (status
== NMRTF_OK
)
236 return g_strdup(ctx
->output
->str
);
238 purple_debug_info("novell", "RTF parser failed with error code %d\n", status
);
243 nm_rtf_deinit(NMRtfContext
*ctx
)
249 for (node
= ctx
->font_table
; node
; node
= node
->next
) {
255 g_slist_free(ctx
->font_table
);
256 g_slist_free_full(ctx
->saved
, g_free
);
257 g_string_free(ctx
->ansi
, TRUE
);
258 g_string_free(ctx
->output
, TRUE
);
264 get_current_encoding(NMRtfContext
*ctx
)
268 font
= rtf_get_font(ctx
, ctx
->chp
.font_idx
);
270 switch (font
->charset
) {
311 purple_debug_info("novell", "Unhandled font charset %d\n", font
->charset
);
318 * Add an entry to the font table
321 rtf_add_font_entry(NMRtfContext
*ctx
, int number
, const char *name
, int charset
)
323 NMRtfFont
*font
= g_new0(NMRtfFont
, 1);
325 font
->number
= number
;
326 font
->name
= g_strdup(name
);
327 font
->charset
= charset
;
329 purple_debug_info("novell", "Adding font to table: #%d\t%s\t%d\n",
330 font
->number
, font
->name
, font
->charset
);
332 ctx
->font_table
= g_slist_append(ctx
->font_table
, font
);
338 * Return the nth entry in the font table
341 rtf_get_font(NMRtfContext
*ctx
, int nth
)
345 font
= g_slist_nth_data(ctx
->font_table
, nth
);
352 * Isolate RTF keywords and send them to rtf_parse_keyword;
353 * Push and pop state at the start and end of RTF groups;
354 * Send text to rtf_dispatch_char for further processing.
357 rtf_parse(NMRtfContext
*ctx
)
365 if (ctx
->input
== NULL
)
368 while (rtf_get_char(ctx
, &ch
) == NMRTF_OK
) {
370 return NMRTF_STACK_UNDERFLOW
;
372 /* if we're parsing binary data, handle it directly */
373 if (ctx
->ris
== NMRTF_STATE_BIN
) {
374 if ((status
= rtf_dispatch_char(ctx
, ch
)) != NMRTF_OK
)
379 if (ctx
->depth
> NMRTF_MAX_DEPTH
)
380 return NMRTF_STACK_OVERFLOW
;
382 if ((status
= rtf_push_state(ctx
)) != NMRTF_OK
)
388 /* for some reason there is always an unwanted '\par' at the end */
389 if (ctx
->rds
== NMRTF_STATE_NORMAL
) {
390 len
= ctx
->output
->len
;
391 if (ctx
->output
->str
[len
-1] == '\n')
392 ctx
->output
= g_string_truncate(ctx
->output
, len
-1);
395 if ((status
= rtf_pop_state(ctx
)) != NMRTF_OK
)
399 return NMRTF_STACK_OVERFLOW
;
402 if ((status
= rtf_parse_keyword(ctx
)) != NMRTF_OK
)
406 case 0x0a: /* cr and lf are noise characters... */
409 if (ctx
->ris
== NMRTF_STATE_NORMAL
) {
410 if ((status
= rtf_dispatch_char(ctx
, ch
)) != NMRTF_OK
)
412 } else { /* parsing a hex encoded character */
413 if (ctx
->ris
!= NMRTF_STATE_HEX
)
414 return NMRTF_ASSERTION
;
416 hex_byte
= hex_byte
<< 4;
418 hex_byte
+= (char) ch
- '0';
421 if (ch
< 'a' || ch
> 'f')
422 return NMRTF_INVALID_HEX
;
423 hex_byte
+= (char) ch
- 'a' + 10;
425 if (ch
< 'A' || ch
> 'F')
426 return NMRTF_INVALID_HEX
;
427 hex_byte
+= (char) ch
- 'A' + 10;
431 if (hex_count
== 0) {
432 if ((status
= rtf_dispatch_char(ctx
, hex_byte
)) != NMRTF_OK
)
436 ctx
->ris
= NMRTF_STATE_NORMAL
;
444 return NMRTF_STACK_OVERFLOW
;
446 return NMRTF_UNMATCHED_BRACE
;
451 * Push the current state onto stack
454 rtf_push_state(NMRtfContext
*ctx
)
456 NMRtfStateSave
*save
= g_new0(NMRtfStateSave
, 1);
457 save
->chp
= ctx
->chp
;
458 save
->rds
= ctx
->rds
;
459 save
->ris
= ctx
->ris
;
460 ctx
->saved
= g_slist_prepend(ctx
->saved
, save
);
461 ctx
->ris
= NMRTF_STATE_NORMAL
;
467 * Restore the state at the top of the stack
470 rtf_pop_state(NMRtfContext
*ctx
)
472 NMRtfStateSave
*save_old
;
475 if (ctx
->saved
== NULL
)
476 return NMRTF_STACK_UNDERFLOW
;
478 save_old
= ctx
->saved
->data
;
479 ctx
->chp
= save_old
->chp
;
480 ctx
->rds
= save_old
->rds
;
481 ctx
->ris
= save_old
->ris
;
485 link_old
= ctx
->saved
;
486 ctx
->saved
= g_slist_delete_link(ctx
->saved
, link_old
);
492 * Get a control word (and its associated value) and
493 * dispatch the control.
496 rtf_parse_keyword(NMRtfContext
*ctx
)
498 int status
= NMRTF_OK
;
500 gboolean param_set
= FALSE
;
501 gboolean is_neg
= FALSE
;
509 if ((status
= rtf_get_char(ctx
, &ch
)) != NMRTF_OK
)
513 /* a control symbol; no delimiter. */
514 keyword
[0] = (char) ch
;
516 return rtf_dispatch_control(ctx
, keyword
, 0, param_set
);
520 for (i
= 0; isalpha(ch
) && (i
< sizeof(keyword
) - 1); rtf_get_char(ctx
, &ch
)) {
521 keyword
[i
] = (char) ch
;
526 /* check for '-' indicated a negative parameter value */
529 if ((status
= rtf_get_char(ctx
, &ch
)) != NMRTF_OK
)
533 /* check for numerical param */
537 for (i
= 0; isdigit(ch
) && (i
< sizeof(parameter
) - 1); rtf_get_char(ctx
, &ch
)) {
538 parameter
[i
] = (char) ch
;
543 ctx
->param
= param
= atoi(parameter
);
545 ctx
->param
= param
= -param
;
548 /* space after control is optional, put character back if it is not a space */
550 rtf_unget_char(ctx
, ch
);
552 return rtf_dispatch_control(ctx
, keyword
, param
, param_set
);
556 * Route the character to the appropriate destination
559 rtf_dispatch_char(NMRtfContext
*ctx
, guchar ch
)
561 if (ctx
->ris
== NMRTF_STATE_BIN
&& --(ctx
->bytes_to_skip
) <= 0)
562 ctx
->ris
= NMRTF_STATE_NORMAL
;
565 case NMRTF_STATE_SKIP
:
567 case NMRTF_STATE_NORMAL
:
568 return rtf_print_char(ctx
, ch
);
569 case NMRTF_STATE_FONTTABLE
:
571 rtf_add_font_entry(ctx
, ctx
->chp
.font_idx
,
572 ctx
->ansi
->str
, ctx
->chp
.font_charset
);
573 g_string_truncate(ctx
->ansi
, 0);
576 return rtf_print_char(ctx
, ch
);
584 /* Handle a unicode character */
586 rtf_dispatch_unicode_char(NMRtfContext
*ctx
, gunichar ch
)
589 case NMRTF_STATE_SKIP
:
591 case NMRTF_STATE_NORMAL
:
592 case NMRTF_STATE_FONTTABLE
:
593 return rtf_print_unicode_char(ctx
, ch
);
603 rtf_print_char(NMRtfContext
*ctx
, guchar ch
)
606 ctx
->ansi
= g_string_append_c(ctx
->ansi
, ch
);
612 * Output a unicode character
615 rtf_print_unicode_char(NMRtfContext
*ctx
, gunichar ch
)
620 /* convert and flush the ansi buffer to the utf8 buffer */
623 /* convert the unicode character to utf8 and add directly to the output buffer */
624 num
= g_unichar_to_utf8((gunichar
) ch
, buf
);
626 purple_debug_info("novell", "converted unichar 0x%X to utf8 char %s\n", ch
, buf
);
628 ctx
->output
= g_string_append(ctx
->output
, buf
);
633 * Flush the output text
636 rtf_flush_data(NMRtfContext
*ctx
)
638 int status
= NMRTF_OK
;
639 char *conv_data
= NULL
;
640 const char *enc
= NULL
;
641 GError
*gerror
= NULL
;
643 if (ctx
->rds
== NMRTF_STATE_NORMAL
&& ctx
->ansi
->len
> 0) {
644 enc
= get_current_encoding(ctx
);
645 conv_data
= g_convert(ctx
->ansi
->str
, ctx
->ansi
->len
, "UTF-8", enc
,
646 NULL
, NULL
, &gerror
);
648 ctx
->output
= g_string_append(ctx
->output
, conv_data
);
650 ctx
->ansi
= g_string_truncate(ctx
->ansi
, 0);
652 status
= NMRTF_CONVERT_ERROR
;
653 purple_debug_info("novell", "failed to convert data! error code = %d msg = %s\n",
654 gerror
->code
, gerror
->message
);
657 g_error_free(gerror
);
663 * Handle a property change
666 rtf_apply_property(NMRtfContext
*ctx
, NMRtfProperty prop
, int val
)
668 if (ctx
->rds
== NMRTF_STATE_SKIP
) /* If we're skipping text, */
669 return NMRTF_OK
; /* don't do anything. */
671 /* Need to flush any temporary data before a property change*/
675 case NMRTF_PROP_FONT_IDX
:
676 ctx
->chp
.font_idx
= val
;
678 case NMRTF_PROP_FONT_CHARSET
:
679 ctx
->chp
.font_charset
= val
;
682 return NMRTF_BAD_TABLE
;
690 * Search the table for keyword and evaluate it appropriately.
693 * keyword: The RTF control to evaluate.
694 * param: The parameter of the RTF control.
695 * param_set: TRUE if the control had a parameter; (that is, if param is valid)
696 * FALSE if it did not.
699 rtf_dispatch_control(NMRtfContext
*ctx
, char *keyword
, int param
, gboolean param_set
)
703 for (idx
= 0; idx
< table_size
; idx
++) {
704 if (purple_strequal(keyword
, rtf_symbols
[idx
].keyword
))
708 if (idx
== table_size
) {
709 if (ctx
->skip_unknown
)
710 ctx
->rds
= NMRTF_STATE_SKIP
;
711 ctx
->skip_unknown
= FALSE
;
715 /* found it! use kwd_type and action to determine what to do with it. */
716 ctx
->skip_unknown
= FALSE
;
717 switch (rtf_symbols
[idx
].kwd_type
) {
719 if (rtf_symbols
[idx
].pass_default
|| !param_set
)
720 param
= rtf_symbols
[idx
].default_val
;
721 return rtf_apply_property(ctx
, rtf_symbols
[idx
].action
, param
);
723 return rtf_dispatch_char(ctx
, rtf_symbols
[idx
].action
);
725 return rtf_change_destination(ctx
, rtf_symbols
[idx
].action
);
727 return rtf_dispatch_special(ctx
, rtf_symbols
[idx
].action
);
729 return NMRTF_BAD_TABLE
;
731 return NMRTF_BAD_TABLE
;
735 * Change to the destination specified.
738 rtf_change_destination(NMRtfContext
*ctx
, NMRtfDestinationType type
)
740 /* if we're skipping text, don't do anything */
741 if (ctx
->rds
== NMRTF_STATE_SKIP
)
745 case NMRTF_DEST_FONTTABLE
:
746 ctx
->rds
= NMRTF_STATE_FONTTABLE
;
747 g_string_truncate(ctx
->ansi
, 0);
750 ctx
->rds
= NMRTF_STATE_SKIP
; /* when in doubt, skip it... */
757 * Dispatch an RTF control that needs special processing
760 rtf_dispatch_special(NMRtfContext
*ctx
, NMRtfSpecialKwd type
)
762 int status
= NMRTF_OK
;
765 if (ctx
->rds
== NMRTF_STATE_SKIP
&& type
!= NMRTF_SPECIAL_BIN
) /* if we're skipping, and it's not */
766 return NMRTF_OK
; /* the \bin keyword, ignore it. */
769 case NMRTF_SPECIAL_BIN
:
770 ctx
->ris
= NMRTF_STATE_BIN
;
771 ctx
->bytes_to_skip
= ctx
->param
;
773 case NMRTF_SPECIAL_SKIP
:
774 ctx
->skip_unknown
= TRUE
;
776 case NMRTF_SPECIAL_HEX
:
777 ctx
->ris
= NMRTF_STATE_HEX
;
779 case NMRTF_SPECIAL_UNICODE
:
780 purple_debug_info("novell", "parsing unichar\n");
781 status
= rtf_dispatch_unicode_char(ctx
, ctx
->param
);
783 if (status
== NMRTF_OK
)
784 status
= rtf_get_char(ctx
, &ch
);
787 status
= NMRTF_BAD_TABLE
;
795 * Get the next character from the input stream
798 rtf_get_char(NMRtfContext
*ctx
, guchar
*ch
)
800 if (ctx
->nextch_available
) {
802 ctx
->nextch_available
= FALSE
;
815 * Move a character back into the input stream
818 rtf_unget_char(NMRtfContext
*ctx
, guchar ch
)
821 ctx
->nextch_available
= TRUE
;