4 * Copyright (c) 2004 Novell, Inc. All Rights Reserved.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; version 2 of the License.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA
21 /* This code was adapted from the sample RTF reader found here:
22 * http://msdn.microsoft.com/library/default.asp?url=/library/en-us/dnrtfspec/html/rtfspec.asp
35 /* Internal RTF parser error codes */
36 #define NMRTF_OK 0 /* Everything's fine! */
37 #define NMRTF_STACK_UNDERFLOW 1 /* Unmatched '}' */
38 #define NMRTF_STACK_OVERFLOW 2 /* Too many '{' -- memory exhausted */
39 #define NMRTF_UNMATCHED_BRACE 3 /* RTF ended during an open group. */
40 #define NMRTF_INVALID_HEX 4 /* invalid hex character found in data */
41 #define NMRTF_BAD_TABLE 5 /* RTF table (sym or prop) invalid */
42 #define NMRTF_ASSERTION 6 /* Assertion failure */
43 #define NMRTF_EOF 7 /* End of file reached while reading RTF */
44 #define NMRTF_CONVERT_ERROR 8 /* Error converting text */
46 #define NMRTF_MAX_DEPTH 256
52 NMRTF_STATE_FONTTABLE
,
55 } NMRtfState
; /* Rtf State */
57 /* Property types that we care about */
61 NMRTF_PROP_FONT_CHARSET
,
69 NMRTF_SPECIAL_UNICODE
,
77 } NMRtfDestinationType
;
89 /* All we care about for now is the font.
90 * bold, italic, underline, etc. should be
106 char *keyword
; /* RTF keyword */
107 int default_val
; /* default value to use */
108 gboolean pass_default
; /* true to use default value from this table */
109 NMRtfKeywordType kwd_type
; /* the type of the keyword */
110 int action
; /* property type if the keyword represents a property */
111 /* destination type if the keyword represents a destination */
112 /* character to print if the keyword represents a character */
126 NMRtfState rds
; /* destination state */
127 NMRtfState ris
; /* internal state */
128 NMRtfCharProp chp
; /* current character properties (ie. font, bold, italic, etc.) */
129 GSList
*font_table
; /* the font table */
130 GSList
*saved
; /* saved state stack */
131 int param
; /* numeric parameter for the current keyword */
132 long bytes_to_skip
; /* number of bytes to skip (after encountering \bin) */
133 int depth
; /* how many groups deep are we */
134 gboolean skip_unknown
; /* if true, skip any unknown destinations (this is set after encountering '\*') */
135 char *input
; /* input string */
136 guchar nextch
; /* next char in input */
137 gboolean nextch_available
; /* nextch value is set */
138 GString
*ansi
; /* Temporary ansi text, will be convert/flushed to the output string */
139 GString
*output
; /* The plain text UTF8 string */
142 static int rtf_parse(NMRtfContext
*ctx
);
143 static int rtf_push_state(NMRtfContext
*ctx
);
144 static int rtf_pop_state(NMRtfContext
*ctx
);
145 static NMRtfFont
*rtf_get_font(NMRtfContext
*ctx
, int index
);
146 static int rtf_get_char(NMRtfContext
*ctx
, guchar
*ch
);
147 static int rtf_unget_char(NMRtfContext
*ctx
, guchar ch
);
148 static int rtf_flush_data(NMRtfContext
*ctx
);
149 static int rtf_parse_keyword(NMRtfContext
*ctx
);
150 static int rtf_dispatch_control(NMRtfContext
*ctx
, char *keyword
, int param
, gboolean param_set
);
151 static int rtf_dispatch_char(NMRtfContext
*ctx
, guchar ch
);
152 static int rtf_dispatch_unicode_char(NMRtfContext
*ctx
, gunichar ch
);
153 static int rtf_print_char(NMRtfContext
*ctx
, guchar ch
);
154 static int rtf_print_unicode_char(NMRtfContext
*ctx
, gunichar ch
);
155 static int rtf_change_destination(NMRtfContext
*ctx
, NMRtfDestinationType dest
);
156 static int rtf_dispatch_special(NMRtfContext
*ctx
, NMRtfSpecialKwd special
);
157 static int rtf_apply_property(NMRtfContext
*ctx
, NMRtfProperty prop
, int val
);
159 /* RTF parser tables */
161 /* Keyword descriptions */
162 NMRtfSymbol rtf_symbols
[] = {
163 /* keyword, default, pass_default, keyword_type, action */
164 {"fonttbl", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_FONTTABLE
},
165 {"f", 0, FALSE
, NMRTF_KWD_PROP
, NMRTF_PROP_FONT_IDX
},
166 {"fcharset", 0, FALSE
, NMRTF_KWD_PROP
, NMRTF_PROP_FONT_CHARSET
},
167 {"par", 0, FALSE
, NMRTF_KWD_CHAR
, 0x0a},
168 {"line", 0, FALSE
, NMRTF_KWD_CHAR
, 0x0a},
169 {"\0x0a", 0, FALSE
, NMRTF_KWD_CHAR
, 0x0a},
170 {"\0x0d", 0, FALSE
, NMRTF_KWD_CHAR
, 0x0a},
171 {"tab", 0, FALSE
, NMRTF_KWD_CHAR
, 0x09},
172 {"\r", 0, FALSE
, NMRTF_KWD_CHAR
, '\r'},
173 {"\n", 0, FALSE
, NMRTF_KWD_CHAR
, '\n'},
174 {"ldblquote",0, FALSE
, NMRTF_KWD_CHAR
, '"'},
175 {"rdblquote",0, FALSE
, NMRTF_KWD_CHAR
, '"'},
176 {"{", 0, FALSE
, NMRTF_KWD_CHAR
, '{'},
177 {"}", 0, FALSE
, NMRTF_KWD_CHAR
, '}'},
178 {"\\", 0, FALSE
, NMRTF_KWD_CHAR
, '\\'},
179 {"bin", 0, FALSE
, NMRTF_KWD_SPEC
, NMRTF_SPECIAL_BIN
},
180 {"*", 0, FALSE
, NMRTF_KWD_SPEC
, NMRTF_SPECIAL_SKIP
},
181 {"'", 0, FALSE
, NMRTF_KWD_SPEC
, NMRTF_SPECIAL_HEX
},
182 {"u", 0, FALSE
, NMRTF_KWD_SPEC
, NMRTF_SPECIAL_UNICODE
},
183 {"colortbl", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
184 {"author", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
185 {"buptim", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
186 {"comment", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
187 {"creatim", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
188 {"doccomm", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
189 {"footer", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
190 {"footerf", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
191 {"footerl", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
192 {"footerr", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
193 {"footnote", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
194 {"ftncn", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
195 {"ftnsep", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
196 {"ftnsepc", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
197 {"header", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
198 {"headerf", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
199 {"headerl", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
200 {"headerr", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
201 {"info", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
202 {"keywords", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
203 {"operator", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
204 {"pict", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
205 {"printim", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
206 {"private1", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
207 {"revtim", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
208 {"rxe", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
209 {"stylesheet", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
210 {"subject", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
211 {"tc", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
212 {"title", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
213 {"txe", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
},
214 {"xe", 0, FALSE
, NMRTF_KWD_DEST
, NMRTF_DEST_SKIP
}
216 int table_size
= sizeof(rtf_symbols
) / sizeof(NMRtfSymbol
);
221 NMRtfContext
*ctx
= g_new0(NMRtfContext
, 1);
222 ctx
->nextch_available
= FALSE
;
223 ctx
->ansi
= g_string_new("");
224 ctx
->output
= g_string_new("");
229 nm_rtf_strip_formatting(NMRtfContext
*ctx
, const char *input
)
233 ctx
->input
= (char *)input
;
234 status
= rtf_parse(ctx
);
235 if (status
== NMRTF_OK
)
236 return g_strdup(ctx
->output
->str
);
238 purple_debug_info("novell", "RTF parser failed with error code %d\n", status
);
243 nm_rtf_deinit(NMRtfContext
*ctx
)
247 NMRtfStateSave
*save
;
250 for (node
= ctx
->font_table
; node
; node
= node
->next
) {
256 g_slist_free(ctx
->font_table
);
257 for (node
= ctx
->saved
; node
; node
= node
->next
) {
262 g_slist_free(ctx
->saved
);
263 g_string_free(ctx
->ansi
, TRUE
);
264 g_string_free(ctx
->output
, TRUE
);
270 get_current_encoding(NMRtfContext
*ctx
)
274 font
= rtf_get_font(ctx
, ctx
->chp
.font_idx
);
276 switch (font
->charset
) {
317 purple_debug_info("novell", "Unhandled font charset %d\n", font
->charset
);
324 * Add an entry to the font table
327 rtf_add_font_entry(NMRtfContext
*ctx
, int number
, const char *name
, int charset
)
329 NMRtfFont
*font
= g_new0(NMRtfFont
, 1);
331 font
->number
= number
;
332 font
->name
= g_strdup(name
);
333 font
->charset
= charset
;
335 purple_debug_info("novell", "Adding font to table: #%d\t%s\t%d\n",
336 font
->number
, font
->name
, font
->charset
);
338 ctx
->font_table
= g_slist_append(ctx
->font_table
, font
);
344 * Return the nth entry in the font table
347 rtf_get_font(NMRtfContext
*ctx
, int nth
)
351 font
= g_slist_nth_data(ctx
->font_table
, nth
);
358 * Isolate RTF keywords and send them to rtf_parse_keyword;
359 * Push and pop state at the start and end of RTF groups;
360 * Send text to rtf_dispatch_char for further processing.
363 rtf_parse(NMRtfContext
*ctx
)
371 if (ctx
->input
== NULL
)
374 while (rtf_get_char(ctx
, &ch
) == NMRTF_OK
) {
376 return NMRTF_STACK_UNDERFLOW
;
378 /* if we're parsing binary data, handle it directly */
379 if (ctx
->ris
== NMRTF_STATE_BIN
) {
380 if ((status
= rtf_dispatch_char(ctx
, ch
)) != NMRTF_OK
)
385 if (ctx
->depth
> NMRTF_MAX_DEPTH
)
386 return NMRTF_STACK_OVERFLOW
;
388 if ((status
= rtf_push_state(ctx
)) != NMRTF_OK
)
394 /* for some reason there is always an unwanted '\par' at the end */
395 if (ctx
->rds
== NMRTF_STATE_NORMAL
) {
396 len
= ctx
->output
->len
;
397 if (ctx
->output
->str
[len
-1] == '\n')
398 ctx
->output
= g_string_truncate(ctx
->output
, len
-1);
401 if ((status
= rtf_pop_state(ctx
)) != NMRTF_OK
)
405 return NMRTF_STACK_OVERFLOW
;
408 if ((status
= rtf_parse_keyword(ctx
)) != NMRTF_OK
)
412 case 0x0a: /* cr and lf are noise characters... */
415 if (ctx
->ris
== NMRTF_STATE_NORMAL
) {
416 if ((status
= rtf_dispatch_char(ctx
, ch
)) != NMRTF_OK
)
418 } else { /* parsing a hex encoded character */
419 if (ctx
->ris
!= NMRTF_STATE_HEX
)
420 return NMRTF_ASSERTION
;
422 hex_byte
= hex_byte
<< 4;
424 hex_byte
+= (char) ch
- '0';
427 if (ch
< 'a' || ch
> 'f')
428 return NMRTF_INVALID_HEX
;
429 hex_byte
+= (char) ch
- 'a' + 10;
431 if (ch
< 'A' || ch
> 'F')
432 return NMRTF_INVALID_HEX
;
433 hex_byte
+= (char) ch
- 'A' + 10;
437 if (hex_count
== 0) {
438 if ((status
= rtf_dispatch_char(ctx
, hex_byte
)) != NMRTF_OK
)
442 ctx
->ris
= NMRTF_STATE_NORMAL
;
450 return NMRTF_STACK_OVERFLOW
;
452 return NMRTF_UNMATCHED_BRACE
;
457 * Push the current state onto stack
460 rtf_push_state(NMRtfContext
*ctx
)
462 NMRtfStateSave
*save
= g_new0(NMRtfStateSave
, 1);
463 save
->chp
= ctx
->chp
;
464 save
->rds
= ctx
->rds
;
465 save
->ris
= ctx
->ris
;
466 ctx
->saved
= g_slist_prepend(ctx
->saved
, save
);
467 ctx
->ris
= NMRTF_STATE_NORMAL
;
473 * Restore the state at the top of the stack
476 rtf_pop_state(NMRtfContext
*ctx
)
478 NMRtfStateSave
*save_old
;
481 if (ctx
->saved
== NULL
)
482 return NMRTF_STACK_UNDERFLOW
;
484 save_old
= ctx
->saved
->data
;
485 ctx
->chp
= save_old
->chp
;
486 ctx
->rds
= save_old
->rds
;
487 ctx
->ris
= save_old
->ris
;
491 link_old
= ctx
->saved
;
492 ctx
->saved
= g_slist_remove_link(ctx
->saved
, link_old
);
493 g_slist_free_1(link_old
);
499 * Get a control word (and its associated value) and
500 * dispatch the control.
503 rtf_parse_keyword(NMRtfContext
*ctx
)
505 int status
= NMRTF_OK
;
507 gboolean param_set
= FALSE
;
508 gboolean is_neg
= FALSE
;
516 if ((status
= rtf_get_char(ctx
, &ch
)) != NMRTF_OK
)
520 /* a control symbol; no delimiter. */
521 keyword
[0] = (char) ch
;
523 return rtf_dispatch_control(ctx
, keyword
, 0, param_set
);
527 for (i
= 0; isalpha(ch
) && (i
< sizeof(keyword
) - 1); rtf_get_char(ctx
, &ch
)) {
528 keyword
[i
] = (char) ch
;
533 /* check for '-' indicated a negative parameter value */
536 if ((status
= rtf_get_char(ctx
, &ch
)) != NMRTF_OK
)
540 /* check for numerical param */
544 for (i
= 0; isdigit(ch
) && (i
< sizeof(parameter
) - 1); rtf_get_char(ctx
, &ch
)) {
545 parameter
[i
] = (char) ch
;
550 ctx
->param
= param
= atoi(parameter
);
552 ctx
->param
= param
= -param
;
555 /* space after control is optional, put character back if it is not a space */
557 rtf_unget_char(ctx
, ch
);
559 return rtf_dispatch_control(ctx
, keyword
, param
, param_set
);
563 * Route the character to the appropriate destination
566 rtf_dispatch_char(NMRtfContext
*ctx
, guchar ch
)
568 if (ctx
->ris
== NMRTF_STATE_BIN
&& --(ctx
->bytes_to_skip
) <= 0)
569 ctx
->ris
= NMRTF_STATE_NORMAL
;
572 case NMRTF_STATE_SKIP
:
574 case NMRTF_STATE_NORMAL
:
575 return rtf_print_char(ctx
, ch
);
576 case NMRTF_STATE_FONTTABLE
:
578 rtf_add_font_entry(ctx
, ctx
->chp
.font_idx
,
579 ctx
->ansi
->str
, ctx
->chp
.font_charset
);
580 g_string_truncate(ctx
->ansi
, 0);
583 return rtf_print_char(ctx
, ch
);
591 /* Handle a unicode character */
593 rtf_dispatch_unicode_char(NMRtfContext
*ctx
, gunichar ch
)
596 case NMRTF_STATE_SKIP
:
598 case NMRTF_STATE_NORMAL
:
599 case NMRTF_STATE_FONTTABLE
:
600 return rtf_print_unicode_char(ctx
, ch
);
610 rtf_print_char(NMRtfContext
*ctx
, guchar ch
)
613 ctx
->ansi
= g_string_append_c(ctx
->ansi
, ch
);
619 * Output a unicode character
622 rtf_print_unicode_char(NMRtfContext
*ctx
, gunichar ch
)
627 /* convert and flush the ansi buffer to the utf8 buffer */
630 /* convert the unicode character to utf8 and add directly to the output buffer */
631 num
= g_unichar_to_utf8((gunichar
) ch
, buf
);
633 purple_debug_info("novell", "converted unichar 0x%X to utf8 char %s\n", ch
, buf
);
635 ctx
->output
= g_string_append(ctx
->output
, buf
);
640 * Flush the output text
643 rtf_flush_data(NMRtfContext
*ctx
)
645 int status
= NMRTF_OK
;
646 char *conv_data
= NULL
;
647 const char *enc
= NULL
;
648 GError
*gerror
= NULL
;
650 if (ctx
->rds
== NMRTF_STATE_NORMAL
&& ctx
->ansi
->len
> 0) {
651 enc
= get_current_encoding(ctx
);
652 conv_data
= g_convert(ctx
->ansi
->str
, ctx
->ansi
->len
, "UTF-8", enc
,
653 NULL
, NULL
, &gerror
);
655 ctx
->output
= g_string_append(ctx
->output
, conv_data
);
657 ctx
->ansi
= g_string_truncate(ctx
->ansi
, 0);
659 status
= NMRTF_CONVERT_ERROR
;
660 purple_debug_info("novell", "failed to convert data! error code = %d msg = %s\n",
661 gerror
->code
, gerror
->message
);
664 g_error_free(gerror
);
670 * Handle a property change
673 rtf_apply_property(NMRtfContext
*ctx
, NMRtfProperty prop
, int val
)
675 if (ctx
->rds
== NMRTF_STATE_SKIP
) /* If we're skipping text, */
676 return NMRTF_OK
; /* don't do anything. */
678 /* Need to flush any temporary data before a property change*/
682 case NMRTF_PROP_FONT_IDX
:
683 ctx
->chp
.font_idx
= val
;
685 case NMRTF_PROP_FONT_CHARSET
:
686 ctx
->chp
.font_charset
= val
;
689 return NMRTF_BAD_TABLE
;
697 * Search the table for keyword and evaluate it appropriately.
700 * keyword: The RTF control to evaluate.
701 * param: The parameter of the RTF control.
702 * param_set: TRUE if the control had a parameter; (that is, if param is valid)
703 * FALSE if it did not.
706 rtf_dispatch_control(NMRtfContext
*ctx
, char *keyword
, int param
, gboolean param_set
)
710 for (idx
= 0; idx
< table_size
; idx
++) {
711 if (purple_strequal(keyword
, rtf_symbols
[idx
].keyword
))
715 if (idx
== table_size
) {
716 if (ctx
->skip_unknown
)
717 ctx
->rds
= NMRTF_STATE_SKIP
;
718 ctx
->skip_unknown
= FALSE
;
722 /* found it! use kwd_type and action to determine what to do with it. */
723 ctx
->skip_unknown
= FALSE
;
724 switch (rtf_symbols
[idx
].kwd_type
) {
726 if (rtf_symbols
[idx
].pass_default
|| !param_set
)
727 param
= rtf_symbols
[idx
].default_val
;
728 return rtf_apply_property(ctx
, rtf_symbols
[idx
].action
, param
);
730 return rtf_dispatch_char(ctx
, rtf_symbols
[idx
].action
);
732 return rtf_change_destination(ctx
, rtf_symbols
[idx
].action
);
734 return rtf_dispatch_special(ctx
, rtf_symbols
[idx
].action
);
736 return NMRTF_BAD_TABLE
;
738 return NMRTF_BAD_TABLE
;
742 * Change to the destination specified.
745 rtf_change_destination(NMRtfContext
*ctx
, NMRtfDestinationType type
)
747 /* if we're skipping text, don't do anything */
748 if (ctx
->rds
== NMRTF_STATE_SKIP
)
752 case NMRTF_DEST_FONTTABLE
:
753 ctx
->rds
= NMRTF_STATE_FONTTABLE
;
754 g_string_truncate(ctx
->ansi
, 0);
757 ctx
->rds
= NMRTF_STATE_SKIP
; /* when in doubt, skip it... */
764 * Dispatch an RTF control that needs special processing
767 rtf_dispatch_special(NMRtfContext
*ctx
, NMRtfSpecialKwd type
)
769 int status
= NMRTF_OK
;
772 if (ctx
->rds
== NMRTF_STATE_SKIP
&& type
!= NMRTF_SPECIAL_BIN
) /* if we're skipping, and it's not */
773 return NMRTF_OK
; /* the \bin keyword, ignore it. */
776 case NMRTF_SPECIAL_BIN
:
777 ctx
->ris
= NMRTF_STATE_BIN
;
778 ctx
->bytes_to_skip
= ctx
->param
;
780 case NMRTF_SPECIAL_SKIP
:
781 ctx
->skip_unknown
= TRUE
;
783 case NMRTF_SPECIAL_HEX
:
784 ctx
->ris
= NMRTF_STATE_HEX
;
786 case NMRTF_SPECIAL_UNICODE
:
787 purple_debug_info("novell", "parsing unichar\n");
788 status
= rtf_dispatch_unicode_char(ctx
, ctx
->param
);
790 if (status
== NMRTF_OK
)
791 status
= rtf_get_char(ctx
, &ch
);
794 status
= NMRTF_BAD_TABLE
;
802 * Get the next character from the input stream
805 rtf_get_char(NMRtfContext
*ctx
, guchar
*ch
)
807 if (ctx
->nextch_available
) {
809 ctx
->nextch_available
= FALSE
;
822 * Move a character back into the input stream
825 rtf_unget_char(NMRtfContext
*ctx
, guchar ch
)
828 ctx
->nextch_available
= TRUE
;