1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 * Copyright 2008 by Sun Microsystems, Inc.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * $RCSfile: parrtf.cxx,v $
12 * This file is part of OpenOffice.org.
14 * OpenOffice.org is free software: you can redistribute it and/or modify
15 * it under the terms of the GNU Lesser General Public License version 3
16 * only, as published by the Free Software Foundation.
18 * OpenOffice.org is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU Lesser General Public License version 3 for more details
22 * (a copy is included in the LICENSE file that accompanied this code).
24 * You should have received a copy of the GNU Lesser General Public License
25 * version 3 along with OpenOffice.org. If not, see
26 * <http://www.openoffice.org/license.html>
27 * for a copy of the LGPLv3 License.
29 ************************************************************************/
31 // MARKER(update_precomp.py): autogen include statement, do not remove
32 #include "precompiled_svtools.hxx"
34 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil -*- */
36 #include <stdio.h> // for EOF
37 #include <rtl/tencinfo.h>
38 #include <tools/stream.hxx>
39 #include <tools/debug.hxx>
41 #include "rtfkeywd.hxx"
42 #include <svtools/parrtf.hxx>
44 const int MAX_STRING_LEN
= 1024;
45 const int MAX_TOKEN_LEN
= 128;
47 #define RTF_ISDIGIT( c ) (c >= '0' && c <= '9')
48 #define RTF_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') )
50 SV_IMPL_VARARR( RtfParserStates_Impl
, RtfParserState_Impl
)
52 SvRTFParser::SvRTFParser( SvStream
& rIn
, BYTE nStackSize
)
53 : SvParser( rIn
, nStackSize
),
54 eUNICodeSet( RTL_TEXTENCODING_MS_1252
), // default ist ANSI-CodeSet
57 // default ist ANSI-CodeSet
58 SetSrcEncoding( RTL_TEXTENCODING_MS_1252
);
59 bRTF_InTextRead
= false;
62 SvRTFParser::~SvRTFParser()
69 int SvRTFParser::_GetNextToken()
79 switch( nNextCh
= GetNextChar() )
84 case '+': // habe ich in einem RTF-File gefunden
85 case '~': // nonbreaking space
86 case '-': // optional hyphen
87 case '_': // nonbreaking hyphen
88 case '\'': // HexValue
93 bNextCh
= 0 == nNextCh
;
96 case '*': // ignoreflag
97 nRet
= RTF_IGNOREFLAG
;
99 case ':': // subentry in an index entry
100 nRet
= RTF_SUBENTRYINDEX
;
102 case '|': // formula-charakter
112 if( RTF_ISALPHA( nNextCh
) )
117 sal_Unicode
* pStr
= aStrBuffer
.AllocBuffer(
119 xub_StrLen nStrLen
= 0;
121 *(pStr
+ nStrLen
++) = nNextCh
;
122 if( MAX_TOKEN_LEN
== nStrLen
)
124 aToken
+= aStrBuffer
;
125 aToken
.GetBufferAccess(); // make unique string!
128 nNextCh
= GetNextChar();
129 } while( RTF_ISALPHA( nNextCh
) );
132 aStrBuffer
.ReleaseBufferAccess( nStrLen
);
133 aToken
+= aStrBuffer
;
137 // Minus fuer numerischen Parameter
138 int bNegValue
= false;
142 nNextCh
= GetNextChar();
145 // evt. Numerischer Parameter
146 if( RTF_ISDIGIT( nNextCh
) )
151 nTokenValue
+= nNextCh
- '0';
152 nNextCh
= GetNextChar();
153 } while( RTF_ISDIGIT( nNextCh
) );
155 nTokenValue
= -nTokenValue
;
158 else if( bNegValue
) // das Minus wieder zurueck
161 rInput
.SeekRel( -1 );
163 if( ' ' == nNextCh
) // Blank gehoert zum Token!
164 nNextCh
= GetNextChar();
166 // suche das Token in der Tabelle:
167 if( 0 == (nRet
= GetRTFToken( aToken
)) )
169 nRet
= RTF_UNKNOWNCONTROL
;
171 // bug 76812 - unicode token handled as normal text
176 if( 0 <= nTokenValue
)
178 nUCharOverread
= (BYTE
)nTokenValue
;
180 //cmc: other ifdef breaks #i3584
181 aParserStates
[ aParserStates
.Count()-1].
182 nUCharOverread
= nUCharOverread
;
184 if( !nUCharOverread
)
185 nUCharOverread
= aParserStates
[
186 aParserStates
.Count()-1].nUCharOverread
;
188 aParserStates
[ aParserStates
.Count()-1].
189 nUCharOverread
= nUCharOverread
;
192 aToken
.Erase(); // #i47831# erase token to prevent the token from beeing treated as text
199 // UPR - overread the group with the ansi
201 while( '{' != _GetNextToken() )
204 _GetNextToken(); // overread the last bracket
210 if( !bRTF_InTextRead
)
212 nRet
= RTF_TEXTTOKEN
;
213 aToken
= (sal_Unicode
)nTokenValue
;
215 // overread the next n "RTF" characters. This
216 // can be also \{, \}, \'88
217 for( BYTE m
= 0; m
< nUCharOverread
; ++m
)
219 sal_Unicode cAnsi
= nNextCh
;
220 while( 0xD == cAnsi
)
221 cAnsi
= GetNextChar();
222 while( 0xA == cAnsi
)
223 cAnsi
= GetNextChar();
226 '\'' == ( cAnsi
= GetNextChar() ))
227 // HexValue ueberlesen
228 cAnsi
= GetHexValue();
229 nNextCh
= GetNextChar();
232 bNextCh
= 0 == nNextCh
;
237 else if( SVPAR_PENDING
!= eState
)
239 // Bug 34631 - "\ " ueberlesen - Blank als Zeichen
240 // eState = SVPAR_ERROR;
248 case sal_Unicode(EOF
):
249 eState
= SVPAR_ACCEPTED
;
255 if( 0 <= nOpenBrakets
)
257 RtfParserState_Impl
aState( nUCharOverread
, GetSrcEncoding() );
258 aParserStates
.Insert(
259 aState
, sal::static_int_cast
< USHORT
>(nOpenBrakets
) );
262 DBG_ASSERT( nOpenBrakets
== aParserStates
.Count(),
263 "ParserStateStack unequal to bracket count" );
270 if( 0 <= nOpenBrakets
)
272 aParserStates
.Remove(
273 sal::static_int_cast
< USHORT
>(nOpenBrakets
) );
274 if( aParserStates
.Count() )
276 const RtfParserState_Impl
& rRPS
=
277 aParserStates
[ aParserStates
.Count() - 1 ];
278 nUCharOverread
= rRPS
.nUCharOverread
;
279 SetSrcEncoding( rRPS
.eCodeSet
);
284 SetSrcEncoding( GetCodeSet() );
287 DBG_ASSERT( nOpenBrakets
== aParserStates
.Count(),
288 "ParserStateStack unequal to bracket count" );
297 // es folgt normaler Text
299 nRet
= RTF_TEXTTOKEN
;
300 bNextCh
= 0 == nNextCh
;
305 nNextCh
= GetNextChar();
307 } while( !nRet
&& SVPAR_WORKING
== eState
);
312 sal_Unicode
SvRTFParser::GetHexValue()
316 register sal_Unicode nHexVal
= 0;
318 for( n
= 0; n
< 2; ++n
)
321 nNextCh
= GetNextChar();
322 if( nNextCh
>= '0' && nNextCh
<= '9' )
323 nHexVal
+= (nNextCh
- 48);
324 else if( nNextCh
>= 'a' && nNextCh
<= 'f' )
325 nHexVal
+= (nNextCh
- 87);
326 else if( nNextCh
>= 'A' && nNextCh
<= 'F' )
327 nHexVal
+= (nNextCh
- 55);
332 void SvRTFParser::ScanText( const sal_Unicode cBreak
)
336 while( bWeiter
&& IsParserWorking() && aStrBuffer
.Len() < MAX_STRING_LEN
)
343 switch (nNextCh
= GetNextChar())
349 // #i35653 patch from cmc
350 ByteString
aByteString(static_cast<char>(GetHexValue()));
351 if (aByteString
.Len())
352 aStrBuffer
.Append(String(aByteString
, GetSrcEncoding()));
354 ByteString aByteString
;
357 aByteString
.Append((char)GetHexValue());
360 sal_Char nSlash
= '\\';
363 wchar_t __next
=GetNextChar();
364 if (__next
>0xFF) // fix for #i43933# and #i35653#
366 if (aByteString
.Len())
367 aStrBuffer
.Append(String(aByteString
, GetSrcEncoding()));
368 aStrBuffer
.Append((sal_Unicode
)__next
);
373 nSlash
= (sal_Char
)__next
;
374 while (nSlash
== 0xD || nSlash
== 0xA)
375 nSlash
= (sal_Char
)GetNextChar();
385 aByteString
.Append(nSlash
);
390 nNextCh
= GetNextChar();
392 if (nSlash
!= '\\' || nNextCh
!= '\'')
402 if (aByteString
.Len())
403 aStrBuffer
.Append(String(aByteString
, GetSrcEncoding()));
410 case '+': // habe ich in einem RTF-File gefunden
411 aStrBuffer
.Append(nNextCh
);
413 case '~': // nonbreaking space
414 aStrBuffer
.Append(static_cast< sal_Unicode
>(0xA0));
416 case '-': // optional hyphen
417 aStrBuffer
.Append(static_cast< sal_Unicode
>(0xAD));
419 case '_': // nonbreaking hyphen
420 aStrBuffer
.Append(static_cast< sal_Unicode
>(0x2011));
424 // UNI-Code Zeichen lesen
426 nNextCh
= GetNextChar();
427 rInput
.SeekRel( -2 );
429 if( '-' == nNextCh
|| RTF_ISDIGIT( nNextCh
) )
431 bRTF_InTextRead
= true;
433 String
sSave( aToken
);
439 DBG_ASSERT( RTF_U
== nToken
, "doch kein UNI-Code Zeichen" );
440 // dont convert symbol chars
442 static_cast< sal_Unicode
>(nTokenValue
));
444 // overread the next n "RTF" characters. This
445 // can be also \{, \}, \'88
446 for( BYTE m
= 0; m
< nUCharOverread
; ++m
)
448 sal_Unicode cAnsi
= nNextCh
;
449 while( 0xD == cAnsi
)
450 cAnsi
= GetNextChar();
451 while( 0xA == cAnsi
)
452 cAnsi
= GetNextChar();
455 '\'' == ( cAnsi
= GetNextChar() ))
456 // HexValue ueberlesen
457 cAnsi
= GetHexValue();
458 nNextCh
= GetNextChar();
462 bRTF_InTextRead
= false;
467 bWeiter
= false; // Abbrechen, String zusammen
473 rInput
.SeekRel( -1 );
475 bWeiter
= false; // Abbrechen, String zusammen
481 case sal_Unicode(EOF
):
482 eState
= SVPAR_ERROR
;
494 if( nNextCh
== cBreak
|| aStrBuffer
.Len() >= MAX_STRING_LEN
)
499 // alle anderen Zeichen kommen in den Text
500 aStrBuffer
.Append(nNextCh
);
502 if (sal_Unicode(EOF
) == (nNextCh
= GetNextChar()))
504 if (aStrBuffer
.Len())
505 aToken
+= aStrBuffer
;
510 (RTF_ISALPHA(nNextCh
) || RTF_ISDIGIT(nNextCh
)) &&
511 (aStrBuffer
.Len() < MAX_STRING_LEN
)
517 if( bWeiter
&& bNextCh
)
518 nNextCh
= GetNextChar();
521 if (aStrBuffer
.Len())
522 aToken
+= aStrBuffer
;
526 short SvRTFParser::_inSkipGroup
=0;
528 void SvRTFParser::SkipGroup()
534 #if 1 //#i16185# fecking \bin keyword
549 int nToken
= _GetNextToken();
550 if (nToken
== RTF_BIN
)
553 rInput
.SeekRel(nTokenValue
);
554 nNextCh
= GetNextChar();
556 while (nNextCh
==0xa || nNextCh
==0xd)
558 nNextCh
= GetNextChar();
560 } while (sal_Unicode(EOF
) != nNextCh
&& IsParserWorking());
562 sal_Unicode cPrev
= 0;
572 if( '\\' != cPrev
&& !--nBrackets
)
582 nNextCh
= GetNextChar();
583 } while( sal_Unicode(EOF
) != nNextCh
&& IsParserWorking() );
586 if( SVPAR_PENDING
!= eState
&& '}' != nNextCh
)
587 eState
= SVPAR_ERROR
;
591 void SvRTFParser::ReadUnknownData() { SkipGroup(); }
592 void SvRTFParser::ReadBitmapData() { SkipGroup(); }
593 void SvRTFParser::ReadOLEData() { SkipGroup(); }
596 SvParserState
SvRTFParser::CallParser()
599 nNextChPos
= rInput
.Tell();
600 rInput
>> cFirstCh
; nNextCh
= cFirstCh
;
601 eState
= SVPAR_WORKING
;
603 SetSrcEncoding( eCodeSet
= RTL_TEXTENCODING_MS_1252
);
604 eUNICodeSet
= RTL_TEXTENCODING_MS_1252
; // default ist ANSI-CodeSet
606 // die 1. beiden Token muessen '{' und \\rtf sein !!
607 if( '{' == GetNextToken() && RTF_RTF
== GetNextToken() )
611 if( SVPAR_PENDING
!= eState
)
612 ReleaseRef(); // dann brauchen wir den Parser nicht mehr!
615 eState
= SVPAR_ERROR
;
620 void SvRTFParser::Continue( int nToken
)
622 // DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
623 // "Zeichensatz wurde geaendert." );
626 nToken
= GetNextToken();
628 while( IsParserWorking() )
636 eState
= SVPAR_ACCEPTED
;
640 // eine unbekannte Gruppe ?
642 if( RTF_IGNOREFLAG
!= GetNextToken() )
643 nToken
= SkipToken( -1 );
644 else if( RTF_UNKNOWNCONTROL
!= GetNextToken() )
645 nToken
= SkipToken( -2 );
648 // gleich herausfiltern
650 nToken
= GetNextToken();
652 eState
= SVPAR_ERROR
;
653 break; // auf zum naechsten Token!!
658 case RTF_UNKNOWNCONTROL
:
659 break; // unbekannte Token ueberspringen
662 SetSrcEncoding( eCodeSet
= RTL_TEXTENCODING_MS_1252
);
665 SetSrcEncoding( eCodeSet
= RTL_TEXTENCODING_APPLE_ROMAN
);
668 SetSrcEncoding( eCodeSet
= RTL_TEXTENCODING_IBM_437
);
671 SetSrcEncoding( eCodeSet
= RTL_TEXTENCODING_IBM_850
);
674 eCodeSet
= rtl_getTextEncodingFromWindowsCodePage(nTokenValue
);
675 SetSrcEncoding(eCodeSet
);
682 if( IsParserWorking() )
683 SaveState( 0 ); // bis hierhin abgearbeitet,
684 // weiter mit neuem Token!
685 nToken
= GetNextToken();
687 if( SVPAR_ACCEPTED
== eState
&& 0 < nOpenBrakets
)
688 eState
= SVPAR_ERROR
;
691 void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc
)
693 if (eEnc
== RTL_TEXTENCODING_DONTKNOW
)
696 if (aParserStates
.Count())
697 aParserStates
[aParserStates
.Count() - 1].eCodeSet
= eEnc
;
698 SetSrcEncoding(eEnc
);
702 void SvRTFParser::SaveState( int nToken
)
704 SvParser::SaveState( nToken
);
707 void SvRTFParser::RestoreState()
709 SvParser::RestoreState();
713 /* vi:set tabstop=4 shiftwidth=4 expandtab: */