1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
21 #include <rtl/tencinfo.h>
22 #include <tools/stream.hxx>
23 #include <tools/debug.hxx>
24 #include <svtools/rtftoken.h>
25 #include <svtools/rtfkeywd.hxx>
26 #include <svtools/parrtf.hxx>
27 #include <comphelper/string.hxx>
29 const int MAX_STRING_LEN
= 1024;
30 const int MAX_TOKEN_LEN
= 128;
32 #define RTF_ISDIGIT( c ) comphelper::string::isdigitAscii(c)
33 #define RTF_ISALPHA( c ) comphelper::string::isalphaAscii(c)
35 SvRTFParser::SvRTFParser( SvStream
& rIn
, sal_uInt8 nStackSize
)
36 : SvParser( rIn
, nStackSize
)
38 , eCodeSet(RTL_TEXTENCODING_MS_1252
)
39 , eUNICodeSet(RTL_TEXTENCODING_MS_1252
) // default ist ANSI-CodeSet
42 // default ist ANSI-CodeSet
43 SetSrcEncoding( RTL_TEXTENCODING_MS_1252
);
44 bRTF_InTextRead
= false;
47 SvRTFParser::~SvRTFParser()
54 int SvRTFParser::_GetNextToken()
64 switch( nNextCh
= GetNextChar() )
69 case '+': // I found it in a RTF-file
70 case '~': // nonbreaking space
71 case '-': // optional hyphen
72 case '_': // nonbreaking hyphen
73 case '\'': // HexValue
78 bNextCh
= 0 == nNextCh
;
81 case '*': // ignoreflag
82 nRet
= RTF_IGNOREFLAG
;
84 case ':': // subentry in an index entry
85 nRet
= RTF_SUBENTRYINDEX
;
87 case '|': // formula-character
97 if( RTF_ISALPHA( nNextCh
) )
101 OUStringBuffer aStrBuffer
;
102 aStrBuffer
.setLength( MAX_TOKEN_LEN
);
103 sal_Int32 nStrLen
= 0;
105 aStrBuffer
[nStrLen
++] = nNextCh
;
106 if( MAX_TOKEN_LEN
== nStrLen
)
108 aToken
+= aStrBuffer
.toString();
111 nNextCh
= GetNextChar();
112 } while( RTF_ISALPHA( nNextCh
) );
115 aToken
+= aStrBuffer
.makeStringAndClear();
119 // minus before numeric parameters
120 bool bNegValue
= false;
124 nNextCh
= GetNextChar();
127 // possible numeric parameter
128 if( RTF_ISDIGIT( nNextCh
) )
133 nTokenValue
+= nNextCh
- '0';
134 nNextCh
= GetNextChar();
135 } while( RTF_ISDIGIT( nNextCh
) );
137 nTokenValue
= -nTokenValue
;
140 else if( bNegValue
) // restore minus
143 rInput
.SeekRel( -1 );
145 if( ' ' == nNextCh
) // blank is part of token!
146 nNextCh
= GetNextChar();
148 // search for the token in the table:
149 if( 0 == (nRet
= GetRTFToken( aToken
)) )
151 nRet
= RTF_UNKNOWNCONTROL
;
153 // bug 76812 - unicode token handled as normal text
158 if( 0 <= nTokenValue
)
160 nUCharOverread
= (sal_uInt8
)nTokenValue
;
161 //cmc: other ifdef breaks #i3584
163 nUCharOverread
= nUCharOverread
;
165 aToken
.clear(); // #i47831# erase token to prevent the token from being treated as text
172 // UPR - overread the group with the ansi
174 while( '{' != _GetNextToken() )
177 _GetNextToken(); // overread the last bracket
183 if( !bRTF_InTextRead
)
185 nRet
= RTF_TEXTTOKEN
;
186 aToken
= OUString( (sal_Unicode
)nTokenValue
);
188 // overread the next n "RTF" characters. This
189 // can be also \{, \}, \'88
190 for( sal_uInt8 m
= 0; m
< nUCharOverread
; ++m
)
192 sal_Unicode cAnsi
= nNextCh
;
193 while( 0xD == cAnsi
)
194 cAnsi
= GetNextChar();
195 while( 0xA == cAnsi
)
196 cAnsi
= GetNextChar();
199 '\'' == ( cAnsi
= GetNextChar() ))
201 cAnsi
= GetHexValue();
202 nNextCh
= GetNextChar();
205 bNextCh
= 0 == nNextCh
;
210 else if( SVPAR_PENDING
!= eState
)
212 // Bug 34631 - "\ " read on - Blank as character
213 // eState = SVPAR_ERROR;
221 case sal_Unicode(EOF
):
222 eState
= SVPAR_ACCEPTED
;
228 if( 0 <= nOpenBrakets
)
230 RtfParserState_Impl
aState( nUCharOverread
, GetSrcEncoding() );
231 aParserStates
.push( aState
);
235 static_cast<size_t>(nOpenBrakets
) == aParserStates
.size(),
236 "ParserStateStack unequal to bracket count" );
243 if( 0 <= nOpenBrakets
)
246 if( !aParserStates
.empty() )
248 const RtfParserState_Impl
& rRPS
=
250 nUCharOverread
= rRPS
.nUCharOverread
;
251 SetSrcEncoding( rRPS
.eCodeSet
);
256 SetSrcEncoding( GetCodeSet() );
260 static_cast<size_t>(nOpenBrakets
) == aParserStates
.size(),
261 "ParserStateStack unequal to bracket count" );
270 // now normal text follows
272 nRet
= RTF_TEXTTOKEN
;
273 bNextCh
= 0 == nNextCh
;
278 nNextCh
= GetNextChar();
280 } while( !nRet
&& SVPAR_WORKING
== eState
);
285 sal_Unicode
SvRTFParser::GetHexValue()
287 // collect Hex values
289 sal_Unicode nHexVal
= 0;
291 for( n
= 0; n
< 2; ++n
)
294 nNextCh
= GetNextChar();
295 if( nNextCh
>= '0' && nNextCh
<= '9' )
296 nHexVal
+= (nNextCh
- 48);
297 else if( nNextCh
>= 'a' && nNextCh
<= 'f' )
298 nHexVal
+= (nNextCh
- 87);
299 else if( nNextCh
>= 'A' && nNextCh
<= 'F' )
300 nHexVal
+= (nNextCh
- 55);
305 void SvRTFParser::ScanText( const sal_Unicode cBreak
)
307 OUStringBuffer aStrBuffer
;
308 bool bContinue
= true;
309 while( bContinue
&& IsParserWorking() && aStrBuffer
.getLength() < MAX_STRING_LEN
)
316 switch (nNextCh
= GetNextChar())
321 OStringBuffer aByteString
;
324 char c
= (char)GetHexValue();
326 * Note: \'00 is a valid internal character in a
327 * string in RTF. OStringBuffer supports
328 * appending nulls fine
330 aByteString
.append(c
);
333 sal_Char nSlash
= '\\';
336 wchar_t __next
=GetNextChar();
337 if (__next
>0xFF) // fix for #i43933# and #i35653#
339 if (!aByteString
.isEmpty())
340 aStrBuffer
.append( OStringToOUString(aByteString
.makeStringAndClear(), GetSrcEncoding()) );
341 aStrBuffer
.append((sal_Unicode
)__next
);
345 nSlash
= (sal_Char
)__next
;
346 while (nSlash
== 0xD || nSlash
== 0xA)
347 nSlash
= (sal_Char
)GetNextChar();
357 aByteString
.append(nSlash
);
362 nNextCh
= GetNextChar();
364 if (nSlash
!= '\\' || nNextCh
!= '\'')
374 if (!aByteString
.isEmpty())
375 aStrBuffer
.append( OStringToOUString(aByteString
.makeStringAndClear(), GetSrcEncoding()) );
381 case '+': // I found in a RTF file
382 aStrBuffer
.append(nNextCh
);
384 case '~': // nonbreaking space
385 aStrBuffer
.append(static_cast< sal_Unicode
>(0xA0));
387 case '-': // optional hyphen
388 aStrBuffer
.append(static_cast< sal_Unicode
>(0xAD));
390 case '_': // nonbreaking hyphen
391 aStrBuffer
.append(static_cast< sal_Unicode
>(0x2011));
395 // read UNI-Code characters
397 nNextCh
= GetNextChar();
398 rInput
.SeekRel( -2 );
400 if( '-' == nNextCh
|| RTF_ISDIGIT( nNextCh
) )
402 bRTF_InTextRead
= true;
404 OUString
sSave( aToken
);
410 DBG_ASSERT( RTF_U
== nToken
, "doch kein UNI-Code Zeichen" );
411 // dont convert symbol chars
412 aStrBuffer
.append(static_cast< sal_Unicode
>(nTokenValue
));
414 // overread the next n "RTF" characters. This
415 // can be also \{, \}, \'88
416 for( sal_uInt8 m
= 0; m
< nUCharOverread
; ++m
)
418 sal_Unicode cAnsi
= nNextCh
;
419 while( 0xD == cAnsi
)
420 cAnsi
= GetNextChar();
421 while( 0xA == cAnsi
)
422 cAnsi
= GetNextChar();
425 '\'' == ( cAnsi
= GetNextChar() ))
426 // HexValue ueberlesen
427 cAnsi
= GetHexValue();
428 nNextCh
= GetNextChar();
432 bRTF_InTextRead
= false;
434 else if ( 'c' == nNextCh
)
436 // Prevent text breaking into multiple tokens.
438 nNextCh
= GetNextChar();
439 if (RTF_ISDIGIT( nNextCh
))
441 sal_uInt8 nNewOverread
= 0 ;
444 nNewOverread
+= nNextCh
- '0';
445 nNextCh
= GetNextChar();
446 } while ( RTF_ISDIGIT( nNextCh
) );
447 nUCharOverread
= nNewOverread
;
448 aParserStates
.top().nUCharOverread
= nNewOverread
;
450 bNextCh
= 0x20 == nNextCh
;
455 bContinue
= false; // abort, string together
461 rInput
.SeekRel( -1 );
463 bContinue
= false; // abort, string together
469 case sal_Unicode(EOF
): eState
= SVPAR_ERROR
;
481 if( nNextCh
== cBreak
|| aStrBuffer
.getLength() >= MAX_STRING_LEN
)
486 // all other characters end up in the text
487 aStrBuffer
.append(nNextCh
);
489 if (sal_Unicode(EOF
) == (nNextCh
= GetNextChar()))
491 if (!aStrBuffer
.isEmpty())
492 aToken
+= aStrBuffer
.toString();
497 (RTF_ISALPHA(nNextCh
) || RTF_ISDIGIT(nNextCh
)) &&
498 (aStrBuffer
.getLength() < MAX_STRING_LEN
)
504 if( bContinue
&& bNextCh
)
505 nNextCh
= GetNextChar();
508 if (!aStrBuffer
.isEmpty())
509 aToken
+= aStrBuffer
.makeStringAndClear();
513 short SvRTFParser::_inSkipGroup
=0;
515 void SvRTFParser::SkipGroup()
521 //#i16185# fecking \bin keyword
536 int nToken
= _GetNextToken();
537 if (nToken
== RTF_BIN
)
540 rInput
.SeekRel(nTokenValue
);
541 nNextCh
= GetNextChar();
543 while (nNextCh
==0xa || nNextCh
==0xd)
545 nNextCh
= GetNextChar();
547 } while (sal_Unicode(EOF
) != nNextCh
&& IsParserWorking());
549 if( SVPAR_PENDING
!= eState
&& '}' != nNextCh
)
550 eState
= SVPAR_ERROR
;
554 void SvRTFParser::ReadUnknownData() { SkipGroup(); }
555 void SvRTFParser::ReadBitmapData() { SkipGroup(); }
556 void SvRTFParser::ReadOLEData() { SkipGroup(); }
559 SvParserState
SvRTFParser::CallParser()
562 nNextChPos
= rInput
.Tell();
563 rInput
.ReadChar( cFirstCh
); nNextCh
= cFirstCh
;
564 eState
= SVPAR_WORKING
;
566 SetSrcEncoding( eCodeSet
= RTL_TEXTENCODING_MS_1252
);
567 eUNICodeSet
= RTL_TEXTENCODING_MS_1252
; // default is ANSI-CodeSet
569 // the first two tokens should be '{' and \\rtf !!
570 if( '{' == GetNextToken() && RTF_RTF
== GetNextToken() )
574 if( SVPAR_PENDING
!= eState
)
575 ReleaseRef(); // now parser is not needed anymore
578 eState
= SVPAR_ERROR
;
583 void SvRTFParser::Continue( int nToken
)
585 // DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
586 // "Characterset was changed." );
589 nToken
= GetNextToken();
591 while( IsParserWorking() )
599 eState
= SVPAR_ACCEPTED
;
605 if( RTF_IGNOREFLAG
!= GetNextToken() )
606 nToken
= SkipToken( -1 );
607 else if( RTF_UNKNOWNCONTROL
!= GetNextToken() )
608 nToken
= SkipToken( -2 );
611 // filter immediately
613 nToken
= GetNextToken();
615 eState
= SVPAR_ERROR
;
616 break; // move to next token!!
621 case RTF_UNKNOWNCONTROL
:
622 break; // skip unknown token
625 SetSrcEncoding( eCodeSet
= RTL_TEXTENCODING_MS_1252
);
628 SetSrcEncoding( eCodeSet
= RTL_TEXTENCODING_APPLE_ROMAN
);
631 SetSrcEncoding( eCodeSet
= RTL_TEXTENCODING_IBM_437
);
634 SetSrcEncoding( eCodeSet
= RTL_TEXTENCODING_IBM_850
);
637 eCodeSet
= rtl_getTextEncodingFromWindowsCodePage(nTokenValue
);
638 SetSrcEncoding(eCodeSet
);
645 if( IsParserWorking() )
646 SaveState( 0 ); // processed till here,
647 // continue with new token!
648 nToken
= GetNextToken();
650 if( SVPAR_ACCEPTED
== eState
&& 0 < nOpenBrakets
)
651 eState
= SVPAR_ERROR
;
654 void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc
)
656 if (eEnc
== RTL_TEXTENCODING_DONTKNOW
)
659 if (!aParserStates
.empty())
660 aParserStates
.top().eCodeSet
= eEnc
;
661 SetSrcEncoding(eEnc
);
664 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */