1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
21 #include <sal/log.hxx>
23 #include <comphelper/scopeguard.hxx>
25 #include <rtl/character.hxx>
26 #include <rtl/strbuf.hxx>
27 #include <rtl/tencinfo.h>
28 #include <rtl/ustrbuf.hxx>
29 #include <tools/stream.hxx>
30 #include <tools/debug.hxx>
31 #include <svtools/rtftoken.h>
32 #include <svtools/parrtf.hxx>
34 const int MAX_STRING_LEN
= 1024;
36 #define RTF_ISDIGIT( c ) rtl::isAsciiDigit(c)
37 #define RTF_ISALPHA( c ) rtl::isAsciiAlpha(c)
39 SvRTFParser::SvRTFParser( SvStream
& rIn
, sal_uInt8 nStackSize
)
40 : SvParser
<int>( rIn
, nStackSize
)
42 , eCodeSet(RTL_TEXTENCODING_MS_1252
)
45 // default is ANSI-CodeSet
46 SetSrcEncoding( RTL_TEXTENCODING_MS_1252
);
47 bRTF_InTextRead
= false;
50 SvRTFParser::~SvRTFParser()
55 int SvRTFParser::GetNextToken_()
65 nNextCh
= GetNextChar();
71 case '+': // I found it in a RTF-file
72 case '~': // nonbreaking space
73 case '-': // optional hyphen
74 case '_': // nonbreaking hyphen
75 case '\'': // HexValue
80 bNextCh
= 0 == nNextCh
;
83 case '*': // ignoreflag
84 nRet
= RTF_IGNOREFLAG
;
86 case ':': // subentry in an index entry
87 nRet
= RTF_SUBENTRYINDEX
;
89 case '|': // formula-character
99 if( RTF_ISALPHA( nNextCh
) )
104 aToken
.appendUtf32(nNextCh
);
105 nNextCh
= GetNextChar();
106 } while( RTF_ISALPHA( nNextCh
) );
109 // minus before numeric parameters
110 bool bNegValue
= false;
114 nNextCh
= GetNextChar();
117 // possible numeric parameter
118 if( RTF_ISDIGIT( nNextCh
) )
120 OUStringBuffer aNumber
;
122 aNumber
.append(static_cast<sal_Unicode
>(nNextCh
));
123 nNextCh
= GetNextChar();
124 } while( RTF_ISDIGIT( nNextCh
) );
125 nTokenValue
= OUString::unacquired(aNumber
).toInt32();
127 nTokenValue
= -nTokenValue
;
130 else if( bNegValue
) // restore minus
133 rInput
.SeekRel( -1 );
135 if( ' ' == nNextCh
) // blank is part of token!
136 nNextCh
= GetNextChar();
138 // search for the token in the table:
139 if( 0 == (nRet
= GetRTFToken( aToken
)) )
141 nRet
= RTF_UNKNOWNCONTROL
;
143 // bug 76812 - unicode token handled as normal text
148 if( 0 <= nTokenValue
)
150 nUCharOverread
= static_cast<sal_uInt8
>(nTokenValue
);
151 if (!aParserStates
.empty())
153 //cmc: other ifdef breaks #i3584
154 aParserStates
.top().nUCharOverread
= nUCharOverread
;
157 aToken
.setLength( 0 ); // #i47831# erase token to prevent the token from being treated as text
164 // UPR - overread the group with the ansi
169 nNextToken
= GetNextToken_();
171 while (nNextToken
!= '{' && nNextToken
!= sal_Unicode(EOF
) && IsParserWorking());
174 GetNextToken_(); // overread the last bracket
180 if( !bRTF_InTextRead
)
182 nRet
= RTF_TEXTTOKEN
;
183 aToken
= OUStringChar( static_cast<sal_Unicode
>(nTokenValue
) );
185 // overread the next n "RTF" characters. This
186 // can be also \{, \}, \'88
187 for( sal_uInt8 m
= 0; m
< nUCharOverread
; ++m
)
189 sal_uInt32 cAnsi
= nNextCh
;
190 while( 0xD == cAnsi
)
191 cAnsi
= GetNextChar();
192 while( 0xA == cAnsi
)
193 cAnsi
= GetNextChar();
196 '\'' == GetNextChar() )
199 nNextCh
= GetNextChar();
202 bNextCh
= 0 == nNextCh
;
207 else if( SvParserState::Pending
!= eState
)
209 // Bug 34631 - "\ " read on - Blank as character
210 // eState = SvParserState::Error;
218 case sal_Unicode(EOF
):
219 eState
= SvParserState::Accepted
;
225 if( 0 <= nOpenBrackets
)
227 RtfParserState_Impl
aState( nUCharOverread
, GetSrcEncoding() );
228 aParserStates
.push( aState
);
232 static_cast<size_t>(nOpenBrackets
) == aParserStates
.size(),
233 "ParserStateStack unequal to bracket count" );
240 if( 0 <= nOpenBrackets
)
243 if( !aParserStates
.empty() )
245 const RtfParserState_Impl
& rRPS
=
247 nUCharOverread
= rRPS
.nUCharOverread
;
248 SetSrcEncoding( rRPS
.eCodeSet
);
253 SetSrcEncoding( GetCodeSet() );
257 static_cast<size_t>(nOpenBrackets
) == aParserStates
.size(),
258 "ParserStateStack unequal to bracket count" );
267 // now normal text follows
269 nRet
= RTF_TEXTTOKEN
;
270 bNextCh
= 0 == nNextCh
;
275 nNextCh
= GetNextChar();
277 } while( !nRet
&& SvParserState::Working
== eState
);
282 sal_Unicode
SvRTFParser::GetHexValue()
284 // collect Hex values
286 sal_Unicode nHexVal
= 0;
288 for( n
= 0; n
< 2; ++n
)
291 nNextCh
= GetNextChar();
292 if( nNextCh
>= '0' && nNextCh
<= '9' )
293 nHexVal
+= (nNextCh
- 48);
294 else if( nNextCh
>= 'a' && nNextCh
<= 'f' )
295 nHexVal
+= (nNextCh
- 87);
296 else if( nNextCh
>= 'A' && nNextCh
<= 'F' )
297 nHexVal
+= (nNextCh
- 55);
302 void SvRTFParser::ScanText()
304 const sal_Unicode cBreak
= 0;
305 OUStringBuffer aStrBuffer
;
306 bool bContinue
= true;
307 while( bContinue
&& IsParserWorking() && aStrBuffer
.getLength() < MAX_STRING_LEN
)
314 nNextCh
= GetNextChar();
320 OStringBuffer aByteString
;
323 char c
= static_cast<char>(GetHexValue());
325 * Note: \'00 is a valid internal character in a
326 * string in RTF. OStringBuffer supports
327 * appending nulls fine
329 aByteString
.append(c
);
336 auto next
= GetNextChar();
337 if (sal_Unicode(EOF
) == next
)
342 if (next
>0xFF) // fix for #i43933# and #i35653#
344 if (!aByteString
.isEmpty())
346 aStrBuffer
.append( OStringToOUString(aByteString
, GetSrcEncoding()) );
347 aByteString
.setLength(0);
349 aStrBuffer
.append(static_cast<sal_Unicode
>(next
));
353 nSlash
= static_cast<char>(next
);
354 while (nSlash
== 0xD || nSlash
== 0xA)
355 nSlash
= static_cast<char>(GetNextChar());
365 aByteString
.append(nSlash
);
372 bContinue
= false; // abort, string together
376 nNextCh
= GetNextChar();
378 if (nSlash
!= '\\' || nNextCh
!= '\'')
381 nNextCh
= static_cast<unsigned char>(nSlash
);
388 if (!aByteString
.isEmpty())
390 aStrBuffer
.append( OStringToOUString(aByteString
, GetSrcEncoding()) );
391 aByteString
.setLength(0);
398 case '+': // I found in a RTF file
399 aStrBuffer
.append(sal_Unicode(nNextCh
));
401 case '~': // nonbreaking space
402 aStrBuffer
.append(u
'\x00A0');
404 case '-': // optional hyphen
405 aStrBuffer
.append(u
'\x00AD');
407 case '_': // nonbreaking hyphen
408 aStrBuffer
.append(u
'\x2011');
412 // read UNI-Code characters
414 nNextCh
= GetNextChar();
415 rInput
.SeekRel( -2 );
417 if( '-' == nNextCh
|| RTF_ISDIGIT( nNextCh
) )
419 bRTF_InTextRead
= true;
421 OUString
sSave( aToken
); // GetNextToken_() overwrites this
423 int nToken
= GetNextToken_();
424 DBG_ASSERT( RTF_U
== nToken
, "still not a UNI-Code character" );
425 // don't convert symbol chars
426 aStrBuffer
.append(static_cast< sal_Unicode
>(nTokenValue
));
428 // overread the next n "RTF" characters. This
429 // can be also \{, \}, \'88
430 for( sal_uInt8 m
= 0; m
< nUCharOverread
; ++m
)
432 sal_Unicode cAnsi
= nNextCh
;
433 while( 0xD == cAnsi
)
434 cAnsi
= GetNextChar();
435 while( 0xA == cAnsi
)
436 cAnsi
= GetNextChar();
439 '\'' == GetNextChar() )
442 nNextCh
= GetNextChar();
446 bRTF_InTextRead
= false;
448 else if ( 'c' == nNextCh
)
450 // Prevent text breaking into multiple tokens.
452 nNextCh
= GetNextChar();
453 if (RTF_ISDIGIT( nNextCh
))
455 sal_uInt8 nNewOverread
= 0 ;
458 nNewOverread
+= nNextCh
- '0';
459 nNextCh
= GetNextChar();
460 } while ( RTF_ISDIGIT( nNextCh
) );
461 nUCharOverread
= nNewOverread
;
462 if (!aParserStates
.empty())
463 aParserStates
.top().nUCharOverread
= nNewOverread
;
465 bNextCh
= 0x20 == nNextCh
;
470 bContinue
= false; // abort, string together
476 rInput
.SeekRel( -1 );
478 bContinue
= false; // abort, string together
484 case sal_Unicode(EOF
):
485 eState
= SvParserState::Error
;
497 if( nNextCh
== cBreak
|| aStrBuffer
.getLength() >= MAX_STRING_LEN
)
502 // all other characters end up in the text
503 aStrBuffer
.appendUtf32(nNextCh
);
505 if (sal_Unicode(EOF
) == (nNextCh
= GetNextChar()))
507 if (!aStrBuffer
.isEmpty())
508 aToken
.append( aStrBuffer
);
513 (RTF_ISALPHA(nNextCh
) || RTF_ISDIGIT(nNextCh
)) &&
514 (aStrBuffer
.getLength() < MAX_STRING_LEN
)
520 if( bContinue
&& bNextCh
)
521 nNextCh
= GetNextChar();
524 if (!aStrBuffer
.isEmpty())
525 aToken
.append( aStrBuffer
);
529 short SvRTFParser::_inSkipGroup
=0;
531 void SvRTFParser::SkipGroup()
537 //#i16185# faking \bin keyword
552 int nToken
= GetNextToken_();
553 if (nToken
== RTF_BIN
)
556 SAL_WARN_IF(nTokenValue
< 0, "svtools", "negative value argument for rtf \\bin keyword");
558 rInput
.SeekRel(nTokenValue
);
559 nNextCh
= GetNextChar();
561 while (nNextCh
==0xa || nNextCh
==0xd)
563 nNextCh
= GetNextChar();
565 } while (sal_Unicode(EOF
) != nNextCh
&& IsParserWorking());
567 if( SvParserState::Pending
!= eState
&& '}' != nNextCh
)
568 eState
= SvParserState::Error
;
572 void SvRTFParser::ReadUnknownData() { SkipGroup(); }
573 void SvRTFParser::ReadBitmapData() { SkipGroup(); }
576 SvParserState
SvRTFParser::CallParser()
579 nNextChPos
= rInput
.Tell();
580 rInput
.ReadChar( cFirstCh
);
581 nNextCh
= static_cast<unsigned char>(cFirstCh
);
582 eState
= SvParserState::Working
;
584 eCodeSet
= RTL_TEXTENCODING_MS_1252
;
585 SetSrcEncoding( eCodeSet
);
587 // the first two tokens should be '{' and \\rtf !!
588 if( '{' == GetNextToken() && RTF_RTF
== GetNextToken() )
591 // call ReleaseRef at end of this scope, even in the face of exceptions
592 comphelper::ScopeGuard
g([this] {
593 if( SvParserState::Pending
!= eState
)
594 ReleaseRef(); // now parser is not needed anymore
599 eState
= SvParserState::Error
;
604 void SvRTFParser::Continue( int nToken
)
606 // DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
607 // "Characterset was changed." );
610 nToken
= GetNextToken();
612 bool bLooping
= false;
614 while (IsParserWorking() && !bLooping
)
616 auto nCurrentTokenIndex
= m_nTokenIndex
;
617 auto nCurrentToken
= nToken
;
625 eState
= SvParserState::Accepted
;
629 // an unknown group ?
631 if( RTF_IGNOREFLAG
!= GetNextToken() )
632 nToken
= SkipToken();
633 else if( RTF_UNKNOWNCONTROL
!= GetNextToken() )
634 nToken
= SkipToken( -2 );
637 // filter immediately
639 nToken
= GetNextToken();
641 eState
= SvParserState::Error
;
642 break; // move to next token!!
647 case RTF_UNKNOWNCONTROL
:
648 break; // skip unknown token
651 eCodeSet
= RTL_TEXTENCODING_MS_1252
;
652 SetSrcEncoding( eCodeSet
);
655 eCodeSet
= RTL_TEXTENCODING_APPLE_ROMAN
;
656 SetSrcEncoding( eCodeSet
);
659 eCodeSet
= RTL_TEXTENCODING_IBM_437
;
660 SetSrcEncoding( eCodeSet
);
663 eCodeSet
= RTL_TEXTENCODING_IBM_850
;
664 SetSrcEncoding( eCodeSet
);
667 eCodeSet
= rtl_getTextEncodingFromWindowsCodePage(nTokenValue
);
668 SetSrcEncoding(eCodeSet
);
675 if( IsParserWorking() )
676 SaveState( 0 ); // processed till here,
677 // continue with new token!
678 nToken
= GetNextToken();
679 bLooping
= nCurrentTokenIndex
== m_nTokenIndex
&& nToken
== nCurrentToken
;
681 if( SvParserState::Accepted
== eState
&& 0 < nOpenBrackets
)
682 eState
= SvParserState::Error
;
685 void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc
)
687 if (eEnc
== RTL_TEXTENCODING_DONTKNOW
)
690 if (!aParserStates
.empty())
691 aParserStates
.top().eCodeSet
= eEnc
;
692 SetSrcEncoding(eEnc
);
695 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */