1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
21 #include <sal/log.hxx>
23 #include <comphelper/scopeguard.hxx>
25 #include <rtl/character.hxx>
26 #include <rtl/strbuf.hxx>
27 #include <rtl/tencinfo.h>
28 #include <rtl/ustrbuf.hxx>
29 #include <tools/stream.hxx>
30 #include <tools/debug.hxx>
31 #include <svtools/rtftoken.h>
32 #include <svtools/parrtf.hxx>
34 const int MAX_STRING_LEN
= 1024;
36 #define RTF_ISDIGIT( c ) rtl::isAsciiDigit(c)
37 #define RTF_ISALPHA( c ) rtl::isAsciiAlpha(c)
39 SvRTFParser::SvRTFParser( SvStream
& rIn
, sal_uInt8 nStackSize
)
40 : SvParser
<int>( rIn
, nStackSize
)
43 , eCodeSet(RTL_TEXTENCODING_MS_1252
)
46 // default is ANSI-CodeSet
47 SetSrcEncoding( RTL_TEXTENCODING_MS_1252
);
48 bRTF_InTextRead
= false;
51 SvRTFParser::~SvRTFParser()
56 int SvRTFParser::GetNextToken_()
66 nNextCh
= GetNextChar();
72 case '+': // I found it in a RTF-file
73 case '~': // nonbreaking space
74 case '-': // optional hyphen
75 case '_': // nonbreaking hyphen
76 case '\'': // HexValue
81 bNextCh
= 0 == nNextCh
;
84 case '*': // ignoreflag
85 nRet
= RTF_IGNOREFLAG
;
87 case ':': // subentry in an index entry
88 nRet
= RTF_SUBENTRYINDEX
;
90 case '|': // formula-character
100 if( RTF_ISALPHA( nNextCh
) )
105 aToken
.appendUtf32(nNextCh
);
106 nNextCh
= GetNextChar();
107 } while( RTF_ISALPHA( nNextCh
) );
110 // minus before numeric parameters
111 bool bNegValue
= false;
115 nNextCh
= GetNextChar();
118 // possible numeric parameter
119 if( RTF_ISDIGIT( nNextCh
) )
121 OUStringBuffer aNumber
;
123 aNumber
.append(static_cast<sal_Unicode
>(nNextCh
));
124 nNextCh
= GetNextChar();
125 } while( RTF_ISDIGIT( nNextCh
) );
126 nTokenValue
= OUString::unacquired(aNumber
).toInt32();
128 nTokenValue
= -nTokenValue
;
131 else if( bNegValue
) // restore minus
134 rInput
.SeekRel( -1 );
136 if( ' ' == nNextCh
) // blank is part of token!
137 nNextCh
= GetNextChar();
139 // search for the token in the table:
140 if( 0 == (nRet
= GetRTFToken( aToken
)) )
142 nRet
= RTF_UNKNOWNCONTROL
;
144 // bug 76812 - unicode token handled as normal text
149 if( 0 <= nTokenValue
)
151 nUCharOverread
= static_cast<sal_uInt8
>(nTokenValue
);
152 if (!aParserStates
.empty())
154 //cmc: other ifdef breaks #i3584
155 aParserStates
.top().nUCharOverread
= nUCharOverread
;
158 aToken
.setLength( 0 ); // #i47831# erase token to prevent the token from being treated as text
166 if (nUPRLevel
> 256) // fairly sure > 1 is probably an error, but provide some leeway
168 SAL_WARN("svtools", "urp stack too deep");
169 eState
= SvParserState::Error
;
175 // UPR - overread the group with the ansi
180 nNextToken
= GetNextToken_();
182 while (nNextToken
!= '{' && nNextToken
!= sal_Unicode(EOF
) && IsParserWorking());
185 GetNextToken_(); // overread the last bracket
193 if( !bRTF_InTextRead
)
195 nRet
= RTF_TEXTTOKEN
;
196 aToken
= OUStringChar( static_cast<sal_Unicode
>(nTokenValue
) );
198 // overread the next n "RTF" characters. This
199 // can be also \{, \}, \'88
200 for( sal_uInt8 m
= 0; m
< nUCharOverread
; ++m
)
202 sal_uInt32 cAnsi
= nNextCh
;
203 while( 0xD == cAnsi
)
204 cAnsi
= GetNextChar();
205 while( 0xA == cAnsi
)
206 cAnsi
= GetNextChar();
209 '\'' == GetNextChar() )
212 nNextCh
= GetNextChar();
215 bNextCh
= 0 == nNextCh
;
220 else if( SvParserState::Pending
!= eState
)
222 // Bug 34631 - "\ " read on - Blank as character
223 // eState = SvParserState::Error;
231 case sal_Unicode(EOF
):
232 eState
= SvParserState::Accepted
;
238 if( 0 <= nOpenBrackets
)
240 RtfParserState_Impl
aState( nUCharOverread
, GetSrcEncoding() );
241 aParserStates
.push( aState
);
245 static_cast<size_t>(nOpenBrackets
) == aParserStates
.size(),
246 "ParserStateStack unequal to bracket count" );
253 if( 0 <= nOpenBrackets
)
256 if( !aParserStates
.empty() )
258 const RtfParserState_Impl
& rRPS
=
260 nUCharOverread
= rRPS
.nUCharOverread
;
261 SetSrcEncoding( rRPS
.eCodeSet
);
266 SetSrcEncoding( GetCodeSet() );
270 static_cast<size_t>(nOpenBrackets
) == aParserStates
.size(),
271 "ParserStateStack unequal to bracket count" );
280 // now normal text follows
282 nRet
= RTF_TEXTTOKEN
;
283 bNextCh
= 0 == nNextCh
;
288 nNextCh
= GetNextChar();
290 } while( !nRet
&& SvParserState::Working
== eState
);
295 sal_Unicode
SvRTFParser::GetHexValue()
297 // collect Hex values
299 sal_Unicode nHexVal
= 0;
301 for( n
= 0; n
< 2; ++n
)
304 nNextCh
= GetNextChar();
305 if( nNextCh
>= '0' && nNextCh
<= '9' )
306 nHexVal
+= (nNextCh
- 48);
307 else if( nNextCh
>= 'a' && nNextCh
<= 'f' )
308 nHexVal
+= (nNextCh
- 87);
309 else if( nNextCh
>= 'A' && nNextCh
<= 'F' )
310 nHexVal
+= (nNextCh
- 55);
315 void SvRTFParser::ScanText()
317 const sal_Unicode cBreak
= 0;
318 OUStringBuffer aStrBuffer
;
319 bool bContinue
= true;
320 while( bContinue
&& IsParserWorking() && aStrBuffer
.getLength() < MAX_STRING_LEN
)
327 nNextCh
= GetNextChar();
333 OStringBuffer aByteString
;
336 char c
= static_cast<char>(GetHexValue());
338 * Note: \'00 is a valid internal character in a
339 * string in RTF. OStringBuffer supports
340 * appending nulls fine
342 aByteString
.append(c
);
349 auto next
= GetNextChar();
350 if (sal_Unicode(EOF
) == next
)
355 if (next
>0xFF) // fix for #i43933# and #i35653#
357 if (!aByteString
.isEmpty())
359 aStrBuffer
.append( OStringToOUString(aByteString
, GetSrcEncoding()) );
360 aByteString
.setLength(0);
362 aStrBuffer
.append(static_cast<sal_Unicode
>(next
));
366 nSlash
= static_cast<char>(next
);
367 while (nSlash
== 0xD || nSlash
== 0xA)
368 nSlash
= static_cast<char>(GetNextChar());
378 aByteString
.append(nSlash
);
385 bContinue
= false; // abort, string together
389 nNextCh
= GetNextChar();
391 if (nSlash
!= '\\' || nNextCh
!= '\'')
394 nNextCh
= static_cast<unsigned char>(nSlash
);
401 if (!aByteString
.isEmpty())
403 aStrBuffer
.append( OStringToOUString(aByteString
, GetSrcEncoding()) );
404 aByteString
.setLength(0);
411 case '+': // I found in a RTF file
412 aStrBuffer
.append(sal_Unicode(nNextCh
));
414 case '~': // nonbreaking space
415 aStrBuffer
.append(u
'\x00A0');
417 case '-': // optional hyphen
418 aStrBuffer
.append(u
'\x00AD');
420 case '_': // nonbreaking hyphen
421 aStrBuffer
.append(u
'\x2011');
425 // read UNI-Code characters
427 nNextCh
= GetNextChar();
428 rInput
.SeekRel( -2 );
430 if( '-' == nNextCh
|| RTF_ISDIGIT( nNextCh
) )
432 bRTF_InTextRead
= true;
434 OUString
sSave( aToken
); // GetNextToken_() overwrites this
436 int nToken
= GetNextToken_();
437 DBG_ASSERT( RTF_U
== nToken
, "still not a UNI-Code character" );
438 // don't convert symbol chars
439 aStrBuffer
.append(static_cast< sal_Unicode
>(nTokenValue
));
441 // overread the next n "RTF" characters. This
442 // can be also \{, \}, \'88
443 for( sal_uInt8 m
= 0; m
< nUCharOverread
; ++m
)
445 sal_Unicode cAnsi
= nNextCh
;
446 while( 0xD == cAnsi
)
447 cAnsi
= GetNextChar();
448 while( 0xA == cAnsi
)
449 cAnsi
= GetNextChar();
452 '\'' == GetNextChar() )
455 nNextCh
= GetNextChar();
459 bRTF_InTextRead
= false;
461 else if ( 'c' == nNextCh
)
463 // Prevent text breaking into multiple tokens.
465 nNextCh
= GetNextChar();
466 if (RTF_ISDIGIT( nNextCh
))
468 sal_uInt8 nNewOverread
= 0 ;
471 nNewOverread
+= nNextCh
- '0';
472 nNextCh
= GetNextChar();
473 } while ( RTF_ISDIGIT( nNextCh
) );
474 nUCharOverread
= nNewOverread
;
475 if (!aParserStates
.empty())
476 aParserStates
.top().nUCharOverread
= nNewOverread
;
478 bNextCh
= 0x20 == nNextCh
;
483 bContinue
= false; // abort, string together
489 rInput
.SeekRel( -1 );
491 bContinue
= false; // abort, string together
497 case sal_Unicode(EOF
):
498 eState
= SvParserState::Error
;
510 if( nNextCh
== cBreak
|| aStrBuffer
.getLength() >= MAX_STRING_LEN
)
515 // all other characters end up in the text
516 aStrBuffer
.appendUtf32(nNextCh
);
518 if (sal_Unicode(EOF
) == (nNextCh
= GetNextChar()))
520 if (!aStrBuffer
.isEmpty())
521 aToken
.append( aStrBuffer
);
526 (RTF_ISALPHA(nNextCh
) || RTF_ISDIGIT(nNextCh
)) &&
527 (aStrBuffer
.getLength() < MAX_STRING_LEN
)
533 if( bContinue
&& bNextCh
)
534 nNextCh
= GetNextChar();
537 if (!aStrBuffer
.isEmpty())
538 aToken
.append( aStrBuffer
);
542 short SvRTFParser::_inSkipGroup
=0;
544 void SvRTFParser::SkipGroup()
550 //#i16185# faking \bin keyword
565 int nToken
= GetNextToken_();
566 if (nToken
== RTF_BIN
)
569 SAL_WARN_IF(nTokenValue
< 0, "svtools", "negative value argument for rtf \\bin keyword");
571 rInput
.SeekRel(nTokenValue
);
572 nNextCh
= GetNextChar();
574 while (nNextCh
==0xa || nNextCh
==0xd)
576 nNextCh
= GetNextChar();
578 } while (sal_Unicode(EOF
) != nNextCh
&& IsParserWorking());
580 if( SvParserState::Pending
!= eState
&& '}' != nNextCh
)
581 eState
= SvParserState::Error
;
585 void SvRTFParser::ReadUnknownData() { SkipGroup(); }
586 void SvRTFParser::ReadBitmapData() { SkipGroup(); }
589 SvParserState
SvRTFParser::CallParser()
592 nNextChPos
= rInput
.Tell();
593 rInput
.ReadChar( cFirstCh
);
594 nNextCh
= static_cast<unsigned char>(cFirstCh
);
595 eState
= SvParserState::Working
;
597 eCodeSet
= RTL_TEXTENCODING_MS_1252
;
598 SetSrcEncoding( eCodeSet
);
600 // the first two tokens should be '{' and \\rtf !!
601 if( '{' == GetNextToken() && RTF_RTF
== GetNextToken() )
604 // call ReleaseRef at end of this scope, even in the face of exceptions
605 comphelper::ScopeGuard
g([this] {
606 if( SvParserState::Pending
!= eState
)
607 ReleaseRef(); // now parser is not needed anymore
612 eState
= SvParserState::Error
;
617 void SvRTFParser::Continue( int nToken
)
619 // DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
620 // "Characterset was changed." );
623 nToken
= GetNextToken();
625 bool bLooping
= false;
627 while (IsParserWorking() && !bLooping
)
629 auto nCurrentTokenIndex
= m_nTokenIndex
;
630 auto nCurrentToken
= nToken
;
638 eState
= SvParserState::Accepted
;
642 // an unknown group ?
644 if( RTF_IGNOREFLAG
!= GetNextToken() )
645 nToken
= SkipToken();
646 else if( RTF_UNKNOWNCONTROL
!= GetNextToken() )
647 nToken
= SkipToken( -2 );
650 // filter immediately
652 nToken
= GetNextToken();
654 eState
= SvParserState::Error
;
655 break; // move to next token!!
660 case RTF_UNKNOWNCONTROL
:
661 break; // skip unknown token
664 eCodeSet
= RTL_TEXTENCODING_MS_1252
;
665 SetSrcEncoding( eCodeSet
);
668 eCodeSet
= RTL_TEXTENCODING_APPLE_ROMAN
;
669 SetSrcEncoding( eCodeSet
);
672 eCodeSet
= RTL_TEXTENCODING_IBM_437
;
673 SetSrcEncoding( eCodeSet
);
676 eCodeSet
= RTL_TEXTENCODING_IBM_850
;
677 SetSrcEncoding( eCodeSet
);
680 eCodeSet
= rtl_getTextEncodingFromWindowsCodePage(nTokenValue
);
681 SetSrcEncoding(eCodeSet
);
688 if( IsParserWorking() )
689 SaveState( 0 ); // processed till here,
690 // continue with new token!
691 nToken
= GetNextToken();
692 bLooping
= nCurrentTokenIndex
== m_nTokenIndex
&& nToken
== nCurrentToken
;
694 if( SvParserState::Accepted
== eState
&& 0 < nOpenBrackets
)
695 eState
= SvParserState::Error
;
698 void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc
)
700 if (eEnc
== RTL_TEXTENCODING_DONTKNOW
)
703 if (!aParserStates
.empty())
704 aParserStates
.top().eCodeSet
= eEnc
;
705 SetSrcEncoding(eEnc
);
708 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */