Version 7.6.3.2-android, tag libreoffice-7.6.3.2-android
[LibreOffice.git] / svtools / source / svrtf / parrtf.cxx
blob42fcc211b264b7dcfbef4bf3f88ff5e4ceb27521
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
21 #include <sal/log.hxx>
23 #include <comphelper/scopeguard.hxx>
25 #include <rtl/character.hxx>
26 #include <rtl/strbuf.hxx>
27 #include <rtl/tencinfo.h>
28 #include <rtl/ustrbuf.hxx>
29 #include <tools/stream.hxx>
30 #include <tools/debug.hxx>
31 #include <svtools/rtftoken.h>
32 #include <svtools/parrtf.hxx>
34 const int MAX_STRING_LEN = 1024;
36 #define RTF_ISDIGIT( c ) rtl::isAsciiDigit(c)
37 #define RTF_ISALPHA( c ) rtl::isAsciiAlpha(c)
39 SvRTFParser::SvRTFParser( SvStream& rIn, sal_uInt8 nStackSize )
40 : SvParser<int>( rIn, nStackSize )
41 , nOpenBrackets(0)
42 , eCodeSet(RTL_TEXTENCODING_MS_1252)
43 , nUCharOverread(1)
45 // default is ANSI-CodeSet
46 SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
47 bRTF_InTextRead = false;
50 SvRTFParser::~SvRTFParser()
55 int SvRTFParser::GetNextToken_()
57 int nRet = 0;
58 do {
59 bool bNextCh = true;
60 switch( nNextCh )
62 case '\\':
64 // control characters
65 nNextCh = GetNextChar();
66 switch( nNextCh )
68 case '{':
69 case '}':
70 case '\\':
71 case '+': // I found it in a RTF-file
72 case '~': // nonbreaking space
73 case '-': // optional hyphen
74 case '_': // nonbreaking hyphen
75 case '\'': // HexValue
76 nNextCh = '\\';
77 rInput.SeekRel( -1 );
78 ScanText();
79 nRet = RTF_TEXTTOKEN;
80 bNextCh = 0 == nNextCh;
81 break;
83 case '*': // ignoreflag
84 nRet = RTF_IGNOREFLAG;
85 break;
86 case ':': // subentry in an index entry
87 nRet = RTF_SUBENTRYINDEX;
88 break;
89 case '|': // formula-character
90 nRet = RTF_FORMULA;
91 break;
93 case 0x0a:
94 case 0x0d:
95 nRet = RTF_PAR;
96 break;
98 default:
99 if( RTF_ISALPHA( nNextCh ) )
101 aToken = "\\";
103 do {
104 aToken.appendUtf32(nNextCh);
105 nNextCh = GetNextChar();
106 } while( RTF_ISALPHA( nNextCh ) );
109 // minus before numeric parameters
110 bool bNegValue = false;
111 if( '-' == nNextCh )
113 bNegValue = true;
114 nNextCh = GetNextChar();
117 // possible numeric parameter
118 if( RTF_ISDIGIT( nNextCh ) )
120 OUStringBuffer aNumber;
121 do {
122 aNumber.append(static_cast<sal_Unicode>(nNextCh));
123 nNextCh = GetNextChar();
124 } while( RTF_ISDIGIT( nNextCh ) );
125 nTokenValue = OUString::unacquired(aNumber).toInt32();
126 if( bNegValue )
127 nTokenValue = -nTokenValue;
128 bTokenHasValue=true;
130 else if( bNegValue ) // restore minus
132 nNextCh = '-';
133 rInput.SeekRel( -1 );
135 if( ' ' == nNextCh ) // blank is part of token!
136 nNextCh = GetNextChar();
138 // search for the token in the table:
139 if( 0 == (nRet = GetRTFToken( aToken )) )
140 // Unknown Control
141 nRet = RTF_UNKNOWNCONTROL;
143 // bug 76812 - unicode token handled as normal text
144 bNextCh = false;
145 switch( nRet )
147 case RTF_UC:
148 if( 0 <= nTokenValue )
150 nUCharOverread = static_cast<sal_uInt8>(nTokenValue);
151 if (!aParserStates.empty())
153 //cmc: other ifdef breaks #i3584
154 aParserStates.top().nUCharOverread = nUCharOverread;
157 aToken.setLength( 0 ); // #i47831# erase token to prevent the token from being treated as text
158 // read next token
159 nRet = 0;
160 break;
162 case RTF_UPR:
163 if (!_inSkipGroup) {
164 // UPR - overread the group with the ansi
165 // information
166 int nNextToken;
169 nNextToken = GetNextToken_();
171 while (nNextToken != '{' && nNextToken != sal_Unicode(EOF) && IsParserWorking());
173 SkipGroup();
174 GetNextToken_(); // overread the last bracket
175 nRet = 0;
177 break;
179 case RTF_U:
180 if( !bRTF_InTextRead )
182 nRet = RTF_TEXTTOKEN;
183 aToken = OUStringChar( static_cast<sal_Unicode>(nTokenValue) );
185 // overread the next n "RTF" characters. This
186 // can be also \{, \}, \'88
187 for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
189 sal_uInt32 cAnsi = nNextCh;
190 while( 0xD == cAnsi )
191 cAnsi = GetNextChar();
192 while( 0xA == cAnsi )
193 cAnsi = GetNextChar();
195 if( '\\' == cAnsi &&
196 '\'' == GetNextChar() )
197 // skip HexValue
198 GetHexValue();
199 nNextCh = GetNextChar();
201 ScanText();
202 bNextCh = 0 == nNextCh;
204 break;
207 else if( SvParserState::Pending != eState )
209 // Bug 34631 - "\ " read on - Blank as character
210 // eState = SvParserState::Error;
211 bNextCh = false;
213 break;
216 break;
218 case sal_Unicode(EOF):
219 eState = SvParserState::Accepted;
220 nRet = nNextCh;
221 break;
223 case '{':
225 if( 0 <= nOpenBrackets )
227 RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() );
228 aParserStates.push( aState );
230 ++nOpenBrackets;
231 DBG_ASSERT(
232 static_cast<size_t>(nOpenBrackets) == aParserStates.size(),
233 "ParserStateStack unequal to bracket count" );
234 nRet = nNextCh;
236 break;
238 case '}':
239 --nOpenBrackets;
240 if( 0 <= nOpenBrackets )
242 aParserStates.pop();
243 if( !aParserStates.empty() )
245 const RtfParserState_Impl& rRPS =
246 aParserStates.top();
247 nUCharOverread = rRPS.nUCharOverread;
248 SetSrcEncoding( rRPS.eCodeSet );
250 else
252 nUCharOverread = 1;
253 SetSrcEncoding( GetCodeSet() );
256 DBG_ASSERT(
257 static_cast<size_t>(nOpenBrackets) == aParserStates.size(),
258 "ParserStateStack unequal to bracket count" );
259 nRet = nNextCh;
260 break;
262 case 0x0d:
263 case 0x0a:
264 break;
266 default:
267 // now normal text follows
268 ScanText();
269 nRet = RTF_TEXTTOKEN;
270 bNextCh = 0 == nNextCh;
271 break;
274 if( bNextCh )
275 nNextCh = GetNextChar();
277 } while( !nRet && SvParserState::Working == eState );
278 return nRet;
282 sal_Unicode SvRTFParser::GetHexValue()
284 // collect Hex values
285 int n;
286 sal_Unicode nHexVal = 0;
288 for( n = 0; n < 2; ++n )
290 nHexVal *= 16;
291 nNextCh = GetNextChar();
292 if( nNextCh >= '0' && nNextCh <= '9' )
293 nHexVal += (nNextCh - 48);
294 else if( nNextCh >= 'a' && nNextCh <= 'f' )
295 nHexVal += (nNextCh - 87);
296 else if( nNextCh >= 'A' && nNextCh <= 'F' )
297 nHexVal += (nNextCh - 55);
299 return nHexVal;
302 void SvRTFParser::ScanText()
304 const sal_Unicode cBreak = 0;
305 OUStringBuffer aStrBuffer;
306 bool bContinue = true;
307 while( bContinue && IsParserWorking() && aStrBuffer.getLength() < MAX_STRING_LEN)
309 bool bNextCh = true;
310 switch( nNextCh )
312 case '\\':
314 nNextCh = GetNextChar();
315 switch (nNextCh)
317 case '\'':
320 OStringBuffer aByteString;
321 while (true)
323 char c = static_cast<char>(GetHexValue());
325 * Note: \'00 is a valid internal character in a
326 * string in RTF. OStringBuffer supports
327 * appending nulls fine
329 aByteString.append(c);
331 bool bBreak = false;
332 bool bEOF = false;
333 char nSlash = '\\';
334 while (!bBreak)
336 auto next = GetNextChar();
337 if (sal_Unicode(EOF) == next)
339 bEOF = true;
340 break;
342 if (next>0xFF) // fix for #i43933# and #i35653#
344 if (!aByteString.isEmpty())
346 aStrBuffer.append( OStringToOUString(aByteString, GetSrcEncoding()) );
347 aByteString.setLength(0);
349 aStrBuffer.append(static_cast<sal_Unicode>(next));
351 continue;
353 nSlash = static_cast<char>(next);
354 while (nSlash == 0xD || nSlash == 0xA)
355 nSlash = static_cast<char>(GetNextChar());
357 switch (nSlash)
359 case '{':
360 case '}':
361 case '\\':
362 bBreak = true;
363 break;
364 default:
365 aByteString.append(nSlash);
366 break;
370 if (bEOF)
372 bContinue = false; // abort, string together
373 break;
376 nNextCh = GetNextChar();
378 if (nSlash != '\\' || nNextCh != '\'')
380 rInput.SeekRel(-1);
381 nNextCh = static_cast<unsigned char>(nSlash);
382 break;
386 bNextCh = false;
388 if (!aByteString.isEmpty())
390 aStrBuffer.append( OStringToOUString(aByteString, GetSrcEncoding()) );
391 aByteString.setLength(0);
394 break;
395 case '\\':
396 case '}':
397 case '{':
398 case '+': // I found in a RTF file
399 aStrBuffer.append(sal_Unicode(nNextCh));
400 break;
401 case '~': // nonbreaking space
402 aStrBuffer.append(u'\x00A0');
403 break;
404 case '-': // optional hyphen
405 aStrBuffer.append(u'\x00AD');
406 break;
407 case '_': // nonbreaking hyphen
408 aStrBuffer.append(u'\x2011');
409 break;
411 case 'u':
412 // read UNI-Code characters
414 nNextCh = GetNextChar();
415 rInput.SeekRel( -2 );
417 if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) )
419 bRTF_InTextRead = true;
421 OUString sSave( aToken ); // GetNextToken_() overwrites this
422 nNextCh = '\\';
423 int nToken = GetNextToken_();
424 DBG_ASSERT( RTF_U == nToken, "still not a UNI-Code character" );
425 // don't convert symbol chars
426 aStrBuffer.append(static_cast< sal_Unicode >(nTokenValue));
428 // overread the next n "RTF" characters. This
429 // can be also \{, \}, \'88
430 for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
432 sal_Unicode cAnsi = nNextCh;
433 while( 0xD == cAnsi )
434 cAnsi = GetNextChar();
435 while( 0xA == cAnsi )
436 cAnsi = GetNextChar();
438 if( '\\' == cAnsi &&
439 '\'' == GetNextChar() )
440 // skip HexValue
441 GetHexValue();
442 nNextCh = GetNextChar();
444 bNextCh = false;
445 aToken = sSave;
446 bRTF_InTextRead = false;
448 else if ( 'c' == nNextCh )
450 // Prevent text breaking into multiple tokens.
451 rInput.SeekRel( 2 );
452 nNextCh = GetNextChar();
453 if (RTF_ISDIGIT( nNextCh ))
455 sal_uInt8 nNewOverread = 0 ;
456 do {
457 nNewOverread *= 10;
458 nNewOverread += nNextCh - '0';
459 nNextCh = GetNextChar();
460 } while ( RTF_ISDIGIT( nNextCh ) );
461 nUCharOverread = nNewOverread;
462 if (!aParserStates.empty())
463 aParserStates.top().nUCharOverread = nNewOverread;
465 bNextCh = 0x20 == nNextCh;
467 else
469 nNextCh = '\\';
470 bContinue = false; // abort, string together
473 break;
475 default:
476 rInput.SeekRel( -1 );
477 nNextCh = '\\';
478 bContinue = false; // abort, string together
479 break;
482 break;
484 case sal_Unicode(EOF):
485 eState = SvParserState::Error;
486 [[fallthrough]];
487 case '{':
488 case '}':
489 bContinue = false;
490 break;
492 case 0x0a:
493 case 0x0d:
494 break;
496 default:
497 if( nNextCh == cBreak || aStrBuffer.getLength() >= MAX_STRING_LEN)
498 bContinue = false;
499 else
501 do {
502 // all other characters end up in the text
503 aStrBuffer.appendUtf32(nNextCh);
505 if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
507 if (!aStrBuffer.isEmpty())
508 aToken.append( aStrBuffer );
509 return;
511 } while
513 (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) &&
514 (aStrBuffer.getLength() < MAX_STRING_LEN)
516 bNextCh = false;
520 if( bContinue && bNextCh )
521 nNextCh = GetNextChar();
524 if (!aStrBuffer.isEmpty())
525 aToken.append( aStrBuffer );
529 short SvRTFParser::_inSkipGroup=0;
531 void SvRTFParser::SkipGroup()
533 short nBrackets=1;
534 if (_inSkipGroup>0)
535 return;
536 _inSkipGroup++;
537 //#i16185# faking \bin keyword
540 switch (nNextCh)
542 case '{':
543 ++nBrackets;
544 break;
545 case '}':
546 if (!--nBrackets) {
547 _inSkipGroup--;
548 return;
550 break;
552 int nToken = GetNextToken_();
553 if (nToken == RTF_BIN)
555 rInput.SeekRel(-1);
556 SAL_WARN_IF(nTokenValue < 0, "svtools", "negative value argument for rtf \\bin keyword");
557 if (nTokenValue > 0)
558 rInput.SeekRel(nTokenValue);
559 nNextCh = GetNextChar();
561 while (nNextCh==0xa || nNextCh==0xd)
563 nNextCh = GetNextChar();
565 } while (sal_Unicode(EOF) != nNextCh && IsParserWorking());
567 if( SvParserState::Pending != eState && '}' != nNextCh )
568 eState = SvParserState::Error;
569 _inSkipGroup--;
572 void SvRTFParser::ReadUnknownData() { SkipGroup(); }
573 void SvRTFParser::ReadBitmapData() { SkipGroup(); }
576 SvParserState SvRTFParser::CallParser()
578 char cFirstCh(0);
579 nNextChPos = rInput.Tell();
580 rInput.ReadChar( cFirstCh );
581 nNextCh = static_cast<unsigned char>(cFirstCh);
582 eState = SvParserState::Working;
583 nOpenBrackets = 0;
584 eCodeSet = RTL_TEXTENCODING_MS_1252;
585 SetSrcEncoding( eCodeSet );
587 // the first two tokens should be '{' and \\rtf !!
588 if( '{' == GetNextToken() && RTF_RTF == GetNextToken() )
590 AddFirstRef();
591 // call ReleaseRef at end of this scope, even in the face of exceptions
592 comphelper::ScopeGuard g([this] {
593 if( SvParserState::Pending != eState )
594 ReleaseRef(); // now parser is not needed anymore
596 Continue( 0 );
598 else
599 eState = SvParserState::Error;
601 return eState;
604 void SvRTFParser::Continue( int nToken )
606 // DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
607 // "Characterset was changed." );
609 if( !nToken )
610 nToken = GetNextToken();
612 bool bLooping = false;
614 while (IsParserWorking() && !bLooping)
616 auto nCurrentTokenIndex = m_nTokenIndex;
617 auto nCurrentToken = nToken;
619 SaveState( nToken );
620 switch( nToken )
622 case '}':
623 if( nOpenBrackets )
624 goto NEXTTOKEN;
625 eState = SvParserState::Accepted;
626 break;
628 case '{':
629 // an unknown group ?
631 if( RTF_IGNOREFLAG != GetNextToken() )
632 nToken = SkipToken();
633 else if( RTF_UNKNOWNCONTROL != GetNextToken() )
634 nToken = SkipToken( -2 );
635 else
637 // filter immediately
638 ReadUnknownData();
639 nToken = GetNextToken();
640 if( '}' != nToken )
641 eState = SvParserState::Error;
642 break; // move to next token!!
645 goto NEXTTOKEN;
647 case RTF_UNKNOWNCONTROL:
648 break; // skip unknown token
649 case RTF_NEXTTYPE:
650 case RTF_ANSITYPE:
651 eCodeSet = RTL_TEXTENCODING_MS_1252;
652 SetSrcEncoding( eCodeSet );
653 break;
654 case RTF_MACTYPE:
655 eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN;
656 SetSrcEncoding( eCodeSet );
657 break;
658 case RTF_PCTYPE:
659 eCodeSet = RTL_TEXTENCODING_IBM_437;
660 SetSrcEncoding( eCodeSet );
661 break;
662 case RTF_PCATYPE:
663 eCodeSet = RTL_TEXTENCODING_IBM_850;
664 SetSrcEncoding( eCodeSet );
665 break;
666 case RTF_ANSICPG:
667 eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue);
668 SetSrcEncoding(eCodeSet);
669 break;
670 default:
671 NEXTTOKEN:
672 NextToken( nToken );
673 break;
675 if( IsParserWorking() )
676 SaveState( 0 ); // processed till here,
677 // continue with new token!
678 nToken = GetNextToken();
679 bLooping = nCurrentTokenIndex == m_nTokenIndex && nToken == nCurrentToken;
681 if( SvParserState::Accepted == eState && 0 < nOpenBrackets )
682 eState = SvParserState::Error;
685 void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc )
687 if (eEnc == RTL_TEXTENCODING_DONTKNOW)
688 eEnc = GetCodeSet();
690 if (!aParserStates.empty())
691 aParserStates.top().eCodeSet = eEnc;
692 SetSrcEncoding(eEnc);
695 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */