tdf#130857 qt weld: Implement QtInstanceWidget::strip_mnemonic
[LibreOffice.git] / svtools / source / svrtf / parrtf.cxx
blob82d69f7881ac24441f0238400f552b62ab145c2e
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
21 #include <sal/log.hxx>
23 #include <comphelper/scopeguard.hxx>
25 #include <rtl/character.hxx>
26 #include <rtl/strbuf.hxx>
27 #include <rtl/tencinfo.h>
28 #include <rtl/ustrbuf.hxx>
29 #include <tools/stream.hxx>
30 #include <tools/debug.hxx>
31 #include <svtools/rtftoken.h>
32 #include <svtools/parrtf.hxx>
34 const int MAX_STRING_LEN = 1024;
36 #define RTF_ISDIGIT( c ) rtl::isAsciiDigit(c)
37 #define RTF_ISALPHA( c ) rtl::isAsciiAlpha(c)
39 SvRTFParser::SvRTFParser( SvStream& rIn, sal_uInt8 nStackSize )
40 : SvParser<int>( rIn, nStackSize )
41 , nOpenBrackets(0)
42 , nUPRLevel(0)
43 , eCodeSet(RTL_TEXTENCODING_MS_1252)
44 , nUCharOverread(1)
46 // default is ANSI-CodeSet
47 SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
48 bRTF_InTextRead = false;
51 SvRTFParser::~SvRTFParser()
56 int SvRTFParser::GetNextToken_()
58 int nRet = 0;
59 do {
60 bool bNextCh = true;
61 switch( nNextCh )
63 case '\\':
65 // control characters
66 nNextCh = GetNextChar();
67 switch( nNextCh )
69 case '{':
70 case '}':
71 case '\\':
72 case '+': // I found it in a RTF-file
73 case '~': // nonbreaking space
74 case '-': // optional hyphen
75 case '_': // nonbreaking hyphen
76 case '\'': // HexValue
77 nNextCh = '\\';
78 rInput.SeekRel( -1 );
79 ScanText();
80 nRet = RTF_TEXTTOKEN;
81 bNextCh = 0 == nNextCh;
82 break;
84 case '*': // ignoreflag
85 nRet = RTF_IGNOREFLAG;
86 break;
87 case ':': // subentry in an index entry
88 nRet = RTF_SUBENTRYINDEX;
89 break;
90 case '|': // formula-character
91 nRet = RTF_FORMULA;
92 break;
94 case 0x0a:
95 case 0x0d:
96 nRet = RTF_PAR;
97 break;
99 default:
100 if( RTF_ISALPHA( nNextCh ) )
102 aToken = "\\";
104 do {
105 aToken.appendUtf32(nNextCh);
106 nNextCh = GetNextChar();
107 } while( RTF_ISALPHA( nNextCh ) );
110 // minus before numeric parameters
111 bool bNegValue = false;
112 if( '-' == nNextCh )
114 bNegValue = true;
115 nNextCh = GetNextChar();
118 // possible numeric parameter
119 if( RTF_ISDIGIT( nNextCh ) )
121 OUStringBuffer aNumber;
122 do {
123 aNumber.append(static_cast<sal_Unicode>(nNextCh));
124 nNextCh = GetNextChar();
125 } while( RTF_ISDIGIT( nNextCh ) );
126 nTokenValue = OUString::unacquired(aNumber).toInt32();
127 if( bNegValue )
128 nTokenValue = -nTokenValue;
129 bTokenHasValue=true;
131 else if( bNegValue ) // restore minus
133 nNextCh = '-';
134 rInput.SeekRel( -1 );
136 if( ' ' == nNextCh ) // blank is part of token!
137 nNextCh = GetNextChar();
139 // search for the token in the table:
140 if( 0 == (nRet = GetRTFToken( aToken )) )
141 // Unknown Control
142 nRet = RTF_UNKNOWNCONTROL;
144 // bug 76812 - unicode token handled as normal text
145 bNextCh = false;
146 switch( nRet )
148 case RTF_UC:
149 if( 0 <= nTokenValue )
151 nUCharOverread = static_cast<sal_uInt8>(nTokenValue);
152 if (!aParserStates.empty())
154 //cmc: other ifdef breaks #i3584
155 aParserStates.top().nUCharOverread = nUCharOverread;
158 aToken.setLength( 0 ); // #i47831# erase token to prevent the token from being treated as text
159 // read next token
160 nRet = 0;
161 break;
163 case RTF_UPR:
164 if (!_inSkipGroup)
166 if (nUPRLevel > 256) // fairly sure > 1 is probably an error, but provide some leeway
168 SAL_WARN("svtools", "urp stack too deep");
169 eState = SvParserState::Error;
170 break;
173 ++nUPRLevel;
175 // UPR - overread the group with the ansi
176 // information
177 int nNextToken;
180 nNextToken = GetNextToken_();
182 while (nNextToken != '{' && nNextToken != sal_Unicode(EOF) && IsParserWorking());
184 SkipGroup();
185 GetNextToken_(); // overread the last bracket
186 nRet = 0;
188 --nUPRLevel;
190 break;
192 case RTF_U:
193 if( !bRTF_InTextRead )
195 nRet = RTF_TEXTTOKEN;
196 aToken = OUStringChar( static_cast<sal_Unicode>(nTokenValue) );
198 // overread the next n "RTF" characters. This
199 // can be also \{, \}, \'88
200 for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
202 sal_uInt32 cAnsi = nNextCh;
203 while( 0xD == cAnsi )
204 cAnsi = GetNextChar();
205 while( 0xA == cAnsi )
206 cAnsi = GetNextChar();
208 if( '\\' == cAnsi &&
209 '\'' == GetNextChar() )
210 // skip HexValue
211 GetHexValue();
212 nNextCh = GetNextChar();
214 ScanText();
215 bNextCh = 0 == nNextCh;
217 break;
220 else if( SvParserState::Pending != eState )
222 // Bug 34631 - "\ " read on - Blank as character
223 // eState = SvParserState::Error;
224 bNextCh = false;
226 break;
229 break;
231 case sal_Unicode(EOF):
232 eState = SvParserState::Accepted;
233 nRet = nNextCh;
234 break;
236 case '{':
238 if( 0 <= nOpenBrackets )
240 RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() );
241 aParserStates.push( aState );
243 ++nOpenBrackets;
244 DBG_ASSERT(
245 static_cast<size_t>(nOpenBrackets) == aParserStates.size(),
246 "ParserStateStack unequal to bracket count" );
247 nRet = nNextCh;
249 break;
251 case '}':
252 --nOpenBrackets;
253 if( 0 <= nOpenBrackets )
255 aParserStates.pop();
256 if( !aParserStates.empty() )
258 const RtfParserState_Impl& rRPS =
259 aParserStates.top();
260 nUCharOverread = rRPS.nUCharOverread;
261 SetSrcEncoding( rRPS.eCodeSet );
263 else
265 nUCharOverread = 1;
266 SetSrcEncoding( GetCodeSet() );
269 DBG_ASSERT(
270 static_cast<size_t>(nOpenBrackets) == aParserStates.size(),
271 "ParserStateStack unequal to bracket count" );
272 nRet = nNextCh;
273 break;
275 case 0x0d:
276 case 0x0a:
277 break;
279 default:
280 // now normal text follows
281 ScanText();
282 nRet = RTF_TEXTTOKEN;
283 bNextCh = 0 == nNextCh;
284 break;
287 if( bNextCh )
288 nNextCh = GetNextChar();
290 } while( !nRet && SvParserState::Working == eState );
291 return nRet;
295 sal_Unicode SvRTFParser::GetHexValue()
297 // collect Hex values
298 int n;
299 sal_Unicode nHexVal = 0;
301 for( n = 0; n < 2; ++n )
303 nHexVal *= 16;
304 nNextCh = GetNextChar();
305 if( nNextCh >= '0' && nNextCh <= '9' )
306 nHexVal += (nNextCh - 48);
307 else if( nNextCh >= 'a' && nNextCh <= 'f' )
308 nHexVal += (nNextCh - 87);
309 else if( nNextCh >= 'A' && nNextCh <= 'F' )
310 nHexVal += (nNextCh - 55);
312 return nHexVal;
315 void SvRTFParser::ScanText()
317 const sal_Unicode cBreak = 0;
318 OUStringBuffer aStrBuffer;
319 bool bContinue = true;
320 while( bContinue && IsParserWorking() && aStrBuffer.getLength() < MAX_STRING_LEN)
322 bool bNextCh = true;
323 switch( nNextCh )
325 case '\\':
327 nNextCh = GetNextChar();
328 switch (nNextCh)
330 case '\'':
333 OStringBuffer aByteString;
334 while (true)
336 char c = static_cast<char>(GetHexValue());
338 * Note: \'00 is a valid internal character in a
339 * string in RTF. OStringBuffer supports
340 * appending nulls fine
342 aByteString.append(c);
344 bool bBreak = false;
345 bool bEOF = false;
346 char nSlash = '\\';
347 while (!bBreak)
349 auto next = GetNextChar();
350 if (sal_Unicode(EOF) == next)
352 bEOF = true;
353 break;
355 if (next>0xFF) // fix for #i43933# and #i35653#
357 if (!aByteString.isEmpty())
359 aStrBuffer.append( OStringToOUString(aByteString, GetSrcEncoding()) );
360 aByteString.setLength(0);
362 aStrBuffer.append(static_cast<sal_Unicode>(next));
364 continue;
366 nSlash = static_cast<char>(next);
367 while (nSlash == 0xD || nSlash == 0xA)
368 nSlash = static_cast<char>(GetNextChar());
370 switch (nSlash)
372 case '{':
373 case '}':
374 case '\\':
375 bBreak = true;
376 break;
377 default:
378 aByteString.append(nSlash);
379 break;
383 if (bEOF)
385 bContinue = false; // abort, string together
386 break;
389 nNextCh = GetNextChar();
391 if (nSlash != '\\' || nNextCh != '\'')
393 rInput.SeekRel(-1);
394 nNextCh = static_cast<unsigned char>(nSlash);
395 break;
399 bNextCh = false;
401 if (!aByteString.isEmpty())
403 aStrBuffer.append( OStringToOUString(aByteString, GetSrcEncoding()) );
404 aByteString.setLength(0);
407 break;
408 case '\\':
409 case '}':
410 case '{':
411 case '+': // I found in a RTF file
412 aStrBuffer.append(sal_Unicode(nNextCh));
413 break;
414 case '~': // nonbreaking space
415 aStrBuffer.append(u'\x00A0');
416 break;
417 case '-': // optional hyphen
418 aStrBuffer.append(u'\x00AD');
419 break;
420 case '_': // nonbreaking hyphen
421 aStrBuffer.append(u'\x2011');
422 break;
424 case 'u':
425 // read UNI-Code characters
427 nNextCh = GetNextChar();
428 rInput.SeekRel( -2 );
430 if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) )
432 bRTF_InTextRead = true;
434 OUString sSave( aToken ); // GetNextToken_() overwrites this
435 nNextCh = '\\';
436 int nToken = GetNextToken_();
437 DBG_ASSERT( RTF_U == nToken, "still not a UNI-Code character" );
438 // don't convert symbol chars
439 aStrBuffer.append(static_cast< sal_Unicode >(nTokenValue));
441 // overread the next n "RTF" characters. This
442 // can be also \{, \}, \'88
443 for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
445 sal_Unicode cAnsi = nNextCh;
446 while( 0xD == cAnsi )
447 cAnsi = GetNextChar();
448 while( 0xA == cAnsi )
449 cAnsi = GetNextChar();
451 if( '\\' == cAnsi &&
452 '\'' == GetNextChar() )
453 // skip HexValue
454 GetHexValue();
455 nNextCh = GetNextChar();
457 bNextCh = false;
458 aToken = sSave;
459 bRTF_InTextRead = false;
461 else if ( 'c' == nNextCh )
463 // Prevent text breaking into multiple tokens.
464 rInput.SeekRel( 2 );
465 nNextCh = GetNextChar();
466 if (RTF_ISDIGIT( nNextCh ))
468 sal_uInt8 nNewOverread = 0 ;
469 do {
470 nNewOverread *= 10;
471 nNewOverread += nNextCh - '0';
472 nNextCh = GetNextChar();
473 } while ( RTF_ISDIGIT( nNextCh ) );
474 nUCharOverread = nNewOverread;
475 if (!aParserStates.empty())
476 aParserStates.top().nUCharOverread = nNewOverread;
478 bNextCh = 0x20 == nNextCh;
480 else
482 nNextCh = '\\';
483 bContinue = false; // abort, string together
486 break;
488 default:
489 rInput.SeekRel( -1 );
490 nNextCh = '\\';
491 bContinue = false; // abort, string together
492 break;
495 break;
497 case sal_Unicode(EOF):
498 eState = SvParserState::Error;
499 [[fallthrough]];
500 case '{':
501 case '}':
502 bContinue = false;
503 break;
505 case 0x0a:
506 case 0x0d:
507 break;
509 default:
510 if( nNextCh == cBreak || aStrBuffer.getLength() >= MAX_STRING_LEN)
511 bContinue = false;
512 else
514 do {
515 // all other characters end up in the text
516 aStrBuffer.appendUtf32(nNextCh);
518 if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
520 if (!aStrBuffer.isEmpty())
521 aToken.append( aStrBuffer );
522 return;
524 } while
526 (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) &&
527 (aStrBuffer.getLength() < MAX_STRING_LEN)
529 bNextCh = false;
533 if( bContinue && bNextCh )
534 nNextCh = GetNextChar();
537 if (!aStrBuffer.isEmpty())
538 aToken.append( aStrBuffer );
542 short SvRTFParser::_inSkipGroup=0;
544 void SvRTFParser::SkipGroup()
546 short nBrackets=1;
547 if (_inSkipGroup>0)
548 return;
549 _inSkipGroup++;
550 //#i16185# faking \bin keyword
553 switch (nNextCh)
555 case '{':
556 ++nBrackets;
557 break;
558 case '}':
559 if (!--nBrackets) {
560 _inSkipGroup--;
561 return;
563 break;
565 int nToken = GetNextToken_();
566 if (nToken == RTF_BIN)
568 rInput.SeekRel(-1);
569 SAL_WARN_IF(nTokenValue < 0, "svtools", "negative value argument for rtf \\bin keyword");
570 if (nTokenValue > 0)
571 rInput.SeekRel(nTokenValue);
572 nNextCh = GetNextChar();
574 while (nNextCh==0xa || nNextCh==0xd)
576 nNextCh = GetNextChar();
578 } while (sal_Unicode(EOF) != nNextCh && IsParserWorking());
580 if( SvParserState::Pending != eState && '}' != nNextCh )
581 eState = SvParserState::Error;
582 _inSkipGroup--;
585 void SvRTFParser::ReadUnknownData() { SkipGroup(); }
586 void SvRTFParser::ReadBitmapData() { SkipGroup(); }
589 SvParserState SvRTFParser::CallParser()
591 char cFirstCh(0);
592 nNextChPos = rInput.Tell();
593 rInput.ReadChar( cFirstCh );
594 nNextCh = static_cast<unsigned char>(cFirstCh);
595 eState = SvParserState::Working;
596 nOpenBrackets = 0;
597 eCodeSet = RTL_TEXTENCODING_MS_1252;
598 SetSrcEncoding( eCodeSet );
600 // the first two tokens should be '{' and \\rtf !!
601 if( '{' == GetNextToken() && RTF_RTF == GetNextToken() )
603 AddFirstRef();
604 // call ReleaseRef at end of this scope, even in the face of exceptions
605 comphelper::ScopeGuard g([this] {
606 if( SvParserState::Pending != eState )
607 ReleaseRef(); // now parser is not needed anymore
609 Continue( 0 );
611 else
612 eState = SvParserState::Error;
614 return eState;
617 void SvRTFParser::Continue( int nToken )
619 // DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
620 // "Characterset was changed." );
622 if( !nToken )
623 nToken = GetNextToken();
625 bool bLooping = false;
627 while (IsParserWorking() && !bLooping)
629 auto nCurrentTokenIndex = m_nTokenIndex;
630 auto nCurrentToken = nToken;
632 SaveState( nToken );
633 switch( nToken )
635 case '}':
636 if( nOpenBrackets )
637 goto NEXTTOKEN;
638 eState = SvParserState::Accepted;
639 break;
641 case '{':
642 // an unknown group ?
644 if( RTF_IGNOREFLAG != GetNextToken() )
645 nToken = SkipToken();
646 else if( RTF_UNKNOWNCONTROL != GetNextToken() )
647 nToken = SkipToken( -2 );
648 else
650 // filter immediately
651 ReadUnknownData();
652 nToken = GetNextToken();
653 if( '}' != nToken )
654 eState = SvParserState::Error;
655 break; // move to next token!!
658 goto NEXTTOKEN;
660 case RTF_UNKNOWNCONTROL:
661 break; // skip unknown token
662 case RTF_NEXTTYPE:
663 case RTF_ANSITYPE:
664 eCodeSet = RTL_TEXTENCODING_MS_1252;
665 SetSrcEncoding( eCodeSet );
666 break;
667 case RTF_MACTYPE:
668 eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN;
669 SetSrcEncoding( eCodeSet );
670 break;
671 case RTF_PCTYPE:
672 eCodeSet = RTL_TEXTENCODING_IBM_437;
673 SetSrcEncoding( eCodeSet );
674 break;
675 case RTF_PCATYPE:
676 eCodeSet = RTL_TEXTENCODING_IBM_850;
677 SetSrcEncoding( eCodeSet );
678 break;
679 case RTF_ANSICPG:
680 eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue);
681 SetSrcEncoding(eCodeSet);
682 break;
683 default:
684 NEXTTOKEN:
685 NextToken( nToken );
686 break;
688 if( IsParserWorking() )
689 SaveState( 0 ); // processed till here,
690 // continue with new token!
691 nToken = GetNextToken();
692 bLooping = nCurrentTokenIndex == m_nTokenIndex && nToken == nCurrentToken;
694 if( SvParserState::Accepted == eState && 0 < nOpenBrackets )
695 eState = SvParserState::Error;
698 void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc )
700 if (eEnc == RTL_TEXTENCODING_DONTKNOW)
701 eEnc = GetCodeSet();
703 if (!aParserStates.empty())
704 aParserStates.top().eCodeSet = eEnc;
705 SetSrcEncoding(eEnc);
708 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */