update emoji autocorrect entries from po-files
[LibreOffice.git] / svtools / source / svrtf / parrtf.cxx
blobe7be520573c09a2667e62d0a340fa9a91586bff2
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
21 #include <rtl/tencinfo.h>
22 #include <tools/stream.hxx>
23 #include <tools/debug.hxx>
24 #include <svtools/rtftoken.h>
25 #include <svtools/rtfkeywd.hxx>
26 #include <svtools/parrtf.hxx>
27 #include <comphelper/string.hxx>
29 const int MAX_STRING_LEN = 1024;
30 const int MAX_TOKEN_LEN = 128;
32 #define RTF_ISDIGIT( c ) comphelper::string::isdigitAscii(c)
33 #define RTF_ISALPHA( c ) comphelper::string::isalphaAscii(c)
35 SvRTFParser::SvRTFParser( SvStream& rIn, sal_uInt8 nStackSize )
36 : SvParser( rIn, nStackSize )
37 , nOpenBrakets(0)
38 , eCodeSet(RTL_TEXTENCODING_MS_1252)
39 , eUNICodeSet(RTL_TEXTENCODING_MS_1252) // default ist ANSI-CodeSet
40 , nUCharOverread(1)
42 // default ist ANSI-CodeSet
43 SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
44 bRTF_InTextRead = false;
47 SvRTFParser::~SvRTFParser()
54 int SvRTFParser::_GetNextToken()
56 int nRet = 0;
57 do {
58 bool bNextCh = true;
59 switch( nNextCh )
61 case '\\':
63 // control charaters
64 switch( nNextCh = GetNextChar() )
66 case '{':
67 case '}':
68 case '\\':
69 case '+': // I found it in a RTF-file
70 case '~': // nonbreaking space
71 case '-': // optional hyphen
72 case '_': // nonbreaking hyphen
73 case '\'': // HexValue
74 nNextCh = '\\';
75 rInput.SeekRel( -1 );
76 ScanText();
77 nRet = RTF_TEXTTOKEN;
78 bNextCh = 0 == nNextCh;
79 break;
81 case '*': // ignoreflag
82 nRet = RTF_IGNOREFLAG;
83 break;
84 case ':': // subentry in an index entry
85 nRet = RTF_SUBENTRYINDEX;
86 break;
87 case '|': // formula-character
88 nRet = RTF_FORMULA;
89 break;
91 case 0x0a:
92 case 0x0d:
93 nRet = RTF_PAR;
94 break;
96 default:
97 if( RTF_ISALPHA( nNextCh ) )
99 aToken = "\\";
101 OUStringBuffer aStrBuffer;
102 aStrBuffer.setLength( MAX_TOKEN_LEN );
103 sal_Int32 nStrLen = 0;
104 do {
105 aStrBuffer[nStrLen++] = nNextCh;
106 if( MAX_TOKEN_LEN == nStrLen )
108 aToken += aStrBuffer.toString();
109 nStrLen = 0;
111 nNextCh = GetNextChar();
112 } while( RTF_ISALPHA( nNextCh ) );
113 if( nStrLen )
115 aToken += aStrBuffer.makeStringAndClear();
119 // minus before numeric parameters
120 bool bNegValue = false;
121 if( '-' == nNextCh )
123 bNegValue = true;
124 nNextCh = GetNextChar();
127 // possible numeric parameter
128 if( RTF_ISDIGIT( nNextCh ) )
130 nTokenValue = 0;
131 do {
132 nTokenValue *= 10;
133 nTokenValue += nNextCh - '0';
134 nNextCh = GetNextChar();
135 } while( RTF_ISDIGIT( nNextCh ) );
136 if( bNegValue )
137 nTokenValue = -nTokenValue;
138 bTokenHasValue=true;
140 else if( bNegValue ) // restore minus
142 nNextCh = '-';
143 rInput.SeekRel( -1 );
145 if( ' ' == nNextCh ) // blank is part of token!
146 nNextCh = GetNextChar();
148 // search for the token in the table:
149 if( 0 == (nRet = GetRTFToken( aToken )) )
150 // Unknown Control
151 nRet = RTF_UNKNOWNCONTROL;
153 // bug 76812 - unicode token handled as normal text
154 bNextCh = false;
155 switch( nRet )
157 case RTF_UC:
158 if( 0 <= nTokenValue )
160 nUCharOverread = (sal_uInt8)nTokenValue;
161 //cmc: other ifdef breaks #i3584
162 aParserStates.top().
163 nUCharOverread = nUCharOverread;
165 aToken.clear(); // #i47831# erase token to prevent the token from being treated as text
166 // read next token
167 nRet = 0;
168 break;
170 case RTF_UPR:
171 if (!_inSkipGroup) {
172 // UPR - overread the group with the ansi
173 // information
174 while( '{' != _GetNextToken() )
176 SkipGroup();
177 _GetNextToken(); // overread the last bracket
178 nRet = 0;
180 break;
182 case RTF_U:
183 if( !bRTF_InTextRead )
185 nRet = RTF_TEXTTOKEN;
186 aToken = OUString( (sal_Unicode)nTokenValue );
188 // overread the next n "RTF" characters. This
189 // can be also \{, \}, \'88
190 for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
192 sal_Unicode cAnsi = nNextCh;
193 while( 0xD == cAnsi )
194 cAnsi = GetNextChar();
195 while( 0xA == cAnsi )
196 cAnsi = GetNextChar();
198 if( '\\' == cAnsi &&
199 '\'' == ( cAnsi = GetNextChar() ))
200 // read on HexValue
201 cAnsi = GetHexValue();
202 nNextCh = GetNextChar();
204 ScanText();
205 bNextCh = 0 == nNextCh;
207 break;
210 else if( SVPAR_PENDING != eState )
212 // Bug 34631 - "\ " read on - Blank as character
213 // eState = SVPAR_ERROR;
214 bNextCh = false;
216 break;
219 break;
221 case sal_Unicode(EOF):
222 eState = SVPAR_ACCEPTED;
223 nRet = nNextCh;
224 break;
226 case '{':
228 if( 0 <= nOpenBrakets )
230 RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() );
231 aParserStates.push( aState );
233 ++nOpenBrakets;
234 DBG_ASSERT(
235 static_cast<size_t>(nOpenBrakets) == aParserStates.size(),
236 "ParserStateStack unequal to bracket count" );
237 nRet = nNextCh;
239 break;
241 case '}':
242 --nOpenBrakets;
243 if( 0 <= nOpenBrakets )
245 aParserStates.pop();
246 if( !aParserStates.empty() )
248 const RtfParserState_Impl& rRPS =
249 aParserStates.top();
250 nUCharOverread = rRPS.nUCharOverread;
251 SetSrcEncoding( rRPS.eCodeSet );
253 else
255 nUCharOverread = 1;
256 SetSrcEncoding( GetCodeSet() );
259 DBG_ASSERT(
260 static_cast<size_t>(nOpenBrakets) == aParserStates.size(),
261 "ParserStateStack unequal to bracket count" );
262 nRet = nNextCh;
263 break;
265 case 0x0d:
266 case 0x0a:
267 break;
269 default:
270 // now normal text follows
271 ScanText();
272 nRet = RTF_TEXTTOKEN;
273 bNextCh = 0 == nNextCh;
274 break;
277 if( bNextCh )
278 nNextCh = GetNextChar();
280 } while( !nRet && SVPAR_WORKING == eState );
281 return nRet;
285 sal_Unicode SvRTFParser::GetHexValue()
287 // collect Hex values
288 int n;
289 sal_Unicode nHexVal = 0;
291 for( n = 0; n < 2; ++n )
293 nHexVal *= 16;
294 nNextCh = GetNextChar();
295 if( nNextCh >= '0' && nNextCh <= '9' )
296 nHexVal += (nNextCh - 48);
297 else if( nNextCh >= 'a' && nNextCh <= 'f' )
298 nHexVal += (nNextCh - 87);
299 else if( nNextCh >= 'A' && nNextCh <= 'F' )
300 nHexVal += (nNextCh - 55);
302 return nHexVal;
305 void SvRTFParser::ScanText( const sal_Unicode cBreak )
307 OUStringBuffer aStrBuffer;
308 bool bContinue = true;
309 while( bContinue && IsParserWorking() && aStrBuffer.getLength() < MAX_STRING_LEN)
311 bool bNextCh = true;
312 switch( nNextCh )
314 case '\\':
316 switch (nNextCh = GetNextChar())
318 case '\'':
321 OStringBuffer aByteString;
322 while (true)
324 char c = (char)GetHexValue();
326 * Note: \'00 is a valid internal character in a
327 * string in RTF. OStringBuffer supports
328 * appending nulls fine
330 aByteString.append(c);
332 bool bBreak = false;
333 sal_Char nSlash = '\\';
334 while (!bBreak)
336 wchar_t __next=GetNextChar();
337 if (__next>0xFF) // fix for #i43933# and #i35653#
339 if (!aByteString.isEmpty())
340 aStrBuffer.append( OStringToOUString(aByteString.makeStringAndClear(), GetSrcEncoding()) );
341 aStrBuffer.append((sal_Unicode)__next);
343 continue;
345 nSlash = (sal_Char)__next;
346 while (nSlash == 0xD || nSlash == 0xA)
347 nSlash = (sal_Char)GetNextChar();
349 switch (nSlash)
351 case '{':
352 case '}':
353 case '\\':
354 bBreak = true;
355 break;
356 default:
357 aByteString.append(nSlash);
358 break;
362 nNextCh = GetNextChar();
364 if (nSlash != '\\' || nNextCh != '\'')
366 rInput.SeekRel(-1);
367 nNextCh = nSlash;
368 break;
372 bNextCh = false;
374 if (!aByteString.isEmpty())
375 aStrBuffer.append( OStringToOUString(aByteString.makeStringAndClear(), GetSrcEncoding()) );
377 break;
378 case '\\':
379 case '}':
380 case '{':
381 case '+': // I found in a RTF file
382 aStrBuffer.append(nNextCh);
383 break;
384 case '~': // nonbreaking space
385 aStrBuffer.append(static_cast< sal_Unicode >(0xA0));
386 break;
387 case '-': // optional hyphen
388 aStrBuffer.append(static_cast< sal_Unicode >(0xAD));
389 break;
390 case '_': // nonbreaking hyphen
391 aStrBuffer.append(static_cast< sal_Unicode >(0x2011));
392 break;
394 case 'u':
395 // read UNI-Code characters
397 nNextCh = GetNextChar();
398 rInput.SeekRel( -2 );
400 if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) )
402 bRTF_InTextRead = true;
404 OUString sSave( aToken );
405 nNextCh = '\\';
406 #ifdef DBG_UTIL
407 int nToken =
408 #endif
409 _GetNextToken();
410 DBG_ASSERT( RTF_U == nToken, "doch kein UNI-Code Zeichen" );
411 // dont convert symbol chars
412 aStrBuffer.append(static_cast< sal_Unicode >(nTokenValue));
414 // overread the next n "RTF" characters. This
415 // can be also \{, \}, \'88
416 for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
418 sal_Unicode cAnsi = nNextCh;
419 while( 0xD == cAnsi )
420 cAnsi = GetNextChar();
421 while( 0xA == cAnsi )
422 cAnsi = GetNextChar();
424 if( '\\' == cAnsi &&
425 '\'' == ( cAnsi = GetNextChar() ))
426 // HexValue ueberlesen
427 cAnsi = GetHexValue();
428 nNextCh = GetNextChar();
430 bNextCh = false;
431 aToken = sSave;
432 bRTF_InTextRead = false;
434 else if ( 'c' == nNextCh )
436 // Prevent text breaking into multiple tokens.
437 rInput.SeekRel( 2 );
438 nNextCh = GetNextChar();
439 if (RTF_ISDIGIT( nNextCh ))
441 sal_uInt8 nNewOverread = 0 ;
442 do {
443 nNewOverread *= 10;
444 nNewOverread += nNextCh - '0';
445 nNextCh = GetNextChar();
446 } while ( RTF_ISDIGIT( nNextCh ) );
447 nUCharOverread = nNewOverread;
448 aParserStates.top().nUCharOverread = nNewOverread;
450 bNextCh = 0x20 == nNextCh;
452 else
454 nNextCh = '\\';
455 bContinue = false; // abort, string together
458 break;
460 default:
461 rInput.SeekRel( -1 );
462 nNextCh = '\\';
463 bContinue = false; // abort, string together
464 break;
467 break;
469 case sal_Unicode(EOF): eState = SVPAR_ERROR;
470 // continue
471 case '{':
472 case '}':
473 bContinue = false;
474 break;
476 case 0x0a:
477 case 0x0d:
478 break;
480 default:
481 if( nNextCh == cBreak || aStrBuffer.getLength() >= MAX_STRING_LEN)
482 bContinue = false;
483 else
485 do {
486 // all other characters end up in the text
487 aStrBuffer.append(nNextCh);
489 if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
491 if (!aStrBuffer.isEmpty())
492 aToken += aStrBuffer.toString();
493 return;
495 } while
497 (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) &&
498 (aStrBuffer.getLength() < MAX_STRING_LEN)
500 bNextCh = false;
504 if( bContinue && bNextCh )
505 nNextCh = GetNextChar();
508 if (!aStrBuffer.isEmpty())
509 aToken += aStrBuffer.makeStringAndClear();
513 short SvRTFParser::_inSkipGroup=0;
515 void SvRTFParser::SkipGroup()
517 short nBrackets=1;
518 if (_inSkipGroup>0)
519 return;
520 _inSkipGroup++;
521 //#i16185# fecking \bin keyword
524 switch (nNextCh)
526 case '{':
527 ++nBrackets;
528 break;
529 case '}':
530 if (!--nBrackets) {
531 _inSkipGroup--;
532 return;
534 break;
536 int nToken = _GetNextToken();
537 if (nToken == RTF_BIN)
539 rInput.SeekRel(-1);
540 rInput.SeekRel(nTokenValue);
541 nNextCh = GetNextChar();
543 while (nNextCh==0xa || nNextCh==0xd)
545 nNextCh = GetNextChar();
547 } while (sal_Unicode(EOF) != nNextCh && IsParserWorking());
549 if( SVPAR_PENDING != eState && '}' != nNextCh )
550 eState = SVPAR_ERROR;
551 _inSkipGroup--;
554 void SvRTFParser::ReadUnknownData() { SkipGroup(); }
555 void SvRTFParser::ReadBitmapData() { SkipGroup(); }
556 void SvRTFParser::ReadOLEData() { SkipGroup(); }
559 SvParserState SvRTFParser::CallParser()
561 sal_Char cFirstCh;
562 nNextChPos = rInput.Tell();
563 rInput.ReadChar( cFirstCh ); nNextCh = cFirstCh;
564 eState = SVPAR_WORKING;
565 nOpenBrakets = 0;
566 SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
567 eUNICodeSet = RTL_TEXTENCODING_MS_1252; // default is ANSI-CodeSet
569 // the first two tokens should be '{' and \\rtf !!
570 if( '{' == GetNextToken() && RTF_RTF == GetNextToken() )
572 AddFirstRef();
573 Continue( 0 );
574 if( SVPAR_PENDING != eState )
575 ReleaseRef(); // now parser is not needed anymore
577 else
578 eState = SVPAR_ERROR;
580 return eState;
583 void SvRTFParser::Continue( int nToken )
585 // DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
586 // "Characterset was changed." );
588 if( !nToken )
589 nToken = GetNextToken();
591 while( IsParserWorking() )
593 SaveState( nToken );
594 switch( nToken )
596 case '}':
597 if( nOpenBrakets )
598 goto NEXTTOKEN;
599 eState = SVPAR_ACCEPTED;
600 break;
602 case '{':
603 // a unknown group ?
605 if( RTF_IGNOREFLAG != GetNextToken() )
606 nToken = SkipToken( -1 );
607 else if( RTF_UNKNOWNCONTROL != GetNextToken() )
608 nToken = SkipToken( -2 );
609 else
611 // filter immediately
612 ReadUnknownData();
613 nToken = GetNextToken();
614 if( '}' != nToken )
615 eState = SVPAR_ERROR;
616 break; // move to next token!!
619 goto NEXTTOKEN;
621 case RTF_UNKNOWNCONTROL:
622 break; // skip unknown token
623 case RTF_NEXTTYPE:
624 case RTF_ANSITYPE:
625 SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
626 break;
627 case RTF_MACTYPE:
628 SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN );
629 break;
630 case RTF_PCTYPE:
631 SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_IBM_437 );
632 break;
633 case RTF_PCATYPE:
634 SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_IBM_850 );
635 break;
636 case RTF_ANSICPG:
637 eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue);
638 SetSrcEncoding(eCodeSet);
639 break;
640 default:
641 NEXTTOKEN:
642 NextToken( nToken );
643 break;
645 if( IsParserWorking() )
646 SaveState( 0 ); // processed till here,
647 // continue with new token!
648 nToken = GetNextToken();
650 if( SVPAR_ACCEPTED == eState && 0 < nOpenBrakets )
651 eState = SVPAR_ERROR;
654 void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc )
656 if (eEnc == RTL_TEXTENCODING_DONTKNOW)
657 eEnc = GetCodeSet();
659 if (!aParserStates.empty())
660 aParserStates.top().eCodeSet = eEnc;
661 SetSrcEncoding(eEnc);
664 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */