1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
21 #include <stdio.h> // for EOF
22 #include <rtl/tencinfo.h>
23 #include <tools/stream.hxx>
24 #include <tools/debug.hxx>
25 #include <svtools/rtftoken.h>
26 #include <svtools/rtfkeywd.hxx>
27 #include <svtools/parrtf.hxx>
28 #include <comphelper/string.hxx>
30 const int MAX_STRING_LEN
= 1024;
31 const int MAX_TOKEN_LEN
= 128;
33 #define RTF_ISDIGIT( c ) comphelper::string::isdigitAscii(c)
34 #define RTF_ISALPHA( c ) comphelper::string::isalphaAscii(c)
36 SvRTFParser::SvRTFParser( SvStream
& rIn
, sal_uInt8 nStackSize
)
37 : SvParser( rIn
, nStackSize
),
38 eUNICodeSet( RTL_TEXTENCODING_MS_1252
), // default ist ANSI-CodeSet
41 // default ist ANSI-CodeSet
42 SetSrcEncoding( RTL_TEXTENCODING_MS_1252
);
43 bRTF_InTextRead
= false;
46 SvRTFParser::~SvRTFParser()
53 int SvRTFParser::_GetNextToken()
63 switch( nNextCh
= GetNextChar() )
68 case '+': // habe ich in einem RTF-File gefunden
69 case '~': // nonbreaking space
70 case '-': // optional hyphen
71 case '_': // nonbreaking hyphen
72 case '\'': // HexValue
77 bNextCh
= 0 == nNextCh
;
80 case '*': // ignoreflag
81 nRet
= RTF_IGNOREFLAG
;
83 case ':': // subentry in an index entry
84 nRet
= RTF_SUBENTRYINDEX
;
86 case '|': // formula-charakter
96 if( RTF_ISALPHA( nNextCh
) )
101 sal_Unicode
* pStr
= aStrBuffer
.AllocBuffer(
103 xub_StrLen nStrLen
= 0;
105 *(pStr
+ nStrLen
++) = nNextCh
;
106 if( MAX_TOKEN_LEN
== nStrLen
)
108 aToken
+= aStrBuffer
;
109 aToken
.GetBufferAccess(); // make unique string!
112 nNextCh
= GetNextChar();
113 } while( RTF_ISALPHA( nNextCh
) );
116 aStrBuffer
.ReleaseBufferAccess( nStrLen
);
117 aToken
+= aStrBuffer
;
121 // Minus fuer numerischen Parameter
122 int bNegValue
= false;
126 nNextCh
= GetNextChar();
129 // evt. Numerischer Parameter
130 if( RTF_ISDIGIT( nNextCh
) )
135 nTokenValue
+= nNextCh
- '0';
136 nNextCh
= GetNextChar();
137 } while( RTF_ISDIGIT( nNextCh
) );
139 nTokenValue
= -nTokenValue
;
142 else if( bNegValue
) // das Minus wieder zurueck
145 rInput
.SeekRel( -1 );
147 if( ' ' == nNextCh
) // Blank gehoert zum Token!
148 nNextCh
= GetNextChar();
150 // suche das Token in der Tabelle:
151 if( 0 == (nRet
= GetRTFToken( aToken
)) )
153 nRet
= RTF_UNKNOWNCONTROL
;
155 // bug 76812 - unicode token handled as normal text
160 if( 0 <= nTokenValue
)
162 nUCharOverread
= (sal_uInt8
)nTokenValue
;
163 //cmc: other ifdef breaks #i3584
165 nUCharOverread
= nUCharOverread
;
167 aToken
.Erase(); // #i47831# erase token to prevent the token from beeing treated as text
174 // UPR - overread the group with the ansi
176 while( '{' != _GetNextToken() )
179 _GetNextToken(); // overread the last bracket
185 if( !bRTF_InTextRead
)
187 nRet
= RTF_TEXTTOKEN
;
188 aToken
= (sal_Unicode
)nTokenValue
;
190 // overread the next n "RTF" characters. This
191 // can be also \{, \}, \'88
192 for( sal_uInt8 m
= 0; m
< nUCharOverread
; ++m
)
194 sal_Unicode cAnsi
= nNextCh
;
195 while( 0xD == cAnsi
)
196 cAnsi
= GetNextChar();
197 while( 0xA == cAnsi
)
198 cAnsi
= GetNextChar();
201 '\'' == ( cAnsi
= GetNextChar() ))
202 // HexValue ueberlesen
203 cAnsi
= GetHexValue();
204 nNextCh
= GetNextChar();
207 bNextCh
= 0 == nNextCh
;
212 else if( SVPAR_PENDING
!= eState
)
214 // Bug 34631 - "\ " ueberlesen - Blank als Zeichen
215 // eState = SVPAR_ERROR;
223 case sal_Unicode(EOF
):
224 eState
= SVPAR_ACCEPTED
;
230 if( 0 <= nOpenBrakets
)
232 RtfParserState_Impl
aState( nUCharOverread
, GetSrcEncoding() );
233 aParserStates
.push( aState
);
237 static_cast<size_t>(nOpenBrakets
) == aParserStates
.size(),
238 "ParserStateStack unequal to bracket count" );
245 if( 0 <= nOpenBrakets
)
248 if( !aParserStates
.empty() )
250 const RtfParserState_Impl
& rRPS
=
252 nUCharOverread
= rRPS
.nUCharOverread
;
253 SetSrcEncoding( rRPS
.eCodeSet
);
258 SetSrcEncoding( GetCodeSet() );
262 static_cast<size_t>(nOpenBrakets
) == aParserStates
.size(),
263 "ParserStateStack unequal to bracket count" );
272 // es folgt normaler Text
274 nRet
= RTF_TEXTTOKEN
;
275 bNextCh
= 0 == nNextCh
;
280 nNextCh
= GetNextChar();
282 } while( !nRet
&& SVPAR_WORKING
== eState
);
287 sal_Unicode
SvRTFParser::GetHexValue()
291 register sal_Unicode nHexVal
= 0;
293 for( n
= 0; n
< 2; ++n
)
296 nNextCh
= GetNextChar();
297 if( nNextCh
>= '0' && nNextCh
<= '9' )
298 nHexVal
+= (nNextCh
- 48);
299 else if( nNextCh
>= 'a' && nNextCh
<= 'f' )
300 nHexVal
+= (nNextCh
- 87);
301 else if( nNextCh
>= 'A' && nNextCh
<= 'F' )
302 nHexVal
+= (nNextCh
- 55);
307 void SvRTFParser::ScanText( const sal_Unicode cBreak
)
311 while( bWeiter
&& IsParserWorking() && aStrBuffer
.Len() < MAX_STRING_LEN
)
318 switch (nNextCh
= GetNextChar())
323 OStringBuffer aByteString
;
326 char c
= (char)GetHexValue();
328 * Note: \'00 is a valid internal character in a
329 * string in RTF. OStringBuffer supports
330 * appending nulls fine
332 aByteString
.append(c
);
335 sal_Char nSlash
= '\\';
338 wchar_t __next
=GetNextChar();
339 if (__next
>0xFF) // fix for #i43933# and #i35653#
341 if (aByteString
.getLength())
342 aStrBuffer
.Append(String(OStringToOUString(aByteString
.makeStringAndClear(), GetSrcEncoding())));
343 aStrBuffer
.Append((sal_Unicode
)__next
);
347 nSlash
= (sal_Char
)__next
;
348 while (nSlash
== 0xD || nSlash
== 0xA)
349 nSlash
= (sal_Char
)GetNextChar();
359 aByteString
.append(nSlash
);
364 nNextCh
= GetNextChar();
366 if (nSlash
!= '\\' || nNextCh
!= '\'')
376 if (aByteString
.getLength())
377 aStrBuffer
.Append(String(OStringToOUString(aByteString
.makeStringAndClear(), GetSrcEncoding())));
383 case '+': // habe ich in einem RTF-File gefunden
384 aStrBuffer
.Append(nNextCh
);
386 case '~': // nonbreaking space
387 aStrBuffer
.Append(static_cast< sal_Unicode
>(0xA0));
389 case '-': // optional hyphen
390 aStrBuffer
.Append(static_cast< sal_Unicode
>(0xAD));
392 case '_': // nonbreaking hyphen
393 aStrBuffer
.Append(static_cast< sal_Unicode
>(0x2011));
397 // UNI-Code Zeichen lesen
399 nNextCh
= GetNextChar();
400 rInput
.SeekRel( -2 );
402 if( '-' == nNextCh
|| RTF_ISDIGIT( nNextCh
) )
404 bRTF_InTextRead
= true;
406 String
sSave( aToken
);
412 DBG_ASSERT( RTF_U
== nToken
, "doch kein UNI-Code Zeichen" );
413 // dont convert symbol chars
415 static_cast< sal_Unicode
>(nTokenValue
));
417 // overread the next n "RTF" characters. This
418 // can be also \{, \}, \'88
419 for( sal_uInt8 m
= 0; m
< nUCharOverread
; ++m
)
421 sal_Unicode cAnsi
= nNextCh
;
422 while( 0xD == cAnsi
)
423 cAnsi
= GetNextChar();
424 while( 0xA == cAnsi
)
425 cAnsi
= GetNextChar();
428 '\'' == ( cAnsi
= GetNextChar() ))
429 // HexValue ueberlesen
430 cAnsi
= GetHexValue();
431 nNextCh
= GetNextChar();
435 bRTF_InTextRead
= false;
440 bWeiter
= false; // Abbrechen, String zusammen
446 rInput
.SeekRel( -1 );
448 bWeiter
= false; // Abbrechen, String zusammen
454 case sal_Unicode(EOF
):
455 eState
= SVPAR_ERROR
;
467 if( nNextCh
== cBreak
|| aStrBuffer
.Len() >= MAX_STRING_LEN
)
472 // alle anderen Zeichen kommen in den Text
473 aStrBuffer
.Append(nNextCh
);
475 if (sal_Unicode(EOF
) == (nNextCh
= GetNextChar()))
477 if (aStrBuffer
.Len())
478 aToken
+= aStrBuffer
;
483 (RTF_ISALPHA(nNextCh
) || RTF_ISDIGIT(nNextCh
)) &&
484 (aStrBuffer
.Len() < MAX_STRING_LEN
)
490 if( bWeiter
&& bNextCh
)
491 nNextCh
= GetNextChar();
494 if (aStrBuffer
.Len())
495 aToken
+= aStrBuffer
;
499 short SvRTFParser::_inSkipGroup
=0;
501 void SvRTFParser::SkipGroup()
507 //#i16185# fecking \bin keyword
522 int nToken
= _GetNextToken();
523 if (nToken
== RTF_BIN
)
526 rInput
.SeekRel(nTokenValue
);
527 nNextCh
= GetNextChar();
529 while (nNextCh
==0xa || nNextCh
==0xd)
531 nNextCh
= GetNextChar();
533 } while (sal_Unicode(EOF
) != nNextCh
&& IsParserWorking());
535 if( SVPAR_PENDING
!= eState
&& '}' != nNextCh
)
536 eState
= SVPAR_ERROR
;
540 void SvRTFParser::ReadUnknownData() { SkipGroup(); }
541 void SvRTFParser::ReadBitmapData() { SkipGroup(); }
542 void SvRTFParser::ReadOLEData() { SkipGroup(); }
545 SvParserState
SvRTFParser::CallParser()
548 nNextChPos
= rInput
.Tell();
549 rInput
>> cFirstCh
; nNextCh
= cFirstCh
;
550 eState
= SVPAR_WORKING
;
552 SetSrcEncoding( eCodeSet
= RTL_TEXTENCODING_MS_1252
);
553 eUNICodeSet
= RTL_TEXTENCODING_MS_1252
; // default ist ANSI-CodeSet
555 // die 1. beiden Token muessen '{' und \\rtf sein !!
556 if( '{' == GetNextToken() && RTF_RTF
== GetNextToken() )
560 if( SVPAR_PENDING
!= eState
)
561 ReleaseRef(); // dann brauchen wir den Parser nicht mehr!
564 eState
= SVPAR_ERROR
;
569 void SvRTFParser::Continue( int nToken
)
571 // DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
572 // "Zeichensatz wurde geaendert." );
575 nToken
= GetNextToken();
577 while( IsParserWorking() )
585 eState
= SVPAR_ACCEPTED
;
589 // eine unbekannte Gruppe ?
591 if( RTF_IGNOREFLAG
!= GetNextToken() )
592 nToken
= SkipToken( -1 );
593 else if( RTF_UNKNOWNCONTROL
!= GetNextToken() )
594 nToken
= SkipToken( -2 );
597 // gleich herausfiltern
599 nToken
= GetNextToken();
601 eState
= SVPAR_ERROR
;
602 break; // auf zum naechsten Token!!
607 case RTF_UNKNOWNCONTROL
:
608 break; // unbekannte Token ueberspringen
611 SetSrcEncoding( eCodeSet
= RTL_TEXTENCODING_MS_1252
);
614 SetSrcEncoding( eCodeSet
= RTL_TEXTENCODING_APPLE_ROMAN
);
617 SetSrcEncoding( eCodeSet
= RTL_TEXTENCODING_IBM_437
);
620 SetSrcEncoding( eCodeSet
= RTL_TEXTENCODING_IBM_850
);
623 eCodeSet
= rtl_getTextEncodingFromWindowsCodePage(nTokenValue
);
624 SetSrcEncoding(eCodeSet
);
631 if( IsParserWorking() )
632 SaveState( 0 ); // bis hierhin abgearbeitet,
633 // weiter mit neuem Token!
634 nToken
= GetNextToken();
636 if( SVPAR_ACCEPTED
== eState
&& 0 < nOpenBrakets
)
637 eState
= SVPAR_ERROR
;
640 void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc
)
642 if (eEnc
== RTL_TEXTENCODING_DONTKNOW
)
645 if (!aParserStates
.empty())
646 aParserStates
.top().eCodeSet
= eEnc
;
647 SetSrcEncoding(eEnc
);
650 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */