update dev300-m58
[ooovba.git] / svtools / source / svrtf / parrtf.cxx
blob1f72da97eec10089f72590aa6e6c5b6b4d55ef7c
1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * Copyright 2008 by Sun Microsystems, Inc.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * $RCSfile: parrtf.cxx,v $
10 * $Revision: 1.25 $
12 * This file is part of OpenOffice.org.
14 * OpenOffice.org is free software: you can redistribute it and/or modify
15 * it under the terms of the GNU Lesser General Public License version 3
16 * only, as published by the Free Software Foundation.
18 * OpenOffice.org is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU Lesser General Public License version 3 for more details
22 * (a copy is included in the LICENSE file that accompanied this code).
24 * You should have received a copy of the GNU Lesser General Public License
25 * version 3 along with OpenOffice.org. If not, see
26 * <http://www.openoffice.org/license.html>
27 * for a copy of the LGPLv3 License.
29 ************************************************************************/
31 // MARKER(update_precomp.py): autogen include statement, do not remove
32 #include "precompiled_svtools.hxx"
34 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil -*- */
36 #include <stdio.h> // for EOF
37 #include <rtl/tencinfo.h>
38 #include <tools/stream.hxx>
39 #include <tools/debug.hxx>
40 #include "rtftoken.h"
41 #include "rtfkeywd.hxx"
42 #include <svtools/parrtf.hxx>
44 const int MAX_STRING_LEN = 1024;
45 const int MAX_TOKEN_LEN = 128;
47 #define RTF_ISDIGIT( c ) (c >= '0' && c <= '9')
48 #define RTF_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') )
50 SV_IMPL_VARARR( RtfParserStates_Impl, RtfParserState_Impl )
52 SvRTFParser::SvRTFParser( SvStream& rIn, BYTE nStackSize )
53 : SvParser( rIn, nStackSize ),
54 eUNICodeSet( RTL_TEXTENCODING_MS_1252 ), // default ist ANSI-CodeSet
55 nUCharOverread( 1 )
57 // default ist ANSI-CodeSet
58 SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
59 bRTF_InTextRead = false;
62 SvRTFParser::~SvRTFParser()
69 int SvRTFParser::_GetNextToken()
71 int nRet = 0;
72 do {
73 int bNextCh = true;
74 switch( nNextCh )
76 case '\\':
78 // Steuerzeichen
79 switch( nNextCh = GetNextChar() )
81 case '{':
82 case '}':
83 case '\\':
84 case '+': // habe ich in einem RTF-File gefunden
85 case '~': // nonbreaking space
86 case '-': // optional hyphen
87 case '_': // nonbreaking hyphen
88 case '\'': // HexValue
89 nNextCh = '\\';
90 rInput.SeekRel( -1 );
91 ScanText();
92 nRet = RTF_TEXTTOKEN;
93 bNextCh = 0 == nNextCh;
94 break;
96 case '*': // ignoreflag
97 nRet = RTF_IGNOREFLAG;
98 break;
99 case ':': // subentry in an index entry
100 nRet = RTF_SUBENTRYINDEX;
101 break;
102 case '|': // formula-charakter
103 nRet = RTF_FORMULA;
104 break;
106 case 0x0a:
107 case 0x0d:
108 nRet = RTF_PAR;
109 break;
111 default:
112 if( RTF_ISALPHA( nNextCh ) )
114 aToken = '\\';
116 String aStrBuffer;
117 sal_Unicode* pStr = aStrBuffer.AllocBuffer(
118 MAX_TOKEN_LEN );
119 xub_StrLen nStrLen = 0;
120 do {
121 *(pStr + nStrLen++) = nNextCh;
122 if( MAX_TOKEN_LEN == nStrLen )
124 aToken += aStrBuffer;
125 aToken.GetBufferAccess(); // make unique string!
126 nStrLen = 0;
128 nNextCh = GetNextChar();
129 } while( RTF_ISALPHA( nNextCh ) );
130 if( nStrLen )
132 aStrBuffer.ReleaseBufferAccess( nStrLen );
133 aToken += aStrBuffer;
137 // Minus fuer numerischen Parameter
138 int bNegValue = false;
139 if( '-' == nNextCh )
141 bNegValue = true;
142 nNextCh = GetNextChar();
145 // evt. Numerischer Parameter
146 if( RTF_ISDIGIT( nNextCh ) )
148 nTokenValue = 0;
149 do {
150 nTokenValue *= 10;
151 nTokenValue += nNextCh - '0';
152 nNextCh = GetNextChar();
153 } while( RTF_ISDIGIT( nNextCh ) );
154 if( bNegValue )
155 nTokenValue = -nTokenValue;
156 bTokenHasValue=true;
158 else if( bNegValue ) // das Minus wieder zurueck
160 nNextCh = '-';
161 rInput.SeekRel( -1 );
163 if( ' ' == nNextCh ) // Blank gehoert zum Token!
164 nNextCh = GetNextChar();
166 // suche das Token in der Tabelle:
167 if( 0 == (nRet = GetRTFToken( aToken )) )
168 // Unknown Control
169 nRet = RTF_UNKNOWNCONTROL;
171 // bug 76812 - unicode token handled as normal text
172 bNextCh = false;
173 switch( nRet )
175 case RTF_UC:
176 if( 0 <= nTokenValue )
178 nUCharOverread = (BYTE)nTokenValue;
179 #if 1
180 //cmc: other ifdef breaks #i3584
181 aParserStates[ aParserStates.Count()-1].
182 nUCharOverread = nUCharOverread;
183 #else
184 if( !nUCharOverread )
185 nUCharOverread = aParserStates[
186 aParserStates.Count()-1].nUCharOverread;
187 else
188 aParserStates[ aParserStates.Count()-1].
189 nUCharOverread = nUCharOverread;
190 #endif
192 aToken.Erase(); // #i47831# erase token to prevent the token from beeing treated as text
193 // read next token
194 nRet = 0;
195 break;
197 case RTF_UPR:
198 if (!_inSkipGroup) {
199 // UPR - overread the group with the ansi
200 // informations
201 while( '{' != _GetNextToken() )
203 SkipGroup();
204 _GetNextToken(); // overread the last bracket
205 nRet = 0;
207 break;
209 case RTF_U:
210 if( !bRTF_InTextRead )
212 nRet = RTF_TEXTTOKEN;
213 aToken = (sal_Unicode)nTokenValue;
215 // overread the next n "RTF" characters. This
216 // can be also \{, \}, \'88
217 for( BYTE m = 0; m < nUCharOverread; ++m )
219 sal_Unicode cAnsi = nNextCh;
220 while( 0xD == cAnsi )
221 cAnsi = GetNextChar();
222 while( 0xA == cAnsi )
223 cAnsi = GetNextChar();
225 if( '\\' == cAnsi &&
226 '\'' == ( cAnsi = GetNextChar() ))
227 // HexValue ueberlesen
228 cAnsi = GetHexValue();
229 nNextCh = GetNextChar();
231 ScanText();
232 bNextCh = 0 == nNextCh;
234 break;
237 else if( SVPAR_PENDING != eState )
239 // Bug 34631 - "\ " ueberlesen - Blank als Zeichen
240 // eState = SVPAR_ERROR;
241 bNextCh = false;
243 break;
246 break;
248 case sal_Unicode(EOF):
249 eState = SVPAR_ACCEPTED;
250 nRet = nNextCh;
251 break;
253 case '{':
255 if( 0 <= nOpenBrakets )
257 RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() );
258 aParserStates.Insert(
259 aState, sal::static_int_cast< USHORT >(nOpenBrakets) );
261 ++nOpenBrakets;
262 DBG_ASSERT( nOpenBrakets == aParserStates.Count(),
263 "ParserStateStack unequal to bracket count" );
264 nRet = nNextCh;
266 break;
268 case '}':
269 --nOpenBrakets;
270 if( 0 <= nOpenBrakets )
272 aParserStates.Remove(
273 sal::static_int_cast< USHORT >(nOpenBrakets) );
274 if( aParserStates.Count() )
276 const RtfParserState_Impl& rRPS =
277 aParserStates[ aParserStates.Count() - 1 ];
278 nUCharOverread = rRPS.nUCharOverread;
279 SetSrcEncoding( rRPS.eCodeSet );
281 else
283 nUCharOverread = 1;
284 SetSrcEncoding( GetCodeSet() );
287 DBG_ASSERT( nOpenBrakets == aParserStates.Count(),
288 "ParserStateStack unequal to bracket count" );
289 nRet = nNextCh;
290 break;
292 case 0x0d:
293 case 0x0a:
294 break;
296 default:
297 // es folgt normaler Text
298 ScanText();
299 nRet = RTF_TEXTTOKEN;
300 bNextCh = 0 == nNextCh;
301 break;
304 if( bNextCh )
305 nNextCh = GetNextChar();
307 } while( !nRet && SVPAR_WORKING == eState );
308 return nRet;
312 sal_Unicode SvRTFParser::GetHexValue()
314 // Hex-Wert sammeln
315 register int n;
316 register sal_Unicode nHexVal = 0;
318 for( n = 0; n < 2; ++n )
320 nHexVal *= 16;
321 nNextCh = GetNextChar();
322 if( nNextCh >= '0' && nNextCh <= '9' )
323 nHexVal += (nNextCh - 48);
324 else if( nNextCh >= 'a' && nNextCh <= 'f' )
325 nHexVal += (nNextCh - 87);
326 else if( nNextCh >= 'A' && nNextCh <= 'F' )
327 nHexVal += (nNextCh - 55);
329 return nHexVal;
332 void SvRTFParser::ScanText( const sal_Unicode cBreak )
334 String aStrBuffer;
335 int bWeiter = true;
336 while( bWeiter && IsParserWorking() && aStrBuffer.Len() < MAX_STRING_LEN)
338 int bNextCh = true;
339 switch( nNextCh )
341 case '\\':
343 switch (nNextCh = GetNextChar())
345 case '\'':
348 #if 0
349 // #i35653 patch from cmc
350 ByteString aByteString(static_cast<char>(GetHexValue()));
351 if (aByteString.Len())
352 aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
353 #else
354 ByteString aByteString;
355 while (1)
357 aByteString.Append((char)GetHexValue());
359 bool bBreak = false;
360 sal_Char nSlash = '\\';
361 while (!bBreak)
363 wchar_t __next=GetNextChar();
364 if (__next>0xFF) // fix for #i43933# and #i35653#
366 if (aByteString.Len())
367 aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
368 aStrBuffer.Append((sal_Unicode)__next);
370 aByteString.Erase();
371 continue;
373 nSlash = (sal_Char)__next;
374 while (nSlash == 0xD || nSlash == 0xA)
375 nSlash = (sal_Char)GetNextChar();
377 switch (nSlash)
379 case '{':
380 case '}':
381 case '\\':
382 bBreak = true;
383 break;
384 default:
385 aByteString.Append(nSlash);
386 break;
390 nNextCh = GetNextChar();
392 if (nSlash != '\\' || nNextCh != '\'')
394 rInput.SeekRel(-1);
395 nNextCh = nSlash;
396 break;
400 bNextCh = false;
402 if (aByteString.Len())
403 aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
404 #endif
406 break;
407 case '\\':
408 case '}':
409 case '{':
410 case '+': // habe ich in einem RTF-File gefunden
411 aStrBuffer.Append(nNextCh);
412 break;
413 case '~': // nonbreaking space
414 aStrBuffer.Append(static_cast< sal_Unicode >(0xA0));
415 break;
416 case '-': // optional hyphen
417 aStrBuffer.Append(static_cast< sal_Unicode >(0xAD));
418 break;
419 case '_': // nonbreaking hyphen
420 aStrBuffer.Append(static_cast< sal_Unicode >(0x2011));
421 break;
423 case 'u':
424 // UNI-Code Zeichen lesen
426 nNextCh = GetNextChar();
427 rInput.SeekRel( -2 );
429 if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) )
431 bRTF_InTextRead = true;
433 String sSave( aToken );
434 nNextCh = '\\';
435 #ifdef DBG_UTIL
436 int nToken =
437 #endif
438 _GetNextToken();
439 DBG_ASSERT( RTF_U == nToken, "doch kein UNI-Code Zeichen" );
440 // dont convert symbol chars
441 aStrBuffer.Append(
442 static_cast< sal_Unicode >(nTokenValue));
444 // overread the next n "RTF" characters. This
445 // can be also \{, \}, \'88
446 for( BYTE m = 0; m < nUCharOverread; ++m )
448 sal_Unicode cAnsi = nNextCh;
449 while( 0xD == cAnsi )
450 cAnsi = GetNextChar();
451 while( 0xA == cAnsi )
452 cAnsi = GetNextChar();
454 if( '\\' == cAnsi &&
455 '\'' == ( cAnsi = GetNextChar() ))
456 // HexValue ueberlesen
457 cAnsi = GetHexValue();
458 nNextCh = GetNextChar();
460 bNextCh = false;
461 aToken = sSave;
462 bRTF_InTextRead = false;
464 else
466 nNextCh = '\\';
467 bWeiter = false; // Abbrechen, String zusammen
470 break;
472 default:
473 rInput.SeekRel( -1 );
474 nNextCh = '\\';
475 bWeiter = false; // Abbrechen, String zusammen
476 break;
479 break;
481 case sal_Unicode(EOF):
482 eState = SVPAR_ERROR;
483 // weiter
484 case '{':
485 case '}':
486 bWeiter = false;
487 break;
489 case 0x0a:
490 case 0x0d:
491 break;
493 default:
494 if( nNextCh == cBreak || aStrBuffer.Len() >= MAX_STRING_LEN)
495 bWeiter = false;
496 else
498 do {
499 // alle anderen Zeichen kommen in den Text
500 aStrBuffer.Append(nNextCh);
502 if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
504 if (aStrBuffer.Len())
505 aToken += aStrBuffer;
506 return;
508 } while
510 (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) &&
511 (aStrBuffer.Len() < MAX_STRING_LEN)
513 bNextCh = false;
517 if( bWeiter && bNextCh )
518 nNextCh = GetNextChar();
521 if (aStrBuffer.Len())
522 aToken += aStrBuffer;
526 short SvRTFParser::_inSkipGroup=0;
528 void SvRTFParser::SkipGroup()
530 short nBrackets=1;
531 if (_inSkipGroup>0)
532 return;
533 _inSkipGroup++;
534 #if 1 //#i16185# fecking \bin keyword
537 switch (nNextCh)
539 case '{':
540 ++nBrackets;
541 break;
542 case '}':
543 if (!--nBrackets) {
544 _inSkipGroup--;
545 return;
547 break;
549 int nToken = _GetNextToken();
550 if (nToken == RTF_BIN)
552 rInput.SeekRel(-1);
553 rInput.SeekRel(nTokenValue);
554 nNextCh = GetNextChar();
556 while (nNextCh==0xa || nNextCh==0xd)
558 nNextCh = GetNextChar();
560 } while (sal_Unicode(EOF) != nNextCh && IsParserWorking());
561 #else
562 sal_Unicode cPrev = 0;
563 do {
564 switch( nNextCh )
566 case '{':
567 if( '\\' != cPrev )
568 ++nBrackets;
569 break;
571 case '}':
572 if( '\\' != cPrev && !--nBrackets )
573 return;
574 break;
576 case '\\':
577 if( '\\' == cPrev )
578 nNextCh = 0;
579 break;
581 cPrev = nNextCh;
582 nNextCh = GetNextChar();
583 } while( sal_Unicode(EOF) != nNextCh && IsParserWorking() );
584 #endif
586 if( SVPAR_PENDING != eState && '}' != nNextCh )
587 eState = SVPAR_ERROR;
588 _inSkipGroup--;
591 void SvRTFParser::ReadUnknownData() { SkipGroup(); }
592 void SvRTFParser::ReadBitmapData() { SkipGroup(); }
593 void SvRTFParser::ReadOLEData() { SkipGroup(); }
596 SvParserState SvRTFParser::CallParser()
598 sal_Char cFirstCh;
599 nNextChPos = rInput.Tell();
600 rInput >> cFirstCh; nNextCh = cFirstCh;
601 eState = SVPAR_WORKING;
602 nOpenBrakets = 0;
603 SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
604 eUNICodeSet = RTL_TEXTENCODING_MS_1252; // default ist ANSI-CodeSet
606 // die 1. beiden Token muessen '{' und \\rtf sein !!
607 if( '{' == GetNextToken() && RTF_RTF == GetNextToken() )
609 AddRef();
610 Continue( 0 );
611 if( SVPAR_PENDING != eState )
612 ReleaseRef(); // dann brauchen wir den Parser nicht mehr!
614 else
615 eState = SVPAR_ERROR;
617 return eState;
620 void SvRTFParser::Continue( int nToken )
622 // DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
623 // "Zeichensatz wurde geaendert." );
625 if( !nToken )
626 nToken = GetNextToken();
628 while( IsParserWorking() )
630 SaveState( nToken );
631 switch( nToken )
633 case '}':
634 if( nOpenBrakets )
635 goto NEXTTOKEN;
636 eState = SVPAR_ACCEPTED;
637 break;
639 case '{':
640 // eine unbekannte Gruppe ?
642 if( RTF_IGNOREFLAG != GetNextToken() )
643 nToken = SkipToken( -1 );
644 else if( RTF_UNKNOWNCONTROL != GetNextToken() )
645 nToken = SkipToken( -2 );
646 else
648 // gleich herausfiltern
649 ReadUnknownData();
650 nToken = GetNextToken();
651 if( '}' != nToken )
652 eState = SVPAR_ERROR;
653 break; // auf zum naechsten Token!!
656 goto NEXTTOKEN;
658 case RTF_UNKNOWNCONTROL:
659 break; // unbekannte Token ueberspringen
660 case RTF_NEXTTYPE:
661 case RTF_ANSITYPE:
662 SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
663 break;
664 case RTF_MACTYPE:
665 SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN );
666 break;
667 case RTF_PCTYPE:
668 SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_IBM_437 );
669 break;
670 case RTF_PCATYPE:
671 SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_IBM_850 );
672 break;
673 case RTF_ANSICPG:
674 eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue);
675 SetSrcEncoding(eCodeSet);
676 break;
677 default:
678 NEXTTOKEN:
679 NextToken( nToken );
680 break;
682 if( IsParserWorking() )
683 SaveState( 0 ); // bis hierhin abgearbeitet,
684 // weiter mit neuem Token!
685 nToken = GetNextToken();
687 if( SVPAR_ACCEPTED == eState && 0 < nOpenBrakets )
688 eState = SVPAR_ERROR;
691 void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc )
693 if (eEnc == RTL_TEXTENCODING_DONTKNOW)
694 eEnc = GetCodeSet();
696 if (aParserStates.Count())
697 aParserStates[aParserStates.Count() - 1].eCodeSet = eEnc;
698 SetSrcEncoding(eEnc);
701 #ifdef USED
702 void SvRTFParser::SaveState( int nToken )
704 SvParser::SaveState( nToken );
707 void SvRTFParser::RestoreState()
709 SvParser::RestoreState();
711 #endif
713 /* vi:set tabstop=4 shiftwidth=4 expandtab: */