Version 7.6.3.2-android, tag libreoffice-7.6.3.2-android
[LibreOffice.git] / svtools / source / svrtf / svparser.cxx
blob1a8e73d0edb6cebe7c1e35b6f6c4cff0c6730ebc
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <svtools/svparser.hxx>
21 #include <svtools/htmltokn.h>
22 #include <tools/stream.hxx>
23 #include <tools/debug.hxx>
24 #include <rtl/textcvt.h>
25 #include <rtl/tencinfo.h>
26 #include <rtl/character.hxx>
27 #include <sal/log.hxx>
28 #include <unicode/ucsdet.h>
29 #include <unotools/configmgr.hxx>
31 #include <vector>
33 // structure to store the actual data
34 template<typename T>
35 struct SvParser_Impl
37 OUString aToken; // gescanntes Token
38 sal_uInt64 nFilePos; // actual position in stream
39 sal_uInt32 nlLineNr; // actual line number
40 sal_uInt32 nlLinePos; // actual column number
41 tools::Long nTokenValue; // extra value (RTF)
42 bool bTokenHasValue; // indicates whether nTokenValue is valid
43 T nToken; // actual Token
44 sal_uInt32 nNextCh; // actual character
45 T nSaveToken; // the token from Continue
47 rtl_TextToUnicodeConverter hConv;
48 rtl_TextToUnicodeContext hContext;
50 SvParser_Impl()
51 : nFilePos(0)
52 , nlLineNr(0)
53 , nlLinePos(0)
54 , nTokenValue(0)
55 , bTokenHasValue(false)
56 , nToken(static_cast<T>(0))
57 , nNextCh(0)
58 , nSaveToken(static_cast<T>(0))
59 , hConv( nullptr )
60 , hContext( reinterpret_cast<rtl_TextToUnicodeContext>(1) )
67 template<typename T>
68 SvParser<T>::TokenStackType::TokenStackType()
69 : nTokenValue(0)
70 , bTokenHasValue(false)
71 , nTokenId(static_cast<T>(0))
75 // Constructor
76 template<typename T>
77 SvParser<T>::SvParser( SvStream& rIn, sal_uInt8 nStackSize )
78 : rInput( rIn )
79 , nlLineNr( 1 )
80 , nlLinePos( 1 )
81 , pImplData( nullptr )
82 , m_nTokenIndex(0)
83 , nTokenValue( 0 )
84 , bTokenHasValue( false )
85 , bFuzzing(utl::ConfigManager::IsFuzzing())
86 , eState( SvParserState::NotStarted )
87 , eSrcEnc( RTL_TEXTENCODING_DONTKNOW )
88 , nNextChPos(0)
89 , nNextCh(0)
90 , bSwitchToUCS2(false)
91 , bRTF_InTextRead(false)
92 , nTokenStackSize( nStackSize )
93 , nTokenStackPos( 0 )
95 eState = SvParserState::NotStarted;
96 if( nTokenStackSize < 3 )
97 nTokenStackSize = 3;
98 pTokenStack.reset(new TokenStackType[ nTokenStackSize ]);
99 pTokenStackPos = pTokenStack.get();
102 template<typename T>
103 SvParser<T>::~SvParser()
105 if( pImplData && pImplData->hConv )
107 rtl_destroyTextToUnicodeContext( pImplData->hConv,
108 pImplData->hContext );
109 rtl_destroyTextToUnicodeConverter( pImplData->hConv );
112 pTokenStack.reset();
115 template<typename T> SvParserState SvParser<T>::GetStatus() const { return eState; }
116 template<typename T> sal_uInt32 SvParser<T>::GetLineNr() const { return nlLineNr; }
117 template<typename T> sal_uInt32 SvParser<T>::GetLinePos() const { return nlLinePos; }
118 template<typename T> void SvParser<T>::IncLineNr() { ++nlLineNr; }
119 template<typename T> sal_uInt32 SvParser<T>::IncLinePos() { return ++nlLinePos; }
120 template<typename T> void SvParser<T>::SetLineNr( sal_uInt32 nlNum ) { nlLineNr = nlNum; }
121 template<typename T> void SvParser<T>::SetLinePos( sal_uInt32 nlPos ) { nlLinePos = nlPos; }
122 template<typename T> bool SvParser<T>::IsParserWorking() const { return SvParserState::Working == eState; }
123 template<typename T> rtl_TextEncoding SvParser<T>::GetSrcEncoding() const { return eSrcEnc; }
124 template<typename T> void SvParser<T>::SetSwitchToUCS2( bool bSet ) { bSwitchToUCS2 = bSet; }
125 template<typename T> bool SvParser<T>::IsSwitchToUCS2() const { return bSwitchToUCS2; }
126 template<typename T> sal_uInt16 SvParser<T>::GetCharSize() const { return (RTL_TEXTENCODING_UCS2 == eSrcEnc) ? 2 : 1; }
127 template<typename T> Link<LinkParamNone*,void> SvParser<T>::GetAsynchCallLink() const
129 return LINK( const_cast<SvParser*>(this), SvParser, NewDataRead );
132 template<typename T>
133 void SvParser<T>::ClearTxtConvContext()
135 if( pImplData && pImplData->hConv )
136 rtl_resetTextToUnicodeContext( pImplData->hConv, pImplData->hContext );
139 template<typename T>
140 void SvParser<T>::SetSrcEncoding( rtl_TextEncoding eEnc )
142 if( eEnc == eSrcEnc )
143 return;
145 if( pImplData && pImplData->hConv )
147 rtl_destroyTextToUnicodeContext( pImplData->hConv,
148 pImplData->hContext );
149 rtl_destroyTextToUnicodeConverter( pImplData->hConv );
150 pImplData->hConv = nullptr;
151 pImplData->hContext = reinterpret_cast<rtl_TextToUnicodeContext>(1);
154 if( rtl_isOctetTextEncoding(eEnc) ||
155 RTL_TEXTENCODING_UCS2 == eEnc )
157 eSrcEnc = eEnc;
158 if( !pImplData )
159 pImplData.reset(new SvParser_Impl<T>);
160 pImplData->hConv = rtl_createTextToUnicodeConverter( eSrcEnc );
161 DBG_ASSERT( pImplData->hConv,
162 "SvParser::SetSrcEncoding: no converter for source encoding" );
163 if( !pImplData->hConv )
164 eSrcEnc = RTL_TEXTENCODING_DONTKNOW;
165 else
166 pImplData->hContext =
167 rtl_createTextToUnicodeContext( pImplData->hConv );
169 else
171 SAL_WARN( "svtools",
172 "SvParser::SetSrcEncoding: invalid source encoding" );
173 eSrcEnc = RTL_TEXTENCODING_DONTKNOW;
177 template<typename T>
178 void SvParser<T>::RereadLookahead()
180 rInput.Seek(nNextChPos);
181 nNextCh = GetNextChar();
184 template<typename T>
185 sal_uInt32 SvParser<T>::GetNextChar()
187 sal_uInt32 c = 0U;
189 // When reading multiple bytes, we don't have to care about the file
190 // position when we run into the pending state. The file position is
191 // maintained by SaveState/RestoreState.
192 if( bSwitchToUCS2 && 0 == rInput.Tell() )
194 rInput.StartReadingUnicodeText(RTL_TEXTENCODING_DONTKNOW);
195 if (rInput.good())
197 sal_uInt64 nPos = rInput.Tell();
198 if (nPos == 2)
199 eSrcEnc = RTL_TEXTENCODING_UCS2;
200 else if (nPos == 3)
201 SetSrcEncoding(RTL_TEXTENCODING_UTF8);
202 else // Try to detect encoding without BOM
204 std::vector<char> buf(65535); // Arbitrarily chosen 64KiB buffer
205 const size_t nSize = rInput.ReadBytes(buf.data(), buf.size());
206 rInput.Seek(0);
207 if (nSize > 0)
209 UErrorCode uerr = U_ZERO_ERROR;
210 UCharsetDetector* ucd = ucsdet_open(&uerr);
211 ucsdet_setText(ucd, buf.data(), nSize, &uerr);
212 if (const UCharsetMatch* match = ucsdet_detect(ucd, &uerr))
214 const char* pEncodingName = ucsdet_getName(match, &uerr);
216 if (U_SUCCESS(uerr))
218 if (strcmp("UTF-8", pEncodingName) == 0)
220 SetSrcEncoding(RTL_TEXTENCODING_UTF8);
222 else if (strcmp("UTF-16LE", pEncodingName) == 0)
224 eSrcEnc = RTL_TEXTENCODING_UCS2;
225 rInput.SetEndian(SvStreamEndian::LITTLE);
227 else if (strcmp("UTF-16BE", pEncodingName) == 0)
229 eSrcEnc = RTL_TEXTENCODING_UCS2;
230 rInput.SetEndian(SvStreamEndian::BIG);
235 ucsdet_close(ucd);
239 bSwitchToUCS2 = false;
242 bool bErr;
243 nNextChPos = rInput.Tell();
245 if( RTL_TEXTENCODING_UCS2 == eSrcEnc )
247 sal_Unicode cUC;
248 rInput.ReadUtf16(cUC);
249 bErr = !rInput.good();
250 if( !bErr )
252 c = cUC;
253 if (rtl::isHighSurrogate(cUC))
255 const sal_uInt64 nPos = rInput.Tell();
256 rInput.ReadUtf16(cUC);
257 if (rtl::isLowSurrogate(cUC)) // can only be true when ReadUtf16 succeeded
258 c = rtl::combineSurrogates(c, cUC);
259 else
260 rInput.Seek(nPos); // process lone high surrogate
264 else
266 sal_Size nChars = 0;
269 char c1; // signed, that's the text converter expects
270 rInput.ReadChar( c1 );
271 bErr = !rInput.good();
272 if( !bErr )
274 if (
275 RTL_TEXTENCODING_DONTKNOW == eSrcEnc ||
276 RTL_TEXTENCODING_SYMBOL == eSrcEnc
279 // no conversion shall take place
280 c = reinterpret_cast<unsigned char&>( c1 );
281 nChars = 1;
283 else
285 assert(pImplData && pImplData->hConv && "no text converter!");
287 sal_Unicode cUC;
288 sal_uInt32 nInfo = 0;
289 sal_Size nCvtBytes;
290 nChars = rtl_convertTextToUnicode(
291 pImplData->hConv, pImplData->hContext,
292 &c1, 1, &cUC, 1,
293 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR|
294 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR|
295 RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR,
296 &nInfo, &nCvtBytes);
297 if( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) != 0 )
299 // The conversion wasn't successful because we haven't
300 // read enough characters.
301 if( pImplData->hContext != reinterpret_cast<rtl_TextToUnicodeContext>(1) )
303 sal_Unicode sCh[2];
304 while( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) != 0 )
306 rInput.ReadChar( c1 );
307 bErr = !rInput.good();
308 if( bErr )
309 break;
311 nChars = rtl_convertTextToUnicode(
312 pImplData->hConv, pImplData->hContext,
313 &c1, 1, sCh , 2,
314 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR|
315 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR|
316 RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR,
317 &nInfo, &nCvtBytes);
319 if( !bErr )
321 if( 1 == nChars && 0 == nInfo )
323 c = sal_uInt32( sCh[0] );
325 else if( 2 == nChars && 0 == nInfo )
327 c = rtl::combineSurrogates( sCh[0], sCh[1] );
329 else if( 0 != nChars || 0 != nInfo )
331 DBG_ASSERT( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) == 0,
332 "source buffer is too small" );
333 DBG_ASSERT( (nInfo&~(RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL)) == 0,
334 "there is a conversion error" );
335 DBG_ASSERT( 0 == nChars,
336 "there is a converted character, but an error" );
337 // There are still errors, but nothing we can
338 // do
339 c = '?';
340 nChars = 1;
344 else
346 char sBuffer[10];
347 sBuffer[0] = c1;
348 sal_uInt16 nLen = 1;
349 while( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) != 0 &&
350 nLen < 10 )
352 rInput.ReadChar( c1 );
353 bErr = !rInput.good();
354 if( bErr )
355 break;
357 sBuffer[nLen++] = c1;
358 nChars = rtl_convertTextToUnicode(
359 pImplData->hConv, nullptr, sBuffer, nLen, &cUC, 1,
360 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR|
361 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR|
362 RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR,
363 &nInfo, &nCvtBytes);
365 if( !bErr )
367 if( 1 == nChars && 0 == nInfo )
369 DBG_ASSERT( nCvtBytes == nLen,
370 "no all bytes have been converted!" );
371 c = cUC;
373 else
375 DBG_ASSERT( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) == 0,
376 "source buffer is too small" );
377 DBG_ASSERT( (nInfo&~(RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL)) == 0,
378 "there is a conversion error" );
379 DBG_ASSERT( 0 == nChars,
380 "there is a converted character, but an error" );
382 // There are still errors, so we use the first
383 // character and restart after that.
384 c = reinterpret_cast<unsigned char&>( sBuffer[0] );
385 rInput.SeekRel( -(nLen-1) );
386 nChars = 1;
391 else if( 1 == nChars && 0 == nInfo )
393 // The conversion was successful
394 DBG_ASSERT( nCvtBytes == 1,
395 "no all bytes have been converted!" );
396 c = cUC;
398 else if( 0 != nChars || 0 != nInfo )
400 DBG_ASSERT( 0 == nChars,
401 "there is a converted character, but an error" );
402 DBG_ASSERT( 0 != nInfo,
403 "there is no converted character and no error" );
404 // #73398#: If the character could not be converted,
405 // because a conversion is not available, do no conversion at all.
406 c = reinterpret_cast<unsigned char&>( c1 );
407 nChars = 1;
413 while( 0 == nChars && !bErr );
416 if ( ! rtl::isUnicodeScalarValue( c ) )
417 c = '?' ;
419 if( bErr )
421 if( ERRCODE_IO_PENDING == rInput.GetError() )
423 eState = SvParserState::Pending;
424 return c;
426 else
427 return sal_Unicode(EOF);
430 if( c == '\n' )
432 IncLineNr();
433 SetLinePos( 1 );
435 else
436 IncLinePos();
438 return c;
441 template<typename T>
442 T SvParser<T>::GetNextToken()
444 T nRet = static_cast<T>(0);
446 if( !nTokenStackPos )
448 aToken.setLength( 0 ); // empty token buffer
449 nTokenValue = -1; // marker for no value read
450 bTokenHasValue = false;
452 nRet = GetNextToken_();
453 if( SvParserState::Pending == eState )
454 return nRet;
457 ++pTokenStackPos;
458 if( pTokenStackPos == pTokenStack.get() + nTokenStackSize )
459 pTokenStackPos = pTokenStack.get();
461 // pop from stack ??
462 if( nTokenStackPos )
464 --nTokenStackPos;
465 nTokenValue = pTokenStackPos->nTokenValue;
466 bTokenHasValue = pTokenStackPos->bTokenHasValue;
467 aToken = pTokenStackPos->sToken;
468 nRet = pTokenStackPos->nTokenId;
469 ++m_nTokenIndex;
471 // no, now push actual value on stack
472 else if( SvParserState::Working == eState )
474 pTokenStackPos->sToken = aToken;
475 pTokenStackPos->nTokenValue = nTokenValue;
476 pTokenStackPos->bTokenHasValue = bTokenHasValue;
477 pTokenStackPos->nTokenId = nRet;
478 ++m_nTokenIndex;
480 else if( SvParserState::Accepted != eState && SvParserState::Pending != eState )
481 eState = SvParserState::Error; // an error occurred
483 return nRet;
486 template<typename T>
487 T SvParser<T>::SkipToken( short nCnt ) // "skip" n Tokens backward
489 pTokenStackPos = GetStackPtr( nCnt );
490 short nTmp = nTokenStackPos - nCnt;
491 if( nTmp < 0 )
492 nTmp = 0;
493 else if( nTmp > nTokenStackSize )
494 nTmp = nTokenStackSize;
495 nTokenStackPos = sal_uInt8(nTmp);
497 m_nTokenIndex -= nTmp;
499 // restore values
500 aToken = pTokenStackPos->sToken;
501 nTokenValue = pTokenStackPos->nTokenValue;
502 bTokenHasValue = pTokenStackPos->bTokenHasValue;
504 return pTokenStackPos->nTokenId;
507 template<typename T>
508 typename SvParser<T>::TokenStackType* SvParser<T>::GetStackPtr( short nCnt )
510 sal_uInt8 nCurrentPos = sal_uInt8(pTokenStackPos - pTokenStack.get());
511 if( nCnt > 0 )
513 if( nCnt >= nTokenStackSize )
514 nCnt = (nTokenStackSize-1);
515 if( nCurrentPos + nCnt < nTokenStackSize )
516 nCurrentPos = sal::static_int_cast< sal_uInt8 >(nCurrentPos + nCnt);
517 else
518 nCurrentPos = sal::static_int_cast< sal_uInt8 >(
519 nCurrentPos + (nCnt - nTokenStackSize));
521 else if( nCnt < 0 )
523 if( -nCnt >= nTokenStackSize )
524 nCnt = -nTokenStackSize+1;
525 if( -nCnt <= nCurrentPos )
526 nCurrentPos = sal::static_int_cast< sal_uInt8 >(nCurrentPos + nCnt);
527 else
528 nCurrentPos = sal::static_int_cast< sal_uInt8 >(
529 nCurrentPos + (nCnt + nTokenStackSize));
531 return pTokenStack.get() + nCurrentPos;
534 // to read asynchronous from SvStream
536 template<typename T>
537 T SvParser<T>::GetSaveToken() const
539 return pImplData ? pImplData->nSaveToken : static_cast<T>(0);
542 template<typename T>
543 void SvParser<T>::SaveState( T nToken )
545 // save actual status
546 if( !pImplData )
548 pImplData.reset(new SvParser_Impl<T>);
549 pImplData->nSaveToken = static_cast<T>(0);
552 pImplData->nFilePos = rInput.Tell();
553 pImplData->nToken = nToken;
555 pImplData->aToken = aToken;
556 pImplData->nlLineNr = nlLineNr;
557 pImplData->nlLinePos = nlLinePos;
558 pImplData->nTokenValue= nTokenValue;
559 pImplData->bTokenHasValue = bTokenHasValue;
560 pImplData->nNextCh = nNextCh;
563 template<typename T>
564 void SvParser<T>::RestoreState()
566 // restore old status
567 if( !pImplData )
568 return;
570 if( ERRCODE_IO_PENDING == rInput.GetError() )
571 rInput.ResetError();
572 aToken = pImplData->aToken;
573 nlLineNr = pImplData->nlLineNr;
574 nlLinePos = pImplData->nlLinePos;
575 nTokenValue= pImplData->nTokenValue;
576 bTokenHasValue=pImplData->bTokenHasValue;
577 nNextCh = pImplData->nNextCh;
579 pImplData->nSaveToken = pImplData->nToken;
581 rInput.Seek( pImplData->nFilePos );
584 template<typename T>
585 void SvParser<T>::Continue( T )
590 // expanded out version of
591 // IMPL_LINK_NOARG( SvParser, NewDataRead, LinkParamNone*, void )
592 // since it can't cope with template methods
593 template<typename T>
594 void SvParser<T>::LinkStubNewDataRead(void * instance, LinkParamNone* data) {
595 return static_cast<SvParser<T> *>(instance)->NewDataRead(data);
597 template<typename T>
598 void SvParser<T>::NewDataRead(SAL_UNUSED_PARAMETER LinkParamNone*)
600 switch( eState )
602 case SvParserState::Pending:
603 eState = SvParserState::Working;
604 RestoreState();
606 Continue( pImplData->nToken );
608 if( ERRCODE_IO_PENDING == rInput.GetError() )
609 rInput.ResetError();
611 if( SvParserState::Pending != eState )
612 ReleaseRef(); // ready otherwise!
613 break;
615 case SvParserState::NotStarted:
616 case SvParserState::Working:
617 break;
619 default:
620 ReleaseRef(); // ready otherwise!
621 break;
625 template class SVT_DLLPUBLIC SvParser<int>;
626 template class SVT_DLLPUBLIC SvParser<HtmlTokenId>;
628 /*========================================================================
630 * SvKeyValueIterator.
632 *======================================================================*/
634 typedef std::vector<SvKeyValue> SvKeyValueList_Impl;
636 struct SvKeyValueIterator::Impl
638 SvKeyValueList_Impl maList;
639 sal_uInt16 mnPos;
641 Impl() : mnPos(0) {}
644 SvKeyValueIterator::SvKeyValueIterator() : mpImpl(new Impl) {}
646 SvKeyValueIterator::~SvKeyValueIterator() = default;
648 bool SvKeyValueIterator::GetFirst (SvKeyValue &rKeyVal)
650 mpImpl->mnPos = mpImpl->maList.size();
651 return GetNext (rKeyVal);
654 bool SvKeyValueIterator::GetNext (SvKeyValue &rKeyVal)
656 if (mpImpl->mnPos > 0)
658 rKeyVal = mpImpl->maList[--mpImpl->mnPos];
659 return true;
661 else
663 // Nothing to do.
664 return false;
668 void SvKeyValueIterator::Append (const SvKeyValue &rKeyVal)
670 mpImpl->maList.push_back(rKeyVal);
673 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */