tdf#130857 qt weld: Implement QtInstanceWidget::strip_mnemonic
[LibreOffice.git] / svtools / source / svrtf / svparser.cxx
blob419de30e9d0a07b9d5800af45c69f397c4f0fef1
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <svtools/svparser.hxx>
21 #include <svtools/htmltokn.h>
22 #include <tools/stream.hxx>
23 #include <tools/debug.hxx>
24 #include <rtl/textcvt.h>
25 #include <rtl/tencinfo.h>
26 #include <rtl/character.hxx>
27 #include <sal/log.hxx>
28 #include <unicode/ucsdet.h>
29 #include <comphelper/configuration.hxx>
31 #include <vector>
33 // structure to store the actual data
34 template<typename T>
35 struct SvParser_Impl
37 OUString aToken; // parsed token
38 sal_uInt64 nFilePos; // actual position in stream
39 sal_uInt32 nlLineNr; // actual line number
40 sal_uInt32 nlLinePos; // actual column number
41 tools::Long nTokenValue; // extra value (RTF)
42 bool bTokenHasValue; // indicates whether nTokenValue is valid
43 T nToken; // actual Token
44 sal_uInt32 nNextCh; // actual character
45 T nSaveToken; // the token from Continue
47 rtl_TextToUnicodeConverter hConv;
48 rtl_TextToUnicodeContext hContext;
50 SvParser_Impl()
51 : nFilePos(0)
52 , nlLineNr(0)
53 , nlLinePos(0)
54 , nTokenValue(0)
55 , bTokenHasValue(false)
56 , nToken(static_cast<T>(0))
57 , nNextCh(0)
58 , nSaveToken(static_cast<T>(0))
59 , hConv( nullptr )
60 , hContext( reinterpret_cast<rtl_TextToUnicodeContext>(1) )
67 template<typename T>
68 SvParser<T>::TokenStackType::TokenStackType()
69 : nTokenValue(0)
70 , bTokenHasValue(false)
71 , nTokenId(static_cast<T>(0))
75 // Constructor
76 template<typename T>
77 SvParser<T>::SvParser( SvStream& rIn, sal_uInt8 nStackSize )
78 : rInput( rIn )
79 , nlLineNr( 1 )
80 , nlLinePos( 1 )
81 , nConversionErrors( 0 )
82 , pImplData( nullptr )
83 , m_nTokenIndex(0)
84 , nTokenValue( 0 )
85 , bTokenHasValue( false )
86 , bFuzzing(comphelper::IsFuzzing())
87 , eState( SvParserState::NotStarted )
88 , eSrcEnc( RTL_TEXTENCODING_DONTKNOW )
89 , nNextChPos(0)
90 , nNextCh(0)
91 , bSwitchToUCS2(false)
92 , bRTF_InTextRead(false)
93 , nTokenStackSize( nStackSize )
94 , nTokenStackPos( 0 )
96 eState = SvParserState::NotStarted;
97 if( nTokenStackSize < 3 )
98 nTokenStackSize = 3;
99 pTokenStack.reset(new TokenStackType[ nTokenStackSize ]);
100 pTokenStackPos = pTokenStack.get();
103 template<typename T>
104 SvParser<T>::~SvParser()
106 if( pImplData && pImplData->hConv )
108 rtl_destroyTextToUnicodeContext( pImplData->hConv,
109 pImplData->hContext );
110 rtl_destroyTextToUnicodeConverter( pImplData->hConv );
113 pTokenStack.reset();
116 template<typename T> SvParserState SvParser<T>::GetStatus() const { return eState; }
117 template<typename T> sal_uInt32 SvParser<T>::GetLineNr() const { return nlLineNr; }
118 template<typename T> sal_uInt32 SvParser<T>::GetLinePos() const { return nlLinePos; }
119 template<typename T> void SvParser<T>::IncLineNr() { ++nlLineNr; }
120 template<typename T> sal_uInt32 SvParser<T>::IncLinePos() { return ++nlLinePos; }
121 template<typename T> void SvParser<T>::SetLineNr( sal_uInt32 nlNum ) { nlLineNr = nlNum; }
122 template<typename T> void SvParser<T>::SetLinePos( sal_uInt32 nlPos ) { nlLinePos = nlPos; }
123 template<typename T> bool SvParser<T>::IsParserWorking() const { return SvParserState::Working == eState; }
124 template<typename T> rtl_TextEncoding SvParser<T>::GetSrcEncoding() const { return eSrcEnc; }
125 template<typename T> void SvParser<T>::SetSwitchToUCS2( bool bSet ) { bSwitchToUCS2 = bSet; }
126 template<typename T> bool SvParser<T>::IsSwitchToUCS2() const { return bSwitchToUCS2; }
127 template<typename T> sal_uInt16 SvParser<T>::GetCharSize() const { return (RTL_TEXTENCODING_UCS2 == eSrcEnc) ? 2 : 1; }
128 template<typename T> Link<LinkParamNone*,void> SvParser<T>::GetAsynchCallLink() const
130 return LINK( const_cast<SvParser*>(this), SvParser, NewDataRead );
133 template<typename T>
134 void SvParser<T>::ClearTxtConvContext()
136 if( pImplData && pImplData->hConv )
137 rtl_resetTextToUnicodeContext( pImplData->hConv, pImplData->hContext );
140 template<typename T>
141 void SvParser<T>::SetSrcEncoding( rtl_TextEncoding eEnc )
143 if( eEnc == eSrcEnc )
144 return;
146 if( pImplData && pImplData->hConv )
148 rtl_destroyTextToUnicodeContext( pImplData->hConv,
149 pImplData->hContext );
150 rtl_destroyTextToUnicodeConverter( pImplData->hConv );
151 pImplData->hConv = nullptr;
152 pImplData->hContext = reinterpret_cast<rtl_TextToUnicodeContext>(1);
155 if( rtl_isOctetTextEncoding(eEnc) ||
156 RTL_TEXTENCODING_UCS2 == eEnc )
158 eSrcEnc = eEnc;
159 if( !pImplData )
160 pImplData.reset(new SvParser_Impl<T>);
161 pImplData->hConv = rtl_createTextToUnicodeConverter( eSrcEnc );
162 DBG_ASSERT( pImplData->hConv,
163 "SvParser::SetSrcEncoding: no converter for source encoding" );
164 if( !pImplData->hConv )
165 eSrcEnc = RTL_TEXTENCODING_DONTKNOW;
166 else
167 pImplData->hContext =
168 rtl_createTextToUnicodeContext( pImplData->hConv );
170 else
172 SAL_WARN( "svtools",
173 "SvParser::SetSrcEncoding: invalid source encoding" );
174 eSrcEnc = RTL_TEXTENCODING_DONTKNOW;
178 template<typename T>
179 void SvParser<T>::RereadLookahead()
181 rInput.Seek(nNextChPos);
182 nNextCh = GetNextChar();
185 template<typename T>
186 sal_uInt32 SvParser<T>::GetNextChar()
188 sal_uInt32 c = 0U;
190 // When reading multiple bytes, we don't have to care about the file
191 // position when we run into the pending state. The file position is
192 // maintained by SaveState/RestoreState.
193 if( bSwitchToUCS2 && 0 == rInput.Tell() )
195 rInput.StartReadingUnicodeText(RTL_TEXTENCODING_DONTKNOW);
196 if (rInput.good())
198 sal_uInt64 nPos = rInput.Tell();
199 if (nPos == 2)
200 eSrcEnc = RTL_TEXTENCODING_UCS2;
201 else if (nPos == 3)
202 SetSrcEncoding(RTL_TEXTENCODING_UTF8);
203 else // Try to detect encoding without BOM
205 std::vector<char> buf(65535); // Arbitrarily chosen 64KiB buffer
206 const size_t nSize = rInput.ReadBytes(buf.data(), buf.size());
207 rInput.Seek(0);
208 if (nSize > 0)
210 UErrorCode uerr = U_ZERO_ERROR;
211 UCharsetDetector* ucd = ucsdet_open(&uerr);
212 ucsdet_setText(ucd, buf.data(), nSize, &uerr);
213 if (const UCharsetMatch* match = ucsdet_detect(ucd, &uerr))
215 const char* pEncodingName = ucsdet_getName(match, &uerr);
217 if (U_SUCCESS(uerr))
219 if (strcmp("UTF-8", pEncodingName) == 0)
221 SetSrcEncoding(RTL_TEXTENCODING_UTF8);
223 else if (strcmp("UTF-16LE", pEncodingName) == 0)
225 eSrcEnc = RTL_TEXTENCODING_UCS2;
226 rInput.SetEndian(SvStreamEndian::LITTLE);
228 else if (strcmp("UTF-16BE", pEncodingName) == 0)
230 eSrcEnc = RTL_TEXTENCODING_UCS2;
231 rInput.SetEndian(SvStreamEndian::BIG);
236 ucsdet_close(ucd);
240 bSwitchToUCS2 = false;
243 bool bErr;
244 nNextChPos = rInput.Tell();
246 if( RTL_TEXTENCODING_UCS2 == eSrcEnc )
248 sal_Unicode cUC;
249 rInput.ReadUtf16(cUC);
250 bErr = !rInput.good();
251 if( !bErr )
253 c = cUC;
254 if (rtl::isHighSurrogate(cUC))
256 const sal_uInt64 nPos = rInput.Tell();
257 rInput.ReadUtf16(cUC);
258 if (rtl::isLowSurrogate(cUC)) // can only be true when ReadUtf16 succeeded
259 c = rtl::combineSurrogates(c, cUC);
260 else
261 rInput.Seek(nPos); // process lone high surrogate
265 else
267 sal_Size nChars = 0;
270 char c1; // signed, that's the text converter expects
271 rInput.ReadChar( c1 );
272 bErr = !rInput.good();
273 if( !bErr )
275 if (
276 RTL_TEXTENCODING_DONTKNOW == eSrcEnc ||
277 RTL_TEXTENCODING_SYMBOL == eSrcEnc
280 // no conversion shall take place
281 c = reinterpret_cast<unsigned char&>( c1 );
282 nChars = 1;
284 else
286 assert(pImplData && pImplData->hConv && "no text converter!");
288 sal_Unicode cUC;
289 sal_uInt32 nInfo = 0;
290 sal_Size nCvtBytes;
291 nChars = rtl_convertTextToUnicode(
292 pImplData->hConv, pImplData->hContext,
293 &c1, 1, &cUC, 1,
294 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR|
295 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR|
296 RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR,
297 &nInfo, &nCvtBytes);
298 if( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) != 0 )
300 // The conversion wasn't successful because we haven't
301 // read enough characters.
302 if( pImplData->hContext != reinterpret_cast<rtl_TextToUnicodeContext>(1) )
304 sal_Unicode sCh[2];
305 while( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) != 0 )
307 rInput.ReadChar( c1 );
308 bErr = !rInput.good();
309 if( bErr )
310 break;
312 nChars = rtl_convertTextToUnicode(
313 pImplData->hConv, pImplData->hContext,
314 &c1, 1, sCh , 2,
315 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR|
316 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR|
317 RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR,
318 &nInfo, &nCvtBytes);
320 if( !bErr )
322 if( 1 == nChars && 0 == nInfo )
324 c = sal_uInt32( sCh[0] );
326 else if( 2 == nChars && 0 == nInfo )
328 c = rtl::combineSurrogates( sCh[0], sCh[1] );
330 else if( 0 != nChars || 0 != nInfo )
332 DBG_ASSERT( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) == 0,
333 "source buffer is too small" );
334 DBG_ASSERT( (nInfo&~(RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL)) == 0,
335 "there is a conversion error" );
336 DBG_ASSERT( 0 == nChars,
337 "there is a converted character, but an error" );
338 // There are still errors, but nothing we can
339 // do
340 c = '?';
341 nChars = 1;
342 ++nConversionErrors;
346 else
348 char sBuffer[10];
349 sBuffer[0] = c1;
350 sal_uInt16 nLen = 1;
351 while( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) != 0 &&
352 nLen < 10 )
354 rInput.ReadChar( c1 );
355 bErr = !rInput.good();
356 if( bErr )
357 break;
359 sBuffer[nLen++] = c1;
360 nChars = rtl_convertTextToUnicode(
361 pImplData->hConv, nullptr, sBuffer, nLen, &cUC, 1,
362 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR|
363 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR|
364 RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR,
365 &nInfo, &nCvtBytes);
367 if( !bErr )
369 if( 1 == nChars && 0 == nInfo )
371 DBG_ASSERT( nCvtBytes == nLen,
372 "no all bytes have been converted!" );
373 c = cUC;
375 else
377 DBG_ASSERT( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) == 0,
378 "source buffer is too small" );
379 DBG_ASSERT( (nInfo&~(RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL)) == 0,
380 "there is a conversion error" );
381 DBG_ASSERT( 0 == nChars,
382 "there is a converted character, but an error" );
384 // There are still errors, so we use the first
385 // character and restart after that.
386 c = reinterpret_cast<unsigned char&>( sBuffer[0] );
387 rInput.SeekRel( -(nLen-1) );
388 nChars = 1;
389 ++nConversionErrors;
394 else if( 1 == nChars && 0 == nInfo )
396 // The conversion was successful
397 DBG_ASSERT( nCvtBytes == 1,
398 "no all bytes have been converted!" );
399 c = cUC;
401 else if( 0 != nChars || 0 != nInfo )
403 DBG_ASSERT( 0 == nChars,
404 "there is a converted character, but an error" );
405 DBG_ASSERT( 0 != nInfo,
406 "there is no converted character and no error" );
407 // #73398#: If the character could not be converted,
408 // because a conversion is not available, do no conversion at all.
409 c = reinterpret_cast<unsigned char&>( c1 );
410 nChars = 1;
411 ++nConversionErrors;
416 while( 0 == nChars && !bErr );
419 if ( ! rtl::isUnicodeScalarValue( c ) )
420 c = '?' ;
422 if (bFuzzing && nConversionErrors > 128)
424 SAL_WARN("svtools", "SvParser::GetNextChar too many conversion errors while fuzzing, abandoning for performance");
425 bErr = true;
428 if( bErr )
430 if( ERRCODE_IO_PENDING == rInput.GetError() )
432 eState = SvParserState::Pending;
433 return c;
435 else
436 return sal_Unicode(EOF);
439 if( c == '\n' )
441 IncLineNr();
442 SetLinePos( 1 );
444 else
445 IncLinePos();
447 return c;
450 template<typename T>
451 T SvParser<T>::GetNextToken()
453 T nRet = static_cast<T>(0);
455 if( !nTokenStackPos )
457 aToken.setLength( 0 ); // empty token buffer
458 nTokenValue = -1; // marker for no value read
459 bTokenHasValue = false;
461 nRet = GetNextToken_();
462 if( SvParserState::Pending == eState )
463 return nRet;
466 ++pTokenStackPos;
467 if( pTokenStackPos == pTokenStack.get() + nTokenStackSize )
468 pTokenStackPos = pTokenStack.get();
470 // pop from stack ??
471 if( nTokenStackPos )
473 --nTokenStackPos;
474 nTokenValue = pTokenStackPos->nTokenValue;
475 bTokenHasValue = pTokenStackPos->bTokenHasValue;
476 aToken = pTokenStackPos->sToken;
477 nRet = pTokenStackPos->nTokenId;
478 ++m_nTokenIndex;
480 // no, now push actual value on stack
481 else if( SvParserState::Working == eState )
483 pTokenStackPos->sToken = aToken;
484 pTokenStackPos->nTokenValue = nTokenValue;
485 pTokenStackPos->bTokenHasValue = bTokenHasValue;
486 pTokenStackPos->nTokenId = nRet;
487 ++m_nTokenIndex;
489 else if( SvParserState::Accepted != eState && SvParserState::Pending != eState )
490 eState = SvParserState::Error; // an error occurred
492 return nRet;
495 template<typename T>
496 T SvParser<T>::SkipToken( short nCnt ) // "skip" n Tokens backward
498 pTokenStackPos = GetStackPtr( nCnt );
499 short nTmp = nTokenStackPos - nCnt;
500 if( nTmp < 0 )
501 nTmp = 0;
502 else if( nTmp > nTokenStackSize )
503 nTmp = nTokenStackSize;
504 nTokenStackPos = sal_uInt8(nTmp);
506 m_nTokenIndex -= nTmp;
508 // restore values
509 aToken = pTokenStackPos->sToken;
510 nTokenValue = pTokenStackPos->nTokenValue;
511 bTokenHasValue = pTokenStackPos->bTokenHasValue;
513 return pTokenStackPos->nTokenId;
516 template<typename T>
517 typename SvParser<T>::TokenStackType* SvParser<T>::GetStackPtr( short nCnt )
519 sal_uInt8 nCurrentPos = sal_uInt8(pTokenStackPos - pTokenStack.get());
520 if( nCnt > 0 )
522 if( nCnt >= nTokenStackSize )
523 nCnt = (nTokenStackSize-1);
524 if( nCurrentPos + nCnt < nTokenStackSize )
525 nCurrentPos = sal::static_int_cast< sal_uInt8 >(nCurrentPos + nCnt);
526 else
527 nCurrentPos = sal::static_int_cast< sal_uInt8 >(
528 nCurrentPos + (nCnt - nTokenStackSize));
530 else if( nCnt < 0 )
532 if( -nCnt >= nTokenStackSize )
533 nCnt = -nTokenStackSize+1;
534 if( -nCnt <= nCurrentPos )
535 nCurrentPos = sal::static_int_cast< sal_uInt8 >(nCurrentPos + nCnt);
536 else
537 nCurrentPos = sal::static_int_cast< sal_uInt8 >(
538 nCurrentPos + (nCnt + nTokenStackSize));
540 return pTokenStack.get() + nCurrentPos;
543 // to read asynchronous from SvStream
545 template<typename T>
546 T SvParser<T>::GetSaveToken() const
548 return pImplData ? pImplData->nSaveToken : static_cast<T>(0);
551 template<typename T>
552 void SvParser<T>::SaveState( T nToken )
554 // save actual status
555 if( !pImplData )
557 pImplData.reset(new SvParser_Impl<T>);
558 pImplData->nSaveToken = static_cast<T>(0);
561 pImplData->nFilePos = rInput.Tell();
562 pImplData->nToken = nToken;
564 pImplData->aToken = aToken;
565 pImplData->nlLineNr = nlLineNr;
566 pImplData->nlLinePos = nlLinePos;
567 pImplData->nTokenValue= nTokenValue;
568 pImplData->bTokenHasValue = bTokenHasValue;
569 pImplData->nNextCh = nNextCh;
572 template<typename T>
573 void SvParser<T>::RestoreState()
575 // restore old status
576 if( !pImplData )
577 return;
579 if( ERRCODE_IO_PENDING == rInput.GetError() )
580 rInput.ResetError();
581 aToken = pImplData->aToken;
582 nlLineNr = pImplData->nlLineNr;
583 nlLinePos = pImplData->nlLinePos;
584 nTokenValue= pImplData->nTokenValue;
585 bTokenHasValue=pImplData->bTokenHasValue;
586 nNextCh = pImplData->nNextCh;
588 pImplData->nSaveToken = pImplData->nToken;
590 rInput.Seek( pImplData->nFilePos );
593 template<typename T>
594 void SvParser<T>::Continue( T )
599 // expanded out version of
600 // IMPL_LINK_NOARG( SvParser, NewDataRead, LinkParamNone*, void )
601 // since it can't cope with template methods
602 template<typename T>
603 void SvParser<T>::LinkStubNewDataRead(void * instance, LinkParamNone* data) {
604 return static_cast<SvParser<T> *>(instance)->NewDataRead(data);
606 template<typename T>
607 void SvParser<T>::NewDataRead(SAL_UNUSED_PARAMETER LinkParamNone*)
609 switch( eState )
611 case SvParserState::Pending:
612 eState = SvParserState::Working;
613 RestoreState();
615 Continue( pImplData->nToken );
617 if( ERRCODE_IO_PENDING == rInput.GetError() )
618 rInput.ResetError();
620 if( SvParserState::Pending != eState )
621 ReleaseRef(); // ready otherwise!
622 break;
624 case SvParserState::NotStarted:
625 case SvParserState::Working:
626 break;
628 default:
629 ReleaseRef(); // ready otherwise!
630 break;
634 template class SVT_DLLPUBLIC SvParser<int>;
635 template class SVT_DLLPUBLIC SvParser<HtmlTokenId>;
637 /*========================================================================
639 * SvKeyValueIterator.
641 *======================================================================*/
643 typedef std::vector<SvKeyValue> SvKeyValueList_Impl;
645 struct SvKeyValueIterator::Impl
647 SvKeyValueList_Impl maList;
648 sal_uInt16 mnPos;
650 Impl() : mnPos(0) {}
653 SvKeyValueIterator::SvKeyValueIterator() : mpImpl(new Impl) {}
655 SvKeyValueIterator::~SvKeyValueIterator() = default;
657 bool SvKeyValueIterator::GetFirst (SvKeyValue &rKeyVal)
659 mpImpl->mnPos = mpImpl->maList.size();
660 return GetNext (rKeyVal);
663 bool SvKeyValueIterator::GetNext (SvKeyValue &rKeyVal)
665 if (mpImpl->mnPos > 0)
667 rKeyVal = mpImpl->maList[--mpImpl->mnPos];
668 return true;
670 else
672 // Nothing to do.
673 return false;
677 void SvKeyValueIterator::Append (const SvKeyValue &rKeyVal)
679 mpImpl->maList.push_back(rKeyVal);
682 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */