1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <svtools/svparser.hxx>
21 #include <svtools/htmltokn.h>
22 #include <tools/stream.hxx>
23 #include <tools/debug.hxx>
24 #include <rtl/textcvt.h>
25 #include <rtl/tencinfo.h>
26 #include <rtl/character.hxx>
27 #include <sal/log.hxx>
28 #include <unicode/ucsdet.h>
29 #include <comphelper/configuration.hxx>
33 // structure to store the actual data
37 OUString aToken
; // parsed token
38 sal_uInt64 nFilePos
; // actual position in stream
39 sal_uInt32 nlLineNr
; // actual line number
40 sal_uInt32 nlLinePos
; // actual column number
41 tools::Long nTokenValue
; // extra value (RTF)
42 bool bTokenHasValue
; // indicates whether nTokenValue is valid
43 T nToken
; // actual Token
44 sal_uInt32 nNextCh
; // actual character
45 T nSaveToken
; // the token from Continue
47 rtl_TextToUnicodeConverter hConv
;
48 rtl_TextToUnicodeContext hContext
;
55 , bTokenHasValue(false)
56 , nToken(static_cast<T
>(0))
58 , nSaveToken(static_cast<T
>(0))
60 , hContext( reinterpret_cast<rtl_TextToUnicodeContext
>(1) )
68 SvParser
<T
>::TokenStackType::TokenStackType()
70 , bTokenHasValue(false)
71 , nTokenId(static_cast<T
>(0))
77 SvParser
<T
>::SvParser( SvStream
& rIn
, sal_uInt8 nStackSize
)
81 , nConversionErrors( 0 )
82 , pImplData( nullptr )
85 , bTokenHasValue( false )
86 , bFuzzing(comphelper::IsFuzzing())
87 , eState( SvParserState::NotStarted
)
88 , eSrcEnc( RTL_TEXTENCODING_DONTKNOW
)
91 , bSwitchToUCS2(false)
92 , bRTF_InTextRead(false)
93 , nTokenStackSize( nStackSize
)
96 eState
= SvParserState::NotStarted
;
97 if( nTokenStackSize
< 3 )
99 pTokenStack
.reset(new TokenStackType
[ nTokenStackSize
]);
100 pTokenStackPos
= pTokenStack
.get();
104 SvParser
<T
>::~SvParser()
106 if( pImplData
&& pImplData
->hConv
)
108 rtl_destroyTextToUnicodeContext( pImplData
->hConv
,
109 pImplData
->hContext
);
110 rtl_destroyTextToUnicodeConverter( pImplData
->hConv
);
116 template<typename T
> SvParserState SvParser
<T
>::GetStatus() const { return eState
; }
117 template<typename T
> sal_uInt32 SvParser
<T
>::GetLineNr() const { return nlLineNr
; }
118 template<typename T
> sal_uInt32 SvParser
<T
>::GetLinePos() const { return nlLinePos
; }
119 template<typename T
> void SvParser
<T
>::IncLineNr() { ++nlLineNr
; }
120 template<typename T
> sal_uInt32 SvParser
<T
>::IncLinePos() { return ++nlLinePos
; }
121 template<typename T
> void SvParser
<T
>::SetLineNr( sal_uInt32 nlNum
) { nlLineNr
= nlNum
; }
122 template<typename T
> void SvParser
<T
>::SetLinePos( sal_uInt32 nlPos
) { nlLinePos
= nlPos
; }
123 template<typename T
> bool SvParser
<T
>::IsParserWorking() const { return SvParserState::Working
== eState
; }
124 template<typename T
> rtl_TextEncoding SvParser
<T
>::GetSrcEncoding() const { return eSrcEnc
; }
125 template<typename T
> void SvParser
<T
>::SetSwitchToUCS2( bool bSet
) { bSwitchToUCS2
= bSet
; }
126 template<typename T
> bool SvParser
<T
>::IsSwitchToUCS2() const { return bSwitchToUCS2
; }
127 template<typename T
> sal_uInt16 SvParser
<T
>::GetCharSize() const { return (RTL_TEXTENCODING_UCS2
== eSrcEnc
) ? 2 : 1; }
128 template<typename T
> Link
<LinkParamNone
*,void> SvParser
<T
>::GetAsynchCallLink() const
130 return LINK( const_cast<SvParser
*>(this), SvParser
, NewDataRead
);
134 void SvParser
<T
>::ClearTxtConvContext()
136 if( pImplData
&& pImplData
->hConv
)
137 rtl_resetTextToUnicodeContext( pImplData
->hConv
, pImplData
->hContext
);
141 void SvParser
<T
>::SetSrcEncoding( rtl_TextEncoding eEnc
)
143 if( eEnc
== eSrcEnc
)
146 if( pImplData
&& pImplData
->hConv
)
148 rtl_destroyTextToUnicodeContext( pImplData
->hConv
,
149 pImplData
->hContext
);
150 rtl_destroyTextToUnicodeConverter( pImplData
->hConv
);
151 pImplData
->hConv
= nullptr;
152 pImplData
->hContext
= reinterpret_cast<rtl_TextToUnicodeContext
>(1);
155 if( rtl_isOctetTextEncoding(eEnc
) ||
156 RTL_TEXTENCODING_UCS2
== eEnc
)
160 pImplData
.reset(new SvParser_Impl
<T
>);
161 pImplData
->hConv
= rtl_createTextToUnicodeConverter( eSrcEnc
);
162 DBG_ASSERT( pImplData
->hConv
,
163 "SvParser::SetSrcEncoding: no converter for source encoding" );
164 if( !pImplData
->hConv
)
165 eSrcEnc
= RTL_TEXTENCODING_DONTKNOW
;
167 pImplData
->hContext
=
168 rtl_createTextToUnicodeContext( pImplData
->hConv
);
173 "SvParser::SetSrcEncoding: invalid source encoding" );
174 eSrcEnc
= RTL_TEXTENCODING_DONTKNOW
;
179 void SvParser
<T
>::RereadLookahead()
181 rInput
.Seek(nNextChPos
);
182 nNextCh
= GetNextChar();
186 sal_uInt32 SvParser
<T
>::GetNextChar()
190 // When reading multiple bytes, we don't have to care about the file
191 // position when we run into the pending state. The file position is
192 // maintained by SaveState/RestoreState.
193 if( bSwitchToUCS2
&& 0 == rInput
.Tell() )
195 rInput
.StartReadingUnicodeText(RTL_TEXTENCODING_DONTKNOW
);
198 sal_uInt64 nPos
= rInput
.Tell();
200 eSrcEnc
= RTL_TEXTENCODING_UCS2
;
202 SetSrcEncoding(RTL_TEXTENCODING_UTF8
);
203 else // Try to detect encoding without BOM
205 std::vector
<char> buf(65535); // Arbitrarily chosen 64KiB buffer
206 const size_t nSize
= rInput
.ReadBytes(buf
.data(), buf
.size());
210 UErrorCode uerr
= U_ZERO_ERROR
;
211 UCharsetDetector
* ucd
= ucsdet_open(&uerr
);
212 ucsdet_setText(ucd
, buf
.data(), nSize
, &uerr
);
213 if (const UCharsetMatch
* match
= ucsdet_detect(ucd
, &uerr
))
215 const char* pEncodingName
= ucsdet_getName(match
, &uerr
);
219 if (strcmp("UTF-8", pEncodingName
) == 0)
221 SetSrcEncoding(RTL_TEXTENCODING_UTF8
);
223 else if (strcmp("UTF-16LE", pEncodingName
) == 0)
225 eSrcEnc
= RTL_TEXTENCODING_UCS2
;
226 rInput
.SetEndian(SvStreamEndian::LITTLE
);
228 else if (strcmp("UTF-16BE", pEncodingName
) == 0)
230 eSrcEnc
= RTL_TEXTENCODING_UCS2
;
231 rInput
.SetEndian(SvStreamEndian::BIG
);
240 bSwitchToUCS2
= false;
244 nNextChPos
= rInput
.Tell();
246 if( RTL_TEXTENCODING_UCS2
== eSrcEnc
)
249 rInput
.ReadUtf16(cUC
);
250 bErr
= !rInput
.good();
254 if (rtl::isHighSurrogate(cUC
))
256 const sal_uInt64 nPos
= rInput
.Tell();
257 rInput
.ReadUtf16(cUC
);
258 if (rtl::isLowSurrogate(cUC
)) // can only be true when ReadUtf16 succeeded
259 c
= rtl::combineSurrogates(c
, cUC
);
261 rInput
.Seek(nPos
); // process lone high surrogate
270 char c1
; // signed, that's the text converter expects
271 rInput
.ReadChar( c1
);
272 bErr
= !rInput
.good();
276 RTL_TEXTENCODING_DONTKNOW
== eSrcEnc
||
277 RTL_TEXTENCODING_SYMBOL
== eSrcEnc
280 // no conversion shall take place
281 c
= reinterpret_cast<unsigned char&>( c1
);
286 assert(pImplData
&& pImplData
->hConv
&& "no text converter!");
289 sal_uInt32 nInfo
= 0;
291 nChars
= rtl_convertTextToUnicode(
292 pImplData
->hConv
, pImplData
->hContext
,
294 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
|
295 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
|
296 RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR
,
298 if( (nInfo
&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL
) != 0 )
300 // The conversion wasn't successful because we haven't
301 // read enough characters.
302 if( pImplData
->hContext
!= reinterpret_cast<rtl_TextToUnicodeContext
>(1) )
305 while( (nInfo
&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL
) != 0 )
307 rInput
.ReadChar( c1
);
308 bErr
= !rInput
.good();
312 nChars
= rtl_convertTextToUnicode(
313 pImplData
->hConv
, pImplData
->hContext
,
315 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
|
316 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
|
317 RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR
,
322 if( 1 == nChars
&& 0 == nInfo
)
324 c
= sal_uInt32( sCh
[0] );
326 else if( 2 == nChars
&& 0 == nInfo
)
328 c
= rtl::combineSurrogates( sCh
[0], sCh
[1] );
330 else if( 0 != nChars
|| 0 != nInfo
)
332 DBG_ASSERT( (nInfo
&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL
) == 0,
333 "source buffer is too small" );
334 DBG_ASSERT( (nInfo
&~(RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL
)) == 0,
335 "there is a conversion error" );
336 DBG_ASSERT( 0 == nChars
,
337 "there is a converted character, but an error" );
338 // There are still errors, but nothing we can
351 while( (nInfo
&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL
) != 0 &&
354 rInput
.ReadChar( c1
);
355 bErr
= !rInput
.good();
359 sBuffer
[nLen
++] = c1
;
360 nChars
= rtl_convertTextToUnicode(
361 pImplData
->hConv
, nullptr, sBuffer
, nLen
, &cUC
, 1,
362 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
|
363 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
|
364 RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR
,
369 if( 1 == nChars
&& 0 == nInfo
)
371 DBG_ASSERT( nCvtBytes
== nLen
,
372 "no all bytes have been converted!" );
377 DBG_ASSERT( (nInfo
&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL
) == 0,
378 "source buffer is too small" );
379 DBG_ASSERT( (nInfo
&~(RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL
)) == 0,
380 "there is a conversion error" );
381 DBG_ASSERT( 0 == nChars
,
382 "there is a converted character, but an error" );
384 // There are still errors, so we use the first
385 // character and restart after that.
386 c
= reinterpret_cast<unsigned char&>( sBuffer
[0] );
387 rInput
.SeekRel( -(nLen
-1) );
394 else if( 1 == nChars
&& 0 == nInfo
)
396 // The conversion was successful
397 DBG_ASSERT( nCvtBytes
== 1,
398 "no all bytes have been converted!" );
401 else if( 0 != nChars
|| 0 != nInfo
)
403 DBG_ASSERT( 0 == nChars
,
404 "there is a converted character, but an error" );
405 DBG_ASSERT( 0 != nInfo
,
406 "there is no converted character and no error" );
407 // #73398#: If the character could not be converted,
408 // because a conversion is not available, do no conversion at all.
409 c
= reinterpret_cast<unsigned char&>( c1
);
416 while( 0 == nChars
&& !bErr
);
419 if ( ! rtl::isUnicodeScalarValue( c
) )
422 if (bFuzzing
&& nConversionErrors
> 128)
424 SAL_WARN("svtools", "SvParser::GetNextChar too many conversion errors while fuzzing, abandoning for performance");
430 if( ERRCODE_IO_PENDING
== rInput
.GetError() )
432 eState
= SvParserState::Pending
;
436 return sal_Unicode(EOF
);
451 T SvParser
<T
>::GetNextToken()
453 T nRet
= static_cast<T
>(0);
455 if( !nTokenStackPos
)
457 aToken
.setLength( 0 ); // empty token buffer
458 nTokenValue
= -1; // marker for no value read
459 bTokenHasValue
= false;
461 nRet
= GetNextToken_();
462 if( SvParserState::Pending
== eState
)
467 if( pTokenStackPos
== pTokenStack
.get() + nTokenStackSize
)
468 pTokenStackPos
= pTokenStack
.get();
474 nTokenValue
= pTokenStackPos
->nTokenValue
;
475 bTokenHasValue
= pTokenStackPos
->bTokenHasValue
;
476 aToken
= pTokenStackPos
->sToken
;
477 nRet
= pTokenStackPos
->nTokenId
;
480 // no, now push actual value on stack
481 else if( SvParserState::Working
== eState
)
483 pTokenStackPos
->sToken
= aToken
;
484 pTokenStackPos
->nTokenValue
= nTokenValue
;
485 pTokenStackPos
->bTokenHasValue
= bTokenHasValue
;
486 pTokenStackPos
->nTokenId
= nRet
;
489 else if( SvParserState::Accepted
!= eState
&& SvParserState::Pending
!= eState
)
490 eState
= SvParserState::Error
; // an error occurred
496 T SvParser
<T
>::SkipToken( short nCnt
) // "skip" n Tokens backward
498 pTokenStackPos
= GetStackPtr( nCnt
);
499 short nTmp
= nTokenStackPos
- nCnt
;
502 else if( nTmp
> nTokenStackSize
)
503 nTmp
= nTokenStackSize
;
504 nTokenStackPos
= sal_uInt8(nTmp
);
506 m_nTokenIndex
-= nTmp
;
509 aToken
= pTokenStackPos
->sToken
;
510 nTokenValue
= pTokenStackPos
->nTokenValue
;
511 bTokenHasValue
= pTokenStackPos
->bTokenHasValue
;
513 return pTokenStackPos
->nTokenId
;
517 typename SvParser
<T
>::TokenStackType
* SvParser
<T
>::GetStackPtr( short nCnt
)
519 sal_uInt8 nCurrentPos
= sal_uInt8(pTokenStackPos
- pTokenStack
.get());
522 if( nCnt
>= nTokenStackSize
)
523 nCnt
= (nTokenStackSize
-1);
524 if( nCurrentPos
+ nCnt
< nTokenStackSize
)
525 nCurrentPos
= sal::static_int_cast
< sal_uInt8
>(nCurrentPos
+ nCnt
);
527 nCurrentPos
= sal::static_int_cast
< sal_uInt8
>(
528 nCurrentPos
+ (nCnt
- nTokenStackSize
));
532 if( -nCnt
>= nTokenStackSize
)
533 nCnt
= -nTokenStackSize
+1;
534 if( -nCnt
<= nCurrentPos
)
535 nCurrentPos
= sal::static_int_cast
< sal_uInt8
>(nCurrentPos
+ nCnt
);
537 nCurrentPos
= sal::static_int_cast
< sal_uInt8
>(
538 nCurrentPos
+ (nCnt
+ nTokenStackSize
));
540 return pTokenStack
.get() + nCurrentPos
;
543 // to read asynchronous from SvStream
546 T SvParser
<T
>::GetSaveToken() const
548 return pImplData
? pImplData
->nSaveToken
: static_cast<T
>(0);
552 void SvParser
<T
>::SaveState( T nToken
)
554 // save actual status
557 pImplData
.reset(new SvParser_Impl
<T
>);
558 pImplData
->nSaveToken
= static_cast<T
>(0);
561 pImplData
->nFilePos
= rInput
.Tell();
562 pImplData
->nToken
= nToken
;
564 pImplData
->aToken
= aToken
;
565 pImplData
->nlLineNr
= nlLineNr
;
566 pImplData
->nlLinePos
= nlLinePos
;
567 pImplData
->nTokenValue
= nTokenValue
;
568 pImplData
->bTokenHasValue
= bTokenHasValue
;
569 pImplData
->nNextCh
= nNextCh
;
573 void SvParser
<T
>::RestoreState()
575 // restore old status
579 if( ERRCODE_IO_PENDING
== rInput
.GetError() )
581 aToken
= pImplData
->aToken
;
582 nlLineNr
= pImplData
->nlLineNr
;
583 nlLinePos
= pImplData
->nlLinePos
;
584 nTokenValue
= pImplData
->nTokenValue
;
585 bTokenHasValue
=pImplData
->bTokenHasValue
;
586 nNextCh
= pImplData
->nNextCh
;
588 pImplData
->nSaveToken
= pImplData
->nToken
;
590 rInput
.Seek( pImplData
->nFilePos
);
594 void SvParser
<T
>::Continue( T
)
599 // expanded out version of
600 // IMPL_LINK_NOARG( SvParser, NewDataRead, LinkParamNone*, void )
601 // since it can't cope with template methods
603 void SvParser
<T
>::LinkStubNewDataRead(void * instance
, LinkParamNone
* data
) {
604 return static_cast<SvParser
<T
> *>(instance
)->NewDataRead(data
);
607 void SvParser
<T
>::NewDataRead(SAL_UNUSED_PARAMETER LinkParamNone
*)
611 case SvParserState::Pending
:
612 eState
= SvParserState::Working
;
615 Continue( pImplData
->nToken
);
617 if( ERRCODE_IO_PENDING
== rInput
.GetError() )
620 if( SvParserState::Pending
!= eState
)
621 ReleaseRef(); // ready otherwise!
624 case SvParserState::NotStarted
:
625 case SvParserState::Working
:
629 ReleaseRef(); // ready otherwise!
634 template class SVT_DLLPUBLIC SvParser
<int>;
635 template class SVT_DLLPUBLIC SvParser
<HtmlTokenId
>;
637 /*========================================================================
639 * SvKeyValueIterator.
641 *======================================================================*/
643 typedef std::vector
<SvKeyValue
> SvKeyValueList_Impl
;
645 struct SvKeyValueIterator::Impl
647 SvKeyValueList_Impl maList
;
653 SvKeyValueIterator::SvKeyValueIterator() : mpImpl(new Impl
) {}
655 SvKeyValueIterator::~SvKeyValueIterator() = default;
657 bool SvKeyValueIterator::GetFirst (SvKeyValue
&rKeyVal
)
659 mpImpl
->mnPos
= mpImpl
->maList
.size();
660 return GetNext (rKeyVal
);
663 bool SvKeyValueIterator::GetNext (SvKeyValue
&rKeyVal
)
665 if (mpImpl
->mnPos
> 0)
667 rKeyVal
= mpImpl
->maList
[--mpImpl
->mnPos
];
677 void SvKeyValueIterator::Append (const SvKeyValue
&rKeyVal
)
679 mpImpl
->maList
.push_back(rKeyVal
);
682 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */