1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <svtools/svparser.hxx>
21 #include <svtools/htmltokn.h>
22 #include <tools/stream.hxx>
23 #include <tools/debug.hxx>
24 #include <rtl/textcvt.h>
25 #include <rtl/tencinfo.h>
26 #include <rtl/character.hxx>
27 #include <sal/log.hxx>
28 #include <unicode/ucsdet.h>
29 #include <unotools/configmgr.hxx>
33 // structure to store the actual data
37 OUString aToken
; // gescanntes Token
38 sal_uInt64 nFilePos
; // actual position in stream
39 sal_uInt32 nlLineNr
; // actual line number
40 sal_uInt32 nlLinePos
; // actual column number
41 tools::Long nTokenValue
; // extra value (RTF)
42 bool bTokenHasValue
; // indicates whether nTokenValue is valid
43 T nToken
; // actual Token
44 sal_uInt32 nNextCh
; // actual character
45 T nSaveToken
; // the token from Continue
47 rtl_TextToUnicodeConverter hConv
;
48 rtl_TextToUnicodeContext hContext
;
55 , bTokenHasValue(false)
56 , nToken(static_cast<T
>(0))
58 , nSaveToken(static_cast<T
>(0))
60 , hContext( reinterpret_cast<rtl_TextToUnicodeContext
>(1) )
68 SvParser
<T
>::TokenStackType::TokenStackType()
70 , bTokenHasValue(false)
71 , nTokenId(static_cast<T
>(0))
77 SvParser
<T
>::SvParser( SvStream
& rIn
, sal_uInt8 nStackSize
)
81 , pImplData( nullptr )
84 , bTokenHasValue( false )
85 , bFuzzing(utl::ConfigManager::IsFuzzing())
86 , eState( SvParserState::NotStarted
)
87 , eSrcEnc( RTL_TEXTENCODING_DONTKNOW
)
90 , bSwitchToUCS2(false)
91 , bRTF_InTextRead(false)
92 , nTokenStackSize( nStackSize
)
95 eState
= SvParserState::NotStarted
;
96 if( nTokenStackSize
< 3 )
98 pTokenStack
.reset(new TokenStackType
[ nTokenStackSize
]);
99 pTokenStackPos
= pTokenStack
.get();
103 SvParser
<T
>::~SvParser()
105 if( pImplData
&& pImplData
->hConv
)
107 rtl_destroyTextToUnicodeContext( pImplData
->hConv
,
108 pImplData
->hContext
);
109 rtl_destroyTextToUnicodeConverter( pImplData
->hConv
);
115 template<typename T
> SvParserState SvParser
<T
>::GetStatus() const { return eState
; }
116 template<typename T
> sal_uInt32 SvParser
<T
>::GetLineNr() const { return nlLineNr
; }
117 template<typename T
> sal_uInt32 SvParser
<T
>::GetLinePos() const { return nlLinePos
; }
118 template<typename T
> void SvParser
<T
>::IncLineNr() { ++nlLineNr
; }
119 template<typename T
> sal_uInt32 SvParser
<T
>::IncLinePos() { return ++nlLinePos
; }
120 template<typename T
> void SvParser
<T
>::SetLineNr( sal_uInt32 nlNum
) { nlLineNr
= nlNum
; }
121 template<typename T
> void SvParser
<T
>::SetLinePos( sal_uInt32 nlPos
) { nlLinePos
= nlPos
; }
122 template<typename T
> bool SvParser
<T
>::IsParserWorking() const { return SvParserState::Working
== eState
; }
123 template<typename T
> rtl_TextEncoding SvParser
<T
>::GetSrcEncoding() const { return eSrcEnc
; }
124 template<typename T
> void SvParser
<T
>::SetSwitchToUCS2( bool bSet
) { bSwitchToUCS2
= bSet
; }
125 template<typename T
> bool SvParser
<T
>::IsSwitchToUCS2() const { return bSwitchToUCS2
; }
126 template<typename T
> sal_uInt16 SvParser
<T
>::GetCharSize() const { return (RTL_TEXTENCODING_UCS2
== eSrcEnc
) ? 2 : 1; }
127 template<typename T
> Link
<LinkParamNone
*,void> SvParser
<T
>::GetAsynchCallLink() const
129 return LINK( const_cast<SvParser
*>(this), SvParser
, NewDataRead
);
133 void SvParser
<T
>::ClearTxtConvContext()
135 if( pImplData
&& pImplData
->hConv
)
136 rtl_resetTextToUnicodeContext( pImplData
->hConv
, pImplData
->hContext
);
140 void SvParser
<T
>::SetSrcEncoding( rtl_TextEncoding eEnc
)
142 if( eEnc
== eSrcEnc
)
145 if( pImplData
&& pImplData
->hConv
)
147 rtl_destroyTextToUnicodeContext( pImplData
->hConv
,
148 pImplData
->hContext
);
149 rtl_destroyTextToUnicodeConverter( pImplData
->hConv
);
150 pImplData
->hConv
= nullptr;
151 pImplData
->hContext
= reinterpret_cast<rtl_TextToUnicodeContext
>(1);
154 if( rtl_isOctetTextEncoding(eEnc
) ||
155 RTL_TEXTENCODING_UCS2
== eEnc
)
159 pImplData
.reset(new SvParser_Impl
<T
>);
160 pImplData
->hConv
= rtl_createTextToUnicodeConverter( eSrcEnc
);
161 DBG_ASSERT( pImplData
->hConv
,
162 "SvParser::SetSrcEncoding: no converter for source encoding" );
163 if( !pImplData
->hConv
)
164 eSrcEnc
= RTL_TEXTENCODING_DONTKNOW
;
166 pImplData
->hContext
=
167 rtl_createTextToUnicodeContext( pImplData
->hConv
);
172 "SvParser::SetSrcEncoding: invalid source encoding" );
173 eSrcEnc
= RTL_TEXTENCODING_DONTKNOW
;
178 void SvParser
<T
>::RereadLookahead()
180 rInput
.Seek(nNextChPos
);
181 nNextCh
= GetNextChar();
185 sal_uInt32 SvParser
<T
>::GetNextChar()
189 // When reading multiple bytes, we don't have to care about the file
190 // position when we run into the pending state. The file position is
191 // maintained by SaveState/RestoreState.
192 if( bSwitchToUCS2
&& 0 == rInput
.Tell() )
194 rInput
.StartReadingUnicodeText(RTL_TEXTENCODING_DONTKNOW
);
197 sal_uInt64 nPos
= rInput
.Tell();
199 eSrcEnc
= RTL_TEXTENCODING_UCS2
;
201 SetSrcEncoding(RTL_TEXTENCODING_UTF8
);
202 else // Try to detect encoding without BOM
204 std::vector
<char> buf(65535); // Arbitrarily chosen 64KiB buffer
205 const size_t nSize
= rInput
.ReadBytes(buf
.data(), buf
.size());
209 UErrorCode uerr
= U_ZERO_ERROR
;
210 UCharsetDetector
* ucd
= ucsdet_open(&uerr
);
211 ucsdet_setText(ucd
, buf
.data(), nSize
, &uerr
);
212 if (const UCharsetMatch
* match
= ucsdet_detect(ucd
, &uerr
))
214 const char* pEncodingName
= ucsdet_getName(match
, &uerr
);
218 if (strcmp("UTF-8", pEncodingName
) == 0)
220 SetSrcEncoding(RTL_TEXTENCODING_UTF8
);
222 else if (strcmp("UTF-16LE", pEncodingName
) == 0)
224 eSrcEnc
= RTL_TEXTENCODING_UCS2
;
225 rInput
.SetEndian(SvStreamEndian::LITTLE
);
227 else if (strcmp("UTF-16BE", pEncodingName
) == 0)
229 eSrcEnc
= RTL_TEXTENCODING_UCS2
;
230 rInput
.SetEndian(SvStreamEndian::BIG
);
239 bSwitchToUCS2
= false;
243 nNextChPos
= rInput
.Tell();
245 if( RTL_TEXTENCODING_UCS2
== eSrcEnc
)
248 rInput
.ReadUtf16(cUC
);
249 bErr
= !rInput
.good();
253 if (rtl::isHighSurrogate(cUC
))
255 const sal_uInt64 nPos
= rInput
.Tell();
256 rInput
.ReadUtf16(cUC
);
257 if (rtl::isLowSurrogate(cUC
)) // can only be true when ReadUtf16 succeeded
258 c
= rtl::combineSurrogates(c
, cUC
);
260 rInput
.Seek(nPos
); // process lone high surrogate
269 char c1
; // signed, that's the text converter expects
270 rInput
.ReadChar( c1
);
271 bErr
= !rInput
.good();
275 RTL_TEXTENCODING_DONTKNOW
== eSrcEnc
||
276 RTL_TEXTENCODING_SYMBOL
== eSrcEnc
279 // no conversion shall take place
280 c
= reinterpret_cast<unsigned char&>( c1
);
285 assert(pImplData
&& pImplData
->hConv
&& "no text converter!");
288 sal_uInt32 nInfo
= 0;
290 nChars
= rtl_convertTextToUnicode(
291 pImplData
->hConv
, pImplData
->hContext
,
293 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
|
294 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
|
295 RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR
,
297 if( (nInfo
&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL
) != 0 )
299 // The conversion wasn't successful because we haven't
300 // read enough characters.
301 if( pImplData
->hContext
!= reinterpret_cast<rtl_TextToUnicodeContext
>(1) )
304 while( (nInfo
&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL
) != 0 )
306 rInput
.ReadChar( c1
);
307 bErr
= !rInput
.good();
311 nChars
= rtl_convertTextToUnicode(
312 pImplData
->hConv
, pImplData
->hContext
,
314 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
|
315 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
|
316 RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR
,
321 if( 1 == nChars
&& 0 == nInfo
)
323 c
= sal_uInt32( sCh
[0] );
325 else if( 2 == nChars
&& 0 == nInfo
)
327 c
= rtl::combineSurrogates( sCh
[0], sCh
[1] );
329 else if( 0 != nChars
|| 0 != nInfo
)
331 DBG_ASSERT( (nInfo
&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL
) == 0,
332 "source buffer is too small" );
333 DBG_ASSERT( (nInfo
&~(RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL
)) == 0,
334 "there is a conversion error" );
335 DBG_ASSERT( 0 == nChars
,
336 "there is a converted character, but an error" );
337 // There are still errors, but nothing we can
349 while( (nInfo
&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL
) != 0 &&
352 rInput
.ReadChar( c1
);
353 bErr
= !rInput
.good();
357 sBuffer
[nLen
++] = c1
;
358 nChars
= rtl_convertTextToUnicode(
359 pImplData
->hConv
, nullptr, sBuffer
, nLen
, &cUC
, 1,
360 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
|
361 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
|
362 RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR
,
367 if( 1 == nChars
&& 0 == nInfo
)
369 DBG_ASSERT( nCvtBytes
== nLen
,
370 "no all bytes have been converted!" );
375 DBG_ASSERT( (nInfo
&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL
) == 0,
376 "source buffer is too small" );
377 DBG_ASSERT( (nInfo
&~(RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL
)) == 0,
378 "there is a conversion error" );
379 DBG_ASSERT( 0 == nChars
,
380 "there is a converted character, but an error" );
382 // There are still errors, so we use the first
383 // character and restart after that.
384 c
= reinterpret_cast<unsigned char&>( sBuffer
[0] );
385 rInput
.SeekRel( -(nLen
-1) );
391 else if( 1 == nChars
&& 0 == nInfo
)
393 // The conversion was successful
394 DBG_ASSERT( nCvtBytes
== 1,
395 "no all bytes have been converted!" );
398 else if( 0 != nChars
|| 0 != nInfo
)
400 DBG_ASSERT( 0 == nChars
,
401 "there is a converted character, but an error" );
402 DBG_ASSERT( 0 != nInfo
,
403 "there is no converted character and no error" );
404 // #73398#: If the character could not be converted,
405 // because a conversion is not available, do no conversion at all.
406 c
= reinterpret_cast<unsigned char&>( c1
);
413 while( 0 == nChars
&& !bErr
);
416 if ( ! rtl::isUnicodeScalarValue( c
) )
421 if( ERRCODE_IO_PENDING
== rInput
.GetError() )
423 eState
= SvParserState::Pending
;
427 return sal_Unicode(EOF
);
442 T SvParser
<T
>::GetNextToken()
444 T nRet
= static_cast<T
>(0);
446 if( !nTokenStackPos
)
448 aToken
.setLength( 0 ); // empty token buffer
449 nTokenValue
= -1; // marker for no value read
450 bTokenHasValue
= false;
452 nRet
= GetNextToken_();
453 if( SvParserState::Pending
== eState
)
458 if( pTokenStackPos
== pTokenStack
.get() + nTokenStackSize
)
459 pTokenStackPos
= pTokenStack
.get();
465 nTokenValue
= pTokenStackPos
->nTokenValue
;
466 bTokenHasValue
= pTokenStackPos
->bTokenHasValue
;
467 aToken
= pTokenStackPos
->sToken
;
468 nRet
= pTokenStackPos
->nTokenId
;
471 // no, now push actual value on stack
472 else if( SvParserState::Working
== eState
)
474 pTokenStackPos
->sToken
= aToken
;
475 pTokenStackPos
->nTokenValue
= nTokenValue
;
476 pTokenStackPos
->bTokenHasValue
= bTokenHasValue
;
477 pTokenStackPos
->nTokenId
= nRet
;
480 else if( SvParserState::Accepted
!= eState
&& SvParserState::Pending
!= eState
)
481 eState
= SvParserState::Error
; // an error occurred
487 T SvParser
<T
>::SkipToken( short nCnt
) // "skip" n Tokens backward
489 pTokenStackPos
= GetStackPtr( nCnt
);
490 short nTmp
= nTokenStackPos
- nCnt
;
493 else if( nTmp
> nTokenStackSize
)
494 nTmp
= nTokenStackSize
;
495 nTokenStackPos
= sal_uInt8(nTmp
);
497 m_nTokenIndex
-= nTmp
;
500 aToken
= pTokenStackPos
->sToken
;
501 nTokenValue
= pTokenStackPos
->nTokenValue
;
502 bTokenHasValue
= pTokenStackPos
->bTokenHasValue
;
504 return pTokenStackPos
->nTokenId
;
508 typename SvParser
<T
>::TokenStackType
* SvParser
<T
>::GetStackPtr( short nCnt
)
510 sal_uInt8 nCurrentPos
= sal_uInt8(pTokenStackPos
- pTokenStack
.get());
513 if( nCnt
>= nTokenStackSize
)
514 nCnt
= (nTokenStackSize
-1);
515 if( nCurrentPos
+ nCnt
< nTokenStackSize
)
516 nCurrentPos
= sal::static_int_cast
< sal_uInt8
>(nCurrentPos
+ nCnt
);
518 nCurrentPos
= sal::static_int_cast
< sal_uInt8
>(
519 nCurrentPos
+ (nCnt
- nTokenStackSize
));
523 if( -nCnt
>= nTokenStackSize
)
524 nCnt
= -nTokenStackSize
+1;
525 if( -nCnt
<= nCurrentPos
)
526 nCurrentPos
= sal::static_int_cast
< sal_uInt8
>(nCurrentPos
+ nCnt
);
528 nCurrentPos
= sal::static_int_cast
< sal_uInt8
>(
529 nCurrentPos
+ (nCnt
+ nTokenStackSize
));
531 return pTokenStack
.get() + nCurrentPos
;
534 // to read asynchronous from SvStream
537 T SvParser
<T
>::GetSaveToken() const
539 return pImplData
? pImplData
->nSaveToken
: static_cast<T
>(0);
543 void SvParser
<T
>::SaveState( T nToken
)
545 // save actual status
548 pImplData
.reset(new SvParser_Impl
<T
>);
549 pImplData
->nSaveToken
= static_cast<T
>(0);
552 pImplData
->nFilePos
= rInput
.Tell();
553 pImplData
->nToken
= nToken
;
555 pImplData
->aToken
= aToken
;
556 pImplData
->nlLineNr
= nlLineNr
;
557 pImplData
->nlLinePos
= nlLinePos
;
558 pImplData
->nTokenValue
= nTokenValue
;
559 pImplData
->bTokenHasValue
= bTokenHasValue
;
560 pImplData
->nNextCh
= nNextCh
;
564 void SvParser
<T
>::RestoreState()
566 // restore old status
570 if( ERRCODE_IO_PENDING
== rInput
.GetError() )
572 aToken
= pImplData
->aToken
;
573 nlLineNr
= pImplData
->nlLineNr
;
574 nlLinePos
= pImplData
->nlLinePos
;
575 nTokenValue
= pImplData
->nTokenValue
;
576 bTokenHasValue
=pImplData
->bTokenHasValue
;
577 nNextCh
= pImplData
->nNextCh
;
579 pImplData
->nSaveToken
= pImplData
->nToken
;
581 rInput
.Seek( pImplData
->nFilePos
);
585 void SvParser
<T
>::Continue( T
)
590 // expanded out version of
591 // IMPL_LINK_NOARG( SvParser, NewDataRead, LinkParamNone*, void )
592 // since it can't cope with template methods
594 void SvParser
<T
>::LinkStubNewDataRead(void * instance
, LinkParamNone
* data
) {
595 return static_cast<SvParser
<T
> *>(instance
)->NewDataRead(data
);
598 void SvParser
<T
>::NewDataRead(SAL_UNUSED_PARAMETER LinkParamNone
*)
602 case SvParserState::Pending
:
603 eState
= SvParserState::Working
;
606 Continue( pImplData
->nToken
);
608 if( ERRCODE_IO_PENDING
== rInput
.GetError() )
611 if( SvParserState::Pending
!= eState
)
612 ReleaseRef(); // ready otherwise!
615 case SvParserState::NotStarted
:
616 case SvParserState::Working
:
620 ReleaseRef(); // ready otherwise!
625 template class SVT_DLLPUBLIC SvParser
<int>;
626 template class SVT_DLLPUBLIC SvParser
<HtmlTokenId
>;
628 /*========================================================================
630 * SvKeyValueIterator.
632 *======================================================================*/
634 typedef std::vector
<SvKeyValue
> SvKeyValueList_Impl
;
636 struct SvKeyValueIterator::Impl
638 SvKeyValueList_Impl maList
;
644 SvKeyValueIterator::SvKeyValueIterator() : mpImpl(new Impl
) {}
646 SvKeyValueIterator::~SvKeyValueIterator() = default;
648 bool SvKeyValueIterator::GetFirst (SvKeyValue
&rKeyVal
)
650 mpImpl
->mnPos
= mpImpl
->maList
.size();
651 return GetNext (rKeyVal
);
654 bool SvKeyValueIterator::GetNext (SvKeyValue
&rKeyVal
)
656 if (mpImpl
->mnPos
> 0)
658 rKeyVal
= mpImpl
->maList
[--mpImpl
->mnPos
];
668 void SvKeyValueIterator::Append (const SvKeyValue
&rKeyVal
)
670 mpImpl
->maList
.push_back(rKeyVal
);
673 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */