1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sax/fastparser.hxx>
21 #include <sax/fastattribs.hxx>
23 #include <xml2utf.hxx>
25 #include <com/sun/star/io/XSeekable.hpp>
26 #include <com/sun/star/lang/DisposedException.hpp>
27 #include <com/sun/star/lang/IllegalArgumentException.hpp>
28 #include <com/sun/star/uno/XComponentContext.hpp>
29 #include <com/sun/star/xml/sax/FastToken.hpp>
30 #include <com/sun/star/xml/sax/SAXParseException.hpp>
31 #include <com/sun/star/xml/sax/XFastContextHandler.hpp>
32 #include <cppuhelper/implbase.hxx>
33 #include <cppuhelper/supportsservice.hxx>
34 #include <cppuhelper/exc_hlp.hxx>
35 #include <osl/conditn.hxx>
36 #include <rtl/ref.hxx>
37 #include <rtl/ustrbuf.hxx>
38 #include <sal/log.hxx>
39 #include <salhelper/thread.hxx>
40 #include <comphelper/diagnose_ex.hxx>
41 #include <o3tl/string_view.hxx>
48 #include <string_view>
49 #include <unordered_map>
53 #include <libxml/parser.h>
55 // Inverse of libxml's BAD_CAST.
56 #define XML_CAST( str ) reinterpret_cast< const char* >( str )
58 using namespace ::osl
;
59 using namespace ::cppu
;
60 using namespace ::com::sun::star::uno
;
61 using namespace ::com::sun::star::lang
;
62 using namespace ::com::sun::star::xml::sax
;
63 using namespace ::com::sun::star::io
;
64 using namespace com::sun::star
;
65 using namespace sax_fastparser
;
67 static void NormalizeURI( OUString
& rName
);
72 class FastLocatorImpl
;
73 struct NamespaceDefine
;
76 typedef std::unordered_map
< OUString
, sal_Int32
> NamespaceMap
;
80 std::vector
<Event
> maEvents
;
81 bool mbIsAttributesEmpty
;
84 enum class CallbackType
{ START_ELEMENT
, END_ELEMENT
, CHARACTERS
, PROCESSING_INSTRUCTION
, DONE
, EXCEPTION
};
89 sal_Int32 mnElementToken
;
91 OUString msElementName
;
92 rtl::Reference
< FastAttributeList
> mxAttributes
;
93 rtl::Reference
< FastAttributeList
> mxDeclAttributes
;
102 NameWithToken(OUString sName
, sal_Int32 nToken
) :
103 msName(std::move(sName
)), mnToken(nToken
) {}
108 Reference
< XFastContextHandler
> mxContext
;
109 sal_Int32 mnElementToken
;
110 std::optional
<OUString
> moNamespace
;
111 std::optional
<OUString
> moElementName
;
113 SaxContext( sal_Int32 nElementToken
, const OUString
& aNamespace
, const OUString
& aElementName
):
114 mnElementToken(nElementToken
)
116 if (nElementToken
== FastToken::DONTKNOW
)
118 moNamespace
= aNamespace
;
119 moElementName
= aElementName
;
126 css::uno::Reference
< css::xml::sax::XFastDocumentHandler
> mxDocumentHandler
;
127 rtl::Reference
<FastTokenHandlerBase
> mxTokenHandler
;
128 css::uno::Reference
< css::xml::sax::XErrorHandler
> mxErrorHandler
;
129 css::uno::Reference
< css::xml::sax::XFastNamespaceHandler
>mxNamespaceHandler
;
134 struct NamespaceDefine
138 OUString maNamespaceURL
;
140 NamespaceDefine( OString aPrefix
, sal_Int32 nToken
, OUString aNamespaceURL
)
141 : maPrefix(std::move( aPrefix
)), mnToken( nToken
), maNamespaceURL(std::move( aNamespaceURL
)) {}
142 NamespaceDefine() : mnToken(-1) {}
145 // Entity binds all information needed for a single file | single call of parseStream
146 struct Entity
: public ParserData
148 // Amount of work producer sends to consumer in one iteration:
149 static const size_t mnEventListSize
= 1000;
151 // unique for each Entity instance:
153 // Number of valid events in mxProducedEvents:
154 size_t mnProducedEventsSize
;
155 std::optional
<EventList
> mxProducedEvents
;
156 std::queue
<EventList
> maPendingEvents
;
157 std::queue
<EventList
> maUsedEvents
;
158 std::mutex maEventProtector
;
160 static const size_t mnEventLowWater
= 4;
161 static const size_t mnEventHighWater
= 8;
162 osl::Condition maConsumeResume
;
163 osl::Condition maProduceResume
;
164 // Event we use to store data if threading is disabled:
167 // copied in copy constructor:
169 // Allow to disable threading for small documents:
170 bool mbEnableThreads
;
171 css::xml::sax::InputSource maStructSource
;
172 xmlParserCtxtPtr mpParser
;
173 ::sax_expatwrap::XMLFile2UTFConverter maConverter
;
175 // Exceptions cannot be thrown through the C-XmlParser (possible
176 // resource leaks), therefore any exception thrown by a UNO callback
177 // must be saved somewhere until the C-XmlParser is stopped.
178 css::uno::Any maSavedException
;
179 std::mutex maSavedExceptionMutex
;
180 void saveException( const Any
& e
);
181 // Thread-safe check if maSavedException has value
183 void throwException( const ::rtl::Reference
< FastLocatorImpl
> &xDocumentLocator
,
184 bool mbDuringParse
);
186 std::stack
< NameWithToken
, std::vector
<NameWithToken
> > maNamespaceStack
;
187 /* Context for main thread consuming events.
188 * startElement() stores the data, which characters() and endElement() uses
190 std::stack
< SaxContext
, std::vector
<SaxContext
> > maContextStack
;
191 // Determines which elements of maNamespaceDefines are valid in current context
192 std::stack
< sal_uInt32
, std::vector
<sal_uInt32
> > maNamespaceCount
;
193 std::vector
< NamespaceDefine
> maNamespaceDefines
;
195 explicit Entity( const ParserData
& rData
);
196 Entity( const Entity
& rEntity
) = delete;
197 Entity
& operator=( const Entity
& rEntity
) = delete;
198 void startElement( Event
const *pEvent
);
199 void characters( const OUString
& sChars
);
201 void processingInstruction( const OUString
& rTarget
, const OUString
& rData
);
202 EventList
& getEventList();
203 Event
& getEvent( CallbackType aType
);
206 // Stuff for custom entity names
207 struct ReplacementPair
210 OUString replacement
;
212 inline bool operator<(const ReplacementPair
& lhs
, const ReplacementPair
& rhs
)
214 return lhs
.name
< rhs
.name
;
216 inline bool operator<(const ReplacementPair
& lhs
, const char* rhs
)
218 return lhs
.name
.compareToAscii(rhs
) < 0;
223 namespace sax_fastparser
{
225 class FastSaxParserImpl
228 explicit FastSaxParserImpl();
229 ~FastSaxParserImpl();
232 std::vector
<ReplacementPair
> m_Replacements
;
233 std::vector
<xmlEntityPtr
> m_TemporalEntities
;
237 /// @throws css::xml::sax::SAXException
238 /// @throws css::io::IOException
239 /// @throws css::uno::RuntimeException
240 void parseStream( const css::xml::sax::InputSource
& aInputSource
);
241 /// @throws css::uno::RuntimeException
242 void setFastDocumentHandler( const css::uno::Reference
< css::xml::sax::XFastDocumentHandler
>& Handler
);
243 /// @throws css::uno::RuntimeException
244 void setTokenHandler( const css::uno::Reference
< css::xml::sax::XFastTokenHandler
>& Handler
);
245 /// @throws css::lang::IllegalArgumentException
246 /// @throws css::uno::RuntimeException
247 void registerNamespace( const OUString
& NamespaceURL
, sal_Int32 NamespaceToken
);
248 /// @throws css::lang::IllegalArgumentException
249 /// @throws css::uno::RuntimeException
250 OUString
const & getNamespaceURL( std::u16string_view rPrefix
);
251 /// @throws css::uno::RuntimeException
252 void setErrorHandler( const css::uno::Reference
< css::xml::sax::XErrorHandler
>& Handler
);
253 /// @throws css::uno::RuntimeException
254 void setNamespaceHandler( const css::uno::Reference
< css::xml::sax::XFastNamespaceHandler
>& Handler
);
256 void setCustomEntityNames(
257 const ::css::uno::Sequence
<::css::beans::Pair
<::rtl::OUString
, ::rtl::OUString
>>& replacements
);
259 // called by the C callbacks of the expat parser
260 void callbackStartElement( const xmlChar
*localName
, const xmlChar
* prefix
, const xmlChar
* URI
,
261 int numNamespaces
, const xmlChar
** namespaces
, int numAttributes
, const xmlChar
**attributes
);
262 void callbackEndElement();
263 void callbackCharacters( const xmlChar
* s
, int nLen
);
264 void callbackProcessingInstruction( const xmlChar
*target
, const xmlChar
*data
);
265 xmlEntityPtr
callbackGetEntity( const xmlChar
*name
);
267 void pushEntity(const ParserData
&, xml::sax::InputSource
const&);
269 Entity
& getEntity() { return *mpTop
; }
271 void produce( bool bForceFlush
= false );
272 bool m_bIgnoreMissingNSDecl
;
273 bool m_bDisableThreadedParser
;
276 bool consume(EventList
&);
277 void deleteUsedEvents();
278 void sendPendingCharacters();
279 void addUnknownElementWithPrefix(const xmlChar
**attributes
, int i
, rtl::Reference
< FastAttributeList
> const & xAttributes
);
281 sal_Int32
GetToken( const xmlChar
* pName
);
282 /// @throws css::xml::sax::SAXException
283 sal_Int32
GetTokenWithPrefix( const xmlChar
* pPrefix
, const xmlChar
* pName
);
284 /// @throws css::xml::sax::SAXException
285 OUString
const & GetNamespaceURL( std::string_view rPrefix
);
286 sal_Int32
GetNamespaceToken( const OUString
& rNamespaceURL
);
287 sal_Int32
GetTokenWithContextNamespace( sal_Int32 nNamespaceToken
, const xmlChar
* pName
);
288 void DefineNamespace( const OString
& rPrefix
, const OUString
& namespaceURL
);
291 std::mutex maMutex
; ///< Protecting whole parseStream() execution
292 ::rtl::Reference
< FastLocatorImpl
> mxDocumentLocator
;
293 NamespaceMap maNamespaceMap
;
295 ParserData maData
; /// Cached parser configuration for next call of parseStream().
297 Entity
*mpTop
; /// std::stack::top() is amazingly slow => cache this.
298 std::stack
< Entity
> maEntities
; /// Entity stack for each call of parseStream().
299 std::vector
<char> pendingCharacters
; /// Data from characters() callback that needs to be sent.
302 } // namespace sax_fastparser
306 class ParserThread
: public salhelper::Thread
308 FastSaxParserImpl
*mpParser
;
310 explicit ParserThread(FastSaxParserImpl
*pParser
): Thread("Parser"), mpParser(pParser
) {}
312 virtual void execute() override
320 Entity
&rEntity
= mpParser
->getEntity();
321 rEntity
.getEvent( CallbackType::EXCEPTION
);
322 mpParser
->produce( true );
329 static void call_callbackStartElement(void *userData
, const xmlChar
*localName
, const xmlChar
* prefix
, const xmlChar
* URI
,
330 int numNamespaces
, const xmlChar
** namespaces
, int numAttributes
, int /*defaultedAttributes*/, const xmlChar
**attributes
)
332 FastSaxParserImpl
* pFastParser
= static_cast<FastSaxParserImpl
*>( userData
);
333 pFastParser
->callbackStartElement( localName
, prefix
, URI
, numNamespaces
, namespaces
, numAttributes
, attributes
);
336 static void call_callbackEndElement(void *userData
, const xmlChar
* /*localName*/, const xmlChar
* /*prefix*/, const xmlChar
* /*URI*/)
338 FastSaxParserImpl
* pFastParser
= static_cast<FastSaxParserImpl
*>( userData
);
339 pFastParser
->callbackEndElement();
342 static void call_callbackCharacters( void *userData
, const xmlChar
*s
, int nLen
)
344 FastSaxParserImpl
* pFastParser
= static_cast<FastSaxParserImpl
*>( userData
);
345 pFastParser
->callbackCharacters( s
, nLen
);
348 static void call_callbackProcessingInstruction( void *userData
, const xmlChar
*target
, const xmlChar
*data
)
350 FastSaxParserImpl
* pFastParser
= static_cast<FastSaxParserImpl
*>( userData
);
351 pFastParser
->callbackProcessingInstruction( target
, data
);
354 static xmlEntityPtr
call_callbackGetEntity( void *userData
, const xmlChar
*name
)
356 FastSaxParserImpl
* pFastParser
= static_cast<FastSaxParserImpl
*>( userData
);
357 return pFastParser
->callbackGetEntity( name
);
362 class FastLocatorImpl
: public WeakImplHelper
< XLocator
>
365 explicit FastLocatorImpl(FastSaxParserImpl
*p
) : mpParser(p
) {}
367 void dispose() { mpParser
= nullptr; }
368 /// @throws RuntimeException
369 void checkDispose() const { if( !mpParser
) throw DisposedException(); }
372 virtual sal_Int32 SAL_CALL
getColumnNumber() override
;
373 virtual sal_Int32 SAL_CALL
getLineNumber() override
;
374 virtual OUString SAL_CALL
getPublicId() override
;
375 virtual OUString SAL_CALL
getSystemId() override
;
378 FastSaxParserImpl
*mpParser
;
381 sal_Int32 SAL_CALL
FastLocatorImpl::getColumnNumber()
384 return xmlSAX2GetColumnNumber( mpParser
->getEntity().mpParser
);
387 sal_Int32 SAL_CALL
FastLocatorImpl::getLineNumber()
390 return xmlSAX2GetLineNumber( mpParser
->getEntity().mpParser
);
393 OUString SAL_CALL
FastLocatorImpl::getPublicId()
396 return mpParser
->getEntity().maStructSource
.sPublicId
;
399 OUString SAL_CALL
FastLocatorImpl::getSystemId()
402 return mpParser
->getEntity().maStructSource
.sSystemId
;
405 ParserData::ParserData()
408 Entity::Entity(const ParserData
& rData
)
410 , mnProducedEventsSize(0)
411 , mbEnableThreads(false)
416 void Entity::startElement( Event
const *pEvent
)
418 const sal_Int32
& nElementToken
= pEvent
->mnElementToken
;
419 const OUString
& aNamespace
= pEvent
->msNamespace
;
420 const OUString
& aElementName
= pEvent
->msElementName
;
422 // Use un-wrapped pointers to avoid significant acquire/release overhead
423 XFastContextHandler
*pParentContext
= nullptr;
424 if( !maContextStack
.empty() )
426 pParentContext
= maContextStack
.top().mxContext
.get();
427 if( !pParentContext
)
429 maContextStack
.push( SaxContext(nElementToken
, aNamespace
, aElementName
) );
434 maContextStack
.push( SaxContext( nElementToken
, aNamespace
, aElementName
) );
438 const Reference
< XFastAttributeList
> & xAttr( pEvent
->mxAttributes
);
439 Reference
< XFastContextHandler
> xContext
;
441 if ( mxNamespaceHandler
.is() )
443 const Sequence
< xml::Attribute
> NSDeclAttribs
= pEvent
->mxDeclAttributes
->getUnknownAttributes();
444 for (const auto& rNSDeclAttrib
: NSDeclAttribs
)
446 mxNamespaceHandler
->registerNamespace( rNSDeclAttrib
.Name
, rNSDeclAttrib
.Value
);
450 if( nElementToken
== FastToken::DONTKNOW
)
453 xContext
= pParentContext
->createUnknownChildContext( aNamespace
, aElementName
, xAttr
);
454 else if( mxDocumentHandler
.is() )
455 xContext
= mxDocumentHandler
->createUnknownChildContext( aNamespace
, aElementName
, xAttr
);
459 xContext
->startUnknownElement( aNamespace
, aElementName
, xAttr
);
465 xContext
= pParentContext
->createFastChildContext( nElementToken
, xAttr
);
466 else if( mxDocumentHandler
.is() )
467 xContext
= mxDocumentHandler
->createFastChildContext( nElementToken
, xAttr
);
470 xContext
->startFastElement( nElementToken
, xAttr
);
472 // swap the reference we own in to avoid referencing thrash.
473 maContextStack
.top().mxContext
= std::move( xContext
);
477 saveException( ::cppu::getCaughtException() );
481 void Entity::characters( const OUString
& sChars
)
483 if (maContextStack
.empty())
485 // Malformed XML stream !?
489 XFastContextHandler
* pContext( maContextStack
.top().mxContext
.get() );
492 pContext
->characters( sChars
);
496 saveException( ::cppu::getCaughtException() );
500 void Entity::endElement()
502 if (maContextStack
.empty())
504 // Malformed XML stream !?
508 const SaxContext
& aContext
= maContextStack
.top();
509 XFastContextHandler
* pContext( aContext
.mxContext
.get() );
513 sal_Int32 nElementToken
= aContext
.mnElementToken
;
514 if( nElementToken
!= FastToken::DONTKNOW
)
515 pContext
->endFastElement( nElementToken
);
517 pContext
->endUnknownElement( *aContext
.moNamespace
, *aContext
.moElementName
);
521 saveException( ::cppu::getCaughtException() );
523 maContextStack
.pop();
526 void Entity::processingInstruction( const OUString
& rTarget
, const OUString
& rData
)
528 if( mxDocumentHandler
.is() ) try
530 mxDocumentHandler
->processingInstruction( rTarget
, rData
);
534 saveException( ::cppu::getCaughtException() );
538 EventList
& Entity::getEventList()
540 if (!mxProducedEvents
)
542 std::unique_lock
aGuard(maEventProtector
);
543 if (!maUsedEvents
.empty())
545 mxProducedEvents
= std::move(maUsedEvents
.front());
547 aGuard
.unlock(); // unlock
548 mnProducedEventsSize
= 0;
550 if (!mxProducedEvents
)
552 mxProducedEvents
.emplace();
553 mxProducedEvents
->maEvents
.resize(mnEventListSize
);
554 mxProducedEvents
->mbIsAttributesEmpty
= false;
555 mnProducedEventsSize
= 0;
558 return *mxProducedEvents
;
561 Event
& Entity::getEvent( CallbackType aType
)
563 if (!mbEnableThreads
)
564 return maSharedEvent
;
566 EventList
& rEventList
= getEventList();
567 if (mnProducedEventsSize
== rEventList
.maEvents
.size())
569 SAL_WARN_IF(!maSavedException
.hasValue(), "sax",
570 "Event vector should only exceed " << mnEventListSize
<<
571 " temporarily while an exception is pending");
572 rEventList
.maEvents
.resize(mnProducedEventsSize
+ 1);
574 Event
& rEvent
= rEventList
.maEvents
[mnProducedEventsSize
++];
575 rEvent
.maType
= aType
;
579 OUString
lclGetErrorMessage( xmlParserCtxtPtr ctxt
, std::u16string_view sSystemId
, sal_Int32 nLine
)
581 const char* pMessage
;
582 xmlErrorPtr error
= xmlCtxtGetLastError( ctxt
);
583 if( error
&& error
->message
)
584 pMessage
= error
->message
;
586 pMessage
= "unknown error";
587 return OUString::Concat("[") + sSystemId
+ " line " + OUString::number(nLine
) + "]: " +
588 OUString(pMessage
, strlen(pMessage
), RTL_TEXTENCODING_ASCII_US
);
591 // throw an exception, but avoid callback if
592 // during a threaded produce
593 void Entity::throwException( const ::rtl::Reference
< FastLocatorImpl
> &xDocumentLocator
,
596 // Error during parsing !
599 std::scoped_lock
g(maSavedExceptionMutex
);
600 if (maSavedException
.hasValue())
602 savedException
.setValue(&maSavedException
, cppu::UnoType
<decltype(maSavedException
)>::get());
605 SAXParseException
aExcept(
606 lclGetErrorMessage( mpParser
,
607 xDocumentLocator
->getSystemId(),
608 xDocumentLocator
->getLineNumber() ),
609 Reference
< XInterface
>(),
611 xDocumentLocator
->getPublicId(),
612 xDocumentLocator
->getSystemId(),
613 xDocumentLocator
->getLineNumber(),
614 xDocumentLocator
->getColumnNumber()
617 // error handler is set, it may throw the exception
618 if( !mbDuringParse
|| !mbEnableThreads
)
620 if (mxErrorHandler
.is() )
621 mxErrorHandler
->fatalError( Any( aExcept
) );
624 // error handler has not thrown, but parsing must stop => throw ourselves
628 // In the single threaded case we emit events via our C
629 // callbacks, so any exception caught must be queued up until
630 // we can safely re-throw it from our C++ parent of parse()
632 // If multi-threaded, we need to push an EXCEPTION event, at
633 // which point we transfer ownership of maSavedException to
634 // the consuming thread.
635 void Entity::saveException( const Any
& e
)
637 // fdo#81214 - allow the parser to run on after an exception,
638 // unexpectedly some 'startElements' produce a UNO_QUERY_THROW
639 // for XComponent; and yet expect to continue parsing.
640 SAL_WARN("sax", "Unexpected exception from XML parser " << exceptionToString(e
));
641 std::scoped_lock
g(maSavedExceptionMutex
);
642 if (maSavedException
.hasValue())
644 SAL_INFO("sax.fastparser", "discarding exception, already have one");
648 maSavedException
= e
;
652 bool Entity::hasException()
654 std::scoped_lock
g(maSavedExceptionMutex
);
655 return maSavedException
.hasValue();
660 namespace sax_fastparser
{
662 FastSaxParserImpl::FastSaxParserImpl() :
663 m_bIgnoreMissingNSDecl(false),
664 m_bDisableThreadedParser(false),
667 mxDocumentLocator
.set( new FastLocatorImpl( this ) );
670 FastSaxParserImpl::~FastSaxParserImpl()
672 if( mxDocumentLocator
.is() )
673 mxDocumentLocator
->dispose();
674 for (auto& entity
: m_TemporalEntities
)
678 xmlNodePtr pPtr
= reinterpret_cast<xmlNodePtr
>(entity
);
684 void FastSaxParserImpl::DefineNamespace( const OString
& rPrefix
, const OUString
& namespaceURL
)
686 Entity
& rEntity
= getEntity();
687 assert(!rEntity
.maNamespaceCount
.empty()); // need a context!
689 sal_uInt32 nOffset
= rEntity
.maNamespaceCount
.top()++;
690 if( rEntity
.maNamespaceDefines
.size() <= nOffset
)
691 rEntity
.maNamespaceDefines
.resize( rEntity
.maNamespaceDefines
.size() + 64 );
693 rEntity
.maNamespaceDefines
[nOffset
] = NamespaceDefine( rPrefix
, GetNamespaceToken( namespaceURL
), namespaceURL
);
696 sal_Int32
FastSaxParserImpl::GetToken(const xmlChar
* pName
)
698 return FastTokenHandlerBase::getTokenFromChars( getEntity(). mxTokenHandler
.get(),
699 XML_CAST( pName
) ); // uses utf-8
702 sal_Int32
FastSaxParserImpl::GetTokenWithPrefix( const xmlChar
* pPrefix
, const xmlChar
* pName
)
704 Entity
& rEntity
= getEntity();
705 if (rEntity
.maNamespaceCount
.empty())
706 return FastToken::DONTKNOW
;
708 std::string_view
sPrefix(XML_CAST(pPrefix
));
709 sal_uInt32 nNamespace
= rEntity
.maNamespaceCount
.top();
710 while( nNamespace
-- )
712 const auto & rNamespaceDefine
= rEntity
.maNamespaceDefines
[nNamespace
];
713 if( rNamespaceDefine
.maPrefix
== sPrefix
)
714 return GetTokenWithContextNamespace(rNamespaceDefine
.mnToken
, pName
);
717 if (!m_bIgnoreMissingNSDecl
)
718 throw SAXException("No namespace defined for " + OStringToOUString(sPrefix
,
719 RTL_TEXTENCODING_UTF8
), {}, {});
721 return FastToken::DONTKNOW
;
724 sal_Int32
FastSaxParserImpl::GetNamespaceToken( const OUString
& rNamespaceURL
)
726 NamespaceMap::iterator
aIter( maNamespaceMap
.find( rNamespaceURL
) );
727 if( aIter
!= maNamespaceMap
.end() )
728 return (*aIter
).second
;
730 return FastToken::DONTKNOW
;
733 OUString
const & FastSaxParserImpl::GetNamespaceURL( std::string_view rPrefix
)
735 Entity
& rEntity
= getEntity();
736 if( !rEntity
.maNamespaceCount
.empty() )
738 sal_uInt32 nNamespace
= rEntity
.maNamespaceCount
.top();
739 while( nNamespace
-- )
740 if( rEntity
.maNamespaceDefines
[nNamespace
].maPrefix
== rPrefix
)
741 return rEntity
.maNamespaceDefines
[nNamespace
].maNamespaceURL
;
744 throw SAXException("No namespace defined for " + OUString::fromUtf8(rPrefix
),
745 Reference
< XInterface
>(), Any());
748 sal_Int32
FastSaxParserImpl::GetTokenWithContextNamespace( sal_Int32 nNamespaceToken
, const xmlChar
* pName
)
750 if( nNamespaceToken
!= FastToken::DONTKNOW
)
752 sal_Int32 nNameToken
= GetToken( pName
);
753 if( nNameToken
!= FastToken::DONTKNOW
)
754 return nNamespaceToken
| nNameToken
;
757 return FastToken::DONTKNOW
;
765 FastSaxParserImpl
& m_rParser
;
767 rtl::Reference
<ParserThread
> m_xParser
;
769 ParserCleanup(FastSaxParserImpl
& rParser
, Entity
& rEntity
)
776 if (m_rEntity
.mpParser
)
778 if (m_rEntity
.mpParser
->myDoc
)
779 xmlFreeDoc(m_rEntity
.mpParser
->myDoc
);
780 xmlFreeParserCtxt(m_rEntity
.mpParser
);
783 m_rParser
.popEntity();
785 void setThread(const rtl::Reference
<ParserThread
> &xParser
)
793 rtl::Reference
<ParserThread
> xToJoin
= m_xParser
;
802 * parseStream does Parser-startup initializations. The FastSaxParser::parse() method does
803 * the file-specific initialization work. (During a parser run, external files may be opened)
806 void FastSaxParserImpl::parseStream(const InputSource
& rStructSource
)
810 // Only one text at one time
811 std::unique_lock
guard( maMutex
);
813 pushEntity(maData
, rStructSource
);
814 Entity
& rEntity
= getEntity();
815 ParserCleanup
aEnsureFree(*this, rEntity
);
817 // start the document
818 if( rEntity
.mxDocumentHandler
.is() )
820 rEntity
.mxDocumentHandler
->setDocumentLocator( mxDocumentLocator
);
821 rEntity
.mxDocumentHandler
->startDocument();
825 rEntity
.mbEnableThreads
= false;
827 if (!getenv("SAX_DISABLE_THREADS") && !m_bDisableThreadedParser
)
829 Reference
<css::io::XSeekable
> xSeekable(rEntity
.maStructSource
.aInputStream
, UNO_QUERY
);
830 // available() is not __really__ relevant here, but leave it in as a heuristic for non-seekable streams
831 rEntity
.mbEnableThreads
= (xSeekable
.is() && xSeekable
->getLength() > 10000)
832 || (rEntity
.maStructSource
.aInputStream
->available() > 10000);
836 if (rEntity
.mbEnableThreads
)
838 rtl::Reference
<ParserThread
> xParser
= new ParserThread(this);
840 aEnsureFree
.setThread(xParser
);
843 rEntity
.maConsumeResume
.wait();
844 rEntity
.maConsumeResume
.reset();
846 std::unique_lock
aGuard(rEntity
.maEventProtector
);
847 while (!rEntity
.maPendingEvents
.empty())
849 if (rEntity
.maPendingEvents
.size() <= Entity::mnEventLowWater
)
850 rEntity
.maProduceResume
.set(); // start producer again
852 EventList aEventList
= std::move(rEntity
.maPendingEvents
.front());
853 rEntity
.maPendingEvents
.pop();
854 aGuard
.unlock(); // unlock
856 if (!consume(aEventList
))
859 aGuard
.lock(); // lock
861 if ( rEntity
.maPendingEvents
.size() <= Entity::mnEventLowWater
)
864 for (auto& rEvent
: aEventList
.maEvents
)
866 if (rEvent
.mxAttributes
.is())
868 rEvent
.mxAttributes
->clear();
869 if( rEntity
.mxNamespaceHandler
.is() )
870 rEvent
.mxDeclAttributes
->clear();
872 aEventList
.mbIsAttributesEmpty
= true;
877 rEntity
.maUsedEvents
.push(std::move(aEventList
));
880 aEnsureFree
.joinThread();
883 // callbacks used inside XML_Parse may have caught an exception
884 // No need to lock maSavedExceptionMutex here because parser
886 if( rEntity
.maSavedException
.hasValue() )
887 rEntity
.throwException( mxDocumentLocator
, true );
895 if( rEntity
.mxDocumentHandler
.is() )
897 rEntity
.mxDocumentHandler
->endDocument();
901 void FastSaxParserImpl::setFastDocumentHandler( const Reference
< XFastDocumentHandler
>& Handler
)
903 maData
.mxDocumentHandler
= Handler
;
906 void FastSaxParserImpl::setTokenHandler( const Reference
< XFastTokenHandler
>& xHandler
)
908 assert( dynamic_cast< FastTokenHandlerBase
*>( xHandler
.get() ) && "we expect this handler to be a subclass of FastTokenHandlerBase" );
909 maData
.mxTokenHandler
= dynamic_cast< FastTokenHandlerBase
*>( xHandler
.get() );
912 void FastSaxParserImpl::registerNamespace( const OUString
& NamespaceURL
, sal_Int32 NamespaceToken
)
914 if( NamespaceToken
< FastToken::NAMESPACE
)
915 throw IllegalArgumentException("Invalid namespace token " + OUString::number(NamespaceToken
), css::uno::Reference
<css::uno::XInterface
>(), 0);
917 if( GetNamespaceToken( NamespaceURL
) == FastToken::DONTKNOW
)
919 maNamespaceMap
[ NamespaceURL
] = NamespaceToken
;
922 throw IllegalArgumentException("namespace URL is already registered: " + NamespaceURL
, css::uno::Reference
<css::uno::XInterface
>(), 0);
925 OUString
const & FastSaxParserImpl::getNamespaceURL( std::u16string_view rPrefix
)
929 return GetNamespaceURL( OUStringToOString( rPrefix
, RTL_TEXTENCODING_UTF8
) );
931 catch (const Exception
&)
934 throw IllegalArgumentException();
937 void FastSaxParserImpl::setErrorHandler(const Reference
< XErrorHandler
> & Handler
)
939 maData
.mxErrorHandler
= Handler
;
942 void FastSaxParserImpl::setNamespaceHandler( const Reference
< XFastNamespaceHandler
>& Handler
)
944 maData
.mxNamespaceHandler
= Handler
;
947 void FastSaxParserImpl::setCustomEntityNames(
948 const ::css::uno::Sequence
<::css::beans::Pair
<::rtl::OUString
, ::rtl::OUString
>>& replacements
)
950 m_Replacements
.resize(replacements
.size());
951 for (size_t i
= 0; i
< replacements
.size(); ++i
)
953 m_Replacements
[i
].name
= replacements
[i
].First
;
954 m_Replacements
[i
].replacement
= replacements
[i
].Second
;
956 if (m_Replacements
.size() > 1)
957 std::sort(m_Replacements
.begin(), m_Replacements
.end());
960 void FastSaxParserImpl::deleteUsedEvents()
962 Entity
& rEntity
= getEntity();
963 std::unique_lock
aGuard(rEntity
.maEventProtector
);
965 while (!rEntity
.maUsedEvents
.empty())
967 { // the block makes sure that aEventList is destructed outside the lock
968 EventList aEventList
= std::move(rEntity
.maUsedEvents
.front());
969 rEntity
.maUsedEvents
.pop();
971 aGuard
.unlock(); // unlock
974 aGuard
.lock(); // lock
978 void FastSaxParserImpl::produce( bool bForceFlush
)
980 Entity
& rEntity
= getEntity();
982 rEntity
.mnProducedEventsSize
>= Entity::mnEventListSize
))
985 std::unique_lock
aGuard(rEntity
.maEventProtector
);
987 while (rEntity
.maPendingEvents
.size() >= Entity::mnEventHighWater
)
988 { // pause parsing for a bit
989 aGuard
.unlock(); // unlock
990 rEntity
.maProduceResume
.wait();
991 rEntity
.maProduceResume
.reset();
992 aGuard
.lock(); // lock
995 rEntity
.maPendingEvents
.push(std::move(*rEntity
.mxProducedEvents
));
996 rEntity
.mxProducedEvents
.reset();
997 assert(!rEntity
.mxProducedEvents
);
999 aGuard
.unlock(); // unlock
1001 rEntity
.maConsumeResume
.set();
1004 bool FastSaxParserImpl::consume(EventList
& rEventList
)
1006 Entity
& rEntity
= getEntity();
1007 rEventList
.mbIsAttributesEmpty
= false;
1008 for (auto& rEvent
: rEventList
.maEvents
)
1010 switch (rEvent
.maType
)
1012 case CallbackType::START_ELEMENT
:
1013 rEntity
.startElement( &rEvent
);
1015 case CallbackType::END_ELEMENT
:
1016 rEntity
.endElement();
1018 case CallbackType::CHARACTERS
:
1019 rEntity
.characters( rEvent
.msChars
);
1021 case CallbackType::PROCESSING_INSTRUCTION
:
1022 rEntity
.processingInstruction(
1023 rEvent
.msNamespace
, rEvent
.msElementName
); // ( target, data )
1025 case CallbackType::DONE
:
1027 case CallbackType::EXCEPTION
:
1028 rEntity
.throwException( mxDocumentLocator
, false );
1029 [[fallthrough
]]; // avoid unreachable code warning with some compilers
1038 void FastSaxParserImpl::pushEntity(const ParserData
& rEntityData
,
1039 xml::sax::InputSource
const& rSource
)
1041 if (!rSource
.aInputStream
.is())
1042 throw SAXException("No input source", Reference
<XInterface
>(), Any());
1044 maEntities
.emplace(rEntityData
);
1045 mpTop
= &maEntities
.top();
1047 mpTop
->maStructSource
= rSource
;
1049 mpTop
->maConverter
.setInputStream(mpTop
->maStructSource
.aInputStream
);
1050 if (!mpTop
->maStructSource
.sEncoding
.isEmpty())
1052 mpTop
->maConverter
.setEncoding(OUStringToOString(mpTop
->maStructSource
.sEncoding
, RTL_TEXTENCODING_ASCII_US
));
1056 void FastSaxParserImpl::popEntity()
1059 mpTop
= !maEntities
.empty() ? &maEntities
.top() : nullptr;
1062 // starts parsing with actual parser !
1063 void FastSaxParserImpl::parse()
1065 const int BUFFER_SIZE
= 16 * 1024;
1066 Sequence
< sal_Int8
> seqOut( BUFFER_SIZE
);
1068 Entity
& rEntity
= getEntity();
1070 // set all necessary C-Callbacks
1071 static xmlSAXHandler callbacks
;
1072 callbacks
.startElementNs
= call_callbackStartElement
;
1073 callbacks
.endElementNs
= call_callbackEndElement
;
1074 callbacks
.characters
= call_callbackCharacters
;
1075 callbacks
.processingInstruction
= call_callbackProcessingInstruction
;
1076 callbacks
.getEntity
= call_callbackGetEntity
;
1077 callbacks
.initialized
= XML_SAX2_MAGIC
;
1081 nRead
= rEntity
.maConverter
.readAndConvert( seqOut
, BUFFER_SIZE
);
1084 if( rEntity
.mpParser
!= nullptr )
1086 if( xmlParseChunk( rEntity
.mpParser
, reinterpret_cast<const char*>(seqOut
.getConstArray()), 0, 1 ) != XML_ERR_OK
)
1087 rEntity
.throwException( mxDocumentLocator
, true );
1088 if (rEntity
.hasException())
1089 rEntity
.throwException(mxDocumentLocator
, true);
1094 bool bContinue
= true;
1095 if( rEntity
.mpParser
== nullptr )
1097 // create parser with proper encoding (needs the first chunk of data)
1098 rEntity
.mpParser
= xmlCreatePushParserCtxt( &callbacks
, this,
1099 reinterpret_cast<const char*>(seqOut
.getConstArray()), nRead
, nullptr );
1100 if( !rEntity
.mpParser
)
1101 throw SAXException("Couldn't create parser", Reference
< XInterface
>(), Any() );
1103 // Tell libxml2 parser to decode entities in attribute values.
1104 // Also allow XML attribute values which are larger than 10MB, because this used to work
1106 // coverity[unsafe_xml_parse_config] - entity support is required
1107 xmlCtxtUseOptions(rEntity
.mpParser
, XML_PARSE_NOENT
| XML_PARSE_HUGE
);
1111 bContinue
= xmlParseChunk( rEntity
.mpParser
, reinterpret_cast<const char*>(seqOut
.getConstArray()), nRead
, 0 )
1115 // callbacks used inside XML_Parse may have caught an exception
1118 rEntity
.throwException( mxDocumentLocator
, true );
1120 if (rEntity
.hasException())
1122 rEntity
.throwException( mxDocumentLocator
, true );
1124 } while( nRead
> 0 );
1125 rEntity
.getEvent( CallbackType::DONE
);
1126 if( rEntity
.mbEnableThreads
)
1131 void FastSaxParserImpl::callbackStartElement(const xmlChar
*localName
, const xmlChar
* prefix
, const xmlChar
* URI
,
1132 int numNamespaces
, const xmlChar
** namespaces
, int numAttributes
, const xmlChar
**attributes
)
1134 if (!pendingCharacters
.empty())
1135 sendPendingCharacters();
1136 Entity
& rEntity
= getEntity();
1137 if( rEntity
.maNamespaceCount
.empty() )
1139 rEntity
.maNamespaceCount
.push(0);
1140 DefineNamespace( "xml", "http://www.w3.org/XML/1998/namespace");
1144 rEntity
.maNamespaceCount
.push( rEntity
.maNamespaceCount
.top() );
1147 // create attribute map and process namespace instructions
1148 Event
& rEvent
= rEntity
.getEvent( CallbackType::START_ELEMENT
);
1149 bool bIsAttributesEmpty
= false;
1150 if ( rEntity
.mbEnableThreads
)
1151 bIsAttributesEmpty
= rEntity
.getEventList().mbIsAttributesEmpty
;
1153 if (rEvent
.mxAttributes
.is())
1155 if( !bIsAttributesEmpty
)
1156 rEvent
.mxAttributes
->clear();
1159 rEvent
.mxAttributes
.set(
1160 new FastAttributeList( rEntity
.mxTokenHandler
.get() ) );
1162 if( rEntity
.mxNamespaceHandler
.is() )
1164 if (rEvent
.mxDeclAttributes
.is())
1166 if( !bIsAttributesEmpty
)
1167 rEvent
.mxDeclAttributes
->clear();
1170 rEvent
.mxDeclAttributes
.set(
1171 new FastAttributeList( rEntity
.mxTokenHandler
.get() ) );
1174 OUString sNamespace
;
1175 sal_Int32 nNamespaceToken
= FastToken::DONTKNOW
;
1176 if (!rEntity
.maNamespaceStack
.empty())
1178 sNamespace
= rEntity
.maNamespaceStack
.top().msName
;
1179 nNamespaceToken
= rEntity
.maNamespaceStack
.top().mnToken
;
1184 /* #158414# Each element may define new namespaces, also for attributes.
1185 First, process all namespaces, second, process the attributes after namespaces
1186 have been initialized. */
1188 // #158414# first: get namespaces
1189 for (int i
= 0; i
< numNamespaces
* 2; i
+= 2)
1191 // namespaces[] is (prefix/URI)
1192 if( namespaces
[ i
] != nullptr )
1194 OString
aPrefix( XML_CAST( namespaces
[ i
] ));
1195 OUString
namespaceURL( XML_CAST( namespaces
[ i
+ 1 ] ), strlen( XML_CAST( namespaces
[ i
+ 1 ] )), RTL_TEXTENCODING_UTF8
);
1196 NormalizeURI( namespaceURL
);
1197 DefineNamespace(aPrefix
, namespaceURL
);
1198 if( rEntity
.mxNamespaceHandler
.is() )
1199 rEvent
.mxDeclAttributes
->addUnknown( OString( XML_CAST( namespaces
[ i
] ) ), OString( XML_CAST( namespaces
[ i
+ 1 ] ) ) );
1203 // default namespace
1204 sNamespace
= OUString( XML_CAST( namespaces
[ i
+ 1 ] ), strlen( XML_CAST( namespaces
[ i
+ 1 ] )), RTL_TEXTENCODING_UTF8
);
1205 NormalizeURI( sNamespace
);
1206 nNamespaceToken
= GetNamespaceToken( sNamespace
);
1207 if( rEntity
.mxNamespaceHandler
.is() )
1208 rEvent
.mxDeclAttributes
->addUnknown( "", OString( XML_CAST( namespaces
[ i
+ 1 ] ) ) );
1212 if ( rEntity
.mxTokenHandler
.is() )
1214 // #158414# second: fill attribute list with other attributes
1215 rEvent
.mxAttributes
->reserve( numAttributes
);
1216 for (int i
= 0; i
< numAttributes
* 5; i
+= 5)
1218 // attributes[] is ( localname / prefix / nsURI / valueBegin / valueEnd )
1219 if( attributes
[ i
+ 1 ] != nullptr )
1221 sal_Int32 nAttributeToken
= GetTokenWithPrefix(attributes
[ i
+ 1 ], attributes
[ i
]);
1222 if( nAttributeToken
!= FastToken::DONTKNOW
)
1223 rEvent
.mxAttributes
->add( nAttributeToken
, std::string_view(XML_CAST( attributes
[ i
+ 3 ] ), attributes
[ i
+ 4 ] - attributes
[ i
+ 3 ]) );
1225 addUnknownElementWithPrefix(attributes
, i
, rEvent
.mxAttributes
);
1229 sal_Int32 nAttributeToken
= GetToken(attributes
[ i
]);
1230 if( nAttributeToken
!= FastToken::DONTKNOW
)
1231 rEvent
.mxAttributes
->add( nAttributeToken
, std::string_view(XML_CAST( attributes
[ i
+ 3 ] ), attributes
[ i
+ 4 ] - attributes
[ i
+ 3 ]) );
1234 SAL_WARN("xmloff", "unknown attribute " << XML_CAST( attributes
[ i
] ) << "=" <<
1235 OString( XML_CAST( attributes
[ i
+ 3 ] ), attributes
[ i
+ 4 ] - attributes
[ i
+ 3 ] ));
1236 rEvent
.mxAttributes
->addUnknown( XML_CAST( attributes
[ i
] ),
1237 OString( XML_CAST( attributes
[ i
+ 3 ] ), attributes
[ i
+ 4 ] - attributes
[ i
+ 3 ] ));
1242 if( prefix
!= nullptr )
1243 rEvent
.mnElementToken
= GetTokenWithPrefix(prefix
, localName
);
1244 else if( !sNamespace
.isEmpty() )
1245 rEvent
.mnElementToken
= GetTokenWithContextNamespace(nNamespaceToken
, localName
);
1247 rEvent
.mnElementToken
= GetToken(localName
);
1251 for (int i
= 0; i
< numAttributes
* 5; i
+= 5)
1253 if( attributes
[ i
+ 1 ] != nullptr )
1254 addUnknownElementWithPrefix(attributes
, i
, rEvent
.mxAttributes
);
1256 rEvent
.mxAttributes
->addUnknown( XML_CAST( attributes
[ i
] ),
1257 OString( XML_CAST( attributes
[ i
+ 3 ] ), attributes
[ i
+ 4 ] - attributes
[ i
+ 3 ] ));
1260 rEvent
.mnElementToken
= FastToken::DONTKNOW
;
1263 if( rEvent
.mnElementToken
== FastToken::DONTKNOW
)
1265 OUString aElementPrefix
;
1266 if( prefix
!= nullptr )
1268 aElementPrefix
= OUString( XML_CAST( prefix
), strlen( XML_CAST( prefix
)), RTL_TEXTENCODING_UTF8
);
1269 if ( URI
!= nullptr )
1270 sNamespace
= OUString( XML_CAST( URI
), strlen( XML_CAST( URI
)), RTL_TEXTENCODING_UTF8
);
1271 else if ( m_bIgnoreMissingNSDecl
)
1274 throw SAXException("No namespace defined for " + aElementPrefix
, {}, {});
1275 nNamespaceToken
= GetNamespaceToken( sNamespace
);
1277 OUString
aElementLocalName( XML_CAST( localName
), strlen( XML_CAST( localName
)), RTL_TEXTENCODING_UTF8
);
1278 rEvent
.msNamespace
= sNamespace
;
1279 if( aElementPrefix
.isEmpty() )
1280 rEvent
.msElementName
= std::move(aElementLocalName
);
1282 rEvent
.msElementName
= aElementPrefix
+ ":" + aElementLocalName
;
1284 else // token is always preferred.
1285 rEvent
.msElementName
.clear();
1287 rEntity
.maNamespaceStack
.push( NameWithToken(sNamespace
, nNamespaceToken
) );
1288 if (rEntity
.mbEnableThreads
)
1292 SAL_INFO("sax.fastparser", " startElement line " << mxDocumentLocator
->getLineNumber() << " column " << mxDocumentLocator
->getColumnNumber() << " " << ( prefix
? XML_CAST(prefix
) : "(null)" ) << ":" << localName
);
1293 rEntity
.startElement( &rEvent
);
1298 rEntity
.saveException( ::cppu::getCaughtException() );
1302 void FastSaxParserImpl::addUnknownElementWithPrefix(const xmlChar
**attributes
, int i
, rtl::Reference
< FastAttributeList
> const & xAttributes
)
1304 OUString aNamespaceURI
;
1305 if ( !m_bIgnoreMissingNSDecl
|| attributes
[i
+ 2] != nullptr )
1306 aNamespaceURI
= OUString( XML_CAST( attributes
[ i
+ 2 ] ), strlen( XML_CAST( attributes
[ i
+ 2 ] )), RTL_TEXTENCODING_UTF8
);
1307 const OString
& rPrefix
= OString( XML_CAST( attributes
[ i
+ 1 ] ));
1308 const OString
& rLocalName
= OString( XML_CAST( attributes
[ i
] ));
1309 OString aQualifiedName
= (rPrefix
.isEmpty())? rLocalName
: rPrefix
+ ":" + rLocalName
;
1310 xAttributes
->addUnknown( aNamespaceURI
, aQualifiedName
,
1311 OString( XML_CAST( attributes
[ i
+ 3 ] ), attributes
[ i
+ 4 ] - attributes
[ i
+ 3 ] ));
1312 SAL_INFO("xmloff", "unknown element " << aQualifiedName
<< " " << aNamespaceURI
);
1315 void FastSaxParserImpl::callbackEndElement()
1317 if (!pendingCharacters
.empty())
1318 sendPendingCharacters();
1319 Entity
& rEntity
= getEntity();
1320 SAL_WARN_IF(rEntity
.maNamespaceCount
.empty(), "sax", "Empty NamespaceCount");
1321 if( !rEntity
.maNamespaceCount
.empty() )
1322 rEntity
.maNamespaceCount
.pop();
1324 SAL_WARN_IF(rEntity
.maNamespaceStack
.empty(), "sax", "Empty NamespaceStack");
1325 if( !rEntity
.maNamespaceStack
.empty() )
1326 rEntity
.maNamespaceStack
.pop();
1328 rEntity
.getEvent( CallbackType::END_ELEMENT
);
1329 if (rEntity
.mbEnableThreads
)
1332 rEntity
.endElement();
1335 void FastSaxParserImpl::callbackCharacters( const xmlChar
* s
, int nLen
)
1337 // SAX interface allows that the characters callback splits content of one XML node
1338 // (e.g. because there's an entity that needs decoding), however for consumers it's
1339 // simpler FastSaxParser's character callback provides the whole string at once,
1340 // so merge data from possible multiple calls and send them at once (before the element
1341 // ends or another one starts).
1343 // We use a std::vector<char> to avoid calling into the OUString constructor more than once when
1344 // we have multiple callbackCharacters() calls that we have to merge, which happens surprisingly
1345 // often in writer documents.
1346 int nOriginalLen
= pendingCharacters
.size();
1347 pendingCharacters
.resize(nOriginalLen
+ nLen
);
1348 memcpy(pendingCharacters
.data() + nOriginalLen
, s
, nLen
);
1351 void FastSaxParserImpl::sendPendingCharacters()
1353 Entity
& rEntity
= getEntity();
1354 OUString
sChars( pendingCharacters
.data(), pendingCharacters
.size(), RTL_TEXTENCODING_UTF8
);
1355 if (rEntity
.mbEnableThreads
)
1357 Event
& rEvent
= rEntity
.getEvent( CallbackType::CHARACTERS
);
1358 rEvent
.msChars
= std::move(sChars
);
1362 rEntity
.characters( sChars
);
1363 pendingCharacters
.resize(0);
1366 void FastSaxParserImpl::callbackProcessingInstruction( const xmlChar
*target
, const xmlChar
*data
)
1368 if (!pendingCharacters
.empty())
1369 sendPendingCharacters();
1370 Entity
& rEntity
= getEntity();
1371 Event
& rEvent
= rEntity
.getEvent( CallbackType::PROCESSING_INSTRUCTION
);
1373 // This event is very rare, so no need to waste extra space for this
1374 // Using namespace and element strings to be target and data in that order.
1375 rEvent
.msNamespace
= OUString( XML_CAST( target
), strlen( XML_CAST( target
) ), RTL_TEXTENCODING_UTF8
);
1376 if ( data
!= nullptr )
1377 rEvent
.msElementName
= OUString( XML_CAST( data
), strlen( XML_CAST( data
) ), RTL_TEXTENCODING_UTF8
);
1379 rEvent
.msElementName
.clear();
1381 if (rEntity
.mbEnableThreads
)
1384 rEntity
.processingInstruction( rEvent
.msNamespace
, rEvent
.msElementName
);
1387 xmlEntityPtr
FastSaxParserImpl::callbackGetEntity( const xmlChar
*name
)
1390 return xmlGetPredefinedEntity(name
);
1391 const char* dname
= XML_CAST(name
);
1392 int lname
= strlen(dname
);
1394 return xmlGetPredefinedEntity(name
);
1395 if (m_Replacements
.size() > 0)
1397 auto it
= std::lower_bound(m_Replacements
.begin(), m_Replacements
.end(), dname
);
1398 if (it
!= m_Replacements
.end() && it
->name
.compareToAscii(dname
) == 0)
1400 xmlEntityPtr entpt
= xmlNewEntity(
1401 nullptr, name
, XML_INTERNAL_GENERAL_ENTITY
, nullptr, nullptr,
1402 BAD_CAST(OUStringToOString(it
->replacement
, RTL_TEXTENCODING_UTF8
).getStr()));
1403 m_TemporalEntities
.push_back(entpt
);
1408 return xmlGetPredefinedEntity(name
);
1409 if ( dname
[0] == '#' )
1411 sal_uInt32 cval
= 0;
1412 if( dname
[1] == 'x' || dname
[1] == 'X' )
1415 return xmlGetPredefinedEntity(name
);
1416 cval
= static_cast<sal_uInt32
>( strtoul( dname
+ 2, nullptr, 16 ) );
1418 return xmlGetPredefinedEntity(name
);
1419 OUString
vname( &cval
, 1 );
1421 = xmlNewEntity(nullptr, name
, XML_INTERNAL_GENERAL_ENTITY
, nullptr, nullptr,
1422 BAD_CAST(OUStringToOString(vname
, RTL_TEXTENCODING_UTF8
).getStr()));
1423 m_TemporalEntities
.push_back(entpt
);
1428 cval
= static_cast<sal_uInt32
>( strtoul( dname
+ 2, nullptr, 10 ) );
1430 return xmlGetPredefinedEntity(name
);
1431 OUString
vname(&cval
, 1);
1433 = xmlNewEntity(nullptr, name
, XML_INTERNAL_GENERAL_ENTITY
, nullptr, nullptr,
1434 BAD_CAST(OUStringToOString(vname
, RTL_TEXTENCODING_UTF8
).getStr()));
1435 m_TemporalEntities
.push_back(entpt
);
1439 return xmlGetPredefinedEntity(name
);
1442 FastSaxParser::FastSaxParser() : mpImpl(new FastSaxParserImpl
) {}
1444 FastSaxParser::~FastSaxParser()
1449 FastSaxParser::initialize(css::uno::Sequence
< css::uno::Any
> const& rArguments
)
1451 if (!rArguments
.hasElements())
1455 if ( !(rArguments
[0] >>= str
) )
1456 throw IllegalArgumentException();
1458 if ( str
== "IgnoreMissingNSDecl" )
1459 mpImpl
->m_bIgnoreMissingNSDecl
= true;
1460 else if ( str
== "DoSmeplease" )
1461 ; //just ignore as this is already immune to billion laughs
1462 else if ( str
== "DisableThreadedParser" )
1463 mpImpl
->m_bDisableThreadedParser
= true;
1465 throw IllegalArgumentException();
1469 void FastSaxParser::parseStream( const xml::sax::InputSource
& aInputSource
)
1471 mpImpl
->parseStream(aInputSource
);
1474 void FastSaxParser::setFastDocumentHandler( const uno::Reference
<xml::sax::XFastDocumentHandler
>& Handler
)
1476 mpImpl
->setFastDocumentHandler(Handler
);
1479 void FastSaxParser::setTokenHandler( const uno::Reference
<xml::sax::XFastTokenHandler
>& Handler
)
1481 mpImpl
->setTokenHandler(Handler
);
1484 void FastSaxParser::registerNamespace( const OUString
& NamespaceURL
, sal_Int32 NamespaceToken
)
1486 mpImpl
->registerNamespace(NamespaceURL
, NamespaceToken
);
1489 OUString
FastSaxParser::getNamespaceURL( const OUString
& rPrefix
)
1491 return mpImpl
->getNamespaceURL(rPrefix
);
1494 void FastSaxParser::setErrorHandler( const uno::Reference
< xml::sax::XErrorHandler
>& Handler
)
1496 mpImpl
->setErrorHandler(Handler
);
1499 void FastSaxParser::setEntityResolver( const uno::Reference
< xml::sax::XEntityResolver
>& )
1504 void FastSaxParser::setLocale( const lang::Locale
& )
1509 void FastSaxParser::setNamespaceHandler( const uno::Reference
< css::xml::sax::XFastNamespaceHandler
>& Handler
)
1511 mpImpl
->setNamespaceHandler(Handler
);
1514 OUString
FastSaxParser::getImplementationName()
1516 return "com.sun.star.comp.extensions.xml.sax.FastParser";
1519 void FastSaxParser::setCustomEntityNames(
1520 const ::css::uno::Sequence
<::css::beans::Pair
<::rtl::OUString
, ::rtl::OUString
>>& replacements
)
1522 mpImpl
->setCustomEntityNames(replacements
);
1525 sal_Bool
FastSaxParser::supportsService( const OUString
& ServiceName
)
1527 return cppu::supportsService(this, ServiceName
);
1530 uno::Sequence
<OUString
> FastSaxParser::getSupportedServiceNames()
1532 return { "com.sun.star.xml.sax.FastParser" };
1535 } // namespace sax_fastparser
1537 extern "C" SAL_DLLPUBLIC_EXPORT
css::uno::XInterface
*
1538 com_sun_star_comp_extensions_xml_sax_FastParser_get_implementation(
1539 css::uno::XComponentContext
*,
1540 css::uno::Sequence
<css::uno::Any
> const &)
1542 return cppu::acquire(new FastSaxParser
);
1545 // ----------------------------------------------------------
1546 // copy of the code in xmloff/source/core/namespace.cxx, which adds namespace aliases
1547 // for various dodgy namespace decls in the wild.
1549 static bool NormalizeW3URI( OUString
& rName
);
1550 static bool NormalizeOasisURN( OUString
& rName
);
1552 static void NormalizeURI( OUString
& rName
)
1554 // try OASIS + W3 URI normalization
1555 bool bSuccess
= NormalizeOasisURN( rName
);
1557 NormalizeW3URI( rName
);
1560 constexpr OUStringLiteral
XML_URI_W3_PREFIX(u
"http://www.w3.org/");
1561 constexpr OUStringLiteral
XML_URI_XFORMS_SUFFIX(u
"/xforms");
1562 constexpr OUStringLiteral
XML_N_XFORMS_1_0(u
"http://www.w3.org/2002/xforms");
1563 constexpr OUStringLiteral
XML_N_SVG(u
"http://www.w3.org/2000/svg");
1564 constexpr OUStringLiteral
XML_N_SVG_COMPAT(u
"urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0");
1565 constexpr OUStringLiteral
XML_N_FO(u
"http://www.w3.org/1999/XSL/Format");
1566 constexpr OUStringLiteral
XML_N_FO_COMPAT(u
"urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0");
1567 constexpr OUStringLiteral
XML_N_SMIL(u
"http://www.w3.org/2001/SMIL20/");
1568 constexpr OUStringLiteral
XML_N_SMIL_OLD(u
"http://www.w3.org/2001/SMIL20");
1569 constexpr OUStringLiteral
XML_N_SMIL_COMPAT(u
"urn:oasis:names:tc:opendocument:xmlns:smil-compatible:1.0");
1570 constexpr OUStringLiteral
XML_URN_OASIS_NAMES_TC(u
"urn:oasis:names:tc");
1571 constexpr OUStringLiteral
XML_XMLNS(u
"xmlns");
1572 constexpr OUStringLiteral
XML_OPENDOCUMENT(u
"opendocument");
1573 constexpr OUStringLiteral
XML_1_0(u
"1.0");
1575 static bool NormalizeW3URI( OUString
& rName
)
1577 // check if URI matches:
1578 // http://www.w3.org/[0-9]*/[:letter:]*
1580 // For the following WG/standards names:
1583 bool bSuccess
= false;
1584 const OUString
& sURIPrefix
= XML_URI_W3_PREFIX
;
1585 if( rName
.startsWith( sURIPrefix
) )
1587 const OUString
& sURISuffix
= XML_URI_XFORMS_SUFFIX
;
1588 sal_Int32 nCompareFrom
= rName
.getLength() - sURISuffix
.getLength();
1589 if( rName
.subView( nCompareFrom
) == sURISuffix
)
1591 // found W3 prefix, and xforms suffix
1592 rName
= XML_N_XFORMS_1_0
;
1599 static bool NormalizeOasisURN( OUString
& rName
)
1602 // we exported the wrong namespace for smil, so we correct this here on load
1603 // for older documents
1604 if( rName
== XML_N_SVG
)
1606 rName
= XML_N_SVG_COMPAT
;
1609 else if( rName
== XML_N_FO
)
1611 rName
= XML_N_FO_COMPAT
;
1614 else if( rName
== XML_N_SMIL
|| rName
== XML_N_SMIL_OLD
)
1616 rName
= XML_N_SMIL_COMPAT
;
1621 // Check if URN matches
1622 // :urn:oasis:names:tc:[^:]*:xmlns:[^:]*:1.[^:]*
1623 // |---| |---| |-----|
1624 // TC-Id Sub-Id Version
1626 sal_Int32 nNameLen
= rName
.getLength();
1627 // :urn:oasis:names:tc.*
1628 const OUString
& rOasisURN
= XML_URN_OASIS_NAMES_TC
;
1629 if( !rName
.startsWith( rOasisURN
) )
1632 // :urn:oasis:names:tc:.*
1633 sal_Int32 nPos
= rOasisURN
.getLength();
1634 if( nPos
>= nNameLen
|| rName
[nPos
] != ':' )
1637 // :urn:oasis:names:tc:[^:]:.*
1638 sal_Int32 nTCIdStart
= nPos
+1;
1639 sal_Int32 nTCIdEnd
= rName
.indexOf( ':', nTCIdStart
);
1640 if( -1 == nTCIdEnd
)
1643 // :urn:oasis:names:tc:[^:]:xmlns.*
1644 nPos
= nTCIdEnd
+ 1;
1645 std::u16string_view
sTmp( rName
.subView( nPos
) );
1646 const OUString
& rXMLNS
= XML_XMLNS
;
1647 if( !o3tl::starts_with(sTmp
, rXMLNS
) )
1650 // :urn:oasis:names:tc:[^:]:xmlns:.*
1651 nPos
+= rXMLNS
.getLength();
1652 if( nPos
>= nNameLen
|| rName
[nPos
] != ':' )
1655 // :urn:oasis:names:tc:[^:]:xmlns:[^:]*:.*
1656 nPos
= rName
.indexOf( ':', nPos
+1 );
1660 // :urn:oasis:names:tc:[^:]:xmlns:[^:]*:[^:][^:][^:][^:]*
1661 sal_Int32 nVersionStart
= nPos
+1;
1662 if( nVersionStart
+2 >= nNameLen
||
1663 -1 != rName
.indexOf( ':', nVersionStart
) )
1666 // :urn:oasis:names:tc:[^:]:xmlns:[^:]*:1\.[^:][^:]*
1667 if( rName
[nVersionStart
] != '1' || rName
[nVersionStart
+1] != '.' )
1670 // replace [tcid] with current TCID and version with current version.
1672 rName
= rName
.subView( 0, nTCIdStart
) +
1674 rName
.subView( nTCIdEnd
, nVersionStart
-nTCIdEnd
) +
1681 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */