1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sax/fastparser.hxx>
21 #include <sax/fastattribs.hxx>
23 #include <xml2utf.hxx>
25 #include <com/sun/star/io/XSeekable.hpp>
26 #include <com/sun/star/lang/DisposedException.hpp>
27 #include <com/sun/star/lang/IllegalArgumentException.hpp>
28 #include <com/sun/star/uno/XComponentContext.hpp>
29 #include <com/sun/star/xml/sax/FastToken.hpp>
30 #include <com/sun/star/xml/sax/SAXParseException.hpp>
31 #include <com/sun/star/xml/sax/XFastContextHandler.hpp>
32 #include <cppuhelper/implbase.hxx>
33 #include <cppuhelper/supportsservice.hxx>
34 #include <cppuhelper/exc_hlp.hxx>
35 #include <osl/conditn.hxx>
36 #include <rtl/ref.hxx>
37 #include <sal/log.hxx>
38 #include <salhelper/thread.hxx>
39 #include <comphelper/diagnose_ex.hxx>
40 #include <o3tl/string_view.hxx>
47 #include <string_view>
48 #include <unordered_map>
52 #include <libxml/parser.h>
54 // Inverse of libxml's BAD_CAST.
55 #define XML_CAST( str ) reinterpret_cast< const char* >( str )
57 using namespace ::osl
;
58 using namespace ::cppu
;
59 using namespace ::com::sun::star::uno
;
60 using namespace ::com::sun::star::lang
;
61 using namespace ::com::sun::star::xml::sax
;
62 using namespace ::com::sun::star::io
;
63 using namespace com::sun::star
;
64 using namespace sax_fastparser
;
66 static void NormalizeURI( OUString
& rName
);
71 class FastLocatorImpl
;
72 struct NamespaceDefine
;
75 typedef std::unordered_map
< OUString
, sal_Int32
> NamespaceMap
;
79 std::vector
<Event
> maEvents
;
80 bool mbIsAttributesEmpty
;
83 enum class CallbackType
{ START_ELEMENT
, END_ELEMENT
, CHARACTERS
, PROCESSING_INSTRUCTION
, DONE
, EXCEPTION
};
88 sal_Int32 mnElementToken
;
90 OUString msElementName
;
91 rtl::Reference
< FastAttributeList
> mxAttributes
;
92 rtl::Reference
< FastAttributeList
> mxDeclAttributes
;
101 NameWithToken(OUString sName
, sal_Int32 nToken
) :
102 msName(std::move(sName
)), mnToken(nToken
) {}
107 Reference
< XFastContextHandler
> mxContext
;
108 sal_Int32 mnElementToken
;
109 std::optional
<OUString
> moNamespace
;
110 std::optional
<OUString
> moElementName
;
112 SaxContext( sal_Int32 nElementToken
, const OUString
& aNamespace
, const OUString
& aElementName
):
113 mnElementToken(nElementToken
)
115 if (nElementToken
== FastToken::DONTKNOW
)
117 moNamespace
= aNamespace
;
118 moElementName
= aElementName
;
125 css::uno::Reference
< css::xml::sax::XFastDocumentHandler
> mxDocumentHandler
;
126 rtl::Reference
<FastTokenHandlerBase
> mxTokenHandler
;
127 css::uno::Reference
< css::xml::sax::XErrorHandler
> mxErrorHandler
;
128 css::uno::Reference
< css::xml::sax::XFastNamespaceHandler
>mxNamespaceHandler
;
133 struct NamespaceDefine
137 OUString maNamespaceURL
;
139 NamespaceDefine( OString aPrefix
, sal_Int32 nToken
, OUString aNamespaceURL
)
140 : maPrefix(std::move( aPrefix
)), mnToken( nToken
), maNamespaceURL(std::move( aNamespaceURL
)) {}
141 NamespaceDefine() : mnToken(-1) {}
144 // Entity binds all information needed for a single file | single call of parseStream
145 struct Entity
: public ParserData
147 // Amount of work producer sends to consumer in one iteration:
148 static const size_t mnEventListSize
= 1000;
150 // unique for each Entity instance:
152 // Number of valid events in mxProducedEvents:
153 size_t mnProducedEventsSize
;
154 std::optional
<EventList
> mxProducedEvents
;
155 std::queue
<EventList
> maPendingEvents
;
156 std::queue
<EventList
> maUsedEvents
;
157 std::mutex maEventProtector
;
159 static const size_t mnEventLowWater
= 4;
160 static const size_t mnEventHighWater
= 8;
161 osl::Condition maConsumeResume
;
162 osl::Condition maProduceResume
;
163 // Event we use to store data if threading is disabled:
166 // copied in copy constructor:
168 // Allow to disable threading for small documents:
169 bool mbEnableThreads
;
170 css::xml::sax::InputSource maStructSource
;
171 xmlParserCtxtPtr mpParser
;
172 ::sax_expatwrap::XMLFile2UTFConverter maConverter
;
174 // Exceptions cannot be thrown through the C-XmlParser (possible
175 // resource leaks), therefore any exception thrown by a UNO callback
176 // must be saved somewhere until the C-XmlParser is stopped.
177 css::uno::Any maSavedException
;
178 std::mutex maSavedExceptionMutex
;
179 void saveException( const Any
& e
);
180 // Thread-safe check if maSavedException has value
182 void throwException( const ::rtl::Reference
< FastLocatorImpl
> &xDocumentLocator
,
183 bool mbDuringParse
);
185 std::stack
< NameWithToken
, std::vector
<NameWithToken
> > maNamespaceStack
;
186 /* Context for main thread consuming events.
187 * startElement() stores the data, which characters() and endElement() uses
189 std::stack
< SaxContext
, std::vector
<SaxContext
> > maContextStack
;
190 // Determines which elements of maNamespaceDefines are valid in current context
191 std::stack
< sal_uInt32
, std::vector
<sal_uInt32
> > maNamespaceCount
;
192 std::vector
< NamespaceDefine
> maNamespaceDefines
;
194 explicit Entity( const ParserData
& rData
);
195 Entity( const Entity
& rEntity
) = delete;
196 Entity
& operator=( const Entity
& rEntity
) = delete;
197 void startElement( Event
const *pEvent
);
198 void characters( const OUString
& sChars
);
200 void processingInstruction( const OUString
& rTarget
, const OUString
& rData
);
201 EventList
& getEventList();
202 Event
& getEvent( CallbackType aType
);
205 // Stuff for custom entity names
206 struct ReplacementPair
209 OUString replacement
;
211 inline bool operator<(const ReplacementPair
& lhs
, const ReplacementPair
& rhs
)
213 return lhs
.name
< rhs
.name
;
215 inline bool operator<(const ReplacementPair
& lhs
, const char* rhs
)
217 return lhs
.name
.compareToAscii(rhs
) < 0;
222 namespace sax_fastparser
{
224 class FastSaxParserImpl
227 explicit FastSaxParserImpl();
228 ~FastSaxParserImpl();
231 std::vector
<ReplacementPair
> m_Replacements
;
232 std::vector
<xmlEntityPtr
> m_TemporalEntities
;
236 /// @throws css::xml::sax::SAXException
237 /// @throws css::io::IOException
238 /// @throws css::uno::RuntimeException
239 void parseStream( const css::xml::sax::InputSource
& aInputSource
);
240 /// @throws css::uno::RuntimeException
241 void setFastDocumentHandler( const css::uno::Reference
< css::xml::sax::XFastDocumentHandler
>& Handler
);
242 /// @throws css::uno::RuntimeException
243 void setTokenHandler( const css::uno::Reference
< css::xml::sax::XFastTokenHandler
>& Handler
);
244 /// @throws css::lang::IllegalArgumentException
245 /// @throws css::uno::RuntimeException
246 void registerNamespace( const OUString
& NamespaceURL
, sal_Int32 NamespaceToken
);
247 /// @throws css::lang::IllegalArgumentException
248 /// @throws css::uno::RuntimeException
249 OUString
const & getNamespaceURL( std::u16string_view rPrefix
);
250 /// @throws css::uno::RuntimeException
251 void setErrorHandler( const css::uno::Reference
< css::xml::sax::XErrorHandler
>& Handler
);
252 /// @throws css::uno::RuntimeException
253 void setNamespaceHandler( const css::uno::Reference
< css::xml::sax::XFastNamespaceHandler
>& Handler
);
255 void setCustomEntityNames(
256 const ::css::uno::Sequence
<::css::beans::Pair
<::rtl::OUString
, ::rtl::OUString
>>& replacements
);
258 // called by the C callbacks of the expat parser
259 void callbackStartElement( const xmlChar
*localName
, const xmlChar
* prefix
, const xmlChar
* URI
,
260 int numNamespaces
, const xmlChar
** namespaces
, int numAttributes
, const xmlChar
**attributes
);
261 void callbackEndElement();
262 void callbackCharacters( const xmlChar
* s
, int nLen
);
263 void callbackProcessingInstruction( const xmlChar
*target
, const xmlChar
*data
);
264 xmlEntityPtr
callbackGetEntity( const xmlChar
*name
);
266 void pushEntity(const ParserData
&, xml::sax::InputSource
const&);
268 Entity
& getEntity() { return *mpTop
; }
270 void produce( bool bForceFlush
= false );
271 bool m_bIgnoreMissingNSDecl
;
272 bool m_bDisableThreadedParser
;
275 bool consume(EventList
&);
276 void deleteUsedEvents();
277 void sendPendingCharacters();
278 void addUnknownElementWithPrefix(const xmlChar
**attributes
, int i
, rtl::Reference
< FastAttributeList
> const & xAttributes
);
280 sal_Int32
GetToken( const xmlChar
* pName
);
281 /// @throws css::xml::sax::SAXException
282 sal_Int32
GetTokenWithPrefix( const xmlChar
* pPrefix
, const xmlChar
* pName
);
283 /// @throws css::xml::sax::SAXException
284 OUString
const & GetNamespaceURL( std::string_view rPrefix
);
285 sal_Int32
GetNamespaceToken( const OUString
& rNamespaceURL
);
286 sal_Int32
GetTokenWithContextNamespace( sal_Int32 nNamespaceToken
, const xmlChar
* pName
);
287 void DefineNamespace( const OString
& rPrefix
, const OUString
& namespaceURL
);
290 std::mutex maMutex
; ///< Protecting whole parseStream() execution
291 ::rtl::Reference
< FastLocatorImpl
> mxDocumentLocator
;
292 NamespaceMap maNamespaceMap
;
294 ParserData maData
; /// Cached parser configuration for next call of parseStream().
296 Entity
*mpTop
; /// std::stack::top() is amazingly slow => cache this.
297 std::stack
< Entity
> maEntities
; /// Entity stack for each call of parseStream().
298 std::vector
<char> pendingCharacters
; /// Data from characters() callback that needs to be sent.
301 } // namespace sax_fastparser
305 class ParserThread
: public salhelper::Thread
307 FastSaxParserImpl
*mpParser
;
309 explicit ParserThread(FastSaxParserImpl
*pParser
): Thread("Parser"), mpParser(pParser
) {}
311 virtual void execute() override
319 Entity
&rEntity
= mpParser
->getEntity();
320 rEntity
.getEvent( CallbackType::EXCEPTION
);
321 mpParser
->produce( true );
328 static void call_callbackStartElement(void *userData
, const xmlChar
*localName
, const xmlChar
* prefix
, const xmlChar
* URI
,
329 int numNamespaces
, const xmlChar
** namespaces
, int numAttributes
, int /*defaultedAttributes*/, const xmlChar
**attributes
)
331 FastSaxParserImpl
* pFastParser
= static_cast<FastSaxParserImpl
*>( userData
);
332 pFastParser
->callbackStartElement( localName
, prefix
, URI
, numNamespaces
, namespaces
, numAttributes
, attributes
);
335 static void call_callbackEndElement(void *userData
, const xmlChar
* /*localName*/, const xmlChar
* /*prefix*/, const xmlChar
* /*URI*/)
337 FastSaxParserImpl
* pFastParser
= static_cast<FastSaxParserImpl
*>( userData
);
338 pFastParser
->callbackEndElement();
341 static void call_callbackCharacters( void *userData
, const xmlChar
*s
, int nLen
)
343 FastSaxParserImpl
* pFastParser
= static_cast<FastSaxParserImpl
*>( userData
);
344 pFastParser
->callbackCharacters( s
, nLen
);
347 static void call_callbackProcessingInstruction( void *userData
, const xmlChar
*target
, const xmlChar
*data
)
349 FastSaxParserImpl
* pFastParser
= static_cast<FastSaxParserImpl
*>( userData
);
350 pFastParser
->callbackProcessingInstruction( target
, data
);
353 static xmlEntityPtr
call_callbackGetEntity( void *userData
, const xmlChar
*name
)
355 FastSaxParserImpl
* pFastParser
= static_cast<FastSaxParserImpl
*>( userData
);
356 return pFastParser
->callbackGetEntity( name
);
361 class FastLocatorImpl
: public WeakImplHelper
< XLocator
>
364 explicit FastLocatorImpl(FastSaxParserImpl
*p
) : mpParser(p
) {}
366 void dispose() { mpParser
= nullptr; }
367 /// @throws RuntimeException
368 void checkDispose() const { if( !mpParser
) throw DisposedException(); }
371 virtual sal_Int32 SAL_CALL
getColumnNumber() override
;
372 virtual sal_Int32 SAL_CALL
getLineNumber() override
;
373 virtual OUString SAL_CALL
getPublicId() override
;
374 virtual OUString SAL_CALL
getSystemId() override
;
377 FastSaxParserImpl
*mpParser
;
380 sal_Int32 SAL_CALL
FastLocatorImpl::getColumnNumber()
383 return xmlSAX2GetColumnNumber( mpParser
->getEntity().mpParser
);
386 sal_Int32 SAL_CALL
FastLocatorImpl::getLineNumber()
389 return xmlSAX2GetLineNumber( mpParser
->getEntity().mpParser
);
392 OUString SAL_CALL
FastLocatorImpl::getPublicId()
395 return mpParser
->getEntity().maStructSource
.sPublicId
;
398 OUString SAL_CALL
FastLocatorImpl::getSystemId()
401 return mpParser
->getEntity().maStructSource
.sSystemId
;
404 ParserData::ParserData()
407 Entity::Entity(const ParserData
& rData
)
409 , mnProducedEventsSize(0)
410 , mbEnableThreads(false)
415 void Entity::startElement( Event
const *pEvent
)
417 const sal_Int32
& nElementToken
= pEvent
->mnElementToken
;
418 const OUString
& aNamespace
= pEvent
->msNamespace
;
419 const OUString
& aElementName
= pEvent
->msElementName
;
421 // Use un-wrapped pointers to avoid significant acquire/release overhead
422 XFastContextHandler
*pParentContext
= nullptr;
423 if( !maContextStack
.empty() )
425 pParentContext
= maContextStack
.top().mxContext
.get();
426 if( !pParentContext
)
428 maContextStack
.push( SaxContext(nElementToken
, aNamespace
, aElementName
) );
433 maContextStack
.push( SaxContext( nElementToken
, aNamespace
, aElementName
) );
437 const Reference
< XFastAttributeList
> xAttr( pEvent
->mxAttributes
);
438 Reference
< XFastContextHandler
> xContext
;
440 if ( mxNamespaceHandler
.is() )
442 const Sequence
< xml::Attribute
> NSDeclAttribs
= pEvent
->mxDeclAttributes
->getUnknownAttributes();
443 for (const auto& rNSDeclAttrib
: NSDeclAttribs
)
445 mxNamespaceHandler
->registerNamespace( rNSDeclAttrib
.Name
, rNSDeclAttrib
.Value
);
449 if( nElementToken
== FastToken::DONTKNOW
)
452 xContext
= pParentContext
->createUnknownChildContext( aNamespace
, aElementName
, xAttr
);
453 else if( mxDocumentHandler
.is() )
454 xContext
= mxDocumentHandler
->createUnknownChildContext( aNamespace
, aElementName
, xAttr
);
458 xContext
->startUnknownElement( aNamespace
, aElementName
, xAttr
);
464 xContext
= pParentContext
->createFastChildContext( nElementToken
, xAttr
);
465 else if( mxDocumentHandler
.is() )
466 xContext
= mxDocumentHandler
->createFastChildContext( nElementToken
, xAttr
);
469 xContext
->startFastElement( nElementToken
, xAttr
);
471 // swap the reference we own in to avoid referencing thrash.
472 maContextStack
.top().mxContext
= std::move( xContext
);
476 saveException( ::cppu::getCaughtException() );
480 void Entity::characters( const OUString
& sChars
)
482 if (maContextStack
.empty())
484 // Malformed XML stream !?
488 XFastContextHandler
* pContext( maContextStack
.top().mxContext
.get() );
491 pContext
->characters( sChars
);
495 saveException( ::cppu::getCaughtException() );
499 void Entity::endElement()
501 if (maContextStack
.empty())
503 // Malformed XML stream !?
507 const SaxContext
& aContext
= maContextStack
.top();
508 XFastContextHandler
* pContext( aContext
.mxContext
.get() );
512 sal_Int32 nElementToken
= aContext
.mnElementToken
;
513 if( nElementToken
!= FastToken::DONTKNOW
)
514 pContext
->endFastElement( nElementToken
);
516 pContext
->endUnknownElement( *aContext
.moNamespace
, *aContext
.moElementName
);
520 saveException( ::cppu::getCaughtException() );
522 maContextStack
.pop();
525 void Entity::processingInstruction( const OUString
& rTarget
, const OUString
& rData
)
527 if( mxDocumentHandler
.is() ) try
529 mxDocumentHandler
->processingInstruction( rTarget
, rData
);
533 saveException( ::cppu::getCaughtException() );
537 EventList
& Entity::getEventList()
539 if (!mxProducedEvents
)
541 std::unique_lock
aGuard(maEventProtector
);
542 if (!maUsedEvents
.empty())
544 mxProducedEvents
= std::move(maUsedEvents
.front());
546 aGuard
.unlock(); // unlock
547 mnProducedEventsSize
= 0;
549 if (!mxProducedEvents
)
551 mxProducedEvents
.emplace();
552 mxProducedEvents
->maEvents
.resize(mnEventListSize
);
553 mxProducedEvents
->mbIsAttributesEmpty
= false;
554 mnProducedEventsSize
= 0;
557 return *mxProducedEvents
;
560 Event
& Entity::getEvent( CallbackType aType
)
562 if (!mbEnableThreads
)
563 return maSharedEvent
;
565 EventList
& rEventList
= getEventList();
566 if (mnProducedEventsSize
== rEventList
.maEvents
.size())
568 SAL_WARN_IF(!maSavedException
.hasValue(), "sax",
569 "Event vector should only exceed " << mnEventListSize
<<
570 " temporarily while an exception is pending");
571 rEventList
.maEvents
.resize(mnProducedEventsSize
+ 1);
573 Event
& rEvent
= rEventList
.maEvents
[mnProducedEventsSize
++];
574 rEvent
.maType
= aType
;
578 OUString
lclGetErrorMessage( xmlParserCtxtPtr ctxt
, std::u16string_view sSystemId
, sal_Int32 nLine
)
580 const char* pMessage
;
581 const xmlError
* error
= xmlCtxtGetLastError( ctxt
);
582 if( error
&& error
->message
)
583 pMessage
= error
->message
;
585 pMessage
= "unknown error";
586 return OUString::Concat("[") + sSystemId
+ " line " + OUString::number(nLine
) + "]: " +
587 OUString(pMessage
, strlen(pMessage
), RTL_TEXTENCODING_ASCII_US
);
590 // throw an exception, but avoid callback if
591 // during a threaded produce
592 void Entity::throwException( const ::rtl::Reference
< FastLocatorImpl
> &xDocumentLocator
,
595 // Error during parsing !
598 std::scoped_lock
g(maSavedExceptionMutex
);
599 if (maSavedException
.hasValue())
601 savedException
.setValue(&maSavedException
, cppu::UnoType
<decltype(maSavedException
)>::get());
604 SAXParseException
aExcept(
605 lclGetErrorMessage( mpParser
,
606 xDocumentLocator
->getSystemId(),
607 xDocumentLocator
->getLineNumber() ),
608 Reference
< XInterface
>(),
610 xDocumentLocator
->getPublicId(),
611 xDocumentLocator
->getSystemId(),
612 xDocumentLocator
->getLineNumber(),
613 xDocumentLocator
->getColumnNumber()
616 // error handler is set, it may throw the exception
617 if( !mbDuringParse
|| !mbEnableThreads
)
619 if (mxErrorHandler
.is() )
620 mxErrorHandler
->fatalError( Any( aExcept
) );
623 // error handler has not thrown, but parsing must stop => throw ourselves
627 // In the single threaded case we emit events via our C
628 // callbacks, so any exception caught must be queued up until
629 // we can safely re-throw it from our C++ parent of parse()
631 // If multi-threaded, we need to push an EXCEPTION event, at
632 // which point we transfer ownership of maSavedException to
633 // the consuming thread.
634 void Entity::saveException( const Any
& e
)
636 // fdo#81214 - allow the parser to run on after an exception,
637 // unexpectedly some 'startElements' produce a UNO_QUERY_THROW
638 // for XComponent; and yet expect to continue parsing.
639 SAL_WARN("sax", "Unexpected exception from XML parser " << exceptionToString(e
));
640 std::scoped_lock
g(maSavedExceptionMutex
);
641 if (maSavedException
.hasValue())
643 SAL_INFO("sax.fastparser", "discarding exception, already have one");
647 maSavedException
= e
;
651 bool Entity::hasException()
653 std::scoped_lock
g(maSavedExceptionMutex
);
654 return maSavedException
.hasValue();
659 namespace sax_fastparser
{
661 FastSaxParserImpl::FastSaxParserImpl() :
662 m_bIgnoreMissingNSDecl(false),
663 m_bDisableThreadedParser(false),
666 mxDocumentLocator
.set( new FastLocatorImpl( this ) );
669 FastSaxParserImpl::~FastSaxParserImpl()
671 if( mxDocumentLocator
.is() )
672 mxDocumentLocator
->dispose();
673 for (auto& entity
: m_TemporalEntities
)
677 xmlNodePtr pPtr
= reinterpret_cast<xmlNodePtr
>(entity
);
683 void FastSaxParserImpl::DefineNamespace( const OString
& rPrefix
, const OUString
& namespaceURL
)
685 Entity
& rEntity
= getEntity();
686 assert(!rEntity
.maNamespaceCount
.empty()); // need a context!
688 sal_uInt32 nOffset
= rEntity
.maNamespaceCount
.top()++;
689 if( rEntity
.maNamespaceDefines
.size() <= nOffset
)
690 rEntity
.maNamespaceDefines
.resize( rEntity
.maNamespaceDefines
.size() + 64 );
692 rEntity
.maNamespaceDefines
[nOffset
] = NamespaceDefine( rPrefix
, GetNamespaceToken( namespaceURL
), namespaceURL
);
695 sal_Int32
FastSaxParserImpl::GetToken(const xmlChar
* pName
)
697 return FastTokenHandlerBase::getTokenFromChars( getEntity(). mxTokenHandler
.get(),
698 XML_CAST( pName
) ); // uses utf-8
701 sal_Int32
FastSaxParserImpl::GetTokenWithPrefix( const xmlChar
* pPrefix
, const xmlChar
* pName
)
703 Entity
& rEntity
= getEntity();
704 if (rEntity
.maNamespaceCount
.empty())
705 return FastToken::DONTKNOW
;
707 std::string_view
sPrefix(XML_CAST(pPrefix
));
708 sal_uInt32 nNamespace
= rEntity
.maNamespaceCount
.top();
709 while( nNamespace
-- )
711 const auto & rNamespaceDefine
= rEntity
.maNamespaceDefines
[nNamespace
];
712 if( rNamespaceDefine
.maPrefix
== sPrefix
)
713 return GetTokenWithContextNamespace(rNamespaceDefine
.mnToken
, pName
);
716 if (!m_bIgnoreMissingNSDecl
)
717 throw SAXException("No namespace defined for " + OStringToOUString(sPrefix
,
718 RTL_TEXTENCODING_UTF8
), {}, {});
720 return FastToken::DONTKNOW
;
723 sal_Int32
FastSaxParserImpl::GetNamespaceToken( const OUString
& rNamespaceURL
)
725 NamespaceMap::iterator
aIter( maNamespaceMap
.find( rNamespaceURL
) );
726 if( aIter
!= maNamespaceMap
.end() )
727 return (*aIter
).second
;
729 return FastToken::DONTKNOW
;
732 OUString
const & FastSaxParserImpl::GetNamespaceURL( std::string_view rPrefix
)
734 Entity
& rEntity
= getEntity();
735 if( !rEntity
.maNamespaceCount
.empty() )
737 sal_uInt32 nNamespace
= rEntity
.maNamespaceCount
.top();
738 while( nNamespace
-- )
739 if( rEntity
.maNamespaceDefines
[nNamespace
].maPrefix
== rPrefix
)
740 return rEntity
.maNamespaceDefines
[nNamespace
].maNamespaceURL
;
743 throw SAXException("No namespace defined for " + OUString::fromUtf8(rPrefix
),
744 Reference
< XInterface
>(), Any());
747 sal_Int32
FastSaxParserImpl::GetTokenWithContextNamespace( sal_Int32 nNamespaceToken
, const xmlChar
* pName
)
749 if( nNamespaceToken
!= FastToken::DONTKNOW
)
751 sal_Int32 nNameToken
= GetToken( pName
);
752 if( nNameToken
!= FastToken::DONTKNOW
)
753 return nNamespaceToken
| nNameToken
;
756 return FastToken::DONTKNOW
;
764 FastSaxParserImpl
& m_rParser
;
766 rtl::Reference
<ParserThread
> m_xParser
;
768 ParserCleanup(FastSaxParserImpl
& rParser
, Entity
& rEntity
)
775 if (m_rEntity
.mpParser
)
777 if (m_rEntity
.mpParser
->myDoc
)
778 xmlFreeDoc(m_rEntity
.mpParser
->myDoc
);
779 xmlFreeParserCtxt(m_rEntity
.mpParser
);
782 m_rParser
.popEntity();
784 void setThread(const rtl::Reference
<ParserThread
> &xParser
)
792 rtl::Reference
<ParserThread
> xToJoin
= m_xParser
;
801 * parseStream does Parser-startup initializations. The FastSaxParser::parse() method does
802 * the file-specific initialization work. (During a parser run, external files may be opened)
805 void FastSaxParserImpl::parseStream(const InputSource
& rStructSource
)
809 // Only one text at one time
810 std::unique_lock
guard( maMutex
);
812 pushEntity(maData
, rStructSource
);
813 Entity
& rEntity
= getEntity();
814 ParserCleanup
aEnsureFree(*this, rEntity
);
816 // start the document
817 if( rEntity
.mxDocumentHandler
.is() )
819 rEntity
.mxDocumentHandler
->setDocumentLocator( mxDocumentLocator
);
820 rEntity
.mxDocumentHandler
->startDocument();
824 rEntity
.mbEnableThreads
= false;
826 if (!getenv("SAX_DISABLE_THREADS") && !m_bDisableThreadedParser
)
828 Reference
<css::io::XSeekable
> xSeekable(rEntity
.maStructSource
.aInputStream
, UNO_QUERY
);
829 // available() is not __really__ relevant here, but leave it in as a heuristic for non-seekable streams
830 rEntity
.mbEnableThreads
= (xSeekable
.is() && xSeekable
->getLength() > 10000)
831 || (rEntity
.maStructSource
.aInputStream
->available() > 10000);
835 if (rEntity
.mbEnableThreads
)
837 rtl::Reference
<ParserThread
> xParser
= new ParserThread(this);
839 aEnsureFree
.setThread(xParser
);
842 rEntity
.maConsumeResume
.wait();
843 rEntity
.maConsumeResume
.reset();
845 std::unique_lock
aGuard(rEntity
.maEventProtector
);
846 while (!rEntity
.maPendingEvents
.empty())
848 if (rEntity
.maPendingEvents
.size() <= Entity::mnEventLowWater
)
849 rEntity
.maProduceResume
.set(); // start producer again
851 EventList aEventList
= std::move(rEntity
.maPendingEvents
.front());
852 rEntity
.maPendingEvents
.pop();
853 aGuard
.unlock(); // unlock
855 if (!consume(aEventList
))
858 aGuard
.lock(); // lock
860 if ( rEntity
.maPendingEvents
.size() <= Entity::mnEventLowWater
)
863 for (auto& rEvent
: aEventList
.maEvents
)
865 if (rEvent
.mxAttributes
.is())
867 rEvent
.mxAttributes
->clear();
868 if( rEntity
.mxNamespaceHandler
.is() )
869 rEvent
.mxDeclAttributes
->clear();
871 aEventList
.mbIsAttributesEmpty
= true;
876 rEntity
.maUsedEvents
.push(std::move(aEventList
));
879 aEnsureFree
.joinThread();
882 // callbacks used inside XML_Parse may have caught an exception
883 // No need to lock maSavedExceptionMutex here because parser
885 if( rEntity
.maSavedException
.hasValue() )
886 rEntity
.throwException( mxDocumentLocator
, true );
894 if( rEntity
.mxDocumentHandler
.is() )
896 rEntity
.mxDocumentHandler
->endDocument();
900 void FastSaxParserImpl::setFastDocumentHandler( const Reference
< XFastDocumentHandler
>& Handler
)
902 maData
.mxDocumentHandler
= Handler
;
905 void FastSaxParserImpl::setTokenHandler( const Reference
< XFastTokenHandler
>& xHandler
)
907 assert( dynamic_cast< FastTokenHandlerBase
*>( xHandler
.get() ) && "we expect this handler to be a subclass of FastTokenHandlerBase" );
908 maData
.mxTokenHandler
= dynamic_cast< FastTokenHandlerBase
*>( xHandler
.get() );
911 void FastSaxParserImpl::registerNamespace( const OUString
& NamespaceURL
, sal_Int32 NamespaceToken
)
913 if( NamespaceToken
< FastToken::NAMESPACE
)
914 throw IllegalArgumentException("Invalid namespace token " + OUString::number(NamespaceToken
), css::uno::Reference
<css::uno::XInterface
>(), 0);
916 if( GetNamespaceToken( NamespaceURL
) == FastToken::DONTKNOW
)
918 maNamespaceMap
[ NamespaceURL
] = NamespaceToken
;
921 throw IllegalArgumentException("namespace URL is already registered: " + NamespaceURL
, css::uno::Reference
<css::uno::XInterface
>(), 0);
924 OUString
const & FastSaxParserImpl::getNamespaceURL( std::u16string_view rPrefix
)
928 return GetNamespaceURL( OUStringToOString( rPrefix
, RTL_TEXTENCODING_UTF8
) );
930 catch (const Exception
&)
933 throw IllegalArgumentException();
936 void FastSaxParserImpl::setErrorHandler(const Reference
< XErrorHandler
> & Handler
)
938 maData
.mxErrorHandler
= Handler
;
941 void FastSaxParserImpl::setNamespaceHandler( const Reference
< XFastNamespaceHandler
>& Handler
)
943 maData
.mxNamespaceHandler
= Handler
;
946 void FastSaxParserImpl::setCustomEntityNames(
947 const ::css::uno::Sequence
<::css::beans::Pair
<::rtl::OUString
, ::rtl::OUString
>>& replacements
)
949 m_Replacements
.resize(replacements
.size());
950 for (size_t i
= 0; i
< replacements
.size(); ++i
)
952 m_Replacements
[i
].name
= replacements
[i
].First
;
953 m_Replacements
[i
].replacement
= replacements
[i
].Second
;
955 if (m_Replacements
.size() > 1)
956 std::sort(m_Replacements
.begin(), m_Replacements
.end());
959 void FastSaxParserImpl::deleteUsedEvents()
961 Entity
& rEntity
= getEntity();
962 std::unique_lock
aGuard(rEntity
.maEventProtector
);
964 while (!rEntity
.maUsedEvents
.empty())
966 { // the block makes sure that aEventList is destructed outside the lock
967 EventList aEventList
= std::move(rEntity
.maUsedEvents
.front());
968 rEntity
.maUsedEvents
.pop();
970 aGuard
.unlock(); // unlock
973 aGuard
.lock(); // lock
977 void FastSaxParserImpl::produce( bool bForceFlush
)
979 Entity
& rEntity
= getEntity();
981 rEntity
.mnProducedEventsSize
>= Entity::mnEventListSize
))
984 std::unique_lock
aGuard(rEntity
.maEventProtector
);
986 while (rEntity
.maPendingEvents
.size() >= Entity::mnEventHighWater
)
987 { // pause parsing for a bit
988 aGuard
.unlock(); // unlock
989 rEntity
.maProduceResume
.wait();
990 rEntity
.maProduceResume
.reset();
991 aGuard
.lock(); // lock
994 rEntity
.maPendingEvents
.push(std::move(*rEntity
.mxProducedEvents
));
995 rEntity
.mxProducedEvents
.reset();
996 assert(!rEntity
.mxProducedEvents
);
998 aGuard
.unlock(); // unlock
1000 rEntity
.maConsumeResume
.set();
1003 bool FastSaxParserImpl::consume(EventList
& rEventList
)
1005 Entity
& rEntity
= getEntity();
1006 rEventList
.mbIsAttributesEmpty
= false;
1007 for (auto& rEvent
: rEventList
.maEvents
)
1009 switch (rEvent
.maType
)
1011 case CallbackType::START_ELEMENT
:
1012 rEntity
.startElement( &rEvent
);
1014 case CallbackType::END_ELEMENT
:
1015 rEntity
.endElement();
1017 case CallbackType::CHARACTERS
:
1018 rEntity
.characters( rEvent
.msChars
);
1020 case CallbackType::PROCESSING_INSTRUCTION
:
1021 rEntity
.processingInstruction(
1022 rEvent
.msNamespace
, rEvent
.msElementName
); // ( target, data )
1024 case CallbackType::DONE
:
1026 case CallbackType::EXCEPTION
:
1027 rEntity
.throwException( mxDocumentLocator
, false );
1028 [[fallthrough
]]; // avoid unreachable code warning with some compilers
1037 void FastSaxParserImpl::pushEntity(const ParserData
& rEntityData
,
1038 xml::sax::InputSource
const& rSource
)
1040 if (!rSource
.aInputStream
.is())
1041 throw SAXException(u
"No input source"_ustr
, Reference
<XInterface
>(), Any());
1043 maEntities
.emplace(rEntityData
);
1044 mpTop
= &maEntities
.top();
1046 mpTop
->maStructSource
= rSource
;
1048 mpTop
->maConverter
.setInputStream(mpTop
->maStructSource
.aInputStream
);
1049 if (!mpTop
->maStructSource
.sEncoding
.isEmpty())
1051 mpTop
->maConverter
.setEncoding(OUStringToOString(mpTop
->maStructSource
.sEncoding
, RTL_TEXTENCODING_ASCII_US
));
1055 void FastSaxParserImpl::popEntity()
1058 mpTop
= !maEntities
.empty() ? &maEntities
.top() : nullptr;
1061 // starts parsing with actual parser !
1062 void FastSaxParserImpl::parse()
1064 const int BUFFER_SIZE
= 16 * 1024;
1065 Sequence
< sal_Int8
> seqOut( BUFFER_SIZE
);
1067 Entity
& rEntity
= getEntity();
1069 // set all necessary C-Callbacks
1070 static xmlSAXHandler callbacks
;
1071 callbacks
.startElementNs
= call_callbackStartElement
;
1072 callbacks
.endElementNs
= call_callbackEndElement
;
1073 callbacks
.characters
= call_callbackCharacters
;
1074 callbacks
.processingInstruction
= call_callbackProcessingInstruction
;
1075 callbacks
.getEntity
= call_callbackGetEntity
;
1076 callbacks
.initialized
= XML_SAX2_MAGIC
;
1080 nRead
= rEntity
.maConverter
.readAndConvert( seqOut
, BUFFER_SIZE
);
1083 if( rEntity
.mpParser
!= nullptr )
1085 if( xmlParseChunk( rEntity
.mpParser
, reinterpret_cast<const char*>(seqOut
.getConstArray()), 0, 1 ) != XML_ERR_OK
)
1086 rEntity
.throwException( mxDocumentLocator
, true );
1087 if (rEntity
.hasException())
1088 rEntity
.throwException(mxDocumentLocator
, true);
1093 bool bContinue
= true;
1094 if( rEntity
.mpParser
== nullptr )
1096 // create parser with proper encoding (needs the first chunk of data)
1097 rEntity
.mpParser
= xmlCreatePushParserCtxt( &callbacks
, this,
1098 reinterpret_cast<const char*>(seqOut
.getConstArray()), nRead
, nullptr );
1099 if( !rEntity
.mpParser
)
1100 throw SAXException(u
"Couldn't create parser"_ustr
, Reference
< XInterface
>(), Any() );
1102 // Tell libxml2 parser to decode entities in attribute values.
1103 // Also allow XML attribute values which are larger than 10MB, because this used to work
1105 // coverity[unsafe_xml_parse_config] - entity support is required
1106 xmlCtxtUseOptions(rEntity
.mpParser
, XML_PARSE_NOENT
| XML_PARSE_HUGE
);
1110 bContinue
= xmlParseChunk( rEntity
.mpParser
, reinterpret_cast<const char*>(seqOut
.getConstArray()), nRead
, 0 )
1114 // callbacks used inside XML_Parse may have caught an exception
1117 rEntity
.throwException( mxDocumentLocator
, true );
1119 if (rEntity
.hasException())
1121 rEntity
.throwException( mxDocumentLocator
, true );
1123 } while( nRead
> 0 );
1124 rEntity
.getEvent( CallbackType::DONE
);
1125 if( rEntity
.mbEnableThreads
)
1130 void FastSaxParserImpl::callbackStartElement(const xmlChar
*localName
, const xmlChar
* prefix
, const xmlChar
* URI
,
1131 int numNamespaces
, const xmlChar
** namespaces
, int numAttributes
, const xmlChar
**attributes
)
1133 if (!pendingCharacters
.empty())
1134 sendPendingCharacters();
1135 Entity
& rEntity
= getEntity();
1136 if( rEntity
.maNamespaceCount
.empty() )
1138 rEntity
.maNamespaceCount
.push(0);
1139 DefineNamespace( "xml"_ostr
, u
"http://www.w3.org/XML/1998/namespace"_ustr
);
1143 rEntity
.maNamespaceCount
.push( rEntity
.maNamespaceCount
.top() );
1146 // create attribute map and process namespace instructions
1147 Event
& rEvent
= rEntity
.getEvent( CallbackType::START_ELEMENT
);
1148 bool bIsAttributesEmpty
= false;
1149 if ( rEntity
.mbEnableThreads
)
1150 bIsAttributesEmpty
= rEntity
.getEventList().mbIsAttributesEmpty
;
1152 if (rEvent
.mxAttributes
.is())
1154 if( !bIsAttributesEmpty
)
1155 rEvent
.mxAttributes
->clear();
1158 rEvent
.mxAttributes
.set(
1159 new FastAttributeList( rEntity
.mxTokenHandler
.get() ) );
1161 if( rEntity
.mxNamespaceHandler
.is() )
1163 if (rEvent
.mxDeclAttributes
.is())
1165 if( !bIsAttributesEmpty
)
1166 rEvent
.mxDeclAttributes
->clear();
1169 rEvent
.mxDeclAttributes
.set(
1170 new FastAttributeList( rEntity
.mxTokenHandler
.get() ) );
1173 OUString sNamespace
;
1174 sal_Int32 nNamespaceToken
= FastToken::DONTKNOW
;
1175 if (!rEntity
.maNamespaceStack
.empty())
1177 sNamespace
= rEntity
.maNamespaceStack
.top().msName
;
1178 nNamespaceToken
= rEntity
.maNamespaceStack
.top().mnToken
;
1183 /* #158414# Each element may define new namespaces, also for attributes.
1184 First, process all namespaces, second, process the attributes after namespaces
1185 have been initialized. */
1187 // #158414# first: get namespaces
1188 for (int i
= 0; i
< numNamespaces
* 2; i
+= 2)
1190 // namespaces[] is (prefix/URI)
1191 if( namespaces
[ i
] != nullptr )
1193 OString
aPrefix( XML_CAST( namespaces
[ i
] ));
1194 OUString
namespaceURL( XML_CAST( namespaces
[ i
+ 1 ] ), strlen( XML_CAST( namespaces
[ i
+ 1 ] )), RTL_TEXTENCODING_UTF8
);
1195 NormalizeURI( namespaceURL
);
1196 DefineNamespace(aPrefix
, namespaceURL
);
1197 if( rEntity
.mxNamespaceHandler
.is() )
1198 rEvent
.mxDeclAttributes
->addUnknown( OString( XML_CAST( namespaces
[ i
] ) ), OString( XML_CAST( namespaces
[ i
+ 1 ] ) ) );
1202 // default namespace
1203 sNamespace
= OUString( XML_CAST( namespaces
[ i
+ 1 ] ), strlen( XML_CAST( namespaces
[ i
+ 1 ] )), RTL_TEXTENCODING_UTF8
);
1204 NormalizeURI( sNamespace
);
1205 nNamespaceToken
= GetNamespaceToken( sNamespace
);
1206 if( rEntity
.mxNamespaceHandler
.is() )
1207 rEvent
.mxDeclAttributes
->addUnknown( ""_ostr
, OString( XML_CAST( namespaces
[ i
+ 1 ] ) ) );
1211 if ( rEntity
.mxTokenHandler
.is() )
1213 // #158414# second: fill attribute list with other attributes
1214 rEvent
.mxAttributes
->reserve( numAttributes
);
1215 for (int i
= 0; i
< numAttributes
* 5; i
+= 5)
1217 // attributes[] is ( localname / prefix / nsURI / valueBegin / valueEnd )
1218 if( attributes
[ i
+ 1 ] != nullptr )
1220 sal_Int32 nAttributeToken
= GetTokenWithPrefix(attributes
[ i
+ 1 ], attributes
[ i
]);
1221 if( nAttributeToken
!= FastToken::DONTKNOW
)
1222 rEvent
.mxAttributes
->add( nAttributeToken
, std::string_view(XML_CAST( attributes
[ i
+ 3 ] ), attributes
[ i
+ 4 ] - attributes
[ i
+ 3 ]) );
1224 addUnknownElementWithPrefix(attributes
, i
, rEvent
.mxAttributes
);
1228 sal_Int32 nAttributeToken
= GetToken(attributes
[ i
]);
1229 if( nAttributeToken
!= FastToken::DONTKNOW
)
1230 rEvent
.mxAttributes
->add( nAttributeToken
, std::string_view(XML_CAST( attributes
[ i
+ 3 ] ), attributes
[ i
+ 4 ] - attributes
[ i
+ 3 ]) );
1233 SAL_WARN("xmloff", "unknown attribute " << XML_CAST( attributes
[ i
] ) << "=" <<
1234 OString( XML_CAST( attributes
[ i
+ 3 ] ), attributes
[ i
+ 4 ] - attributes
[ i
+ 3 ] ));
1235 rEvent
.mxAttributes
->addUnknown( XML_CAST( attributes
[ i
] ),
1236 OString( XML_CAST( attributes
[ i
+ 3 ] ), attributes
[ i
+ 4 ] - attributes
[ i
+ 3 ] ));
1241 if( prefix
!= nullptr )
1242 rEvent
.mnElementToken
= GetTokenWithPrefix(prefix
, localName
);
1243 else if( !sNamespace
.isEmpty() )
1244 rEvent
.mnElementToken
= GetTokenWithContextNamespace(nNamespaceToken
, localName
);
1246 rEvent
.mnElementToken
= GetToken(localName
);
1250 for (int i
= 0; i
< numAttributes
* 5; i
+= 5)
1252 if( attributes
[ i
+ 1 ] != nullptr )
1253 addUnknownElementWithPrefix(attributes
, i
, rEvent
.mxAttributes
);
1255 rEvent
.mxAttributes
->addUnknown( XML_CAST( attributes
[ i
] ),
1256 OString( XML_CAST( attributes
[ i
+ 3 ] ), attributes
[ i
+ 4 ] - attributes
[ i
+ 3 ] ));
1259 rEvent
.mnElementToken
= FastToken::DONTKNOW
;
1262 if( rEvent
.mnElementToken
== FastToken::DONTKNOW
)
1264 OUString aElementPrefix
;
1265 if( prefix
!= nullptr )
1267 aElementPrefix
= OUString( XML_CAST( prefix
), strlen( XML_CAST( prefix
)), RTL_TEXTENCODING_UTF8
);
1268 if ( URI
!= nullptr )
1269 sNamespace
= OUString( XML_CAST( URI
), strlen( XML_CAST( URI
)), RTL_TEXTENCODING_UTF8
);
1270 else if ( m_bIgnoreMissingNSDecl
)
1273 throw SAXException("No namespace defined for " + aElementPrefix
, {}, {});
1274 nNamespaceToken
= GetNamespaceToken( sNamespace
);
1276 OUString
aElementLocalName( XML_CAST( localName
), strlen( XML_CAST( localName
)), RTL_TEXTENCODING_UTF8
);
1277 rEvent
.msNamespace
= sNamespace
;
1278 if( aElementPrefix
.isEmpty() )
1279 rEvent
.msElementName
= std::move(aElementLocalName
);
1281 rEvent
.msElementName
= aElementPrefix
+ ":" + aElementLocalName
;
1283 else // token is always preferred.
1284 rEvent
.msElementName
.clear();
1286 rEntity
.maNamespaceStack
.push( NameWithToken(sNamespace
, nNamespaceToken
) );
1287 if (rEntity
.mbEnableThreads
)
1291 SAL_INFO("sax.fastparser", " startElement line " << mxDocumentLocator
->getLineNumber() << " column " << mxDocumentLocator
->getColumnNumber() << " " << ( prefix
? XML_CAST(prefix
) : "(null)" ) << ":" << localName
);
1292 rEntity
.startElement( &rEvent
);
1297 rEntity
.saveException( ::cppu::getCaughtException() );
1301 void FastSaxParserImpl::addUnknownElementWithPrefix(const xmlChar
**attributes
, int i
, rtl::Reference
< FastAttributeList
> const & xAttributes
)
1303 OUString aNamespaceURI
;
1304 if ( !m_bIgnoreMissingNSDecl
|| attributes
[i
+ 2] != nullptr )
1305 aNamespaceURI
= OUString( XML_CAST( attributes
[ i
+ 2 ] ), strlen( XML_CAST( attributes
[ i
+ 2 ] )), RTL_TEXTENCODING_UTF8
);
1306 const OString
aPrefix( XML_CAST( attributes
[ i
+ 1 ] ));
1307 const OString
aLocalName( XML_CAST( attributes
[ i
] ));
1308 OString aQualifiedName
= (aPrefix
.isEmpty())? aLocalName
: aPrefix
+ ":" + aLocalName
;
1309 xAttributes
->addUnknown( aNamespaceURI
, aQualifiedName
,
1310 OString( XML_CAST( attributes
[ i
+ 3 ] ), attributes
[ i
+ 4 ] - attributes
[ i
+ 3 ] ));
1311 SAL_INFO("xmloff", "unknown element " << aQualifiedName
<< " " << aNamespaceURI
);
1314 void FastSaxParserImpl::callbackEndElement()
1316 if (!pendingCharacters
.empty())
1317 sendPendingCharacters();
1318 Entity
& rEntity
= getEntity();
1319 SAL_WARN_IF(rEntity
.maNamespaceCount
.empty(), "sax", "Empty NamespaceCount");
1320 if( !rEntity
.maNamespaceCount
.empty() )
1321 rEntity
.maNamespaceCount
.pop();
1323 SAL_WARN_IF(rEntity
.maNamespaceStack
.empty(), "sax", "Empty NamespaceStack");
1324 if( !rEntity
.maNamespaceStack
.empty() )
1325 rEntity
.maNamespaceStack
.pop();
1327 rEntity
.getEvent( CallbackType::END_ELEMENT
);
1328 if (rEntity
.mbEnableThreads
)
1331 rEntity
.endElement();
1334 void FastSaxParserImpl::callbackCharacters( const xmlChar
* s
, int nLen
)
1336 // SAX interface allows that the characters callback splits content of one XML node
1337 // (e.g. because there's an entity that needs decoding), however for consumers it's
1338 // simpler FastSaxParser's character callback provides the whole string at once,
1339 // so merge data from possible multiple calls and send them at once (before the element
1340 // ends or another one starts).
1342 // We use a std::vector<char> to avoid calling into the OUString constructor more than once when
1343 // we have multiple callbackCharacters() calls that we have to merge, which happens surprisingly
1344 // often in writer documents.
1345 int nOriginalLen
= pendingCharacters
.size();
1346 pendingCharacters
.resize(nOriginalLen
+ nLen
);
1347 memcpy(pendingCharacters
.data() + nOriginalLen
, s
, nLen
);
1350 void FastSaxParserImpl::sendPendingCharacters()
1352 Entity
& rEntity
= getEntity();
1353 OUString
sChars( pendingCharacters
.data(), pendingCharacters
.size(), RTL_TEXTENCODING_UTF8
);
1354 if (rEntity
.mbEnableThreads
)
1356 Event
& rEvent
= rEntity
.getEvent( CallbackType::CHARACTERS
);
1357 rEvent
.msChars
= std::move(sChars
);
1361 rEntity
.characters( sChars
);
1362 pendingCharacters
.resize(0);
1365 void FastSaxParserImpl::callbackProcessingInstruction( const xmlChar
*target
, const xmlChar
*data
)
1367 if (!pendingCharacters
.empty())
1368 sendPendingCharacters();
1369 Entity
& rEntity
= getEntity();
1370 Event
& rEvent
= rEntity
.getEvent( CallbackType::PROCESSING_INSTRUCTION
);
1372 // This event is very rare, so no need to waste extra space for this
1373 // Using namespace and element strings to be target and data in that order.
1374 rEvent
.msNamespace
= OUString( XML_CAST( target
), strlen( XML_CAST( target
) ), RTL_TEXTENCODING_UTF8
);
1375 if ( data
!= nullptr )
1376 rEvent
.msElementName
= OUString( XML_CAST( data
), strlen( XML_CAST( data
) ), RTL_TEXTENCODING_UTF8
);
1378 rEvent
.msElementName
.clear();
1380 if (rEntity
.mbEnableThreads
)
1383 rEntity
.processingInstruction( rEvent
.msNamespace
, rEvent
.msElementName
);
1386 xmlEntityPtr
FastSaxParserImpl::callbackGetEntity( const xmlChar
*name
)
1389 return xmlGetPredefinedEntity(name
);
1390 const char* dname
= XML_CAST(name
);
1391 int lname
= strlen(dname
);
1393 return xmlGetPredefinedEntity(name
);
1394 if (m_Replacements
.size() > 0)
1396 auto it
= std::lower_bound(m_Replacements
.begin(), m_Replacements
.end(), dname
);
1397 if (it
!= m_Replacements
.end() && it
->name
.compareToAscii(dname
) == 0)
1399 xmlEntityPtr entpt
= xmlNewEntity(
1400 nullptr, name
, XML_INTERNAL_GENERAL_ENTITY
, nullptr, nullptr,
1401 BAD_CAST(OUStringToOString(it
->replacement
, RTL_TEXTENCODING_UTF8
).getStr()));
1402 m_TemporalEntities
.push_back(entpt
);
1407 return xmlGetPredefinedEntity(name
);
1408 if ( dname
[0] == '#' )
1410 sal_uInt32 cval
= 0;
1411 if( dname
[1] == 'x' || dname
[1] == 'X' )
1414 return xmlGetPredefinedEntity(name
);
1415 cval
= static_cast<sal_uInt32
>( strtoul( dname
+ 2, nullptr, 16 ) );
1417 return xmlGetPredefinedEntity(name
);
1418 OUString
vname( &cval
, 1 );
1420 = xmlNewEntity(nullptr, name
, XML_INTERNAL_GENERAL_ENTITY
, nullptr, nullptr,
1421 BAD_CAST(OUStringToOString(vname
, RTL_TEXTENCODING_UTF8
).getStr()));
1422 m_TemporalEntities
.push_back(entpt
);
1427 cval
= static_cast<sal_uInt32
>( strtoul( dname
+ 2, nullptr, 10 ) );
1429 return xmlGetPredefinedEntity(name
);
1430 OUString
vname(&cval
, 1);
1432 = xmlNewEntity(nullptr, name
, XML_INTERNAL_GENERAL_ENTITY
, nullptr, nullptr,
1433 BAD_CAST(OUStringToOString(vname
, RTL_TEXTENCODING_UTF8
).getStr()));
1434 m_TemporalEntities
.push_back(entpt
);
1438 return xmlGetPredefinedEntity(name
);
1441 FastSaxParser::FastSaxParser() : mpImpl(new FastSaxParserImpl
) {}
1443 FastSaxParser::~FastSaxParser()
1448 FastSaxParser::initialize(css::uno::Sequence
< css::uno::Any
> const& rArguments
)
1450 if (!rArguments
.hasElements())
1454 if ( !(rArguments
[0] >>= str
) )
1455 throw IllegalArgumentException();
1457 if ( str
== "IgnoreMissingNSDecl" )
1458 mpImpl
->m_bIgnoreMissingNSDecl
= true;
1459 else if ( str
== "DoSmeplease" )
1460 ; //just ignore as this is already immune to billion laughs
1461 else if ( str
== "DisableThreadedParser" )
1462 mpImpl
->m_bDisableThreadedParser
= true;
1464 throw IllegalArgumentException();
1468 void FastSaxParser::parseStream( const xml::sax::InputSource
& aInputSource
)
1470 mpImpl
->parseStream(aInputSource
);
1473 void FastSaxParser::setFastDocumentHandler( const uno::Reference
<xml::sax::XFastDocumentHandler
>& Handler
)
1475 mpImpl
->setFastDocumentHandler(Handler
);
1478 void FastSaxParser::setTokenHandler( const uno::Reference
<xml::sax::XFastTokenHandler
>& Handler
)
1480 mpImpl
->setTokenHandler(Handler
);
1483 void FastSaxParser::registerNamespace( const OUString
& NamespaceURL
, sal_Int32 NamespaceToken
)
1485 mpImpl
->registerNamespace(NamespaceURL
, NamespaceToken
);
1488 OUString
FastSaxParser::getNamespaceURL( const OUString
& rPrefix
)
1490 return mpImpl
->getNamespaceURL(rPrefix
);
1493 void FastSaxParser::setErrorHandler( const uno::Reference
< xml::sax::XErrorHandler
>& Handler
)
1495 mpImpl
->setErrorHandler(Handler
);
1498 void FastSaxParser::setEntityResolver( const uno::Reference
< xml::sax::XEntityResolver
>& )
1503 void FastSaxParser::setLocale( const lang::Locale
& )
1508 void FastSaxParser::setNamespaceHandler( const uno::Reference
< css::xml::sax::XFastNamespaceHandler
>& Handler
)
1510 mpImpl
->setNamespaceHandler(Handler
);
1513 OUString
FastSaxParser::getImplementationName()
1515 return u
"com.sun.star.comp.extensions.xml.sax.FastParser"_ustr
;
1518 void FastSaxParser::setCustomEntityNames(
1519 const ::css::uno::Sequence
<::css::beans::Pair
<::rtl::OUString
, ::rtl::OUString
>>& replacements
)
1521 mpImpl
->setCustomEntityNames(replacements
);
1524 sal_Bool
FastSaxParser::supportsService( const OUString
& ServiceName
)
1526 return cppu::supportsService(this, ServiceName
);
1529 uno::Sequence
<OUString
> FastSaxParser::getSupportedServiceNames()
1531 return { u
"com.sun.star.xml.sax.FastParser"_ustr
};
1534 } // namespace sax_fastparser
1536 extern "C" SAL_DLLPUBLIC_EXPORT
css::uno::XInterface
*
1537 com_sun_star_comp_extensions_xml_sax_FastParser_get_implementation(
1538 css::uno::XComponentContext
*,
1539 css::uno::Sequence
<css::uno::Any
> const &)
1541 return cppu::acquire(new FastSaxParser
);
1544 // ----------------------------------------------------------
1545 // copy of the code in xmloff/source/core/namespace.cxx, which adds namespace aliases
1546 // for various dodgy namespace decls in the wild.
1548 static bool NormalizeW3URI( OUString
& rName
);
1549 static bool NormalizeOasisURN( OUString
& rName
);
1551 static void NormalizeURI( OUString
& rName
)
1553 // try OASIS + W3 URI normalization
1554 bool bSuccess
= NormalizeOasisURN( rName
);
1556 NormalizeW3URI( rName
);
1559 constexpr OUStringLiteral
XML_URI_W3_PREFIX(u
"http://www.w3.org/");
1560 constexpr OUStringLiteral
XML_URI_XFORMS_SUFFIX(u
"/xforms");
1561 constexpr OUStringLiteral
XML_N_XFORMS_1_0(u
"http://www.w3.org/2002/xforms");
1562 constexpr OUStringLiteral
XML_N_SVG(u
"http://www.w3.org/2000/svg");
1563 constexpr OUStringLiteral
XML_N_SVG_COMPAT(u
"urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0");
1564 constexpr OUStringLiteral
XML_N_FO(u
"http://www.w3.org/1999/XSL/Format");
1565 constexpr OUStringLiteral
XML_N_FO_COMPAT(u
"urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0");
1566 constexpr OUStringLiteral
XML_N_SMIL(u
"http://www.w3.org/2001/SMIL20/");
1567 constexpr OUStringLiteral
XML_N_SMIL_OLD(u
"http://www.w3.org/2001/SMIL20");
1568 constexpr OUStringLiteral
XML_N_SMIL_COMPAT(u
"urn:oasis:names:tc:opendocument:xmlns:smil-compatible:1.0");
1569 constexpr OUStringLiteral
XML_URN_OASIS_NAMES_TC(u
"urn:oasis:names:tc");
1570 constexpr OUStringLiteral
XML_XMLNS(u
"xmlns");
1571 constexpr OUStringLiteral
XML_OPENDOCUMENT(u
"opendocument");
1572 constexpr OUStringLiteral
XML_1_0(u
"1.0");
1574 static bool NormalizeW3URI( OUString
& rName
)
1576 // check if URI matches:
1577 // http://www.w3.org/[0-9]*/[:letter:]*
1579 // For the following WG/standards names:
1582 bool bSuccess
= false;
1583 const OUString sURIPrefix
= XML_URI_W3_PREFIX
;
1584 if( rName
.startsWith( sURIPrefix
) )
1586 const OUString sURISuffix
= XML_URI_XFORMS_SUFFIX
;
1587 sal_Int32 nCompareFrom
= rName
.getLength() - sURISuffix
.getLength();
1588 if( rName
.subView( nCompareFrom
) == sURISuffix
)
1590 // found W3 prefix, and xforms suffix
1591 rName
= XML_N_XFORMS_1_0
;
1598 static bool NormalizeOasisURN( OUString
& rName
)
1601 // we exported the wrong namespace for smil, so we correct this here on load
1602 // for older documents
1603 if( rName
== XML_N_SVG
)
1605 rName
= XML_N_SVG_COMPAT
;
1608 else if( rName
== XML_N_FO
)
1610 rName
= XML_N_FO_COMPAT
;
1613 else if( rName
== XML_N_SMIL
|| rName
== XML_N_SMIL_OLD
)
1615 rName
= XML_N_SMIL_COMPAT
;
1620 // Check if URN matches
1621 // :urn:oasis:names:tc:[^:]*:xmlns:[^:]*:1.[^:]*
1622 // |---| |---| |-----|
1623 // TC-Id Sub-Id Version
1625 sal_Int32 nNameLen
= rName
.getLength();
1626 // :urn:oasis:names:tc.*
1627 const OUString aOasisURN
= XML_URN_OASIS_NAMES_TC
;
1628 if( !rName
.startsWith( aOasisURN
) )
1631 // :urn:oasis:names:tc:.*
1632 sal_Int32 nPos
= aOasisURN
.getLength();
1633 if( nPos
>= nNameLen
|| rName
[nPos
] != ':' )
1636 // :urn:oasis:names:tc:[^:]:.*
1637 sal_Int32 nTCIdStart
= nPos
+1;
1638 sal_Int32 nTCIdEnd
= rName
.indexOf( ':', nTCIdStart
);
1639 if( -1 == nTCIdEnd
)
1642 // :urn:oasis:names:tc:[^:]:xmlns.*
1643 nPos
= nTCIdEnd
+ 1;
1644 std::u16string_view
sTmp( rName
.subView( nPos
) );
1645 const OUString aXMLNS
= XML_XMLNS
;
1646 if( !o3tl::starts_with(sTmp
, aXMLNS
) )
1649 // :urn:oasis:names:tc:[^:]:xmlns:.*
1650 nPos
+= aXMLNS
.getLength();
1651 if( nPos
>= nNameLen
|| rName
[nPos
] != ':' )
1654 // :urn:oasis:names:tc:[^:]:xmlns:[^:]*:.*
1655 nPos
= rName
.indexOf( ':', nPos
+1 );
1659 // :urn:oasis:names:tc:[^:]:xmlns:[^:]*:[^:][^:][^:][^:]*
1660 sal_Int32 nVersionStart
= nPos
+1;
1661 if( nVersionStart
+2 >= nNameLen
||
1662 -1 != rName
.indexOf( ':', nVersionStart
) )
1665 // :urn:oasis:names:tc:[^:]:xmlns:[^:]*:1\.[^:][^:]*
1666 if( rName
[nVersionStart
] != '1' || rName
[nVersionStart
+1] != '.' )
1669 // replace [tcid] with current TCID and version with current version.
1671 rName
= rName
.subView( 0, nTCIdStart
) +
1673 rName
.subView( nTCIdEnd
, nVersionStart
-nTCIdEnd
) +
1680 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */