Version 7.6.3.2-android, tag libreoffice-7.6.3.2-android
[LibreOffice.git] / sax / source / fastparser / fastparser.cxx
blob93e661313bd3c1bda944d15c612ab808eb7a536c
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sax/fastparser.hxx>
21 #include <sax/fastattribs.hxx>
22 #include <utility>
23 #include <xml2utf.hxx>
25 #include <com/sun/star/io/XSeekable.hpp>
26 #include <com/sun/star/lang/DisposedException.hpp>
27 #include <com/sun/star/lang/IllegalArgumentException.hpp>
28 #include <com/sun/star/uno/XComponentContext.hpp>
29 #include <com/sun/star/xml/sax/FastToken.hpp>
30 #include <com/sun/star/xml/sax/SAXParseException.hpp>
31 #include <com/sun/star/xml/sax/XFastContextHandler.hpp>
32 #include <cppuhelper/implbase.hxx>
33 #include <cppuhelper/supportsservice.hxx>
34 #include <cppuhelper/exc_hlp.hxx>
35 #include <osl/conditn.hxx>
36 #include <rtl/ref.hxx>
37 #include <rtl/ustrbuf.hxx>
38 #include <sal/log.hxx>
39 #include <salhelper/thread.hxx>
40 #include <comphelper/diagnose_ex.hxx>
41 #include <o3tl/string_view.hxx>
43 #include <queue>
44 #include <memory>
45 #include <mutex>
46 #include <optional>
47 #include <stack>
48 #include <string_view>
49 #include <unordered_map>
50 #include <vector>
51 #include <cassert>
52 #include <cstring>
53 #include <libxml/parser.h>
55 // Inverse of libxml's BAD_CAST.
56 #define XML_CAST( str ) reinterpret_cast< const char* >( str )
58 using namespace ::osl;
59 using namespace ::cppu;
60 using namespace ::com::sun::star::uno;
61 using namespace ::com::sun::star::lang;
62 using namespace ::com::sun::star::xml::sax;
63 using namespace ::com::sun::star::io;
64 using namespace com::sun::star;
65 using namespace sax_fastparser;
67 static void NormalizeURI( OUString& rName );
69 namespace {
71 struct Event;
72 class FastLocatorImpl;
73 struct NamespaceDefine;
74 struct Entity;
76 typedef std::unordered_map< OUString, sal_Int32 > NamespaceMap;
78 struct EventList
80 std::vector<Event> maEvents;
81 bool mbIsAttributesEmpty;
84 enum class CallbackType { START_ELEMENT, END_ELEMENT, CHARACTERS, PROCESSING_INSTRUCTION, DONE, EXCEPTION };
86 struct Event
88 CallbackType maType;
89 sal_Int32 mnElementToken;
90 OUString msNamespace;
91 OUString msElementName;
92 rtl::Reference< FastAttributeList > mxAttributes;
93 rtl::Reference< FastAttributeList > mxDeclAttributes;
94 OUString msChars;
97 struct NameWithToken
99 OUString msName;
100 sal_Int32 mnToken;
102 NameWithToken(OUString sName, sal_Int32 nToken) :
103 msName(std::move(sName)), mnToken(nToken) {}
106 struct SaxContext
108 Reference< XFastContextHandler > mxContext;
109 sal_Int32 mnElementToken;
110 std::optional<OUString> moNamespace;
111 std::optional<OUString> moElementName;
113 SaxContext( sal_Int32 nElementToken, const OUString& aNamespace, const OUString& aElementName ):
114 mnElementToken(nElementToken)
116 if (nElementToken == FastToken::DONTKNOW)
118 moNamespace = aNamespace;
119 moElementName = aElementName;
124 struct ParserData
126 css::uno::Reference< css::xml::sax::XFastDocumentHandler > mxDocumentHandler;
127 rtl::Reference<FastTokenHandlerBase> mxTokenHandler;
128 css::uno::Reference< css::xml::sax::XErrorHandler > mxErrorHandler;
129 css::uno::Reference< css::xml::sax::XFastNamespaceHandler >mxNamespaceHandler;
131 ParserData();
134 struct NamespaceDefine
136 OString maPrefix;
137 sal_Int32 mnToken;
138 OUString maNamespaceURL;
140 NamespaceDefine( OString aPrefix, sal_Int32 nToken, OUString aNamespaceURL )
141 : maPrefix(std::move( aPrefix )), mnToken( nToken ), maNamespaceURL(std::move( aNamespaceURL )) {}
142 NamespaceDefine() : mnToken(-1) {}
145 // Entity binds all information needed for a single file | single call of parseStream
146 struct Entity : public ParserData
148 // Amount of work producer sends to consumer in one iteration:
149 static const size_t mnEventListSize = 1000;
151 // unique for each Entity instance:
153 // Number of valid events in mxProducedEvents:
154 size_t mnProducedEventsSize;
155 std::optional<EventList> mxProducedEvents;
156 std::queue<EventList> maPendingEvents;
157 std::queue<EventList> maUsedEvents;
158 std::mutex maEventProtector;
160 static const size_t mnEventLowWater = 4;
161 static const size_t mnEventHighWater = 8;
162 osl::Condition maConsumeResume;
163 osl::Condition maProduceResume;
164 // Event we use to store data if threading is disabled:
165 Event maSharedEvent;
167 // copied in copy constructor:
169 // Allow to disable threading for small documents:
170 bool mbEnableThreads;
171 css::xml::sax::InputSource maStructSource;
172 xmlParserCtxtPtr mpParser;
173 ::sax_expatwrap::XMLFile2UTFConverter maConverter;
175 // Exceptions cannot be thrown through the C-XmlParser (possible
176 // resource leaks), therefore any exception thrown by a UNO callback
177 // must be saved somewhere until the C-XmlParser is stopped.
178 css::uno::Any maSavedException;
179 std::mutex maSavedExceptionMutex;
180 void saveException( const Any & e );
181 // Thread-safe check if maSavedException has value
182 bool hasException();
183 void throwException( const ::rtl::Reference< FastLocatorImpl > &xDocumentLocator,
184 bool mbDuringParse );
186 std::stack< NameWithToken, std::vector<NameWithToken> > maNamespaceStack;
187 /* Context for main thread consuming events.
188 * startElement() stores the data, which characters() and endElement() uses
190 std::stack< SaxContext, std::vector<SaxContext> > maContextStack;
191 // Determines which elements of maNamespaceDefines are valid in current context
192 std::stack< sal_uInt32, std::vector<sal_uInt32> > maNamespaceCount;
193 std::vector< NamespaceDefine > maNamespaceDefines;
195 explicit Entity( const ParserData& rData );
196 Entity( const Entity& rEntity ) = delete;
197 Entity& operator=( const Entity& rEntity ) = delete;
198 void startElement( Event const *pEvent );
199 void characters( const OUString& sChars );
200 void endElement();
201 void processingInstruction( const OUString& rTarget, const OUString& rData );
202 EventList& getEventList();
203 Event& getEvent( CallbackType aType );
206 // Stuff for custom entity names
207 struct ReplacementPair
209 OUString name;
210 OUString replacement;
212 inline bool operator<(const ReplacementPair& lhs, const ReplacementPair& rhs)
214 return lhs.name < rhs.name;
216 inline bool operator<(const ReplacementPair& lhs, const char* rhs)
218 return lhs.name.compareToAscii(rhs) < 0;
221 } // namespace
223 namespace sax_fastparser {
225 class FastSaxParserImpl
227 public:
228 explicit FastSaxParserImpl();
229 ~FastSaxParserImpl();
231 private:
232 std::vector<ReplacementPair> m_Replacements;
233 std::vector<xmlEntityPtr> m_TemporalEntities;
235 public:
236 // XFastParser
237 /// @throws css::xml::sax::SAXException
238 /// @throws css::io::IOException
239 /// @throws css::uno::RuntimeException
240 void parseStream( const css::xml::sax::InputSource& aInputSource );
241 /// @throws css::uno::RuntimeException
242 void setFastDocumentHandler( const css::uno::Reference< css::xml::sax::XFastDocumentHandler >& Handler );
243 /// @throws css::uno::RuntimeException
244 void setTokenHandler( const css::uno::Reference< css::xml::sax::XFastTokenHandler >& Handler );
245 /// @throws css::lang::IllegalArgumentException
246 /// @throws css::uno::RuntimeException
247 void registerNamespace( const OUString& NamespaceURL, sal_Int32 NamespaceToken );
248 /// @throws css::lang::IllegalArgumentException
249 /// @throws css::uno::RuntimeException
250 OUString const & getNamespaceURL( std::u16string_view rPrefix );
251 /// @throws css::uno::RuntimeException
252 void setErrorHandler( const css::uno::Reference< css::xml::sax::XErrorHandler >& Handler );
253 /// @throws css::uno::RuntimeException
254 void setNamespaceHandler( const css::uno::Reference< css::xml::sax::XFastNamespaceHandler >& Handler);
255 // Fake DTD file
256 void setCustomEntityNames(
257 const ::css::uno::Sequence<::css::beans::Pair<::rtl::OUString, ::rtl::OUString>>& replacements);
259 // called by the C callbacks of the expat parser
260 void callbackStartElement( const xmlChar *localName , const xmlChar* prefix, const xmlChar* URI,
261 int numNamespaces, const xmlChar** namespaces, int numAttributes, const xmlChar **attributes );
262 void callbackEndElement();
263 void callbackCharacters( const xmlChar* s, int nLen );
264 void callbackProcessingInstruction( const xmlChar *target, const xmlChar *data );
265 xmlEntityPtr callbackGetEntity( const xmlChar *name );
267 void pushEntity(const ParserData&, xml::sax::InputSource const&);
268 void popEntity();
269 Entity& getEntity() { return *mpTop; }
270 void parse();
271 void produce( bool bForceFlush = false );
272 bool m_bIgnoreMissingNSDecl;
273 bool m_bDisableThreadedParser;
275 private:
276 bool consume(EventList&);
277 void deleteUsedEvents();
278 void sendPendingCharacters();
279 void addUnknownElementWithPrefix(const xmlChar **attributes, int i, rtl::Reference< FastAttributeList > const & xAttributes);
281 sal_Int32 GetToken( const xmlChar* pName );
282 /// @throws css::xml::sax::SAXException
283 sal_Int32 GetTokenWithPrefix( const xmlChar* pPrefix, const xmlChar* pName );
284 /// @throws css::xml::sax::SAXException
285 OUString const & GetNamespaceURL( std::string_view rPrefix );
286 sal_Int32 GetNamespaceToken( const OUString& rNamespaceURL );
287 sal_Int32 GetTokenWithContextNamespace( sal_Int32 nNamespaceToken, const xmlChar* pName );
288 void DefineNamespace( const OString& rPrefix, const OUString& namespaceURL );
290 private:
291 std::mutex maMutex; ///< Protecting whole parseStream() execution
292 ::rtl::Reference< FastLocatorImpl > mxDocumentLocator;
293 NamespaceMap maNamespaceMap;
295 ParserData maData; /// Cached parser configuration for next call of parseStream().
297 Entity *mpTop; /// std::stack::top() is amazingly slow => cache this.
298 std::stack< Entity > maEntities; /// Entity stack for each call of parseStream().
299 std::vector<char> pendingCharacters; /// Data from characters() callback that needs to be sent.
302 } // namespace sax_fastparser
304 namespace {
306 class ParserThread: public salhelper::Thread
308 FastSaxParserImpl *mpParser;
309 public:
310 explicit ParserThread(FastSaxParserImpl *pParser): Thread("Parser"), mpParser(pParser) {}
311 private:
312 virtual void execute() override
316 mpParser->parse();
318 catch (...)
320 Entity &rEntity = mpParser->getEntity();
321 rEntity.getEvent( CallbackType::EXCEPTION );
322 mpParser->produce( true );
327 extern "C" {
329 static void call_callbackStartElement(void *userData, const xmlChar *localName , const xmlChar* prefix, const xmlChar* URI,
330 int numNamespaces, const xmlChar** namespaces, int numAttributes, int /*defaultedAttributes*/, const xmlChar **attributes)
332 FastSaxParserImpl* pFastParser = static_cast<FastSaxParserImpl*>( userData );
333 pFastParser->callbackStartElement( localName, prefix, URI, numNamespaces, namespaces, numAttributes, attributes );
336 static void call_callbackEndElement(void *userData, const xmlChar* /*localName*/, const xmlChar* /*prefix*/, const xmlChar* /*URI*/)
338 FastSaxParserImpl* pFastParser = static_cast<FastSaxParserImpl*>( userData );
339 pFastParser->callbackEndElement();
342 static void call_callbackCharacters( void *userData , const xmlChar *s , int nLen )
344 FastSaxParserImpl* pFastParser = static_cast<FastSaxParserImpl*>( userData );
345 pFastParser->callbackCharacters( s, nLen );
348 static void call_callbackProcessingInstruction( void *userData, const xmlChar *target, const xmlChar *data )
350 FastSaxParserImpl* pFastParser = static_cast<FastSaxParserImpl*>( userData );
351 pFastParser->callbackProcessingInstruction( target, data );
354 static xmlEntityPtr call_callbackGetEntity( void *userData, const xmlChar *name)
356 FastSaxParserImpl* pFastParser = static_cast<FastSaxParserImpl*>( userData );
357 return pFastParser->callbackGetEntity( name );
362 class FastLocatorImpl : public WeakImplHelper< XLocator >
364 public:
365 explicit FastLocatorImpl(FastSaxParserImpl *p) : mpParser(p) {}
367 void dispose() { mpParser = nullptr; }
368 /// @throws RuntimeException
369 void checkDispose() const { if( !mpParser ) throw DisposedException(); }
371 //XLocator
372 virtual sal_Int32 SAL_CALL getColumnNumber() override;
373 virtual sal_Int32 SAL_CALL getLineNumber() override;
374 virtual OUString SAL_CALL getPublicId() override;
375 virtual OUString SAL_CALL getSystemId() override;
377 private:
378 FastSaxParserImpl *mpParser;
381 sal_Int32 SAL_CALL FastLocatorImpl::getColumnNumber()
383 checkDispose();
384 return xmlSAX2GetColumnNumber( mpParser->getEntity().mpParser );
387 sal_Int32 SAL_CALL FastLocatorImpl::getLineNumber()
389 checkDispose();
390 return xmlSAX2GetLineNumber( mpParser->getEntity().mpParser );
393 OUString SAL_CALL FastLocatorImpl::getPublicId()
395 checkDispose();
396 return mpParser->getEntity().maStructSource.sPublicId;
399 OUString SAL_CALL FastLocatorImpl::getSystemId()
401 checkDispose();
402 return mpParser->getEntity().maStructSource.sSystemId;
405 ParserData::ParserData()
408 Entity::Entity(const ParserData& rData)
409 : ParserData(rData)
410 , mnProducedEventsSize(0)
411 , mbEnableThreads(false)
412 , mpParser(nullptr)
416 void Entity::startElement( Event const *pEvent )
418 const sal_Int32& nElementToken = pEvent->mnElementToken;
419 const OUString& aNamespace = pEvent->msNamespace;
420 const OUString& aElementName = pEvent->msElementName;
422 // Use un-wrapped pointers to avoid significant acquire/release overhead
423 XFastContextHandler *pParentContext = nullptr;
424 if( !maContextStack.empty() )
426 pParentContext = maContextStack.top().mxContext.get();
427 if( !pParentContext )
429 maContextStack.push( SaxContext(nElementToken, aNamespace, aElementName) );
430 return;
434 maContextStack.push( SaxContext( nElementToken, aNamespace, aElementName ) );
438 const Reference< XFastAttributeList > & xAttr( pEvent->mxAttributes );
439 Reference< XFastContextHandler > xContext;
441 if ( mxNamespaceHandler.is() )
443 const Sequence< xml::Attribute > NSDeclAttribs = pEvent->mxDeclAttributes->getUnknownAttributes();
444 for (const auto& rNSDeclAttrib : NSDeclAttribs)
446 mxNamespaceHandler->registerNamespace( rNSDeclAttrib.Name, rNSDeclAttrib.Value );
450 if( nElementToken == FastToken::DONTKNOW )
452 if( pParentContext )
453 xContext = pParentContext->createUnknownChildContext( aNamespace, aElementName, xAttr );
454 else if( mxDocumentHandler.is() )
455 xContext = mxDocumentHandler->createUnknownChildContext( aNamespace, aElementName, xAttr );
457 if( xContext.is() )
459 xContext->startUnknownElement( aNamespace, aElementName, xAttr );
462 else
464 if( pParentContext )
465 xContext = pParentContext->createFastChildContext( nElementToken, xAttr );
466 else if( mxDocumentHandler.is() )
467 xContext = mxDocumentHandler->createFastChildContext( nElementToken, xAttr );
469 if( xContext.is() )
470 xContext->startFastElement( nElementToken, xAttr );
472 // swap the reference we own in to avoid referencing thrash.
473 maContextStack.top().mxContext = std::move( xContext );
475 catch (...)
477 saveException( ::cppu::getCaughtException() );
481 void Entity::characters( const OUString& sChars )
483 if (maContextStack.empty())
485 // Malformed XML stream !?
486 return;
489 XFastContextHandler * pContext( maContextStack.top().mxContext.get() );
490 if( pContext ) try
492 pContext->characters( sChars );
494 catch (...)
496 saveException( ::cppu::getCaughtException() );
500 void Entity::endElement()
502 if (maContextStack.empty())
504 // Malformed XML stream !?
505 return;
508 const SaxContext& aContext = maContextStack.top();
509 XFastContextHandler* pContext( aContext.mxContext.get() );
510 if( pContext )
513 sal_Int32 nElementToken = aContext.mnElementToken;
514 if( nElementToken != FastToken::DONTKNOW )
515 pContext->endFastElement( nElementToken );
516 else
517 pContext->endUnknownElement( *aContext.moNamespace, *aContext.moElementName );
519 catch (...)
521 saveException( ::cppu::getCaughtException() );
523 maContextStack.pop();
526 void Entity::processingInstruction( const OUString& rTarget, const OUString& rData )
528 if( mxDocumentHandler.is() ) try
530 mxDocumentHandler->processingInstruction( rTarget, rData );
532 catch (...)
534 saveException( ::cppu::getCaughtException() );
538 EventList& Entity::getEventList()
540 if (!mxProducedEvents)
542 std::unique_lock aGuard(maEventProtector);
543 if (!maUsedEvents.empty())
545 mxProducedEvents = std::move(maUsedEvents.front());
546 maUsedEvents.pop();
547 aGuard.unlock(); // unlock
548 mnProducedEventsSize = 0;
550 if (!mxProducedEvents)
552 mxProducedEvents.emplace();
553 mxProducedEvents->maEvents.resize(mnEventListSize);
554 mxProducedEvents->mbIsAttributesEmpty = false;
555 mnProducedEventsSize = 0;
558 return *mxProducedEvents;
561 Event& Entity::getEvent( CallbackType aType )
563 if (!mbEnableThreads)
564 return maSharedEvent;
566 EventList& rEventList = getEventList();
567 if (mnProducedEventsSize == rEventList.maEvents.size())
569 SAL_WARN_IF(!maSavedException.hasValue(), "sax",
570 "Event vector should only exceed " << mnEventListSize <<
571 " temporarily while an exception is pending");
572 rEventList.maEvents.resize(mnProducedEventsSize + 1);
574 Event& rEvent = rEventList.maEvents[mnProducedEventsSize++];
575 rEvent.maType = aType;
576 return rEvent;
579 OUString lclGetErrorMessage( xmlParserCtxtPtr ctxt, std::u16string_view sSystemId, sal_Int32 nLine )
581 const char* pMessage;
582 xmlErrorPtr error = xmlCtxtGetLastError( ctxt );
583 if( error && error->message )
584 pMessage = error->message;
585 else
586 pMessage = "unknown error";
587 return OUString::Concat("[") + sSystemId + " line " + OUString::number(nLine) + "]: " +
588 OUString(pMessage, strlen(pMessage), RTL_TEXTENCODING_ASCII_US);
591 // throw an exception, but avoid callback if
592 // during a threaded produce
593 void Entity::throwException( const ::rtl::Reference< FastLocatorImpl > &xDocumentLocator,
594 bool mbDuringParse )
596 // Error during parsing !
597 Any savedException;
599 std::scoped_lock g(maSavedExceptionMutex);
600 if (maSavedException.hasValue())
602 savedException.setValue(&maSavedException, cppu::UnoType<decltype(maSavedException)>::get());
605 SAXParseException aExcept(
606 lclGetErrorMessage( mpParser,
607 xDocumentLocator->getSystemId(),
608 xDocumentLocator->getLineNumber() ),
609 Reference< XInterface >(),
610 savedException,
611 xDocumentLocator->getPublicId(),
612 xDocumentLocator->getSystemId(),
613 xDocumentLocator->getLineNumber(),
614 xDocumentLocator->getColumnNumber()
617 // error handler is set, it may throw the exception
618 if( !mbDuringParse || !mbEnableThreads )
620 if (mxErrorHandler.is() )
621 mxErrorHandler->fatalError( Any( aExcept ) );
624 // error handler has not thrown, but parsing must stop => throw ourselves
625 throw aExcept;
628 // In the single threaded case we emit events via our C
629 // callbacks, so any exception caught must be queued up until
630 // we can safely re-throw it from our C++ parent of parse()
632 // If multi-threaded, we need to push an EXCEPTION event, at
633 // which point we transfer ownership of maSavedException to
634 // the consuming thread.
635 void Entity::saveException( const Any & e )
637 // fdo#81214 - allow the parser to run on after an exception,
638 // unexpectedly some 'startElements' produce a UNO_QUERY_THROW
639 // for XComponent; and yet expect to continue parsing.
640 SAL_WARN("sax", "Unexpected exception from XML parser " << exceptionToString(e));
641 std::scoped_lock g(maSavedExceptionMutex);
642 if (maSavedException.hasValue())
644 SAL_INFO("sax.fastparser", "discarding exception, already have one");
646 else
648 maSavedException = e;
652 bool Entity::hasException()
654 std::scoped_lock g(maSavedExceptionMutex);
655 return maSavedException.hasValue();
658 } // namespace
660 namespace sax_fastparser {
662 FastSaxParserImpl::FastSaxParserImpl() :
663 m_bIgnoreMissingNSDecl(false),
664 m_bDisableThreadedParser(false),
665 mpTop(nullptr)
667 mxDocumentLocator.set( new FastLocatorImpl( this ) );
670 FastSaxParserImpl::~FastSaxParserImpl()
672 if( mxDocumentLocator.is() )
673 mxDocumentLocator->dispose();
674 for (auto& entity : m_TemporalEntities)
676 if (!entity)
677 continue;
678 xmlNodePtr pPtr = reinterpret_cast<xmlNodePtr>(entity);
679 xmlUnlinkNode(pPtr);
680 xmlFreeNode(pPtr);
684 void FastSaxParserImpl::DefineNamespace( const OString& rPrefix, const OUString& namespaceURL )
686 Entity& rEntity = getEntity();
687 assert(!rEntity.maNamespaceCount.empty()); // need a context!
689 sal_uInt32 nOffset = rEntity.maNamespaceCount.top()++;
690 if( rEntity.maNamespaceDefines.size() <= nOffset )
691 rEntity.maNamespaceDefines.resize( rEntity.maNamespaceDefines.size() + 64 );
693 rEntity.maNamespaceDefines[nOffset] = NamespaceDefine( rPrefix, GetNamespaceToken( namespaceURL ), namespaceURL );
696 sal_Int32 FastSaxParserImpl::GetToken(const xmlChar* pName)
698 return FastTokenHandlerBase::getTokenFromChars( getEntity(). mxTokenHandler.get(),
699 XML_CAST( pName ) ); // uses utf-8
702 sal_Int32 FastSaxParserImpl::GetTokenWithPrefix( const xmlChar* pPrefix, const xmlChar* pName )
704 Entity& rEntity = getEntity();
705 if (rEntity.maNamespaceCount.empty())
706 return FastToken::DONTKNOW;
708 std::string_view sPrefix(XML_CAST(pPrefix));
709 sal_uInt32 nNamespace = rEntity.maNamespaceCount.top();
710 while( nNamespace-- )
712 const auto & rNamespaceDefine = rEntity.maNamespaceDefines[nNamespace];
713 if( rNamespaceDefine.maPrefix == sPrefix )
714 return GetTokenWithContextNamespace(rNamespaceDefine.mnToken, pName);
717 if (!m_bIgnoreMissingNSDecl)
718 throw SAXException("No namespace defined for " + OStringToOUString(sPrefix,
719 RTL_TEXTENCODING_UTF8), {}, {});
721 return FastToken::DONTKNOW;
724 sal_Int32 FastSaxParserImpl::GetNamespaceToken( const OUString& rNamespaceURL )
726 NamespaceMap::iterator aIter( maNamespaceMap.find( rNamespaceURL ) );
727 if( aIter != maNamespaceMap.end() )
728 return (*aIter).second;
729 else
730 return FastToken::DONTKNOW;
733 OUString const & FastSaxParserImpl::GetNamespaceURL( std::string_view rPrefix )
735 Entity& rEntity = getEntity();
736 if( !rEntity.maNamespaceCount.empty() )
738 sal_uInt32 nNamespace = rEntity.maNamespaceCount.top();
739 while( nNamespace-- )
740 if( rEntity.maNamespaceDefines[nNamespace].maPrefix == rPrefix )
741 return rEntity.maNamespaceDefines[nNamespace].maNamespaceURL;
744 throw SAXException("No namespace defined for " + OUString::fromUtf8(rPrefix),
745 Reference< XInterface >(), Any());
748 sal_Int32 FastSaxParserImpl::GetTokenWithContextNamespace( sal_Int32 nNamespaceToken, const xmlChar* pName )
750 if( nNamespaceToken != FastToken::DONTKNOW )
752 sal_Int32 nNameToken = GetToken( pName );
753 if( nNameToken != FastToken::DONTKNOW )
754 return nNamespaceToken | nNameToken;
757 return FastToken::DONTKNOW;
760 namespace
762 class ParserCleanup
764 private:
765 FastSaxParserImpl& m_rParser;
766 Entity& m_rEntity;
767 rtl::Reference<ParserThread> m_xParser;
768 public:
769 ParserCleanup(FastSaxParserImpl& rParser, Entity& rEntity)
770 : m_rParser(rParser)
771 , m_rEntity(rEntity)
774 ~ParserCleanup()
776 if (m_rEntity.mpParser)
778 if (m_rEntity.mpParser->myDoc)
779 xmlFreeDoc(m_rEntity.mpParser->myDoc);
780 xmlFreeParserCtxt(m_rEntity.mpParser);
782 joinThread();
783 m_rParser.popEntity();
785 void setThread(const rtl::Reference<ParserThread> &xParser)
787 m_xParser = xParser;
789 void joinThread()
791 if (m_xParser.is())
793 rtl::Reference<ParserThread> xToJoin = m_xParser;
794 m_xParser.clear();
795 xToJoin->join();
800 /***************
802 * parseStream does Parser-startup initializations. The FastSaxParser::parse() method does
803 * the file-specific initialization work. (During a parser run, external files may be opened)
805 ****************/
806 void FastSaxParserImpl::parseStream(const InputSource& rStructSource)
808 xmlInitParser();
810 // Only one text at one time
811 std::unique_lock guard( maMutex );
813 pushEntity(maData, rStructSource);
814 Entity& rEntity = getEntity();
815 ParserCleanup aEnsureFree(*this, rEntity);
817 // start the document
818 if( rEntity.mxDocumentHandler.is() )
820 rEntity.mxDocumentHandler->setDocumentLocator( mxDocumentLocator );
821 rEntity.mxDocumentHandler->startDocument();
824 #ifdef EMSCRIPTEN
825 rEntity.mbEnableThreads = false;
826 #else
827 if (!getenv("SAX_DISABLE_THREADS") && !m_bDisableThreadedParser)
829 Reference<css::io::XSeekable> xSeekable(rEntity.maStructSource.aInputStream, UNO_QUERY);
830 // available() is not __really__ relevant here, but leave it in as a heuristic for non-seekable streams
831 rEntity.mbEnableThreads = (xSeekable.is() && xSeekable->getLength() > 10000)
832 || (rEntity.maStructSource.aInputStream->available() > 10000);
834 #endif
836 if (rEntity.mbEnableThreads)
838 rtl::Reference<ParserThread> xParser = new ParserThread(this);
839 xParser->launch();
840 aEnsureFree.setThread(xParser);
841 bool done = false;
842 do {
843 rEntity.maConsumeResume.wait();
844 rEntity.maConsumeResume.reset();
846 std::unique_lock aGuard(rEntity.maEventProtector);
847 while (!rEntity.maPendingEvents.empty())
849 if (rEntity.maPendingEvents.size() <= Entity::mnEventLowWater)
850 rEntity.maProduceResume.set(); // start producer again
852 EventList aEventList = std::move(rEntity.maPendingEvents.front());
853 rEntity.maPendingEvents.pop();
854 aGuard.unlock(); // unlock
856 if (!consume(aEventList))
857 done = true;
859 aGuard.lock(); // lock
861 if ( rEntity.maPendingEvents.size() <= Entity::mnEventLowWater )
863 aGuard.unlock();
864 for (auto& rEvent : aEventList.maEvents)
866 if (rEvent.mxAttributes.is())
868 rEvent.mxAttributes->clear();
869 if( rEntity.mxNamespaceHandler.is() )
870 rEvent.mxDeclAttributes->clear();
872 aEventList.mbIsAttributesEmpty = true;
874 aGuard.lock();
877 rEntity.maUsedEvents.push(std::move(aEventList));
879 } while (!done);
880 aEnsureFree.joinThread();
881 deleteUsedEvents();
883 // callbacks used inside XML_Parse may have caught an exception
884 // No need to lock maSavedExceptionMutex here because parser
885 // thread is joined.
886 if( rEntity.maSavedException.hasValue() )
887 rEntity.throwException( mxDocumentLocator, true );
889 else
891 parse();
894 // finish document
895 if( rEntity.mxDocumentHandler.is() )
897 rEntity.mxDocumentHandler->endDocument();
901 void FastSaxParserImpl::setFastDocumentHandler( const Reference< XFastDocumentHandler >& Handler )
903 maData.mxDocumentHandler = Handler;
906 void FastSaxParserImpl::setTokenHandler( const Reference< XFastTokenHandler >& xHandler )
908 assert( dynamic_cast< FastTokenHandlerBase *>( xHandler.get() ) && "we expect this handler to be a subclass of FastTokenHandlerBase" );
909 maData.mxTokenHandler = dynamic_cast< FastTokenHandlerBase *>( xHandler.get() );
912 void FastSaxParserImpl::registerNamespace( const OUString& NamespaceURL, sal_Int32 NamespaceToken )
914 if( NamespaceToken < FastToken::NAMESPACE )
915 throw IllegalArgumentException("Invalid namespace token " + OUString::number(NamespaceToken), css::uno::Reference<css::uno::XInterface >(), 0);
917 if( GetNamespaceToken( NamespaceURL ) == FastToken::DONTKNOW )
919 maNamespaceMap[ NamespaceURL ] = NamespaceToken;
920 return;
922 throw IllegalArgumentException("namespace URL is already registered: " + NamespaceURL, css::uno::Reference<css::uno::XInterface >(), 0);
925 OUString const & FastSaxParserImpl::getNamespaceURL( std::u16string_view rPrefix )
929 return GetNamespaceURL( OUStringToOString( rPrefix, RTL_TEXTENCODING_UTF8 ) );
931 catch (const Exception&)
934 throw IllegalArgumentException();
937 void FastSaxParserImpl::setErrorHandler(const Reference< XErrorHandler > & Handler)
939 maData.mxErrorHandler = Handler;
942 void FastSaxParserImpl::setNamespaceHandler( const Reference< XFastNamespaceHandler >& Handler )
944 maData.mxNamespaceHandler = Handler;
947 void FastSaxParserImpl::setCustomEntityNames(
948 const ::css::uno::Sequence<::css::beans::Pair<::rtl::OUString, ::rtl::OUString>>& replacements)
950 m_Replacements.resize(replacements.size());
951 for (size_t i = 0; i < replacements.size(); ++i)
953 m_Replacements[i].name = replacements[i].First;
954 m_Replacements[i].replacement = replacements[i].Second;
956 if (m_Replacements.size() > 1)
957 std::sort(m_Replacements.begin(), m_Replacements.end());
960 void FastSaxParserImpl::deleteUsedEvents()
962 Entity& rEntity = getEntity();
963 std::unique_lock aGuard(rEntity.maEventProtector);
965 while (!rEntity.maUsedEvents.empty())
967 { // the block makes sure that aEventList is destructed outside the lock
968 EventList aEventList = std::move(rEntity.maUsedEvents.front());
969 rEntity.maUsedEvents.pop();
971 aGuard.unlock(); // unlock
974 aGuard.lock(); // lock
978 void FastSaxParserImpl::produce( bool bForceFlush )
980 Entity& rEntity = getEntity();
981 if (!(bForceFlush ||
982 rEntity.mnProducedEventsSize >= Entity::mnEventListSize))
983 return;
985 std::unique_lock aGuard(rEntity.maEventProtector);
987 while (rEntity.maPendingEvents.size() >= Entity::mnEventHighWater)
988 { // pause parsing for a bit
989 aGuard.unlock(); // unlock
990 rEntity.maProduceResume.wait();
991 rEntity.maProduceResume.reset();
992 aGuard.lock(); // lock
995 rEntity.maPendingEvents.push(std::move(*rEntity.mxProducedEvents));
996 rEntity.mxProducedEvents.reset();
997 assert(!rEntity.mxProducedEvents);
999 aGuard.unlock(); // unlock
1001 rEntity.maConsumeResume.set();
1004 bool FastSaxParserImpl::consume(EventList& rEventList)
1006 Entity& rEntity = getEntity();
1007 rEventList.mbIsAttributesEmpty = false;
1008 for (auto& rEvent : rEventList.maEvents)
1010 switch (rEvent.maType)
1012 case CallbackType::START_ELEMENT:
1013 rEntity.startElement( &rEvent );
1014 break;
1015 case CallbackType::END_ELEMENT:
1016 rEntity.endElement();
1017 break;
1018 case CallbackType::CHARACTERS:
1019 rEntity.characters( rEvent.msChars );
1020 break;
1021 case CallbackType::PROCESSING_INSTRUCTION:
1022 rEntity.processingInstruction(
1023 rEvent.msNamespace, rEvent.msElementName ); // ( target, data )
1024 break;
1025 case CallbackType::DONE:
1026 return false;
1027 case CallbackType::EXCEPTION:
1028 rEntity.throwException( mxDocumentLocator, false );
1029 [[fallthrough]]; // avoid unreachable code warning with some compilers
1030 default:
1031 assert(false);
1032 return false;
1035 return true;
1038 void FastSaxParserImpl::pushEntity(const ParserData& rEntityData,
1039 xml::sax::InputSource const& rSource)
1041 if (!rSource.aInputStream.is())
1042 throw SAXException("No input source", Reference<XInterface>(), Any());
1044 maEntities.emplace(rEntityData);
1045 mpTop = &maEntities.top();
1047 mpTop->maStructSource = rSource;
1049 mpTop->maConverter.setInputStream(mpTop->maStructSource.aInputStream);
1050 if (!mpTop->maStructSource.sEncoding.isEmpty())
1052 mpTop->maConverter.setEncoding(OUStringToOString(mpTop->maStructSource.sEncoding, RTL_TEXTENCODING_ASCII_US));
1056 void FastSaxParserImpl::popEntity()
1058 maEntities.pop();
1059 mpTop = !maEntities.empty() ? &maEntities.top() : nullptr;
1062 // starts parsing with actual parser !
1063 void FastSaxParserImpl::parse()
1065 const int BUFFER_SIZE = 16 * 1024;
1066 Sequence< sal_Int8 > seqOut( BUFFER_SIZE );
1068 Entity& rEntity = getEntity();
1070 // set all necessary C-Callbacks
1071 static xmlSAXHandler callbacks;
1072 callbacks.startElementNs = call_callbackStartElement;
1073 callbacks.endElementNs = call_callbackEndElement;
1074 callbacks.characters = call_callbackCharacters;
1075 callbacks.processingInstruction = call_callbackProcessingInstruction;
1076 callbacks.getEntity = call_callbackGetEntity;
1077 callbacks.initialized = XML_SAX2_MAGIC;
1078 int nRead = 0;
1081 nRead = rEntity.maConverter.readAndConvert( seqOut, BUFFER_SIZE );
1082 if( nRead <= 0 )
1084 if( rEntity.mpParser != nullptr )
1086 if( xmlParseChunk( rEntity.mpParser, reinterpret_cast<const char*>(seqOut.getConstArray()), 0, 1 ) != XML_ERR_OK )
1087 rEntity.throwException( mxDocumentLocator, true );
1088 if (rEntity.hasException())
1089 rEntity.throwException(mxDocumentLocator, true);
1091 break;
1094 bool bContinue = true;
1095 if( rEntity.mpParser == nullptr )
1097 // create parser with proper encoding (needs the first chunk of data)
1098 rEntity.mpParser = xmlCreatePushParserCtxt( &callbacks, this,
1099 reinterpret_cast<const char*>(seqOut.getConstArray()), nRead, nullptr );
1100 if( !rEntity.mpParser )
1101 throw SAXException("Couldn't create parser", Reference< XInterface >(), Any() );
1103 // Tell libxml2 parser to decode entities in attribute values.
1104 // Also allow XML attribute values which are larger than 10MB, because this used to work
1105 // with expat.
1106 // coverity[unsafe_xml_parse_config] - entity support is required
1107 xmlCtxtUseOptions(rEntity.mpParser, XML_PARSE_NOENT | XML_PARSE_HUGE);
1109 else
1111 bContinue = xmlParseChunk( rEntity.mpParser, reinterpret_cast<const char*>(seqOut.getConstArray()), nRead, 0 )
1112 == XML_ERR_OK;
1115 // callbacks used inside XML_Parse may have caught an exception
1116 if (!bContinue)
1118 rEntity.throwException( mxDocumentLocator, true );
1120 if (rEntity.hasException())
1122 rEntity.throwException( mxDocumentLocator, true );
1124 } while( nRead > 0 );
1125 rEntity.getEvent( CallbackType::DONE );
1126 if( rEntity.mbEnableThreads )
1127 produce( true );
1130 // The C-Callbacks
1131 void FastSaxParserImpl::callbackStartElement(const xmlChar *localName , const xmlChar* prefix, const xmlChar* URI,
1132 int numNamespaces, const xmlChar** namespaces, int numAttributes, const xmlChar **attributes)
1134 if (!pendingCharacters.empty())
1135 sendPendingCharacters();
1136 Entity& rEntity = getEntity();
1137 if( rEntity.maNamespaceCount.empty() )
1139 rEntity.maNamespaceCount.push(0);
1140 DefineNamespace( "xml", "http://www.w3.org/XML/1998/namespace");
1142 else
1144 rEntity.maNamespaceCount.push( rEntity.maNamespaceCount.top() );
1147 // create attribute map and process namespace instructions
1148 Event& rEvent = rEntity.getEvent( CallbackType::START_ELEMENT );
1149 bool bIsAttributesEmpty = false;
1150 if ( rEntity.mbEnableThreads )
1151 bIsAttributesEmpty = rEntity.getEventList().mbIsAttributesEmpty;
1153 if (rEvent.mxAttributes.is())
1155 if( !bIsAttributesEmpty )
1156 rEvent.mxAttributes->clear();
1158 else
1159 rEvent.mxAttributes.set(
1160 new FastAttributeList( rEntity.mxTokenHandler.get() ) );
1162 if( rEntity.mxNamespaceHandler.is() )
1164 if (rEvent.mxDeclAttributes.is())
1166 if( !bIsAttributesEmpty )
1167 rEvent.mxDeclAttributes->clear();
1169 else
1170 rEvent.mxDeclAttributes.set(
1171 new FastAttributeList( rEntity.mxTokenHandler.get() ) );
1174 OUString sNamespace;
1175 sal_Int32 nNamespaceToken = FastToken::DONTKNOW;
1176 if (!rEntity.maNamespaceStack.empty())
1178 sNamespace = rEntity.maNamespaceStack.top().msName;
1179 nNamespaceToken = rEntity.maNamespaceStack.top().mnToken;
1184 /* #158414# Each element may define new namespaces, also for attributes.
1185 First, process all namespaces, second, process the attributes after namespaces
1186 have been initialized. */
1188 // #158414# first: get namespaces
1189 for (int i = 0; i < numNamespaces * 2; i += 2)
1191 // namespaces[] is (prefix/URI)
1192 if( namespaces[ i ] != nullptr )
1194 OString aPrefix( XML_CAST( namespaces[ i ] ));
1195 OUString namespaceURL( XML_CAST( namespaces[ i + 1 ] ), strlen( XML_CAST( namespaces[ i + 1 ] )), RTL_TEXTENCODING_UTF8 );
1196 NormalizeURI( namespaceURL );
1197 DefineNamespace(aPrefix, namespaceURL);
1198 if( rEntity.mxNamespaceHandler.is() )
1199 rEvent.mxDeclAttributes->addUnknown( OString( XML_CAST( namespaces[ i ] ) ), OString( XML_CAST( namespaces[ i + 1 ] ) ) );
1201 else
1203 // default namespace
1204 sNamespace = OUString( XML_CAST( namespaces[ i + 1 ] ), strlen( XML_CAST( namespaces[ i + 1 ] )), RTL_TEXTENCODING_UTF8 );
1205 NormalizeURI( sNamespace );
1206 nNamespaceToken = GetNamespaceToken( sNamespace );
1207 if( rEntity.mxNamespaceHandler.is() )
1208 rEvent.mxDeclAttributes->addUnknown( "", OString( XML_CAST( namespaces[ i + 1 ] ) ) );
1212 if ( rEntity.mxTokenHandler.is() )
1214 // #158414# second: fill attribute list with other attributes
1215 rEvent.mxAttributes->reserve( numAttributes );
1216 for (int i = 0; i < numAttributes * 5; i += 5)
1218 // attributes[] is ( localname / prefix / nsURI / valueBegin / valueEnd )
1219 if( attributes[ i + 1 ] != nullptr )
1221 sal_Int32 nAttributeToken = GetTokenWithPrefix(attributes[ i + 1 ], attributes[ i ]);
1222 if( nAttributeToken != FastToken::DONTKNOW )
1223 rEvent.mxAttributes->add( nAttributeToken, std::string_view(XML_CAST( attributes[ i + 3 ] ), attributes[ i + 4 ] - attributes[ i + 3 ]) );
1224 else
1225 addUnknownElementWithPrefix(attributes, i, rEvent.mxAttributes);
1227 else
1229 sal_Int32 nAttributeToken = GetToken(attributes[ i ]);
1230 if( nAttributeToken != FastToken::DONTKNOW )
1231 rEvent.mxAttributes->add( nAttributeToken, std::string_view(XML_CAST( attributes[ i + 3 ] ), attributes[ i + 4 ] - attributes[ i + 3 ]) );
1232 else
1234 SAL_WARN("xmloff", "unknown attribute " << XML_CAST( attributes[ i ] ) << "=" <<
1235 OString( XML_CAST( attributes[ i + 3 ] ), attributes[ i + 4 ] - attributes[ i + 3 ] ));
1236 rEvent.mxAttributes->addUnknown( XML_CAST( attributes[ i ] ),
1237 OString( XML_CAST( attributes[ i + 3 ] ), attributes[ i + 4 ] - attributes[ i + 3 ] ));
1242 if( prefix != nullptr )
1243 rEvent.mnElementToken = GetTokenWithPrefix(prefix, localName);
1244 else if( !sNamespace.isEmpty() )
1245 rEvent.mnElementToken = GetTokenWithContextNamespace(nNamespaceToken, localName);
1246 else
1247 rEvent.mnElementToken = GetToken(localName);
1249 else
1251 for (int i = 0; i < numAttributes * 5; i += 5)
1253 if( attributes[ i + 1 ] != nullptr )
1254 addUnknownElementWithPrefix(attributes, i, rEvent.mxAttributes);
1255 else
1256 rEvent.mxAttributes->addUnknown( XML_CAST( attributes[ i ] ),
1257 OString( XML_CAST( attributes[ i + 3 ] ), attributes[ i + 4 ] - attributes[ i + 3 ] ));
1260 rEvent.mnElementToken = FastToken::DONTKNOW;
1263 if( rEvent.mnElementToken == FastToken::DONTKNOW )
1265 OUString aElementPrefix;
1266 if( prefix != nullptr )
1268 aElementPrefix = OUString( XML_CAST( prefix ), strlen( XML_CAST( prefix )), RTL_TEXTENCODING_UTF8 );
1269 if ( URI != nullptr )
1270 sNamespace = OUString( XML_CAST( URI ), strlen( XML_CAST( URI )), RTL_TEXTENCODING_UTF8 );
1271 else if ( m_bIgnoreMissingNSDecl )
1272 sNamespace.clear();
1273 else
1274 throw SAXException("No namespace defined for " + aElementPrefix, {}, {});
1275 nNamespaceToken = GetNamespaceToken( sNamespace );
1277 OUString aElementLocalName( XML_CAST( localName ), strlen( XML_CAST( localName )), RTL_TEXTENCODING_UTF8 );
1278 rEvent.msNamespace = sNamespace;
1279 if( aElementPrefix.isEmpty() )
1280 rEvent.msElementName = std::move(aElementLocalName);
1281 else
1282 rEvent.msElementName = aElementPrefix + ":" + aElementLocalName;
1284 else // token is always preferred.
1285 rEvent.msElementName.clear();
1287 rEntity.maNamespaceStack.push( NameWithToken(sNamespace, nNamespaceToken) );
1288 if (rEntity.mbEnableThreads)
1289 produce();
1290 else
1292 SAL_INFO("sax.fastparser", " startElement line " << mxDocumentLocator->getLineNumber() << " column " << mxDocumentLocator->getColumnNumber() << " " << ( prefix ? XML_CAST(prefix) : "(null)" ) << ":" << localName);
1293 rEntity.startElement( &rEvent );
1296 catch (...)
1298 rEntity.saveException( ::cppu::getCaughtException() );
1302 void FastSaxParserImpl::addUnknownElementWithPrefix(const xmlChar **attributes, int i, rtl::Reference< FastAttributeList > const & xAttributes)
1304 OUString aNamespaceURI;
1305 if ( !m_bIgnoreMissingNSDecl || attributes[i + 2] != nullptr )
1306 aNamespaceURI = OUString( XML_CAST( attributes[ i + 2 ] ), strlen( XML_CAST( attributes[ i + 2 ] )), RTL_TEXTENCODING_UTF8 );
1307 const OString& rPrefix = OString( XML_CAST( attributes[ i + 1 ] ));
1308 const OString& rLocalName = OString( XML_CAST( attributes[ i ] ));
1309 OString aQualifiedName = (rPrefix.isEmpty())? rLocalName : rPrefix + ":" + rLocalName;
1310 xAttributes->addUnknown( aNamespaceURI, aQualifiedName,
1311 OString( XML_CAST( attributes[ i + 3 ] ), attributes[ i + 4 ] - attributes[ i + 3 ] ));
1312 SAL_INFO("xmloff", "unknown element " << aQualifiedName << " " << aNamespaceURI);
1315 void FastSaxParserImpl::callbackEndElement()
1317 if (!pendingCharacters.empty())
1318 sendPendingCharacters();
1319 Entity& rEntity = getEntity();
1320 SAL_WARN_IF(rEntity.maNamespaceCount.empty(), "sax", "Empty NamespaceCount");
1321 if( !rEntity.maNamespaceCount.empty() )
1322 rEntity.maNamespaceCount.pop();
1324 SAL_WARN_IF(rEntity.maNamespaceStack.empty(), "sax", "Empty NamespaceStack");
1325 if( !rEntity.maNamespaceStack.empty() )
1326 rEntity.maNamespaceStack.pop();
1328 rEntity.getEvent( CallbackType::END_ELEMENT );
1329 if (rEntity.mbEnableThreads)
1330 produce();
1331 else
1332 rEntity.endElement();
1335 void FastSaxParserImpl::callbackCharacters( const xmlChar* s, int nLen )
1337 // SAX interface allows that the characters callback splits content of one XML node
1338 // (e.g. because there's an entity that needs decoding), however for consumers it's
1339 // simpler FastSaxParser's character callback provides the whole string at once,
1340 // so merge data from possible multiple calls and send them at once (before the element
1341 // ends or another one starts).
1343 // We use a std::vector<char> to avoid calling into the OUString constructor more than once when
1344 // we have multiple callbackCharacters() calls that we have to merge, which happens surprisingly
1345 // often in writer documents.
1346 int nOriginalLen = pendingCharacters.size();
1347 pendingCharacters.resize(nOriginalLen + nLen);
1348 memcpy(pendingCharacters.data() + nOriginalLen, s, nLen);
1351 void FastSaxParserImpl::sendPendingCharacters()
1353 Entity& rEntity = getEntity();
1354 OUString sChars( pendingCharacters.data(), pendingCharacters.size(), RTL_TEXTENCODING_UTF8 );
1355 if (rEntity.mbEnableThreads)
1357 Event& rEvent = rEntity.getEvent( CallbackType::CHARACTERS );
1358 rEvent.msChars = std::move(sChars);
1359 produce();
1361 else
1362 rEntity.characters( sChars );
1363 pendingCharacters.resize(0);
1366 void FastSaxParserImpl::callbackProcessingInstruction( const xmlChar *target, const xmlChar *data )
1368 if (!pendingCharacters.empty())
1369 sendPendingCharacters();
1370 Entity& rEntity = getEntity();
1371 Event& rEvent = rEntity.getEvent( CallbackType::PROCESSING_INSTRUCTION );
1373 // This event is very rare, so no need to waste extra space for this
1374 // Using namespace and element strings to be target and data in that order.
1375 rEvent.msNamespace = OUString( XML_CAST( target ), strlen( XML_CAST( target ) ), RTL_TEXTENCODING_UTF8 );
1376 if ( data != nullptr )
1377 rEvent.msElementName = OUString( XML_CAST( data ), strlen( XML_CAST( data ) ), RTL_TEXTENCODING_UTF8 );
1378 else
1379 rEvent.msElementName.clear();
1381 if (rEntity.mbEnableThreads)
1382 produce();
1383 else
1384 rEntity.processingInstruction( rEvent.msNamespace, rEvent.msElementName );
1387 xmlEntityPtr FastSaxParserImpl::callbackGetEntity( const xmlChar *name )
1389 if( !name )
1390 return xmlGetPredefinedEntity(name);
1391 const char* dname = XML_CAST(name);
1392 int lname = strlen(dname);
1393 if( lname == 0 )
1394 return xmlGetPredefinedEntity(name);
1395 if (m_Replacements.size() > 0)
1397 auto it = std::lower_bound(m_Replacements.begin(), m_Replacements.end(), dname);
1398 if (it != m_Replacements.end() && it->name.compareToAscii(dname) == 0)
1400 xmlEntityPtr entpt = xmlNewEntity(
1401 nullptr, name, XML_INTERNAL_GENERAL_ENTITY, nullptr, nullptr,
1402 BAD_CAST(OUStringToOString(it->replacement, RTL_TEXTENCODING_UTF8).getStr()));
1403 m_TemporalEntities.push_back(entpt);
1404 return entpt;
1407 if( lname < 2 )
1408 return xmlGetPredefinedEntity(name);
1409 if ( dname[0] == '#' )
1411 sal_uInt32 cval = 0;
1412 if( dname[1] == 'x' || dname[1] == 'X' )
1414 if( lname < 3 )
1415 return xmlGetPredefinedEntity(name);
1416 cval = static_cast<sal_uInt32>( strtoul( dname + 2, nullptr, 16 ) );
1417 if( cval == 0 )
1418 return xmlGetPredefinedEntity(name);
1419 OUString vname( &cval, 1 );
1420 xmlEntityPtr entpt
1421 = xmlNewEntity(nullptr, name, XML_INTERNAL_GENERAL_ENTITY, nullptr, nullptr,
1422 BAD_CAST(OUStringToOString(vname, RTL_TEXTENCODING_UTF8).getStr()));
1423 m_TemporalEntities.push_back(entpt);
1424 return entpt;
1426 else
1428 cval = static_cast<sal_uInt32>( strtoul( dname + 2, nullptr, 10 ) );
1429 if( cval == 0 )
1430 return xmlGetPredefinedEntity(name);
1431 OUString vname(&cval, 1);
1432 xmlEntityPtr entpt
1433 = xmlNewEntity(nullptr, name, XML_INTERNAL_GENERAL_ENTITY, nullptr, nullptr,
1434 BAD_CAST(OUStringToOString(vname, RTL_TEXTENCODING_UTF8).getStr()));
1435 m_TemporalEntities.push_back(entpt);
1436 return entpt;
1439 return xmlGetPredefinedEntity(name);
1442 FastSaxParser::FastSaxParser() : mpImpl(new FastSaxParserImpl) {}
1444 FastSaxParser::~FastSaxParser()
1448 void SAL_CALL
1449 FastSaxParser::initialize(css::uno::Sequence< css::uno::Any > const& rArguments)
1451 if (!rArguments.hasElements())
1452 return;
1454 OUString str;
1455 if ( !(rArguments[0] >>= str) )
1456 throw IllegalArgumentException();
1458 if ( str == "IgnoreMissingNSDecl" )
1459 mpImpl->m_bIgnoreMissingNSDecl = true;
1460 else if ( str == "DoSmeplease" )
1461 ; //just ignore as this is already immune to billion laughs
1462 else if ( str == "DisableThreadedParser" )
1463 mpImpl->m_bDisableThreadedParser = true;
1464 else
1465 throw IllegalArgumentException();
1469 void FastSaxParser::parseStream( const xml::sax::InputSource& aInputSource )
1471 mpImpl->parseStream(aInputSource);
1474 void FastSaxParser::setFastDocumentHandler( const uno::Reference<xml::sax::XFastDocumentHandler>& Handler )
1476 mpImpl->setFastDocumentHandler(Handler);
1479 void FastSaxParser::setTokenHandler( const uno::Reference<xml::sax::XFastTokenHandler>& Handler )
1481 mpImpl->setTokenHandler(Handler);
1484 void FastSaxParser::registerNamespace( const OUString& NamespaceURL, sal_Int32 NamespaceToken )
1486 mpImpl->registerNamespace(NamespaceURL, NamespaceToken);
1489 OUString FastSaxParser::getNamespaceURL( const OUString& rPrefix )
1491 return mpImpl->getNamespaceURL(rPrefix);
1494 void FastSaxParser::setErrorHandler( const uno::Reference< xml::sax::XErrorHandler >& Handler )
1496 mpImpl->setErrorHandler(Handler);
1499 void FastSaxParser::setEntityResolver( const uno::Reference< xml::sax::XEntityResolver >& )
1501 // not implemented
1504 void FastSaxParser::setLocale( const lang::Locale& )
1506 // not implemented
1509 void FastSaxParser::setNamespaceHandler( const uno::Reference< css::xml::sax::XFastNamespaceHandler >& Handler)
1511 mpImpl->setNamespaceHandler(Handler);
1514 OUString FastSaxParser::getImplementationName()
1516 return "com.sun.star.comp.extensions.xml.sax.FastParser";
1519 void FastSaxParser::setCustomEntityNames(
1520 const ::css::uno::Sequence<::css::beans::Pair<::rtl::OUString, ::rtl::OUString>>& replacements)
1522 mpImpl->setCustomEntityNames(replacements);
1525 sal_Bool FastSaxParser::supportsService( const OUString& ServiceName )
1527 return cppu::supportsService(this, ServiceName);
1530 uno::Sequence<OUString> FastSaxParser::getSupportedServiceNames()
1532 return { "com.sun.star.xml.sax.FastParser" };
1535 } // namespace sax_fastparser
1537 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface *
1538 com_sun_star_comp_extensions_xml_sax_FastParser_get_implementation(
1539 css::uno::XComponentContext *,
1540 css::uno::Sequence<css::uno::Any> const &)
1542 return cppu::acquire(new FastSaxParser);
1545 // ----------------------------------------------------------
1546 // copy of the code in xmloff/source/core/namespace.cxx, which adds namespace aliases
1547 // for various dodgy namespace decls in the wild.
1549 static bool NormalizeW3URI( OUString& rName );
1550 static bool NormalizeOasisURN( OUString& rName );
1552 static void NormalizeURI( OUString& rName )
1554 // try OASIS + W3 URI normalization
1555 bool bSuccess = NormalizeOasisURN( rName );
1556 if( ! bSuccess )
1557 NormalizeW3URI( rName );
1560 constexpr OUStringLiteral XML_URI_W3_PREFIX(u"http://www.w3.org/");
1561 constexpr OUStringLiteral XML_URI_XFORMS_SUFFIX(u"/xforms");
1562 constexpr OUStringLiteral XML_N_XFORMS_1_0(u"http://www.w3.org/2002/xforms");
1563 constexpr OUStringLiteral XML_N_SVG(u"http://www.w3.org/2000/svg");
1564 constexpr OUStringLiteral XML_N_SVG_COMPAT(u"urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0");
1565 constexpr OUStringLiteral XML_N_FO(u"http://www.w3.org/1999/XSL/Format");
1566 constexpr OUStringLiteral XML_N_FO_COMPAT(u"urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0");
1567 constexpr OUStringLiteral XML_N_SMIL(u"http://www.w3.org/2001/SMIL20/");
1568 constexpr OUStringLiteral XML_N_SMIL_OLD(u"http://www.w3.org/2001/SMIL20");
1569 constexpr OUStringLiteral XML_N_SMIL_COMPAT(u"urn:oasis:names:tc:opendocument:xmlns:smil-compatible:1.0");
1570 constexpr OUStringLiteral XML_URN_OASIS_NAMES_TC(u"urn:oasis:names:tc");
1571 constexpr OUStringLiteral XML_XMLNS(u"xmlns");
1572 constexpr OUStringLiteral XML_OPENDOCUMENT(u"opendocument");
1573 constexpr OUStringLiteral XML_1_0(u"1.0");
1575 static bool NormalizeW3URI( OUString& rName )
1577 // check if URI matches:
1578 // http://www.w3.org/[0-9]*/[:letter:]*
1579 // (year)/(WG name)
1580 // For the following WG/standards names:
1581 // - xforms
1583 bool bSuccess = false;
1584 const OUString& sURIPrefix = XML_URI_W3_PREFIX;
1585 if( rName.startsWith( sURIPrefix ) )
1587 const OUString& sURISuffix = XML_URI_XFORMS_SUFFIX ;
1588 sal_Int32 nCompareFrom = rName.getLength() - sURISuffix.getLength();
1589 if( rName.subView( nCompareFrom ) == sURISuffix )
1591 // found W3 prefix, and xforms suffix
1592 rName = XML_N_XFORMS_1_0;
1593 bSuccess = true;
1596 return bSuccess;
1599 static bool NormalizeOasisURN( OUString& rName )
1601 // #i38644#
1602 // we exported the wrong namespace for smil, so we correct this here on load
1603 // for older documents
1604 if( rName == XML_N_SVG )
1606 rName = XML_N_SVG_COMPAT;
1607 return true;
1609 else if( rName == XML_N_FO )
1611 rName = XML_N_FO_COMPAT;
1612 return true;
1614 else if( rName == XML_N_SMIL || rName == XML_N_SMIL_OLD )
1616 rName = XML_N_SMIL_COMPAT;
1617 return true;
1621 // Check if URN matches
1622 // :urn:oasis:names:tc:[^:]*:xmlns:[^:]*:1.[^:]*
1623 // |---| |---| |-----|
1624 // TC-Id Sub-Id Version
1626 sal_Int32 nNameLen = rName.getLength();
1627 // :urn:oasis:names:tc.*
1628 const OUString& rOasisURN = XML_URN_OASIS_NAMES_TC;
1629 if( !rName.startsWith( rOasisURN ) )
1630 return false;
1632 // :urn:oasis:names:tc:.*
1633 sal_Int32 nPos = rOasisURN.getLength();
1634 if( nPos >= nNameLen || rName[nPos] != ':' )
1635 return false;
1637 // :urn:oasis:names:tc:[^:]:.*
1638 sal_Int32 nTCIdStart = nPos+1;
1639 sal_Int32 nTCIdEnd = rName.indexOf( ':', nTCIdStart );
1640 if( -1 == nTCIdEnd )
1641 return false;
1643 // :urn:oasis:names:tc:[^:]:xmlns.*
1644 nPos = nTCIdEnd + 1;
1645 std::u16string_view sTmp( rName.subView( nPos ) );
1646 const OUString& rXMLNS = XML_XMLNS;
1647 if( !o3tl::starts_with(sTmp, rXMLNS ) )
1648 return false;
1650 // :urn:oasis:names:tc:[^:]:xmlns:.*
1651 nPos += rXMLNS.getLength();
1652 if( nPos >= nNameLen || rName[nPos] != ':' )
1653 return false;
1655 // :urn:oasis:names:tc:[^:]:xmlns:[^:]*:.*
1656 nPos = rName.indexOf( ':', nPos+1 );
1657 if( -1 == nPos )
1658 return false;
1660 // :urn:oasis:names:tc:[^:]:xmlns:[^:]*:[^:][^:][^:][^:]*
1661 sal_Int32 nVersionStart = nPos+1;
1662 if( nVersionStart+2 >= nNameLen ||
1663 -1 != rName.indexOf( ':', nVersionStart ) )
1664 return false;
1666 // :urn:oasis:names:tc:[^:]:xmlns:[^:]*:1\.[^:][^:]*
1667 if( rName[nVersionStart] != '1' || rName[nVersionStart+1] != '.' )
1668 return false;
1670 // replace [tcid] with current TCID and version with current version.
1672 rName = rName.subView( 0, nTCIdStart ) +
1673 XML_OPENDOCUMENT +
1674 rName.subView( nTCIdEnd, nVersionStart-nTCIdEnd ) +
1675 XML_1_0;
1677 return true;
1681 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */