1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
25 #include <com/sun/star/container/NoSuchElementException.hpp>
26 #include <com/sun/star/uno/RuntimeException.hpp>
28 #include <rtl/character.hxx>
29 #include <rtl/string.h>
30 #include <rtl/ustring.hxx>
31 #include <sal/log.hxx>
32 #include <sal/types.h>
34 #include <xmlreader/pad.hxx>
35 #include <xmlreader/span.hxx>
36 #include <xmlreader/xmlreader.hxx>
42 bool isSpace(char c
) {
56 XmlReader::XmlReader(OUString fileUrl
)
57 : fileUrl_(std::move(fileUrl
))
58 , fileHandle_(nullptr)
60 oslFileError e
= osl_openFile(
61 fileUrl_
.pData
, &fileHandle_
, osl_File_OpenFlag_Read
);
66 case osl_File_E_NOENT
:
67 throw css::container::NoSuchElementException( fileUrl_
);
69 throw css::uno::RuntimeException(
70 "cannot open " + fileUrl_
+ ": " + OUString::number(e
));
72 e
= osl_getFileSize(fileHandle_
, &fileSize_
);
73 if (e
== osl_File_E_None
) {
75 fileHandle_
, &fileAddress_
, fileSize_
, 0,
76 osl_File_MapFlag_WillNeed
);
78 if (e
!= osl_File_E_None
) {
79 oslFileError e2
= osl_closeFile(fileHandle_
);
80 if (e2
!= osl_File_E_None
) {
83 "osl_closeFile of \"" << fileUrl_
<< "\" failed with " << +e2
);
85 throw css::uno::RuntimeException(
86 "cannot mmap " + fileUrl_
+ " (" + OUString::number(e
) + ")" );
88 namespaceIris_
.emplace_back("http://www.w3.org/XML/1998/namespace");
89 namespaces_
.emplace_back(Span("xml"), NAMESPACE_XML
);
90 pos_
= static_cast< char * >(fileAddress_
);
91 end_
= pos_
+ fileSize_
;
92 state_
= State::Content
;
93 firstAttribute_
= true;
96 XmlReader::~XmlReader() {
99 oslFileError e
= osl_unmapMappedFile(fileHandle_
, fileAddress_
, fileSize_
);
100 if (e
!= osl_File_E_None
) {
103 "osl_unmapMappedFile of \"" << fileUrl_
<< "\" failed with " << +e
);
105 e
= osl_closeFile(fileHandle_
);
106 if (e
!= osl_File_E_None
) {
109 "osl_closeFile of \"" << fileUrl_
<< "\" failed with " << +e
);
113 int XmlReader::registerNamespaceIri(Span
const & iri
) {
114 int id
= toNamespaceId(namespaceIris_
.size());
115 namespaceIris_
.push_back(iri
);
116 if (iri
== "http://www.w3.org/2001/XMLSchema-instance") {
117 // Old user layer .xcu files used the xsi namespace prefix without
118 // declaring a corresponding namespace binding, see issue 77174; reading
119 // those files during migration would fail without this hack that can be
120 // removed once migration is no longer relevant (see
121 // configmgr::Components::parseModificationLayer):
122 namespaces_
.emplace_back(Span("xsi"), id
);
127 XmlReader::Result
XmlReader::nextItem(Text reportText
, Span
* data
, int * nsId
)
131 switch (reportText
) {
133 return handleSkippedText(data
, nsId
);
135 return handleRawText(data
);
136 default: // Text::Normalized
137 return handleNormalizedText(data
);
139 case State::StartTag
:
140 return handleStartTag(nsId
, data
);
142 return handleEndTag();
143 case State::EmptyElementTag
:
146 default: // State::Done
151 bool XmlReader::nextAttribute(int * nsId
, Span
* localName
) {
152 assert(nsId
!= nullptr && localName
!= nullptr);
153 if (firstAttribute_
) {
154 currentAttribute_
= attributes_
.begin();
155 firstAttribute_
= false;
159 if (currentAttribute_
== attributes_
.end()) {
162 if (currentAttribute_
->nameColon
== nullptr) {
163 *nsId
= NAMESPACE_NONE
;
165 currentAttribute_
->nameBegin
,
166 currentAttribute_
->nameEnd
- currentAttribute_
->nameBegin
);
168 *nsId
= getNamespaceId(
170 currentAttribute_
->nameBegin
,
171 currentAttribute_
->nameColon
- currentAttribute_
->nameBegin
));
173 currentAttribute_
->nameColon
+ 1,
174 currentAttribute_
->nameEnd
- (currentAttribute_
->nameColon
+ 1));
179 Span
XmlReader::getAttributeValue(bool fullyNormalize
) {
180 return handleAttributeValue(
181 currentAttribute_
->valueBegin
, currentAttribute_
->valueEnd
,
185 int XmlReader::getNamespaceId(Span
const & prefix
) const {
186 auto i
= std::find_if(namespaces_
.crbegin(), namespaces_
.crend(),
187 [&prefix
](const NamespaceData
& rNamespaceData
) { return prefix
== rNamespaceData
.prefix
; });
189 if (i
!= namespaces_
.rend())
192 return NAMESPACE_UNKNOWN
;
196 void XmlReader::normalizeLineEnds(Span
const & text
) {
197 char const * p
= text
.begin
;
198 sal_Int32 n
= text
.length
;
200 sal_Int32 i
= rtl_str_indexOfChar_WithLength(p
, n
, '\x0D');
207 if (n
== 0 || *p
!= '\x0A') {
214 void XmlReader::skipSpace() {
215 while (isSpace(peek())) {
220 bool XmlReader::skipComment() {
221 if (rtl_str_shortenedCompare_WithLength(
222 pos_
, end_
- pos_
, RTL_CONSTASCII_STRINGPARAM("--"),
223 RTL_CONSTASCII_LENGTH("--")) !=
228 pos_
+= RTL_CONSTASCII_LENGTH("--");
229 sal_Int32 i
= rtl_str_indexOfStr_WithLength(
230 pos_
, end_
- pos_
, RTL_CONSTASCII_STRINGPARAM("--"));
232 throw css::uno::RuntimeException(
233 "premature end (within comment) of " + fileUrl_
);
235 pos_
+= i
+ RTL_CONSTASCII_LENGTH("--");
237 throw css::uno::RuntimeException(
238 "illegal \"--\" within comment in " + fileUrl_
);
243 void XmlReader::skipProcessingInstruction() {
244 sal_Int32 i
= rtl_str_indexOfStr_WithLength(
245 pos_
, end_
- pos_
, RTL_CONSTASCII_STRINGPARAM("?>"));
247 throw css::uno::RuntimeException(
248 "bad '<?' in " + fileUrl_
);
250 pos_
+= i
+ RTL_CONSTASCII_LENGTH("?>");
253 void XmlReader::skipDocumentTypeDeclaration() {
254 // Neither is it checked that the doctypedecl is at the correct position in
255 // the document, nor that it is well-formed:
259 case '\0': // i.e., EOF
260 throw css::uno::RuntimeException(
261 "premature end (within DTD) of " + fileUrl_
);
265 sal_Int32 i
= rtl_str_indexOfChar_WithLength(
266 pos_
, end_
- pos_
, c
);
268 throw css::uno::RuntimeException(
269 "premature end (within DTD) of " + fileUrl_
);
280 case '\0': // i.e., EOF
281 throw css::uno::RuntimeException(
282 "premature end (within DTD) of " + fileUrl_
);
286 sal_Int32 i
= rtl_str_indexOfChar_WithLength(
287 pos_
, end_
- pos_
, c
);
289 throw css::uno::RuntimeException(
290 "premature end (within DTD) of " + fileUrl_
);
297 case '\0': // i.e., EOF
298 throw css::uno::RuntimeException(
299 "premature end (within DTD) of " + fileUrl_
);
304 skipProcessingInstruction();
313 throw css::uno::RuntimeException(
314 "missing \">\" of DTD in " + fileUrl_
);
327 Span
XmlReader::scanCdataSection() {
328 if (rtl_str_shortenedCompare_WithLength(
329 pos_
, end_
- pos_
, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
330 RTL_CONSTASCII_LENGTH("[CDATA[")) !=
335 pos_
+= RTL_CONSTASCII_LENGTH("[CDATA[");
336 char const * begin
= pos_
;
337 sal_Int32 i
= rtl_str_indexOfStr_WithLength(
338 pos_
, end_
- pos_
, RTL_CONSTASCII_STRINGPARAM("]]>"));
340 throw css::uno::RuntimeException(
341 "premature end (within CDATA section) of " + fileUrl_
);
343 pos_
+= i
+ RTL_CONSTASCII_LENGTH("]]>");
344 return Span(begin
, i
);
347 bool XmlReader::scanName(char const ** nameColon
) {
348 assert(nameColon
!= nullptr && *nameColon
== nullptr);
349 for (char const * begin
= pos_
;; ++pos_
) {
351 case '\0': // i.e., EOF
359 return pos_
!= begin
;
369 int XmlReader::scanNamespaceIri(char const * begin
, char const * end
) {
370 assert(begin
!= nullptr && begin
<= end
);
371 Span
iri(handleAttributeValue(begin
, end
, false));
372 for (NamespaceIris::size_type i
= 0; i
< namespaceIris_
.size(); ++i
) {
373 if (namespaceIris_
[i
] == iri
) {
374 return toNamespaceId(i
);
377 return XmlReader::NAMESPACE_UNKNOWN
;
380 char const * XmlReader::handleReference(char const * position
, char const * end
)
382 assert(position
!= nullptr && *position
== '&' && position
< end
);
384 if (*position
== '#') {
388 if (*position
== 'x') {
391 for (;; ++position
) {
393 if (c
>= '0' && c
<= '9') {
394 val
= 16 * val
+ (c
- '0');
395 } else if (c
>= 'A' && c
<= 'F') {
396 val
= 16 * val
+ (c
- 'A') + 10;
397 } else if (c
>= 'a' && c
<= 'f') {
398 val
= 16 * val
+ (c
- 'a') + 10;
402 if (!rtl::isUnicodeCodePoint(val
)) { // avoid overflow
403 throw css::uno::RuntimeException(
404 "'&#x...' too large in " + fileUrl_
);
409 for (;; ++position
) {
411 if (c
>= '0' && c
<= '9') {
412 val
= 10 * val
+ (c
- '0');
416 if (!rtl::isUnicodeCodePoint(val
)) { // avoid overflow
417 throw css::uno::RuntimeException(
418 "'&#...' too large in " + fileUrl_
);
422 if (position
== p
|| *position
++ != ';') {
423 throw css::uno::RuntimeException(
424 "'&#...' missing ';' in " + fileUrl_
);
426 assert(rtl::isUnicodeCodePoint(val
));
427 if ((val
< 0x20 && val
!= 0x9 && val
!= 0xA && val
!= 0xD) ||
428 (val
>= 0xD800 && val
<= 0xDFFF) || val
== 0xFFFE || val
== 0xFFFF)
430 throw css::uno::RuntimeException(
431 "character reference denoting invalid character in " + fileUrl_
);
436 buf
[0] = static_cast< char >(val
);
438 } else if (val
< 0x800) {
439 buf
[0] = static_cast< char >((val
>> 6) | 0xC0);
440 buf
[1] = static_cast< char >((val
& 0x3F) | 0x80);
442 } else if (val
< 0x10000) {
443 buf
[0] = static_cast< char >((val
>> 12) | 0xE0);
444 buf
[1] = static_cast< char >(((val
>> 6) & 0x3F) | 0x80);
445 buf
[2] = static_cast< char >((val
& 0x3F) | 0x80);
448 buf
[0] = static_cast< char >((val
>> 18) | 0xF0);
449 buf
[1] = static_cast< char >(((val
>> 12) & 0x3F) | 0x80);
450 buf
[2] = static_cast< char >(((val
>> 6) & 0x3F) | 0x80);
451 buf
[3] = static_cast< char >((val
& 0x3F) | 0x80);
454 pad_
.addEphemeral(buf
, len
);
458 char const * inBegin
;
459 sal_Int32
const inLength
;
460 char const * outBegin
;
461 sal_Int32
const outLength
;
463 static EntityRef
const refs
[] = {
464 { RTL_CONSTASCII_STRINGPARAM("amp;"),
465 RTL_CONSTASCII_STRINGPARAM("&") },
466 { RTL_CONSTASCII_STRINGPARAM("lt;"),
467 RTL_CONSTASCII_STRINGPARAM("<") },
468 { RTL_CONSTASCII_STRINGPARAM("gt;"),
469 RTL_CONSTASCII_STRINGPARAM(">") },
470 { RTL_CONSTASCII_STRINGPARAM("apos;"),
471 RTL_CONSTASCII_STRINGPARAM("'") },
472 { RTL_CONSTASCII_STRINGPARAM("quot;"),
473 RTL_CONSTASCII_STRINGPARAM("\"") } };
474 for (const auto & ref
: refs
) {
475 if (rtl_str_shortenedCompare_WithLength(
476 position
, end
- position
, ref
.inBegin
, ref
.inLength
,
480 position
+= ref
.inLength
;
481 pad_
.add(ref
.outBegin
, ref
.outLength
);
485 throw css::uno::RuntimeException(
486 "unknown entity reference in " + fileUrl_
);
490 Span
XmlReader::handleAttributeValue(
491 char const * begin
, char const * end
, bool fullyNormalize
)
494 if (fullyNormalize
) {
495 while (begin
!= end
&& isSpace(*begin
)) {
498 while (end
!= begin
&& isSpace(end
[-1])) {
501 char const * p
= begin
;
502 enum Space
{ SPACE_NONE
, SPACE_SPAN
, SPACE_BREAK
};
503 // a single true space character can go into the current span,
504 // everything else breaks the span
505 Space space
= SPACE_NONE
;
513 pad_
.add(begin
, p
- begin
);
518 pad_
.add(begin
, p
- begin
);
533 pad_
.add(begin
, p
- begin
);
543 pad_
.add(begin
, p
- begin
);
544 p
= handleReference(p
, end
);
554 pad_
.add(begin
, p
- begin
);
556 char const * p
= begin
;
561 pad_
.add(begin
, p
- begin
);
566 pad_
.add(begin
, p
- begin
);
568 if (peek() == '\x0A') {
575 pad_
.add(begin
, p
- begin
);
576 p
= handleReference(p
, end
);
584 pad_
.add(begin
, p
- begin
);
589 XmlReader::Result
XmlReader::handleStartTag(int * nsId
, Span
* localName
) {
590 assert(nsId
!= nullptr && localName
);
591 char const * nameBegin
= pos_
;
592 char const * nameColon
= nullptr;
593 if (!scanName(&nameColon
)) {
594 throw css::uno::RuntimeException(
595 "bad tag name in " + fileUrl_
);
597 char const * nameEnd
= pos_
;
598 NamespaceList::size_type inheritedNamespaces
= namespaces_
.size();
599 bool hasDefaultNs
= false;
600 int defaultNsId
= NAMESPACE_NONE
;
603 char const * p
= pos_
;
605 if (peek() == '/' || peek() == '>') {
609 throw css::uno::RuntimeException(
610 "missing whitespace before attribute in " + fileUrl_
);
612 char const * attrNameBegin
= pos_
;
613 char const * attrNameColon
= nullptr;
614 if (!scanName(&attrNameColon
)) {
615 throw css::uno::RuntimeException(
616 "bad attribute name in " + fileUrl_
);
618 char const * attrNameEnd
= pos_
;
621 throw css::uno::RuntimeException(
622 "missing '=' in " + fileUrl_
);
626 if (del
!= '\'' && del
!= '"') {
627 throw css::uno::RuntimeException(
628 "bad attribute value in " + fileUrl_
);
630 char const * valueBegin
= pos_
;
631 sal_Int32 i
= rtl_str_indexOfChar_WithLength(pos_
, end_
- pos_
, del
);
633 throw css::uno::RuntimeException(
634 "unterminated attribute value in " + fileUrl_
);
636 char const * valueEnd
= pos_
+ i
;
638 if (attrNameColon
== nullptr &&
639 Span(attrNameBegin
, attrNameEnd
- attrNameBegin
) == "xmlns")
642 defaultNsId
= scanNamespaceIri(valueBegin
, valueEnd
);
643 } else if (attrNameColon
!= nullptr &&
644 Span(attrNameBegin
, attrNameColon
- attrNameBegin
) ==
647 namespaces_
.emplace_back(
648 Span(attrNameColon
+ 1, attrNameEnd
- (attrNameColon
+ 1)),
649 scanNamespaceIri(valueBegin
, valueEnd
));
651 attributes_
.emplace_back(
652 attrNameBegin
, attrNameEnd
, attrNameColon
, valueBegin
,
656 if (!hasDefaultNs
&& !elements_
.empty()) {
657 defaultNsId
= elements_
.top().defaultNamespaceId
;
659 firstAttribute_
= true;
661 state_
= State::EmptyElementTag
;
664 state_
= State::Content
;
667 throw css::uno::RuntimeException(
668 "missing '>' in " + fileUrl_
);
673 Span(nameBegin
, nameEnd
- nameBegin
), inheritedNamespaces
,
675 if (nameColon
== nullptr) {
677 *localName
= Span(nameBegin
, nameEnd
- nameBegin
);
679 *nsId
= getNamespaceId(Span(nameBegin
, nameColon
- nameBegin
));
680 *localName
= Span(nameColon
+ 1, nameEnd
- (nameColon
+ 1));
682 return Result::Begin
;
685 XmlReader::Result
XmlReader::handleEndTag() {
686 if (elements_
.empty()) {
687 throw css::uno::RuntimeException(
688 "spurious end tag in " + fileUrl_
);
690 char const * nameBegin
= pos_
;
691 char const * nameColon
= nullptr;
692 if (!scanName(&nameColon
) ||
693 !elements_
.top().name
.equals(nameBegin
, pos_
- nameBegin
))
695 throw css::uno::RuntimeException(
696 "tag mismatch in " + fileUrl_
);
701 throw css::uno::RuntimeException(
702 "missing '>' in " + fileUrl_
);
708 void XmlReader::handleElementEnd() {
709 assert(!elements_
.empty());
710 auto end
= elements_
.top().inheritedNamespaces
;
711 namespaces_
.resize(end
);
713 state_
= elements_
.empty() ? State::Done
: State::Content
;
716 XmlReader::Result
XmlReader::handleSkippedText(Span
* data
, int * nsId
) {
718 auto i
= static_cast<const char*>(std::memchr(pos_
, '<', end_
- pos_
));
720 throw css::uno::RuntimeException(
721 "premature end of " + fileUrl_
);
727 if (!skipComment() && !scanCdataSection().is()) {
728 skipDocumentTypeDeclaration();
733 return handleEndTag();
736 skipProcessingInstruction();
739 return handleStartTag(nsId
, data
);
744 XmlReader::Result
XmlReader::handleRawText(Span
* text
) {
746 for (char const * begin
= pos_
;;) {
748 case '\0': // i.e., EOF
749 throw css::uno::RuntimeException(
750 "premature end of " + fileUrl_
);
752 pad_
.add(begin
, pos_
- begin
);
754 if (peek() != '\x0A') {
760 pad_
.add(begin
, pos_
- begin
);
761 pos_
= handleReference(pos_
, end_
);
765 pad_
.add(begin
, pos_
- begin
);
770 if (!skipComment()) {
771 Span
cdata(scanCdataSection());
773 normalizeLineEnds(cdata
);
775 skipDocumentTypeDeclaration();
783 state_
= State::EndTag
;
787 skipProcessingInstruction();
792 state_
= State::StartTag
;
803 XmlReader::Result
XmlReader::handleNormalizedText(Span
* text
) {
805 char const * flowBegin
= pos_
;
806 char const * flowEnd
= pos_
;
807 enum Space
{ SPACE_START
, SPACE_NONE
, SPACE_SPAN
, SPACE_BREAK
};
808 // a single true space character can go into the current flow,
809 // everything else breaks the flow
810 Space space
= SPACE_START
;
813 case '\0': // i.e., EOF
814 throw css::uno::RuntimeException(
815 "premature end of " + fileUrl_
);
850 pad_
.add(flowBegin
, pos_
- flowBegin
);
853 pad_
.add(flowBegin
, flowEnd
- flowBegin
);
857 pos_
= handleReference(pos_
, end_
);
870 Span
cdata(scanCdataSection());
872 // CDATA is not normalized (similar to character
873 // references; it keeps the code simple), but it might
874 // arguably be better to normalize it:
880 pad_
.add(flowBegin
, pos_
- flowBegin
);
883 pad_
.add(flowBegin
, flowEnd
- flowBegin
);
887 normalizeLineEnds(cdata
);
892 skipDocumentTypeDeclaration();
898 pad_
.add(flowBegin
, flowEnd
- flowBegin
);
900 state_
= State::EndTag
;
904 skipProcessingInstruction();
908 pad_
.add(flowBegin
, flowEnd
- flowBegin
);
910 state_
= State::StartTag
;
923 pad_
.add(flowBegin
, flowEnd
- flowBegin
);
935 int XmlReader::toNamespaceId(NamespaceIris::size_type pos
) {
936 assert(pos
<= INT_MAX
);
937 return static_cast< int >(pos
);
942 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */