1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
25 #include <com/sun/star/container/NoSuchElementException.hpp>
26 #include <com/sun/star/uno/RuntimeException.hpp>
28 #include <rtl/character.hxx>
29 #include <rtl/string.h>
30 #include <rtl/ustring.hxx>
31 #include <sal/log.hxx>
32 #include <sal/types.h>
33 #include <xmlreader/pad.hxx>
34 #include <xmlreader/span.hxx>
35 #include <xmlreader/xmlreader.hxx>
41 bool isSpace(char c
) {
55 XmlReader::XmlReader(OUString
const & fileUrl
)
57 , fileHandle_(nullptr)
59 oslFileError e
= osl_openFile(
60 fileUrl_
.pData
, &fileHandle_
, osl_File_OpenFlag_Read
);
65 case osl_File_E_NOENT
:
66 throw css::container::NoSuchElementException( fileUrl_
);
68 throw css::uno::RuntimeException(
69 "cannot open " + fileUrl_
+ ": " + OUString::number(e
));
71 e
= osl_getFileSize(fileHandle_
, &fileSize_
);
72 if (e
== osl_File_E_None
) {
74 fileHandle_
, &fileAddress_
, fileSize_
, 0,
75 osl_File_MapFlag_WillNeed
);
77 if (e
!= osl_File_E_None
) {
78 oslFileError e2
= osl_closeFile(fileHandle_
);
79 if (e2
!= osl_File_E_None
) {
82 "osl_closeFile of \"" << fileUrl_
<< "\" failed with " << +e2
);
84 throw css::uno::RuntimeException(
85 "cannot mmap " + fileUrl_
+ " (" + OUString::number(e
) + ")" );
87 namespaceIris_
.emplace_back("http://www.w3.org/XML/1998/namespace");
88 namespaces_
.emplace_back(Span("xml"), NAMESPACE_XML
);
89 pos_
= static_cast< char * >(fileAddress_
);
90 end_
= pos_
+ fileSize_
;
91 state_
= State::Content
;
92 firstAttribute_
= true;
95 XmlReader::~XmlReader() {
98 oslFileError e
= osl_unmapMappedFile(fileHandle_
, fileAddress_
, fileSize_
);
99 if (e
!= osl_File_E_None
) {
102 "osl_unmapMappedFile of \"" << fileUrl_
<< "\" failed with " << +e
);
104 e
= osl_closeFile(fileHandle_
);
105 if (e
!= osl_File_E_None
) {
108 "osl_closeFile of \"" << fileUrl_
<< "\" failed with " << +e
);
112 int XmlReader::registerNamespaceIri(Span
const & iri
) {
113 int id
= toNamespaceId(namespaceIris_
.size());
114 namespaceIris_
.push_back(iri
);
115 if (iri
== "http://www.w3.org/2001/XMLSchema-instance") {
116 // Old user layer .xcu files used the xsi namespace prefix without
117 // declaring a corresponding namespace binding, see issue 77174; reading
118 // those files during migration would fail without this hack that can be
119 // removed once migration is no longer relevant (see
120 // configmgr::Components::parseModificationLayer):
121 namespaces_
.emplace_back(Span("xsi"), id
);
126 XmlReader::Result
XmlReader::nextItem(Text reportText
, Span
* data
, int * nsId
)
130 switch (reportText
) {
132 return handleSkippedText(data
, nsId
);
134 return handleRawText(data
);
135 default: // Text::Normalized
136 return handleNormalizedText(data
);
138 case State::StartTag
:
139 return handleStartTag(nsId
, data
);
141 return handleEndTag();
142 case State::EmptyElementTag
:
145 default: // State::Done
150 bool XmlReader::nextAttribute(int * nsId
, Span
* localName
) {
151 assert(nsId
!= nullptr && localName
!= nullptr);
152 if (firstAttribute_
) {
153 currentAttribute_
= attributes_
.begin();
154 firstAttribute_
= false;
158 if (currentAttribute_
== attributes_
.end()) {
161 if (currentAttribute_
->nameColon
== nullptr) {
162 *nsId
= NAMESPACE_NONE
;
164 currentAttribute_
->nameBegin
,
165 currentAttribute_
->nameEnd
- currentAttribute_
->nameBegin
);
167 *nsId
= getNamespaceId(
169 currentAttribute_
->nameBegin
,
170 currentAttribute_
->nameColon
- currentAttribute_
->nameBegin
));
172 currentAttribute_
->nameColon
+ 1,
173 currentAttribute_
->nameEnd
- (currentAttribute_
->nameColon
+ 1));
178 Span
XmlReader::getAttributeValue(bool fullyNormalize
) {
179 return handleAttributeValue(
180 currentAttribute_
->valueBegin
, currentAttribute_
->valueEnd
,
184 int XmlReader::getNamespaceId(Span
const & prefix
) const {
185 auto i
= std::find_if(namespaces_
.crbegin(), namespaces_
.crend(),
186 [&prefix
](const NamespaceData
& rNamespaceData
) { return prefix
== rNamespaceData
.prefix
; });
188 if (i
!= namespaces_
.rend())
191 return NAMESPACE_UNKNOWN
;
195 void XmlReader::normalizeLineEnds(Span
const & text
) {
196 char const * p
= text
.begin
;
197 sal_Int32 n
= text
.length
;
199 sal_Int32 i
= rtl_str_indexOfChar_WithLength(p
, n
, '\x0D');
206 if (n
== 0 || *p
!= '\x0A') {
213 void XmlReader::skipSpace() {
214 while (isSpace(peek())) {
219 bool XmlReader::skipComment() {
220 if (rtl_str_shortenedCompare_WithLength(
221 pos_
, end_
- pos_
, RTL_CONSTASCII_STRINGPARAM("--"),
222 RTL_CONSTASCII_LENGTH("--")) !=
227 pos_
+= RTL_CONSTASCII_LENGTH("--");
228 sal_Int32 i
= rtl_str_indexOfStr_WithLength(
229 pos_
, end_
- pos_
, RTL_CONSTASCII_STRINGPARAM("--"));
231 throw css::uno::RuntimeException(
232 "premature end (within comment) of " + fileUrl_
);
234 pos_
+= i
+ RTL_CONSTASCII_LENGTH("--");
236 throw css::uno::RuntimeException(
237 "illegal \"--\" within comment in " + fileUrl_
);
242 void XmlReader::skipProcessingInstruction() {
243 sal_Int32 i
= rtl_str_indexOfStr_WithLength(
244 pos_
, end_
- pos_
, RTL_CONSTASCII_STRINGPARAM("?>"));
246 throw css::uno::RuntimeException(
247 "bad '<?' in " + fileUrl_
);
249 pos_
+= i
+ RTL_CONSTASCII_LENGTH("?>");
252 void XmlReader::skipDocumentTypeDeclaration() {
253 // Neither is it checked that the doctypedecl is at the correct position in
254 // the document, nor that it is well-formed:
258 case '\0': // i.e., EOF
259 throw css::uno::RuntimeException(
260 "premature end (within DTD) of " + fileUrl_
);
264 sal_Int32 i
= rtl_str_indexOfChar_WithLength(
265 pos_
, end_
- pos_
, c
);
267 throw css::uno::RuntimeException(
268 "premature end (within DTD) of " + fileUrl_
);
279 case '\0': // i.e., EOF
280 throw css::uno::RuntimeException(
281 "premature end (within DTD) of " + fileUrl_
);
285 sal_Int32 i
= rtl_str_indexOfChar_WithLength(
286 pos_
, end_
- pos_
, c
);
288 throw css::uno::RuntimeException(
289 "premature end (within DTD) of " + fileUrl_
);
296 case '\0': // i.e., EOF
297 throw css::uno::RuntimeException(
298 "premature end (within DTD) of " + fileUrl_
);
303 skipProcessingInstruction();
312 throw css::uno::RuntimeException(
313 "missing \">\" of DTD in " + fileUrl_
);
326 Span
XmlReader::scanCdataSection() {
327 if (rtl_str_shortenedCompare_WithLength(
328 pos_
, end_
- pos_
, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
329 RTL_CONSTASCII_LENGTH("[CDATA[")) !=
334 pos_
+= RTL_CONSTASCII_LENGTH("[CDATA[");
335 char const * begin
= pos_
;
336 sal_Int32 i
= rtl_str_indexOfStr_WithLength(
337 pos_
, end_
- pos_
, RTL_CONSTASCII_STRINGPARAM("]]>"));
339 throw css::uno::RuntimeException(
340 "premature end (within CDATA section) of " + fileUrl_
);
342 pos_
+= i
+ RTL_CONSTASCII_LENGTH("]]>");
343 return Span(begin
, i
);
346 bool XmlReader::scanName(char const ** nameColon
) {
347 assert(nameColon
!= nullptr && *nameColon
== nullptr);
348 for (char const * begin
= pos_
;; ++pos_
) {
350 case '\0': // i.e., EOF
358 return pos_
!= begin
;
368 int XmlReader::scanNamespaceIri(char const * begin
, char const * end
) {
369 assert(begin
!= nullptr && begin
<= end
);
370 Span
iri(handleAttributeValue(begin
, end
, false));
371 for (NamespaceIris::size_type i
= 0; i
< namespaceIris_
.size(); ++i
) {
372 if (namespaceIris_
[i
] == iri
) {
373 return toNamespaceId(i
);
376 return XmlReader::NAMESPACE_UNKNOWN
;
379 char const * XmlReader::handleReference(char const * position
, char const * end
)
381 assert(position
!= nullptr && *position
== '&' && position
< end
);
383 if (*position
== '#') {
387 if (*position
== 'x') {
390 for (;; ++position
) {
392 if (c
>= '0' && c
<= '9') {
393 val
= 16 * val
+ (c
- '0');
394 } else if (c
>= 'A' && c
<= 'F') {
395 val
= 16 * val
+ (c
- 'A') + 10;
396 } else if (c
>= 'a' && c
<= 'f') {
397 val
= 16 * val
+ (c
- 'a') + 10;
401 if (!rtl::isUnicodeCodePoint(val
)) { // avoid overflow
402 throw css::uno::RuntimeException(
403 "'&#x...' too large in " + fileUrl_
);
408 for (;; ++position
) {
410 if (c
>= '0' && c
<= '9') {
411 val
= 10 * val
+ (c
- '0');
415 if (!rtl::isUnicodeCodePoint(val
)) { // avoid overflow
416 throw css::uno::RuntimeException(
417 "'&#...' too large in " + fileUrl_
);
421 if (position
== p
|| *position
++ != ';') {
422 throw css::uno::RuntimeException(
423 "'&#...' missing ';' in " + fileUrl_
);
425 assert(rtl::isUnicodeCodePoint(val
));
426 if ((val
< 0x20 && val
!= 0x9 && val
!= 0xA && val
!= 0xD) ||
427 (val
>= 0xD800 && val
<= 0xDFFF) || val
== 0xFFFE || val
== 0xFFFF)
429 throw css::uno::RuntimeException(
430 "character reference denoting invalid character in " + fileUrl_
);
435 buf
[0] = static_cast< char >(val
);
437 } else if (val
< 0x800) {
438 buf
[0] = static_cast< char >((val
>> 6) | 0xC0);
439 buf
[1] = static_cast< char >((val
& 0x3F) | 0x80);
441 } else if (val
< 0x10000) {
442 buf
[0] = static_cast< char >((val
>> 12) | 0xE0);
443 buf
[1] = static_cast< char >(((val
>> 6) & 0x3F) | 0x80);
444 buf
[2] = static_cast< char >((val
& 0x3F) | 0x80);
447 buf
[0] = static_cast< char >((val
>> 18) | 0xF0);
448 buf
[1] = static_cast< char >(((val
>> 12) & 0x3F) | 0x80);
449 buf
[2] = static_cast< char >(((val
>> 6) & 0x3F) | 0x80);
450 buf
[3] = static_cast< char >((val
& 0x3F) | 0x80);
453 pad_
.addEphemeral(buf
, len
);
457 char const * inBegin
;
458 sal_Int32
const inLength
;
459 char const * outBegin
;
460 sal_Int32
const outLength
;
462 static EntityRef
const refs
[] = {
463 { RTL_CONSTASCII_STRINGPARAM("amp;"),
464 RTL_CONSTASCII_STRINGPARAM("&") },
465 { RTL_CONSTASCII_STRINGPARAM("lt;"),
466 RTL_CONSTASCII_STRINGPARAM("<") },
467 { RTL_CONSTASCII_STRINGPARAM("gt;"),
468 RTL_CONSTASCII_STRINGPARAM(">") },
469 { RTL_CONSTASCII_STRINGPARAM("apos;"),
470 RTL_CONSTASCII_STRINGPARAM("'") },
471 { RTL_CONSTASCII_STRINGPARAM("quot;"),
472 RTL_CONSTASCII_STRINGPARAM("\"") } };
473 for (const auto & ref
: refs
) {
474 if (rtl_str_shortenedCompare_WithLength(
475 position
, end
- position
, ref
.inBegin
, ref
.inLength
,
479 position
+= ref
.inLength
;
480 pad_
.add(ref
.outBegin
, ref
.outLength
);
484 throw css::uno::RuntimeException(
485 "unknown entity reference in " + fileUrl_
);
489 Span
XmlReader::handleAttributeValue(
490 char const * begin
, char const * end
, bool fullyNormalize
)
493 if (fullyNormalize
) {
494 while (begin
!= end
&& isSpace(*begin
)) {
497 while (end
!= begin
&& isSpace(end
[-1])) {
500 char const * p
= begin
;
501 enum Space
{ SPACE_NONE
, SPACE_SPAN
, SPACE_BREAK
};
502 // a single true space character can go into the current span,
503 // everything else breaks the span
504 Space space
= SPACE_NONE
;
512 pad_
.add(begin
, p
- begin
);
517 pad_
.add(begin
, p
- begin
);
532 pad_
.add(begin
, p
- begin
);
542 pad_
.add(begin
, p
- begin
);
543 p
= handleReference(p
, end
);
553 pad_
.add(begin
, p
- begin
);
555 char const * p
= begin
;
560 pad_
.add(begin
, p
- begin
);
565 pad_
.add(begin
, p
- begin
);
567 if (peek() == '\x0A') {
574 pad_
.add(begin
, p
- begin
);
575 p
= handleReference(p
, end
);
583 pad_
.add(begin
, p
- begin
);
588 XmlReader::Result
XmlReader::handleStartTag(int * nsId
, Span
* localName
) {
589 assert(nsId
!= nullptr && localName
);
590 char const * nameBegin
= pos_
;
591 char const * nameColon
= nullptr;
592 if (!scanName(&nameColon
)) {
593 throw css::uno::RuntimeException(
594 "bad tag name in " + fileUrl_
);
596 char const * nameEnd
= pos_
;
597 NamespaceList::size_type inheritedNamespaces
= namespaces_
.size();
598 bool hasDefaultNs
= false;
599 int defaultNsId
= NAMESPACE_NONE
;
602 char const * p
= pos_
;
604 if (peek() == '/' || peek() == '>') {
608 throw css::uno::RuntimeException(
609 "missing whitespace before attribute in " + fileUrl_
);
611 char const * attrNameBegin
= pos_
;
612 char const * attrNameColon
= nullptr;
613 if (!scanName(&attrNameColon
)) {
614 throw css::uno::RuntimeException(
615 "bad attribute name in " + fileUrl_
);
617 char const * attrNameEnd
= pos_
;
620 throw css::uno::RuntimeException(
621 "missing '=' in " + fileUrl_
);
625 if (del
!= '\'' && del
!= '"') {
626 throw css::uno::RuntimeException(
627 "bad attribute value in " + fileUrl_
);
629 char const * valueBegin
= pos_
;
630 sal_Int32 i
= rtl_str_indexOfChar_WithLength(pos_
, end_
- pos_
, del
);
632 throw css::uno::RuntimeException(
633 "unterminated attribute value in " + fileUrl_
);
635 char const * valueEnd
= pos_
+ i
;
637 if (attrNameColon
== nullptr &&
638 Span(attrNameBegin
, attrNameEnd
- attrNameBegin
) == "xmlns")
641 defaultNsId
= scanNamespaceIri(valueBegin
, valueEnd
);
642 } else if (attrNameColon
!= nullptr &&
643 Span(attrNameBegin
, attrNameColon
- attrNameBegin
) ==
646 namespaces_
.emplace_back(
647 Span(attrNameColon
+ 1, attrNameEnd
- (attrNameColon
+ 1)),
648 scanNamespaceIri(valueBegin
, valueEnd
));
650 attributes_
.emplace_back(
651 attrNameBegin
, attrNameEnd
, attrNameColon
, valueBegin
,
655 if (!hasDefaultNs
&& !elements_
.empty()) {
656 defaultNsId
= elements_
.top().defaultNamespaceId
;
658 firstAttribute_
= true;
660 state_
= State::EmptyElementTag
;
663 state_
= State::Content
;
666 throw css::uno::RuntimeException(
667 "missing '>' in " + fileUrl_
);
672 Span(nameBegin
, nameEnd
- nameBegin
), inheritedNamespaces
,
674 if (nameColon
== nullptr) {
676 *localName
= Span(nameBegin
, nameEnd
- nameBegin
);
678 *nsId
= getNamespaceId(Span(nameBegin
, nameColon
- nameBegin
));
679 *localName
= Span(nameColon
+ 1, nameEnd
- (nameColon
+ 1));
681 return Result::Begin
;
684 XmlReader::Result
XmlReader::handleEndTag() {
685 if (elements_
.empty()) {
686 throw css::uno::RuntimeException(
687 "spurious end tag in " + fileUrl_
);
689 char const * nameBegin
= pos_
;
690 char const * nameColon
= nullptr;
691 if (!scanName(&nameColon
) ||
692 !elements_
.top().name
.equals(nameBegin
, pos_
- nameBegin
))
694 throw css::uno::RuntimeException(
695 "tag mismatch in " + fileUrl_
);
700 throw css::uno::RuntimeException(
701 "missing '>' in " + fileUrl_
);
707 void XmlReader::handleElementEnd() {
708 assert(!elements_
.empty());
709 auto end
= elements_
.top().inheritedNamespaces
;
710 namespaces_
.resize(end
);
712 state_
= elements_
.empty() ? State::Done
: State::Content
;
715 XmlReader::Result
XmlReader::handleSkippedText(Span
* data
, int * nsId
) {
717 sal_Int32 i
= rtl_str_indexOfChar_WithLength(pos_
, end_
- pos_
, '<');
719 throw css::uno::RuntimeException(
720 "premature end of " + fileUrl_
);
726 if (!skipComment() && !scanCdataSection().is()) {
727 skipDocumentTypeDeclaration();
732 return handleEndTag();
735 skipProcessingInstruction();
738 return handleStartTag(nsId
, data
);
743 XmlReader::Result
XmlReader::handleRawText(Span
* text
) {
745 for (char const * begin
= pos_
;;) {
747 case '\0': // i.e., EOF
748 throw css::uno::RuntimeException(
749 "premature end of " + fileUrl_
);
751 pad_
.add(begin
, pos_
- begin
);
753 if (peek() != '\x0A') {
759 pad_
.add(begin
, pos_
- begin
);
760 pos_
= handleReference(pos_
, end_
);
764 pad_
.add(begin
, pos_
- begin
);
769 if (!skipComment()) {
770 Span
cdata(scanCdataSection());
772 normalizeLineEnds(cdata
);
774 skipDocumentTypeDeclaration();
782 state_
= State::EndTag
;
786 skipProcessingInstruction();
791 state_
= State::StartTag
;
802 XmlReader::Result
XmlReader::handleNormalizedText(Span
* text
) {
804 char const * flowBegin
= pos_
;
805 char const * flowEnd
= pos_
;
806 enum Space
{ SPACE_START
, SPACE_NONE
, SPACE_SPAN
, SPACE_BREAK
};
807 // a single true space character can go into the current flow,
808 // everything else breaks the flow
809 Space space
= SPACE_START
;
812 case '\0': // i.e., EOF
813 throw css::uno::RuntimeException(
814 "premature end of " + fileUrl_
);
849 pad_
.add(flowBegin
, pos_
- flowBegin
);
852 pad_
.add(flowBegin
, flowEnd
- flowBegin
);
856 pos_
= handleReference(pos_
, end_
);
869 Span
cdata(scanCdataSection());
871 // CDATA is not normalized (similar to character
872 // references; it keeps the code simple), but it might
873 // arguably be better to normalize it:
879 pad_
.add(flowBegin
, pos_
- flowBegin
);
882 pad_
.add(flowBegin
, flowEnd
- flowBegin
);
886 normalizeLineEnds(cdata
);
891 skipDocumentTypeDeclaration();
897 pad_
.add(flowBegin
, flowEnd
- flowBegin
);
899 state_
= State::EndTag
;
903 skipProcessingInstruction();
907 pad_
.add(flowBegin
, flowEnd
- flowBegin
);
909 state_
= State::StartTag
;
922 pad_
.add(flowBegin
, flowEnd
- flowBegin
);
934 int XmlReader::toNamespaceId(NamespaceIris::size_type pos
) {
935 assert(pos
<= INT_MAX
);
936 return static_cast< int >(pos
);
941 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */