1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include "sal/config.h"
26 #include "com/sun/star/container/NoSuchElementException.hpp"
27 #include "com/sun/star/uno/Reference.hxx"
28 #include "com/sun/star/uno/RuntimeException.hpp"
29 #include "com/sun/star/uno/XInterface.hpp"
31 #include "rtl/string.h"
32 #include "rtl/ustring.hxx"
33 #include "sal/log.hxx"
34 #include "sal/types.h"
35 #include "xmlreader/pad.hxx"
36 #include "xmlreader/span.hxx"
37 #include "xmlreader/xmlreader.hxx"
43 bool isSpace(char c
) {
57 XmlReader::XmlReader(OUString
const & fileUrl
)
59 css::container::NoSuchElementException
, css::uno::RuntimeException
)):
62 oslFileError e
= osl_openFile(
63 fileUrl_
.pData
, &fileHandle_
, osl_File_OpenFlag_Read
);
68 case osl_File_E_NOENT
:
69 throw css::container::NoSuchElementException(
70 fileUrl_
, css::uno::Reference
< css::uno::XInterface
>());
72 throw css::uno::RuntimeException(
73 "cannot open " + fileUrl_
+ ": " + OUString::number(e
),
74 css::uno::Reference
< css::uno::XInterface
>());
76 e
= osl_getFileSize(fileHandle_
, &fileSize_
);
77 if (e
== osl_File_E_None
) {
79 fileHandle_
, &fileAddress_
, fileSize_
, 0,
80 osl_File_MapFlag_WillNeed
);
82 if (e
!= osl_File_E_None
) {
83 oslFileError e2
= osl_closeFile(fileHandle_
);
84 if (e2
!= osl_File_E_None
) {
87 "osl_closeFile of \"" << fileUrl_
<< "\" failed with " << +e2
);
89 throw css::uno::RuntimeException(
90 "cannot mmap " + fileUrl_
+ " (" + OUString::number(e
) + ")",
91 css::uno::Reference
< css::uno::XInterface
>());
93 namespaceIris_
.push_back(Span("http://www.w3.org/XML/1998/namespace"));
94 namespaces_
.push_back(NamespaceData(Span("xml"), NAMESPACE_XML
));
95 pos_
= static_cast< char * >(fileAddress_
);
96 end_
= pos_
+ fileSize_
;
97 state_
= STATE_CONTENT
;
98 firstAttribute_
= true;
101 XmlReader::~XmlReader() {
102 oslFileError e
= osl_unmapMappedFile(fileHandle_
, fileAddress_
, fileSize_
);
103 if (e
!= osl_File_E_None
) {
106 "osl_unmapMappedFile of \"" << fileUrl_
<< "\" failed with " << +e
);
108 e
= osl_closeFile(fileHandle_
);
109 if (e
!= osl_File_E_None
) {
112 "osl_closeFile of \"" << fileUrl_
<< "\" failed with " << +e
);
116 int XmlReader::registerNamespaceIri(Span
const & iri
) {
117 int id
= toNamespaceId(namespaceIris_
.size());
118 namespaceIris_
.push_back(iri
);
119 if (iri
.equals("http://www.w3.org/2001/XMLSchema-instance")) {
120 // Old user layer .xcu files used the xsi namespace prefix without
121 // declaring a corresponding namespace binding, see issue 77174; reading
122 // those files during migration would fail without this hack that can be
123 // removed once migration is no longer relevant (see
124 // configmgr::Components::parseModificationLayer):
125 namespaces_
.push_back(NamespaceData(Span("xsi"), id
));
130 XmlReader::Result
XmlReader::nextItem(Text reportText
, Span
* data
, int * nsId
)
134 switch (reportText
) {
136 return handleSkippedText(data
, nsId
);
138 return handleRawText(data
);
139 case TEXT_NORMALIZED
:
140 return handleNormalizedText(data
);
142 case STATE_START_TAG
:
143 return handleStartTag(nsId
, data
);
145 return handleEndTag();
146 case STATE_EMPTY_ELEMENT_TAG
:
149 default: // STATE_DONE
154 bool XmlReader::nextAttribute(int * nsId
, Span
* localName
) {
155 assert(nsId
!= 0 && localName
!= 0);
156 if (firstAttribute_
) {
157 currentAttribute_
= attributes_
.begin();
158 firstAttribute_
= false;
162 if (currentAttribute_
== attributes_
.end()) {
165 if (currentAttribute_
->nameColon
== 0) {
166 *nsId
= NAMESPACE_NONE
;
168 currentAttribute_
->nameBegin
,
169 currentAttribute_
->nameEnd
- currentAttribute_
->nameBegin
);
171 *nsId
= getNamespaceId(
173 currentAttribute_
->nameBegin
,
174 currentAttribute_
->nameColon
- currentAttribute_
->nameBegin
));
176 currentAttribute_
->nameColon
+ 1,
177 currentAttribute_
->nameEnd
- (currentAttribute_
->nameColon
+ 1));
182 Span
XmlReader::getAttributeValue(bool fullyNormalize
) {
183 return handleAttributeValue(
184 currentAttribute_
->valueBegin
, currentAttribute_
->valueEnd
,
188 int XmlReader::getNamespaceId(Span
const & prefix
) const {
189 for (NamespaceList::const_reverse_iterator
i(namespaces_
.rbegin());
190 i
!= namespaces_
.rend(); ++i
)
192 if (prefix
.equals(i
->prefix
)) {
196 return NAMESPACE_UNKNOWN
;
199 OUString
XmlReader::getUrl() const {
203 void XmlReader::normalizeLineEnds(Span
const & text
) {
204 char const * p
= text
.begin
;
205 sal_Int32 n
= text
.length
;
207 sal_Int32 i
= rtl_str_indexOfChar_WithLength(p
, n
, '\x0D');
214 if (n
== 0 || *p
!= '\x0A') {
221 void XmlReader::skipSpace() {
222 while (isSpace(peek())) {
227 bool XmlReader::skipComment() {
228 if (rtl_str_shortenedCompare_WithLength(
229 pos_
, end_
- pos_
, RTL_CONSTASCII_STRINGPARAM("--"),
230 RTL_CONSTASCII_LENGTH("--")) !=
235 pos_
+= RTL_CONSTASCII_LENGTH("--");
236 sal_Int32 i
= rtl_str_indexOfStr_WithLength(
237 pos_
, end_
- pos_
, RTL_CONSTASCII_STRINGPARAM("--"));
239 throw css::uno::RuntimeException(
240 "premature end (within comment) of " + fileUrl_
,
241 css::uno::Reference
< css::uno::XInterface
>());
243 pos_
+= i
+ RTL_CONSTASCII_LENGTH("--");
245 throw css::uno::RuntimeException(
246 "illegal \"--\" within comment in " + fileUrl_
,
247 css::uno::Reference
< css::uno::XInterface
>());
252 void XmlReader::skipProcessingInstruction() {
253 sal_Int32 i
= rtl_str_indexOfStr_WithLength(
254 pos_
, end_
- pos_
, RTL_CONSTASCII_STRINGPARAM("?>"));
256 throw css::uno::RuntimeException(
257 "bad '<?' in " + fileUrl_
,
258 css::uno::Reference
< css::uno::XInterface
>());
260 pos_
+= i
+ RTL_CONSTASCII_LENGTH("?>");
263 void XmlReader::skipDocumentTypeDeclaration() {
264 // Neither is it checked that the doctypedecl is at the correct position in
265 // the document, nor that it is well-formed:
269 case '\0': // i.e., EOF
270 throw css::uno::RuntimeException(
271 "premature end (within DTD) of " + fileUrl_
,
272 css::uno::Reference
< css::uno::XInterface
>());
276 sal_Int32 i
= rtl_str_indexOfChar_WithLength(
277 pos_
, end_
- pos_
, c
);
279 throw css::uno::RuntimeException(
280 "premature end (within DTD) of " + fileUrl_
,
281 css::uno::Reference
< css::uno::XInterface
>());
292 case '\0': // i.e., EOF
293 throw css::uno::RuntimeException(
294 "premature end (within DTD) of " + fileUrl_
,
295 css::uno::Reference
< css::uno::XInterface
>());
299 sal_Int32 i
= rtl_str_indexOfChar_WithLength(
300 pos_
, end_
- pos_
, c
);
302 throw css::uno::RuntimeException(
303 "premature end (within DTD) of " + fileUrl_
,
304 css::uno::Reference
< css::uno::XInterface
>());
311 case '\0': // i.e., EOF
312 throw css::uno::RuntimeException(
313 "premature end (within DTD) of " + fileUrl_
,
314 css::uno::Reference
< css::uno::XInterface
>());
319 skipProcessingInstruction();
328 throw css::uno::RuntimeException(
329 "missing \">\" of DTD in " + fileUrl_
,
330 css::uno::Reference
< css::uno::XInterface
>());
343 Span
XmlReader::scanCdataSection() {
344 if (rtl_str_shortenedCompare_WithLength(
345 pos_
, end_
- pos_
, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
346 RTL_CONSTASCII_LENGTH("[CDATA[")) !=
351 pos_
+= RTL_CONSTASCII_LENGTH("[CDATA[");
352 char const * begin
= pos_
;
353 sal_Int32 i
= rtl_str_indexOfStr_WithLength(
354 pos_
, end_
- pos_
, RTL_CONSTASCII_STRINGPARAM("]]>"));
356 throw css::uno::RuntimeException(
357 "premature end (within CDATA section) of " + fileUrl_
,
358 css::uno::Reference
< css::uno::XInterface
>());
360 pos_
+= i
+ RTL_CONSTASCII_LENGTH("]]>");
361 return Span(begin
, i
);
364 bool XmlReader::scanName(char const ** nameColon
) {
365 assert(nameColon
!= 0 && *nameColon
== 0);
366 for (char const * begin
= pos_
;; ++pos_
) {
368 case '\0': // i.e., EOF
376 return pos_
!= begin
;
386 int XmlReader::scanNamespaceIri(char const * begin
, char const * end
) {
387 assert(begin
!= 0 && begin
<= end
);
388 Span
iri(handleAttributeValue(begin
, end
, false));
389 for (NamespaceIris::size_type i
= 0; i
< namespaceIris_
.size(); ++i
) {
390 if (namespaceIris_
[i
].equals(iri
)) {
391 return toNamespaceId(i
);
394 return XmlReader::NAMESPACE_UNKNOWN
;
397 char const * XmlReader::handleReference(char const * position
, char const * end
)
399 assert(position
!= 0 && *position
== '&' && position
< end
);
401 if (*position
== '#') {
405 if (*position
== 'x') {
408 for (;; ++position
) {
410 if (c
>= '0' && c
<= '9') {
411 val
= 16 * val
+ (c
- '0');
412 } else if (c
>= 'A' && c
<= 'F') {
413 val
= 16 * val
+ (c
- 'A') + 10;
414 } else if (c
>= 'a' && c
<= 'f') {
415 val
= 16 * val
+ (c
- 'a') + 10;
419 if (val
> 0x10FFFF) { // avoid overflow
420 throw css::uno::RuntimeException(
421 "'&#x...' too large in " + fileUrl_
,
422 css::uno::Reference
< css::uno::XInterface
>());
427 for (;; ++position
) {
429 if (c
>= '0' && c
<= '9') {
430 val
= 10 * val
+ (c
- '0');
434 if (val
> 0x10FFFF) { // avoid overflow
435 throw css::uno::RuntimeException(
436 "'&#...' too large in " + fileUrl_
,
437 css::uno::Reference
< css::uno::XInterface
>());
441 if (position
== p
|| *position
++ != ';') {
442 throw css::uno::RuntimeException(
443 "'&#...' missing ';' in " + fileUrl_
,
444 css::uno::Reference
< css::uno::XInterface
>());
446 assert(val
>= 0 && val
<= 0x10FFFF);
447 if ((val
< 0x20 && val
!= 0x9 && val
!= 0xA && val
!= 0xD) ||
448 (val
>= 0xD800 && val
<= 0xDFFF) || val
== 0xFFFE || val
== 0xFFFF)
450 throw css::uno::RuntimeException(
451 "character reference denoting invalid character in " + fileUrl_
,
452 css::uno::Reference
< css::uno::XInterface
>());
457 buf
[0] = static_cast< char >(val
);
459 } else if (val
< 0x800) {
460 buf
[0] = static_cast< char >((val
>> 6) | 0xC0);
461 buf
[1] = static_cast< char >((val
& 0x3F) | 0x80);
463 } else if (val
< 0x10000) {
464 buf
[0] = static_cast< char >((val
>> 12) | 0xE0);
465 buf
[1] = static_cast< char >(((val
>> 6) & 0x3F) | 0x80);
466 buf
[2] = static_cast< char >((val
& 0x3F) | 0x80);
469 buf
[0] = static_cast< char >((val
>> 18) | 0xF0);
470 buf
[1] = static_cast< char >(((val
>> 12) & 0x3F) | 0x80);
471 buf
[2] = static_cast< char >(((val
>> 6) & 0x3F) | 0x80);
472 buf
[3] = static_cast< char >((val
& 0x3F) | 0x80);
475 pad_
.addEphemeral(buf
, len
);
479 char const * inBegin
;
481 char const * outBegin
;
484 static EntityRef
const refs
[] = {
485 { RTL_CONSTASCII_STRINGPARAM("amp;"),
486 RTL_CONSTASCII_STRINGPARAM("&") },
487 { RTL_CONSTASCII_STRINGPARAM("lt;"),
488 RTL_CONSTASCII_STRINGPARAM("<") },
489 { RTL_CONSTASCII_STRINGPARAM("gt;"),
490 RTL_CONSTASCII_STRINGPARAM(">") },
491 { RTL_CONSTASCII_STRINGPARAM("apos;"),
492 RTL_CONSTASCII_STRINGPARAM("'") },
493 { RTL_CONSTASCII_STRINGPARAM("quot;"),
494 RTL_CONSTASCII_STRINGPARAM("\"") } };
495 for (std::size_t i
= 0; i
< sizeof refs
/ sizeof refs
[0]; ++i
) {
496 if (rtl_str_shortenedCompare_WithLength(
497 position
, end
- position
, refs
[i
].inBegin
, refs
[i
].inLength
,
501 position
+= refs
[i
].inLength
;
502 pad_
.add(refs
[i
].outBegin
, refs
[i
].outLength
);
506 throw css::uno::RuntimeException(
507 "unknown entity reference in " + fileUrl_
,
508 css::uno::Reference
< css::uno::XInterface
>());
512 Span
XmlReader::handleAttributeValue(
513 char const * begin
, char const * end
, bool fullyNormalize
)
516 if (fullyNormalize
) {
517 while (begin
!= end
&& isSpace(*begin
)) {
520 while (end
!= begin
&& isSpace(end
[-1])) {
523 char const * p
= begin
;
524 enum Space
{ SPACE_NONE
, SPACE_SPAN
, SPACE_BREAK
};
525 // a single true space character can go into the current span,
526 // everything else breaks the span
527 Space space
= SPACE_NONE
;
535 pad_
.add(begin
, p
- begin
);
540 pad_
.add(begin
, p
- begin
);
555 pad_
.add(begin
, p
- begin
);
565 pad_
.add(begin
, p
- begin
);
566 p
= handleReference(p
, end
);
576 pad_
.add(begin
, p
- begin
);
578 char const * p
= begin
;
583 pad_
.add(begin
, p
- begin
);
588 pad_
.add(begin
, p
- begin
);
590 if (peek() == '\x0A') {
597 pad_
.add(begin
, p
- begin
);
598 p
= handleReference(p
, end
);
606 pad_
.add(begin
, p
- begin
);
611 XmlReader::Result
XmlReader::handleStartTag(int * nsId
, Span
* localName
) {
612 assert(nsId
!= 0 && localName
);
613 char const * nameBegin
= pos_
;
614 char const * nameColon
= 0;
615 if (!scanName(&nameColon
)) {
616 throw css::uno::RuntimeException(
617 "bad tag name in " + fileUrl_
,
618 css::uno::Reference
< css::uno::XInterface
>());
620 char const * nameEnd
= pos_
;
621 NamespaceList::size_type inheritedNamespaces
= namespaces_
.size();
622 bool hasDefaultNs
= false;
623 int defaultNsId
= NAMESPACE_NONE
;
626 char const * p
= pos_
;
628 if (peek() == '/' || peek() == '>') {
632 throw css::uno::RuntimeException(
633 "missing whitespace before attribute in " + fileUrl_
,
634 css::uno::Reference
< css::uno::XInterface
>());
636 char const * attrNameBegin
= pos_
;
637 char const * attrNameColon
= 0;
638 if (!scanName(&attrNameColon
)) {
639 throw css::uno::RuntimeException(
640 "bad attribute name in " + fileUrl_
,
641 css::uno::Reference
< css::uno::XInterface
>());
643 char const * attrNameEnd
= pos_
;
646 throw css::uno::RuntimeException(
647 "missing '=' in " + fileUrl_
,
648 css::uno::Reference
< css::uno::XInterface
>());
652 if (del
!= '\'' && del
!= '"') {
653 throw css::uno::RuntimeException(
654 "bad attribute value in " + fileUrl_
,
655 css::uno::Reference
< css::uno::XInterface
>());
657 char const * valueBegin
= pos_
;
658 sal_Int32 i
= rtl_str_indexOfChar_WithLength(pos_
, end_
- pos_
, del
);
660 throw css::uno::RuntimeException(
661 "unterminated attribute value in " + fileUrl_
,
662 css::uno::Reference
< css::uno::XInterface
>());
664 char const * valueEnd
= pos_
+ i
;
666 if (attrNameColon
== 0 &&
667 Span(attrNameBegin
, attrNameEnd
- attrNameBegin
).equals("xmlns"))
670 defaultNsId
= scanNamespaceIri(valueBegin
, valueEnd
);
671 } else if (attrNameColon
!= 0 &&
672 Span(attrNameBegin
, attrNameColon
- attrNameBegin
).equals(
675 namespaces_
.push_back(
677 Span(attrNameColon
+ 1, attrNameEnd
- (attrNameColon
+ 1)),
678 scanNamespaceIri(valueBegin
, valueEnd
)));
680 attributes_
.push_back(
682 attrNameBegin
, attrNameEnd
, attrNameColon
, valueBegin
,
686 if (!hasDefaultNs
&& !elements_
.empty()) {
687 defaultNsId
= elements_
.top().defaultNamespaceId
;
689 firstAttribute_
= true;
691 state_
= STATE_EMPTY_ELEMENT_TAG
;
694 state_
= STATE_CONTENT
;
697 throw css::uno::RuntimeException(
698 "missing '>' in " + fileUrl_
,
699 css::uno::Reference
< css::uno::XInterface
>());
704 Span(nameBegin
, nameEnd
- nameBegin
), inheritedNamespaces
,
706 if (nameColon
== 0) {
708 *localName
= Span(nameBegin
, nameEnd
- nameBegin
);
710 *nsId
= getNamespaceId(Span(nameBegin
, nameColon
- nameBegin
));
711 *localName
= Span(nameColon
+ 1, nameEnd
- (nameColon
+ 1));
716 XmlReader::Result
XmlReader::handleEndTag() {
717 if (elements_
.empty()) {
718 throw css::uno::RuntimeException(
719 "spurious end tag in " + fileUrl_
,
720 css::uno::Reference
< css::uno::XInterface
>());
722 char const * nameBegin
= pos_
;
723 char const * nameColon
= 0;
724 if (!scanName(&nameColon
) ||
725 !elements_
.top().name
.equals(nameBegin
, pos_
- nameBegin
))
727 throw css::uno::RuntimeException(
728 "tag mismatch in " + fileUrl_
,
729 css::uno::Reference
< css::uno::XInterface
>());
734 throw css::uno::RuntimeException(
735 "missing '>' in " + fileUrl_
,
736 css::uno::Reference
< css::uno::XInterface
>());
742 void XmlReader::handleElementEnd() {
743 assert(!elements_
.empty());
744 namespaces_
.resize(elements_
.top().inheritedNamespaces
);
746 state_
= elements_
.empty() ? STATE_DONE
: STATE_CONTENT
;
749 XmlReader::Result
XmlReader::handleSkippedText(Span
* data
, int * nsId
) {
751 sal_Int32 i
= rtl_str_indexOfChar_WithLength(pos_
, end_
- pos_
, '<');
753 throw css::uno::RuntimeException(
754 "premature end of " + fileUrl_
,
755 css::uno::Reference
< css::uno::XInterface
>());
761 if (!skipComment() && !scanCdataSection().is()) {
762 skipDocumentTypeDeclaration();
767 return handleEndTag();
770 skipProcessingInstruction();
773 return handleStartTag(nsId
, data
);
778 XmlReader::Result
XmlReader::handleRawText(Span
* text
) {
780 for (char const * begin
= pos_
;;) {
782 case '\0': // i.e., EOF
783 throw css::uno::RuntimeException(
784 "premature end of " + fileUrl_
,
785 css::uno::Reference
< css::uno::XInterface
>());
787 pad_
.add(begin
, pos_
- begin
);
789 if (peek() != '\x0A') {
795 pad_
.add(begin
, pos_
- begin
);
796 pos_
= handleReference(pos_
, end_
);
800 pad_
.add(begin
, pos_
- begin
);
805 if (!skipComment()) {
806 Span
cdata(scanCdataSection());
808 normalizeLineEnds(cdata
);
810 skipDocumentTypeDeclaration();
818 state_
= STATE_END_TAG
;
822 skipProcessingInstruction();
827 state_
= STATE_START_TAG
;
838 XmlReader::Result
XmlReader::handleNormalizedText(Span
* text
) {
840 char const * flowBegin
= pos_
;
841 char const * flowEnd
= pos_
;
842 enum Space
{ SPACE_START
, SPACE_NONE
, SPACE_SPAN
, SPACE_BREAK
};
843 // a single true space character can go into the current flow,
844 // everything else breaks the flow
845 Space space
= SPACE_START
;
848 case '\0': // i.e., EOF
849 throw css::uno::RuntimeException(
850 "premature end of " + fileUrl_
,
851 css::uno::Reference
< css::uno::XInterface
>());
886 pad_
.add(flowBegin
, pos_
- flowBegin
);
889 pad_
.add(flowBegin
, flowEnd
- flowBegin
);
893 pos_
= handleReference(pos_
, end_
);
906 Span
cdata(scanCdataSection());
908 // CDATA is not normalized (similar to character
909 // references; it keeps the code simple), but it might
910 // arguably be better to normalize it:
916 pad_
.add(flowBegin
, pos_
- flowBegin
);
919 pad_
.add(flowBegin
, flowEnd
- flowBegin
);
923 normalizeLineEnds(cdata
);
928 skipDocumentTypeDeclaration();
934 pad_
.add(flowBegin
, flowEnd
- flowBegin
);
936 state_
= STATE_END_TAG
;
940 skipProcessingInstruction();
944 pad_
.add(flowBegin
, flowEnd
- flowBegin
);
946 state_
= STATE_START_TAG
;
959 pad_
.add(flowBegin
, flowEnd
- flowBegin
);
971 int XmlReader::toNamespaceId(NamespaceIris::size_type pos
) {
972 assert(pos
<= INT_MAX
);
973 return static_cast< int >(pos
);
978 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */