1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
26 #include <com/sun/star/container/NoSuchElementException.hpp>
27 #include <com/sun/star/uno/Reference.hxx>
28 #include <com/sun/star/uno/RuntimeException.hpp>
29 #include <com/sun/star/uno/XInterface.hpp>
31 #include <rtl/string.h>
32 #include <rtl/ustring.hxx>
33 #include <sal/log.hxx>
34 #include <sal/types.h>
35 #include <xmlreader/pad.hxx>
36 #include <xmlreader/span.hxx>
37 #include <xmlreader/xmlreader.hxx>
43 bool isSpace(char c
) {
57 XmlReader::XmlReader(char const *sStr
, size_t nLength
)
63 namespaceIris_
.push_back(Span("http://www.w3.org/XML/1998/namespace"));
64 namespaces_
.push_back(NamespaceData(Span("xml"), NAMESPACE_XML
));
66 end_
= pos_
+ nLength
;
67 state_
= STATE_CONTENT
;
68 firstAttribute_
= true;
71 XmlReader::XmlReader(OUString
const & fileUrl
)
75 oslFileError e
= osl_openFile(
76 fileUrl_
.pData
, &fileHandle_
, osl_File_OpenFlag_Read
);
81 case osl_File_E_NOENT
:
82 throw css::container::NoSuchElementException( fileUrl_
);
84 throw css::uno::RuntimeException(
85 "cannot open " + fileUrl_
+ ": " + OUString::number(e
));
87 e
= osl_getFileSize(fileHandle_
, &fileSize_
);
88 if (e
== osl_File_E_None
) {
90 fileHandle_
, &fileAddress_
, fileSize_
, 0,
91 osl_File_MapFlag_WillNeed
);
93 if (e
!= osl_File_E_None
) {
94 oslFileError e2
= osl_closeFile(fileHandle_
);
95 if (e2
!= osl_File_E_None
) {
98 "osl_closeFile of \"" << fileUrl_
<< "\" failed with " << +e2
);
100 throw css::uno::RuntimeException(
101 "cannot mmap " + fileUrl_
+ " (" + OUString::number(e
) + ")" );
103 namespaceIris_
.push_back(Span("http://www.w3.org/XML/1998/namespace"));
104 namespaces_
.push_back(NamespaceData(Span("xml"), NAMESPACE_XML
));
105 pos_
= static_cast< char * >(fileAddress_
);
106 end_
= pos_
+ fileSize_
;
107 state_
= STATE_CONTENT
;
108 firstAttribute_
= true;
111 XmlReader::~XmlReader() {
114 oslFileError e
= osl_unmapMappedFile(fileHandle_
, fileAddress_
, fileSize_
);
115 if (e
!= osl_File_E_None
) {
118 "osl_unmapMappedFile of \"" << fileUrl_
<< "\" failed with " << +e
);
120 e
= osl_closeFile(fileHandle_
);
121 if (e
!= osl_File_E_None
) {
124 "osl_closeFile of \"" << fileUrl_
<< "\" failed with " << +e
);
128 int XmlReader::registerNamespaceIri(Span
const & iri
) {
129 int id
= toNamespaceId(namespaceIris_
.size());
130 namespaceIris_
.push_back(iri
);
131 if (iri
.equals("http://www.w3.org/2001/XMLSchema-instance")) {
132 // Old user layer .xcu files used the xsi namespace prefix without
133 // declaring a corresponding namespace binding, see issue 77174; reading
134 // those files during migration would fail without this hack that can be
135 // removed once migration is no longer relevant (see
136 // configmgr::Components::parseModificationLayer):
137 namespaces_
.push_back(NamespaceData(Span("xsi"), id
));
142 XmlReader::Result
XmlReader::nextItem(Text reportText
, Span
* data
, int * nsId
)
146 switch (reportText
) {
148 return handleSkippedText(data
, nsId
);
150 return handleRawText(data
);
151 case TEXT_NORMALIZED
:
152 return handleNormalizedText(data
);
154 case STATE_START_TAG
:
155 return handleStartTag(nsId
, data
);
157 return handleEndTag();
158 case STATE_EMPTY_ELEMENT_TAG
:
161 default: // STATE_DONE
166 bool XmlReader::nextAttribute(int * nsId
, Span
* localName
) {
167 assert(nsId
!= 0 && localName
!= 0);
168 if (firstAttribute_
) {
169 currentAttribute_
= attributes_
.begin();
170 firstAttribute_
= false;
174 if (currentAttribute_
== attributes_
.end()) {
177 if (currentAttribute_
->nameColon
== 0) {
178 *nsId
= NAMESPACE_NONE
;
180 currentAttribute_
->nameBegin
,
181 currentAttribute_
->nameEnd
- currentAttribute_
->nameBegin
);
183 *nsId
= getNamespaceId(
185 currentAttribute_
->nameBegin
,
186 currentAttribute_
->nameColon
- currentAttribute_
->nameBegin
));
188 currentAttribute_
->nameColon
+ 1,
189 currentAttribute_
->nameEnd
- (currentAttribute_
->nameColon
+ 1));
194 Span
XmlReader::getAttributeValue(bool fullyNormalize
) {
195 return handleAttributeValue(
196 currentAttribute_
->valueBegin
, currentAttribute_
->valueEnd
,
200 int XmlReader::getNamespaceId(Span
const & prefix
) const {
201 for (NamespaceList::const_reverse_iterator
i(namespaces_
.rbegin());
202 i
!= namespaces_
.rend(); ++i
)
204 if (prefix
.equals(i
->prefix
)) {
208 return NAMESPACE_UNKNOWN
;
212 void XmlReader::normalizeLineEnds(Span
const & text
) {
213 char const * p
= text
.begin
;
214 sal_Int32 n
= text
.length
;
216 sal_Int32 i
= rtl_str_indexOfChar_WithLength(p
, n
, '\x0D');
223 if (n
== 0 || *p
!= '\x0A') {
230 void XmlReader::skipSpace() {
231 while (isSpace(peek())) {
236 bool XmlReader::skipComment() {
237 if (rtl_str_shortenedCompare_WithLength(
238 pos_
, end_
- pos_
, RTL_CONSTASCII_STRINGPARAM("--"),
239 RTL_CONSTASCII_LENGTH("--")) !=
244 pos_
+= RTL_CONSTASCII_LENGTH("--");
245 sal_Int32 i
= rtl_str_indexOfStr_WithLength(
246 pos_
, end_
- pos_
, RTL_CONSTASCII_STRINGPARAM("--"));
248 throw css::uno::RuntimeException(
249 "premature end (within comment) of " + fileUrl_
);
251 pos_
+= i
+ RTL_CONSTASCII_LENGTH("--");
253 throw css::uno::RuntimeException(
254 "illegal \"--\" within comment in " + fileUrl_
);
259 void XmlReader::skipProcessingInstruction() {
260 sal_Int32 i
= rtl_str_indexOfStr_WithLength(
261 pos_
, end_
- pos_
, RTL_CONSTASCII_STRINGPARAM("?>"));
263 throw css::uno::RuntimeException(
264 "bad '<?' in " + fileUrl_
);
266 pos_
+= i
+ RTL_CONSTASCII_LENGTH("?>");
269 void XmlReader::skipDocumentTypeDeclaration() {
270 // Neither is it checked that the doctypedecl is at the correct position in
271 // the document, nor that it is well-formed:
275 case '\0': // i.e., EOF
276 throw css::uno::RuntimeException(
277 "premature end (within DTD) of " + fileUrl_
);
281 sal_Int32 i
= rtl_str_indexOfChar_WithLength(
282 pos_
, end_
- pos_
, c
);
284 throw css::uno::RuntimeException(
285 "premature end (within DTD) of " + fileUrl_
);
296 case '\0': // i.e., EOF
297 throw css::uno::RuntimeException(
298 "premature end (within DTD) of " + fileUrl_
);
302 sal_Int32 i
= rtl_str_indexOfChar_WithLength(
303 pos_
, end_
- pos_
, c
);
305 throw css::uno::RuntimeException(
306 "premature end (within DTD) of " + fileUrl_
);
313 case '\0': // i.e., EOF
314 throw css::uno::RuntimeException(
315 "premature end (within DTD) of " + fileUrl_
);
320 skipProcessingInstruction();
329 throw css::uno::RuntimeException(
330 "missing \">\" of DTD in " + fileUrl_
);
343 Span
XmlReader::scanCdataSection() {
344 if (rtl_str_shortenedCompare_WithLength(
345 pos_
, end_
- pos_
, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
346 RTL_CONSTASCII_LENGTH("[CDATA[")) !=
351 pos_
+= RTL_CONSTASCII_LENGTH("[CDATA[");
352 char const * begin
= pos_
;
353 sal_Int32 i
= rtl_str_indexOfStr_WithLength(
354 pos_
, end_
- pos_
, RTL_CONSTASCII_STRINGPARAM("]]>"));
356 throw css::uno::RuntimeException(
357 "premature end (within CDATA section) of " + fileUrl_
);
359 pos_
+= i
+ RTL_CONSTASCII_LENGTH("]]>");
360 return Span(begin
, i
);
363 bool XmlReader::scanName(char const ** nameColon
) {
364 assert(nameColon
!= 0 && *nameColon
== 0);
365 for (char const * begin
= pos_
;; ++pos_
) {
367 case '\0': // i.e., EOF
375 return pos_
!= begin
;
385 int XmlReader::scanNamespaceIri(char const * begin
, char const * end
) {
386 assert(begin
!= 0 && begin
<= end
);
387 Span
iri(handleAttributeValue(begin
, end
, false));
388 for (NamespaceIris::size_type i
= 0; i
< namespaceIris_
.size(); ++i
) {
389 if (namespaceIris_
[i
].equals(iri
)) {
390 return toNamespaceId(i
);
393 return XmlReader::NAMESPACE_UNKNOWN
;
396 char const * XmlReader::handleReference(char const * position
, char const * end
)
398 assert(position
!= 0 && *position
== '&' && position
< end
);
400 if (*position
== '#') {
404 if (*position
== 'x') {
407 for (;; ++position
) {
409 if (c
>= '0' && c
<= '9') {
410 val
= 16 * val
+ (c
- '0');
411 } else if (c
>= 'A' && c
<= 'F') {
412 val
= 16 * val
+ (c
- 'A') + 10;
413 } else if (c
>= 'a' && c
<= 'f') {
414 val
= 16 * val
+ (c
- 'a') + 10;
418 if (val
> 0x10FFFF) { // avoid overflow
419 throw css::uno::RuntimeException(
420 "'&#x...' too large in " + fileUrl_
);
425 for (;; ++position
) {
427 if (c
>= '0' && c
<= '9') {
428 val
= 10 * val
+ (c
- '0');
432 if (val
> 0x10FFFF) { // avoid overflow
433 throw css::uno::RuntimeException(
434 "'&#...' too large in " + fileUrl_
);
438 if (position
== p
|| *position
++ != ';') {
439 throw css::uno::RuntimeException(
440 "'&#...' missing ';' in " + fileUrl_
);
442 assert(val
>= 0 && val
<= 0x10FFFF);
443 if ((val
< 0x20 && val
!= 0x9 && val
!= 0xA && val
!= 0xD) ||
444 (val
>= 0xD800 && val
<= 0xDFFF) || val
== 0xFFFE || val
== 0xFFFF)
446 throw css::uno::RuntimeException(
447 "character reference denoting invalid character in " + fileUrl_
);
452 buf
[0] = static_cast< char >(val
);
454 } else if (val
< 0x800) {
455 buf
[0] = static_cast< char >((val
>> 6) | 0xC0);
456 buf
[1] = static_cast< char >((val
& 0x3F) | 0x80);
458 } else if (val
< 0x10000) {
459 buf
[0] = static_cast< char >((val
>> 12) | 0xE0);
460 buf
[1] = static_cast< char >(((val
>> 6) & 0x3F) | 0x80);
461 buf
[2] = static_cast< char >((val
& 0x3F) | 0x80);
464 buf
[0] = static_cast< char >((val
>> 18) | 0xF0);
465 buf
[1] = static_cast< char >(((val
>> 12) & 0x3F) | 0x80);
466 buf
[2] = static_cast< char >(((val
>> 6) & 0x3F) | 0x80);
467 buf
[3] = static_cast< char >((val
& 0x3F) | 0x80);
470 pad_
.addEphemeral(buf
, len
);
474 char const * inBegin
;
476 char const * outBegin
;
479 static EntityRef
const refs
[] = {
480 { RTL_CONSTASCII_STRINGPARAM("amp;"),
481 RTL_CONSTASCII_STRINGPARAM("&") },
482 { RTL_CONSTASCII_STRINGPARAM("lt;"),
483 RTL_CONSTASCII_STRINGPARAM("<") },
484 { RTL_CONSTASCII_STRINGPARAM("gt;"),
485 RTL_CONSTASCII_STRINGPARAM(">") },
486 { RTL_CONSTASCII_STRINGPARAM("apos;"),
487 RTL_CONSTASCII_STRINGPARAM("'") },
488 { RTL_CONSTASCII_STRINGPARAM("quot;"),
489 RTL_CONSTASCII_STRINGPARAM("\"") } };
490 for (std::size_t i
= 0; i
< sizeof refs
/ sizeof refs
[0]; ++i
) {
491 if (rtl_str_shortenedCompare_WithLength(
492 position
, end
- position
, refs
[i
].inBegin
, refs
[i
].inLength
,
496 position
+= refs
[i
].inLength
;
497 pad_
.add(refs
[i
].outBegin
, refs
[i
].outLength
);
501 throw css::uno::RuntimeException(
502 "unknown entity reference in " + fileUrl_
);
506 Span
XmlReader::handleAttributeValue(
507 char const * begin
, char const * end
, bool fullyNormalize
)
510 if (fullyNormalize
) {
511 while (begin
!= end
&& isSpace(*begin
)) {
514 while (end
!= begin
&& isSpace(end
[-1])) {
517 char const * p
= begin
;
518 enum Space
{ SPACE_NONE
, SPACE_SPAN
, SPACE_BREAK
};
519 // a single true space character can go into the current span,
520 // everything else breaks the span
521 Space space
= SPACE_NONE
;
529 pad_
.add(begin
, p
- begin
);
534 pad_
.add(begin
, p
- begin
);
549 pad_
.add(begin
, p
- begin
);
559 pad_
.add(begin
, p
- begin
);
560 p
= handleReference(p
, end
);
570 pad_
.add(begin
, p
- begin
);
572 char const * p
= begin
;
577 pad_
.add(begin
, p
- begin
);
582 pad_
.add(begin
, p
- begin
);
584 if (peek() == '\x0A') {
591 pad_
.add(begin
, p
- begin
);
592 p
= handleReference(p
, end
);
600 pad_
.add(begin
, p
- begin
);
605 XmlReader::Result
XmlReader::handleStartTag(int * nsId
, Span
* localName
) {
606 assert(nsId
!= 0 && localName
);
607 char const * nameBegin
= pos_
;
608 char const * nameColon
= 0;
609 if (!scanName(&nameColon
)) {
610 throw css::uno::RuntimeException(
611 "bad tag name in " + fileUrl_
);
613 char const * nameEnd
= pos_
;
614 NamespaceList::size_type inheritedNamespaces
= namespaces_
.size();
615 bool hasDefaultNs
= false;
616 int defaultNsId
= NAMESPACE_NONE
;
619 char const * p
= pos_
;
621 if (peek() == '/' || peek() == '>') {
625 throw css::uno::RuntimeException(
626 "missing whitespace before attribute in " + fileUrl_
);
628 char const * attrNameBegin
= pos_
;
629 char const * attrNameColon
= 0;
630 if (!scanName(&attrNameColon
)) {
631 throw css::uno::RuntimeException(
632 "bad attribute name in " + fileUrl_
);
634 char const * attrNameEnd
= pos_
;
637 throw css::uno::RuntimeException(
638 "missing '=' in " + fileUrl_
);
642 if (del
!= '\'' && del
!= '"') {
643 throw css::uno::RuntimeException(
644 "bad attribute value in " + fileUrl_
);
646 char const * valueBegin
= pos_
;
647 sal_Int32 i
= rtl_str_indexOfChar_WithLength(pos_
, end_
- pos_
, del
);
649 throw css::uno::RuntimeException(
650 "unterminated attribute value in " + fileUrl_
);
652 char const * valueEnd
= pos_
+ i
;
654 if (attrNameColon
== 0 &&
655 Span(attrNameBegin
, attrNameEnd
- attrNameBegin
).equals("xmlns"))
658 defaultNsId
= scanNamespaceIri(valueBegin
, valueEnd
);
659 } else if (attrNameColon
!= 0 &&
660 Span(attrNameBegin
, attrNameColon
- attrNameBegin
).equals(
663 namespaces_
.push_back(
665 Span(attrNameColon
+ 1, attrNameEnd
- (attrNameColon
+ 1)),
666 scanNamespaceIri(valueBegin
, valueEnd
)));
668 attributes_
.push_back(
670 attrNameBegin
, attrNameEnd
, attrNameColon
, valueBegin
,
674 if (!hasDefaultNs
&& !elements_
.empty()) {
675 defaultNsId
= elements_
.top().defaultNamespaceId
;
677 firstAttribute_
= true;
679 state_
= STATE_EMPTY_ELEMENT_TAG
;
682 state_
= STATE_CONTENT
;
685 throw css::uno::RuntimeException(
686 "missing '>' in " + fileUrl_
);
691 Span(nameBegin
, nameEnd
- nameBegin
), inheritedNamespaces
,
693 if (nameColon
== 0) {
695 *localName
= Span(nameBegin
, nameEnd
- nameBegin
);
697 *nsId
= getNamespaceId(Span(nameBegin
, nameColon
- nameBegin
));
698 *localName
= Span(nameColon
+ 1, nameEnd
- (nameColon
+ 1));
703 XmlReader::Result
XmlReader::handleEndTag() {
704 if (elements_
.empty()) {
705 throw css::uno::RuntimeException(
706 "spurious end tag in " + fileUrl_
);
708 char const * nameBegin
= pos_
;
709 char const * nameColon
= 0;
710 if (!scanName(&nameColon
) ||
711 !elements_
.top().name
.equals(nameBegin
, pos_
- nameBegin
))
713 throw css::uno::RuntimeException(
714 "tag mismatch in " + fileUrl_
);
719 throw css::uno::RuntimeException(
720 "missing '>' in " + fileUrl_
);
726 void XmlReader::handleElementEnd() {
727 assert(!elements_
.empty());
728 namespaces_
.resize(elements_
.top().inheritedNamespaces
);
730 state_
= elements_
.empty() ? STATE_DONE
: STATE_CONTENT
;
733 XmlReader::Result
XmlReader::handleSkippedText(Span
* data
, int * nsId
) {
735 sal_Int32 i
= rtl_str_indexOfChar_WithLength(pos_
, end_
- pos_
, '<');
737 throw css::uno::RuntimeException(
738 "premature end of " + fileUrl_
);
744 if (!skipComment() && !scanCdataSection().is()) {
745 skipDocumentTypeDeclaration();
750 return handleEndTag();
753 skipProcessingInstruction();
756 return handleStartTag(nsId
, data
);
761 XmlReader::Result
XmlReader::handleRawText(Span
* text
) {
763 for (char const * begin
= pos_
;;) {
765 case '\0': // i.e., EOF
766 throw css::uno::RuntimeException(
767 "premature end of " + fileUrl_
);
769 pad_
.add(begin
, pos_
- begin
);
771 if (peek() != '\x0A') {
777 pad_
.add(begin
, pos_
- begin
);
778 pos_
= handleReference(pos_
, end_
);
782 pad_
.add(begin
, pos_
- begin
);
787 if (!skipComment()) {
788 Span
cdata(scanCdataSection());
790 normalizeLineEnds(cdata
);
792 skipDocumentTypeDeclaration();
800 state_
= STATE_END_TAG
;
804 skipProcessingInstruction();
809 state_
= STATE_START_TAG
;
820 XmlReader::Result
XmlReader::handleNormalizedText(Span
* text
) {
822 char const * flowBegin
= pos_
;
823 char const * flowEnd
= pos_
;
824 enum Space
{ SPACE_START
, SPACE_NONE
, SPACE_SPAN
, SPACE_BREAK
};
825 // a single true space character can go into the current flow,
826 // everything else breaks the flow
827 Space space
= SPACE_START
;
830 case '\0': // i.e., EOF
831 throw css::uno::RuntimeException(
832 "premature end of " + fileUrl_
);
867 pad_
.add(flowBegin
, pos_
- flowBegin
);
870 pad_
.add(flowBegin
, flowEnd
- flowBegin
);
874 pos_
= handleReference(pos_
, end_
);
887 Span
cdata(scanCdataSection());
889 // CDATA is not normalized (similar to character
890 // references; it keeps the code simple), but it might
891 // arguably be better to normalize it:
897 pad_
.add(flowBegin
, pos_
- flowBegin
);
900 pad_
.add(flowBegin
, flowEnd
- flowBegin
);
904 normalizeLineEnds(cdata
);
909 skipDocumentTypeDeclaration();
915 pad_
.add(flowBegin
, flowEnd
- flowBegin
);
917 state_
= STATE_END_TAG
;
921 skipProcessingInstruction();
925 pad_
.add(flowBegin
, flowEnd
- flowBegin
);
927 state_
= STATE_START_TAG
;
940 pad_
.add(flowBegin
, flowEnd
- flowBegin
);
952 int XmlReader::toNamespaceId(NamespaceIris::size_type pos
) {
953 assert(pos
<= INT_MAX
);
954 return static_cast< int >(pos
);
959 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */