Version 6.4.0.3, tag libreoffice-6.4.0.3
[LibreOffice.git] / xmlreader / source / xmlreader.cxx
blob85027b66ee99a0fef29b2353eabc99b7f30e4545
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
22 #include <cassert>
23 #include <climits>
25 #include <com/sun/star/container/NoSuchElementException.hpp>
26 #include <com/sun/star/uno/RuntimeException.hpp>
27 #include <osl/file.h>
28 #include <rtl/character.hxx>
29 #include <rtl/string.h>
30 #include <rtl/ustring.hxx>
31 #include <sal/log.hxx>
32 #include <sal/types.h>
33 #include <xmlreader/pad.hxx>
34 #include <xmlreader/span.hxx>
35 #include <xmlreader/xmlreader.hxx>
37 namespace xmlreader {
39 namespace {
41 bool isSpace(char c) {
42 switch (c) {
43 case '\x09':
44 case '\x0A':
45 case '\x0D':
46 case ' ':
47 return true;
48 default:
49 return false;
55 XmlReader::XmlReader(OUString const & fileUrl)
56 : fileUrl_(fileUrl)
57 , fileHandle_(nullptr)
59 oslFileError e = osl_openFile(
60 fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read);
61 switch (e)
63 case osl_File_E_None:
64 break;
65 case osl_File_E_NOENT:
66 throw css::container::NoSuchElementException( fileUrl_ );
67 default:
68 throw css::uno::RuntimeException(
69 "cannot open " + fileUrl_ + ": " + OUString::number(e));
71 e = osl_getFileSize(fileHandle_, &fileSize_);
72 if (e == osl_File_E_None) {
73 e = osl_mapFile(
74 fileHandle_, &fileAddress_, fileSize_, 0,
75 osl_File_MapFlag_WillNeed);
77 if (e != osl_File_E_None) {
78 oslFileError e2 = osl_closeFile(fileHandle_);
79 if (e2 != osl_File_E_None) {
80 SAL_WARN(
81 "xmlreader",
82 "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e2);
84 throw css::uno::RuntimeException(
85 "cannot mmap " + fileUrl_ + " (" + OUString::number(e) + ")" );
87 namespaceIris_.emplace_back("http://www.w3.org/XML/1998/namespace");
88 namespaces_.emplace_back(Span("xml"), NAMESPACE_XML);
89 pos_ = static_cast< char * >(fileAddress_);
90 end_ = pos_ + fileSize_;
91 state_ = State::Content;
92 firstAttribute_ = true;
95 XmlReader::~XmlReader() {
96 if (!fileHandle_)
97 return;
98 oslFileError e = osl_unmapMappedFile(fileHandle_, fileAddress_, fileSize_);
99 if (e != osl_File_E_None) {
100 SAL_WARN(
101 "xmlreader",
102 "osl_unmapMappedFile of \"" << fileUrl_ << "\" failed with " << +e);
104 e = osl_closeFile(fileHandle_);
105 if (e != osl_File_E_None) {
106 SAL_WARN(
107 "xmlreader",
108 "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e);
112 int XmlReader::registerNamespaceIri(Span const & iri) {
113 int id = toNamespaceId(namespaceIris_.size());
114 namespaceIris_.push_back(iri);
115 if (iri == "http://www.w3.org/2001/XMLSchema-instance") {
116 // Old user layer .xcu files used the xsi namespace prefix without
117 // declaring a corresponding namespace binding, see issue 77174; reading
118 // those files during migration would fail without this hack that can be
119 // removed once migration is no longer relevant (see
120 // configmgr::Components::parseModificationLayer):
121 namespaces_.emplace_back(Span("xsi"), id);
123 return id;
126 XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId)
128 switch (state_) {
129 case State::Content:
130 switch (reportText) {
131 case Text::NONE:
132 return handleSkippedText(data, nsId);
133 case Text::Raw:
134 return handleRawText(data);
135 default: // Text::Normalized
136 return handleNormalizedText(data);
138 case State::StartTag:
139 return handleStartTag(nsId, data);
140 case State::EndTag:
141 return handleEndTag();
142 case State::EmptyElementTag:
143 handleElementEnd();
144 return Result::End;
145 default: // State::Done
146 return Result::Done;
150 bool XmlReader::nextAttribute(int * nsId, Span * localName) {
151 assert(nsId != nullptr && localName != nullptr);
152 if (firstAttribute_) {
153 currentAttribute_ = attributes_.begin();
154 firstAttribute_ = false;
155 } else {
156 ++currentAttribute_;
158 if (currentAttribute_ == attributes_.end()) {
159 return false;
161 if (currentAttribute_->nameColon == nullptr) {
162 *nsId = NAMESPACE_NONE;
163 *localName = Span(
164 currentAttribute_->nameBegin,
165 currentAttribute_->nameEnd - currentAttribute_->nameBegin);
166 } else {
167 *nsId = getNamespaceId(
168 Span(
169 currentAttribute_->nameBegin,
170 currentAttribute_->nameColon - currentAttribute_->nameBegin));
171 *localName = Span(
172 currentAttribute_->nameColon + 1,
173 currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1));
175 return true;
178 Span XmlReader::getAttributeValue(bool fullyNormalize) {
179 return handleAttributeValue(
180 currentAttribute_->valueBegin, currentAttribute_->valueEnd,
181 fullyNormalize);
184 int XmlReader::getNamespaceId(Span const & prefix) const {
185 auto i = std::find_if(namespaces_.crbegin(), namespaces_.crend(),
186 [&prefix](const NamespaceData& rNamespaceData) { return prefix == rNamespaceData.prefix; });
188 if (i != namespaces_.rend())
189 return i->nsId;
191 return NAMESPACE_UNKNOWN;
195 void XmlReader::normalizeLineEnds(Span const & text) {
196 char const * p = text.begin;
197 sal_Int32 n = text.length;
198 for (;;) {
199 sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D');
200 if (i < 0) {
201 break;
203 pad_.add(p, i);
204 p += i + 1;
205 n -= i + 1;
206 if (n == 0 || *p != '\x0A') {
207 pad_.add("\x0A");
210 pad_.add(p, n);
213 void XmlReader::skipSpace() {
214 while (isSpace(peek())) {
215 ++pos_;
219 bool XmlReader::skipComment() {
220 if (rtl_str_shortenedCompare_WithLength(
221 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"),
222 RTL_CONSTASCII_LENGTH("--")) !=
225 return false;
227 pos_ += RTL_CONSTASCII_LENGTH("--");
228 sal_Int32 i = rtl_str_indexOfStr_WithLength(
229 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"));
230 if (i < 0) {
231 throw css::uno::RuntimeException(
232 "premature end (within comment) of " + fileUrl_ );
234 pos_ += i + RTL_CONSTASCII_LENGTH("--");
235 if (read() != '>') {
236 throw css::uno::RuntimeException(
237 "illegal \"--\" within comment in " + fileUrl_ );
239 return true;
242 void XmlReader::skipProcessingInstruction() {
243 sal_Int32 i = rtl_str_indexOfStr_WithLength(
244 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>"));
245 if (i < 0) {
246 throw css::uno::RuntimeException(
247 "bad '<?' in " + fileUrl_ );
249 pos_ += i + RTL_CONSTASCII_LENGTH("?>");
252 void XmlReader::skipDocumentTypeDeclaration() {
253 // Neither is it checked that the doctypedecl is at the correct position in
254 // the document, nor that it is well-formed:
255 for (;;) {
256 char c = read();
257 switch (c) {
258 case '\0': // i.e., EOF
259 throw css::uno::RuntimeException(
260 "premature end (within DTD) of " + fileUrl_ );
261 case '"':
262 case '\'':
264 sal_Int32 i = rtl_str_indexOfChar_WithLength(
265 pos_, end_ - pos_, c);
266 if (i < 0) {
267 throw css::uno::RuntimeException(
268 "premature end (within DTD) of " + fileUrl_ );
270 pos_ += i + 1;
272 break;
273 case '>':
274 return;
275 case '[':
276 for (;;) {
277 c = read();
278 switch (c) {
279 case '\0': // i.e., EOF
280 throw css::uno::RuntimeException(
281 "premature end (within DTD) of " + fileUrl_ );
282 case '"':
283 case '\'':
285 sal_Int32 i = rtl_str_indexOfChar_WithLength(
286 pos_, end_ - pos_, c);
287 if (i < 0) {
288 throw css::uno::RuntimeException(
289 "premature end (within DTD) of " + fileUrl_ );
291 pos_ += i + 1;
293 break;
294 case '<':
295 switch (read()) {
296 case '\0': // i.e., EOF
297 throw css::uno::RuntimeException(
298 "premature end (within DTD) of " + fileUrl_ );
299 case '!':
300 skipComment();
301 break;
302 case '?':
303 skipProcessingInstruction();
304 break;
305 default:
306 break;
308 break;
309 case ']':
310 skipSpace();
311 if (read() != '>') {
312 throw css::uno::RuntimeException(
313 "missing \">\" of DTD in " + fileUrl_ );
315 return;
316 default:
317 break;
320 default:
321 break;
326 Span XmlReader::scanCdataSection() {
327 if (rtl_str_shortenedCompare_WithLength(
328 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
329 RTL_CONSTASCII_LENGTH("[CDATA[")) !=
332 return Span();
334 pos_ += RTL_CONSTASCII_LENGTH("[CDATA[");
335 char const * begin = pos_;
336 sal_Int32 i = rtl_str_indexOfStr_WithLength(
337 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>"));
338 if (i < 0) {
339 throw css::uno::RuntimeException(
340 "premature end (within CDATA section) of " + fileUrl_ );
342 pos_ += i + RTL_CONSTASCII_LENGTH("]]>");
343 return Span(begin, i);
346 bool XmlReader::scanName(char const ** nameColon) {
347 assert(nameColon != nullptr && *nameColon == nullptr);
348 for (char const * begin = pos_;; ++pos_) {
349 switch (peek()) {
350 case '\0': // i.e., EOF
351 case '\x09':
352 case '\x0A':
353 case '\x0D':
354 case ' ':
355 case '/':
356 case '=':
357 case '>':
358 return pos_ != begin;
359 case ':':
360 *nameColon = pos_;
361 break;
362 default:
363 break;
368 int XmlReader::scanNamespaceIri(char const * begin, char const * end) {
369 assert(begin != nullptr && begin <= end);
370 Span iri(handleAttributeValue(begin, end, false));
371 for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) {
372 if (namespaceIris_[i] == iri) {
373 return toNamespaceId(i);
376 return XmlReader::NAMESPACE_UNKNOWN;
379 char const * XmlReader::handleReference(char const * position, char const * end)
381 assert(position != nullptr && *position == '&' && position < end);
382 ++position;
383 if (*position == '#') {
384 ++position;
385 sal_uInt32 val = 0;
386 char const * p;
387 if (*position == 'x') {
388 ++position;
389 p = position;
390 for (;; ++position) {
391 char c = *position;
392 if (c >= '0' && c <= '9') {
393 val = 16 * val + (c - '0');
394 } else if (c >= 'A' && c <= 'F') {
395 val = 16 * val + (c - 'A') + 10;
396 } else if (c >= 'a' && c <= 'f') {
397 val = 16 * val + (c - 'a') + 10;
398 } else {
399 break;
401 if (!rtl::isUnicodeCodePoint(val)) { // avoid overflow
402 throw css::uno::RuntimeException(
403 "'&#x...' too large in " + fileUrl_ );
406 } else {
407 p = position;
408 for (;; ++position) {
409 char c = *position;
410 if (c >= '0' && c <= '9') {
411 val = 10 * val + (c - '0');
412 } else {
413 break;
415 if (!rtl::isUnicodeCodePoint(val)) { // avoid overflow
416 throw css::uno::RuntimeException(
417 "'&#...' too large in " + fileUrl_ );
421 if (position == p || *position++ != ';') {
422 throw css::uno::RuntimeException(
423 "'&#...' missing ';' in " + fileUrl_ );
425 assert(rtl::isUnicodeCodePoint(val));
426 if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) ||
427 (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF)
429 throw css::uno::RuntimeException(
430 "character reference denoting invalid character in " + fileUrl_ );
432 char buf[4];
433 sal_Int32 len;
434 if (val < 0x80) {
435 buf[0] = static_cast< char >(val);
436 len = 1;
437 } else if (val < 0x800) {
438 buf[0] = static_cast< char >((val >> 6) | 0xC0);
439 buf[1] = static_cast< char >((val & 0x3F) | 0x80);
440 len = 2;
441 } else if (val < 0x10000) {
442 buf[0] = static_cast< char >((val >> 12) | 0xE0);
443 buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
444 buf[2] = static_cast< char >((val & 0x3F) | 0x80);
445 len = 3;
446 } else {
447 buf[0] = static_cast< char >((val >> 18) | 0xF0);
448 buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80);
449 buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
450 buf[3] = static_cast< char >((val & 0x3F) | 0x80);
451 len = 4;
453 pad_.addEphemeral(buf, len);
454 return position;
455 } else {
456 struct EntityRef {
457 char const * inBegin;
458 sal_Int32 const inLength;
459 char const * outBegin;
460 sal_Int32 const outLength;
462 static EntityRef const refs[] = {
463 { RTL_CONSTASCII_STRINGPARAM("amp;"),
464 RTL_CONSTASCII_STRINGPARAM("&") },
465 { RTL_CONSTASCII_STRINGPARAM("lt;"),
466 RTL_CONSTASCII_STRINGPARAM("<") },
467 { RTL_CONSTASCII_STRINGPARAM("gt;"),
468 RTL_CONSTASCII_STRINGPARAM(">") },
469 { RTL_CONSTASCII_STRINGPARAM("apos;"),
470 RTL_CONSTASCII_STRINGPARAM("'") },
471 { RTL_CONSTASCII_STRINGPARAM("quot;"),
472 RTL_CONSTASCII_STRINGPARAM("\"") } };
473 for (const auto & ref : refs) {
474 if (rtl_str_shortenedCompare_WithLength(
475 position, end - position, ref.inBegin, ref.inLength,
476 ref.inLength) ==
479 position += ref.inLength;
480 pad_.add(ref.outBegin, ref.outLength);
481 return position;
484 throw css::uno::RuntimeException(
485 "unknown entity reference in " + fileUrl_ );
489 Span XmlReader::handleAttributeValue(
490 char const * begin, char const * end, bool fullyNormalize)
492 pad_.clear();
493 if (fullyNormalize) {
494 while (begin != end && isSpace(*begin)) {
495 ++begin;
497 while (end != begin && isSpace(end[-1])) {
498 --end;
500 char const * p = begin;
501 enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
502 // a single true space character can go into the current span,
503 // everything else breaks the span
504 Space space = SPACE_NONE;
505 while (p != end) {
506 switch (*p) {
507 case '\x09':
508 case '\x0A':
509 case '\x0D':
510 switch (space) {
511 case SPACE_NONE:
512 pad_.add(begin, p - begin);
513 pad_.add(" ");
514 space = SPACE_BREAK;
515 break;
516 case SPACE_SPAN:
517 pad_.add(begin, p - begin);
518 space = SPACE_BREAK;
519 break;
520 case SPACE_BREAK:
521 break;
523 begin = ++p;
524 break;
525 case ' ':
526 switch (space) {
527 case SPACE_NONE:
528 ++p;
529 space = SPACE_SPAN;
530 break;
531 case SPACE_SPAN:
532 pad_.add(begin, p - begin);
533 begin = ++p;
534 space = SPACE_BREAK;
535 break;
536 case SPACE_BREAK:
537 begin = ++p;
538 break;
540 break;
541 case '&':
542 pad_.add(begin, p - begin);
543 p = handleReference(p, end);
544 begin = p;
545 space = SPACE_NONE;
546 break;
547 default:
548 ++p;
549 space = SPACE_NONE;
550 break;
553 pad_.add(begin, p - begin);
554 } else {
555 char const * p = begin;
556 while (p != end) {
557 switch (*p) {
558 case '\x09':
559 case '\x0A':
560 pad_.add(begin, p - begin);
561 begin = ++p;
562 pad_.add(" ");
563 break;
564 case '\x0D':
565 pad_.add(begin, p - begin);
566 ++p;
567 if (peek() == '\x0A') {
568 ++p;
570 begin = p;
571 pad_.add(" ");
572 break;
573 case '&':
574 pad_.add(begin, p - begin);
575 p = handleReference(p, end);
576 begin = p;
577 break;
578 default:
579 ++p;
580 break;
583 pad_.add(begin, p - begin);
585 return pad_.get();
588 XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) {
589 assert(nsId != nullptr && localName);
590 char const * nameBegin = pos_;
591 char const * nameColon = nullptr;
592 if (!scanName(&nameColon)) {
593 throw css::uno::RuntimeException(
594 "bad tag name in " + fileUrl_ );
596 char const * nameEnd = pos_;
597 NamespaceList::size_type inheritedNamespaces = namespaces_.size();
598 bool hasDefaultNs = false;
599 int defaultNsId = NAMESPACE_NONE;
600 attributes_.clear();
601 for (;;) {
602 char const * p = pos_;
603 skipSpace();
604 if (peek() == '/' || peek() == '>') {
605 break;
607 if (pos_ == p) {
608 throw css::uno::RuntimeException(
609 "missing whitespace before attribute in " + fileUrl_ );
611 char const * attrNameBegin = pos_;
612 char const * attrNameColon = nullptr;
613 if (!scanName(&attrNameColon)) {
614 throw css::uno::RuntimeException(
615 "bad attribute name in " + fileUrl_ );
617 char const * attrNameEnd = pos_;
618 skipSpace();
619 if (read() != '=') {
620 throw css::uno::RuntimeException(
621 "missing '=' in " + fileUrl_ );
623 skipSpace();
624 char del = read();
625 if (del != '\'' && del != '"') {
626 throw css::uno::RuntimeException(
627 "bad attribute value in " + fileUrl_ );
629 char const * valueBegin = pos_;
630 sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del);
631 if (i < 0) {
632 throw css::uno::RuntimeException(
633 "unterminated attribute value in " + fileUrl_ );
635 char const * valueEnd = pos_ + i;
636 pos_ += i + 1;
637 if (attrNameColon == nullptr &&
638 Span(attrNameBegin, attrNameEnd - attrNameBegin) == "xmlns")
640 hasDefaultNs = true;
641 defaultNsId = scanNamespaceIri(valueBegin, valueEnd);
642 } else if (attrNameColon != nullptr &&
643 Span(attrNameBegin, attrNameColon - attrNameBegin) ==
644 "xmlns")
646 namespaces_.emplace_back(
647 Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)),
648 scanNamespaceIri(valueBegin, valueEnd));
649 } else {
650 attributes_.emplace_back(
651 attrNameBegin, attrNameEnd, attrNameColon, valueBegin,
652 valueEnd);
655 if (!hasDefaultNs && !elements_.empty()) {
656 defaultNsId = elements_.top().defaultNamespaceId;
658 firstAttribute_ = true;
659 if (peek() == '/') {
660 state_ = State::EmptyElementTag;
661 ++pos_;
662 } else {
663 state_ = State::Content;
665 if (peek() != '>') {
666 throw css::uno::RuntimeException(
667 "missing '>' in " + fileUrl_ );
669 ++pos_;
670 elements_.push(
671 ElementData(
672 Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces,
673 defaultNsId));
674 if (nameColon == nullptr) {
675 *nsId = defaultNsId;
676 *localName = Span(nameBegin, nameEnd - nameBegin);
677 } else {
678 *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin));
679 *localName = Span(nameColon + 1, nameEnd - (nameColon + 1));
681 return Result::Begin;
684 XmlReader::Result XmlReader::handleEndTag() {
685 if (elements_.empty()) {
686 throw css::uno::RuntimeException(
687 "spurious end tag in " + fileUrl_ );
689 char const * nameBegin = pos_;
690 char const * nameColon = nullptr;
691 if (!scanName(&nameColon) ||
692 !elements_.top().name.equals(nameBegin, pos_ - nameBegin))
694 throw css::uno::RuntimeException(
695 "tag mismatch in " + fileUrl_ );
697 handleElementEnd();
698 skipSpace();
699 if (peek() != '>') {
700 throw css::uno::RuntimeException(
701 "missing '>' in " + fileUrl_ );
703 ++pos_;
704 return Result::End;
707 void XmlReader::handleElementEnd() {
708 assert(!elements_.empty());
709 auto end = elements_.top().inheritedNamespaces;
710 namespaces_.resize(end);
711 elements_.pop();
712 state_ = elements_.empty() ? State::Done : State::Content;
715 XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) {
716 for (;;) {
717 sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, '<');
718 if (i < 0) {
719 throw css::uno::RuntimeException(
720 "premature end of " + fileUrl_ );
722 pos_ += i + 1;
723 switch (peek()) {
724 case '!':
725 ++pos_;
726 if (!skipComment() && !scanCdataSection().is()) {
727 skipDocumentTypeDeclaration();
729 break;
730 case '/':
731 ++pos_;
732 return handleEndTag();
733 case '?':
734 ++pos_;
735 skipProcessingInstruction();
736 break;
737 default:
738 return handleStartTag(nsId, data);
743 XmlReader::Result XmlReader::handleRawText(Span * text) {
744 pad_.clear();
745 for (char const * begin = pos_;;) {
746 switch (peek()) {
747 case '\0': // i.e., EOF
748 throw css::uno::RuntimeException(
749 "premature end of " + fileUrl_ );
750 case '\x0D':
751 pad_.add(begin, pos_ - begin);
752 ++pos_;
753 if (peek() != '\x0A') {
754 pad_.add("\x0A");
756 begin = pos_;
757 break;
758 case '&':
759 pad_.add(begin, pos_ - begin);
760 pos_ = handleReference(pos_, end_);
761 begin = pos_;
762 break;
763 case '<':
764 pad_.add(begin, pos_ - begin);
765 ++pos_;
766 switch (peek()) {
767 case '!':
768 ++pos_;
769 if (!skipComment()) {
770 Span cdata(scanCdataSection());
771 if (cdata.is()) {
772 normalizeLineEnds(cdata);
773 } else {
774 skipDocumentTypeDeclaration();
777 begin = pos_;
778 break;
779 case '/':
780 *text = pad_.get();
781 ++pos_;
782 state_ = State::EndTag;
783 return Result::Text;
784 case '?':
785 ++pos_;
786 skipProcessingInstruction();
787 begin = pos_;
788 break;
789 default:
790 *text = pad_.get();
791 state_ = State::StartTag;
792 return Result::Text;
794 break;
795 default:
796 ++pos_;
797 break;
802 XmlReader::Result XmlReader::handleNormalizedText(Span * text) {
803 pad_.clear();
804 char const * flowBegin = pos_;
805 char const * flowEnd = pos_;
806 enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
807 // a single true space character can go into the current flow,
808 // everything else breaks the flow
809 Space space = SPACE_START;
810 for (;;) {
811 switch (peek()) {
812 case '\0': // i.e., EOF
813 throw css::uno::RuntimeException(
814 "premature end of " + fileUrl_ );
815 case '\x09':
816 case '\x0A':
817 case '\x0D':
818 switch (space) {
819 case SPACE_START:
820 case SPACE_BREAK:
821 break;
822 case SPACE_NONE:
823 case SPACE_SPAN:
824 space = SPACE_BREAK;
825 break;
827 ++pos_;
828 break;
829 case ' ':
830 switch (space) {
831 case SPACE_START:
832 case SPACE_BREAK:
833 break;
834 case SPACE_NONE:
835 space = SPACE_SPAN;
836 break;
837 case SPACE_SPAN:
838 space = SPACE_BREAK;
839 break;
841 ++pos_;
842 break;
843 case '&':
844 switch (space) {
845 case SPACE_START:
846 break;
847 case SPACE_NONE:
848 case SPACE_SPAN:
849 pad_.add(flowBegin, pos_ - flowBegin);
850 break;
851 case SPACE_BREAK:
852 pad_.add(flowBegin, flowEnd - flowBegin);
853 pad_.add(" ");
854 break;
856 pos_ = handleReference(pos_, end_);
857 flowBegin = pos_;
858 flowEnd = pos_;
859 space = SPACE_NONE;
860 break;
861 case '<':
862 ++pos_;
863 switch (peek()) {
864 case '!':
865 ++pos_;
866 if (skipComment()) {
867 space = SPACE_BREAK;
868 } else {
869 Span cdata(scanCdataSection());
870 if (cdata.is()) {
871 // CDATA is not normalized (similar to character
872 // references; it keeps the code simple), but it might
873 // arguably be better to normalize it:
874 switch (space) {
875 case SPACE_START:
876 break;
877 case SPACE_NONE:
878 case SPACE_SPAN:
879 pad_.add(flowBegin, pos_ - flowBegin);
880 break;
881 case SPACE_BREAK:
882 pad_.add(flowBegin, flowEnd - flowBegin);
883 pad_.add(" ");
884 break;
886 normalizeLineEnds(cdata);
887 flowBegin = pos_;
888 flowEnd = pos_;
889 space = SPACE_NONE;
890 } else {
891 skipDocumentTypeDeclaration();
894 break;
895 case '/':
896 ++pos_;
897 pad_.add(flowBegin, flowEnd - flowBegin);
898 *text = pad_.get();
899 state_ = State::EndTag;
900 return Result::Text;
901 case '?':
902 ++pos_;
903 skipProcessingInstruction();
904 space = SPACE_BREAK;
905 break;
906 default:
907 pad_.add(flowBegin, flowEnd - flowBegin);
908 *text = pad_.get();
909 state_ = State::StartTag;
910 return Result::Text;
912 break;
913 default:
914 switch (space) {
915 case SPACE_START:
916 flowBegin = pos_;
917 break;
918 case SPACE_NONE:
919 case SPACE_SPAN:
920 break;
921 case SPACE_BREAK:
922 pad_.add(flowBegin, flowEnd - flowBegin);
923 pad_.add(" ");
924 flowBegin = pos_;
925 break;
927 flowEnd = ++pos_;
928 space = SPACE_NONE;
929 break;
934 int XmlReader::toNamespaceId(NamespaceIris::size_type pos) {
935 assert(pos <= INT_MAX);
936 return static_cast< int >(pos);
941 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */