bump product version to 7.6.3.2-android
[LibreOffice.git] / xmlreader / source / xmlreader.cxx
blob00489caa498706f14a069c91e853fe43d71b2162
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
22 #include <cassert>
23 #include <climits>
25 #include <com/sun/star/container/NoSuchElementException.hpp>
26 #include <com/sun/star/uno/RuntimeException.hpp>
27 #include <osl/file.h>
28 #include <rtl/character.hxx>
29 #include <rtl/string.h>
30 #include <rtl/ustring.hxx>
31 #include <sal/log.hxx>
32 #include <sal/types.h>
33 #include <utility>
34 #include <xmlreader/pad.hxx>
35 #include <xmlreader/span.hxx>
36 #include <xmlreader/xmlreader.hxx>
38 namespace xmlreader {
40 namespace {
42 bool isSpace(char c) {
43 switch (c) {
44 case '\x09':
45 case '\x0A':
46 case '\x0D':
47 case ' ':
48 return true;
49 default:
50 return false;
56 XmlReader::XmlReader(OUString fileUrl)
57 : fileUrl_(std::move(fileUrl))
58 , fileHandle_(nullptr)
60 oslFileError e = osl_openFile(
61 fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read);
62 switch (e)
64 case osl_File_E_None:
65 break;
66 case osl_File_E_NOENT:
67 throw css::container::NoSuchElementException( fileUrl_ );
68 default:
69 throw css::uno::RuntimeException(
70 "cannot open " + fileUrl_ + ": " + OUString::number(e));
72 e = osl_getFileSize(fileHandle_, &fileSize_);
73 if (e == osl_File_E_None) {
74 e = osl_mapFile(
75 fileHandle_, &fileAddress_, fileSize_, 0,
76 osl_File_MapFlag_WillNeed);
78 if (e != osl_File_E_None) {
79 oslFileError e2 = osl_closeFile(fileHandle_);
80 if (e2 != osl_File_E_None) {
81 SAL_WARN(
82 "xmlreader",
83 "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e2);
85 throw css::uno::RuntimeException(
86 "cannot mmap " + fileUrl_ + " (" + OUString::number(e) + ")" );
88 namespaceIris_.emplace_back("http://www.w3.org/XML/1998/namespace");
89 namespaces_.emplace_back(Span("xml"), NAMESPACE_XML);
90 pos_ = static_cast< char * >(fileAddress_);
91 end_ = pos_ + fileSize_;
92 state_ = State::Content;
93 firstAttribute_ = true;
96 XmlReader::~XmlReader() {
97 if (!fileHandle_)
98 return;
99 oslFileError e = osl_unmapMappedFile(fileHandle_, fileAddress_, fileSize_);
100 if (e != osl_File_E_None) {
101 SAL_WARN(
102 "xmlreader",
103 "osl_unmapMappedFile of \"" << fileUrl_ << "\" failed with " << +e);
105 e = osl_closeFile(fileHandle_);
106 if (e != osl_File_E_None) {
107 SAL_WARN(
108 "xmlreader",
109 "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e);
113 int XmlReader::registerNamespaceIri(Span const & iri) {
114 int id = toNamespaceId(namespaceIris_.size());
115 namespaceIris_.push_back(iri);
116 if (iri == "http://www.w3.org/2001/XMLSchema-instance") {
117 // Old user layer .xcu files used the xsi namespace prefix without
118 // declaring a corresponding namespace binding, see issue 77174; reading
119 // those files during migration would fail without this hack that can be
120 // removed once migration is no longer relevant (see
121 // configmgr::Components::parseModificationLayer):
122 namespaces_.emplace_back(Span("xsi"), id);
124 return id;
127 XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId)
129 switch (state_) {
130 case State::Content:
131 switch (reportText) {
132 case Text::NONE:
133 return handleSkippedText(data, nsId);
134 case Text::Raw:
135 return handleRawText(data);
136 default: // Text::Normalized
137 return handleNormalizedText(data);
139 case State::StartTag:
140 return handleStartTag(nsId, data);
141 case State::EndTag:
142 return handleEndTag();
143 case State::EmptyElementTag:
144 handleElementEnd();
145 return Result::End;
146 default: // State::Done
147 return Result::Done;
151 bool XmlReader::nextAttribute(int * nsId, Span * localName) {
152 assert(nsId != nullptr && localName != nullptr);
153 if (firstAttribute_) {
154 currentAttribute_ = attributes_.begin();
155 firstAttribute_ = false;
156 } else {
157 ++currentAttribute_;
159 if (currentAttribute_ == attributes_.end()) {
160 return false;
162 if (currentAttribute_->nameColon == nullptr) {
163 *nsId = NAMESPACE_NONE;
164 *localName = Span(
165 currentAttribute_->nameBegin,
166 currentAttribute_->nameEnd - currentAttribute_->nameBegin);
167 } else {
168 *nsId = getNamespaceId(
169 Span(
170 currentAttribute_->nameBegin,
171 currentAttribute_->nameColon - currentAttribute_->nameBegin));
172 *localName = Span(
173 currentAttribute_->nameColon + 1,
174 currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1));
176 return true;
179 Span XmlReader::getAttributeValue(bool fullyNormalize) {
180 return handleAttributeValue(
181 currentAttribute_->valueBegin, currentAttribute_->valueEnd,
182 fullyNormalize);
185 int XmlReader::getNamespaceId(Span const & prefix) const {
186 auto i = std::find_if(namespaces_.crbegin(), namespaces_.crend(),
187 [&prefix](const NamespaceData& rNamespaceData) { return prefix == rNamespaceData.prefix; });
189 if (i != namespaces_.rend())
190 return i->nsId;
192 return NAMESPACE_UNKNOWN;
196 void XmlReader::normalizeLineEnds(Span const & text) {
197 char const * p = text.begin;
198 sal_Int32 n = text.length;
199 for (;;) {
200 sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D');
201 if (i < 0) {
202 break;
204 pad_.add(p, i);
205 p += i + 1;
206 n -= i + 1;
207 if (n == 0 || *p != '\x0A') {
208 pad_.add("\x0A");
211 pad_.add(p, n);
214 void XmlReader::skipSpace() {
215 while (isSpace(peek())) {
216 ++pos_;
220 bool XmlReader::skipComment() {
221 if (rtl_str_shortenedCompare_WithLength(
222 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"),
223 RTL_CONSTASCII_LENGTH("--")) !=
226 return false;
228 pos_ += RTL_CONSTASCII_LENGTH("--");
229 sal_Int32 i = rtl_str_indexOfStr_WithLength(
230 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"));
231 if (i < 0) {
232 throw css::uno::RuntimeException(
233 "premature end (within comment) of " + fileUrl_ );
235 pos_ += i + RTL_CONSTASCII_LENGTH("--");
236 if (read() != '>') {
237 throw css::uno::RuntimeException(
238 "illegal \"--\" within comment in " + fileUrl_ );
240 return true;
243 void XmlReader::skipProcessingInstruction() {
244 sal_Int32 i = rtl_str_indexOfStr_WithLength(
245 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>"));
246 if (i < 0) {
247 throw css::uno::RuntimeException(
248 "bad '<?' in " + fileUrl_ );
250 pos_ += i + RTL_CONSTASCII_LENGTH("?>");
253 void XmlReader::skipDocumentTypeDeclaration() {
254 // Neither is it checked that the doctypedecl is at the correct position in
255 // the document, nor that it is well-formed:
256 for (;;) {
257 char c = read();
258 switch (c) {
259 case '\0': // i.e., EOF
260 throw css::uno::RuntimeException(
261 "premature end (within DTD) of " + fileUrl_ );
262 case '"':
263 case '\'':
265 sal_Int32 i = rtl_str_indexOfChar_WithLength(
266 pos_, end_ - pos_, c);
267 if (i < 0) {
268 throw css::uno::RuntimeException(
269 "premature end (within DTD) of " + fileUrl_ );
271 pos_ += i + 1;
273 break;
274 case '>':
275 return;
276 case '[':
277 for (;;) {
278 c = read();
279 switch (c) {
280 case '\0': // i.e., EOF
281 throw css::uno::RuntimeException(
282 "premature end (within DTD) of " + fileUrl_ );
283 case '"':
284 case '\'':
286 sal_Int32 i = rtl_str_indexOfChar_WithLength(
287 pos_, end_ - pos_, c);
288 if (i < 0) {
289 throw css::uno::RuntimeException(
290 "premature end (within DTD) of " + fileUrl_ );
292 pos_ += i + 1;
294 break;
295 case '<':
296 switch (read()) {
297 case '\0': // i.e., EOF
298 throw css::uno::RuntimeException(
299 "premature end (within DTD) of " + fileUrl_ );
300 case '!':
301 skipComment();
302 break;
303 case '?':
304 skipProcessingInstruction();
305 break;
306 default:
307 break;
309 break;
310 case ']':
311 skipSpace();
312 if (read() != '>') {
313 throw css::uno::RuntimeException(
314 "missing \">\" of DTD in " + fileUrl_ );
316 return;
317 default:
318 break;
321 default:
322 break;
327 Span XmlReader::scanCdataSection() {
328 if (rtl_str_shortenedCompare_WithLength(
329 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
330 RTL_CONSTASCII_LENGTH("[CDATA[")) !=
333 return Span();
335 pos_ += RTL_CONSTASCII_LENGTH("[CDATA[");
336 char const * begin = pos_;
337 sal_Int32 i = rtl_str_indexOfStr_WithLength(
338 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>"));
339 if (i < 0) {
340 throw css::uno::RuntimeException(
341 "premature end (within CDATA section) of " + fileUrl_ );
343 pos_ += i + RTL_CONSTASCII_LENGTH("]]>");
344 return Span(begin, i);
347 bool XmlReader::scanName(char const ** nameColon) {
348 assert(nameColon != nullptr && *nameColon == nullptr);
349 for (char const * begin = pos_;; ++pos_) {
350 switch (peek()) {
351 case '\0': // i.e., EOF
352 case '\x09':
353 case '\x0A':
354 case '\x0D':
355 case ' ':
356 case '/':
357 case '=':
358 case '>':
359 return pos_ != begin;
360 case ':':
361 *nameColon = pos_;
362 break;
363 default:
364 break;
369 int XmlReader::scanNamespaceIri(char const * begin, char const * end) {
370 assert(begin != nullptr && begin <= end);
371 Span iri(handleAttributeValue(begin, end, false));
372 for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) {
373 if (namespaceIris_[i] == iri) {
374 return toNamespaceId(i);
377 return XmlReader::NAMESPACE_UNKNOWN;
380 char const * XmlReader::handleReference(char const * position, char const * end)
382 assert(position != nullptr && *position == '&' && position < end);
383 ++position;
384 if (*position == '#') {
385 ++position;
386 sal_uInt32 val = 0;
387 char const * p;
388 if (*position == 'x') {
389 ++position;
390 p = position;
391 for (;; ++position) {
392 char c = *position;
393 if (c >= '0' && c <= '9') {
394 val = 16 * val + (c - '0');
395 } else if (c >= 'A' && c <= 'F') {
396 val = 16 * val + (c - 'A') + 10;
397 } else if (c >= 'a' && c <= 'f') {
398 val = 16 * val + (c - 'a') + 10;
399 } else {
400 break;
402 if (!rtl::isUnicodeCodePoint(val)) { // avoid overflow
403 throw css::uno::RuntimeException(
404 "'&#x...' too large in " + fileUrl_ );
407 } else {
408 p = position;
409 for (;; ++position) {
410 char c = *position;
411 if (c >= '0' && c <= '9') {
412 val = 10 * val + (c - '0');
413 } else {
414 break;
416 if (!rtl::isUnicodeCodePoint(val)) { // avoid overflow
417 throw css::uno::RuntimeException(
418 "'&#...' too large in " + fileUrl_ );
422 if (position == p || *position++ != ';') {
423 throw css::uno::RuntimeException(
424 "'&#...' missing ';' in " + fileUrl_ );
426 assert(rtl::isUnicodeCodePoint(val));
427 if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) ||
428 (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF)
430 throw css::uno::RuntimeException(
431 "character reference denoting invalid character in " + fileUrl_ );
433 char buf[4];
434 sal_Int32 len;
435 if (val < 0x80) {
436 buf[0] = static_cast< char >(val);
437 len = 1;
438 } else if (val < 0x800) {
439 buf[0] = static_cast< char >((val >> 6) | 0xC0);
440 buf[1] = static_cast< char >((val & 0x3F) | 0x80);
441 len = 2;
442 } else if (val < 0x10000) {
443 buf[0] = static_cast< char >((val >> 12) | 0xE0);
444 buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
445 buf[2] = static_cast< char >((val & 0x3F) | 0x80);
446 len = 3;
447 } else {
448 buf[0] = static_cast< char >((val >> 18) | 0xF0);
449 buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80);
450 buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
451 buf[3] = static_cast< char >((val & 0x3F) | 0x80);
452 len = 4;
454 pad_.addEphemeral(buf, len);
455 return position;
456 } else {
457 struct EntityRef {
458 char const * inBegin;
459 sal_Int32 const inLength;
460 char const * outBegin;
461 sal_Int32 const outLength;
463 static EntityRef const refs[] = {
464 { RTL_CONSTASCII_STRINGPARAM("amp;"),
465 RTL_CONSTASCII_STRINGPARAM("&") },
466 { RTL_CONSTASCII_STRINGPARAM("lt;"),
467 RTL_CONSTASCII_STRINGPARAM("<") },
468 { RTL_CONSTASCII_STRINGPARAM("gt;"),
469 RTL_CONSTASCII_STRINGPARAM(">") },
470 { RTL_CONSTASCII_STRINGPARAM("apos;"),
471 RTL_CONSTASCII_STRINGPARAM("'") },
472 { RTL_CONSTASCII_STRINGPARAM("quot;"),
473 RTL_CONSTASCII_STRINGPARAM("\"") } };
474 for (const auto & ref : refs) {
475 if (rtl_str_shortenedCompare_WithLength(
476 position, end - position, ref.inBegin, ref.inLength,
477 ref.inLength) ==
480 position += ref.inLength;
481 pad_.add(ref.outBegin, ref.outLength);
482 return position;
485 throw css::uno::RuntimeException(
486 "unknown entity reference in " + fileUrl_ );
490 Span XmlReader::handleAttributeValue(
491 char const * begin, char const * end, bool fullyNormalize)
493 pad_.clear();
494 if (fullyNormalize) {
495 while (begin != end && isSpace(*begin)) {
496 ++begin;
498 while (end != begin && isSpace(end[-1])) {
499 --end;
501 char const * p = begin;
502 enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
503 // a single true space character can go into the current span,
504 // everything else breaks the span
505 Space space = SPACE_NONE;
506 while (p != end) {
507 switch (*p) {
508 case '\x09':
509 case '\x0A':
510 case '\x0D':
511 switch (space) {
512 case SPACE_NONE:
513 pad_.add(begin, p - begin);
514 pad_.add(" ");
515 space = SPACE_BREAK;
516 break;
517 case SPACE_SPAN:
518 pad_.add(begin, p - begin);
519 space = SPACE_BREAK;
520 break;
521 case SPACE_BREAK:
522 break;
524 begin = ++p;
525 break;
526 case ' ':
527 switch (space) {
528 case SPACE_NONE:
529 ++p;
530 space = SPACE_SPAN;
531 break;
532 case SPACE_SPAN:
533 pad_.add(begin, p - begin);
534 begin = ++p;
535 space = SPACE_BREAK;
536 break;
537 case SPACE_BREAK:
538 begin = ++p;
539 break;
541 break;
542 case '&':
543 pad_.add(begin, p - begin);
544 p = handleReference(p, end);
545 begin = p;
546 space = SPACE_NONE;
547 break;
548 default:
549 ++p;
550 space = SPACE_NONE;
551 break;
554 pad_.add(begin, p - begin);
555 } else {
556 char const * p = begin;
557 while (p != end) {
558 switch (*p) {
559 case '\x09':
560 case '\x0A':
561 pad_.add(begin, p - begin);
562 begin = ++p;
563 pad_.add(" ");
564 break;
565 case '\x0D':
566 pad_.add(begin, p - begin);
567 ++p;
568 if (peek() == '\x0A') {
569 ++p;
571 begin = p;
572 pad_.add(" ");
573 break;
574 case '&':
575 pad_.add(begin, p - begin);
576 p = handleReference(p, end);
577 begin = p;
578 break;
579 default:
580 ++p;
581 break;
584 pad_.add(begin, p - begin);
586 return pad_.get();
589 XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) {
590 assert(nsId != nullptr && localName);
591 char const * nameBegin = pos_;
592 char const * nameColon = nullptr;
593 if (!scanName(&nameColon)) {
594 throw css::uno::RuntimeException(
595 "bad tag name in " + fileUrl_ );
597 char const * nameEnd = pos_;
598 NamespaceList::size_type inheritedNamespaces = namespaces_.size();
599 bool hasDefaultNs = false;
600 int defaultNsId = NAMESPACE_NONE;
601 attributes_.clear();
602 for (;;) {
603 char const * p = pos_;
604 skipSpace();
605 if (peek() == '/' || peek() == '>') {
606 break;
608 if (pos_ == p) {
609 throw css::uno::RuntimeException(
610 "missing whitespace before attribute in " + fileUrl_ );
612 char const * attrNameBegin = pos_;
613 char const * attrNameColon = nullptr;
614 if (!scanName(&attrNameColon)) {
615 throw css::uno::RuntimeException(
616 "bad attribute name in " + fileUrl_ );
618 char const * attrNameEnd = pos_;
619 skipSpace();
620 if (read() != '=') {
621 throw css::uno::RuntimeException(
622 "missing '=' in " + fileUrl_ );
624 skipSpace();
625 char del = read();
626 if (del != '\'' && del != '"') {
627 throw css::uno::RuntimeException(
628 "bad attribute value in " + fileUrl_ );
630 char const * valueBegin = pos_;
631 sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del);
632 if (i < 0) {
633 throw css::uno::RuntimeException(
634 "unterminated attribute value in " + fileUrl_ );
636 char const * valueEnd = pos_ + i;
637 pos_ += i + 1;
638 if (attrNameColon == nullptr &&
639 Span(attrNameBegin, attrNameEnd - attrNameBegin) == "xmlns")
641 hasDefaultNs = true;
642 defaultNsId = scanNamespaceIri(valueBegin, valueEnd);
643 } else if (attrNameColon != nullptr &&
644 Span(attrNameBegin, attrNameColon - attrNameBegin) ==
645 "xmlns")
647 namespaces_.emplace_back(
648 Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)),
649 scanNamespaceIri(valueBegin, valueEnd));
650 } else {
651 attributes_.emplace_back(
652 attrNameBegin, attrNameEnd, attrNameColon, valueBegin,
653 valueEnd);
656 if (!hasDefaultNs && !elements_.empty()) {
657 defaultNsId = elements_.top().defaultNamespaceId;
659 firstAttribute_ = true;
660 if (peek() == '/') {
661 state_ = State::EmptyElementTag;
662 ++pos_;
663 } else {
664 state_ = State::Content;
666 if (peek() != '>') {
667 throw css::uno::RuntimeException(
668 "missing '>' in " + fileUrl_ );
670 ++pos_;
671 elements_.push(
672 ElementData(
673 Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces,
674 defaultNsId));
675 if (nameColon == nullptr) {
676 *nsId = defaultNsId;
677 *localName = Span(nameBegin, nameEnd - nameBegin);
678 } else {
679 *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin));
680 *localName = Span(nameColon + 1, nameEnd - (nameColon + 1));
682 return Result::Begin;
685 XmlReader::Result XmlReader::handleEndTag() {
686 if (elements_.empty()) {
687 throw css::uno::RuntimeException(
688 "spurious end tag in " + fileUrl_ );
690 char const * nameBegin = pos_;
691 char const * nameColon = nullptr;
692 if (!scanName(&nameColon) ||
693 !elements_.top().name.equals(nameBegin, pos_ - nameBegin))
695 throw css::uno::RuntimeException(
696 "tag mismatch in " + fileUrl_ );
698 handleElementEnd();
699 skipSpace();
700 if (peek() != '>') {
701 throw css::uno::RuntimeException(
702 "missing '>' in " + fileUrl_ );
704 ++pos_;
705 return Result::End;
708 void XmlReader::handleElementEnd() {
709 assert(!elements_.empty());
710 auto end = elements_.top().inheritedNamespaces;
711 namespaces_.resize(end);
712 elements_.pop();
713 state_ = elements_.empty() ? State::Done : State::Content;
716 XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) {
717 for (;;) {
718 auto i = static_cast<const char*>(std::memchr(pos_, '<', end_ - pos_));
719 if (!i) {
720 throw css::uno::RuntimeException(
721 "premature end of " + fileUrl_ );
723 pos_ = i + 1;
724 switch (peek()) {
725 case '!':
726 ++pos_;
727 if (!skipComment() && !scanCdataSection().is()) {
728 skipDocumentTypeDeclaration();
730 break;
731 case '/':
732 ++pos_;
733 return handleEndTag();
734 case '?':
735 ++pos_;
736 skipProcessingInstruction();
737 break;
738 default:
739 return handleStartTag(nsId, data);
744 XmlReader::Result XmlReader::handleRawText(Span * text) {
745 pad_.clear();
746 for (char const * begin = pos_;;) {
747 switch (peek()) {
748 case '\0': // i.e., EOF
749 throw css::uno::RuntimeException(
750 "premature end of " + fileUrl_ );
751 case '\x0D':
752 pad_.add(begin, pos_ - begin);
753 ++pos_;
754 if (peek() != '\x0A') {
755 pad_.add("\x0A");
757 begin = pos_;
758 break;
759 case '&':
760 pad_.add(begin, pos_ - begin);
761 pos_ = handleReference(pos_, end_);
762 begin = pos_;
763 break;
764 case '<':
765 pad_.add(begin, pos_ - begin);
766 ++pos_;
767 switch (peek()) {
768 case '!':
769 ++pos_;
770 if (!skipComment()) {
771 Span cdata(scanCdataSection());
772 if (cdata.is()) {
773 normalizeLineEnds(cdata);
774 } else {
775 skipDocumentTypeDeclaration();
778 begin = pos_;
779 break;
780 case '/':
781 *text = pad_.get();
782 ++pos_;
783 state_ = State::EndTag;
784 return Result::Text;
785 case '?':
786 ++pos_;
787 skipProcessingInstruction();
788 begin = pos_;
789 break;
790 default:
791 *text = pad_.get();
792 state_ = State::StartTag;
793 return Result::Text;
795 break;
796 default:
797 ++pos_;
798 break;
803 XmlReader::Result XmlReader::handleNormalizedText(Span * text) {
804 pad_.clear();
805 char const * flowBegin = pos_;
806 char const * flowEnd = pos_;
807 enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
808 // a single true space character can go into the current flow,
809 // everything else breaks the flow
810 Space space = SPACE_START;
811 for (;;) {
812 switch (peek()) {
813 case '\0': // i.e., EOF
814 throw css::uno::RuntimeException(
815 "premature end of " + fileUrl_ );
816 case '\x09':
817 case '\x0A':
818 case '\x0D':
819 switch (space) {
820 case SPACE_START:
821 case SPACE_BREAK:
822 break;
823 case SPACE_NONE:
824 case SPACE_SPAN:
825 space = SPACE_BREAK;
826 break;
828 ++pos_;
829 break;
830 case ' ':
831 switch (space) {
832 case SPACE_START:
833 case SPACE_BREAK:
834 break;
835 case SPACE_NONE:
836 space = SPACE_SPAN;
837 break;
838 case SPACE_SPAN:
839 space = SPACE_BREAK;
840 break;
842 ++pos_;
843 break;
844 case '&':
845 switch (space) {
846 case SPACE_START:
847 break;
848 case SPACE_NONE:
849 case SPACE_SPAN:
850 pad_.add(flowBegin, pos_ - flowBegin);
851 break;
852 case SPACE_BREAK:
853 pad_.add(flowBegin, flowEnd - flowBegin);
854 pad_.add(" ");
855 break;
857 pos_ = handleReference(pos_, end_);
858 flowBegin = pos_;
859 flowEnd = pos_;
860 space = SPACE_NONE;
861 break;
862 case '<':
863 ++pos_;
864 switch (peek()) {
865 case '!':
866 ++pos_;
867 if (skipComment()) {
868 space = SPACE_BREAK;
869 } else {
870 Span cdata(scanCdataSection());
871 if (cdata.is()) {
872 // CDATA is not normalized (similar to character
873 // references; it keeps the code simple), but it might
874 // arguably be better to normalize it:
875 switch (space) {
876 case SPACE_START:
877 break;
878 case SPACE_NONE:
879 case SPACE_SPAN:
880 pad_.add(flowBegin, pos_ - flowBegin);
881 break;
882 case SPACE_BREAK:
883 pad_.add(flowBegin, flowEnd - flowBegin);
884 pad_.add(" ");
885 break;
887 normalizeLineEnds(cdata);
888 flowBegin = pos_;
889 flowEnd = pos_;
890 space = SPACE_NONE;
891 } else {
892 skipDocumentTypeDeclaration();
895 break;
896 case '/':
897 ++pos_;
898 pad_.add(flowBegin, flowEnd - flowBegin);
899 *text = pad_.get();
900 state_ = State::EndTag;
901 return Result::Text;
902 case '?':
903 ++pos_;
904 skipProcessingInstruction();
905 space = SPACE_BREAK;
906 break;
907 default:
908 pad_.add(flowBegin, flowEnd - flowBegin);
909 *text = pad_.get();
910 state_ = State::StartTag;
911 return Result::Text;
913 break;
914 default:
915 switch (space) {
916 case SPACE_START:
917 flowBegin = pos_;
918 break;
919 case SPACE_NONE:
920 case SPACE_SPAN:
921 break;
922 case SPACE_BREAK:
923 pad_.add(flowBegin, flowEnd - flowBegin);
924 pad_.add(" ");
925 flowBegin = pos_;
926 break;
928 flowEnd = ++pos_;
929 space = SPACE_NONE;
930 break;
935 int XmlReader::toNamespaceId(NamespaceIris::size_type pos) {
936 assert(pos <= INT_MAX);
937 return static_cast< int >(pos);
942 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */