bump product version to 5.0.4.1
[LibreOffice.git] / xmlreader / source / xmlreader.cxx
blobb384f8a670a5a94c71bd9bb3537f2c97726004c7
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
22 #include <cassert>
23 #include <climits>
24 #include <cstddef>
26 #include <com/sun/star/container/NoSuchElementException.hpp>
27 #include <com/sun/star/uno/Reference.hxx>
28 #include <com/sun/star/uno/RuntimeException.hpp>
29 #include <com/sun/star/uno/XInterface.hpp>
30 #include <osl/file.h>
31 #include <rtl/string.h>
32 #include <rtl/ustring.hxx>
33 #include <sal/log.hxx>
34 #include <sal/types.h>
35 #include <xmlreader/pad.hxx>
36 #include <xmlreader/span.hxx>
37 #include <xmlreader/xmlreader.hxx>
39 namespace xmlreader {
41 namespace {
43 bool isSpace(char c) {
44 switch (c) {
45 case '\x09':
46 case '\x0A':
47 case '\x0D':
48 case ' ':
49 return true;
50 default:
51 return false;
57 XmlReader::XmlReader(char const *sStr, size_t nLength)
58 : fileUrl_("stream")
59 , fileHandle_(0)
60 , fileSize_(0)
61 , fileAddress_(0)
63 namespaceIris_.push_back(Span("http://www.w3.org/XML/1998/namespace"));
64 namespaces_.push_back(NamespaceData(Span("xml"), NAMESPACE_XML));
65 pos_ = sStr;
66 end_ = pos_ + nLength;
67 state_ = STATE_CONTENT;
68 firstAttribute_ = true;
71 XmlReader::XmlReader(OUString const & fileUrl)
72 : fileUrl_(fileUrl)
73 , fileHandle_(0)
75 oslFileError e = osl_openFile(
76 fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read);
77 switch (e)
79 case osl_File_E_None:
80 break;
81 case osl_File_E_NOENT:
82 throw css::container::NoSuchElementException( fileUrl_ );
83 default:
84 throw css::uno::RuntimeException(
85 "cannot open " + fileUrl_ + ": " + OUString::number(e));
87 e = osl_getFileSize(fileHandle_, &fileSize_);
88 if (e == osl_File_E_None) {
89 e = osl_mapFile(
90 fileHandle_, &fileAddress_, fileSize_, 0,
91 osl_File_MapFlag_WillNeed);
93 if (e != osl_File_E_None) {
94 oslFileError e2 = osl_closeFile(fileHandle_);
95 if (e2 != osl_File_E_None) {
96 SAL_WARN(
97 "xmlreader",
98 "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e2);
100 throw css::uno::RuntimeException(
101 "cannot mmap " + fileUrl_ + " (" + OUString::number(e) + ")" );
103 namespaceIris_.push_back(Span("http://www.w3.org/XML/1998/namespace"));
104 namespaces_.push_back(NamespaceData(Span("xml"), NAMESPACE_XML));
105 pos_ = static_cast< char * >(fileAddress_);
106 end_ = pos_ + fileSize_;
107 state_ = STATE_CONTENT;
108 firstAttribute_ = true;
111 XmlReader::~XmlReader() {
112 if (!fileHandle_)
113 return;
114 oslFileError e = osl_unmapMappedFile(fileHandle_, fileAddress_, fileSize_);
115 if (e != osl_File_E_None) {
116 SAL_WARN(
117 "xmlreader",
118 "osl_unmapMappedFile of \"" << fileUrl_ << "\" failed with " << +e);
120 e = osl_closeFile(fileHandle_);
121 if (e != osl_File_E_None) {
122 SAL_WARN(
123 "xmlreader",
124 "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e);
128 int XmlReader::registerNamespaceIri(Span const & iri) {
129 int id = toNamespaceId(namespaceIris_.size());
130 namespaceIris_.push_back(iri);
131 if (iri.equals("http://www.w3.org/2001/XMLSchema-instance")) {
132 // Old user layer .xcu files used the xsi namespace prefix without
133 // declaring a corresponding namespace binding, see issue 77174; reading
134 // those files during migration would fail without this hack that can be
135 // removed once migration is no longer relevant (see
136 // configmgr::Components::parseModificationLayer):
137 namespaces_.push_back(NamespaceData(Span("xsi"), id));
139 return id;
142 XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId)
144 switch (state_) {
145 case STATE_CONTENT:
146 switch (reportText) {
147 case TEXT_NONE:
148 return handleSkippedText(data, nsId);
149 case TEXT_RAW:
150 return handleRawText(data);
151 case TEXT_NORMALIZED:
152 return handleNormalizedText(data);
154 case STATE_START_TAG:
155 return handleStartTag(nsId, data);
156 case STATE_END_TAG:
157 return handleEndTag();
158 case STATE_EMPTY_ELEMENT_TAG:
159 handleElementEnd();
160 return RESULT_END;
161 default: // STATE_DONE
162 return RESULT_DONE;
166 bool XmlReader::nextAttribute(int * nsId, Span * localName) {
167 assert(nsId != 0 && localName != 0);
168 if (firstAttribute_) {
169 currentAttribute_ = attributes_.begin();
170 firstAttribute_ = false;
171 } else {
172 ++currentAttribute_;
174 if (currentAttribute_ == attributes_.end()) {
175 return false;
177 if (currentAttribute_->nameColon == 0) {
178 *nsId = NAMESPACE_NONE;
179 *localName = Span(
180 currentAttribute_->nameBegin,
181 currentAttribute_->nameEnd - currentAttribute_->nameBegin);
182 } else {
183 *nsId = getNamespaceId(
184 Span(
185 currentAttribute_->nameBegin,
186 currentAttribute_->nameColon - currentAttribute_->nameBegin));
187 *localName = Span(
188 currentAttribute_->nameColon + 1,
189 currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1));
191 return true;
194 Span XmlReader::getAttributeValue(bool fullyNormalize) {
195 return handleAttributeValue(
196 currentAttribute_->valueBegin, currentAttribute_->valueEnd,
197 fullyNormalize);
200 int XmlReader::getNamespaceId(Span const & prefix) const {
201 for (NamespaceList::const_reverse_iterator i(namespaces_.rbegin());
202 i != namespaces_.rend(); ++i)
204 if (prefix.equals(i->prefix)) {
205 return i->nsId;
208 return NAMESPACE_UNKNOWN;
212 void XmlReader::normalizeLineEnds(Span const & text) {
213 char const * p = text.begin;
214 sal_Int32 n = text.length;
215 for (;;) {
216 sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D');
217 if (i < 0) {
218 break;
220 pad_.add(p, i);
221 p += i + 1;
222 n -= i + 1;
223 if (n == 0 || *p != '\x0A') {
224 pad_.add("\x0A");
227 pad_.add(p, n);
230 void XmlReader::skipSpace() {
231 while (isSpace(peek())) {
232 ++pos_;
236 bool XmlReader::skipComment() {
237 if (rtl_str_shortenedCompare_WithLength(
238 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"),
239 RTL_CONSTASCII_LENGTH("--")) !=
242 return false;
244 pos_ += RTL_CONSTASCII_LENGTH("--");
245 sal_Int32 i = rtl_str_indexOfStr_WithLength(
246 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"));
247 if (i < 0) {
248 throw css::uno::RuntimeException(
249 "premature end (within comment) of " + fileUrl_ );
251 pos_ += i + RTL_CONSTASCII_LENGTH("--");
252 if (read() != '>') {
253 throw css::uno::RuntimeException(
254 "illegal \"--\" within comment in " + fileUrl_ );
256 return true;
259 void XmlReader::skipProcessingInstruction() {
260 sal_Int32 i = rtl_str_indexOfStr_WithLength(
261 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>"));
262 if (i < 0) {
263 throw css::uno::RuntimeException(
264 "bad '<?' in " + fileUrl_ );
266 pos_ += i + RTL_CONSTASCII_LENGTH("?>");
269 void XmlReader::skipDocumentTypeDeclaration() {
270 // Neither is it checked that the doctypedecl is at the correct position in
271 // the document, nor that it is well-formed:
272 for (;;) {
273 char c = read();
274 switch (c) {
275 case '\0': // i.e., EOF
276 throw css::uno::RuntimeException(
277 "premature end (within DTD) of " + fileUrl_ );
278 case '"':
279 case '\'':
281 sal_Int32 i = rtl_str_indexOfChar_WithLength(
282 pos_, end_ - pos_, c);
283 if (i < 0) {
284 throw css::uno::RuntimeException(
285 "premature end (within DTD) of " + fileUrl_ );
287 pos_ += i + 1;
289 break;
290 case '>':
291 return;
292 case '[':
293 for (;;) {
294 c = read();
295 switch (c) {
296 case '\0': // i.e., EOF
297 throw css::uno::RuntimeException(
298 "premature end (within DTD) of " + fileUrl_ );
299 case '"':
300 case '\'':
302 sal_Int32 i = rtl_str_indexOfChar_WithLength(
303 pos_, end_ - pos_, c);
304 if (i < 0) {
305 throw css::uno::RuntimeException(
306 "premature end (within DTD) of " + fileUrl_ );
308 pos_ += i + 1;
310 break;
311 case '<':
312 switch (read()) {
313 case '\0': // i.e., EOF
314 throw css::uno::RuntimeException(
315 "premature end (within DTD) of " + fileUrl_ );
316 case '!':
317 skipComment();
318 break;
319 case '?':
320 skipProcessingInstruction();
321 break;
322 default:
323 break;
325 break;
326 case ']':
327 skipSpace();
328 if (read() != '>') {
329 throw css::uno::RuntimeException(
330 "missing \">\" of DTD in " + fileUrl_ );
332 return;
333 default:
334 break;
337 default:
338 break;
343 Span XmlReader::scanCdataSection() {
344 if (rtl_str_shortenedCompare_WithLength(
345 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
346 RTL_CONSTASCII_LENGTH("[CDATA[")) !=
349 return Span();
351 pos_ += RTL_CONSTASCII_LENGTH("[CDATA[");
352 char const * begin = pos_;
353 sal_Int32 i = rtl_str_indexOfStr_WithLength(
354 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>"));
355 if (i < 0) {
356 throw css::uno::RuntimeException(
357 "premature end (within CDATA section) of " + fileUrl_ );
359 pos_ += i + RTL_CONSTASCII_LENGTH("]]>");
360 return Span(begin, i);
363 bool XmlReader::scanName(char const ** nameColon) {
364 assert(nameColon != 0 && *nameColon == 0);
365 for (char const * begin = pos_;; ++pos_) {
366 switch (peek()) {
367 case '\0': // i.e., EOF
368 case '\x09':
369 case '\x0A':
370 case '\x0D':
371 case ' ':
372 case '/':
373 case '=':
374 case '>':
375 return pos_ != begin;
376 case ':':
377 *nameColon = pos_;
378 break;
379 default:
380 break;
385 int XmlReader::scanNamespaceIri(char const * begin, char const * end) {
386 assert(begin != 0 && begin <= end);
387 Span iri(handleAttributeValue(begin, end, false));
388 for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) {
389 if (namespaceIris_[i].equals(iri)) {
390 return toNamespaceId(i);
393 return XmlReader::NAMESPACE_UNKNOWN;
396 char const * XmlReader::handleReference(char const * position, char const * end)
398 assert(position != 0 && *position == '&' && position < end);
399 ++position;
400 if (*position == '#') {
401 ++position;
402 sal_Int32 val = 0;
403 char const * p;
404 if (*position == 'x') {
405 ++position;
406 p = position;
407 for (;; ++position) {
408 char c = *position;
409 if (c >= '0' && c <= '9') {
410 val = 16 * val + (c - '0');
411 } else if (c >= 'A' && c <= 'F') {
412 val = 16 * val + (c - 'A') + 10;
413 } else if (c >= 'a' && c <= 'f') {
414 val = 16 * val + (c - 'a') + 10;
415 } else {
416 break;
418 if (val > 0x10FFFF) { // avoid overflow
419 throw css::uno::RuntimeException(
420 "'&#x...' too large in " + fileUrl_ );
423 } else {
424 p = position;
425 for (;; ++position) {
426 char c = *position;
427 if (c >= '0' && c <= '9') {
428 val = 10 * val + (c - '0');
429 } else {
430 break;
432 if (val > 0x10FFFF) { // avoid overflow
433 throw css::uno::RuntimeException(
434 "'&#...' too large in " + fileUrl_ );
438 if (position == p || *position++ != ';') {
439 throw css::uno::RuntimeException(
440 "'&#...' missing ';' in " + fileUrl_ );
442 assert(val >= 0 && val <= 0x10FFFF);
443 if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) ||
444 (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF)
446 throw css::uno::RuntimeException(
447 "character reference denoting invalid character in " + fileUrl_ );
449 char buf[4];
450 sal_Int32 len;
451 if (val < 0x80) {
452 buf[0] = static_cast< char >(val);
453 len = 1;
454 } else if (val < 0x800) {
455 buf[0] = static_cast< char >((val >> 6) | 0xC0);
456 buf[1] = static_cast< char >((val & 0x3F) | 0x80);
457 len = 2;
458 } else if (val < 0x10000) {
459 buf[0] = static_cast< char >((val >> 12) | 0xE0);
460 buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
461 buf[2] = static_cast< char >((val & 0x3F) | 0x80);
462 len = 3;
463 } else {
464 buf[0] = static_cast< char >((val >> 18) | 0xF0);
465 buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80);
466 buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
467 buf[3] = static_cast< char >((val & 0x3F) | 0x80);
468 len = 4;
470 pad_.addEphemeral(buf, len);
471 return position;
472 } else {
473 struct EntityRef {
474 char const * inBegin;
475 sal_Int32 inLength;
476 char const * outBegin;
477 sal_Int32 outLength;
479 static EntityRef const refs[] = {
480 { RTL_CONSTASCII_STRINGPARAM("amp;"),
481 RTL_CONSTASCII_STRINGPARAM("&") },
482 { RTL_CONSTASCII_STRINGPARAM("lt;"),
483 RTL_CONSTASCII_STRINGPARAM("<") },
484 { RTL_CONSTASCII_STRINGPARAM("gt;"),
485 RTL_CONSTASCII_STRINGPARAM(">") },
486 { RTL_CONSTASCII_STRINGPARAM("apos;"),
487 RTL_CONSTASCII_STRINGPARAM("'") },
488 { RTL_CONSTASCII_STRINGPARAM("quot;"),
489 RTL_CONSTASCII_STRINGPARAM("\"") } };
490 for (std::size_t i = 0; i < sizeof refs / sizeof refs[0]; ++i) {
491 if (rtl_str_shortenedCompare_WithLength(
492 position, end - position, refs[i].inBegin, refs[i].inLength,
493 refs[i].inLength) ==
496 position += refs[i].inLength;
497 pad_.add(refs[i].outBegin, refs[i].outLength);
498 return position;
501 throw css::uno::RuntimeException(
502 "unknown entity reference in " + fileUrl_ );
506 Span XmlReader::handleAttributeValue(
507 char const * begin, char const * end, bool fullyNormalize)
509 pad_.clear();
510 if (fullyNormalize) {
511 while (begin != end && isSpace(*begin)) {
512 ++begin;
514 while (end != begin && isSpace(end[-1])) {
515 --end;
517 char const * p = begin;
518 enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
519 // a single true space character can go into the current span,
520 // everything else breaks the span
521 Space space = SPACE_NONE;
522 while (p != end) {
523 switch (*p) {
524 case '\x09':
525 case '\x0A':
526 case '\x0D':
527 switch (space) {
528 case SPACE_NONE:
529 pad_.add(begin, p - begin);
530 pad_.add(" ");
531 space = SPACE_BREAK;
532 break;
533 case SPACE_SPAN:
534 pad_.add(begin, p - begin);
535 space = SPACE_BREAK;
536 break;
537 case SPACE_BREAK:
538 break;
540 begin = ++p;
541 break;
542 case ' ':
543 switch (space) {
544 case SPACE_NONE:
545 ++p;
546 space = SPACE_SPAN;
547 break;
548 case SPACE_SPAN:
549 pad_.add(begin, p - begin);
550 begin = ++p;
551 space = SPACE_BREAK;
552 break;
553 case SPACE_BREAK:
554 begin = ++p;
555 break;
557 break;
558 case '&':
559 pad_.add(begin, p - begin);
560 p = handleReference(p, end);
561 begin = p;
562 space = SPACE_NONE;
563 break;
564 default:
565 ++p;
566 space = SPACE_NONE;
567 break;
570 pad_.add(begin, p - begin);
571 } else {
572 char const * p = begin;
573 while (p != end) {
574 switch (*p) {
575 case '\x09':
576 case '\x0A':
577 pad_.add(begin, p - begin);
578 begin = ++p;
579 pad_.add(" ");
580 break;
581 case '\x0D':
582 pad_.add(begin, p - begin);
583 ++p;
584 if (peek() == '\x0A') {
585 ++p;
587 begin = p;
588 pad_.add(" ");
589 break;
590 case '&':
591 pad_.add(begin, p - begin);
592 p = handleReference(p, end);
593 begin = p;
594 break;
595 default:
596 ++p;
597 break;
600 pad_.add(begin, p - begin);
602 return pad_.get();
605 XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) {
606 assert(nsId != 0 && localName);
607 char const * nameBegin = pos_;
608 char const * nameColon = 0;
609 if (!scanName(&nameColon)) {
610 throw css::uno::RuntimeException(
611 "bad tag name in " + fileUrl_ );
613 char const * nameEnd = pos_;
614 NamespaceList::size_type inheritedNamespaces = namespaces_.size();
615 bool hasDefaultNs = false;
616 int defaultNsId = NAMESPACE_NONE;
617 attributes_.clear();
618 for (;;) {
619 char const * p = pos_;
620 skipSpace();
621 if (peek() == '/' || peek() == '>') {
622 break;
624 if (pos_ == p) {
625 throw css::uno::RuntimeException(
626 "missing whitespace before attribute in " + fileUrl_ );
628 char const * attrNameBegin = pos_;
629 char const * attrNameColon = 0;
630 if (!scanName(&attrNameColon)) {
631 throw css::uno::RuntimeException(
632 "bad attribute name in " + fileUrl_ );
634 char const * attrNameEnd = pos_;
635 skipSpace();
636 if (read() != '=') {
637 throw css::uno::RuntimeException(
638 "missing '=' in " + fileUrl_ );
640 skipSpace();
641 char del = read();
642 if (del != '\'' && del != '"') {
643 throw css::uno::RuntimeException(
644 "bad attribute value in " + fileUrl_ );
646 char const * valueBegin = pos_;
647 sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del);
648 if (i < 0) {
649 throw css::uno::RuntimeException(
650 "unterminated attribute value in " + fileUrl_ );
652 char const * valueEnd = pos_ + i;
653 pos_ += i + 1;
654 if (attrNameColon == 0 &&
655 Span(attrNameBegin, attrNameEnd - attrNameBegin).equals("xmlns"))
657 hasDefaultNs = true;
658 defaultNsId = scanNamespaceIri(valueBegin, valueEnd);
659 } else if (attrNameColon != 0 &&
660 Span(attrNameBegin, attrNameColon - attrNameBegin).equals(
661 "xmlns"))
663 namespaces_.push_back(
664 NamespaceData(
665 Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)),
666 scanNamespaceIri(valueBegin, valueEnd)));
667 } else {
668 attributes_.push_back(
669 AttributeData(
670 attrNameBegin, attrNameEnd, attrNameColon, valueBegin,
671 valueEnd));
674 if (!hasDefaultNs && !elements_.empty()) {
675 defaultNsId = elements_.top().defaultNamespaceId;
677 firstAttribute_ = true;
678 if (peek() == '/') {
679 state_ = STATE_EMPTY_ELEMENT_TAG;
680 ++pos_;
681 } else {
682 state_ = STATE_CONTENT;
684 if (peek() != '>') {
685 throw css::uno::RuntimeException(
686 "missing '>' in " + fileUrl_ );
688 ++pos_;
689 elements_.push(
690 ElementData(
691 Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces,
692 defaultNsId));
693 if (nameColon == 0) {
694 *nsId = defaultNsId;
695 *localName = Span(nameBegin, nameEnd - nameBegin);
696 } else {
697 *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin));
698 *localName = Span(nameColon + 1, nameEnd - (nameColon + 1));
700 return RESULT_BEGIN;
703 XmlReader::Result XmlReader::handleEndTag() {
704 if (elements_.empty()) {
705 throw css::uno::RuntimeException(
706 "spurious end tag in " + fileUrl_ );
708 char const * nameBegin = pos_;
709 char const * nameColon = 0;
710 if (!scanName(&nameColon) ||
711 !elements_.top().name.equals(nameBegin, pos_ - nameBegin))
713 throw css::uno::RuntimeException(
714 "tag mismatch in " + fileUrl_ );
716 handleElementEnd();
717 skipSpace();
718 if (peek() != '>') {
719 throw css::uno::RuntimeException(
720 "missing '>' in " + fileUrl_ );
722 ++pos_;
723 return RESULT_END;
726 void XmlReader::handleElementEnd() {
727 assert(!elements_.empty());
728 namespaces_.resize(elements_.top().inheritedNamespaces);
729 elements_.pop();
730 state_ = elements_.empty() ? STATE_DONE : STATE_CONTENT;
733 XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) {
734 for (;;) {
735 sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, '<');
736 if (i < 0) {
737 throw css::uno::RuntimeException(
738 "premature end of " + fileUrl_ );
740 pos_ += i + 1;
741 switch (peek()) {
742 case '!':
743 ++pos_;
744 if (!skipComment() && !scanCdataSection().is()) {
745 skipDocumentTypeDeclaration();
747 break;
748 case '/':
749 ++pos_;
750 return handleEndTag();
751 case '?':
752 ++pos_;
753 skipProcessingInstruction();
754 break;
755 default:
756 return handleStartTag(nsId, data);
761 XmlReader::Result XmlReader::handleRawText(Span * text) {
762 pad_.clear();
763 for (char const * begin = pos_;;) {
764 switch (peek()) {
765 case '\0': // i.e., EOF
766 throw css::uno::RuntimeException(
767 "premature end of " + fileUrl_ );
768 case '\x0D':
769 pad_.add(begin, pos_ - begin);
770 ++pos_;
771 if (peek() != '\x0A') {
772 pad_.add("\x0A");
774 begin = pos_;
775 break;
776 case '&':
777 pad_.add(begin, pos_ - begin);
778 pos_ = handleReference(pos_, end_);
779 begin = pos_;
780 break;
781 case '<':
782 pad_.add(begin, pos_ - begin);
783 ++pos_;
784 switch (peek()) {
785 case '!':
786 ++pos_;
787 if (!skipComment()) {
788 Span cdata(scanCdataSection());
789 if (cdata.is()) {
790 normalizeLineEnds(cdata);
791 } else {
792 skipDocumentTypeDeclaration();
795 begin = pos_;
796 break;
797 case '/':
798 *text = pad_.get();
799 ++pos_;
800 state_ = STATE_END_TAG;
801 return RESULT_TEXT;
802 case '?':
803 ++pos_;
804 skipProcessingInstruction();
805 begin = pos_;
806 break;
807 default:
808 *text = pad_.get();
809 state_ = STATE_START_TAG;
810 return RESULT_TEXT;
812 break;
813 default:
814 ++pos_;
815 break;
820 XmlReader::Result XmlReader::handleNormalizedText(Span * text) {
821 pad_.clear();
822 char const * flowBegin = pos_;
823 char const * flowEnd = pos_;
824 enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
825 // a single true space character can go into the current flow,
826 // everything else breaks the flow
827 Space space = SPACE_START;
828 for (;;) {
829 switch (peek()) {
830 case '\0': // i.e., EOF
831 throw css::uno::RuntimeException(
832 "premature end of " + fileUrl_ );
833 case '\x09':
834 case '\x0A':
835 case '\x0D':
836 switch (space) {
837 case SPACE_START:
838 case SPACE_BREAK:
839 break;
840 case SPACE_NONE:
841 case SPACE_SPAN:
842 space = SPACE_BREAK;
843 break;
845 ++pos_;
846 break;
847 case ' ':
848 switch (space) {
849 case SPACE_START:
850 case SPACE_BREAK:
851 break;
852 case SPACE_NONE:
853 space = SPACE_SPAN;
854 break;
855 case SPACE_SPAN:
856 space = SPACE_BREAK;
857 break;
859 ++pos_;
860 break;
861 case '&':
862 switch (space) {
863 case SPACE_START:
864 break;
865 case SPACE_NONE:
866 case SPACE_SPAN:
867 pad_.add(flowBegin, pos_ - flowBegin);
868 break;
869 case SPACE_BREAK:
870 pad_.add(flowBegin, flowEnd - flowBegin);
871 pad_.add(" ");
872 break;
874 pos_ = handleReference(pos_, end_);
875 flowBegin = pos_;
876 flowEnd = pos_;
877 space = SPACE_NONE;
878 break;
879 case '<':
880 ++pos_;
881 switch (peek()) {
882 case '!':
883 ++pos_;
884 if (skipComment()) {
885 space = SPACE_BREAK;
886 } else {
887 Span cdata(scanCdataSection());
888 if (cdata.is()) {
889 // CDATA is not normalized (similar to character
890 // references; it keeps the code simple), but it might
891 // arguably be better to normalize it:
892 switch (space) {
893 case SPACE_START:
894 break;
895 case SPACE_NONE:
896 case SPACE_SPAN:
897 pad_.add(flowBegin, pos_ - flowBegin);
898 break;
899 case SPACE_BREAK:
900 pad_.add(flowBegin, flowEnd - flowBegin);
901 pad_.add(" ");
902 break;
904 normalizeLineEnds(cdata);
905 flowBegin = pos_;
906 flowEnd = pos_;
907 space = SPACE_NONE;
908 } else {
909 skipDocumentTypeDeclaration();
912 break;
913 case '/':
914 ++pos_;
915 pad_.add(flowBegin, flowEnd - flowBegin);
916 *text = pad_.get();
917 state_ = STATE_END_TAG;
918 return RESULT_TEXT;
919 case '?':
920 ++pos_;
921 skipProcessingInstruction();
922 space = SPACE_BREAK;
923 break;
924 default:
925 pad_.add(flowBegin, flowEnd - flowBegin);
926 *text = pad_.get();
927 state_ = STATE_START_TAG;
928 return RESULT_TEXT;
930 break;
931 default:
932 switch (space) {
933 case SPACE_START:
934 flowBegin = pos_;
935 break;
936 case SPACE_NONE:
937 case SPACE_SPAN:
938 break;
939 case SPACE_BREAK:
940 pad_.add(flowBegin, flowEnd - flowBegin);
941 pad_.add(" ");
942 flowBegin = pos_;
943 break;
945 flowEnd = ++pos_;
946 space = SPACE_NONE;
947 break;
952 int XmlReader::toNamespaceId(NamespaceIris::size_type pos) {
953 assert(pos <= INT_MAX);
954 return static_cast< int >(pos);
959 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */