update credits
[LibreOffice.git] / xmlreader / source / xmlreader.cxx
bloba014892590fadc3791c89b6cab7bc9c65d13cf7a
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include "sal/config.h"
22 #include <cassert>
23 #include <climits>
24 #include <cstddef>
26 #include "com/sun/star/container/NoSuchElementException.hpp"
27 #include "com/sun/star/uno/Reference.hxx"
28 #include "com/sun/star/uno/RuntimeException.hpp"
29 #include "com/sun/star/uno/XInterface.hpp"
30 #include "osl/file.h"
31 #include "rtl/string.h"
32 #include "rtl/ustring.hxx"
33 #include "sal/log.hxx"
34 #include "sal/types.h"
35 #include "xmlreader/pad.hxx"
36 #include "xmlreader/span.hxx"
37 #include "xmlreader/xmlreader.hxx"
39 namespace xmlreader {
41 namespace {
43 bool isSpace(char c) {
44 switch (c) {
45 case '\x09':
46 case '\x0A':
47 case '\x0D':
48 case ' ':
49 return true;
50 default:
51 return false;
57 XmlReader::XmlReader(OUString const & fileUrl)
58 SAL_THROW((
59 css::container::NoSuchElementException, css::uno::RuntimeException)):
60 fileUrl_(fileUrl)
62 oslFileError e = osl_openFile(
63 fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read);
64 switch (e)
66 case osl_File_E_None:
67 break;
68 case osl_File_E_NOENT:
69 throw css::container::NoSuchElementException(
70 fileUrl_, css::uno::Reference< css::uno::XInterface >());
71 default:
72 throw css::uno::RuntimeException(
73 "cannot open " + fileUrl_ + ": " + OUString::number(e),
74 css::uno::Reference< css::uno::XInterface >());
76 e = osl_getFileSize(fileHandle_, &fileSize_);
77 if (e == osl_File_E_None) {
78 e = osl_mapFile(
79 fileHandle_, &fileAddress_, fileSize_, 0,
80 osl_File_MapFlag_WillNeed);
82 if (e != osl_File_E_None) {
83 oslFileError e2 = osl_closeFile(fileHandle_);
84 if (e2 != osl_File_E_None) {
85 SAL_WARN(
86 "xmlreader",
87 "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e2);
89 throw css::uno::RuntimeException(
90 "cannot mmap " + fileUrl_ + " (" + OUString::number(e) + ")",
91 css::uno::Reference< css::uno::XInterface >());
93 namespaceIris_.push_back(Span("http://www.w3.org/XML/1998/namespace"));
94 namespaces_.push_back(NamespaceData(Span("xml"), NAMESPACE_XML));
95 pos_ = static_cast< char * >(fileAddress_);
96 end_ = pos_ + fileSize_;
97 state_ = STATE_CONTENT;
98 firstAttribute_ = true;
101 XmlReader::~XmlReader() {
102 oslFileError e = osl_unmapMappedFile(fileHandle_, fileAddress_, fileSize_);
103 if (e != osl_File_E_None) {
104 SAL_WARN(
105 "xmlreader",
106 "osl_unmapMappedFile of \"" << fileUrl_ << "\" failed with " << +e);
108 e = osl_closeFile(fileHandle_);
109 if (e != osl_File_E_None) {
110 SAL_WARN(
111 "xmlreader",
112 "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e);
116 int XmlReader::registerNamespaceIri(Span const & iri) {
117 int id = toNamespaceId(namespaceIris_.size());
118 namespaceIris_.push_back(iri);
119 if (iri.equals("http://www.w3.org/2001/XMLSchema-instance")) {
120 // Old user layer .xcu files used the xsi namespace prefix without
121 // declaring a corresponding namespace binding, see issue 77174; reading
122 // those files during migration would fail without this hack that can be
123 // removed once migration is no longer relevant (see
124 // configmgr::Components::parseModificationLayer):
125 namespaces_.push_back(NamespaceData(Span("xsi"), id));
127 return id;
130 XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId)
132 switch (state_) {
133 case STATE_CONTENT:
134 switch (reportText) {
135 case TEXT_NONE:
136 return handleSkippedText(data, nsId);
137 case TEXT_RAW:
138 return handleRawText(data);
139 case TEXT_NORMALIZED:
140 return handleNormalizedText(data);
142 case STATE_START_TAG:
143 return handleStartTag(nsId, data);
144 case STATE_END_TAG:
145 return handleEndTag();
146 case STATE_EMPTY_ELEMENT_TAG:
147 handleElementEnd();
148 return RESULT_END;
149 default: // STATE_DONE
150 return RESULT_DONE;
154 bool XmlReader::nextAttribute(int * nsId, Span * localName) {
155 assert(nsId != 0 && localName != 0);
156 if (firstAttribute_) {
157 currentAttribute_ = attributes_.begin();
158 firstAttribute_ = false;
159 } else {
160 ++currentAttribute_;
162 if (currentAttribute_ == attributes_.end()) {
163 return false;
165 if (currentAttribute_->nameColon == 0) {
166 *nsId = NAMESPACE_NONE;
167 *localName = Span(
168 currentAttribute_->nameBegin,
169 currentAttribute_->nameEnd - currentAttribute_->nameBegin);
170 } else {
171 *nsId = getNamespaceId(
172 Span(
173 currentAttribute_->nameBegin,
174 currentAttribute_->nameColon - currentAttribute_->nameBegin));
175 *localName = Span(
176 currentAttribute_->nameColon + 1,
177 currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1));
179 return true;
182 Span XmlReader::getAttributeValue(bool fullyNormalize) {
183 return handleAttributeValue(
184 currentAttribute_->valueBegin, currentAttribute_->valueEnd,
185 fullyNormalize);
188 int XmlReader::getNamespaceId(Span const & prefix) const {
189 for (NamespaceList::const_reverse_iterator i(namespaces_.rbegin());
190 i != namespaces_.rend(); ++i)
192 if (prefix.equals(i->prefix)) {
193 return i->nsId;
196 return NAMESPACE_UNKNOWN;
199 OUString XmlReader::getUrl() const {
200 return fileUrl_;
203 void XmlReader::normalizeLineEnds(Span const & text) {
204 char const * p = text.begin;
205 sal_Int32 n = text.length;
206 for (;;) {
207 sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D');
208 if (i < 0) {
209 break;
211 pad_.add(p, i);
212 p += i + 1;
213 n -= i + 1;
214 if (n == 0 || *p != '\x0A') {
215 pad_.add("\x0A");
218 pad_.add(p, n);
221 void XmlReader::skipSpace() {
222 while (isSpace(peek())) {
223 ++pos_;
227 bool XmlReader::skipComment() {
228 if (rtl_str_shortenedCompare_WithLength(
229 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"),
230 RTL_CONSTASCII_LENGTH("--")) !=
233 return false;
235 pos_ += RTL_CONSTASCII_LENGTH("--");
236 sal_Int32 i = rtl_str_indexOfStr_WithLength(
237 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"));
238 if (i < 0) {
239 throw css::uno::RuntimeException(
240 "premature end (within comment) of " + fileUrl_,
241 css::uno::Reference< css::uno::XInterface >());
243 pos_ += i + RTL_CONSTASCII_LENGTH("--");
244 if (read() != '>') {
245 throw css::uno::RuntimeException(
246 "illegal \"--\" within comment in " + fileUrl_,
247 css::uno::Reference< css::uno::XInterface >());
249 return true;
252 void XmlReader::skipProcessingInstruction() {
253 sal_Int32 i = rtl_str_indexOfStr_WithLength(
254 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>"));
255 if (i < 0) {
256 throw css::uno::RuntimeException(
257 "bad '<?' in " + fileUrl_,
258 css::uno::Reference< css::uno::XInterface >());
260 pos_ += i + RTL_CONSTASCII_LENGTH("?>");
263 void XmlReader::skipDocumentTypeDeclaration() {
264 // Neither is it checked that the doctypedecl is at the correct position in
265 // the document, nor that it is well-formed:
266 for (;;) {
267 char c = read();
268 switch (c) {
269 case '\0': // i.e., EOF
270 throw css::uno::RuntimeException(
271 "premature end (within DTD) of " + fileUrl_,
272 css::uno::Reference< css::uno::XInterface >());
273 case '"':
274 case '\'':
276 sal_Int32 i = rtl_str_indexOfChar_WithLength(
277 pos_, end_ - pos_, c);
278 if (i < 0) {
279 throw css::uno::RuntimeException(
280 "premature end (within DTD) of " + fileUrl_,
281 css::uno::Reference< css::uno::XInterface >());
283 pos_ += i + 1;
285 break;
286 case '>':
287 return;
288 case '[':
289 for (;;) {
290 c = read();
291 switch (c) {
292 case '\0': // i.e., EOF
293 throw css::uno::RuntimeException(
294 "premature end (within DTD) of " + fileUrl_,
295 css::uno::Reference< css::uno::XInterface >());
296 case '"':
297 case '\'':
299 sal_Int32 i = rtl_str_indexOfChar_WithLength(
300 pos_, end_ - pos_, c);
301 if (i < 0) {
302 throw css::uno::RuntimeException(
303 "premature end (within DTD) of " + fileUrl_,
304 css::uno::Reference< css::uno::XInterface >());
306 pos_ += i + 1;
308 break;
309 case '<':
310 switch (read()) {
311 case '\0': // i.e., EOF
312 throw css::uno::RuntimeException(
313 "premature end (within DTD) of " + fileUrl_,
314 css::uno::Reference< css::uno::XInterface >());
315 case '!':
316 skipComment();
317 break;
318 case '?':
319 skipProcessingInstruction();
320 break;
321 default:
322 break;
324 break;
325 case ']':
326 skipSpace();
327 if (read() != '>') {
328 throw css::uno::RuntimeException(
329 "missing \">\" of DTD in " + fileUrl_,
330 css::uno::Reference< css::uno::XInterface >());
332 return;
333 default:
334 break;
337 default:
338 break;
343 Span XmlReader::scanCdataSection() {
344 if (rtl_str_shortenedCompare_WithLength(
345 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
346 RTL_CONSTASCII_LENGTH("[CDATA[")) !=
349 return Span();
351 pos_ += RTL_CONSTASCII_LENGTH("[CDATA[");
352 char const * begin = pos_;
353 sal_Int32 i = rtl_str_indexOfStr_WithLength(
354 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>"));
355 if (i < 0) {
356 throw css::uno::RuntimeException(
357 "premature end (within CDATA section) of " + fileUrl_,
358 css::uno::Reference< css::uno::XInterface >());
360 pos_ += i + RTL_CONSTASCII_LENGTH("]]>");
361 return Span(begin, i);
364 bool XmlReader::scanName(char const ** nameColon) {
365 assert(nameColon != 0 && *nameColon == 0);
366 for (char const * begin = pos_;; ++pos_) {
367 switch (peek()) {
368 case '\0': // i.e., EOF
369 case '\x09':
370 case '\x0A':
371 case '\x0D':
372 case ' ':
373 case '/':
374 case '=':
375 case '>':
376 return pos_ != begin;
377 case ':':
378 *nameColon = pos_;
379 break;
380 default:
381 break;
386 int XmlReader::scanNamespaceIri(char const * begin, char const * end) {
387 assert(begin != 0 && begin <= end);
388 Span iri(handleAttributeValue(begin, end, false));
389 for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) {
390 if (namespaceIris_[i].equals(iri)) {
391 return toNamespaceId(i);
394 return XmlReader::NAMESPACE_UNKNOWN;
397 char const * XmlReader::handleReference(char const * position, char const * end)
399 assert(position != 0 && *position == '&' && position < end);
400 ++position;
401 if (*position == '#') {
402 ++position;
403 sal_Int32 val = 0;
404 char const * p;
405 if (*position == 'x') {
406 ++position;
407 p = position;
408 for (;; ++position) {
409 char c = *position;
410 if (c >= '0' && c <= '9') {
411 val = 16 * val + (c - '0');
412 } else if (c >= 'A' && c <= 'F') {
413 val = 16 * val + (c - 'A') + 10;
414 } else if (c >= 'a' && c <= 'f') {
415 val = 16 * val + (c - 'a') + 10;
416 } else {
417 break;
419 if (val > 0x10FFFF) { // avoid overflow
420 throw css::uno::RuntimeException(
421 "'&#x...' too large in " + fileUrl_,
422 css::uno::Reference< css::uno::XInterface >());
425 } else {
426 p = position;
427 for (;; ++position) {
428 char c = *position;
429 if (c >= '0' && c <= '9') {
430 val = 10 * val + (c - '0');
431 } else {
432 break;
434 if (val > 0x10FFFF) { // avoid overflow
435 throw css::uno::RuntimeException(
436 "'&#...' too large in " + fileUrl_,
437 css::uno::Reference< css::uno::XInterface >());
441 if (position == p || *position++ != ';') {
442 throw css::uno::RuntimeException(
443 "'&#...' missing ';' in " + fileUrl_,
444 css::uno::Reference< css::uno::XInterface >());
446 assert(val >= 0 && val <= 0x10FFFF);
447 if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) ||
448 (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF)
450 throw css::uno::RuntimeException(
451 "character reference denoting invalid character in " + fileUrl_,
452 css::uno::Reference< css::uno::XInterface >());
454 char buf[4];
455 sal_Int32 len;
456 if (val < 0x80) {
457 buf[0] = static_cast< char >(val);
458 len = 1;
459 } else if (val < 0x800) {
460 buf[0] = static_cast< char >((val >> 6) | 0xC0);
461 buf[1] = static_cast< char >((val & 0x3F) | 0x80);
462 len = 2;
463 } else if (val < 0x10000) {
464 buf[0] = static_cast< char >((val >> 12) | 0xE0);
465 buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
466 buf[2] = static_cast< char >((val & 0x3F) | 0x80);
467 len = 3;
468 } else {
469 buf[0] = static_cast< char >((val >> 18) | 0xF0);
470 buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80);
471 buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
472 buf[3] = static_cast< char >((val & 0x3F) | 0x80);
473 len = 4;
475 pad_.addEphemeral(buf, len);
476 return position;
477 } else {
478 struct EntityRef {
479 char const * inBegin;
480 sal_Int32 inLength;
481 char const * outBegin;
482 sal_Int32 outLength;
484 static EntityRef const refs[] = {
485 { RTL_CONSTASCII_STRINGPARAM("amp;"),
486 RTL_CONSTASCII_STRINGPARAM("&") },
487 { RTL_CONSTASCII_STRINGPARAM("lt;"),
488 RTL_CONSTASCII_STRINGPARAM("<") },
489 { RTL_CONSTASCII_STRINGPARAM("gt;"),
490 RTL_CONSTASCII_STRINGPARAM(">") },
491 { RTL_CONSTASCII_STRINGPARAM("apos;"),
492 RTL_CONSTASCII_STRINGPARAM("'") },
493 { RTL_CONSTASCII_STRINGPARAM("quot;"),
494 RTL_CONSTASCII_STRINGPARAM("\"") } };
495 for (std::size_t i = 0; i < sizeof refs / sizeof refs[0]; ++i) {
496 if (rtl_str_shortenedCompare_WithLength(
497 position, end - position, refs[i].inBegin, refs[i].inLength,
498 refs[i].inLength) ==
501 position += refs[i].inLength;
502 pad_.add(refs[i].outBegin, refs[i].outLength);
503 return position;
506 throw css::uno::RuntimeException(
507 "unknown entity reference in " + fileUrl_,
508 css::uno::Reference< css::uno::XInterface >());
512 Span XmlReader::handleAttributeValue(
513 char const * begin, char const * end, bool fullyNormalize)
515 pad_.clear();
516 if (fullyNormalize) {
517 while (begin != end && isSpace(*begin)) {
518 ++begin;
520 while (end != begin && isSpace(end[-1])) {
521 --end;
523 char const * p = begin;
524 enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
525 // a single true space character can go into the current span,
526 // everything else breaks the span
527 Space space = SPACE_NONE;
528 while (p != end) {
529 switch (*p) {
530 case '\x09':
531 case '\x0A':
532 case '\x0D':
533 switch (space) {
534 case SPACE_NONE:
535 pad_.add(begin, p - begin);
536 pad_.add(" ");
537 space = SPACE_BREAK;
538 break;
539 case SPACE_SPAN:
540 pad_.add(begin, p - begin);
541 space = SPACE_BREAK;
542 break;
543 case SPACE_BREAK:
544 break;
546 begin = ++p;
547 break;
548 case ' ':
549 switch (space) {
550 case SPACE_NONE:
551 ++p;
552 space = SPACE_SPAN;
553 break;
554 case SPACE_SPAN:
555 pad_.add(begin, p - begin);
556 begin = ++p;
557 space = SPACE_BREAK;
558 break;
559 case SPACE_BREAK:
560 begin = ++p;
561 break;
563 break;
564 case '&':
565 pad_.add(begin, p - begin);
566 p = handleReference(p, end);
567 begin = p;
568 space = SPACE_NONE;
569 break;
570 default:
571 ++p;
572 space = SPACE_NONE;
573 break;
576 pad_.add(begin, p - begin);
577 } else {
578 char const * p = begin;
579 while (p != end) {
580 switch (*p) {
581 case '\x09':
582 case '\x0A':
583 pad_.add(begin, p - begin);
584 begin = ++p;
585 pad_.add(" ");
586 break;
587 case '\x0D':
588 pad_.add(begin, p - begin);
589 ++p;
590 if (peek() == '\x0A') {
591 ++p;
593 begin = p;
594 pad_.add(" ");
595 break;
596 case '&':
597 pad_.add(begin, p - begin);
598 p = handleReference(p, end);
599 begin = p;
600 break;
601 default:
602 ++p;
603 break;
606 pad_.add(begin, p - begin);
608 return pad_.get();
611 XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) {
612 assert(nsId != 0 && localName);
613 char const * nameBegin = pos_;
614 char const * nameColon = 0;
615 if (!scanName(&nameColon)) {
616 throw css::uno::RuntimeException(
617 "bad tag name in " + fileUrl_,
618 css::uno::Reference< css::uno::XInterface >());
620 char const * nameEnd = pos_;
621 NamespaceList::size_type inheritedNamespaces = namespaces_.size();
622 bool hasDefaultNs = false;
623 int defaultNsId = NAMESPACE_NONE;
624 attributes_.clear();
625 for (;;) {
626 char const * p = pos_;
627 skipSpace();
628 if (peek() == '/' || peek() == '>') {
629 break;
631 if (pos_ == p) {
632 throw css::uno::RuntimeException(
633 "missing whitespace before attribute in " + fileUrl_,
634 css::uno::Reference< css::uno::XInterface >());
636 char const * attrNameBegin = pos_;
637 char const * attrNameColon = 0;
638 if (!scanName(&attrNameColon)) {
639 throw css::uno::RuntimeException(
640 "bad attribute name in " + fileUrl_,
641 css::uno::Reference< css::uno::XInterface >());
643 char const * attrNameEnd = pos_;
644 skipSpace();
645 if (read() != '=') {
646 throw css::uno::RuntimeException(
647 "missing '=' in " + fileUrl_,
648 css::uno::Reference< css::uno::XInterface >());
650 skipSpace();
651 char del = read();
652 if (del != '\'' && del != '"') {
653 throw css::uno::RuntimeException(
654 "bad attribute value in " + fileUrl_,
655 css::uno::Reference< css::uno::XInterface >());
657 char const * valueBegin = pos_;
658 sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del);
659 if (i < 0) {
660 throw css::uno::RuntimeException(
661 "unterminated attribute value in " + fileUrl_,
662 css::uno::Reference< css::uno::XInterface >());
664 char const * valueEnd = pos_ + i;
665 pos_ += i + 1;
666 if (attrNameColon == 0 &&
667 Span(attrNameBegin, attrNameEnd - attrNameBegin).equals("xmlns"))
669 hasDefaultNs = true;
670 defaultNsId = scanNamespaceIri(valueBegin, valueEnd);
671 } else if (attrNameColon != 0 &&
672 Span(attrNameBegin, attrNameColon - attrNameBegin).equals(
673 "xmlns"))
675 namespaces_.push_back(
676 NamespaceData(
677 Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)),
678 scanNamespaceIri(valueBegin, valueEnd)));
679 } else {
680 attributes_.push_back(
681 AttributeData(
682 attrNameBegin, attrNameEnd, attrNameColon, valueBegin,
683 valueEnd));
686 if (!hasDefaultNs && !elements_.empty()) {
687 defaultNsId = elements_.top().defaultNamespaceId;
689 firstAttribute_ = true;
690 if (peek() == '/') {
691 state_ = STATE_EMPTY_ELEMENT_TAG;
692 ++pos_;
693 } else {
694 state_ = STATE_CONTENT;
696 if (peek() != '>') {
697 throw css::uno::RuntimeException(
698 "missing '>' in " + fileUrl_,
699 css::uno::Reference< css::uno::XInterface >());
701 ++pos_;
702 elements_.push(
703 ElementData(
704 Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces,
705 defaultNsId));
706 if (nameColon == 0) {
707 *nsId = defaultNsId;
708 *localName = Span(nameBegin, nameEnd - nameBegin);
709 } else {
710 *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin));
711 *localName = Span(nameColon + 1, nameEnd - (nameColon + 1));
713 return RESULT_BEGIN;
716 XmlReader::Result XmlReader::handleEndTag() {
717 if (elements_.empty()) {
718 throw css::uno::RuntimeException(
719 "spurious end tag in " + fileUrl_,
720 css::uno::Reference< css::uno::XInterface >());
722 char const * nameBegin = pos_;
723 char const * nameColon = 0;
724 if (!scanName(&nameColon) ||
725 !elements_.top().name.equals(nameBegin, pos_ - nameBegin))
727 throw css::uno::RuntimeException(
728 "tag mismatch in " + fileUrl_,
729 css::uno::Reference< css::uno::XInterface >());
731 handleElementEnd();
732 skipSpace();
733 if (peek() != '>') {
734 throw css::uno::RuntimeException(
735 "missing '>' in " + fileUrl_,
736 css::uno::Reference< css::uno::XInterface >());
738 ++pos_;
739 return RESULT_END;
742 void XmlReader::handleElementEnd() {
743 assert(!elements_.empty());
744 namespaces_.resize(elements_.top().inheritedNamespaces);
745 elements_.pop();
746 state_ = elements_.empty() ? STATE_DONE : STATE_CONTENT;
749 XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) {
750 for (;;) {
751 sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, '<');
752 if (i < 0) {
753 throw css::uno::RuntimeException(
754 "premature end of " + fileUrl_,
755 css::uno::Reference< css::uno::XInterface >());
757 pos_ += i + 1;
758 switch (peek()) {
759 case '!':
760 ++pos_;
761 if (!skipComment() && !scanCdataSection().is()) {
762 skipDocumentTypeDeclaration();
764 break;
765 case '/':
766 ++pos_;
767 return handleEndTag();
768 case '?':
769 ++pos_;
770 skipProcessingInstruction();
771 break;
772 default:
773 return handleStartTag(nsId, data);
778 XmlReader::Result XmlReader::handleRawText(Span * text) {
779 pad_.clear();
780 for (char const * begin = pos_;;) {
781 switch (peek()) {
782 case '\0': // i.e., EOF
783 throw css::uno::RuntimeException(
784 "premature end of " + fileUrl_,
785 css::uno::Reference< css::uno::XInterface >());
786 case '\x0D':
787 pad_.add(begin, pos_ - begin);
788 ++pos_;
789 if (peek() != '\x0A') {
790 pad_.add("\x0A");
792 begin = pos_;
793 break;
794 case '&':
795 pad_.add(begin, pos_ - begin);
796 pos_ = handleReference(pos_, end_);
797 begin = pos_;
798 break;
799 case '<':
800 pad_.add(begin, pos_ - begin);
801 ++pos_;
802 switch (peek()) {
803 case '!':
804 ++pos_;
805 if (!skipComment()) {
806 Span cdata(scanCdataSection());
807 if (cdata.is()) {
808 normalizeLineEnds(cdata);
809 } else {
810 skipDocumentTypeDeclaration();
813 begin = pos_;
814 break;
815 case '/':
816 *text = pad_.get();
817 ++pos_;
818 state_ = STATE_END_TAG;
819 return RESULT_TEXT;
820 case '?':
821 ++pos_;
822 skipProcessingInstruction();
823 begin = pos_;
824 break;
825 default:
826 *text = pad_.get();
827 state_ = STATE_START_TAG;
828 return RESULT_TEXT;
830 break;
831 default:
832 ++pos_;
833 break;
838 XmlReader::Result XmlReader::handleNormalizedText(Span * text) {
839 pad_.clear();
840 char const * flowBegin = pos_;
841 char const * flowEnd = pos_;
842 enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
843 // a single true space character can go into the current flow,
844 // everything else breaks the flow
845 Space space = SPACE_START;
846 for (;;) {
847 switch (peek()) {
848 case '\0': // i.e., EOF
849 throw css::uno::RuntimeException(
850 "premature end of " + fileUrl_,
851 css::uno::Reference< css::uno::XInterface >());
852 case '\x09':
853 case '\x0A':
854 case '\x0D':
855 switch (space) {
856 case SPACE_START:
857 case SPACE_BREAK:
858 break;
859 case SPACE_NONE:
860 case SPACE_SPAN:
861 space = SPACE_BREAK;
862 break;
864 ++pos_;
865 break;
866 case ' ':
867 switch (space) {
868 case SPACE_START:
869 case SPACE_BREAK:
870 break;
871 case SPACE_NONE:
872 space = SPACE_SPAN;
873 break;
874 case SPACE_SPAN:
875 space = SPACE_BREAK;
876 break;
878 ++pos_;
879 break;
880 case '&':
881 switch (space) {
882 case SPACE_START:
883 break;
884 case SPACE_NONE:
885 case SPACE_SPAN:
886 pad_.add(flowBegin, pos_ - flowBegin);
887 break;
888 case SPACE_BREAK:
889 pad_.add(flowBegin, flowEnd - flowBegin);
890 pad_.add(" ");
891 break;
893 pos_ = handleReference(pos_, end_);
894 flowBegin = pos_;
895 flowEnd = pos_;
896 space = SPACE_NONE;
897 break;
898 case '<':
899 ++pos_;
900 switch (peek()) {
901 case '!':
902 ++pos_;
903 if (skipComment()) {
904 space = SPACE_BREAK;
905 } else {
906 Span cdata(scanCdataSection());
907 if (cdata.is()) {
908 // CDATA is not normalized (similar to character
909 // references; it keeps the code simple), but it might
910 // arguably be better to normalize it:
911 switch (space) {
912 case SPACE_START:
913 break;
914 case SPACE_NONE:
915 case SPACE_SPAN:
916 pad_.add(flowBegin, pos_ - flowBegin);
917 break;
918 case SPACE_BREAK:
919 pad_.add(flowBegin, flowEnd - flowBegin);
920 pad_.add(" ");
921 break;
923 normalizeLineEnds(cdata);
924 flowBegin = pos_;
925 flowEnd = pos_;
926 space = SPACE_NONE;
927 } else {
928 skipDocumentTypeDeclaration();
931 break;
932 case '/':
933 ++pos_;
934 pad_.add(flowBegin, flowEnd - flowBegin);
935 *text = pad_.get();
936 state_ = STATE_END_TAG;
937 return RESULT_TEXT;
938 case '?':
939 ++pos_;
940 skipProcessingInstruction();
941 space = SPACE_BREAK;
942 break;
943 default:
944 pad_.add(flowBegin, flowEnd - flowBegin);
945 *text = pad_.get();
946 state_ = STATE_START_TAG;
947 return RESULT_TEXT;
949 break;
950 default:
951 switch (space) {
952 case SPACE_START:
953 flowBegin = pos_;
954 break;
955 case SPACE_NONE:
956 case SPACE_SPAN:
957 break;
958 case SPACE_BREAK:
959 pad_.add(flowBegin, flowEnd - flowBegin);
960 pad_.add(" ");
961 flowBegin = pos_;
962 break;
964 flowEnd = ++pos_;
965 space = SPACE_NONE;
966 break;
971 int XmlReader::toNamespaceId(NamespaceIris::size_type pos) {
972 assert(pos <= INT_MAX);
973 return static_cast< int >(pos);
978 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */