2 ==============================================================================
4 This file is part of the Water library.
5 Copyright (c) 2016 ROLI Ltd.
6 Copyright (C) 2017-2022 Filipe Coelho <falktx@falktx.com>
8 Permission is granted to use this software under the terms of the ISC license
9 http://www.isc.org/downloads/software-support-policy/isc-license/
11 Permission to use, copy, modify, and/or distribute this software for any
12 purpose with or without fee is hereby granted, provided that the above
13 copyright notice and this permission notice appear in all copies.
15 THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH REGARD
16 TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT,
18 OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
19 USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
20 TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
23 ==============================================================================
26 #include "XmlDocument.h"
27 #include "XmlElement.h"
28 #include "../containers/LinkedListPointer.h"
29 #include "../streams/FileInputSource.h"
30 #include "../streams/InputStream.h"
31 #include "../streams/MemoryOutputStream.h"
35 XmlDocument::XmlDocument (const String
& documentText
)
36 : originalText (documentText
),
39 errorOccurred (false),
40 needToLoadDTD (false),
41 ignoreEmptyTextElements (true)
45 XmlDocument::XmlDocument (const File
& file
)
48 errorOccurred (false),
49 needToLoadDTD (false),
50 ignoreEmptyTextElements (true),
51 inputSource (new FileInputSource (file
))
55 XmlDocument::~XmlDocument()
59 XmlElement
* XmlDocument::parse (const File
& file
)
61 XmlDocument
doc (file
);
62 return doc
.getDocumentElement();
65 XmlElement
* XmlDocument::parse (const String
& xmlData
)
67 XmlDocument
doc (xmlData
);
68 return doc
.getDocumentElement();
71 void XmlDocument::setInputSource (FileInputSource
* const newSource
) noexcept
73 inputSource
= newSource
;
76 void XmlDocument::setEmptyTextElementsIgnored (const bool shouldBeIgnored
) noexcept
78 ignoreEmptyTextElements
= shouldBeIgnored
;
81 namespace XmlIdentifierChars
83 static bool isIdentifierCharSlow (const water_uchar c
) noexcept
85 return CharacterFunctions::isLetterOrDigit (c
)
86 || c
== '_' || c
== '-' || c
== ':' || c
== '.';
89 static bool isIdentifierChar (const water_uchar c
) noexcept
91 static const uint32 legalChars
[] = { 0, 0x7ff6000, 0x87fffffe, 0x7fffffe, 0 };
93 return ((int) c
< (int) numElementsInArray (legalChars
) * 32) ? ((legalChars
[c
>> 5] & (1 << (c
& 31))) != 0)
94 : isIdentifierCharSlow (c
);
97 /*static void generateIdentifierCharConstants()
100 for (int i = 0; i < 256; ++i)
101 if (isIdentifierCharSlow (i))
102 n[i >> 5] |= (1 << (i & 31));
105 for (int i = 0; i < 8; ++i)
106 s << "0x" << String::toHexString ((int) n[i]) << ", ";
111 static CharPointer_UTF8
findEndOfToken (CharPointer_UTF8 p
)
113 while (isIdentifierChar (*p
))
120 XmlElement
* XmlDocument::getDocumentElement (const bool onlyReadOuterDocumentElement
)
122 if (originalText
.isEmpty() && inputSource
!= nullptr)
124 CarlaScopedPointer
<InputStream
> in (inputSource
->createInputStream());
128 MemoryOutputStream data
;
129 data
.writeFromInputStream (*in
, onlyReadOuterDocumentElement
? 8192 : -1);
131 if (data
.getDataSize() > 2)
134 const char* text
= static_cast<const char*> (data
.getData());
136 if (CharPointer_UTF8::isByteOrderMark (text
))
139 // parse the input buffer directly to avoid copying it all to a string..
140 return parseDocumentElement (CharPointer_UTF8 (text
), onlyReadOuterDocumentElement
);
145 return parseDocumentElement (originalText
.getCharPointer(), onlyReadOuterDocumentElement
);
148 const String
& XmlDocument::getLastParseError() const noexcept
153 void XmlDocument::setLastError (const String
& desc
, const bool carryOn
)
156 errorOccurred
= ! carryOn
;
159 String
XmlDocument::getFileContents (const String
& filename
) const
161 if (inputSource
!= nullptr)
163 const CarlaScopedPointer
<InputStream
> in (inputSource
->createInputStreamFor (filename
.trim().unquoted()));
166 return in
->readEntireStreamAsString();
172 water_uchar
XmlDocument::readNextChar() noexcept
174 const water_uchar c
= input
.getAndAdvance();
185 XmlElement
* XmlDocument::parseDocumentElement (CharPointer_UTF8 textToParse
,
186 const bool onlyReadOuterDocumentElement
)
189 errorOccurred
= false;
191 needToLoadDTD
= true;
193 if (textToParse
.isEmpty())
195 lastError
= "not enough input";
197 else if (! parseHeader())
199 lastError
= "malformed header";
201 else if (! parseDTD())
203 lastError
= "malformed DTD";
209 CarlaScopedPointer
<XmlElement
> result (readNextElement (! onlyReadOuterDocumentElement
));
212 return result
.release();
218 bool XmlDocument::parseHeader()
220 skipNextWhiteSpace();
222 if (CharacterFunctions::compareUpTo (input
, CharPointer_UTF8 ("<?xml"), 5) == 0)
224 const CharPointer_UTF8
headerEnd (CharacterFunctions::find (input
, CharPointer_UTF8 ("?>")));
226 if (headerEnd
.isEmpty())
229 const String
encoding (String (input
, headerEnd
)
230 .fromFirstOccurrenceOf ("encoding", false, true)
231 .fromFirstOccurrenceOf ("=", false, false)
232 .fromFirstOccurrenceOf ("\"", false, false)
233 .upToFirstOccurrenceOf ("\"", false, false).trim());
235 /* If you load an XML document with a non-UTF encoding type, it may have been
236 loaded wrongly.. Since all the files are read via the normal water file streams,
237 they're treated as UTF-8, so by the time it gets to the parser, the encoding will
238 have been lost. Best plan is to stick to utf-8 or if you have specific files to
239 read, use your own code to convert them to a unicode String, and pass that to the
242 CARLA_SAFE_ASSERT_RETURN (encoding
.isEmpty() || encoding
.startsWithIgnoreCase ("utf-"), false);
244 input
= headerEnd
+ 2;
245 skipNextWhiteSpace();
251 bool XmlDocument::parseDTD()
253 if (CharacterFunctions::compareUpTo (input
, CharPointer_UTF8 ("<!DOCTYPE"), 9) == 0)
256 const CharPointer_UTF8
dtdStart (input
);
258 for (int n
= 1; n
> 0;)
260 const water_uchar c
= readNextChar();
271 dtdText
= String (dtdStart
, input
- 1).trim();
277 void XmlDocument::skipNextWhiteSpace()
281 input
= input
.findEndOfWhitespace();
296 const int closeComment
= input
.indexOf (CharPointer_UTF8 ("-->"));
298 if (closeComment
< 0)
304 input
+= closeComment
+ 3;
311 const int closeBracket
= input
.indexOf (CharPointer_UTF8 ("?>"));
313 if (closeBracket
< 0)
319 input
+= closeBracket
+ 2;
328 void XmlDocument::readQuotedString (String
& result
)
330 const water_uchar quote
= readNextChar();
334 const water_uchar c
= readNextChar();
347 const CharPointer_UTF8
start (input
);
351 const water_uchar character
= *input
;
353 if (character
== quote
)
355 result
.appendCharPointer (start
, input
);
359 else if (character
== '&')
361 result
.appendCharPointer (start
, input
);
364 else if (character
== 0)
366 setLastError ("unmatched quotes", false);
377 XmlElement
* XmlDocument::readNextElement (const bool alsoParseSubElements
)
379 XmlElement
* node
= nullptr;
381 skipNextWhiteSpace();
388 CharPointer_UTF8
endOfToken (XmlIdentifierChars::findEndOfToken (input
));
390 if (endOfToken
== input
)
392 // no tag name - but allow for a gap after the '<' before giving an error
393 skipNextWhiteSpace();
394 endOfToken
= XmlIdentifierChars::findEndOfToken (input
);
396 if (endOfToken
== input
)
398 setLastError ("tag name missing", false);
403 node
= new XmlElement (input
, endOfToken
);
405 LinkedListPointer
<XmlElement::XmlAttributeNode
>::Appender
attributeAppender (node
->attributes
);
407 // look for attributes
410 skipNextWhiteSpace();
412 const water_uchar c
= *input
;
415 if (c
== '/' && input
[1] == '>')
421 // parse the guts of the element..
426 if (alsoParseSubElements
)
427 readChildElements (*node
);
432 // get an attribute..
433 if (XmlIdentifierChars::isIdentifierChar (c
))
435 CharPointer_UTF8
attNameEnd (XmlIdentifierChars::findEndOfToken (input
));
437 if (attNameEnd
!= input
)
439 const CharPointer_UTF8
attNameStart (input
);
442 skipNextWhiteSpace();
444 if (readNextChar() == '=')
446 skipNextWhiteSpace();
448 const water_uchar nextChar
= *input
;
450 if (nextChar
== '"' || nextChar
== '\'')
452 XmlElement::XmlAttributeNode
* const newAtt
453 = new XmlElement::XmlAttributeNode (attNameStart
, attNameEnd
);
455 readQuotedString (newAtt
->value
);
456 attributeAppender
.append (newAtt
);
462 setLastError ("expected '=' after attribute '"
463 + String (attNameStart
, attNameEnd
) + "'", false);
471 setLastError ("illegal character found in " + node
->getTagName() + ": '" + c
+ "'", false);
481 void XmlDocument::readChildElements (XmlElement
& parent
)
483 LinkedListPointer
<XmlElement
>::Appender
childAppender (parent
.firstChildElement
);
487 const CharPointer_UTF8
preWhitespaceInput (input
);
488 skipNextWhiteSpace();
492 setLastError ("unmatched tags", false);
498 const water_uchar c1
= input
[1];
503 const int closeTag
= input
.indexOf ((water_uchar
) '>');
506 input
+= closeTag
+ 1;
511 if (c1
== '!' && CharacterFunctions::compareUpTo (input
+ 2, CharPointer_UTF8 ("[CDATA["), 7) == 0)
514 const CharPointer_UTF8
inputStart (input
);
518 const water_uchar c0
= *input
;
522 setLastError ("unterminated CDATA section", false);
530 childAppender
.append (XmlElement::createTextElement (String (inputStart
, input
)));
540 // this is some other element, so parse and add it..
541 if (XmlElement
* const n
= readNextElement (true))
542 childAppender
.append (n
);
547 else // must be a character block
549 input
= preWhitespaceInput
; // roll back to include the leading whitespace
550 MemoryOutputStream textElementContent
;
551 bool contentShouldBeUsed
= ! ignoreEmptyTextElements
;
555 const water_uchar c
= *input
;
559 if (input
[1] == '!' && input
[2] == '-' && input
[3] == '-')
562 const int closeComment
= input
.indexOf (CharPointer_UTF8 ("-->"));
564 if (closeComment
< 0)
566 setLastError ("unterminated comment", false);
571 input
+= closeComment
+ 3;
580 setLastError ("unmatched tags", false);
590 if (entity
.startsWithChar ('<') && entity
[1] != 0)
592 const CharPointer_UTF8
oldInput (input
);
593 const bool oldOutOfData
= outOfData
;
595 input
= entity
.getCharPointer();
598 while (XmlElement
* n
= readNextElement (true))
599 childAppender
.append (n
);
602 outOfData
= oldOutOfData
;
606 textElementContent
<< entity
;
607 contentShouldBeUsed
= contentShouldBeUsed
|| entity
.containsNonWhitespaceChars();
614 water_uchar nextChar
= *input
;
616 if (nextChar
== '\r')
620 if (input
[1] == '\n')
624 if (nextChar
== '<' || nextChar
== '&')
629 setLastError ("unmatched tags", false);
634 textElementContent
.appendUTF8Char (nextChar
);
635 contentShouldBeUsed
= contentShouldBeUsed
|| ! CharacterFunctions::isWhitespace (nextChar
);
640 if (contentShouldBeUsed
)
641 childAppender
.append (XmlElement::createTextElement (textElementContent
.toUTF8()));
646 void XmlDocument::readEntity (String
& result
)
648 // skip over the ampersand
651 if (input
.compareIgnoreCaseUpTo (CharPointer_UTF8 ("amp;"), 4) == 0)
656 else if (input
.compareIgnoreCaseUpTo (CharPointer_UTF8 ("quot;"), 5) == 0)
661 else if (input
.compareIgnoreCaseUpTo (CharPointer_UTF8 ("apos;"), 5) == 0)
666 else if (input
.compareIgnoreCaseUpTo (CharPointer_UTF8 ("lt;"), 3) == 0)
671 else if (input
.compareIgnoreCaseUpTo (CharPointer_UTF8 ("gt;"), 3) == 0)
676 else if (*input
== '#')
681 if (*input
== 'x' || *input
== 'X')
686 while (input
[0] != ';')
688 const int hexValue
= CharacterFunctions::getHexDigitValue (input
[0]);
690 if (hexValue
< 0 || ++numChars
> 8)
692 setLastError ("illegal escape sequence", true);
696 charCode
= (charCode
<< 4) | hexValue
;
702 else if (input
[0] >= '0' && input
[0] <= '9')
706 while (input
[0] != ';')
710 setLastError ("illegal escape sequence", true);
714 charCode
= charCode
* 10 + ((int) input
[0] - '0');
722 setLastError ("illegal escape sequence", true);
727 result
<< (water_uchar
) charCode
;
731 const CharPointer_UTF8
entityNameStart (input
);
732 const int closingSemiColon
= input
.indexOf ((water_uchar
) ';');
734 if (closingSemiColon
< 0)
741 input
+= closingSemiColon
+ 1;
743 result
+= expandExternalEntity (String (entityNameStart
, (size_t) closingSemiColon
));
748 String
XmlDocument::expandEntity (const String
& ent
)
750 if (ent
.equalsIgnoreCase ("amp")) return String::charToString ('&');
751 if (ent
.equalsIgnoreCase ("quot")) return String::charToString ('"');
752 if (ent
.equalsIgnoreCase ("apos")) return String::charToString ('\'');
753 if (ent
.equalsIgnoreCase ("lt")) return String::charToString ('<');
754 if (ent
.equalsIgnoreCase ("gt")) return String::charToString ('>');
758 const water_uchar char1
= ent
[1];
760 if (char1
== 'x' || char1
== 'X')
761 return String::charToString (static_cast<water_uchar
> (ent
.substring (2).getHexValue32()));
763 if (char1
>= '0' && char1
<= '9')
764 return String::charToString (static_cast<water_uchar
> (ent
.substring (1).getIntValue()));
766 setLastError ("illegal escape sequence", false);
767 return String::charToString ('&');
770 return expandExternalEntity (ent
);
773 String
XmlDocument::expandExternalEntity (const String
& entity
)
777 if (dtdText
.isNotEmpty())
779 dtdText
= dtdText
.trimCharactersAtEnd (">");
780 tokenisedDTD
.addTokens (dtdText
, true);
782 if (tokenisedDTD
[tokenisedDTD
.size() - 2].equalsIgnoreCase ("system")
783 && tokenisedDTD
[tokenisedDTD
.size() - 1].isQuotedString())
785 const String
fn (tokenisedDTD
[tokenisedDTD
.size() - 1]);
787 tokenisedDTD
.clear();
788 tokenisedDTD
.addTokens (getFileContents (fn
), true);
792 tokenisedDTD
.clear();
793 const int openBracket
= dtdText
.indexOfChar ('[');
797 const int closeBracket
= dtdText
.lastIndexOfChar (']');
799 if (closeBracket
> openBracket
)
800 tokenisedDTD
.addTokens (dtdText
.substring (openBracket
+ 1,
801 closeBracket
), true);
805 for (int i
= tokenisedDTD
.size(); --i
>= 0;)
807 if (tokenisedDTD
[i
].startsWithChar ('%')
808 && tokenisedDTD
[i
].endsWithChar (';'))
810 const String
parsed (getParameterEntity (tokenisedDTD
[i
].substring (1, tokenisedDTD
[i
].length() - 1)));
812 newToks
.addTokens (parsed
, true);
814 tokenisedDTD
.remove (i
);
816 for (int j
= newToks
.size(); --j
>= 0;)
817 tokenisedDTD
.insert (i
, newToks
[j
]);
822 needToLoadDTD
= false;
825 for (int i
= 0; i
< tokenisedDTD
.size(); ++i
)
827 if (tokenisedDTD
[i
] == entity
)
829 if (tokenisedDTD
[i
- 1].equalsIgnoreCase ("<!entity"))
831 String
ent (tokenisedDTD
[i
+ 1].trimCharactersAtEnd (">").trim().unquoted());
833 // check for sub-entities..
834 int ampersand
= ent
.indexOfChar ('&');
836 while (ampersand
>= 0)
838 const int semiColon
= ent
.indexOf (i
+ 1, ";");
842 setLastError ("entity without terminating semi-colon", false);
846 const String
resolved (expandEntity (ent
.substring (i
+ 1, semiColon
)));
848 ent
= ent
.substring (0, ampersand
)
850 + ent
.substring (semiColon
+ 1);
852 ampersand
= ent
.indexOfChar (semiColon
+ 1, '&');
860 setLastError ("unknown entity", true);
865 String
XmlDocument::getParameterEntity (const String
& entity
)
867 for (int i
= 0; i
< tokenisedDTD
.size(); ++i
)
869 if (tokenisedDTD
[i
] == entity
870 && tokenisedDTD
[i
- 1] == "%"
871 && tokenisedDTD
[i
- 2].equalsIgnoreCase ("<!entity"))
873 const String
ent (tokenisedDTD
[i
+ 1].trimCharactersAtEnd (">"));
875 if (ent
.equalsIgnoreCase ("system"))
876 return getFileContents (tokenisedDTD
[i
+ 2].trimCharactersAtEnd (">"));
878 return ent
.trim().unquoted();