2 ==============================================================================
4 This file is part of the JUCE library.
5 Copyright (c) 2022 - Raw Material Software Limited
7 JUCE is an open source library subject to commercial or open-source
10 The code included in this file is provided under the terms of the ISC license
11 http://www.isc.org/downloads/software-support-policy/isc-license. Permission
12 To use, copy, modify, and/or distribute this software for any purpose with or
13 without fee is hereby granted provided that the above copyright notice and
14 this permission notice appear in all copies.
16 JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
17 EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
20 ==============================================================================
26 XmlDocument::XmlDocument (const String
& text
) : originalText (text
) {}
27 XmlDocument::XmlDocument (const File
& file
) : inputSource (new FileInputSource (file
)) {}
29 XmlDocument::~XmlDocument() {}
31 std::unique_ptr
<XmlElement
> XmlDocument::parse (const File
& file
)
33 return XmlDocument (file
).getDocumentElement();
36 std::unique_ptr
<XmlElement
> XmlDocument::parse (const String
& textToParse
)
38 return XmlDocument (textToParse
).getDocumentElement();
41 std::unique_ptr
<XmlElement
> parseXML (const String
& textToParse
)
43 return XmlDocument (textToParse
).getDocumentElement();
46 std::unique_ptr
<XmlElement
> parseXML (const File
& file
)
48 return XmlDocument (file
).getDocumentElement();
51 std::unique_ptr
<XmlElement
> parseXMLIfTagMatches (const String
& textToParse
, StringRef requiredTag
)
53 return XmlDocument (textToParse
).getDocumentElementIfTagMatches (requiredTag
);
56 std::unique_ptr
<XmlElement
> parseXMLIfTagMatches (const File
& file
, StringRef requiredTag
)
58 return XmlDocument (file
).getDocumentElementIfTagMatches (requiredTag
);
61 void XmlDocument::setInputSource (InputSource
* newSource
) noexcept
63 inputSource
.reset (newSource
);
66 void XmlDocument::setEmptyTextElementsIgnored (bool shouldBeIgnored
) noexcept
68 ignoreEmptyTextElements
= shouldBeIgnored
;
71 namespace XmlIdentifierChars
73 static bool isIdentifierCharSlow (juce_wchar c
) noexcept
75 return CharacterFunctions::isLetterOrDigit (c
)
76 || c
== '_' || c
== '-' || c
== ':' || c
== '.';
79 static bool isIdentifierChar (juce_wchar c
) noexcept
81 static const uint32 legalChars
[] = { 0, 0x7ff6000, 0x87fffffe, 0x7fffffe, 0 };
83 return ((int) c
< (int) numElementsInArray (legalChars
) * 32) ? ((legalChars
[c
>> 5] & (uint32
) (1 << (c
& 31))) != 0)
84 : isIdentifierCharSlow (c
);
87 /*static void generateIdentifierCharConstants()
90 for (int i = 0; i < 256; ++i)
91 if (isIdentifierCharSlow (i))
92 n[i >> 5] |= (1 << (i & 31));
95 for (int i = 0; i < 8; ++i)
96 s << "0x" << String::toHexString ((int) n[i]) << ", ";
101 static String::CharPointerType
findEndOfToken (String::CharPointerType p
) noexcept
103 while (isIdentifierChar (*p
))
110 std::unique_ptr
<XmlElement
> XmlDocument::getDocumentElement (const bool onlyReadOuterDocumentElement
)
112 if (originalText
.isEmpty() && inputSource
!= nullptr)
114 std::unique_ptr
<InputStream
> in (inputSource
->createInputStream());
118 MemoryOutputStream data
;
119 data
.writeFromInputStream (*in
, onlyReadOuterDocumentElement
? 8192 : -1);
121 #if JUCE_STRING_UTF_TYPE == 8
122 if (data
.getDataSize() > 2)
125 auto* text
= static_cast<const char*> (data
.getData());
127 if (CharPointer_UTF16::isByteOrderMarkBigEndian (text
)
128 || CharPointer_UTF16::isByteOrderMarkLittleEndian (text
))
130 originalText
= data
.toString();
134 if (CharPointer_UTF8::isByteOrderMark (text
))
137 // parse the input buffer directly to avoid copying it all to a string..
138 return parseDocumentElement (String::CharPointerType (text
), onlyReadOuterDocumentElement
);
142 originalText
= data
.toString();
147 return parseDocumentElement (originalText
.getCharPointer(), onlyReadOuterDocumentElement
);
150 std::unique_ptr
<XmlElement
> XmlDocument::getDocumentElementIfTagMatches (StringRef requiredTag
)
152 if (auto xml
= getDocumentElement (true))
153 if (xml
->hasTagName (requiredTag
))
154 return getDocumentElement (false);
159 const String
& XmlDocument::getLastParseError() const noexcept
164 void XmlDocument::setLastError (const String
& desc
, const bool carryOn
)
167 errorOccurred
= ! carryOn
;
170 String
XmlDocument::getFileContents (const String
& filename
) const
172 if (inputSource
!= nullptr)
174 std::unique_ptr
<InputStream
> in (inputSource
->createInputStreamFor (filename
.trim().unquoted()));
177 return in
->readEntireStreamAsString();
183 juce_wchar
XmlDocument::readNextChar() noexcept
185 auto c
= input
.getAndAdvance();
196 std::unique_ptr
<XmlElement
> XmlDocument::parseDocumentElement (String::CharPointerType textToParse
,
197 bool onlyReadOuterDocumentElement
)
200 errorOccurred
= false;
202 needToLoadDTD
= true;
204 if (textToParse
.isEmpty())
206 lastError
= "not enough input";
208 else if (! parseHeader())
210 lastError
= "malformed header";
212 else if (! parseDTD())
214 lastError
= "malformed DTD";
219 std::unique_ptr
<XmlElement
> result (readNextElement (! onlyReadOuterDocumentElement
));
228 bool XmlDocument::parseHeader()
230 skipNextWhiteSpace();
232 if (CharacterFunctions::compareUpTo (input
, CharPointer_ASCII ("<?xml"), 5) == 0)
234 auto headerEnd
= CharacterFunctions::find (input
, CharPointer_ASCII ("?>"));
236 if (headerEnd
.isEmpty())
240 auto encoding
= String (input
, headerEnd
)
241 .fromFirstOccurrenceOf ("encoding", false, true)
242 .fromFirstOccurrenceOf ("=", false, false)
243 .fromFirstOccurrenceOf ("\"", false, false)
244 .upToFirstOccurrenceOf ("\"", false, false)
247 /* If you load an XML document with a non-UTF encoding type, it may have been
248 loaded wrongly.. Since all the files are read via the normal juce file streams,
249 they're treated as UTF-8, so by the time it gets to the parser, the encoding will
250 have been lost. Best plan is to stick to utf-8 or if you have specific files to
251 read, use your own code to convert them to a unicode String, and pass that to the
254 jassert (encoding
.isEmpty() || encoding
.startsWithIgnoreCase ("utf-"));
257 input
= headerEnd
+ 2;
258 skipNextWhiteSpace();
264 bool XmlDocument::parseDTD()
266 if (CharacterFunctions::compareUpTo (input
, CharPointer_ASCII ("<!DOCTYPE"), 9) == 0)
269 auto dtdStart
= input
;
271 for (int n
= 1; n
> 0;)
273 auto c
= readNextChar();
284 dtdText
= String (dtdStart
, input
- 1).trim();
290 void XmlDocument::skipNextWhiteSpace()
294 input
.incrementToEndOfWhitespace();
309 auto closeComment
= input
.indexOf (CharPointer_ASCII ("-->"));
311 if (closeComment
< 0)
317 input
+= closeComment
+ 3;
324 auto closeBracket
= input
.indexOf (CharPointer_ASCII ("?>"));
326 if (closeBracket
< 0)
332 input
+= closeBracket
+ 2;
341 void XmlDocument::readQuotedString (String
& result
)
343 auto quote
= readNextChar();
347 auto c
= readNextChar();
364 auto character
= *input
;
366 if (character
== quote
)
368 result
.appendCharPointer (start
, input
);
373 if (character
== '&')
375 result
.appendCharPointer (start
, input
);
381 setLastError ("unmatched quotes", false);
392 XmlElement
* XmlDocument::readNextElement (const bool alsoParseSubElements
)
394 XmlElement
* node
= nullptr;
395 skipNextWhiteSpace();
403 auto endOfToken
= XmlIdentifierChars::findEndOfToken (input
);
405 if (endOfToken
== input
)
407 // no tag name - but allow for a gap after the '<' before giving an error
408 skipNextWhiteSpace();
409 endOfToken
= XmlIdentifierChars::findEndOfToken (input
);
411 if (endOfToken
== input
)
413 setLastError ("tag name missing", false);
418 node
= new XmlElement (input
, endOfToken
);
420 LinkedListPointer
<XmlElement::XmlAttributeNode
>::Appender
attributeAppender (node
->attributes
);
422 // look for attributes
425 skipNextWhiteSpace();
429 if (c
== '/' && input
[1] == '>')
435 // parse the guts of the element..
440 if (alsoParseSubElements
)
441 readChildElements (*node
);
446 // get an attribute..
447 if (XmlIdentifierChars::isIdentifierChar (c
))
449 auto attNameEnd
= XmlIdentifierChars::findEndOfToken (input
);
451 if (attNameEnd
!= input
)
453 auto attNameStart
= input
;
455 skipNextWhiteSpace();
457 if (readNextChar() == '=')
459 skipNextWhiteSpace();
460 auto nextChar
= *input
;
462 if (nextChar
== '"' || nextChar
== '\'')
464 auto* newAtt
= new XmlElement::XmlAttributeNode (attNameStart
, attNameEnd
);
465 readQuotedString (newAtt
->value
);
466 attributeAppender
.append (newAtt
);
472 setLastError ("expected '=' after attribute '"
473 + String (attNameStart
, attNameEnd
) + "'", false);
481 setLastError ("illegal character found in " + node
->getTagName() + ": '" + c
+ "'", false);
491 void XmlDocument::readChildElements (XmlElement
& parent
)
493 LinkedListPointer
<XmlElement
>::Appender
childAppender (parent
.firstChildElement
);
497 auto preWhitespaceInput
= input
;
498 skipNextWhiteSpace();
502 setLastError ("unmatched tags", false);
513 auto closeTag
= input
.indexOf ((juce_wchar
) '>');
516 input
+= closeTag
+ 1;
521 if (c1
== '!' && CharacterFunctions::compareUpTo (input
+ 2, CharPointer_ASCII ("[CDATA["), 7) == 0)
524 auto inputStart
= input
;
532 setLastError ("unterminated CDATA section", false);
537 if (c0
== ']' && input
[1] == ']' && input
[2] == '>')
539 childAppender
.append (XmlElement::createTextElement (String (inputStart
, input
)));
549 // this is some other element, so parse and add it..
550 if (auto* n
= readNextElement (true))
551 childAppender
.append (n
);
556 else // must be a character block
558 input
= preWhitespaceInput
; // roll back to include the leading whitespace
559 MemoryOutputStream textElementContent
;
560 bool contentShouldBeUsed
= ! ignoreEmptyTextElements
;
568 if (input
[1] == '!' && input
[2] == '-' && input
[3] == '-')
571 auto closeComment
= input
.indexOf (CharPointer_ASCII ("-->"));
573 if (closeComment
< 0)
575 setLastError ("unterminated comment", false);
580 input
+= closeComment
+ 3;
589 setLastError ("unmatched tags", false);
599 if (entity
.startsWithChar ('<') && entity
[1] != 0)
601 auto oldInput
= input
;
602 auto oldOutOfData
= outOfData
;
604 input
= entity
.getCharPointer();
607 while (auto* n
= readNextElement (true))
608 childAppender
.append (n
);
611 outOfData
= oldOutOfData
;
615 textElementContent
<< entity
;
616 contentShouldBeUsed
= contentShouldBeUsed
|| entity
.containsNonWhitespaceChars();
623 auto nextChar
= *input
;
625 if (nextChar
== '\r')
629 if (input
[1] == '\n')
633 if (nextChar
== '<' || nextChar
== '&')
638 setLastError ("unmatched tags", false);
643 textElementContent
.appendUTF8Char (nextChar
);
644 contentShouldBeUsed
= contentShouldBeUsed
|| ! CharacterFunctions::isWhitespace (nextChar
);
649 if (contentShouldBeUsed
)
650 childAppender
.append (XmlElement::createTextElement (textElementContent
.toUTF8()));
655 void XmlDocument::readEntity (String
& result
)
657 // skip over the ampersand
660 if (input
.compareIgnoreCaseUpTo (CharPointer_ASCII ("amp;"), 4) == 0)
665 else if (input
.compareIgnoreCaseUpTo (CharPointer_ASCII ("quot;"), 5) == 0)
670 else if (input
.compareIgnoreCaseUpTo (CharPointer_ASCII ("apos;"), 5) == 0)
675 else if (input
.compareIgnoreCaseUpTo (CharPointer_ASCII ("lt;"), 3) == 0)
680 else if (input
.compareIgnoreCaseUpTo (CharPointer_ASCII ("gt;"), 3) == 0)
685 else if (*input
== '#')
687 int64_t charCode
= 0;
690 if (*input
== 'x' || *input
== 'X')
695 while (input
[0] != ';')
697 auto hexValue
= CharacterFunctions::getHexDigitValue (input
[0]);
699 if (hexValue
< 0 || ++numChars
> 8)
701 setLastError ("illegal escape sequence", true);
705 charCode
= (charCode
<< 4) | hexValue
;
711 else if (input
[0] >= '0' && input
[0] <= '9')
717 const auto firstChar
= input
[0];
721 setLastError ("unexpected end of input", true);
725 if (firstChar
== ';')
730 setLastError ("illegal escape sequence", true);
734 charCode
= charCode
* 10 + ((int) firstChar
- '0');
742 setLastError ("illegal escape sequence", true);
747 result
<< (juce_wchar
) charCode
;
751 auto entityNameStart
= input
;
752 auto closingSemiColon
= input
.indexOf ((juce_wchar
) ';');
754 if (closingSemiColon
< 0)
761 input
+= closingSemiColon
+ 1;
762 result
+= expandExternalEntity (String (entityNameStart
, (size_t) closingSemiColon
));
767 String
XmlDocument::expandEntity (const String
& ent
)
769 if (ent
.equalsIgnoreCase ("amp")) return String::charToString ('&');
770 if (ent
.equalsIgnoreCase ("quot")) return String::charToString ('"');
771 if (ent
.equalsIgnoreCase ("apos")) return String::charToString ('\'');
772 if (ent
.equalsIgnoreCase ("lt")) return String::charToString ('<');
773 if (ent
.equalsIgnoreCase ("gt")) return String::charToString ('>');
779 if (char1
== 'x' || char1
== 'X')
780 return String::charToString (static_cast<juce_wchar
> (ent
.substring (2).getHexValue32()));
782 if (char1
>= '0' && char1
<= '9')
783 return String::charToString (static_cast<juce_wchar
> (ent
.substring (1).getIntValue()));
785 setLastError ("illegal escape sequence", false);
786 return String::charToString ('&');
789 return expandExternalEntity (ent
);
792 String
XmlDocument::expandExternalEntity (const String
& entity
)
796 if (dtdText
.isNotEmpty())
798 dtdText
= dtdText
.trimCharactersAtEnd (">");
799 tokenisedDTD
.addTokens (dtdText
, true);
801 if (tokenisedDTD
[tokenisedDTD
.size() - 2].equalsIgnoreCase ("system")
802 && tokenisedDTD
[tokenisedDTD
.size() - 1].isQuotedString())
804 auto fn
= tokenisedDTD
[tokenisedDTD
.size() - 1];
806 tokenisedDTD
.clear();
807 tokenisedDTD
.addTokens (getFileContents (fn
), true);
811 tokenisedDTD
.clear();
812 auto openBracket
= dtdText
.indexOfChar ('[');
816 auto closeBracket
= dtdText
.lastIndexOfChar (']');
818 if (closeBracket
> openBracket
)
819 tokenisedDTD
.addTokens (dtdText
.substring (openBracket
+ 1,
820 closeBracket
), true);
824 for (int i
= tokenisedDTD
.size(); --i
>= 0;)
826 if (tokenisedDTD
[i
].startsWithChar ('%')
827 && tokenisedDTD
[i
].endsWithChar (';'))
829 auto parsed
= getParameterEntity (tokenisedDTD
[i
].substring (1, tokenisedDTD
[i
].length() - 1));
831 newToks
.addTokens (parsed
, true);
833 tokenisedDTD
.remove (i
);
835 for (int j
= newToks
.size(); --j
>= 0;)
836 tokenisedDTD
.insert (i
, newToks
[j
]);
841 needToLoadDTD
= false;
844 for (int i
= 0; i
< tokenisedDTD
.size(); ++i
)
846 if (tokenisedDTD
[i
] == entity
)
848 if (tokenisedDTD
[i
- 1].equalsIgnoreCase ("<!entity"))
850 auto ent
= tokenisedDTD
[i
+ 1].trimCharactersAtEnd (">").trim().unquoted();
852 // check for sub-entities..
853 auto ampersand
= ent
.indexOfChar ('&');
855 while (ampersand
>= 0)
857 auto semiColon
= ent
.indexOf (i
+ 1, ";");
861 setLastError ("entity without terminating semi-colon", false);
865 auto resolved
= expandEntity (ent
.substring (i
+ 1, semiColon
));
867 ent
= ent
.substring (0, ampersand
)
869 + ent
.substring (semiColon
+ 1);
871 ampersand
= ent
.indexOfChar (semiColon
+ 1, '&');
879 setLastError ("unknown entity", true);
883 String
XmlDocument::getParameterEntity (const String
& entity
)
885 for (int i
= 0; i
< tokenisedDTD
.size(); ++i
)
887 if (tokenisedDTD
[i
] == entity
888 && tokenisedDTD
[i
- 1] == "%"
889 && tokenisedDTD
[i
- 2].equalsIgnoreCase ("<!entity"))
891 auto ent
= tokenisedDTD
[i
+ 1].trimCharactersAtEnd (">");
893 if (ent
.equalsIgnoreCase ("system"))
894 return getFileContents (tokenisedDTD
[i
+ 2].trimCharactersAtEnd (">"));
896 return ent
.trim().unquoted();