1 /* valamarkupreader.vala
3 * Copyright (C) 2008-2009 Jürg Billeter
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * Jürg Billeter <j@bitron.ch>
26 * Simple reader for a subset of XML.
28 public class Vala
.MarkupReader
: Object
{
29 public string filename
{ get; private set; }
31 public string name
{ get; private set; }
33 public string content
{ get; private set; }
35 MappedFile mapped_file
;
44 Map
<string,string> attributes
= new HashMap
<string,string> (str_hash
, str_equal
);
47 public MarkupReader (string filename
) {
48 this
.filename
= filename
;
51 mapped_file
= new
MappedFile (filename
, false);
52 begin
= mapped_file
.get_contents ();
53 end
= begin
+ mapped_file
.get_length ();
59 } catch (FileError e
) {
60 Report
.error (null, "Unable to map file `%s': %s".printf (filename
, e
.message
));
64 public string?
get_attribute (string attr
) {
65 return attributes
[attr
];
69 char* begin
= current
;
70 while (current
< end
) {
71 if (current
[0] == ' ' || current
[0] == '\t' || current
[0] == '>'
72 || current
[0] == '/' || current
[0] == '=' || current
[0] == '\n') {
75 unichar u
= ((string) current
).get_char_validated ((long) (end
- current
));
76 if (u
!= (unichar
) (-1)) {
77 current
+= u
.to_utf8 (null);
79 Report
.error (null, "invalid UTF-8 character");
82 if (current
== begin
) {
83 // syntax error: invalid name
85 return ((string) begin
).ndup (current
- begin
);
88 public MarkupTokenType
read_token (out SourceLocation token_begin
, out SourceLocation token_end
) {
92 empty_element
= false;
93 return MarkupTokenType
.END_ELEMENT
;
98 MarkupTokenType type
= MarkupTokenType
.NONE
;
99 char* begin
= current
;
100 token_begin
.pos
= begin
;
101 token_begin
.line
= line
;
102 token_begin
.column
= column
;
104 if (current
>= end
) {
105 type
= MarkupTokenType
.EOF
;
106 } else if (current
[0] == '<') {
108 if (current
>= end
) {
110 } else if (current
[0] == '?') {
111 // processing instruction
112 } else if (current
[0] == '!') {
113 // comment or doctype
115 if (current
< end
- 1 && current
[0] == '-' && current
[1] == '-') {
118 while (current
< end
- 2) {
119 if (current
[0] == '-' && current
[1] == '-' && current
[2] == '>') {
123 } else if (current
[0] == '\n') {
130 // ignore comment, read next token
131 return read_token (out token_begin
, out token_end
);
133 } else if (current
[0] == '/') {
134 type
= MarkupTokenType
.END_ELEMENT
;
137 if (current
>= end
|| current
[0] != '>') {
142 type
= MarkupTokenType
.START_ELEMENT
;
145 while (current
< end
&& current
[0] != '>' && current
[0] != '/') {
146 string attr_name
= read_name ();
147 if (current
>= end
|| current
[0] != '=') {
151 // FIXME allow single quotes
152 if (current
>= end
|| current
[0] != '"') {
157 string attr_value
= text ('"', false);
159 if (current
>= end
|| current
[0] != '"') {
163 attributes
.set (attr_name
, attr_value
);
166 if (current
[0] == '/') {
167 empty_element
= true;
171 empty_element
= false;
173 if (current
>= end
|| current
[0] != '>') {
181 if (current
[0] != '<') {
182 content
= text ('<', true);
186 return read_token (out token_begin
, out token_end
);
189 type
= MarkupTokenType
.TEXT
;
192 token_end
.pos
= current
;
193 token_end
.line
= line
;
194 token_end
.column
= column
- 1;
199 string text (char end_char
, bool rm_trailing_whitespace
) {
200 StringBuilder content
= new
StringBuilder ();
201 char* text_begin
= current
;
202 char* last_linebreak
= current
;
204 while (current
< end
&& current
[0] != end_char
) {
205 unichar u
= ((string) current
).get_char_validated ((long) (end
- current
));
206 if (u
== (unichar
) (-1)) {
207 Report
.error (null, "invalid UTF-8 character");
208 } else if (u
== '&') {
209 char* next_pos
= current
+ u
.to_utf8 (null);
210 if (((string) next_pos
).has_prefix ("amp;")) {
211 content
.append (((string) text_begin
).ndup (current
- text_begin
));
212 content
.append_c ('&');
214 text_begin
= current
;
215 } else if (((string) next_pos
).has_prefix ("quot;")) {
216 content
.append (((string) text_begin
).ndup (current
- text_begin
));
217 content
.append_c ('"');
219 text_begin
= current
;
220 } else if (((string) next_pos
).has_prefix ("apos;")) {
221 content
.append (((string) text_begin
).ndup (current
- text_begin
));
222 content
.append_c ('\'');
224 text_begin
= current
;
225 } else if (((string) next_pos
).has_prefix ("lt;")) {
226 content
.append (((string) text_begin
).ndup (current
- text_begin
));
227 content
.append_c ('<');
229 text_begin
= current
;
230 } else if (((string) next_pos
).has_prefix ("gt;")) {
231 content
.append (((string) text_begin
).ndup (current
- text_begin
));
232 content
.append_c ('>');
234 text_begin
= current
;
236 current
+= u
.to_utf8 (null);
242 last_linebreak
= current
;
245 current
+= u
.to_utf8 (null);
250 if (text_begin
!= current
) {
251 content
.append (((string) text_begin
).ndup (current
- text_begin
));
254 column
+= (int) (current
- last_linebreak
);
256 // Removes trailing whitespace
257 if (rm_trailing_whitespace
) {
258 char* str_pos
= ((char*)content
.str
) + content
.len
;
259 for (str_pos
--; str_pos
> ((char*)content
.str
) && str_pos
[0].isspace(); str_pos
--);
260 content
.erase ((ssize_t
) (str_pos
-((char*) content
.str
) + 1), -1);
267 while (current
< end
&& current
[0].isspace ()) {
268 if (current
[0] == '\n') {
278 public enum Vala
.MarkupTokenType
{
285 public unowned
string to_string () {
287 case START_ELEMENT
: return "start element";
288 case END_ELEMENT
: return "end element";
289 case TEXT
: return "text";
290 case EOF
: return "end of file";
291 default: return "unknown token type";