vala/valamarkupreader.vala

   1 /* valamarkupreader.vala
   2  *
   3  * Copyright (C) 2008-2009  Jürg Billeter
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2.1 of the License, or (at your option) any later version.
   9
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14
  15  * You should have received a copy of the GNU Lesser General Public
  16  * License along with this library; if not, write to the Free Software
  17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA
  18  *
  19  * Author:
  20  *      Jürg Billeter <j@bitron.ch>
  21  */
  22
  23 using GLib;
  24
  25 /**
  26  * Simple reader for a subset of XML.
  27  */
  28 public class Vala.MarkupReader {
  29         public string filename { get; private set; }
  30
  31         public string name { get; private set; }
  32
  33         public string content { get; private set; }
  34
  35         MappedFile mapped_file;
  36
  37         char* begin;
  38         char* current;
  39         char* end;
  40
  41         int line;
  42         int column;
  43
  44         Map<string,string> attributes = new HashMap<string,string> (str_hash, str_equal);
  45         bool empty_element;
  46
  47         public MarkupReader (string filename) {
  48                 this.filename = filename;
  49
  50                 try {
  51                         mapped_file = new MappedFile (filename, false);
  52                         begin = mapped_file.get_contents ();
  53                         end = begin + mapped_file.get_length ();
  54
  55                         current = begin;
  56
  57                         line = 1;
  58                         column = 1;
  59                 } catch (FileError e) {
  60                         Report.error (null, "Unable to map file `%s': %s".printf (filename, e.message));
  61                 }
  62         }
  63
  64         public MarkupReader.from_string (string filename, string content) {
  65                 this.filename = filename;
  66
  67                 begin = content;
  68                 end = begin + content.length;
  69
  70                 current = begin;
  71
  72                 line = 1;
  73                 column = 1;
  74         }
  75
  76         public string? get_attribute (string attr) {
  77                 return attributes[attr];
  78         }
  79
  80         /*
  81          * Returns a copy of the current attributes.
  82          *
  83          * @return map of current attributes
  84          */
  85         public Map<string,string> get_attributes () {
  86                 var result = new HashMap<string,string> (str_hash, str_equal);
  87                 foreach (var key in attributes.get_keys ()) {
  88                         result.set (key, attributes.get (key));
  89                 }
  90                 return result;
  91         }
  92
  93         string read_name () {
  94                 char* begin = current;
  95                 while (current < end) {
  96                         if (current[0] == ' ' || current[0] == '\t' || current[0] == '>'
  97                             || current[0] == '/' || current[0] == '=' || current[0] == '\n') {
  98                                 break;
  99                         }
 100                         unichar u = ((string) current).get_char_validated ((long) (end - current));
 101                         if (u != (unichar) (-1)) {
 102                                 current += u.to_utf8 (null);
 103                         } else {
 104                                 Report.error (null, "invalid UTF-8 character");
 105                         }
 106                 }
 107                 if (current == begin) {
 108                         // syntax error: invalid name
 109                 }
 110                 return ((string) begin).substring (0, (int) (current - begin));
 111         }
 112
 113         public MarkupTokenType read_token (out SourceLocation token_begin, out SourceLocation token_end) {
 114                 attributes.clear ();
 115
 116                 if (empty_element) {
 117                         empty_element = false;
 118                         token_begin = SourceLocation (begin, line, column);
 119                         token_end = SourceLocation (begin, line, column);
 120                         return MarkupTokenType.END_ELEMENT;
 121                 }
 122
 123                 content = null;
 124                 name = null;
 125
 126                 space ();
 127
 128                 MarkupTokenType type = MarkupTokenType.NONE;
 129                 char* begin = current;
 130                 token_begin = SourceLocation (begin, line, column);
 131
 132                 if (current >= end) {
 133                         type = MarkupTokenType.EOF;
 134                 } else if (current[0] == '<') {
 135                         current++;
 136                         if (current >= end) {
 137                                 // error
 138                         } else if (current[0] == '?') {
 139                                 // processing instruction
 140                         } else if (current[0] == '!') {
 141                                 // comment or doctype
 142                                 current++;
 143                                 if (current < end - 1 && current[0] == '-' && current[1] == '-') {
 144                                         // comment
 145                                         current += 2;
 146                                         while (current < end - 2) {
 147                                                 if (current[0] == '-' && current[1] == '-' && current[2] == '>') {
 148                                                         // end of comment
 149                                                         current += 3;
 150                                                         break;
 151                                                 } else if (current[0] == '\n') {
 152                                                         line++;
 153                                                         column = 0;
 154                                                 }
 155                                                 current++;
 156                                         }
 157
 158                                         // ignore comment, read next token
 159                                         return read_token (out token_begin, out token_end);
 160                                 }
 161                         } else if (current[0] == '/') {
 162                                 type = MarkupTokenType.END_ELEMENT;
 163                                 current++;
 164                                 name = read_name ();
 165                                 if (current >= end || current[0] != '>') {
 166                                         // error
 167                                 }
 168                                 current++;
 169                         } else {
 170                                 type = MarkupTokenType.START_ELEMENT;
 171                                 name = read_name ();
 172                                 space ();
 173                                 while (current < end && current[0] != '>' && current[0] != '/') {
 174                                         string attr_name = read_name ();
 175                                         if (current >= end || current[0] != '=') {
 176                                                 // error
 177                                         }
 178                                         current++;
 179                                         if (current >= end || current[0] != '"' || current[0] != '\'') {
 180                                                 // error
 181                                         }
 182                                         char quote = current[0];
 183                                         current++;
 184
 185                                         string attr_value = text (quote, false);
 186
 187                                         if (current >= end || current[0] != quote) {
 188                                                 // error
 189                                         }
 190                                         current++;
 191                                         attributes.set (attr_name, attr_value);
 192                                         space ();
 193                                 }
 194                                 if (current[0] == '/') {
 195                                         empty_element = true;
 196                                         current++;
 197                                         space ();
 198                                 } else {
 199                                         empty_element = false;
 200                                 }
 201                                 if (current >= end || current[0] != '>') {
 202                                         // error
 203                                 }
 204                                 current++;
 205                         }
 206                 } else {
 207                         space ();
 208
 209                         if (current[0] != '<') {
 210                                 content = text ('<', true);
 211                         } else {
 212                                 // no text
 213                                 // read next token
 214                                 return read_token (out token_begin, out token_end);
 215                         }
 216
 217                         type = MarkupTokenType.TEXT;
 218                 }
 219
 220                 token_end = SourceLocation (current, line, column - 1);
 221
 222                 return type;
 223         }
 224
 225         string text (char end_char, bool rm_trailing_whitespace) {
 226                 StringBuilder content = new StringBuilder ();
 227                 char* text_begin = current;
 228                 char* last_linebreak = current;
 229
 230                 while (current < end && current[0] != end_char) {
 231                         unichar u = ((string) current).get_char_validated ((long) (end - current));
 232                         if (u == (unichar) (-1)) {
 233                                 Report.error (null, "invalid UTF-8 character");
 234                         } else if (u == '&') {
 235                                 char* next_pos = current + u.to_utf8 (null);
 236                                 if (((string) next_pos).has_prefix ("amp;")) {
 237                                         content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
 238                                         content.append_c ('&');
 239                                         current += 5;
 240                                         text_begin = current;
 241                                 } else if (((string) next_pos).has_prefix ("quot;")) {
 242                                         content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
 243                                         content.append_c ('"');
 244                                         current += 6;
 245                                         text_begin = current;
 246                                 } else if (((string) next_pos).has_prefix ("apos;")) {
 247                                         content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
 248                                         content.append_c ('\'');
 249                                         current += 6;
 250                                         text_begin = current;
 251                                 } else if (((string) next_pos).has_prefix ("lt;")) {
 252                                         content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
 253                                         content.append_c ('<');
 254                                         current += 4;
 255                                         text_begin = current;
 256                                 } else if (((string) next_pos).has_prefix ("gt;")) {
 257                                         content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
 258                                         content.append_c ('>');
 259                                         current += 4;
 260                                         text_begin = current;
 261                                 } else if (((string) next_pos).has_prefix ("percnt;")) {
 262                                         content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
 263                                         content.append_c ('%');
 264                                         current += 8;
 265                                         text_begin = current;
 266                                 } else {
 267                                         current += u.to_utf8 (null);
 268                                 }
 269                         } else {
 270                                 if (u == '\n') {
 271                                         line++;
 272                                         column = 0;
 273                                         last_linebreak = current;
 274                                 }
 275
 276                                 current += u.to_utf8 (null);
 277                                 column++;
 278                         }
 279                 }
 280
 281                 if (text_begin != current) {
 282                         content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
 283                 }
 284
 285                 column += (int) (current - last_linebreak);
 286
 287                 // Removes trailing whitespace
 288                 if (rm_trailing_whitespace) {
 289                         char* str_pos = ((char*)content.str) + content.len;
 290                         for (str_pos--; str_pos > ((char*)content.str) && str_pos[0].isspace(); str_pos--);
 291                         content.erase ((ssize_t) (str_pos-((char*) content.str) + 1), -1);
 292                 }
 293
 294                 return content.str;
 295         }
 296
 297         void space () {
 298                 while (current < end && current[0].isspace ()) {
 299                         if (current[0] == '\n') {
 300                                 line++;
 301                                 column = 0;
 302                         }
 303                         current++;
 304                         column++;
 305                 }
 306         }
 307 }
 308
 309 public enum Vala.MarkupTokenType {
 310         NONE,
 311         START_ELEMENT,
 312         END_ELEMENT,
 313         TEXT,
 314         EOF;
 315
 316         public unowned string to_string () {
 317                 switch (this) {
 318                 case START_ELEMENT: return "start element";
 319                 case END_ELEMENT: return "end element";
 320                 case TEXT: return "text";
 321                 case EOF: return "end of file";
 322                 default: return "unknown token type";
 323                 }
 324         }
 325 }
 326