vala/valamarkupreader.vala

   1 /* valamarkupreader.vala
   2  *
   3  * Copyright (C) 2008-2009  Jürg Billeter
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2.1 of the License, or (at your option) any later version.
   9
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14
  15  * You should have received a copy of the GNU Lesser General Public
  16  * License along with this library; if not, write to the Free Software
  17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA
  18  *
  19  * Author:
  20  *      Jürg Billeter <j@bitron.ch>
  21  */
  22
  23 using GLib;
  24
  25 /**
  26  * Simple reader for a subset of XML.
  27  */
  28 public class Vala.MarkupReader : Object {
  29         public string filename { get; private set; }
  30
  31         public string name { get; private set; }
  32
  33         public string content { get; private set; }
  34
  35         MappedFile mapped_file;
  36
  37         char* begin;
  38         char* current;
  39         char* end;
  40
  41         int line;
  42         int column;
  43
  44         Map<string,string> attributes = new HashMap<string,string> (str_hash, str_equal);
  45         bool empty_element;
  46
  47         public MarkupReader (string filename) {
  48                 this.filename = filename;
  49
  50                 try {
  51                         mapped_file = new MappedFile (filename, false);
  52                         begin = mapped_file.get_contents ();
  53                         end = begin + mapped_file.get_length ();
  54
  55                         current = begin;
  56
  57                         line = 1;
  58                         column = 1;
  59                 } catch (FileError e) {
  60                         Report.error (null, "Unable to map file `%s': %s".printf (filename, e.message));
  61                 }
  62         }
  63
  64         public string? get_attribute (string attr) {
  65                 return attributes[attr];
  66         }
  67
  68         string read_name () {
  69                 char* begin = current;
  70                 while (current < end) {
  71                         if (current[0] == ' ' || current[0] == '\t' || current[0] == '>'
  72                             || current[0] == '/' || current[0] == '=' || current[0] == '\n') {
  73                                 break;
  74                         }
  75                         unichar u = ((string) current).get_char_validated ((long) (end - current));
  76                         if (u != (unichar) (-1)) {
  77                                 current += u.to_utf8 (null);
  78                         } else {
  79                                 Report.error (null, "invalid UTF-8 character");
  80                         }
  81                 }
  82                 if (current == begin) {
  83                         // syntax error: invalid name
  84                 }
  85                 return ((string) begin).substring (0, (int) (current - begin));
  86         }
  87
  88         public MarkupTokenType read_token (out SourceLocation token_begin, out SourceLocation token_end) {
  89                 attributes.clear ();
  90
  91                 if (empty_element) {
  92                         empty_element = false;
  93                         return MarkupTokenType.END_ELEMENT;
  94                 }
  95
  96                 space ();
  97
  98                 MarkupTokenType type = MarkupTokenType.NONE;
  99                 char* begin = current;
 100                 token_begin.pos = begin;
 101                 token_begin.line = line;
 102                 token_begin.column = column;
 103
 104                 if (current >= end) {
 105                         type = MarkupTokenType.EOF;
 106                 } else if (current[0] == '<') {
 107                         current++;
 108                         if (current >= end) {
 109                                 // error
 110                         } else if (current[0] == '?') {
 111                                 // processing instruction
 112                         } else if (current[0] == '!') {
 113                                 // comment or doctype
 114                                 current++;
 115                                 if (current < end - 1 && current[0] == '-' && current[1] == '-') {
 116                                         // comment
 117                                         current += 2;
 118                                         while (current < end - 2) {
 119                                                 if (current[0] == '-' && current[1] == '-' && current[2] == '>') {
 120                                                         // end of comment
 121                                                         current += 3;
 122                                                         break;
 123                                                 } else if (current[0] == '\n') {
 124                                                         line++;
 125                                                         column = 0;
 126                                                 }
 127                                                 current++;
 128                                         }
 129
 130                                         // ignore comment, read next token
 131                                         return read_token (out token_begin, out token_end);
 132                                 }
 133                         } else if (current[0] == '/') {
 134                                 type = MarkupTokenType.END_ELEMENT;
 135                                 current++;
 136                                 name = read_name ();
 137                                 if (current >= end || current[0] != '>') {
 138                                         // error
 139                                 }
 140                                 current++;
 141                         } else {
 142                                 type = MarkupTokenType.START_ELEMENT;
 143                                 name = read_name ();
 144                                 space ();
 145                                 while (current < end && current[0] != '>' && current[0] != '/') {
 146                                         string attr_name = read_name ();
 147                                         if (current >= end || current[0] != '=') {
 148                                                 // error
 149                                         }
 150                                         current++;
 151                                         // FIXME allow single quotes
 152                                         if (current >= end || current[0] != '"') {
 153                                                 // error
 154                                         }
 155                                         current++;
 156
 157                                         string attr_value = text ('"', false);
 158
 159                                         if (current >= end || current[0] != '"') {
 160                                                 // error
 161                                         }
 162                                         current++;
 163                                         attributes.set (attr_name, attr_value);
 164                                         space ();
 165                                 }
 166                                 if (current[0] == '/') {
 167                                         empty_element = true;
 168                                         current++;
 169                                         space ();
 170                                 } else {
 171                                         empty_element = false;
 172                                 }
 173                                 if (current >= end || current[0] != '>') {
 174                                         // error
 175                                 }
 176                                 current++;
 177                         }
 178                 } else {
 179                         space ();
 180
 181                         if (current[0] != '<') {
 182                                 content = text ('<', true);
 183                         } else {
 184                                 // no text
 185                                 // read next token
 186                                 return read_token (out token_begin, out token_end);
 187                         }
 188
 189                         type = MarkupTokenType.TEXT;
 190                 }
 191
 192                 token_end.pos = current;
 193                 token_end.line = line;
 194                 token_end.column = column - 1;
 195
 196                 return type;
 197         }
 198
 199         string text (char end_char, bool rm_trailing_whitespace) {
 200                 StringBuilder content = new StringBuilder ();
 201                 char* text_begin = current;
 202                 char* last_linebreak = current;
 203
 204                 while (current < end && current[0] != end_char) {
 205                         unichar u = ((string) current).get_char_validated ((long) (end - current));
 206                         if (u == (unichar) (-1)) {
 207                                 Report.error (null, "invalid UTF-8 character");
 208                         } else if (u == '&') {
 209                                 char* next_pos = current + u.to_utf8 (null);
 210                                 if (((string) next_pos).has_prefix ("amp;")) {
 211                                         content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
 212                                         content.append_c ('&');
 213                                         current += 5;
 214                                         text_begin = current;
 215                                 } else if (((string) next_pos).has_prefix ("quot;")) {
 216                                         content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
 217                                         content.append_c ('"');
 218                                         current += 6;
 219                                         text_begin = current;
 220                                 } else if (((string) next_pos).has_prefix ("apos;")) {
 221                                         content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
 222                                         content.append_c ('\'');
 223                                         current += 6;
 224                                         text_begin = current;
 225                                 } else if (((string) next_pos).has_prefix ("lt;")) {
 226                                         content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
 227                                         content.append_c ('<');
 228                                         current += 4;
 229                                         text_begin = current;
 230                                 } else if (((string) next_pos).has_prefix ("gt;")) {
 231                                         content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
 232                                         content.append_c ('>');
 233                                         current += 4;
 234                                         text_begin = current;
 235                                 } else {
 236                                         current += u.to_utf8 (null);
 237                                 }
 238                         } else {
 239                                 if (u == '\n') {
 240                                         line++;
 241                                         column = 0;
 242                                         last_linebreak = current;
 243                                 }
 244
 245                                 current += u.to_utf8 (null);
 246                                 column++;
 247                         }
 248                 }
 249
 250                 if (text_begin != current) {
 251                         content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
 252                 }
 253
 254                 column += (int) (current - last_linebreak);
 255
 256                 // Removes trailing whitespace
 257                 if (rm_trailing_whitespace) {
 258                         char* str_pos = ((char*)content.str) + content.len;
 259                         for (str_pos--; str_pos > ((char*)content.str) && str_pos[0].isspace(); str_pos--);
 260                         content.erase ((ssize_t) (str_pos-((char*) content.str) + 1), -1);
 261                 }
 262
 263                 return content.str;
 264         }
 265
 266         void space () {
 267                 while (current < end && current[0].isspace ()) {
 268                         if (current[0] == '\n') {
 269                                 line++;
 270                                 column = 0;
 271                         }
 272                         current++;
 273                         column++;
 274                 }
 275         }
 276 }
 277
 278 public enum Vala.MarkupTokenType {
 279         NONE,
 280         START_ELEMENT,
 281         END_ELEMENT,
 282         TEXT,
 283         EOF;
 284
 285         public unowned string to_string () {
 286                 switch (this) {
 287                 case START_ELEMENT: return "start element";
 288                 case END_ELEMENT: return "end element";
 289                 case TEXT: return "text";
 290                 case EOF: return "end of file";
 291                 default: return "unknown token type";
 292                 }
 293         }
 294 }
 295