vala/valamarkupreader.vala

   1 /* valamarkupreader.vala
   2  *
   3  * Copyright (C) 2008-2009  Jürg Billeter
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2.1 of the License, or (at your option) any later version.
   9
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14
  15  * You should have received a copy of the GNU Lesser General Public
  16  * License along with this library; if not, write to the Free Software
  17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA
  18  *
  19  * Author:
  20  *      Jürg Billeter <j@bitron.ch>
  21  */
  22
  23 using GLib;
  24
  25 /**
  26  * Simple reader for a subset of XML.
  27  */
  28 public class Vala.MarkupReader : Object {
  29         public string filename { get; private set; }
  30
  31         public string name { get; private set; }
  32
  33         MappedFile mapped_file;
  34
  35         char* begin;
  36         char* current;
  37         char* end;
  38
  39         int line;
  40         int column;
  41
  42         Map<string,string> attributes = new HashMap<string,string> (str_hash, str_equal);
  43         bool empty_element;
  44
  45         public MarkupReader (string filename) {
  46                 this.filename = filename;
  47         }
  48
  49         construct {
  50                 try {
  51                         mapped_file = new MappedFile (filename, false);
  52                         begin = mapped_file.get_contents ();
  53                         end = begin + mapped_file.get_length ();
  54
  55                         current = begin;
  56
  57                         line = 1;
  58                         column = 1;
  59                 } catch (FileError e) {
  60                         Report.error (null, "Unable to map file `%s': %s".printf (filename, e.message));
  61                 }
  62         }
  63
  64         public string? get_attribute (string attr) {
  65                 return attributes[attr];
  66         }
  67
  68         string read_name () {
  69                 char* begin = current;
  70                 while (current < end) {
  71                         if (current[0] == ' ' || current[0] == '>'
  72                             || current[0] == '/' || current[0] == '=') {
  73                                 break;
  74                         }
  75                         unichar u = ((string) current).get_char_validated ((long) (end - current));
  76                         if (u != (unichar) (-1)) {
  77                                 current += u.to_utf8 (null);
  78                         } else {
  79                                 Report.error (null, "invalid UTF-8 character");
  80                         }
  81                 }
  82                 if (current == begin) {
  83                         // syntax error: invalid name
  84                 }
  85                 return ((string) begin).ndup (current - begin);
  86         }
  87
  88         public MarkupTokenType read_token (out SourceLocation token_begin, out SourceLocation token_end) {
  89                 attributes.clear ();
  90
  91                 if (empty_element) {
  92                         empty_element = false;
  93                         return MarkupTokenType.END_ELEMENT;
  94                 }
  95
  96                 space ();
  97
  98                 MarkupTokenType type = MarkupTokenType.NONE;
  99                 char* begin = current;
 100                 token_begin.pos = begin;
 101                 token_begin.line = line;
 102                 token_begin.column = column;
 103
 104                 if (current >= end) {
 105                         type = MarkupTokenType.EOF;
 106                 } else if (current[0] == '<') {
 107                         current++;
 108                         if (current >= end) {
 109                                 // error
 110                         } else if (current[0] == '?') {
 111                                 // processing instruction
 112                         } else if (current[0] == '!') {
 113                                 // comment or doctype
 114                                 current++;
 115                                 if (current < end - 1 && current[0] == '-' && current[1] == '-') {
 116                                         // comment
 117                                         current += 2;
 118                                         while (current < end - 2) {
 119                                                 if (current[0] == '-' && current[1] == '-' && current[2] == '>') {
 120                                                         // end of comment
 121                                                         current += 3;
 122                                                         break;
 123                                                 }
 124                                                 current++;
 125                                         }
 126
 127                                         // ignore comment, read next token
 128                                         return read_token (out token_begin, out token_end);
 129                                 }
 130                         } else if (current[0] == '/') {
 131                                 type = MarkupTokenType.END_ELEMENT;
 132                                 current++;
 133                                 name = read_name ();
 134                                 if (current >= end || current[0] != '>') {
 135                                         // error
 136                                 }
 137                                 current++;
 138                         } else {
 139                                 type = MarkupTokenType.START_ELEMENT;
 140                                 name = read_name ();
 141                                 space ();
 142                                 while (current < end && current[0] != '>' && current[0] != '/') {
 143                                         string attr_name = read_name ();
 144                                         if (current >= end || current[0] != '=') {
 145                                                 // error
 146                                         }
 147                                         current++;
 148                                         // FIXME allow single quotes
 149                                         if (current >= end || current[0] != '"') {
 150                                                 // error
 151                                         }
 152                                         current++;
 153                                         char* attr_begin = current;
 154                                         while (current < end && current[0] != '"') {
 155                                                 unichar u = ((string) current).get_char_validated ((long) (end - current));
 156                                                 if (u != (unichar) (-1)) {
 157                                                         current += u.to_utf8 (null);
 158                                                 } else {
 159                                                         Report.error (null, "invalid UTF-8 character");
 160                                                 }
 161                                         }
 162                                         // TODO process &amp; &gt; &lt; &quot; &apos;
 163                                         string attr_value = ((string) attr_begin).ndup (current - attr_begin);
 164                                         if (current >= end || current[0] != '"') {
 165                                                 // error
 166                                         }
 167                                         current++;
 168                                         attributes.set (attr_name, attr_value);
 169                                         space ();
 170                                 }
 171                                 if (current[0] == '/') {
 172                                         empty_element = true;
 173                                         current++;
 174                                         space ();
 175                                 } else {
 176                                         empty_element = false;
 177                                 }
 178                                 if (current >= end || current[0] != '>') {
 179                                         // error
 180                                 }
 181                                 current++;
 182                         }
 183                 } else {
 184                         space ();
 185                         char* text_begin = current;
 186                         while (current < end && current[0] != '<') {
 187                                 unichar u = ((string) current).get_char_validated ((long) (end - current));
 188                                 if (u != (unichar) (-1)) {
 189                                         current += u.to_utf8 (null);
 190                                 } else {
 191                                         Report.error (null, "invalid UTF-8 character");
 192                                 }
 193                         }
 194                         if (text_begin == current) {
 195                                 // no text
 196                                 // read next token
 197                                 return read_token (out token_begin, out token_end);
 198                         }
 199                         type = MarkupTokenType.TEXT;
 200                         // TODO process &amp; &gt; &lt; &quot; &apos;
 201                         // string text = ((string) text_begin).ndup (current - text_begin);
 202                 }
 203
 204                 column += (int) (current - begin);
 205
 206                 token_end.pos = current;
 207                 token_end.line = line;
 208                 token_end.column = column - 1;
 209
 210                 return type;
 211         }
 212
 213         void space () {
 214                 while (current < end && current[0].isspace ()) {
 215                         if (current[0] == '\n') {
 216                                 line++;
 217                                 column = 0;
 218                         }
 219                         current++;
 220                         column++;
 221                 }
 222         }
 223 }
 224
 225 public enum Vala.MarkupTokenType {
 226         NONE,
 227         START_ELEMENT,
 228         END_ELEMENT,
 229         TEXT,
 230         EOF;
 231
 232         public weak string to_string () {
 233                 switch (this) {
 234                 case START_ELEMENT: return "start element";
 235                 case END_ELEMENT: return "end element";
 236                 case TEXT: return "text";
 237                 case EOF: return "end of file";
 238                 default: return "unknown token type";
 239                 }
 240         }
 241 }
 242