Dont reindex already indexed files. Yet another bug uncovered by the DateTime fixes.
[beagle.git] / Util / SemWeb / N3Parser.cs
blobbf4b425d778b1b3143754e5c2dcaadecb038dac9
1 using System;
2 using System.Collections;
3 using System.IO;
4 using System.Text;
6 using SemWeb;
7 using SemWeb.Util;
9 namespace SemWeb {
11 public class N3Reader : RdfReader {
12 Resource PrefixResource = new Literal("@prefix");
13 Resource KeywordsResource = new Literal("@keywords");
15 TextReader sourcestream;
16 NamespaceManager namespaces = new NamespaceManager();
18 Entity entRDFTYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type";
19 Entity entRDFFIRST = "http://www.w3.org/1999/02/22-rdf-syntax-ns#first";
20 Entity entRDFREST = "http://www.w3.org/1999/02/22-rdf-syntax-ns#rest";
21 Entity entRDFNIL = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nil";
22 //Entity entOWLSAMEAS = "http://www.w3.org/2002/07/owl#sameAs";
23 Entity entDAMLEQUIV = "http://www.daml.org/2000/12/daml+oil#equivalentTo";
24 Entity entLOGIMPLIES = "http://www.w3.org/2000/10/swap/log#implies";
26 public N3Reader(TextReader source) {
27 this.sourcestream = source;
30 public N3Reader(string sourcefile) {
31 this.sourcestream = GetReader(sourcefile);
32 BaseUri = "file:" + sourcefile + "#";
35 private struct ParseContext {
36 public MyReader source;
37 public StatementSink store;
38 public NamespaceManager namespaces;
39 public UriMap namedNode;
40 public Hashtable anonymous;
41 public Entity meta;
42 public bool UsingKeywords;
43 public Hashtable Keywords;
45 public Location Location { get { return new Location(source.Line, source.Col); } }
48 public override void Select(StatementSink store) {
49 ParseContext context = new ParseContext();
50 context.source = new MyReader(sourcestream);
51 context.store = GetDupCheckSink(store);
52 context.namespaces = namespaces;
53 context.namedNode = new UriMap();
54 context.anonymous = new Hashtable();
55 context.meta = Meta;
57 while (ReadStatement(context)) { }
60 private bool ReadStatement(ParseContext context) {
61 Location loc = context.Location;
63 bool reverse;
64 Resource subject = ReadResource(context, out reverse);
65 if (subject == null) return false;
66 if (reverse) OnError("is...of not allowed on a subject", loc);
68 if ((object)subject == (object)PrefixResource) {
69 loc = context.Location;
70 string qname = ReadToken(context.source, context) as string;
71 if (qname == null || !qname.EndsWith(":")) OnError("When using @prefix, the prefix identifier must end with a colon", loc);
73 loc = context.Location;
74 Resource uri = ReadResource(context, out reverse);
75 if (uri == null) OnError("Expecting a URI", loc);
76 if (reverse) OnError("is...of not allowed here", loc);
77 namespaces.AddNamespace(uri.Uri, qname.Substring(0, qname.Length-1));
79 loc = context.Location;
80 char punc = ReadPunc(context.source);
81 if (punc != '.')
82 OnError("Expected a period but found '" + punc + "'", loc);
83 return true;
86 if ((object)subject == (object)KeywordsResource) {
87 context.UsingKeywords = true;
88 context.Keywords = new Hashtable();
89 while (true) {
90 ReadWhitespace(context.source);
91 if (context.source.Peek() == '.') {
92 context.source.Read();
93 break;
96 loc = context.Location;
97 string tok = ReadToken(context.source, context) as string;
98 if (tok == null)
99 OnError("Expecting keyword names", loc);
101 context.Keywords[tok] = tok;
103 return true;
106 // It's possible to just assert the presence of an entity
107 // by following the entity with a period, or a } to end
108 // a reified context.
109 if (NextPunc(context.source) == '.') {
110 context.source.Read();
111 return true;
113 if (NextPunc(context.source) == '}') {
114 context.source.Read();
115 return false; // end of block
118 // Read the predicates for this subject.
119 char period = ReadPredicates(subject, context);
120 loc = context.Location;
121 if (period != '.' && period != '}')
122 OnError("Expected a period but found '" + period + "'", loc);
123 if (period == '}') return false;
124 return true;
127 private char ReadPredicates(Resource subject, ParseContext context) {
128 char punctuation = ';';
129 while (punctuation == ';')
130 punctuation = ReadPredicate(subject, context);
131 return punctuation;
134 private char ReadPredicate(Resource subject, ParseContext context) {
135 bool reverse;
136 Location loc = context.Location;
137 Resource predicate = ReadResource(context, out reverse);
138 if (predicate == null) OnError("Expecting a predicate", loc);
139 if (predicate is Literal) OnError("Predicates cannot be literals", loc);
141 char punctuation = ',';
142 while (punctuation == ',') {
143 ReadObject(subject, (Entity)predicate, context, reverse);
144 loc = context.Location;
145 punctuation = ReadPunc(context.source);
147 if (punctuation != '.' && punctuation != ';' && punctuation != ']' && punctuation != '}')
148 OnError("Expecting a period, semicolon, comma, or close-bracket but found '" + punctuation + "'", loc);
150 return punctuation;
153 private void ReadObject(Resource subject, Entity predicate, ParseContext context, bool reverse) {
154 bool reverse2;
155 Location loc = context.Location;
156 Resource value = ReadResource(context, out reverse2);
157 if (value == null) OnError("Expecting a resource or literal object", loc);
158 if (reverse2) OnError("is...of not allowed on objects", loc);
160 loc = context.Location;
161 if (!reverse) {
162 if (subject is Literal) OnError("Subjects of statements cannot be literals", loc);
163 Add(context.store, new Statement((Entity)subject, predicate, value, context.meta), loc);
164 } else {
165 if (value is Literal) OnError("A literal cannot be the object of a reverse-predicate statement", loc);
166 Add(context.store, new Statement((Entity)value, predicate, subject, context.meta), loc);
170 private void ReadWhitespace(MyReader source) {
171 while (true) {
172 while (char.IsWhiteSpace((char)source.Peek()))
173 source.Read();
175 if (source.Peek() == '#') {
176 while (true) {
177 int c = source.Read();
178 if (c == -1 || c == 10 || c == 13) break;
180 continue;
183 break;
187 private char ReadPunc(MyReader source) {
188 ReadWhitespace(source);
189 int c = source.Read();
190 if (c == -1)
191 OnError("End of file expecting punctuation", new Location(source.Line, source.Col));
192 return (char)c;
195 private int NextPunc(MyReader source) {
196 ReadWhitespace(source);
197 return source.Peek();
200 private void ReadEscapedChar(char c, StringBuilder b, MyReader source, Location loc) {
201 if (c == 'n') b.Append('\n');
202 else if (c == 'r') b.Append('\r');
203 else if (c == 't') b.Append('\t');
204 else if (c == '\\') b.Append('\\');
205 else if (c == '"') b.Append('"');
206 else if (c == '\'') b.Append('\'');
207 else if (c == 'a') b.Append('\a');
208 else if (c == 'b') b.Append('\b');
209 else if (c == 'f') b.Append('\f');
210 else if (c == 'v') b.Append('\v');
211 else if (c == '\n') { }
212 else if (c == '\r') { }
213 else if (c == 'u' || c == 'U') {
214 StringBuilder num = new StringBuilder();
215 if (c == 'u') {
216 num.Append((char)source.Read()); // four hex digits
217 num.Append((char)source.Read());
218 num.Append((char)source.Read());
219 num.Append((char)source.Read());
220 } else {
221 source.Read(); // two zeros
222 source.Read();
223 num.Append((char)source.Read()); // six hex digits
224 num.Append((char)source.Read());
225 num.Append((char)source.Read());
226 num.Append((char)source.Read());
227 num.Append((char)source.Read());
228 num.Append((char)source.Read());
231 int unicode = int.Parse(num.ToString(), System.Globalization.NumberStyles.AllowHexSpecifier);
232 b.Append((char)unicode); // is this correct?
234 } else if (char.IsDigit((char)c) || c == 'x')
235 OnError("Octal and hex byte-value escapes are deprecated and not supported", loc);
236 else
237 OnError("Unrecognized escape character: " + (char)c, loc);
240 private StringBuilder readTokenBuffer = new StringBuilder();
242 private object ReadToken(MyReader source, ParseContext context) {
243 ReadWhitespace(source);
245 Location loc = new Location(source.Line, source.Col);
247 int firstchar = source.Read();
248 if (firstchar == -1)
249 return "";
251 StringBuilder b = readTokenBuffer; readTokenBuffer.Length = 0;
252 b.Append((char)firstchar);
254 if (firstchar == '<') {
255 // This is a URI or the <= verb. URIs can be escaped like strings, at least in the NTriples spec.
256 bool escaped = false;
257 while (true) {
258 int c = source.Read();
259 if (c == -1) OnError("Unexpected end of stream within a token beginning with <", loc);
261 if (b.Length == 2 && c == '=')
262 return "<="; // the <= verb
264 if (escaped) {
265 ReadEscapedChar((char)c, b, source, loc);
266 escaped = false;
267 } else if (c == '\\') {
268 escaped = true;
269 } else {
270 b.Append((char)c);
271 if (c == '>') // end of the URI
272 break;
276 } else if (firstchar == '"') {
277 // A string: ("""[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*""")|("[^"\\]*(?:\\.[^"\\]*)*")
278 // What kind of crazy regex is this??
279 b.Length = 0; // get rid of the open quote
280 bool escaped = false;
281 bool triplequoted = false;
282 while (true) {
283 int c = source.Read();
284 if (c == -1) OnError("Unexpected end of stream within a string", loc);
286 if (b.Length == 0 && c == (int)'"' && source.Peek() == (int)'"') {
287 triplequoted = true;
288 source.Read();
289 continue;
292 if (!escaped && c == '\\')
293 escaped = true;
294 else if (escaped) {
295 ReadEscapedChar((char)c, b, source, loc);
296 escaped = false;
297 } else {
298 if (c == '"' && !triplequoted)
299 break;
300 if (c == '"' && source.Peek() == '"' && source.Peek2() == '"' && triplequoted)
301 break;
302 b.Append((char)c);
306 if (triplequoted) { // read the extra end quotes
307 source.Read();
308 source.Read();
311 string litvalue = b.ToString();
312 string litlang = null;
313 string litdt = null;
315 // Strings can be suffixed with @langcode or ^^symbol (but not both?).
316 if (source.Peek() == '@') {
317 source.Read();
318 b.Length = 0;
319 while (char.IsLetterOrDigit((char)source.Peek()) || source.Peek() == (int)'-')
320 b.Append((char)source.Read());
321 litlang = b.ToString();
322 } else if (source.Peek() == '^' && source.Peek2() == '^') {
323 loc = new Location(source.Line, source.Col);
324 source.Read();
325 source.Read();
326 litdt = ReadToken(source, context).ToString(); // better be a string URI
327 if (litdt.StartsWith("<") && litdt.EndsWith(">"))
328 litdt = litdt.Substring(1, litdt.Length-2);
329 else if (litdt.IndexOf(":") != -1) {
330 Resource r = ResolveQName(litdt, context, loc);
331 if (r.Uri == null)
332 OnError("A literal datatype cannot be an anonymous entity", loc);
333 litdt = r.Uri;
337 return new Literal(litvalue, litlang, litdt);
339 } else if (char.IsLetter((char)firstchar) || firstchar == '?' || firstchar == '@' || firstchar == ':' || firstchar == '_') {
340 // Something starting with @
341 // A QName: ([a-zA-Z_][a-zA-Z0-9_]*)?:)?([a-zA-Z_][a-zA-Z0-9_]*)?
342 // A variable: \?[a-zA-Z_][a-zA-Z0-9_]*
343 while (true) {
344 int c = source.Peek();
345 if (c == -1 || (!char.IsLetterOrDigit((char)c) && c != '-' && c != '_' && c != ':')) break;
346 b.Append((char)source.Read());
349 } else if (char.IsDigit((char)firstchar) || firstchar == '+' || firstchar == '-') {
350 while (true) {
351 int ci = source.Peek();
352 if (ci == -1) break;
354 // punctuation followed by a space means the punctuation is
355 // punctuation, and not part of this token
356 if (!char.IsDigit((char)ci) && source.Peek2() != -1 && char.IsWhiteSpace((char)source.Peek2()))
357 break;
359 char c = (char)ci;
360 if (char.IsWhiteSpace(c)) break;
362 b.Append((char)source.Read());
365 } else if (firstchar == '=') {
366 if (source.Peek() == (int)'>')
367 b.Append((char)source.Read());
369 } else if (firstchar == '[') {
370 // The start of an anonymous node.
372 } else if (firstchar == '{') {
373 return "{";
375 } else if (firstchar == '(') {
376 return "(";
377 } else if (firstchar == ')') {
378 return ")";
380 } else {
381 while (true) {
382 int c = source.Read();
383 if (c == -1) break;
384 if (char.IsWhiteSpace((char)c)) break;
385 b.Append((char)c);
387 OnError("Invalid token: " + b.ToString(), loc);
390 return b.ToString();
393 private Resource ReadResource(ParseContext context, out bool reverse) {
394 Location loc = context.Location;
396 Resource res = ReadResource2(context, out reverse);
398 ReadWhitespace(context.source);
399 while (context.source.Peek() == '!' || context.source.Peek() == '^' || (context.source.Peek() == '.' && context.source.Peek2() != -1 && char.IsLetter((char)context.source.Peek2())) ) {
400 int pathType = context.source.Read();
402 bool reverse2;
403 loc = context.Location;
404 Resource path = ReadResource2(context, out reverse2);
405 if (reverse || reverse2) OnError("is...of is not allowed in path expressions", loc);
406 if (!(path is Entity)) OnError("A path expression cannot be a literal", loc);
408 Entity anon = new Entity(null);
410 Statement s;
411 if (pathType == '!' || pathType == '.') {
412 if (!(res is Entity)) OnError("A path expression cannot contain a literal: " + res, loc);
413 s = new Statement((Entity)res, (Entity)path, anon, context.meta);
414 } else {
415 s = new Statement(anon, (Entity)path, res, context.meta);
418 Add(context.store, s, loc);
420 res = anon;
422 ReadWhitespace(context.source);
425 return res;
428 private Entity GetResource(ParseContext context, string uri) {
429 if (!ReuseEntities)
430 return new Entity(uri);
432 Entity ret = (Entity)context.namedNode[uri];
433 if (ret != null) return ret;
434 ret = new Entity(uri);
435 context.namedNode[uri] = ret;
436 return ret;
439 private Resource ResolveQName(string str, ParseContext context, Location loc) {
440 int colon = str.IndexOf(":");
441 string prefix = str.Substring(0, colon);
442 if (prefix == "_") {
443 Resource ret = (Resource)context.anonymous[str];
444 if (ret == null) {
445 ret = new Entity(null);
446 context.anonymous[str] = ret;
448 return ret;
449 } else if (prefix == "") {
450 return GetResource(context, (BaseUri == null ? "" : BaseUri) + str.Substring(colon+1));
451 } else {
452 string ns = context.namespaces.GetNamespace(prefix);
453 if (ns == null)
454 OnError("Prefix is undefined: " + str, loc);
455 return GetResource(context, ns + str.Substring(colon+1));
459 private Resource ReadResource2(ParseContext context, out bool reverse) {
460 reverse = false;
462 Location loc = context.Location;
464 object tok = ReadToken(context.source, context);
465 if (tok is Literal)
466 return (Literal)tok;
468 string str = (string)tok;
469 if (str == "")
470 return null;
472 // @ Keywords
474 if (str == "@prefix")
475 return PrefixResource;
477 if (str == "@keywords")
478 return KeywordsResource;
480 if (context.UsingKeywords && context.Keywords.Contains(str))
481 str = "@" + str;
482 if (!context.UsingKeywords &&
483 ( str == "a" || str == "has" || str == "is"))
484 str = "@" + str;
486 // Standard Keywords
487 // TODO: Turn these off with @keywords
489 if (str == "@a")
490 return entRDFTYPE;
492 if (str == "=")
493 return entDAMLEQUIV;
494 if (str == "=>")
495 return entLOGIMPLIES;
496 if (str == "<=") {
497 reverse = true;
498 return entLOGIMPLIES;
501 if (str == "@has") // ignore this token
502 return ReadResource2(context, out reverse);
504 if (str == "@is") {
505 // Reverse predicate
506 bool reversetemp;
507 Resource pred = ReadResource2(context, out reversetemp);
508 reverse = true;
510 string of = ReadToken(context.source, context) as string;
511 if (of == null) OnError("End of stream while expecting 'of'", loc);
512 if (of == "@of"
513 || (!context.UsingKeywords && of == "of")
514 || (context.UsingKeywords && context.Keywords.Contains("of") && of == "of"))
515 return pred;
516 OnError("Expecting token 'of' but found '" + of + "'", loc);
517 return null; // unreachable
520 if (str.StartsWith("@"))
521 OnError("The " + str + " directive is not supported", loc);
523 // URI
525 if (str.StartsWith("<") && str.EndsWith(">")) {
526 string uri = GetAbsoluteUri(BaseUri, str.Substring(1, str.Length-2));
527 return GetResource(context, uri);
530 // VARIABLE
532 if (str[0] == '?') {
533 string uri = str.Substring(1);
534 if (BaseUri != null)
535 uri = BaseUri + uri;
536 Entity var = GetResource(context, uri);
537 AddVariable(var);
538 return var;
541 // QNAME
543 if (str.IndexOf(":") != -1)
544 return ResolveQName(str, context, loc);
546 // ANONYMOUS
548 if (str == "[") {
549 Entity ret = new Entity(null);
550 ReadWhitespace(context.source);
551 if (context.source.Peek() != ']') {
552 char bracket = ReadPredicates(ret, context);
553 if (bracket == '.')
554 bracket = ReadPunc(context.source);
555 if (bracket != ']')
556 OnError("Expected a close bracket but found '" + bracket + "'", loc);
557 } else {
558 context.source.Read();
560 return ret;
563 // LIST
565 if (str == "(") {
566 // A list
567 Entity ent = null;
568 while (true) {
569 bool rev2;
570 Resource res = ReadResource(context, out rev2);
571 if (res == null)
572 break;
574 if (ent == null) {
575 ent = new Entity(null);
576 } else {
577 Entity sub = new Entity(null);
578 Add(context.store, new Statement(ent, entRDFREST, sub, context.meta), loc);
579 ent = sub;
582 Add(context.store, new Statement(ent, entRDFFIRST, res, context.meta), loc);
584 if (ent == null) // No list items.
585 ent = entRDFNIL; // according to Turtle spec
586 else
587 Add(context.store, new Statement(ent, entRDFREST, entRDFNIL, context.meta), loc);
588 return ent;
591 if (str == ")")
592 return null; // Should I use a more precise end-of-list return value?
594 // REIFICATION
596 if (str == "{") {
597 // Embedded resource
598 ParseContext newcontext = context;
599 newcontext.meta = new Entity(null);
600 while (NextPunc(context.source) != '}' && ReadStatement(newcontext)) { }
601 ReadWhitespace(context.source);
602 if (context.source.Peek() == '}') context.source.Read();
603 return newcontext.meta;
606 // NUMERIC LITERAL
608 // In Turtle, numbers are restricted to [0-9]+, and are datatyped xsd:integer.
609 double numval;
610 if (double.TryParse(str, System.Globalization.NumberStyles.Any, null, out numval))
611 return new Literal(numval.ToString());
613 // If @keywords is used, alphanumerics that aren't keywords
614 // are local names in the default namespace.
615 if (context.UsingKeywords && char.IsLetter(str[0])) {
616 if (BaseUri == null)
617 OnError("The document contains an unqualified name but no BaseUri was specified: \"" + str + "\"", loc);
618 return GetResource(context, BaseUri + str);
621 // NOTHING MATCHED
623 OnError("Invalid token: " + str, loc);
624 return null;
627 private void Add(StatementSink store, Statement statement, Location position) {
628 try {
629 store.Add(statement);
630 } catch (Exception e) {
631 OnError("Add failed on statement { " + statement + " }: " + e.Message, position, e);
635 private void OnError(string message, Location position) {
636 throw new ParserException(message + ", line " + position.Line + " col " + position.Col);
638 private void OnError(string message, Location position, Exception cause) {
639 throw new ParserException(message + ", line " + position.Line + " col " + position.Col, cause);
645 internal class MyReader {
646 TextReader r;
647 public MyReader(TextReader reader) { r = reader; }
649 public int Line = 1;
650 public int Col = 0;
652 int[] peeked = new int[2];
653 int peekCount = 0;
655 public Location Location { get { return new Location(Line, Col); } }
657 public int Peek() {
658 if (peekCount == 0) {
659 peeked[0] = r.Read();
660 peekCount = 1;
662 return peeked[0];
665 public int Peek2() {
666 Peek();
667 if (peekCount == 1) {
668 peeked[1] = r.Read();
669 peekCount = 2;
671 return peeked[1];
674 public int Read() {
675 int c;
677 if (peekCount > 0) {
678 c = peeked[0];
679 peeked[0] = peeked[1];
680 peekCount--;
681 } else {
682 c = r.Read();
685 if (c == '\n') { Line++; Col = 0; }
686 else { Col++; }
688 return c;
692 internal struct Location {
693 public readonly int Line, Col;
694 public Location(int line, int col) { Line = line; Col = col; }