Dont reindex already indexed files. Yet another bug uncovered by the DateTime fixes.
[beagle.git] / Util / SemWeb / XmlParser.cs
blobb63c7a86cd28dde1bf6a5ed638bf5ef5bf318e85
1 using System;
2 using System.Collections;
3 using System.IO;
4 using System.Text;
5 using System.Xml;
7 using SemWeb.Util;
9 namespace SemWeb {
10 public class RdfXmlReader : RdfReader {
11 // TODO: Make some of the errors warnings.
13 XmlReader xml;
15 Hashtable blankNodes = new Hashtable();
16 UriMap namedNodes = new UriMap();
17 Hashtable seenIDs = new Hashtable();
19 StatementSink storage;
21 static readonly Entity
22 rdfType = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
23 rdfFirst = "http://www.w3.org/1999/02/22-rdf-syntax-ns#first",
24 rdfRest = "http://www.w3.org/1999/02/22-rdf-syntax-ns#rest",
25 rdfNil = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nil",
26 rdfSubject = "http://www.w3.org/1999/02/22-rdf-syntax-ns#subject",
27 rdfPredicate = "http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate",
28 rdfObject = "http://www.w3.org/1999/02/22-rdf-syntax-ns#object",
29 rdfStatement = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement";
31 public RdfXmlReader(XmlDocument document) {
32 xml = new XmlNodeReader(document);
35 public RdfXmlReader(XmlReader document) {
36 XmlValidatingReader reader = new XmlValidatingReader(document);
37 reader.ValidationType = ValidationType.None;
38 xml = reader;
41 public RdfXmlReader(TextReader document) : this(new XmlTextReader(document)) {
44 public RdfXmlReader(Stream document) : this(new XmlTextReader(document)) {
47 public RdfXmlReader(string file) : this(GetReader(file)) {
50 public override void Select(StatementSink storage) {
51 // Read past the processing instructions to
52 // the document element. If it is rdf:RDF,
53 // then process the description nodes within it.
54 // Otherwise, the document element is itself a
55 // description.
57 storage = GetDupCheckSink(storage);
58 this.storage = storage;
60 while (xml.Read()) {
61 if (xml.NamespaceURI == NS.RDF && xml.LocalName == "RDF" ) {
62 while (xml.Read()) {
63 if (xml.NodeType == XmlNodeType.Element)
64 ParseDescription();
70 xml.Close();
73 private string CurNode() {
74 return xml.NamespaceURI + xml.LocalName;
77 private int isset(string attribute) {
78 return attribute != null ? 1 : 0;
81 private string Unrelativize(string uri) {
82 return GetAbsoluteUri(xml.BaseURI != "" ? xml.BaseURI : BaseUri, uri);
85 private Entity GetBlankNode(string nodeID) {
86 if (blankNodes.ContainsKey(nodeID))
87 return (Entity)blankNodes[nodeID];
89 Entity entity = new Entity(null);
90 blankNodes[nodeID] = entity;
92 return entity;
95 private Entity GetNamedNode(string uri) {
96 if (!ReuseEntities)
97 return new Entity(uri);
99 Entity ret = (Entity)namedNodes[uri];
100 if (ret != null) return ret;
101 ret = new Entity(uri);
102 namedNodes[uri] = ret;
103 return ret;
106 private Entity ParseDescription() {
107 // The XmlReader is positioned on an element node
108 // that is a description of an entity.
109 // On returning, the reader is positioned after the
110 // end element of the description node.
112 string nodeID = xml.GetAttribute("nodeID", NS.RDF);
113 string about = xml.GetAttribute("about", NS.RDF);
114 //if (about == null)
115 // about = xml.GetAttribute("about");
116 string ID = xml.GetAttribute("ID", NS.RDF);
117 if (isset(nodeID) + isset(about) + isset(ID) > 1)
118 OnError("An entity description cannot specify more than one of rdf:nodeID, rdf:about, and rdf:ID");
120 Entity entity;
122 if (about != null)
123 entity = GetNamedNode(Unrelativize(about));
124 else if (ID != null) {
125 entity = GetNamedNode(Unrelativize("#" + ID));
127 if (seenIDs.ContainsKey(entity.Uri))
128 OnError("Two descriptions cannot use the same rdf:ID: <" + entity.Uri + ">");
129 seenIDs[entity.Uri] = seenIDs;
130 } else if (nodeID != null)
131 entity = GetBlankNode(nodeID);
132 else
133 entity = new Entity(null);
135 // If the name of the element is not rdf:Description,
136 // then the name gives its type.
137 if (CurNode() != NS.RDF + "Description") {
138 if (CurNode() == NS.RDF + "li") OnError("rdf:li cannot be the type of a node");
139 storage.Add(new Statement(entity, rdfType, (Entity)CurNode(), Meta));
142 ParsePropertyAttributes(entity);
143 ParsePropertyNodes(entity);
145 return entity;
148 private bool ParsePropertyAttributes(Entity entity) {
149 bool foundAttrs = false;
151 if (!xml.MoveToFirstAttribute()) return false;
152 do {
153 // Propery attributes in the default namespace
154 // should be ignored.
155 if (xml.NamespaceURI == "")
156 continue;
158 string curnode = CurNode();
160 // rdf:type is interpreted with an entity object,
161 // not a literal object.
162 if (curnode == NS.RDF + "type") {
163 storage.Add(new Statement(entity, rdfType, (Entity)xml.Value, Meta));
164 foundAttrs = true;
165 continue;
168 // Properties which are not recognized as property
169 // attributes and should be ignored.
170 if (curnode == NS.RDF + "RDF") continue;
171 if (curnode == NS.RDF + "Description") continue;
172 if (curnode == NS.RDF + "ID") continue;
173 if (curnode == NS.RDF + "about") continue;
174 if (curnode == NS.RDF + "parseType") continue;
175 if (curnode == NS.RDF + "resource") continue;
176 if (curnode == NS.RDF + "nodeID") continue;
177 if (curnode == NS.RDF + "datatype") continue;
179 // Properties which are invalid as attributes.
180 if (curnode == NS.RDF + "li")
181 OnError("rdf:li is not a valid attribute");
182 if (curnode == NS.RDF + "aboutEach" || curnode == NS.RDF + "aboutEachPrefix")
183 OnError("rdf:aboutEach has been removed from the RDF spec");
185 // Unrecognized attributes in the xml namespace should be ignored.
186 if (xml.Prefix == "xml") continue;
187 if (xml.Prefix == "xmlns") continue;
188 if (curnode == "http://www.w3.org/2000/xmlns/xmlns") continue;
190 // This is a literal property attribute.
191 string lang = xml.XmlLang != "" ? xml.XmlLang : null;
192 storage.Add(new Statement(entity, curnode,
193 new Literal(xml.Value, lang, null), Meta));
194 foundAttrs = true;
196 } while (xml.MoveToNextAttribute());
198 xml.MoveToElement();
200 return foundAttrs;
203 private void ParsePropertyNodes(Entity subject) {
204 // The reader is positioned within a description node.
205 // On returning, the reader is positioned after the
206 // end element of the description node.
208 if (xml.IsEmptyElement) return;
210 int liIndex = 1;
212 while (xml.Read()) {
213 if (xml.NodeType == XmlNodeType.EndElement)
214 break;
215 if (xml.NodeType != XmlNodeType.Element)
216 continue;
218 ParseProperty(subject, ref liIndex);
222 private void ParseProperty(Entity subject, ref int liIndex) {
223 // The reader is positioned on a propert node,
224 // and on returning the reader is positioned past
225 // that node.
227 // Get all of the attributes before we move the reader forward.
229 string nodeID = xml.GetAttribute("nodeID", NS.RDF);
230 string resource = xml.GetAttribute("resource", NS.RDF);
232 string parseType = xml.GetAttribute("parseType", NS.RDF);
233 string datatype = xml.GetAttribute("datatype", NS.RDF);
235 string lang = xml.XmlLang != "" ? xml.XmlLang : null;
237 string predicate = CurNode();
238 if (predicate == NS.RDF + "li")
239 predicate = NS.RDF + "_" + (liIndex++);
241 string ID = xml.GetAttribute("ID", NS.RDF);
243 Resource objct = null;
244 if (nodeID != null || resource != null) {
245 if (isset(nodeID) + isset(resource) > 1)
246 OnError("A predicate node cannot specify more than one of rdf:nodeID and rdf:resource");
248 if (parseType != null || datatype != null)
249 OnError("The attributes rdf:parseType and rdf:datatype are not valid on a predicate with a rdf:nodeID or rdf:resource attribute");
251 // Object is an entity given by nodeID or resource.
252 // The
253 if (nodeID != null)
254 objct = GetBlankNode(nodeID);
255 else if (resource != null)
256 objct = GetNamedNode(Unrelativize(resource));
258 ParsePropertyAttributes((Entity)objct);
260 // No children are allowed in this element.
261 if (!xml.IsEmptyElement)
262 while (xml.Read()) {
263 if (xml.NodeType == XmlNodeType.EndElement) break;
264 if (xml.NodeType == XmlNodeType.Whitespace) continue;
265 if (xml.NodeType == XmlNodeType.Comment) continue;
266 if (xml.NodeType == XmlNodeType.ProcessingInstruction) continue;
267 OnError("Content is not allowed within a property with a rdf:nodeID or rdf:resource attribute");
270 } else if (parseType != null && parseType == "Literal") {
271 if (datatype == null)
272 datatype = "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral";
274 if (ParsePropertyAttributes(new Entity(null)))
275 OnError("Property attributes are not valid when parseType is Literal");
277 objct = new Literal(xml.ReadInnerXml(), null, datatype);
279 } else if (parseType != null && parseType == "Resource") {
280 objct = new Entity(null);
282 ParsePropertyAttributes((Entity)objct);
283 if (!xml.IsEmptyElement)
284 ParsePropertyNodes((Entity)objct);
286 } else if (parseType != null && parseType == "Collection") {
287 Entity collection = new Entity(null);
288 Entity lastnode = collection;
289 bool empty = true;
291 ParsePropertyAttributes(collection);
293 if (!xml.IsEmptyElement)
294 while (xml.Read()) {
295 if (xml.NodeType == XmlNodeType.EndElement) break;
296 if (xml.NodeType != XmlNodeType.Element) continue;
298 if (!empty) {
299 Entity next = new Entity(null);
300 storage.Add(new Statement(lastnode, rdfRest, next, Meta));
301 lastnode = next;
304 Entity item = ParseDescription();
305 storage.Add(new Statement(lastnode, rdfFirst, item, Meta));
307 empty = false;
310 storage.Add(new Statement(lastnode, rdfRest, rdfNil, Meta));
312 if (empty)
313 objct = rdfNil;
314 else
315 objct = collection;
317 } else if (datatype != null) {
318 // Forces even xml content to be read as in parseType=Literal?
319 // Note that any xml:lang is discarded.
321 if (ParsePropertyAttributes(new Entity(null)))
322 OnError("Property attributes are not valid when a data type is given");
324 objct = new Literal(xml.ReadInnerXml(), null, datatype);
326 } else {
327 // We don't know whether the contents of this element
328 // refer to a literal or an entity. If an element is
329 // a child of this node, then it must be an entity.
330 // If the property has predicate attributes, then it
331 // is an anonymous entity. Otherwise the text content
332 // is the literal value.
334 objct = new Entity(null);
335 if (ParsePropertyAttributes((Entity)objct)) {
336 // Found property attributes. There should be no other internal content?
338 if (!xml.IsEmptyElement)
339 while (xml.Read()) {
340 if (xml.NodeType == XmlNodeType.EndElement) break;
341 if (xml.NodeType == XmlNodeType.Whitespace) continue;
342 if (xml.NodeType == XmlNodeType.Comment) continue;
343 if (xml.NodeType == XmlNodeType.ProcessingInstruction) continue;
344 OnError(xml.NodeType + " is not allowed within a property with property attributes");
347 } else {
348 StringBuilder textcontent = new StringBuilder();
349 bool hadText = false;
350 bool hadElement = false;
352 if (!xml.IsEmptyElement)
353 while (xml.Read()) {
354 if (xml.NodeType == XmlNodeType.EndElement) break;
355 if (xml.NodeType == XmlNodeType.Element) {
356 if (hadText)
357 OnError("Both text and elements are present as a property value");
358 hadElement = true;
360 objct = ParseDescription();
361 } else if (xml.NodeType == XmlNodeType.Text || xml.NodeType == XmlNodeType.SignificantWhitespace) {
362 if (hadElement)
363 OnError("Both text and elements are present as a property value");
364 textcontent.Append(xml.Value);
365 hadText = true;
366 } else {
367 textcontent.Append(xml.Value);
371 if (!hadElement)
372 objct = new Literal(textcontent.ToString(), lang, null);
376 storage.Add(new Statement(subject, predicate, objct, Meta));
378 if (ID != null) {
379 // In addition to adding the statement as normal, also
380 // add a reified statement.
381 Entity statement = GetNamedNode(Unrelativize("#" + ID));;
382 storage.Add(new Statement(statement, rdfType, rdfStatement, Meta));
383 storage.Add(new Statement(statement, rdfSubject, subject, Meta));
384 storage.Add(new Statement(statement, rdfPredicate, (Entity)predicate, Meta));
385 storage.Add(new Statement(statement, rdfObject, objct, Meta));
389 private void OnError(string message) {
390 if (xml is IXmlLineInfo && ((IXmlLineInfo)xml).HasLineInfo()) {
391 IXmlLineInfo line = (IXmlLineInfo)xml;
392 message += ", line " + line.LineNumber + " col " + line.LinePosition;
394 throw new ParserException(message);