strex: added `detectUrl()`
[iv.d.git] / saxy.d
blob3a211ce23cac74ebe3aa3ee0514614a7963866b3
1 /* Invisible Vector Library
2 * coded by Ketmar // Invisible Vector <ketmar@ketmar.no-ip.org>
3 * Understanding is not required. Only obedience.
5 * This software is provided 'as-is', without any express or implied
6 * warranty. In no event will the authors be held liable for any damages
7 * arising from the use of this software.
8 * Permission is granted to anyone to use this software for any purpose,
9 * including commercial applications, and to alter it and redistribute it
10 * freely, subject to the following restrictions:
11 * 1. The origin of this software must not be misrepresented; you must not
12 * claim that you wrote the original software. If you use this software
13 * in a product, an acknowledgment in the product documentation would be
14 * appreciated but is not required.
15 * 2. Altered source versions must be plainly marked as such, and must not be
16 * misrepresented as being the original software.
17 * 3. This notice may not be removed or altered from any source distribution.
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
23 // SAX style xml parser
24 module iv.saxy /*is aliced*/;
26 import std.encoding;
27 import std.range;
29 import iv.alice;
30 import iv.strex;
31 import iv.vfs;
34 // ////////////////////////////////////////////////////////////////////////// //
35 //*WARNING*: attr keys are *NOT* strings!
36 void xmparse(ST) (auto ref ST fl,
37 scope void delegate (char[] name, char[][string] attrs) tagStart,
38 scope void delegate (char[] name) tagEnd,
39 scope void delegate (char[] text) content,
40 ) if (isReadableStream!ST || (isInputRange!ST && is(ElementEncodingType!ST == char))) {
41 char[] buf;
42 uint bufpos;
43 char[][string] attrs;
44 scope(exit) {
45 attrs.destroy;
46 buf.destroy;
49 static bool isValidNameChar() (char ch) {
50 pragma(inline, true);
51 return
52 (ch >= '0' && ch <= '9') ||
53 (ch >= 'A' && ch <= 'Z') ||
54 (ch >= 'a' && ch <= 'z') ||
55 ch == '_' || ch == '-' || ch == ':';
58 int tagLevel = 0;
60 void bufPut (const(char)[] chars...) {
61 if (/*tagLevel &&*/ chars.length) {
62 if (chars.length+bufpos > buf.length) {
63 if (chars.length+bufpos >= int.max) throw new Exception("out of memory in xml parser");
64 buf.assumeSafeAppend;
65 buf.length = ((chars.length+bufpos)|0x3ff)+1;
67 buf[bufpos..bufpos+chars.length] = chars[];
68 bufpos += chars.length;
72 void clearBuf () {
73 bufpos = 0;
76 char curCh;
77 bool eof;
79 static if (isReadableStream!ST) {
80 char[] rdbuf;
81 scope(exit) rdbuf.destroy;
82 uint rdbufpos, rdbufused;
85 void skipChar () {
86 if (!eof) {
87 static if (isReadableStream!ST) {
88 // buffer more bytes
89 if (rdbufpos >= rdbufused) {
90 if (rdbuf.length == 0) rdbuf.length = 32*1024;
91 auto rd = fl.rawRead(rdbuf[]);
92 if (rd.length == 0) { eof = true; curCh = 0; return; }
93 rdbufpos = 0;
94 rdbufused = cast(uint)rd.length;
96 curCh = rdbuf.ptr[rdbufpos++];
97 } else {
98 if (fl.empty) { eof = true; curCh = 0; return; }
99 curCh = fl.front;
100 fl.popFront;
102 if (curCh == 0) curCh = ' ';
106 // curCh is '&'
107 void parseEntity (bool inattr) {
108 assert(curCh == '&');
109 bufPut(curCh);
110 auto xpos = bufpos;
111 skipChar();
112 if (inattr) {
113 while (!eof && curCh != '/' && curCh != '>' && curCh != '?' && curCh != ';' && bufpos-xpos < 9) {
114 bufPut(curCh);
115 skipChar();
117 } else {
118 while (!eof && curCh != '<' && curCh != ';' && bufpos-xpos < 9) {
119 bufPut(curCh);
120 skipChar();
123 if (!eof && curCh == ';' && bufpos > xpos) {
124 import std.utf : encode, UseReplacementDchar;
125 char[4] ubuf = void; // utf buffer
126 switch (buf[xpos..bufpos]) {
127 case "lt": bufpos = xpos-1; bufPut('<'); break;
128 case "gt": bufpos = xpos-1; bufPut('>'); break;
129 case "amp": bufpos = xpos-1; bufPut('&'); break;
130 case "quot": bufpos = xpos-1; bufPut('"'); break;
131 case "apos": bufpos = xpos-1; bufPut('\''); break;
132 default:
133 bufPut(curCh); // first put ';'
134 if (bufpos-xpos > 3 && buf.ptr[xpos] == '#' && buf.ptr[xpos+1] == 'x') {
135 // should be hex code
136 uint n = 0;
137 auto pos = xpos+2;
138 while (pos < bufpos-1) {
139 char ch = buf.ptr[pos++];
140 if (ch >= '0' && ch <= '9') n = n*16+ch-'0';
141 else if (ch >= 'A' && ch <= 'F') n = n*16+ch-'A'+10;
142 else if (ch >= 'a' && ch <= 'f') n = n*16+ch-'a'+10;
143 else { n = uint.max; break; } // invalid digit
144 if (n > dchar.max) break; // invalid char
146 if (n <= dchar.max) {
147 bufpos = xpos-1;
148 auto sz = encode!(UseReplacementDchar.yes)(ubuf, cast(dchar)n);
149 foreach (immutable char ch; ubuf[0..sz]) bufPut(ch);
151 } else if (bufpos-xpos > 2 && buf.ptr[xpos] == '#') {
152 // shoud be decimal code
153 uint n = 0;
154 auto pos = xpos+1;
155 while (pos < bufpos-1) {
156 char ch = buf.ptr[pos++];
157 if (ch >= '0' && ch <= '9') n = n*10+ch-'0';
158 else { n = uint.max; break; } // invalid digit
159 if (n > dchar.max) break; // invalid char
161 if (n <= dchar.max) {
162 bufpos = xpos-1;
163 auto sz = encode!(UseReplacementDchar.yes)(ubuf, cast(dchar)n);
164 foreach (immutable char ch; ubuf[0..sz]) bufPut(ch);
167 break;
169 skipChar();
173 void parseCData () {
174 clearBuf();
175 while (!eof) {
176 if (bufpos >= 3 && buf.ptr[bufpos-1] == '>' && buf.ptr[bufpos-2] == ']' && buf.ptr[bufpos-3] == ']') {
177 bufpos -= 3;
178 break;
180 bufPut(curCh);
181 skipChar();
183 if (tagLevel && bufpos > 0 && content !is null) content(buf[0..bufpos]);
184 clearBuf();
187 void parseContent () {
188 clearBuf();
189 while (!eof) {
190 if (curCh == '<') break;
191 if (curCh != '&') {
192 bufPut(curCh);
193 skipChar();
194 } else {
195 parseEntity(false);
198 if (tagLevel && bufpos > 0 && content !is null) content(buf[0..bufpos]);
199 clearBuf();
202 void parseTag () {
203 assert(!eof && curCh == '<');
204 clearBuf();
205 skipChar();
206 if (eof) throw new Exception("invalid xml");
207 bool inlineClose = false, closeTag = false;
208 if (curCh == '!') {
209 // either CDATA, or comment-like
210 skipChar();
211 if (curCh == '[') {
212 // this *must* be CDATA
213 skipChar();
214 if (curCh != 'C') throw new Exception("invalid xml");
215 skipChar();
216 if (curCh != 'D') throw new Exception("invalid xml");
217 skipChar();
218 if (curCh != 'A') throw new Exception("invalid xml");
219 skipChar();
220 if (curCh != 'T') throw new Exception("invalid xml");
221 skipChar();
222 if (curCh != 'A') throw new Exception("invalid xml");
223 skipChar();
224 if (curCh != '[') throw new Exception("invalid xml");
225 skipChar();
226 clearBuf();
227 parseCData();
228 return;
229 } else if (curCh == '-') {
230 // comment
231 skipChar();
232 if (curCh != '-') throw new Exception("invalid xml");
233 skipChar();
234 for (;;) {
235 if (eof) throw new Exception("invalid xml");
236 if (curCh == '-') {
237 skipChar();
238 if (curCh == '-') {
239 skipChar();
240 if (curCh == '>') {
241 skipChar();
242 break;
245 } else {
246 skipChar();
249 clearBuf();
250 return;
251 } else {
252 // !tag
253 bufPut('!');
255 } else {
256 if (curCh == '/') { closeTag = true; skipChar(); }
257 if (curCh == '?') { bufPut(curCh); skipChar(); }
259 if (eof || !isValidNameChar(curCh)) throw new Exception("invalid xml");
260 while (isValidNameChar(curCh)) {
261 bufPut(curCh);
262 skipChar();
264 //{ import std.stdio; writeln("TAG: ", buf[0..bufpos].quote); }
265 // now parse attributes
266 scope(exit) attrs.clear();
267 while (!eof && curCh <= ' ') skipChar();
268 // closing tag?
269 auto tagnameend = bufpos;
270 if (!closeTag) {
271 // attr=["]name["]
272 // read the whole tag, so we can add AA items without anchoring stale memory
273 if (eof) throw new Exception("invalid xml");
274 if (curCh != '/' && curCh != '>' && curCh != '?') {
275 bufPut(' ');
276 auto stpos = bufpos;
277 char qch = 0;
278 for (;;) {
279 if (eof) throw new Exception("invalid xml");
280 if (qch) {
281 if (curCh == qch) qch = 0;
282 if (curCh == '&') {
283 parseEntity(true);
284 continue;
286 } else {
287 if (curCh == '/' || curCh == '>' || curCh == '?') break;
288 if (curCh == '"' || curCh == '\'') qch = curCh;
290 bufPut(curCh);
291 skipChar();
293 // now parse attributes
294 while (stpos < bufpos) {
295 while (stpos < bufpos && buf.ptr[stpos] <= ' ') ++stpos;
296 if (stpos >= bufpos) break;
297 //{ import std.stdio; writeln(": ", buf[stpos..bufpos].quote); }
298 if (!isValidNameChar(buf.ptr[stpos])) throw new Exception("invalid xml");
299 auto nst = stpos;
300 while (stpos < bufpos && isValidNameChar(buf.ptr[stpos])) ++stpos;
301 string aname = cast(string)(buf[nst..stpos]); // unsafe cast, but meh...
302 while (stpos < bufpos && buf.ptr[stpos] <= ' ') ++stpos;
303 if (stpos >= bufpos) { attrs[aname] = null; break; } // no value
304 if (buf.ptr[stpos] != '=') { attrs[aname] = null; continue; } // no value
305 ++stpos;
306 if (stpos >= bufpos) { attrs[aname] = buf[bufpos..bufpos]; break; }
307 if (buf.ptr[stpos] == '"' || buf.ptr[stpos] == '\'') {
308 auto ech = buf.ptr[stpos];
309 nst = ++stpos;
310 while (stpos < bufpos && buf.ptr[stpos] != ech) ++stpos;
311 if (stpos >= bufpos) throw new Exception("invalid xml");
312 attrs[aname] = buf[nst..stpos];
313 ++stpos;
314 } else {
315 nst = stpos;
316 while (stpos < bufpos && buf.ptr[stpos] > ' ') ++stpos;
317 attrs[aname] = buf[nst..stpos];
322 if (curCh == '?') {
323 if (buf.ptr[0] != '?') throw new Exception("invalid xml");
324 skipChar();
325 inlineClose = true;
326 } else if (buf.ptr[0] != '!') {
327 if (curCh == '/') { inlineClose = true; skipChar(); }
328 } else {
329 inlineClose = true;
331 if (curCh != '>') throw new Exception("invalid xml");
332 skipChar();
333 if (closeTag) {
334 if (inlineClose) throw new Exception("invalid xml");
335 if (tagEnd !is null) tagEnd(buf[0..tagnameend]);
336 --tagLevel;
337 } else {
338 ++tagLevel;
339 if (tagStart !is null) tagStart(buf[0..tagnameend], attrs);
340 if (inlineClose) {
341 if (tagEnd !is null) tagEnd(buf[0..tagnameend]);
342 --tagLevel;
347 while (!eof) {
348 //writeln("*** ", tagLevel, " ***");
349 parseContent();
350 if (eof) break;
351 if (curCh == '<') {
352 parseTag();
353 if (tagLevel < 0) throw new Exception("invalid xml");
357 if (tagLevel != 0) throw new Exception("invalid xml");
361 // ////////////////////////////////////////////////////////////////////////// //
362 // you can use "quantifiers" in pathes, like this:
363 // "/a/b/c*/d+/*"
364 // that means "any number of 'c' tags", "one or more 'd' tags", "any number of any tags"
365 // the last is useful to parse things like "bold" tag inside "p" tag, for example
366 final class SaxyEx {
367 private import std.range;
368 public:
369 alias TagOpenCB = void delegate (char[] name, char[][string] attrs);
370 alias TagOpenCBNA = void delegate (char[] name);
371 alias TagCloseCB = void delegate (char[] name);
372 alias TagContentCB = void delegate (char[] text);
374 private:
375 static struct PathElement {
376 string name; // empty: any tag
377 char quant = 0; // '+', '*', 0
380 static struct TagCB {
381 enum Type { Open, Close, Content }
382 Type type;
383 PathElement[] path;
384 bool pathHasQuants; // use faster algo if there are no quantifiers
385 bool openNoAttr;
386 union {
387 TagOpenCB open;
388 TagCloseCB close;
389 TagContentCB content;
393 private:
394 TagCB[] callbacksOpen;
395 TagCB[] callbacksClose;
396 TagCB[] callbacksContent;
398 public:
399 this () {}
401 void load (const(char)[] filename) { loadFile(VFile(filename)); }
403 void loadStream(ST) (auto ref ST st) if (isReadableStream!ST || (isInputRange!ST && is(ElementEncodingType!ST == char))) { loadFile(st); }
405 void onOpen(ST : const(char)[]) (ST path, TagOpenCB cb) {
406 assert(cb !is null);
407 auto tcb = newCallback!"open"(path);
408 tcb.open = cb;
409 tcb.openNoAttr = false;
412 void onOpen(ST : const(char)[]) (ST path, TagOpenCBNA cb) {
413 assert(cb !is null);
414 auto tcb = newCallback!"open"(path);
415 tcb.close = cb; // lucky me
416 tcb.openNoAttr = true;
419 void onClose(ST : const(char)[]) (ST path, TagCloseCB cb) {
420 assert(cb !is null);
421 auto tcb = newCallback!"close"(path);
422 tcb.close = cb;
425 void onContent(ST : const(char)[]) (ST path, TagContentCB cb) {
426 assert(cb !is null);
427 auto tcb = newCallback!"content"(path);
428 tcb.content = cb;
431 private:
432 TagCB* newCallback(string type, ST : const(char)[]) (ST path) {
433 static if (is(ST == typeof(null))) {
434 return newCallback("");
435 } else {
436 // parse path
437 bool hasQuants = false;
438 PathElement[] pth;
439 if (path.length) {
440 while (path.length != 0) {
441 while (path.length != 0 && path.ptr[0] == '/') path = path[1..$];
442 if (path.length == 0) break;
443 usize e = 0;
444 while (e < path.length && path.ptr[e] != '/') ++e;
445 //if (e == 1 && path.ptr[0] == '+') throw new Exception("invalid callback path");
446 if (path.ptr[e-1] == '+' || path.ptr[e-1] == '*') {
447 pth ~= PathElement(path[0..e-1].idup, path.ptr[e-1]);
448 hasQuants = true;
449 } else {
450 pth ~= PathElement(path[0..e].idup, 0);
452 path = path[e..$];
454 if (pth.length == 0) throw new Exception("invalid callback path");
455 } else {
456 hasQuants = true;
457 pth ~= PathElement(null, '*');
459 TagCB* res;
460 static if (type == "open") {
461 callbacksOpen.length += 1;
462 res = &callbacksOpen[$-1];
463 res.type = TagCB.Type.Open;
464 } else static if (type == "close") {
465 callbacksClose.length += 1;
466 res = &callbacksClose[$-1];
467 res.type = TagCB.Type.Close;
468 } else static if (type == "content") {
469 callbacksContent.length += 1;
470 res = &callbacksContent[$-1];
471 res.type = TagCB.Type.Content;
472 } else {
473 static assert(0, "wtf?!");
475 res.path = pth;
476 res.pathHasQuants = hasQuants;
477 return res;
481 // yes, i can make it faster with some more preprocessing, but why should i bother?
482 static bool pathHit (const(char)[][] tagStack, PathElement[] path, bool hasQuants) {
483 version(none) {
484 import std.stdio;
485 writeln("tagStack: ", tagStack[]);
486 foreach (const ref PathElement pe; path) {
487 write((pe.quant ? pe.quant : ' '), pe.name);
489 writeln;
491 if (!hasQuants) {
492 // easy case
493 if (tagStack.length != path.length) return false;
494 foreach_reverse (immutable idx, const ref PathElement pe; path) {
495 if (tagStack.ptr[idx] != pe.name) return false;
497 return true;
500 static bool hasQ (PathElement[] path) {
501 foreach (const ref PathElement pe; path) if (pe.quant) return true;
502 return false;
505 while (path.length > 0) {
506 auto pe = &path[0];
507 path = path[1..$];
508 if (pe.quant == '*') {
509 if (pe.name.length == 0) {
510 // any number of any tag, including zero
511 if (path.length == 0) return true;
512 while (tagStack.length > 0) {
513 if (pathHit(tagStack, path, hasQ(path))) return true;
514 tagStack = tagStack[1..$];
516 return false;
517 } else {
518 // any number of given tag, including zero
519 // skip this tag and continue
520 while (tagStack.length && tagStack.ptr[0] == pe.name) tagStack = tagStack[1..$];
522 } else if (pe.quant == '+') {
523 if (pe.name.length == 0) {
524 // any number of any tag, not including zero
525 if (path.length == 0) return (tagStack.length > 0);
526 while (tagStack.length > 0) {
527 if (pathHit(tagStack, path, hasQ(path))) return true;
528 tagStack = tagStack[1..$];
530 return false;
531 } else {
532 // any number of given tag, not including zero
533 if (tagStack.length == 0 || tagStack.ptr[0] != pe.name) return false;
534 // skip this tag and continue
535 while (tagStack.length && tagStack.ptr[0] == pe.name) tagStack = tagStack[1..$];
537 } else if (pe.name.length != 0) {
538 // named tag
539 if (tagStack.length == 0) return false;
540 if (pe.name != tagStack.ptr[0]) return false;
541 tagStack = tagStack[1..$];
542 } else {
543 // any tag
544 tagStack = tagStack[1..$];
547 return (tagStack.length == 0);
550 private:
551 void loadFile(ST) (auto ref ST fl) if (isReadableStream!ST || (isInputRange!ST && is(ElementEncodingType!ST == char))) {
552 bool seenXML;
553 bool tagStackLastWasAppend = true;
554 const(char)[][] tagStack; // all data is in tagStackBuf
555 char[] tagStackBuf;
556 scope(exit) tagStackBuf.destroy;
557 uint tagStackBufPos;
558 EncodingScheme efrom, eto;
559 scope(exit) { efrom.destroy; eto.destroy; }
560 char[] recbuf; // recode buffer
561 usize rcpos; // for recode buffer
562 scope(exit) recbuf.destroy;
564 void pushTag (const(char)[] s) {
565 if (s.length) {
566 if (tagStackBufPos+s.length >= tagStackBuf.length) {
567 if (tagStackBufPos >= int.max/2) throw new Exception("too many tags");
568 tagStackBuf.length = ((tagStackBufPos+s.length)|0x3ff)+1;
570 tagStackBuf[tagStackBufPos..tagStackBufPos+s.length] = s[];
571 if (!tagStackLastWasAppend) { tagStack.assumeSafeAppend; tagStackLastWasAppend = true; }
572 tagStack ~= tagStackBuf[tagStackBufPos..tagStackBufPos+s.length];
573 tagStackBufPos += s.length;
574 } else {
575 if (!tagStackLastWasAppend) { tagStack.assumeSafeAppend; tagStackLastWasAppend = true; }
576 tagStack ~= "";
580 void popTag () {
581 tagStack.length -= 1;
582 auto idx = tagStack.length;
583 tagStackBufPos -= tagStack.ptr[idx].length;
584 tagStackLastWasAppend = false;
587 char[] nrecode(bool doreset=true) (char[] text) {
588 if (efrom is null) return text; // nothing to do
589 static if (doreset) rcpos = 0;
590 bool needRecode = false;
591 foreach (char ch; text) if (ch >= 0x80) { needRecode = true; break; }
592 if (!needRecode) return text;
593 auto stpos = rcpos;
594 ubyte[16] buf;
595 auto ub = cast(const(ubyte)[])text;
596 while (ub.length > 0) {
597 dchar dc = efrom.safeDecode(ub);
598 if (dc == INVALID_SEQUENCE) dc = '?';
599 auto len = eto.encode(dc, buf);
600 if (rcpos+len > recbuf.length) {
601 recbuf.assumeSafeAppend; // the user is expected to copy data
602 recbuf.length = ((rcpos+len)|0x3ff)+1;
604 recbuf[rcpos..rcpos+len] = cast(char[])buf[0..len];
605 rcpos += len;
607 return recbuf[stpos..rcpos];
610 xmparse(fl,
611 (char[] name, char[][string] attrs) {
612 if (name == "?xml") {
613 if (seenXML) throw new Exception("duplicate '?xml?' tag");
614 seenXML = true;
615 if (auto ec = "encoding" in attrs) {
616 foreach (ref char ch; *ec) {
617 import std.ascii : toLower;
618 ch = ch.toLower;
620 if ((*ec).length && *ec != "utf-8") {
621 efrom = EncodingScheme.create(cast(string)(*ec)); // let's hope that it is safe...
622 eto = EncodingScheme.create("utf-8");
625 return;
627 if (!seenXML) throw new Exception("no '?xml?' tag");
628 pushTag(name);
629 bool attrsRecoded = (efrom is null);
630 foreach (ref TagCB tcb; callbacksOpen) {
631 if (tcb.type == TagCB.Type.Open && pathHit(tagStack, tcb.path, tcb.pathHasQuants)) {
632 if (tcb.openNoAttr) {
633 tcb.close(name);
634 } else {
635 // recode attrs and call the callback
636 if (!attrsRecoded) {
637 rcpos = 0; // reset recode
638 foreach (ref v; attrs.byValue) v = nrecode!false(v);
639 attrsRecoded = true;
641 tcb.open(name, attrs);
646 (char[] name) {
647 if (name == "?xml") return;
648 if (tagStack.length == 0 || tagStack[$-1] != name) throw new Exception("unbalanced xml tags");
649 foreach (ref TagCB tcb; callbacksClose) {
650 if (tcb.type == TagCB.Type.Close && pathHit(tagStack, tcb.path, tcb.pathHasQuants)) {
651 // call the callback
652 tcb.close(name);
655 popTag();
657 (char[] text) {
658 bool textRecoded = (efrom is null);
659 foreach (ref TagCB tcb; callbacksContent) {
660 if (tcb.type == TagCB.Type.Content && pathHit(tagStack, tcb.path, tcb.pathHasQuants)) {
661 // recode text and call the callback
662 if (!textRecoded) {
663 text = nrecode(text);
664 textRecoded = true;
666 tcb.content(text);