1 ! Copyright (C) 2008 Doug Coleman.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors arrays hashtables sequences.parser
4 html.parser.utils kernel namespaces sequences make math
5 unicode combinators.short-circuit quoting fry ;
8 TUPLE: tag name attributes text closing? ;
21 : closing-tag? ( string -- ? )
23 [ { [ first CHAR: / = ] [ last CHAR: / = ] } 1|| ] if-empty ;
25 : <tag> ( name attributes closing? -- tag )
31 : make-tag ( string attribs -- tag )
32 [ [ closing-tag? ] keep "/" trim1 ] dip rot <tag> ;
34 : new-tag ( text name -- tag )
39 : (read-quote) ( sequence-parser ch -- string )
40 '[ [ current _ = ] take-until ] [ advance drop ] bi ;
42 : read-single-quote ( sequence-parser -- string )
43 CHAR: ' (read-quote) ;
45 : read-double-quote ( sequence-parser -- string )
46 CHAR: \" (read-quote) ;
48 : read-quote ( sequence-parser -- string )
49 dup get+increment CHAR: ' =
50 [ read-single-quote ] [ read-double-quote ] if ;
52 : read-key ( sequence-parser -- string )
54 [ current { [ CHAR: = = ] [ blank? ] } 1|| ] take-until ;
56 : read-token ( sequence-parser -- string )
57 [ current blank? ] take-until ;
59 : read-value ( sequence-parser -- string )
61 dup current quote? [ read-quote ] [ read-token ] if
64 : read-comment ( sequence-parser -- )
65 [ "-->" take-until-sequence comment new-tag push-tag ]
66 [ '[ _ advance drop ] 3 swap times ] bi ;
68 : read-dtd ( sequence-parser -- )
69 [ ">" take-until-sequence dtd new-tag push-tag ]
72 : read-bang ( sequence-parser -- )
73 advance dup { [ current CHAR: - = ] [ peek-next CHAR: - = ] } 1&&
74 [ advance advance read-comment ] [ read-dtd ] if ;
76 : read-tag ( sequence-parser -- string )
78 [ current "><" member? ] take-until
79 [ CHAR: / = ] trim-tail
80 ] [ dup current CHAR: < = [ advance ] unless drop ] bi ;
82 : read-until-< ( sequence-parser -- string )
83 [ current CHAR: < = ] take-until ;
85 : parse-text ( sequence-parser -- )
86 read-until-< [ text new-tag push-tag ] unless-empty ;
88 : parse-key/value ( sequence-parser -- key value )
90 [ skip-whitespace "=" take-sequence ]
91 [ swap [ read-value ] [ drop dup ] if ] tri ;
93 : (parse-attributes) ( sequence-parser -- )
95 dup sequence-parse-end? [
98 [ parse-key/value swap ,, ] [ (parse-attributes) ] bi
101 : parse-attributes ( sequence-parser -- hashtable )
102 [ (parse-attributes) ] H{ } make ;
104 : (parse-tag) ( string -- string' hashtable )
106 [ read-token >lower ] [ parse-attributes ] bi
109 : read-< ( sequence-parser -- string/f )
110 advance dup current [
111 CHAR: ! = [ read-bang f ] [ read-tag ] if
116 : parse-tag ( sequence-parser -- )
117 read-< [ (parse-tag) make-tag push-tag ] unless-empty ;
119 : (parse-html) ( sequence-parser -- )
121 [ parse-text ] [ parse-tag ] [ (parse-html) ] tri
124 : tag-parse ( quot -- vector )
125 V{ } clone tagstack [ parse-sequence ] with-variable ; inline
129 : parse-html ( string -- vector )
130 [ (parse-html) tagstack get ] tag-parse ;