3 -- Extract any text that has been markup up with a given tag.
5 import Text
.HTML
.TagSoup
7 import System
.Environment
(getArgs)
8 import Data
.List
.HT
(breakAfter
)
9 import Data
.List
(foldl')
12 import qualified Data
.Map
as Map
15 data Entry
= Entry
Int (Tag
String)
18 instance Num Entry
where
19 (+) (Entry i a
) (Entry j _
) = Entry
(i
+j
) a
21 instance Ord Entry
where
22 (>=) (Entry i1 a1
) (Entry i2 a2
)
23 | i1
== i2
= (>=) a1 a2
24 |
otherwise = (>=) i1 i2
25 (<=) (Entry i1 a1
) (Entry i2 a2
)
26 | i1
== i2
= (<=) a1 a2
27 |
otherwise = (<=) i1 i2
29 data State
= State
Int (Map Entry
Int)
32 stateGetMap
:: State
-> (Map Entry
Int)
33 stateGetMap
(State lev m
) = m
35 checkArgs
:: [String] -> Bool
36 checkArgs x
= any (elem '-') x
38 helpstring
= "Bad Arguments!\n"
44 then hPutStr stderr helpstring
45 else interact (unlines . report
)
47 report
:: String -> [String]
48 report
= reportLines
. scan
. canonicalizeTags
. parseTags
50 reportLines
:: (Map Entry
Int) -> [String]
51 reportLines m
= map fmt
(Map
.toAscList m
)
52 where fmt
((Entry l t
), n
) =
53 show l
++ "\t" ++ show n
++ "\t" ++ renderTags
[t
]
55 scan
:: [Tag
String] -> (Map Entry
Int)
56 scan
= stateGetMap
. foldl' update
(State
0 Map
.empty)
58 update
(State lev m
) t
59 | isStartTag t
= State
(lev
+1) (insert t lev m
)
60 | isStopTag t
= State
(lev
-1) (insert t
(lev
-1) m
)
61 | isHtmlTag t
= State lev
(insert t lev m
)
62 |
otherwise = State lev m
63 insert t l
= Map
.insertWith
(+) (Entry l t
) 1
64 -- update :: Tag String -> State -> State
67 isStartTag
:: Tag
String -> Bool
68 isStartTag
(TagOpen str _
) = elem str leveltags
71 isStopTag
:: Tag
String -> Bool
72 isStopTag
(TagClose str
) = elem str leveltags
75 isHtmlTag
:: Tag
String -> Bool
76 isHtmlTag
(TagOpen _ _
) = True
77 isHtmlTag
(TagClose _
) = True
80 leveltags
= ["html", "head", "title", "body",
81 "h1", "h2", "h3", "h4", "h5", "h6", "a",
82 "b", "strong", "i", "tt", "em", "font", "u", "small", "strike",
83 "sub", "sup", "big", "span",
84 "code", "pre", "cite", "blockquote", "center", "div",
86 -- Not sure about, "p", ...