1 from typing
import Union
, List
2 from lxml
.html
import document_fromstring
, HtmlElement
3 # from purifier.purifier import HTMLPurifier
9 # def _clear_html(body):
10 # purifier = HTMLPurifier({
11 # 'div': ['*'], 'span': ['*'],
12 # 'img': ['*'], 'a': ['*'],
13 # 'h1': ['*'], 'h2': ['*'],
14 # 'h3': ['*'], 'h4': ['*'],
15 # 'h5': ['*'], 'h6': ['*'],
17 # return purifier.feed(body)
20 def document_fromstring(body
, selector
: str = None, idx
: int = None) -> Union
[HtmlElement
, List
[HtmlElement
]]: # pragma: no cover
22 raise RuntimeError('Content has empty!')
24 result
= document_fromstring(body
) # todo
25 if isinstance(selector
, str):
26 result
= result
.cssselect(selector
)
27 if isinstance(idx
, int):
32 def _set_if_not_none(var
, key
, value
): # pragma: no cover
40 _
= _
and o
not in [42, 47, 92, 94]
44 def remove_not_ascii(value
):
45 return "".join(i
for i
in value
if i
== '_' or Static
.__test
_ascii
(i
))