1 """Various things to do with [x]html that might be useful in more than
4 import lxml
, lxml
.html
, lxml
.etree
, lxml
.html
.clean
8 from cStringIO
import StringIO
10 from urlparse
import urlparse
, urlsplit
, urljoin
11 from urllib2
import urlopen
, HTTPError
14 import simplejson
as json
18 from zipfile
import ZipFile
, ZipInfo
, ZIP_DEFLATED
, ZIP_STORED
23 'xhtml': "application/xhtml+xml",
25 'json': "application/json",
31 'svg': 'image/svg+xml',
33 'ncx': 'application/x-dtbncx+xml',
34 'dtb': 'application/x-dtbook+xml',
35 'xml': 'application/xml',
37 'pdf': "application/pdf",
40 'epub': "application/epub+zip",
41 'booki': "application/booki+zip",
43 None: 'application/octet-stream',
47 "body", "head", "html", "title", "abbr", "acronym", "address",
48 "blockquote", "br", "cite", "code", "dfn", "div", "em", "h1", "h2",
49 "h3", "h4", "h5", "h6", "kbd", "p", "pre", "q", "samp", "span",
50 "strong", "var", "a", "dl", "dt", "dd", "ol", "ul", "li", "object",
51 "param", "b", "big", "hr", "i", "small", "sub", "sup", "tt", "del",
52 "ins", "bdo", "caption", "col", "colgroup", "table", "tbody", "td",
53 "tfoot", "th", "thead", "tr", "img", "area", "map", "meta", "style",
57 XHTMLNS
= '{http://www.w3.org/1999/xhtml}'
58 XHTML
= 'http://www.w3.org/1999/xhtml'
60 XHTML11_DOCTYPE
= '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
61 "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
63 XML_DEC
= '<?xml version="1.0" encoding="UTF-8"?>\n'
65 IMG_CACHE
= 'cache/images/'
66 IMG_PREFIX
= 'static/'
68 def log(*messages
, **kwargs
):
71 print >> sys
.stderr
, m
73 print >> sys
.stderr
, repr(m
)
77 def __init__(self
, name
, email
):
81 def url_to_filename(url
, prefix
=''):
82 #XXX slightly inefficient to do urlsplit so many times, but versatile
83 fragments
= urlsplit(url
)
84 base
, ext
= fragments
.path
.rsplit('.', 1)
85 server
= fragments
.netloc
.split('.', 1)[0] #en, fr, translate
86 base
= base
.split('/pub/', 1)[1] #remove /floss/pub/ or /pub/
87 base
= re
.sub(r
'[^\w]+', '-', '%s-%s' %(base
, server
))
88 return '%s%s.%s' % (prefix
, base
, ext
)
91 class ImageCache(object):
92 def __init__(self
, cache_dir
=IMG_CACHE
, prefix
=IMG_PREFIX
):
94 self
.cache_dir
= cache_dir
96 if not os
.path
.exists(cache_dir
+ prefix
):
97 os
.makedirs(cache_dir
+ prefix
)
99 def read_local_url(self
, path
):
100 f
= open(self
.cache_dir
+ path
)
105 def _save_local_url(self
, path
, data
):
106 f
= open(self
.cache_dir
+ path
, 'w')
109 #os.chmod(path, 0444)
111 def fetch_if_necessary(self
, url
, target
=None, use_cache
=True):
112 if url
in self
._fetched
:
113 return self
._fetched
[url
]
116 target
= url_to_filename(url
, self
.prefix
)
118 if use_cache
and os
.path
.exists(self
.cache_dir
+ target
):
119 log("used cache for %s" % target
)
127 # if it is missing, assume it will be missing every time
128 # after, otherwise, you can get into endless waiting
129 self
._fetched
[url
] = None
130 log("Wanting '%s', got error %s" %(url
, e
))
133 self
._save
_local
_url
(target
, data
)
134 self
._fetched
[url
] = target
135 log("got %s as %s" % (url
, target
))
139 class BaseChapter(object):
140 image_cache
= ImageCache()
143 """Serialise the tree as html."""
144 return lxml
.etree
.tostring(self
.tree
, method
='html')
146 def as_twikitext(self
):
147 """Get the twiki-style guts of the chapter from the tree"""
148 text
= lxml
.etree
.tostring(self
.tree
.find('body'), method
='html')
149 text
= re
.sub(r
'^.*?<body.*?>\s*', '', text
)
150 text
= re
.sub(r
'\s*</body>.*$', '\n', text
)
154 """Convert to xhtml and serialise."""
156 root
= self
.tree
.getroot()
157 except AttributeError:
160 nsmap
= {None: XHTML
}
161 xroot
= lxml
.etree
.Element(XHTMLNS
+ "html", nsmap
=nsmap
)
163 def xhtml_copy(el
, xel
):
165 for k
, v
in el
.items():
167 for child
in el
.iterchildren():
168 xchild
= xel
.makeelement(XHTMLNS
+ child
.tag
)
170 xhtml_copy(child
, xchild
)
173 xhtml_copy(root
, xroot
)
175 return XML_DEC
+ XHTML11_DOCTYPE
+ lxml
.etree
.tostring(xroot
)
177 def localise_links(self
):
178 """Find image links, convert them to local links, and fetch
179 the images from the net so the local links work"""
181 def localise(oldlink
):
182 fragments
= urlsplit(oldlink
)
183 if '.' not in fragments
.path
:
184 log('ignoring %s' % oldlink
)
186 base
, ext
= fragments
.path
.rsplit('.', 1)
188 if (not fragments
.scheme
.startswith('http') or
189 fragments
.netloc
!= self
.server
or
190 ext
not in ('png', 'gif', 'jpg', 'jpeg', 'svg', 'css', 'js') or
193 log('ignoring %s' % oldlink
)
196 newlink
= self
.image_cache
.fetch_if_necessary(oldlink
, use_cache
=self
.use_cache
)
197 if newlink
is not None:
198 images
.append(newlink
)
200 log("can't do anything for %s -- why?" % (oldlink
,))
203 self
.tree
.rewrite_links(localise
, base_href
=('http://%s/bin/view/%s/%s' %
204 (self
.server
, self
.book
, self
.name
)))
207 cleaner
= lxml
.html
.clean
.Cleaner(scripts
=True,
213 page_structure
=False,
214 processing_instructions
=True,
220 remove_unknown_tags
=False,
221 safe_attrs_only
=True,
225 def remove_bad_tags(self
):
226 for e
in self
.tree
.iter():
227 if not e
.tag
in OK_TAGS
:
228 log('found bad tag %s' % e
.tag
)
229 self
.cleaner(self
.tree
)
232 class ImportedChapter(BaseChapter
):
233 """Used for git import"""
234 def __init__(self
, lang
, book
, chapter_name
, text
, author
, email
, date
, server
=None,
235 use_cache
=False, cache_dir
=None):
238 self
.name
= chapter_name
239 self
.author
= Author(author
, email
)
242 server
= '%s.flossmanuals.net' % lang
244 #XXX is texl html-wrapped?
245 self
.tree
= lxml
.html
.document_fromstring(text
)
246 self
.use_cache
= use_cache
248 self
.image_cache
= ImageCache(cache_dir
)
251 class EpubChapter(BaseChapter
):
252 def __init__(self
, server
, book
, chapter_name
, html
, use_cache
=False,
256 self
.name
= chapter_name
257 self
.use_cache
= use_cache
259 self
.image_cache
= ImageCache(cache_dir
)
260 self
.tree
= lxml
.html
.document_fromstring(html
)
265 class BookiZip(object):
267 def __init__(self
, filename
):
268 """Start a new zip and put an uncompressed 'mimetype' file at the
269 start. This idea is copied from the epub specification, and
270 allows the file type to be dscovered by reading the first few
272 self
.zipfile
= ZipFile(filename
, 'w', ZIP_DEFLATED
, allowZip64
=True)
273 self
.write_blob('mimetype', MEDIATYPES
['booki'], ZIP_STORED
)
274 self
.filename
= filename
277 def write_blob(self
, filename
, blob
, compression
=ZIP_DEFLATED
, mode
=0644):
278 """Add something to the zip without adding to manifest"""
279 zinfo
= ZipInfo(filename
)
280 zinfo
.external_attr
= mode
<< 16L # set permissions
281 zinfo
.compress_type
= compression
282 self
.zipfile
.writestr(zinfo
, blob
)
284 def add_to_package(self
, ID
, fn
, blob
, mediatype
=None):
285 """Add an item to the zip, and save it in the manifest. If
286 mediatype is not provided, it will be guessed according to the
288 self
.write_blob(fn
, blob
)
289 if mediatype
is None:
290 ext
= fn
[fn
.rfind('.') + 1:]
291 mediatype
= MEDIATYPES
.get(ext
, MEDIATYPES
[None])
292 self
.manifest
[ID
] = (fn
, mediatype
)
298 """Finalise the metadata and write to disk"""
299 self
.info
['manifest'] = self
.manifest
300 infojson
= json
.dumps(self
.info
, indent
=2)
301 self
.add_to_package('info.json', 'info.json', infojson
, 'application/json')