Fixed create_chapter toc creatin bug and created Progress bar when doing Publishing.
[booki.git] / lib / booki / xhtml_utils.py
blob01e516b0e928a8396f4bea279d9648cca1d3a097
1 """Various things to do with [x]html that might be useful in more than
2 one place."""
4 import lxml, lxml.html, lxml.etree, lxml.html.clean
6 import os, sys
7 import re
8 from cStringIO import StringIO
10 from urlparse import urlparse, urlsplit, urljoin
11 from urllib2 import urlopen, HTTPError
13 try:
14 import simplejson as json
15 except ImportError:
16 import json
18 from zipfile import ZipFile, ZipInfo, ZIP_DEFLATED, ZIP_STORED
21 MEDIATYPES = {
22 'html': "text/html",
23 'xhtml': "application/xhtml+xml",
24 'css': 'text/css',
25 'json': "application/json",
27 'png': 'image/png',
28 'gif': 'image/gif',
29 'jpg': 'image/jpeg',
30 'jpeg': 'image/jpeg',
31 'svg': 'image/svg+xml',
33 'ncx': 'application/x-dtbncx+xml',
34 'dtb': 'application/x-dtbook+xml',
35 'xml': 'application/xml',
37 'pdf': "application/pdf",
38 'txt': 'text/plain',
40 'epub': "application/epub+zip",
41 'booki': "application/booki+zip",
43 None: 'application/octet-stream',
46 OK_TAGS = set([
47 "body", "head", "html", "title", "abbr", "acronym", "address",
48 "blockquote", "br", "cite", "code", "dfn", "div", "em", "h1", "h2",
49 "h3", "h4", "h5", "h6", "kbd", "p", "pre", "q", "samp", "span",
50 "strong", "var", "a", "dl", "dt", "dd", "ol", "ul", "li", "object",
51 "param", "b", "big", "hr", "i", "small", "sub", "sup", "tt", "del",
52 "ins", "bdo", "caption", "col", "colgroup", "table", "tbody", "td",
53 "tfoot", "th", "thead", "tr", "img", "area", "map", "meta", "style",
54 "link", "base"
57 XHTMLNS = '{http://www.w3.org/1999/xhtml}'
58 XHTML = 'http://www.w3.org/1999/xhtml'
60 XHTML11_DOCTYPE = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
61 "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
62 '''
63 XML_DEC = '<?xml version="1.0" encoding="UTF-8"?>\n'
65 IMG_CACHE = 'cache/images/'
66 IMG_PREFIX = 'static/'
68 def log(*messages, **kwargs):
69 for m in messages:
70 try:
71 print >> sys.stderr, m
72 except Exception:
73 print >> sys.stderr, repr(m)
76 class Author(object):
77 def __init__(self, name, email):
78 self.name = name
79 self.email = email
81 def url_to_filename(url, prefix=''):
82 #XXX slightly inefficient to do urlsplit so many times, but versatile
83 fragments = urlsplit(url)
84 base, ext = fragments.path.rsplit('.', 1)
85 server = fragments.netloc.split('.', 1)[0] #en, fr, translate
86 base = base.split('/pub/', 1)[1] #remove /floss/pub/ or /pub/
87 base = re.sub(r'[^\w]+', '-', '%s-%s' %(base, server))
88 return '%s%s.%s' % (prefix, base, ext)
91 class ImageCache(object):
92 def __init__(self, cache_dir=IMG_CACHE, prefix=IMG_PREFIX):
93 self._fetched = {}
94 self.cache_dir = cache_dir
95 self.prefix = prefix
96 if not os.path.exists(cache_dir + prefix):
97 os.makedirs(cache_dir + prefix)
99 def read_local_url(self, path):
100 f = open(self.cache_dir + path)
101 s = f.read()
102 f.close()
103 return s
105 def _save_local_url(self, path, data):
106 f = open(self.cache_dir + path, 'w')
107 f.write(data)
108 f.close()
109 #os.chmod(path, 0444)
111 def fetch_if_necessary(self, url, target=None, use_cache=True):
112 if url in self._fetched:
113 return self._fetched[url]
115 if target is None:
116 target = url_to_filename(url, self.prefix)
118 if use_cache and os.path.exists(self.cache_dir + target):
119 log("used cache for %s" % target)
120 return target
122 try:
123 f = urlopen(url)
124 data = f.read()
125 f.close()
126 except HTTPError, e:
127 # if it is missing, assume it will be missing every time
128 # after, otherwise, you can get into endless waiting
129 self._fetched[url] = None
130 log("Wanting '%s', got error %s" %(url, e))
131 return None
133 self._save_local_url(target, data)
134 self._fetched[url] = target
135 log("got %s as %s" % (url, target))
136 return target
139 class BaseChapter(object):
140 image_cache = ImageCache()
142 def as_html(self):
143 """Serialise the tree as html."""
144 return lxml.etree.tostring(self.tree, method='html')
146 def as_twikitext(self):
147 """Get the twiki-style guts of the chapter from the tree"""
148 text = lxml.etree.tostring(self.tree.find('body'), method='html')
149 text = re.sub(r'^.*?<body.*?>\s*', '', text)
150 text = re.sub(r'\s*</body>.*$', '\n', text)
151 return text
153 def as_xhtml(self):
154 """Convert to xhtml and serialise."""
155 try:
156 root = self.tree.getroot()
157 except AttributeError:
158 root = self.tree
160 nsmap = {None: XHTML}
161 xroot = lxml.etree.Element(XHTMLNS + "html", nsmap=nsmap)
163 def xhtml_copy(el, xel):
164 xel.text = el.text
165 for k, v in el.items():
166 xel.set(k, v)
167 for child in el.iterchildren():
168 xchild = xel.makeelement(XHTMLNS + child.tag)
169 xel.append(xchild)
170 xhtml_copy(child, xchild)
171 xel.tail = el.tail
173 xhtml_copy(root, xroot)
175 return XML_DEC + XHTML11_DOCTYPE + lxml.etree.tostring(xroot)
177 def localise_links(self):
178 """Find image links, convert them to local links, and fetch
179 the images from the net so the local links work"""
180 images = []
181 def localise(oldlink):
182 fragments = urlsplit(oldlink)
183 if '.' not in fragments.path:
184 log('ignoring %s' % oldlink)
185 return oldlink
186 base, ext = fragments.path.rsplit('.', 1)
187 ext = ext.lower()
188 if (not fragments.scheme.startswith('http') or
189 fragments.netloc != self.server or
190 ext not in ('png', 'gif', 'jpg', 'jpeg', 'svg', 'css', 'js') or
191 '/pub/' not in base
193 log('ignoring %s' % oldlink)
194 return oldlink
196 newlink = self.image_cache.fetch_if_necessary(oldlink, use_cache=self.use_cache)
197 if newlink is not None:
198 images.append(newlink)
199 return newlink
200 log("can't do anything for %s -- why?" % (oldlink,))
201 return oldlink
203 self.tree.rewrite_links(localise, base_href=('http://%s/bin/view/%s/%s' %
204 (self.server, self.book, self.name)))
205 return images
207 cleaner = lxml.html.clean.Cleaner(scripts=True,
208 javascript=True,
209 comments=False,
210 style=True,
211 links=True,
212 meta=True,
213 page_structure=False,
214 processing_instructions=True,
215 embedded=True,
216 frames=True,
217 forms=True,
218 annoying_tags=True,
219 allow_tags=OK_TAGS,
220 remove_unknown_tags=False,
221 safe_attrs_only=True,
222 add_nofollow=False
225 def remove_bad_tags(self):
226 for e in self.tree.iter():
227 if not e.tag in OK_TAGS:
228 log('found bad tag %s' % e.tag)
229 self.cleaner(self.tree)
232 class ImportedChapter(BaseChapter):
233 """Used for git import"""
234 def __init__(self, lang, book, chapter_name, text, author, email, date, server=None,
235 use_cache=False, cache_dir=None):
236 self.lang = lang
237 self.book = book
238 self.name = chapter_name
239 self.author = Author(author, email)
240 self.date = date
241 if server is None:
242 server = '%s.flossmanuals.net' % lang
243 self.server = server
244 #XXX is texl html-wrapped?
245 self.tree = lxml.html.document_fromstring(text)
246 self.use_cache = use_cache
247 if cache_dir:
248 self.image_cache = ImageCache(cache_dir)
251 class EpubChapter(BaseChapter):
252 def __init__(self, server, book, chapter_name, html, use_cache=False,
253 cache_dir=None):
254 self.server = server
255 self.book = book
256 self.name = chapter_name
257 self.use_cache = use_cache
258 if cache_dir:
259 self.image_cache = ImageCache(cache_dir)
260 self.tree = lxml.html.document_fromstring(html)
265 class BookiZip(object):
267 def __init__(self, filename):
268 """Start a new zip and put an uncompressed 'mimetype' file at the
269 start. This idea is copied from the epub specification, and
270 allows the file type to be dscovered by reading the first few
271 bytes."""
272 self.zipfile = ZipFile(filename, 'w', ZIP_DEFLATED, allowZip64=True)
273 self.write_blob('mimetype', MEDIATYPES['booki'], ZIP_STORED)
274 self.filename = filename
275 self.manifest = {}
277 def write_blob(self, filename, blob, compression=ZIP_DEFLATED, mode=0644):
278 """Add something to the zip without adding to manifest"""
279 zinfo = ZipInfo(filename)
280 zinfo.external_attr = mode << 16L # set permissions
281 zinfo.compress_type = compression
282 self.zipfile.writestr(zinfo, blob)
284 def add_to_package(self, ID, fn, blob, mediatype=None):
285 """Add an item to the zip, and save it in the manifest. If
286 mediatype is not provided, it will be guessed according to the
287 extrension."""
288 self.write_blob(fn, blob)
289 if mediatype is None:
290 ext = fn[fn.rfind('.') + 1:]
291 mediatype = MEDIATYPES.get(ext, MEDIATYPES[None])
292 self.manifest[ID] = (fn, mediatype)
294 def _close(self):
295 self.zipfile.close()
297 def finish(self):
298 """Finalise the metadata and write to disk"""
299 self.info['manifest'] = self.manifest
300 infojson = json.dumps(self.info, indent=2)
301 self.add_to_package('info.json', 'info.json', infojson, 'application/json')
302 self._close()