lib/booki/xhtml_utils.py

   1 """Various things to do with [x]html that might be useful in more than
   2 one place."""
   3
   4 import lxml, lxml.html, lxml.etree, lxml.html.clean
   5
   6 import os, sys
   7 import re
   8 from cStringIO import StringIO
   9
  10 from urlparse import urlparse, urlsplit, urljoin
  11 from urllib2 import urlopen, HTTPError
  12
  13 try:
  14     import simplejson as json
  15 except ImportError:
  16     import json
  17
  18 from zipfile import ZipFile, ZipInfo, ZIP_DEFLATED, ZIP_STORED
  19
  20
  21 MEDIATYPES = {
  22     'html': "text/html",
  23     'xhtml': "application/xhtml+xml",
  24     'css': 'text/css',
  25     'json': "application/json",
  26
  27     'png': 'image/png',
  28     'gif': 'image/gif',
  29     'jpg': 'image/jpeg',
  30     'jpeg': 'image/jpeg',
  31     'svg': 'image/svg+xml',
  32
  33     'ncx': 'application/x-dtbncx+xml',
  34     'dtb': 'application/x-dtbook+xml',
  35     'xml': 'application/xml',
  36
  37     'pdf': "application/pdf",
  38     'txt': 'text/plain',
  39
  40     'epub': "application/epub+zip",
  41     'booki': "application/booki+zip",
  42
  43     None: 'application/octet-stream',
  44 }
  45
  46 OK_TAGS = set([
  47     "body", "head", "html", "title", "abbr", "acronym", "address",
  48     "blockquote", "br", "cite", "code", "dfn", "div", "em", "h1", "h2",
  49     "h3", "h4", "h5", "h6", "kbd", "p", "pre", "q", "samp", "span",
  50     "strong", "var", "a", "dl", "dt", "dd", "ol", "ul", "li", "object",
  51     "param", "b", "big", "hr", "i", "small", "sub", "sup", "tt", "del",
  52     "ins", "bdo", "caption", "col", "colgroup", "table", "tbody", "td",
  53     "tfoot", "th", "thead", "tr", "img", "area", "map", "meta", "style",
  54     "link", "base"
  55     ])
  56
  57 XHTMLNS = '{http://www.w3.org/1999/xhtml}'
  58 XHTML = 'http://www.w3.org/1999/xhtml'
  59
  60 XHTML11_DOCTYPE = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
  61     "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
  62 '''
  63 XML_DEC = '<?xml version="1.0" encoding="UTF-8"?>\n'
  64
  65 IMG_CACHE = 'cache/images/'
  66 IMG_PREFIX = 'static/'
  67
  68 def log(*messages, **kwargs):
  69     for m in messages:
  70         try:
  71             print >> sys.stderr, m
  72         except Exception:
  73             print >> sys.stderr, repr(m)
  74
  75
  76 class Author(object):
  77     def __init__(self, name, email):
  78         self.name = name
  79         self.email = email
  80
  81 def url_to_filename(url, prefix=''):
  82     #XXX slightly inefficient to do urlsplit so many times, but versatile
  83     fragments = urlsplit(url)
  84     base, ext = fragments.path.rsplit('.', 1)
  85     server = fragments.netloc.split('.', 1)[0] #en, fr, translate
  86     base = base.split('/pub/', 1)[1] #remove /floss/pub/ or /pub/
  87     base = re.sub(r'[^\w]+', '-',  '%s-%s' %(base, server))
  88     return '%s%s.%s' % (prefix, base, ext)
  89
  90
  91 class ImageCache(object):
  92     def __init__(self, cache_dir=IMG_CACHE, prefix=IMG_PREFIX):
  93         self._fetched = {}
  94         self.cache_dir = cache_dir
  95         self.prefix = prefix
  96         if not os.path.exists(cache_dir + prefix):
  97             os.makedirs(cache_dir + prefix)
  98
  99     def read_local_url(self, path):
 100         f = open(self.cache_dir + path)
 101         s = f.read()
 102         f.close()
 103         return s
 104
 105     def _save_local_url(self, path, data):
 106         f = open(self.cache_dir + path, 'w')
 107         f.write(data)
 108         f.close()
 109         #os.chmod(path, 0444)
 110
 111     def fetch_if_necessary(self, url, target=None, use_cache=True):
 112         if url in self._fetched:
 113             return self._fetched[url]
 114
 115         if target is None:
 116             target = url_to_filename(url, self.prefix)
 117
 118         if use_cache and os.path.exists(self.cache_dir + target):
 119             log("used cache for %s" % target)
 120             return target
 121
 122         try:
 123             f = urlopen(url)
 124             data = f.read()
 125             f.close()
 126         except HTTPError, e:
 127             # if it is missing, assume it will be missing every time
 128             # after, otherwise, you can get into endless waiting
 129             self._fetched[url] = None
 130             log("Wanting '%s', got error %s" %(url, e))
 131             return None
 132
 133         self._save_local_url(target, data)
 134         self._fetched[url] = target
 135         log("got %s as %s" % (url, target))
 136         return target
 137
 138
 139 class BaseChapter(object):
 140     image_cache = ImageCache()
 141
 142     def as_html(self):
 143         """Serialise the tree as html."""
 144         return lxml.etree.tostring(self.tree, method='html')
 145
 146     def as_twikitext(self):
 147         """Get the twiki-style guts of the chapter from the tree"""
 148         text = lxml.etree.tostring(self.tree.find('body'), method='html')
 149         text = re.sub(r'^.*?<body.*?>\s*', '', text)
 150         text = re.sub(r'\s*</body>.*$', '\n', text)
 151         return text
 152
 153     def as_xhtml(self):
 154         """Convert to xhtml and serialise."""
 155         try:
 156             root = self.tree.getroot()
 157         except AttributeError:
 158             root = self.tree
 159
 160         nsmap = {None: XHTML}
 161         xroot = lxml.etree.Element(XHTMLNS + "html", nsmap=nsmap)
 162
 163         def xhtml_copy(el, xel):
 164             xel.text = el.text
 165             for k, v in el.items():
 166                 xel.set(k, v)
 167             for child in el.iterchildren():
 168                 xchild = xel.makeelement(XHTMLNS + child.tag)
 169                 xel.append(xchild)
 170                 xhtml_copy(child, xchild)
 171             xel.tail = el.tail
 172
 173         xhtml_copy(root, xroot)
 174
 175         return XML_DEC + XHTML11_DOCTYPE + lxml.etree.tostring(xroot)
 176
 177     def localise_links(self):
 178         """Find image links, convert them to local links, and fetch
 179         the images from the net so the local links work"""
 180         images = []
 181         def localise(oldlink):
 182             fragments = urlsplit(oldlink)
 183             if '.' not in fragments.path:
 184                 log('ignoring %s' % oldlink)
 185                 return oldlink
 186             base, ext = fragments.path.rsplit('.', 1)
 187             ext = ext.lower()
 188             if (not fragments.scheme.startswith('http') or
 189                 fragments.netloc != self.server or
 190                 ext not in ('png', 'gif', 'jpg', 'jpeg', 'svg', 'css', 'js') or
 191                 '/pub/' not in base
 192                 ):
 193                 log('ignoring %s' % oldlink)
 194                 return oldlink
 195
 196             newlink = self.image_cache.fetch_if_necessary(oldlink, use_cache=self.use_cache)
 197             if newlink is not None:
 198                 images.append(newlink)
 199                 return newlink
 200             log("can't do anything for %s -- why?" % (oldlink,))
 201             return oldlink
 202
 203         self.tree.rewrite_links(localise, base_href=('http://%s/bin/view/%s/%s' %
 204                                                      (self.server, self.book, self.name)))
 205         return images
 206
 207     cleaner = lxml.html.clean.Cleaner(scripts=True,
 208                                       javascript=True,
 209                                       comments=False,
 210                                       style=True,
 211                                       links=True,
 212                                       meta=True,
 213                                       page_structure=False,
 214                                       processing_instructions=True,
 215                                       embedded=True,
 216                                       frames=True,
 217                                       forms=True,
 218                                       annoying_tags=True,
 219                                       allow_tags=OK_TAGS,
 220                                       remove_unknown_tags=False,
 221                                       safe_attrs_only=True,
 222                                       add_nofollow=False
 223                                       )
 224
 225     def remove_bad_tags(self):
 226         for e in self.tree.iter():
 227             if not e.tag in OK_TAGS:
 228                 log('found bad tag %s' % e.tag)
 229         self.cleaner(self.tree)
 230
 231
 232 class ImportedChapter(BaseChapter):
 233     """Used for git import"""
 234     def __init__(self, lang, book, chapter_name, text, author, email, date, server=None,
 235                  use_cache=False, cache_dir=None):
 236         self.lang = lang
 237         self.book = book
 238         self.name = chapter_name
 239         self.author = Author(author, email)
 240         self.date = date
 241         if server is None:
 242             server = '%s.flossmanuals.net' % lang
 243         self.server = server
 244         #XXX is texl html-wrapped?
 245         self.tree = lxml.html.document_fromstring(text)
 246         self.use_cache = use_cache
 247         if cache_dir:
 248             self.image_cache = ImageCache(cache_dir)
 249
 250
 251 class EpubChapter(BaseChapter):
 252     def __init__(self, server, book, chapter_name, html, use_cache=False,
 253                  cache_dir=None):
 254         self.server = server
 255         self.book = book
 256         self.name = chapter_name
 257         self.use_cache = use_cache
 258         if cache_dir:
 259             self.image_cache = ImageCache(cache_dir)
 260         self.tree = lxml.html.document_fromstring(html)
 261
 262
 263
 264
 265 class BookiZip(object):
 266
 267     def __init__(self, filename):
 268         """Start a new zip and put an uncompressed 'mimetype' file at the
 269         start.  This idea is copied from the epub specification, and
 270         allows the file type to be dscovered by reading the first few
 271         bytes."""
 272         self.zipfile = ZipFile(filename, 'w', ZIP_DEFLATED, allowZip64=True)
 273         self.write_blob('mimetype', MEDIATYPES['booki'], ZIP_STORED)
 274         self.filename = filename
 275         self.manifest = {}
 276
 277     def write_blob(self, filename, blob, compression=ZIP_DEFLATED, mode=0644):
 278         """Add something to the zip without adding to manifest"""
 279         zinfo = ZipInfo(filename)
 280         zinfo.external_attr = mode << 16L # set permissions
 281         zinfo.compress_type = compression
 282         self.zipfile.writestr(zinfo, blob)
 283
 284     def add_to_package(self, ID, fn, blob, mediatype=None):
 285         """Add an item to the zip, and save it in the manifest.  If
 286         mediatype is not provided, it will be guessed according to the
 287         extrension."""
 288         self.write_blob(fn, blob)
 289         if mediatype is None:
 290             ext = fn[fn.rfind('.') + 1:]
 291             mediatype = MEDIATYPES.get(ext, MEDIATYPES[None])
 292         self.manifest[ID] = (fn, mediatype)
 293
 294     def _close(self):
 295         self.zipfile.close()
 296
 297     def finish(self):
 298         """Finalise the metadata and write to disk"""
 299         self.info['manifest'] = self.manifest
 300         infojson = json.dumps(self.info, indent=2)
 301         self.add_to_package('info.json', 'info.json', infojson, 'application/json')
 302         self._close()
 303