storage/html.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # Copyright 2004-2006 Zuza Software Foundation
   5 #
   6 # This file is part of translate.
   7 #
   8 # translate is free software; you can redistribute it and/or modify
   9 # it under the terms of the GNU General Public License as published by
  10 # the Free Software Foundation; either version 2 of the License, or
  11 # (at your option) any later version.
  12 #
  13 # translate is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 # GNU General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU General Public License
  19 # along with translate; if not, write to the Free Software
  20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21 #
  22
  23 """module for parsing html files for translation"""
  24
  25 import re
  26 from translate.storage import base
  27 from HTMLParser import HTMLParser
  28
  29 class htmlunit(base.TranslationUnit):
  30     """A unit of translatable/localisable HTML content"""
  31     def __init__(self, source=None):
  32         self.locations = []
  33         self.setsource(source)
  34
  35     def getsource(self):
  36         #TODO: Rethink how clever we should try to be with html entities.
  37         return self.text.replace("&amp;", "&").replace("&lt;", "<").replace("\r\n", " ").replace("\n", " ").replace("\r", " ")
  38
  39     def setsource(self, source):
  40         self.text = source.replace("&", "&amp;").replace("<", "&lt;")
  41     source = property(getsource, setsource)
  42
  43     def addlocation(self, location):
  44         self.locations.append(location)
  45
  46     def getlocations(self):
  47         return self.locations
  48
  49
  50 class htmlfile(HTMLParser, base.TranslationStore):
  51     UnitClass = htmlunit
  52     markingtags = ["p", "title", "h1", "h2", "h3", "h4", "h5", "h6", "th", "td", "div", "li", "dt", "dd", "address", "caption"]
  53     markingattrs = []
  54     includeattrs = ["alt", "summary", "standby", "abbr", "content"]
  55
  56     def __init__(self, includeuntaggeddata=None, inputfile=None):
  57         self.units = []
  58         self.filename = getattr(inputfile, 'name', None)
  59         self.currentblock = ""
  60         self.currentblocknum = 0
  61         self.currenttag = None
  62         self.includeuntaggeddata = includeuntaggeddata
  63         HTMLParser.__init__(self)
  64
  65         if inputfile is not None:
  66             htmlsrc = inputfile.read()
  67             inputfile.close()
  68             self.parse(htmlsrc)
  69
  70     def guess_encoding(self, htmlsrc):
  71         """Returns the encoding of the html text.
  72
  73         We look for 'charset=' within a meta tag to do this.
  74         """
  75
  76         pattern = '''(?i)<meta.*content.*=.*charset.*=\\s*([^\\s]*)\\s*["']'''
  77         result = re.findall(pattern, htmlsrc)
  78         encoding = None
  79         if result:
  80             encoding = result[0]
  81         return encoding
  82
  83     def do_encoding(self, htmlsrc):
  84         """Return the html text properly encoded based on a charset."""
  85         charset = self.guess_encoding(htmlsrc)
  86         if charset:
  87             return htmlsrc.decode(charset)
  88         else:
  89             return htmlsrc
  90
  91     def parse(self, htmlsrc):
  92         htmlsrc = self.do_encoding(htmlsrc)
  93         self.feed(htmlsrc)
  94
  95     def addhtmlblock(self, text):
  96         text = self.strip_html(text)
  97         if self.has_translatable_content(text):
  98             self.currentblocknum += 1
  99             unit = self.addsourceunit(text)
 100             unit.addlocation("%s:%d" % (self.filename, self.currentblocknum))
 101
 102     def strip_html(self, text):
 103         """Strip unnecessary html from the text.
 104
 105         HTML tags are deemed unnecessary if it fully encloses the translatable
 106         text, eg. '<a href="index.html">Home Page</a>'.
 107
 108         HTML tags that occurs within the normal flow of text will not be removed,
 109         eg. 'This is a link to the <a href="index.html">Home Page</a>.'
 110         """
 111         text = text.strip()
 112
 113         pattern = '(?s)^<[^>]*>(.*)</.*>$'
 114         result = re.findall(pattern, text)
 115         if len(result) == 1:
 116             text = self.strip_html(result[0])
 117         return text
 118
 119     def has_translatable_content(self, text):
 120         """Check if the supplied HTML snippet has any content that needs to be translated."""
 121
 122         text = text.strip()
 123         result = re.findall('(?i).*(charset.*=.*)', text)
 124         if len(result) == 1:
 125             return False
 126
 127         # TODO: Get a better way to find untranslatable entities.
 128         if text == '&nbsp;':
 129             return False
 130
 131         pattern = '<[^>]*>'
 132         result = re.sub(pattern, '', text).strip()
 133         if result:
 134             return True
 135         else:
 136             return False
 137
 138 #From here on below, follows the methods of the HTMLParser
 139
 140     def startblock(self, tag):
 141         self.addhtmlblock(self.currentblock)
 142         self.currentblock = ""
 143         self.currenttag = tag
 144
 145     def endblock(self):
 146         self.addhtmlblock(self.currentblock)
 147         self.currentblock = ""
 148         self.currenttag = None
 149
 150     def handle_starttag(self, tag, attrs):
 151         newblock = 0
 152         if tag in self.markingtags:
 153             newblock = 1
 154         for attrname, attrvalue in attrs:
 155             if attrname in self.markingattrs:
 156                 newblock = 1
 157             if attrname in self.includeattrs:
 158                 self.addhtmlblock(attrvalue)
 159
 160         if newblock:
 161             self.startblock(tag)
 162         elif self.currenttag is not None:
 163             self.currentblock += self.get_starttag_text()
 164
 165     def handle_startendtag(self, tag, attrs):
 166         for attrname, attrvalue in attrs:
 167             if attrname in self.includeattrs:
 168                 self.addhtmlblock(attrvalue)
 169         if self.currenttag is not None:
 170             self.currentblock += self.get_starttag_text()
 171
 172     def handle_endtag(self, tag):
 173         if tag == self.currenttag:
 174             self.endblock()
 175         elif self.currenttag is not None:
 176             self.currentblock += '</%s>' % tag
 177
 178     def handle_data(self, data):
 179         if self.currenttag is not None:
 180             self.currentblock += data
 181         elif self.includeuntaggeddata:
 182             self.startblock(None)
 183             self.currentblock += data
 184
 185     def handle_charref(self, name):
 186         self.handle_data("&#%s;" % name)
 187
 188     def handle_entityref(self, name):
 189         self.handle_data("&%s;" % name)
 190
 191     def handle_comment(self, data):
 192         # we don't do anything with comments
 193         pass
 194
 195 class POHTMLParser(htmlfile):
 196     pass
 197