fix git support for v1.5.3 (or higher) by setting "--work-tree"
[translate_toolkit.git] / storage / html.py
blob9c22d1156023c712f18ea8b825cf50e8dca53140
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # Copyright 2004-2006 Zuza Software Foundation
5 #
6 # This file is part of translate.
8 # translate is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
13 # translate is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with translate; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 """module for parsing html files for translation"""
25 import re
26 from translate.storage import base
27 from HTMLParser import HTMLParser
29 class htmlunit(base.TranslationUnit):
30 """A unit of translatable/localisable HTML content"""
31 def __init__(self, source=None):
32 self.locations = []
33 self.setsource(source)
35 def getsource(self):
36 #TODO: Rethink how clever we should try to be with html entities.
37 return self.text.replace("&amp;", "&").replace("&lt;", "<").replace("\r\n", " ").replace("\n", " ").replace("\r", " ")
39 def setsource(self, source):
40 self.text = source.replace("&", "&amp;").replace("<", "&lt;")
41 source = property(getsource, setsource)
43 def addlocation(self, location):
44 self.locations.append(location)
46 def getlocations(self):
47 return self.locations
50 class htmlfile(HTMLParser, base.TranslationStore):
51 UnitClass = htmlunit
52 markingtags = ["p", "title", "h1", "h2", "h3", "h4", "h5", "h6", "th", "td", "div", "li", "dt", "dd", "address", "caption"]
53 markingattrs = []
54 includeattrs = ["alt", "summary", "standby", "abbr", "content"]
56 def __init__(self, includeuntaggeddata=None, inputfile=None):
57 self.units = []
58 self.filename = getattr(inputfile, 'name', None)
59 self.currentblock = ""
60 self.currentblocknum = 0
61 self.currenttag = None
62 self.includeuntaggeddata = includeuntaggeddata
63 HTMLParser.__init__(self)
65 if inputfile is not None:
66 htmlsrc = inputfile.read()
67 inputfile.close()
68 self.parse(htmlsrc)
70 def guess_encoding(self, htmlsrc):
71 """Returns the encoding of the html text.
73 We look for 'charset=' within a meta tag to do this.
74 """
76 pattern = '''(?i)<meta.*content.*=.*charset.*=\\s*([^\\s]*)\\s*["']'''
77 result = re.findall(pattern, htmlsrc)
78 encoding = None
79 if result:
80 encoding = result[0]
81 return encoding
83 def do_encoding(self, htmlsrc):
84 """Return the html text properly encoded based on a charset."""
85 charset = self.guess_encoding(htmlsrc)
86 if charset:
87 return htmlsrc.decode(charset)
88 else:
89 return htmlsrc
91 def parse(self, htmlsrc):
92 htmlsrc = self.do_encoding(htmlsrc)
93 self.feed(htmlsrc)
95 def addhtmlblock(self, text):
96 text = self.strip_html(text)
97 if self.has_translatable_content(text):
98 self.currentblocknum += 1
99 unit = self.addsourceunit(text)
100 unit.addlocation("%s:%d" % (self.filename, self.currentblocknum))
102 def strip_html(self, text):
103 """Strip unnecessary html from the text.
105 HTML tags are deemed unnecessary if it fully encloses the translatable
106 text, eg. '<a href="index.html">Home Page</a>'.
108 HTML tags that occurs within the normal flow of text will not be removed,
109 eg. 'This is a link to the <a href="index.html">Home Page</a>.'
111 text = text.strip()
113 pattern = '(?s)^<[^>]*>(.*)</.*>$'
114 result = re.findall(pattern, text)
115 if len(result) == 1:
116 text = self.strip_html(result[0])
117 return text
119 def has_translatable_content(self, text):
120 """Check if the supplied HTML snippet has any content that needs to be translated."""
122 text = text.strip()
123 result = re.findall('(?i).*(charset.*=.*)', text)
124 if len(result) == 1:
125 return False
127 # TODO: Get a better way to find untranslatable entities.
128 if text == '&nbsp;':
129 return False
131 pattern = '<[^>]*>'
132 result = re.sub(pattern, '', text).strip()
133 if result:
134 return True
135 else:
136 return False
138 #From here on below, follows the methods of the HTMLParser
140 def startblock(self, tag):
141 self.addhtmlblock(self.currentblock)
142 self.currentblock = ""
143 self.currenttag = tag
145 def endblock(self):
146 self.addhtmlblock(self.currentblock)
147 self.currentblock = ""
148 self.currenttag = None
150 def handle_starttag(self, tag, attrs):
151 newblock = 0
152 if tag in self.markingtags:
153 newblock = 1
154 for attrname, attrvalue in attrs:
155 if attrname in self.markingattrs:
156 newblock = 1
157 if attrname in self.includeattrs:
158 self.addhtmlblock(attrvalue)
160 if newblock:
161 self.startblock(tag)
162 elif self.currenttag is not None:
163 self.currentblock += self.get_starttag_text()
165 def handle_startendtag(self, tag, attrs):
166 for attrname, attrvalue in attrs:
167 if attrname in self.includeattrs:
168 self.addhtmlblock(attrvalue)
169 if self.currenttag is not None:
170 self.currentblock += self.get_starttag_text()
172 def handle_endtag(self, tag):
173 if tag == self.currenttag:
174 self.endblock()
175 elif self.currenttag is not None:
176 self.currentblock += '</%s>' % tag
178 def handle_data(self, data):
179 if self.currenttag is not None:
180 self.currentblock += data
181 elif self.includeuntaggeddata:
182 self.startblock(None)
183 self.currentblock += data
185 def handle_charref(self, name):
186 self.handle_data("&#%s;" % name)
188 def handle_entityref(self, name):
189 self.handle_data("&%s;" % name)
191 def handle_comment(self, data):
192 # we don't do anything with comments
193 pass
195 class POHTMLParser(htmlfile):
196 pass