2 # -*- coding: utf-8 -*-
4 # Copyright 2004-2006 Zuza Software Foundation
6 # This file is part of translate.
8 # translate is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
13 # translate is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with translate; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 """module for parsing html files for translation"""
26 from translate
.storage
import base
27 from HTMLParser
import HTMLParser
29 class htmlunit(base
.TranslationUnit
):
30 """A unit of translatable/localisable HTML content"""
31 def __init__(self
, source
=None):
33 self
.setsource(source
)
36 #TODO: Rethink how clever we should try to be with html entities.
37 return self
.text
.replace("&", "&").replace("<", "<").replace("\r\n", " ").replace("\n", " ").replace("\r", " ")
39 def setsource(self
, source
):
40 self
.text
= source
.replace("&", "&").replace("<", "<")
41 source
= property(getsource
, setsource
)
43 def addlocation(self
, location
):
44 self
.locations
.append(location
)
46 def getlocations(self
):
50 class htmlfile(HTMLParser
, base
.TranslationStore
):
52 markingtags
= ["p", "title", "h1", "h2", "h3", "h4", "h5", "h6", "th", "td", "div", "li", "dt", "dd", "address", "caption"]
54 includeattrs
= ["alt", "summary", "standby", "abbr", "content"]
56 def __init__(self
, includeuntaggeddata
=None, inputfile
=None):
58 self
.filename
= getattr(inputfile
, 'name', None)
59 self
.currentblock
= ""
60 self
.currentblocknum
= 0
61 self
.currenttag
= None
62 self
.includeuntaggeddata
= includeuntaggeddata
63 HTMLParser
.__init
__(self
)
65 if inputfile
is not None:
66 htmlsrc
= inputfile
.read()
70 def guess_encoding(self
, htmlsrc
):
71 """Returns the encoding of the html text.
73 We look for 'charset=' within a meta tag to do this.
76 pattern
= '''(?i)<meta.*content.*=.*charset.*=\\s*([^\\s]*)\\s*["']'''
77 result
= re
.findall(pattern
, htmlsrc
)
83 def do_encoding(self
, htmlsrc
):
84 """Return the html text properly encoded based on a charset."""
85 charset
= self
.guess_encoding(htmlsrc
)
87 return htmlsrc
.decode(charset
)
91 def parse(self
, htmlsrc
):
92 htmlsrc
= self
.do_encoding(htmlsrc
)
95 def addhtmlblock(self
, text
):
96 text
= self
.strip_html(text
)
97 if self
.has_translatable_content(text
):
98 self
.currentblocknum
+= 1
99 unit
= self
.addsourceunit(text
)
100 unit
.addlocation("%s:%d" % (self
.filename
, self
.currentblocknum
))
102 def strip_html(self
, text
):
103 """Strip unnecessary html from the text.
105 HTML tags are deemed unnecessary if it fully encloses the translatable
106 text, eg. '<a href="index.html">Home Page</a>'.
108 HTML tags that occurs within the normal flow of text will not be removed,
109 eg. 'This is a link to the <a href="index.html">Home Page</a>.'
113 pattern
= '(?s)^<[^>]*>(.*)</.*>$'
114 result
= re
.findall(pattern
, text
)
116 text
= self
.strip_html(result
[0])
119 def has_translatable_content(self
, text
):
120 """Check if the supplied HTML snippet has any content that needs to be translated."""
123 result
= re
.findall('(?i).*(charset.*=.*)', text
)
127 # TODO: Get a better way to find untranslatable entities.
132 result
= re
.sub(pattern
, '', text
).strip()
138 #From here on below, follows the methods of the HTMLParser
140 def startblock(self
, tag
):
141 self
.addhtmlblock(self
.currentblock
)
142 self
.currentblock
= ""
143 self
.currenttag
= tag
146 self
.addhtmlblock(self
.currentblock
)
147 self
.currentblock
= ""
148 self
.currenttag
= None
150 def handle_starttag(self
, tag
, attrs
):
152 if tag
in self
.markingtags
:
154 for attrname
, attrvalue
in attrs
:
155 if attrname
in self
.markingattrs
:
157 if attrname
in self
.includeattrs
:
158 self
.addhtmlblock(attrvalue
)
162 elif self
.currenttag
is not None:
163 self
.currentblock
+= self
.get_starttag_text()
165 def handle_startendtag(self
, tag
, attrs
):
166 for attrname
, attrvalue
in attrs
:
167 if attrname
in self
.includeattrs
:
168 self
.addhtmlblock(attrvalue
)
169 if self
.currenttag
is not None:
170 self
.currentblock
+= self
.get_starttag_text()
172 def handle_endtag(self
, tag
):
173 if tag
== self
.currenttag
:
175 elif self
.currenttag
is not None:
176 self
.currentblock
+= '</%s>' % tag
178 def handle_data(self
, data
):
179 if self
.currenttag
is not None:
180 self
.currentblock
+= data
181 elif self
.includeuntaggeddata
:
182 self
.startblock(None)
183 self
.currentblock
+= data
185 def handle_charref(self
, name
):
186 self
.handle_data("&#%s;" % name
)
188 def handle_entityref(self
, name
):
189 self
.handle_data("&%s;" % name
)
191 def handle_comment(self
, data
):
192 # we don't do anything with comments
195 class POHTMLParser(htmlfile
):