fix git support for v1.5.3 (or higher) by setting "--work-tree"
[translate_toolkit.git] / convert / test_html2po.py
blob3e700e4c637cbc5cff951d9298f835733336c38c
1 #!/usr/bin/env python
3 from translate.convert import html2po
4 from translate.convert import po2html
5 from translate.convert import test_convert
6 from translate.misc import wStringIO
8 class TestHTML2PO:
9 def html2po(self, markup):
10 """Helper to convert html to po without a file."""
11 inputfile = wStringIO.StringIO(markup)
12 convertor = html2po.html2po()
13 outputpo = convertor.convertfile(inputfile, "test", False, False)
14 return outputpo
16 def po2html(self, posource, htmltemplate):
17 """Helper to convert po to html without a file."""
18 inputfile = wStringIO.StringIO(posource)
19 outputfile = wStringIO.StringIO()
20 templatefile = wStringIO.StringIO(htmltemplate)
21 assert po2html.converthtml(inputfile, outputfile, templatefile)
22 return outputfile.getvalue()
24 def countunits(self, pofile, expected):
25 """helper to check that we got the expected number of messages"""
26 actual = len(pofile.units)
27 if actual > 0:
28 if pofile.units[0].isheader():
29 actual = actual - 1
30 print pofile
31 assert actual == expected
33 def compareunit(self, pofile, unitnumber, expected):
34 """helper to validate a PO message"""
35 if not pofile.units[0].isheader():
36 unitnumber = unitnumber - 1
37 print 'unit source: ' + str(pofile.units[unitnumber].source) + '|'
38 print 'expected: ' + expected.encode('utf-8') + '|'
39 assert unicode(pofile.units[unitnumber].source) == unicode(expected)
41 def check_single(self, markup, itemtext):
42 """checks that converting this markup produces a single element with value itemtext"""
43 pofile = self.html2po(markup)
44 self.countunits(pofile, 1)
45 self.compareunit(pofile, 1, itemtext)
47 def check_null(self, markup):
48 """checks that converting this markup produces no elements"""
49 pofile = self.html2po(markup)
50 self.countunits(pofile, 0)
52 def test_htmllang(self):
53 """test to ensure that we no longer use the lang attribure"""
54 markup = '''<html lang="en"><head><title>My title</title></head><body></body></html>'''
55 pofile = self.html2po(markup)
56 self.countunits(pofile, 1)
57 # Check that the first item is the <title> not <head>
58 self.compareunit(pofile, 1, "My title")
60 def test_title(self):
61 """test that we can extract the <title> tag"""
62 self.check_single("<html><head><title>My title</title></head><body></body></html>", "My title")
64 def test_title_with_linebreak(self):
65 """Test a linebreak in the <title> tag"""
66 htmltext = '''<html>
67 <head>
68 <title>My
69 title</title>
70 </head>
71 <body>
72 </body>
73 </html>
74 '''
75 self.check_single(htmltext, "My title")
77 def test_meta(self):
78 """Test that we can extract certain <meta> info from <head>."""
79 self.check_single('''<html><head><meta name="keywords" content="these are keywords"></head><body></body></html>''', "these are keywords")
81 def test_tag_p(self):
82 """test that we can extract the <p> tag"""
83 self.check_single("<html><head></head><body><p>A paragraph.</p></body></html>", "A paragraph.")
84 markup = "<p>First line.<br>Second line.</p>"
85 pofile = self.html2po(markup)
86 self.compareunit(pofile, 1, "First line.<br>Second line.")
88 def test_tag_p_with_linebreak(self):
89 """Test newlines within the <p> tag."""
90 htmltext = '''<html>
91 <head>
92 </head>
93 <body>
94 <p>
95 A paragraph is a section in a piece of writing, usually highlighting a
96 particular point or topic. It always begins on a new line and usually
97 with indentation, and it consists of at least one sentence.
98 </p>
99 </body>
100 </html>
102 self.check_single(htmltext, "A paragraph is a section in a piece of writing, usually highlighting a particular point or topic. It always begins on a new line and usually with indentation, and it consists of at least one sentence.")
103 markup = "<p>First\nline.<br>Second\nline.</p>"
104 pofile = self.html2po(markup)
105 self.compareunit(pofile, 1, "First line.<br>Second line.")
107 def test_tag_div(self):
108 """test that we can extract the <div> tag"""
109 self.check_single("<html><head></head><body><div>A paragraph.</div></body></html>", "A paragraph.")
110 markup = "<div>First line.<br>Second line.</div>"
111 pofile = self.html2po(markup)
112 self.compareunit(pofile, 1, "First line.<br>Second line.")
114 def test_tag_div_with_linebreaks(self):
115 """Test linebreaks within a <div> tag."""
116 htmltext = '''<html>
117 <head>
118 </head>
119 <body>
120 <div>
121 A paragraph is a section in a piece of writing, usually highlighting a
122 particular point or topic. It always begins on a new line and usually
123 with indentation, and it consists of at least one sentence.
124 </div>
125 </body>
126 </html>
128 self.check_single(htmltext, "A paragraph is a section in a piece of writing, usually highlighting a particular point or topic. It always begins on a new line and usually with indentation, and it consists of at least one sentence.")
129 markup = "<div>First\nline.<br>Second\nline.</div>"
130 pofile = self.html2po(markup)
131 self.compareunit(pofile, 1, "First line.<br>Second line.")
133 def test_tag_a(self):
134 """test that we can extract the <a> tag"""
135 self.check_single('<html><head></head><body><p>A paragraph with <a href="http://translate.org.za/">hyperlink</a>.</p></body></html>', 'A paragraph with <a href="http://translate.org.za/">hyperlink</a>.')
137 def test_tag_a_with_linebreak(self):
138 """Test that we can extract the <a> tag with newlines in it."""
139 htmltext = '''<html>
140 <head>
141 </head>
142 <body>
143 <p>A
144 paragraph
145 with <a
146 href="http://translate.org.za/">hyperlink</a>
148 newlines.</p></body></html>
150 self.check_single(htmltext, 'A paragraph with <a href="http://translate.org.za/">hyperlink</a> and newlines.')
152 def test_tag_img(self):
153 """Test that we can extract the alt attribute from the <img> tag."""
154 self.check_single('''<html><head></head><body><img src="picture.png" alt="A picture"></body></html>''', "A picture")
156 def test_img_empty(self):
157 """Test that we can extract the alt attribute from the <img> tag."""
158 htmlsource = '''<html><head></head><body><img src="images/topbar.jpg" width="750" height="80"></body></html>'''
159 self.check_null(htmlsource)
161 def test_tag_table_summary(self):
162 """Test that we can extract the summary attribute."""
163 self.check_single( '''<html><head></head><body><table summary="Table summary"></table></body></html>''', "Table summary")
165 def test_table_simple(self):
166 """Test that we can fully extract a simple table."""
167 markup = '''<html><head></head><body><table><tr><th>Heading One</th><th>Heading Two</th><tr><td>One</td><td>Two</td></tr></table></body></html>'''
168 pofile = self.html2po(markup)
169 self.countunits(pofile, 4)
170 self.compareunit(pofile, 1, "Heading One")
171 self.compareunit(pofile, 2, "Heading Two")
172 self.compareunit(pofile, 3, "One")
173 self.compareunit(pofile, 4, "Two")
175 def test_table_complex(self):
176 markup = '''<table summary="This is the summary"><caption>A caption</caption><thead><tr><th abbr="Head 1">Heading One</th><th>Heading Two</th></thead><tfoot><tr><td>Foot One</td><td>Foot Two</td></tr></tfoot><tbody><tr><td>One</td><td>Two</td></tr></tbody></table>'''
177 pofile = self.html2po(markup)
178 self.countunits(pofile, 9)
179 self.compareunit(pofile, 1, "This is the summary")
180 self.compareunit(pofile, 2, "A caption")
181 self.compareunit(pofile, 3, "Head 1")
182 self.compareunit(pofile, 4, "Heading One")
183 self.compareunit(pofile, 5, "Heading Two")
184 self.compareunit(pofile, 6, "Foot One")
185 self.compareunit(pofile, 7, "Foot Two")
186 self.compareunit(pofile, 8, "One")
187 self.compareunit(pofile, 9, "Two")
189 def test_table_empty(self):
190 """Test that we ignore tables that are empty.
192 A table is deemed empty if it has no translatable content.
195 self.check_null('''<html><head></head><body><table><tr><td><img src="bob.png"></td></tr></table></body></html>''')
196 self.check_null('''<html><head></head><body><table><tr><td>&nbsp;</td></tr></table></body></html>''')
197 self.check_null('''<html><head></head><body><table><tr><td><strong></strong></td></tr></table></body></html>''')
199 def test_address(self):
200 """Test to see if the address element is extracted"""
201 self.check_single("<body><address>My address</address></body>", "My address")
203 def test_headings(self):
204 """Test to see if the h* elements are extracted"""
205 markup = "<html><head></head><body><h1>Heading One</h1><h2>Heading Two</h2><h3>Heading Three</h3><h4>Heading Four</h4><h5>Heading Five</h5><h6>Heading Six</h6></body></html>"
206 pofile = self.html2po(markup)
207 self.countunits(pofile, 6)
208 self.compareunit(pofile, 1, "Heading One")
209 self.compareunit(pofile, 2, "Heading Two")
210 self.compareunit(pofile, 3, "Heading Three")
211 self.compareunit(pofile, 4, "Heading Four")
212 self.compareunit(pofile, 5, "Heading Five")
213 self.compareunit(pofile, 6, "Heading Six")
215 def test_headings_with_linebreaks(self):
216 """Test to see if h* elements with newlines can be extracted"""
217 markup = "<html><head></head><body><h1>Heading\nOne</h1><h2>Heading\nTwo</h2><h3>Heading\nThree</h3><h4>Heading\nFour</h4><h5>Heading\nFive</h5><h6>Heading\nSix</h6></body></html>"
218 pofile = self.html2po(markup)
219 self.countunits(pofile, 6)
220 self.compareunit(pofile, 1, "Heading One")
221 self.compareunit(pofile, 2, "Heading Two")
222 self.compareunit(pofile, 3, "Heading Three")
223 self.compareunit(pofile, 4, "Heading Four")
224 self.compareunit(pofile, 5, "Heading Five")
225 self.compareunit(pofile, 6, "Heading Six")
227 def test_dt(self):
228 """Test to see if the definition list title (dt) element is extracted"""
229 self.check_single("<html><head></head><body><dl><dt>Definition List Item Title</dt></dl></body></html>", "Definition List Item Title")
231 def test_dd(self):
232 """Test to see if the definition list description (dd) element is extracted"""
233 self.check_single("<html><head></head><body><dl><dd>Definition List Item Description</dd></dl></body></html>", "Definition List Item Description")
235 def test_span(self):
236 """test to check that we don't double extract a span item"""
237 self.check_single("<html><head></head><body><p>You are a <span>Spanish</span> sentence.</p></body></html>", "You are a <span>Spanish</span> sentence.")
239 def test_ul(self):
240 """Test to see if the list item <li> is exracted"""
241 markup = "<html><head></head><body><ul><li>Unordered One</li><li>Unordered Two</li></ul><ol><li>Ordered One</li><li>Ordered Two</li></ol></body></html>"
242 pofile = self.html2po(markup)
243 self.countunits(pofile, 4)
244 self.compareunit(pofile, 1, "Unordered One")
245 self.compareunit(pofile, 2, "Unordered Two")
246 self.compareunit(pofile, 3, "Ordered One")
247 self.compareunit(pofile, 4, "Ordered Two")
249 def test_duplicates(self):
250 """check that we use the default style of msgid_comments to disambiguate duplicate messages"""
251 markup = "<html><head></head><body><p>Duplicate</p><p>Duplicate</p></body></html>"
252 pofile = self.html2po(markup)
253 self.countunits(pofile, 2)
254 # FIXME change this so that we check that the KDE comment is correctly added
255 self.compareunit(pofile, 1, "Duplicate")
256 self.compareunit(pofile, 2, "Duplicate")
258 def wtest_multiline_reflow(self):
259 """check that we reflow multiline content to make it more readable for translators"""
260 self.check_single('''<td valign="middle" width="96%"><font class="headingwhite">South
261 Africa</font></td>''', '''<font class="headingwhite">South Africa</font>''')
263 def wtest_nested_tags(self):
264 """check that we can extract items within nested tags"""
265 markup = "<div><p>Extract this</p>And this</div>"
266 pofile = self.html2po(markup)
267 self.countunits(pofile, 2)
268 self.compareunit(pofile, 1, "Extract this")
269 self.compareunit(pofile, 2, "And this")
271 def test_carriage_return(self):
272 """Remove carriage returns from files in dos format."""
273 htmlsource = '''<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">\r
274 <html><!-- InstanceBegin template="/Templates/masterpage.dwt" codeOutsideHTMLIsLocked="false" -->\r
275 <head>\r
276 <!-- InstanceBeginEditable name="doctitle" -->\r
277 <link href="fmfi.css" rel="stylesheet" type="text/css">\r
278 </head>\r
280 <body>\r
281 <p>The rapid expansion of telecommunications infrastructure in recent\r
282 years has helped to bridge the digital divide to a limited extent.</p> \r
283 </body>\r
284 <!-- InstanceEnd --></html>\r
287 self.check_single(htmlsource, 'The rapid expansion of telecommunications infrastructure in recent years has helped to bridge the digital divide to a limited extent.')
289 def test_encoding_latin1(self):
290 """Convert HTML input in iso-8859-1 correctly to unicode."""
291 htmlsource = '''<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
292 <html><!-- InstanceBegin template="/Templates/masterpage.dwt" codeOutsideHTMLIsLocked="false" -->
293 <head>
294 <!-- InstanceBeginEditable name="doctitle" -->
295 <title>FMFI - South Africa - CSIR Openphone - Overview</title>
296 <!-- InstanceEndEditable -->
297 <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
298 <meta name="keywords" content="fmfi, first mile, first inch, wireless, rural development, access devices, mobile devices, wifi, connectivity, rural connectivty, ict, low cost, cheap, digital divide, csir, idrc, community">
300 <!-- InstanceBeginEditable name="head" -->
301 <!-- InstanceEndEditable -->
302 <link href="../../../fmfi.css" rel="stylesheet" type="text/css">
303 </head>
305 <body>
306 <p>We aim to please \x96 will you aim too, please?</p>
307 <p>South Africa\x92s language diversity can be challenging.</p>
308 </body>
309 </html>
311 pofile = self.html2po(htmlsource)
313 self.countunits(pofile, 4)
314 self.compareunit(pofile, 3, u'We aim to please \x96 will you aim too, please?')
315 self.compareunit(pofile, 4, u'South Africa\x92s language diversity can be challenging.')
317 def test_strip_html(self):
318 """Ensure that unnecessary html is stripped from the resulting unit."""
320 htmlsource = '''<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
321 <html>
322 <head>
323 <title>FMFI - Contact</title>
324 </head>
325 <body>
326 <table width="100%" border="0" cellpadding="0" cellspacing="0">
327 <tr align="left" valign="top">
328 <td width="150" height="556">
329 <table width="157" height="100%" border="0" cellspacing="0" id="leftmenubg-color">
330 <tr>
331 <td align="left" valign="top" height="555">
332 <table width="100%" border="0" cellspacing="0" cellpadding="2">
333 <tr align="left" valign="top" bgcolor="#660000">
334 <td width="4%"><strong></strong></td>
335 <td width="96%"><strong><font class="headingwhite">Projects</font></strong></td>
336 </tr>
337 <tr align="left" valign="top">
338 <td valign="middle" width="4%"><img src="images/arrow.gif" width="8" height="8"></td>
339 <td width="96%"><a href="index.html">Home Page</a></td>
340 </tr>
341 </table>
342 </td>
343 </tr>
344 </table></td>
345 </table>
346 </body>
347 </html>
349 pofile = self.html2po(htmlsource)
350 self.countunits(pofile, 3)
351 self.compareunit(pofile, 2, u'Projects')
352 self.compareunit(pofile, 3, u'Home Page')
354 # Translate and convert back:
355 pofile.units[1].target = 'Projekte'
356 pofile.units[2].target = 'Tuisblad'
357 htmlresult = self.po2html(str(pofile), htmlsource).replace('\n', ' ').replace('= "', '="').replace('> <', '><')
358 snippet = '<td width="96%"><strong><font class="headingwhite">Projekte</font></strong></td>'
359 assert snippet in htmlresult
360 snippet = '<td width="96%"><a href="index.html">Tuisblad</a></td>'
361 assert snippet in htmlresult
363 class TestHTML2POCommand(test_convert.TestConvertCommand, TestHTML2PO):
364 """Tests running actual html2po commands on files"""
365 convertmodule = html2po
366 defaultoptions = {"progress": "none"}
368 def test_help(self):
369 """tests getting help"""
370 options = test_convert.TestConvertCommand.test_help(self)
371 options = self.help_check(options, "-P, --pot")
372 options = self.help_check(options, "--duplicates=DUPLICATESTYLE")
373 options = self.help_check(options, "-u, --untagged", last=True)