convert/test_html2po.py

   1 #!/usr/bin/env python
   2
   3 from translate.convert import html2po
   4 from translate.convert import po2html
   5 from translate.convert import test_convert
   6 from translate.misc import wStringIO
   7
   8 class TestHTML2PO:
   9     def html2po(self, markup):
  10         """Helper to convert html to po without a file."""
  11         inputfile = wStringIO.StringIO(markup)
  12         convertor = html2po.html2po()
  13         outputpo = convertor.convertfile(inputfile, "test", False, False)
  14         return outputpo
  15
  16     def po2html(self, posource, htmltemplate):
  17         """Helper to convert po to html without a file."""
  18         inputfile = wStringIO.StringIO(posource)
  19         outputfile = wStringIO.StringIO()
  20         templatefile = wStringIO.StringIO(htmltemplate)
  21         assert po2html.converthtml(inputfile, outputfile, templatefile)
  22         return outputfile.getvalue()
  23
  24     def countunits(self, pofile, expected):
  25         """helper to check that we got the expected number of messages"""
  26         actual = len(pofile.units)
  27         if actual > 0:
  28             if pofile.units[0].isheader():
  29                 actual = actual - 1
  30         print pofile
  31         assert actual == expected
  32
  33     def compareunit(self, pofile, unitnumber, expected):
  34         """helper to validate a PO message"""
  35         if not pofile.units[0].isheader():
  36             unitnumber = unitnumber - 1
  37         print 'unit source: ' + str(pofile.units[unitnumber].source) + '|'
  38         print 'expected: ' + expected.encode('utf-8') + '|'
  39         assert unicode(pofile.units[unitnumber].source) == unicode(expected)
  40
  41     def check_single(self, markup, itemtext):
  42         """checks that converting this markup produces a single element with value itemtext"""
  43         pofile = self.html2po(markup)
  44         self.countunits(pofile, 1)
  45         self.compareunit(pofile, 1, itemtext)
  46
  47     def check_null(self, markup):
  48         """checks that converting this markup produces no elements"""
  49         pofile = self.html2po(markup)
  50         self.countunits(pofile, 0)
  51
  52     def test_htmllang(self):
  53         """test to ensure that we no longer use the lang attribure"""
  54         markup = '''<html lang="en"><head><title>My title</title></head><body></body></html>'''
  55         pofile = self.html2po(markup)
  56         self.countunits(pofile, 1)
  57         # Check that the first item is the <title> not <head>
  58         self.compareunit(pofile, 1, "My title")
  59
  60     def test_title(self):
  61         """test that we can extract the <title> tag"""
  62         self.check_single("<html><head><title>My title</title></head><body></body></html>", "My title")
  63
  64     def test_title_with_linebreak(self):
  65         """Test a linebreak in the <title> tag"""
  66         htmltext = '''<html>
  67 <head>
  68   <title>My
  69 title</title>
  70 </head>
  71 <body>
  72 </body>
  73 </html>
  74 '''
  75         self.check_single(htmltext, "My title")
  76
  77     def test_meta(self):
  78         """Test that we can extract certain <meta> info from <head>."""
  79         self.check_single('''<html><head><meta name="keywords" content="these are keywords"></head><body></body></html>''', "these are keywords")
  80
  81     def test_tag_p(self):
  82         """test that we can extract the <p> tag"""
  83         self.check_single("<html><head></head><body><p>A paragraph.</p></body></html>", "A paragraph.")
  84         markup = "<p>First line.<br>Second line.</p>"
  85         pofile = self.html2po(markup)
  86         self.compareunit(pofile, 1, "First line.<br>Second line.")
  87
  88     def test_tag_p_with_linebreak(self):
  89         """Test newlines within the <p> tag."""
  90         htmltext = '''<html>
  91 <head>
  92 </head>
  93 <body>
  94 <p>
  95 A paragraph is a section in a piece of writing, usually highlighting a
  96 particular point or topic. It always begins on a new line and usually
  97 with indentation, and it consists of at least one sentence.
  98 </p>
  99 </body>
 100 </html>
 101 '''
 102         self.check_single(htmltext, "A paragraph is a section in a piece of writing, usually highlighting a particular point or topic. It always begins on a new line and usually with indentation, and it consists of at least one sentence.")
 103         markup = "<p>First\nline.<br>Second\nline.</p>"
 104         pofile = self.html2po(markup)
 105         self.compareunit(pofile, 1, "First line.<br>Second line.")
 106
 107     def test_tag_div(self):
 108         """test that we can extract the <div> tag"""
 109         self.check_single("<html><head></head><body><div>A paragraph.</div></body></html>", "A paragraph.")
 110         markup = "<div>First line.<br>Second line.</div>"
 111         pofile = self.html2po(markup)
 112         self.compareunit(pofile, 1, "First line.<br>Second line.")
 113
 114     def test_tag_div_with_linebreaks(self):
 115         """Test linebreaks within a <div> tag."""
 116         htmltext = '''<html>
 117 <head>
 118 </head>
 119 <body>
 120 <div>
 121 A paragraph is a section in a piece of writing, usually highlighting a
 122 particular point or topic. It always begins on a new line and usually
 123 with indentation, and it consists of at least one sentence.
 124 </div>
 125 </body>
 126 </html>
 127 '''
 128         self.check_single(htmltext, "A paragraph is a section in a piece of writing, usually highlighting a particular point or topic. It always begins on a new line and usually with indentation, and it consists of at least one sentence.")
 129         markup = "<div>First\nline.<br>Second\nline.</div>"
 130         pofile = self.html2po(markup)
 131         self.compareunit(pofile, 1, "First line.<br>Second line.")
 132
 133     def test_tag_a(self):
 134         """test that we can extract the <a> tag"""
 135         self.check_single('<html><head></head><body><p>A paragraph with <a href="http://translate.org.za/">hyperlink</a>.</p></body></html>', 'A paragraph with <a href="http://translate.org.za/">hyperlink</a>.')
 136
 137     def test_tag_a_with_linebreak(self):
 138         """Test that we can extract the <a> tag with newlines in it."""
 139         htmltext = '''<html>
 140 <head>
 141 </head>
 142 <body>
 143 <p>A
 144 paragraph
 145 with <a
 146 href="http://translate.org.za/">hyperlink</a>
 147 and
 148 newlines.</p></body></html>
 149 '''
 150         self.check_single(htmltext, 'A paragraph with <a href="http://translate.org.za/">hyperlink</a> and newlines.')
 151
 152     def test_tag_img(self):
 153         """Test that we can extract the alt attribute from the <img> tag."""
 154         self.check_single('''<html><head></head><body><img src="picture.png" alt="A picture"></body></html>''', "A picture")
 155
 156     def test_img_empty(self):
 157         """Test that we can extract the alt attribute from the <img> tag."""
 158         htmlsource = '''<html><head></head><body><img src="images/topbar.jpg" width="750" height="80"></body></html>'''
 159         self.check_null(htmlsource)
 160
 161     def test_tag_table_summary(self):
 162         """Test that we can extract the summary attribute."""
 163         self.check_single( '''<html><head></head><body><table summary="Table summary"></table></body></html>''', "Table summary")
 164
 165     def test_table_simple(self):
 166         """Test that we can fully extract a simple table."""
 167         markup = '''<html><head></head><body><table><tr><th>Heading One</th><th>Heading Two</th><tr><td>One</td><td>Two</td></tr></table></body></html>'''
 168         pofile = self.html2po(markup)
 169         self.countunits(pofile, 4)
 170         self.compareunit(pofile, 1, "Heading One")
 171         self.compareunit(pofile, 2, "Heading Two")
 172         self.compareunit(pofile, 3, "One")
 173         self.compareunit(pofile, 4, "Two")
 174
 175     def test_table_complex(self):
 176         markup = '''<table summary="This is the summary"><caption>A caption</caption><thead><tr><th abbr="Head 1">Heading One</th><th>Heading Two</th></thead><tfoot><tr><td>Foot One</td><td>Foot Two</td></tr></tfoot><tbody><tr><td>One</td><td>Two</td></tr></tbody></table>'''
 177         pofile = self.html2po(markup)
 178         self.countunits(pofile, 9)
 179         self.compareunit(pofile, 1, "This is the summary")
 180         self.compareunit(pofile, 2, "A caption")
 181         self.compareunit(pofile, 3, "Head 1")
 182         self.compareunit(pofile, 4, "Heading One")
 183         self.compareunit(pofile, 5, "Heading Two")
 184         self.compareunit(pofile, 6, "Foot One")
 185         self.compareunit(pofile, 7, "Foot Two")
 186         self.compareunit(pofile, 8, "One")
 187         self.compareunit(pofile, 9, "Two")
 188
 189     def test_table_empty(self):
 190         """Test that we ignore tables that are empty.
 191
 192         A table is deemed empty if it has no translatable content.
 193         """
 194
 195         self.check_null('''<html><head></head><body><table><tr><td><img src="bob.png"></td></tr></table></body></html>''')
 196         self.check_null('''<html><head></head><body><table><tr><td>&nbsp;</td></tr></table></body></html>''')
 197         self.check_null('''<html><head></head><body><table><tr><td><strong></strong></td></tr></table></body></html>''')
 198
 199     def test_address(self):
 200         """Test to see if the address element is extracted"""
 201         self.check_single("<body><address>My address</address></body>", "My address")
 202
 203     def test_headings(self):
 204         """Test to see if the h* elements are extracted"""
 205         markup = "<html><head></head><body><h1>Heading One</h1><h2>Heading Two</h2><h3>Heading Three</h3><h4>Heading Four</h4><h5>Heading Five</h5><h6>Heading Six</h6></body></html>"
 206         pofile = self.html2po(markup)
 207         self.countunits(pofile, 6)
 208         self.compareunit(pofile, 1, "Heading One")
 209         self.compareunit(pofile, 2, "Heading Two")
 210         self.compareunit(pofile, 3, "Heading Three")
 211         self.compareunit(pofile, 4, "Heading Four")
 212         self.compareunit(pofile, 5, "Heading Five")
 213         self.compareunit(pofile, 6, "Heading Six")
 214
 215     def test_headings_with_linebreaks(self):
 216         """Test to see if h* elements with newlines can be extracted"""
 217         markup = "<html><head></head><body><h1>Heading\nOne</h1><h2>Heading\nTwo</h2><h3>Heading\nThree</h3><h4>Heading\nFour</h4><h5>Heading\nFive</h5><h6>Heading\nSix</h6></body></html>"
 218         pofile = self.html2po(markup)
 219         self.countunits(pofile, 6)
 220         self.compareunit(pofile, 1, "Heading One")
 221         self.compareunit(pofile, 2, "Heading Two")
 222         self.compareunit(pofile, 3, "Heading Three")
 223         self.compareunit(pofile, 4, "Heading Four")
 224         self.compareunit(pofile, 5, "Heading Five")
 225         self.compareunit(pofile, 6, "Heading Six")
 226
 227     def test_dt(self):
 228         """Test to see if the definition list title (dt) element is extracted"""
 229         self.check_single("<html><head></head><body><dl><dt>Definition List Item Title</dt></dl></body></html>", "Definition List Item Title")
 230
 231     def test_dd(self):
 232         """Test to see if the definition list description (dd) element is extracted"""
 233         self.check_single("<html><head></head><body><dl><dd>Definition List Item Description</dd></dl></body></html>", "Definition List Item Description")
 234
 235     def test_span(self):
 236         """test to check that we don't double extract a span item"""
 237         self.check_single("<html><head></head><body><p>You are a <span>Spanish</span> sentence.</p></body></html>", "You are a <span>Spanish</span> sentence.")
 238
 239     def test_ul(self):
 240         """Test to see if the list item <li> is exracted"""
 241         markup = "<html><head></head><body><ul><li>Unordered One</li><li>Unordered Two</li></ul><ol><li>Ordered One</li><li>Ordered Two</li></ol></body></html>"
 242         pofile = self.html2po(markup)
 243         self.countunits(pofile, 4)
 244         self.compareunit(pofile, 1, "Unordered One")
 245         self.compareunit(pofile, 2, "Unordered Two")
 246         self.compareunit(pofile, 3, "Ordered One")
 247         self.compareunit(pofile, 4, "Ordered Two")
 248
 249     def test_duplicates(self):
 250         """check that we use the default style of msgid_comments to disambiguate duplicate messages"""
 251         markup = "<html><head></head><body><p>Duplicate</p><p>Duplicate</p></body></html>"
 252         pofile = self.html2po(markup)
 253         self.countunits(pofile, 2)
 254         # FIXME change this so that we check that the KDE comment is correctly added
 255         self.compareunit(pofile, 1, "Duplicate")
 256         self.compareunit(pofile, 2, "Duplicate")
 257
 258     def wtest_multiline_reflow(self):
 259         """check that we reflow multiline content to make it more readable for translators"""
 260         self.check_single('''<td valign="middle" width="96%"><font class="headingwhite">South
 261                   Africa</font></td>''', '''<font class="headingwhite">South Africa</font>''')
 262
 263     def wtest_nested_tags(self):
 264         """check that we can extract items within nested tags"""
 265         markup = "<div><p>Extract this</p>And this</div>"
 266         pofile = self.html2po(markup)
 267         self.countunits(pofile, 2)
 268         self.compareunit(pofile, 1, "Extract this")
 269         self.compareunit(pofile, 2, "And this")
 270
 271     def test_carriage_return(self):
 272         """Remove carriage returns from files in dos format."""
 273         htmlsource = '''<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">\r
 274 <html><!-- InstanceBegin template="/Templates/masterpage.dwt" codeOutsideHTMLIsLocked="false" -->\r
 275 <head>\r
 276 <!-- InstanceBeginEditable name="doctitle" -->\r
 277 <link href="fmfi.css" rel="stylesheet" type="text/css">\r
 278 </head>\r
 279 \r
 280 <body>\r
 281 <p>The rapid expansion of telecommunications infrastructure in recent\r
 282 years has helped to bridge the digital divide to a limited extent.</p> \r
 283 </body>\r
 284 <!-- InstanceEnd --></html>\r
 285 '''
 286
 287         self.check_single(htmlsource, 'The rapid expansion of telecommunications infrastructure in recent years has helped to bridge the digital divide to a limited extent.')
 288
 289     def test_encoding_latin1(self):
 290         """Convert HTML input in iso-8859-1 correctly to unicode."""
 291         htmlsource = '''<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
 292 <html><!-- InstanceBegin template="/Templates/masterpage.dwt" codeOutsideHTMLIsLocked="false" -->
 293 <head>
 294 <!-- InstanceBeginEditable name="doctitle" -->
 295 <title>FMFI - South Africa - CSIR Openphone - Overview</title>
 296 <!-- InstanceEndEditable -->
 297 <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
 298 <meta name="keywords" content="fmfi, first mile, first inch, wireless, rural development, access devices, mobile devices, wifi, connectivity, rural connectivty, ict, low cost, cheap, digital divide, csir, idrc, community">
 299
 300 <!-- InstanceBeginEditable name="head" -->
 301 <!-- InstanceEndEditable -->
 302 <link href="../../../fmfi.css" rel="stylesheet" type="text/css">
 303 </head>
 304
 305 <body>
 306 <p>We aim to please \x96 will you aim too, please?</p>
 307 <p>South Africa\x92s language diversity can be challenging.</p>
 308 </body>
 309 </html>
 310 '''
 311         pofile = self.html2po(htmlsource)
 312
 313         self.countunits(pofile, 4)
 314         self.compareunit(pofile, 3, u'We aim to please \x96 will you aim too, please?')
 315         self.compareunit(pofile, 4, u'South Africa\x92s language diversity can be challenging.')
 316
 317     def test_strip_html(self):
 318         """Ensure that unnecessary html is stripped from the resulting unit."""
 319
 320         htmlsource = '''<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
 321 <html>
 322 <head>
 323 <title>FMFI - Contact</title>
 324 </head>
 325 <body>
 326 <table width="100%"  border="0" cellpadding="0" cellspacing="0">
 327   <tr align="left" valign="top">
 328     <td width="150" height="556">
 329       <table width="157" height="100%" border="0" cellspacing="0" id="leftmenubg-color">
 330       <tr>
 331           <td align="left" valign="top" height="555">
 332             <table width="100%" border="0" cellspacing="0" cellpadding="2">
 333               <tr align="left" valign="top" bgcolor="#660000">
 334                 <td width="4%"><strong></strong></td>
 335                 <td width="96%"><strong><font class="headingwhite">Projects</font></strong></td>
 336               </tr>
 337               <tr align="left" valign="top">
 338                 <td valign="middle" width="4%"><img src="images/arrow.gif" width="8" height="8"></td>
 339                 <td width="96%"><a href="index.html">Home Page</a></td>
 340               </tr>
 341             </table>
 342           </td>
 343       </tr>
 344     </table></td>
 345 </table>
 346 </body>
 347 </html>
 348 '''
 349         pofile = self.html2po(htmlsource)
 350         self.countunits(pofile, 3)
 351         self.compareunit(pofile, 2, u'Projects')
 352         self.compareunit(pofile, 3, u'Home Page')
 353
 354         # Translate and convert back:
 355         pofile.units[1].target = 'Projekte'
 356         pofile.units[2].target = 'Tuisblad'
 357         htmlresult = self.po2html(str(pofile), htmlsource).replace('\n', ' ').replace('= "', '="').replace('> <', '><')
 358         snippet = '<td width="96%"><strong><font class="headingwhite">Projekte</font></strong></td>'
 359         assert snippet in htmlresult
 360         snippet = '<td width="96%"><a href="index.html">Tuisblad</a></td>'
 361         assert snippet in htmlresult
 362
 363 class TestHTML2POCommand(test_convert.TestConvertCommand, TestHTML2PO):
 364     """Tests running actual html2po commands on files"""
 365     convertmodule = html2po
 366     defaultoptions = {"progress": "none"}
 367
 368     def test_help(self):
 369         """tests getting help"""
 370         options = test_convert.TestConvertCommand.test_help(self)
 371         options = self.help_check(options, "-P, --pot")
 372         options = self.help_check(options, "--duplicates=DUPLICATESTYLE")
 373         options = self.help_check(options, "-u, --untagged", last=True)