1 use Test
::More
'no_plan';
8 use_ok
'Thrasher::XHTML_IM_Normalize', ':all';
9 use_ok
'Thrasher::HTMLNormalize', ':all';
12 my $RANDOM_TEST_COUNT = 1000;
16 my $x = 'hello <>& &!';
19 isnt
($x, $y, "escape doesn't modify existing strings");
20 is
($y, 'hello <>& &amp;!',
21 'escaping basically works');
24 # We test two cases: full-permission normalization, and the
25 # normalization defined by $CONSERVATIVE_PERMISSIONS. This creates a
26 # series of test cases [$input => $result]. The first set are used
27 # in both. %CONSERVATIVE_TESTS defines a series of tests used only for
28 # the permissions, and $FULL_TESTS defines tests for the
29 # full-permission normalization.
30 # something), but I decided to leave them in for readability.
33 # Had trouble with this
34 '<ul><li>Test<ul><li>nested</li><li>nested2</li></ul></li><li>moo</li></ul>' =>
35 '<ul><li>Test<ul><li>nested</li><li>nested2</li></ul></li><li>moo</li></ul>',
37 't&a' => '<p>t&a</p>',
38 "t\n\na" => "<p>t</p>\n\n<p>a</p>",
39 "This is a test.\nMore.\n\nSo there." =>
40 "<p>This is a test.<br />\nMore.</p>\n\n<p>So there.</p>",
41 "<p>This is a test.<br />\nMore.</p>" =>
42 "<p>This is a test.<br />\nMore.</p>",
43 "<P>This is a test.<BR />\nMore.</P>" =>
44 "<p>This is a test.<br />\nMore.</p>",
45 '<b><i>moo</b></i>' => '<p><b><i>moo</i></b></p>',
46 '<b><i><abbr title="moo">argle</b></i></abbr>' =>
47 '<p><b><i><abbr title=\'moo\'>argle</abbr></i></b></p>',
50 # Incomplete CDATA tags get escaped no matter whether we are
51 # allowing CDATA or not
52 '<pre><![CDATA[moo' =>
53 '<pre><![CDATA[moo</pre>',
54 '™' => '<p>™</p>',
55 "\x{2033}" => "<p>\x{2033}</p>",
56 '\x{e2}' => '<p>\x{e2}</p>',
59 '<A href="http://www.wssource.com/cgi-bin/auth/dispatch.cgi?_code=307&alert=123256652117586003&noback=1">[edit]</A>' =>
60 '<p><a href=\'http://www.wssource.com/cgi-bin/auth/dispatch.cgi?_code=307&alert=123256652117586003&noback=1\'>[edit]</a></p>',
61 '<A HREF="http://www.wssource.com/cgi-bin/auth/dispatch.cgi?_code=307&alert=123256652117586003&noback=1">[edit]</A>' =>
62 '<p><a href=\'http://www.wssource.com/cgi-bin/auth/dispatch.cgi?_code=307&alert=123256652117586003&noback=1\'>[edit]</a></p>'
65 my %CONSERVATIVE_TESTS = (
67 '<p id="moo">test</p>' => '<p>test</p>',
68 '<p><a href="http://www.jerf.org/">test</a></p>' =>
69 "<p><a href='http://www.jerf.org/'>test</a></p>",
70 '<p><a href="/moo">test</a></p>' =>
71 "<p><a href='http:///moo'>test</a></p>",
72 '<ul><li>A list element.</li>\n\n<li>Another list '.
73 'element</li></ul>' =>
74 '<ul><li>A list element.</li>\n\n<li>Another list '.
76 '<ul><li>A list element</li><li>another list element</li>' =>
77 '<ul><li>A list element</li><li>another list element</li></ul>',
78 '<ul><li>A list element</li>\n<li>another list element</li>' =>
79 '<ul><li>A list element</li>\n<li>another list '.
81 '<ul><li>A list element<li>another' =>
82 '<ul><li>A list element</li><li>another</li></ul>',
83 "<p>Moo.\n\n<p>More moo.\n\nForgot my p." =>
84 "<p>Moo.</p>\n\n<p>More moo.</p>\n\n<p>Forgot my p.</p>",
85 '<scr<script>ipt>' => '<p>ipt></p>',
86 '<a href="javascript:alert(\'moo\')">test</a>' =>
88 '<bgsound src="javascript:alert()">moo' =>
90 '<abbr title="oh & my">OM</abbr>' =>
91 "<p><abbr title='oh & my'>OM</abbr></p>"
94 my %FULL_PERM_TESTS = (
96 "Moo\n\n<h3>Header</h3>\n\nMoo" =>
97 "<p>Moo</p>\n\n<h3>Header</h3>\n\n<p>Moo</p>",
98 "<div class=\"NewsItemTitle\">title</div>\n<div ".
99 "class=\"NewsItemBody\">moo\n\nthere</div>" =>
100 "<div class=\'NewsItemTitle\'>title</div>\n<div ".
101 "class=\'NewsItemBody\'>moo\n\n<p>there</p></div>",
102 '<p>a</p> <hr><p>hello</p>' =>
103 '<p>a</p> <hr /><p>hello</p>',
104 '<table><tr><td>a</td></tr></table>' =>
105 '<table><tr><td>a</td></tr></table>',
106 # Non-quoted attributes handled as expected
107 '<p name=cheesy 444 poofs>meh</p>' => "<p name='cheesy' poofs='1'>meh</p>",
108 '<p name=cheesy poofs 333>meh</p>' => "<p name='cheesy' poofs='1'>meh</p>",
113 '<pre><![CDATA[ & < > <script> & ]]></pre>' =>
114 '<pre><![CDATA[ & < > <script> & ]]></pre>' );
116 my @RANDOM_HTML_FRAGMENTS =
117 ('<scr', '<i>', '</i>',
118 '<p id="testing">ddd',
119 '<script>alert ()</script>',
120 '<<<', '>>>', "\x{0437}",
122 '¬anentity;', " \n\r\r\n",
123 '<!-- -->', 'rc', '<!--',
125 # An excitingly Unicode-laden string
126 join('', map { chr($_) } 32..1024),
130 sub test_data_with_allowed_elements
{
132 my $allowed_elements = shift;
133 my $allow_cdata = shift;
136 while (my ($input, $desired_output) = each %$hash) {
137 my $actual_output = normalize
($input, $allowed_elements,
139 is
($actual_output, $desired_output, 'test with ' . $name);
143 test_data_with_allowed_elements
(\
%FULL_PERM_TESTS, undef, undef,
145 test_data_with_allowed_elements
(\
%CONSERVATIVE_TESTS,
146 $CONSERVATIVE_PERMISSIONS, undef,
147 'conservative permissions');
148 test_data_with_allowed_elements
(\
%CDATA_TESTS,
149 undef, 1, 'cdata permissions');
151 # This is primarily a verification that there exists no input that
152 # results in invalid XML. "Sadly", this code doesn't solve the GIGO
156 for (my $i = 0; $i < $RANDOM_TEST_COUNT; $i++) {
157 my @random_fragments;
158 for (my $j = 0; $j < 10; $j++) {
159 push(@random_fragments,
160 $RANDOM_HTML_FRAGMENTS[int(rand(@RANDOM_HTML_FRAGMENTS))]);
162 my $random_input = join '', @random_fragments;
163 # Allow both cdata options.
164 normalize
($random_input, $CONSERVATIVE_PERMISSIONS, 0);
165 normalize
($random_input, $CONSERVATIVE_PERMISSIONS, 1);
171 ('<ping>' => '<p><ping></p>',
173 '<html><body><p>hello</body></html>' => '<p>hello</p>',
174 %CONSERVATIVE_TESTS);
175 # behaves differently
176 $allow_taglike{'<scr<script>ipt>'} =
177 '<p><scr<script>ipt></p>';
179 while (my ($input, $desired_output) = each %allow_taglike) {
180 is
(normalize
($input, $CONSERVATIVE_PERMISSIONS,
181 0, 1), $desired_output,
182 'test with allow_taglike');
183 is
(normalize
($input, $CONSERVATIVE_PERMISSIONS,
184 1, 1), $desired_output,
185 'test with allow_taglike');
190 # The preceding is actually the generic test script for
191 # Thrasher::HTMLNormalize. This tests the XHTML modifications, at
192 # least to some degree.
193 ACTUAL_XHTML_TESTS
: {
194 # Make sure the automatic <p> suppression occurs for
196 my ($xhtml, $text) = xhtml_and_text
("hello!\nhello!\n\nhello!");
197 is
($text, "hello!\nhello!\n\nhello!",
198 'text only correctly suppresses all automatically-added tags');
199 is
($xhtml, "<p>hello!<br />\nhello!</p>\n\n<p>hello!</p>",
200 'xhtml mode still correctly adds them');
202 # Test the translations
203 $xhtml = xhtml
("<p>hello <b>there</b> buddy</p>");
204 is
($xhtml, "hello <strong>there</strong> buddy",
205 'simple tag normalization works');
207 $xhtml = xhtml
("<a href='http://www.jerf.org/'>Test</a>");
208 is
($xhtml, "<a href='http://www.jerf.org/'>Test</a>",
209 "can handle links correctly.");
211 $xhtml = xhtml
("<FONT FACE='ARIAL'><i>itali</FONT></i>c");
212 is
($xhtml, "<em>itali</em>c",
213 'correctly handles upper-case crap in the tags');