3 class HTMLPurifier_EncoderTest
extends HTMLPurifier_Harness
6 protected $_entity_lookup;
8 public function setUp()
10 $this->_entity_lookup
= HTMLPurifier_EntityLookup
::instance();
14 public function assertCleanUTF8($string, $expect = null)
16 if ($expect === null) $expect = $string;
17 $this->assertIdentical(HTMLPurifier_Encoder
::cleanUTF8($string), $expect, 'iconv: %s');
18 $this->assertIdentical(HTMLPurifier_Encoder
::cleanUTF8($string, true), $expect, 'PHP: %s');
21 public function test_cleanUTF8()
23 $this->assertCleanUTF8('Normal string.');
24 $this->assertCleanUTF8("Test\tAllowed\nControl\rCharacters");
25 $this->assertCleanUTF8("null byte: \0", 'null byte: ');
26 $this->assertCleanUTF8("あ(い)う(え)お\0", "あ(い)う(え)お"); // test for issue #122
27 $this->assertCleanUTF8("\1\2\3\4\5\6\7", '');
28 $this->assertCleanUTF8("\x7F", ''); // one byte invalid SGML char
29 $this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML
30 $this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte
31 $this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
33 $this->assertCleanUTF8("\xED\xB0\x80", '');
36 public function test_convertToUTF8_noConvert()
38 // UTF-8 means that we don't touch it
39 $this->assertIdentical(
40 HTMLPurifier_Encoder
::convertToUTF8("\xF6", $this->config
, $this->context
),
41 "\xF6", // this is invalid
42 'Expected identical [Binary: F6]'
46 public function test_convertToUTF8_spuriousEncoding()
48 if (!HTMLPurifier_Encoder
::iconvAvailable()) return;
49 $this->config
->set('Core.Encoding', 'utf99');
50 $this->expectError('Invalid encoding utf99');
51 $this->assertIdentical(
52 HTMLPurifier_Encoder
::convertToUTF8("\xF6", $this->config
, $this->context
),
57 public function test_convertToUTF8_iso8859_1()
59 $this->config
->set('Core.Encoding', 'ISO-8859-1');
60 $this->assertIdentical(
61 HTMLPurifier_Encoder
::convertToUTF8("\xF6", $this->config
, $this->context
),
66 public function test_convertToUTF8_withoutIconv()
68 $this->config
->set('Core.Encoding', 'ISO-8859-1');
69 $this->config
->set('Test.ForceNoIconv', true);
70 $this->assertIdentical(
71 HTMLPurifier_Encoder
::convertToUTF8("\xF6", $this->config
, $this->context
),
77 public function getZhongWen()
79 return "\xE4\xB8\xAD\xE6\x96\x87 (Chinese)";
82 public function test_convertFromUTF8_utf8()
84 // UTF-8 means that we don't touch it
85 $this->assertIdentical(
86 HTMLPurifier_Encoder
::convertFromUTF8("\xC3\xB6", $this->config
, $this->context
),
91 public function test_convertFromUTF8_iso8859_1()
93 $this->config
->set('Core.Encoding', 'ISO-8859-1');
94 $this->assertIdentical(
95 HTMLPurifier_Encoder
::convertFromUTF8("\xC3\xB6", $this->config
, $this->context
),
97 'Expected identical [Binary: F6]'
101 public function test_convertFromUTF8_iconvNoChars()
103 if (!HTMLPurifier_Encoder
::iconvAvailable()) return;
104 $this->config
->set('Core.Encoding', 'ISO-8859-1');
105 $this->assertIdentical(
106 HTMLPurifier_Encoder
::convertFromUTF8($this->getZhongWen(), $this->config
, $this->context
),
111 public function test_convertFromUTF8_phpNormal()
113 // Plain PHP implementation has slightly different behavior
114 $this->config
->set('Core.Encoding', 'ISO-8859-1');
115 $this->config
->set('Test.ForceNoIconv', true);
116 $this->assertIdentical(
117 HTMLPurifier_Encoder
::convertFromUTF8("\xC3\xB6", $this->config
, $this->context
),
119 'Expected identical [Binary: F6]'
123 public function test_convertFromUTF8_phpNoChars()
125 $this->config
->set('Core.Encoding', 'ISO-8859-1');
126 $this->config
->set('Test.ForceNoIconv', true);
127 $this->assertIdentical(
128 HTMLPurifier_Encoder
::convertFromUTF8($this->getZhongWen(), $this->config
, $this->context
),
133 public function test_convertFromUTF8_withProtection()
135 // Preserve the characters!
136 $this->config
->set('Core.Encoding', 'ISO-8859-1');
137 $this->config
->set('Core.EscapeNonASCIICharacters', true);
138 $this->assertIdentical(
139 HTMLPurifier_Encoder
::convertFromUTF8($this->getZhongWen(), $this->config
, $this->context
),
140 "中文 (Chinese)"
144 public function test_convertFromUTF8_withProtectionButUtf8()
146 // Preserve the characters!
147 $this->config
->set('Core.EscapeNonASCIICharacters', true);
148 $this->assertIdentical(
149 HTMLPurifier_Encoder
::convertFromUTF8($this->getZhongWen(), $this->config
, $this->context
),
150 "中文 (Chinese)"
154 public function test_convertToASCIIDumbLossless()
156 // Uppercase thorn letter
157 $this->assertIdentical(
158 HTMLPurifier_Encoder
::convertToASCIIDumbLossless("\xC3\x9Eorn"),
162 $this->assertIdentical(
163 HTMLPurifier_Encoder
::convertToASCIIDumbLossless("an"),
167 // test up to four bytes
168 $this->assertIdentical(
169 HTMLPurifier_Encoder
::convertToASCIIDumbLossless("\xF3\xA0\x80\xA0"),
175 public function assertASCIISupportCheck($enc, $ret)
177 $test = HTMLPurifier_Encoder
::testEncodingSupportsASCII($enc, true);
178 if ($test === false) return;
179 $this->assertIdentical(
180 HTMLPurifier_Encoder
::testEncodingSupportsASCII($enc),
183 $this->assertIdentical(
184 HTMLPurifier_Encoder
::testEncodingSupportsASCII($enc, true),
189 public function test_testEncodingSupportsASCII()
191 if (HTMLPurifier_Encoder
::iconvAvailable()) {
192 $this->assertASCIISupportCheck('Shift_JIS', array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~'));
193 $this->assertASCIISupportCheck('JOHAB', array("\xE2\x82\xA9" => '\\'));
195 $this->assertASCIISupportCheck('ISO-8859-1', array());
196 $this->assertASCIISupportCheck('dontexist', array()); // canary
199 public function testShiftJIS()
201 if (!HTMLPurifier_Encoder
::iconvAvailable()) return;
202 $this->config
->set('Core.Encoding', 'Shift_JIS');
203 // This actually looks like a Yen, but we're going to treat it differently
204 $this->assertIdentical(
205 HTMLPurifier_Encoder
::convertFromUTF8('\\~', $this->config
, $this->context
),
208 $this->assertIdentical(
209 HTMLPurifier_Encoder
::convertToUTF8('\\~', $this->config
, $this->context
),
214 public function testIconvTruncateBug()
216 if (!HTMLPurifier_Encoder
::iconvAvailable()) return;
217 if (HTMLPurifier_Encoder
::testIconvTruncateBug() !== HTMLPurifier_Encoder
::ICONV_TRUNCATES
) return;
218 $this->config
->set('Core.Encoding', 'ISO-8859-1');
219 $this->assertIdentical(
220 HTMLPurifier_Encoder
::convertFromUTF8("\xE4\xB8\xAD" . str_repeat('a', 10000), $this->config
, $this->context
),
221 str_repeat('a', 10000)
225 public function testIconvChunking()
227 if (!HTMLPurifier_Encoder
::iconvAvailable()) return;
228 if (HTMLPurifier_Encoder
::testIconvTruncateBug() !== HTMLPurifier_Encoder
::ICONV_TRUNCATES
) return;
229 $this->assertIdentical(HTMLPurifier_Encoder
::iconv('utf-8', 'iso-8859-1//IGNORE', "a\xF3\xA0\x80\xA0b", 4), 'ab');
230 $this->assertIdentical(HTMLPurifier_Encoder
::iconv('utf-8', 'iso-8859-1//IGNORE', "aa\xE4\xB8\xADb", 4), 'aab');
231 $this->assertIdentical(HTMLPurifier_Encoder
::iconv('utf-8', 'iso-8859-1//IGNORE', "aaa\xCE\xB1b", 4), 'aaab');
232 $this->assertIdentical(HTMLPurifier_Encoder
::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xF3\xA0\x80\xA0b", 4), 'aaaab');
233 $this->assertIdentical(HTMLPurifier_Encoder
::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xE4\xB8\xADb", 4), 'aaaab');
234 $this->assertIdentical(HTMLPurifier_Encoder
::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xCE\xB1b", 4), 'aaaab');
239 // vim: et sw=4 sts=4