Non-word characters don't terminate tag names.
[mediawiki.git] / includes / normal / UtfNormalTest2.php
blob750c00999b87bbfdddef2660fbc983439ae88984
1 #!/usr/bin/env php
2 <?php
3 /**
4 * Other tests for the unicode normalization module.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 * http://www.gnu.org/copyleft/gpl.html
21 * @file
22 * @ingroup UtfNormal
25 if( PHP_SAPI != 'cli' ) {
26 die( "Run me from the command line please.\n" );
29 // From http://unicode.org/Public/UNIDATA/NormalizationTest.txt
30 $file = "NormalizationTest.txt";
32 // Anything after this character is a comment
33 define ( 'COMMENT', '#' );
35 // Semicolons are used to separate the columns
36 define ( 'SEPARATOR', ';' );
38 $f = fopen($file, "r");
40 /**
41 * The following section will be used for testing different normalization methods.
42 * - Pure PHP
43 ~ no assertion errors
44 ~ 6.25 minutes
46 * - php_utfnormal.so or intl extension: both are wrappers around
47 libicu so we list the version of libicu when making the
48 comparison
50 * - libicu Ubuntu 3.8.1-3ubuntu1.1 php 5.2.6-3ubuntu4.5
51 ~ 2200 assertion errors
52 ~ 5 seconds
53 ~ output: http://paste2.org/p/921566
55 * - libicu Ubuntu 4.2.1-3 php 5.3.2-1ubuntu4.2
56 ~ 1384 assertion errors
57 ~ 15 seconds
58 ~ output: http://paste2.org/p/921435
60 * - libicu Debian 4.4.1-5 php 5.3.2-1ubuntu4.2
61 ~ no assertion errors
62 ~ 13 seconds
64 * - Tests comparing pure PHP output with libicu output were added
65 later and slow down the runtime.
68 require_once './UtfNormal.php';
69 function normalize_form_c($c) { return UtfNormal::toNFC($c); }
70 function normalize_form_d($c) { return UtfNormal::toNFD($c); }
71 function normalize_form_kc($c) { return UtfNormal::toNFKC($c); }
72 function normalize_form_kd($c) { return UtfNormal::toNFKD($c); }
74 /**
75 * This set of functions is only useful if youve added a param to the
76 * following functions to force pure PHP usage. I decided not to
77 * commit that code since might produce a slowdown in the UTF
78 * normalization code just for the sake of these tests. -- hexmode
79 * @return string
81 function normalize_form_c_php($c) { return UtfNormal::toNFC($c, "php"); }
82 function normalize_form_d_php($c) { return UtfNormal::toNFD($c, "php"); }
83 function normalize_form_kc_php($c) { return UtfNormal::toNFKC($c, "php"); }
84 function normalize_form_kd_php($c) { return UtfNormal::toNFKD($c, "php"); }
86 assert_options(ASSERT_ACTIVE, 1);
87 assert_options(ASSERT_WARNING, 0);
88 assert_options(ASSERT_QUIET_EVAL, 1);
89 assert_options(ASSERT_CALLBACK, 'my_assert');
91 function my_assert( $file, $line, $code ) {
92 global $col, $lineNo;
93 echo "Assertion that '$code' failed on line $lineNo ($col[5])\n";
96 $count = 0;
97 $lineNo = 0;
98 if( $f !== false ) {
99 while( ( $col = getRow( $f ) ) !== false ) {
100 $lineNo++;
102 if(count($col) == 6) {
103 $count++;
104 if( $count % 100 === 0 ) echo "Count: $count\n";
105 } else {
106 continue;
109 # verify that the pure PHP version is correct
110 $NFCc1 = normalize_form_c($col[0]);
111 $NFCc1p = normalize_form_c_php($col[0]);
112 assert('$NFCc1 === $NFCc1p');
113 $NFCc2 = normalize_form_c($col[1]);
114 $NFCc2p = normalize_form_c_php($col[1]);
115 assert('$NFCc2 === $NFCc2p');
116 $NFCc3 = normalize_form_c($col[2]);
117 $NFCc3p = normalize_form_c_php($col[2]);
118 assert('$NFCc3 === $NFCc3p');
119 $NFCc4 = normalize_form_c($col[3]);
120 $NFCc4p = normalize_form_c_php($col[3]);
121 assert('$NFCc4 === $NFCc4p');
122 $NFCc5 = normalize_form_c($col[4]);
123 $NFCc5p = normalize_form_c_php($col[4]);
124 assert('$NFCc5 === $NFCc5p');
126 $NFDc1 = normalize_form_d($col[0]);
127 $NFDc1p = normalize_form_d_php($col[0]);
128 assert('$NFDc1 === $NFDc1p');
129 $NFDc2 = normalize_form_d($col[1]);
130 $NFDc2p = normalize_form_d_php($col[1]);
131 assert('$NFDc2 === $NFDc2p');
132 $NFDc3 = normalize_form_d($col[2]);
133 $NFDc3p = normalize_form_d_php($col[2]);
134 assert('$NFDc3 === $NFDc3p');
135 $NFDc4 = normalize_form_d($col[3]);
136 $NFDc4p = normalize_form_d_php($col[3]);
137 assert('$NFDc4 === $NFDc4p');
138 $NFDc5 = normalize_form_d($col[4]);
139 $NFDc5p = normalize_form_d_php($col[4]);
140 assert('$NFDc5 === $NFDc5p');
142 $NFKDc1 = normalize_form_kd($col[0]);
143 $NFKDc1p = normalize_form_kd_php($col[0]);
144 assert('$NFKDc1 === $NFKDc1p');
145 $NFKDc2 = normalize_form_kd($col[1]);
146 $NFKDc2p = normalize_form_kd_php($col[1]);
147 assert('$NFKDc2 === $NFKDc2p');
148 $NFKDc3 = normalize_form_kd($col[2]);
149 $NFKDc3p = normalize_form_kd_php($col[2]);
150 assert('$NFKDc3 === $NFKDc3p');
151 $NFKDc4 = normalize_form_kd($col[3]);
152 $NFKDc4p = normalize_form_kd_php($col[3]);
153 assert('$NFKDc4 === $NFKDc4p');
154 $NFKDc5 = normalize_form_kd($col[4]);
155 $NFKDc5p = normalize_form_kd_php($col[4]);
156 assert('$NFKDc5 === $NFKDc5p');
158 $NFKCc1 = normalize_form_kc($col[0]);
159 $NFKCc1p = normalize_form_kc_php($col[0]);
160 assert('$NFKCc1 === $NFKCc1p');
161 $NFKCc2 = normalize_form_kc($col[1]);
162 $NFKCc2p = normalize_form_kc_php($col[1]);
163 assert('$NFKCc2 === $NFKCc2p');
164 $NFKCc3 = normalize_form_kc($col[2]);
165 $NFKCc3p = normalize_form_kc_php($col[2]);
166 assert('$NFKCc3 === $NFKCc3p');
167 $NFKCc4 = normalize_form_kc($col[3]);
168 $NFKCc4p = normalize_form_kc_php($col[3]);
169 assert('$NFKCc4 === $NFKCc4p');
170 $NFKCc5 = normalize_form_kc($col[4]);
171 $NFKCc5p = normalize_form_kc_php($col[4]);
172 assert('$NFKCc5 === $NFKCc5p');
174 # c2 == NFC(c1) == NFC(c2) == NFC(c3)
175 assert('$col[1] === $NFCc1');
176 assert('$col[1] === $NFCc2');
177 assert('$col[1] === $NFCc3');
179 # c4 == NFC(c4) == NFC(c5)
180 assert('$col[3] === $NFCc4');
181 assert('$col[3] === $NFCc5');
183 # c3 == NFD(c1) == NFD(c2) == NFD(c3)
184 assert('$col[2] === $NFDc1');
185 assert('$col[2] === $NFDc2');
186 assert('$col[2] === $NFDc3');
188 # c5 == NFD(c4) == NFD(c5)
189 assert('$col[4] === $NFDc4');
190 assert('$col[4] === $NFDc5');
192 # c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
193 assert('$col[3] === $NFKCc1');
194 assert('$col[3] === $NFKCc2');
195 assert('$col[3] === $NFKCc3');
196 assert('$col[3] === $NFKCc4');
197 assert('$col[3] === $NFKCc5');
199 # c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
200 assert('$col[4] === $NFKDc1');
201 assert('$col[4] === $NFKDc2');
202 assert('$col[4] === $NFKDc3');
203 assert('$col[4] === $NFKDc4');
204 assert('$col[4] === $NFKDc5');
207 echo "done.\n";
209 // Compare against http://en.wikipedia.org/wiki/UTF-8#Description
210 function unichr($c) {
211 if ($c <= 0x7F) {
212 return chr($c);
213 } elseif ($c <= 0x7FF) {
214 return chr(0xC0 | $c >> 6) . chr(0x80 | $c & 0x3F);
215 } elseif ($c <= 0xFFFF) {
216 return chr(0xE0 | $c >> 12) . chr(0x80 | $c >> 6 & 0x3F)
217 . chr(0x80 | $c & 0x3F);
218 } elseif ($c <= 0x10FFFF) {
219 return chr(0xF0 | $c >> 18) . chr(0x80 | $c >> 12 & 0x3F)
220 . chr(0x80 | $c >> 6 & 0x3F)
221 . chr(0x80 | $c & 0x3F);
222 } else {
223 return false;
227 function unistr($c) {
228 return implode("", array_map("unichr", array_map("hexdec", explode(" ", $c))));
231 function getRow( $f ) {
232 $row = fgets( $f );
233 if( $row === false ) return false;
234 $row = rtrim($row);
235 $pos = strpos( $row, COMMENT );
236 $pos2 = strpos( $row, ")" );
237 if( $pos === 0 ) return array($row);
238 $c = "";
240 if( $pos ) {
241 if($pos2) $c = substr( $row, $pos2 + 2 );
242 else $c = substr( $row, $pos );
243 $row = substr( $row, 0, $pos );
246 $ret = array();
247 foreach( explode( SEPARATOR, $row ) as $ent ) {
248 if( trim( $ent ) !== "" ) {
249 $ret[] = unistr($ent);
252 $ret[] = $c;
254 return $ret;