Merge "De-duplicate pages in replaceInternal"
[mediawiki.git] / includes / normal / UtfNormalTest2.php
blob53e68c293f7731cb1461b574ec36d8f6cc7d3686
1 #!/usr/bin/env php
2 <?php
3 /**
4 * Other tests for the unicode normalization module.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 * http://www.gnu.org/copyleft/gpl.html
21 * @file
22 * @ingroup UtfNormal
25 if ( PHP_SAPI != 'cli' ) {
26 die( "Run me from the command line please.\n" );
29 // From http://unicode.org/Public/UNIDATA/NormalizationTest.txt
30 $file = "NormalizationTest.txt";
32 // Anything after this character is a comment
33 define ( 'COMMENT', '#' );
35 // Semicolons are used to separate the columns
36 define ( 'SEPARATOR', ';' );
38 $f = fopen( $file, "r" );
40 /**
41 * The following section will be used for testing different normalization methods.
42 * - Pure PHP
43 * ~ no assertion errors
44 * ~ 6.25 minutes
45 * - php_utfnormal.so or intl extension: both are wrappers around
46 * libicu so we list the version of libicu when making the
47 * comparison
48 * - libicu Ubuntu 3.8.1-3ubuntu1.1 php 5.2.6-3ubuntu4.5
49 * ~ 2200 assertion errors
50 * ~ 5 seconds
51 * ~ output: http://paste2.org/p/921566
52 * - libicu Ubuntu 4.2.1-3 php 5.3.2-1ubuntu4.2
53 * ~ 1384 assertion errors
54 * ~ 15 seconds
55 * ~ output: http://paste2.org/p/921435
56 * - libicu Debian 4.4.1-5 php 5.3.2-1ubuntu4.2
57 * ~ no assertion errors
58 * ~ 13 seconds
59 * - Tests comparing pure PHP output with libicu output were added
60 * later and slow down the runtime.
63 require_once './UtfNormal.php';
64 function normalize_form_c( $c ) {
65 return UtfNormal::toNFC( $c );
68 function normalize_form_d( $c ) {
69 return UtfNormal::toNFD( $c );
72 function normalize_form_kc( $c ) {
73 return UtfNormal::toNFKC( $c );
76 function normalize_form_kd( $c ) {
77 return UtfNormal::toNFKD( $c );
80 /**
81 * This set of functions is only useful if youve added a param to the
82 * following functions to force pure PHP usage. I decided not to
83 * commit that code since might produce a slowdown in the UTF
84 * normalization code just for the sake of these tests. -- hexmode
85 * @return string
87 function normalize_form_c_php( $c ) {
88 return UtfNormal::toNFC( $c, "php" );
91 function normalize_form_d_php( $c ) {
92 return UtfNormal::toNFD( $c, "php" );
95 function normalize_form_kc_php( $c ) {
96 return UtfNormal::toNFKC( $c, "php" );
99 function normalize_form_kd_php( $c ) {
100 return UtfNormal::toNFKD( $c, "php" );
103 assert_options( ASSERT_ACTIVE, 1 );
104 assert_options( ASSERT_WARNING, 0 );
105 assert_options( ASSERT_QUIET_EVAL, 1 );
106 assert_options( ASSERT_CALLBACK, 'my_assert' );
108 function my_assert( $file, $line, $code ) {
109 // @codingStandardsIgnoreStart MediaWiki.NamingConventions.ValidGlobalName.wgPrefix
110 global $col, $lineNo;
111 // @codingStandardsIgnoreEnd
113 echo "Assertion that '$code' failed on line $lineNo ($col[5])\n";
116 $count = 0;
117 $lineNo = 0;
118 if ( $f !== false ) {
119 while ( ( $col = getRow( $f ) ) !== false ) {
120 $lineNo++;
122 if ( count( $col ) == 6 ) {
123 $count++;
124 if ( $count % 100 === 0 ) echo "Count: $count\n";
125 } else {
126 continue;
129 # verify that the pure PHP version is correct
130 $NFCc1 = normalize_form_c( $col[0] );
131 $NFCc1p = normalize_form_c_php( $col[0] );
132 assert( '$NFCc1 === $NFCc1p' );
133 $NFCc2 = normalize_form_c( $col[1] );
134 $NFCc2p = normalize_form_c_php( $col[1] );
135 assert( '$NFCc2 === $NFCc2p' );
136 $NFCc3 = normalize_form_c( $col[2] );
137 $NFCc3p = normalize_form_c_php( $col[2] );
138 assert( '$NFCc3 === $NFCc3p' );
139 $NFCc4 = normalize_form_c( $col[3] );
140 $NFCc4p = normalize_form_c_php( $col[3] );
141 assert( '$NFCc4 === $NFCc4p' );
142 $NFCc5 = normalize_form_c( $col[4] );
143 $NFCc5p = normalize_form_c_php( $col[4] );
144 assert( '$NFCc5 === $NFCc5p' );
146 $NFDc1 = normalize_form_d( $col[0] );
147 $NFDc1p = normalize_form_d_php( $col[0] );
148 assert( '$NFDc1 === $NFDc1p' );
149 $NFDc2 = normalize_form_d( $col[1] );
150 $NFDc2p = normalize_form_d_php( $col[1] );
151 assert( '$NFDc2 === $NFDc2p' );
152 $NFDc3 = normalize_form_d( $col[2] );
153 $NFDc3p = normalize_form_d_php( $col[2] );
154 assert( '$NFDc3 === $NFDc3p' );
155 $NFDc4 = normalize_form_d( $col[3] );
156 $NFDc4p = normalize_form_d_php( $col[3] );
157 assert( '$NFDc4 === $NFDc4p' );
158 $NFDc5 = normalize_form_d( $col[4] );
159 $NFDc5p = normalize_form_d_php( $col[4] );
160 assert( '$NFDc5 === $NFDc5p' );
162 $NFKDc1 = normalize_form_kd( $col[0] );
163 $NFKDc1p = normalize_form_kd_php( $col[0] );
164 assert( '$NFKDc1 === $NFKDc1p' );
165 $NFKDc2 = normalize_form_kd( $col[1] );
166 $NFKDc2p = normalize_form_kd_php( $col[1] );
167 assert( '$NFKDc2 === $NFKDc2p' );
168 $NFKDc3 = normalize_form_kd( $col[2] );
169 $NFKDc3p = normalize_form_kd_php( $col[2] );
170 assert( '$NFKDc3 === $NFKDc3p' );
171 $NFKDc4 = normalize_form_kd( $col[3] );
172 $NFKDc4p = normalize_form_kd_php( $col[3] );
173 assert( '$NFKDc4 === $NFKDc4p' );
174 $NFKDc5 = normalize_form_kd( $col[4] );
175 $NFKDc5p = normalize_form_kd_php( $col[4] );
176 assert( '$NFKDc5 === $NFKDc5p' );
178 $NFKCc1 = normalize_form_kc( $col[0] );
179 $NFKCc1p = normalize_form_kc_php( $col[0] );
180 assert( '$NFKCc1 === $NFKCc1p' );
181 $NFKCc2 = normalize_form_kc( $col[1] );
182 $NFKCc2p = normalize_form_kc_php( $col[1] );
183 assert( '$NFKCc2 === $NFKCc2p' );
184 $NFKCc3 = normalize_form_kc( $col[2] );
185 $NFKCc3p = normalize_form_kc_php( $col[2] );
186 assert( '$NFKCc3 === $NFKCc3p' );
187 $NFKCc4 = normalize_form_kc( $col[3] );
188 $NFKCc4p = normalize_form_kc_php( $col[3] );
189 assert( '$NFKCc4 === $NFKCc4p' );
190 $NFKCc5 = normalize_form_kc( $col[4] );
191 $NFKCc5p = normalize_form_kc_php( $col[4] );
192 assert( '$NFKCc5 === $NFKCc5p' );
194 # c2 == NFC(c1) == NFC(c2) == NFC(c3)
195 assert( '$col[1] === $NFCc1' );
196 assert( '$col[1] === $NFCc2' );
197 assert( '$col[1] === $NFCc3' );
199 # c4 == NFC(c4) == NFC(c5)
200 assert( '$col[3] === $NFCc4' );
201 assert( '$col[3] === $NFCc5' );
203 # c3 == NFD(c1) == NFD(c2) == NFD(c3)
204 assert( '$col[2] === $NFDc1' );
205 assert( '$col[2] === $NFDc2' );
206 assert( '$col[2] === $NFDc3' );
208 # c5 == NFD(c4) == NFD(c5)
209 assert( '$col[4] === $NFDc4' );
210 assert( '$col[4] === $NFDc5' );
212 # c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
213 assert( '$col[3] === $NFKCc1' );
214 assert( '$col[3] === $NFKCc2' );
215 assert( '$col[3] === $NFKCc3' );
216 assert( '$col[3] === $NFKCc4' );
217 assert( '$col[3] === $NFKCc5' );
219 # c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
220 assert( '$col[4] === $NFKDc1' );
221 assert( '$col[4] === $NFKDc2' );
222 assert( '$col[4] === $NFKDc3' );
223 assert( '$col[4] === $NFKDc4' );
224 assert( '$col[4] === $NFKDc5' );
227 echo "done.\n";
229 // Compare against http://en.wikipedia.org/wiki/UTF-8#Description
230 function unichr( $c ) {
231 if ( $c <= 0x7F ) {
232 return chr( $c );
233 } elseif ( $c <= 0x7FF ) {
234 return chr( 0xC0 | $c >> 6 ) . chr( 0x80 | $c & 0x3F );
235 } elseif ( $c <= 0xFFFF ) {
236 return chr( 0xE0 | $c >> 12 ) . chr( 0x80 | $c >> 6 & 0x3F )
237 . chr( 0x80 | $c & 0x3F );
238 } elseif ( $c <= 0x10FFFF ) {
239 return chr( 0xF0 | $c >> 18 ) . chr( 0x80 | $c >> 12 & 0x3F )
240 . chr( 0x80 | $c >> 6 & 0x3F )
241 . chr( 0x80 | $c & 0x3F );
242 } else {
243 return false;
247 function unistr( $c ) {
248 return implode( "", array_map( "unichr", array_map( "hexdec", explode( " ", $c ) ) ) );
251 function getRow( $f ) {
252 $row = fgets( $f );
253 if ( $row === false ) return false;
254 $row = rtrim( $row );
255 $pos = strpos( $row, COMMENT );
256 $pos2 = strpos( $row, ")" );
257 if ( $pos === 0 ) return array( $row );
258 $c = "";
260 if ( $pos ) {
261 if ( $pos2 ) $c = substr( $row, $pos2 + 2 );
262 else $c = substr( $row, $pos );
263 $row = substr( $row, 0, $pos );
266 $ret = array();
267 foreach ( explode( SEPARATOR, $row ) as $ent ) {
268 if ( trim( $ent ) !== "" ) {
269 $ret[] = unistr( $ent );
272 $ret[] = $c;
274 return $ret;