3 require_once( dirname( __FILE__
) . '/../Maintenance.php' );
5 require_once( dirname( __FILE__
) . '/../../includes/normal/UtfNormalUtil.php' );
8 * Generates normalizer data files for Arabic and Malayalam.
9 * For NFC see includes/normal.
11 class GenerateNormalizerData
extends Maintenance
{
14 public function __construct() {
15 parent
::__construct();
16 $this->addOption( 'unicode-data-file', 'The local location of the data file ' .
17 'from http://unicode.org/Public/UNIDATA/UnicodeData.txt', false, true );
20 public function execute() {
21 if ( !$this->hasOption( 'unicode-data-file' ) ) {
22 $this->dataFile
= 'UnicodeData.txt';
23 if ( !file_exists( $this->dataFile
) ) {
24 $this->error( "Unable to find UnicodeData.txt. Please specify its location with --unicode-data-file=<FILE>" );
28 $this->dataFile
= $this->getOption( 'unicode-data-file' );
29 if ( !file_exists( $this->dataFile
) ) {
30 $this->error( 'Unable to find the specified data file.' );
35 $this->generateArabic();
36 $this->generateMalayalam();
39 function generateArabic() {
40 $file = fopen( $this->dataFile
, 'r' );
42 $this->error( 'Unable to open the data file.' );
46 // For the file format, see http://www.unicode.org/reports/tr44/
51 'Canonical_Combining_Class',
53 'Decomposition_Type_Mapping',
58 'Simple_Uppercase_Mapping',
59 'Simple_Lowercase_Mapping',
60 'Simple_Titlecase_Mapping'
66 while ( false !== ( $line = fgets( $file ) ) ) {
70 $line = trim( substr( $line, 0, strcspn( $line, '#' ) ) );
76 $numberedData = explode( ';', $line );
78 foreach ( $fieldNames as $number => $name ) {
79 $data[$name] = $numberedData[$number];
82 $code = base_convert( $data['Code'], 16, 10 );
83 if ( ( $code >= 0xFB50 && $code <= 0xFDFF ) # Arabic presentation forms A
84 ||
( $code >= 0xFE70 && $code <= 0xFEFF ) ) # Arabic presentation forms B
86 if ( $data['Decomposition_Type_Mapping'] === '' ) {
90 if ( !preg_match( '/^ *(<\w*>) +([0-9A-F ]*)$/',
91 $data['Decomposition_Type_Mapping'], $m ) )
93 $this->error( "Can't parse Decomposition_Type/Mapping on line $lineNum" );
94 $this->error( $line );
98 $source = hexSequenceToUtf8( $data['Code'] );
99 $dest = hexSequenceToUtf8( $m[2] );
100 $pairs[$source] = $dest;
105 file_put_contents( "$IP/serialized/normalize-ar.ser", serialize( $pairs ) );
106 echo "ar: " . count( $pairs ) . " pairs written.\n";
109 function generateMalayalam() {
111 # From http://unicode.org/versions/Unicode5.1.0/#Malayalam_Chillu_Characters
112 '0D23 0D4D 200D' => '0D7A',
113 '0D28 0D4D 200D' => '0D7B',
114 '0D30 0D4D 200D' => '0D7C',
115 '0D32 0D4D 200D' => '0D7D',
116 '0D33 0D4D 200D' => '0D7E',
118 # From http://permalink.gmane.org/gmane.science.linguistics.wikipedia.technical/46413
119 '0D15 0D4D 200D' => '0D7F',
123 foreach ( $hexPairs as $hexSource => $hexDest ) {
124 $source = hexSequenceToUtf8( $hexSource );
125 $dest = hexSequenceToUtf8( $hexDest );
126 $pairs[$source] = $dest;
130 file_put_contents( "$IP/serialized/normalize-ml.ser", serialize( $pairs ) );
131 echo "ml: " . count( $pairs ) . " pairs written.\n";
135 $maintClass = 'GenerateNormalizerData';
136 require_once( DO_MAINTENANCE
);