Non-word characters don't terminate tag names.
[mediawiki.git] / languages / utils / CLDRPluralRuleEvaluator.php
blob43e8aaf7d337c8b01d055a2c1172e7d64010271e
1 <?php
2 /**
3 * Parse and evaluate a plural rule.
5 * http://unicode.org/reports/tr35/#Language_Plural_Rules
7 * @author Niklas Laxstrom, Tim Starling
9 * @copyright Copyright © 2010-2012, Niklas Laxström
10 * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0
11 * or later
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
23 * You should have received a copy of the GNU General Public License along
24 * with this program; if not, write to the Free Software Foundation, Inc.,
25 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
26 * http://www.gnu.org/copyleft/gpl.html
29 * @file
30 * @since 1.20
33 class CLDRPluralRuleEvaluator {
34 /**
35 * Evaluate a number against a set of plural rules. If a rule passes,
36 * return the index of plural rule.
38 * @param int The number to be evaluated against the rules
39 * @param array The associative array of plural rules in pluralform => rule format.
40 * @return int The index of the plural form which passed the evaluation
42 public static function evaluate( $number, array $rules ) {
43 $rules = self::compile( $rules );
44 return self::evaluateCompiled( $number, $rules );
47 /**
48 * Convert a set of rules to a compiled form which is optimised for
49 * fast evaluation. The result will be an array of strings, and may be cached.
51 * @param $rules The rules to compile
52 * @return An array of compile rules.
54 public static function compile( array $rules ) {
55 // We can't use array_map() for this because it generates a warning if
56 // there is an exception.
57 foreach ( $rules as &$rule ) {
58 $rule = CLDRPluralRuleConverter::convert( $rule );
60 return $rules;
63 /**
64 * Evaluate a compiled set of rules returned by compile(). Do not allow
65 * the user to edit the compiled form, or else PHP errors may result.
67 public static function evaluateCompiled( $number, array $rules ) {
68 // The compiled form is RPN, with tokens strictly delimited by
69 // spaces, so this is a simple RPN evaluator.
70 foreach ( $rules as $i => $rule ) {
71 $stack = array();
72 $zero = ord( '0' );
73 $nine = ord( '9' );
74 foreach ( StringUtils::explode( ' ', $rule ) as $token ) {
75 $ord = ord( $token );
76 if ( $token === 'n' ) {
77 $stack[] = $number;
78 } elseif ( $ord >= $zero && $ord <= $nine ) {
79 $stack[] = intval( $token );
80 } else {
81 $right = array_pop( $stack );
82 $left = array_pop( $stack );
83 $result = self::doOperation( $token, $left, $right );
84 $stack[] = $result;
87 if ( $stack[0] ) {
88 return $i;
91 // None of the provided rules match. The number belongs to caregory
92 // 'other' which comes last.
93 return count( $rules );
96 /**
97 * Do a single operation
99 * @param $token string The token string
100 * @param $left The left operand. If it is an object, its state may be destroyed.
101 * @param $right The right operand
102 * @throws CLDRPluralRuleError
103 * @return mixed
105 private static function doOperation( $token, $left, $right ) {
106 if ( in_array( $token, array( 'in', 'not-in', 'within', 'not-within' ) ) ) {
107 if ( !( $right instanceof CLDRPluralRuleEvaluator_Range ) ) {
108 $right = new CLDRPluralRuleEvaluator_Range( $right );
111 switch ( $token ) {
112 case 'or':
113 return $left || $right;
114 case 'and':
115 return $left && $right;
116 case 'is':
117 return $left == $right;
118 case 'is-not':
119 return $left != $right;
120 case 'in':
121 return $right->isNumberIn( $left );
122 case 'not-in':
123 return !$right->isNumberIn( $left );
124 case 'within':
125 return $right->isNumberWithin( $left );
126 case 'not-within':
127 return !$right->isNumberWithin( $left );
128 case 'mod':
129 if ( is_int( $left ) ) {
130 return (int) fmod( $left, $right );
132 return fmod( $left, $right );
133 case ',':
134 if ( $left instanceof CLDRPluralRuleEvaluator_Range ) {
135 $range = $left;
136 } else {
137 $range = new CLDRPluralRuleEvaluator_Range( $left );
139 $range->add( $right );
140 return $range;
141 case '..':
142 return new CLDRPluralRuleEvaluator_Range( $left, $right );
143 default:
144 throw new CLDRPluralRuleError( "Invalid RPN token" );
150 * Evaluator helper class representing a range list.
152 class CLDRPluralRuleEvaluator_Range {
153 public $parts = array();
155 function __construct( $start, $end = false ) {
156 if ( $end === false ) {
157 $this->parts[] = $start;
158 } else {
159 $this->parts[] = array( $start, $end );
164 * Determine if the given number is inside the range. If $integerConstraint
165 * is true, the number must additionally be an integer if it is to match
166 * any interval part.
168 function isNumberIn( $number, $integerConstraint = true ) {
169 foreach ( $this->parts as $part ) {
170 if ( is_array( $part ) ) {
171 if ( ( !$integerConstraint || floor( $number ) === (float)$number )
172 && $number >= $part[0] && $number <= $part[1] )
174 return true;
176 } else {
177 if ( $number == $part ) {
178 return true;
182 return false;
186 * Readable alias for isNumberIn( $number, false ), and the implementation
187 * of the "within" operator.
189 function isNumberWithin( $number ) {
190 return $this->isNumberIn( $number, false );
194 * Add another part to this range. The supplied new part may either be a
195 * range object itself, or a single number.
197 function add( $other ) {
198 if ( $other instanceof self ) {
199 $this->parts = array_merge( $this->parts, $other->parts );
200 } else {
201 $this->parts[] = $other;
206 * For debugging
208 function __toString() {
209 $s = 'Range(';
210 foreach ( $this->parts as $i => $part ) {
211 if ( $i ) {
212 $s .= ', ';
214 if ( is_array( $part ) ) {
215 $s .= $part[0] . '..' . $part[1];
216 } else {
217 $s .= $part;
220 $s .= ')';
221 return $s;
227 * Helper class for converting rules to reverse polish notation (RPN).
229 class CLDRPluralRuleConverter {
230 public $rule, $pos, $end;
231 public $operators = array();
232 public $operands = array();
235 * Precedence levels. Note that there's no need to worry about associativity
236 * for the level 4 operators, since they return boolean and don't accept
237 * boolean inputs.
239 static $precedence = array(
240 'or' => 2,
241 'and' => 3,
242 'is' => 4,
243 'is-not' => 4,
244 'in' => 4,
245 'not-in' => 4,
246 'within' => 4,
247 'not-within' => 4,
248 'mod' => 5,
249 ',' => 6,
250 '..' => 7,
254 * A character list defining whitespace, for use in strspn() etc.
256 const WHITESPACE_CLASS = " \t\r\n";
259 * Same for digits. Note that the grammar given in UTS #35 doesn't allow
260 * negative numbers or decimals.
262 const NUMBER_CLASS = '0123456789';
265 * An anchored regular expression which matches a word at the current offset.
267 const WORD_REGEX = '/[a-zA-Z]+/A';
270 * Convert a rule to RPN. This is the only public entry point.
272 public static function convert( $rule ) {
273 $parser = new self( $rule );
274 return $parser->doConvert();
278 * Private constructor.
280 protected function __construct( $rule ) {
281 $this->rule = $rule;
282 $this->pos = 0;
283 $this->end = strlen( $rule );
287 * Do the operation.
289 protected function doConvert() {
290 $expectOperator = true;
292 // Iterate through all tokens, saving the operators and operands to a
293 // stack per Dijkstra's shunting yard algorithm.
294 while ( false !== ( $token = $this->nextToken() ) ) {
295 // In this grammar, there are only binary operators, so every valid
296 // rule string will alternate between operator and operand tokens.
297 $expectOperator = !$expectOperator;
299 if ( $token instanceof CLDRPluralRuleConverter_Expression ) {
300 // Operand
301 if ( $expectOperator ) {
302 $token->error( 'unexpected operand' );
304 $this->operands[] = $token;
305 continue;
306 } else {
307 // Operator
308 if ( !$expectOperator ) {
309 $token->error( 'unexpected operator' );
311 // Resolve higher precedence levels
312 $lastOp = end( $this->operators );
313 while ( $lastOp && self::$precedence[$token->name] <= self::$precedence[$lastOp->name] ) {
314 $this->doOperation( $lastOp, $this->operands );
315 array_pop( $this->operators );
316 $lastOp = end( $this->operators );
318 $this->operators[] = $token;
322 // Finish off the stack
323 while ( $op = array_pop( $this->operators ) ) {
324 $this->doOperation( $op, $this->operands );
327 // Make sure the result is sane. The first case is possible for an empty
328 // string input, the second should be unreachable.
329 if ( !count( $this->operands ) ) {
330 $this->error( 'condition expected' );
331 } elseif ( count( $this->operands ) > 1 ) {
332 $this->error( 'missing operator or too many operands' );
335 $value = $this->operands[0];
336 if ( $value->type !== 'boolean' ) {
337 $this->error( 'the result must have a boolean type' );
340 return $this->operands[0]->rpn;
344 * Fetch the next token from the input string. Return it as a
345 * CLDRPluralRuleConverter_Fragment object.
347 protected function nextToken() {
348 if ( $this->pos >= $this->end ) {
349 return false;
352 // Whitespace
353 $length = strspn( $this->rule, self::WHITESPACE_CLASS, $this->pos );
354 $this->pos += $length;
356 if ( $this->pos >= $this->end ) {
357 return false;
360 // Number
361 $length = strspn( $this->rule, self::NUMBER_CLASS, $this->pos );
362 if ( $length !== 0 ) {
363 $token = $this->newNumber( substr( $this->rule, $this->pos, $length ), $this->pos );
364 $this->pos += $length;
365 return $token;
368 // Comma
369 if ( $this->rule[$this->pos] === ',' ) {
370 $token = $this->newOperator( ',', $this->pos, 1 );
371 $this->pos ++;
372 return $token;
375 // Dot dot
376 if ( substr( $this->rule, $this->pos, 2 ) === '..' ) {
377 $token = $this->newOperator( '..', $this->pos, 2 );
378 $this->pos += 2;
379 return $token;
382 // Word
383 if ( !preg_match( self::WORD_REGEX, $this->rule, $m, 0, $this->pos ) ) {
384 $this->error( 'unexpected character "' . $this->rule[$this->pos] . '"' );
386 $word1 = strtolower( $m[0] );
387 $word2 = '';
388 $nextTokenPos = $this->pos + strlen( $word1 );
389 if ( $word1 === 'not' || $word1 === 'is' ) {
390 // Look ahead one word
391 $nextTokenPos += strspn( $this->rule, self::WHITESPACE_CLASS, $nextTokenPos );
392 if ( $nextTokenPos < $this->end
393 && preg_match( self::WORD_REGEX, $this->rule, $m, 0, $nextTokenPos ) )
395 $word2 = strtolower( $m[0] );
396 $nextTokenPos += strlen( $word2 );
400 // Two-word operators like "is not" take precedence over single-word operators like "is"
401 if ( $word2 !== '' ) {
402 $bothWords = "{$word1}-{$word2}";
403 if ( isset( self::$precedence[$bothWords] ) ) {
404 $token = $this->newOperator( $bothWords, $this->pos, $nextTokenPos - $this->pos );
405 $this->pos = $nextTokenPos;
406 return $token;
410 // Single-word operators
411 if ( isset( self::$precedence[$word1] ) ) {
412 $token = $this->newOperator( $word1, $this->pos, strlen( $word1 ) );
413 $this->pos += strlen( $word1 );
414 return $token;
417 // The special numerical keyword "n"
418 if ( $word1 === 'n' ) {
419 $token = $this->newNumber( 'n', $this->pos );
420 $this->pos ++;
421 return $token;
424 $this->error( 'unrecognised word' );
428 * For the binary operator $op, pop its operands off the stack and push
429 * a fragment with rpn and type members describing the result of that
430 * operation.
432 protected function doOperation( $op ) {
433 if ( count( $this->operands ) < 2 ) {
434 $op->error( 'missing operand' );
436 $right = array_pop( $this->operands );
437 $left = array_pop( $this->operands );
438 $result = $op->operate( $left, $right );
439 $this->operands[] = $result;
443 * Create a numerical expression object
445 protected function newNumber( $text, $pos ) {
446 return new CLDRPluralRuleConverter_Expression( $this, 'number', $text, $pos, strlen( $text ) );
450 * Create a binary operator
452 protected function newOperator( $type, $pos, $length ) {
453 return new CLDRPluralRuleConverter_Operator( $this, $type, $pos, $length );
457 * Throw an error
459 protected function error( $message ) {
460 throw new CLDRPluralRuleError( $message );
465 * Helper for CLDRPluralRuleConverter.
466 * The base class for operators and expressions, describing a region of the input string.
468 class CLDRPluralRuleConverter_Fragment {
469 public $parser, $pos, $length, $end;
471 function __construct( $parser, $pos, $length ) {
472 $this->parser = $parser;
473 $this->pos = $pos;
474 $this->length = $length;
475 $this->end = $pos + $length;
478 public function error( $message ) {
479 $text = $this->getText();
480 throw new CLDRPluralRuleError( "$message at position " . ( $this->pos + 1 ) . ": \"$text\"" );
483 public function getText() {
484 return substr( $this->parser->rule, $this->pos, $this->length );
489 * Helper for CLDRPluralRuleConverter.
490 * An expression object, representing a region of the input string (for error
491 * messages), the RPN notation used to evaluate it, and the result type for
492 * validation.
494 class CLDRPluralRuleConverter_Expression extends CLDRPluralRuleConverter_Fragment {
495 public $type, $rpn;
497 function __construct( $parser, $type, $rpn, $pos, $length ) {
498 parent::__construct( $parser, $pos, $length );
499 $this->type = $type;
500 $this->rpn = $rpn;
503 public function isType( $type ) {
504 if ( $type === 'range' && ( $this->type === 'range' || $this->type === 'number' ) ) {
505 return true;
507 if ( $type === $this->type ) {
508 return true;
510 return false;
515 * Helper for CLDRPluralRuleConverter.
516 * An operator object, representing a region of the input string (for error
517 * messages), and the binary operator at that location.
519 class CLDRPluralRuleConverter_Operator extends CLDRPluralRuleConverter_Fragment {
520 public $name;
523 * Each op type has three characters: left operand type, right operand type and result type
525 * b = boolean
526 * n = number
527 * r = range
529 * A number is a kind of range.
531 static $opTypes = array(
532 'or' => 'bbb',
533 'and' => 'bbb',
534 'is' => 'nnb',
535 'is-not' => 'nnb',
536 'in' => 'nrb',
537 'not-in' => 'nrb',
538 'within' => 'nrb',
539 'not-within' => 'nrb',
540 'mod' => 'nnn',
541 ',' => 'rrr',
542 '..' => 'nnr',
546 * Map converting from the abbrevation to the full form.
548 static $typeSpecMap = array(
549 'b' => 'boolean',
550 'n' => 'number',
551 'r' => 'range',
554 function __construct( $parser, $name, $pos, $length ) {
555 parent::__construct( $parser, $pos, $length );
556 $this->name = $name;
559 public function operate( $left, $right ) {
560 $typeSpec = self::$opTypes[$this->name];
562 $leftType = self::$typeSpecMap[$typeSpec[0]];
563 $rightType = self::$typeSpecMap[$typeSpec[1]];
564 $resultType = self::$typeSpecMap[$typeSpec[2]];
566 $start = min( $this->pos, $left->pos, $right->pos );
567 $end = max( $this->end, $left->end, $right->end );
568 $length = $end - $start;
570 $newExpr = new CLDRPluralRuleConverter_Expression( $this->parser, $resultType,
571 "{$left->rpn} {$right->rpn} {$this->name}",
572 $start, $length );
574 if ( !$left->isType( $leftType ) ) {
575 $newExpr->error( "invalid type for left operand: expected $leftType, got {$left->type}" );
578 if ( !$right->isType( $rightType ) ) {
579 $newExpr->error( "invalid type for right operand: expected $rightType, got {$right->type}" );
581 return $newExpr;
586 * The exception class for all the classes in this file. This will be thrown
587 * back to the caller if there is any validation error.
589 class CLDRPluralRuleError extends MWException {
590 function __construct( $message ) {
591 parent::__construct( 'CLDR plural rule error: ' . $message );