From e7464f34818f20377ca73fab72c7b3214e0f5e1c Mon Sep 17 00:00:00 2001 From: Brian Wolff Date: Sat, 29 Oct 2016 08:29:11 +0000 Subject: [PATCH] Make NumericUppercaseCollation use localized digit transforms This will cause the numeric collation to sort localized digits for the current content language the same as how 0-9 are. This only deals with the localized digit numbers, commas and other number formatting are still not handled. Weird "numerical" unicode characters are also not handled. I was unsure if to make a "family" of numeric collations where you specify numeric-, or if it should just use $wgContLang. Given that $wgContLang effectively never changes, and also affects all other digit handling, I opted to just use $wgContLang. Any wikis currently using the 'numeric' collation will have to have updateCollation.php --force run after this change is deployed. At the moment that includes: bnwiki, bnwikisource and hewiki Bug: T148873 Change-Id: I9eda52a8a9752a91134d1118546b0a80d3980ccf --- includes/collation/Collation.php | 4 +- includes/collation/NumericUppercaseCollation.php | 57 +++++++++++++++++++++--- 2 files changed, 55 insertions(+), 6 deletions(-) diff --git a/includes/collation/Collation.php b/includes/collation/Collation.php index 881c8c23fe9..9950a11a6f6 100644 --- a/includes/collation/Collation.php +++ b/includes/collation/Collation.php @@ -46,11 +46,13 @@ abstract class Collation { * @return Collation */ public static function factory( $collationName ) { + global $wgContLang; + switch ( $collationName ) { case 'uppercase': return new UppercaseCollation; case 'numeric': - return new NumericUppercaseCollation; + return new NumericUppercaseCollation( $wgContLang ); case 'identity': return new IdentityCollation; case 'uca-default': diff --git a/includes/collation/NumericUppercaseCollation.php b/includes/collation/NumericUppercaseCollation.php index d15daec5055..2d2ca47bcf9 100644 --- a/includes/collation/NumericUppercaseCollation.php +++ b/includes/collation/NumericUppercaseCollation.php @@ -24,12 +24,37 @@ * Note that this only works in terms of sequences of digits, and the behavior for decimal fractions * or pretty-formatted numbers may be unexpected. * + * Digits will be based on the wiki's content language settings. If + * you change the content langauge of a wiki you will need to run + * updateCollation.php --force. Only English (ASCII 0-9) and the + * localized version will be counted. Localized digits from other languages + * or weird unicode digit equivalents (e.g. 4, 𝟜, ⓸ , ⁴, etc) will not count. + * * @since 1.28 */ class NumericUppercaseCollation extends UppercaseCollation { + + /** + * @var $digitTransformLang Language How to convert digits (usually $wgContLang) + */ + private $digitTransformLang; + + /** + * Constructor + * + * @param $lang Language How to convert digits. + * For example, if given language "my" than ၇ is treated like 7. + * + * It is expected that usually this is given $wgContLang. + */ + public function __construct( Language $lang ) { + $this->digitTransformLang = $lang; + parent::__construct(); + } + public function getSortKey( $string ) { $sortkey = parent::getSortKey( $string ); - + $sortkey = $this->convertDigits( $sortkey ); // For each sequence of digits, insert the digit '0' and then the length of the sequence // (encoded in two bytes) before it. That's all folks, it sorts correctly now! The '0' ensures // correct position (where digits would normally sort), then the length will be compared putting @@ -48,11 +73,33 @@ class NumericUppercaseCollation extends UppercaseCollation { return $sortkey; } + /** + * Convert localized digits to english digits. + * + * based on Language::parseFormattedNumber but without commas. + * + * @param $string String sortkey to unlocalize digits of + * @return String Sortkey with all localized digits replaced with ASCII digits. + */ + private function convertDigits( $string ) { + $table = $this->digitTransformLang->digitTransformTable(); + if ( $table ) { + $table = array_filter( $table ); + $flipped = array_flip( $table ); + // Some languages seem to also have commas in this table. + $flipped = array_filter( $flipped, 'is_numeric' ); + $string = strtr( $string, $flipped ); + } + return $string; + } + public function getFirstLetter( $string ) { - if ( preg_match( '/^\d/', $string ) ) { - // Note that we pass 0 and 9 as normal params, not numParams(). This only works for 0-9 - // and not localised digits, so we don't want them to be converted. - return wfMessage( 'category-header-numerals' )->params( 0, 9 )->text(); + $convertedString = $this->convertDigits( $string ); + + if ( preg_match( '/^\d/', $convertedString ) ) { + return wfMessage( 'category-header-numerals' ) + ->numParams( 0, 9 ) + ->text(); } else { return parent::getFirstLetter( $string ); } -- 2.11.4.GIT