languages/LanguageUtf8.php

   1 <?php
   2 /**
   3   * @package MediaWiki
   4   * @subpackage Language
   5   */
   6
   7 if( defined( "MEDIAWIKI" ) ) {
   8
   9 # This file and LanguageLatin1.php may be included from within functions, so
  10 # we need to have global statements
  11
  12 global $wgInputEncoding, $wgOutputEncoding, $wikiUpperChars, $wikiLowerChars;
  13 global $wgDBname, $wgMemc;
  14
  15 $wgInputEncoding    = "UTF-8";
  16 $wgOutputEncoding       = "UTF-8";
  17
  18 if( function_exists( 'mb_strtoupper' ) ) {
  19         mb_internal_encoding('UTF-8');
  20 } else {
  21         # Hack our own case conversion routines
  22
  23         # Loading serialized arrays is faster than parsing code :P
  24         $wikiUpperChars = $wgMemc->get( $key1 = "$wgDBname:utf8:upper" );
  25         $wikiLowerChars = $wgMemc->get( $key2 = "$wgDBname:utf8:lower" );
  26
  27         if(empty( $wikiUpperChars) || empty($wikiLowerChars )) {
  28                 require_once( "includes/Utf8Case.php" );
  29                 $wgMemc->set( $key1, $wikiUpperChars );
  30                 $wgMemc->set( $key2, $wikiLowerChars );
  31         }
  32 }
  33
  34 /**
  35  * Base stuff useful to all UTF-8 based language files
  36  * @package MediaWiki
  37  */
  38 class LanguageUtf8 extends Language {
  39
  40         # These functions use mbstring library, if it is loaded
  41         # or compiled and character mapping arrays otherwise.
  42         # In case of language-specific character mismatch
  43         # it should be dealt with in Language classes.
  44
  45         function ucfirst( $str ) {
  46                 return LanguageUtf8::uc( $str, true );
  47         }
  48
  49         function uc( $str, $first = false ) {
  50                 if ( function_exists( 'mb_strtoupper' ) )
  51                         if ( $first )
  52                                 if ( LanguageUtf8::isMultibyte( $str ) )
  53                                         return mb_strtoupper( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 );
  54                                 else
  55                                         return ucfirst( $str );
  56                         else
  57                                 return LanguageUtf8::isMultibyte( $str ) ? mb_strtoupper( $str ) : strtoupper( $str );
  58                 else
  59                         if ( LanguageUtf8::isMultibyte( $str ) ) {
  60                                 global $wikiUpperChars;
  61                                 $x = $first ? '^' : '';
  62                                 return preg_replace(
  63                                         "/$x([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
  64                                         "strtr( \"\$1\" , \$wikiUpperChars )",
  65                                         $str
  66                                 );
  67                         } else
  68                                 return $first ? ucfirst( $str ) : strtoupper( $str );
  69         }
  70
  71         function lcfirst( $str ) {
  72                 return LanguageUtf8::lc( $str, true );
  73         }
  74
  75         function lc( $str, $first = false ) {
  76                 if ( function_exists( 'mb_strtolower' ) )
  77                         if ( $first )
  78                                 if ( LanguageUtf8::isMultibyte( $str ) )
  79                                         return mb_strtolower( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 );
  80                                 else
  81                                         return strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 );
  82                         else
  83                                 return LanguageUtf8::isMultibyte( $str ) ? mb_strtolower( $str ) : strtolower( $str );
  84                 else
  85                         if ( LanguageUtf8::isMultibyte( $str ) ) {
  86                                 global $wikiLowerChars;
  87                                 $x = $first ? '^' : '';
  88                                 return preg_replace(
  89                                         "/$x([A-Z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
  90                                         "strtr( \"\$1\" , \$wikiLowerChars )",
  91                                         $str
  92                                 );
  93                         } else
  94                                 return $first ? strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 ) : strtolower( $str );
  95         }
  96
  97         function isMultibyte( $str ) {
  98                 return (bool)preg_match( '/^[\x80-\xff]/', $str );
  99         }
 100
 101         function stripForSearch( $string ) {
 102                 # MySQL fulltext index doesn't grok utf-8, so we
 103                 # need to fold cases and convert to hex
 104
 105                 # In Language:: it just returns lowercase, maybe
 106                 # all strtolower on stripped output or argument
 107                 # should be removed and all stripForSearch
 108                 # methods adjusted to that.
 109
 110                 wfProfileIn( "LanguageUtf8::stripForSearch" );
 111                 if( function_exists( 'mb_strtolower' ) ) {
 112                         $out = preg_replace(
 113                                 "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
 114                                 "'U8' . bin2hex( \"$1\" )",
 115                                 mb_strtolower( $string ) );
 116                 } else {
 117                         global $wikiLowerChars;
 118                         $out = preg_replace(
 119                                 "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
 120                                 "'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
 121                                 $string );
 122                 }
 123                 wfProfileOut( "LanguageUtf8::stripForSearch" );
 124                 return $out;
 125         }
 126
 127         function fallback8bitEncoding() {
 128                 # Windows codepage 1252 is a superset of iso 8859-1
 129                 # override this to use difference source encoding to
 130                 # translate incoming 8-bit URLs.
 131                 return "windows-1252";
 132         }
 133
 134         function checkTitleEncoding( $s ) {
 135                 global $wgInputEncoding;
 136
 137                 if( is_array( $s ) ) {
 138                         wfDebugDieBacktrace( 'Given array to checkTitleEncoding.' );
 139                 }
 140                 # Check for non-UTF-8 URLs
 141                 $ishigh = preg_match( '/[\x80-\xff]/', $s);
 142                 if(!$ishigh) return $s;
 143
 144                 $isutf8 = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
 145                 '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );
 146                 if( $isutf8 ) return $s;
 147
 148                 return $this->iconv( $this->fallback8bitEncoding(), "utf-8", $s );
 149         }
 150
 151         function firstChar( $s ) {
 152                 preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
 153                 '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})/', $s, $matches);
 154
 155                 return isset( $matches[1] ) ? $matches[1] : "";
 156         }
 157
 158         # Crop a string from the beginning or end to a certain number of bytes.
 159         # (Bytes are used because our storage has limited byte lengths for some
 160         # columns in the database.) Multibyte charsets will need to make sure that
 161         # only whole characters are included!
 162         #
 163         # $length does not include the optional ellipsis.
 164         # If $length is negative, snip from the beginning
 165         function truncate( $string, $length, $ellipsis = "" ) {
 166                 if( $length == 0 ) {
 167                         return $ellipsis;
 168                 }
 169                 if ( strlen( $string ) <= abs( $length ) ) {
 170                         return $string;
 171                 }
 172                 if( $length > 0 ) {
 173                         $string = substr( $string, 0, $length );
 174                         $char = ord( $string[strlen( $string ) - 1] );
 175                         if ($char >= 0xc0) {
 176                                 # We got the first byte only of a multibyte char; remove it.
 177                                 $string = substr( $string, 0, -1 );
 178                         } elseif( $char >= 0x80 &&
 179                                   preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' .
 180                                               '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) {
 181                             # We chopped in the middle of a character; remove it
 182                                 $string = $m[1];
 183                         }
 184                         return $string . $ellipsis;
 185                 } else {
 186                         $string = substr( $string, $length );
 187                         $char = ord( $string[0] );
 188                         if( $char >= 0x80 && $char < 0xc0 ) {
 189                                 # We chopped in the middle of a character; remove the whole thing
 190                                 $string = preg_replace( '/^[\x80-\xbf]+/', '', $string );
 191                         }
 192                         return $ellipsis . $string;
 193                 }
 194         }
 195 }
 196
 197 } # ifdef MEDIAWIKI
 198
 199 ?>