includes/normal/UtfNormalUtil.php

   1 <?php
   2 /**
   3  * Some of these functions are adapted from places in MediaWiki.
   4  * Should probably merge them for consistency.
   5  *
   6  * Copyright © 2004 Brion Vibber <brion@pobox.com>
   7  * http://www.mediawiki.org/
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License along
  20  * with this program; if not, write to the Free Software Foundation, Inc.,
  21  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  22  * http://www.gnu.org/copyleft/gpl.html
  23  *
  24  * @file
  25  * @ingroup UtfNormal
  26  */
  27
  28 /**
  29  * Return UTF-8 sequence for a given Unicode code point.
  30  * May die if fed out of range data.
  31  *
  32  * @param $codepoint Integer:
  33  * @return String
  34  * @public
  35  */
  36 function codepointToUtf8( $codepoint ) {
  37         if($codepoint <         0x80) return chr($codepoint);
  38         if($codepoint <    0x800) return chr($codepoint >>      6 & 0x3f | 0xc0) .
  39                                                                          chr($codepoint           & 0x3f | 0x80);
  40         if($codepoint <  0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
  41                                                                          chr($codepoint >>      6 & 0x3f | 0x80) .
  42                                                                          chr($codepoint           & 0x3f | 0x80);
  43         if($codepoint < 0x110000) return chr($codepoint >> 18 & 0x07 | 0xf0) .
  44                                                                          chr($codepoint >> 12 & 0x3f | 0x80) .
  45                                                                          chr($codepoint >>      6 & 0x3f | 0x80) .
  46                                                                          chr($codepoint           & 0x3f | 0x80);
  47
  48         echo "Asked for code outside of range ($codepoint)\n";
  49         die( -1 );
  50 }
  51
  52 /**
  53  * Take a series of space-separated hexadecimal numbers representing
  54  * Unicode code points and return a UTF-8 string composed of those
  55  * characters. Used by UTF-8 data generation and testing routines.
  56  *
  57  * @param $sequence String
  58  * @return String
  59  * @private
  60  */
  61 function hexSequenceToUtf8( $sequence ) {
  62         $utf = '';
  63         foreach( explode( ' ', $sequence ) as $hex ) {
  64                 $n = hexdec( $hex );
  65                 $utf .= codepointToUtf8( $n );
  66         }
  67         return $utf;
  68 }
  69
  70 /**
  71  * Take a UTF-8 string and return a space-separated series of hex
  72  * numbers representing Unicode code points. For debugging.
  73  *
  74  * @param string $str UTF-8 string.
  75  * @return string
  76  * @private
  77  */
  78 function utf8ToHexSequence( $str ) {
  79         $buf = '';
  80         foreach ( preg_split( '//u', $str, -1, PREG_SPLIT_NO_EMPTY ) as $cp ) {
  81                 $buf .= sprintf( '%04x ', utf8ToCodepoint( $cp ) );
  82         }
  83         return rtrim( $buf );
  84 }
  85
  86 /**
  87  * Determine the Unicode codepoint of a single-character UTF-8 sequence.
  88  * Does not check for invalid input data.
  89  *
  90  * @param $char String
  91  * @return Integer
  92  * @public
  93  */
  94 function utf8ToCodepoint( $char ) {
  95         # Find the length
  96         $z = ord( $char[0] );
  97         if ( $z & 0x80 ) {
  98                 $length = 0;
  99                 while ( $z & 0x80 ) {
 100                         $length++;
 101                         $z <<= 1;
 102                 }
 103         } else {
 104                 $length = 1;
 105         }
 106
 107         if ( $length != strlen( $char ) ) {
 108                 return false;
 109         }
 110         if ( $length == 1 ) {
 111                 return ord( $char );
 112         }
 113
 114         # Mask off the length-determining bits and shift back to the original location
 115         $z &= 0xff;
 116         $z >>= $length;
 117
 118         # Add in the free bits from subsequent bytes
 119         for ( $i=1; $i < $length; $i++ ) {
 120                 $z <<= 6;
 121                 $z |= ord( $char[$i] ) & 0x3f;
 122         }
 123
 124         return $z;
 125 }
 126
 127 /**
 128  * Escape a string for inclusion in a PHP single-quoted string literal.
 129  *
 130  * @param string $string string to be escaped.
 131  * @return String: escaped string.
 132  * @public
 133  */
 134 function escapeSingleString( $string ) {
 135         return strtr( $string,
 136                 array(
 137                         '\\' => '\\\\',
 138                         '\'' => '\\\''
 139                 ));
 140 }