1 <?php
defined('SYSPATH') OR die('No direct access allowed.');
3 * A port of phputf8 to a unified file/class. Checks PHP status to ensure that
4 * UTF-8 support is available and normalize global variables to UTF-8. It also
5 * provides multi-byte aware replacement string functions.
7 * This file is licensed differently from the rest of Kohana. As a port of
8 * phputf8, which is LGPL software, this file is released under the LGPL.
10 * PCRE needs to be compiled with UTF-8 support (--enable-utf8).
11 * Support for Unicode properties is highly recommended (--enable-unicode-properties).
12 * @see http://php.net/manual/reference.pcre.pattern.modifiers.php
14 * UTF-8 conversion will be much more reliable if the iconv extension is loaded.
15 * @see http://php.net/iconv
17 * The mbstring extension is highly recommended, but must not be overloading
19 * @see http://php.net/mbstring
21 * $Id: utf8.php 3917 2009-01-21 03:06:22Z zombor $
25 * @copyright (c) 2007 Kohana Team
26 * @copyright (c) 2005 Harry Fuecks
27 * @license http://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt
30 if ( ! preg_match('/^.$/u', 'ñ'))
34 '<a href="http://php.net/pcre">PCRE</a> has not been compiled with UTF-8 support. '.
35 'See <a href="http://php.net/manual/reference.pcre.pattern.modifiers.php">PCRE Pattern Modifiers</a> '.
36 'for more information. This application cannot be run without UTF-8 support.',
41 if ( ! extension_loaded('iconv'))
45 'The <a href="http://php.net/iconv">iconv</a> extension is not loaded. '.
46 'Without iconv, strings cannot be properly translated to UTF-8 from user input. '.
47 'This application cannot be run without UTF-8 support.',
52 if (extension_loaded('mbstring') AND (ini_get('mbstring.func_overload') & MB_OVERLOAD_STRING
))
56 'The <a href="http://php.net/mbstring">mbstring</a> extension is overloading PHP\'s native string functions. '.
57 'Disable this by setting mbstring.func_overload to 0, 1, 4 or 5 in php.ini or a .htaccess file.'.
58 'This application cannot be run without UTF-8 support.',
63 // Check PCRE support for Unicode properties such as \p and \X.
64 $ER = error_reporting(0);
65 define('PCRE_UNICODE_PROPERTIES', (bool) preg_match('/^\pL$/u', 'ñ'));
68 // SERVER_UTF8 ? use mb_* functions : use non-native functions
69 if (extension_loaded('mbstring'))
71 mb_internal_encoding('UTF-8');
72 define('SERVER_UTF8', TRUE);
76 define('SERVER_UTF8', FALSE);
79 // Convert all global variables to UTF-8.
80 $_GET = utf8
::clean($_GET);
81 $_POST = utf8
::clean($_POST);
82 $_COOKIE = utf8
::clean($_COOKIE);
83 $_SERVER = utf8
::clean($_SERVER);
85 if (PHP_SAPI
== 'cli')
87 // Convert command line arguments
88 $_SERVER['argv'] = utf8
::clean($_SERVER['argv']);
94 static $called = array();
97 * Recursively cleans arrays, objects, and strings. Removes ASCII control
98 * codes and converts to UTF-8 while silently discarding incompatible
101 * @param string string to clean
104 public static function clean($str)
106 if (is_array($str) OR is_object($str))
108 foreach ($str as $key => $val)
111 $str[self
::clean($key)] = self
::clean($val);
114 elseif (is_string($str) AND $str !== '')
116 // Remove control characters
117 $str = self
::strip_ascii_ctrl($str);
119 if ( ! self
::is_ascii($str))
122 $ER = error_reporting(~E_NOTICE
);
124 // iconv is expensive, so it is only used when needed
125 $str = iconv('UTF-8', 'UTF-8//IGNORE', $str);
127 // Turn notices back on
128 error_reporting($ER);
136 * Tests whether a string contains only 7bit ASCII bytes. This is used to
137 * determine when to use native functions or UTF-8 functions.
139 * @param string string to check
142 public static function is_ascii($str)
144 return ! preg_match('/[^\x00-\x7F]/S', $str);
148 * Strips out device control codes in the ASCII range.
150 * @param string string to clean
153 public static function strip_ascii_ctrl($str)
155 return preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S', '', $str);
159 * Strips out all non-7bit ASCII bytes.
161 * @param string string to clean
164 public static function strip_non_ascii($str)
166 return preg_replace('/[^\x00-\x7F]+/S', '', $str);
170 * Replaces special/accented UTF-8 characters by ASCII-7 'equivalents'.
172 * @author Andreas Gohr <andi@splitbrain.org>
174 * @param string string to transliterate
175 * @param integer -1 lowercase only, +1 uppercase only, 0 both cases
178 public static function transliterate_to_ascii($str, $case = 0)
180 if ( ! isset(self
::$called[__FUNCTION__
]))
182 require SYSPATH
.'core/utf8/'.__FUNCTION__
.EXT
;
184 // Function has been called
185 self
::$called[__FUNCTION__
] = TRUE;
188 return _transliterate_to_ascii($str, $case);
192 * Returns the length of the given string.
193 * @see http://php.net/strlen
195 * @param string string being measured for length
198 public static function strlen($str)
200 if ( ! isset(self
::$called[__FUNCTION__
]))
202 require SYSPATH
.'core/utf8/'.__FUNCTION__
.EXT
;
204 // Function has been called
205 self
::$called[__FUNCTION__
] = TRUE;
208 return _strlen($str);
212 * Finds position of first occurrence of a UTF-8 string.
213 * @see http://php.net/strlen
215 * @author Harry Fuecks <hfuecks@gmail.com>
217 * @param string haystack
218 * @param string needle
219 * @param integer offset from which character in haystack to start searching
220 * @return integer position of needle
221 * @return boolean FALSE if the needle is not found
223 public static function strpos($str, $search, $offset = 0)
225 if ( ! isset(self
::$called[__FUNCTION__
]))
227 require SYSPATH
.'core/utf8/'.__FUNCTION__
.EXT
;
229 // Function has been called
230 self
::$called[__FUNCTION__
] = TRUE;
233 return _strpos($str, $search, $offset);
237 * Finds position of last occurrence of a char in a UTF-8 string.
238 * @see http://php.net/strrpos
240 * @author Harry Fuecks <hfuecks@gmail.com>
242 * @param string haystack
243 * @param string needle
244 * @param integer offset from which character in haystack to start searching
245 * @return integer position of needle
246 * @return boolean FALSE if the needle is not found
248 public static function strrpos($str, $search, $offset = 0)
250 if ( ! isset(self
::$called[__FUNCTION__
]))
252 require SYSPATH
.'core/utf8/'.__FUNCTION__
.EXT
;
254 // Function has been called
255 self
::$called[__FUNCTION__
] = TRUE;
258 return _strrpos($str, $search, $offset);
262 * Returns part of a UTF-8 string.
263 * @see http://php.net/substr
265 * @author Chris Smith <chris@jalakai.co.uk>
267 * @param string input string
268 * @param integer offset
269 * @param integer length limit
272 public static function substr($str, $offset, $length = NULL)
274 if ( ! isset(self
::$called[__FUNCTION__
]))
276 require SYSPATH
.'core/utf8/'.__FUNCTION__
.EXT
;
278 // Function has been called
279 self
::$called[__FUNCTION__
] = TRUE;
282 return _substr($str, $offset, $length);
286 * Replaces text within a portion of a UTF-8 string.
287 * @see http://php.net/substr_replace
289 * @author Harry Fuecks <hfuecks@gmail.com>
291 * @param string input string
292 * @param string replacement string
293 * @param integer offset
296 public static function substr_replace($str, $replacement, $offset, $length = NULL)
298 if ( ! isset(self
::$called[__FUNCTION__
]))
300 require SYSPATH
.'core/utf8/'.__FUNCTION__
.EXT
;
302 // Function has been called
303 self
::$called[__FUNCTION__
] = TRUE;
306 return _substr_replace($str, $replacement, $offset, $length);
310 * Makes a UTF-8 string lowercase.
311 * @see http://php.net/strtolower
313 * @author Andreas Gohr <andi@splitbrain.org>
315 * @param string mixed case string
318 public static function strtolower($str)
320 if ( ! isset(self
::$called[__FUNCTION__
]))
322 require SYSPATH
.'core/utf8/'.__FUNCTION__
.EXT
;
324 // Function has been called
325 self
::$called[__FUNCTION__
] = TRUE;
328 return _strtolower($str);
332 * Makes a UTF-8 string uppercase.
333 * @see http://php.net/strtoupper
335 * @author Andreas Gohr <andi@splitbrain.org>
337 * @param string mixed case string
340 public static function strtoupper($str)
342 if ( ! isset(self
::$called[__FUNCTION__
]))
344 require SYSPATH
.'core/utf8/'.__FUNCTION__
.EXT
;
346 // Function has been called
347 self
::$called[__FUNCTION__
] = TRUE;
350 return _strtoupper($str);
354 * Makes a UTF-8 string's first character uppercase.
355 * @see http://php.net/ucfirst
357 * @author Harry Fuecks <hfuecks@gmail.com>
359 * @param string mixed case string
362 public static function ucfirst($str)
364 if ( ! isset(self
::$called[__FUNCTION__
]))
366 require SYSPATH
.'core/utf8/'.__FUNCTION__
.EXT
;
368 // Function has been called
369 self
::$called[__FUNCTION__
] = TRUE;
372 return _ucfirst($str);
376 * Makes the first character of every word in a UTF-8 string uppercase.
377 * @see http://php.net/ucwords
379 * @author Harry Fuecks <hfuecks@gmail.com>
381 * @param string mixed case string
384 public static function ucwords($str)
386 if ( ! isset(self
::$called[__FUNCTION__
]))
388 require SYSPATH
.'core/utf8/'.__FUNCTION__
.EXT
;
390 // Function has been called
391 self
::$called[__FUNCTION__
] = TRUE;
394 return _ucwords($str);
398 * Case-insensitive UTF-8 string comparison.
399 * @see http://php.net/strcasecmp
401 * @author Harry Fuecks <hfuecks@gmail.com>
403 * @param string string to compare
404 * @param string string to compare
405 * @return integer less than 0 if str1 is less than str2
406 * @return integer greater than 0 if str1 is greater than str2
407 * @return integer 0 if they are equal
409 public static function strcasecmp($str1, $str2)
411 if ( ! isset(self
::$called[__FUNCTION__
]))
413 require SYSPATH
.'core/utf8/'.__FUNCTION__
.EXT
;
415 // Function has been called
416 self
::$called[__FUNCTION__
] = TRUE;
419 return _strcasecmp($str1, $str2);
423 * Returns a string or an array with all occurrences of search in subject (ignoring case).
424 * replaced with the given replace value.
425 * @see http://php.net/str_ireplace
427 * @note It's not fast and gets slower if $search and/or $replace are arrays.
428 * @author Harry Fuecks <hfuecks@gmail.com
430 * @param string|array text to replace
431 * @param string|array replacement text
432 * @param string|array subject text
433 * @param integer number of matched and replaced needles will be returned via this parameter which is passed by reference
434 * @return string if the input was a string
435 * @return array if the input was an array
437 public static function str_ireplace($search, $replace, $str, & $count = NULL)
439 if ( ! isset(self
::$called[__FUNCTION__
]))
441 require SYSPATH
.'core/utf8/'.__FUNCTION__
.EXT
;
443 // Function has been called
444 self
::$called[__FUNCTION__
] = TRUE;
447 return _str_ireplace($search, $replace, $str, $count);
451 * Case-insenstive UTF-8 version of strstr. Returns all of input string
452 * from the first occurrence of needle to the end.
453 * @see http://php.net/stristr
455 * @author Harry Fuecks <hfuecks@gmail.com>
457 * @param string input string
458 * @param string needle
459 * @return string matched substring if found
460 * @return boolean FALSE if the substring was not found
462 public static function stristr($str, $search)
464 if ( ! isset(self
::$called[__FUNCTION__
]))
466 require SYSPATH
.'core/utf8/'.__FUNCTION__
.EXT
;
468 // Function has been called
469 self
::$called[__FUNCTION__
] = TRUE;
472 return _stristr($str, $search);
476 * Finds the length of the initial segment matching mask.
477 * @see http://php.net/strspn
479 * @author Harry Fuecks <hfuecks@gmail.com>
481 * @param string input string
482 * @param string mask for search
483 * @param integer start position of the string to examine
484 * @param integer length of the string to examine
485 * @return integer length of the initial segment that contains characters in the mask
487 public static function strspn($str, $mask, $offset = NULL, $length = NULL)
489 if ( ! isset(self
::$called[__FUNCTION__
]))
491 require SYSPATH
.'core/utf8/'.__FUNCTION__
.EXT
;
493 // Function has been called
494 self
::$called[__FUNCTION__
] = TRUE;
497 return _strspn($str, $mask, $offset, $length);
501 * Finds the length of the initial segment not matching mask.
502 * @see http://php.net/strcspn
504 * @author Harry Fuecks <hfuecks@gmail.com>
506 * @param string input string
507 * @param string mask for search
508 * @param integer start position of the string to examine
509 * @param integer length of the string to examine
510 * @return integer length of the initial segment that contains characters not in the mask
512 public static function strcspn($str, $mask, $offset = NULL, $length = NULL)
514 if ( ! isset(self
::$called[__FUNCTION__
]))
516 require SYSPATH
.'core/utf8/'.__FUNCTION__
.EXT
;
518 // Function has been called
519 self
::$called[__FUNCTION__
] = TRUE;
522 return _strcspn($str, $mask, $offset, $length);
526 * Pads a UTF-8 string to a certain length with another string.
527 * @see http://php.net/str_pad
529 * @author Harry Fuecks <hfuecks@gmail.com>
531 * @param string input string
532 * @param integer desired string length after padding
533 * @param string string to use as padding
534 * @param string padding type: STR_PAD_RIGHT, STR_PAD_LEFT, or STR_PAD_BOTH
537 public static function str_pad($str, $final_str_length, $pad_str = ' ', $pad_type = STR_PAD_RIGHT
)
539 if ( ! isset(self
::$called[__FUNCTION__
]))
541 require SYSPATH
.'core/utf8/'.__FUNCTION__
.EXT
;
543 // Function has been called
544 self
::$called[__FUNCTION__
] = TRUE;
547 return _str_pad($str, $final_str_length, $pad_str, $pad_type);
551 * Converts a UTF-8 string to an array.
552 * @see http://php.net/str_split
554 * @author Harry Fuecks <hfuecks@gmail.com>
556 * @param string input string
557 * @param integer maximum length of each chunk
560 public static function str_split($str, $split_length = 1)
562 if ( ! isset(self
::$called[__FUNCTION__
]))
564 require SYSPATH
.'core/utf8/'.__FUNCTION__
.EXT
;
566 // Function has been called
567 self
::$called[__FUNCTION__
] = TRUE;
570 return _str_split($str, $split_length);
574 * Reverses a UTF-8 string.
575 * @see http://php.net/strrev
577 * @author Harry Fuecks <hfuecks@gmail.com>
579 * @param string string to be reversed
582 public static function strrev($str)
584 if ( ! isset(self
::$called[__FUNCTION__
]))
586 require SYSPATH
.'core/utf8/'.__FUNCTION__
.EXT
;
588 // Function has been called
589 self
::$called[__FUNCTION__
] = TRUE;
592 return _strrev($str);
596 * Strips whitespace (or other UTF-8 characters) from the beginning and
598 * @see http://php.net/trim
600 * @author Andreas Gohr <andi@splitbrain.org>
602 * @param string input string
603 * @param string string of characters to remove
606 public static function trim($str, $charlist = NULL)
608 if ( ! isset(self
::$called[__FUNCTION__
]))
610 require SYSPATH
.'core/utf8/'.__FUNCTION__
.EXT
;
612 // Function has been called
613 self
::$called[__FUNCTION__
] = TRUE;
616 return _trim($str, $charlist);
620 * Strips whitespace (or other UTF-8 characters) from the beginning of a string.
621 * @see http://php.net/ltrim
623 * @author Andreas Gohr <andi@splitbrain.org>
625 * @param string input string
626 * @param string string of characters to remove
629 public static function ltrim($str, $charlist = NULL)
631 if ( ! isset(self
::$called[__FUNCTION__
]))
633 require SYSPATH
.'core/utf8/'.__FUNCTION__
.EXT
;
635 // Function has been called
636 self
::$called[__FUNCTION__
] = TRUE;
639 return _ltrim($str, $charlist);
643 * Strips whitespace (or other UTF-8 characters) from the end of a string.
644 * @see http://php.net/rtrim
646 * @author Andreas Gohr <andi@splitbrain.org>
648 * @param string input string
649 * @param string string of characters to remove
652 public static function rtrim($str, $charlist = NULL)
654 if ( ! isset(self
::$called[__FUNCTION__
]))
656 require SYSPATH
.'core/utf8/'.__FUNCTION__
.EXT
;
658 // Function has been called
659 self
::$called[__FUNCTION__
] = TRUE;
662 return _rtrim($str, $charlist);
666 * Returns the unicode ordinal for a character.
667 * @see http://php.net/ord
669 * @author Harry Fuecks <hfuecks@gmail.com>
671 * @param string UTF-8 encoded character
674 public static function ord($chr)
676 if ( ! isset(self
::$called[__FUNCTION__
]))
678 require SYSPATH
.'core/utf8/'.__FUNCTION__
.EXT
;
680 // Function has been called
681 self
::$called[__FUNCTION__
] = TRUE;
688 * Takes an UTF-8 string and returns an array of ints representing the Unicode characters.
689 * Astral planes are supported i.e. the ints in the output can be > 0xFFFF.
690 * Occurrances of the BOM are ignored. Surrogates are not allowed.
692 * The Original Code is Mozilla Communicator client code.
693 * The Initial Developer of the Original Code is Netscape Communications Corporation.
694 * Portions created by the Initial Developer are Copyright (C) 1998 the Initial Developer.
695 * Ported to PHP by Henri Sivonen <hsivonen@iki.fi>, see http://hsivonen.iki.fi/php-utf8/.
696 * Slight modifications to fit with phputf8 library by Harry Fuecks <hfuecks@gmail.com>.
698 * @param string UTF-8 encoded string
699 * @return array unicode code points
700 * @return boolean FALSE if the string is invalid
702 public static function to_unicode($str)
704 if ( ! isset(self
::$called[__FUNCTION__
]))
706 require SYSPATH
.'core/utf8/'.__FUNCTION__
.EXT
;
708 // Function has been called
709 self
::$called[__FUNCTION__
] = TRUE;
712 return _to_unicode($str);
716 * Takes an array of ints representing the Unicode characters and returns a UTF-8 string.
717 * Astral planes are supported i.e. the ints in the input can be > 0xFFFF.
718 * Occurrances of the BOM are ignored. Surrogates are not allowed.
720 * The Original Code is Mozilla Communicator client code.
721 * The Initial Developer of the Original Code is Netscape Communications Corporation.
722 * Portions created by the Initial Developer are Copyright (C) 1998 the Initial Developer.
723 * Ported to PHP by Henri Sivonen <hsivonen@iki.fi>, see http://hsivonen.iki.fi/php-utf8/.
724 * Slight modifications to fit with phputf8 library by Harry Fuecks <hfuecks@gmail.com>.
726 * @param array unicode code points representing a string
727 * @return string utf8 string of characters
728 * @return boolean FALSE if a code point cannot be found
730 public static function from_unicode($arr)
732 if ( ! isset(self
::$called[__FUNCTION__
]))
734 require SYSPATH
.'core/utf8/'.__FUNCTION__
.EXT
;
736 // Function has been called
737 self
::$called[__FUNCTION__
] = TRUE;
740 return _from_unicode($arr);