MDL-11075 Now saving to temp file, then outputting using filelib's readfile_chunked...
[moodle-pu.git] / lib / typo3 / class.t3lib_cs.php
blob28f7b1f7026eb438d9739816fdb7accc8a8344ec
1 <?php
2 /***************************************************************
3 * Copyright notice
5 * (c) 2003-2006 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
27 * Typo Id: class.t3lib_cs.php,v 1.56 2006/05/03 08:47:30 masi Exp $
28 * Moodle $Id$
30 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
31 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
33 /**
34 * [CLASS/FUNCTION INDEX of SCRIPT]
38 * 136: class t3lib_cs
39 * 488: function parse_charset($charset)
40 * 507: function get_locale_charset($locale)
42 * SECTION: Charset Conversion functions
43 * 560: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
44 * 600: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
45 * 617: function utf8_encode($str,$charset)
46 * 663: function utf8_decode($str,$charset,$useEntityForNoChar=0)
47 * 706: function utf8_to_entities($str)
48 * 739: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
49 * 773: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
50 * 823: function UnumberToChar($cbyte)
51 * 868: function utf8CharToUnumber($str,$hex=0)
53 * SECTION: Init functions
54 * 911: function initCharset($charset)
55 * 973: function initUnicodeData($mode=null)
56 * 1198: function initCaseFolding($charset)
57 * 1260: function initToASCII($charset)
59 * SECTION: String operation functions
60 * 1331: function substr($charset,$string,$start,$len=null)
61 * 1384: function strlen($charset,$string)
62 * 1414: function crop($charset,$string,$len,$crop='')
63 * 1467: function strtrunc($charset,$string,$len)
64 * 1501: function conv_case($charset,$string,$case)
65 * 1527: function specCharsToASCII($charset,$string)
67 * SECTION: Internal string operation functions
68 * 1567: function sb_char_mapping($str,$charset,$mode,$opt='')
70 * SECTION: Internal UTF-8 string operation functions
71 * 1622: function utf8_substr($str,$start,$len=null)
72 * 1655: function utf8_strlen($str)
73 * 1676: function utf8_strtrunc($str,$len)
74 * 1698: function utf8_strpos($haystack,$needle,$offset=0)
75 * 1723: function utf8_strrpos($haystack,$needle)
76 * 1745: function utf8_char2byte_pos($str,$pos)
77 * 1786: function utf8_byte2char_pos($str,$pos)
78 * 1809: function utf8_char_mapping($str,$mode,$opt='')
80 * SECTION: Internal EUC string operation functions
81 * 1885: function euc_strtrunc($str,$len,$charset)
82 * 1914: function euc_substr($str,$start,$charset,$len=null)
83 * 1939: function euc_strlen($str,$charset)
84 * 1966: function euc_char2byte_pos($str,$pos,$charset)
85 * 2007: function euc_char_mapping($str,$charset,$mode,$opt='')
87 * TOTAL FUNCTIONS: 35
88 * (This index is automatically created/updated by the extension "extdeveval")
99 /**
100 * Notes on UTF-8
102 * Functions working on UTF-8 strings:
104 * - strchr/strstr
105 * - strrchr
106 * - substr_count
107 * - implode/explode/join
109 * Functions nearly working on UTF-8 strings:
111 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
112 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
113 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
114 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
116 * Functions NOT working on UTF-8 strings:
118 * - str*cmp
119 * - stristr
120 * - stripos
121 * - substr
122 * - strrev
123 * - ereg/eregi
124 * - split/spliti
125 * - preg_*
126 * - ...
130 * Class for conversion between charsets
132 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
133 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
134 * @package TYPO3
135 * @subpackage t3lib
137 class t3lib_cs {
138 var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
140 // This is the array where parsed conversion tables are stored (cached)
141 var $parsedCharsets=array();
143 // An array where case folding data will be stored (cached)
144 var $caseFolding=array();
146 // An array where charset-to-ASCII mappings are stored (cached)
147 var $toASCII=array();
149 // This tells the converter which charsets has two bytes per char:
150 var $twoByteSets=array(
151 'ucs-2'=>1, // 2-byte Unicode
154 // This tells the converter which charsets has four bytes per char:
155 var $fourByteSets=array(
156 'ucs-4'=>1, // 4-byte Unicode
157 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
160 // This tells the converter which charsets use a scheme like the Extended Unix Code:
161 var $eucBasedSets=array(
162 'gb2312'=>1, // Chinese, simplified.
163 'big5'=>1, // Chinese, traditional.
164 'euc-kr'=>1, // Korean
165 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
168 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
169 // http://czyborra.com/charsets/iso8859.html
170 var $synonyms=array(
171 'us' => 'ascii',
172 'us-ascii'=> 'ascii',
173 'cp819' => 'iso-8859-1',
174 'ibm819' => 'iso-8859-1',
175 'iso-ir-100' => 'iso-8859-1',
176 'iso-ir-109' => 'iso-8859-2',
177 'iso-ir-148' => 'iso-8859-9',
178 'iso-ir-199' => 'iso-8859-14',
179 'iso-ir-203' => 'iso-8859-15',
180 'csisolatin1' => 'iso-8859-1',
181 'csisolatin2' => 'iso-8859-2',
182 'csisolatin3' => 'iso-8859-3',
183 'csisolatin5' => 'iso-8859-9',
184 'csisolatin8' => 'iso-8859-14',
185 'csisolatin9' => 'iso-8859-15',
186 'csisolatingreek' => 'iso-8859-7',
187 'iso-celtic' => 'iso-8859-14',
188 'latin1' => 'iso-8859-1',
189 'latin2' => 'iso-8859-2',
190 'latin3' => 'iso-8859-3',
191 'latin5' => 'iso-8859-9',
192 'latin6' => 'iso-8859-10',
193 'latin8' => 'iso-8859-14',
194 'latin9' => 'iso-8859-15',
195 'l1' => 'iso-8859-1',
196 'l2' => 'iso-8859-2',
197 'l3' => 'iso-8859-3',
198 'l5' => 'iso-8859-9',
199 'l6' => 'iso-8859-10',
200 'l8' => 'iso-8859-14',
201 'l9' => 'iso-8859-15',
202 'cyrillic' => 'iso-8859-5',
203 'arabic' => 'iso-8859-6',
204 'tis-620' => 'iso-8859-11',
205 'win874' => 'windows-874',
206 'win1250' => 'windows-1250',
207 'win1251' => 'windows-1251',
208 'win1252' => 'windows-1252',
209 'win1253' => 'windows-1253',
210 'win1254' => 'windows-1254',
211 'win1255' => 'windows-1255',
212 'win1256' => 'windows-1256',
213 'win1257' => 'windows-1257',
214 'win1258' => 'windows-1258',
215 'cp1250' => 'windows-1250',
216 'cp1251' => 'windows-1251',
217 'cp1252' => 'windows-1252',
218 'ms-ee' => 'windows-1250',
219 'ms-ansi' => 'windows-1252',
220 'ms-greek' => 'windows-1253',
221 'ms-turk' => 'windows-1254',
222 'winbaltrim' => 'windows-1257',
223 'koi-8ru' => 'koi-8r',
224 'koi8r' => 'koi-8r',
225 'cp878' => 'koi-8r',
226 'mac' => 'macroman',
227 'macintosh' => 'macroman',
228 'euc-cn' => 'gb2312',
229 'x-euc-cn' => 'gb2312',
230 'euccn' => 'gb2312',
231 'cp936' => 'gb2312',
232 'big-5' => 'big5',
233 'cp950' => 'big5',
234 'eucjp' => 'euc-jp',
235 'sjis' => 'shift_jis',
236 'shift-jis' => 'shift_jis',
237 'cp932' => 'shift_jis',
238 'cp949' => 'euc-kr',
239 'utf7' => 'utf-7',
240 'utf8' => 'utf-8',
241 'utf16' => 'utf-16',
242 'utf32' => 'utf-32',
243 'utf8' => 'utf-8',
244 'ucs2' => 'ucs-2',
245 'ucs4' => 'ucs-4',
248 // mapping of iso-639:2 language codes to script names
249 var $lang_to_script=array(
250 // iso-639:2 language codes, see:
251 // http://www.w3.org/WAI/ER/IG/ert/iso639.htm
252 // http://www.loc.gov/standards/iso639-2/langcodes.html
253 // http://www.unicode.org/onlinedat/languages.html
254 'ar' => 'arabic',
255 'bg' => 'cyrillic', // Bulgarian
256 'bs' => 'east_european', // Bosnian
257 'cs' => 'east_european', // Czech
258 'da' => 'west_european', // Danish
259 'de' => 'west_european', // German
260 'es' => 'west_european', // Spanish
261 'et' => 'estonian',
262 'eo' => 'unicode', // Esperanto
263 'eu' => 'west_european', // Basque
264 'fa' => 'arabic', // Persian
265 'fi' => 'west_european', // Finish
266 'fo' => 'west_european', // Faroese
267 'fr' => 'west_european', // French
268 'gr' => 'greek',
269 'he' => 'hebrew', // Hebrew (since 1998)
270 'hi' => 'unicode', // Hindi
271 'hr' => 'east_european', // Croatian
272 'hu' => 'east_european', // Hungarian
273 'iw' => 'hebrew', // Hebrew (til 1998)
274 'is' => 'west_european', // Icelandic
275 'it' => 'west_european', // Italian
276 'ja' => 'japanese',
277 'kl' => 'west_european', // Greenlandic
278 'ko' => 'korean',
279 'lt' => 'lithuanian',
280 'lv' => 'west_european', // Latvian/Lettish
281 'nl' => 'west_european', // Dutch
282 'no' => 'west_european', // Norwegian
283 'pl' => 'east_european', // Polish
284 'pt' => 'west_european', // Portuguese
285 'ro' => 'east_european', // Romanian
286 'ru' => 'cyrillic', // Russian
287 'sk' => 'east_european', // Slovak
288 'sl' => 'east_european', // Slovenian
289 'sr' => 'cyrillic', // Serbian
290 'sv' => 'west_european', // Swedish
291 'th' => 'thai',
292 'uk' => 'cyrillic', // Ukranian
293 'vi' => 'vietnamese',
294 'zh' => 'chinese',
295 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
296 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
297 'ara' => 'arabic',
298 'bgr' => 'cyrillic', // Bulgarian
299 'cat' => 'west_european', // Catalan
300 'chs' => 'simpl_chinese',
301 'cht' => 'trad_chinese',
302 'csy' => 'east_european', // Czech
303 'dan' => 'west_european', // Danisch
304 'deu' => 'west_european', // German
305 'dea' => 'west_european', // German (Austrian)
306 'des' => 'west_european', // German (Swiss)
307 'ena' => 'west_european', // English (Australian)
308 'enc' => 'west_european', // English (Canadian)
309 'eng' => 'west_european', // English
310 'enz' => 'west_european', // English (New Zealand)
311 'enu' => 'west_european', // English (United States)
312 'euq' => 'west_european', // Basque
313 'fos' => 'west_european', // Faroese
314 'far' => 'arabic', // Persian
315 'fin' => 'west_european', // Finish
316 'fra' => 'west_european', // French
317 'frb' => 'west_european', // French (Belgian)
318 'frc' => 'west_european', // French (Canadian)
319 'frs' => 'west_european', // French (Swiss)
320 'ell' => 'greek',
321 'heb' => 'hebrew',
322 'hin' => 'unicode', // Hindi
323 'hun' => 'east_european', // Hungarian
324 'isl' => 'west_euorpean', // Icelandic
325 'ita' => 'west_european', // Italian
326 'its' => 'west_european', // Italian (Swiss)
327 'jpn' => 'japanese',
328 'kor' => 'korean',
329 'lth' => 'lithuanian',
330 'lvi' => 'west_european', // Latvian/Lettish
331 'msl' => 'west_european', // Malay
332 'nlb' => 'west_european', // Dutch (Belgian)
333 'nld' => 'west_european', // Dutch
334 'nor' => 'west_european', // Norwegian (bokmal)
335 'non' => 'west_european', // Norwegian (nynorsk)
336 'plk' => 'east_european', // Polish
337 'ptg' => 'west_european', // Portuguese
338 'ptb' => 'west_european', // Portuguese (Brazil)
339 'rom' => 'east_european', // Romanian
340 'rus' => 'cyrillic', // Russian
341 'slv' => 'east_european', // Slovenian
342 'sky' => 'east_european', // Slovak
343 'srl' => 'east_european', // Serbian (Latin)
344 'srb' => 'cyrillic', // Serbian (Cyrillic)
345 'esp' => 'west_european', // Spanish (trad. sort)
346 'esm' => 'west_european', // Spanish (Mexican)
347 'esn' => 'west_european', // Spanish (internat. sort)
348 'sve' => 'west_european', // Swedish
349 'tha' => 'thai',
350 'trk' => 'turkish',
351 'ukr' => 'cyrillic', // Ukrainian
352 // English language names
353 'arabic' => 'arabic',
354 'basque' => 'west_european',
355 'bosnian' => 'east_european',
356 'bulgarian' => 'east_european',
357 'catalan' => 'west_european',
358 'croatian' => 'east_european',
359 'czech' => 'east_european',
360 'danish' => 'west_european',
361 'dutch' => 'west_european',
362 'english' => 'west_european',
363 'esperanto' => 'unicode',
364 'estonian' => 'estonian',
365 'faroese' => 'west_european',
366 'farsi' => 'arabic',
367 'finnish' => 'west_european',
368 'french' => 'west_european',
369 'galician' => 'west_european',
370 'german' => 'west_european',
371 'greek' => 'greek',
372 'greenlandic' => 'west_european',
373 'hebrew' => 'hebrew',
374 'hindi' => 'unicode',
375 'hungarian' => 'east_european',
376 'icelandic' => 'west_european',
377 'italian' => 'west_european',
378 'latvian' => 'west_european',
379 'lettish' => 'west_european',
380 'lithuanian' => 'lithuanian',
381 'malay' => 'west_european',
382 'norwegian' => 'west_european',
383 'persian' => 'arabic',
384 'polish' => 'east_european',
385 'portuguese' => 'west_european',
386 'russian' => 'cyrillic',
387 'romanian' => 'east_european',
388 'serbian' => 'cyrillic',
389 'slovak' => 'east_european',
390 'slovenian' => 'east_european',
391 'spanish' => 'west_european',
392 'svedish' => 'west_european',
393 'that' => 'thai',
394 'turkish' => 'turkish',
395 'ukrainian' => 'cyrillic',
398 // mapping of language (family) names to charsets on Unix
399 var $script_to_charset_unix=array(
400 'west_european' => 'iso-8859-1',
401 'estonian' => 'iso-8859-1',
402 'east_european' => 'iso-8859-2',
403 'baltic' => 'iso-8859-4',
404 'cyrillic' => 'iso-8859-5',
405 'arabic' => 'iso-8859-6',
406 'greek' => 'iso-8859-7',
407 'hebrew' => 'iso-8859-8',
408 'turkish' => 'iso-8859-9',
409 'thai' => 'iso-8859-11', // = TIS-620
410 'lithuanian' => 'iso-8859-13',
411 'chinese' => 'gb2312', // = euc-cn
412 'japanese' => 'euc-jp',
413 'korean' => 'euc-kr',
414 'simpl_chinese' => 'gb2312',
415 'trad_chinese' => 'big5',
416 'vietnamese' => '',
417 'unicode' => 'utf-8',
420 // mapping of language (family) names to charsets on Windows
421 var $script_to_charset_windows=array(
422 'east_european' => 'windows-1250',
423 'cyrillic' => 'windows-1251',
424 'west_european' => 'windows-1252',
425 'greek' => 'windows-1253',
426 'turkish' => 'windows-1254',
427 'hebrew' => 'windows-1255',
428 'arabic' => 'windows-1256',
429 'baltic' => 'windows-1257',
430 'estonian' => 'windows-1257',
431 'lithuanian' => 'windows-1257',
432 'vietnamese' => 'windows-1258',
433 'thai' => 'cp874',
434 'korean' => 'cp949',
435 'chinese' => 'gb2312',
436 'japanese' => 'shift_jis',
437 'simpl_chinese' => 'gb2312',
438 'trad_chinese' => 'big5',
441 // mapping of locale names to charsets
442 var $locale_to_charset=array(
443 'japanese.euc' => 'euc-jp',
444 'ja_jp.ujis' => 'euc-jp',
445 'korean.euc' => 'euc-kr',
446 'sr@Latn' => 'iso-8859-2',
447 'zh_cn' => 'gb2312',
448 'zh_hk' => 'big5',
449 'zh_tw' => 'big5',
452 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
453 // Empty values means "iso-8859-1"
454 var $charSetArray = array(
455 'dk' => '',
456 'de' => '',
457 'no' => '',
458 'it' => '',
459 'fr' => '',
460 'es' => '',
461 'nl' => '',
462 'cz' => 'windows-1250',
463 'pl' => 'iso-8859-2',
464 'si' => 'windows-1250',
465 'fi' => '',
466 'tr' => 'iso-8859-9',
467 'se' => '',
468 'pt' => '',
469 'ru' => 'windows-1251',
470 'ro' => 'iso-8859-2',
471 'ch' => 'gb2312',
472 'sk' => 'windows-1250',
473 'lt' => 'windows-1257',
474 'is' => 'utf-8',
475 'hr' => 'windows-1250',
476 'hu' => 'iso-8859-2',
477 'gl' => '',
478 'th' => 'iso-8859-11',
479 'gr' => 'iso-8859-7',
480 'hk' => 'big5',
481 'eu' => '',
482 'bg' => 'windows-1251',
483 'br' => '',
484 'et' => 'iso-8859-4',
485 'ar' => 'iso-8859-6',
486 'he' => 'utf-8',
487 'ua' => 'windows-1251',
488 'jp' => 'shift_jis',
489 'lv' => 'utf-8',
490 'vn' => 'utf-8',
491 'ca' => 'iso-8859-15',
492 'ba' => 'iso-8859-2',
493 'kr' => 'euc-kr',
494 'eo' => 'utf-8',
495 'my' => '',
496 'hi' => 'utf-8',
497 'fo' => 'utf-8',
498 'fa' => 'utf-8',
499 'sr' => 'utf-8'
502 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
503 // Missing keys means: same as Typo3
504 var $isoArray = array(
505 'ba' => 'bs',
506 'br' => 'pt_BR',
507 'ch' => 'zh_CN',
508 'cz' => 'cs',
509 'dk' => 'da',
510 'si' => 'sl',
511 'se' => 'sv',
512 'gl' => 'kl',
513 'gr' => 'el',
514 'hk' => 'zh_HK',
515 'kr' => 'ko',
516 'ua' => 'uk',
517 'jp' => 'ja',
518 'vn' => 'vi',
522 * Normalize - changes input character set to lowercase letters.
524 * @param string Input charset
525 * @return string Normalized charset
526 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
528 function parse_charset($charset) {
529 $charset = strtolower($charset);
530 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
532 return $charset;
536 * Get the charset of a locale.
538 * ln language
539 * ln_CN language / country
540 * ln_CN.cs language / country / charset
541 * ln_CN.cs@mod language / country / charset / modifier
543 * @param string Locale string
544 * @return string Charset resolved for locale string
545 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
547 function get_locale_charset($locale) {
548 $locale = strtolower($locale);
550 // exact locale specific charset?
551 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale];
553 // get modifier
554 list($locale,$modifier) = explode('@',$locale);
556 // locale contains charset: use it
557 list($locale,$charset) = explode('.',$locale);
558 if ($charset) return $this->parse_charset($charset);
560 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
561 if ($modifier == 'euro') return 'iso-8859-15';
563 // get language
564 list($language,$country) = explode('_',$locale);
565 if (isset($this->lang_to_script[$language])) $script = $this->lang_to_script[$language];
567 if (TYPO3_OS == 'WIN') {
568 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'window-1252';
569 } else {
570 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
573 return $cs;
584 /********************************************
586 * Charset Conversion functions
588 ********************************************/
591 * Convert from one charset to another charset.
593 * @param string Input string
594 * @param string From charset (the current charset of the string)
595 * @param string To charset (the output charset wanted)
596 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
597 * @return string Converted string
598 * @see convArray()
600 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
601 if ($fromCS==$toCS) return $str;
603 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
604 if ($toCS=='utf-8' || !$useEntityForNoChar) {
605 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
606 case 'mbstring':
607 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
608 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
609 break;
611 case 'iconv':
612 $conv_str = iconv($fromCS,$toCS.'//IGNORE',$str);
613 if (false !== $conv_str) return $conv_str;
614 break;
616 case 'recode':
617 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
618 if (false !== $conv_str) return $conv_str;
619 break;
621 // fallback to TYPO3 conversion
624 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
625 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
626 return $str;
630 * Convert all elements in ARRAY from one charset to another charset.
631 * NOTICE: Array is passed by reference!
633 * @param string Input array, possibly multidimensional
634 * @param string From charset (the current charset of the string)
635 * @param string To charset (the output charset wanted)
636 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
637 * @return void
638 * @see conv()
640 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
641 foreach($array as $key => $value) {
642 if (is_array($array[$key])) {
643 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
644 } else {
645 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
651 * Converts $str from $charset to UTF-8
653 * @param string String in local charset to convert to UTF-8
654 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
655 * @return string Output string, converted to UTF-8
657 function utf8_encode($str,$charset) {
659 if ($charset === 'utf-8') return $str;
661 // Charset is case-insensitive.
662 if ($this->initCharset($charset)) { // Parse conv. table if not already...
663 $strLen = strlen($str);
664 $outStr='';
666 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
667 $chr=substr($str,$a,1);
668 $ord=ord($chr);
669 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
670 $ord2 = ord($str{$a+1});
671 $ord = $ord<<8 | $ord2; // assume big endian
673 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
674 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
675 } else $outStr.=chr($this->noCharByteVal); // No char exists
676 $a++;
677 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
678 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
679 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
680 $a++;
681 $ord2=ord(substr($str,$a,1));
682 $ord = $ord*256+$ord2;
686 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
687 $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
688 } else $outStr.= chr($this->noCharByteVal); // No char exists
689 } else $outStr.= $chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
691 return $outStr;
696 * Converts $str from UTF-8 to $charset
698 * @param string String in UTF-8 to convert to local charset
699 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
700 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
701 * @return string Output string, converted to local charset
703 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
705 // Charset is case-insensitive.
706 if ($this->initCharset($charset)) { // Parse conv. table if not already...
707 $strLen = strlen($str);
708 $outStr='';
709 $buf='';
710 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
711 $chr=substr($str,$a,1);
712 $ord=ord($chr);
713 if ($ord>127) { // This means multibyte! (first byte!)
714 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
716 $buf=$chr; // Add first byte
717 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
718 $ord = $ord << 1; // Shift it left and ...
719 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
720 $a++; // Increase pointer...
721 $buf.=substr($str,$a,1); // ... and add the next char.
722 } else break;
725 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
726 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
727 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
728 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
729 } else $outStr.= chr($mByte);
730 } elseif ($useEntityForNoChar) { // Create num entity:
731 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
732 } else $outStr.=chr($this->noCharByteVal); // No char exists
733 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
734 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
736 return $outStr;
741 * Converts all chars > 127 to numeric entities.
743 * @param string Input string
744 * @return string Output string
746 function utf8_to_entities($str) {
747 $strLen = strlen($str);
748 $outStr='';
749 $buf='';
750 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
751 $chr=substr($str,$a,1);
752 $ord=ord($chr);
753 if ($ord>127) { // This means multibyte! (first byte!)
754 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
755 $buf=$chr; // Add first byte
756 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
757 $ord = $ord << 1; // Shift it left and ...
758 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
759 $a++; // Increase pointer...
760 $buf.=substr($str,$a,1); // ... and add the next char.
761 } else break;
764 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
765 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
766 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
769 return $outStr;
773 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
775 * @param string Input string, UTF-8
776 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
777 * @return string Output string
779 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
780 if ($alsoStdHtmlEnt) {
781 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
784 $token = md5(microtime());
785 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
786 foreach($parts as $k => $v) {
787 if ($k%2) {
788 if (substr($v,0,1)=='#') { // Dec or hex entities:
789 if (substr($v,1,1)=='x') {
790 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
791 } else {
792 $parts[$k] = $this->UnumberToChar(substr($v,1));
794 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
795 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
796 } else { // No conversion:
797 $parts[$k] ='&'.$v.';';
802 return implode('',$parts);
806 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
808 * @param string Input string, UTF-8
809 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
810 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
811 * @return array Output array with the char numbers
813 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
814 // If entities must be registered as well...:
815 if ($convEntities) {
816 $str = $this->entities_to_utf8($str,1);
818 // Do conversion:
819 $strLen = strlen($str);
820 $outArr=array();
821 $buf='';
822 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
823 $chr=substr($str,$a,1);
824 $ord=ord($chr);
825 if ($ord>127) { // This means multibyte! (first byte!)
826 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
827 $buf=$chr; // Add first byte
828 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
829 $ord = $ord << 1; // Shift it left and ...
830 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
831 $a++; // Increase pointer...
832 $buf.=substr($str,$a,1); // ... and add the next char.
833 } else break;
836 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
837 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
838 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
841 return $outArr;
845 * Converts a UNICODE number to a UTF-8 multibyte character
846 * Algorithm based on script found at From: http://czyborra.com/utf/
847 * Unit-tested by Kasper
849 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
851 * bytes | bits | representation
852 * 1 | 7 | 0vvvvvvv
853 * 2 | 11 | 110vvvvv 10vvvvvv
854 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
855 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
856 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
857 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
859 * @param integer UNICODE integer
860 * @return string UTF-8 multibyte character string
861 * @see utf8CharToUnumber()
863 function UnumberToChar($cbyte) {
864 $str='';
866 if ($cbyte < 0x80) {
867 $str.=chr($cbyte);
868 } else if ($cbyte < 0x800) {
869 $str.=chr(0xC0 | ($cbyte >> 6));
870 $str.=chr(0x80 | ($cbyte & 0x3F));
871 } else if ($cbyte < 0x10000) {
872 $str.=chr(0xE0 | ($cbyte >> 12));
873 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
874 $str.=chr(0x80 | ($cbyte & 0x3F));
875 } else if ($cbyte < 0x200000) {
876 $str.=chr(0xF0 | ($cbyte >> 18));
877 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
878 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
879 $str.=chr(0x80 | ($cbyte & 0x3F));
880 } else if ($cbyte < 0x4000000) {
881 $str.=chr(0xF8 | ($cbyte >> 24));
882 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
883 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
884 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
885 $str.=chr(0x80 | ($cbyte & 0x3F));
886 } else if ($cbyte < 0x80000000) {
887 $str.=chr(0xFC | ($cbyte >> 30));
888 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
889 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
890 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
891 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
892 $str.=chr(0x80 | ($cbyte & 0x3F));
893 } else { // Cannot express a 32-bit character in UTF-8
894 $str .= chr($this->noCharByteVal);
896 return $str;
900 * Converts a UTF-8 Multibyte character to a UNICODE number
901 * Unit-tested by Kasper
903 * @param string UTF-8 multibyte character string
904 * @param boolean If set, then a hex. number is returned.
905 * @return integer UNICODE integer
906 * @see UnumberToChar()
908 function utf8CharToUnumber($str,$hex=0) {
909 $ord=ord(substr($str,0,1)); // First char
911 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
912 $binBuf='';
913 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
914 $ord = $ord << 1; // Shift it left and ...
915 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
916 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
917 } else break;
919 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
921 $int = bindec($binBuf);
922 } else $int = $ord;
924 return $hex ? 'x'.dechex($int) : $int;
935 /********************************************
937 * Init functions
939 ********************************************/
942 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
943 * This function is automatically called by the conversion functions
945 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
947 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
948 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
949 * @access private
951 function initCharset($charset) {
952 // Only process if the charset is not yet loaded:
953 if (!is_array($this->parsedCharsets[$charset])) {
955 // Conversion table filename:
956 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
958 // If the conversion table is found:
959 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
960 // Cache file for charsets:
961 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
962 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
963 if ($cacheFile && @is_file($cacheFile)) {
964 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
965 } else {
966 // Parse conversion table into lines:
967 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
968 // Initialize the internal variable holding the conv. table:
969 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
970 // traverse the lines:
971 $detectedType='';
972 foreach($lines as $value) {
973 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
975 // Detect type if not done yet: (Done on first real line)
976 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
977 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
979 if ($detectedType=='ms-token') {
980 list($hexbyte,$utf8) = split('=|:',$value,3);
981 } elseif ($detectedType=='whitespaced') {
982 $regA=array();
983 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
984 $hexbyte = $regA[1];
985 $utf8 = 'U+'.$regA[2];
987 $decval = hexdec(trim($hexbyte));
988 if ($decval>127) {
989 $utf8decval = hexdec(substr(trim($utf8),2));
990 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
991 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
995 if ($cacheFile) {
996 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
999 return 2;
1000 } else return false;
1001 } else return 1;
1005 * This function initializes all UTF-8 character data tables.
1007 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1009 * @param string Mode ("case", "ascii", ...)
1010 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1011 * @access private
1013 function initUnicodeData($mode=null) {
1014 // cache files
1015 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1016 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1018 // Only process if the tables are not yet loaded
1019 switch($mode) {
1020 case 'case':
1021 if (is_array($this->caseFolding['utf-8'])) return 1;
1023 // Use cached version if possible
1024 if ($cacheFileCase && @is_file($cacheFileCase)) {
1025 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1026 return 2;
1028 break;
1030 case 'ascii':
1031 if (is_array($this->toASCII['utf-8'])) return 1;
1033 // Use cached version if possible
1034 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1035 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1036 return 2;
1038 break;
1041 // process main Unicode data file
1042 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
1043 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
1045 $fh = fopen($unicodeDataFile,'rb');
1046 if (!$fh) return false;
1048 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1049 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1050 $this->caseFolding['utf-8'] = array();
1051 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1052 $utf8CaseFolding['toUpper'] = array();
1053 $utf8CaseFolding['toLower'] = array();
1054 $utf8CaseFolding['toTitle'] = array();
1056 $decomposition = array(); // array of temp. decompositions
1057 $mark = array(); // array of chars that are marks (eg. composing accents)
1058 $number = array(); // array of chars that are numbers (eg. digits)
1059 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1061 while (!feof($fh)) {
1062 $line = fgets($fh,4096);
1063 // has a lot of info
1064 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
1066 $ord = hexdec($char);
1067 if ($ord > 0xFFFF) break; // only process the BMP
1069 $utf8_char = $this->UnumberToChar($ord);
1071 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1072 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1073 // store "title" only when different from "upper" (only a few)
1074 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1076 switch ($cat{0}) {
1077 case 'M': // mark (accent, umlaut, ...)
1078 $mark["U+$char"] = 1;
1079 break;
1081 case 'N': // numeric value
1082 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
1085 // accented Latin letters without "official" decomposition
1086 $match = array();
1087 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) {
1088 $c = ord($match[2]);
1089 if ($match[1] == 'SMALL') $c += 32;
1091 $decomposition["U+$char"] = array(dechex($c));
1092 continue;
1095 $match = array();
1096 if (ereg('(<.*>)? *(.+)',$decomp,$match)) {
1097 switch($match[1]) {
1098 case '<circle>': // add parenthesis as circle replacement, eg (1)
1099 $match[2] = '0028 '.$match[2].' 0029';
1100 break;
1102 case '<square>': // add square brackets as square replacement, eg [1]
1103 $match[2] = '005B '.$match[2].' 005D';
1104 break;
1106 case '<compat>': // ignore multi char decompositions that start with a space
1107 if (ereg('^0020 ',$match[2])) continue 2;
1108 break;
1110 // ignore Arabic and vertical layout presentation decomposition
1111 case '<initial>':
1112 case '<medial>':
1113 case '<final>':
1114 case '<isolated>':
1115 case '<vertical>':
1116 continue 2;
1118 $decomposition["U+$char"] = split(' ',$match[2]);
1121 fclose($fh);
1123 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1124 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
1125 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1126 $fh = fopen($specialCasingFile,'rb');
1127 if ($fh) {
1128 while (!feof($fh)) {
1129 $line = fgets($fh,4096);
1130 if ($line{0} != '#' && trim($line) != '') {
1132 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
1133 if ($cond == '' || $cond{0} == '#') {
1134 $utf8_char = $this->UnumberToChar(hexdec($char));
1135 if ($char != $lower) {
1136 $arr = split(' ',$lower);
1137 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1138 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1140 if ($char != $title && $title != $upper) {
1141 $arr = split(' ',$title);
1142 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1143 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1145 if ($char != $upper) {
1146 $arr = split(' ',$upper);
1147 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1148 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1153 fclose($fh);
1157 // process custom decompositions
1158 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
1159 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1160 $fh = fopen($customTranslitFile,'rb');
1161 if ($fh) {
1162 while (!feof($fh)) {
1163 $line = fgets($fh,4096);
1164 if ($line{0} != '#' && trim($line) != '') {
1165 list($char,$translit) = t3lib_div::trimExplode(';', $line);
1166 if (!$translit) $omit["U+$char"] = 1;
1167 $decomposition["U+$char"] = split(' ', $translit);
1171 fclose($fh);
1175 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1176 foreach($decomposition as $from => $to) {
1177 $code_decomp = array();
1179 while ($code_value = array_shift($to)) {
1180 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1181 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
1182 array_unshift($to, $cv);
1184 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1185 array_push($code_decomp, $code_value);
1188 if (count($code_decomp) || isset($omit[$from])) {
1189 $decomposition[$from] = $code_decomp;
1190 } else {
1191 unset($decomposition[$from]);
1195 // create ascii only mapping
1196 $this->toASCII['utf-8'] = array();
1197 $ascii =& $this->toASCII['utf-8'];
1199 foreach($decomposition as $from => $to) {
1200 $code_decomp = array();
1201 while ($code_value = array_shift($to)) {
1202 $ord = hexdec($code_value);
1203 if ($ord > 127)
1204 continue 2; // skip decompositions containing non-ASCII chars
1205 else
1206 array_push($code_decomp,chr($ord));
1208 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1211 // add numeric decompositions
1212 foreach($number as $from => $to) {
1213 $utf8_char = $this->UnumberToChar(hexdec($from));
1214 if (!isset($ascii[$utf8_char])) {
1215 $ascii[$utf8_char] = $to;
1219 if ($cacheFileCase) {
1220 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1223 if ($cacheFileASCII) {
1224 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1227 return 3;
1231 * This function initializes the folding table for a charset other than UTF-8.
1232 * This function is automatically called by the case folding functions.
1234 * @param string Charset for which to initialize case folding.
1235 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1236 * @access private
1238 function initCaseFolding($charset) {
1239 // Only process if the case table is not yet loaded:
1240 if (is_array($this->caseFolding[$charset])) return 1;
1242 // Use cached version if possible
1243 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1244 if ($cacheFile && @is_file($cacheFile)) {
1245 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1246 return 2;
1249 // init UTF-8 conversion for this charset
1250 if (!$this->initCharset($charset)) {
1251 return false;
1254 // UTF-8 case folding is used as the base conversion table
1255 if (!$this->initUnicodeData('case')) {
1256 return false;
1259 $nochar = chr($this->noCharByteVal);
1260 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1261 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1262 $c = $this->utf8_decode($utf8, $charset);
1264 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1265 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1266 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1268 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1269 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1270 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
1272 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1273 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1274 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1277 // add the ASCII case table
1278 for ($i=ord('a'); $i<=ord('z'); $i++) {
1279 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
1281 for ($i=ord('A'); $i<=ord('Z'); $i++) {
1282 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
1285 if ($cacheFile) {
1286 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
1289 return 3;
1293 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1294 * This function is automatically called by the ASCII transliteration functions.
1296 * @param string Charset for which to initialize conversion.
1297 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1298 * @access private
1300 function initToASCII($charset) {
1301 // Only process if the case table is not yet loaded:
1302 if (is_array($this->toASCII[$charset])) return 1;
1304 // Use cached version if possible
1305 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1306 if ($cacheFile && @is_file($cacheFile)) {
1307 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1308 return 2;
1311 // init UTF-8 conversion for this charset
1312 if (!$this->initCharset($charset)) {
1313 return false;
1316 // UTF-8/ASCII transliteration is used as the base conversion table
1317 if (!$this->initUnicodeData('ascii')) {
1318 return false;
1321 $nochar = chr($this->noCharByteVal);
1322 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1323 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1324 $c = $this->utf8_decode($utf8, $charset);
1326 if (isset($this->toASCII['utf-8'][$utf8])) {
1327 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1331 if ($cacheFile) {
1332 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
1335 return 3;
1353 /********************************************
1355 * String operation functions
1357 ********************************************/
1360 * Returns a part of a string.
1361 * Unit-tested by Kasper (single byte charsets only)
1363 * @param string The character set
1364 * @param string Character string
1365 * @param integer Start position (character position)
1366 * @param integer Length (in characters)
1367 * @return string The substring
1368 * @see substr(), mb_substr()
1369 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1371 function substr($charset,$string,$start,$len=null) {
1372 if ($len===0) return '';
1374 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1375 // cannot omit $len, when specifying charset
1376 if ($len==null) {
1377 $enc = mb_internal_encoding(); // save internal encoding
1378 mb_internal_encoding($charset);
1379 $str = mb_substr($string,$start);
1380 mb_internal_encoding($enc); // restore internal encoding
1382 return $str;
1384 else {
1385 return mb_substr($string,$start,$len,$charset);
1387 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1388 // cannot omit $len, when specifying charset
1389 if ($len==null) {
1390 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1391 iconv_set_encoding('internal_encoding',$charset);
1392 $str = iconv_substr($string,$start);
1393 iconv_set_encoding('internal_encoding',$enc); // restore internal encoding
1395 return $str;
1397 else {
1398 return iconv_substr($string,$start,$len,$charset);
1400 } elseif ($charset == 'utf-8') {
1401 return $this->utf8_substr($string,$start,$len);
1402 } elseif ($this->eucBasedSets[$charset]) {
1403 return $this->euc_substr($string,$start,$charset,$len);
1404 } elseif ($this->twoByteSets[$charset]) {
1405 return substr($string,$start*2,$len*2);
1406 } elseif ($this->fourByteSets[$charset]) {
1407 return substr($string,$start*4,$len*4);
1410 // treat everything else as single-byte encoding
1411 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
1415 * Counts the number of characters.
1416 * Unit-tested by Kasper (single byte charsets only)
1418 * @param string The character set
1419 * @param string Character string
1420 * @return integer The number of characters
1421 * @see strlen()
1422 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1424 function strlen($charset,$string) {
1425 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1426 return mb_strlen($string,$charset);
1427 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1428 return iconv_strlen($string,$charset);
1429 } elseif ($charset == 'utf-8') {
1430 return $this->utf8_strlen($string);
1431 } elseif ($this->eucBasedSets[$charset]) {
1432 return $this->euc_strlen($string,$charset);
1433 } elseif ($this->twoByteSets[$charset]) {
1434 return strlen($string)/2;
1435 } elseif ($this->fourByteSets[$charset]) {
1436 return strlen($string)/4;
1438 // treat everything else as single-byte encoding
1439 return strlen($string);
1443 * Truncates a string and pre-/appends a string.
1444 * Unit tested by Kasper
1446 * @param string The character set
1447 * @param string Character string
1448 * @param integer Length (in characters)
1449 * @param string Crop signifier
1450 * @return string The shortened string
1451 * @see substr(), mb_strimwidth()
1452 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1454 function crop($charset,$string,$len,$crop='') {
1455 if (intval($len) == 0) return $string;
1457 if ($charset == 'utf-8') {
1458 $i = $this->utf8_char2byte_pos($string,$len);
1459 } elseif ($this->eucBasedSets[$charset]) {
1460 $i = $this->euc_char2byte_pos($string,$len,$charset);
1461 } else {
1462 if ($len > 0) {
1463 $i = $len;
1464 } else {
1465 $i = strlen($string)+$len;
1466 if ($i<=0) $i = false;
1470 if ($i === false) { // $len outside actual string length
1471 return $string;
1472 } else {
1473 if ($len > 0) {
1474 if (strlen($string{$i})) {
1475 return substr($string,0,$i).$crop;
1478 } else {
1479 if (strlen($string{$i-1})) {
1480 return $crop.substr($string,$i);
1485 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1486 if ($len > 0) {
1487 return substr($string,0,$i).$crop;
1488 } else {
1489 return $crop.substr($string,$i);
1494 return $string;
1498 * Cuts a string short at a given byte length.
1500 * @param string The character set
1501 * @param string Character string
1502 * @param integer The byte length
1503 * @return string The shortened string
1504 * @see mb_strcut()
1505 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1507 function strtrunc($charset,$string,$len) {
1508 if ($len <= 0) return '';
1510 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1511 return mb_strcut($string,0,$len,$charset);
1512 } elseif ($charset == 'utf-8') {
1513 return $this->utf8_strtrunc($string,$len);
1514 } elseif ($this->eucBasedSets[$charset]) {
1515 return $this->euc_strtrunc($string,$charset);
1516 } elseif ($this->twoByteSets[$charset]) {
1517 if ($len % 2) $len--; // don't cut at odd positions
1518 } elseif ($this->fourByteSets[$charset]) {
1519 $x = $len % 4;
1520 $len -= $x; // realign to position dividable by four
1522 // treat everything else as single-byte encoding
1523 return substr($string,0,$len);
1527 * Translates all characters of a string into their respective case values.
1528 * Unlike strtolower() and strtoupper() this method is locale independent.
1529 * Note that the string length may change!
1530 * eg. lower case German �(sharp S) becomes upper case "SS"
1531 * Unit-tested by Kasper
1532 * Real case folding is language dependent, this method ignores this fact.
1534 * @param string Character set of string
1535 * @param string Input string to convert case for
1536 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1537 * @return string The converted string
1538 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1539 * @see strtolower(), strtoupper()
1541 function conv_case($charset,$string,$case) {
1542 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' && (float)phpversion() >= 4.3) {
1543 if ($case == 'toLower') {
1544 $string = mb_strtolower($string,$charset);
1545 } else {
1546 $string = mb_strtoupper($string,$charset);
1548 } elseif ($charset == 'utf-8') {
1549 $string = $this->utf8_char_mapping($string,'case',$case);
1550 } elseif (isset($this->eucBasedSets[$charset])) {
1551 $string = $this->euc_char_mapping($string,$charset,'case',$case);
1552 } else {
1553 // treat everything else as single-byte encoding
1554 $string = $this->sb_char_mapping($string,$charset,'case',$case);
1557 return $string;
1561 * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
1563 * @param string Character set of string
1564 * @param string Input string to convert
1565 * @return string The converted string
1567 function specCharsToASCII($charset,$string) {
1568 if ($charset == 'utf-8') {
1569 $string = $this->utf8_char_mapping($string,'ascii');
1570 } elseif (isset($this->eucBasedSets[$charset])) {
1571 $string = $this->euc_char_mapping($string,$charset,'ascii');
1572 } else {
1573 // treat everything else as single-byte encoding
1574 $string = $this->sb_char_mapping($string,$charset,'ascii');
1577 return $string;
1591 /********************************************
1593 * Internal string operation functions
1595 ********************************************/
1598 * Maps all characters of a string in a single byte charset.
1600 * @param string the string
1601 * @param string the charset
1602 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1603 * @param string 'case': conversion 'toLower' or 'toUpper'
1604 * @return string the converted string
1605 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1607 function sb_char_mapping($str,$charset,$mode,$opt='') {
1608 switch($mode) {
1609 case 'case':
1610 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1611 $map =& $this->caseFolding[$charset][$opt];
1612 break;
1614 case 'ascii':
1615 if (!$this->initToASCII($charset)) return $str; // do nothing
1616 $map =& $this->toASCII[$charset];
1617 break;
1619 default:
1620 return $str;
1623 $out = '';
1624 for($i=0; strlen($str{$i}); $i++) {
1625 $c = $str{$i};
1626 if (isset($map[$c])) {
1627 $out .= $map[$c];
1628 } else {
1629 $out .= $c;
1633 return $out;
1645 /********************************************
1647 * Internal UTF-8 string operation functions
1649 ********************************************/
1652 * Returns a part of a UTF-8 string.
1653 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1655 * @param string UTF-8 string
1656 * @param integer Start position (character position)
1657 * @param integer Length (in characters)
1658 * @return string The substring
1659 * @see substr()
1660 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1662 function utf8_substr($str,$start,$len=null) {
1663 if (!strcmp($len,'0')) return '';
1665 $byte_start = $this->utf8_char2byte_pos($str,$start);
1666 if ($byte_start === false) {
1667 if ($start > 0) {
1668 return false; // $start outside string length
1669 } else {
1670 $start = 0;
1674 $str = substr($str,$byte_start);
1676 if ($len!=null) {
1677 $byte_end = $this->utf8_char2byte_pos($str,$len);
1678 if ($byte_end === false) // $len outside actual string length
1679 return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string.
1680 else
1681 return substr($str,0,$byte_end);
1683 else return $str;
1687 * Counts the number of characters of a string in UTF-8.
1688 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1690 * @param string UTF-8 multibyte character string
1691 * @return integer The number of characters
1692 * @see strlen()
1693 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1695 function utf8_strlen($str) {
1696 $n=0;
1697 for($i=0; strlen($str{$i}); $i++) {
1698 $c = ord($str{$i});
1699 if (!($c & 0x80)) // single-byte (0xxxxxx)
1700 $n++;
1701 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1702 $n++;
1704 return $n;
1708 * Truncates a string in UTF-8 short at a given byte length.
1710 * @param string UTF-8 multibyte character string
1711 * @param integer the byte length
1712 * @return string the shortened string
1713 * @see mb_strcut()
1714 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1716 function utf8_strtrunc($str,$len) {
1717 $i = $len-1;
1718 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1719 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1720 if ($i <= 0) return ''; // sanity check
1721 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1722 if ($bc+$i > $len) return substr($str,0,$i);
1723 // fallthru: multibyte char fits into length
1725 return substr($str,0,$len);
1729 * Find position of first occurrence of a string, both arguments are in UTF-8.
1731 * @param string UTF-8 string to search in
1732 * @param string UTF-8 string to search for
1733 * @param integer Positition to start the search
1734 * @return integer The character position
1735 * @see strpos()
1736 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1738 function utf8_strpos($haystack,$needle,$offset=0) {
1739 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1740 return mb_strpos($haystack,$needle,$offset,'utf-8');
1741 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1742 return iconv_strpos($haystack,$needle,$offset,'utf-8');
1745 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1746 if ($byte_offset === false) return false; // offset beyond string length
1748 $byte_pos = strpos($haystack,$needle,$byte_offset);
1749 if ($byte_pos === false) return false; // needle not found
1751 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1755 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1757 * @param string UTF-8 string to search in
1758 * @param string UTF-8 character to search for (single character)
1759 * @return integer The character position
1760 * @see strrpos()
1761 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1763 function utf8_strrpos($haystack,$needle) {
1764 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1765 return mb_strrpos($haystack,$needle,'utf-8');
1766 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1767 return iconv_strrpos($haystack,$needle,'utf-8');
1770 $byte_pos = strrpos($haystack,$needle);
1771 if ($byte_pos === false) return false; // needle not found
1773 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1777 * Translates a character position into an 'absolute' byte position.
1778 * Unit tested by Kasper.
1780 * @param string UTF-8 string
1781 * @param integer Character position (negative values start from the end)
1782 * @return integer Byte position
1783 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1785 function utf8_char2byte_pos($str,$pos) {
1786 $n = 0; // number of characters found
1787 $p = abs($pos); // number of characters wanted
1789 if ($pos >= 0) {
1790 $i = 0;
1791 $d = 1;
1792 } else {
1793 $i = strlen($str)-1;
1794 $d = -1;
1797 for( ; strlen($str{$i}) && $n<$p; $i+=$d) {
1798 $c = (int)ord($str{$i});
1799 if (!($c & 0x80)) // single-byte (0xxxxxx)
1800 $n++;
1801 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1802 $n++;
1804 if (!strlen($str{$i})) return false; // offset beyond string length
1806 if ($pos >= 0) {
1807 // skip trailing multi-byte data bytes
1808 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1809 } else {
1810 // correct offset
1811 $i++;
1814 return $i;
1818 * Translates an 'absolute' byte position into a character position.
1819 * Unit tested by Kasper.
1821 * @param string UTF-8 string
1822 * @param integer byte position
1823 * @return integer character position
1824 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1826 function utf8_byte2char_pos($str,$pos) {
1827 $n = 0; // number of characters
1828 for($i=$pos; $i>0; $i--) {
1829 $c = (int)ord($str{$i});
1830 if (!($c & 0x80)) // single-byte (0xxxxxx)
1831 $n++;
1832 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1833 $n++;
1835 if (!strlen($str{$i})) return false; // offset beyond string length
1837 return $n;
1841 * Maps all characters of an UTF-8 string.
1843 * @param string UTF-8 string
1844 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1845 * @param string 'case': conversion 'toLower' or 'toUpper'
1846 * @return string the converted string
1847 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1849 function utf8_char_mapping($str,$mode,$opt='') {
1850 if (!$this->initUnicodeData($mode)) return $str; // do nothing
1852 $out = '';
1853 switch($mode) {
1854 case 'case':
1855 $map =& $this->caseFolding['utf-8'][$opt];
1856 break;
1858 case 'ascii':
1859 $map =& $this->toASCII['utf-8'];
1860 break;
1862 default:
1863 return $str;
1866 for($i=0; strlen($str{$i}); $i++) {
1867 $c = ord($str{$i});
1868 if (!($c & 0x80)) // single-byte (0xxxxxx)
1869 $mbc = $str{$i};
1870 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1871 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1872 $mbc = substr($str,$i,$bc);
1873 $i += $bc-1;
1876 if (isset($map[$mbc])) {
1877 $out .= $map[$mbc];
1878 } else {
1879 $out .= $mbc;
1883 return $out;
1903 /********************************************
1905 * Internal EUC string operation functions
1907 * Extended Unix Code:
1908 * ASCII compatible 7bit single bytes chars
1909 * 8bit two byte chars
1911 * Shift-JIS is treated as a special case.
1913 ********************************************/
1916 * Cuts a string in the EUC charset family short at a given byte length.
1918 * @param string EUC multibyte character string
1919 * @param integer the byte length
1920 * @param string the charset
1921 * @return string the shortened string
1922 * @see mb_strcut()
1923 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1925 function euc_strtrunc($str,$len,$charset) {
1926 $sjis = ($charset == 'shift_jis');
1927 for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
1928 $c = ord($str{$i});
1929 if ($sjis) {
1930 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1932 else {
1933 if ($c >= 0x80) $i++; // advance a double-byte char
1936 if (!strlen($str{$i})) return $str; // string shorter than supplied length
1938 if ($i>$len)
1939 return substr($str,0,$len-1); // we ended on a first byte
1940 else
1941 return substr($str,0,$len);
1945 * Returns a part of a string in the EUC charset family.
1947 * @param string EUC multibyte character string
1948 * @param integer start position (character position)
1949 * @param string the charset
1950 * @param integer length (in characters)
1951 * @return string the substring
1952 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1954 function euc_substr($str,$start,$charset,$len=null) {
1955 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1956 if ($byte_start === false) return false; // $start outside string length
1958 $str = substr($str,$byte_start);
1960 if ($len!=null) {
1961 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1962 if ($byte_end === false) // $len outside actual string length
1963 return $str;
1964 else
1965 return substr($str,0,$byte_end);
1967 else return $str;
1971 * Counts the number of characters of a string in the EUC charset family.
1973 * @param string EUC multibyte character string
1974 * @param string the charset
1975 * @return integer the number of characters
1976 * @see strlen()
1977 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1979 function euc_strlen($str,$charset) {
1980 $sjis = ($charset == 'shift_jis');
1981 $n=0;
1982 for ($i=0; strlen($str{$i}); $i++) {
1983 $c = ord($str{$i});
1984 if ($sjis) {
1985 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1987 else {
1988 if ($c >= 0x80) $i++; // advance a double-byte char
1991 $n++;
1994 return $n;
1998 * Translates a character position into an 'absolute' byte position.
2000 * @param string EUC multibyte character string
2001 * @param integer character position (negative values start from the end)
2002 * @param string the charset
2003 * @return integer byte position
2004 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2006 function euc_char2byte_pos($str,$pos,$charset) {
2007 $sjis = ($charset == 'shift_jis');
2008 $n = 0; // number of characters seen
2009 $p = abs($pos); // number of characters wanted
2011 if ($pos >= 0) {
2012 $i = 0;
2013 $d = 1;
2014 } else {
2015 $i = strlen($str)-1;
2016 $d = -1;
2019 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
2020 $c = ord($str{$i});
2021 if ($sjis) {
2022 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char
2024 else {
2025 if ($c >= 0x80) $i+=$d; // advance a double-byte char
2028 $n++;
2030 if (!strlen($str{$i})) return false; // offset beyond string length
2032 if ($pos < 0) $i++; // correct offset
2034 return $i;
2038 * Maps all characters of a string in the EUC charset family.
2040 * @param string EUC multibyte character string
2041 * @param string the charset
2042 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2043 * @param string 'case': conversion 'toLower' or 'toUpper'
2044 * @return string the converted string
2045 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2047 function euc_char_mapping($str,$charset,$mode,$opt='') {
2048 switch($mode) {
2049 case 'case':
2050 if (!$this->initCaseFolding($charset)) return $str; // do nothing
2051 $map =& $this->caseFolding[$charset][$opt];
2052 break;
2054 case 'ascii':
2055 if (!$this->initToASCII($charset)) return $str; // do nothing
2056 $map =& $this->toASCII[$charset];
2057 break;
2059 default:
2060 return $str;
2063 $sjis = ($charset == 'shift_jis');
2064 $out = '';
2065 for($i=0; strlen($str{$i}); $i++) {
2066 $mbc = $str{$i};
2067 $c = ord($mbc);
2069 if ($sjis) {
2070 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2071 $mbc = substr($str,$i,2);
2072 $i++;
2075 else {
2076 if ($c >= 0x80) { // a double-byte char
2077 $mbc = substr($str,$i,2);
2078 $i++;
2082 if (isset($map[$mbc])) {
2083 $out .= $map[$mbc];
2084 } else {
2085 $out .= $mbc;
2089 return $out;
2094 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
2095 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);