Source for file uctc.php
Documentation is available at uctc.php
* UCTC - The Unicode Transcoder
* Converts between various flavours of Unicode representations like UCS-4 or UTF-8
* - UCS-4 Little Endian / Big Endian / Array (partially)
* - UTF-16 Little Endian / Big Endian (not yet)
* - UTF-7 IMAP (modified UTF-7)
* @author Matthias Sommerfeld <mso@phlyLabs.de>
* @copyright 2003-2009 phlyLabs Berlin, http://phlylabs.de
* @version 0.0.6 2009-05-10
private static $mechs =
array('ucs4', /*'ucs4le', 'ucs4be', */'ucs4array', /*'utf16', 'utf16le', 'utf16be', */'utf8', 'utf7', 'utf7imap');
private static $allow_overlong =
false;
private static $safe_mode;
private static $safe_char;
* The actual conversion routine
* @param mixed $data The data to convert, usually a string, array when converting from UCS-4 array
* @param string $from Original encoding of the data
* @param string $to Target encoding of the data
* @param bool $safe_mode SafeMode tries to correct invalid codepoints
* @return mixed False on failure, String or array on success, depending on target encoding
public static function convert($data, $from, $to, $safe_mode =
false, $safe_char =
0xFFFC)
self::$safe_mode =
($safe_mode) ?
true :
false;
self::$safe_char =
($safe_char) ?
$safe_char :
0xFFFC;
if (self::$safe_mode) self::$allow_overlong =
true;
if (!in_array($from, self::$mechs)) throw
new Exception('Invalid input format specified');
if (!in_array($to, self::$mechs)) throw
new Exception('Invalid output format specified');
if ($from !=
'ucs4array') eval
('$data = self::'.
$from.
'_ucs4array($data);');
if ($to !=
'ucs4array') eval
('$data = self::ucs4array_'.
$to.
'($data);');
* This converts an UTF-8 encoded string to its UCS-4 representation
* @param string $input The UTF-8 string to convert
* @return array Array of 32bit values representing each codepoint
private static function utf8_ucs4array($input)
for ($k =
0; $k <
$inp_len; ++
$k) {
$v =
ord($input{$k}); // Extract byte from input string
if ($v <
128) { // We found an ASCII char - put into stirng as is
$output[$out_len-
2] =
self::$safe_char;
throw
new Exception('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.
$k);
if ('next' ==
$mode) { // Try to find the next start byte; determine the width of the Unicode char
if ($v >>
5 ==
6) { // &110xxxxx 10xxxxx
$next_byte =
0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left
} elseif ($v >>
4 ==
14) { // &1110xxxx 10xxxxxx 10xxxxxx
} elseif ($v >>
3 ==
30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
} elseif (self::$safe_mode) {
$output[$out_len] =
self::$safe_char;
throw
new Exception('This might be UTF-8, but I don\'t understand it at byte '.
$k);
if ($inp_len-
$k-
$next_byte <
2) {
$output[$out_len] =
self::$safe_char;
$output[$out_len] = (int)
$v;
if (!self::$allow_overlong &&
$test ==
'range') {
if (($v <
0xA0 &&
$start_byte ==
0xE0) ||
($v <
0x90 &&
$start_byte ==
0xF0) ||
($v >
0x8F &&
$start_byte ==
0xF4)) {
throw
new Exception('Bogus UTF-8 character detected (out of legal range) at byte '.
$k);
if ($v >>
6 ==
2) { // Bit mask must be 10xxxxxx
$v =
($v-
128) <<
($next_byte*
6);
$output[($out_len-
1)] +=
$v;
$output[$out_len-
1] =
ord(self::$safe_char);
throw
new Exception('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.
$k);
* Convert UCS-4 string into UTF-8 string
* See utf8_ucs4array() for details
private static function ucs4array_utf8($input)
if ($v <
128) { // 7bit are transferred literally
} elseif ($v <
(1 <<
11)) { // 2 bytes
$output .=
chr(192+
($v >>
6)).
chr(128+
($v & 63));
} elseif ($v <
(1 <<
16)) { // 3 bytes
$output .=
chr(224+
($v >>
12)).
chr(128+
(($v >>
6) & 63)).
chr(128+
($v & 63));
} elseif ($v <
(1 <<
21)) { // 4 bytes
$output .=
chr(240+
($v >>
18)).
chr(128+
(($v >>
12) & 63)).
chr(128+
(($v >>
6) & 63)).
chr(128+
($v & 63));
} elseif (self::$safe_mode) {
$output .=
self::$safe_char;
throw
new Exception('Conversion from UCS-4 to UTF-8 failed: malformed input at byte '.
$k);
private static function utf7imap_ucs4array($input)
return self::utf7_ucs4array(str_replace(',', '/', $input), '&');
private static function utf7_ucs4array($input, $sc =
'+')
for ($k =
0; $k <
$inp_len; ++
$k) {
if (0 ==
ord($c)) continue; // Ignore zero bytes
// Sequence got terminated
$output[$out_len] =
ord($sc);
for ($i =
0; $i <
strlen($tmp); $i++
) {
$output[$out_len] +=
ord($tmp{$i});
$output[$out_len] =
ord($tmp{$i}) <<
8;
$output[$out_len] =
ord($c);
private static function ucs4array_utf7imap($input)
return str_replace('/', ',', self::ucs4array_utf7($input, '&'));
private static function ucs4array_utf7($input, $sc =
'+')
$is_direct =
(false !==
$v) ?
(0x20 <=
$v &&
$v <=
0x7e &&
$v !=
ord($sc)) :
true;
if ($b64 ==
chr(0).
$sc) {
} elseif (false !==
$v) {
$b64 .=
chr(($v >>
8) & 255).
chr($v & 255);
if ($mode ==
'd' &&
false !==
$v) {
$b64 =
chr(($v >>
8) & 255).
chr($v & 255);
if (false ===
$v &&
$b64 ==
'') break;
* Convert UCS-4 array into UCS-4 string (Little Endian at the moment)
private static function ucs4array_ucs4($input)
$output .=
chr(($v >>
24) & 255).
chr(($v >>
16) & 255).
chr(($v >>
8) & 255).
chr($v & 255);
* Convert UCS-4 string (LE in the moment) into UCS-4 garray
private static function ucs4_ucs4array($input)
// Input length must be dividable by 4
throw
new Exception('Input UCS4 string is broken');
// Empty input - return empty output
if (!$inp_len) return $output;
for ($i =
0, $out_len = -
1; $i <
$inp_len; ++
$i) {
if (!($i %
4)) { // Increment output position every 4 input bytes
$output[$out_len] +=
ord($input{$i}) <<
(8 *
(3 -
($i %
4) ) );
Documentation generated on Tue, 19 Nov 2013 15:16:06 +0100 by phpDocumentor 1.4.3