Source for file idna_convert.class.php
Documentation is available at idna_convert.class.php
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 foldmethod=marker: */
// +----------------------------------------------------------------------+
// | This library is free software; you can redistribute it and/or modify |
// | it under the terms of the GNU Lesser General Public License as |
// | published by the Free Software Foundation; either version 2.1 of the |
// | License, or (at your option) any later version. |
// | This library is distributed in the hope that it will be useful, but |
// | WITHOUT ANY WARRANTY; without even the implied warranty of |
// | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
// | Lesser General Public License for more details. |
// | You should have received a copy of the GNU Lesser General Public |
// | License along with this library; if not, write to the Free Software |
// | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 |
// +----------------------------------------------------------------------+
* Encode/decode Internationalized Domain Names.
* The class allows to convert internationalized domain names
* (see RFC 3490 for details) as they can be used with various registries worldwide
* to be translated between their original (localized) form and their encoded form
* as it will be used in the DNS (Domain Name System).
* The class provides two public methods, encode() and decode(), which do exactly
* what you would expect them to do. You are allowed to use complete domain names,
* simple strings and complete email addresses as well. That means, that you might
* use any of the following notations:
* - xn--brse-5qa.xn--knrz-1ra.info
* Unicode input might be given as either UTF-8 string, UCS-4 string or UCS-4
* array. Unicode output is available in the same formats.
* You can select your preferred format via {@link set_paramter()}.
* ACE input and output is always expected to be ASCII.
* @author Matthias Sommerfeld <mso@phlylabs.de>
* @copyright 2004-2007 phlyLabs Berlin, http://phlylabs.de
* Holds all relevant mapping tables, loaded from a seperate file on construct
* See RFC3454 for details
// Internal settings, do not mess with them
var $_ncount =
588; // _vcount * _tcount
var $_scount =
11172; // _lcount * _tcount * _vcount
// See {@link set_paramter()} for details of how to change the following
// settings from within your script / application
// If parameters are given, pass these to the respective method
* Sets a new option value. Available options and values:
* [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
* 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
* [overlong - Unicode does not allow unnecessarily long encodings of chars,
* to allow this, set this parameter to true, else to false;
* [strict - true: strict mode, good for registration purposes - Causes errors
* on failures; false: loose mode, ideal for "wildlife" applications
* by silently ignoring errors and returning the original input instead
* @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs)
* @param string Value to use (if parameter 1 is a string)
* @return boolean true on success, false otherwise
$option =
array($option =>
$value);
foreach ($option as $k =>
$v) {
$this->_error('Set Parameter: Unknown parameter '.
$v.
' for option '.
$k);
$this->_error('Set Parameter: Unknown option '.
$k);
* Decode a given ACE domain name
* @param string Domain name (ACE string)
* [@param string Desired output encoding, see {@link set_parameter}]
* @return string Decoded Domain name (UTF-8 or UCS-4)
function decode($input, $one_time_encoding =
false)
if ($one_time_encoding) {
switch ($one_time_encoding) {
$this->_error('Unknown encoding '.
$one_time_encoding);
// Make sure to drop any newline characters around
// Negotiate input and try to determine, whether it is a plain string,
// an email address or something like a complete URL
if (strpos($input, '@')) { // Maybe it is an email address
$this->_error('Only simple domain name parts can be handled in strict mode');
list
($email_pref, $input) =
explode('@', $input, 2);
foreach ($arr as $k =>
$v) {
if ($conv) $arr[$k] =
$conv;
$input =
join('.', $arr);
foreach ($arr as $k =>
$v) {
if ($conv) $arr[$k] =
$conv;
$email_pref =
join('.', $arr);
$return =
$email_pref .
'@' .
$input;
} elseif (preg_match('![:\./]!', $input)) { // Or a complete domain name (with or without paths / parameters)
$this->_error('Only simple domain name parts can be handled in strict mode');
if (isset
($parsed['host'])) {
$arr =
explode('.', $parsed['host']);
foreach ($arr as $k =>
$v) {
if ($conv) $arr[$k] =
$conv;
$parsed['host'] =
join('.', $arr);
(empty($parsed['scheme']) ?
'' :
$parsed['scheme'].
(strtolower($parsed['scheme']) ==
'mailto' ?
':' :
'://'))
.
(empty($parsed['user']) ?
'' :
$parsed['user'].
(empty($parsed['pass']) ?
'' :
':'.
$parsed['pass']).
'@')
.
(empty($parsed['port']) ?
'' :
':'.
$parsed['port'])
.
(empty($parsed['path']) ?
'' :
$parsed['path'])
.
(empty($parsed['query']) ?
'' :
'?'.
$parsed['query'])
.
(empty($parsed['fragment']) ?
'' :
'#'.
$parsed['fragment']);
} else { // parse_url seems to have failed, try without it
foreach ($arr as $k =>
$v) {
$arr[$k] =
($conv) ?
$conv :
$v;
$return =
join('.', $arr);
} else { // Otherwise we consider it being a pure domain name string
if (!$return) $return =
$input;
// The output is UTF-8 by default, other output formats need conversion here
// If one time encoding is given, use this, else the objects property
switch (($one_time_encoding) ?
$one_time_encoding :
$this->_api_encoding) {
$this->_error('Unsupported output format');
* Encode a given UTF-8 domain name
* @param string Domain name (UTF-8 or UCS-4)
* [@param string Desired input encoding, see {@link set_parameter}]
* @return string Encoded Domain name (ACE string)
function encode($decoded, $one_time_encoding =
false)
// Forcing conversion of input to UCS4 array
// If one time encoding is given, use this, else the objects property
switch ($one_time_encoding ?
$one_time_encoding :
$this->_api_encoding) {
$this->_error('Unsupported input format: '.
($one_time_encoding ?
$one_time_encoding :
$this->_api_encoding));
// No input, no output, what else did you expect?
if (empty($decoded)) return '';
foreach ($decoded as $k =>
$v) {
// Make sure to use just the plain dot
// Right, no break here, the above are converted to dots anyway
// Stumbling across an anchoring character
// Neither email addresses nor URLs allowed in strict mode
$this->_error('Neither email addresses nor URLs are allowed in strict mode.');
$output .=
chr($decoded[$k]);
// Catch the rest of the string
if ($output =
$this->_encode($decoded)) {
* Use this method to get the last error ocurred
* @return string The last error, that occured
* The actual decoding algorithm
// We do need to find the Punycode prefix
$this->_error('This is not a punycode string');
// If nothing left after removing the prefix, it is hopeless
$this->_error('The given encoded string was empty');
// Find last occurence of the delimiter
$delim_pos =
strrpos($encoded, '-');
$decoded[] =
ord($encoded{$k});
$deco_len =
count($decoded);
// Wandering through the strings; init
for ($enco_idx =
($delim_pos) ?
($delim_pos +
1) :
0; $enco_idx <
$enco_len; ++
$deco_len) {
for ($old_idx =
$idx, $w =
1, $k =
$this->_base; 1 ; $k +=
$this->_base) {
$t =
($k <=
$bias) ?
$this->_tmin :
(($k >=
$bias +
$this->_tmax) ?
$this->_tmax :
($k -
$bias));
$w = (int)
($w *
($this->_base -
$t));
$bias =
$this->_adapt($idx -
$old_idx, $deco_len +
1, $is_first);
$char += (int)
($idx /
($deco_len +
1));
// Make room for the decoded char
for ($i =
$deco_len; $i >
$idx; $i--
) {
$decoded[$i] =
$decoded[($i -
1)];
$decoded[$idx++
] =
$char;
* The actual encoding algorithm
// We cannot encode a domain name containing the Punycode prefix
if ($check_pref ==
$check_deco) {
$this->_error('This is already a punycode string');
// We will not try to encode strings consisting of basic code points only
foreach ($decoded as $k =>
$v) {
$this->_error('The given string does not contain encodable chars');
if (!$decoded ||
!is_array($decoded)) return false; // NAMEPREP failed
$deco_len =
count($decoded);
if (!$deco_len) return false; // Empty array
$codecount =
0; // How many chars have been consumed
// Copy all basic code points to output
for ($i =
0; $i <
$deco_len; ++
$i) {
// Will match [-0-9a-zA-Z]
if ((0x2F <
$test &&
$test <
0x40) ||
(0x40 <
$test &&
$test <
0x5B)
||
(0x60 <
$test &&
$test <=
0x7B) ||
(0x2D ==
$test)) {
$encoded .=
chr($decoded[$i]);
if ($codecount ==
$deco_len) return $encoded; // All codepoints were basic ones
// Start with the prefix; copy it to output
// If we have basic code points in output, add an hyphen to the end
if ($codecount) $encoded .=
'-';
// Now find and encode all non-basic code points
while ($codecount <
$deco_len) {
// Find the smallest code point >= the current code point and
// remember the last ouccrence of it in the input
for ($i =
0, $next_code =
$this->_max_ucs; $i <
$deco_len; $i++
) {
if ($decoded[$i] >=
$cur_code &&
$decoded[$i] <=
$next_code) {
$next_code =
$decoded[$i];
$delta +=
($next_code -
$cur_code) *
($codecount +
1);
// Scan input again and encode all characters whose code point is $cur_code
for ($i =
0; $i <
$deco_len; $i++
) {
if ($decoded[$i] <
$cur_code) {
} elseif ($decoded[$i] ==
$cur_code) {
for ($q =
$delta, $k =
$this->_base; 1; $k +=
$this->_base) {
$t =
($k <=
$bias) ?
$this->_tmin :
(($k >=
$bias +
$this->_tmax) ?
$this->_tmax :
$k -
$bias);
$encoded .=
$this->_encode_digit(intval($t +
(($q -
$t) %
($this->_base -
$t)))); //v0.4.5 Changed from ceil() to intval()
$q = (int)
(($q -
$t) /
($this->_base -
$t));
$bias =
$this->_adapt($delta, $codecount+
1, $is_first);
* Adapt the bias according to the current code point and position
function _adapt($delta, $npoints, $is_first)
$delta =
intval($is_first ?
($delta /
$this->_damp) :
($delta /
2));
$delta +=
intval($delta /
$npoints);
* Encoding a certain digit
return chr($d +
22 +
75 *
($d <
26));
return ($cp -
48 <
10) ?
$cp -
22 :
(($cp -
65 <
26) ?
$cp -
65 :
(($cp -
97 <
26) ?
$cp -
97 :
$this->_base));
* Internal error handling method
* Do Nameprep according to RFC3491 and RFC3454
* @param array Unicode Characters
* @return string Unicode Characters, Nameprep'd
// Walking through the input array, performing the required steps on each of
// the input chars and putting the result into the output array
// While mapping required chars we apply the cannonical ordering
// Map to nothing == skip that code point
if (in_array($v, $this->NP['map_nothing'])) continue;
// Try to find prohibited input
$this->_error('NAMEPREP: Prohibited input U+'.
sprintf('%08X', $v));
foreach ($this->NP['prohibit_ranges'] as $range) {
if ($range[0] <=
$v &&
$v <=
$range[1]) {
$this->_error('NAMEPREP: Prohibited input U+'.
sprintf('%08X', $v));
// Hangul syllable decomposition
if (0xAC00 <=
$v &&
$v <=
0xD7AF) {
// There's a decomposition mapping for that code point
} elseif (isset
($this->NP['replacemaps'][$v])) {
// Before applying any Combining, try to rearrange any Hangul syllables
$out_len =
count($output);
for ($i =
0; $i <
$out_len; ++
$i) {
if ((!$last_class ||
$last_class >
$class) &&
$class) {
$seq_len =
$i -
$last_starter;
// On match: Replace the last starter with the composed character and remove
// the now redundant non-starter(s)
$output[$last_starter] =
$out;
if (count($out) !=
$seq_len) {
for ($j =
$i+
1; $j <
$out_len; ++
$j) {
$output[$j-
1] =
$output[$j];
unset
($output[$out_len]);
// Rewind the for loop by one, since there can be more possible compositions
// The current class is 0
if (!$class) $last_starter =
$i;
* Decomposes a Hangul syllable
* (see http://www.unicode.org/unicode/reports/tr15/#Hangul
* @param integer 32bit UCS4 code point
* @return array Either Hangul Syllable decomposed or original 32bit value as one value array
$sindex = (int)
$char -
$this->_sbase;
if ($sindex <
0 ||
$sindex >=
$this->_scount) {
if ($T !=
$this->_tbase) $result[] =
$T;
* Ccomposes a Hangul syllable
* (see http://www.unicode.org/unicode/reports/tr15/#Hangul
* @param array Decomposed UCS4 sequence
* @return array UCS4 sequence with syllables composed
$inp_len =
count($input);
if (!$inp_len) return array();
$result[] =
$last; // copy first char from input to output
for ($i =
1; $i <
$inp_len; ++
$i) {
$char = (int)
$input[$i];
$sindex =
$last -
$this->_sbase;
$lindex =
$last -
$this->_lbase;
$vindex =
$char -
$this->_vbase;
$tindex =
$char -
$this->_tbase;
// Find out, whether two current characters are LV and T
if (0 <=
$sindex &&
$sindex <
$this->_scount &&
($sindex %
$this->_tcount ==
0)
&&
0 <=
$tindex &&
$tindex <=
$this->_tcount) {
// create syllable of form LVT
$result[(count($result) -
1)] =
$last; // reset last
continue; // discard char
// Find out, whether two current characters form L and V
if (0 <=
$lindex &&
$lindex <
$this->_lcount &&
0 <=
$vindex &&
$vindex <
$this->_vcount) {
// create syllable of form LV
$result[(count($result) -
1)] =
$last; // reset last
continue; // discard char
// if neither case was true, just add the character
* Returns the combining class of a certain wide char
* @param integer Wide char to check (32bit integer)
* @return integer Combining class if found, else 0
return isset
($this->NP['norm_combcls'][$char]) ?
$this->NP['norm_combcls'][$char] :
0;
* Apllies the cannonical ordering of a decomposed UCS4 sequence
* @param array Decomposed UCS4 sequence
* @return array Ordered USC4 sequence
for ($i =
0; $i <
$size-
1; ++
$i) {
if ($next !=
0 &&
$last >
$next) {
// Move item leftward until it fits
for ($j =
$i +
1; $j >
0; --
$j) {
$input[$j] =
intval($input[$j-
1]);
// Reentering the loop looking at the old character again
* Do composition of a sequence of starter and non-starter
* @param array UCS4 Decomposed sequence
* @return array Ordered USC4 sequence
$inp_len =
count($input);
foreach ($this->NP['replacemaps'] as $np_src =>
$np_target) {
if ($np_target[0] !=
$input[0]) continue;
if (count($np_target) !=
$inp_len) continue;
foreach ($input as $k2 =>
$v2) {
if ($v2 ==
$np_target[$k2]) {
if ($hit) return $np_src;
* This converts an UTF-8 encoded string to its UCS-4 representation
* By talking about UCS-4 "strings" we mean arrays of 32bit integers representing
* each of the "chars". This is due to PHP not being able to handle strings with
* bit depth different from 8. This apllies to the reverse method _ucs4_to_utf8(), too.
* The following UTF-8 encodings are supported:
* bytes bits representation
* 3 16 1110xxxx 10xxxxxx 10xxxxxx
* 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* 5 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* 6 31 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* Each x represents a bit that can be used to store character data.
* The five and six byte sequences are part of Annex D of ISO/IEC 10646-1:2000
for ($k =
0; $k <
$inp_len; ++
$k) {
$v =
ord($input{$k}); // Extract byte from input string
if ($v <
128) { // We found an ASCII char - put into stirng as is
$this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.
$k);
if ('next' ==
$mode) { // Try to find the next start byte; determine the width of the Unicode char
if ($v >>
5 ==
6) { // &110xxxxx 10xxxxx
$next_byte =
0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left
} elseif ($v >>
4 ==
14) { // &1110xxxx 10xxxxxx 10xxxxxx
} elseif ($v >>
3 ==
30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
} elseif ($v >>
2 ==
62) { // &111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
} elseif ($v >>
1 ==
126) { // &1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
$this->_error('This might be UTF-8, but I don\'t understand it at byte '.
$k);
$output[$out_len] = (int)
$v;
if (($v <
0xA0 &&
$start_byte ==
0xE0) ||
($v <
0x90 &&
$start_byte ==
0xF0) ||
($v >
0x8F &&
$start_byte ==
0xF4)) {
$this->_error('Bogus UTF-8 character detected (out of legal range) at byte '.
$k);
if ($v >>
6 ==
2) { // Bit mask must be 10xxxxxx
$v =
($v -
128) <<
($next_byte *
6);
$output[($out_len -
1)] +=
$v;
$this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.
$k);
* Convert UCS-4 string into UTF-8 string
* See _utf8_to_ucs4() for details
if ($v <
128) { // 7bit are transferred literally
} elseif ($v <
(1 <<
11)) { // 2 bytes
$output .=
chr(192 +
($v >>
6)) .
chr(128 +
($v & 63));
} elseif ($v <
(1 <<
16)) { // 3 bytes
$output .=
chr(224 +
($v >>
12)) .
chr(128 +
(($v >>
6) & 63)) .
chr(128 +
($v & 63));
} elseif ($v <
(1 <<
21)) { // 4 bytes
$output .=
chr(240 +
($v >>
18)) .
chr(128 +
(($v >>
12) & 63))
.
chr(128 +
(($v >>
6) & 63)) .
chr(128 +
($v & 63));
} elseif ($v <
(1 <<
26)) { // 5 bytes
$output .=
chr(248 +
($v >>
24)) .
chr(128 +
(($v >>
18) & 63))
.
chr(128 +
(($v >>
12) & 63)) .
chr(128 +
(($v >>
6) & 63))
} elseif ($v <
(1 <<
31)) { // 6 bytes
$output .=
chr(252 +
($v >>
30)) .
chr(128 +
(($v >>
24) & 63))
.
chr(128 +
(($v >>
18) & 63)) .
chr(128 +
(($v >>
12) & 63))
.
chr(128 +
(($v >>
6) & 63)) .
chr(128 +
($v & 63));
$this->_error('Conversion from UCS-4 to UTF-8 failed: malformed input at byte '.
$k);
* Convert UCS-4 array into UCS-4 string
// Take array values and split output to 4 bytes per value
// The bit mask is 255, which reads &11111111
$output .=
chr(($v >>
24) & 255).
chr(($v >>
16) & 255).
chr(($v >>
8) & 255).
chr($v & 255);
* Convert UCS-4 strin into UCS-4 garray
// Input length must be dividable by 4
$this->_error('Input UCS4 string is broken');
// Empty input - return empty output
if (!$inp_len) return $output;
for ($i =
0, $out_len = -
1; $i <
$inp_len; ++
$i) {
// Increment output position every 4 input bytes
$output[$out_len] +=
ord($input{$i}) <<
(8 *
(3 -
($i %
4) ) );
* Adapter class for aligning the API of idna_convert with that of Net_IDNA
* @author Matthias Sommerfeld <mso@phlylabs.de>
* Sets a new option value. Available options and values:
* [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
* 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
* [overlong - Unicode does not allow unnecessarily long encodings of chars,
* to allow this, set this parameter to true, else to false;
* [strict - true: strict mode, good for registration purposes - Causes errors
* on failures; false: loose mode, ideal for "wildlife" applications
* by silently ignoring errors and returning the original input instead
* @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs)
* @param string Value to use (if parameter 1 is a string)
* @return boolean true on success, false otherwise
return $this->IC->set_parameters($option, $param);
Documentation generated on Tue, 19 Nov 2013 15:05:13 +0100 by phpDocumentor 1.4.3