<?php

class CharsetEntity {

   /* ==========================================================================================
	* Charset to UTF8 Convertor 0.9
	* Copyright 2004 by Niels Leenheer
	* ==========================================================================================
	* This program is free software and open source software; you can redistribute
	* it and/or modify it under the terms of the GNU General Public License as
	* published by the Free Software Foundation; either version 2 of the License,
	* or (at your option) any later version.
	*
	* This program is distributed in the hope that it will be useful, but WITHOUT
	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
	* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
	* more details.
	*
	* You should have received a copy of the GNU General Public License along
	* with this program; if not, write to the Free Software Foundation, Inc.,
	* 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA  or visit
	* http://www.gnu.org/licenses/gpl.html
	* ==========================================================================================
	*/

	var $directory;
	var $undefined;
	
	var $current_matrix;
	var $current_encoding;
	var $named_matrix;


	function CharsetEntity ($directory = '')
	{
		/* IN:  directory in which matrixes can be found
		 */

		$this->undefined = '?';
		
		if ($directory == '')
			$directory = dirname(ereg_replace("\\\\","/",__FILE__));
		
		$this->setDirectory($directory);
	}

	function setDirectory ($directory)
	{
		/* IN:  directory in which matrixes can be found
		 */

		if ($directory != $this->directory)
		{
			$this->directory = $directory;

			// Import named entities
			@include ($this->directory . '/entity_named.php');
			$this->named_matrix = $matrix;
		}
	}
	
	function setUndefined ($undefined)
	{
		/* IN:  which character must be used to represent unconvertable chars
		 */

		$this->undefined = $undefined;
	}

	function detectDeclaration ($str) 
	{
		// Strip out everything except > 32 and < 128
		$str = substr($str, 0, 400);
		$len = strlen($str);
		$pos = 0;
		$out = '';
		
		while ($pos < $len)
		{
			$ord = ord($str[$pos]);
			
			if ($ord > 32 && $ord < 128)
				$out .= $str[$pos];
			
			$pos++;
		}

		if (preg_match("/<\?xml.+encoding\s*=\s*[\"|']([^\"']+)[\"|']\s*\?>/i", $out, $matches))
			return strtolower($matches[1]);		
		else
			return false;
	}

	function convert ($string, $encoding = 'iso-8859-1')
	{
		if (is_callable("iconv"))
			return iconv($this->_alias($encoding), "UTF-8", $string);
		elseif (is_callable("mb_convert_encoding"))
			return mb_convert_encoding($string, "UTF-8", $this->_alias($encoding));
		else 
			return $string;
	}

	function entities($string, $utf8toentities = true)
	{
		/* IN:  string or array in UTF-8 encoding 
		 * OUT: depending on utf8toentities and string or
		 *      or array in UTF-8 encoding containing
		 *      entities for special characters or
		 *      a string containing only 7-bit ascii
		 *      characters with numeric of named
		 *      entities representing non 7-bit characters
		 */

		if (is_array($string))
		{
			while (list($k,$v) = each($string))
			{
				$string[$k] = $this->entities($string[$k], $utf8toentities);
			}
		}
		else
		{
			if ($utf8toentities)
				$string = $this->_convert_entities($string);
			
			$string = $this->_numeric_entities($string);
			$string = $this->_named_entities($string);
		}
		
		return $string;
	}

	function bidi($encoding)
	{
		$encoding = $this->_alias($encoding);
		
		switch ($encoding)
		{
			case 'iso-8859-6':		/* Arabic */
			case 'windows-1256':
			case 'iso-8859-8':		/* Hebrew */
			case 'windows-1255':	
				return 'rtl';
				break;
		
			default:
				return 'ltr';
				break;
		}
	}

	function _alias($encoding)
	{
		/* IN:  a name or alias of an encoding
		 * OUT: the correct name of an encoding
		 */

		$aliases = array (
			
			/* big5 */
			'csbig5'			=> 'big5',

			/* cp037 */		
			'ibm037'			=> 'cp037',
			'ebcdic-cp-us'		=> 'cp037',
			'ebcdic-cp-ca'		=> 'cp037',
			'ebcdic-cp-wt'		=> 'cp037',
			'ebcdic-cp-nl'		=> 'cp037',
			'csibm037'			=> 'cp037',

			/* cp1026 */
			'ibm1026'			=> 'cp1026',
			'csibm1026'			=> 'cp1026',
			
			/* cp424 */
			'ibm424'			=> 'cp424',
			'ebcdic-cp-he'		=> 'cp424',
			'csibm424'			=> 'cp424',
			
			/* cp437 */
			'ibm437'			=> 'cp437',
			'437'				=> 'cp437',
			'cspc8codepage437'	=> 'cp437',

			/* cp500 */
			'ibm500'			=> 'cp500',
			'ebcdic-cp-be'		=> 'cp500',
			'ebcdic-cp-cn'		=> 'cp500',
			'csibm500'			=> 'cp500',

			/* cp775 */
			'ibm775'			=> 'cp775',
			'cspc775baltic'		=> 'cp775',

			/* cp850 */
			'ibm850'			=> 'cp850',
			'850'				=> 'cp850',
			'cspc850multilingual'	
								=> 'cp850',

			/* cp851 */
			'ibm851'			=> 'cp851',
			'851'				=> 'cp851',
			'csibm851'			=> 'cp851',

			/* cp852 */
			'ibm852'			=> 'cp852',
			'852'				=> 'cp852',
			'cspcp852'			=> 'cp852',

			/* cp855 */
			'ibm855'			=> 'cp855',
			'855'				=> 'cp855',
			'csibm855'			=> 'cp855',

			/* cp857 */
			'ibm857'			=> 'cp857',
			'857'				=> 'cp857',
			'csibm857'			=> 'cp857',

			/* cp860 */
			'ibm860'			=> 'cp860',
			'860'				=> 'cp860',
			'csibm860'			=> 'cp860',

			/* cp861 */
			'ibm861'			=> 'cp861',
			'861'				=> 'cp861',
			'cp-is'				=> 'cp861',
			'csibm861'			=> 'cp861',

			/* cp862 */
			'ibm862'			=> 'cp862',
			'862'				=> 'cp862',
			'cspc862latinhebrew'=> 'cp862',

			/* cp863 */
			'ibm863'			=> 'cp863',
			'863'				=> 'cp863',
			'csibm863'			=> 'cp863',

			/* cp864 */
			'ibm864'			=> 'cp864',
			'864'				=> 'cp864',
			'csibm864'			=> 'cp864',

			/* cp865 */
			'ibm865'			=> 'cp865',
			'865'				=> 'cp865',
			'csibm865'			=> 'cp865',

			/* cp866 */
			'ibm866'			=> 'cp866',
			'866'				=> 'cp866',
			'csibm866'			=> 'cp866',

			/* cp868 */
			'ibm868'			=> 'cp868',
			'868'				=> 'cp868',
			'cp-ar'				=> 'cp868',
			'csibm868'			=> 'cp868',

			/* cp869 */
			'ibm869'			=> 'cp869',
			'869'				=> 'cp869',
			'cp-gr'				=> 'cp869',
			'csibm869'			=> 'cp869',

			/* gb2312 */
			'csgb2312'			=> 'gb2312',
			
			/* iso-8859-1 */
			'iso-ir-100'		=> 'iso-8859-1',
			'iso_8859-1'		=> 'iso-8859-1',
			'latin1'			=> 'iso-8859-1',
			'l1'				=> 'iso-8859-1',
			'ibm819'			=> 'iso-8859-1',
			'cp819'				=> 'iso-8859-1',
			'csisolatin1'		=> 'iso-8859-1',
			
			/* iso-8859-2 */
			'iso-ir-101'		=> 'iso-8859-2',
			'iso_8859-2'		=> 'iso-8859-2',
			'latin2'			=> 'iso-8859-2',
			'l2'				=> 'iso-8859-2',
			'csisolatin2'		=> 'iso-8859-2',

			/* iso-8859-3 */
			'iso-ir-109'		=> 'iso-8859-3',
			'iso_8859-3'		=> 'iso-8859-3',
			'latin3'			=> 'iso-8859-3',
			'l3'				=> 'iso-8859-3',
			'csisolatin3'		=> 'iso-8859-3',			

			/* iso-8859-4 */
			'iso-ir-110'		=> 'iso-8859-4',
			'iso_8859-4'		=> 'iso-8859-4',
			'latin4'			=> 'iso-8859-4',
			'l4'				=> 'iso-8859-4',
			'csisolatin4'		=> 'iso-8859-4',			

			/* iso-8859-5 */
			'iso-ir-144'		=> 'iso-8859-5',
			'iso_8859-5'		=> 'iso-8859-5',
			'cyrillic'			=> 'iso-8859-5',
			'csisolatincyrillic'=> 'iso-8859-5',			

			/* iso-8859-6 */
			'iso-ir-127'		=> 'iso-8859-6',
			'iso_8859-6'		=> 'iso-8859-6',
			'ecma-114'			=> 'iso-8859-6',
			'asmo-708'			=> 'iso-8859-6',
			'arabic'			=> 'iso-8859-6',
			'csisolatinarabic'	=> 'iso-8859-6',			

			/* iso-8859-7 */
			'iso-ir-126'		=> 'iso-8859-7',
			'iso_8859-7'		=> 'iso-8859-7',
			'elot_928'			=> 'iso-8859-7',
			'ecma-118'			=> 'iso-8859-7',
			'greek'				=> 'iso-8859-7',
			'greek8'			=> 'iso-8859-7',
			'csisolatingreek'	=> 'iso-8859-7',			

			/* iso-8859-8 */
			'iso-ir-138'		=> 'iso-8859-8',
			'iso_8859-8'		=> 'iso-8859-8',
			'hebrew'			=> 'iso-8859-8',
			'csisolatinhebrew'	=> 'iso-8859-8',			

			/* iso-8859-9 */
			'iso-ir-148'		=> 'iso-8859-9',
			'iso_8859-9'		=> 'iso-8859-9',
			'latin5'			=> 'iso-8859-9',
			'l5'				=> 'iso-8859-9',
			'csisolatin5'		=> 'iso-8859-9',			

			/* iso-8859-10 */
			'iso-ir-157'		=> 'iso-8859-10',
			'l6'				=> 'iso-8859-10',
			'csisolatin6'		=> 'iso-8859-10',			
			'latin6'			=> 'iso-8859-10',

			/* iso-8859-14 */
			'iso-ir-199'		=> 'iso-8859-14',
			'iso_8859-14'		=> 'iso-8859-14',
			'latin8'			=> 'iso-8859-14',
			'isoceltic'			=> 'iso-8859-14',			
			'l8'				=> 'iso-8859-14',

			/* iso-8859-15 */
			'iso_8859-15'		=> 'iso-8859-15',
			'latin-9'			=> 'iso-8859-15',

			/* iso-8859-16 */
			'iso-ir-226'		=> 'iso-8859-16',
			'iso_8859-16'		=> 'iso-8859-16',
			'latin10'			=> 'iso-8859-16',
			'l10'				=> 'iso-8859-16',

			/* koi8-r */
			'cskoi8r'			=> 'koi8-r',

			/* us_ascii */
			'iso-ir-6' 		 	=> 'us_ascii',
			'ansi_x3.4-1986'	=> 'us_ascii',
			'iso_646.irv:1991'	=> 'us_ascii',
			'ascii'				=> 'us_ascii',
			'iso646-us'			=> 'us_ascii',
			'us'				=> 'us_ascii',
			'ibm367'			=> 'us_ascii',
			'cp367'				=> 'us_ascii',
			'csascii'			=> 'us_ascii',
			
			/* shift_jis */
			'ms_kanji'			=> 'shift_jis',
			'csshiftjis'		=> 'shift_jis',
			
			/* euc-jp */
			'cseucpkdfmtjapanese' 
								=> 'euc-jp',
			
			/* iso-2022-jp */
			'csiso2022jp'		=> 'iso-2022-jp',

			/* utf-8 */
			'utf-2'				=> 'utf-8',

			/* utf-16 */
			'ucs-2'				=> 'utf-16',
			'iso-10646-ucs-2'	=> 'utf-16',
			'ucs-2e'			=> 'utf-16',
			'iso-10646-ucs-2e'	=> 'utf-16',

			/* utf-32 */
			'ucs-4'				=> 'utf-32',
			'iso-10646-ucs-4'	=> 'utf-32',
		);
		
		$encoding = strtolower($encoding);

		if (isset($aliases[$encoding]))
			$encoding = $aliases[$encoding];
	
		return $encoding;
	}
	
	function _numeric_entities($string)
	{
		/* IN:  string consisting of only characters ranging from 0x00 to 0x7f, 
		 *      using numeric entities to represent the other characters
		 * OUT: string consisting of only characters ranging from 0x00 to 0x7f, 
		 *      using numeric entities to represent the other characters 
		 */

		// Convert all decimal entities to hexadecimal entities
		$string = preg_replace('/&#([0-9]+);/e', "'&#x'.dechex('\\1').';'", $string);

		// Convert all hexadecimal entities ranging from 0x00 - 0x7F to normal characters
		$string = preg_replace('/&#[Xx]([0]*[0-7]?[0-9A-Fa-f]);/e', 'chr(hexdec("\\1"))', $string);	

		// Convert all hexadecimal entities back to decimal entities
		$string = preg_replace('/&#[Xx]([0-9A-Fa-f]+);/e', "'&#'.hexdec('\\1').';'", $string);	

		return $string;
	}
	
	function _named_entities($string)
	{
		/* IN:  string consisting of only characters ranging from 0x00 to 0x7f, 
		 *      using numeric entities to represent the other characters 
		 * OUT: string consisting of only characters ranging from 0x00 to 0x7f,
		 *      using both numeric and named entities to represent other characters
		 */
		 
		// Convert pre-existing named entities to their numeric equivalents
		$string = strtr($string, array_flip($this->named_matrix));
		 
		// Make numeric entities safe
		$string = preg_replace('/&#([0-9]+);/', '[[NUMERIC_ENTITY:\\1]]', $string);
		
		// Covert to named entities
		$special = array (
			"&quot;" => "&quot;",	"&amp;" => "&amp;",		"&apos;" => "&#39;",	
			"&lt;" => "&lt;",		"&gt;" => "&gt;",		"\x22" => "&quot;",
			"\x27" => "&#39;",		"\x26" => "&amp;",		"\x3c" => "&lt;",		
			"\x3e" => "&gt;",
		);

		$string = strtr($string, $special);
		
		// Recreate numeric entities
		$string = preg_replace('/\[\[NUMERIC_ENTITY:([0-9]+)\]\]/i', '&#\\1;', $string);

		// Convert some of the numeric entities to their named equivalents
		$string = strtr($string, $this->named_matrix);
		
		return $string;
	}
	
	function _convert_entities($string) 
	{
		/* IN:  string in UTF-8 encoding
		 * OUT: string consisting of only characters ranging from 0x00 to 0x7f, 
		 *      using numeric entities to represent the other characters 
		 */
		 
 		$len = strlen ($string);
		$pos = 0;
		$out = '';
   			
   		while ($pos < $len) 
		{
       		$ascii = ord (substr ($string, $pos, 1));
			
       		if ($ascii >= 0xF0) 
			{
       			$byte[1] = ord(substr ($string, $pos, 1)) - 0xF0;
       			$byte[2] = ord(substr ($string, $pos + 1, 1)) - 0x80;
       			$byte[3] = ord(substr ($string, $pos + 2, 1)) - 0x80;
       			$byte[4] = ord(substr ($string, $pos + 3, 1)) - 0x80;

				$char_code = ($byte[1] << 18) + ($byte[2] << 12) + ($byte[3] << 6) + $byte[4];
       			$pos += 4;
   			}
       		elseif (($ascii >= 0xE0) && ($ascii < 0xF0)) 
			{
       			$byte[1] = ord(substr ($string, $pos, 1)) - 0xE0;
       			$byte[2] = ord(substr ($string, $pos + 1, 1)) - 0x80;
       			$byte[3] = ord(substr ($string, $pos + 2, 1)) - 0x80;

				$char_code = ($byte[1] << 12) + ($byte[2] << 6) + $byte[3];
				$pos += 3;
       		}
			elseif (($ascii >= 0xC0) && ($ascii < 0xE0)) 
			{
       			$byte[1] = ord(substr ($string, $pos, 1)) - 0xC0;
       			$byte[2] = ord(substr ($string, $pos + 1, 1)) - 0x80;

				$char_code = ($byte[1] << 6) + $byte[2];
				$pos += 2;
			}
			else 
			{
       			$char_code = ord(substr ($string, $pos, 1));
				$pos += 1;
       		}

       		if ($char_code < 0x80)
	        	$out .= chr($char_code);
       		else
	        	$out .=  '&#'. str_pad($char_code, 5, '0', STR_PAD_LEFT) . ';';
   		}

		return $out;	
	}
}

?>