<?php
require(R_P.'require/chinese.php');
require(R_P.'require/imageWater.php');


/**
* 采集内容页函数
*
*/
function gathercontent($linkCount){
	$url_array = readUrl(0,$linkCount);//以后可扩展为根据记录读取相应的网址

}

/**
	 * 将获取到的链接网址保存到一个临时文件中
	 *
	 */
function saveLinks($links)
{
	global $g_id;
	$cache='';
	foreach ($links as $url){
		$realurl_1 = realUrl($url);
		$cache.=$realurl_1."\n";
	}
	writeover(R_P.'data/cache/gather/cache_'.$g_id.'_list.txt',$cache,'rb+');
}

/**
	 * 从临时文件中读取列表页
	 *
	 * @param integer $readnum 读取的网址数量
	 */
function readUrl($start,$readnum){
	global $g_id;
	$str = file_get_contents(R_P.'data/cache/gather/cache_'.$g_id.'_list.txt');
	$url_array = explode("\n",$str);
	array_pop($url_array);
	$readnum<=0 && $readnum=1;
	$url_array = array_slice($url_array,$start,$readnum);
	return $url_array;
}

/**
	 * 获取到所有有效的内容页网址
	 *
	 * @param string $listArea 有效内容网址区域
	 * @param string $contenturl 必须包含的有效内容url
	 * @param string $debarurl 要排除的url部分
	 */
function getLinks($Reg,$data,$contenturl)
{
	global $action;
	$links = array();
	$data = getData($Reg,$data); //首先获取到有效区域
	$allLinks = getAllLinks($data); //继而获取到区域中所有链接
	$contenturl ? strpos($contenturl,'|') ? $valid = explode('|',$contenturl) : $valid = array($contenturl) : $valid = array(); //有效块
	foreach ($allLinks as $link){ //循环所有链接来获取到有效的所需内容页链接
		if(empty($link)) continue;
		$errorno = 0; //错误计数器
		foreach ($valid as $v){
			if(empty($v)) continue;
			if(strpos($link,$v)===false){ //一旦有一次没有查到必须包含的部分，出错
				$errorno++;
				break;
			}
		}
		if($errorno>0) continue; //出现一次错误，则证明此link无效
		$links[] = $link;
	}
	$links = array_unique($links); //移除重复值
	$action == 'test' && $links=array(array_shift($links));
	saveLinks($links);
	$linkCount = count($links);
	return $linkCount;
}

/**
	 * 获取到一段内容中的所有链接
	 *
	 * @param string $data
	 * @return array
	 */
function getAllLinks($data)
{
	$chunklist = array ();
	$chunklist = explode("\n", $data);
	$links = array ();
	$regs = Array ();
	while(list ($id, $chunk) = each($chunklist)){
		if (strstr(strtolower($chunk), "href")){
			while (preg_match("/(href)\s*=\s*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/\?~=&;\\\(\),._a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?(\s*rel\s*=\s*[\'\"]?(nofollow)[\'\"]?)?/i", $chunk, $regs)) {
				if(!isset ($regs[10])) $links[] = $regs[2];
				$chunk = str_replace($regs[0], "", $chunk);
			}
		}

		elseif (strstr(strtolower($chunk), "frame") && strstr(strtolower($chunk), "src")){
			while (eregi("(frame[^>]*src[[:blank:]]*)=[[:blank:]]*[\'\"]?(([[a-z]{3,5}://(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%/?=&;\\\(\),._a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?", $chunk, $regs)) {
				$links[] = $regs[2];
				$chunk = str_replace($regs[0], "", $chunk);
			}
		}

		elseif (strstr(strtolower($chunk), "window") && strstr(strtolower($chunk), "location")) {
			while (eregi("(window[.]location)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}://(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%/?=&;\\\(\),._a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?", $chunk, $regs)) {
				$links[] = $readUrl($regs[2]);
				$chunk = str_replace($regs[0], "", $chunk);
			}
		}

		elseif (strstr(strtolower($chunk), "http-equiv")) {
			while (eregi("(http-equiv=['\"]refresh['\"] *content=['\"][0-9]+;url)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}://(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%/?=&;\\\(\),._a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?", $chunk, $regs)) {
				$links[] = $regs[2];
				$chunk = str_replace($regs[0], "", $chunk);
			}
		}

		elseif (strstr(strtolower($chunk), "window") && strstr(strtolower($chunk), "open")) {
			while (eregi("(window[.]open[[:blank:]]*[(])[[:blank:]]*[\'\"]?(([[a-z]{3,5}://(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%/?=&;\\\(\),._a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?", $chunk, $regs)) {
				$links[] = $regs[2];
				$chunk = str_replace($regs[0], "", $chunk);
			}
		}

	}
	//print  $links;
	return $links;
}


/**
	 * 根据字段的规则来获取该字段所需要的采集内容
	 *
	 * @param string $Reg 字段规则
	 * @return string 有效内容
	 */
function getData($Reg,$data)
{ //获取数据
	$clearrubbish = $imgtolocal = 0;

	if($pos = strpos($Reg,'{DATA}')){
		$start = substr($Reg,0,$pos);
		$end = substr($Reg,$pos+6);

		$RegChk = explode('{DATA}',$Reg);

		if(strpos($data,$RegChk[0]) && $start && $end ){
			$startpos = strpos($data,$start);
			$startpos+=strlen($start);
			$endpos = strpos($data,$end,$startpos);
			$length = $endpos - $startpos;
			$value = substr($data,$startpos,$length);
			return $value;
		}
		else return 0;
	}

}

/**
	 * 连接采集地址并获取内容
	 * @param string $url
	 * @return string $data 
	 */


function open($url){
	$data = '';
	$path = parse_url($url);
	$host	= $path['host'];
	$port	= $path['port'];
	$path	= $path['path'];
	if($path['query']) $path .= "?".$path['query'];
	if(empty($port)){
		$port=80;
	}elseif ($path['scheme']=='https'){
		$port=443;
	}elseif ($path['scheme']=='http'){
		$port=80;
	}
	$scheme = $port==80 ? "http://" : "https://";
	define('g_port',$port);
	define('g_scheme',$scheme);
	define('g_path',$path);
	define('g_host',$host);

	if(ini_get('allow_url_fopen'))
	{
		$data = @file_get_contents($url);
	}
	else
	{
		$config['errorno'] = '';
		$config['errornum'] = 0;
		$config['timeout'] = 90;
		$fso_data=@fsockopen($host,$port,$config['errorno'],$config['errornum'],$config['timeout']);
		$fso_data  && exit('Can not connect the server');

		$user_agent=$_SERVER['HTTP_USER_AGENT'];
		$http="GET $path HTTP/1.1\r\n";
		$http.="Host: $host:$port\r\n";
		$http.="Accept:*/*\r\nAccept-Encoding: identity\r\n";
		$http.="User-Agent: $user_agent\r\n\r\n";
		fwrite($fso_data,$http);

		$status = socket_get_status($fso_data);
		while (!feof($fso_data) && !$status['timed_out'])
		{
			$data .= fread($fso_data,8192);
		}
		fclose($fso_data);
	}
	$data = g_charset_Conversion($data);
	return $data;
}


/**
	 * 自动识别页面中的Base Href标签
	 *
	 */
function getBaseHref($data)
{
	if(eregi("<base[[:blank:]]*(href)=[[:blank:]]*[\'\"]?(([[a-z]{3,5}://(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%/?=&;\\\(\),._a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\"]?[^>]*>",$data,$reg)){
		$reg[2] && $baseHref = $reg[2];
	}else {
		$baseHref = '';
	}
	return $baseHref;
}

/**
	 * 根据一个采集页中获取到的相对或绝对的地址来判断地址的完整url
	 *
	 * @param string $url
	 * @return string
	 * @author AileenGuan
	 */
function realUrl($url)
{
	/*
	$baseHref = getBaseHref($url);
	if($baseHref){
	$urlPre = $baseHref;
	}else{
	$urlPre = g_scheme.g_host;
	}
	*/

	if(eregi("http:|https:",$url)){ //网址
		return $url;
	}elseif(ereg("^/",$url)){ //斜杠开头
		$realurl = g_scheme.g_host.$url;
		return $realurl;
	}elseif(ereg("^\?",$url)){ //问号开头
		$pathPre = explode("?",g_path);
		$realurl = g_scheme.g_host.$pathPre[0].$url;
		return $realurl;
	}elseif (ereg("^[^\.]{2}/",$url)){ // ../开头 表示上级目录
		$up_num = substr_count($url,"../");
		$path_array = explode("/",g_path);
		array_pop($path_array);
		for ($i=0;$i<$up_num;$i++){
			array_pop($path_array);
		}
		$url = str_replace("../","",$url);
		$path_array[]=$url;
		$url = implode("/",$path_array);
		$realurl = $urlPre.$url;
		return $realurl;
	}else{
		$path = explode("/",g_path);
		$arrlen = count($path)-2;
		for ($i=1;$i<$arrlen;$i++){
			$currentdir .= $path[$i].'/';
		}
		//$filename = array_pop($path);
		//$currentdir = substr(g_path,0,-strlen($filename));
		//echo $currentdir;
		return g_scheme.g_host.'/'.$currentdir.$url;

	}
}

/**
	 * 图片本地化
	 *
	 * @param string $data
	 * @return string 本地化之后的内容
	 */
function imageToLocal($imgsrc,$local=true){
	global $up_dir,$tpl_path;

	strpos($imgsrc,'.') ? $file_ext = strtolower(end(explode('.',$imgsrc))) : $file_ext = 'jpg';

	if(!in_array($file_ext,array('jpg','jpeg','png','gif'))) $file_ext='jpg';
	//如果不是指定格式，则强制格式，防止本地化可能带来的安全问题
	$imgname = substr(md5($imgsrc),10,10).'.'.$file_ext;
	$newImgSrc = $up_dir.'/img/'.date("Y-m-d",time()).'/'.$imgname;
	$TargetImg = R_P.$newImgSrc;
	$file_name = R_P.$up_dir.'/img/'.date("Y-m-d",time());
	!file_exists($file_name) &&  (mkdir($file_name) || Showmsg('mkdir_err'));
	if ($local) {
		!file_exists($TargetImg) && copy($imgsrc,$TargetImg) && @chmod($TargetImg,0777);
		
		$ImageWater = new ImageWater($TargetImg,9);
		$ImageWater->setWaterImageInfo(R_P.'images/'.$tpl_path.'/logo.gif');
		$ImageWater->makeWater();
	}
	return $newImgSrc; //再把内容中图片地址更换成对应的本地图片地址
}


/**
	 * 编码格式转化
	 * @param string $data
	 * @return string $data 
	 */
function g_charset_Conversion($data){
	global $g_charset,$charset;
	$g_charset ? $g_charset ='utf-8' : $g_charset ='gb2312';
	if($charset != $g_charset){
		$chs = new Chinese($g_charset,$charset);
		$data = $chs->Convert($data);
	}
	return $data;
}



function GetWebContent($host, $method, $str, $sessid = '')
{
	//$ip = gethostbyname($host);
	$fp = fsockopen($host, 80);
	if (!$fp) return;
	fputs($fp, "$method\r\n");
	fputs($fp, "Host: $host\r\n");
	if (!empty($sessid))
	{
		fputs($fp, "Cookie: PHPSESSID=$sessid; path=/;\r\n");
	}
	if ( substr(trim($method),0, 4) == "POST")
	{
		fputs($fp, "Content-Length: ". strlen($str) . "\r\n"); //  别忘了指定长度
	}
	fputs($fp, "Content-Type: application/x-www-form-urlencoded\r\n\r\n");
	if ( substr(trim($method),0, 4) == "POST")
	{
		fputs($fp, $str."\r\n");
	}
	while(!feof($fp))
	{
		$response .= fgets($fp, 1024);
	}
	$hlen = strpos($response,"\r\n\r\n"); // LINUX下是 "\n\n"
	$header = substr($response, 0, $hlen);
	$entity = substr($response, $hlen + 4);
	if ( preg_match('/PHPSESSID=([0-9a-z]+);/i', $header, $matches))
	{
		$a['sessid'] = $matches[1];
	}
	if ( preg_match('/Location: ([0-9a-z\_\?\=\&\#\.]+)/i', $header, $matches))
	{
		$a['location'] = $matches[1];
	}
	$a['content'] = $entity;
	fclose($fp);
	return $a;
}
?>
