<?php
/**
 * 采集模块
 * @copyright reginx.com
 * $Id$ 
 */
class ga_lib{

    
    /**
     * 创建类别入库映射
     *
     * @param unknown_type $str
     * @return unknown
     */
    public static function cmap($str , $parent = 0){
        $ret = array();
        if(!empty($str)){
            $str = explode("\n" , $str);
            foreach ((array)$str as $v){
                if(!empty($v)){
                    $v = explode(':' , trim($v));
                    $v[1] = filter::text($v[1]);
                    $cat = OBJ('cat_tab')->where("name = '{$v[1]}'")->get();
                    if(empty($cat)){
                        $cat['id'] = cat_lib::addnew($v[1] , 2 , $parent);
                    }
                    $ret[] = array($v[0] , $cat['id']);
                }
            }
        }
        return $ret;    
    }
    
    /**
     * 获取标题分词结果
     *
     * @param unknown_type $title
     * @return unknown
     */
    public static function getkeywords($title , $content = ''){
        $ret = array();
        $url = 'http://keyword.discuz.com/related_kw.html?title=' . urlencode($title) .'&content=' . urlencode($content) . '&ics=utf-8&ocs=utf-8';
        $res = curl_lib::gethtml($url);
        if($res['code'] == '200' && $res['html'] != ''){
            preg_match_all('/<kw><\!\\[CDATA\[(.+?)\]\]><\/kw>/i' , $res['html'] , $ret);
        }
        return !empty($ret[1]) ? $ret[1] : array();
    }
    
    /**
     * 反编码字串
     *
     * @param unknown_type $str
     * @return unknown
     */
    public static function unescape($str){
        $ret = '';
        $len = strlen($str);
        for ($i = 0; $i < $len; $i++){
            if ($str[$i] == '%' && $str[$i+1] == 'u'){
                $val = hexdec(substr($str, $i+2, 4));
                if ($val < 0x7f){
                    $ret .= chr($val);
                }else if($val < 0x800){
                    $ret .= chr(0xc0|($val>>6)).chr(0x80|($val&0x3f));
                }else{
                    $ret .= chr(0xe0|($val>>12)).chr(0x80|(($val>>6)&0x3f)).chr(0x80|($val&0x3f));
                }
                $i += 5;
            }else if ($str[$i] == '%'){
                $ret .= urldecode(substr($str, $i, 3));
                $i += 2;
            }else{
                $ret .= $str[$i];
            }
        }
        return $ret;
    }

    /**
     * 采集播放地址获取规则类型
     *
     * @var unknown_type
     */
    public static $urlctype = array(
        1   => '播放页含有一个资源地址',
        2   => '播放页含有所有资源地址',
        3   => '播放页含有所有资源地址且以js变量的形式体现',
        4   => '播放页含有所有资源地址且以js变量的 unescape 形式体现',
        5   => '详细页含有所有的资源地址(资源站)',
        6   => '播放页含有所有资源地址的js文件',
    );

    /**
     * 获取播放资源地址
     *
     * @param unknown_type $url
     * @param unknown_type $ga
     * @return unknown
     */
    public static function fetchurl($row , $vod , $vsname , $ga = array() ){
        $ret = array('code' => 1 , 'msg' => '拉取数据失败了' , 'list' => array());
        $hparam = array(
            'gcookie'   => CACHE_PATH . 'ga-' . $ga['id'] . '.cookie',
            'gref'      => $ga['host']
        );
        list($url , $sname) = explode('###' , $row['sdesc']);
        $res = curl_lib::gethtml($url , $hparam);
        if($res['code'] == '200' ){
            $ret['code'] = 0;
            $ret['msg']  = '';
            $mts = array();
            // 播放页含有所有资源地址且以js变量的形式体现
            if($ga['conf']['utype'] == '3'){
                $puregx = explode(';' , $ga['conf']['puregx']);
                $tmpstr = array();
                // 提取js 变量
                preg_match('/' . self::fregx($puregx[1]) . '/is' , $res['html'] , $tmpstr);
                if($tmpstr && !empty($tmpstr[1])){
                    $tmpstr = explode($puregx[0] , urldecode($tmpstr[1]));
                    if(!empty($tmpstr)){
                        foreach ($tmpstr as $v){
                            $v = explode($puregx[2] , $v);
                            foreach ($v as $sv){
                                $sv = explode($puregx[3] , $sv);
                                if(substr($sv[1] , -1) == '|'){
                                    $sv[1] = substr($sv[1] , 0 , strlen($sv[1]) - 1);
                                }
                                $ret['list'][] = vod_lib::geturl($sv[1] , $sv[0] , $vod['name'] , $vsname);
                            }
                        }
                    }
                }
            }
            // 播放页含有所有资源地址且以js变量的 unescape 形式体现
            else if($ga['conf']['utype'] == '4'){
                $puregx = explode(';' , $ga['conf']['puregx']);
                $tmpstr = array();
                // 提取js 变量
                preg_match('/' . self::fregx($puregx[1]) . '/is' , $res['html'] , $tmpstr);
                if($tmpstr && !empty($tmpstr[1])){
                    $tmpstr = explode($puregx[0] , self::unescape($tmpstr[1]));
                    if(!empty($tmpstr)){
                        foreach ($tmpstr as $v){
                            $v = explode($puregx[2] , $v);
                            $v = array_pop($v);
                            if(!empty($v)){
                                $v = explode($puregx[3] , $v);
                                // array('第1集' , 'bdhd://...');
                                $tmp = array();
                                $len = count($v);
                                for ($i =0 ; $i < $len ; $i+=2){
                                    if(!empty($v[$i+1])){
                                        $sname = $v[$i];
                                        if(strpos($v[$i] , '#') !== false){
                                            $sname = substr($v[$i] , strpos($v[$i] , '#') + 1);
                                        }
                                        $tmp[] = array($sname , $v[$i+1]);
                                    }
                                }
                                foreach ($tmp as $sv){
                                    if(substr($sv[1] , -1) == '|'){
                                        $sv[1] = substr($sv[1] , 0 , strlen($sv[1]) - 1);
                                    }
                                    $ret['list'][] = vod_lib::geturl($sv[1] , $sv[0] , $vod['name'] , $vsname);
                                }
                                $tmp = null; unset($tmp);
                            }
                        }
                    }
                }
            }
            // 默认处理
            else{
                preg_match_all('/' . self::fregx($ga['conf']['puregx']) . '/is' , $res['html'] , $mts);
                $res = null; unset($res);
                if(!empty($mts[1])){
                    foreach ($mts[1] as $v){
                        $ret['list'][] = vod_lib::geturl($v , $sname , $vod['name'] , $vsname);
                    }
                }
            }
        }
        return $ret;
    }

    /**
     * 获取采集规则
     *
     * @param unknown_type $id
     * @return unknown
     */
    public static function get($id=0){
        $ret = core::getobj('ga_tab')->where("id = '$id' ")->get();
        if(!empty($ret)){
            $ret['conf'] = unserialize(base64_decode($ret['conf']));
        }
        return $ret;
    }


    /**
     * 保存远程图片
     *
     * @param unknown_type $url
     * @param unknown_type $tw
     * @param unknown_type $th
     * @return unknown
     */
    public static function srimg($url , $tw = 0 , $th = 0 , $param = array()){
        $sfile  = date('Y-m-d' , REQUEST_TIME) . '/';
        if(!is_dir(UPLOAD_PATH . $sfile)){
            core::makedir(UPLOAD_PATH . $sfile);
        }
        $sfile .= 'http_' . REQUEST_TIME . mt_rand(100,999) . '.jpg';
        $image  = curl_lib::getimage($url , $param);
        if(!empty($image) && file_put_contents(UPLOAD_PATH . $sfile , $image  , LOCK_EX) !== false){
            $tw = $tw ? $tw : $GLOBALS['_APP']['thumb_width'];
            $th = $th ? $th : $GLOBALS['_APP']['thumb_height'];
            $image = null; unset($image);
            image_lib::thumb(UPLOAD_PATH . $sfile , 'auto' , $tw , $th);
        }else{
            $sfile = '';
        }
        return $sfile;
    }

    /**
     * 获取类别 , 若不存在则尝试创建之
     *
     * @param unknown_type $name
     * @param unknown_type $parent
     * @param unknown_type $ctype
     * @return array
     */
    public static function getcat($name , $parent = 0 , $ctype = 3){
        $ret = 0;
        $name = filter::text($name);
        $obj  = OBJ('cat_tab');
        $row  = $obj->where("name = '{$name}'")->get();
        // 保持类别 url 映射 唯一性
        if($row && !empty($row) && $ctype == $row['ctype']){
            $ret = $row;
        }else{
            $sname = py_lib::get($name , '');
            $path = '#0#';
            if($parent > 0){
                $prow = $obj->fields('path')->where("id = '$parent' ")->get();
                if($prow){
                    $path = $prow['path'] . $parent .'#';
                    $prow = null;
                }
            }
            $obj->set("id" , 0)->set("name" , $name)->set("sname" , $sname);
            $obj->set("ctype" , $ctype)->set("parent" , $parent)->set("path" , $path)->set("sort" , 0)->set("key" , $name);
            $obj->set("desc" , $name)->set("tpl" , 0)->set("extra" , 0)->set("cover" , 'images/null.gif');
            $ret = $obj->save();
            if($ret && (int)$ret['code'] === 0){
                cat_lib::create($sname);
                $catid = intval($ret['msg']);
                $ret = OBJ('cat_tab')->where("id = $catid")->get();
            }
        }
        return $ret;
    }


    /**
     * 拉取远程数据
     *
     * @param unknown_type $url
     * @param unknown_type $conf
     * @return unknown
     */
    public static function fetch($url , $ga){
        $ret = $mts = array();
        $hash = md5($url);
        $obj  = OBJ('galog_tab');
        if($obj->where("hash = '$hash' and status = 1")->count() < 1 || 1){
            $list = array_map(null , $ga['conf']['list']['name'] , $ga['conf']['list']['regx'] , $ga['conf']['list']['target']);
            $hparam = array(
                'gcookie'   => CACHE_PATH . 'ga-' . $ga['id'] . '.cookie',
                'gref'      => $ga['host']
            );
            $res = curl_lib::gethtml($url , $hparam);
            $hparam['gref'] = $url;
            if($res['code'] == '200' ){
                foreach ($list as $v){
                    if($v[2] == 'player_url'){
                        preg_match_all('/' . self::fregx($v[1]) . '/is' , $res['html'] , $mts);
                    }else{
                        preg_match('/' . self::fregx($v[1]) . '/is' , $res['html'] , $mts);
                    }
                    if(!empty($mts[1])){
                        // 播放地址
                        if($v[2] == 'player_url'){
                            foreach ($mts[0] as $sk => $sv){
                                $tmp = array();
                                preg_match('/href=(?:\'|\")?(.+?)(?:\'|\")/is' , $sv , $tmp);
                                if(!empty($tmp[1])){
                                    if(strpos($tmp[1] , 'http://') === false){
                                        if(substr($tmp[1] , 0 , 1) == '/'){
                                            $tmp[1] = $ga['host'] . substr($tmp[1] , 1);
                                        }else{
                                            $tmp[1] = $ga['host'] . $tmp[1];
                                        }
                                    }
                                }else{
                                    // for 资源站. 剧集ID
                                    $tmp[1] = $sk;
                                }
                                $ret['data']['player_url'][] = array($tmp[1] , str_replace(' ' ,'' , filter::text($mts[1][$sk])));
                            }
                        }else{
                            // 删除内链
                            $mts[1] = self::losetags($mts[1]);
                            // 封面字段
                            if($v[2] == 'cover' && !empty($mts[1])){
                                if(strpos($mts[1] , 'http://') === false){
                                    if(substr($mts[1] , 0 , 1) == '/'){
                                        $mts[1] = $ga['host'] . $mts[1];
                                    }else{
                                        $mts[1] = $ga['host'] . substr($mts[1] , 1);
                                    }
                                }
                                // 存储远程图片
                                if($ga['srimg']){
                                    $mts[1] =  ga_lib::srimg($mts[1] , $GLOBALS['_APP']['thumb_widht'] , $GLOBALS['_APP']['thumb_height'] , $hparam);
                                }
                                // 设置默认封面
                                if(!isset($ret['data']['rimage'])){
                                    $ret['data']['rimage'] = $mts[1];
                                }
                            }else{
                                $img = array();
                                // 普通内容提取图片url
                                preg_match_all('/<img.+?src=(?:\"|\')?(.+?)(?:\"|\')? "/i' , $mts[1] , $img);
                                if(!empty($img[1]) && is_array($img[1])){
                                    foreach ($img[1] as $sv){
                                        $surl = $sv;
                                        if(strpos($sv , 'http://') === false){
                                            if(substr($sv , 0 , 1) == '/'){
                                                $surl = $ga['host'] . $surl;
                                            }else{
                                                $surl = $ga['host'] . substr($sv , 1);
                                            }
                                        }
                                        $tmpurl = ga_lib::srimg($surl , $GLOBALS['_APP']['thumb_widht'] , $GLOBALS['_APP']['thumb_height'] , $hparam);
                                        if(!isset($ret['data']['rimage'])){
                                            $ret['data']['rimage'] = $tmpurl;
                                        }
                                        // 替换内容中的图片标签
                                        $mts[1] = str_replace($sv , UPLOAD_URL . $tmpurl , $mts[1]);
                                    }
                                }
                            }
                            $ret['data'][$v[2]] = $mts[1];
                        }
                    }
                }
                $obj->set("status" , 1);
                $ret['code'] = 0;
                $ret['msg']  = '处理成功! ';
            }else{
                $obj->set("status" , 0);
                $ret['code'] = 1;
                $ret['msg']  = '拉取超时! ';
            }
            $obj->set("gaid" , $ga['id'])->set("url" , $url);
            $obj->set("adate" , REQUEST_TIME)->set("hash" , $hash)->replace();
        }else{
            $ret['code'] = '2';
            $ret['msg']  = '重复跳过! ';
        }
        return $ret;
    }

    /**
     * 获取列表页链接内容及url
     *
     * @param unknown_type $url
     * @param unknown_type $ga
     * @return array
     */
    public static function getpages(&$url , $ga){
        if(strpos($url , 'http://') === false){
            if(substr($url , 0 , 1) == '/'){
                $url = $ga['host'] . substr($url , 1);
            }else{
                $url = $ga['host'] . $url;
            }
        }
        $key = 'ga#' . md5($url);
        $ret = $mas = $pmas = $tmas = array();
        $res = curl_lib::gethtml($url);
        if($res['code'] == '200' ){
            // 获取尾页数
            preg_match_all('/' . self::fregx($ga['conf']['pregx']) . '/is' , $res['html'] , $pmas);
            $ret['maxpn'] = 0;
            if(!empty($pmas)){
                foreach ($pmas[1] as $v){
                    $ret['maxpn'] = intval($v > $ret['maxpn'] ? $v : $ret['maxpn']);
                }
            }
            // 获取总数
            preg_match_all('/' . self::fregx($ga['conf']['tregx']) . '/is' , $res['html'] , $tmas);
            $ret['total'] = 0;
            if(!empty($tmas)){
                foreach ($tmas[1] as $v){
                    $ret['total'] = intval($v > $ret['total'] ? $v : $ret['total']);
                }
            }
            // 获取内容链接代码
            preg_match_all('/' . self::fregx($ga['conf']['iregx']) . '/is' , $res['html'] , $mas);
            foreach ($mas[0] as &$v){
                $tmp = array();
                // 获取内容链接地址
                preg_match('/' . self::fregx($ga['conf']['uregx']) . '/is' , $v , $tmp);
                if(isset($tmp[0]) && !empty($tmp[0])){
                    $utmp = $tmp[0];
                    // 处理相对路径
                    if(strpos($tmp[0] , 'http://') === false){
                        if(substr($tmp[0] , 0 , 1) == '/'){
                            $utmp = $ga['host'] . substr($tmp[0] , 1);
                        }else{
                            $utmp = $ga['host'] . $tmp[0];
                        }
                    }
                    $v = str_replace('href' , 'title="点击查看目标内容" target = "_blank" href' , str_replace($tmp[0] , $utmp , $v));
                    // 清除无关的html代码
                    $v = substr($v , strpos($v , '<a'));
                    $v = substr($v , 0 , strpos($v , '</a>') + 4);
                    $ret['list'][] = array($v , $utmp);
                    $tmp = array();
                }
            }
        }
        return $ret;
    }

    /**
     * 格式化 正则表达式
     *
     * @param unknown_type $regx
     * @return unknown
     */
    public static function fregx($regx){
        return  str_replace(
            array('#数字#'  , '#内容#' , '#时间#' , '#@数字#' , '#@内容#' , '#@时间#' , '#*内容#' ) ,
            array(
                '(\d+?)'   , '(.+?)'   , '(\d{4}(?:-|\/|年)\d{1,2}(?:-|\/|月)\d{1,2}(?:日)?(?: \d{1,2}:\d{1,2}(?:\:\d{1,2})?)?)',
                '(?:\d+?)' , '(?:.+?)' , '(?:\d{4}(?:-|\/|年)\d{1,2}(?:-|\/|月)\d{1,2}(?:日)?(?: \d{1,2}:\d{1,2}(?:\:\d{1,2})?)?)' ,
                '(.*?)'
            ),
            preg_quote($regx , '/')
        );
    }
    
    
    /**
     * 过滤html标签
     *
     * @param unknown_type $str
     */
    public static function losetags($str){
        if(empty($str) && $str != '0'){
            return '';
        }
        $str = htmlspecialchars_decode($str);
        $str = preg_replace('/\<style(.*?)>.*?\<\/style\>/is', '', $str);
        $str = preg_replace('/\<script(.*?)>.*?\<\/script\>/is', '', $str);
        $str = preg_replace('/\<iframe(.*?)>.*?\<\/iframe\>/is', '', $str);
        $str = preg_replace('/\<\/?\s*(div|ul|li|embed|span|a).*?\>/is' , '' , $str);
        return trim($str);
    }

    /**
     * 添加采集任务
     *
     * @param unknown_type $gaid
     * @param unknown_type $vid
     * @param unknown_type $desc
     * @param unknown_type $extra
     */
    public static function addtask($gaid , $vid , $ctype ,  $desc , $extra = 0){
        if(!empty($desc)){
            $ctype = intval($ctype);
            $obj = OBJ('gatask_tab');
            if(is_array($desc)){
                foreach ($desc as $k => $v){
                    $obj->set("gaid" , $gaid)->set("ctype" , $ctype);
                    if($ctype === 0 && is_array($v)){
                        $obj->set("sdesc" , join('###' , $v))->set("extra" , $k + 1);
                    }else{
                        $obj->set("sdesc" ,$v)->set("extra" , $k + 1);
                    }
                    $obj->set("vid" , $vid)->replace();
                }
            }else{
                $obj->set("gaid" , $gaid)->set("ctype" , $ctype)->set("vid" , $vid);
                if($ctype === 0 && is_array($v)){
                    $obj->set("sdesc" , join('###' , $v));
                }else{
                    $obj->set("sdesc" ,$v);
                }
                $obj->set("extra" , $extra)->replace();
            }
        }
    }
}
?>