<?php
/**
 * 重新抓取空的章节内容
 * 只处理当天和昨天的
 */
$process = `ps aux | grep analyzeEmpty | grep -v grep | grep -v sh | wc -l`;
$process = intval($process);
if ($process > 1)
{
    exit;
}

require_once 'init.php';
$id = intval($argv[1]);
$in = S::sqlEscape(date('Y-m-d')) . ',' . S::sqlEscape(date('Y-m-d', strtotime('-1 day')));
$sql = 'SELECT c.chapterid,c.bookid,c.chaptername,cl.spider FROM '.Chapter::table().' AS c LEFT JOIN '.CollectLogs::table().' AS cl ON c.chapterid = cl.cid WHERE cl.date IN ('.$in.') AND (c.size = 0 OR c.size = 57 OR c.size = 58)';
$result = $db->setQueryString($sql)->find();
if (empty($result))
{
    exit();
}

$array = array();
// 整理内容，同一本书的放一起
foreach ($result as $value)
{
    $array[$value['bookid']][$value['spider']][] = array('chapterId' => $value['chapterid'], 'chapterName' => $value['chaptername']);
}
if (empty($array))
{
    exit();
}

/**
 *  有些书可能空了很多章，但去搜索的时候某个站点无法搜到这本书
 *  所以当搜不到的时候，需要记录一下，避免重复查询浪费时间
 */
$denyList = array();

$spiderList = Spider::getMonitorList();
foreach ($array as $bookId => $value)
{
    $book = getBookInfo($bookId);
    foreach ($value as $oldSpider => $chapters)
    {
        // 循环站点抓空章节内容
        foreach ($chapters as $chapter)
        {
            foreach ($spiderList as $spider)
            {
                if ($oldSpider != $spider)
                {
                    echo "$bookId...{$chapter['chapterName']}...尝试站点$spider...";
                    if (!empty($denyList[$bookId]) && in_array($spider, $denyList[$bookId]))
                    {
                        echo "在排除列表，直接跳过\n";
                        continue;
                    }
                    
                    // TODO 进行书籍搜索，这里要先判断站内是否有记录，不然性能很差
                    $detailUrl = $listUrl = false;
                    $urlInfo = emptyGetSiteUrlInfo($bookId, $spider);
                    if (!empty($urlInfo))
                    {
                        $detailUrl = $urlInfo['detailurl'];
                        $listUrl = $urlInfo['listurl'];
                    }
                    else
                    {
                        $detailUrl = Spider::search($spider, $book['bookname'], $book['author']);
                    }

                    if (false !== $detailUrl)
                    {
                        $spiderObj = Spider::factory($detailUrl, $spider);
                        if (empty($listUrl))
                        {
                            $spiderObj->getInfoFile();
                            $listUrl = $spiderObj->getChapterListUrl();
                            
                            // 记录下来
                            createCollectList($bookId, $spider, $listUrl, $detailUrl);
                        }
                        $spiderObj->setChapterListUrl($listUrl);
                        // 开始对比章节，找到空章节对应的URL
                        if (emptyUpdate($bookId, $chapter, $spiderObj))
                        {
                            // 结束这一轮循环
                            echo "成功\n";
                            break;
                        }
                        else
                        {
                            echo "失败\n";
                        }
                    }
                    else 
                    {
                        $denyList[$bookId][] = $spider;
                        echo "无法获取到列表地址，跳过...\n";
                    }
                }
            }
        }
    }
}

function emptyUpdate($bookId, $chapter, $spiderObj)
{
    $return = false;
    $spiderObj->getChapterList();
    $chapterList = $spiderObj->getChapterArray();
    $updater = new Updater($spiderObj);
    
    if ($chapterList['hasVolume'])
    {
        $tmp = $updater->mergeVolume($chapterList);
        // 反转数组，应付头尾都相等的情况导致重复采集
        $tmp['chapters'] = array_reverse($tmp['chapters']);
        foreach ($tmp['chapters'] as $key => $v)
        {
            if ($updater->compare1($chapter['chapterName'], $v['title']))
            {
                $return = emptyUp($bookId, $chapter['chapterId'], $spiderObj, $v['url']);
            }
        }
    }
    
    if (!$return)
    {
        $chapterList = $spiderObj->clearVolume($chapterList);
        $tmp['chapters'] = array_reverse($chapterList['chapters']);
        foreach ($tmp['chapters'] as $key => $v)
        {
            if ($updater->compare2($chapter['chapterName'], $v['title']))
            {
                $return = emptyUp($bookId, $chapter['chapterId'], $spiderObj, $v['url']);
            }
        }
    }
    
    return $return;
}

function emptyUp($bookId, $chapterId, $spiderObj, $url)
{
    $url = $spiderObj->getChapterContentUrl($url);
    $i = 0;
    do
    {
       $chapterContent = $spiderObj->getChapterContent($url);
       $i++;
    } while (empty($chapterContent) && $i < 3);
    $content = $spiderObj->filterChapterContent($chapterContent);
    
    // 遇到图片章节的内容，直接返回，目前不抓图片，以后再说
    if ($spiderObj->isImage($content))
    {
        return false;
    }
    if (false !== strpos($content, '由于服务器压力过'))
    {
        return false;
    }
    $data['content'] = $content;
    Chapter::single()->update($chapterId, $bookId, $data);
    
    return true;
}

function emptyGetSiteUrlInfo($bookId, $spider)
{
    $sql = 'SELECT listurl,detailurl FROM '.CollectList::table().' WHERE bookid = '.$bookId." AND spider = '$spider'";
    
    return $GLOBALS['db']->setQueryString($sql)->find(true);
}