﻿/*
 * 
 * 标题：采集类
 * 作者：DODO
 * 版权：www.taowaw.com
 * 日期：
 *      2010-02-12 PM 21:16
 * 
 */ 
using System;
using System.Data;
using System.Configuration;
using System.Web;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.IO;

namespace taowaw.Common
{
    /// <summary>
    /// 文章采集
    /// </summary>
    public class Collect
    {
        /// <summary>
        /// 获取网页源码
        /// </summary>
        /// <param name="HttpUrl">网址</param>
        /// <param name="CharSet">true:默认Gb2312，false:UTF-8</param>
        /// <returns></returns>
        public string GetHttpPage(string GHttpUrl, bool CharSet)
        {
            Encoding cncode = Encoding.Default;
            if (!CharSet)
                cncode = Encoding.UTF8;
            string PageHtml;

            WebClient wc = new WebClient();
            byte[] PageData = wc.DownloadData(GHttpUrl.Trim());
            if (CharSet)
                PageHtml = Encoding.Default.GetString(PageData);
            else
                PageHtml = Encoding.UTF8.GetString(PageData);


            //string Result;
            //WebRequest MyReq = WebRequest.Create(HttpUrl);
            //WebResponse MyRes = MyReq.GetResponse();
            //Stream resStream = MyRes.GetResponseStream();
            //StreamReader webRstrem = new StreamReader(resStream, cncode);
            //StringBuilder sb = new StringBuilder();
            //while ((Result = webRstrem.ReadLine()) != null)
            //{
            //    sb.Append(Result);
            //}
            //PageHtml = sb.ToString();
            //MyRes.Close();

            return PageHtml;
        }

        /// <summary>
        /// 截取字符串
        /// </summary>
        /// <param name="Constr">内容</param>
        /// <param name="StartStr">开始字符</param>
        /// <param name="OverStr">结束字符</param>
        /// <param name="IncluL">true:结果包含开始字符</param>
        /// <param name="IncluR">true:结果包含结束字符</param>
        /// <returns></returns>
        public string GetBody(string Constr, string StartStr, string OverStr, bool IncluL, bool IncluR)
        {
            string ConstrTemp;
            int Start, Over;
            try
            {
                ConstrTemp = Constr;//Constr.ToLower();
                //StartStr = StartStr.ToLower();
                //OverStr = OverStr.ToLower();

                Start = ConstrTemp.IndexOf(StartStr) + StartStr.Length;
                if (IncluL)
                {
                    Start -= StartStr.Length;
                }
                ConstrTemp = ConstrTemp.Substring(Start);

                Over = ConstrTemp.IndexOf(OverStr);
                if (IncluR)
                {
                    Over += OverStr.Length;
                }
                ConstrTemp = ConstrTemp.Substring(0, Over);
            }
            catch
            {
                ConstrTemp = string.Empty;
            }
            return ConstrTemp;
        }

        #region 返回指定内容的URL与标题数组
        /// <summary>
        /// 返回指定内容的URL与标题数组
        /// </summary>
        /// <param name="HtmlCode">HTML源码</param>
        /// <param name="start">开始字符</param>
        /// <param name="end">结束字符</param>
        /// <param name="AppendUrl">采集站URL</param>
        /// <returns></returns>
        public string GetHrefAndTitle(string HtmlCode, string start, string end, string AppendUrl)
        {
            StringBuilder TempVale = new StringBuilder();
            string[] strArr = Utils.SplitString(GetHrefArray(HtmlCode, start, end), "$Array$");
            for (int i = 0; i < strArr.Length; i++)
            {
                if (!string.IsNullOrEmpty(strArr[i]))
                {
                    TempVale.Append(GetHrefOnsingle(strArr[i], AppendUrl));
                    TempVale.Append("$Array1$");
                    TempVale.Append(GetHrefTitleOnsingle(strArr[i].ToLower()));
                    TempVale.Append("$Array2$");
                }
            }

            return TempVale.ToString();
        }
        #endregion

        #region 匹配页面的链接(暂不用)
        /// <summary>  
        /// 获取页面的链接正则  
        /// </summary>  
        /// <param name="HtmlCode">html源代码</param>  
        /// <returns></returns>  
        public string GetHref(string HtmlCode)
        {
            string MatchVale = "";
            string Reg = @"(h|H)(r|R)(e|E)(f|F) *= *('|"")?((\w|\\|\/|\.|:|-|_)+)('|""| *|>)?";
            foreach (Match m in Regex.Matches(HtmlCode, Reg))
            {
                MatchVale += (m.Value).ToLower().Replace("href=", "").Trim().Replace(@"""", "").Replace("'", "") + "||";
            }

            return MatchVale;
        }
        #endregion

        #region 匹配内容单个连接
        /// <summary>
        /// 匹配内容单个连接
        /// </summary>
        /// <param name="htmlCode"></param>
        /// <returns></returns>
        public string GetonlyHref(string hrefVal)
        {
            int ofnum = 0;
            string MatchVale = "";

            hrefVal = hrefVal.Replace("HREF=", "href=").Replace("HREF =", "href=").Replace("Href =", "href=");
            MatchVale = GetBody(hrefVal, "href=", ">", false, false).Replace("'", "").Replace(@"""", "").Replace("target=_blank", "");

            MatchVale = MatchVale.Replace("Title=", "title=").Replace("title =", "title=").Replace("Class=", "class=").Replace("class =", "class=");
            MatchVale = MatchVale.Replace("Title =", "title=").Replace("Class =", "class=");
            //过滤title
            if (MatchVale.IndexOf("title=") > -1 || MatchVale.IndexOf("TITLE=") > -1)
            {
                ofnum = MatchVale.IndexOf("title=");
                if (ofnum == 0 || ofnum < 0)
                {
                    ofnum = MatchVale.IndexOf("TITLE=");
                }

                MatchVale = MatchVale.Substring(0, ofnum - 1);
            }
            //过滤class
            if (MatchVale.IndexOf("class=") > -1 || MatchVale.IndexOf("CLASS=") > -1)
            {
                ofnum = MatchVale.IndexOf("class=");
                if (ofnum == 0 || ofnum < 0)
                {
                    ofnum = MatchVale.IndexOf("CLASS=");
                }

                MatchVale = MatchVale.Substring(0, ofnum - 1);
            }

            MatchVale = MatchVale.Replace(" ", "");

            return MatchVale;
        }
        #endregion

        #region 返回单个连接(私有)
        /// <summary>
        /// 返回单个连接
        /// </summary>
        /// <param name="hrefVal"></param>
        /// <returns></returns>
        private string GetHrefOnsingle(string hrefVal, string AppendUrl)
        {
            int ofnum = 0;
            string MatchVale = "";
            //string Reg = @"(h|H)(r|R)(e|E)(f|F) *= *('|"")?((\w|\\|\/|\.|:|-|_)+)('|""| *|>)?";
            //foreach (Match m in Regex.Matches(hrefVal, Reg))
            //{
            //    MatchVale = m.Value.Replace("href=", "").Trim().Replace(@"""", "").Replace("'", "");
            //}

            hrefVal = hrefVal.Replace("HREF=", "href=").Replace("HREF =", "href=").Replace("Href =", "href=");
            MatchVale = GetBody(hrefVal, "href=", ">", false, false).Replace("'", "").Replace(@"""", "").Replace("target=_blank", "").Replace("target=_self", "").Replace("rel=bookmark", "").Replace("&amp;", "&");

            MatchVale = MatchVale.Replace("Title=", "title=").Replace("title =", "title=").Replace("Class=", "class=").Replace("class =", "class=");
            MatchVale = MatchVale.Replace("Title =", "title=").Replace("Class =", "class=");
            //过滤title
            if (MatchVale.IndexOf("title=") > -1 || MatchVale.IndexOf("TITLE=") > -1)
            {
                ofnum = MatchVale.IndexOf("title=");
                if (ofnum == 0 || ofnum < 0)
                {
                    ofnum = MatchVale.IndexOf("TITLE=");
                }

                MatchVale = MatchVale.Substring(0, ofnum - 1);
            }
            //过滤class
            if (MatchVale.IndexOf("class=") > -1 || MatchVale.IndexOf("CLASS=") > -1)
            {
                ofnum = MatchVale.IndexOf("class=");
                if (ofnum == 0 || ofnum < 0)
                {
                    ofnum = MatchVale.IndexOf("CLASS=");
                }

                MatchVale = MatchVale.Substring(0, ofnum - 1);
            }

            MatchVale = MatchVale.Replace(" ", "");

            //如果没有主URL则加上
            if (!string.IsNullOrEmpty(MatchVale) && MatchVale.Length > 4)
            {
                if (MatchVale.Substring(0, 1) == "/")
                    MatchVale = AppendUrl + MatchVale.Substring(1, MatchVale.Length - 1);
                else if (MatchVale.Substring(0, 2) == "./")
                    MatchVale = AppendUrl + MatchVale.Substring(2, MatchVale.Length - 2);
                else if (MatchVale.Substring(0, 3) == "../")
                    MatchVale = AppendUrl + MatchVale.Substring(3, MatchVale.Length - 3);
                else if (MatchVale.Substring(0, 4) != "http")
                    MatchVale = AppendUrl + MatchVale;
            }

            return MatchVale;
        }
        #endregion

        #region 返回单个链接标题(私有)
        /// <summary>
        /// 返回单个链接标题
        /// </summary>
        /// <param name="hrefVal"></param>
        /// <returns></returns>
        private string GetHrefTitleOnsingle(string hrefVal)
        {
            string MatchVale = "";
            string Reg = "(>).+?(</a>)";
            foreach (Match m in Regex.Matches(hrefVal, Reg))
            {
                MatchVale += m.Value.Replace(">", "").Replace("</a", "");
            }
            //Utils.Response(hrefVal + "<br>");
            return MatchVale;
        }
        #endregion

        #region 匹配内容所有链接(私有)
        /// <summary>
        /// 匹配内容链接
        /// </summary>
        /// <param name="HtmlCode"></param>
        /// <param name="start"></param>
        /// <param name="end"></param>
        /// <returns></returns>
        private string GetHrefArray(string HtmlCode, string start, string end)
        {
            string MatchVale = "";
            string Reg = "(" + start + ").+?(" + end + ")";
            //string Reg = "(<a).+?(</a>)";
            foreach (Match m in Regex.Matches(HtmlCode, Reg))
            {
                MatchVale += m.Value + "$Array$";
            }

            return MatchVale;
        }
        #endregion

        #region 过滤指定HTML标签包含内容
        /// <summary>
        /// 过滤指定HTML标签包含内容
        /// </summary>
        /// <param name="htmlCode"></param>
        /// <param name="TagName"></param>
        /// <returns></returns>
        public string ScriptHtml(string htmlCode, string TagName)
        {
            string MatchVale = "";
            //string Reg = "<" + TagName + "([^>])*></" + TagName + "([^>])*>";
            string Reg = "<" + TagName + "([^>])*>([^>])*</" + TagName + "([^>])*>";
            MatchVale = Regex.Replace(htmlCode, Reg, "");

            return MatchVale;
        }
        #endregion

        #region 过滤链接
        /// <summary>
        /// 过滤链接
        /// </summary>
        /// <param name="HtmlCode"></param>
        /// <returns></returns>
        public string RemoveHref(string HtmlCode)
        {
            string MatchVale = "";
            string Reg = "<a[^>]+>(.+?)</a>";
            MatchVale = Regex.Replace(HtmlCode, Reg, "$1");

            return MatchVale;
        }
        #endregion

        #region 过滤图片
        /// <summary>
        /// 过滤图片
        /// </summary>
        /// <param name="HtmlCode"></param>
        /// <returns></returns>
        public string RemoveImg(string HtmlCode)
        {
            string MatchVale = "";
            string Reg = "<img[^>]+/>";
            MatchVale = Regex.Replace(HtmlCode, Reg, "");

            return MatchVale;
        }
        #endregion

        #region 连接加上主连接
        /// <summary>
        /// 连接加上主连接
        /// </summary>
        /// <param name="val"></param>
        /// <param name="url"></param>
        /// <returns></returns>
        public string HrefAddUrl(string val, string url)
        {
            string tmp = string.Empty;
            string[] arrVal = Utils.SplitString(val, "||");
            for (int i = 0; i < arrVal.Length; i++)
            {
                if (arrVal[i].Length > 7)
                {
                    if (arrVal[i].ToLower().Substring(0, 7) != "http://")
                        tmp += url + arrVal[i] + "||";
                }
            }

            if (tmp.LastIndexOf("||") > -2 && tmp.Length >= 2)
                tmp = tmp.Substring(0, tmp.Length - 2);

            return tmp;
        }
        #endregion

        #region 图片加上主连接
        /// <summary>
        /// 图片加上主连接
        /// </summary>
        /// <param name="img"></param>
        /// <param name="url"></param>
        /// <returns></returns>
        public string[] ImgAddUrl(string[] img, string url)
        {
            string[] imgUrl = new string[img.Length];
            for (int i = 0; i < img.Length; i++)
            {
                if (img[i].Length > 7)
                {
                    if (img[i].ToLower().Substring(0, 7) != "http://")
                        imgUrl[i] = url + img[i];
                    else
                        imgUrl[i] = img[i];
                }
            }

            return imgUrl;
        }
        #endregion

        #region 取分页内容
        /// <summary>
        /// 取分页内容
        /// </summary>
        /// <param name="mainurl"></param>
        /// <param name="url"></param>
        /// <param name="start_key"></param>
        /// <param name="end_key"></param>
        /// <returns></returns>
        public string GetPageTxt(string mainurl, string url, string start_key, string end_key, bool encode)
        {
            StringBuilder strHtml = new StringBuilder();
            StringBuilder strBody = new StringBuilder();
            if (mainurl != url)
            {
                strHtml.Append(GetHttpPage(url, encode));
                strBody.Append(GetBody(strHtml.ToString(), start_key, end_key, false, false));
            }

            return strBody.ToString();
        }
        #endregion
    }
}
