﻿//------------------------------------------------------------------------------
// <copyright company="Tunynet">
// Copyright (c) Tunynet Inc. All rights reserved.
// </copyright> 
//------------------------------------------------------------------------------

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using PanGu;
using Lucene.Net.Analysis;
using Lucene.Net.Index;
using Lucene.Net.Store;
using System.IO;
using Lucene.Net.Documents;
using SpaceBuilder.Common;
using Lucene.Net.Search;
using System.Diagnostics;

namespace SpaceBuilder.LuceneSearch
{
    public abstract class SearchManagerBase<T> where T : class
    {
        protected static readonly int MaxNumFragmentsRequired = 2;

        /// <summary>
        /// PhraseQuery的Slop与多元分词设置有关系
        /// </summary>
        protected static readonly int PhraseQuerySlop = 2;

        /// <summary>
        /// Lucene索引文件版本
        /// </summary>
        protected readonly Lucene.Net.Util.Version CurrentLuceneVersion = Lucene.Net.Util.Version.LUCENE_29;

        protected SearchManagerBase(string indexFileDirectory) { IndexFileDirectory = indexFileDirectory; }

        /// <summary>
        /// 索引文件目录名称
        /// </summary>
        public string IndexFileDirectory = "";

        private string physicalIndexDirectory = null;
        /// <summary>
        /// 索引文件夹
        /// </summary>
        /// <returns></returns>
        public string PhysicalIndexDirectory
        {
            get
            {
                if (physicalIndexDirectory == null)
                    physicalIndexDirectory = System.IO.Path.Combine(SearchConfiguration.Instance().GlobalIndexDirectory, IndexFileDirectory);
                return physicalIndexDirectory;
            }
        }

        protected bool? isIndexDirectoryExists = null;
        /// <summary>
        /// 索引文件所在目录是否存在
        /// </summary>
        public bool IsIndexDirectoryExists
        {
            get
            {
                if (isIndexDirectoryExists == null)
                    isIndexDirectoryExists = System.IO.Directory.Exists(PhysicalIndexDirectory);

                return isIndexDirectoryExists.Value;
            }
        }

        private bool? isIndexFilesExists = null;
        /// <summary>
        /// 索引文件是否存在
        /// </summary>
        public bool IsIndexFilesExists
        {
            get
            {
                if (isIndexFilesExists == null)
                    isIndexFilesExists = IsExistsIndexFiles(PhysicalIndexDirectory);

                return isIndexFilesExists.Value;
            }
        }

        /// <summary>
        /// 检测索引文件是否存在
        /// </summary>
        /// <param name="indexPath"></param>
        /// <returns></returns>
        protected bool IsExistsIndexFiles(string indexPath)
        {
            return Lucene.Net.Index.IndexReader.IndexExists(Lucene.Net.Store.FSDirectory.Open(new System.IO.DirectoryInfo(indexPath)));
        }

        /// <summary>
        /// 获取用于中文分词的Analyzer
        /// </summary>
        /// <returns>Analyzer</returns>
        protected Analyzer GetChineseAnalyzer()
        {
            return new Lucene.Net.Analysis.PanGu.PanGuAnalyzer();
        }


        /// <summary>
        /// 获取用于中文分词的Analyzer(且返回原始字符串)
        /// </summary>
        /// <returns>Analyzer</returns>
        protected Analyzer GetChineseAnalyzerOfUnTokenized()
        {
            return new Lucene.Net.Analysis.PanGu.PanGuAnalyzer(true);
        }

        /// <summary>
        /// 把一个语句划分成多个词
        /// </summary>
        /// <param name="str"></param>
        /// <returns></returns>
        protected static ICollection<WordInfo> SegmentToWordInfos(String str)
        {
            PanGu.Segment segment = new Segment();

            PanGu.Match.MatchOptions matchOptions = new PanGu.Match.MatchOptions();

            //中文人名识别
            matchOptions.ChineseNameIdentify = false;
            //词频优先
            matchOptions.FrequencyFirst = false;
            //多元分词
            matchOptions.MultiDimensionality = false;
            //英文多元分词，这个开关，会将英文中的字母和数字分开
            matchOptions.EnglishMultiDimensionality = false;
            //过滤停用词
            matchOptions.FilterStopWords = false;
            //忽略空格、回车、Tab
            matchOptions.IgnoreSpace = true;
            //强制一元分词
            matchOptions.ForceSingleWord = false;
            //繁体中文开关
            matchOptions.TraditionalChineseEnabled = false;
            //同时输出简体和繁体
            matchOptions.OutputSimplifiedTraditional = false;
            //未登录词识别
            matchOptions.UnknownWordIdentify = false;
            //过滤英文，这个选项只有在过滤停用词选项生效时才有效
            matchOptions.FilterEnglish = false;
            //过滤数字，这个选项只有在过滤停用词选项生效时才有效
            matchOptions.FilterNumeric = false;
            //忽略英文大小写
            matchOptions.IgnoreCapital = true;
            //英文分词
            matchOptions.EnglishSegment = false;
            //同义词输出  （同义词输出功能一般用于对搜索字符串的分词，不建议在索引时使用）
            matchOptions.SynonymOutput = false;
            //通配符匹配输出 （）
            matchOptions.WildcardOutput = false;
            //对通配符匹配的结果分词
            matchOptions.WildcardSegment = false;

            PanGu.Match.MatchParameter matchParameter = new PanGu.Match.MatchParameter();
            //未登录词权值
            matchParameter.UnknowRank = 1;
            //最匹配词权值
            matchParameter.BestRank = 5;
            //次匹配词权值
            matchParameter.SecRank = 3;
            //再次匹配词权值
            matchParameter.ThirdRank = 2;
            //强行输出的单字的权值
            matchParameter.SingleRank = 1;
            //数字的权值
            matchParameter.NumericRank = 1;
            //英文词汇权值
            matchParameter.EnglishRank = 5;
            //英文词汇小写的权值
            matchParameter.EnglishLowerRank = 3;
            //英文词汇词根的权值
            matchParameter.EnglishStemRank = 2;
            //符号的权值
            matchParameter.SymbolRank = 2;
            //强制同时输出简繁汉字时，非原来文本的汉字输出权值。 比如原来文本是简体，这里就是输出的繁体字的权值，反之亦然。
            matchParameter.SimplifiedTraditionalRank = 1;
            //同义词权值
            matchParameter.SynonymRank = 1;
            //通配符匹配结果的权值
            matchParameter.WildcardRank = 1;
            //过滤英文选项生效时，过滤大于这个长度的英文
            matchParameter.FilterEnglishLength = 0;
            //过滤数字选项生效时，过滤大于这个长度的数字
            matchParameter.FilterNumericLength = 0;
            //用户自定义规则的配件文件名
            matchParameter.CustomRuleAssemblyFileName = string.Empty;
            //用户自定义规则的类的完整名，即带名字空间的名称
            matchParameter.CustomRuleFullClassName = string.Empty;

            return segment.DoSegment(str, matchOptions, matchParameter);

        }

        /// <summary>
        /// 切分关键词用于QueryParser
        /// </summary>
        /// <remarks>已经进行Query加权</remarks>
        /// <param name="keywords"></param>
        /// <returns></returns>
        protected static string SegmentForQueryParser(string keywords)
        {
            StringBuilder result = new StringBuilder();
            ICollection<WordInfo> words = SegmentToWordInfos(keywords);
            int maxRank = words.Max(n => n.Rank);
            foreach (WordInfo word in words)
            {
                if (word == null)
                    continue;

                if (maxRank > 1 && word.Rank <= 1)
                    continue;

                result.AppendFormat("{0}^{1}.0 ", word.Word, (int)Math.Pow(3, word.Rank));
            }

            string resultString = result.ToString().Trim();
            if (string.IsNullOrEmpty(resultString))
                return keywords;
            else
                return resultString;
        }

        /// <summary>
        /// 切分关键词(用空格分隔)[用于迷你博客 话题搜索]
        /// </summary>
        /// <param name="keywords"></param>
        /// <returns></returns>
        protected static string[] SegmentForPhraseQuery(string keywords)
        {
            ICollection<WordInfo> words = SegmentToWordInfos(keywords);
            return words.Select(n => n.Word).ToArray();
        }


        private object insertLockObject = new object();
        /// <summary>
        /// 加入索引
        /// </summary>
        public bool Insert(IList<T> threads)
        {
            if (!IsIndexDirectoryExists)
            {
                try
                {
                    System.IO.Directory.CreateDirectory(PhysicalIndexDirectory);
                    isIndexDirectoryExists = true;
                }
                catch
                {
                    throw new ApplicationException(string.Format("create Directory '{0}' failed", PhysicalIndexDirectory));
                }
            }

            return Insert(threads, PhysicalIndexDirectory, !IsIndexFilesExists);
        }

        /// <summary>
        /// 加入索引
        /// </summary>
        /// <param name="createIndexFile">是否创建索引文件</param>
        public bool Insert(IList<T> objs, string indexPath, bool createIndexFile)
        {
            if (objs == null || objs.Count == 0)
                return false;

            bool result = false;
            lock (insertLockObject)
            {
                Lucene.Net.Store.Directory indexDirectory = FSDirectory.Open(new DirectoryInfo(indexPath));

                // 第一个参数是存放索引目录有FSDirectory（存储到磁盘上）和RAMDirectory（存储到内存中）， 第二个参数是使用的分词器， 第三个：true，建立全新的索引，false,建立增量索引，第四个是.字段的最长文本，如果超过这个最长的文本，就切除超过的部分。如果你有些字段的文本长度有可能超过10000，那就要需要改动！ 为了性能，完全没必要。
                IndexWriter fsWriter = new IndexWriter(indexDirectory, GetChineseAnalyzer(), createIndexFile, IndexWriter.MaxFieldLength.LIMITED);
                // 索引合并因子
                // SetMergeFactor（合并因子）   
                // SetMergeFactor是控制segment合并频率的，其决定了一个索引块中包括多少个文档，当硬盘上的索引块达到多少时，   
                // 将它们合并成一个较大的索引块。当MergeFactor值较大时，生成索引的速度较快。MergeFactor的默认值是10，建议在建立索引前将其设置的大一些。
                fsWriter.SetMergeFactor(SearchConfiguration.Instance().MergeFactor);
                // SetMaxBufferedDocs（最大缓存文档数）   
                // SetMaxBufferedDocs是控制写入一个新的segment前内存中保存的document的数目，   
                // 设置较大的数目可以加快建索引速度，默认为10。  
                fsWriter.SetMaxBufferedDocs(SearchConfiguration.Instance().MaxMergeDocs);
                //// SetMaxMergeDocs（最大合并文档数）   
                //// SetMaxMergeDocs是控制一个segment中可以保存的最大document数目，值较小有利于追加索引的速度，默认Integer.MAX_VALUE，无需修改。   
                //// 在创建大量数据的索引时，我们会发现索引过程的瓶颈在于大量的磁盘操作，如果内存足够大的话，   
                //// 我们应当尽量使用内存，而非硬盘。可以通过SetMaxBufferedDocs来调整，增大Lucene使用内存的次数。   
                //fsWriter.SetMaxMergeDocs(SearchConfiguration.Instance().MaxMergeDocs);
                // SetUseCompoundFile这个方法可以使Lucene在创建索引库时，会合并多个 Segments 文件到一个.cfs中。   
                // 此方式有助于减少索引文件数量，对于将来搜索的效率有较大影响。   
                // 压缩存储（True则为复合索引格式）   
                fsWriter.SetUseCompoundFile(true);

                try
                {
                    foreach (T obj in objs)
                    {
                        if (obj != null)
                        {
                            Document doc = ConvertObjToDocument(obj);
                            if (doc != null)
                                fsWriter.AddDocument(doc);
                        }
                    }
                    result = true;
                    //优化索引，使多个Segments变成一个Segments  
                    //optimize()  
                    //指定最大Segments的数量  
                    //optimize(int maxNumSegments)  
                    //前面的方面都是优化完成之后再返回，这个方法的参数如果是FALSE的话，就直接返回，再开一个线程来优化  
                    //optimize(boolean doWait)  
                    //前面两个参数的组合
                    //optimize(int maxNumSegments, boolean doWait) 
                    fsWriter.Optimize();

                }
                finally
                {
                    fsWriter.Close();
                    indexDirectory.Close();
                }
            }
            return result;
        }

        private readonly object delLockObject = new object();
        /// <summary>
        /// 删除索引
        /// </summary>
        public bool Delete(IList<int> ids, string idIndexFieldName)
        {
            if (ids == null && ids.Count == 0)
                return false;

            if (!IsIndexFilesExists)
                return false;

            bool result = false;
            lock (delLockObject)
            {
                IndexWriter iw = null;
                try
                {
                    iw = new IndexWriter(FSDirectory.Open(new DirectoryInfo(PhysicalIndexDirectory)), GetChineseAnalyzer(), false, IndexWriter.MaxFieldLength.LIMITED);
                    foreach (var id in ids)
                    {
                        Term term = new Term(idIndexFieldName, id.ToString());
                        iw.DeleteDocuments(term);
                    }
                    //不优化不会删除
                    //if (needOptimize)
                    iw.Optimize();
                    result = true;
                }
                finally
                {
                    if (iw != null)
                        iw.Close();
                }
            }
            return result;
        }

        private readonly object updateLockObject = new object();
        /// <summary>
        /// 更新索引
        /// </summary>
        /// <param name="objs">更新的集合对象</param>
        /// <param name="funcSelectIDs">将集合对象中的主键ID删选出来</param>
        /// <param name="idIndexFieldName">主键对应的Lucene字段名称</param>
        /// <returns></returns>
        public bool Update(IList<T> objs, Func<IList<T>, IList<int>> funcSelectIDs, string idIndexFieldName)
        {
            if (objs == null || objs.Count == 0)
                return false;

            bool result = false;
            lock (updateLockObject)
            {
                IList<int> ids = funcSelectIDs(objs);
                result = Delete(ids, idIndexFieldName);

                if (result || !IsIndexFilesExists)
                    result = Insert(objs);
            }
            return result;
        }

        /// <summary>
        /// 根据Query进行搜索
        /// </summary>
        /// <param name="searchQuery">查询query</param>
        /// <param name="sortFields">排序字段 null为按得分排序</param>
        /// <param name="pageIndex">页数</param>
        /// <param name="pageSize">每页展示数</param>
        /// <returns></returns>
        protected SearchResultDataSet<T> Search(Query searchQuery, Filter filter, SortField[] sortFields, int pageIndex, int pageSize)
        {
            Stopwatch stopwatch = new Stopwatch();
            stopwatch.Start();

            //只读 不加锁
            IndexReader reader = IndexReader.Open(FSDirectory.Open(new DirectoryInfo(PhysicalIndexDirectory)), true);
            Searcher searcher = new IndexSearcher(reader);

            Sort sort;
            if (sortFields != null && sortFields.Length > 0)
                sort = new Sort(sortFields);
            else
                sort = new Sort(SortField.FIELD_SCORE);

            //搜索结果最大每页录数（有些PageSize担心直接使用int.MaxValue）
            int maxPageSize = 10000;
            if (pageSize > maxPageSize)
                pageSize = maxPageSize;

            TopFieldCollector collector = TopFieldCollector.create(sort, pageIndex * pageSize, false, true, false, true);

            searcher.Search(searchQuery, filter, collector);
            IEnumerable<ScoreDoc> hits = collector.TopDocs().scoreDocs.Skip((pageIndex - 1) * pageSize);

            SearchResultDataSet<T> pds = new SearchResultDataSet<T>();
            foreach (var hit in hits)
            {
                T item = ConvertDocumentToObj(searcher.Doc(hit.doc));
                if (item != null)
                    pds.Records.Add(item);
            }
            searcher.Close();
            reader.Close();
            pds.TotalRecords = collector.GetTotalHits();
            pds.PageIndex = pageIndex;
            pds.PageSize = pageSize;

            stopwatch.Stop();
            pds.SearchDuration = stopwatch.ElapsedMilliseconds / 1000d;
            return pds;
        }

        /// <summary>
        /// 搜索并返回不分页的集合
        /// </summary>
        /// <param name="searchQuery"></param>
        /// <param name="sortFields"></param>
        /// <param name="topNumber"></param>
        /// <returns></returns>
        protected ICollection<T> Search(Query searchQuery, Filter filter, SortField[] sortFields, int topNumber)
        {
            //默认排序
            if (sortFields == null || sortFields.Length <= 0)
                sortFields = new SortField[] { SortField.FIELD_SCORE, new SortField(null, SortField.DOC, true) };

            IndexReader reader = IndexReader.Open(FSDirectory.Open(new DirectoryInfo(PhysicalIndexDirectory)), true);
            Searcher searcher = new IndexSearcher(reader);

            Sort sort = null;
            if (sortFields != null)
                sort = new Sort(sortFields);

            TopFieldDocs collector = searcher.Search(searchQuery, filter, topNumber, sort);

            List<T> results = new List<T>();
            foreach (var hit in collector.scoreDocs)
            {
                T item = ConvertDocumentToObj(searcher.Doc(hit.doc));
                if (item != null)
                    results.Add(item);
            }
            searcher.Close();
            reader.Close();

            return results;
        }


        private PanGu.HighLight.Highlighter highlighter = null;
        /// <summary>
        /// 为关键字进行高亮
        /// </summary>
        public string HighlighterForKeyWord(string content, string keyWord)
        {
            if (highlighter == null)
            {
                highlighter = new PanGu.HighLight.Highlighter(new PanGu.HighLight.SimpleHTMLFormatter("<font color=\"#c60a00\">", "</font>"), new Segment());
                highlighter.FragmentSize = 256;
            }
            string bestContent = null;
            if (!string.IsNullOrEmpty(content) && content.Length > MaxNumFragmentsRequired)
                bestContent = highlighter.GetBestFragment(keyWord, content);
            return string.IsNullOrEmpty(bestContent) ? SpaceBuilder.Utils.StringUtils.Trim(content, 256) : bestContent;
        }

        /// <summary>
        /// 初始化索引
        /// </summary>
        /// <param name="indexPath"></param>
        public abstract void InitializeIndex(string indexPath);

        /// <summary>
        /// 将对象转换为Document
        /// </summary>
        /// <param name="thread"></param>
        /// <returns></returns>
        protected abstract Document ConvertObjToDocument(T obj);

        /// <summary>
        /// 将Document转换为对象
        /// </summary>
        /// <param name="doc"></param>
        /// <returns></returns>
        protected abstract T ConvertDocumentToObj(Document doc);
    }
}
