/*
 * Id$: zuv-cloud:z-service:cc.zuv.service.spider.lucene.SpiderParser:20181225151549
 *
 * SpiderParser.java
 * Copyright (c) 2002-2020 Luther Inc.
 * http://zuv.cc
 * All rights reserved.
 */
package cc.zuv.service.spider.lucene;

import lombok.extern.slf4j.Slf4j;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
 * File Description
 *
 * @author			Kama Luther
 * @version			0.1
 * @since           0.1
 * @create.date     2014-2-11 下午02:19:48
 * @modify.date     2014-2-11 下午02:19:48
 */
@Slf4j
public class SpiderParser
{

    //-----------------------------------------------------------------------------------------

	private Analyzer analyzer;

	public SpiderParser()
	{
    	//
		analyzer = new SmartChineseAnalyzer(Version.LUCENE_46, true); // new PaodingAnalyzer(); //StandardAnalyzer
	}

    //-----------------------------------------------------------------------------------------

	public Analyzer getAnalyzer()
	{
		return analyzer;
	}
	public void setAnalyzer(Analyzer analyzer)
	{
		this.analyzer = analyzer;
	}

    //-----------------------------------------------------------------------------------------

	public List<String> analyzeChinaWord(String word)
	{
	    List<String> result = new ArrayList<>();

	    try
	    {
	    	// 自定义停用词
            String[] self_stop_words = { "的", "了", "呢", "，", "0", "：", ",", "是", "流" };
            CharArraySet cas = new CharArraySet(Version.LUCENE_46, 0, true);
            for (int i = 0; i < self_stop_words.length; i++)
            {
                cas.add(self_stop_words[i]);
            }

            // 加入系统默认停用词
            Iterator<Object> itor = SmartChineseAnalyzer.getDefaultStopSet().iterator();
            while (itor.hasNext())
            {
                cas.add(itor.next());
            }

            Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_46, cas);

	        TokenStream tokenStream = analyzer.tokenStream("field", word);
	        CharTermAttribute term=tokenStream.addAttribute(CharTermAttribute.class);
	        tokenStream.reset();
	        while( tokenStream.incrementToken() )
	        {
	            result.add( term.toString() );
	        }
	        tokenStream.end();
	        tokenStream.close();


	        analyzer.close();
	    }
		catch (IOException e)
		{
			log.error("IO Error {}", e.getMessage());
		}
	    return result;
	}

    //-----------------------------------------------------------------------------------------

}
