package com.datastax.insight.ml.spark.ml.feature.transformer;

import com.datastax.insight.spec.DataSetOperator;
import com.datastax.insight.core.Consts;
import org.apache.spark.ml.feature.StopWordsRemover;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;

/**
 * 去停词
 */
public class StopWordsRemoverWrapper implements DataSetOperator {
    /**
     * 停止词去除器
     */
    public static StopWordsRemover getOperator(String inputCol, String outputCol,boolean caseSensitive,String stopWord){
        String[] words=stopWord.split(Consts.DELIMITER);
        StopWordsRemover remover = new StopWordsRemover()
                .setInputCol(inputCol)
                .setOutputCol(outputCol)
                .setCaseSensitive(caseSensitive)
                .setStopWords(words);
        return remover;
    }

    public static Dataset<Row> transform(Dataset<Row> data, String inputCol, String outputCol,boolean caseSensitive,String stopWord){
        StopWordsRemover remover=getOperator(inputCol,outputCol,caseSensitive,stopWord);
        Dataset<Row> tdata=remover.transform(data);
        return tdata;
    }

    /**
     * 停止词去除
     */
    public static Dataset<Row> transform(StopWordsRemover remover,Dataset<Row> data){
        Dataset<Row> tdata=remover.transform(data);
        return tdata;
    }
}
