package com.datastax.insight.ml.spark.ml.feature.transformer;

import com.datastax.insight.spec.DataSetOperator;
import org.apache.spark.ml.feature.Tokenizer;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;

/**
 * 英文分词
 */
public class TokenizerWrapper implements DataSetOperator {
    /**
     * 英文分词器
     */
    public static Tokenizer getOperator(String inputCol,String outputCol){
        Tokenizer tokenizer = new Tokenizer()
                .setInputCol(inputCol)
                .setOutputCol(outputCol);
        return tokenizer;
    }

    public static Dataset<Row> transform(Dataset<Row> data,String inputCol,String outputCol){
        Tokenizer tokenizer=getOperator(inputCol,outputCol);
        Dataset<Row> tdata=tokenizer.transform(data);
        return tdata;
    }

    /**
     * 英文分词
     */
    public static Dataset<Row> transform(Tokenizer tokenizer,Dataset<Row> data){
        Dataset<Row> tdata=tokenizer.transform(data);
        return tdata;
    }
}
