package com.datastax.insight.ml.spark.mllib.feature;

import com.datastax.insight.spec.RDDOperator;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.mllib.feature.HashingTF;
import org.apache.spark.mllib.feature.IDF;
import org.apache.spark.mllib.linalg.Vector;

import java.util.List;

public class TFIDF implements RDDOperator {
    public static JavaRDD<Vector> transform(JavaRDD<List<String>> data, int numFeatures, int minDocFreq){
        HashingTF hashingTF=null;
        if(numFeatures>0){
            hashingTF=new HashingTF(numFeatures);
        }else {
            hashingTF=new HashingTF();
        }
        JavaRDD<Vector> vData= hashingTF.transform(data);

        IDF idf=new IDF(minDocFreq);
        return idf.fit(vData).transform(vData);
    }
}
