package co.cask.cdap.examples.wikipedia;

import co.cask.cdap.api.dataset.table.Table;
import co.cask.cdap.api.spark.SparkContext;
import co.cask.cdap.api.workflow.Value;
import co.cask.cdap.api.workflow.WorkflowToken;
import java.util.Map;
import org.apache.spark.Accumulator;
import org.apache.spark.AccumulatorParam$IntAccumulatorParam$;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.rdd.NewHadoopRDD;
import org.apache.spark.rdd.RDD;
import org.apache.spark.rdd.RDD$;
import scala.Array$;
import scala.MatchError;
import scala.Predef$;
import scala.Tuple2;
import scala.Tuple3;
import scala.collection.immutable.StringOps;
import scala.math.Numeric$LongIsIntegral$;
import scala.math.Ordering$Long$;
import scala.math.Ordering$String$;
import scala.reflect.ClassTag$;
import scala.runtime.BoxesRunTime;

/* compiled from: ClusteringUtils.scala */
/* loaded from: input_file:co/cask/cdap/examples/wikipedia/ClusteringUtils$.class */
public final class ClusteringUtils$ {
    public static final ClusteringUtils$ MODULE$ = null;

    static {
        new ClusteringUtils$();
    }

    public Tuple3<RDD<Tuple2<Object, Vector>>, String[], Object> preProcess(SparkContext sparkContext) {
        Map<String, String> runtimeArguments = sparkContext.getRuntimeArguments();
        NewHadoopRDD newHadoopRDD = (NewHadoopRDD) sparkContext.readFromDataset(WikipediaPipelineApp.NORMALIZED_WIKIPEDIA_DATASET, byte[].class, byte[].class);
        org.apache.spark.SparkContext sparkContext2 = (org.apache.spark.SparkContext) sparkContext.getOriginalSparkContext();
        String str = runtimeArguments.containsKey("stopwords.file") ? runtimeArguments.get("stopwords.file") : "";
        int i = runtimeArguments.containsKey("vocab.size") ? new StringOps(Predef$.MODULE$.augmentString(runtimeArguments.get("vocab.size"))).toInt() : 1000;
        RDD map = newHadoopRDD.zipWithIndex().map(new ClusteringUtils$$anonfun$3(new SimpleTokenizer(sparkContext2, str)), ClassTag$.MODULE$.apply(Tuple2.class));
        map.cache();
        RDD reduceByKey = RDD$.MODULE$.rddToPairRDDFunctions(map.flatMap(new ClusteringUtils$$anonfun$4(), ClassTag$.MODULE$.apply(Tuple2.class)), ClassTag$.MODULE$.apply(String.class), ClassTag$.MODULE$.Long(), Ordering$String$.MODULE$).reduceByKey(new ClusteringUtils$$anonfun$1());
        reduceByKey.cache();
        Tuple2[] tuple2Arr = (i == -1 || reduceByKey.count() <= ((long) i)) ? (Tuple2[]) Predef$.MODULE$.refArrayOps((Object[]) reduceByKey.collect()).sortBy(new ClusteringUtils$$anonfun$5(), Ordering$Long$.MODULE$) : (Tuple2[]) reduceByKey.sortBy(new ClusteringUtils$$anonfun$6(), false, reduceByKey.sortBy$default$3(), Ordering$Long$.MODULE$, ClassTag$.MODULE$.Long()).take(i);
        Tuple2 tuple2 = new Tuple2(Predef$.MODULE$.refArrayOps((Object[]) Predef$.MODULE$.refArrayOps((Object[]) Predef$.MODULE$.refArrayOps(tuple2Arr).map(new ClusteringUtils$$anonfun$7(), Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(String.class)))).zipWithIndex(Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(Tuple2.class)))).toMap(Predef$.MODULE$.conforms()), Predef$.MODULE$.longArrayOps((long[]) Predef$.MODULE$.refArrayOps(tuple2Arr).map(new ClusteringUtils$$anonfun$8(), Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.Long()))).sum(Numeric$LongIsIntegral$.MODULE$));
        if (tuple2 != null) {
            scala.collection.immutable.Map map2 = (scala.collection.immutable.Map) tuple2._1();
            long _2$mcJ$sp = tuple2._2$mcJ$sp();
            if (map2 != null) {
                Tuple2 tuple22 = new Tuple2(map2, BoxesRunTime.boxToLong(_2$mcJ$sp));
                scala.collection.immutable.Map map3 = (scala.collection.immutable.Map) tuple22._1();
                long _2$mcJ$sp2 = tuple22._2$mcJ$sp();
                RDD map4 = map.map(new ClusteringUtils$$anonfun$9(map3), ClassTag$.MODULE$.apply(Tuple2.class));
                String[] strArr = new String[map3.size()];
                map3.foreach(new ClusteringUtils$$anonfun$preProcess$1(strArr));
                return new Tuple3<>(map4, strArr, BoxesRunTime.boxToLong(_2$mcJ$sp2));
            }
        }
        throw new MatchError(tuple2);
    }

    public void storeResults(SparkContext sparkContext, Tuple2<String, Object>[][] tuple2Arr, String str) {
        Table table = (Table) sparkContext.getDataset(str);
        org.apache.spark.SparkContext sparkContext2 = (org.apache.spark.SparkContext) sparkContext.getOriginalSparkContext();
        Accumulator accumulator = sparkContext2.accumulator(BoxesRunTime.boxToInteger(0), "num.records", AccumulatorParam$IntAccumulatorParam$.MODULE$);
        Accumulator accumulator2 = sparkContext2.accumulator(new Term("", 0.0d), "highest.score", HighestAccumulatorParam$.MODULE$);
        Predef$.MODULE$.refArrayOps((Object[]) Predef$.MODULE$.refArrayOps(tuple2Arr).zipWithIndex(Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(Tuple2.class)))).foreach(new ClusteringUtils$$anonfun$storeResults$1(table, accumulator, accumulator2));
        WorkflowToken workflowToken = sparkContext.getWorkflowToken();
        if (workflowToken != null) {
            workflowToken.put("num.records", Value.of(BoxesRunTime.unboxToInt(accumulator.value())));
            workflowToken.put("highest.score.term", ((Term) accumulator2.value()).name());
            workflowToken.put("highest.score.value", Value.of(((Term) accumulator2.value()).weight()));
        }
    }

    private ClusteringUtils$() {
        MODULE$ = this;
    }
}
