package org.apache.spark.examples.mllib;

import java.util.Locale;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.examples.mllib.LDAExample;
import org.apache.spark.ml.Pipeline;
import org.apache.spark.ml.PipelineModel;
import org.apache.spark.ml.PipelineStage;
import org.apache.spark.ml.feature.CountVectorizer;
import org.apache.spark.ml.feature.RegexTokenizer;
import org.apache.spark.ml.feature.StopWordsRemover;
import org.apache.spark.mllib.clustering.DistributedLDAModel;
import org.apache.spark.mllib.clustering.EMLDAOptimizer;
import org.apache.spark.mllib.clustering.LDA;
import org.apache.spark.mllib.clustering.OnlineLDAOptimizer;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors$;
import org.apache.spark.rdd.RDD;
import org.apache.spark.rdd.RDD$;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row$;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.SparkSession$;
import scala.Array$;
import scala.MatchError;
import scala.Predef$;
import scala.Some;
import scala.Tuple2;
import scala.Tuple3;
import scala.collection.Seq;
import scala.collection.Seq$;
import scala.collection.SeqLike;
import scala.collection.immutable.StringOps;
import scala.collection.mutable.ArrayOps;
import scala.math.Numeric$IntIsIntegral$;
import scala.reflect.ClassTag$;
import scala.runtime.BoxedUnit;
import scala.runtime.BoxesRunTime;
import scala.runtime.ScalaRunTime$;
import scala.sys.package$;
import scopt.OptionParser;
import scopt.Read$;

/* compiled from: LDAExample.scala */
/* loaded from: input_file:org/apache/spark/examples/mllib/LDAExample$.class */
public final class LDAExample$ {
    public static LDAExample$ MODULE$;

    static {
        new LDAExample$();
    }

    public void main(String[] strArr) {
        final LDAExample.Params params = new LDAExample.Params(LDAExample$Params$.MODULE$.apply$default$1(), LDAExample$Params$.MODULE$.apply$default$2(), LDAExample$Params$.MODULE$.apply$default$3(), LDAExample$Params$.MODULE$.apply$default$4(), LDAExample$Params$.MODULE$.apply$default$5(), LDAExample$Params$.MODULE$.apply$default$6(), LDAExample$Params$.MODULE$.apply$default$7(), LDAExample$Params$.MODULE$.apply$default$8(), LDAExample$Params$.MODULE$.apply$default$9(), LDAExample$Params$.MODULE$.apply$default$10());
        Some parse = new OptionParser<LDAExample.Params>(params) { // from class: org.apache.spark.examples.mllib.LDAExample$$anon$1
            public static final /* synthetic */ LDAExample.Params $anonfun$new$1(int i, LDAExample.Params params2) {
                return params2.copy(params2.copy$default$1(), i, params2.copy$default$3(), params2.copy$default$4(), params2.copy$default$5(), params2.copy$default$6(), params2.copy$default$7(), params2.copy$default$8(), params2.copy$default$9(), params2.copy$default$10());
            }

            public static final /* synthetic */ LDAExample.Params $anonfun$new$2(int i, LDAExample.Params params2) {
                return params2.copy(params2.copy$default$1(), params2.copy$default$2(), i, params2.copy$default$4(), params2.copy$default$5(), params2.copy$default$6(), params2.copy$default$7(), params2.copy$default$8(), params2.copy$default$9(), params2.copy$default$10());
            }

            public static final /* synthetic */ LDAExample.Params $anonfun$new$3(double d, LDAExample.Params params2) {
                return params2.copy(params2.copy$default$1(), params2.copy$default$2(), params2.copy$default$3(), d, params2.copy$default$5(), params2.copy$default$6(), params2.copy$default$7(), params2.copy$default$8(), params2.copy$default$9(), params2.copy$default$10());
            }

            public static final /* synthetic */ LDAExample.Params $anonfun$new$4(double d, LDAExample.Params params2) {
                return params2.copy(params2.copy$default$1(), params2.copy$default$2(), params2.copy$default$3(), params2.copy$default$4(), d, params2.copy$default$6(), params2.copy$default$7(), params2.copy$default$8(), params2.copy$default$9(), params2.copy$default$10());
            }

            public static final /* synthetic */ LDAExample.Params $anonfun$new$5(int i, LDAExample.Params params2) {
                return params2.copy(params2.copy$default$1(), params2.copy$default$2(), params2.copy$default$3(), params2.copy$default$4(), params2.copy$default$5(), i, params2.copy$default$7(), params2.copy$default$8(), params2.copy$default$9(), params2.copy$default$10());
            }

            public static final /* synthetic */ LDAExample.Params $anonfun$new$9(int i, LDAExample.Params params2) {
                return params2.copy(params2.copy$default$1(), params2.copy$default$2(), params2.copy$default$3(), params2.copy$default$4(), params2.copy$default$5(), params2.copy$default$6(), params2.copy$default$7(), params2.copy$default$8(), params2.copy$default$9(), i);
            }

            {
                super("LDAExample");
                head(Predef$.MODULE$.wrapRefArray(new String[]{"LDAExample: an example LDA app for plain text data."}));
                opt("k", Read$.MODULE$.intRead()).text(new StringBuilder(27).append("number of topics. default: ").append(params.k()).toString()).action((obj, params2) -> {
                    return $anonfun$new$1(BoxesRunTime.unboxToInt(obj), params2);
                });
                opt("maxIterations", Read$.MODULE$.intRead()).text(new StringBuilder(43).append("number of iterations of learning. default: ").append(params.maxIterations()).toString()).action((obj2, params3) -> {
                    return $anonfun$new$2(BoxesRunTime.unboxToInt(obj2), params3);
                });
                opt("docConcentration", Read$.MODULE$.doubleRead()).text(new StringBuilder(62).append("amount of topic smoothing to use (> 1.0) (-1=auto).").append("  default: ").append(params.docConcentration()).toString()).action((obj3, params4) -> {
                    return $anonfun$new$3(BoxesRunTime.unboxToDouble(obj3), params4);
                });
                opt("topicConcentration", Read$.MODULE$.doubleRead()).text(new StringBuilder(68).append("amount of term (word) smoothing to use (> 1.0) (-1=auto).").append("  default: ").append(params.topicConcentration()).toString()).action((obj4, params5) -> {
                    return $anonfun$new$4(BoxesRunTime.unboxToDouble(obj4), params5);
                });
                opt("vocabSize", Read$.MODULE$.intRead()).text(new StringBuilder(78).append("number of distinct word types to use, chosen by frequency. (-1=all)").append("  default: ").append(params.vocabSize()).toString()).action((obj5, params6) -> {
                    return $anonfun$new$5(BoxesRunTime.unboxToInt(obj5), params6);
                });
                opt("stopwordFile", Read$.MODULE$.stringRead()).text(new StringBuilder(85).append("filepath for a list of stopwords. Note: This must fit on a single machine.").append("  default: ").append(params.stopwordFile()).toString()).action((str, params7) -> {
                    return params7.copy(params7.copy$default$1(), params7.copy$default$2(), params7.copy$default$3(), params7.copy$default$4(), params7.copy$default$5(), params7.copy$default$6(), str, params7.copy$default$8(), params7.copy$default$9(), params7.copy$default$10());
                });
                opt("algorithm", Read$.MODULE$.stringRead()).text(new StringBuilder(66).append("inference algorithm to use. em and online are supported.").append(" default: ").append(params.algorithm()).toString()).action((str2, params8) -> {
                    return params8.copy(params8.copy$default$1(), params8.copy$default$2(), params8.copy$default$3(), params8.copy$default$4(), params8.copy$default$5(), params8.copy$default$6(), params8.copy$default$7(), str2, params8.copy$default$9(), params8.copy$default$10());
                });
                opt("checkpointDir", Read$.MODULE$.stringRead()).text(new StringBuilder(143).append("Directory for checkpointing intermediate results.").append("  Checkpointing helps with recovery and eliminates temporary shuffle files on disk.").append("  default: ").append(params.checkpointDir()).toString()).action((str3, params9) -> {
                    return params9.copy(params9.copy$default$1(), params9.copy$default$2(), params9.copy$default$3(), params9.copy$default$4(), params9.copy$default$5(), params9.copy$default$6(), params9.copy$default$7(), params9.copy$default$8(), new Some(str3), params9.copy$default$10());
                });
                opt("checkpointInterval", Read$.MODULE$.intRead()).text(new StringBuilder(81).append("Iterations between each checkpoint.  Only used if checkpointDir is set.").append(" default: ").append(params.checkpointInterval()).toString()).action((obj6, params10) -> {
                    return $anonfun$new$9(BoxesRunTime.unboxToInt(obj6), params10);
                });
                arg("<input>...", Read$.MODULE$.stringRead()).text("input paths (directories) to plain text corpora.  Each text file line should hold 1 document.").unbounded().required().action((str4, params11) -> {
                    return params11.copy((Seq) params11.input().$colon$plus(str4, Seq$.MODULE$.canBuildFrom()), params11.copy$default$2(), params11.copy$default$3(), params11.copy$default$4(), params11.copy$default$5(), params11.copy$default$6(), params11.copy$default$7(), params11.copy$default$8(), params11.copy$default$9(), params11.copy$default$10());
                });
            }
        }.parse(Predef$.MODULE$.wrapRefArray(strArr), params);
        if (!(parse instanceof Some)) {
            throw package$.MODULE$.exit(1);
        }
        run((LDAExample.Params) parse.value());
        BoxedUnit boxedUnit = BoxedUnit.UNIT;
    }

    private void run(LDAExample.Params params) {
        EMLDAOptimizer miniBatchFraction;
        SparkContext sparkContext = new SparkContext(new SparkConf().setAppName(new StringBuilder(16).append("LDAExample with ").append(params).toString()));
        Logger.getRootLogger().setLevel(Level.WARN);
        long nanoTime = System.nanoTime();
        Tuple3<RDD<Tuple2<Object, Vector>>, String[], Object> preprocess = preprocess(sparkContext, params.input(), params.vocabSize(), params.stopwordFile());
        if (preprocess == null) {
            throw new MatchError(preprocess);
        }
        Tuple3 tuple3 = new Tuple3((RDD) preprocess._1(), (String[]) preprocess._2(), BoxesRunTime.boxToLong(BoxesRunTime.unboxToLong(preprocess._3())));
        RDD rdd = (RDD) tuple3._1();
        String[] strArr = (String[]) tuple3._2();
        long unboxToLong = BoxesRunTime.unboxToLong(tuple3._3());
        rdd.cache();
        long count = rdd.count();
        int length = strArr.length;
        Predef$.MODULE$.println();
        Predef$.MODULE$.println("Corpus summary:");
        Predef$.MODULE$.println(new StringBuilder(31).append("\t Training set size: ").append(count).append(" documents").toString());
        Predef$.MODULE$.println(new StringBuilder(25).append("\t Vocabulary size: ").append(length).append(" terms").toString());
        Predef$.MODULE$.println(new StringBuilder(28).append("\t Training set size: ").append(unboxToLong).append(" tokens").toString());
        Predef$.MODULE$.println(new StringBuilder(26).append("\t Preprocessing time: ").append((System.nanoTime() - nanoTime) / 1.0E9d).append(" sec").toString());
        Predef$.MODULE$.println();
        LDA lda = new LDA();
        String lowerCase = params.algorithm().toLowerCase(Locale.ROOT);
        if ("em".equals(lowerCase)) {
            miniBatchFraction = new EMLDAOptimizer();
        } else {
            if (!"online".equals(lowerCase)) {
                throw new IllegalArgumentException(new StringBuilder(39).append("Only em, online are supported but got ").append(params.algorithm()).append(".").toString());
            }
            miniBatchFraction = new OnlineLDAOptimizer().setMiniBatchFraction(0.05d + (1.0d / count));
        }
        lda.setOptimizer(miniBatchFraction).setK(params.k()).setMaxIterations(params.maxIterations()).setDocConcentration(params.docConcentration()).setTopicConcentration(params.topicConcentration()).setCheckpointInterval(params.checkpointInterval());
        if (params.checkpointDir().nonEmpty()) {
            sparkContext.setCheckpointDir((String) params.checkpointDir().get());
        }
        long nanoTime2 = System.nanoTime();
        DistributedLDAModel run = lda.run(rdd);
        Predef$.MODULE$.println("Finished training LDA model.  Summary:");
        Predef$.MODULE$.println(new StringBuilder(21).append("\t Training time: ").append((System.nanoTime() - nanoTime2) / 1.0E9d).append(" sec").toString());
        if (run instanceof DistributedLDAModel) {
            Predef$.MODULE$.println(new StringBuilder(40).append("\t Training data average log likelihood: ").append(run.logLikelihood() / count).toString());
            Predef$.MODULE$.println();
        }
        Tuple2[][] tuple2Arr = (Tuple2[][]) new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps(run.describeTopics(10))).map(tuple2 -> {
            if (tuple2 == null) {
                throw new MatchError(tuple2);
            }
            return (Tuple2[]) new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps((Object[]) new ArrayOps.ofInt(Predef$.MODULE$.intArrayOps((int[]) tuple2._1())).zip(Predef$.MODULE$.wrapDoubleArray((double[]) tuple2._2()), Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(Tuple2.class))))).map(tuple2 -> {
                if (tuple2 == null) {
                    throw new MatchError(tuple2);
                }
                return new Tuple2(strArr[tuple2._1$mcI$sp()], BoxesRunTime.boxToDouble(tuple2._2$mcD$sp()));
            }, Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(Tuple2.class)));
        }, Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(ScalaRunTime$.MODULE$.arrayClass(Tuple2.class))));
        Predef$.MODULE$.println(new StringBuilder(8).append(params.k()).append(" topics:").toString());
        new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps((Object[]) new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps(tuple2Arr)).zipWithIndex(Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(Tuple2.class))))).foreach(tuple22 -> {
            $anonfun$run$3(tuple22);
            return BoxedUnit.UNIT;
        });
        sparkContext.stop();
    }

    private Tuple3<RDD<Tuple2<Object, Vector>>, String[], Object> preprocess(SparkContext sparkContext, Seq<String> seq, int i, String str) {
        SparkSession orCreate = SparkSession$.MODULE$.builder().sparkContext(sparkContext).getOrCreate();
        Dataset df = orCreate.implicits().rddToDatasetHolder(sparkContext.textFile(seq.mkString(","), sparkContext.textFile$default$2()), orCreate.implicits().newStringEncoder()).toDF(Predef$.MODULE$.wrapRefArray(new String[]{"docs"}));
        String[] strArr = str.isEmpty() ? (String[]) Array$.MODULE$.empty(ClassTag$.MODULE$.apply(String.class)) : (String[]) new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps((String[]) sparkContext.textFile(str, sparkContext.textFile$default$2()).collect())).flatMap(str2 -> {
            return new ArrayOps.ofRef($anonfun$preprocess$1(str2));
        }, Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(String.class)));
        PipelineStage pipelineStage = (RegexTokenizer) new RegexTokenizer().setInputCol("docs").setOutputCol("rawTokens");
        PipelineStage outputCol = new StopWordsRemover().setInputCol("rawTokens").setOutputCol("tokens");
        outputCol.setStopWords((String[]) new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps(outputCol.getStopWords())).$plus$plus(new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps(strArr)), Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(String.class))));
        PipelineModel fit = new Pipeline().setStages(new PipelineStage[]{pipelineStage, outputCol, new CountVectorizer().setVocabSize(i).setInputCol("tokens").setOutputCol("features")}).fit(df);
        RDD map = fit.transform(df).select("features", Predef$.MODULE$.wrapRefArray(new String[0])).rdd().map(row -> {
            Some unapplySeq = Row$.MODULE$.unapplySeq(row);
            if (!unapplySeq.isEmpty() && unapplySeq.get() != null && ((SeqLike) unapplySeq.get()).lengthCompare(1) == 0) {
                Object apply = ((SeqLike) unapplySeq.get()).apply(0);
                if (apply instanceof org.apache.spark.ml.linalg.Vector) {
                    return Vectors$.MODULE$.fromML((org.apache.spark.ml.linalg.Vector) apply);
                }
            }
            throw new MatchError(row);
        }, ClassTag$.MODULE$.apply(Vector.class)).zipWithIndex().map(tuple2 -> {
            return tuple2.swap();
        }, ClassTag$.MODULE$.apply(Tuple2.class));
        return new Tuple3<>(map, fit.stages()[2].vocabulary(), BoxesRunTime.boxToLong((long) RDD$.MODULE$.numericRDDToDoubleRDDFunctions(map.map(tuple22 -> {
            return BoxesRunTime.boxToInteger($anonfun$preprocess$4(tuple22));
        }, ClassTag$.MODULE$.Int()), Numeric$IntIsIntegral$.MODULE$).sum()));
    }

    public static final /* synthetic */ void $anonfun$run$4(Tuple2 tuple2) {
        if (tuple2 == null) {
            throw new MatchError(tuple2);
        }
        String str = (String) tuple2._1();
        Predef$.MODULE$.println(new StringBuilder(1).append(str).append("\t").append(tuple2._2$mcD$sp()).toString());
        BoxedUnit boxedUnit = BoxedUnit.UNIT;
    }

    public static final /* synthetic */ void $anonfun$run$3(Tuple2 tuple2) {
        if (tuple2 == null) {
            throw new MatchError(tuple2);
        }
        Tuple2[] tuple2Arr = (Tuple2[]) tuple2._1();
        Predef$.MODULE$.println(new StringBuilder(6).append("TOPIC ").append(tuple2._2$mcI$sp()).toString());
        new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps(tuple2Arr)).foreach(tuple22 -> {
            $anonfun$run$4(tuple22);
            return BoxedUnit.UNIT;
        });
        Predef$.MODULE$.println();
        BoxedUnit boxedUnit = BoxedUnit.UNIT;
    }

    public static final /* synthetic */ Object[] $anonfun$preprocess$1(String str) {
        return Predef$.MODULE$.refArrayOps(new StringOps(Predef$.MODULE$.augmentString(str)).stripMargin().split("\\s+"));
    }

    public static final /* synthetic */ int $anonfun$preprocess$4(Tuple2 tuple2) {
        return ((Vector) tuple2._2()).numActives();
    }

    private LDAExample$() {
        MODULE$ = this;
    }
}
