package co.cask.cdap.examples.wikipedia;

import co.cask.cdap.api.dataset.table.Table;
import co.cask.cdap.api.spark.ScalaSparkProgram;
import co.cask.cdap.api.workflow.Value;
import co.cask.cdap.api.workflow.WorkflowToken;
import java.util.Map;
import org.apache.spark.Accumulator;
import org.apache.spark.AccumulatorParam$IntAccumulatorParam$;
import org.apache.spark.SparkContext;
import org.apache.spark.mllib.clustering.LDA;
import org.apache.spark.mllib.clustering.LDAModel;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.rdd.NewHadoopRDD;
import org.apache.spark.rdd.RDD;
import org.apache.spark.rdd.RDD$;
import scala.Array$;
import scala.MatchError;
import scala.Predef$;
import scala.Serializable;
import scala.Tuple2;
import scala.Tuple3;
import scala.collection.immutable.Set;
import scala.collection.immutable.StringOps;
import scala.math.Numeric$LongIsIntegral$;
import scala.math.Ordering$Long$;
import scala.math.Ordering$String$;
import scala.reflect.ClassTag$;
import scala.reflect.ScalaSignature;
import scala.runtime.BoxesRunTime;
import scala.runtime.ScalaRunTime$;
import scala.util.matching.Regex;

/* compiled from: ScalaSparkLDA.scala */
@ScalaSignature(bytes = "\u0006\u0001\u0005]d\u0001B\u0001\u0003\u00015\u0011QbU2bY\u0006\u001c\u0006/\u0019:l\u0019\u0012\u000b%BA\u0002\u0005\u0003%9\u0018n[5qK\u0012L\u0017M\u0003\u0002\u0006\r\u0005AQ\r_1na2,7O\u0003\u0002\b\u0011\u0005!1\rZ1q\u0015\tI!\"\u0001\u0003dCN\\'\"A\u0006\u0002\u0005\r|7\u0001A\n\u0004\u000191\u0002CA\b\u0015\u001b\u0005\u0001\"BA\t\u0013\u0003\u0011a\u0017M\\4\u000b\u0003M\tAA[1wC&\u0011Q\u0003\u0005\u0002\u0007\u001f\nTWm\u0019;\u0011\u0005]aR\"\u0001\r\u000b\u0005eQ\u0012!B:qCJ\\'BA\u000e\u0007\u0003\r\t\u0007/[\u0005\u0003;a\u0011\u0011cU2bY\u0006\u001c\u0006/\u0019:l!J|wM]1n\u0011\u0015y\u0002\u0001\"\u0001!\u0003\u0019a\u0014N\\5u}Q\t\u0011\u0005\u0005\u0002#\u00015\t!\u0001C\u0003%\u0001\u0011\u0005S%A\u0002sk:$\"A\n\u0017\u0011\u0005\u001dRS\"\u0001\u0015\u000b\u0003%\nQa]2bY\u0006L!a\u000b\u0015\u0003\tUs\u0017\u000e\u001e\u0005\u0006[\r\u0002\rAL\u0001\bG>tG/\u001a=u!\t9r&\u0003\u000211\ta1\u000b]1sW\u000e{g\u000e^3yi\")!\u0007\u0001C\u0005g\u0005Q\u0001O]3Qe>\u001cWm]:\u0015\tQR6m\u001b\t\u0006OU:\u0004+R\u0005\u0003m!\u0012a\u0001V;qY\u0016\u001c\u0004c\u0001\u001dA\u00056\t\u0011H\u0003\u0002;w\u0005\u0019!\u000f\u001a3\u000b\u0005ea$BA\u001f?\u0003\u0019\t\u0007/Y2iK*\tq(A\u0002pe\u001eL!!Q\u001d\u0003\u0007I#E\t\u0005\u0003(\u0007\u0016C\u0015B\u0001#)\u0005\u0019!V\u000f\u001d7feA\u0011qER\u0005\u0003\u000f\"\u0012A\u0001T8oOB\u0011\u0011JT\u0007\u0002\u0015*\u00111\nT\u0001\u0007Y&t\u0017\r\\4\u000b\u00055[\u0014!B7mY&\u0014\u0017BA(K\u0005\u00191Vm\u0019;peB\u0019q%U*\n\u0005IC#!B!se\u0006L\bC\u0001+X\u001d\t9S+\u0003\u0002WQ\u00051\u0001K]3eK\u001aL!\u0001W-\u0003\rM#(/\u001b8h\u0015\t1\u0006\u0006C\u0003\\c\u0001\u0007A,A\u000bo_Jl\u0017\r\\5{K\u0012<\u0016n[5ECR\f7/\u001a;\u0011\tajvlX\u0005\u0003=f\u0012ABT3x\u0011\u0006$wn\u001c9S\t\u0012\u00032aJ)a!\t9\u0013-\u0003\u0002cQ\t!!)\u001f;f\u0011\u0015!\u0017\u00071\u0001f\u0003%\t'oZ;nK:$8\u000f\u0005\u0003gSN\u001bV\"A4\u000b\u0005!\u0014\u0012\u0001B;uS2L!A[4\u0003\u00075\u000b\u0007\u000fC\u0003mc\u0001\u0007Q.\u0001\u0002tGB\u0011an\\\u0007\u0002w%\u0011\u0001g\u000f\u0005\u0006c\u0002!IA]\u0001\u0007eVtG\nR!\u0015\u0007ML8\u0010\u0005\u0002uo6\tQO\u0003\u0002w\u0019\u0006Q1\r\\;ti\u0016\u0014\u0018N\\4\n\u0005a,(\u0001\u0003'E\u00036{G-\u001a7\t\u000bi\u0004\b\u0019A\u001c\u0002\r\r|'\u000f];t\u0011\u0015!\u0007\u000f1\u0001f\r\u0011i\b\u0001\u0002@\u0003\u001fMKW\u000e\u001d7f)>\\WM\\5{KJ\u001cB\u0001`@\u0002\u0006A\u0019q%!\u0001\n\u0007\u0005\r\u0001F\u0001\u0004B]f\u0014VM\u001a\t\u0004O\u0005\u001d\u0011bAA\u0005Q\ta1+\u001a:jC2L'0\u00192mK\"AA\u000e B\u0001B\u0003%Q\u000eC\u0005\u0002\u0010q\u0014\t\u0011)A\u0005'\u0006a1\u000f^8qo>\u0014HMR5mK\"1q\u0004 C\u0001\u0003'!b!!\u0006\u0002\u001a\u0005m\u0001cAA\fy6\t\u0001\u0001\u0003\u0004m\u0003#\u0001\r!\u001c\u0005\b\u0003\u001f\t\t\u00021\u0001T\u0011%\ty\u0002 b\u0001\n\u0013\t\t#A\u0005ti>\u0004xo\u001c:egV\u0011\u00111\u0005\t\u0005)\u0006\u00152+C\u0002\u0002(e\u00131aU3u\u0011!\tY\u0003 Q\u0001\n\u0005\r\u0012AC:u_B<xN\u001d3tA!I\u0011q\u0006?C\u0002\u0013%\u0011\u0011G\u0001\rC2dwk\u001c:e%\u0016<W\r_\u000b\u0003\u0003g\u0001B!!\u000e\u0002>5\u0011\u0011q\u0007\u0006\u0005\u0003s\tY$\u0001\u0005nCR\u001c\u0007.\u001b8h\u0015\tA\u0007&\u0003\u0003\u0002@\u0005]\"!\u0002*fO\u0016D\b\u0002CA\"y\u0002\u0006I!a\r\u0002\u001b\u0005dGnV8sIJ+w-\u001a=!\u0011%\t9\u0005 b\u0001\n\u0013\tI%A\u0007nS:<vN\u001d3MK:<G\u000f[\u000b\u0003\u0003\u0017\u00022aJA'\u0013\r\ty\u0005\u000b\u0002\u0004\u0013:$\b\u0002CA*y\u0002\u0006I!a\u0013\u0002\u001d5LgnV8sI2+gn\u001a;iA!9\u0011q\u000b?\u0005\u0002\u0005e\u0013\u0001C4fi^{'\u000fZ:\u0015\t\u0005m\u00131\u000f\t\u0006\u0003;\nig\u0015\b\u0005\u0003?\nIG\u0004\u0003\u0002b\u0005\u001dTBAA2\u0015\r\t)\u0007D\u0001\u0007yI|w\u000e\u001e \n\u0003%J1!a\u001b)\u0003\u001d\u0001\u0018mY6bO\u0016LA!a\u001c\u0002r\tQ\u0011J\u001c3fq\u0016$7+Z9\u000b\u0007\u0005-\u0004\u0006C\u0004\u0002v\u0005U\u0003\u0019A*\u0002\tQ,\u0007\u0010\u001e")
/* loaded from: input_file:co/cask/cdap/examples/wikipedia/ScalaSparkLDA.class */
public class ScalaSparkLDA implements ScalaSparkProgram {

    /* compiled from: ScalaSparkLDA.scala */
    /* loaded from: input_file:co/cask/cdap/examples/wikipedia/ScalaSparkLDA$SimpleTokenizer.class */
    public class SimpleTokenizer implements Serializable {
        private final Set<String> stopwords;
        private final Regex allWordRegex;
        private final int minWordLength;
        public final /* synthetic */ ScalaSparkLDA $outer;

        private Set<String> stopwords() {
            return this.stopwords;
        }

        private Regex allWordRegex() {
            return this.allWordRegex;
        }

        private int minWordLength() {
            return this.minWordLength;
        }

        /* JADX WARN: Can't wrap try/catch for region: R(9:4|(2:10|(6:14|15|16|17|19|20))|24|15|16|17|19|20|2) */
        /* JADX WARN: Code restructure failed: missing block: B:22:0x0022, code lost:
        
            r9 = -1;
         */
        /*
            Code decompiled incorrectly, please refer to instructions dump.
            To view partially-correct add '--show-bad-code' argument
        */
        public scala.collection.IndexedSeq<java.lang.String> getWords(java.lang.String r5) {
            /*
                r4 = this;
                scala.collection.mutable.ArrayBuffer r0 = new scala.collection.mutable.ArrayBuffer
                r1 = r0
                r1.<init>()
                r6 = r0
                java.text.BreakIterator r0 = java.text.BreakIterator.getWordInstance()
                r7 = r0
                r0 = r7
                r1 = r5
                r0.setText(r1)
                r0 = r7
                int r0 = r0.first()
                r8 = r0
                r0 = r7
                int r0 = r0.next()
                r9 = r0
                goto L25
            L20:
                r15 = move-exception
                r0 = -1
                r9 = r0
            L25:
                r0 = r9
                r1 = -1
                if (r0 == r1) goto Lb6
                r0 = r5
                r1 = r8
                r2 = r9
                java.lang.String r0 = r0.substring(r1, r2)
                java.lang.String r0 = r0.toLowerCase()
                r10 = r0
                r0 = r10
                r11 = r0
                r0 = r4
                scala.util.matching.Regex r0 = r0.allWordRegex()
                r1 = r11
                scala.Option r0 = r0.unapplySeq(r1)
                r12 = r0
                r0 = r12
                boolean r0 = r0.isEmpty()
                if (r0 != 0) goto La1
                r0 = r12
                java.lang.Object r0 = r0.get()
                if (r0 == 0) goto La1
                r0 = r12
                java.lang.Object r0 = r0.get()
                scala.collection.LinearSeqOptimized r0 = (scala.collection.LinearSeqOptimized) r0
                r1 = 1
                int r0 = r0.lengthCompare(r1)
                r1 = 0
                if (r0 != r1) goto La1
                r0 = r12
                java.lang.Object r0 = r0.get()
                scala.collection.LinearSeqOptimized r0 = (scala.collection.LinearSeqOptimized) r0
                r1 = 0
                java.lang.Object r0 = r0.apply(r1)
                java.lang.String r0 = (java.lang.String) r0
                r13 = r0
                r0 = r13
                int r0 = r0.length()
                r1 = r4
                int r1 = r1.minWordLength()
                if (r0 < r1) goto La1
                r0 = r4
                scala.collection.immutable.Set r0 = r0.stopwords()
                r1 = r13
                boolean r0 = r0.contains(r1)
                if (r0 != 0) goto La1
                r0 = r6
                r1 = r13
                scala.collection.mutable.ArrayBuffer r0 = r0.$plus$eq(r1)
                r14 = r0
                goto La6
            La1:
                scala.runtime.BoxedUnit r0 = scala.runtime.BoxedUnit.UNIT
                r14 = r0
            La6:
                r0 = r14
                r0 = r9
                r8 = r0
                r0 = r7
                int r0 = r0.next()     // Catch: java.lang.Exception -> L20
                r9 = r0
                goto L25
            Lb6:
                r0 = r6
                return r0
            */
            throw new UnsupportedOperationException("Method not decompiled: co.cask.cdap.examples.wikipedia.ScalaSparkLDA.SimpleTokenizer.getWords(java.lang.String):scala.collection.IndexedSeq");
        }

        public /* synthetic */ ScalaSparkLDA co$cask$cdap$examples$wikipedia$ScalaSparkLDA$SimpleTokenizer$$$outer() {
            return this.$outer;
        }

        public SimpleTokenizer(ScalaSparkLDA scalaSparkLDA, SparkContext sparkContext, String str) {
            Set<String> set;
            if (scalaSparkLDA == null) {
                throw new NullPointerException();
            }
            this.$outer = scalaSparkLDA;
            if (str.isEmpty()) {
                set = Predef$.MODULE$.Set().empty();
            } else {
                set = Predef$.MODULE$.refArrayOps((Object[]) Predef$.MODULE$.refArrayOps((String[]) sparkContext.textFile(str, sparkContext.textFile$default$2()).collect()).flatMap(new ScalaSparkLDA$SimpleTokenizer$$anonfun$11(this), Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(String.class)))).toSet();
            }
            this.stopwords = set;
            this.allWordRegex = new StringOps(Predef$.MODULE$.augmentString("^(\\p{L}*)$")).r();
            this.minWordLength = 3;
        }
    }

    @Override // co.cask.cdap.api.spark.SparkProgram
    public void run(co.cask.cdap.api.spark.SparkContext sparkContext) {
        Map<String, String> runtimeArguments = sparkContext.getRuntimeArguments();
        NewHadoopRDD<byte[], byte[]> newHadoopRDD = (NewHadoopRDD) sparkContext.readFromDataset(WikipediaPipelineApp.NORMALIZED_WIKIPEDIA_DATASET, byte[].class, byte[].class);
        SparkContext sparkContext2 = (SparkContext) sparkContext.getOriginalSparkContext();
        Tuple3<RDD<Tuple2<Object, Vector>>, String[], Object> preProcess = preProcess(newHadoopRDD, runtimeArguments, sparkContext2);
        if (preProcess == null) {
            throw new MatchError(preProcess);
        }
        Tuple3 tuple3 = new Tuple3((RDD) preProcess._1(), (String[]) preProcess._2(), BoxesRunTime.boxToLong(BoxesRunTime.unboxToLong(preProcess._3())));
        RDD<Tuple2<Object, Vector>> rdd = (RDD) tuple3._1();
        String[] strArr = (String[]) tuple3._2();
        BoxesRunTime.unboxToLong(tuple3._3());
        rdd.cache();
        LDAModel runLDA = runLDA(rdd, runtimeArguments);
        Table table = (Table) sparkContext.getDataset(WikipediaPipelineApp.SPARK_LDA_OUTPUT_DATASET);
        Tuple2[][] tuple2Arr = (Tuple2[][]) Predef$.MODULE$.refArrayOps(runLDA.describeTopics(10)).map(new ScalaSparkLDA$$anonfun$3(this, strArr), Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(ScalaRunTime$.MODULE$.arrayClass(Tuple2.class))));
        Accumulator accumulator = sparkContext2.accumulator(BoxesRunTime.boxToInteger(0), "num.records", AccumulatorParam$IntAccumulatorParam$.MODULE$);
        Accumulator accumulator2 = sparkContext2.accumulator(new Term("", 0.0d), "highest.score", HighestAccumulatorParam$.MODULE$);
        Predef$.MODULE$.refArrayOps((Object[]) Predef$.MODULE$.refArrayOps(tuple2Arr).zipWithIndex(Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(Tuple2.class)))).foreach(new ScalaSparkLDA$$anonfun$run$1(this, table, accumulator, accumulator2));
        WorkflowToken workflowToken = sparkContext.getWorkflowToken();
        if (workflowToken != null) {
            workflowToken.put("num.records", Value.of(BoxesRunTime.unboxToInt(accumulator.value())));
            workflowToken.put("highest.score.term", ((Term) accumulator2.value()).name());
            workflowToken.put("highest.score.value", String.valueOf(((Term) accumulator2.value()).weight()));
        }
    }

    private Tuple3<RDD<Tuple2<Object, Vector>>, String[], Object> preProcess(NewHadoopRDD<byte[], byte[]> newHadoopRDD, Map<String, String> map, SparkContext sparkContext) {
        String str = map.containsKey("stopwords.file") ? map.get("stopwords.file") : "";
        int i = map.containsKey("vocab.size") ? new StringOps(Predef$.MODULE$.augmentString(map.get("vocab.size"))).toInt() : 1000;
        RDD map2 = newHadoopRDD.zipWithIndex().map(new ScalaSparkLDA$$anonfun$4(this, new SimpleTokenizer(this, sparkContext, str)), ClassTag$.MODULE$.apply(Tuple2.class));
        map2.cache();
        RDD reduceByKey = RDD$.MODULE$.rddToPairRDDFunctions(map2.flatMap(new ScalaSparkLDA$$anonfun$5(this), ClassTag$.MODULE$.apply(Tuple2.class)), ClassTag$.MODULE$.apply(String.class), ClassTag$.MODULE$.Long(), Ordering$String$.MODULE$).reduceByKey(new ScalaSparkLDA$$anonfun$1(this));
        reduceByKey.cache();
        Tuple2[] tuple2Arr = (i == -1 || reduceByKey.count() <= ((long) i)) ? (Tuple2[]) Predef$.MODULE$.refArrayOps((Object[]) reduceByKey.collect()).sortBy(new ScalaSparkLDA$$anonfun$6(this), Ordering$Long$.MODULE$) : (Tuple2[]) reduceByKey.sortBy(new ScalaSparkLDA$$anonfun$7(this), false, reduceByKey.sortBy$default$3(), Ordering$Long$.MODULE$, ClassTag$.MODULE$.Long()).take(i);
        Tuple2 tuple2 = new Tuple2(Predef$.MODULE$.refArrayOps((Object[]) Predef$.MODULE$.refArrayOps((Object[]) Predef$.MODULE$.refArrayOps(tuple2Arr).map(new ScalaSparkLDA$$anonfun$8(this), Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(String.class)))).zipWithIndex(Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(Tuple2.class)))).toMap(Predef$.MODULE$.conforms()), Predef$.MODULE$.longArrayOps((long[]) Predef$.MODULE$.refArrayOps(tuple2Arr).map(new ScalaSparkLDA$$anonfun$9(this), Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.Long()))).sum(Numeric$LongIsIntegral$.MODULE$));
        if (tuple2 != null) {
            scala.collection.immutable.Map map3 = (scala.collection.immutable.Map) tuple2._1();
            long _2$mcJ$sp = tuple2._2$mcJ$sp();
            if (map3 != null) {
                Tuple2 tuple22 = new Tuple2(map3, BoxesRunTime.boxToLong(_2$mcJ$sp));
                scala.collection.immutable.Map map4 = (scala.collection.immutable.Map) tuple22._1();
                long _2$mcJ$sp2 = tuple22._2$mcJ$sp();
                RDD map5 = map2.map(new ScalaSparkLDA$$anonfun$10(this, map4), ClassTag$.MODULE$.apply(Tuple2.class));
                String[] strArr = new String[map4.size()];
                map4.foreach(new ScalaSparkLDA$$anonfun$preProcess$1(this, strArr));
                return new Tuple3<>(map5, strArr, BoxesRunTime.boxToLong(_2$mcJ$sp2));
            }
        }
        throw new MatchError(tuple2);
    }

    private LDAModel runLDA(RDD<Tuple2<Object, Vector>> rdd, Map<String, String> map) {
        int i = map.containsKey("num.topics") ? new StringOps(Predef$.MODULE$.augmentString(map.get("num.topics"))).toInt() : 10;
        int i2 = map.containsKey("max.iterations") ? new StringOps(Predef$.MODULE$.augmentString(map.get("max.iterations"))).toInt() : 10;
        LDA lda = new LDA();
        lda.setK(i).setMaxIterations(i2).setDocConcentration(-1.0d).setTopicConcentration(-1.0d).setCheckpointInterval(10);
        return lda.run(rdd);
    }
}
