package org.deeplearning4j.models.word2vec.actor;

import akka.actor.UntypedActor;
import akka.dispatch.Futures;
import akka.dispatch.OnFailure;
import akka.dispatch.OnSuccess;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.compress.utils.IOUtils;
import org.deeplearning4j.models.word2vec.StreamWork;
import org.deeplearning4j.models.word2vec.VocabWord;
import org.deeplearning4j.models.word2vec.VocabWork;
import org.deeplearning4j.models.word2vec.wordstore.VocabCache;
import org.deeplearning4j.text.invertedindex.InvertedIndex;
import org.deeplearning4j.text.movingwindow.Util;
import org.deeplearning4j.text.tokenization.tokenizer.Tokenizer;
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.tartarus.snowball.ext.PorterStemmer;
import scala.concurrent.Future;

/* loaded from: input_file:org/deeplearning4j/models/word2vec/actor/VocabActor.class */
public class VocabActor extends UntypedActor {
    private transient TokenizerFactory tokenizer;
    private List<String> stopWords;
    private AtomicLong lastUpdate;
    private VocabCache cache;
    private int minWordFrequency;
    private AtomicLong numWordsEncountered;
    private InvertedIndex index;
    private static final Logger log = LoggerFactory.getLogger(VocabActor.class);
    private PorterStemmer stemmer = new PorterStemmer();

    public VocabActor(TokenizerFactory tokenizerFactory, VocabCache vocabCache, List<String> list, AtomicLong atomicLong, int i, AtomicLong atomicLong2, InvertedIndex invertedIndex) {
        this.tokenizer = tokenizerFactory;
        this.stopWords = list;
        this.lastUpdate = atomicLong;
        this.cache = vocabCache;
        this.minWordFrequency = i;
        this.numWordsEncountered = atomicLong2;
        this.index = invertedIndex;
    }

    public void onReceive(Object obj) throws Exception {
        String nextToken;
        final HashSet hashSet = new HashSet();
        if (obj instanceof VocabWork) {
            final ArrayList arrayList = new ArrayList();
            final VocabWork vocabWork = (VocabWork) obj;
            if (vocabWork.getWork() == null || vocabWork.getWork().isEmpty()) {
                return;
            }
            final String work = vocabWork.getWork();
            if (work.isEmpty() || work.length() <= 2) {
                vocabWork.increment();
                this.lastUpdate.getAndSet(System.currentTimeMillis());
                return;
            } else {
                Future future = Futures.future(new Callable<Object>() { // from class: org.deeplearning4j.models.word2vec.actor.VocabActor.1
                    @Override // java.util.concurrent.Callable
                    public Object call() throws Exception {
                        VocabActor.this.numWordsEncountered.set(VocabActor.this.numWordsEncountered.get() + arrayList.size());
                        Tokenizer create = VocabActor.this.tokenizer.create(work);
                        while (create.hasMoreTokens()) {
                            String nextToken2 = create.nextToken();
                            if (nextToken2.isEmpty()) {
                                break;
                            }
                            VocabActor.this.processToken(nextToken2, hashSet, arrayList, vocabWork.isStem());
                        }
                        if (vocabWork.getLabel() != null) {
                            VocabActor.this.index.addWordsToDoc(VocabActor.this.index.numDocuments(), arrayList, vocabWork.getLabel());
                            return null;
                        }
                        VocabActor.this.index.addWordsToDoc(VocabActor.this.index.numDocuments(), arrayList);
                        return null;
                    }
                }, context().dispatcher());
                future.onFailure(new OnFailure() { // from class: org.deeplearning4j.models.word2vec.actor.VocabActor.2
                    public void onFailure(Throwable th) throws Throwable {
                        VocabActor.log.error("Failure on vocab actor ", th);
                    }
                }, context().dispatcher());
                future.onSuccess(new OnSuccess<Object>() { // from class: org.deeplearning4j.models.word2vec.actor.VocabActor.3
                    public void onSuccess(Object obj2) throws Throwable {
                        vocabWork.increment();
                        VocabActor.this.lastUpdate.getAndSet(System.currentTimeMillis());
                    }
                }, context().dispatcher());
                return;
            }
        }
        if (!(obj instanceof StreamWork)) {
            unhandled(obj);
            return;
        }
        StreamWork streamWork = (StreamWork) obj;
        ArrayList arrayList2 = new ArrayList();
        InputStream is = streamWork.getIs();
        if (is == null) {
            return;
        }
        boolean z = false;
        try {
            if (is.available() > 0) {
                z = true;
            }
        } catch (Exception e) {
            z = false;
        }
        if (z) {
            Tokenizer create = this.tokenizer.create(is);
            while (create.hasMoreTokens() && (nextToken = create.nextToken()) != null && !nextToken.isEmpty()) {
                processToken(nextToken, hashSet, arrayList2, false);
            }
            this.index.addWordsToDoc(this.index.numDocuments(), arrayList2);
            this.numWordsEncountered.set(this.numWordsEncountered.get() + arrayList2.size());
            IOUtils.closeQuietly(is);
            streamWork.countDown();
            this.lastUpdate.getAndSet(System.currentTimeMillis());
        }
    }

    protected synchronized void processToken(String str, Set<String> set, List<VocabWord> list, boolean z) {
        VocabWord vocabWord;
        if (this.stopWords.contains(str)) {
            str = "STOP";
        }
        if (str.isEmpty()) {
            return;
        }
        String str2 = str;
        if (z) {
            synchronized (this.stemmer) {
                this.stemmer.setCurrent(str);
                if (this.stemmer.stem() && this.stemmer.getCurrent() != null && !this.stemmer.getCurrent().isEmpty()) {
                    str = this.stemmer.getCurrent();
                }
            }
        }
        if (str.isEmpty()) {
            str = str2;
        }
        this.cache.incrementWordCount(str);
        if (!set.contains(str)) {
            this.cache.incrementDocCount(str, 1);
            set.add(str);
        }
        if (this.cache.hasToken(str)) {
            vocabWord = this.cache.tokenFor(str);
        } else {
            vocabWord = new VocabWord(1.0d, str);
            this.cache.addToken(vocabWord);
        }
        list.add(vocabWord);
        if (Util.matchesAnyStopWord(this.stopWords, str) || str == null || str.isEmpty()) {
            return;
        }
        if (!this.cache.containsWord(str) && this.cache.wordFrequency(str) >= this.minWordFrequency) {
            vocabWord.setIndex(this.cache.numWords());
            this.cache.putVocabWord(str);
        } else {
            if (!Util.matchesAnyStopWord(this.stopWords, str) || str == null || str.isEmpty() || this.cache.containsWord("STOP") || this.cache.wordFrequency("STOP") < this.minWordFrequency) {
                return;
            }
            vocabWord.setIndex(this.cache.numWords());
            this.cache.putVocabWord("STOP");
        }
    }
}
