package smile.nlp.collocation;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import smile.nlp.Bigram;
import smile.nlp.Corpus;
import smile.sort.HeapSelect;
import smile.stat.distribution.ChiSquareDistribution;

/* loaded from: input_file:libarx-3.7.1.jar:smile/nlp/collocation/BigramCollocationFinder.class */
public class BigramCollocationFinder {
    private ChiSquareDistribution chisq = new ChiSquareDistribution(1);
    private int minFreq;

    public BigramCollocationFinder(int i) {
        this.minFreq = i;
    }

    public BigramCollocation[] find(Corpus corpus, int i) {
        BigramCollocation[] bigramCollocationArr = new BigramCollocation[i];
        HeapSelect heapSelect = new HeapSelect(bigramCollocationArr);
        Iterator<Bigram> bigrams = corpus.getBigrams();
        while (bigrams.hasNext()) {
            Bigram next = bigrams.next();
            int bigramFrequency = corpus.getBigramFrequency(next);
            if (bigramFrequency > this.minFreq) {
                heapSelect.add(new BigramCollocation(next.w1, next.w2, bigramFrequency, -likelihoodRatio(corpus.getTermFrequency(next.w1), corpus.getTermFrequency(next.w2), bigramFrequency, corpus.size())));
            }
        }
        heapSelect.sort();
        BigramCollocation[] bigramCollocationArr2 = new BigramCollocation[i];
        for (int i2 = 0; i2 < i; i2++) {
            BigramCollocation bigramCollocation = bigramCollocationArr[(i - i2) - 1];
            bigramCollocationArr2[i2] = new BigramCollocation(bigramCollocation.w1(), bigramCollocation.w2(), bigramCollocation.frequency(), -bigramCollocation.score());
        }
        return bigramCollocationArr2;
    }

    public BigramCollocation[] find(Corpus corpus, double d) {
        if (d <= 0.0d || d >= 1.0d) {
            throw new IllegalArgumentException("Invalid p = " + d);
        }
        double quantile = this.chisq.quantile(d);
        ArrayList arrayList = new ArrayList();
        Iterator<Bigram> bigrams = corpus.getBigrams();
        while (bigrams.hasNext()) {
            Bigram next = bigrams.next();
            int bigramFrequency = corpus.getBigramFrequency(next);
            if (bigramFrequency > this.minFreq) {
                double likelihoodRatio = likelihoodRatio(corpus.getTermFrequency(next.w1), corpus.getTermFrequency(next.w2), bigramFrequency, corpus.size());
                if (likelihoodRatio > quantile) {
                    arrayList.add(new BigramCollocation(next.w1, next.w2, bigramFrequency, likelihoodRatio));
                }
            }
        }
        int size = arrayList.size();
        BigramCollocation[] bigramCollocationArr = new BigramCollocation[size];
        for (int i = 0; i < size; i++) {
            bigramCollocationArr[i] = (BigramCollocation) arrayList.get(i);
        }
        Arrays.sort(bigramCollocationArr);
        for (int i2 = 0; i2 < size / 2; i2++) {
            BigramCollocation bigramCollocation = bigramCollocationArr[i2];
            bigramCollocationArr[i2] = bigramCollocationArr[(size - i2) - 1];
            bigramCollocationArr[(size - i2) - 1] = bigramCollocation;
        }
        return bigramCollocationArr;
    }

    private double likelihoodRatio(int i, int i2, int i3, long j) {
        double d = i2 / j;
        return (-2.0d) * (((logL(i3, i, d) + logL(i2 - i3, j - i, d)) - logL(i3, i, i3 / i)) - logL(i2 - i3, j - i, (i2 - i3) / (j - i)));
    }

    private double logL(int i, long j, double d) {
        if (d == 0.0d) {
            d = 0.01d;
        }
        if (d == 1.0d) {
            d = 0.99d;
        }
        return (i * Math.log(d)) + ((j - i) * Math.log(1.0d - d));
    }
}
