package org.dkpro.tc.features.pair.core.ngram;

import com.google.common.collect.MinMaxPriorityQueue;
import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceSpecifier;
import org.dkpro.tc.api.exception.TextClassificationException;
import org.dkpro.tc.api.features.Feature;
import org.dkpro.tc.api.features.PairFeatureExtractor;
import org.dkpro.tc.api.features.meta.MetaCollectorConfiguration;
import org.dkpro.tc.api.type.TextClassificationTarget;
import org.dkpro.tc.features.ngram.util.NGramUtils;
import org.dkpro.tc.features.ngram.util.TermFreqTuple;
import org.dkpro.tc.features.pair.core.ngram.meta.ComboUtils;
import org.dkpro.tc.features.pair.core.ngram.meta.LuceneNGramPMetaCollector;

/* loaded from: input_file:org/dkpro/tc/features/pair/core/ngram/LuceneNGramCPFE.class */
public class LuceneNGramCPFE extends LuceneNGramPFE implements PairFeatureExtractor {
    public static final String PARAM_NGRAM_MIN_N_COMBO = "ngramMinNCombo";

    @ConfigurationParameter(name = PARAM_NGRAM_MIN_N_COMBO, mandatory = false, defaultValue = {"2"})
    protected int ngramMinNCombo;
    public static final String PARAM_NGRAM_MAX_N_COMBO = "ngramMaxNCombo";

    @ConfigurationParameter(name = PARAM_NGRAM_MAX_N_COMBO, mandatory = false, defaultValue = {"4"})
    protected int ngramMaxNCombo;
    public static final String PARAM_NGRAM_USE_TOP_K_COMBO = "ngramUseTopKCombo";

    @ConfigurationParameter(name = PARAM_NGRAM_USE_TOP_K_COMBO, mandatory = false, defaultValue = {"500"})
    protected int ngramUseTopKCombo;
    public static final String PARAM_NGRAM_SYMMETRY_COMBO = "ngramUseSymmetricalCombos";

    @ConfigurationParameter(name = PARAM_NGRAM_SYMMETRY_COMBO, mandatory = false, defaultValue = {"false"})
    protected boolean ngramUseSymmetricalCombos;
    public static final String LUCENE_NGRAM_FIELDCOMBO = "ngramCombo";
    protected FrequencyDistribution<String> topKSetCombo;

    @Override // org.dkpro.tc.features.pair.core.ngram.LuceneNGramPFE
    public boolean initialize(ResourceSpecifier resourceSpecifier, Map<String, Object> map) throws ResourceInitializationException {
        if (!super.initialize(resourceSpecifier, map)) {
            return false;
        }
        this.topKSetCombo = getTopNgramsCombo(this.ngramUseTopKCombo, LUCENE_NGRAM_FIELDCOMBO);
        return true;
    }

    @Override // org.dkpro.tc.features.pair.core.ngram.LuceneNGramPFE
    public List<MetaCollectorConfiguration> getMetaCollectorClasses(Map<String, Object> map) throws ResourceInitializationException {
        return Arrays.asList(new MetaCollectorConfiguration(LuceneNGramPMetaCollector.class, map).addStorageMapping("targetLocation", "sourceLocation", "lucene"));
    }

    @Override // org.dkpro.tc.features.pair.core.ngram.LuceneNGramPFE
    public Set<Feature> extract(JCas jCas, JCas jCas2) throws TextClassificationException {
        FrequencyDistribution<String> combinedNgrams = ComboUtils.getCombinedNgrams(NGramUtils.getDocumentNgrams(jCas, JCasUtil.selectSingle(jCas, TextClassificationTarget.class), this.ngramLowerCase, this.filterPartialStopwordMatches, this.ngramMinN1, this.ngramMaxN1, this.stopwords), NGramUtils.getDocumentNgrams(jCas2, JCasUtil.selectSingle(jCas2, TextClassificationTarget.class), this.ngramLowerCase, this.filterPartialStopwordMatches, this.ngramMinN2, this.ngramMaxN2, this.stopwords), this.ngramMinNCombo, this.ngramMaxNCombo, this.ngramUseSymmetricalCombos);
        HashSet hashSet = new HashSet();
        this.prefix = "comboNG";
        return addToFeatureArray(combinedNgrams, this.topKSetCombo, hashSet);
    }

    private FrequencyDistribution<String> getTopNgramsCombo(int i, String str) throws ResourceInitializationException {
        Terms terms;
        FrequencyDistribution<String> frequencyDistribution = new FrequencyDistribution<>();
        MinMaxPriorityQueue create = MinMaxPriorityQueue.maximumSize(i).create();
        try {
            Fields fields = MultiFields.getFields(DirectoryReader.open(FSDirectory.open(this.luceneDir)));
            if (fields != null && (terms = fields.terms(str)) != null) {
                TermsEnum it = terms.iterator((TermsEnum) null);
                while (true) {
                    BytesRef next = it.next();
                    if (next == null) {
                        break;
                    }
                    String utf8ToString = next.utf8ToString();
                    long j = it.totalTermFreq();
                    String str2 = utf8ToString.split(ComboUtils.JOINT)[0];
                    String str3 = utf8ToString.split(ComboUtils.JOINT)[1];
                    int length = str2.split("_").length + str3.split("_").length;
                    if (this.topKSetView1.contains(str2) && this.topKSet.contains(str2) && this.topKSetView2.contains(str3) && this.topKSet.contains(str3) && length <= this.ngramMaxNCombo && length >= this.ngramMinNCombo) {
                        create.add(new TermFreqTuple(utf8ToString, j));
                    }
                }
            }
            int size = create.size();
            for (int i2 = 0; i2 < size; i2++) {
                TermFreqTuple termFreqTuple = (TermFreqTuple) create.poll();
                frequencyDistribution.addSample(termFreqTuple.getTerm(), termFreqTuple.getFreq());
            }
            return frequencyDistribution;
        } catch (Exception e) {
            throw new ResourceInitializationException(e);
        }
    }
}
