package cc.mallet.cluster.tui;

import cc.mallet.classify.MaxEnt;
import cc.mallet.classify.MaxEntTrainer;
import cc.mallet.classify.Trial;
import cc.mallet.cluster.Clusterer;
import cc.mallet.cluster.Clustering;
import cc.mallet.cluster.Clusterings;
import cc.mallet.cluster.GreedyAgglomerativeByDensity;
import cc.mallet.cluster.Record;
import cc.mallet.cluster.evaluate.AccuracyEvaluator;
import cc.mallet.cluster.evaluate.BCubedEvaluator;
import cc.mallet.cluster.evaluate.ClusteringEvaluator;
import cc.mallet.cluster.evaluate.ClusteringEvaluators;
import cc.mallet.cluster.evaluate.MUCEvaluator;
import cc.mallet.cluster.evaluate.PairF1Evaluator;
import cc.mallet.cluster.iterator.PairSampleIterator;
import cc.mallet.cluster.neighbor_evaluator.AgglomerativeNeighbor;
import cc.mallet.cluster.neighbor_evaluator.PairwiseEvaluator;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.InfoGain;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.util.CommandOption;
import cc.mallet.util.MalletLogger;
import cc.mallet.util.PropertyList;
import cc.mallet.util.Randoms;
import cc.mallet.util.Strings;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.logging.Logger;
import org.apache.uima.pear.tools.InstallationDescriptorHandler;

/* loaded from: input_file:cc/mallet/cluster/tui/Clusterings2Clusterer.class */
public class Clusterings2Clusterer {
    private static Logger logger = MalletLogger.getLogger(Clusterings2Clusterer.class.getName());
    static CommandOption.File loadClusterer = new CommandOption.File(Clusterings2Clusterer.class, "load-clusterer", InstallationDescriptorHandler.FILE_TAG, false, null, "The file from which to read the clusterer.", null);
    static CommandOption.File saveClusterer = new CommandOption.File(Clusterings2Clusterer.class, "save-clusterer", InstallationDescriptorHandler.FILE_TAG, false, new File("clusterer.mallet"), "The filename in which to write the clusterer after it has been trained.", null);
    static CommandOption.String outputClusterings = new CommandOption.String(Clusterings2Clusterer.class, "output-clusterings", "FILENAME", false, "predictions", "The filename in which to write the predicted clusterings.", null);
    static CommandOption.String trainingFile = new CommandOption.String(Clusterings2Clusterer.class, "train", "FILENAME", false, "text.clusterings.train", "Read the training set Clusterings from this file. If this is specified, the input file parameter is ignored", null);
    static CommandOption.String testingFile = new CommandOption.String(Clusterings2Clusterer.class, "test", "FILENAME", false, "text.clusterings.test", "Read the test set Clusterings from this file. If this option is specified, the training-file parameter must be specified and  the input-file parameter is ignored", null);
    static CommandOption.Object clusteringEvaluatorOption = new CommandOption.Object(Clusterings2Clusterer.class, "clustering-evaluator", "CONSTRUCTOR", true, null, "Java code for constructing a ClusteringEvaluator object", null);
    static CommandOption.SpacedStrings exactMatchFields = new CommandOption.SpacedStrings(Clusterings2Clusterer.class, "exact-match-fields", "STRING...", false, null, "The field names to be checked for exactly matching values", null);
    static CommandOption.SpacedStrings approxMatchFields = new CommandOption.SpacedStrings(Clusterings2Clusterer.class, "approx-match-fields", "STRING...", false, null, "The field names to be checked for approx matching values", null);
    static CommandOption.SpacedStrings substringMatchFields = new CommandOption.SpacedStrings(Clusterings2Clusterer.class, "substring-match-fields", "STRING...", false, null, "The field names to be checked for substring matching values. Note that values fewer than 3 characters are ignored.", null);

    /* loaded from: input_file:cc/mallet/cluster/tui/Clusterings2Clusterer$ClusteringPipe.class */
    public static class ClusteringPipe extends Pipe {
        private static final long serialVersionUID = 1;
        int[] exactMatchFields;
        int[] approxMatchFields;
        int[] substringMatchFields;
        double approxMatchThreshold;

        public ClusteringPipe(int[] iArr, int[] iArr2, int[] iArr3) {
            super(new Alphabet(), new LabelAlphabet());
            this.exactMatchFields = iArr;
            this.approxMatchFields = iArr2;
            this.substringMatchFields = iArr3;
        }

        private Record[] array2Records(int[] iArr, InstanceList instanceList) {
            ArrayList arrayList = new ArrayList();
            for (int i : iArr) {
                arrayList.add((Record) instanceList.get(i).getData());
            }
            return (Record[]) arrayList.toArray(new Record[0]);
        }

        @Override // cc.mallet.pipe.Pipe
        public Instance pipe(Instance instance) {
            AgglomerativeNeighbor agglomerativeNeighbor = (AgglomerativeNeighbor) instance.getData();
            Clustering original = agglomerativeNeighbor.getOriginal();
            int[] iArr = agglomerativeNeighbor.getOldClusters()[0];
            int[] iArr2 = agglomerativeNeighbor.getOldClusters()[1];
            Record[] array2Records = array2Records(agglomerativeNeighbor.getNewCluster(), original.getInstances());
            Alphabet fieldAlphabet = array2Records[0].fieldAlphabet();
            Alphabet valueAlphabet = array2Records[0].valueAlphabet();
            instance.setData(new FeatureVector(getDataAlphabet(), addSubstringMatch(array2Records, fieldAlphabet, valueAlphabet, addApproxMatch(array2Records, fieldAlphabet, valueAlphabet, addExactMatch(array2Records, fieldAlphabet, valueAlphabet, null))), true));
            instance.setTarget(((LabelAlphabet) getTargetAlphabet()).lookupLabel(original.getLabel(iArr[0]) == original.getLabel(iArr2[0]) ? "YES" : "NO"));
            return instance;
        }

        private PropertyList addExactMatch(Record[] recordArr, Alphabet alphabet, Alphabet alphabet2, PropertyList propertyList) {
            for (int i = 0; i < this.exactMatchFields.length; i++) {
                int i2 = 0;
                int i3 = 0;
                for (int i4 = 0; i4 < recordArr.length && this.exactMatchFields.length > 0; i4++) {
                    FeatureVector values = recordArr[i4].values(this.exactMatchFields[i]);
                    for (int i5 = i4 + 1; i5 < recordArr.length && values != null; i5++) {
                        FeatureVector values2 = recordArr[i5].values(this.exactMatchFields[i]);
                        if (values2 != null) {
                            i3++;
                            int i6 = 0;
                            while (true) {
                                if (i6 < values.numLocations()) {
                                    if (values2.contains(alphabet2.lookupObject(values.indexAtLocation(i6)))) {
                                        i2++;
                                        break;
                                    }
                                    i6++;
                                }
                            }
                        }
                    }
                    if (i2 == i3 && i3 > 1) {
                        propertyList = PropertyList.add(alphabet.lookupObject(this.exactMatchFields[i]) + "_all_match", 1.0d, propertyList);
                    }
                    if (i2 > 0) {
                        propertyList = PropertyList.add(alphabet.lookupObject(this.exactMatchFields[i]) + "_exists_match", 1.0d, propertyList);
                    }
                }
            }
            return propertyList;
        }

        private PropertyList addApproxMatch(Record[] recordArr, Alphabet alphabet, Alphabet alphabet2, PropertyList propertyList) {
            for (int i = 0; i < this.approxMatchFields.length; i++) {
                int i2 = 0;
                int i3 = 0;
                for (int i4 = 0; i4 < recordArr.length && this.approxMatchFields.length > 0; i4++) {
                    FeatureVector values = recordArr[i4].values(this.approxMatchFields[i]);
                    for (int i5 = i4 + 1; i5 < recordArr.length && values != null; i5++) {
                        FeatureVector values2 = recordArr[i5].values(this.approxMatchFields[i]);
                        if (values2 != null) {
                            i3++;
                            for (int i6 = 0; i6 < values.numLocations(); i6++) {
                                String str = (String) alphabet2.lookupObject(values.indexAtLocation(i6));
                                int i7 = 0;
                                while (true) {
                                    if (i7 < values2.numLocations()) {
                                        if (Strings.levenshteinDistance(str, (String) alphabet2.lookupObject(values2.indexAtLocation(i7))) < this.approxMatchThreshold) {
                                            i2++;
                                            break;
                                        }
                                        i7++;
                                    }
                                }
                            }
                        }
                    }
                    if (i2 == i3 && i3 > 1) {
                        propertyList = PropertyList.add(alphabet.lookupObject(this.approxMatchFields[i]) + "_all_approx_match", 1.0d, propertyList);
                    }
                    if (i2 > 0) {
                        propertyList = PropertyList.add(alphabet.lookupObject(this.approxMatchFields[i]) + "_exists_approx_match", 1.0d, propertyList);
                    }
                }
            }
            return propertyList;
        }

        private PropertyList addSubstringMatch(Record[] recordArr, Alphabet alphabet, Alphabet alphabet2, PropertyList propertyList) {
            for (int i = 0; i < this.substringMatchFields.length; i++) {
                int i2 = 0;
                int i3 = 0;
                for (int i4 = 0; i4 < recordArr.length && this.substringMatchFields.length > 0; i4++) {
                    FeatureVector values = recordArr[i4].values(this.substringMatchFields[i]);
                    for (int i5 = i4 + 1; i5 < recordArr.length && values != null; i5++) {
                        FeatureVector values2 = recordArr[i5].values(this.substringMatchFields[i]);
                        if (values2 != null) {
                            i3++;
                            for (int i6 = 0; i6 < values.numLocations(); i6++) {
                                String str = (String) alphabet2.lookupObject(values.indexAtLocation(i6));
                                if (str.length() < 2) {
                                    break;
                                }
                                for (int i7 = 0; i7 < values2.numLocations(); i7++) {
                                    String str2 = (String) alphabet2.lookupObject(values2.indexAtLocation(i7));
                                    if (str2.length() > 2 && (str.contains(str) || str2.contains(str))) {
                                        i2++;
                                        break;
                                    }
                                }
                            }
                        }
                    }
                    if (i2 == i3 && i3 > 1) {
                        propertyList = PropertyList.add(alphabet.lookupObject(this.exactMatchFields[i]) + "_all_substring_match", 1.0d, propertyList);
                    }
                    if (i2 > 0) {
                        propertyList = PropertyList.add(alphabet.lookupObject(this.exactMatchFields[i]) + "_exists_substring_match", 1.0d, propertyList);
                    }
                }
            }
            return propertyList;
        }
    }

    public static void main(String[] strArr) throws Exception {
        Clusterer clusterer;
        CommandOption.setSummary(Clusterings2Clusterer.class, "A tool to train and test a Clusterer.");
        CommandOption.process(Clusterings2Clusterer.class, strArr);
        Randoms randoms = new Randoms(123);
        if (loadClusterer.value.exists()) {
            clusterer = (Clusterer) new ObjectInputStream(new FileInputStream(loadClusterer.value)).readObject();
        } else {
            Clusterings readClusterings = readClusterings(trainingFile.value);
            Alphabet fieldAlphabet = ((Record) readClusterings.get(0).getInstances().get(0).getData()).fieldAlphabet();
            ClusteringPipe clusteringPipe = new ClusteringPipe(string2ints(exactMatchFields.value, fieldAlphabet), string2ints(approxMatchFields.value, fieldAlphabet), string2ints(substringMatchFields.value, fieldAlphabet));
            InstanceList instanceList = new InstanceList(clusteringPipe);
            for (int i = 0; i < readClusterings.size(); i++) {
                PairSampleIterator pairSampleIterator = new PairSampleIterator(readClusterings.get(i), randoms, 0.5d, readClusterings.get(i).getNumInstances());
                while (pairSampleIterator.hasNext()) {
                    instanceList.add(clusteringPipe.pipe(pairSampleIterator.next()));
                }
            }
            logger.info("generated " + instanceList.size() + " training instances");
            MaxEnt train = new MaxEntTrainer().train(instanceList);
            logger.info("InfoGain:\n");
            new InfoGain(instanceList).printByRank(System.out);
            logger.info("pairwise training accuracy=" + new Trial(train, instanceList).getAccuracy());
            clusterer = new GreedyAgglomerativeByDensity(readClusterings.get(0).getInstances().getPipe(), new PairwiseEvaluator(train, "YES", new PairwiseEvaluator.Average(), true), 0.5d, false, randoms);
        }
        Clusterings readClusterings2 = readClusterings(testingFile.value);
        ClusteringEvaluator clusteringEvaluator = (ClusteringEvaluator) clusteringEvaluatorOption.value;
        if (clusteringEvaluator == null) {
            clusteringEvaluator = new ClusteringEvaluators(new ClusteringEvaluator[]{new BCubedEvaluator(), new PairF1Evaluator(), new MUCEvaluator(), new AccuracyEvaluator()});
        }
        ArrayList arrayList = new ArrayList();
        for (int i2 = 0; i2 < readClusterings2.size(); i2++) {
            Clustering clustering = readClusterings2.get(i2);
            Clustering cluster = clusterer.cluster(clustering.getInstances());
            arrayList.add(cluster);
            logger.info(clusteringEvaluator.evaluate(clustering, cluster));
        }
        logger.info(clusteringEvaluator.evaluateTotals());
        ObjectOutputStream objectOutputStream = new ObjectOutputStream(new FileOutputStream(saveClusterer.value));
        objectOutputStream.writeObject(clusterer);
        objectOutputStream.close();
        if (outputClusterings.value != null) {
            BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(new File(outputClusterings.value)));
            bufferedWriter.write(arrayList.toString());
            bufferedWriter.flush();
            bufferedWriter.close();
        }
    }

    public static int[] string2ints(String[] strArr, Alphabet alphabet) {
        int[] iArr = new int[strArr.length];
        for (int i = 0; i < strArr.length; i++) {
            iArr[i] = alphabet.lookupIndex(strArr[i]);
        }
        return iArr;
    }

    public static Clusterings readClusterings(String str) throws Exception {
        return (Clusterings) new ObjectInputStream(new FileInputStream(new File(str))).readObject();
    }
}
