package org.cleartk.examples.chunking;

import com.google.common.base.Function;
import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.FileFilterUtils;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.jcas.JCas;
import org.cleartk.classifier.CleartkSequenceAnnotator;
import org.cleartk.classifier.jar.DefaultSequenceDataWriterFactory;
import org.cleartk.classifier.jar.DirectoryDataWriterFactory;
import org.cleartk.classifier.jar.GenericJarClassifierFactory;
import org.cleartk.classifier.jar.Train;
import org.cleartk.classifier.mallet.MalletCRFStringOutcomeDataWriter;
import org.cleartk.eval.AnnotationStatistics;
import org.cleartk.eval.Evaluation_ImplBase;
import org.cleartk.examples.chunking.TrainNamedEntityChunker;
import org.cleartk.examples.chunking.util.MASCGoldAnnotator;
import org.cleartk.examples.documentclassification.advanced.DocumentClassificationEvaluation;
import org.cleartk.ne.type.NamedEntityMention;
import org.cleartk.syntax.opennlp.PosTaggerAnnotator;
import org.cleartk.syntax.opennlp.SentenceAnnotator;
import org.cleartk.token.tokenizer.TokenAnnotator;
import org.cleartk.util.Options_ImplBase;
import org.cleartk.util.ae.UriToDocumentTextAnnotator;
import org.cleartk.util.cr.UriCollectionReader;
import org.kohsuke.args4j.Option;
import org.uimafit.component.ViewCreatorAnnotator;
import org.uimafit.factory.AggregateBuilder;
import org.uimafit.factory.AnalysisEngineFactory;
import org.uimafit.factory.CollectionReaderFactory;
import org.uimafit.pipeline.JCasIterable;
import org.uimafit.pipeline.SimplePipeline;
import org.uimafit.util.JCasUtil;

/* loaded from: input_file:org/cleartk/examples/chunking/EvaluateNamedEntityChunker.class */
public class EvaluateNamedEntityChunker extends Evaluation_ImplBase<File, AnnotationStatistics<String>> {

    /* loaded from: input_file:org/cleartk/examples/chunking/EvaluateNamedEntityChunker$Options.class */
    public static class Options extends Options_ImplBase {

        @Option(name = "--train-dir", usage = "Specify the directory containing the training documents.  This is used for cross-validation and for training in a holdout set evaluator. When we run this example we point to a directory containing training data from the MASC-1.0.3 corpus - i.e. a directory called 'MASC-1.0.3/data/written'")
        public File trainDirectory = new File("src/main/resources/data/MASC-1.0.3/data/written");

        @Option(name = "--models-dir", usage = "specify the directory in which to write out the trained model files")
        public File modelsDirectory = new File("target/chunking/ne-model");
    }

    public static void main(String[] strArr) throws Exception {
        Options options = new Options();
        options.parseOptions(strArr);
        ArrayList arrayList = new ArrayList(FileUtils.listFiles(options.trainDirectory, new TrainNamedEntityChunker.MASCTextFileFilter(), FileFilterUtils.falseFileFilter()));
        EvaluateNamedEntityChunker evaluateNamedEntityChunker = new EvaluateNamedEntityChunker(options.modelsDirectory);
        AnnotationStatistics addAll = AnnotationStatistics.addAll(evaluateNamedEntityChunker.crossValidation(arrayList, 2));
        System.err.println("Cross Validation Results:");
        System.err.print(addAll);
        System.err.println();
        System.err.println(addAll.confusions());
        System.err.println();
        evaluateNamedEntityChunker.trainAndTest(arrayList, Collections.emptyList());
    }

    public EvaluateNamedEntityChunker(File file) {
        super(file);
    }

    protected CollectionReader getCollectionReader(List<File> list) throws Exception {
        return CollectionReaderFactory.createCollectionReader(UriCollectionReader.getDescriptionFromFiles(list), new Object[0]);
    }

    public void train(CollectionReader collectionReader, File file) throws Exception {
        AggregateBuilder aggregateBuilder = new AggregateBuilder();
        aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription(), new String[0]);
        aggregateBuilder.add(MASCGoldAnnotator.getDescription(), new String[0]);
        aggregateBuilder.add(PosTaggerAnnotator.getDescription(), new String[0]);
        aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(NamedEntityChunker.class, new Object[]{CleartkSequenceAnnotator.PARAM_IS_TRAINING, true, DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY, file, DefaultSequenceDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME, MalletCRFStringOutcomeDataWriter.class}), new String[0]);
        SimplePipeline.runPipeline(collectionReader, new AnalysisEngineDescription[]{aggregateBuilder.createAggregateDescription()});
        Logger.getLogger("cc.mallet").setLevel(Level.WARNING);
        Logger.getLogger("cc.mallet.fst.CRFOptimizableByLabelLikelihood").setLevel(Level.INFO);
        Train.main(file, new String[0]);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    /* renamed from: test, reason: merged with bridge method [inline-methods] */
    public AnnotationStatistics<String> m0test(CollectionReader collectionReader, File file) throws Exception {
        AggregateBuilder aggregateBuilder = new AggregateBuilder();
        aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ViewCreatorAnnotator.class, new Object[]{ViewCreatorAnnotator.PARAM_VIEW_NAME, "GoldView"}), new String[0]);
        aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription(), new String[]{DocumentClassificationEvaluation.SYSTEM_VIEW_NAME, "GoldView"});
        aggregateBuilder.add(MASCGoldAnnotator.getDescription(), new String[]{DocumentClassificationEvaluation.SYSTEM_VIEW_NAME, "GoldView"});
        aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription(), new String[0]);
        aggregateBuilder.add(SentenceAnnotator.getDescription(), new String[0]);
        aggregateBuilder.add(TokenAnnotator.getDescription(), new String[0]);
        aggregateBuilder.add(PosTaggerAnnotator.getDescription(), new String[0]);
        aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(NamedEntityChunker.class, new Object[]{CleartkSequenceAnnotator.PARAM_IS_TRAINING, false, GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH, new File(file, "model.jar")}), new String[0]);
        AnnotationStatistics<String> annotationStatistics = new AnnotationStatistics<>();
        Function annotationToSpan = AnnotationStatistics.annotationToSpan();
        Function annotationToFeatureValue = AnnotationStatistics.annotationToFeatureValue("mentionType");
        Iterator it = new JCasIterable(collectionReader, new AnalysisEngine[]{aggregateBuilder.createAggregate()}).iterator();
        while (it.hasNext()) {
            JCas jCas = (JCas) it.next();
            annotationStatistics.add(JCasUtil.select(jCas.getView("GoldView"), NamedEntityMention.class), JCasUtil.select(jCas.getView(DocumentClassificationEvaluation.SYSTEM_VIEW_NAME), NamedEntityMention.class), annotationToSpan, annotationToFeatureValue);
        }
        return annotationStatistics;
    }
}
