package org.cleartk.summarization;

import com.google.common.io.Files;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Collection;
import org.apache.commons.io.FileUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.impl.XmiCasDeserializer;
import org.apache.uima.cas.impl.XmiCasSerializer;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.XMLSerializer;
import org.cleartk.classifier.CleartkAnnotator;
import org.cleartk.classifier.jar.DefaultDataWriterFactory;
import org.cleartk.classifier.jar.DirectoryDataWriterFactory;
import org.cleartk.classifier.jar.GenericJarClassifierFactory;
import org.cleartk.classifier.jar.JarClassifierBuilder;
import org.cleartk.summarization.SumBasicAnnotator;
import org.cleartk.summarization.SumBasicModel;
import org.cleartk.summarization.classifier.SumBasicDataWriter;
import org.cleartk.syntax.opennlp.PosTaggerAnnotator;
import org.cleartk.syntax.opennlp.SentenceAnnotator;
import org.cleartk.token.stem.snowball.DefaultSnowballStemmer;
import org.cleartk.token.tokenizer.TokenAnnotator;
import org.cleartk.util.Options_ImplBase;
import org.cleartk.util.ViewURIUtil;
import org.cleartk.util.ae.UriToDocumentTextAnnotator;
import org.cleartk.util.cr.UriCollectionReader;
import org.kohsuke.args4j.Option;
import org.uimafit.component.JCasAnnotator_ImplBase;
import org.uimafit.descriptor.ConfigurationParameter;
import org.uimafit.factory.AggregateBuilder;
import org.uimafit.factory.AnalysisEngineFactory;
import org.uimafit.factory.ConfigurationParameterFactory;
import org.uimafit.pipeline.SimplePipeline;
import org.xml.sax.SAXException;

/* loaded from: input_file:org/cleartk/summarization/SumBasic.class */
public class SumBasic extends Summarize_ImplBase<File> {
    private static final int DEFAULT_MAX_NUM_SENTENCES = 10;
    private static final double DEFAULT_SEEN_WORDS_PROB = 1.0E-4d;
    private static final SumBasicModel.CompositionFunctionType DEFAULT_CF_TYPE = SumBasicModel.CompositionFunctionType.AVERAGE;
    private static final SumBasicAnnotator.TokenField DEFAULT_TOKEN_FIELD = SumBasicAnnotator.TokenField.COVERED_TEXT;
    private File documentsDirectory;
    private File modelDirectory;
    private File xmiDirectory;
    private Collection<File> items;
    private File stopwordsFile;
    private SumBasicAnnotator.TokenField tokenField;
    private File sentencesOutFile;
    private Double seenWordsProbability = Double.valueOf(DEFAULT_SEEN_WORDS_PROB);
    private int numSentences = DEFAULT_MAX_NUM_SENTENCES;
    private SumBasicModel.CompositionFunctionType cfType = DEFAULT_CF_TYPE;
    private boolean outputSentences = false;
    private boolean outputScores = false;

    /* loaded from: input_file:org/cleartk/summarization/SumBasic$Options.class */
    public static class Options extends Options_ImplBase {

        @Option(name = "--max-num-sentences", usage = "Specifies the maximum number of sentences to extract in the summary")
        public int maxNumSentences = SumBasic.DEFAULT_MAX_NUM_SENTENCES;

        @Option(name = "--seen-words-prob", usage = "Specify the probability for seen words.")
        public double seenWordsProbability = SumBasic.DEFAULT_SEEN_WORDS_PROB;

        @Option(name = "--composition-function", usage = "Specifies how word probabilities are combined (AVERAGE|SUM|PRODUCT, default=AVERAGE)")
        public SumBasicModel.CompositionFunctionType cfType = SumBasicModel.CompositionFunctionType.AVERAGE;

        @Option(name = "--token-field", usage = "Specifies what kind of token is used for summarization, (COVERED_TEXT|STEM|LEMMA, default=COVERED_TEXT)")
        public SumBasicAnnotator.TokenField tokenField = SumBasicAnnotator.TokenField.COVERED_TEXT;

        @Option(name = "--stopwords-file", usage = "Path to whitespace delimited stopwords text file")
        public File stopwordsFile = new File("src/main/resources/stopwords.txt");

        @Option(name = "--documents-dir", usage = "Path to documents to summarize")
        public File documentsDir = new File("src/test/resources/test_documents");

        @Option(name = "--model-dir", usage = "Path for saving model data")
        public File modelDir = new File("target/models");

        @Option(name = "--xmi-dir", usage = "Path for saving intermediate cas xmi files.  Leave unspecified for a temporary directory")
        public File xmiDir = Files.createTempDir();

        @Option(name = "sentencesOutFile", usage = "Path to the output file")
        public File sentencesOutFile = new File("target/sentences.out");

        @Option(name = "outputScores", usage = "Path to the output file")
        public Boolean outputScores = false;
    }

    /* loaded from: input_file:org/cleartk/summarization/SumBasic$XMIAnnotator.class */
    public static abstract class XMIAnnotator extends JCasAnnotator_ImplBase {

        @ConfigurationParameter(mandatory = true)
        protected File xmiDirectory;
        public static final String PARAM_XMI_DIRECTORY = ConfigurationParameterFactory.createConfigurationParameterName(XMIAnnotator.class, "xmiDirectory");

        protected File getFile(JCas jCas) throws AnalysisEngineProcessException {
            File file = new File(ViewURIUtil.getURI(jCas));
            return new File(this.xmiDirectory, file.getName().replaceAll("\\." + Files.getFileExtension(file.getName()) + "$", "") + ".xmi");
        }
    }

    /* loaded from: input_file:org/cleartk/summarization/SumBasic$XMIReader.class */
    public static class XMIReader extends XMIAnnotator {
        public void process(JCas jCas) throws AnalysisEngineProcessException {
            try {
                FileInputStream fileInputStream = new FileInputStream(getFile(jCas));
                try {
                    XmiCasDeserializer.deserialize(fileInputStream, jCas.getCas());
                    fileInputStream.close();
                } catch (Throwable th) {
                    fileInputStream.close();
                    throw th;
                }
            } catch (IOException e) {
                throw new AnalysisEngineProcessException(e);
            } catch (SAXException e2) {
                throw new AnalysisEngineProcessException(e2);
            }
        }
    }

    /* loaded from: input_file:org/cleartk/summarization/SumBasic$XMIWriter.class */
    public static class XMIWriter extends XMIAnnotator {
        public void initialize(UimaContext uimaContext) throws ResourceInitializationException {
            super.initialize(uimaContext);
            if (this.xmiDirectory.exists()) {
                return;
            }
            this.xmiDirectory.mkdirs();
        }

        public void process(JCas jCas) throws AnalysisEngineProcessException {
            XmiCasSerializer xmiCasSerializer = new XmiCasSerializer(jCas.getTypeSystem());
            try {
                FileOutputStream fileOutputStream = new FileOutputStream(getFile(jCas));
                try {
                    xmiCasSerializer.serialize(jCas.getCas(), new XMLSerializer(fileOutputStream, false).getContentHandler());
                    fileOutputStream.close();
                } catch (Throwable th) {
                    fileOutputStream.close();
                    throw th;
                }
            } catch (IOException e) {
                throw new AnalysisEngineProcessException(e);
            } catch (SAXException e2) {
                throw new AnalysisEngineProcessException(e2);
            }
        }
    }

    public SumBasic(File file, File file2, File file3, File file4, SumBasicAnnotator.TokenField tokenField) {
        this.tokenField = DEFAULT_TOKEN_FIELD;
        this.documentsDirectory = file;
        this.modelDirectory = file2;
        this.xmiDirectory = file3;
        this.stopwordsFile = file4;
        this.tokenField = tokenField;
        this.items = FileUtils.listFiles(this.documentsDirectory, new UriCollectionReader.RejectSystemFiles(), new UriCollectionReader.RejectSystemDirectories());
    }

    @Override // org.cleartk.summarization.Summarize_ImplBase
    protected CollectionReader getCollectionReader(Collection<File> collection) throws Exception {
        return UriCollectionReader.getCollectionReaderFromFiles(collection);
    }

    @Override // org.cleartk.summarization.Summarize_ImplBase
    protected void train() throws Exception {
        SimplePipeline.runPipeline(getCollectionReader(this.items), new AnalysisEngineDescription[]{buildTrainingAggregate().createAggregateDescription()});
        JarClassifierBuilder.trainAndPackage(this.modelDirectory, new String[]{"--max-num-sentences", Integer.toString(this.numSentences), "--seen-words-prob", Double.toString(this.seenWordsProbability.doubleValue()), "--composition-function", this.cfType.toString()});
    }

    @Override // org.cleartk.summarization.Summarize_ImplBase
    public void extract() throws Exception {
        SimplePipeline.runPipeline(getCollectionReader(FileUtils.listFiles(this.xmiDirectory, new UriCollectionReader.RejectSystemFiles(), new UriCollectionReader.RejectSystemDirectories())), new AnalysisEngineDescription[]{buildExtractAggregate().createAggregateDescription()});
    }

    public AggregateBuilder buildTrainingAggregate() throws ResourceInitializationException {
        AggregateBuilder aggregateBuilder = new AggregateBuilder();
        aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription(), new String[0]);
        aggregateBuilder.add(SentenceAnnotator.getDescription(), new String[0]);
        aggregateBuilder.add(TokenAnnotator.getDescription(), new String[0]);
        aggregateBuilder.add(PosTaggerAnnotator.getDescription(), new String[0]);
        aggregateBuilder.add(DefaultSnowballStemmer.getDescription("English"), new String[0]);
        aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(SumBasicAnnotator.class, new Object[]{DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME, SumBasicDataWriter.class.getName(), DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY, this.modelDirectory.getPath(), SumBasicAnnotator.PARAM_TOKEN_FIELD, this.tokenField.name(), SumBasicAnnotator.PARAM_STOPWORDS_URI, this.stopwordsFile.toURI()}), new String[0]);
        aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(XMIWriter.class, new Object[]{XMIAnnotator.PARAM_XMI_DIRECTORY, this.xmiDirectory.getPath()}), new String[0]);
        return aggregateBuilder;
    }

    public AggregateBuilder buildExtractAggregate() throws ResourceInitializationException {
        AggregateBuilder aggregateBuilder = new AggregateBuilder();
        aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(XMIReader.class, new Object[]{XMIAnnotator.PARAM_XMI_DIRECTORY, this.xmiDirectory}), new String[0]);
        aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(SumBasicAnnotator.class, new Object[]{CleartkAnnotator.PARAM_IS_TRAINING, false, GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH, JarClassifierBuilder.getModelJarFile(this.modelDirectory), SumBasicAnnotator.PARAM_TOKEN_FIELD, this.tokenField.name(), SumBasicAnnotator.PARAM_STOPWORDS_URI, this.stopwordsFile.toURI()}), new String[0]);
        if (this.sentencesOutFile != null && this.outputSentences) {
            aggregateBuilder.add(SummarySentenceWriterAnnotator.getDescription(this.sentencesOutFile, this.outputScores), new String[0]);
        }
        return aggregateBuilder;
    }

    public void setSentencesOutFile(File file, boolean z) {
        this.sentencesOutFile = file;
        this.outputSentences = true;
        this.outputScores = z;
    }

    public static void main(String[] strArr) throws Exception {
        Options options = new Options();
        options.parseOptions(strArr);
        SumBasic sumBasic = new SumBasic(options.documentsDir, options.modelDir, options.xmiDir, options.stopwordsFile, SumBasicAnnotator.TokenField.COVERED_TEXT);
        sumBasic.setSentencesOutFile(options.sentencesOutFile, options.outputScores.booleanValue());
        System.out.println("Training");
        sumBasic.train();
        System.out.println("Extracting sentences");
        sumBasic.extract();
    }
}
