package org.cleartk.ne.ace2005;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.regex.Pattern;
import org.apache.uima.UimaContext;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.FileUtils;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;
import org.cleartk.ne.type.Ace2005Document;
import org.cleartk.util.ViewURIUtil;
import org.jdom2.Element;
import org.jdom2.JDOMException;
import org.jdom2.input.SAXBuilder;
import org.uimafit.component.JCasCollectionReader_ImplBase;
import org.uimafit.descriptor.ConfigurationParameter;
import org.uimafit.descriptor.SofaCapability;
import org.uimafit.factory.ConfigurationParameterFactory;
import org.xml.sax.DTDHandler;

@SofaCapability(outputSofas = {Ace2005Constants.ACE_2005_APF_URI_VIEW, "UriView"})
/* loaded from: input_file:org/cleartk/ne/ace2005/Ace2005GoldReader.class */
public class Ace2005GoldReader extends JCasCollectionReader_ImplBase {

    @ConfigurationParameter(mandatory = true, description = "Takes the name of directory that contains ACE data.  Typically, a folder such as \".../ACE_2005/optimization/English/all\".  The folder should contain files that come in pairs - i.e. for each .sgm file there should be a corresponding .apf.xml file.")
    private String aceDirectoryName;
    private static final String PARAM_ACE_FILE_NAMES_DESCRIPTION = "takes a file that contains the names of the files to read.   \nThe file should contain a list of the files in AceCorpusDir (one file name per line) \nthat you want read in. File names should not include the last suffix(es) (e.g. \".sgm\" or \"apf.xml\") \nIf parameter value is not given, then all files will be read in. An example file might look like this: \n\nAFP_ENG_20030304.0250\nAFP_ENG_20030305.0918\n...\n";

    @ConfigurationParameter(description = PARAM_ACE_FILE_NAMES_DESCRIPTION)
    private String aceFileNamesFile;
    File[] aceFiles;
    int aceFileIndex;
    int aceFileCount;
    File currentSGMFile = null;
    public static final String TAG_REGEX = "<.*?>";
    Pattern tagPattern;
    public static final String PARAM_ACE_DIRECTORY_NAME = ConfigurationParameterFactory.createConfigurationParameterName(Ace2005GoldReader.class, "aceDirectoryName");
    public static final String PARAM_ACE_FILE_NAMES_FILE = ConfigurationParameterFactory.createConfigurationParameterName(Ace2005GoldReader.class, "aceFileNamesFile");

    /* JADX WARN: Finally extract failed */
    public void initialize(UimaContext uimaContext) throws ResourceInitializationException {
        if (!new File(this.aceDirectoryName).exists()) {
            throw new ResourceInitializationException(new IOException(String.format("directory %s does not exist", this.aceDirectoryName)));
        }
        File file = new File(this.aceDirectoryName);
        if (this.aceFileNamesFile == null || this.aceFileNamesFile.trim().equals("")) {
            this.aceFiles = file.listFiles();
        } else {
            try {
                ArrayList arrayList = new ArrayList();
                BufferedReader bufferedReader = new BufferedReader(new FileReader(this.aceFileNamesFile));
                while (true) {
                    try {
                        String readLine = bufferedReader.readLine();
                        if (readLine == null) {
                            break;
                        }
                        String trim = readLine.trim();
                        if (trim.endsWith(".sgm")) {
                            arrayList.add(new File(file, trim));
                        } else {
                            arrayList.add(new File(file, trim + ".sgm"));
                        }
                    } catch (Throwable th) {
                        bufferedReader.close();
                        throw th;
                    }
                }
                bufferedReader.close();
                this.aceFiles = (File[]) arrayList.toArray(new File[arrayList.size()]);
                for (File file2 : this.aceFiles) {
                    if (!file2.exists()) {
                        throw new ResourceInitializationException("could_not_access_data", new Object[]{file2});
                    }
                }
            } catch (IOException e) {
                throw new ResourceInitializationException(e);
            }
        }
        this.aceFileIndex = 0;
        this.aceFileCount = 0;
        this.tagPattern = Pattern.compile(TAG_REGEX, 40);
    }

    private File getNextSGMFile() {
        if (this.currentSGMFile != null) {
            return this.currentSGMFile;
        }
        while (this.aceFileIndex < this.aceFiles.length) {
            File[] fileArr = this.aceFiles;
            int i = this.aceFileIndex;
            this.aceFileIndex = i + 1;
            File file = fileArr[i];
            if (file.getName().endsWith(".sgm")) {
                this.currentSGMFile = file;
                return file;
            }
        }
        return null;
    }

    private File getAPFFile(File file) {
        String str = file.getPath().substring(0, file.getPath().length() - 3) + "apf.xml";
        if (new File(str).exists()) {
            return new File(str);
        }
        String str2 = file.getPath().substring(0, file.getPath().length() - 3) + "entities.apf.xml";
        if (new File(str2).exists()) {
            return new File(str2);
        }
        String str3 = file.getPath().substring(0, file.getPath().length() - 3) + "mentions.apf.xml";
        if (new File(str3).exists()) {
            return new File(str3);
        }
        return null;
    }

    private String getDocumentText(String str) {
        return this.tagPattern.matcher(new StringBuffer(str)).replaceAll("");
    }

    public void getNext(JCas jCas) throws IOException, CollectionException {
        try {
            File nextSGMFile = getNextSGMFile();
            this.currentSGMFile = null;
            String file2String = FileUtils.file2String(nextSGMFile);
            JCas view = jCas.getView("_InitialView");
            view.setDocumentText(getDocumentText(file2String));
            File aPFFile = getAPFFile(nextSGMFile);
            SAXBuilder sAXBuilder = new SAXBuilder();
            sAXBuilder.setDTDHandler((DTDHandler) null);
            Element rootElement = sAXBuilder.build(aPFFile).getRootElement();
            String attributeValue = rootElement.getAttributeValue("URI");
            String attributeValue2 = rootElement.getAttributeValue("SOURCE");
            String attributeValue3 = rootElement.getAttributeValue("TYPE");
            ViewURIUtil.setURI(jCas, nextSGMFile.toURI());
            Ace2005Document ace2005Document = new Ace2005Document(view);
            ace2005Document.setAceUri(attributeValue);
            ace2005Document.setAceSource(attributeValue2);
            ace2005Document.setAceType(attributeValue3);
            ace2005Document.addToIndexes();
            jCas.createView(Ace2005Constants.ACE_2005_APF_URI_VIEW).setSofaDataURI(aPFFile.toURI().toString(), (String) null);
        } catch (CASException e) {
            throw new CollectionException(e);
        } catch (JDOMException e2) {
            throw new CollectionException(e2);
        }
    }

    public void close() throws IOException {
    }

    public Progress[] getProgress() {
        return new Progress[]{new ProgressImpl(this.aceFileIndex, this.aceFiles.length, "entities")};
    }

    public boolean hasNext() throws IOException, CollectionException {
        return getNextSGMFile() != null;
    }

    public void setAceDirectoryName(String str) {
        this.aceDirectoryName = str;
    }

    public void setAceFileNamesFile(String str) {
        this.aceFileNamesFile = str;
    }
}
