package org.cleartk.corpus.conll2003;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import org.apache.uima.UimaContext;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;
import org.cleartk.ne.type.Chunk;
import org.cleartk.ne.type.NamedEntity;
import org.cleartk.ne.type.NamedEntityMention;
import org.cleartk.token.type.Sentence;
import org.cleartk.token.type.Token;
import org.cleartk.util.ViewURIUtil;
import org.uimafit.component.JCasCollectionReader_ImplBase;
import org.uimafit.descriptor.ConfigurationParameter;
import org.uimafit.descriptor.SofaCapability;
import org.uimafit.factory.ConfigurationParameterFactory;

@SofaCapability(outputSofas = {"UriView"})
/* loaded from: input_file:org/cleartk/corpus/conll2003/Conll2003GoldReader.class */
public class Conll2003GoldReader extends JCasCollectionReader_ImplBase {

    @ConfigurationParameter(mandatory = true, description = "Points to CoNLL data (e.g. ner/eng.train).")
    private String dataFileName;

    @ConfigurationParameter(mandatory = true, description = "determines if the named entities are loaded (i.e. named entity mention annotations are created) or if just plain text from the files is loaded.", defaultValue = {"true"})
    private boolean loadNamedEntities;
    public static final String DOCSTART = "-DOCSTART-";
    BufferedReader reader;
    boolean hasNext = true;
    int documentIndex = 0;
    int entityIdIndex = 0;
    List<String> documentData;
    StringBuffer documentText;
    int sentenceStart;
    List<Token> sentenceTokens;
    List<Chunk> sentenceChunks;
    int tokenPosition;
    int chunkStart;
    String currentChunkType;
    List<Token> chunkTokens;
    int namedEntityStart;
    String currentNamedEntityType;
    List<Token> namedEntityTokens;
    public static final String PARAM_DATA_FILE_NAME = ConfigurationParameterFactory.createConfigurationParameterName(Conll2003GoldReader.class, "dataFileName");
    public static final String PARAM_LOAD_NAMED_ENTITIES = ConfigurationParameterFactory.createConfigurationParameterName(Conll2003GoldReader.class, "loadNamedEntities");

    public void initialize(UimaContext uimaContext) throws ResourceInitializationException {
        try {
            this.reader = new BufferedReader(new FileReader(new File(this.dataFileName)));
            while (true) {
                String readLine = this.reader.readLine();
                if (readLine == null) {
                    break;
                } else if (readLine.trim().startsWith(DOCSTART)) {
                    this.reader.readLine();
                    break;
                }
            }
            this.sentenceTokens = new ArrayList();
            this.sentenceChunks = new ArrayList();
            this.chunkTokens = new ArrayList();
            this.namedEntityTokens = new ArrayList();
        } catch (FileNotFoundException e) {
            throw new ResourceInitializationException(e);
        } catch (IOException e2) {
            throw new ResourceInitializationException(e2);
        }
    }

    public void getNext(JCas jCas) throws IOException, CollectionException {
        String readLine;
        this.documentData = new ArrayList();
        while (true) {
            readLine = this.reader.readLine();
            if (readLine == null || readLine.startsWith(DOCSTART)) {
                break;
            } else {
                this.documentData.add(readLine.trim());
            }
        }
        if (readLine == null) {
            this.hasNext = false;
        } else {
            this.reader.readLine().trim();
        }
        this.documentText = new StringBuffer();
        initSentence();
        this.tokenPosition = 0;
        this.chunkStart = 0;
        this.currentChunkType = "";
        this.chunkTokens.clear();
        this.namedEntityStart = 0;
        this.currentNamedEntityType = "";
        this.namedEntityTokens.clear();
        for (String str : this.documentData) {
            if (str.trim().equals("")) {
                createChunk(jCas);
                this.currentChunkType = "";
                createNamedEntity(jCas);
                this.currentNamedEntityType = "";
                new Sentence(jCas, this.sentenceStart, this.documentText.length()).addToIndexes();
                initSentence();
            } else {
                String[] split = str.split(" ");
                String str2 = split[0];
                String str3 = split[1];
                String str4 = split[2];
                if (this.currentChunkType.equals("")) {
                    initChunk(str4);
                }
                String str5 = split[3];
                if (this.currentNamedEntityType.equals("")) {
                    initNamedEntity(str5);
                }
                Token token = new Token(jCas, this.documentText.length(), this.documentText.length() + str2.length());
                token.setPos(str3);
                token.addToIndexes();
                boolean startsWithB = startsWithB(this.currentChunkType, str4);
                if (!str4.equals(this.currentChunkType) && !startsWithB) {
                    createChunk(jCas);
                    initChunk(str4);
                }
                boolean startsWithB2 = startsWithB(this.currentNamedEntityType, str5);
                if (!str5.equals(this.currentNamedEntityType) && !startsWithB2) {
                    createNamedEntity(jCas);
                    initNamedEntity(str5);
                }
                this.sentenceTokens.add(token);
                this.chunkTokens.add(token);
                this.namedEntityTokens.add(token);
                this.documentText.append(str2 + " ");
            }
        }
        jCas.setDocumentText(this.documentText.toString());
        URI uri = new File(this.dataFileName).toURI();
        try {
            ViewURIUtil.setURI(jCas, new URI(uri.getScheme(), uri.getHost(), uri.getPath(), String.valueOf(this.documentIndex)));
            this.documentIndex++;
        } catch (URISyntaxException e) {
            throw new RuntimeException(e);
        }
    }

    private void initSentence() {
        this.sentenceStart = this.documentText.length();
        this.sentenceTokens.clear();
        this.sentenceChunks.clear();
    }

    private void createChunk(JCas jCas) {
        if (this.currentChunkType.equals("O")) {
            return;
        }
        Chunk chunk = new Chunk(jCas, this.chunkStart, this.documentText.length() - 1);
        chunk.setChunkType(this.currentChunkType.substring(2));
        chunk.addToIndexes();
        this.sentenceChunks.add(chunk);
    }

    private void initChunk(String str) {
        this.chunkStart = this.documentText.length();
        this.chunkTokens.clear();
        this.currentChunkType = str;
    }

    private void createNamedEntity(JCas jCas) {
        if (this.currentNamedEntityType.equals("O") || !this.loadNamedEntities) {
            return;
        }
        NamedEntity namedEntity = new NamedEntity(jCas);
        namedEntity.setEntityClass("SPC");
        StringBuilder append = new StringBuilder().append("");
        int i = this.entityIdIndex;
        this.entityIdIndex = i + 1;
        namedEntity.setEntityId(append.append(i).toString());
        namedEntity.setEntityType(this.currentNamedEntityType.substring(2));
        namedEntity.setEntitySubtype(this.currentNamedEntityType.substring(2));
        namedEntity.addToIndexes();
        NamedEntityMention namedEntityMention = new NamedEntityMention(jCas, this.namedEntityStart, this.documentText.length() - 1);
        namedEntityMention.setMentionType("NAM");
        Annotation annotation = new Annotation(jCas, this.namedEntityStart, this.documentText.length() - 1);
        annotation.addToIndexes();
        namedEntityMention.setAnnotation(annotation);
        namedEntityMention.setHead(annotation);
        namedEntityMention.setMentionedEntity(namedEntity);
        namedEntityMention.addToIndexes();
        namedEntity.setMentions(new FSArray(jCas, 1));
        namedEntity.setMentions(0, namedEntityMention);
    }

    private void initNamedEntity(String str) {
        this.namedEntityStart = this.documentText.length();
        this.namedEntityTokens.clear();
        this.currentNamedEntityType = str;
    }

    private boolean startsWithB(String str, String str2) {
        return str.startsWith("B") && str2.startsWith("I") && str2.substring(1).equals(str.substring(1));
    }

    public void close() throws IOException {
        this.reader.close();
    }

    public Progress[] getProgress() {
        return new Progress[]{new ProgressImpl(this.documentIndex, 5000, "entities")};
    }

    public boolean hasNext() throws IOException, CollectionException {
        return this.hasNext;
    }

    public void setDataFileName(String str) {
        this.dataFileName = str;
    }

    public void setLoadNamedEntities(boolean z) {
        this.loadNamedEntities = z;
    }
}
