package banner.tagging;

import dragon.nlp.Sentence;
import dragon.nlp.Word;
import dragon.nlp.tool.Lemmatiser;
import edu.umass.cs.mallet.base.pipe.Pipe;
import edu.umass.cs.mallet.base.types.Instance;
import edu.umass.cs.mallet.base.types.LabelAlphabet;
import edu.umass.cs.mallet.base.types.LabelSequence;
import edu.umass.cs.mallet.base.types.Token;
import edu.umass.cs.mallet.base.types.TokenSequence;

/* loaded from: input_file:banner/tagging/String2TokenSequencePipe.class */
public class String2TokenSequencePipe extends Pipe {
    private static final long serialVersionUID = 1;
    private transient Lemmatiser lemmatiser;
    private transient dragon.nlp.tool.Tagger posTagger;
    private boolean useNumericNormalization;

    public String2TokenSequencePipe(Lemmatiser lemmatiser, dragon.nlp.tool.Tagger tagger, boolean z) {
        super((Class) null, LabelAlphabet.class);
        this.lemmatiser = null;
        this.posTagger = null;
        this.lemmatiser = lemmatiser;
        this.posTagger = tagger;
        this.useNumericNormalization = z;
    }

    public void setLemmatiser(Lemmatiser lemmatiser) {
        this.lemmatiser = lemmatiser;
    }

    public void setPosTagger(dragon.nlp.tool.Tagger tagger) {
        this.posTagger = tagger;
    }

    public Instance pipe(Instance instance) {
        String[] split = ((String) instance.getData()).trim().split("\\s+");
        TokenSequence tokenSequence = new TokenSequence(split.length);
        LabelSequence labelSequence = new LabelSequence(getTargetAlphabet(), split.length);
        StringBuffer stringBuffer = new StringBuffer();
        for (int i = 0; i < split.length; i++) {
            String[] split2 = split[i].split("\\|");
            String str = split2[0];
            String str2 = split2[1];
            Token token = new Token(str);
            token.setFeatureValue("W=" + str.toLowerCase(), 1.0d);
            int[] iArr = null;
            if (this.posTagger != null) {
                iArr = getPOS(split);
                token.setFeatureValue("POS=" + iArr[i], 1.0d);
            }
            if (this.lemmatiser != null) {
                token.setFeatureValue("LW=" + (iArr == null ? this.lemmatiser.lemmatize(str) : this.lemmatiser.lemmatize(str, iArr[i])), 1.0d);
            }
            if (this.useNumericNormalization) {
                token.setFeatureValue("NC=" + getNumberClass(str), 1.0d);
                token.setFeatureValue("BNC=" + getBriefNumberClass(str), 1.0d);
            }
            token.setFeatureValue("WC=" + getWordClass(str), 1.0d);
            token.setFeatureValue("BWC=" + getBriefWordClass(str), 1.0d);
            tokenSequence.add(token);
            labelSequence.add(str2);
            stringBuffer.append(token.getText());
            stringBuffer.append(" ");
        }
        instance.setData(tokenSequence);
        instance.setTarget(labelSequence);
        instance.setSource(stringBuffer);
        return instance;
    }

    private String getNumberClass(String str) {
        return str.replaceAll("[0-9]", "0");
    }

    private String getWordClass(String str) {
        return str.replaceAll("[A-Z]", "A").replaceAll("[a-z]", "a").replaceAll("[0-9]", "0").replaceAll("[^A-Za-z0-9]", "x");
    }

    private String getBriefNumberClass(String str) {
        return str.replaceAll("[0-9]+", "0");
    }

    private static String getBriefWordClass(String str) {
        return str.replaceAll("[A-Z]+", "A").replaceAll("[a-z]+", "a").replaceAll("[0-9]+", "0").replaceAll("[^A-Za-z0-9]+", "x");
    }

    private int[] getPOS(String[] strArr) {
        String[] strArr2 = new String[strArr.length];
        Word[] wordArr = new Word[strArr.length];
        Sentence sentence = new Sentence();
        for (int i = 0; i < strArr.length; i++) {
            strArr2[i] = strArr[i].split("\\|")[0];
            wordArr[i] = new Word(strArr2[i]);
            sentence.addWord(wordArr[i]);
        }
        this.posTagger.tag(sentence);
        int[] iArr = new int[strArr.length];
        for (int i2 = 0; i2 < strArr.length; i2++) {
            iArr[i2] = wordArr[i2].getPOSIndex();
        }
        return iArr;
    }
}
