package org.codelibs.elasticsearch.vi.analysis;

import java.io.IOException;
import java.io.InputStream;
import java.io.LineNumberReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.regex.Matcher;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.codelibs.elasticsearch.vi.nlp.lexicon.LexiconUnmarshaller;
import org.codelibs.elasticsearch.vi.nlp.lexicon.jaxb.W;
import org.codelibs.elasticsearch.vi.nlp.tokenizer.ResultMerger;
import org.codelibs.elasticsearch.vi.nlp.tokenizer.ResultSplitter;
import org.codelibs.elasticsearch.vi.nlp.tokenizer.segmenter.Segmenter;
import org.codelibs.elasticsearch.vi.nlp.tokenizer.segmenter.UnigramResolver;
import org.codelibs.elasticsearch.vi.nlp.tokenizer.tokens.LexerRule;
import org.codelibs.elasticsearch.vi.nlp.tokenizer.tokens.TaggedWord;

/* loaded from: input_file:org/codelibs/elasticsearch/vi/analysis/TaggedWordTokenizer.class */
public class TaggedWordTokenizer {
    private static final Logger logger = LogManager.getLogger(TaggedWordTokenizer.class);
    private Segmenter segmenter;
    private ResultMerger resultMerger;
    private ResultSplitter resultSplitter;
    private final List<LexerRule> rules = new ArrayList();

    public TaggedWordTokenizer() {
        try {
            InputStream resourceAsStream = getClass().getResourceAsStream("/tokenizer.properties");
            Throwable th = null;
            try {
                Properties properties = new Properties();
                properties.load(resourceAsStream);
                loadLexerRules(properties.getProperty("lexers"));
                this.resultMerger = new ResultMerger();
                this.resultSplitter = new ResultSplitter(properties);
                this.segmenter = new Segmenter(properties, new UnigramResolver(properties.getProperty("unigramModel")));
                if (resourceAsStream != null) {
                    if (0 != 0) {
                        try {
                            resourceAsStream.close();
                        } catch (Throwable th2) {
                            th.addSuppressed(th2);
                        }
                    } else {
                        resourceAsStream.close();
                    }
                }
            } finally {
            }
        } catch (IOException e) {
            logger.warn(e);
        }
    }

    private void loadLexerRules(String str) {
        for (W w : new LexiconUnmarshaller().unmarshal(str).getBody().getW()) {
            this.rules.add(new LexerRule(w.getMsd(), w.getContent()));
        }
    }

    public List<TaggedWord> tokenize(Reader reader) throws IOException {
        int end;
        ArrayList arrayList = new ArrayList();
        LineNumberReader lineNumberReader = new LineNumberReader(reader);
        String str = null;
        int i = 1;
        while (true) {
            if (str == null || str.trim().length() == 0) {
                str = lineNumberReader.readLine();
                if (str == null) {
                    break;
                }
            }
            int i2 = -1;
            int i3 = -1;
            LexerRule lexerRule = null;
            for (int i4 = 0; i4 < this.rules.size(); i4++) {
                LexerRule lexerRule2 = this.rules.get(i4);
                Matcher matcher = lexerRule2.getPattern().matcher(str);
                if (matcher.lookingAt() && (end = matcher.end()) > i3) {
                    i3 = end;
                    i2 = end;
                    lexerRule = lexerRule2;
                }
            }
            int i5 = i2;
            if (i2 < str.length() && str.charAt(i2) == '@') {
                while (i5 > 0 && str.charAt(i5) != ' ') {
                    i5--;
                }
            }
            if (i5 == 0) {
                i5 = i2;
            }
            if (lexerRule == null) {
                lexerRule = new LexerRule("phrase");
            }
            TaggedWord taggedWord = new TaggedWord(lexerRule, str.substring(0, i5), lineNumberReader.getLineNumber(), i);
            i += i5;
            str = str.substring(i5).trim();
            if (taggedWord.isPhrase()) {
                String trim = taggedWord.getText().trim();
                if (trim.contains(" ")) {
                    String name = taggedWord.getRule().getName();
                    String[] strArr = null;
                    CopyOnWriteArrayList copyOnWriteArrayList = new CopyOnWriteArrayList(this.segmenter.segment(trim));
                    if (copyOnWriteArrayList.size() > 1) {
                        strArr = this.segmenter.resolveAmbiguity(copyOnWriteArrayList);
                    } else {
                        Iterator it = copyOnWriteArrayList.iterator();
                        if (it.hasNext()) {
                            strArr = (String[]) it.next();
                        }
                    }
                    for (String str2 : strArr) {
                        arrayList.add(new TaggedWord(new LexerRule(name), str2, lineNumberReader.getLineNumber(), i));
                        i += str2.length();
                    }
                } else if (trim.length() > 0) {
                    arrayList.add(taggedWord);
                }
            } else if (taggedWord.isNamedEntity()) {
                TaggedWord[] split = this.resultSplitter.split(taggedWord);
                if (split != null) {
                    for (TaggedWord taggedWord2 : split) {
                        arrayList.add(taggedWord2);
                    }
                } else {
                    arrayList.add(taggedWord);
                }
            } else if (taggedWord.getText().trim().length() > 0) {
                arrayList.add(taggedWord);
            }
        }
        return arrayList.size() > 0 ? this.resultMerger.mergeList(arrayList) : arrayList;
    }
}
