package de.datexis.preprocess;

import com.google.common.base.Optional;
import com.optimaize.langdetect.LanguageDetector;
import com.optimaize.langdetect.LanguageDetectorBuilder;
import com.optimaize.langdetect.i18n.LdLocale;
import com.optimaize.langdetect.ngram.NgramExtractors;
import com.optimaize.langdetect.profiles.LanguageProfileReader;
import com.optimaize.langdetect.text.CommonTextObjectFactories;
import com.optimaize.langdetect.text.TextObjectFactory;
import de.datexis.common.Resource;
import de.datexis.common.WordHelpers;
import de.datexis.model.Document;
import de.datexis.model.Sentence;
import de.datexis.model.Token;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.TreeMap;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.Span;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/datexis/preprocess/DocumentFactory.class */
public class DocumentFactory {
    protected static final Logger log = LoggerFactory.getLogger(DocumentFactory.class);
    protected static DocumentFactory instance = new DocumentFactory();
    private static final String LANG_EN = "en";
    private static final String LANG_DE = "de";
    TreeMap<String, SentenceDetectorME> sentenceSplitter = new TreeMap<>();
    TreeMap<String, TokenizerMENL> newlineTokenizer = new TreeMap<>();
    TextObjectFactory textObjectFactory;
    LanguageDetector languageDetector;

    /* loaded from: input_file:de/datexis/preprocess/DocumentFactory$Newlines.class */
    public enum Newlines {
        KEEP,
        DISCARD
    }

    public static DocumentFactory getInstance() {
        return instance;
    }

    public DocumentFactory() {
        loadSentenceSplitter(LANG_EN, Resource.fromJAR("openNLP/en-sent.bin"));
        loadTokenizer(LANG_EN, Resource.fromJAR("openNLP/en-token.bin"));
        loadSentenceSplitter(LANG_DE, Resource.fromJAR("openNLP/de-sent.bin"));
        loadTokenizer(LANG_DE, Resource.fromJAR("openNLP/de-token.bin"));
        try {
            this.languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()).withProfiles(new LanguageProfileReader().readAllBuiltIn()).build();
            this.textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
        } catch (IOException e) {
            log.error("Could not load language profiles");
        }
    }

    private void loadSentenceSplitter(String str, Resource resource) {
        try {
            this.sentenceSplitter.put(str, new SentenceDetectorMENL(new SentenceModel(resource.getInputStream())));
        } catch (IOException e) {
            throw new IllegalStateException("cannot load openNLP model '" + resource.toString() + "': " + e.toString());
        }
    }

    private void loadTokenizer(String str, Resource resource) {
        try {
            this.newlineTokenizer.put(str, new TokenizerMENL(new TokenizerModel(resource.getInputStream())));
        } catch (IOException e) {
            throw new IllegalStateException("cannot load openNLP model '" + resource.toString() + "': " + e.toString());
        }
    }

    public static Document fromText(String str) {
        return instance.createFromText(str);
    }

    public static Document fromText(String str, Newlines newlines) {
        return instance.createFromText(str, newlines);
    }

    public static Document fromText(String str, Newlines newlines, WordHelpers.Language language) {
        return instance.createFromText(str, newlines, language);
    }

    public static Document fromTokenizedText(String str) {
        return instance.createFromTokens(instance.createTokensFromTokenizedText(str, 0));
    }

    public static Document fromTokens(List<Token> list) {
        return instance.createFromTokens(list);
    }

    public static List<Token> createTokensFromText(String str) {
        return instance.createTokensFromText(str, 0);
    }

    public static List<Token> createTokensFromTokenizedText(String str) {
        return instance.createTokensFromTokenizedText(str, 0);
    }

    public static Sentence createSentenceFromTokens(List<Token> list) {
        Sentence sentence = new Sentence();
        list.stream().filter(token -> {
            return !token.isEmpty();
        }).forEach(token2 -> {
            sentence.addToken(token2);
        });
        return sentence;
    }

    public static Sentence createSentenceFromString(String str, String str2) {
        return createSentenceFromTokens(instance.createTokensFromText(str, 0, str2));
    }

    public static Sentence createSentenceFromTokenizedString(String str) {
        return createSentenceFromTokens(instance.createTokensFromTokenizedText(str, 0));
    }

    public static String getLanguage(String str) {
        return instance.detectLanguage(str);
    }

    public Document createFromText(String str) {
        Document document = new Document();
        addToDocumentFromText(str, document, Newlines.DISCARD);
        return document;
    }

    public Document createFromText(String str, Newlines newlines) {
        Document document = new Document();
        addToDocumentFromText(str, document, newlines);
        return document;
    }

    public Document createFromText(String str, Newlines newlines, WordHelpers.Language language) {
        Document document = new Document();
        addToDocumentFromText(str, document, newlines, language.toString().toLowerCase());
        return document;
    }

    public void addToDocumentFromText(String str, Document document, Newlines newlines) {
        String language = document.getLanguage();
        if (language == null) {
            language = detectLanguage(str);
            if (!language.isEmpty()) {
                document.setLanguage(language);
            }
        }
        addToDocumentFromText(str, document, newlines, language);
    }

    public void addToDocumentFromText(String str, Document document, Newlines newlines, String str2) {
        int end = document.getEnd();
        if (end > 0) {
            end++;
        }
        TokenizerME tokenizerME = (TokenizerME) this.newlineTokenizer.getOrDefault(str2, this.newlineTokenizer.get(LANG_EN));
        int i = 0;
        int i2 = 0;
        for (Span span : ((SentenceDetectorME) this.sentenceSplitter.getOrDefault(str2, this.sentenceSplitter.get(LANG_EN))).sentPosDetect(str)) {
            if (span != null) {
                String substring = str.substring(span.getStart(), span.getEnd());
                Span[] spanArr = tokenizerME.tokenizePos(substring);
                LinkedList linkedList = new LinkedList();
                for (Span span2 : spanArr) {
                    String substring2 = substring.substring(span2.getStart(), span2.getEnd());
                    if (substring2.equals("\n")) {
                        i++;
                        if (newlines == Newlines.KEEP) {
                            linkedList.add(new Token(substring2, (end - i2) + span.getStart() + span2.getStart(), (end - i2) + span.getStart() + span2.getEnd()));
                        } else if (newlines != Newlines.DISCARD) {
                            i2++;
                        } else if (i > 1) {
                            i2++;
                        }
                    } else {
                        linkedList.add(new Token(substring2, (end - i2) + span.getStart() + span2.getStart(), (end - i2) + span.getStart() + span2.getEnd()));
                        i = 0;
                    }
                }
                document.addSentence(new Sentence(linkedList), false);
            }
        }
    }

    public synchronized String detectLanguage(String str) {
        try {
            Optional detect = this.languageDetector.detect(this.textObjectFactory.forText(str));
            return detect.isPresent() ? ((LdLocale) detect.get()).getLanguage() : "";
        } catch (Exception e) {
            return "";
        }
    }

    public Document createFromTokens(List<Token> list) {
        String detectLanguage = detectLanguage(WordHelpers.tokensToText(list, 0));
        Document document = new Document();
        document.setLanguage(detectLanguage);
        createSentencesFromTokens(list, detectLanguage).forEach(sentence -> {
            document.addSentence(sentence, false);
        });
        return document;
    }

    public List<Sentence> createSentencesFromTokens(List<Token> list) {
        return createSentencesFromTokens(list, detectLanguage(WordHelpers.tokensToText(list, 0)));
    }

    public List<Sentence> createSentencesFromTokens(List<Token> list, String str) {
        ArrayList arrayList = new ArrayList();
        Span[] sentPosDetect = ((SentenceDetectorME) this.sentenceSplitter.getOrDefault(str, this.sentenceSplitter.get(LANG_EN))).sentPosDetect(WordHelpers.tokensToText(list, 0));
        Iterator<Token> it = list.iterator();
        if (!it.hasNext()) {
            return arrayList;
        }
        Token next = it.next();
        for (Span span : sentPosDetect) {
            if (span != null) {
                ArrayList arrayList2 = new ArrayList();
                while (next.getBegin() < span.getEnd()) {
                    if (!next.getText().equals("\n")) {
                        arrayList2.add(next);
                    }
                    if (!it.hasNext()) {
                        break;
                    }
                    next = it.next();
                }
                arrayList.add(new Sentence(arrayList2));
            }
        }
        return arrayList;
    }

    public List<Token> createTokensFromText(String str, int i) {
        return createTokensFromText(str, i, detectLanguage(str));
    }

    public List<Token> createTokensFromText(String str, int i, String str2) {
        Span[] spanArr = ((TokenizerME) this.newlineTokenizer.getOrDefault(str2, this.newlineTokenizer.get(LANG_EN))).tokenizePos(str);
        LinkedList linkedList = new LinkedList();
        for (Span span : spanArr) {
            linkedList.add(new Token(str.substring(span.getStart(), span.getEnd()), i + span.getStart(), i + span.getEnd()));
        }
        return linkedList;
    }

    public List<Token> createTokensFromTokenizedText(String str, int i) {
        ArrayList arrayList = new ArrayList();
        String str2 = "";
        for (String str3 : WordHelpers.splitSpaces(str)) {
            Token token = new Token(str3, i, i + str3.length());
            if (!WordHelpers.skipSpaceAfter.contains(str2) && !WordHelpers.skipSpaceBefore.contains(str3)) {
                token.setBegin(token.getBegin() + 1);
                token.setEnd(token.getEnd() + 1);
            }
            i = token.getEnd();
            arrayList.add(token);
            str2 = str3;
        }
        return arrayList;
    }

    public void retokenize(Document document) {
        document.setText(document.getText());
    }
}
