package de.datexis.common;

import com.google.common.collect.Lists;
import de.datexis.model.Span;
import de.datexis.model.Token;
import java.io.IOException;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.deeplearning4j.text.tokenization.tokenizer.TokenPreProcess;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.factory.Nd4j;
import org.nd4j.linalg.ops.transforms.Transforms;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/datexis/common/WordHelpers.class */
public class WordHelpers {
    protected static final Logger log = LoggerFactory.getLogger(WordHelpers.class);
    public static HashSet<String> skipSpaceBefore = new HashSet<>(Arrays.asList(",", ".", ":", ";", "?", "!", ")", "]", "'m", "'s", "'re", "'ve", "'d", "'ll", "n't"));
    public static HashSet<String> skipSpaceAfter = new HashSet<>(Arrays.asList("(", "[", "", "\n"));
    private static final String[][] umlautReplacements = {new String[]{"Ä", "Ae"}, new String[]{"Ü", "Ue"}, new String[]{"Ö", "Oe"}, new String[]{"ä", "ae"}, new String[]{"ü", "ue"}, new String[]{"ö", "oe"}, new String[]{"ß", "ss"}, new String[]{"–", "-"}};
    private static final String[][] tokenizationReplacements = {new String[]{"``", "\""}, new String[]{"''", "\""}};
    public static final Pattern punctPattern = Pattern.compile("[^\\w\\s\\-_]+");
    public static final Pattern spacePattern = Pattern.compile("[\\s]+");
    public static final Pattern numericPattern = Pattern.compile("[\\d]+");
    public static final Pattern bracketsPattern = Pattern.compile("[\\(\\)\\[\\]\"]");
    public static HashSet<String> abbreviationsEN = new HashSet<>(Arrays.asList("Adj.", "Adm.", "Adv.", "Asst.", "Bart.", "Bldg.", "Brig.", "Bros.", "Capt.", "Cmdr.", "Col.", "Comdr.", "Con.", "Corp.", "Cpl.", "DR.", "Dr.", "Drs.", "Ens.", "Gen.", "Gov.", "Hon.", "Hr.", "Hosp.", "Insp.", "Lt.", "MM.", "MR.", "MRS.", "MS.", "Maj.", "Messrs.", "Mlle.", "Mme.", "Mr.", "Mrs.", "Ms.", "Msgr.", "Op.", "Ord.", "Pfc.", "Ph.", "Prof.", "Pvt.", "Rep.", "Reps.", "Res.", "Rev.", "Rt.", "Sen.", "Sens.", "Sfc.", "Sgt.", "Sr.", "St.", "Supt.", "Surg", "v.", "vs.", "i.e.", "rev.", "e.g.", "No.", "Nr.", "pp."));
    public static HashSet<String> abbreviationsDE = new HashSet<>(Arrays.asList("I.", "II.", "III.", "IV.", "V.", "VI.", "VII.", "VIII.", "IX.", "X.", "XI.", "XII.", "XIII.", "XIV.", "XV.", "XVI.", "XVII.", "XVIII.", "XIX.", "XX.", "i.", "ii.", "iii.", "iv.", "v.", "vi.", "vii.", "viii.", "ix.", "x.", "xi.", "xii.", "xiii.", "xiv.", "xv.", "xvi.", "xvii.", "xviii.", "xix.", "xx.", "Adj.", "Adm.", "Adv.", "Asst.", "Bart.", "Bldg.", "Brig.", "Bros.", "Capt.", "Cmdr.", "Col.", "Comdr.", "Con.", "Corp.", "Cpl.", "DR.", "Dr.", "Ens.", "Gen.", "Gov.", "Hon.", "Hosp.", "Insp.", "Lt.", "MM.", "MR.", "MRS.", "MS.", "Maj.", "Messrs.", "Mlle.", "Mme.", "Mr.", "Mrs.", "Ms.", "Msgr.", "Op.", "Ord.", "Pfc.", "Ph.", "Prof.", "Pvt.", "Rep.", "Reps.", "Res.", "Rev.", "Rt.", "Sen.", "Sens.", "Sfc.", "Sgt.", "Sr.", "St.", "Supt.", "Surg.", "Mio.", "Mrd.", "bzw.", "v.", "vs.", "usw.", "d.h.", "z.B.", "u.a.", "etc.", "Mrd.", "MwSt.", "ggf.", "d.J.", "D.h.", "m.E.", "vgl.", "I.F.", "z.T.", "sogen.", "ff.", "u.E.", "g.U.", "g.g.A.", "c.-à-d.", "Buchst.", "u.s.w.", "sog.", "u.ä.", "Std.", "evtl.", "Zt.", "Chr.", "u.U.", "o.ä.", "Ltd.", "b.A.", "z.Zt.", "spp.", "sen.", "SA.", "k.o.", "jun.", "i.H.v.", "dgl.", "dergl.", "Co.", "zzt.", "usf.", "s.p.a.", "Dkr.", "Corp.", "bzgl.", "BSE.", "No.", "Nos.", "Art.", "Nr.", "pp.", "ca.", "Ca"));
    private final Set<String> stopWords;

    /* loaded from: input_file:de/datexis/common/WordHelpers$Language.class */
    public enum Language {
        EN,
        DE
    }

    public WordHelpers(Language language) {
        this.stopWords = new TreeSet(readStopWords(language));
    }

    public static Language getLanguage(String str) {
        try {
            return Language.valueOf(str.trim().toUpperCase());
        } catch (IllegalArgumentException e) {
            return Language.EN;
        }
    }

    /* JADX WARN: Multi-variable type inference failed */
    /* JADX WARN: Type inference failed for: r0v11, types: [java.util.List] */
    private List<String> readStopWords(Language language) {
        Resource fromJAR = Resource.fromJAR("stopwords/stopwords_" + language.toString().toLowerCase() + ".csv");
        ArrayList arrayList = new ArrayList();
        try {
            arrayList = IOUtils.readLines(fromJAR.getInputStream(), "UTF-8");
        } catch (IOException e) {
            log.error("Could not read stop words " + e.toString());
        }
        return arrayList;
    }

    public List<String> getStopWords() {
        return Lists.newArrayList(this.stopWords);
    }

    public boolean isStopWord(String str, TokenPreProcess tokenPreProcess) {
        return isStopWord(tokenPreProcess.preProcess(str));
    }

    public boolean isStopWord(String str) {
        return this.stopWords.contains(str.toLowerCase());
    }

    public static String wordsToText(Iterable<Token> iterable) {
        StringBuilder sb = new StringBuilder();
        String str = "";
        for (Token token : iterable) {
            if (!skipSpaceAfter.contains(str) && !skipSpaceBefore.contains(token.getText())) {
                sb.append(" ");
            }
            sb.append(token.getText());
            str = token.getText();
        }
        return sb.toString().trim();
    }

    public static String tokensToText(Iterable<Token> iterable, int i) {
        StringBuilder sb = new StringBuilder();
        int i2 = i;
        for (Token token : iterable) {
            if (!token.isEmpty()) {
                if (i2 > token.getBegin()) {
                    sb.append(" ");
                    i2 = token.getBegin();
                }
                while (i2 < token.getBegin()) {
                    sb.append(" ");
                    i2++;
                }
                String text = token.getText();
                if (token.getLength() == text.length()) {
                    sb.append(text);
                } else if (token.getLength() < text.length()) {
                    sb.append(text.substring(0, token.getLength()));
                } else {
                    sb.append(text).append(String.join("", Collections.nCopies(token.getLength() - text.length(), " ")));
                }
                i2 = token.getEnd();
            }
        }
        return sb.toString();
    }

    public static double cosineSim(INDArray iNDArray, INDArray iNDArray2) {
        if (iNDArray == null || iNDArray2 == null || iNDArray.maxNumber().doubleValue() == 0.0d || iNDArray2.maxNumber().doubleValue() == 0.0d) {
            return 0.0d;
        }
        return Transforms.cosineSim(iNDArray, iNDArray2);
    }

    public static String vecToString(INDArray iNDArray) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < iNDArray.length(); i++) {
            sb.append(iNDArray.getDouble(i));
            if (i < iNDArray.length() - 1) {
                sb.append(" ");
            }
        }
        return sb.toString();
    }

    public static INDArray stringToVec(String str) {
        String[] split = str.split(" ");
        float[] fArr = new float[split.length];
        for (int i = 0; i < split.length; i++) {
            fArr[i] = Float.parseFloat(split[i]);
        }
        return Nd4j.create(fArr);
    }

    public static String replaceAccents(String str) {
        return Normalizer.normalize(str, Normalizer.Form.NFD);
    }

    public static String replaceUmlauts(String str) {
        for (String[] strArr : umlautReplacements) {
            str = str.replaceAll(strArr[0], strArr[1]);
        }
        return str;
    }

    public static String replacePunctuation(String str, String str2) {
        return punctPattern.matcher(str).replaceAll(str2);
    }

    public static String replaceNumbers(String str, String str2) {
        return numericPattern.matcher(str).replaceAll(str2);
    }

    public static String replaceSpaces(String str, String str2) {
        return spacePattern.matcher(str).replaceAll(str2);
    }

    public static String[] splitSpaces(String str) {
        return str.split(spacePattern.pattern());
    }

    public static int getSpanOverlapLength(Span span, Span span2) {
        int max = Math.max(span.getBegin(), span2.getBegin());
        int min = Math.min(span.getEnd(), span2.getEnd());
        if (max < min) {
            return min - max;
        }
        return 0;
    }
}
