package webcorp.tokens;

import com.google.re2j.Pattern;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/* loaded from: input_file:webcorp/tokens/JFlexTokenizer.class */
public class JFlexTokenizer implements TokenizerInterface {
    private final Pattern _abbrev_re;
    private final Pattern _ord_noun_re;
    private final Pattern _ord_gen_re;
    private final Pattern _lc_name_re;
    private final Pattern _pre_ord_re;
    private final Pattern _weak_abbrev_re;
    private final Pattern _sent_start_re;
    private final Pattern _konj_re;
    public static final int WORD = 0;
    public static final int NUMBER = 1;
    public static final int ABBREV = 2;
    public static final int WEAK_ABBREV = 3;
    public final String language;

    private static Map<String, Pattern> compilePatterns(BufferedReader bufferedReader) throws IOException {
        HashMap hashMap = new HashMap();
        StringBuffer stringBuffer = new StringBuffer();
        String str = null;
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                break;
            }
            if (readLine.startsWith(">>> ") || readLine.startsWith(">*> ")) {
                if (str != null) {
                    hashMap.put(str, Pattern.compile(stringBuffer.toString().substring(1)));
                }
                stringBuffer.setLength(0);
                str = readLine.substring(4);
            } else if (readLine.length() >= 1 && !readLine.startsWith("##")) {
                stringBuffer.append('|').append(readLine);
            }
        }
        if (str != null) {
            hashMap.put(str, Pattern.compile(stringBuffer.toString().substring(1)));
        }
        return hashMap;
    }

    public JFlexTokenizer(String str) {
        try {
            Map<String, Pattern> compilePatterns = compilePatterns(Utils.openResourceIn(JFlexTokenizer.class, str + "_token_macros.txt", "UTF-8"));
            this._abbrev_re = compilePatterns.get("Abbrev");
            this._lc_name_re = compilePatterns.get("LCName");
            this._ord_noun_re = compilePatterns.get("OrdNoun");
            this._ord_gen_re = compilePatterns.get("OrdGen");
            this._pre_ord_re = compilePatterns.get("PreOrd");
            this._weak_abbrev_re = compilePatterns.get("WeakAbbrev");
            this._sent_start_re = compilePatterns.get("SentStart");
            this._konj_re = compilePatterns.get("Conj");
            this.language = str;
        } catch (IOException e) {
            throw new RuntimeException("Cannot load patterns", e);
        }
    }

    public int classifyToken(Token token) {
        if (token.hasType(1)) {
            return 1;
        }
        if (this._abbrev_re.matches(token.value)) {
            return 2;
        }
        return this._weak_abbrev_re.matches(token.value) ? 3 : 0;
    }

    private boolean isLower(String str) {
        return Character.isLowerCase(str.charAt(0)) && !this._lc_name_re.matches(str);
    }

    private boolean plausibleOrdinal(int i, List<Token> list) {
        String str;
        boolean z;
        if (!"de".equals(this.language)) {
            return false;
        }
        if (i < list.size() - 1) {
            Token token = list.get(i + 1);
            str = token.value;
            z = token.hasType(1);
        } else {
            str = "*END*";
            z = false;
        }
        String str2 = list.get(i - 1).value;
        int codePointAt = str2.codePointAt(0);
        boolean z2 = codePointAt < 48 || codePointAt > 57;
        String str3 = i >= 2 ? list.get(i - 2).value : "*BEGIN*";
        if ("?!:,/".contains(str) || this._ord_noun_re.matches(str) || this._pre_ord_re.matches(str3)) {
            return true;
        }
        return (z2 || str2.length() <= 2) ? isLower(str) || z || ")".equals(str) || (str3.endsWith(".") && list.get(i - 2).hasType(1) && !this._sent_start_re.matches(str)) : this._ord_gen_re.matches(str);
    }

    private boolean plausibleAbbrev(int i, List<Token> list) {
        if (i >= list.size() - 1) {
            return true;
        }
        String str = list.get(i + 1).value;
        return isLower(str) || !this._sent_start_re.matches(str);
    }

    private boolean plausibleTrunc(int i, List<Token> list) {
        if (i >= list.size() - 1) {
            return false;
        }
        String str = list.get(i + 1).value;
        return this._konj_re.matches(str) || ",".equals(str);
    }

    void fixSentBoundaries(List<Token> list) {
        int i = -1;
        int i2 = -1;
        for (int i3 = 0; i3 < list.size(); i3++) {
            Token token = list.get(i3);
            if (token.hasType(4)) {
                if (i == -1) {
                    i = i3;
                } else {
                    if (token.hasFlag(8)) {
                        token.removeFlag(8);
                        if (i3 < list.size() - 1) {
                            list.get(i3 + 1).addFlag(8);
                        }
                    }
                    i = -1;
                }
            } else if ("(".equals(token.value)) {
                i2 = i3;
            } else if (")".equals(token.value) && i2 >= 0) {
                if (token.hasFlag(8)) {
                    token.removeFlag(8);
                    if (i3 < list.size() - 1) {
                        String str = list.get(i3 + 1).value;
                        if (!isLower(str) && !",".equals(str)) {
                            list.get(i3 + 1).addFlag(8);
                        } else if (i3 - i2 < 12) {
                            for (int i4 = i2 + 1; i4 < i3; i4++) {
                                if (list.get(i4 + 1).hasFlag(8) && "!".equals(list.get(i4).value)) {
                                    list.get(i4 + 1).removeFlag(8);
                                }
                            }
                        }
                    }
                }
                boolean z = false;
                for (int i5 = i2 + 1; i5 < i3; i5++) {
                    if (list.get(i5 + 1).hasFlag(8) && ":".equals(list.get(i5).value)) {
                        z = true;
                        list.get(i5 + 1).removeFlag(8);
                    }
                }
                if (z && i3 < list.size() - 1 && this._sent_start_re.matches(list.get(i3 + 1).value)) {
                    list.get(i3 + 1).addFlag(8);
                }
                i2 = -1;
            }
        }
    }

    /* JADX WARN: Failed to find 'out' block for switch in B:24:0x00e7. Please report as an issue. */
    @Override // webcorp.tokens.TokenizerInterface
    public List<Token> tokenize(String str, int i) {
        TokenScanner tokenScannerEN;
        ArrayList arrayList = new ArrayList();
        if ("de".equals(this.language)) {
            tokenScannerEN = new TokenScannerDE(new StringReader(str));
        } else {
            if (!"en".equals(this.language)) {
                throw new IllegalStateException("No scanner for language:" + this.language);
            }
            tokenScannerEN = new TokenScannerEN(new StringReader(str));
        }
        while (true) {
            try {
                Token yylex = tokenScannerEN.yylex();
                if (yylex == null) {
                    break;
                }
                arrayList.add(yylex);
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
        if (arrayList.size() == 0) {
            return arrayList;
        }
        Token token = arrayList.get(0);
        int i2 = 1;
        while (i2 < arrayList.size()) {
            Token token2 = arrayList.get(i2);
            boolean z = false;
            if (token.end == token2.start) {
                if (".".equals(token2.value)) {
                    switch (classifyToken(token)) {
                        case 1:
                            z = plausibleOrdinal(i2, arrayList);
                            break;
                        case 3:
                            z = plausibleAbbrev(i2, arrayList);
                            break;
                    }
                } else if ("-".equals(token2.value)) {
                    z = plausibleTrunc(i2, arrayList);
                }
                if (z) {
                    token.value += token2.value;
                    token.end = token2.end;
                    arrayList.remove(i2);
                    i2--;
                } else {
                    token = token2;
                }
            } else {
                token = token2;
            }
            i2++;
        }
        for (int i3 = 0; i3 < arrayList.size() - 1; i3++) {
            if (arrayList.get(i3).hasType(2)) {
                arrayList.get(i3 + 1).flags |= 8;
            }
        }
        fixSentBoundaries(arrayList);
        if ("de".equals(this.language)) {
            Token token3 = arrayList.get(0);
            int i4 = 1;
            while (i4 < arrayList.size()) {
                Token token4 = arrayList.get(i4);
                if (token3.end == token4.start && token4.value.charAt(0) == '\'') {
                    boolean z2 = false;
                    if (token4.value.length() == 1) {
                        char charAt = token3.value.charAt(token3.value.length() - 1);
                        if (charAt == 'b' || charAt == 't' || charAt == 's') {
                            z2 = true;
                        }
                    } else if (token4.value.length() == 2 && Character.toLowerCase(token4.value.charAt(1)) == 's') {
                        z2 = Character.isUpperCase(token3.value.charAt(0));
                    }
                    if (z2) {
                        token3.value += token4.value;
                        token3.end = token4.end;
                        arrayList.remove(i4);
                        i4--;
                    } else {
                        token3 = token4;
                    }
                } else {
                    token3 = token4;
                }
                i4++;
            }
        }
        Token token5 = arrayList.get(0);
        int i5 = 1;
        while (i5 < arrayList.size()) {
            Token token6 = arrayList.get(i5);
            if (token5.value.length() <= 3 || !token5.value.endsWith("-") || !Character.isLetter(token6.value.charAt(0)) || this._konj_re.matcher(token6.value).matches()) {
                token5 = token6;
            } else {
                token5.value += token6.value;
                token5.end = token6.end;
                arrayList.remove(i5);
                i5--;
            }
            i5++;
        }
        return arrayList;
    }

    public static void main(String[] strArr) {
        TokenScannerEN tokenScannerEN = new TokenScannerEN(new StringReader("He's a Ph.D. from the U.S."));
        while (true) {
            try {
                Token yylex = tokenScannerEN.yylex();
                if (yylex == null) {
                    return;
                } else {
                    System.err.println("TOK:" + yylex.value);
                }
            } catch (IOException e) {
                e.printStackTrace();
                return;
            }
        }
    }
}
