package com.parc.chat.tokenizer;

import com.parc.chat.tokenizer.LexicalFSA;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:com/parc/chat/tokenizer/MicroTextTokenizer.class */
public class MicroTextTokenizer {
    private String originalText;
    static Pattern p = Pattern.compile("^[a-z0]+$");
    public static List<String> abbreviationList = Arrays.asList("mr", "mrs", "dr", "ms", "st", "rd", "no");
    private static String protocolList = "http|https|mailto|sftp|ftp|smb|htp|htps|smtp|fax|xrxscanwebservice|mailbox|usb|webdav|webdavs";
    private static String urlRegex = "^(((" + protocolList + "):\\/\\/)\\S+).*";
    private static Pattern urlPattern = Pattern.compile(urlRegex, 2);
    private static String topLevelDomainList = "com|edu|org|net|gov|mil|co|us";
    private static String domainPart = "([A-Za-z0-9-]+\\.)+(" + topLevelDomainList + ")";
    private static String filePath = "([\\w\\d\\.\\/-])+";
    private static Pattern hostnamePattern = Pattern.compile("(" + domainPart + ")", 2);
    private static Pattern hostnamePathPattern = Pattern.compile("(" + domainPart + "\\/" + filePath + ")", 2);
    private static String localpart = "[^\\.\\s][a-zA-Z0-9!#$%&'*+\\-/=?\\^_`{|}~]*";
    private static String emailAddressRegex = "^(" + localpart + "@" + domainPart + ").*";
    private static Pattern emailPattern = Pattern.compile(emailAddressRegex);
    private static String fileExtension = "aiff?|au|avi|bat|bmp|class|csv|cvs|dbf|dif|docx?|eps|exe|fm3|gif|hqx|html?|java|jpeg|jpg|mac|map|mdb|mid|midi|mov|mtb|mtw|pdf|png|ppt|pptx|psd|psp|qt|qxd|ra|rtf|sit|tar|tif|txt|wav|xls|xlsx|zip";
    private static String filenameRegex = "^(\\S+\\.(" + fileExtension + ")).*";
    private static Pattern filenamePattern = Pattern.compile(filenameRegex, 2);
    private Stack<LabeledToken> tokenStack = null;
    private StringBuilder surfaceFormBuffer = new StringBuilder();
    private StringBuilder lexemeBuffer = new StringBuilder();
    private int tokenCount = 0;
    private int charPos = 0;
    private int currentTokenPos = 0;

    public MicroTextTokenizer(String str) {
        this.originalText = str;
    }

    public Stack<LabeledToken> tokenize() {
        LexicalFSA.State steadyState;
        this.tokenStack = new Stack<>();
        LexicalFSA.State state = null;
        if (this.originalText == null || this.originalText.isEmpty()) {
            return this.tokenStack;
        }
        char c = 0;
        char c2 = 0;
        LexicalFSA.State state2 = LexicalFSA.State.START;
        this.charPos = 0;
        while (this.charPos < this.originalText.length()) {
            char charAt = this.originalText.charAt(this.charPos);
            c2 = getLookahead(this.charPos);
            LexicalFSA.State nextState = LexicalFSA.getNextState(state2, LexicalFSA.getSymbol(charAt, c, c2));
            if (nextState != state2) {
                LexicalFSA.State exitState = exitState(state2, nextState, charAt, c, c2);
                steadyState = exitState != state2 ? exitState : enterState(nextState, state2, charAt, c, c2);
            } else {
                steadyState = steadyState(state2, charAt, c, c2);
            }
            state = steadyState;
            state2 = state;
            c = charAt;
            this.charPos++;
        }
        exitState(state, null, (char) 0, c, c2);
        return this.tokenStack;
    }

    public String getText() {
        return this.originalText;
    }

    public String toString() {
        return this.originalText;
    }

    public List<String> getTokensAsList() {
        ArrayList arrayList = new ArrayList();
        if (this.tokenStack == null) {
            throw new IllegalStateException("You must call tokenize() before requesting token information.");
        }
        Iterator<LabeledToken> it = this.tokenStack.iterator();
        while (it.hasNext()) {
            arrayList.add(it.next().getOriginalWord());
        }
        return arrayList;
    }

    public int getTokenPosition(int i) {
        if (this.tokenStack == null) {
            throw new IllegalStateException("You must call tokenize() before requesting token information.");
        }
        return this.tokenStack.get(i).getCharacterPosition();
    }

    private LexicalFSA.State enterState(LexicalFSA.State state, LexicalFSA.State state2, char c, char c2, char c3) {
        LexicalFSA.State checkTransitionNetworks = checkTransitionNetworks(c);
        if (checkTransitionNetworks != null) {
            return checkTransitionNetworks;
        }
        switch (state) {
            case ON_DOT:
                if (c3 != '.') {
                    if (this.tokenStack.size() > 1 && isAbbreviation(this.tokenStack.peek().getStem())) {
                        LabeledToken pop = this.tokenStack.pop();
                        pop.setOriginalWord(pop.getOriginalWord() + ".");
                        pop.setStem(pop.getStem() + ".");
                        this.tokenStack.push(pop);
                        break;
                    }
                } else {
                    LexicalFSA.State state3 = LexicalFSA.State.ON_ELLIPSIS;
                }
                this.currentTokenPos = this.charPos;
                appendSymbol(state, c, c2, c3);
                break;
            case ON_PUNCT:
                this.currentTokenPos = this.charPos;
                appendSymbol(state, c, c2, c3);
                saveToken(TokenType.PUNCT);
                state = LexicalFSA.State.BETWEEN_TOKENS;
                break;
            case ON_HYPHEN:
                this.currentTokenPos = this.charPos;
                appendSymbol(state, c, c2, c3);
                saveToken(TokenType.HYPHEN);
                state = LexicalFSA.State.BETWEEN_TOKENS;
                break;
            case IN_AT_NAME:
            case IN_HASH_TAG:
            case ON_ELLIPSIS:
                appendSymbol(state, c, c2, c3);
                break;
            case BETWEEN_TOKENS:
                if (c2 == '.') {
                    resolveAbbreviationPeriods();
                    break;
                }
                break;
            case ERROR:
                break;
            default:
                this.currentTokenPos = this.charPos;
                appendSymbol(state, c, c2, c3);
                break;
        }
        return state;
    }

    private LexicalFSA.State steadyState(LexicalFSA.State state, char c, char c2, char c3) {
        LexicalFSA.State checkTransitionNetworks = checkTransitionNetworks(c);
        if (checkTransitionNetworks != null) {
            return checkTransitionNetworks;
        }
        if (state != LexicalFSA.State.BETWEEN_TOKENS) {
            appendSymbol(state, c, c2, c3);
        }
        return state;
    }

    /* JADX WARN: Can't fix incorrect switch cases order, some code will duplicate */
    private LexicalFSA.State exitState(LexicalFSA.State state, LexicalFSA.State state2, char c, char c2, char c3) {
        switch (state) {
            case ON_DOT:
                if (state2 != LexicalFSA.State.ON_ELLIPSIS && state2 != LexicalFSA.State.ON_DIGIT) {
                    saveToken(TokenType.PUNCT);
                    break;
                }
                break;
            case ON_PUNCT:
            case BETWEEN_TOKENS:
            case ERROR:
            default:
                saveToken(TokenType.ALPHA);
                break;
            case ON_HYPHEN:
                saveToken(TokenType.HYPHEN);
                saveToken(TokenType.NUMERIC);
                break;
            case IN_AT_NAME:
                saveToken(TokenType.AT_NAME);
                break;
            case IN_HASH_TAG:
                saveToken(TokenType.HASH_TAG);
                break;
            case ON_ELLIPSIS:
                saveToken(TokenType.PUNCT);
                break;
            case ON_AT:
                if (state2 != LexicalFSA.State.IN_AT_NAME) {
                    saveToken(TokenType.PUNCT);
                    break;
                }
                break;
            case ON_HASH:
                if (state2 != LexicalFSA.State.IN_HASH_TAG) {
                    saveToken(TokenType.PUNCT);
                    break;
                }
                break;
            case ON_DIGIT:
                saveToken(TokenType.NUMERIC);
                break;
            case URL:
                saveToken(TokenType.URL);
                break;
            case START:
                break;
        }
        return state;
    }

    private LexicalFSA.State checkTransitionNetworks(char c) {
        int recognizeEmoticon;
        if (EmoticonFST.isInitialEmoticonChar(c) && (recognizeEmoticon = EmoticonFST.recognizeEmoticon(this.originalText.substring(this.charPos))) > 0) {
            String substring = this.originalText.substring(this.charPos, this.charPos + recognizeEmoticon);
            this.lexemeBuffer.append(substring);
            this.surfaceFormBuffer.append(substring);
            this.currentTokenPos = this.charPos;
            saveToken(TokenType.EMOTICON);
            this.charPos += recognizeEmoticon;
            return LexicalFSA.State.BETWEEN_TOKENS;
        }
        Matcher matcher = emailPattern.matcher(this.originalText.substring(this.charPos));
        if (matcher.matches()) {
            String group = matcher.group(1);
            this.lexemeBuffer.append(group.toLowerCase());
            this.surfaceFormBuffer.append(group);
            this.currentTokenPos = this.charPos;
            saveToken(TokenType.EMAIL_ADDR);
            this.charPos += group.length();
            return LexicalFSA.State.BETWEEN_TOKENS;
        }
        Matcher matcher2 = urlPattern.matcher(this.originalText.substring(this.charPos));
        if (matcher2.matches()) {
            String group2 = matcher2.group(1);
            if (group2.endsWith(".")) {
                group2 = group2.substring(0, group2.length() - 1);
            }
            this.lexemeBuffer.append(group2.toLowerCase());
            this.surfaceFormBuffer.append(group2);
            this.currentTokenPos = this.charPos;
            saveToken(TokenType.URL);
            this.charPos += group2.length() - 1;
            return LexicalFSA.State.BETWEEN_TOKENS;
        }
        Matcher matcher3 = hostnamePathPattern.matcher(this.originalText.substring(this.charPos));
        if (matcher3.matches()) {
            String group3 = matcher3.group(1);
            if (group3.endsWith(".")) {
                group3 = group3.substring(0, group3.length() - 1);
            }
            this.lexemeBuffer.append(group3.toLowerCase());
            this.surfaceFormBuffer.append(group3);
            this.currentTokenPos = this.charPos;
            saveToken(TokenType.URL);
            this.charPos += group3.length() - 1;
            return LexicalFSA.State.BETWEEN_TOKENS;
        }
        Matcher matcher4 = hostnamePattern.matcher(this.originalText.substring(this.charPos));
        if (matcher4.matches()) {
            String group4 = matcher4.group(1);
            this.lexemeBuffer.append(group4.toLowerCase());
            this.surfaceFormBuffer.append(group4);
            this.currentTokenPos = this.charPos;
            saveToken(TokenType.URL);
            this.charPos += group4.length() - 1;
            return LexicalFSA.State.BETWEEN_TOKENS;
        }
        Matcher matcher5 = filenamePattern.matcher(this.originalText.substring(this.charPos));
        if (!matcher5.matches()) {
            return null;
        }
        String group5 = matcher5.group(1);
        this.lexemeBuffer.append(group5.toLowerCase());
        this.surfaceFormBuffer.append(group5);
        this.currentTokenPos = this.charPos;
        saveToken(TokenType.FILENAME);
        this.charPos += group5.length() - 1;
        return LexicalFSA.State.BETWEEN_TOKENS;
    }

    private void resolveAbbreviationPeriods() {
        if (this.tokenStack.size() < 2) {
            return;
        }
        LabeledToken pop = this.tokenStack.pop();
        LabeledToken pop2 = this.tokenStack.pop();
        if (pop2.getOriginalWord().length() != 1) {
            this.tokenStack.push(pop2);
            this.tokenStack.push(pop);
            return;
        }
        Stack stack = new Stack();
        stack.push(pop);
        stack.push(pop2);
        while (this.tokenStack.size() > 1) {
            LabeledToken pop3 = this.tokenStack.pop();
            LabeledToken pop4 = this.tokenStack.pop();
            if (pop4.getOriginalWord().length() != 1) {
                break;
            }
            stack.push(pop3);
            stack.push(pop4);
        }
        LabeledToken labeledToken = new LabeledToken();
        StringBuilder sb = new StringBuilder();
        StringBuilder sb2 = new StringBuilder();
        Iterator it = stack.iterator();
        while (it.hasNext()) {
            LabeledToken labeledToken2 = (LabeledToken) it.next();
            sb.insert(0, labeledToken2.getOriginalWord());
            sb2.insert(0, labeledToken2.getStem());
        }
        labeledToken.setCharacterPosition(((LabeledToken) stack.get(0)).getCharacterPosition());
        labeledToken.setIndex(((LabeledToken) stack.get(0)).getIndex());
        labeledToken.setOriginalWord(sb.toString());
        labeledToken.setStem(sb2.toString());
        labeledToken.setTokenType(TokenType.ALPHA);
        this.tokenStack.push(labeledToken);
    }

    private static boolean isAbbreviation(String str) {
        return abbreviationList.contains(str.toLowerCase());
    }

    private void appendSymbol(LexicalFSA.State state, char c, char c2, char c3) {
        this.surfaceFormBuffer.append(c);
        if (c == '\'' && state == LexicalFSA.State.IN_WORD && c3 != 's') {
            return;
        }
        if (state == LexicalFSA.State.IN_WORD && c == '0') {
            this.lexemeBuffer.append('o');
        } else {
            this.lexemeBuffer.append(Character.toLowerCase(c));
        }
    }

    private void clearBuffers() {
        this.surfaceFormBuffer = new StringBuilder();
        this.lexemeBuffer = new StringBuilder();
    }

    private char getLookahead(int i) {
        return this.originalText.length() <= i + 1 ? (char) 0 : this.originalText.charAt(i + 1);
    }

    private void saveToken(TokenType tokenType) {
        String sb = this.lexemeBuffer.toString();
        String sb2 = this.surfaceFormBuffer.toString();
        if (sb == null || sb.length() == 0) {
            return;
        }
        LabeledToken labeledToken = new LabeledToken();
        if (tokenType == TokenType.AT_NAME) {
            labeledToken.setStem("ATNAME");
        } else {
            labeledToken.setStem(sb);
        }
        labeledToken.setOriginalWord(sb2);
        labeledToken.setCharacterPosition(this.currentTokenPos);
        if (tokenType != TokenType.ALPHA) {
            labeledToken.setTokenType(tokenType);
        } else if (labeledToken.getStem().matches("^[\\d\\-\\.\\,\\$]+$")) {
            labeledToken.setTokenType(TokenType.NUMERIC);
        } else if (labeledToken.getStem().matches(".*[A-Za-z].*")) {
            labeledToken.setTokenType(TokenType.ALPHA);
        } else {
            labeledToken.setTokenType(TokenType.PUNCT);
        }
        this.tokenCount++;
        labeledToken.setIndex(this.tokenCount);
        this.tokenStack.push(labeledToken);
        clearBuffers();
    }
}
