package com.github.chen0040.data.text;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:com/github/chen0040/data/text/BasicTokenizer.class */
public class BasicTokenizer implements Tokenizer, Serializable {
    private static final String regexLetterNumber = "[a-zA-Z0-9]";
    private static final String regexNotLetterNumber = "[^a-zA-Z0-9]";
    private static final String regexSeparator = "[\\?!()\";/\\|`]";
    private static final String regexClitics = "'|:|-|'S|'D|'M|'LL|'RE|'VE|N'T|'s|'d|'m|'ll|'re|'ve|n't";
    private static final List<String> abbrList = Arrays.asList("Co.", "Corp.", "vs.", "e.g.", "etc.", "ex.", "cf.", "eg.", "Jan.", "Feb.", "Mar.", "Apr.", "Jun.", "Jul.", "Aug.", "Sept.", "Oct.", "Nov.", "Dec.", "jan.", "feb.", "mar.", "apr.", "jun.", "jul.", "aug.", "sept.", "oct.", "nov.", "dec.", "ed.", "eds.", "repr.", "trans.", "vol.", "vols.", "rev.", "est.", "b.", "m.", "bur.", "d.", "r.", "M.", "Dept.", "MM.", "U.", "Mr.", "Jr.", "Ms.", "Mme.", "Mrs.", "Dr.", "Ph.D.");
    private static final long serialVersionUID = -999803747111655623L;
    private static BasicTokenizer tokenizer;

    @Override // com.github.chen0040.data.text.Tokenizer
    public List<String> tokenize(String str) {
        ArrayList arrayList = new ArrayList();
        String[] split = str.replaceAll("\\t", " ").replaceAll("([\\?!()\";/\\|`])", " $1 ").replaceAll("([^\\s]),", "$1 ,").replaceAll(",([^\\s])", " , $1").replaceAll("^(')", "$1 ").replaceAll("([^a-zA-Z0-9])'", "$1 '").replaceAll("('|:|-|'S|'D|'M|'LL|'RE|'VE|N'T|'s|'d|'m|'ll|'re|'ve|n't)$", " $1").replaceAll("('|:|-|'S|'D|'M|'LL|'RE|'VE|N'T|'s|'d|'m|'ll|'re|'ve|n't)([^a-zA-Z0-9])", " $1 $2").trim().split("\\s+");
        Pattern compile = Pattern.compile(".*[a-zA-Z0-9]\\.");
        Pattern compile2 = Pattern.compile("^([A-Za-z]\\.([A-Za-z]\\.)+|[A-Z][bcdfghj-nptvxz]+\\.)$");
        for (String str2 : split) {
            Matcher matcher = compile.matcher(str2);
            Matcher matcher2 = compile2.matcher(str2);
            if (!matcher.matches() || abbrList.contains(str2) || matcher2.matches()) {
                arrayList.add(str2);
            } else {
                arrayList.add(str2.substring(0, str2.length() - 1));
                arrayList.add(str2.substring(str2.length() - 1));
            }
        }
        return arrayList;
    }

    private static BasicTokenizer getTokenizer() {
        if (tokenizer == null) {
            tokenizer = new BasicTokenizer();
        }
        return tokenizer;
    }

    public static List<String> doTokenize(String str) {
        return getTokenizer().tokenize(str);
    }

    public static List<String> doTokenize(List<String> list) {
        ArrayList arrayList = new ArrayList();
        for (int i = 0; i < list.size(); i++) {
            arrayList.addAll(doTokenize(list.get(i)));
        }
        return arrayList;
    }
}
