package org.languagetool.tokenizers;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Pattern;
import org.languagetool.tools.StringTools;

/* loaded from: input_file:org/languagetool/tokenizers/WordTokenizer.class */
public class WordTokenizer implements Tokenizer {
    private static final List<String> PROTOCOLS = Collections.unmodifiableList(Arrays.asList("http", "https", "ftp"));
    private static final Pattern URL_CHARS = Pattern.compile("[a-zA-Z0-9/%$-_.+!*'(),\\?]+");

    public static List<String> getProtocols() {
        return PROTOCOLS;
    }

    @Override // org.languagetool.tokenizers.Tokenizer
    public List<String> tokenize(String str) {
        ArrayList arrayList = new ArrayList();
        StringTokenizer stringTokenizer = new StringTokenizer(str, "  ᅟᅠ\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006 \u2008\u2009\u200a\u200b\u200c\u200d\u200e\u200f\u2028\u2029\u202a\u202b\u202c\u202d\u202e \u205f\u2060\u2061\u2062\u2063\u206a\u206b\u206c\u206d\u206e\u206f\u3000ㅤ\ufeffﾠ\ufff9\ufffa\ufffb,.;()[]{}<>!?:/|\\\"'«»„”“`´‘’‛′…¿¡\t\n\r", true);
        while (stringTokenizer.hasMoreElements()) {
            arrayList.add(stringTokenizer.nextToken());
        }
        return joinUrls(arrayList);
    }

    protected List<String> joinUrls(List<String> list) {
        ArrayList arrayList = new ArrayList();
        boolean z = false;
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < list.size(); i++) {
            if (urlStartsAt(i, list)) {
                z = true;
                sb.append(list.get(i));
            } else if (z && urlEndsAt(i, list)) {
                z = false;
                arrayList.add(sb.toString());
                sb.setLength(0);
                arrayList.add(list.get(i));
            } else if (z) {
                sb.append(list.get(i));
            } else {
                arrayList.add(list.get(i));
            }
        }
        if (sb.length() > 0) {
            arrayList.add(sb.toString());
        }
        return arrayList;
    }

    private boolean urlStartsAt(int i, List<String> list) {
        if (!isProtocol(list.get(i)) || list.size() <= i + 3) {
            return false;
        }
        return list.get(i + 1).equals(":") && list.get(i + 2).equals("/") && list.get(i + 3).equals("/");
    }

    private boolean isProtocol(String str) {
        Iterator<String> it = PROTOCOLS.iterator();
        while (it.hasNext()) {
            if (str.equals(it.next())) {
                return true;
            }
        }
        return false;
    }

    private boolean urlEndsAt(int i, List<String> list) {
        String str = list.get(i);
        if (StringTools.isWhitespace(str) || str.equals(")")) {
            return true;
        }
        if (list.size() <= i + 1) {
            return !URL_CHARS.matcher(str).matches();
        }
        if (StringTools.isWhitespace(list.get(i + 1))) {
            return str.equals(".") || str.equals(",") || str.equals(";") || str.equals(":") || str.equals("!") || str.equals("?");
        }
        return false;
    }
}
