package org.pageseeder.diffx.load.text;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.pageseeder.diffx.config.WhiteSpaceProcessing;
import org.pageseeder.diffx.token.TextToken;
import org.pageseeder.diffx.token.impl.CharactersToken;
import org.pageseeder.diffx.token.impl.IgnorableSpaceToken;

/* loaded from: input_file:org/pageseeder/diffx/load/text/TokenizerByPunctuation.class */
public final class TokenizerByPunctuation implements TextTokenizer {
    private static final String PUNCTUATION_MARKS = ".,?!;";
    private final WhiteSpaceProcessing whitespace;

    public TokenizerByPunctuation(WhiteSpaceProcessing whiteSpaceProcessing) {
        if (whiteSpaceProcessing == null) {
            throw new NullPointerException("the white space processing must be specified.");
        }
        this.whitespace = whiteSpaceProcessing;
    }

    @Override // org.pageseeder.diffx.load.text.TextTokenizer
    public List<TextToken> tokenize(CharSequence charSequence) {
        int i;
        TextToken token;
        if (charSequence == null) {
            throw new NullPointerException("Character sequence is null");
        }
        if (charSequence.length() == 0) {
            return Collections.emptyList();
        }
        ArrayList arrayList = new ArrayList(charSequence.length());
        Matcher matcher = Pattern.compile("[.,?!;]+").matcher(charSequence);
        int i2 = 0;
        while (true) {
            i = i2;
            if (!matcher.find()) {
                break;
            }
            if (i < matcher.end()) {
                arrayList.add(new CharactersToken(charSequence.subSequence(i, matcher.end())));
            }
            i2 = matcher.end();
        }
        if (i != charSequence.length() && (token = toToken(charSequence.subSequence(i, charSequence.length()), this.whitespace)) != null) {
            arrayList.add(token);
        }
        return arrayList;
    }

    private static TextToken toToken(CharSequence charSequence, WhiteSpaceProcessing whiteSpaceProcessing) {
        if (!Tokenizers.isWhitespace(charSequence)) {
            return new CharactersToken(charSequence);
        }
        if (whiteSpaceProcessing == WhiteSpaceProcessing.IGNORE) {
            return null;
        }
        return new IgnorableSpaceToken(charSequence);
    }

    public static List<TextToken> tokenize(CharSequence charSequence, WhiteSpaceProcessing whiteSpaceProcessing) {
        return new TokenizerByPunctuation(whiteSpaceProcessing).tokenize(charSequence);
    }
}
