package org.pageseeder.diffx.load.text;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.pageseeder.diffx.config.WhiteSpaceProcessing;
import org.pageseeder.diffx.token.TextToken;
import org.pageseeder.diffx.token.impl.IgnorableSpaceToken;
import org.pageseeder.diffx.token.impl.SpaceToken;
import org.pageseeder.diffx.token.impl.WordToken;

/* loaded from: input_file:org/pageseeder/diffx/load/text/TokenizerBySpaceWord.class */
public final class TokenizerBySpaceWord implements TextTokenizer {
    private final Map<String, TextToken> recycling = new HashMap();
    private final WhiteSpaceProcessing whitespace;

    public TokenizerBySpaceWord(WhiteSpaceProcessing whiteSpaceProcessing) {
        if (whiteSpaceProcessing == null) {
            throw new NullPointerException("the white space processing must be specified.");
        }
        this.whitespace = whiteSpaceProcessing;
    }

    @Override // org.pageseeder.diffx.load.text.TextTokenizer
    public List<TextToken> tokenize(CharSequence charSequence) {
        int i;
        if (charSequence == null) {
            throw new NullPointerException("Character sequence is null");
        }
        if (charSequence.length() == 0) {
            return Collections.emptyList();
        }
        ArrayList arrayList = new ArrayList(charSequence.length() / 4);
        Matcher matcher = Pattern.compile("( ?[A-Za-z0-9_'@/$.-]*[A-Za-z0-9_%])|(\\S)|( ?[\"(][^ \\t\\r\\n\\f'\"()]+[\")])").matcher(charSequence);
        int i2 = 0;
        while (true) {
            i = i2;
            if (!matcher.find()) {
                break;
            }
            if (i != matcher.start() && this.whitespace != WhiteSpaceProcessing.IGNORE) {
                arrayList.add(getSpaceEvent(charSequence.subSequence(i, matcher.start()).toString()));
            }
            arrayList.add(getWordEvent(charSequence.subSequence(matcher.start(), matcher.end()).toString()));
            i2 = matcher.end();
        }
        if (i != charSequence.length()) {
            arrayList.add(getSpaceEvent(charSequence.subSequence(i, charSequence.length()).toString()));
        }
        return arrayList;
    }

    public static List<TextToken> tokenize(CharSequence charSequence, WhiteSpaceProcessing whiteSpaceProcessing) {
        return new TokenizerBySpaceWord(whiteSpaceProcessing).tokenize(charSequence);
    }

    private TextToken getWordEvent(String str) {
        TextToken textToken = this.recycling.get(str);
        if (textToken == null) {
            textToken = new WordToken(str);
            this.recycling.put(str, textToken);
        }
        return textToken;
    }

    private TextToken getSpaceEvent(String str) {
        TextToken textToken = this.recycling.get(str);
        if (textToken == null) {
            textToken = this.whitespace == WhiteSpaceProcessing.PRESERVE ? new IgnorableSpaceToken(str) : SpaceToken.getInstance(str);
            this.recycling.put(str, textToken);
        }
        return textToken;
    }
}
