package org.deeplearning4j.text.tokenization.tokenizer;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import java.util.NavigableMap;
import java.util.concurrent.atomic.AtomicInteger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/deeplearning4j/text/tokenization/tokenizer/BertWordPieceStreamTokenizer.class */
public class BertWordPieceStreamTokenizer implements Tokenizer {
    private static final Logger log = LoggerFactory.getLogger(BertWordPieceStreamTokenizer.class);
    private final NavigableMap<String, Integer> vocab;
    private final Reader reader;
    private final boolean lowerCaseOnly;
    private int longestToken;
    private TokenPreProcess tokenPreProcess;
    private boolean more = true;
    private String buffer = "";
    private String prevRest = null;
    private boolean noSplit = false;
    private List<String> tokens = new ArrayList();
    private AtomicInteger position = new AtomicInteger(0);

    public BertWordPieceStreamTokenizer(InputStream inputStream, NavigableMap<String, Integer> navigableMap, boolean z) {
        this.longestToken = 0;
        this.lowerCaseOnly = z;
        if (navigableMap.comparator() == null || navigableMap.comparator().compare("a", "b") < 0) {
            throw new IllegalArgumentException("Vocab must use reverse sort order!");
        }
        this.reader = new BufferedReader(new InputStreamReader(inputStream));
        this.vocab = navigableMap;
        for (String str : navigableMap.keySet()) {
            if (str.length() > this.longestToken) {
                this.longestToken = str.length();
            }
        }
    }

    @Override // org.deeplearning4j.text.tokenization.tokenizer.Tokenizer
    public boolean hasMoreTokens() {
        return this.more || this.buffer.length() > 0 || this.prevRest != null;
    }

    private void readMore() {
        StringBuilder sb = new StringBuilder(this.longestToken);
        while (this.more && sb.length() < this.longestToken) {
            try {
                int read = this.reader.read();
                if (read >= 0) {
                    sb.appendCodePoint(read);
                } else {
                    this.more = false;
                }
            } catch (IOException e) {
                this.more = false;
                log.error("Unexpected exception while reading input stream", e);
            }
        }
        String sb2 = sb.toString();
        if (this.lowerCaseOnly) {
            sb2 = sb2.toLowerCase();
        }
        if (!this.noSplit) {
            this.buffer += sb2;
            return;
        }
        String[] split = BertWordPieceTokenizer.splitPattern.split(sb2, 2);
        this.prevRest = (this.prevRest == null ? "" : this.prevRest) + split[0];
        if (split.length > 1) {
            this.noSplit = false;
            this.buffer += split[1];
        }
    }

    @Override // org.deeplearning4j.text.tokenization.tokenizer.Tokenizer
    public int countTokens() {
        return getTokens().size();
    }

    @Override // org.deeplearning4j.text.tokenization.tokenizer.Tokenizer
    public String nextToken() {
        return (this.tokens.isEmpty() || this.position.get() >= this.tokens.size()) ? nextTokenFromStream() : this.tokens.get(this.position.getAndIncrement());
    }

    private String nextTokenFromStream() {
        if (this.noSplit && this.more) {
            readMore();
        }
        String str = this.prevRest;
        if (str == null || str.length() == 0) {
            if (this.buffer.length() < this.longestToken && this.more) {
                readMore();
            }
            String[] split = BertWordPieceTokenizer.splitPattern.split(this.buffer, 2);
            str = split[0];
            if (split.length > 1) {
                this.buffer = split[1];
                this.noSplit = false;
            } else {
                this.buffer = "";
                this.noSplit = true;
            }
        }
        String findLongestSubstring = BertWordPieceTokenizer.findLongestSubstring(this.vocab, str);
        String substring = str.substring(findLongestSubstring.length());
        if (str.length() > findLongestSubstring.length()) {
            substring = "##" + substring;
        }
        if ("##".equals(substring) || substring.length() == 0) {
            substring = null;
        }
        this.prevRest = substring;
        if (this.tokenPreProcess != null) {
            findLongestSubstring = this.tokenPreProcess.preProcess(findLongestSubstring);
        }
        return findLongestSubstring;
    }

    @Override // org.deeplearning4j.text.tokenization.tokenizer.Tokenizer
    public List<String> getTokens() {
        if (!this.tokens.isEmpty()) {
            return this.tokens;
        }
        log.info("Starting prebuffering...");
        while (hasMoreTokens()) {
            this.tokens.add(nextTokenFromStream());
        }
        log.info("Tokens prefetch finished. Tokens size: [" + this.tokens.size() + "]");
        return this.tokens;
    }

    @Override // org.deeplearning4j.text.tokenization.tokenizer.Tokenizer
    public void setTokenPreProcessor(TokenPreProcess tokenPreProcess) {
        this.tokenPreProcess = tokenPreProcess;
    }
}
