package org.datavec.nlp.vectorizer;

import java.util.HashSet;
import org.datavec.api.conf.Configuration;
import org.datavec.api.records.Record;
import org.datavec.api.records.reader.RecordReader;
import org.datavec.nlp.tokenization.tokenizer.TokenPreProcess;
import org.datavec.nlp.tokenization.tokenizer.Tokenizer;
import org.datavec.nlp.tokenization.tokenizerfactory.DefaultTokenizerFactory;
import org.datavec.nlp.tokenization.tokenizerfactory.TokenizerFactory;

/* loaded from: input_file:org/datavec/nlp/vectorizer/AbstractTfidfVectorizer.class */
public abstract class AbstractTfidfVectorizer<VECTOR_TYPE> extends TextVectorizer<VECTOR_TYPE> {
    @Override // org.datavec.nlp.vectorizer.TextVectorizer
    public void doWithTokens(Tokenizer tokenizer) {
        HashSet hashSet = new HashSet();
        while (tokenizer.hasMoreTokens()) {
            String nextToken = tokenizer.nextToken();
            if (!this.stopWords.contains(nextToken)) {
                this.cache.incrementCount(nextToken);
                if (!hashSet.contains(nextToken)) {
                    this.cache.incrementDocCount(nextToken);
                }
                hashSet.add(nextToken);
            }
        }
    }

    @Override // org.datavec.nlp.vectorizer.TextVectorizer
    public TokenizerFactory createTokenizerFactory(Configuration configuration) {
        try {
            TokenizerFactory tokenizerFactory = (TokenizerFactory) Class.forName(configuration.get(TextVectorizer.TOKENIZER, DefaultTokenizerFactory.class.getName())).newInstance();
            String str = configuration.get(TextVectorizer.PREPROCESSOR, (String) null);
            if (str != null) {
                tokenizerFactory.setTokenPreProcessor((TokenPreProcess) Class.forName(str).newInstance());
            }
            return tokenizerFactory;
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    public abstract VECTOR_TYPE createVector(Object[] objArr);

    public abstract VECTOR_TYPE fitTransform(RecordReader recordReader);

    public abstract VECTOR_TYPE transform(Record record);
}
