/*
 * Decompiled with CFR 0.152.
 */
package org.datavec.nlp.vectorizer;

import java.util.Collection;
import org.datavec.api.conf.Configuration;
import org.datavec.api.records.Record;
import org.datavec.api.records.reader.RecordReader;
import org.datavec.api.vector.Vectorizer;
import org.datavec.api.writable.Writable;
import org.datavec.nlp.metadata.DefaultVocabCache;
import org.datavec.nlp.metadata.VocabCache;
import org.datavec.nlp.stopwords.StopWords;
import org.datavec.nlp.tokenization.tokenizer.Tokenizer;
import org.datavec.nlp.tokenization.tokenizerfactory.TokenizerFactory;
import org.nd4j.linalg.primitives.Counter;

public abstract class TextVectorizer<VECTOR_TYPE>
implements Vectorizer<VECTOR_TYPE> {
    protected TokenizerFactory tokenizerFactory;
    protected int minWordFrequency = 0;
    public static final String MIN_WORD_FREQUENCY = "org.nd4j.nlp.minwordfrequency";
    public static final String STOP_WORDS = "org.nd4j.nlp.stopwords";
    public static final String TOKENIZER = "org.datavec.nlp.tokenizerfactory";
    public static final String PREPROCESSOR = "org.datavec.nlp.preprocessor";
    public static final String VOCAB_CACHE = "org.datavec.nlp.vocabcache";
    protected Collection<String> stopWords;
    protected VocabCache cache;

    public void initialize(Configuration conf) {
        this.tokenizerFactory = this.createTokenizerFactory(conf);
        this.minWordFrequency = conf.getInt(MIN_WORD_FREQUENCY, 5);
        if (conf.get(STOP_WORDS) != null) {
            this.stopWords = conf.getStringCollection(STOP_WORDS);
        }
        if (this.stopWords == null) {
            this.stopWords = StopWords.getStopWords();
        }
        String clazz = conf.get(VOCAB_CACHE, DefaultVocabCache.class.getName());
        try {
            Class<?> tokenizerFactoryClazz = Class.forName(clazz);
            this.cache = (VocabCache)tokenizerFactoryClazz.newInstance();
            this.cache.initialize(conf);
        }
        catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    public void fit(RecordReader reader) {
        this.fit(reader, null);
    }

    public void fit(RecordReader reader, Vectorizer.RecordCallBack callBack) {
        while (reader.hasNext()) {
            Record record = reader.nextRecord();
            String s = this.toString(record.getRecord());
            Tokenizer tokenizer = this.tokenizerFactory.create(s);
            this.doWithTokens(tokenizer);
            if (callBack != null) {
                callBack.onRecord(record);
            }
            this.cache.incrementNumDocs(1.0);
        }
    }

    protected Counter<String> wordFrequenciesForRecord(Collection<Writable> record) {
        String s = this.toString(record);
        Tokenizer tokenizer = this.tokenizerFactory.create(s);
        Counter ret = new Counter();
        while (tokenizer.hasMoreTokens()) {
            ret.incrementCount((Object)tokenizer.nextToken(), 1.0);
        }
        return ret;
    }

    protected String toString(Collection<Writable> record) {
        StringBuilder sb = new StringBuilder();
        for (Writable w : record) {
            sb.append(w.toString());
        }
        return sb.toString();
    }

    public abstract void doWithTokens(Tokenizer var1);

    public abstract TokenizerFactory createTokenizerFactory(Configuration var1);

    public VocabCache getCache() {
        return this.cache;
    }
}

