package org.fbk.cit.hlt.core.lsa.io;

import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import org.fbk.cit.hlt.core.lsa.Index;
import org.fbk.cit.hlt.core.lsa.TermSet;
import org.fbk.cit.hlt.core.lsa.Vocabulary;

/* loaded from: input_file:org/fbk/cit/hlt/core/lsa/io/TermDocumentMatrixBuilder.class */
public abstract class TermDocumentMatrixBuilder {
    protected Index termIndex;
    protected Index documentIndex;
    protected MatrixFileWriter matrixWriter;
    protected int columnCount;
    protected Vocabulary corpusVocabulary;
    protected TermSet stopwordSet;
    protected int[] lengthFreq;
    protected File matrixFile;
    protected File rowFile;
    protected File colFile;
    protected File dfFile;
    static Logger logger = Logger.getLogger(TermDocumentMatrixBuilder.class.getName());
    protected static Pattern spacePattern = Pattern.compile(" ");
    protected int totalKW = 0;
    protected TermSet keywordSet = new TermSet();

    public TermDocumentMatrixBuilder(String str, File file, File file2) throws IOException {
        this.keywordSet.read(new FileReader(file2));
        logger.info("keyword to be indexed: " + this.keywordSet.size());
        this.stopwordSet = new TermSet();
        this.stopwordSet.read(new FileReader(file));
        logger.info(file + "(" + this.stopwordSet.size() + ")");
        this.lengthFreq = new int[101];
        this.columnCount = 0;
        this.matrixFile = new File(str + "-matrix");
        this.rowFile = new File(str + "-row");
        this.colFile = new File(str + "-col");
        this.dfFile = new File(str + "-df");
        this.termIndex = new Index();
        this.documentIndex = new Index();
        this.matrixWriter = new SparseBinaryMatrixFileWriter(this.matrixFile);
        this.corpusVocabulary = new Vocabulary();
    }

    public void close() throws IOException {
        this.termIndex.write(new FileWriter(this.rowFile));
        this.documentIndex.write(new FileWriter(this.colFile));
        this.matrixWriter.close();
        this.corpusVocabulary.write(new FileWriter(this.dfFile));
    }

    public abstract void read(File file) throws IOException;

    protected void addDocument(String[] strArr) throws IOException {
        Vocabulary vocabulary = new Vocabulary();
        this.totalKW += strArr.length - 1;
        for (int i = 1; i < strArr.length; i++) {
            String lowerCase = strArr[i].toLowerCase();
            if (isWord(lowerCase)) {
                if (this.keywordSet.size() == 0) {
                    if (this.stopwordSet.size() == 0) {
                        logger.debug("1 adding " + lowerCase);
                        vocabulary.add(lowerCase);
                    } else if (!this.stopwordSet.contains(lowerCase)) {
                        logger.debug("2 adding " + lowerCase);
                        vocabulary.add(lowerCase);
                    }
                } else if (this.keywordSet.contains(lowerCase)) {
                    logger.debug("3 adding " + lowerCase);
                    vocabulary.add(lowerCase);
                }
            }
        }
        if (strArr.length <= 100) {
            int[] iArr = this.lengthFreq;
            int length = strArr.length - 1;
            iArr[length] = iArr[length] + 1;
        } else {
            int[] iArr2 = this.lengthFreq;
            iArr2[0] = iArr2[0] + 1;
        }
        if (vocabulary.size() == 0) {
            return;
        }
        this.documentIndex.get(strArr[0]);
        int size = vocabulary.entrySet().size();
        int[] iArr3 = new int[size];
        float[] fArr = new float[size];
        int i2 = 0;
        for (Map.Entry entry : vocabulary.entrySet()) {
            String str = (String) entry.getKey();
            Vocabulary.TermFrequency termFrequency = (Vocabulary.TermFrequency) entry.getValue();
            iArr3[i2] = this.termIndex.add(str);
            fArr[i2] = (float) (1.0d + Math.log(termFrequency.get()));
            this.corpusVocabulary.add(str);
            i2++;
        }
        this.columnCount++;
        this.matrixWriter.writeColumn(iArr3, fArr);
    }

    private boolean isWord(String str) {
        if (str.length() < 2 || !Character.isLetter((int) str.charAt(0))) {
            return false;
        }
        for (int i = 1; i < str.length(); i++) {
            char charAt = str.charAt(i);
            if (!Character.isLetterOrDigit((int) charAt) || charAt == '-') {
                return false;
            }
        }
        return true;
    }
}
