package org.fbk.cit.hlt.core.lsa.io;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.OutputStreamWriter;
import java.util.Date;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.fbk.cit.hlt.core.analysis.stemmer.Stemmer;
import org.fbk.cit.hlt.core.lsa.Index;
import org.fbk.cit.hlt.core.lsa.TermSet;
import org.fbk.cit.hlt.core.lsa.Vocabulary;
import org.fbk.cit.hlt.thewikimachine.util.StringTable;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.ParsedPageLink;

/* loaded from: input_file:org/fbk/cit/hlt/core/lsa/io/TermDocumentMatrixFileWriter.class */
public class TermDocumentMatrixFileWriter {
    protected Index termIndex;
    protected Index documentIndex;
    protected MatrixFileWriter matrixWriter;
    protected int columnCount;
    protected Vocabulary corpusVocabulary;
    protected TermSet stopwordSet;
    protected TermSet keywordSet;
    protected int totalKW;
    protected int[] lengthFreq;
    protected boolean indexAllTokens;
    protected boolean saveMatrix;
    Stemmer stemmer;
    static Logger logger = Logger.getLogger(TermDocumentMatrixFileWriter.class.getName());
    protected static Pattern spacePattern = Pattern.compile(" ");

    public TermDocumentMatrixFileWriter(File file, String str, File file2, File file3, int i, boolean z) {
        this(file, str, file2, file3, i, z, null);
    }

    public TermDocumentMatrixFileWriter(File file, String str, File file2, File file3, int i, boolean z, Stemmer stemmer) {
        this.indexAllTokens = this.indexAllTokens;
        this.stemmer = stemmer;
        this.saveMatrix = z;
        try {
            this.totalKW = 0;
            this.keywordSet = new TermSet();
            this.keywordSet.read(new BufferedReader(new InputStreamReader(new FileInputStream(file3), "UTF-8")));
            logger.info(this.keywordSet.size() + " keywords read from " + file3);
            this.stopwordSet = new TermSet();
            this.stopwordSet.read(new BufferedReader(new InputStreamReader(new FileInputStream(file2), "UTF-8")), stemmer);
            logger.info(this.stopwordSet.size() + " stopwords read from " + file2);
            this.lengthFreq = new int[101];
            this.columnCount = 0;
            File file4 = new File(str + "-matrix");
            File file5 = new File(str + "-row");
            File file6 = new File(str + "-col");
            File file7 = new File(str + "-df");
            this.termIndex = new Index();
            this.documentIndex = new Index();
            if (z) {
                this.matrixWriter = new SparseBinaryMatrixFileWriter(file4);
            }
            this.corpusVocabulary = new Vocabulary();
            long currentTimeMillis = System.currentTimeMillis();
            LineNumberReader lineNumberReader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));
            int i2 = 1;
            Date date = new Date();
            Date date2 = new Date();
            while (true) {
                String readLine = lineNumberReader.readLine();
                if (readLine == null || i2 > i) {
                    break;
                }
                addDocument(spacePattern.split(readLine));
                if (i2 % 100000 == 0) {
                    logger.info(i2 + StringTable.HORIZONTAL_TABULATION + ((r0.getTime() - date2.getTime()) / 1000.0d) + " total s (" + new Date() + "), voc size:" + this.corpusVocabulary.size() + ", term index size:" + this.termIndex.size() + ", totalKW: " + this.totalKW);
                    date2 = new Date();
                    date = new Date();
                } else if (i2 % 10000 == 0) {
                    logger.info(i2 + StringTable.HORIZONTAL_TABULATION + ((new Date().getTime() - date.getTime()) / 1000.0d) + " total s,  voc size:" + this.corpusVocabulary.size());
                    date = new Date();
                } else if (i2 % 500 == 0) {
                    System.out.print(".");
                }
                if (i2 % 1000000 == 0) {
                    logger.info("keyword stat");
                    logger.info("doc stat");
                    double d = 0.0d;
                    for (int i3 = 1; i3 < this.lengthFreq.length; i3++) {
                        if (this.lengthFreq[i3] > 0) {
                            d += this.lengthFreq[i3] / i2;
                            logger.info("F(" + i3 + ")=" + this.lengthFreq[i3] + ParsedPageLink.START_SUFFIX_PATTERN + d + ")");
                        }
                    }
                    if (this.lengthFreq[0] > 0) {
                        logger.info("F(freq>100)=" + this.lengthFreq[0] + ParsedPageLink.START_SUFFIX_PATTERN + (d + (this.lengthFreq[0] / i2)) + ")");
                    }
                }
                i2++;
            }
            long currentTimeMillis2 = System.currentTimeMillis();
            System.out.print("\n");
            logger.info(file + " processed in " + (currentTimeMillis2 - currentTimeMillis) + " ms");
            if (z) {
                this.termIndex.write(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file5), "UTF-8")));
                this.documentIndex.write(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file6), "UTF-8")));
                this.matrixWriter.close();
            }
            this.corpusVocabulary.write(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file7), "UTF-8")));
            logger.info("columnCount: " + this.columnCount);
        } catch (Exception e) {
            logger.error(e);
        }
    }

    private void addDocument(String[] strArr) throws IOException {
        Vocabulary vocabulary = new Vocabulary();
        this.totalKW += strArr.length - 1;
        for (int i = 1; i < strArr.length; i++) {
            String stem = this.stemmer != null ? this.stemmer.stem(strArr[i].toLowerCase()) : strArr[i].toLowerCase();
            if (isWord(stem)) {
                if (this.keywordSet.size() == 0) {
                    if (this.stopwordSet.size() == 0) {
                        vocabulary.add(stem);
                    } else if (!this.stopwordSet.contains(stem)) {
                        vocabulary.add(stem);
                    }
                } else if (this.keywordSet.contains(stem)) {
                    vocabulary.add(stem);
                }
            }
        }
        if (strArr.length <= 100) {
            int[] iArr = this.lengthFreq;
            int length = strArr.length - 1;
            iArr[length] = iArr[length] + 1;
        } else {
            int[] iArr2 = this.lengthFreq;
            iArr2[0] = iArr2[0] + 1;
        }
        if (vocabulary.size() == 0) {
            return;
        }
        if (this.saveMatrix) {
            this.documentIndex.add(strArr[0]);
        }
        int size = vocabulary.entrySet().size();
        int[] iArr3 = new int[size];
        float[] fArr = new float[size];
        int i2 = 0;
        for (String str : vocabulary.keySet()) {
            if (this.saveMatrix) {
                int i3 = vocabulary.get(str);
                iArr3[i2] = this.termIndex.add(str);
                fArr[i2] = i3 / size;
            }
            this.corpusVocabulary.add(str);
            i2++;
        }
        this.columnCount++;
        if (this.saveMatrix) {
            this.matrixWriter.writeColumn(iArr3, fArr);
        }
    }

    private boolean isWord(String str) {
        if (str.length() < 2 || !Character.isLetter(str.charAt(0))) {
            return false;
        }
        for (int i = 1; i < str.length(); i++) {
            char charAt = str.charAt(i);
            if (!Character.isLetterOrDigit(charAt) && charAt != '-') {
                return false;
            }
        }
        return true;
    }

    public static void main(String[] strArr) throws Exception {
        String property = System.getProperty("log-config");
        if (property == null) {
            property = "log-config.txt";
        }
        long currentTimeMillis = System.currentTimeMillis();
        PropertyConfigurator.configure(property);
        if (strArr.length != 6) {
            logger.info(getHelp());
            System.exit(1);
        }
        new TermDocumentMatrixFileWriter(new File(strArr[0]), strArr[3], new File(strArr[1]), new File(strArr[2]), Integer.parseInt(strArr[4]), Boolean.parseBoolean(strArr[5]));
        logger.info("corpus readDocumentList in " + (System.currentTimeMillis() - currentTimeMillis) + " ms");
    }

    private static String getHelp() {
        StringBuffer stringBuffer = new StringBuffer();
        stringBuffer.append("Usage: java -mx1024M com.rt.task2.TermDocumentMatrixFileWriter input stowordSet keywordSet output n b\n\n");
        stringBuffer.append("Arguments:\n");
        stringBuffer.append("\tinput\t\t-> file from which to read the input corpus (txt format)\n");
        stringBuffer.append("\tkeywords\t-> file from which to read the stopwords (one stopword per line)\n");
        stringBuffer.append("\tkeywords\t-> file from which to read the keywords to index (one keyword per line)\n");
        stringBuffer.append("\toutput\t\t-> root of files in which to store resulting term-by-document matrix (in sparse binary format), row index, col index and document frequency\n");
        stringBuffer.append("\tn\t\t-> number of documents to process\n");
        stringBuffer.append("\tb\t\t-> true to save the matrix; false to save the term document frequency only\n");
        return stringBuffer.toString();
    }
}
