package org.fbk.cit.hlt.thewikimachine.classifier;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.text.DecimalFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.xerces.impl.xs.SchemaSymbols;
import org.fbk.cit.hlt.thewikimachine.analysis.HardTokenizer;
import org.fbk.cit.hlt.thewikimachine.analysis.Tokenizer;
import org.fbk.cit.hlt.thewikimachine.util.StringTable;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.ParsedPageLink;

/* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/classifier/NGramModel.class */
public class NGramModel {
    public static final int DEFAULT_N_GRAM_LENGTH = 3;
    public static final int DEFAULT_N_GRAM_SIZE = 1000000;
    Tokenizer tokenizer;
    private Map<String, Integer> indexMap;
    private Map<String, Double> valueMap;
    private int length;
    private int size;
    static Logger logger = Logger.getLogger(NGramModel.class.getName());
    private static Pattern tabPattern = Pattern.compile(StringTable.HORIZONTAL_TABULATION);
    private static DecimalFormat df = new DecimalFormat("###,###,###,###");
    public static final double LOG2 = Math.log(2.0d);

    public NGramModel(File file, File file2, File file3) throws IOException {
        this(file, file2, file3, 3, 1000000);
    }

    public NGramModel(String str, String str2, String str3) throws IOException {
        this(new File(str), new File(str2), new File(str3), 3, 1000000);
    }

    public NGramModel(File file, File file2, File file3, int i, int i2) throws IOException {
        this.length = i;
        this.size = i2;
        this.tokenizer = new HardTokenizer();
        Set<String> readStopwords = readStopwords(file3);
        this.indexMap = readIndex(file, readStopwords);
        this.valueMap = readValue(file2, readStopwords);
    }

    public NGramModel(String str, String str2, String str3, int i, int i2) throws IOException {
        this(new File(str), new File(str2), new File(str3), i, i2);
    }

    public Double getValue(String str) {
        return this.valueMap.get(str);
    }

    public Integer getIndex(String str) {
        return this.indexMap.get(str);
    }

    public int size() {
        return this.size;
    }

    public int getLength() {
        return this.length;
    }

    public Set<String> readStopwords(File file) throws IOException {
        long currentTimeMillis = System.currentTimeMillis();
        logger.info("reading " + file + "...");
        HashSet hashSet = new HashSet();
        LineNumberReader lineNumberReader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));
        new StringBuilder();
        while (true) {
            String readLine = lineNumberReader.readLine();
            if (readLine == null) {
                lineNumberReader.close();
                logger.info(df.format(hashSet.size()) + " stopwords read in " + df.format(System.currentTimeMillis() - currentTimeMillis) + ParsedPageLink.START_SUFFIX_PATTERN + new Date() + ")");
                return hashSet;
            }
            hashSet.add(readLine);
        }
    }

    public Map<String, Integer> readIndex(File file, Set<String> set) throws IOException {
        long currentTimeMillis = System.currentTimeMillis();
        logger.info("reading " + file + "...");
        HashMap hashMap = new HashMap();
        LineNumberReader lineNumberReader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));
        new StringBuilder();
        while (true) {
            String readLine = lineNumberReader.readLine();
            if (readLine == null) {
                lineNumberReader.close();
                logger.info(df.format(hashMap.size()) + " forms/id read in " + df.format(System.currentTimeMillis() - currentTimeMillis) + ParsedPageLink.START_SUFFIX_PATTERN + new Date() + ")");
                return hashMap;
            }
            String[] split = tabPattern.split(readLine);
            if (split.length == 2) {
                hashMap.put(split[1], new Integer(split[0]));
            }
        }
    }

    public Map<String, Double> readValue(File file, Set<String> set) throws IOException {
        long currentTimeMillis = System.currentTimeMillis();
        logger.info("reading " + file + "...");
        HashMap hashMap = new HashMap();
        LineNumberReader lineNumberReader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));
        new StringBuilder();
        int i = 0;
        String readLine = lineNumberReader.readLine();
        if (readLine != null) {
            String[] split = tabPattern.split(readLine);
            if (split.length == 3) {
                i = new Integer(split[0]).intValue();
            }
        }
        while (true) {
            String readLine2 = lineNumberReader.readLine();
            if (readLine2 == null) {
                lineNumberReader.close();
                logger.info(df.format(hashMap.size()) + " forms/idf read in " + df.format(System.currentTimeMillis() - currentTimeMillis) + ParsedPageLink.START_SUFFIX_PATTERN + new Date() + ")");
                return hashMap;
            }
            String[] split2 = tabPattern.split(readLine2);
            if (split2.length == 3) {
                int intValue = new Integer(split2[0]).intValue();
                if (new Integer(split2[1]).intValue() > 2) {
                    hashMap.put(split2[2], Double.valueOf(log2(i / intValue)));
                }
            }
        }
    }

    public double log2(double d) {
        return Math.log(d) / LOG2;
    }

    public static void main(String[] strArr) {
        String property = System.getProperty("log-config");
        if (property == null) {
            property = "configuration/log-config.txt";
        }
        PropertyConfigurator.configure(property);
        Options options = new Options();
        try {
            OptionBuilder.withArgName("file");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("form id mapping");
            OptionBuilder.isRequired();
            OptionBuilder.withLongOpt("form-id");
            Option create = OptionBuilder.create();
            OptionBuilder.withArgName("file");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("form idf mapping");
            OptionBuilder.isRequired();
            OptionBuilder.withLongOpt("form-idf");
            Option create2 = OptionBuilder.create();
            OptionBuilder.withArgName("file");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("stopwords");
            OptionBuilder.isRequired();
            OptionBuilder.withLongOpt("stopwords");
            Option create3 = OptionBuilder.create();
            OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("n-gram length (default is 3)");
            OptionBuilder.withLongOpt("ngram-length");
            Option create4 = OptionBuilder.create("l");
            OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("n-gram length (default is 1000000)");
            OptionBuilder.withLongOpt("ngram-size");
            Option create5 = OptionBuilder.create("s");
            options.addOption("h", "help", false, "print this message");
            options.addOption("v", "version", false, "output version information and exit");
            options.addOption(create);
            options.addOption(create2);
            options.addOption(create4);
            options.addOption(create5);
            options.addOption(create3);
            CommandLine parse = new PosixParser().parse(options, strArr);
            logger.debug(options);
            logger.debug(parse.getOptionValue("output") + StringTable.HORIZONTAL_TABULATION + parse.getOptionValue("input") + StringTable.HORIZONTAL_TABULATION + parse.getOptionValue("lsm"));
            if (parse.hasOption("normalized")) {
            }
            if (parse.hasOption("ngram-size")) {
                Integer.parseInt(parse.getOptionValue("ngram-size"));
            }
            if (parse.hasOption("ngram-length")) {
                Integer.parseInt(parse.getOptionValue("ngram-length"));
            }
            new NGramModel(parse.getOptionValue("form-id"), parse.getOptionValue("form-idf"), parse.getOptionValue("stopwords"));
        } catch (IOException e) {
            logger.error(e);
        } catch (ParseException e2) {
            System.err.println("Parsing failed: " + e2.getMessage() + "\n");
            new HelpFormatter().printHelp(200, "java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.csv.OneExamplePerSenseExtractor", "\n", options, "\n", true);
        }
    }
}
