package org.fbk.cit.hlt.thewikimachine.csv;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.text.DecimalFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import opennlp.tools.parser.AbstractBottomUpParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.fbk.cit.hlt.thewikimachine.util.FreqSet;
import org.fbk.cit.hlt.thewikimachine.util.StringTable;

/* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/csv/NGramCaseVariantBuilder.class */
public class NGramCaseVariantBuilder {
    static Logger logger = Logger.getLogger(NGramCaseVariantBuilder.class.getName());
    private static Pattern tabPattern = Pattern.compile(StringTable.HORIZONTAL_TABULATION);
    private static DecimalFormat df = new DecimalFormat("###,###,###,###");
    public static final int DEFAULT_NOTIFICATION_POINT = 1000000;

    public NGramCaseVariantBuilder(File file, File file2) throws IOException {
        writeSortedMap(read(file), file2);
    }

    public Map<String, Set<String>> read(File file) throws IOException {
        logger.info("reading " + file + "...");
        HashMap hashMap = new HashMap();
        long currentTimeMillis = System.currentTimeMillis();
        FreqSet freqSet = new FreqSet();
        LineNumberReader lineNumberReader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));
        int i = 1;
        while (true) {
            String readLine = lineNumberReader.readLine();
            if (readLine == null) {
                lineNumberReader.close();
                logger.info(df.format(i) + StringTable.HORIZONTAL_TABULATION + df.format(freqSet.size()) + StringTable.HORIZONTAL_TABULATION + df.format(System.currentTimeMillis() - currentTimeMillis) + StringTable.HORIZONTAL_TABULATION + new Date());
                return hashMap;
            }
            String[] split = tabPattern.split(readLine);
            if (split.length == 3) {
                String lowerCase = split[2].toLowerCase();
                Set set = (Set) hashMap.get(lowerCase);
                if (set == null) {
                    set = new HashSet();
                    hashMap.put(lowerCase, set);
                }
                set.add(split[2]);
            }
            if (i % 1000000 == 0) {
                logger.info(df.format(i) + StringTable.HORIZONTAL_TABULATION + df.format(freqSet.size()) + StringTable.HORIZONTAL_TABULATION + df.format(System.currentTimeMillis() - currentTimeMillis) + StringTable.HORIZONTAL_TABULATION + new Date());
                currentTimeMillis = System.currentTimeMillis();
            }
            i++;
        }
    }

    private static void writeSortedMap(Map<String, Set<String>> map, File file) throws IOException {
        logger.info("writing " + file + "...");
        long currentTimeMillis = System.currentTimeMillis();
        PrintWriter printWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "UTF-8")));
        new StringBuilder();
        int i = 0;
        int i2 = 0;
        for (String str : map.keySet()) {
            Iterator<String> it = map.get(str).iterator();
            int i3 = 0;
            while (it.hasNext()) {
                printWriter.print(str);
                printWriter.print('\t');
                printWriter.println(it.next());
                i++;
                i3++;
            }
            i2++;
        }
        logger.info(df.format(map.size()) + " of " + df.format(i) + " distinct forms saved in " + df.format(System.currentTimeMillis() - currentTimeMillis) + " ms " + new Date());
        printWriter.close();
    }

    public static void main(String[] strArr) throws Exception {
        String property = System.getProperty("log-config");
        if (property == null) {
            property = "configuration/log-config.txt";
        }
        PropertyConfigurator.configure(property);
        Options options = new Options();
        try {
            OptionBuilder.withArgName("file");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("ngram file");
            OptionBuilder.isRequired();
            OptionBuilder.withLongOpt("input");
            Option create = OptionBuilder.create(AbstractBottomUpParser.INCOMPLETE);
            OptionBuilder.withArgName("file");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("lowercase ngram file");
            OptionBuilder.withLongOpt("output");
            Option create2 = OptionBuilder.create("o");
            options.addOption("h", "help", false, "print this message");
            options.addOption("v", "version", false, "output version information and exit");
            options.addOption(create);
            options.addOption(create2);
            CommandLine parse = new PosixParser().parse(options, strArr);
            if (parse.hasOption("help") || parse.hasOption("version")) {
                throw new ParseException("");
            }
            new NGramCaseVariantBuilder(new File(parse.getOptionValue("input")), new File(parse.getOptionValue("output")));
        } catch (ParseException e) {
            if (e.getMessage().length() > 0) {
                System.out.println("Parsing failed: " + e.getMessage() + "\n");
            }
            new HelpFormatter().printHelp(400, "java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.csv.NGramCaseVariantBuilder", "\n", options, "\n", true);
        }
    }
}
