package org.fbk.cit.hlt.thewikimachine.csv;

import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.text.DecimalFormat;
import java.util.Date;
import java.util.List;
import java.util.SortedMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.log4j.PropertyConfigurator;
import org.apache.xerces.impl.xs.SchemaSymbols;
import org.fbk.cit.hlt.thewikimachine.ExtractorParameters;
import org.fbk.cit.hlt.thewikimachine.util.StringTable;
import org.fbk.cit.hlt.thewikimachine.util.SynchronizedCounter;

/* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/csv/UnigramExtractor.class */
public class UnigramExtractor extends CSVExtractor {
    private SynchronizedCounter synchronizedCounter;
    private PrintWriter unigramWriter;
    private static Pattern tabPattern = Pattern.compile(StringTable.HORIZONTAL_TABULATION);
    private static Pattern spacePattern = Pattern.compile(" ");
    private static DecimalFormat df = new DecimalFormat("###,###,###,###");

    public UnigramExtractor(int i, int i2) {
        super(i, i2);
        this.synchronizedCounter = new SynchronizedCounter();
    }

    public UnigramExtractor(int i) {
        super(i);
        this.synchronizedCounter = new SynchronizedCounter();
    }

    @Override // org.fbk.cit.hlt.thewikimachine.csv.CSVExtractor
    public void processLine(String str) {
        String[] split = spacePattern.split(str.toLowerCase());
        for (int i = 1; i < split.length; i++) {
            if (isWord(split[i])) {
                this.synchronizedCounter.add(split[i]);
            }
        }
    }

    private boolean isWord(String str) {
        for (int i = 0; i < str.length(); i++) {
            if (!Character.isLetter(str.charAt(i))) {
                return false;
            }
        }
        return true;
    }

    @Override // org.fbk.cit.hlt.thewikimachine.csv.CSVExtractor
    public void start(ExtractorParameters extractorParameters) {
        try {
            logger.info("writing unigrams in " + extractorParameters.getWikipediaUnigramFileName() + "...");
            this.unigramWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaUnigramFileName()), "UTF-8")));
            read(extractorParameters.getWikipediaTextFileName());
        } catch (IOException e) {
            logger.error(e);
        }
    }

    @Override // org.fbk.cit.hlt.thewikimachine.csv.CSVExtractor
    public void end() {
        logger.info("sorting...");
        long currentTimeMillis = System.currentTimeMillis();
        SortedMap<AtomicInteger, List<String>> sortedMap = this.synchronizedCounter.getSortedMap();
        try {
            writeSortedMap(sortedMap);
            this.unigramWriter.close();
        } catch (IOException e) {
            logger.error(e);
        }
        logger.info(sortedMap.size() + " lines sorted in " + df.format(System.currentTimeMillis() - currentTimeMillis) + " ms " + new Date());
    }

    private void writeSortedMap(SortedMap<AtomicInteger, List<String>> sortedMap) throws IOException {
        logger.info("writing...");
        long currentTimeMillis = System.currentTimeMillis();
        int i = 0;
        for (AtomicInteger atomicInteger : sortedMap.keySet()) {
            List<String> list = sortedMap.get(atomicInteger);
            StringBuilder sb = new StringBuilder();
            for (int i2 = 0; i2 < list.size(); i2++) {
                String str = list.get(i2);
                sb.append(atomicInteger);
                sb.append('\t');
                sb.append(str);
                sb.append('\n');
            }
            this.unigramWriter.print(sb.toString());
            i++;
        }
        logger.info(df.format(sortedMap.size()) + " lines wrote in " + df.format(System.currentTimeMillis() - currentTimeMillis) + StringTable.HORIZONTAL_TABULATION + new Date());
    }

    public static void main(String[] strArr) throws IOException {
        String property = System.getProperty("log-config");
        if (property == null) {
            property = "configuration/log-config.txt";
        }
        PropertyConfigurator.configure(property);
        Options options = new Options();
        try {
            try {
                OptionBuilder.withArgName("file");
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("wikipedia xml dump file");
                OptionBuilder.isRequired();
                OptionBuilder.withLongOpt("wikipedia-dump");
                Option create = OptionBuilder.create("d");
                OptionBuilder.withArgName("dir");
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("output directory in which to store output files");
                OptionBuilder.isRequired();
                OptionBuilder.withLongOpt("output-dir");
                Option create2 = OptionBuilder.create("o");
                OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("number of threads (default 1)");
                OptionBuilder.withLongOpt("num-threads");
                Option create3 = OptionBuilder.create("t");
                OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("number of pages to process (default all)");
                OptionBuilder.withLongOpt("num-pages");
                Option create4 = OptionBuilder.create("p");
                OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("receive notification every n pages (default 10000)");
                OptionBuilder.withLongOpt("notification-point");
                Option create5 = OptionBuilder.create("b");
                options.addOption("h", "help", false, "print this message");
                options.addOption("v", "version", false, "output version information and exit");
                options.addOption(create);
                options.addOption(create2);
                options.addOption(create3);
                options.addOption(create4);
                options.addOption(create5);
                CommandLine parse = new PosixParser().parse(options, strArr);
                logger.debug(parse);
                if (parse.hasOption("help") || parse.hasOption("version")) {
                    throw new ParseException("");
                }
                int i = 1;
                if (parse.hasOption("num-threads")) {
                    i = Integer.parseInt(parse.getOptionValue("num-threads"));
                }
                if (parse.hasOption("num-pages")) {
                    Integer.parseInt(parse.getOptionValue("num-pages"));
                }
                int i2 = 10000;
                if (parse.hasOption("notification-point")) {
                    i2 = Integer.parseInt(parse.getOptionValue("notification-point"));
                }
                ExtractorParameters extractorParameters = new ExtractorParameters(parse.getOptionValue("wikipedia-dump"), parse.getOptionValue("output-dir"));
                logger.debug(extractorParameters);
                UnigramExtractor unigramExtractor = new UnigramExtractor(i);
                unigramExtractor.setNotificationPoint(i2);
                unigramExtractor.start(extractorParameters);
                logger.info("extraction ended " + new Date());
            } catch (ParseException e) {
                logger.error("Parsing failed: " + e.getMessage() + "\n");
                new HelpFormatter().printHelp(200, "java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.csv.UnigramExtractor", "\n", options, "\n", true);
                logger.info("extraction ended " + new Date());
            }
        } catch (Throwable th) {
            logger.info("extraction ended " + new Date());
            throw th;
        }
    }
}
