package org.fbk.cit.hlt.thewikimachine.experiments.mt;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.Date;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.xerces.impl.xs.SchemaSymbols;
import org.fbk.cit.hlt.thewikimachine.ExtractorParameters;
import org.fbk.cit.hlt.thewikimachine.csv.CSVExtractor;
import org.fbk.cit.hlt.thewikimachine.index.PageFormSearcher;
import org.fbk.cit.hlt.thewikimachine.index.util.FreqSetSearcher;
import org.fbk.cit.hlt.thewikimachine.util.StringTable;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.ParsedPageTitle;

/* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/experiments/mt/MT.class */
public class MT extends CSVExtractor {
    static Logger logger = Logger.getLogger(MT.class.getName());
    private String annotatedFileName;
    PrintWriter termWriter;
    PrintWriter termsWriter;
    AtomicInteger count;
    PageFormSearcher pageFormSearcher;

    public MT(String str, int i) throws IOException {
        super(i);
        this.annotatedFileName = str;
        this.termWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(str + ".term"), "UTF-8")));
        this.termsWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(str + ".terms"), "UTF-8")));
        this.count = new AtomicInteger();
    }

    @Override // org.fbk.cit.hlt.thewikimachine.csv.CSVExtractor
    public void end() {
        this.termsWriter.close();
        this.termWriter.close();
    }

    @Override // org.fbk.cit.hlt.thewikimachine.csv.CSVExtractor
    public void processLine(String str) {
        FreqSetSearcher.Entry[] search;
        String rules;
        String[] split = str.split(StringTable.HORIZONTAL_TABULATION);
        int incrementAndGet = this.count.incrementAndGet();
        if (split.length <= 6 || (rules = rules(split[1], split[2], split[6], (search = this.pageFormSearcher.search(split[6])))) == null) {
            return;
        }
        synchronized (this) {
            System.out.println(incrementAndGet + StringTable.HORIZONTAL_TABULATION + split[1] + StringTable.HORIZONTAL_TABULATION + split[2] + StringTable.HORIZONTAL_TABULATION + split[6] + StringTable.HORIZONTAL_TABULATION + rules);
            this.termWriter.println(str + StringTable.HORIZONTAL_TABULATION + rules);
            this.termsWriter.print(str);
            if (search.length > 0) {
                this.termsWriter.print(StringTable.HORIZONTAL_TABULATION);
                for (int i = 0; i < search.length; i++) {
                    if (i > 0) {
                        this.termsWriter.print(",");
                    }
                    this.termsWriter.print(search[i].getValue().replace(" - ", HelpFormatter.DEFAULT_OPT_PREFIX) + "=" + search[i].getFreq());
                }
                this.termsWriter.print("\n");
            }
        }
    }

    String rules(String str, String str2, String str3, FreqSetSearcher.Entry[] entryArr) {
        ParsedPageTitle parsedPageTitle = new ParsedPageTitle(str2);
        String form = new ParsedPageTitle(str3).getForm();
        if (str.equalsIgnoreCase(parsedPageTitle.getForm())) {
            return Character.isUpperCase(str.charAt(0)) ? form + "=1" : Character.toLowerCase(form.charAt(0)) + form.substring(1, form.length()) + "=1";
        }
        if (sourceInTarget(str, entryArr)) {
            return str + "=1";
        }
        if (entryArr.length > 0) {
            return entryArr[0].getValue().replace(" - ", HelpFormatter.DEFAULT_OPT_PREFIX) + "=" + entryArr[0].getFreq();
        }
        return null;
    }

    boolean sourceInTarget(String str, FreqSetSearcher.Entry[] entryArr) {
        for (FreqSetSearcher.Entry entry : entryArr) {
            if (str.equalsIgnoreCase(entry.getValue())) {
                return true;
            }
        }
        return false;
    }

    @Override // org.fbk.cit.hlt.thewikimachine.csv.CSVExtractor
    public void start(ExtractorParameters extractorParameters) {
        try {
            this.pageFormSearcher = new PageFormSearcher(extractorParameters.getWikipediaPageFormIndexName());
            read(this.annotatedFileName);
        } catch (IOException e) {
            logger.error(e);
        }
    }

    public static void main(String[] strArr) throws IOException {
        String property = System.getProperty("log-config");
        if (property == null) {
            property = "configuration/log-config.txt";
        }
        PropertyConfigurator.configure(property);
        Options options = new Options();
        try {
            try {
                OptionBuilder.withArgName("file");
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("wikipedia xml dump file");
                OptionBuilder.isRequired();
                OptionBuilder.withLongOpt("wikipedia-dump");
                Option create = OptionBuilder.create("d");
                OptionBuilder.withArgName("dir");
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("directory from which to read the model files");
                OptionBuilder.isRequired();
                OptionBuilder.withLongOpt("model-dir");
                Option create2 = OptionBuilder.create("m");
                OptionBuilder.withArgName("dir");
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("file from which to read the annotated text");
                OptionBuilder.isRequired();
                OptionBuilder.withLongOpt("annotated-file");
                Option create3 = OptionBuilder.create("a");
                OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("number of threads (default 1)");
                OptionBuilder.withLongOpt("num-threads");
                Option create4 = OptionBuilder.create("t");
                OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("number of pages to process (default all)");
                OptionBuilder.withLongOpt("num-pages");
                Option create5 = OptionBuilder.create("p");
                OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("receive notification every n pages (default 10000)");
                OptionBuilder.withLongOpt("notification-point");
                Option create6 = OptionBuilder.create("n");
                options.addOption("h", "help", false, "print this message");
                options.addOption("v", "version", false, "output version information and exit");
                options.addOption(create);
                options.addOption(create3);
                options.addOption(create2);
                options.addOption(create4);
                options.addOption(create5);
                options.addOption(create6);
                CommandLine parse = new PosixParser().parse(options, strArr);
                logger.debug(parse);
                if (parse.hasOption("help") || parse.hasOption("version")) {
                    throw new ParseException("");
                }
                int parseInt = parse.hasOption("num-threads") ? Integer.parseInt(parse.getOptionValue("num-threads")) : 1;
                if (parse.hasOption("num-pages")) {
                    Integer.parseInt(parse.getOptionValue("num-pages"));
                }
                int parseInt2 = parse.hasOption("notification-point") ? Integer.parseInt(parse.getOptionValue("notification-point")) : 10000;
                ExtractorParameters extractorParameters = new ExtractorParameters(parse.getOptionValue("wikipedia-dump"), parse.getOptionValue("model-dir"));
                logger.debug(extractorParameters);
                File file = new File(parse.getOptionValue("annotated-file"));
                if (file.isFile()) {
                    MT mt = new MT(parse.getOptionValue("annotated-file"), parseInt);
                    mt.setNotificationPoint(parseInt2);
                    mt.start(extractorParameters);
                } else {
                    File[] listFiles = file.listFiles(new FilenameFilter() { // from class: org.fbk.cit.hlt.thewikimachine.experiments.mt.MT.1
                        @Override // java.io.FilenameFilter
                        public boolean accept(File file2, String str) {
                            return str.endsWith(".ml");
                        }
                    });
                    for (int i = 0; i < listFiles.length; i++) {
                        try {
                            MT mt2 = new MT(listFiles[i].getAbsolutePath(), parseInt);
                            mt2.setNotificationPoint(parseInt2);
                            mt2.start(extractorParameters);
                        } catch (Exception e) {
                            logger.error("Error at " + listFiles[i]);
                            logger.error(e);
                        }
                    }
                }
                logger.info("extraction ended " + new Date());
            } catch (ParseException e2) {
                logger.error("Parsing failed: " + e2.getMessage() + "\n");
                new HelpFormatter().printHelp(200, "java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.experiments.mt.MT", "\n", options, "\n", true);
                logger.info("extraction ended " + new Date());
            }
        } catch (Throwable th) {
            logger.info("extraction ended " + new Date());
            throw th;
        }
    }
}
