package org.fbk.cit.hlt.thewikimachine.xmldump;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.Date;
import java.util.Locale;
import java.util.MissingResourceException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.xerces.impl.xs.SchemaSymbols;
import org.fbk.cit.hlt.thewikimachine.ExtractorParameters;
import org.fbk.cit.hlt.thewikimachine.util.StringTable;

/* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/xmldump/WikipediaPersonInfoExtractor.class */
public class WikipediaPersonInfoExtractor extends AbstractWikipediaExtractor implements WikipediaExtractor {
    static Logger logger = Logger.getLogger(WikipediaPersonInfoExtractor.class.getName());
    private PrintWriter personInfoWriter;
    protected Pattern templatePattern;
    protected Pattern birthDatePattern;
    protected Pattern deathDatePattern;
    protected Pattern namePattern;
    protected Pattern surnamePattern;

    public WikipediaPersonInfoExtractor(int i, int i2, Locale locale) throws MissingResourceException {
        super(i, i2, locale);
        loadResources();
        if (this.resources.getString("PERSONAL_DATA_TEMPLATE_PATTERN") != null) {
            this.templatePattern = Pattern.compile(this.resources.getString("PERSONAL_DATA_TEMPLATE_PATTERN"));
        }
        if (this.resources.getString("NAME_PATTERN") != null) {
            this.namePattern = Pattern.compile(this.resources.getString("NAME_PATTERN"));
        }
        if (this.resources.getString("SURNAME_PATTERN") != null) {
            this.surnamePattern = Pattern.compile(this.resources.getString("SURNAME_PATTERN"));
        }
        if (this.resources.getString("BIRTH_DATE_PATTERN") != null && this.resources.getString("BIRTH_DATE_PATTERN").length() != 0) {
            this.birthDatePattern = Pattern.compile(this.resources.getString("BIRTH_DATE_PATTERN"));
        }
        if (this.resources.getString("DEATH_DATE_PATTERN") != null && this.resources.getString("DEATH_DATE_PATTERN").length() != 0) {
            this.deathDatePattern = Pattern.compile(this.resources.getString("DEATH_DATE_PATTERN"));
        }
        logger.info("templatePattern: " + this.templatePattern);
        logger.info("namePattern: " + this.namePattern);
        logger.info("surnamePattern: " + this.surnamePattern);
        logger.info("birthDatePattern: " + this.birthDatePattern);
        logger.info("deathDatePattern: " + this.deathDatePattern);
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void start(ExtractorParameters extractorParameters) {
        try {
            this.personInfoWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaPersonInfoFileName()), "UTF-8")));
        } catch (IOException e) {
            logger.error(e);
        }
        startProcess(extractorParameters.getWikipediaXmlFileName());
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void filePage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void contentPage(String str, String str2, int i) {
        logger.debug(str2);
        String str3 = null;
        String str4 = null;
        String str5 = null;
        String str6 = null;
        String str7 = null;
        Matcher matcher = this.templatePattern.matcher(str);
        if (matcher.find()) {
            str7 = matcher.group(1);
        }
        if (str7 == null) {
            return;
        }
        String trim = str7.trim();
        Matcher matcher2 = this.namePattern.matcher(trim);
        if (matcher2.find()) {
            str5 = matcher2.group(1).trim();
        }
        Matcher matcher3 = this.surnamePattern.matcher(trim);
        if (matcher3.find()) {
            str6 = matcher3.group(1).trim();
        }
        if (this.birthDatePattern != null) {
            Matcher matcher4 = this.birthDatePattern.matcher(trim);
            if (matcher4.find()) {
                str3 = matcher4.group(1).trim();
            }
        }
        if (this.deathDatePattern != null) {
            Matcher matcher5 = this.deathDatePattern.matcher(trim);
            if (matcher5.find()) {
                str4 = matcher5.group(1).trim();
            }
        }
        if (str5 == null || str6 == null) {
            return;
        }
        synchronized (this) {
            this.personInfoWriter.print(str2.trim());
            this.personInfoWriter.print(StringTable.HORIZONTAL_TABULATION);
            this.personInfoWriter.print(str5.trim());
            this.personInfoWriter.print(StringTable.HORIZONTAL_TABULATION);
            this.personInfoWriter.print(str6.trim());
            this.personInfoWriter.print(StringTable.HORIZONTAL_TABULATION);
            if (str3 != null) {
                this.personInfoWriter.print(str3.trim());
            }
            this.personInfoWriter.print(StringTable.HORIZONTAL_TABULATION);
            if (str4 != null) {
                this.personInfoWriter.print(str4.trim());
            }
            this.personInfoWriter.print("\n");
        }
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void disambiguationPage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void categoryPage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void templatePage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void redirectPage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void portalPage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void projectPage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.AbstractWikipediaExtractor, org.fbk.cit.hlt.thewikimachine.xmldump.AbstractWikipediaXmlDumpParser
    public void endProcess() {
        super.endProcess();
        this.personInfoWriter.flush();
        this.personInfoWriter.close();
    }

    public static void main(String[] strArr) throws IOException {
        String property = System.getProperty("log-config");
        if (property == null) {
            property = "configuration/log-config.txt";
        }
        PropertyConfigurator.configure(property);
        Options options = new Options();
        try {
            OptionBuilder.withArgName("file");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("wikipedia xml dump file");
            OptionBuilder.isRequired();
            OptionBuilder.withLongOpt("wikipedia-dump");
            Option create = OptionBuilder.create("d");
            OptionBuilder.withArgName("dir");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("output directory in which to store output files");
            OptionBuilder.isRequired();
            OptionBuilder.withLongOpt("output-dir");
            Option create2 = OptionBuilder.create("o");
            OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("number of threads (default 1)");
            OptionBuilder.withLongOpt("num-threads");
            Option create3 = OptionBuilder.create("t");
            OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("number of pages to process (default all)");
            OptionBuilder.withLongOpt("num-pages");
            Option create4 = OptionBuilder.create("p");
            OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("receive notification every n pages (default 10000)");
            OptionBuilder.withLongOpt("notification-point");
            Option create5 = OptionBuilder.create("n");
            options.addOption("h", "help", false, "print this message");
            options.addOption("v", "version", false, "output version information and exit");
            OptionBuilder.withDescription("if set, use the output folder as base dir");
            OptionBuilder.withLongOpt("base-dir");
            Option create6 = OptionBuilder.create();
            options.addOption(create);
            options.addOption(create2);
            options.addOption(create3);
            options.addOption(create4);
            options.addOption(create5);
            options.addOption(create6);
            CommandLine parse = new PosixParser().parse(options, strArr);
            int i = 1;
            if (parse.hasOption("num-threads")) {
                i = Integer.parseInt(parse.getOptionValue("num-threads"));
            }
            int i2 = Integer.MAX_VALUE;
            if (parse.hasOption("num-pages")) {
                i2 = Integer.parseInt(parse.getOptionValue("num-pages"));
            }
            int i3 = 10000;
            if (parse.hasOption("notification-point")) {
                i3 = Integer.parseInt(parse.getOptionValue("notification-point"));
            }
            ExtractorParameters extractorParameters = parse.hasOption("base-dir") ? new ExtractorParameters(parse.getOptionValue("wikipedia-dump"), parse.getOptionValue("output-dir"), true) : new ExtractorParameters(parse.getOptionValue("wikipedia-dump"), parse.getOptionValue("output-dir"));
            new File(extractorParameters.getExtractionOutputDirName()).mkdirs();
            logger.debug(extractorParameters);
            logger.debug("extracting person info (" + extractorParameters.getWikipediaExampleFileName() + ")...");
            WikipediaPersonInfoExtractor wikipediaPersonInfoExtractor = new WikipediaPersonInfoExtractor(i, i2, extractorParameters.getLocale());
            wikipediaPersonInfoExtractor.setNotificationPoint(i3);
            wikipediaPersonInfoExtractor.start(extractorParameters);
            logger.info("extraction ended " + new Date());
        } catch (ParseException e) {
            System.out.println("Parsing failed: " + e.getMessage() + "\n");
            new HelpFormatter().printHelp(400, "java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaPersonInfoExtractor", "\n", options, "\n", true);
        }
    }
}
