package org.fbk.cit.hlt.thewikimachine.xmldump;

import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.Date;
import java.util.Locale;
import java.util.regex.Matcher;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.xerces.impl.xs.SchemaSymbols;
import org.fbk.cit.hlt.thewikimachine.ExtractorParameters;

/* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/xmldump/WikipediaPageCategoryExtractor.class */
public class WikipediaPageCategoryExtractor extends AbstractWikipediaExtractor implements WikipediaExtractor {
    static Logger logger = Logger.getLogger(WikipediaPageCategoryExtractor.class.getName());
    private PrintWriter pageCategoryWriter;
    private boolean delCatLabel;

    public WikipediaPageCategoryExtractor(int i, int i2, Locale locale, boolean z) {
        super(i, i2, locale);
        this.delCatLabel = z;
    }

    public WikipediaPageCategoryExtractor(int i, int i2, Locale locale) {
        this(i, i2, locale, true);
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void start(ExtractorParameters extractorParameters) {
        try {
            this.pageCategoryWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaPageCategoryFileName()), "UTF-8")));
        } catch (IOException e) {
            logger.error(e);
        }
        startProcess(extractorParameters.getWikipediaXmlFileName());
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void filePage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void disambiguationPage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void categoryPage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void templatePage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void redirectPage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void portalPage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void projectPage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void contentPage(String str, String str2, int i) {
        pageCategory(str, str2);
    }

    void pageCategory(String str, String str2) {
        Matcher matcher = this.categoryPattern.matcher(str);
        StringBuilder sb = new StringBuilder();
        int i = this.delCatLabel ? 2 : 1;
        while (matcher.find()) {
            String replace = str.substring(matcher.start(i), matcher.end(i)).replace(' ', '_');
            sb.append(str2);
            sb.append('\t');
            int indexOf = replace.indexOf(124);
            if (indexOf != -1) {
                sb.append(normalizePageName(replace.substring(0, indexOf)));
            } else {
                sb.append(normalizePageName(replace));
            }
            sb.append('\n');
        }
        synchronized (this) {
            this.pageCategoryWriter.print(sb);
        }
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.AbstractWikipediaExtractor, org.fbk.cit.hlt.thewikimachine.xmldump.AbstractWikipediaXmlDumpParser
    public void endProcess() {
        super.endProcess();
        this.pageCategoryWriter.flush();
        this.pageCategoryWriter.close();
    }

    public static void main(String[] strArr) throws IOException {
        String property = System.getProperty("log-config");
        if (property == null) {
            property = "configuration/log-config.txt";
        }
        PropertyConfigurator.configure(property);
        Options options = new Options();
        try {
            OptionBuilder.withArgName("file");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("wikipedia xml dump file");
            OptionBuilder.isRequired();
            OptionBuilder.withLongOpt("wikipedia-dump");
            Option create = OptionBuilder.create("d");
            OptionBuilder.withArgName("dir");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("output directory in which to store output files");
            OptionBuilder.isRequired();
            OptionBuilder.withLongOpt("output-dir");
            Option create2 = OptionBuilder.create("o");
            OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("number of threads (default 1)");
            OptionBuilder.withLongOpt("num-threads");
            Option create3 = OptionBuilder.create("t");
            OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("number of pages to process (default all)");
            OptionBuilder.withLongOpt("num-pages");
            Option create4 = OptionBuilder.create("p");
            OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("receive notification every n pages (default 10000)");
            OptionBuilder.withLongOpt("notification-point");
            Option create5 = OptionBuilder.create("n");
            options.addOption("h", "help", false, "print this message");
            options.addOption("v", "version", false, "output version information and exit");
            options.addOption(create);
            options.addOption(create2);
            options.addOption(create3);
            options.addOption(create4);
            options.addOption(create5);
            CommandLine parse = new PosixParser().parse(options, strArr);
            int i = 1;
            if (parse.hasOption("num-threads")) {
                i = Integer.parseInt(parse.getOptionValue("num-threads"));
            }
            int i2 = Integer.MAX_VALUE;
            if (parse.hasOption("num-pages")) {
                i2 = Integer.parseInt(parse.getOptionValue("num-pages"));
            }
            int i3 = 10000;
            if (parse.hasOption("notification-point")) {
                i3 = Integer.parseInt(parse.getOptionValue("notification-point"));
            }
            ExtractorParameters extractorParameters = new ExtractorParameters(parse.getOptionValue("wikipedia-dump"), parse.getOptionValue("output-dir"));
            logger.debug(extractorParameters);
            WikipediaPageCategoryExtractor wikipediaPageCategoryExtractor = new WikipediaPageCategoryExtractor(i, i2, extractorParameters.getLocale());
            wikipediaPageCategoryExtractor.setNotificationPoint(i3);
            wikipediaPageCategoryExtractor.start(extractorParameters);
            logger.info("extraction ended " + new Date());
        } catch (ParseException e) {
            System.out.println("Parsing failed: " + e.getMessage() + "\n");
            new HelpFormatter().printHelp(400, "java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaPageCategoryExtractor", "\n", options, "\n", true);
        }
    }
}
