package org.fbk.cit.hlt.thewikimachine.xmldump;

import de.tudarmstadt.ukp.wikipedia.parser.ParsedPage;
import de.tudarmstadt.ukp.wikipedia.parser.Section;
import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.xerces.impl.xs.SchemaSymbols;
import org.fbk.cit.hlt.thewikimachine.ExtractorParameters;
import org.fbk.cit.hlt.thewikimachine.index.FirstNameIndexer;
import org.fbk.cit.hlt.thewikimachine.util.StringTable;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.ParsedPageLink;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.WikiMarkupParser;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.WikiTemplate;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.WikiTemplateParser;

/* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/xmldump/WikipediaPreprocessing.class */
public class WikipediaPreprocessing extends AbstractWikipediaExtractor implements WikipediaExtractor {
    static Logger logger = Logger.getLogger(WikipediaPreprocessing.class.getName());
    private PrintWriter disambiguationWriter;
    private PrintWriter titleIdWriter;
    protected Pattern crossLanguagePattern;
    private PrintWriter personInfoWriter;
    protected Pattern templatePattern;
    protected Pattern birthDatePattern;
    protected Pattern deathDatePattern;
    protected Pattern namePattern;
    protected Pattern surnamePattern;
    private PrintWriter redirectWriter;
    private PrintWriter fileWriter;
    private PrintWriter pageCategoryWriter;
    private PrintWriter superCategoryWriter;
    private PrintWriter categoryWriter;
    private PrintWriter analysisWriter;
    private PrintWriter contentPageTitleWriter;
    private PrintWriter templateNameWriter;
    private PrintWriter templateMapWriter;
    private PrintWriter templateMapWriterWithRepetitions;
    private PrintWriter templateMapWriterProp;
    private PrintWriter sectionTitleWriter;
    private PrintWriter templateNavigationWriter;
    private Pattern sectionTitleSkipPattern;
    private boolean delCatLabel;

    public WikipediaPreprocessing(int i, int i2, Locale locale) {
        this(i, i2, locale, null);
    }

    public WikipediaPreprocessing(int i, int i2, Locale locale, String str) {
        super(i, i2, locale, str);
        this.delCatLabel = true;
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void start(ExtractorParameters extractorParameters) {
        if (this.resources.getString("PERSONAL_DATA_TEMPLATE_PATTERN") != null) {
            this.templatePattern = Pattern.compile(this.resources.getString("PERSONAL_DATA_TEMPLATE_PATTERN"));
        }
        if (this.resources.getString("SECTION_TITLE_SKIP_PATTERN") != null) {
            this.sectionTitleSkipPattern = Pattern.compile(this.resources.getString("SECTION_TITLE_SKIP_PATTERN"), 2);
        }
        if (this.resources.getString("NAME_PATTERN") != null) {
            this.namePattern = Pattern.compile(this.resources.getString("NAME_PATTERN"));
        }
        if (this.resources.getString("SURNAME_PATTERN") != null) {
            this.surnamePattern = Pattern.compile(this.resources.getString("SURNAME_PATTERN"));
        }
        if (this.resources.getString("BIRTH_DATE_PATTERN") != null && this.resources.getString("BIRTH_DATE_PATTERN").length() != 0) {
            this.birthDatePattern = Pattern.compile(this.resources.getString("BIRTH_DATE_PATTERN"));
        }
        if (this.resources.getString("DEATH_DATE_PATTERN") != null && this.resources.getString("DEATH_DATE_PATTERN").length() != 0) {
            this.deathDatePattern = Pattern.compile(this.resources.getString("DEATH_DATE_PATTERN"));
        }
        this.crossLanguagePattern = Pattern.compile("\\[\\[(\\w\\w:[^\\]]+)\\]\\]");
        logger.info("templatePattern: " + this.templatePattern);
        logger.info("namePattern: " + this.namePattern);
        logger.info("surnamePattern: " + this.surnamePattern);
        logger.info("birthDatePattern: " + this.birthDatePattern);
        logger.info("deathDatePattern: " + this.deathDatePattern);
        logger.info("crossLanguagePattern: " + this.crossLanguagePattern);
        try {
            this.analysisWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaAnalysisFileName()), "UTF-8")));
            this.disambiguationWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaDisambiguationFileName()), "UTF-8")));
            this.titleIdWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaTitleIdFileName()), "UTF-8")));
            this.contentPageTitleWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaContentPageFileName()), "UTF-8")));
            this.personInfoWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaPersonInfoFileName()), "UTF-8")));
            this.redirectWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaRedirFileName()), "UTF-8")));
            this.pageCategoryWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaPageCategoryFileName()), "UTF-8")));
            this.superCategoryWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaCategorySuperCategoryFileName()), "UTF-8")));
            this.categoryWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaCategoryFileName()), "UTF-8")));
            this.sectionTitleWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaSectionTitleFilePrefixName()), "UTF-8")));
            this.fileWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaFileName()), "UTF-8")));
            this.templateNameWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaTemplateFileNames().get(FirstNameIndexer.NAME_FIELD_NAME)), "UTF-8")));
            this.templateMapWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaTemplateFileNames().get("map")), "UTF-8")));
            this.templateMapWriterWithRepetitions = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaTemplateFileNames().get("map-rep")), "UTF-8")));
            this.templateMapWriterProp = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaTemplateFileNames().get("map-prop")), "UTF-8")));
            this.templateNavigationWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaTemplateFileNames().get("navigation")), "UTF-8")));
        } catch (IOException e) {
            logger.error(e);
        }
        startProcess(extractorParameters.getWikipediaXmlFileName());
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void filePage(String str, String str2, int i) {
        writeTitlePage(str2, i);
        synchronized (this) {
            this.fileWriter.println(str2);
        }
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void categoryPage(String str, String str2, int i) {
        writeTitlePage(str2, i);
        Matcher matcher = this.categoryPattern.matcher(str);
        StringBuilder sb = new StringBuilder();
        int i2 = this.delCatLabel ? 2 : 1;
        String str3 = str2;
        if (this.delCatLabel) {
            str3 = str2.substring(str2.indexOf(58) + 1, str2.length());
        }
        int i3 = 0;
        while (matcher.find()) {
            String replace = str.substring(matcher.start(i2), matcher.end(i2)).replace(' ', '_');
            sb.append(str3);
            sb.append('\t');
            int indexOf = replace.indexOf(124);
            if (indexOf != -1) {
                sb.append(normalizePageName(replace.substring(0, indexOf)));
            } else {
                sb.append(normalizePageName(replace));
            }
            sb.append('\n');
            i3++;
        }
        synchronized (this) {
            this.superCategoryWriter.print(sb);
            this.categoryWriter.println(str3 + StringTable.HORIZONTAL_TABULATION + i3);
        }
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void templatePage(String str, String str2, int i) {
        writeTitlePage(str2, i);
        synchronized (this) {
            this.templateNameWriter.println(str2);
        }
        Matcher matcher = this.categoryPattern.matcher(str);
        while (matcher.find()) {
            String replace = str.substring(matcher.start(2), matcher.end(2)).replace(' ', '_');
            if (this.navigationTemplatePattern != null && this.navigationTemplatePattern.matcher(replace).find() && this.simpleTemplatePattern != null) {
                Matcher matcher2 = this.simpleTemplatePattern.matcher(str2);
                if (matcher2.find()) {
                    String group = matcher2.group(1);
                    synchronized (this) {
                        this.templateNavigationWriter.append((CharSequence) group);
                        this.templateNavigationWriter.append('\n');
                    }
                } else {
                    continue;
                }
            }
        }
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void redirectPage(String str, String str2, int i) {
        writeTitlePage(str2, i);
        synchronized (this) {
            this.redirectWriter.println(str2 + '\t' + normalizePageName(str));
        }
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void contentPage(String str, String str2, int i) {
        writeTitleContentPage(str2, i);
        writeTitlePage(str2, i);
        personInfo(str, str2);
        textAndSections(str, str2, i);
        pageCategory(str, str2);
        pageTemplate(str, str2, i);
    }

    void pageTemplate(String str, String str2, int i) {
        ArrayList<WikiTemplate> parse = WikiTemplateParser.parse(str, false);
        HashSet hashSet = new HashSet();
        HashSet hashSet2 = new HashSet();
        StringBuffer stringBuffer = new StringBuffer();
        StringBuffer stringBuffer2 = new StringBuffer();
        StringBuffer stringBuffer3 = new StringBuffer();
        int i2 = 0;
        Iterator<WikiTemplate> it = parse.iterator();
        while (it.hasNext()) {
            WikiTemplate next = it.next();
            Set<String> keySet = next.getHashMapOfParts().keySet();
            String firstPart = next.getFirstPart();
            if (firstPart != null && firstPart.length() != 0 && !firstPart.startsWith("#")) {
                String replace = normalizePageName(firstPart.trim()).replace(' ', '_');
                if (!hashSet.contains(replace)) {
                    stringBuffer.append(str2 + StringTable.HORIZONTAL_TABULATION + replace + StringTable.HORIZONTAL_TABULATION + i2 + StringTable.HORIZONTAL_TABULATION + i).append("\n");
                    hashSet.add(replace);
                    i2++;
                }
                stringBuffer2.append(str2 + StringTable.HORIZONTAL_TABULATION + replace + StringTable.HORIZONTAL_TABULATION + next.getPartsCount() + StringTable.HORIZONTAL_TABULATION + next.getNlCount() + StringTable.HORIZONTAL_TABULATION + next.getKeyValueParts()).append("\n");
                for (String str3 : keySet) {
                    String str4 = replace + ";" + str3;
                    if (!hashSet2.contains(str4)) {
                        stringBuffer3.append(str2 + StringTable.HORIZONTAL_TABULATION + replace + StringTable.HORIZONTAL_TABULATION + str3).append("\n");
                        hashSet2.add(str4);
                    }
                }
            }
        }
        synchronized (this) {
            this.templateMapWriter.print(stringBuffer);
            this.templateMapWriterWithRepetitions.print(stringBuffer2);
            this.templateMapWriterProp.print(stringBuffer3);
        }
    }

    void pageCategory(String str, String str2) {
        Matcher matcher = this.categoryPattern.matcher(str);
        StringBuilder sb = new StringBuilder();
        int i = this.delCatLabel ? 2 : 1;
        while (matcher.find()) {
            String replace = str.substring(matcher.start(i), matcher.end(i)).replace(' ', '_');
            sb.append(str2);
            sb.append('\t');
            int indexOf = replace.indexOf(124);
            if (indexOf != -1) {
                sb.append(normalizePageName(replace.substring(0, indexOf)));
            } else {
                sb.append(normalizePageName(replace));
            }
            sb.append('\n');
        }
        synchronized (this) {
            this.pageCategoryWriter.print(sb);
        }
    }

    void textAndSections(String str, String str2, int i) {
        try {
            ParsedPage parsePage = WikiMarkupParser.getInstance().parsePage(str, new String[]{this.filePrefix, this.imagePrefix});
            StringBuilder sb = new StringBuilder();
            Iterator<Section> it = parsePage.getSections().iterator();
            while (it.hasNext()) {
                String title = it.next().getTitle();
                if (title != null && title.trim().length() > 0) {
                    if (this.sectionTitleSkipPattern == null || !this.sectionTitleSkipPattern.matcher(title).find()) {
                        sb.append(str2);
                        sb.append('\t');
                        sb.append(title);
                        sb.append('\n');
                    }
                }
            }
            synchronized (this) {
                this.sectionTitleWriter.print(sb.toString());
            }
        } catch (Exception e) {
            logger.error("Error processing page " + str2 + ParsedPageLink.START_SUFFIX_PATTERN + i + ")");
        }
    }

    void personInfo(String str, String str2) {
        String str3 = null;
        String str4 = null;
        String str5 = null;
        String str6 = null;
        String str7 = null;
        Matcher matcher = this.templatePattern.matcher(str);
        if (matcher.find()) {
            str7 = matcher.group(1);
        }
        if (str7 == null) {
            return;
        }
        String trim = str7.trim();
        Matcher matcher2 = this.namePattern.matcher(trim);
        if (matcher2.find()) {
            str5 = matcher2.group(1).trim();
        }
        Matcher matcher3 = this.surnamePattern.matcher(trim);
        if (matcher3.find()) {
            str6 = matcher3.group(1).trim();
        }
        if (this.birthDatePattern != null) {
            Matcher matcher4 = this.birthDatePattern.matcher(trim);
            if (matcher4.find()) {
                str3 = matcher4.group(1).trim();
            }
        }
        if (this.deathDatePattern != null) {
            Matcher matcher5 = this.deathDatePattern.matcher(trim);
            if (matcher5.find()) {
                str4 = matcher5.group(1).trim();
            }
        }
        if (str5 == null || str6 == null) {
            return;
        }
        StringBuilder sb = new StringBuilder();
        sb.append(str2);
        sb.append('\t');
        sb.append(str5.trim());
        sb.append('\t');
        sb.append(str6.trim());
        sb.append('\t');
        if (str3 != null) {
            sb.append(str3.trim());
        }
        sb.append('\t');
        if (str4 != null) {
            sb.append(str4.trim());
        }
        synchronized (this) {
            this.personInfoWriter.println(sb.toString());
        }
    }

    private void writeTitlePage(String str, int i) {
        synchronized (this) {
            this.titleIdWriter.println(str + '\t' + i);
        }
    }

    void writeTitleContentPage(String str, int i) {
        synchronized (this) {
            this.contentPageTitleWriter.println(str + '\t' + i);
        }
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void portalPage(String str, String str2, int i) {
        writeTitlePage(str2, i);
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void projectPage(String str, String str2, int i) {
        writeTitlePage(str2, i);
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void disambiguationPage(String str, String str2, int i) {
        writeTitlePage(str2, i);
        synchronized (this) {
            this.disambiguationWriter.println(str2);
        }
    }

    void analysis() {
        synchronized (this) {
            this.analysisWriter.println("date=" + new Date());
            this.analysisWriter.println("total=" + this.generalCount);
            this.analysisWriter.println("content=" + this.countPageCounter);
            this.analysisWriter.println("disambiguation=" + this.disambiguationPageCounter);
            this.analysisWriter.println("category=" + this.categoryPageCounter);
            this.analysisWriter.println("redirect=" + this.redirectPageCounter);
            this.analysisWriter.println("template=" + this.templatePageCounter);
            this.analysisWriter.println("mediawiki=" + this.mediawikiPageCounter);
            this.analysisWriter.println("wikipedia=" + this.wikipediaPageCounter);
            this.analysisWriter.println("file=" + this.filePageCounter);
            this.analysisWriter.println("special=" + this.specialPageCounter);
            this.analysisWriter.println("image=" + this.imagePageCounter);
            this.analysisWriter.println("project=" + this.projectPageCounter);
            this.analysisWriter.println("other=" + this.otherPageCounter);
        }
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.AbstractWikipediaExtractor, org.fbk.cit.hlt.thewikimachine.xmldump.AbstractWikipediaXmlDumpParser
    public void endProcess() {
        super.endProcess();
        analysis();
        this.analysisWriter.close();
        this.titleIdWriter.close();
        this.disambiguationWriter.close();
        this.personInfoWriter.close();
        this.redirectWriter.close();
        this.pageCategoryWriter.close();
        this.superCategoryWriter.close();
        this.contentPageTitleWriter.close();
        this.templateNameWriter.close();
        this.templateMapWriter.close();
        this.templateMapWriterWithRepetitions.close();
        this.templateMapWriterProp.close();
        this.templateNavigationWriter.close();
    }

    public static void main(String[] strArr) throws IOException {
        String property = System.getProperty("log-config");
        if (property == null) {
            property = "configuration/log-config.txt";
        }
        PropertyConfigurator.configure(property);
        Options options = new Options();
        try {
            try {
                OptionBuilder.withArgName("file");
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("wikipedia xml dump file");
                OptionBuilder.isRequired();
                OptionBuilder.withLongOpt("wikipedia-dump");
                Option create = OptionBuilder.create("d");
                OptionBuilder.withArgName("dir");
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("output directory in which to store output files");
                OptionBuilder.isRequired();
                OptionBuilder.withLongOpt("output-dir");
                Option create2 = OptionBuilder.create("o");
                OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("number of threads (default 1)");
                OptionBuilder.withLongOpt("num-threads");
                Option create3 = OptionBuilder.create("t");
                OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("number of pages to process (default all)");
                OptionBuilder.withLongOpt("num-pages");
                Option create4 = OptionBuilder.create("p");
                OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("receive notification every n pages (default 10000)");
                OptionBuilder.withLongOpt("notification-point");
                Option create5 = OptionBuilder.create("n");
                options.addOption("h", "help", false, "print this message");
                options.addOption("v", "version", false, "output version information and exit");
                options.addOption(create);
                options.addOption(create2);
                options.addOption(create3);
                options.addOption(create4);
                options.addOption(create5);
                CommandLine parse = new PosixParser().parse(options, strArr);
                logger.debug(parse);
                int i = 1;
                if (parse.hasOption("num-threads")) {
                    i = Integer.parseInt(parse.getOptionValue("num-threads"));
                }
                int i2 = Integer.MAX_VALUE;
                if (parse.hasOption("num-pages")) {
                    i2 = Integer.parseInt(parse.getOptionValue("num-pages"));
                }
                int i3 = 10000;
                if (parse.hasOption("notification-point")) {
                    i3 = Integer.parseInt(parse.getOptionValue("notification-point"));
                }
                ExtractorParameters extractorParameters = new ExtractorParameters(parse.getOptionValue("wikipedia-dump"), parse.getOptionValue("output-dir"));
                logger.debug(extractorParameters);
                WikipediaPreprocessing wikipediaPreprocessing = new WikipediaPreprocessing(i, i2, extractorParameters.getLocale());
                wikipediaPreprocessing.setNotificationPoint(i3);
                wikipediaPreprocessing.start(extractorParameters);
                logger.info("extraction ended " + new Date());
            } catch (ParseException e) {
                logger.error("Parsing failed: " + e.getMessage() + "\n");
                new HelpFormatter().printHelp(200, "java -cp properties:dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaPreprocessing", "\n", options, "\n", true);
                logger.info("extraction ended " + new Date());
            }
        } catch (Throwable th) {
            logger.info("extraction ended " + new Date());
            throw th;
        }
    }
}
