package org.fbk.cit.hlt.thewikimachine.xmldump.util;

import de.tudarmstadt.ukp.wikipedia.parser.ParsedPage;
import de.tudarmstadt.ukp.wikipedia.parser.Section;
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.FlushTemplates;
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser;
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParserFactory;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.regex.Pattern;
import opennlp.tools.parser.AbstractBottomUpParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.fbk.cit.hlt.core.io.FileUtils;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.clean.CleanWikipedia;

/* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/xmldump/util/WikiMarkupParser.class */
public class WikiMarkupParser {
    public static final String NBSP = "&nbsp;";
    public static final String EMPTY_STRING = "";
    protected Pattern refPattern = Pattern.compile("<ref[^>]*>[^<]+</ref>");
    protected Pattern commentPattern = Pattern.compile("<!--[^-]+-->");
    protected MediaWikiParser parser;
    static Logger logger = Logger.getLogger(WikiMarkupParser.class.getName());
    private static WikiMarkupParser ourInstance = null;

    public static synchronized WikiMarkupParser getInstance() {
        if (ourInstance == null) {
            ourInstance = new WikiMarkupParser();
        }
        return ourInstance;
    }

    private WikiMarkupParser() {
        logger.info("WikiMarkupParser.WikiMarkupParser");
        MediaWikiParserFactory mediaWikiParserFactory = new MediaWikiParserFactory();
        mediaWikiParserFactory.setTemplateParserClass(new FlushTemplates().getClass());
        logger.info("getShowImageText: " + mediaWikiParserFactory.getShowImageText());
        logger.info("getDeleteTemplates: " + mediaWikiParserFactory.getDeleteTemplates());
        logger.info("getLineSeparator: " + mediaWikiParserFactory.getLineSeparator());
        logger.info("getParseTemplates: " + mediaWikiParserFactory.getParseTemplates());
        logger.info("getLanguageIdentifers: " + mediaWikiParserFactory.getLanguageIdentifers());
        logger.info("getCategoryIdentifers: " + mediaWikiParserFactory.getCategoryIdentifers());
        logger.info("getImageIdentifers: " + mediaWikiParserFactory.getImageIdentifers());
        logger.info("getShowImageText: " + mediaWikiParserFactory.getShowImageText());
        logger.info("getDeleteTags: " + mediaWikiParserFactory.getDeleteTags());
        logger.info("getImageIdentifers: " + mediaWikiParserFactory.getImageIdentifers());
        logger.info("getShowMathTagContent: " + mediaWikiParserFactory.getShowMathTagContent());
        logger.info("getCalculateSrcSpans: " + mediaWikiParserFactory.getCalculateSrcSpans());
        this.parser = mediaWikiParserFactory.createParser();
    }

    protected String normalizePageName(String str) {
        if (str.length() != 0 && !Character.isUpperCase(str.charAt(0))) {
            return str.substring(0, 1).toUpperCase() + str.substring(1, str.length());
        }
        return str;
    }

    protected String removeRef(String str) {
        return this.refPattern.matcher(str).replaceAll("");
    }

    protected String removeHtmlComments(String str) {
        return this.commentPattern.matcher(str).replaceAll("");
    }

    protected String replaceNBSP(String str) {
        return str.replace(NBSP, " ");
    }

    public ParsedPage parsePage(String str, String[] strArr) throws IOException {
        return this.parser.parse(CleanWikipedia.clean(replaceNBSP(removeRef(str)), strArr, true, true));
    }

    public ParsedPage parsePage(String str) throws IOException {
        return this.parser.parse(replaceNBSP(removeRef(str)));
    }

    public static void main(String[] strArr) throws IOException {
        String property = System.getProperty("log-config");
        if (property == null) {
            property = "configuration/log-config.txt";
        }
        PropertyConfigurator.configure(property);
        Options options = new Options();
        try {
            OptionBuilder.withArgName("file");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("wikipedia file");
            OptionBuilder.isRequired();
            OptionBuilder.withLongOpt("input");
            options.addOption(OptionBuilder.create(AbstractBottomUpParser.INCOMPLETE));
            Iterator<Section> it = getInstance().parsePage(FileUtils.read(new File(new PosixParser().parse(options, strArr).getOptionValue("input")))).getSections().iterator();
            while (it.hasNext()) {
                System.out.println(it.next().getText());
            }
        } catch (ParseException e) {
            System.out.println("Parsing failed: " + e.getMessage() + "\n");
            new HelpFormatter().printHelp(400, "java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.xmldump.util.WikiMarkupParser", "\n", options, "\n", true);
        }
    }
}
