package org.fbk.cit.hlt.thewikimachine.xmldump;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Date;
import java.util.Locale;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
import org.apache.commons.configuration.PropertiesConfiguration;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.log4j.Logger;
import org.fbk.cit.hlt.thewikimachine.util.StringTable;

/* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/xmldump/AbstractWikipediaExtractor.class */
public abstract class AbstractWikipediaExtractor extends AbstractWikipediaXmlDumpParser implements WikipediaExtractor {
    static Logger logger = Logger.getLogger(AbstractWikipediaExtractor.class.getName());
    protected PropertiesConfiguration resources;
    protected String categoryPrefix;
    protected String filePrefix;
    protected String specialPrefix;
    protected String imagePrefix;
    protected String wikipediaPrefix;
    protected String portalPrefix;
    protected String templatePrefix;
    protected String mediawikiPrefix;
    protected String disambiguationSuffix;
    protected String helpPrefix;
    protected String projectPrefix;
    protected String infoboxRootCategory;
    protected Pattern disambiguationPattern;
    protected Pattern categoryPattern;
    protected Pattern categoryMainPattern;
    protected Pattern officialSitePattern;
    protected Pattern filePattern;
    protected Pattern simpleCategoryPattern;
    protected Pattern templatePattern;
    protected Pattern simpleTemplatePattern;
    protected Pattern navigationTemplatePattern;
    protected AtomicInteger disambiguationPageCounter;
    protected AtomicInteger redirectPageCounter;
    protected AtomicInteger categoryPageCounter;
    protected AtomicInteger specialPageCounter;
    protected AtomicInteger filePageCounter;
    protected AtomicInteger otherPageCounter;
    protected AtomicInteger imagePageCounter;
    protected AtomicInteger templatePageCounter;
    protected AtomicInteger mediawikiPageCounter;
    protected AtomicInteger wikipediaPageCounter;
    protected AtomicInteger portalPageCounter;
    protected AtomicInteger helpPageCounter;
    protected AtomicInteger projectPageCounter;
    protected AtomicInteger countPageCounter;
    private int numPages;
    private String configurationFolder;
    private Locale locale;
    public static final int DEFAULT_NUM_PAGES = Integer.MAX_VALUE;
    private int notificationPoint;
    public static final int DEFAULT_NOTIFICATION_POINT = 10000;
    boolean compress;
    boolean printHeader;

    public AbstractWikipediaExtractor(int i, int i2, Locale locale) {
        this(i, i2, locale, null);
    }

    public AbstractWikipediaExtractor(int i, int i2, Locale locale, String str) {
        super(i);
        this.disambiguationPageCounter = new AtomicInteger();
        this.redirectPageCounter = new AtomicInteger();
        this.categoryPageCounter = new AtomicInteger();
        this.specialPageCounter = new AtomicInteger();
        this.filePageCounter = new AtomicInteger();
        this.otherPageCounter = new AtomicInteger();
        this.imagePageCounter = new AtomicInteger();
        this.templatePageCounter = new AtomicInteger();
        this.mediawikiPageCounter = new AtomicInteger();
        this.wikipediaPageCounter = new AtomicInteger();
        this.portalPageCounter = new AtomicInteger();
        this.helpPageCounter = new AtomicInteger();
        this.projectPageCounter = new AtomicInteger();
        this.countPageCounter = new AtomicInteger();
        this.configurationFolder = "configuration/";
        this.numPages = i2;
        this.locale = locale;
        this.notificationPoint = 10000;
        if (str != null) {
            this.configurationFolder = str.endsWith(File.separator) ? str : str + File.separator;
        }
        loadResources();
        this.printHeader = true;
        this.compress = false;
    }

    public boolean isCompress() {
        return this.compress;
    }

    public void setCompress(boolean z) {
        this.compress = z;
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void setNotificationPoint(int i) {
        this.notificationPoint = i;
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public int getNotificationPoint() {
        return this.notificationPoint;
    }

    public int getNumPages() {
        return this.numPages;
    }

    public Locale getLocale() {
        return this.locale;
    }

    public void loadResources() {
        logger.info("loading " + this.locale.toLanguageTag() + " resources...");
        try {
            this.resources = new PropertiesConfiguration();
            this.resources.setEncoding("UTF-8");
            this.resources.setListDelimiter('\t');
            String str = this.configurationFolder + this.locale.toLanguageTag() + ".properties";
            logger.debug("Configuration file: " + str);
            File file = new File(str);
            if (!file.exists()) {
                throw new IOException("File " + file.getAbsolutePath() + " does not exist");
            }
            this.resources.load(new FileReader(file));
            logger.debug(this.resources);
            if (this.resources.getString("NAVBOX_TEMPLATE") != null) {
                try {
                    this.navigationTemplatePattern = Pattern.compile(this.resources.getString("NAVBOX_TEMPLATE"));
                    logger.debug("NAVBOX_TEMPLATE " + this.resources.getString("NAVBOX_TEMPLATE"));
                } catch (Exception e) {
                    logger.warn(e.getMessage());
                }
            }
            if (this.resources.getString("INFOBOX_LABEL") != null) {
                this.infoboxRootCategory = this.resources.getString("INFOBOX_LABEL");
                logger.debug("INFOBOX_LABEL " + this.infoboxRootCategory);
            }
            if (this.resources.getString("CATEGORY_LABEL") != null) {
                this.categoryPrefix = this.resources.getString("CATEGORY_LABEL") + ":";
                logger.debug("CATEGORY_LABEL " + this.categoryPrefix);
            }
            if (this.resources.getString("FILE_LABEL") != null) {
                this.filePattern = Pattern.compile("[:=]\\s?([^\\.\\|\\[\\]:=]+)\\.(svg|jpg|png|gif|jpeg)", 2);
                this.filePrefix = this.resources.getString("FILE_LABEL") + ":";
                logger.debug("FILE_LABEL " + this.filePrefix);
            }
            if (this.resources.getString("SPECIAL_LABEL") != null) {
                this.specialPrefix = this.resources.getString("SPECIAL_LABEL") + ":";
                logger.debug("SPECIAL_LABEL " + this.specialPrefix);
            }
            if (this.resources.getString("IMAGE_LABEL") != null) {
                this.imagePrefix = this.resources.getString("IMAGE_LABEL") + ":";
                logger.debug("IMAGE_LABEL " + this.imagePrefix);
            }
            if (this.resources.getString("WIKIPEDIA_LABEL") != null) {
                this.wikipediaPrefix = this.resources.getString("WIKIPEDIA_LABEL") + ":";
                logger.debug("WIKIPEDIA_LABEL " + this.wikipediaPrefix);
            }
            if (this.resources.getString("PORTAL_LABEL") != null) {
                this.portalPrefix = this.resources.getString("PORTAL_LABEL") + ":";
                logger.debug("PORTAL_LABEL " + this.portalPrefix);
            }
            if (this.resources.getString("TEMPLATE_LABEL") != null) {
                this.templatePrefix = this.resources.getString("TEMPLATE_LABEL") + ":";
                logger.debug("TEMPLATE_LABEL " + this.templatePrefix);
            }
            if (this.resources.getString("MEDIAWIKI_LABEL") != null) {
                this.mediawikiPrefix = this.resources.getString("MEDIAWIKI_LABEL") + ":";
                logger.debug("MEDIAWIKI_LABEL " + this.mediawikiPrefix);
            }
            if (this.resources.getString("HELP_LABEL") != null) {
                this.helpPrefix = this.resources.getString("HELP_LABEL") + ":";
                logger.debug("HELP_LABEL " + this.helpPrefix);
            }
            if (this.resources.getString("PROJECT_LABEL") != null) {
                this.projectPrefix = this.resources.getString("PROJECT_LABEL") + ":";
                logger.debug("PROJECT_LABEL " + this.projectPrefix);
            }
            if (this.resources.getString("DISAMBIGUATION_LABEL") != null) {
                this.disambiguationSuffix = "(" + this.resources.getString("DISAMBIGUATION_LABEL") + ")";
                logger.debug("DISAMBIGUATION_LABEL " + this.disambiguationSuffix);
            }
            if (this.resources.getString("DISAMBIGUATION_PATTERN") != null) {
                this.disambiguationPattern = Pattern.compile(this.resources.getString("DISAMBIGUATION_PATTERN"), 2);
                logger.debug("DISAMBIGUATION_PATTERN " + this.disambiguationPattern);
            }
            if (this.resources.getString("CATEGORY_LABEL") != null) {
                this.categoryPattern = Pattern.compile("\\[\\[(" + this.resources.getString("CATEGORY_LABEL") + ":([^\\]]+))\\]\\]", 2);
                this.categoryMainPattern = Pattern.compile("\\[\\[(" + this.resources.getString("CATEGORY_LABEL") + ":([^\\]\\|]+))\\|\\s*\\]\\]", 2);
                this.simpleCategoryPattern = Pattern.compile(this.resources.getString("CATEGORY_LABEL") + ":([^\\]]+)", 2);
                logger.debug("CATEGORY_PATTERN " + this.categoryPattern);
                logger.debug("SIMPLE_CATEGORY_PATTERN " + this.simpleCategoryPattern);
            }
            if (this.resources.getString("TEMPLATE_LABEL") != null) {
                this.templatePattern = Pattern.compile("\\[\\[(" + this.resources.getString("TEMPLATE_LABEL") + ":([^\\]]+))\\]\\]");
                this.simpleTemplatePattern = Pattern.compile(this.resources.getString("TEMPLATE_LABEL") + ":([^\\]/]+)");
                logger.debug("TEMPLATE_PATTERN " + this.templatePattern);
                logger.debug("SIMPLE_TEMPLATE_PATTERN " + this.simpleTemplatePattern);
            }
        } catch (IOException e2) {
            logger.error(e2);
            System.exit(0);
        } catch (Exception e3) {
            logger.error(e3);
        }
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.AbstractWikipediaXmlDumpParser
    public void printLog() {
        if (this.printHeader) {
            logger.info("pages\tcontent\tredirect\tdisambig\tcategory\tfile\ttime\tdate");
            this.printHeader = false;
        }
        try {
            logger.info(this.decimalFormat.format(this.generalCount.intValue()) + StringTable.HORIZONTAL_TABULATION + this.decimalFormat.format(this.countPageCounter) + StringTable.HORIZONTAL_TABULATION + this.decimalFormat.format(this.redirectPageCounter) + StringTable.HORIZONTAL_TABULATION + this.decimalFormat.format(this.disambiguationPageCounter) + StringTable.HORIZONTAL_TABULATION + this.decimalFormat.format(this.categoryPageCounter) + StringTable.HORIZONTAL_TABULATION + this.decimalFormat.format(this.filePageCounter) + StringTable.HORIZONTAL_TABULATION + this.decimalFormat.format(this.genEnd.longValue() - this.genBegin.longValue()) + StringTable.HORIZONTAL_TABULATION + new Date());
        } catch (Exception e) {
            logger.warn(e.getMessage());
        }
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.AbstractWikipediaXmlDumpParser
    public void endProcess() {
        this.genEnd.set(System.currentTimeMillis());
        printLog();
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.AbstractWikipediaXmlDumpParser
    public void getPage(String str, String str2, int i, String str3) {
        int incrementAndGet = this.generalCount.incrementAndGet();
        if (incrementAndGet > this.numPages) {
            logger.info("Exit after " + incrementAndGet + " content pages (" + this.numPages + ")");
            endProcess();
            System.exit(0);
        }
        if (incrementAndGet % getNotificationPoint() == 0) {
            this.genEnd.set(System.currentTimeMillis());
            printLog();
            this.genBegin.set(System.currentTimeMillis());
        }
        if (str3 != null) {
            redirectPage(str3, str2, i);
            this.redirectPageCounter.incrementAndGet();
            return;
        }
        if (str2.endsWith(this.disambiguationSuffix)) {
            disambiguationPage(str, str2, i);
            this.disambiguationPageCounter.incrementAndGet();
            return;
        }
        if (str2.startsWith(this.categoryPrefix)) {
            categoryPage(str, str2, i);
            this.categoryPageCounter.incrementAndGet();
            return;
        }
        if (str2.startsWith(this.filePrefix)) {
            filePage(str, str2, i);
            this.filePageCounter.incrementAndGet();
            return;
        }
        if (str2.startsWith(this.specialPrefix)) {
            this.specialPageCounter.incrementAndGet();
            return;
        }
        if (str2.startsWith(this.imagePrefix)) {
            this.imagePageCounter.incrementAndGet();
            return;
        }
        if (str2.startsWith(this.wikipediaPrefix)) {
            this.wikipediaPageCounter.incrementAndGet();
            this.otherPageCounter.incrementAndGet();
            return;
        }
        if (str2.startsWith(this.portalPrefix)) {
            portalPage(str, str2, i);
            this.portalPageCounter.incrementAndGet();
            return;
        }
        if (str2.startsWith(this.templatePrefix)) {
            templatePage(str, str2, i);
            this.templatePageCounter.incrementAndGet();
            return;
        }
        if (str2.startsWith(this.mediawikiPrefix)) {
            this.mediawikiPageCounter.incrementAndGet();
            this.otherPageCounter.incrementAndGet();
            return;
        }
        if (str2.startsWith(this.helpPrefix)) {
            this.helpPageCounter.incrementAndGet();
            this.otherPageCounter.incrementAndGet();
            return;
        }
        if (str2.startsWith(this.projectPrefix)) {
            projectPage(str, str2, i);
            this.projectPageCounter.incrementAndGet();
            this.otherPageCounter.incrementAndGet();
        } else if (str.length() > 0) {
            String unescapeXml = StringEscapeUtils.unescapeXml(str);
            if (this.disambiguationPattern == null) {
                contentPage(unescapeXml, str2, i);
                this.countPageCounter.incrementAndGet();
            } else if (this.disambiguationPattern.matcher(unescapeXml).find()) {
                disambiguationPage(unescapeXml, str2, i);
                this.disambiguationPageCounter.incrementAndGet();
            } else {
                contentPage(unescapeXml, str2, i);
                this.countPageCounter.incrementAndGet();
            }
        }
    }

    public static String normalizePageName(String str) {
        if (str.length() != 0 && !Character.isUpperCase(str.charAt(0))) {
            return str.substring(0, 1).toUpperCase() + str.substring(1, str.length());
        }
        return str;
    }
}
