package org.fbk.cit.hlt.thewikimachine.xmldump.util;

import de.tudarmstadt.ukp.wikipedia.parser.ParsedPage;
import java.io.File;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.fbk.cit.hlt.core.io.FileUtils;
import org.fbk.cit.hlt.thewikimachine.util.StringTable;

/* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/xmldump/util/PageTypeExtractor.class */
public class PageTypeExtractor {
    static Logger logger = Logger.getLogger(PageTypeExtractor.class.getName());
    private boolean nominal;
    Pattern spacePattern = Pattern.compile(" ");
    public static final double DEFAULT_THRESHOLD = 0.1d;

    public PageTypeExtractor(String str, String str2) {
        String[] split = this.spacePattern.split(str2);
        if (split.length == 1) {
            this.nominal = fromPage(" " + str, str2);
            return;
        }
        if (split.length != 2) {
            this.nominal = fromForm(split);
        } else if (containsDigits(split[1])) {
            this.nominal = fromPage(" " + str, str2);
        } else {
            this.nominal = fromForm(split);
        }
    }

    private boolean containsDigits(String str) {
        for (int i = 0; i < str.length(); i++) {
            if (Character.isDigit(str.charAt(0))) {
                return true;
            }
        }
        return false;
    }

    private boolean containsUpperCase(String str) {
        for (int i = 0; i < str.length(); i++) {
            if (Character.isUpperCase(str.charAt(0))) {
                return true;
            }
        }
        return false;
    }

    public boolean isNominal() {
        return this.nominal;
    }

    private boolean fromPage(String str, String str2) {
        try {
            Matcher matcher = Pattern.compile("[\\s'\"«‛“¿](" + str2 + ")[\\s\\.,!\\?'\"»…:;‟’]", 2).matcher(str);
            int i = 0;
            int i2 = 0;
            while (matcher.find()) {
                if (Character.isLowerCase(str.charAt(matcher.start(1)))) {
                    i++;
                }
                i2++;
            }
            return ((double) i) / ((double) i2) > 0.1d;
        } catch (PatternSyntaxException e) {
            return false;
        }
    }

    private boolean fromForm(String[] strArr) {
        for (int i = 1; i < strArr.length; i++) {
            if (containsUpperCase(strArr[i])) {
                return false;
            }
        }
        return true;
    }

    public static void main(String[] strArr) throws Exception {
        String property = System.getProperty("log-config");
        if (property == null) {
            property = "configuration/log-config.txt";
        }
        PropertyConfigurator.configure(property);
        if (strArr.length != 2) {
            logger.info("java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.xmldump.util.PageTypeExtractor wiki-file title");
            System.exit(1);
        }
        ParsedPage parsePage = WikiMarkupParser.getInstance().parsePage(FileUtils.read(new File(strArr[0])));
        String str = strArr[1];
        ParsedPageTitle parsedPageTitle = new ParsedPageTitle(str);
        logger.debug(str + StringTable.HORIZONTAL_TABULATION + parsedPageTitle);
        PageTypeExtractor pageTypeExtractor = new PageTypeExtractor(parsePage.getText(), parsedPageTitle.getForm());
        logger.debug(parsePage.getText());
        logger.info(str + " is " + (pageTypeExtractor.isNominal() ? "nominal" : " not nominal)"));
    }
}
