package org.wikibrain.parser.xml;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringEscapeUtils;
import org.wikibrain.core.lang.LanguageInfo;
import org.wikibrain.core.model.NameSpace;
import org.wikibrain.core.model.RawPage;
import org.wikibrain.core.model.Title;
import org.wikibrain.parser.WpParseException;

/* loaded from: input_file:org/wikibrain/parser/xml/PageXmlParser.class */
public class PageXmlParser {
    private static final Logger LOG = Logger.getLogger(PageXmlParser.class.getName());
    private static final Pattern TITLE_PATTERN = Pattern.compile("<title>(.*?)</title>");
    private static final Pattern ID_PATTERN = Pattern.compile("<id>(.*?)</id>");
    private static final Pattern TIMESTAMP_PATTERN = Pattern.compile("<timestamp>(.*?)</timestamp>");
    private static final Pattern CONTENT_PATTERN = Pattern.compile("<text xml:space=\"preserve\">(.*?)</text>", 32);
    private static final Pattern SELF_CLOSING_CONTENT_PATTERN = Pattern.compile("<text xml:space=\"preserve\"\\s*/>", 32);
    private static final Pattern REDIRECT_PATTERN = Pattern.compile("<redirect title=\"(.*?)\" />");
    private static final Pattern MODEL_PATTERN = Pattern.compile("<model>(.*?)</model>");
    private static final Pattern FORMAT_PATTERN = Pattern.compile("<format>(.*?)</format>");
    private final SimpleDateFormat xmlDumpDateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
    private final LanguageInfo language;

    public PageXmlParser(LanguageInfo languageInfo) {
        this.language = languageInfo;
    }

    public RawPage parse(String str) throws WpParseException {
        return parse(str, -1L, -1L);
    }

    public RawPage parse(String str, long j, long j2) throws WpParseException {
        String unescapeHtml4 = StringEscapeUtils.unescapeHtml4(str);
        String extractSingleString = extractSingleString(TITLE_PATTERN, unescapeHtml4, 1);
        String extractSingleString2 = extractSingleString(ID_PATTERN, unescapeHtml4, 1);
        String extractSingleString3 = extractSingleString(TIMESTAMP_PATTERN, unescapeHtml4, 1);
        String extractSingleString4 = extractSingleString(ID_PATTERN, unescapeHtml4, 2);
        String extractSingleString5 = extractSingleString(FORMAT_PATTERN, unescapeHtml4, 1);
        String extractSingleString6 = extractSingleString(MODEL_PATTERN, unescapeHtml4, 1);
        if (extractSingleString == null) {
            throw new WpParseException("no title for article");
        }
        if (extractSingleString2 == null) {
            throw new WpParseException("no id for article");
        }
        if (extractSingleString4 == null) {
            throw new WpParseException("no revision id for article");
        }
        String extractSingleString7 = extractSingleString(CONTENT_PATTERN, unescapeHtml4, 1);
        if (extractSingleString7 == null && SELF_CLOSING_CONTENT_PATTERN.matcher(unescapeHtml4).find()) {
            extractSingleString7 = "";
        }
        if (extractSingleString7 == null) {
            System.err.println("invalid body: " + unescapeHtml4);
            extractSingleString7 = "";
        }
        Date date = null;
        try {
            date = this.xmlDumpDateFormat.parse(extractSingleString3);
        } catch (ParseException e) {
            LOG.warning("Could not parse last edited date: " + extractSingleString3);
        }
        String trim = extractSingleString.trim();
        String redirect = getRedirect(unescapeHtml4);
        RawPage rawPage = new RawPage(Integer.valueOf(extractSingleString2).intValue(), Integer.valueOf(extractSingleString4).intValue(), trim, extractSingleString7, date, this.language.getLanguage(), getNameSpace(trim), redirect != null, false, redirect);
        if (extractSingleString5 != null) {
            rawPage.setFormat(extractSingleString5);
        }
        if (extractSingleString6 != null) {
            rawPage.setModel(extractSingleString6);
        }
        return rawPage;
    }

    private NameSpace getNameSpace(String str) {
        return new Title(str, this.language).getNamespace();
    }

    private String getRedirect(String str) {
        return extractSingleString(REDIRECT_PATTERN, str, 1);
    }

    private static String extractSingleString(Pattern pattern, String str, int i) {
        if (pattern == null || str == null) {
            return null;
        }
        Matcher matcher = pattern.matcher(str);
        String str2 = null;
        for (int i2 = 0; matcher.find() && i2 < i; i2++) {
            str2 = matcher.group(1);
        }
        return str2;
    }
}
