package org.archive.crawler.extractor;

import java.io.IOException;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import org.apache.commons.httpclient.URIException;
import org.archive.crawler.datamodel.CoreAttributeConstants;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.framework.CrawlController;
import org.archive.io.ReplayCharSequence;
import org.archive.io.warc.WARCConstants;
import org.archive.util.TextUtils;

/* loaded from: input_file:site-search/heritrix/heritrix-1.12.1.jar:org/archive/crawler/extractor/ExtractorXML.class */
public class ExtractorXML extends Extractor implements CoreAttributeConstants {
    private static final long serialVersionUID = 3101230586822401584L;
    private static Logger logger = Logger.getLogger(ExtractorXML.class.getName());
    private static String ESCAPED_AMP = "&amp";
    static final String XML_URI_EXTRACTOR = "(?i)[\"'>]\\s*(http:[^\\s\"'<>]+)\\s*[\"'<]";
    private long numberOfCURIsHandled;
    private long numberOfLinksExtracted;

    public ExtractorXML(String str) {
        super(str, "XML Extractor. Extracts links from XML/RSS.");
        this.numberOfCURIsHandled = 0L;
        this.numberOfLinksExtracted = 0L;
    }

    @Override // org.archive.crawler.extractor.Extractor
    public void extract(CrawlURI crawlURI) {
        String contentType;
        if (isHttpTransactionContentToProcess(crawlURI) && (contentType = crawlURI.getContentType()) != null) {
            if (contentType.toLowerCase().indexOf("xml") >= 0 || crawlURI.toString().toLowerCase().endsWith(".rss") || crawlURI.toString().toLowerCase().endsWith(".xml")) {
                this.numberOfCURIsHandled++;
                ReplayCharSequence replayCharSequence = null;
                try {
                    replayCharSequence = crawlURI.getHttpRecorder().getReplayCharSequence();
                } catch (IOException e) {
                    logger.severe("Failed getting ReplayCharSequence: " + e.getMessage());
                }
                if (replayCharSequence == null) {
                    logger.severe("Failed getting ReplayCharSequence: " + crawlURI.toString());
                    return;
                }
                try {
                    this.numberOfLinksExtracted += processXml(crawlURI, replayCharSequence, getController());
                    crawlURI.linkExtractorFinished();
                    if (replayCharSequence != null) {
                        try {
                            replayCharSequence.close();
                        } catch (IOException e2) {
                            logger.warning(TextUtils.exceptionToString("Failed close of ReplayCharSequence.", e2));
                        }
                    }
                } catch (Throwable th) {
                    if (replayCharSequence != null) {
                        try {
                            replayCharSequence.close();
                        } catch (IOException e3) {
                            logger.warning(TextUtils.exceptionToString("Failed close of ReplayCharSequence.", e3));
                        }
                    }
                    throw th;
                }
            }
        }
    }

    public static long processXml(CrawlURI crawlURI, CharSequence charSequence, CrawlController crawlController) {
        long j = 0;
        Matcher matcher = TextUtils.getMatcher(XML_URI_EXTRACTOR, charSequence);
        while (matcher.find()) {
            String replaceAll = TextUtils.replaceAll(ESCAPED_AMP, matcher.group(1), "&");
            j++;
            try {
                crawlURI.createAndAddLink(replaceAll, Link.SPECULATIVE_MISC, 'X');
            } catch (URIException e) {
                if (crawlController != null) {
                    crawlController.logUriError(e, crawlURI.getUURI(), replaceAll);
                } else {
                    logger.info(crawlURI + ", " + replaceAll + WARCConstants.COLON_SPACE + e.getMessage());
                }
            }
        }
        TextUtils.recycleMatcher(matcher);
        return j;
    }

    @Override // org.archive.crawler.framework.Processor
    public String report() {
        StringBuffer stringBuffer = new StringBuffer();
        stringBuffer.append("Processor: org.archive.crawler.extractor.ExtractorXML\n");
        stringBuffer.append("  Function:          Link extraction on XML/RSS\n");
        stringBuffer.append("  CrawlURIs handled: " + this.numberOfCURIsHandled + "\n");
        stringBuffer.append("  Links extracted:   " + this.numberOfLinksExtracted + "\n\n");
        return stringBuffer.toString();
    }
}
