package org.archive.crawler.extractor;

import java.util.Collection;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import org.apache.commons.httpclient.URIException;
import org.archive.crawler.datamodel.CoreAttributeConstants;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.settings.SimpleType;
import org.archive.util.TextUtils;

/* loaded from: input_file:site-search/heritrix/heritrix-1.12.1.jar:org/archive/crawler/extractor/ExtractorImpliedURI.class */
public class ExtractorImpliedURI extends Extractor implements CoreAttributeConstants {
    private static final long serialVersionUID = 8579045413127769497L;
    private static Logger LOGGER = Logger.getLogger(ExtractorImpliedURI.class.getName());
    public static final String ATTR_TRIGGER_REGEXP = "trigger-regexp";
    public static final String ATTR_BUILD_PATTERN = "build-pattern";
    public static final String ATTR_REMOVE_TRIGGER_URIS = "remove-trigger-uris";
    private long numberOfCURIsHandled;
    private long numberOfLinksExtracted;

    public ExtractorImpliedURI(String str) {
        super(str, "Implied URI Extractor. Finds URIs implied by other URIs according to regex/replacement patterns. Should appear after most other extractors.");
        this.numberOfCURIsHandled = 0L;
        this.numberOfLinksExtracted = 0L;
        addElementToDefinition(new SimpleType(ATTR_TRIGGER_REGEXP, "Triggering regular expression. When a discovered URI matches this pattern, the 'implied' URI will be built. The capturing groups of this expression are available for the build replacement pattern.", ""));
        addElementToDefinition(new SimpleType(ATTR_BUILD_PATTERN, "Replacement pattern to build 'implied' URI, using captured groups of trigger expression.", ""));
        addElementToDefinition(new SimpleType(ATTR_REMOVE_TRIGGER_URIS, "If true, all URIs that match trigger regular expression are removed from the list of extracted URIs. Default is false.", Boolean.FALSE));
    }

    @Override // org.archive.crawler.extractor.Extractor
    public void extract(CrawlURI crawlURI) {
        this.numberOfCURIsHandled++;
        Collection<Link> outLinks = crawlURI.getOutLinks();
        for (Link link : (Link[]) outLinks.toArray(new Link[outLinks.size()])) {
            String extractImplied = extractImplied(link.getDestination(), (String) getUncheckedAttribute(crawlURI, ATTR_TRIGGER_REGEXP), (String) getUncheckedAttribute(crawlURI, ATTR_BUILD_PATTERN));
            if (extractImplied != null) {
                try {
                    crawlURI.createAndAddLink(extractImplied, Link.SPECULATIVE_MISC, 'X');
                    this.numberOfLinksExtracted++;
                    if (((Boolean) getUncheckedAttribute(crawlURI, ATTR_REMOVE_TRIGGER_URIS)).booleanValue()) {
                        if (crawlURI.getOutLinks().remove(link)) {
                            LOGGER.log(Level.FINE, ((Object) link.getDestination()) + " has been removed from " + ((Object) link.getSource()) + " outlinks list.");
                            this.numberOfLinksExtracted--;
                        } else {
                            LOGGER.log(Level.FINE, "Failed to remove " + ((Object) link.getDestination()) + " from " + ((Object) link.getSource()) + " outlinks list.");
                        }
                    }
                } catch (URIException e) {
                    LOGGER.log(Level.FINE, "bad URI", (Throwable) e);
                }
            }
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static String extractImplied(CharSequence charSequence, String str, String str2) {
        if (str.length() == 0) {
            return null;
        }
        Matcher matcher = TextUtils.getMatcher(str, charSequence);
        if (!matcher.matches()) {
            return null;
        }
        String replaceFirst = matcher.replaceFirst(str2);
        TextUtils.recycleMatcher(matcher);
        return replaceFirst;
    }

    @Override // org.archive.crawler.framework.Processor
    public String report() {
        StringBuffer stringBuffer = new StringBuffer();
        stringBuffer.append("Processor: " + ExtractorImpliedURI.class.getName() + "\n");
        stringBuffer.append("  Function:          Extracts links inside other URIs\n");
        stringBuffer.append("  CrawlURIs handled: " + this.numberOfCURIsHandled + "\n");
        stringBuffer.append("  Links extracted:   " + this.numberOfLinksExtracted + "\n\n");
        return stringBuffer.toString();
    }
}
