package org.archive.crawler.postprocessor;

import java.util.HashSet;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.management.AttributeNotFoundException;
import org.apache.commons.httpclient.URIException;
import org.archive.crawler.datamodel.CandidateURI;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.datamodel.FetchStatusCodes;
import org.archive.crawler.deciderules.DecideRule;
import org.archive.crawler.deciderules.DecideRuleSequence;
import org.archive.crawler.extractor.Link;
import org.archive.crawler.framework.Scoper;
import org.archive.crawler.settings.MapType;
import org.archive.crawler.settings.SimpleType;

/* loaded from: input_file:site-search/heritrix/heritrix-1.12.1.jar:org/archive/crawler/postprocessor/LinksScoper.class */
public class LinksScoper extends Scoper implements FetchStatusCodes {
    private static final long serialVersionUID = -4074442117992496793L;
    private static final String ATTR_SEED_REDIRECTS_NEW_SEEDS = "seed-redirects-new-seed";
    public static final String ATTR_REJECTLOG_DECIDE_RULES = "scope-rejected-url-rules";
    public static final String ATTR_PREFERENCE_DEPTH_HOPS = "preference-depth-hops";
    private MapType rejectLogFilters;
    private static Logger LOGGER = Logger.getLogger(LinksScoper.class.getName());
    private static final Boolean DEFAULT_SEED_REDIRECTS_NEW_SEEDS = new Boolean(true);
    private static final Integer DEFAULT_PREFERENCE_DEPTH_HOPS = new Integer(-1);

    public LinksScoper(String str) {
        super(str, "LinksScoper. Rules on which extracted links are within configured scope.");
        this.rejectLogFilters = null;
        addElementToDefinition(new SimpleType(ATTR_SEED_REDIRECTS_NEW_SEEDS, "If enabled, any URL found because a seed redirected to it (original seed returned 301 or 302), will also be treated as a seed.", DEFAULT_SEED_REDIRECTS_NEW_SEEDS)).setExpertSetting(true);
        addElementToDefinition(new SimpleType(ATTR_PREFERENCE_DEPTH_HOPS, "Number of hops (of any sort) from a seed up to which a URI has higher priority scheduling than any remaining seed. For example, if set to 1 items one hop (link, embed, redirect, etc.) away from a seed will be scheduled with HIGH priority. If set to -1, no preferencing will occur, and a breadth-first search with seeds processed before discovered links will proceed. If set to zero, a purely depth-first search will proceed, with all discovered links processed before remaining seeds.  Seed redirects are treated as one hop from a seed.", DEFAULT_PREFERENCE_DEPTH_HOPS)).setExpertSetting(true);
        addElementToDefinition(new DecideRuleSequence(ATTR_REJECTLOG_DECIDE_RULES, "DecideRules which, if their final decision on a link is not REJECT, cause the otherwise scope-rejected links to be logged"));
    }

    @Override // org.archive.crawler.framework.Processor
    protected void innerProcess(CrawlURI crawlURI) {
        if (LOGGER.isLoggable(Level.FINEST)) {
            LOGGER.finest(getName() + " processing " + crawlURI);
        }
        if (crawlURI.hasPrerequisiteUri()) {
            handlePrerequisite(crawlURI);
            return;
        }
        if (crawlURI.getFetchStatus() < 200 || crawlURI.getFetchStatus() >= 400) {
            crawlURI.clearOutlinks();
            return;
        }
        if (crawlURI.outlinksSize() <= 0) {
            return;
        }
        boolean booleanValue = ((Boolean) getUncheckedAttribute(crawlURI, ATTR_SEED_REDIRECTS_NEW_SEEDS)).booleanValue();
        int intValue = ((Integer) getUncheckedAttribute(crawlURI, ATTR_PREFERENCE_DEPTH_HOPS)).intValue();
        HashSet hashSet = new HashSet();
        for (Object obj : crawlURI.getOutObjects()) {
            if (obj instanceof Link) {
                Link link = (Link) obj;
                try {
                    CandidateURI createCandidateURI = crawlURI.createCandidateURI(crawlURI.getBaseURI(), link, getSchedulingFor(crawlURI, link, intValue), considerAsSeed(crawlURI, link, booleanValue));
                    if (isInScope(createCandidateURI)) {
                        hashSet.add(createCandidateURI);
                    }
                } catch (URIException e) {
                    getController().logUriError(e, crawlURI.getUURI(), link.getDestination().toString());
                }
            } else if (obj instanceof CandidateURI) {
                CandidateURI candidateURI = (CandidateURI) obj;
                if (isInScope(candidateURI)) {
                    hashSet.add(candidateURI);
                }
            } else {
                LOGGER.severe("Unexpected type: " + obj);
            }
        }
        crawlURI.replaceOutlinks(hashSet);
    }

    protected void handlePrerequisite(CrawlURI crawlURI) {
        try {
            CandidateURI createCandidateURI = crawlURI.createCandidateURI(crawlURI.getBaseURI(), (Link) crawlURI.getPrerequisiteUri());
            int schedulingDirective = crawlURI.getSchedulingDirective() - 1;
            if (schedulingDirective < 0) {
                schedulingDirective = 0;
                LOGGER.severe("Unable to promote prerequisite " + createCandidateURI + " above " + crawlURI);
            }
            createCandidateURI.setSchedulingDirective(schedulingDirective);
            createCandidateURI.setForceFetch(true);
            if (isInScope(createCandidateURI)) {
                crawlURI.setPrerequisiteUri(createCandidateURI);
            } else {
                crawlURI.setFetchStatus(-63);
            }
        } catch (NumberFormatException e) {
            getController().uriErrors.log(Level.INFO, e.getMessage(), new Object[]{crawlURI, crawlURI.getPrerequisiteUri()});
        } catch (URIException e2) {
            getController().uriErrors.log(Level.INFO, e2.getMessage(), new Object[]{crawlURI, crawlURI.getPrerequisiteUri()});
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // org.archive.crawler.framework.Scoper
    public void outOfScope(CandidateURI candidateURI) {
        super.outOfScope(candidateURI);
        if (LOGGER.isLoggable(Level.INFO)) {
            CrawlURI crawlURI = candidateURI instanceof CrawlURI ? (CrawlURI) candidateURI : new CrawlURI(candidateURI.getUURI());
            if (rulesAccept(getRejectLogRules(crawlURI), crawlURI)) {
                LOGGER.info(crawlURI.getUURI().toString());
            }
        }
    }

    protected DecideRule getRejectLogRules(Object obj) {
        try {
            return (DecideRule) getAttribute(obj, ATTR_REJECTLOG_DECIDE_RULES);
        } catch (AttributeNotFoundException e) {
            throw new RuntimeException((Throwable) e);
        }
    }

    private boolean considerAsSeed(CrawlURI crawlURI, Link link, boolean z) {
        if (crawlURI.isSeed()) {
            return (crawlURI.getFetchStatus() == 301 || crawlURI.getFetchStatus() == 302) && link.getHopType() == 'R' && z;
        }
        return false;
    }

    protected int getSchedulingFor(CrawlURI crawlURI, Link link, int i) {
        char hopType = link.getHopType();
        if (LOGGER.isLoggable(Level.FINEST)) {
            LOGGER.finest(crawlURI + " with path=" + crawlURI.getPathFromSeed() + " isSeed=" + crawlURI.isSeed() + " with fetchStatus=" + crawlURI.getFetchStatus() + " -> " + ((Object) link.getDestination()) + " type " + hopType + " with context=" + ((Object) link.getContext()));
        }
        switch (hopType) {
            case 'R':
                return i >= 0 ? 1 : 2;
            default:
                if (i == 0) {
                    return 1;
                }
                return (i <= 0 || crawlURI.getPathFromSeed().length() + 1 > i) ? 3 : 1;
        }
    }
}
