package org.archive.crawler.processor;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintWriter;
import java.util.HashMap;
import java.util.Iterator;
import javax.management.AttributeNotFoundException;
import org.apache.commons.cli.HelpFormatter;
import org.archive.crawler.datamodel.CandidateURI;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.datamodel.FetchStatusCodes;
import org.archive.crawler.deciderules.DecideRule;
import org.archive.crawler.deciderules.DecideRuleSequence;
import org.archive.crawler.framework.Processor;
import org.archive.crawler.settings.SimpleType;
import org.archive.io.ArchiveFileConstants;
import org.archive.util.ArchiveUtils;
import org.archive.util.fingerprint.ArrayLongFPCache;
import st.ata.util.FPGenerator;

/* loaded from: input_file:site-search/heritrix/heritrix-1.12.1.jar:org/archive/crawler/processor/CrawlMapper.class */
public abstract class CrawlMapper extends Processor implements FetchStatusCodes {
    public static final String ATTR_CHECK_URI = "check-uri";
    public static final String ATTR_CHECK_OUTLINKS = "check-outlinks";
    public static final String ATTR_MAP_OUTLINK_DECIDE_RULES = "decide-rules";
    public static final String ATTR_LOCAL_NAME = "local-name";
    public static final String DEFAULT_LOCAL_NAME = ".";
    public static final String ATTR_DIVERSION_DIR = "diversion-dir";
    public static final String DEFAULT_DIVERSION_DIR = "diversions";
    public static final String ATTR_ROTATION_DIGITS = "rotation-digits";
    HashMap<String, PrintWriter> diversionLogs;
    String logGeneration;
    protected String localName;
    protected ArrayLongFPCache cache;
    public static final Boolean DEFAULT_CHECK_URI = Boolean.TRUE;
    public static final Boolean DEFAULT_CHECK_OUTLINKS = Boolean.TRUE;
    public static final Integer DEFAULT_ROTATION_DIGITS = new Integer(10);

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:site-search/heritrix/heritrix-1.12.1.jar:org/archive/crawler/processor/CrawlMapper$FilePrintWriter.class */
    public class FilePrintWriter extends PrintWriter {
        File file;

        public FilePrintWriter(File file) throws FileNotFoundException {
            super(new BufferedOutputStream(new FileOutputStream(file)));
            this.file = file;
        }

        public File getFile() {
            return this.file;
        }
    }

    public CrawlMapper(String str, String str2) {
        super(str, str2);
        this.diversionLogs = new HashMap<>();
        this.logGeneration = "";
        addElementToDefinition(new SimpleType(ATTR_LOCAL_NAME, "Name of local crawler node; mappings to this name result in normal processing (no diversion).", "."));
        addElementToDefinition(new SimpleType(ATTR_DIVERSION_DIR, "Directory to write diversion logs.", DEFAULT_DIVERSION_DIR));
        addElementToDefinition(new SimpleType(ATTR_CHECK_URI, "Whether to apply the mapping to a URI being processed itself, for example early in processing (while its status is still 'unattempted').", DEFAULT_CHECK_URI));
        addElementToDefinition(new SimpleType(ATTR_CHECK_OUTLINKS, "Whether to apply the mapping to discovered outlinks, for example after extraction has occurred. ", DEFAULT_CHECK_OUTLINKS));
        addElementToDefinition(new DecideRuleSequence("decide-rules"));
        addElementToDefinition(new SimpleType(ATTR_ROTATION_DIGITS, "Number of timestamp digits to use as prefix of log names (grouping all diversions from that period in a single log). Default is 10 (hourly log rotation).", DEFAULT_ROTATION_DIGITS));
    }

    @Override // org.archive.crawler.framework.Processor
    protected void innerProcess(CrawlURI crawlURI) {
        String substring = ArchiveUtils.get14DigitDate().substring(0, ((Integer) getUncheckedAttribute(null, ATTR_ROTATION_DIGITS)).intValue());
        if (!substring.equals(this.logGeneration)) {
            updateGeneration(substring);
        }
        if (crawlURI.getFetchStatus() == 0 && ((Boolean) getUncheckedAttribute(null, ATTR_CHECK_URI)).booleanValue()) {
            String map = map(crawlURI);
            if (!this.localName.equals(map)) {
                crawlURI.setFetchStatus(FetchStatusCodes.S_BLOCKED_BY_CUSTOM_PROCESSOR);
                crawlURI.addAnnotation("to:" + map);
                crawlURI.skipToProcessorChain(getController().getPostprocessorChain());
                divertLog(crawlURI, map);
            }
        }
        if (crawlURI.getOutLinks().size() <= 0 || !((Boolean) getUncheckedAttribute(null, ATTR_CHECK_OUTLINKS)).booleanValue()) {
            return;
        }
        Iterator<CandidateURI> it2 = crawlURI.getOutCandidates().iterator();
        while (it2.hasNext()) {
            CandidateURI next = it2.next();
            if (decideToMapOutlink(next)) {
                String map2 = map(next);
                if (!this.localName.equals(map2)) {
                    it2.remove();
                    divertLog(next, map2);
                }
            }
        }
    }

    protected boolean decideToMapOutlink(CandidateURI candidateURI) {
        return !getMapOutlinkDecideRule(candidateURI).decisionFor(candidateURI).equals(DecideRule.REJECT);
    }

    protected DecideRule getMapOutlinkDecideRule(Object obj) {
        try {
            return (DecideRule) getAttribute(obj, "decide-rules");
        } catch (AttributeNotFoundException e) {
            throw new RuntimeException((Throwable) e);
        }
    }

    protected synchronized void updateGeneration(String str) {
        Iterator<PrintWriter> it2 = this.diversionLogs.values().iterator();
        while (it2.hasNext()) {
            FilePrintWriter filePrintWriter = (FilePrintWriter) it2.next();
            filePrintWriter.close();
            filePrintWriter.getFile().renameTo(new File(filePrintWriter.getFile().getAbsolutePath().replaceFirst("\\.open$", ".divert")));
        }
        this.diversionLogs.clear();
        this.logGeneration = str;
    }

    protected abstract String map(CandidateURI candidateURI);

    protected synchronized void divertLog(CandidateURI candidateURI, String str) {
        if (recentlySeen(candidateURI)) {
            return;
        }
        PrintWriter diversionLog = getDiversionLog(str);
        candidateURI.singleLineReportTo(diversionLog);
        diversionLog.println();
    }

    private boolean recentlySeen(CandidateURI candidateURI) {
        return !this.cache.add(FPGenerator.std64.fp(candidateURI.toString()));
    }

    protected PrintWriter getDiversionLog(String str) {
        FilePrintWriter filePrintWriter = (FilePrintWriter) this.diversionLogs.get(str);
        if (filePrintWriter == null) {
            String str2 = (String) getUncheckedAttribute(null, ATTR_DIVERSION_DIR);
            File file = new File(str2);
            if (!file.isAbsolute()) {
                file = new File(getSettingsHandler().getOrder().getController().getDisk(), str2);
            }
            file.mkdirs();
            try {
                filePrintWriter = new FilePrintWriter(new File(file, this.logGeneration + HelpFormatter.DEFAULT_OPT_PREFIX + this.localName + "-to-" + str + ArchiveFileConstants.OCCUPIED_SUFFIX));
                this.diversionLogs.put(str, filePrintWriter);
            } catch (FileNotFoundException e) {
                e.printStackTrace();
                throw new RuntimeException(e);
            }
        }
        return filePrintWriter;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // org.archive.crawler.framework.Processor
    public void initialTasks() {
        super.initialTasks();
        this.localName = (String) getUncheckedAttribute(null, ATTR_LOCAL_NAME);
        this.cache = new ArrayLongFPCache();
    }
}
