package org.archive.crawler.extractor;

import java.io.IOException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import javax.management.AttributeNotFoundException;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.framework.Processor;
import org.archive.crawler.settings.SimpleType;
import org.archive.io.ReplayCharSequence;
import org.archive.net.UURIFactory;
import org.archive.util.Base32;
import org.archive.util.TextUtils;

/* loaded from: input_file:site-search/heritrix/heritrix-1.12.1.jar:org/archive/crawler/extractor/HTTPContentDigest.class */
public class HTTPContentDigest extends Processor {
    private static final long serialVersionUID = 8055532198737384358L;
    public static final String ATTR_STRIP_REG_EXPR = "strip-reg-expr";
    protected static final String DEFAULT_STRIP_REG_EXPR = "";
    public static final String ATTR_MAX_SIZE_BYTES = "max-size-bytes";
    private static final String SHA1 = "SHA1";
    private static Logger logger = Logger.getLogger(HTTPContentDigest.class.getName());
    protected static final Long DEFAULT_MAX_SIZE_BYTES = new Long(1048576);

    public HTTPContentDigest(String str) {
        super(str, "Calculate custom - stripped - content digests. A processor for calculating custom HTTP content digests in place of the default (if any) computed by the HTTP fetcher processors. This processor enables you to specify a regular expression called strip-reg-expr. Any segment of a document (text only, binary files will be skipped) that matches this regular expression will be rewritten with the blank character (character 32 in the ANSI character set) FOR THE PURPOSE OF THE DIGEST, this has no effect on the document for subsequent processing or archiving. You can also specify a maximum length for documents being evaluated by this processor. Documents exceeding that length will be ignored. To further discriminate by file type or URL, you should use the override and refinement options (the processor can be disabled by default and only enabled as needed in overrides and refinements. It is generally recommended that this recalculation only be performed when absolutely needed (because of stripping data that changes automatically each time the URL is fetched) as this is an expensive operation.");
        addElementToDefinition(new SimpleType(ATTR_STRIP_REG_EXPR, "A regular expression that matches those portions of downloaded documents that need to be ignored when calculating the content digest. Segments matching this expression will be rewritten with the blank character for the content digest.", ""));
        addElementToDefinition(new SimpleType("max-size-bytes", "Maximum size of of documents to recalculate the digest for. Documents that exceed this value (bytes) will be ignored. Defaults to 1048576 bytes, or 1 MB. -1 denotes unlimited size. A setting of 0 will effectively disable the processor.", DEFAULT_MAX_SIZE_BYTES));
    }

    @Override // org.archive.crawler.framework.Processor
    protected void innerProcess(CrawlURI crawlURI) throws InterruptedException {
        String replaceAll;
        if (crawlURI.isHttpTransaction() && TextUtils.matches("^text.*$", crawlURI.getContentType())) {
            long longValue = DEFAULT_MAX_SIZE_BYTES.longValue();
            try {
                longValue = ((Long) getAttribute(crawlURI, "max-size-bytes")).longValue();
            } catch (AttributeNotFoundException e) {
                logger.severe("Missing max-size-bytes attribute when processing " + crawlURI.toString());
            }
            if (longValue >= crawlURI.getContentSize() || longValue <= -1) {
                try {
                    String str = (String) getAttribute(crawlURI, ATTR_STRIP_REG_EXPR);
                    try {
                        ReplayCharSequence replayCharSequence = crawlURI.getHttpRecorder().getReplayCharSequence();
                        try {
                            try {
                                MessageDigest messageDigest = MessageDigest.getInstance(SHA1);
                                messageDigest.reset();
                                if (str.length() == 0) {
                                    replaceAll = replayCharSequence.toString();
                                } else {
                                    Matcher matcher = TextUtils.getMatcher(str, replayCharSequence);
                                    replaceAll = matcher.replaceAll(UURIFactory.SPACE);
                                    TextUtils.recycleMatcher(matcher);
                                }
                                messageDigest.update(replaceAll.getBytes());
                                byte[] digest = messageDigest.digest();
                                if (logger.isLoggable(Level.FINEST)) {
                                    logger.finest("Recalculated content digest for " + crawlURI.toString() + " old: " + Base32.encode((byte[]) crawlURI.getContentDigest()) + ", new: " + Base32.encode(digest));
                                }
                                crawlURI.setContentDigest(SHA1, digest);
                                if (replayCharSequence != null) {
                                    try {
                                        replayCharSequence.close();
                                    } catch (IOException e2) {
                                        logger.warning(TextUtils.exceptionToString("Failed close of ReplayCharSequence.", e2));
                                    }
                                }
                            } catch (NoSuchAlgorithmException e3) {
                                e3.printStackTrace();
                                if (replayCharSequence != null) {
                                    try {
                                        replayCharSequence.close();
                                    } catch (IOException e4) {
                                        logger.warning(TextUtils.exceptionToString("Failed close of ReplayCharSequence.", e4));
                                    }
                                }
                            }
                        } catch (Throwable th) {
                            if (replayCharSequence != null) {
                                try {
                                    replayCharSequence.close();
                                } catch (IOException e5) {
                                    logger.warning(TextUtils.exceptionToString("Failed close of ReplayCharSequence.", e5));
                                }
                            }
                            throw th;
                        }
                    } catch (Exception e6) {
                        crawlURI.addLocalizedError(getName(), e6, "Failed get of replay char sequence " + crawlURI.toString() + UURIFactory.SPACE + e6.getMessage());
                        logger.warning("Failed get of replay char sequence " + crawlURI.toString() + UURIFactory.SPACE + e6.getMessage() + UURIFactory.SPACE + Thread.currentThread().getName());
                    }
                } catch (AttributeNotFoundException e7) {
                    logger.severe("Missing strip-reg-exp when processing " + crawlURI.toString());
                }
            }
        }
    }
}
