package org.archive.crawler.writer;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpMethodBase;
import org.archive.crawler.datamodel.CoreAttributeConstants;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.datamodel.FetchStatusCodes;
import org.archive.crawler.deciderules.recrawl.IdenticalDigestDecideRule;
import org.archive.crawler.event.CrawlStatusListener;
import org.archive.crawler.extractor.Link;
import org.archive.crawler.framework.WriterPoolProcessor;
import org.archive.crawler.settings.SimpleType;
import org.archive.crawler.settings.Type;
import org.archive.io.ReplayInputStream;
import org.archive.io.WriterPoolMember;
import org.archive.io.WriterPoolSettings;
import org.archive.io.warc.ExperimentalWARCWriter;
import org.archive.io.warc.WARCConstants;
import org.archive.io.warc.WARCWriterPool;
import org.archive.uid.GeneratorFactory;
import org.archive.util.ArchiveUtils;
import org.archive.util.anvl.ANVLRecord;

/* loaded from: input_file:site-search/heritrix/heritrix-1.12.1.jar:org/archive/crawler/writer/ExperimentalWARCWriterProcessor.class */
public class ExperimentalWARCWriterProcessor extends WriterPoolProcessor implements CoreAttributeConstants, CrawlStatusListener, WriterPoolSettings, FetchStatusCodes, WARCConstants {
    private static final long serialVersionUID = 6182850087635847443L;
    private final Logger logger;
    public static final String ATTR_WRITE_REQUESTS = "write-requests";
    public static final String ATTR_WRITE_METADATA = "write-metadata";
    public static final String ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS = "write-revisit-for-identical-digests";
    public static final String ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED = "write-revisit-for-not-modified";
    private static final String[] DEFAULT_PATH = {"warcs"};

    @Override // org.archive.crawler.framework.WriterPoolProcessor
    protected String[] getDefaultPath() {
        return DEFAULT_PATH;
    }

    public ExperimentalWARCWriterProcessor(String str) {
        super(str, "Experimental WARCWriter processor (Version 0.12)");
        this.logger = Logger.getLogger(getClass().getName());
        Type addElementToDefinition = addElementToDefinition(new SimpleType(ATTR_WRITE_REQUESTS, "Whether to write 'request' type records. Default is true.", new Boolean(true)));
        addElementToDefinition.setOverrideable(true);
        addElementToDefinition.setExpertSetting(true);
        Type addElementToDefinition2 = addElementToDefinition(new SimpleType(ATTR_WRITE_METADATA, "Whether to write 'metadata' type records. Default is true.", new Boolean(true)));
        addElementToDefinition2.setOverrideable(true);
        addElementToDefinition2.setExpertSetting(true);
        Type addElementToDefinition3 = addElementToDefinition(new SimpleType(ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS, "Whether to write 'revisit' type records when a URI's history indicates the previous fetch had an identical content digest. Default is true.", new Boolean(true)));
        addElementToDefinition3.setOverrideable(true);
        addElementToDefinition3.setExpertSetting(true);
        Type addElementToDefinition4 = addElementToDefinition(new SimpleType(ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED, "Whether to write 'revisit' type records when a 304-Not Modified response is received. Default is true.", new Boolean(true)));
        addElementToDefinition4.setOverrideable(true);
        addElementToDefinition4.setExpertSetting(true);
    }

    @Override // org.archive.crawler.framework.WriterPoolProcessor
    protected void setupPool(AtomicInteger atomicInteger) {
        setPool(new WARCWriterPool(atomicInteger, this, getPoolMaximumActive(), getPoolMaximumWait()));
    }

    @Override // org.archive.crawler.framework.WriterPoolProcessor, org.archive.crawler.framework.Processor
    protected void innerProcess(CrawlURI crawlURI) {
        if (crawlURI.getFetchStatus() > 0 && crawlURI.getContentSize() > 0) {
            String lowerCase = crawlURI.getUURI().getScheme().toLowerCase();
            try {
                if (shouldWrite(crawlURI)) {
                    write(lowerCase, crawlURI);
                } else {
                    this.logger.info("This writer does not write out scheme " + lowerCase + " content");
                }
            } catch (IOException e) {
                crawlURI.addLocalizedError(getName(), e, "WriteRecord: " + crawlURI.toString());
                this.logger.log(Level.SEVERE, "Failed write of Record: " + crawlURI.toString(), (Throwable) e);
            }
        }
    }

    protected void write(String str, CrawlURI crawlURI) throws IOException {
        URI writeResponse;
        WriterPoolMember borrowFile = getPool().borrowFile();
        long position = borrowFile.getPosition();
        borrowFile.checkSize();
        if (borrowFile.getPosition() != position) {
            setTotalBytesWritten(getTotalBytesWritten() + (borrowFile.getPosition() - position));
            position = borrowFile.getPosition();
        }
        ExperimentalWARCWriter experimentalWARCWriter = (ExperimentalWARCWriter) borrowFile;
        try {
            try {
                URI recordID = getRecordID();
                String log14Date = ArchiveUtils.getLog14Date(crawlURI.getLong(CoreAttributeConstants.A_FETCH_BEGAN_TIME));
                if (str.startsWith("http")) {
                    ANVLRecord aNVLRecord = new ANVLRecord(5);
                    if (crawlURI.getContentDigest() != null) {
                        aNVLRecord.addLabelValue(WARCConstants.HEADER_KEY_CHECKSUM, crawlURI.getContentDigestSchemeString());
                    }
                    aNVLRecord.addLabelValue(WARCConstants.HEADER_KEY_IP, getHostAddress(crawlURI));
                    if (IdenticalDigestDecideRule.hasIdenticalDigest(crawlURI) && ((Boolean) getUncheckedAttribute(crawlURI, ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS)).booleanValue()) {
                        writeResponse = writeRevisitDigest(experimentalWARCWriter, log14Date, WARCConstants.HTTP_RESPONSE_MIMETYPE, recordID, crawlURI, aNVLRecord);
                    } else if (crawlURI.getFetchStatus() == 304 && ((Boolean) getUncheckedAttribute(crawlURI, ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED)).booleanValue()) {
                        writeResponse = writeRevisitNotModified(experimentalWARCWriter, log14Date, recordID, crawlURI, aNVLRecord);
                    } else {
                        if (crawlURI.isTruncatedFetch()) {
                            aNVLRecord.addLabelValue(WARCConstants.HEADER_KEY_TRUNCATED, crawlURI.isTimeTruncatedFetch() ? WARCConstants.NAMED_FIELD_TRUNCATED_VALUE_TIME : crawlURI.isLengthTruncatedFetch() ? "length" : crawlURI.isHeaderTruncatedFetch() ? WARCConstants.NAMED_FIELD_TRUNCATED_VALUE_HEAD : WARCConstants.TRUNCATED_VALUE_UNSPECIFIED);
                        }
                        writeResponse = writeResponse(experimentalWARCWriter, log14Date, WARCConstants.HTTP_RESPONSE_MIMETYPE, recordID, crawlURI, aNVLRecord);
                    }
                    ANVLRecord aNVLRecord2 = new ANVLRecord(1);
                    aNVLRecord2.addLabelValue(WARCConstants.HEADER_KEY_CONCURRENT_TO, '<' + writeResponse.toString() + '>');
                    if (((Boolean) getUncheckedAttribute(crawlURI, ATTR_WRITE_REQUESTS)).booleanValue()) {
                        writeRequest(experimentalWARCWriter, log14Date, WARCConstants.HTTP_REQUEST_MIMETYPE, recordID, crawlURI, aNVLRecord2);
                    }
                    if (((Boolean) getUncheckedAttribute(crawlURI, ATTR_WRITE_METADATA)).booleanValue()) {
                        writeMetadata(experimentalWARCWriter, log14Date, recordID, crawlURI, aNVLRecord2);
                    }
                } else if (str.equals("dns")) {
                    ANVLRecord aNVLRecord3 = null;
                    String string = crawlURI.getString(CoreAttributeConstants.A_DNS_SERVER_IP_LABEL);
                    if (string != null && string.length() > 0) {
                        aNVLRecord3 = new ANVLRecord(1);
                        aNVLRecord3.addLabelValue(WARCConstants.HEADER_KEY_IP, string);
                    }
                    writeResponse(experimentalWARCWriter, log14Date, crawlURI.getContentType(), recordID, crawlURI, aNVLRecord3);
                } else {
                    this.logger.warning("No handler for scheme " + str);
                }
                checkBytesWritten();
            } catch (IOException e) {
                getPool().invalidateFile(borrowFile);
                throw e;
            }
        } finally {
            if (borrowFile != null) {
                setTotalBytesWritten(getTotalBytesWritten() + (borrowFile.getPosition() - position));
                getPool().returnFile(borrowFile);
            }
        }
    }

    protected URI writeRequest(ExperimentalWARCWriter experimentalWARCWriter, String str, String str2, URI uri, CrawlURI crawlURI, ANVLRecord aNVLRecord) throws IOException {
        URI qualifyRecordID = qualifyRecordID(uri, "type", WARCConstants.REQUEST);
        ReplayInputStream replayInputStream = crawlURI.getHttpRecorder().getRecordedOutput().getReplayInputStream();
        try {
            experimentalWARCWriter.writeRequestRecord(crawlURI.toString(), str, str2, qualifyRecordID, aNVLRecord, replayInputStream, crawlURI.getHttpRecorder().getRecordedOutput().getSize());
            if (replayInputStream != null) {
                replayInputStream.close();
            }
            return qualifyRecordID;
        } catch (Throwable th) {
            if (replayInputStream != null) {
                replayInputStream.close();
            }
            throw th;
        }
    }

    protected URI writeResponse(ExperimentalWARCWriter experimentalWARCWriter, String str, String str2, URI uri, CrawlURI crawlURI, ANVLRecord aNVLRecord) throws IOException {
        ReplayInputStream replayInputStream = crawlURI.getHttpRecorder().getRecordedInput().getReplayInputStream();
        try {
            experimentalWARCWriter.writeResponseRecord(crawlURI.toString(), str, str2, uri, aNVLRecord, replayInputStream, crawlURI.getHttpRecorder().getRecordedInput().getSize());
            if (replayInputStream != null) {
                replayInputStream.close();
            }
            return uri;
        } catch (Throwable th) {
            if (replayInputStream != null) {
                replayInputStream.close();
            }
            throw th;
        }
    }

    protected URI writeRevisitDigest(ExperimentalWARCWriter experimentalWARCWriter, String str, String str2, URI uri, CrawlURI crawlURI, ANVLRecord aNVLRecord) throws IOException {
        long contentBegin = crawlURI.getHttpRecorder().getRecordedInput().getContentBegin();
        long size = contentBegin > 0 ? contentBegin : crawlURI.getHttpRecorder().getRecordedInput().getSize();
        aNVLRecord.addLabelValue(WARCConstants.HEADER_KEY_PROFILE, WARCConstants.PROFILE_REVISIT_IDENTICAL_DIGEST);
        aNVLRecord.addLabelValue(WARCConstants.HEADER_KEY_TRUNCATED, "length");
        ReplayInputStream replayInputStream = crawlURI.getHttpRecorder().getRecordedInput().getReplayInputStream();
        try {
            experimentalWARCWriter.writeRevisitRecord(crawlURI.toString(), str, str2, uri, aNVLRecord, replayInputStream, size);
            if (replayInputStream != null) {
                replayInputStream.close();
            }
            return uri;
        } catch (Throwable th) {
            if (replayInputStream != null) {
                replayInputStream.close();
            }
            throw th;
        }
    }

    protected URI writeRevisitNotModified(ExperimentalWARCWriter experimentalWARCWriter, String str, URI uri, CrawlURI crawlURI, ANVLRecord aNVLRecord) throws IOException {
        aNVLRecord.addLabelValue(WARCConstants.HEADER_KEY_PROFILE, WARCConstants.PROFILE_REVISIT_NOT_MODIFIED);
        if (crawlURI.containsKey(CoreAttributeConstants.A_HTTP_TRANSACTION)) {
            HttpMethodBase httpMethodBase = (HttpMethodBase) crawlURI.getObject(CoreAttributeConstants.A_HTTP_TRANSACTION);
            saveHeader(CoreAttributeConstants.A_ETAG_HEADER, httpMethodBase, aNVLRecord, WARCConstants.HEADER_KEY_ETAG);
            saveHeader(CoreAttributeConstants.A_LAST_MODIFIED_HEADER, httpMethodBase, aNVLRecord, WARCConstants.HEADER_KEY_LAST_MODIFIED);
        }
        aNVLRecord.addLabelValue(WARCConstants.HEADER_KEY_TRUNCATED, "length");
        ReplayInputStream replayInputStream = crawlURI.getHttpRecorder().getRecordedInput().getReplayInputStream();
        try {
            experimentalWARCWriter.writeRevisitRecord(crawlURI.toString(), str, null, uri, aNVLRecord, replayInputStream, 0L);
            if (replayInputStream != null) {
                replayInputStream.close();
            }
            return uri;
        } catch (Throwable th) {
            if (replayInputStream != null) {
                replayInputStream.close();
            }
            throw th;
        }
    }

    protected void saveHeader(String str, HttpMethodBase httpMethodBase, ANVLRecord aNVLRecord, String str2) {
        Header responseHeader = httpMethodBase.getResponseHeader(str);
        if (responseHeader != null) {
            aNVLRecord.addLabelValue(str2, responseHeader.getValue());
        }
    }

    protected URI writeMetadata(ExperimentalWARCWriter experimentalWARCWriter, String str, URI uri, CrawlURI crawlURI, ANVLRecord aNVLRecord) throws IOException {
        URI qualifyRecordID = qualifyRecordID(uri, "type", WARCConstants.METADATA);
        ANVLRecord aNVLRecord2 = new ANVLRecord();
        if (crawlURI.isSeed()) {
            aNVLRecord2.addLabel("seed");
        } else {
            if (crawlURI.forceFetch()) {
                aNVLRecord2.addLabel("force-fetch");
            }
            aNVLRecord2.addLabelValue("via", crawlURI.flattenVia());
            aNVLRecord2.addLabelValue("pathFromSeed", crawlURI.getPathFromSeed());
            if (crawlURI.containsKey(CoreAttributeConstants.A_SOURCE_TAG)) {
                aNVLRecord2.addLabelValue("sourceTag", crawlURI.getString(CoreAttributeConstants.A_SOURCE_TAG));
            }
        }
        Collection<Link> outLinks = crawlURI.getOutLinks();
        if (outLinks != null && outLinks.size() > 0) {
            Iterator<Link> it2 = outLinks.iterator();
            while (it2.hasNext()) {
                aNVLRecord2.addLabelValue("outlink", it2.next().toString());
            }
        }
        experimentalWARCWriter.writeMetadataRecord(crawlURI.toString(), str, ANVLRecord.MIMETYPE, qualifyRecordID, aNVLRecord, new ByteArrayInputStream(aNVLRecord2.getUTF8Bytes()), r0.length);
        return qualifyRecordID;
    }

    protected URI getRecordID() throws IOException {
        try {
            return GeneratorFactory.getFactory().getRecordID();
        } catch (URISyntaxException e) {
            throw new IOException(e.toString());
        }
    }

    protected URI qualifyRecordID(URI uri, String str, String str2) throws IOException {
        HashMap hashMap = new HashMap(1);
        hashMap.put(str, str2);
        try {
            return GeneratorFactory.getFactory().qualifyRecordID(uri, hashMap);
        } catch (URISyntaxException e) {
            throw new IOException(e.toString());
        }
    }

    @Override // org.archive.crawler.framework.WriterPoolProcessor
    protected String getFirstrecordStylesheet() {
        return "/warcinfobody.xsl";
    }
}
