package org.archive.crawler.extractor;

import au.id.jericho.lib.html.HTMLElementName;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.logging.Logger;
import org.apache.commons.httpclient.URIException;
import org.archive.crawler.datamodel.CoreAttributeConstants;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.framework.ToeThread;
import org.archive.io.warc.WARCConstants;

/* loaded from: input_file:site-search/heritrix/heritrix-1.12.1.jar:org/archive/crawler/extractor/ExtractorPDF.class */
public class ExtractorPDF extends Extractor implements CoreAttributeConstants {
    private static final long serialVersionUID = -6040669467531928494L;
    private static final Logger LOGGER = Logger.getLogger(ExtractorPDF.class.getName());
    private static int DEFAULT_MAX_SIZE_TO_PARSE = 5242880;
    private long maxSizeToParse;
    protected long numberOfCURIsHandled;
    protected long numberOfLinksExtracted;

    public ExtractorPDF(String str) {
        super(str, "PDF extractor. Link extraction on PDF documents.");
        this.maxSizeToParse = DEFAULT_MAX_SIZE_TO_PARSE;
        this.numberOfCURIsHandled = 0L;
        this.numberOfLinksExtracted = 0L;
    }

    @Override // org.archive.crawler.extractor.Extractor
    protected void extract(CrawlURI crawlURI) {
        if (isHttpTransactionContentToProcess(crawlURI) && isExpectedMimeType(crawlURI.getContentType(), "application/pdf")) {
            this.numberOfCURIsHandled++;
            if (crawlURI.getHttpRecorder().getRecordedInput().getSize() > this.maxSizeToParse) {
                return;
            }
            File file = new File(getController().getScratchDisk(), HTMLElementName.TT + ((ToeThread) Thread.currentThread()).getSerialNumber() + "tmp.pdf");
            try {
                try {
                    try {
                        crawlURI.getHttpRecorder().getRecordedInput().copyContentBodyTo(file);
                        ArrayList extractURIs = new PDFParser(file.getAbsolutePath()).extractURIs();
                        file.delete();
                        if (extractURIs != null && extractURIs.size() > 0) {
                            Iterator it2 = extractURIs.iterator();
                            while (it2.hasNext()) {
                                String str = (String) it2.next();
                                try {
                                    crawlURI.createAndAddLink(str, Link.NAVLINK_MISC, 'L');
                                } catch (URIException e) {
                                    if (getController() != null) {
                                        getController().logUriError(e, crawlURI.getUURI(), str);
                                    } else {
                                        LOGGER.info(crawlURI + ", " + str + WARCConstants.COLON_SPACE + e.getMessage());
                                    }
                                }
                            }
                            this.numberOfLinksExtracted += extractURIs.size();
                        }
                        LOGGER.fine(crawlURI + " has " + extractURIs.size() + " links.");
                        crawlURI.linkExtractorFinished();
                    } catch (Throwable th) {
                        file.delete();
                        throw th;
                    }
                } catch (RuntimeException e2) {
                    crawlURI.addLocalizedError(getName(), e2, "ExtractorPDF RuntimeException");
                    file.delete();
                }
            } catch (IOException e3) {
                crawlURI.addLocalizedError(getName(), e3, "ExtractorPDF IOException");
                file.delete();
            }
        }
    }

    @Override // org.archive.crawler.framework.Processor
    public String report() {
        StringBuffer stringBuffer = new StringBuffer();
        stringBuffer.append("Processor: org.archive.crawler.extractor.ExtractorPDF\n");
        stringBuffer.append("  Function:          Link extraction on PDF documents\n");
        stringBuffer.append("  CrawlURIs handled: " + this.numberOfCURIsHandled + "\n");
        stringBuffer.append("  Links extracted:   " + this.numberOfLinksExtracted + "\n\n");
        return stringBuffer.toString();
    }
}
