package org.archive.crawler.extractor;

import java.io.IOException;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.URIException;
import org.archive.crawler.datamodel.CoreAttributeConstants;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.io.ReplayInputStream;
import org.archive.io.SeekReader;
import org.archive.io.SeekReaderCharSequence;
import org.archive.io.warc.WARCConstants;
import org.archive.util.ms.Doc;

/* loaded from: input_file:site-search/heritrix/heritrix-1.12.1.jar:org/archive/crawler/extractor/ExtractorDOC.class */
public class ExtractorDOC extends Extractor implements CoreAttributeConstants {
    private static final long serialVersionUID = 1896822554981116303L;
    private static Pattern PATTERN = Pattern.compile("HYPERLINK.*?\"(.*?)\"");
    private static Logger logger = Logger.getLogger("org.archive.crawler.extractor.ExtractorDOC");
    private long numberOfCURIsHandled;
    private long numberOfLinksExtracted;

    public ExtractorDOC(String str) {
        super(str, "MS-Word document Extractor. Extracts links from MS-Word '.doc' documents.");
        this.numberOfCURIsHandled = 0L;
        this.numberOfLinksExtracted = 0L;
    }

    /* JADX WARN: Finally extract failed */
    @Override // org.archive.crawler.extractor.Extractor
    protected void extract(CrawlURI crawlURI) {
        if (isHttpTransactionContentToProcess(crawlURI) && isExpectedMimeType(crawlURI.getContentType(), "application/msword")) {
            int i = 0;
            ReplayInputStream replayInputStream = null;
            this.numberOfCURIsHandled++;
            try {
                try {
                    replayInputStream = crawlURI.getHttpRecorder().getRecordedInput().getContentReplayInputStream();
                    if (replayInputStream == null) {
                        try {
                            replayInputStream.close();
                            return;
                        } catch (IOException e) {
                            return;
                        }
                    }
                    SeekReader text = Doc.getText(replayInputStream);
                    try {
                        replayInputStream.close();
                    } catch (IOException e2) {
                    }
                    Matcher matcher = PATTERN.matcher(new SeekReaderCharSequence(text, 0));
                    while (matcher.find()) {
                        i++;
                        addLink(crawlURI, matcher.group(1));
                    }
                    crawlURI.linkExtractorFinished();
                    logger.fine(crawlURI + " has " + i + " links.");
                } catch (Throwable th) {
                    try {
                        replayInputStream.close();
                    } catch (IOException e3) {
                    }
                    throw th;
                }
            } catch (Exception e4) {
                crawlURI.addLocalizedError(getName(), e4, "ExtractorDOC Exception");
                try {
                    replayInputStream.close();
                } catch (IOException e5) {
                }
            }
        }
    }

    private void addLink(CrawlURI crawlURI, String str) {
        try {
            crawlURI.createAndAddLink(str, Link.NAVLINK_MISC, 'L');
        } catch (URIException e) {
            getController().logUriError(e, crawlURI.getUURI(), str);
            if (getController() != null) {
                getController().logUriError(e, crawlURI.getUURI(), str);
            } else {
                logger.info(crawlURI + ", " + str + WARCConstants.COLON_SPACE + e.getMessage());
            }
        }
        this.numberOfLinksExtracted++;
    }

    @Override // org.archive.crawler.framework.Processor
    public String report() {
        StringBuffer stringBuffer = new StringBuffer();
        stringBuffer.append("Processor: org.archive.crawler.extractor.ExtractorDOC\n");
        stringBuffer.append("  Function:          Link extraction on MS Word documents (.doc)\n");
        stringBuffer.append("  CrawlURIs handled: " + this.numberOfCURIsHandled + "\n");
        stringBuffer.append("  Links extracted:   " + this.numberOfLinksExtracted + "\n\n");
        return stringBuffer.toString();
    }
}
