package org.archive.crawler.writer;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringBufferInputStream;
import java.util.logging.Logger;
import javax.management.AttributeNotFoundException;
import javax.management.MBeanException;
import javax.management.ReflectionException;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.httpclient.methods.multipart.StringPart;
import org.archive.crawler.datamodel.CoreAttributeConstants;
import org.archive.crawler.datamodel.CrawlHost;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.framework.Processor;
import org.archive.crawler.settings.SimpleType;
import org.archive.io.RecordingInputStream;
import org.archive.io.ReplayCharSequence;
import org.archive.io.ReplayInputStream;
import org.archive.util.ArchiveUtils;
import org.archive.util.MimetypeUtils;
import org.jets3t.service.utils.Mimetypes;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;
import org.textmining.text.extraction.WordExtractor;

/* loaded from: input_file:site-search/heritrix/heritrix-1.12.1.jar:org/archive/crawler/writer/TrecWebWriterProcessor.class */
public class TrecWebWriterProcessor extends Processor implements CoreAttributeConstants {
    public static final String ATTR_CORPUS_PATH = "path";
    public static final String ATTR_BASENAME = "basename";
    private String corpusPath;
    private String basename;
    private File corpusDir;
    private volatile File corpusFile;
    private volatile PrintWriter corpusStream;
    private int fileCount;
    private static final int MAX_LENGTH = 104857600;
    private static final String HEADER_FIELD_SEPARATOR = " ";
    private static final String LINE_SEPARATOR = "\n";
    private static final Logger logger = Logger.getLogger(TrecWebWriterProcessor.class.getName());
    private static final Object streamLock = new Object();

    public TrecWebWriterProcessor(String str) {
        super(str, "TrecWebWriter processor. A writer that writes each URL to a trecweb format file for indexing by indri.");
        addElementToDefinition(new SimpleType("path", "Path to output directory.", "corpus"));
        addElementToDefinition(new SimpleType(ATTR_BASENAME, "basename to use for corpus files.", "crawl"));
    }

    public Object getAttributeUnchecked(String str) {
        Object obj = null;
        try {
            obj = super.getAttribute(str);
        } catch (MBeanException e) {
            logger.warning(e.getLocalizedMessage());
        } catch (ReflectionException e2) {
            logger.warning(e2.getLocalizedMessage());
        } catch (AttributeNotFoundException e3) {
            logger.warning(e3.getLocalizedMessage());
        }
        return obj;
    }

    private String makeFilename() {
        StringBuilder append = new StringBuilder().append(this.basename).append(HelpFormatter.DEFAULT_OPT_PREFIX);
        int i = this.fileCount;
        this.fileCount = i + 1;
        return append.append(i).append(".dat").toString();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // org.archive.crawler.framework.Processor
    public synchronized void initialTasks() {
        Object attributeUnchecked = getAttributeUnchecked("path");
        this.corpusPath = attributeUnchecked == null ? "corpus" : (String) attributeUnchecked;
        Object attributeUnchecked2 = getAttributeUnchecked(ATTR_BASENAME);
        this.basename = attributeUnchecked2 == null ? "crawl" : (String) attributeUnchecked2;
        this.fileCount = 1;
        try {
            this.corpusDir = new File(this.corpusPath);
            if (!this.corpusDir.isAbsolute()) {
                this.corpusDir = new File(getController().getDisk(), this.corpusPath);
            }
            if (!this.corpusDir.exists() && !this.corpusDir.mkdirs()) {
                throw new IOException("Can not mkdir " + this.corpusDir.getAbsolutePath());
            }
            this.corpusFile = new File(this.corpusDir, makeFilename());
            this.corpusStream = new PrintWriter(new FileWriter(this.corpusFile));
        } catch (IOException e) {
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // org.archive.crawler.framework.Processor
    public void finalTasks() {
        this.corpusStream.close();
    }

    @Override // org.archive.crawler.framework.Processor
    public String report() {
        return "";
    }

    private String mime2FileClass(String str) {
        String str2 = null;
        if (str.equalsIgnoreCase("text/html")) {
            str2 = "html";
        } else if (str.equalsIgnoreCase("application/pdf")) {
            str2 = "pdf";
        } else if (str.equalsIgnoreCase(StringPart.DEFAULT_CONTENT_TYPE)) {
            str2 = "txt";
        } else if (str.equalsIgnoreCase("text/xml")) {
            str2 = "xml";
        } else if (str.equalsIgnoreCase("application/x-pdf")) {
            str2 = "pdf";
        } else if (str.equalsIgnoreCase(Mimetypes.MIMETYPE_XML)) {
            str2 = "xml";
        } else if (str.equalsIgnoreCase("application/msword")) {
            str2 = "word";
        } else if (str.equalsIgnoreCase("application/xhtml+xml")) {
            str2 = "xml";
        }
        return str2;
    }

    private String extractWord(ReplayInputStream replayInputStream) {
        String str = null;
        try {
            str = new WordExtractor().extractText(replayInputStream);
        } catch (Exception e) {
            logger.warning(e.getLocalizedMessage());
        }
        return str;
    }

    private String extractPDF(String str) {
        String str2;
        PDDocument pDDocument = null;
        try {
            pDDocument = PDDocument.load(new StringBufferInputStream(str));
            PDFTextStripper pDFTextStripper = new PDFTextStripper();
            pDFTextStripper.setSortByPosition(false);
            pDFTextStripper.setStartPage(1);
            pDFTextStripper.setEndPage(Integer.MAX_VALUE);
            try {
                str2 = pDFTextStripper.getText(pDDocument);
            } catch (IOException e) {
                str2 = null;
            }
            if (pDDocument != null) {
                try {
                    pDDocument.close();
                } catch (IOException e2) {
                    logger.warning(e2.getLocalizedMessage());
                }
            }
        } catch (Exception e3) {
            str2 = null;
            if (pDDocument != null) {
                try {
                    pDDocument.close();
                } catch (IOException e4) {
                    logger.warning(e4.getLocalizedMessage());
                }
            }
        } catch (OutOfMemoryError e5) {
            str2 = null;
            if (pDDocument != null) {
                try {
                    pDDocument.close();
                } catch (IOException e6) {
                    logger.warning(e6.getLocalizedMessage());
                }
            }
        } catch (Throwable th) {
            if (pDDocument != null) {
                try {
                    pDDocument.close();
                } catch (IOException e7) {
                    logger.warning(e7.getLocalizedMessage());
                    throw th;
                }
            }
            throw th;
        }
        return str2;
    }

    private String getHostAddress(CrawlURI crawlURI) {
        CrawlHost hostFor = getController().getServerCache().getHostFor(crawlURI);
        if (hostFor == null) {
            throw new NullPointerException("Crawlhost is null for " + crawlURI + " " + ((Object) crawlURI.getVia()));
        }
        if (hostFor.getIP() == null) {
            throw new NullPointerException("Address is null for " + crawlURI + " " + ((Object) crawlURI.getVia()) + ". Address " + (hostFor.getIpFetched() == -2 ? "was never looked up." : (System.currentTimeMillis() - hostFor.getIpFetched()) + " ms ago."));
        }
        return hostFor.getIP().getHostAddress();
    }

    private String makeMetaline(String str, String str2, String str3, String str4, String str5) {
        return str + " " + str2 + " " + str3 + " " + str4 + " " + str5 + LINE_SEPARATOR;
    }

    private String getHeaders(ReplayInputStream replayInputStream, int i) throws IOException {
        byte[] bArr = new byte[i];
        replayInputStream.read(bArr, 0, i);
        return new String(bArr);
    }

    private String makeDocHeader(String str, String str2, String str3) {
        StringBuffer stringBuffer = new StringBuffer("<DOC>\n<DOCNO>");
        stringBuffer.append(str2);
        stringBuffer.append("</DOCNO>\n<DOCHDR>\n");
        stringBuffer.append(str3);
        stringBuffer.append(str);
        stringBuffer.append("</DOCHDR>\n");
        return stringBuffer.toString();
    }

    @Override // org.archive.crawler.framework.Processor
    protected void innerProcess(CrawlURI crawlURI) {
        int fetchStatus;
        String obj;
        if (crawlURI.isSuccess()) {
            String scheme = crawlURI.getUURI().getScheme();
            if (("http".equalsIgnoreCase(scheme) || "https".equalsIgnoreCase(scheme)) && (fetchStatus = crawlURI.getFetchStatus()) >= 200 && fetchStatus <= 299) {
                RecordingInputStream recordedInput = crawlURI.getHttpRecorder().getRecordedInput();
                if (0 == recordedInput.getResponseContentLength()) {
                    try {
                        recordedInput.close();
                        return;
                    } catch (IOException e) {
                        return;
                    }
                }
                try {
                    String truncate = MimetypeUtils.truncate(crawlURI.getContentType());
                    String mime2FileClass = mime2FileClass(truncate);
                    if (mime2FileClass == null) {
                        recordedInput.close();
                        return;
                    }
                    String crawlURI2 = crawlURI.toString();
                    long j = crawlURI.getLong(CoreAttributeConstants.A_FETCH_BEGAN_TIME);
                    int contentSize = (int) crawlURI.getContentSize();
                    if (contentSize > 31457280) {
                        logger.warning("Oversize URI: " + crawlURI2 + " ContentType: " + crawlURI.getContentType() + " Record length: " + contentSize);
                        recordedInput.close();
                        return;
                    }
                    ReplayInputStream replayInputStream = recordedInput.getReplayInputStream();
                    ReplayCharSequence replayCharSequence = crawlURI.getHttpRecorder().getReplayCharSequence();
                    ReplayInputStream contentReplayInputStream = crawlURI.getHttpRecorder().getRecordedInput().getContentReplayInputStream();
                    String makeDocHeader = makeDocHeader(getHeaders(replayInputStream, contentSize - ((int) recordedInput.getResponseContentLength())), crawlURI2, makeMetaline(crawlURI2, getHostAddress(crawlURI), ArchiveUtils.get14DigitDate(j), truncate, Integer.toString(contentSize)));
                    if (mime2FileClass.equals("pdf")) {
                        obj = extractPDF(replayCharSequence.toString());
                        if (obj == null) {
                            replayInputStream.close();
                            replayCharSequence.close();
                            contentReplayInputStream.close();
                            recordedInput.close();
                            return;
                        }
                    } else if (mime2FileClass.equals("word")) {
                        obj = extractWord(contentReplayInputStream);
                        if (obj == null) {
                            replayInputStream.close();
                            replayCharSequence.close();
                            contentReplayInputStream.close();
                            recordedInput.close();
                            return;
                        }
                    } else {
                        obj = replayCharSequence.toString();
                    }
                    synchronized (streamLock) {
                        if (this.corpusFile.length() > 104857600) {
                            this.corpusStream.close();
                            this.corpusFile = new File(this.corpusDir, makeFilename());
                            this.corpusStream = new PrintWriter(new FileWriter(this.corpusFile));
                        }
                        this.corpusStream.print(makeDocHeader);
                        this.corpusStream.print(obj);
                        this.corpusStream.print("\n</DOC>\n");
                    }
                    replayInputStream.close();
                    replayCharSequence.close();
                    contentReplayInputStream.close();
                    recordedInput.close();
                } catch (IOException e2) {
                    crawlURI.addLocalizedError(getName(), e2, "TrecWeb");
                } catch (Exception e3) {
                    crawlURI.addLocalizedError(getName(), e3, "TrecWeb");
                }
            }
        }
    }
}
