package org.archive.crawler.extractor;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.logging.ConsoleHandler;
import java.util.logging.Handler;
import java.util.logging.Logger;
import javax.management.Attribute;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpMethodBase;
import org.apache.commons.httpclient.URIException;
import org.archive.crawler.datamodel.CoreAttributeConstants;
import org.archive.crawler.datamodel.CrawlOrder;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.framework.Processor;
import org.archive.crawler.settings.CrawlerSettings;
import org.archive.crawler.settings.MapType;
import org.archive.crawler.settings.XMLSettingsHandler;
import org.archive.io.ArchiveRecord;
import org.archive.io.arc.ARCReaderFactory;
import org.archive.io.arc.ARCRecord;
import org.archive.net.UURIFactory;
import org.archive.util.HttpRecorder;
import org.archive.util.OneLineSimpleLogger;

/* loaded from: input_file:site-search/heritrix/heritrix-1.12.1.jar:org/archive/crawler/extractor/ExtractorTool.class */
public class ExtractorTool {
    private static final String[] DEFAULT_EXTRACTORS;
    private final List<Processor> extractors;
    private final File scratchDir;
    private static final String DEFAULT_SCRATCH = "/tmp";

    public ExtractorTool() throws Exception {
        this(DEFAULT_EXTRACTORS, DEFAULT_SCRATCH);
    }

    public ExtractorTool(String[] strArr, String str) throws Exception {
        this.scratchDir = str == null ? new File(DEFAULT_SCRATCH) : new File(str);
        if (!this.scratchDir.exists()) {
            this.scratchDir.mkdirs();
        }
        XMLSettingsHandler xMLSettingsHandler = new XMLSettingsHandler(new File(this.scratchDir.getAbsolutePath(), ExtractorTool.class.getName() + "_order.xml"));
        xMLSettingsHandler.initialize();
        xMLSettingsHandler.getOrder().setAttribute(new Attribute(CrawlOrder.ATTR_SETTINGS_DIRECTORY, this.scratchDir.getAbsolutePath()));
        CrawlerSettings settingsObject = xMLSettingsHandler.getSettingsObject(null);
        MapType mapType = (MapType) xMLSettingsHandler.getOrder().getAttribute(CrawlOrder.ATTR_EXTRACT_PROCESSORS);
        this.extractors = new ArrayList();
        for (int i = 0; i < strArr.length; i++) {
            Processor processor = (Processor) Class.forName(strArr[i]).getConstructor(String.class).newInstance(Integer.toString(i));
            mapType.addElement(settingsObject, processor);
            processor.setAttribute(new Attribute("enabled", Boolean.TRUE));
            this.extractors.add(processor);
        }
    }

    public void extract(String str) throws IOException, URIException, InterruptedException {
        Iterator<ArchiveRecord> it2 = ARCReaderFactory.get(new File(str)).iterator();
        while (it2.hasNext()) {
            ARCRecord aRCRecord = (ARCRecord) it2.next();
            CrawlURI crawlURI = getCrawlURI(aRCRecord, HttpRecorder.wrapInputStreamWithHttpRecord(this.scratchDir, getClass().getName(), aRCRecord, null));
            Iterator<Processor> it3 = this.extractors.iterator();
            while (it3.hasNext()) {
                it3.next().process(crawlURI);
            }
            outlinks(crawlURI);
        }
    }

    protected void outlinks(CrawlURI crawlURI) {
        System.out.println(crawlURI.getUURI().toString());
        for (Link link : crawlURI.getOutLinks()) {
            System.out.println(UURIFactory.SPACE + ((Object) link.getDestination()) + UURIFactory.SPACE + link.getHopType() + UURIFactory.SPACE + ((Object) link.getContext()));
        }
    }

    protected CrawlURI getCrawlURI(final ARCRecord aRCRecord, HttpRecorder httpRecorder) throws URIException {
        CrawlURI crawlURI = new CrawlURI(UURIFactory.getInstance(aRCRecord.getMetaData().getUrl()));
        crawlURI.setContentSize(aRCRecord.getMetaData().getLength());
        crawlURI.setContentType(aRCRecord.getMetaData().getMimetype());
        crawlURI.setHttpRecorder(httpRecorder);
        if (!crawlURI.getUURI().getScheme().equals("filedesc")) {
            crawlURI.putObject(CoreAttributeConstants.A_HTTP_TRANSACTION, new HttpMethodBase() { // from class: org.archive.crawler.extractor.ExtractorTool.1
                @Override // org.apache.commons.httpclient.HttpMethodBase, org.apache.commons.httpclient.HttpMethod
                public String getName() {
                    return getClass().getName() + "_method";
                }

                @Override // org.apache.commons.httpclient.HttpMethodBase, org.apache.commons.httpclient.HttpMethod
                public Header getResponseHeader(String str) {
                    String str2 = (String) aRCRecord.getMetaData().getHeaderValue(str);
                    if (str2 == null || str2.length() == 0) {
                        return null;
                    }
                    return new Header(str, str2);
                }
            });
            String statusCode = aRCRecord.getMetaData().getStatusCode();
            crawlURI.setFetchStatus(statusCode == null ? 200 : Integer.parseInt(statusCode));
        }
        return crawlURI;
    }

    private static void usage(HelpFormatter helpFormatter, Options options, int i) {
        helpFormatter.printHelp("java " + ExtractorTool.class.getName() + " \\\n[--scratch=DIR] [--extractor=EXTRACTOR1,EXTRACTOR2,...] ARC", options);
        System.exit(i);
    }

    public static void main(String[] strArr) throws Exception {
        Options options = new Options();
        options.addOption(new Option("h", "help", false, "Prints this message and exits."));
        StringBuffer stringBuffer = new StringBuffer();
        for (int i = 0; i < DEFAULT_EXTRACTORS.length; i++) {
            if (i > 0) {
                stringBuffer.append(", ");
            }
            stringBuffer.append(DEFAULT_EXTRACTORS[i]);
        }
        options.addOption(new Option("e", "extractor", true, "List of comma-separated extractor class names. Run in order listed. If no extractors listed, runs following: " + stringBuffer.toString() + "."));
        options.addOption(new Option("s", "scratch", true, "Directory to write scratch files to. Default: '/tmp'."));
        CommandLine parse = new PosixParser().parse(options, strArr, false);
        List argList = parse.getArgList();
        Option[] options2 = parse.getOptions();
        HelpFormatter helpFormatter = new HelpFormatter();
        if (argList.size() <= 0) {
            usage(helpFormatter, options, 0);
        }
        String[] strArr2 = DEFAULT_EXTRACTORS;
        String str = null;
        for (int i2 = 0; i2 < options2.length; i2++) {
            switch (options2[i2].getId()) {
                case 101:
                    String value = options2[i2].getValue();
                    if (value == null || value.length() <= 0) {
                        strArr2 = new String[0];
                        break;
                    } else {
                        strArr2 = value.split(",");
                        break;
                    }
                case 104:
                    usage(helpFormatter, options, 0);
                    break;
                case 115:
                    str = options2[i2].getValue();
                    break;
                default:
                    throw new RuntimeException("Unexpected option: " + options2[i2].getId());
            }
        }
        ExtractorTool extractorTool = new ExtractorTool(strArr2, str);
        Iterator it2 = argList.iterator();
        while (it2.hasNext()) {
            extractorTool.extract((String) it2.next());
        }
    }

    static {
        Handler[] handlers = Logger.getLogger("").getHandlers();
        for (int i = 0; i < handlers.length; i++) {
            Handler handler = handlers[0];
            if (handler instanceof ConsoleHandler) {
                handler.setFormatter(new OneLineSimpleLogger());
            }
        }
        DEFAULT_EXTRACTORS = new String[]{"org.archive.crawler.extractor.ExtractorHTTP", "org.archive.crawler.extractor.ExtractorHTML"};
    }
}
