package org.archive.crawler.extractor;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URL;
import java.util.Collection;
import java.util.Iterator;
import javax.management.AttributeNotFoundException;
import javax.management.InvalidAttributeValueException;
import javax.management.MBeanException;
import javax.management.ReflectionException;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.collections.Predicate;
import org.apache.commons.httpclient.URIException;
import org.apache.tools.ant.taskdefs.XSLTLiaison;
import org.archive.crawler.datamodel.CoreAttributeConstants;
import org.archive.crawler.datamodel.CrawlOrder;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.settings.MapType;
import org.archive.crawler.settings.XMLSettingsHandler;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.util.HttpRecorder;
import org.archive.util.TmpDirTestCase;

/* loaded from: input_file:site-search/heritrix/heritrix-1.12.1.jar:org/archive/crawler/extractor/ExtractorHTMLTest.class */
public class ExtractorHTMLTest extends TmpDirTestCase implements CoreAttributeConstants {
    private final String ARCHIVE_DOT_ORG = "archive.org";
    private final String LINK_TO_FIND = "http://www.hewlett.org/";
    private HttpRecorder recorder = null;
    private ExtractorHTML extractor = null;

    protected ExtractorHTML createExtractor() throws InvalidAttributeValueException, AttributeNotFoundException, MBeanException, ReflectionException {
        String name = getClass().getName();
        XMLSettingsHandler xMLSettingsHandler = new XMLSettingsHandler(new File(getTmpDir(), name + ".order.xml"));
        xMLSettingsHandler.initialize();
        return (ExtractorHTML) ((MapType) xMLSettingsHandler.getOrder().getAttribute(CrawlOrder.ATTR_RULES)).addElement(xMLSettingsHandler.getSettingsObject(null), new ExtractorHTML(name));
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // org.archive.util.TmpDirTestCase, junit.framework.TestCase
    public void setUp() throws Exception {
        super.setUp();
        this.extractor = createExtractor();
        File tmpDir = getTmpDir();
        StringBuilder sb = new StringBuilder();
        getClass();
        File file = new File(tmpDir, sb.append("archive.org").append(".html").toString());
        URL url = new URL(XSLTLiaison.FILE_PROTOCOL_PREFIX + file.getAbsolutePath());
        FileOutputStream fileOutputStream = new FileOutputStream(file);
        StringBuilder append = new StringBuilder().append("<html><head><title>test</title><body><a href=");
        getClass();
        fileOutputStream.write(append.append("http://www.hewlett.org/").append(">Hewlett Foundation</a>").append("</body></html>").toString().getBytes());
        fileOutputStream.flush();
        fileOutputStream.close();
        this.recorder = HttpRecorder.wrapInputStreamWithHttpRecord(getTmpDir(), getClass().getName(), url.openStream(), null);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // org.archive.util.TmpDirTestCase, junit.framework.TestCase
    public void tearDown() throws Exception {
        super.tearDown();
    }

    public void testInnerProcess() throws IOException {
        StringBuilder append = new StringBuilder().append("http://");
        getClass();
        CrawlURI crawlURI = setupCrawlURI(this.recorder, UURIFactory.getInstance(append.append("archive.org").toString()).toString());
        this.extractor.innerProcess(crawlURI);
        boolean z = false;
        Iterator<Link> it2 = crawlURI.getOutLinks().iterator();
        while (true) {
            if (!it2.hasNext()) {
                break;
            }
            String obj = it2.next().getDestination().toString();
            getClass();
            if (obj.equals("http://www.hewlett.org/")) {
                z = true;
                break;
            }
        }
        assertTrue("Did not find gif url", z);
    }

    private CrawlURI setupCrawlURI(HttpRecorder httpRecorder, String str) throws URIException {
        CrawlURI crawlURI = new CrawlURI(UURIFactory.getInstance(str));
        crawlURI.setContentSize(this.recorder.getRecordedInput().getSize());
        crawlURI.setContentType("text/html");
        crawlURI.setFetchStatus(200);
        crawlURI.setHttpRecorder(httpRecorder);
        crawlURI.putObject(CoreAttributeConstants.A_HTTP_TRANSACTION, new Object());
        return crawlURI;
    }

    public void testPageParse() throws InvalidAttributeValueException, AttributeNotFoundException, MBeanException, ReflectionException, IOException {
        if (0 != 0) {
            runExtractor(null);
        }
    }

    protected UURI getUURI(String str) throws URIException {
        return UURIFactory.getInstance(str.indexOf("://") > 0 ? str : XSLTLiaison.FILE_PROTOCOL_PREFIX + str);
    }

    protected void runExtractor(UURI uuri) throws InvalidAttributeValueException, AttributeNotFoundException, MBeanException, ReflectionException, IOException {
        runExtractor(uuri, null);
    }

    protected void runExtractor(UURI uuri, String str) throws IOException, InvalidAttributeValueException, AttributeNotFoundException, MBeanException, ReflectionException {
        if (uuri == null) {
            return;
        }
        this.extractor = createExtractor();
        URL url = new URL(uuri.toString());
        this.recorder = HttpRecorder.wrapInputStreamWithHttpRecord(getTmpDir(), getClass().getName(), url.openStream(), str);
        CrawlURI crawlURI = setupCrawlURI(this.recorder, url.toString());
        this.extractor.innerProcess(crawlURI);
        System.out.println("+" + this.extractor.report());
        int i = 0;
        Collection<Link> outLinks = crawlURI.getOutLinks();
        System.out.println("+HTML Links (hopType=L):");
        if (outLinks != null) {
            for (Link link : outLinks) {
                if (link.getHopType() == 'L') {
                    i++;
                    System.out.println(link.getDestination());
                }
            }
        }
        System.out.println("+HTML Embeds (hopType=E):");
        if (outLinks != null) {
            for (Link link2 : outLinks) {
                if (link2.getHopType() == 'E') {
                    i++;
                    System.out.println(link2.getDestination());
                }
            }
        }
        System.out.println("+HTML Speculative Embeds (hopType=X):");
        if (outLinks != null) {
            for (Link link3 : outLinks) {
                if (link3.getHopType() == 'X') {
                    i++;
                    System.out.println(link3.getDestination());
                }
            }
        }
        System.out.println("+HTML Other (all other hopTypes):");
        if (outLinks != null) {
            for (Link link4 : outLinks) {
                if (link4.getHopType() != 'X' && link4.getHopType() != 'L' && link4.getHopType() != 'E') {
                    i++;
                    System.out.println(link4.getHopType() + UURIFactory.SPACE + ((Object) link4.getDestination()));
                }
            }
        }
        System.out.println("TOTAL URIS EXTRACTED: " + i);
    }

    public void testEmbedSrc() throws URIException {
        CrawlURI crawlURI = new CrawlURI(UURIFactory.getInstance("http://www.example.org"));
        this.extractor.extract(crawlURI, "<embed src=\"/documents/prem/18/1/graphics/qtvr/hall.mov\" width=\"320\" height=\"212\" controller=\"true\" CORRECTION=\"FULL\" pluginspage=\"http://www.apple.com/quicktime/download/\" /> ");
        assertTrue(CollectionUtils.exists(crawlURI.getOutLinks(), new Predicate() { // from class: org.archive.crawler.extractor.ExtractorHTMLTest.1
            @Override // org.apache.commons.collections.Predicate
            public boolean evaluate(Object obj) {
                return ((Link) obj).getDestination().toString().indexOf("/documents/prem/18/1/graphics/qtvr/hall.mov") >= 0;
            }
        }));
    }

    public void testHrefWhitespace() throws URIException {
        CrawlURI crawlURI = new CrawlURI(UURIFactory.getInstance("http://www.carsound.dk"));
        this.extractor.extract(crawlURI, "<a href=\"http://www.carsound.dk\n\n\n\"\ntarget=\"_blank\">C.A.R. Sound\n\n\n\n</a>");
        crawlURI.getOutLinks();
        assertTrue("Not stripping new lines", CollectionUtils.exists(crawlURI.getOutLinks(), new Predicate() { // from class: org.archive.crawler.extractor.ExtractorHTMLTest.2
            @Override // org.apache.commons.collections.Predicate
            public boolean evaluate(Object obj) {
                return ((Link) obj).getDestination().toString().indexOf("http://www.carsound.dk/") >= 0;
            }
        }));
    }

    public static void main(String[] strArr) throws Exception {
        if (strArr.length != 1 && strArr.length != 2) {
            System.err.println("Usage: " + ExtractorHTMLTest.class.getName() + " URL|PATH [ENCODING]");
            System.exit(1);
        }
        ExtractorHTMLTest extractorHTMLTest = new ExtractorHTMLTest();
        extractorHTMLTest.setUp();
        try {
            extractorHTMLTest.runExtractor(extractorHTMLTest.getUURI(strArr[0]), strArr.length == 2 ? strArr[1] : null);
            extractorHTMLTest.tearDown();
        } catch (Throwable th) {
            extractorHTMLTest.tearDown();
            throw th;
        }
    }
}
