package org.archive.crawler.extractor;

import au.id.jericho.lib.html.Attribute;
import au.id.jericho.lib.html.Attributes;
import au.id.jericho.lib.html.Element;
import au.id.jericho.lib.html.FormControl;
import au.id.jericho.lib.html.FormControlType;
import au.id.jericho.lib.html.FormField;
import au.id.jericho.lib.html.HTMLElementName;
import au.id.jericho.lib.html.Source;
import au.id.jericho.lib.html.StartTagType;
import it.unimi.dsi.mg4j.document.DispatchingDocumentFactory;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.logging.Logger;
import org.apache.commons.httpclient.URIException;
import org.archive.crawler.admin.ui.JobConfigureUtils;
import org.archive.crawler.datamodel.CoreAttributeConstants;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.datamodel.RobotsHonoringPolicy;
import org.archive.io.warc.WARCConstants;

/* loaded from: input_file:site-search/heritrix/heritrix-1.12.1.jar:org/archive/crawler/extractor/JerichoExtractorHTML.class */
public class JerichoExtractorHTML extends ExtractorHTML implements CoreAttributeConstants {
    private static final long serialVersionUID = 1684681316546343615L;
    private Logger logger;
    protected long numberOfFormsProcessed;

    public JerichoExtractorHTML(String str) {
        this(str, "Jericho-HTML extractor. Extracts links from HTML documents using Jericho HTML Parser. Offers same basic functionality as ExtractorHTML but better handles broken HTML and extraction of default values from HTML forms. A word of warning: the used parser, the Jericho HTML Parser, reads the whole document into memory for parsing - thus this extractor has an inherent OOME risk. This OOME risk can be reduced/eleminated by limiting the size of documents to be parsed (i.e. using NotExceedsDocumentLengthTresholdDecideRule). ");
    }

    public JerichoExtractorHTML(String str, String str2) {
        super(str, str2);
        this.logger = Logger.getLogger(getClass().getName());
        this.numberOfFormsProcessed = 0L;
    }

    private static List<Attribute> findOnAttributes(Attributes attributes) {
        LinkedList linkedList = new LinkedList();
        Iterator it2 = attributes.iterator();
        while (it2.hasNext()) {
            Attribute attribute = (Attribute) it2.next();
            if (attribute.getKey().startsWith("on")) {
                linkedList.add(attribute);
            }
        }
        return linkedList;
    }

    /* JADX WARN: Code restructure failed: missing block: B:82:0x01b5, code lost:
    
        if (r0 != null) goto L52;
     */
    /* JADX WARN: Code restructure failed: missing block: B:91:0x024c, code lost:
    
        if (r0 != null) goto L72;
     */
    /*
        Code decompiled incorrectly, please refer to instructions dump.
        To view partially-correct add '--show-bad-code' argument
    */
    protected void processGeneralTag(org.archive.crawler.datamodel.CrawlURI r8, au.id.jericho.lib.html.Element r9, au.id.jericho.lib.html.Attributes r10) {
        /*
            Method dump skipped, instructions count: 1090
            To view this dump add '--comments-level debug' option
        */
        throw new UnsupportedOperationException("Method not decompiled: org.archive.crawler.extractor.JerichoExtractorHTML.processGeneralTag(org.archive.crawler.datamodel.CrawlURI, au.id.jericho.lib.html.Element, au.id.jericho.lib.html.Attributes):void");
    }

    protected boolean processMeta(CrawlURI crawlURI, Element element) {
        String attributeValue = element.getAttributeValue("name");
        String attributeValue2 = element.getAttributeValue("http-equiv");
        String attributeValue3 = element.getAttributeValue("content");
        if ("robots".equals(attributeValue) && attributeValue3 != null) {
            crawlURI.putString(CoreAttributeConstants.A_META_ROBOTS, attributeValue3);
            RobotsHonoringPolicy robotsHonoringPolicy = getSettingsHandler().getOrder().getRobotsHonoringPolicy();
            String lowerCase = attributeValue3.toLowerCase();
            if ((robotsHonoringPolicy == null || (!robotsHonoringPolicy.isType(crawlURI, 1) && !robotsHonoringPolicy.isType(crawlURI, 2))) && (lowerCase.indexOf("nofollow") >= 0 || lowerCase.indexOf("none") >= 0)) {
                this.logger.fine("HTML extraction skipped due to robots meta-tag for: " + crawlURI.toString());
                return true;
            }
        }
        if (!"refresh".equals(attributeValue2) || attributeValue3 == null) {
            return false;
        }
        String substring = attributeValue3.substring(attributeValue3.indexOf("=") + 1);
        try {
            crawlURI.createAndAddLinkRelativeToBase(substring, "meta", 'R');
            return false;
        } catch (URIException e) {
            if (getController() != null) {
                getController().logUriError(e, crawlURI.getUURI(), substring);
                return false;
            }
            this.logger.info("Failed createAndAddLinkRelativeToBase " + crawlURI + ", " + element.toString() + ", " + substring + WARCConstants.COLON_SPACE + e);
            return false;
        }
    }

    protected void processScript(CrawlURI crawlURI, Element element) {
        processGeneralTag(crawlURI, element, element.getAttributes());
        processScriptCode(crawlURI, element.getContent());
    }

    protected void processStyle(CrawlURI crawlURI, Element element) {
        processGeneralTag(crawlURI, element, element.getAttributes());
        this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(crawlURI, element.getContent(), getController());
    }

    protected void processForm(CrawlURI crawlURI, Element element) {
        String str;
        String attributeValue = element.getAttributeValue(JobConfigureUtils.ACTION);
        String attributeValue2 = element.getAttributeValue("name");
        String str2 = "";
        if (((Boolean) getUncheckedAttribute(crawlURI, ExtractorHTML.ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue()) {
            return;
        }
        this.numberOfFormsProcessed++;
        Iterator it2 = element.findFormFields().iterator();
        while (it2.hasNext()) {
            for (FormControl formControl : ((FormField) it2.next()).getFormControls()) {
                String name = formControl.getName();
                Collection values = formControl.getFormControlType() != FormControlType.SUBMIT ? formControl.getValues() : formControl.getPredefinedValues();
                if (values.size() > 0) {
                    Iterator it3 = values.iterator();
                    while (it3.hasNext()) {
                        str2 = str2 + "&" + name + "=" + ((String) it3.next());
                    }
                } else {
                    str2 = str2 + "&" + name + "=";
                }
            }
        }
        if (attributeValue == null) {
            str = str2.replaceFirst("&", DispatchingDocumentFactory.OTHERWISE_IN_RULE);
        } else {
            if (!attributeValue.contains(DispatchingDocumentFactory.OTHERWISE_IN_RULE)) {
                str2 = str2.replaceFirst("&", DispatchingDocumentFactory.OTHERWISE_IN_RULE);
            }
            str = attributeValue + str2;
        }
        processLink(crawlURI, str, Link.elementContext(element.getName(), "name=" + attributeValue2));
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    @Override // org.archive.crawler.extractor.ExtractorHTML
    public void extract(CrawlURI crawlURI, CharSequence charSequence) {
        for (Element element : new Source(charSequence).findAllElements(StartTagType.NORMAL)) {
            String name = element.getName();
            if (name.equals("meta")) {
                if (processMeta(crawlURI, element)) {
                    return;
                }
            } else if (name.equals("script")) {
                processScript(crawlURI, element);
            } else if (name.equals("style")) {
                processStyle(crawlURI, element);
            } else if (name.equals(HTMLElementName.FORM)) {
                processForm(crawlURI, element);
            } else {
                Attributes attributes = element.getAttributes();
                if (!attributes.isEmpty()) {
                    processGeneralTag(crawlURI, element, attributes);
                }
            }
        }
    }

    @Override // org.archive.crawler.extractor.ExtractorHTML, org.archive.crawler.framework.Processor
    public String report() {
        StringBuffer stringBuffer = new StringBuffer();
        stringBuffer.append("Processor: org.archive.crawler.extractor.JerichoExtractorHTML\n");
        stringBuffer.append("  Function:          Link extraction on HTML documents\n");
        stringBuffer.append("  CrawlURIs handled: " + this.numberOfCURIsHandled + "\n");
        stringBuffer.append("  Forms processed:   " + this.numberOfFormsProcessed + "\n");
        stringBuffer.append("  Links extracted:   " + this.numberOfLinksExtracted + "\n\n");
        return stringBuffer.toString();
    }
}
