package org.archive.extractor;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import org.apache.commons.httpclient.URIException;
import org.archive.crawler.extractor.Link;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.util.DevUtils;
import org.archive.util.TextUtils;

/* loaded from: input_file:site-search/heritrix/heritrix-1.12.1.jar:org/archive/extractor/RegexpHTMLLinkExtractor.class */
public class RegexpHTMLLinkExtractor extends CharSequenceLinkExtractor {
    private static Logger logger = Logger.getLogger(RegexpHTMLLinkExtractor.class.getName());
    boolean honorRobots = true;
    boolean extractInlineCss = true;
    boolean extractInlineJs = true;
    protected LinkedList<Link> next = new LinkedList<>();
    protected Matcher tags;
    static final String RELEVANT_TAG_EXTRACTOR = "(?is)<(?:((script[^>]*+)>.*?</script)|((style[^>]*+)>[^<]*+</style)|(((meta)|(?:\\w+))\\s+[^>]*+)|(!--.*?--))>";
    static final String EACH_ATTRIBUTE_EXTRACTOR = "(?is)\\s((href)|(action)|(on\\w*)|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)|(?:usemap)|(?:profile)|(?:datasrc)|(?:for))|(codebase)|((?:classid)|(?:data))|(archive)|(code)|(value)|([-\\w]+))\\s*=\\s*(?:(?:\"(.*?)(?:\"|$))|(?:'(.*?)(?:'|$))|(\\S+))";
    static final String LIKELY_URI_PATH = "(\\.{0,2}[^\\.\\n\\r\\s\"']*(\\.[^\\.\\n\\r\\s\"']+)+)";
    static final String ESCAPED_AMP = "&amp;";
    static final String AMP = "&";
    static final String WHITESPACE = "\\s";
    static final String CLASSEXT = ".class";
    static final String APPLET = "applet";
    static final String BASE = "base";
    static final String LINK = "link";
    static final String JAVASCRIPT = "(?i)^javascript:.*";
    static final String NON_HTML_PATH_EXTENSION = "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)";

    @Override // org.archive.extractor.CharSequenceLinkExtractor
    protected boolean findNextLink() {
        if (this.tags == null) {
            this.tags = TextUtils.getMatcher(RELEVANT_TAG_EXTRACTOR, this.sourceContent);
        }
        while (this.tags.find() && !Thread.interrupted()) {
            if (this.tags.start(8) <= 0) {
                if (this.tags.start(7) > 0) {
                    processMeta(this.sourceContent.subSequence(this.tags.start(5), this.tags.end(5)));
                } else if (this.tags.start(5) > 0) {
                    int start = this.tags.start(5);
                    int end = this.tags.end(5);
                    processGeneralTag(this.sourceContent.subSequence(this.tags.start(6), this.tags.end(6)), this.sourceContent.subSequence(start, end));
                } else if (this.tags.start(1) > 0) {
                    int start2 = this.tags.start(1);
                    processScript(this.sourceContent.subSequence(start2, this.tags.end(1)), this.tags.end(2) - start2);
                } else if (this.tags.start(3) > 0) {
                    int start3 = this.tags.start(3);
                    processStyle(this.sourceContent.subSequence(start3, this.tags.end(3)), this.tags.end(4) - start3);
                }
            }
            if (!this.next.isEmpty()) {
                return true;
            }
        }
        return false;
    }

    protected boolean processGeneralTag(CharSequence charSequence, CharSequence charSequence2) {
        Matcher matcher = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, charSequence2);
        String str = null;
        ArrayList arrayList = null;
        long size = this.next.size();
        while (matcher.find()) {
            int i = matcher.start(12) > -1 ? 12 : matcher.start(13) > -1 ? 13 : 14;
            CharSequence subSequence = charSequence2.subSequence(matcher.start(i), matcher.end(i));
            if (matcher.start(2) > -1) {
                CharSequence elementContext = Link.elementContext(charSequence, matcher.group(2));
                if (charSequence.toString().equalsIgnoreCase("link")) {
                    processEmbed(subSequence, elementContext);
                } else {
                    if (charSequence.toString().equalsIgnoreCase("base")) {
                        try {
                            this.base = UURIFactory.getInstance(subSequence.toString());
                        } catch (URIException e) {
                            this.extractErrorListener.noteExtractError(e, this.source, subSequence);
                        }
                    }
                    processLink(subSequence, elementContext);
                }
            } else if (matcher.start(3) > -1) {
                processLink(subSequence, Link.elementContext(charSequence, matcher.group(3)));
            } else if (matcher.start(4) > -1) {
                processScriptCode(subSequence);
            } else if (matcher.start(5) > -1) {
                processEmbed(subSequence, Link.elementContext(charSequence, matcher.group(5)));
            } else if (matcher.start(6) > -1) {
                str = TextUtils.replaceAll(ESCAPED_AMP, subSequence, AMP);
                processEmbed(str, Link.elementContext(charSequence, matcher.group(6)));
            } else if (matcher.start(7) > -1) {
                if (arrayList == null) {
                    arrayList = new ArrayList();
                }
                arrayList.add(subSequence.toString());
            } else if (matcher.start(8) > -1) {
                if (arrayList == null) {
                    arrayList = new ArrayList();
                }
                for (String str2 : TextUtils.split(WHITESPACE, subSequence)) {
                    arrayList.add(str2);
                }
            } else if (matcher.start(9) > -1) {
                if (arrayList == null) {
                    arrayList = new ArrayList();
                }
                if (!charSequence.toString().toLowerCase().equals("applet") || subSequence.toString().toLowerCase().endsWith(CLASSEXT)) {
                    arrayList.add(subSequence.toString());
                } else {
                    arrayList.add(subSequence.toString() + CLASSEXT);
                }
            } else if (matcher.start(10) > -1) {
                if (TextUtils.matches(LIKELY_URI_PATH, subSequence)) {
                    processLink(subSequence, Link.elementContext(charSequence, matcher.group(10)));
                }
            } else if (matcher.start(11) > -1) {
            }
        }
        TextUtils.recycleMatcher(matcher);
        if (arrayList == null) {
            return size - ((long) this.next.size()) > 0;
        }
        Iterator it2 = arrayList.iterator();
        UURI uuri = null;
        String str3 = null;
        if (str != null) {
            try {
                uuri = UURIFactory.getInstance(this.base, str);
            } catch (IllegalArgumentException e2) {
                DevUtils.logger.log(Level.WARNING, "processGeneralTag()\ncodebase=" + str + " res=" + str3 + "\n" + DevUtils.extraInfo(), (Throwable) e2);
            } catch (URIException e3) {
                this.extractErrorListener.noteExtractError(e3, this.source, str);
            }
        }
        while (it2.hasNext()) {
            str3 = TextUtils.replaceAll(ESCAPED_AMP, it2.next().toString(), AMP);
            if (uuri != null) {
                str3 = uuri.resolve(str3).toString();
            }
            processEmbed(str3, charSequence);
        }
        return size - ((long) this.next.size()) > 0;
    }

    protected void processScriptCode(CharSequence charSequence) {
        RegexpJSLinkExtractor.extract(charSequence, this.source, this.base, this.next, this.extractErrorListener);
    }

    protected void processLink(CharSequence charSequence, CharSequence charSequence2) {
        String replaceAll = TextUtils.replaceAll(ESCAPED_AMP, charSequence, AMP);
        if (TextUtils.matches(JAVASCRIPT, replaceAll)) {
            processScriptCode(charSequence.subSequence(11, charSequence.length()));
        } else {
            addLinkFromString(replaceAll, charSequence2, 'L');
        }
    }

    private void addLinkFromString(String str, CharSequence charSequence, char c) {
        try {
            this.next.addLast(new Link(this.source, UURIFactory.getInstance(this.base, str), charSequence, c));
        } catch (URIException e) {
            this.extractErrorListener.noteExtractError(e, this.source, str);
        }
    }

    protected long processEmbed(CharSequence charSequence, CharSequence charSequence2) {
        addLinkFromString(TextUtils.replaceAll(ESCAPED_AMP, charSequence, AMP), charSequence2, 'E');
        return 1L;
    }

    protected void processScript(CharSequence charSequence, int i) {
        processGeneralTag(charSequence.subSequence(0, 6), charSequence.subSequence(0, i));
        processScriptCode(charSequence.subSequence(i, charSequence.length()));
    }

    protected void processMeta(CharSequence charSequence) {
        Matcher matcher = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, charSequence);
        String str = null;
        String str2 = null;
        String str3 = null;
        while (matcher.find()) {
            int i = matcher.start(12) > -1 ? 12 : matcher.start(13) > -1 ? 13 : 14;
            CharSequence subSequence = charSequence.subSequence(matcher.start(i), matcher.end(i));
            if (matcher.group(1).equalsIgnoreCase("name")) {
                str = subSequence.toString();
            } else if (matcher.group(1).equalsIgnoreCase("http-equiv")) {
                str2 = subSequence.toString();
            } else if (matcher.group(1).equalsIgnoreCase("content")) {
                str3 = subSequence.toString();
            }
        }
        TextUtils.recycleMatcher(matcher);
        if (!"robots".equalsIgnoreCase(str) || str3 == null) {
            if (!"refresh".equalsIgnoreCase(str2) || str3 == null) {
                return;
            }
            String substring = str3.substring(str3.indexOf("=") + 1);
            try {
                this.next.addLast(new Link(this.source, UURIFactory.getInstance(this.base, substring), Link.elementContext("meta", str2), 'R'));
                return;
            } catch (URIException e) {
                this.extractErrorListener.noteExtractError(e, this.source, substring);
                return;
            }
        }
        if (getHonorRobots()) {
            String lowerCase = str3.toLowerCase();
            if (lowerCase.indexOf("nofollow") >= 0 || lowerCase.indexOf("none") >= 0) {
                logger.fine("HTML extraction skipped due to robots meta-tag for: " + ((Object) this.source));
                cancelFurtherExtraction();
            }
        }
    }

    private boolean getHonorRobots() {
        return this.honorRobots;
    }

    private void cancelFurtherExtraction() {
        this.tags.reset("");
    }

    protected void processStyle(CharSequence charSequence, int i) {
        processGeneralTag(charSequence.subSequence(0, 6), charSequence.subSequence(0, i));
        RegexpCSSLinkExtractor.extract(charSequence.subSequence(i, charSequence.length()), this.source, this.base, this.next, this.extractErrorListener);
    }

    @Override // org.archive.extractor.CharSequenceLinkExtractor, org.archive.extractor.LinkExtractor
    public void reset() {
        super.reset();
        TextUtils.recycleMatcher(this.tags);
        this.tags = null;
    }

    protected static CharSequenceLinkExtractor newDefaultInstance() {
        return new RegexpHTMLLinkExtractor();
    }
}
