package org.archive.crawler.extractor;

import java.io.IOException;
import java.util.regex.Matcher;
import javax.management.AttributeNotFoundException;
import org.archive.crawler.datamodel.CoreAttributeConstants;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.settings.SimpleType;
import org.archive.io.ReplayInputStream;
import org.archive.util.TextUtils;
import org.archive.util.anvl.ANVLRecord;

/* loaded from: input_file:site-search/heritrix/heritrix-1.12.1.jar:org/archive/crawler/extractor/ExtractorUniversal.class */
public class ExtractorUniversal extends Extractor implements CoreAttributeConstants {
    private static final long serialVersionUID = -7593380118857156939L;
    private static String ATTR_MAX_DEPTH_BYTES = "max-depth-bytes";
    private static long DEFAULT_MAX_DEPTH_BYTES = ANVLRecord.MAXIMUM_SIZE;
    private static String ATTR_MAX_URL_LENGTH = "max-url-length";
    private static long DEFAULT_MAX_URL_LENGTH = 2083;
    static final String IP_ADDRESS = "((http://)|(https://))(\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?)";
    public static final String TLDs = "(ac(/.*)?)|(ad(/.*)?)|(ae(/.*)?)|(af(/.*)?)|(ag(/.*)?)|(ai(/.*)?)|(al(/.*)?)|(am(/.*)?)|(an(/.*)?)|(ao(/.*)?)|(aero(/.*)?)|(aq(/.*)?)|(ar(/.*)?)|(as(/.*)?)|(at(/.*)?)|(au(/.*)?)|(aw(/.*)?)|(az(/.*)?)|(ba(/.*)?)|(bb(/.*)?)|(bd(/.*)?)|(be(/.*)?)|(bf(/.*)?)|(bg(/.*)?)|(bh(/.*)?)|(bi(/.*)?)|(biz(/.*)?)|(bj(/.*)?)|(bm(/.*)?)|(bn(/.*)?)|(bo(/.*)?)|(br(/.*)?)|(bs(/.*)?)|(bt(/.*)?)|(bv(/.*)?)|(bw(/.*)?)|(by(/.*)?)|(bz(/.*)?)|(ca(/.*)?)|(cc(/.*)?)|(cd(/.*)?)|(cf(/.*)?)|(cg(/.*)?)|(ch(/.*)?)|(ci(/.*)?)|(ck(/.*)?)|(cl(/.*)?)|(cm(/.*)?)|(cn(/.*)?)|(co(/.*)?)|(com(/.*)?)|(coop(/.*)?)|(cr(/.*)?)|(cs(/.*)?)|(cu(/.*)?)|(cv(/.*)?)|(cx(/.*)?)|(cy(/.*)?)|(cz(/.*)?)|(de(/.*)?)|(dj(/.*)?)|(dk(/.*)?)|(dm(/.*)?)|(do(/.*)?)|(dz(/.*)?)|(ec(/.*)?)|(edu(/.*)?)|(ee(/.*)?)|(eg(/.*)?)|(eh(/.*)?)|(er(/.*)?)|(es(/.*)?)|(et(/.*)?)|(fi(/.*)?)|(fj(/.*)?)|(fk(/.*)?)|(fm(/.*)?)|(fo(/.*)?)|(fr(/.*)?)|(ga(/.*)?)|(gd(/.*)?)|(ge(/.*)?)|(gf(/.*)?)|(gg(/.*)?)|(gh(/.*)?)|(gi(/.*)?)|(gl(/.*)?)|(gm(/.*)?)|(gn(/.*)?)|(gov(/.*)?)|(gp(/.*)?)|(gq(/.*)?)|(gr(/.*)?)|(gs(/.*)?)|(gt(/.*)?)|(gu(/.*)?)|(gw(/.*)?)|(gy(/.*)?)|(hk(/.*)?)|(hm(/.*)?)|(hn(/.*)?)|(hr(/.*)?)|(ht(/.*)?)|(hu(/.*)?)|(id(/.*)?)|(ie(/.*)?)|(il(/.*)?)|(im(/.*)?)|(in(/.*)?)|(info(/.*)?)|(int(/.*)?)|(io(/.*)?)|(iq(/.*)?)|(ir(/.*)?)|(is(/.*)?)|(it(/.*)?)|(je(/.*)?)|(jm(/.*)?)|(jo(/.*)?)|(jp(/.*)?)|(ke(/.*)?)|(kg(/.*)?)|(kh(/.*)?)|(ki(/.*)?)|(km(/.*)?)|(kn(/.*)?)|(kp(/.*)?)|(kr(/.*)?)|(kw(/.*)?)|(ky(/.*)?)|(kz(/.*)?)|(la(/.*)?)|(lb(/.*)?)|(lc(/.*)?)|(li(/.*)?)|(lk(/.*)?)|(lr(/.*)?)|(ls(/.*)?)|(lt(/.*)?)|(lu(/.*)?)|(lv(/.*)?)|(ly(/.*)?)|(ma(/.*)?)|(mc(/.*)?)|(md(/.*)?)|(mg(/.*)?)|(mh(/.*)?)|(mil(/.*)?)|(mk(/.*)?)|(ml(/.*)?)|(mm(/.*)?)|(mn(/.*)?)|(mo(/.*)?)|(mp(/.*)?)|(mq(/.*)?)|(mr(/.*)?)|(ms(/.*)?)|(mt(/.*)?)|(mu(/.*)?)|(museum(/.*)?)|(mv(/.*)?)|(mw(/.*)?)|(mx(/.*)?)|(my(/.*)?)|(mz(/.*)?)|(na(/.*)?)|(name(/.*)?)|(nc(/.*)?)|(ne(/.*)?)|(net(/.*)?)|(nf(/.*)?)|(ng(/.*)?)|(ni(/.*)?)|(nl(/.*)?)|(no(/.*)?)|(np(/.*)?)|(nr(/.*)?)|(nt(/.*)?)|(nu(/.*)?)|(nz(/.*)?)|(om(/.*)?)|(org(/.*)?)|(pa(/.*)?)|(pe(/.*)?)|(pf(/.*)?)|(pg(/.*)?)|(ph(/.*)?)|(pk(/.*)?)|(pl(/.*)?)|(pm(/.*)?)|(pn(/.*)?)|(pr(/.*)?)|(pro(/.*)?)|(ps(/.*)?)|(pt(/.*)?)|(pw(/.*)?)|(py(/.*)?)|(qa(/.*)?)|(re(/.*)?)|(ro(/.*)?)|(ru(/.*)?)|(rw(/.*)?)|(sa(/.*)?)|(sb(/.*)?)|(sc(/.*)?)|(sd(/.*)?)|(se(/.*)?)|(sg(/.*)?)|(sh(/.*)?)|(si(/.*)?)|(sj(/.*)?)|(sk(/.*)?)|(sl(/.*)?)|(sm(/.*)?)|(sn(/.*)?)|(so(/.*)?)|(sr(/.*)?)|(sv(/.*)?)|(st(/.*)?)|(sy(/.*)?)|(sz(/.*)?)|(tc(/.*)?)|(td(/.*)?)|(tf(/.*)?)|(tg(/.*)?)|(th(/.*)?)|(tj(/.*)?)|(tk(/.*)?)|(tm(/.*)?)|(tn(/.*)?)|(to(/.*)?)|(tp(/.*)?)|(tr(/.*)?)|(tt(/.*)?)|(tv(/.*)?)|(tw(/.*)?)|(tz(/.*)?)|(ua(/.*)?)|(ug(/.*)?)|(uk(/.*)?)|(um(/.*)?)|(us(/.*)?)|(uy(/.*)?)|(uz(/.*)?)|(va(/.*)?)|(vc(/.*)?)|(ve(/.*)?)|(vg(/.*)?)|(vi(/.*)?)|(vn(/.*)?)|(vu(/.*)?)|(wf(/.*)?)|(ws(/.*)?)|(ye(/.*)?)|(yt(/.*)?)|(yu(/.*)?)|(za(/.*)?)|(zm(/.*)?)|(zw(/.*)?)";
    protected long numberOfCURIsHandled;
    protected long numberOfLinksExtracted;

    public ExtractorUniversal(String str) {
        super(str, "Link extraction on unknown file types. A best effort extractor that looks at the raw byte code of any file that has not been handled by another extractor and tries to find URIs. Will only match absolute URIs.");
        this.numberOfCURIsHandled = 0L;
        this.numberOfLinksExtracted = 0L;
        addElementToDefinition(new SimpleType(ATTR_MAX_DEPTH_BYTES, "How deep to look into files for URI strings, in bytes", new Long(DEFAULT_MAX_DEPTH_BYTES))).setExpertSetting(true);
        addElementToDefinition(new SimpleType(ATTR_MAX_URL_LENGTH, "Max length of URIs in bytes", new Long(DEFAULT_MAX_URL_LENGTH))).setExpertSetting(true);
    }

    @Override // org.archive.crawler.extractor.Extractor
    protected void extract(CrawlURI crawlURI) {
        if (isHttpTransactionContentToProcess(crawlURI)) {
            this.numberOfCURIsHandled++;
            try {
                ReplayInputStream contentReplayInputStream = crawlURI.getHttpRecorder().getRecordedInput().getContentReplayInputStream();
                StringBuffer stringBuffer = new StringBuffer();
                long j = 0;
                long longValue = ((Long) getAttribute(ATTR_MAX_DEPTH_BYTES, crawlURI)).longValue();
                if (longValue <= 0) {
                    longValue = Long.MAX_VALUE;
                }
                long longValue2 = ((Long) getAttribute(ATTR_MAX_URL_LENGTH, crawlURI)).longValue();
                boolean z = false;
                for (int read = contentReplayInputStream.read(); read != -1; read = contentReplayInputStream.read()) {
                    long j2 = j + 1;
                    j = j2;
                    if (j2 > longValue) {
                        break;
                    }
                    if (stringBuffer.length() > longValue2) {
                        stringBuffer = new StringBuffer();
                        z = false;
                    } else if (isURLableChar(read)) {
                        if (read == 46) {
                            z = true;
                        }
                        stringBuffer.append((char) read);
                    } else if (stringBuffer.length() > 3 && z) {
                        String stringBuffer2 = stringBuffer.toString();
                        if (looksLikeAnURL(stringBuffer2)) {
                            if (stringBuffer2.toLowerCase().indexOf("http") > 0) {
                                stringBuffer2 = stringBuffer2.substring(stringBuffer2.toLowerCase().indexOf("http"));
                            }
                            while (stringBuffer2.substring(stringBuffer2.length() - 1).equals(".")) {
                                stringBuffer2 = stringBuffer2.substring(0, stringBuffer2.length() - 1);
                            }
                            this.numberOfLinksExtracted++;
                            crawlURI.createAndAddLink(stringBuffer2, Link.SPECULATIVE_MISC, 'X');
                        }
                        stringBuffer = new StringBuffer();
                        z = false;
                    } else if (stringBuffer.length() > 0) {
                        stringBuffer = new StringBuffer();
                        z = false;
                    }
                }
            } catch (AttributeNotFoundException e) {
                e.printStackTrace();
            } catch (IOException e2) {
                e2.printStackTrace();
            }
            crawlURI.linkExtractorFinished();
        }
    }

    private boolean looksLikeAnURL(String str) {
        if (str.indexOf("http://") == 0 || str.indexOf("https://") == 0) {
            Matcher matcher = TextUtils.getMatcher(IP_ADDRESS, str);
            boolean matches = matcher.matches();
            TextUtils.recycleMatcher(matcher);
            if (matches) {
                return true;
            }
        }
        int indexOf = str.indexOf(".");
        if (indexOf == 0) {
            return false;
        }
        while (indexOf != -1 && indexOf < str.length()) {
            str = str.substring(indexOf + 1);
            if (isTLD(str.substring(0, str.length() <= 6 ? str.length() : 6))) {
                return true;
            }
            indexOf = str.indexOf(".");
        }
        return false;
    }

    private boolean isTLD(String str) {
        if (str.length() < 2) {
            return false;
        }
        str.toLowerCase();
        Matcher matcher = TextUtils.getMatcher(TLDs, str);
        boolean matches = matcher.matches();
        TextUtils.recycleMatcher(matcher);
        return matches;
    }

    private boolean isURLableChar(int i) {
        return (i >= 35 && i <= 38) || (i >= 43 && i <= 59) || i == 61 || ((i >= 63 && i <= 90) || i == 95 || ((i >= 97 && i <= 122) || i == 126));
    }

    @Override // org.archive.crawler.framework.Processor
    public String report() {
        StringBuffer stringBuffer = new StringBuffer();
        stringBuffer.append("Processor: org.archive.crawler.extractor.ExtractorUniversal\n");
        stringBuffer.append("  Function:          Link extraction on unknown file types.\n");
        stringBuffer.append("  CrawlURIs handled: " + this.numberOfCURIsHandled + "\n");
        stringBuffer.append("  Links extracted:   " + this.numberOfLinksExtracted + "\n\n");
        return stringBuffer.toString();
    }
}
