package ai.platon.pulsar.examples.common;

import ai.platon.pulsar.PulsarSession;
import ai.platon.pulsar.common.NetUtil;
import ai.platon.pulsar.common.config.VolatileConfig;
import ai.platon.pulsar.common.options.LoadOptions;
import ai.platon.pulsar.common.urls.NormUrl;
import ai.platon.pulsar.common.urls.Urls;
import ai.platon.pulsar.context.PulsarContext;
import ai.platon.pulsar.crawl.WebPageBatchHandler;
import ai.platon.pulsar.dom.FeaturedDocument;
import ai.platon.pulsar.persist.WebPage;
import java.net.URL;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import kotlin.Metadata;
import kotlin.Pair;
import kotlin.TuplesKt;
import kotlin.Unit;
import kotlin.collections.CollectionsKt;
import kotlin.jvm.functions.Function1;
import kotlin.jvm.internal.DefaultConstructorMarker;
import kotlin.jvm.internal.Intrinsics;
import kotlin.sequences.Sequence;
import kotlin.sequences.SequencesKt;
import kotlin.text.StringsKt;
import org.jetbrains.annotations.NotNull;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* compiled from: Crawler.kt */
@Metadata(mv = {1, 5, 1}, k = 1, xi = 48, d1 = {"��J\n\u0002\u0018\u0002\n\u0002\u0010��\n��\n\u0002\u0018\u0002\n��\n\u0002\u0018\u0002\n\u0002\b\u0005\n\u0002\u0018\u0002\n\u0002\b\u0003\n\u0002\u0018\u0002\n\u0002\b\u0002\n\u0002\u0010\u0002\n��\n\u0002\u0018\u0002\n\u0002\u0018\u0002\n\u0002\u0018\u0002\n��\n\u0002\u0010\u000e\n��\n\u0002\u0018\u0002\n\u0002\b\u0007\b\u0016\u0018��2\u00020\u0001B!\u0012\u0006\u0010\u0002\u001a\u00020\u0003\u0012\b\b\u0002\u0010\u0004\u001a\u00020\u0005\u0012\b\b\u0002\u0010\u0006\u001a\u00020\u0005¢\u0006\u0002\u0010\u0007J\u0006\u0010\u0011\u001a\u00020\u0012J\"\u0010\u0013\u001a\u000e\u0012\u0004\u0012\u00020\u0015\u0012\u0004\u0012\u00020\u00160\u00142\u0006\u0010\u0017\u001a\u00020\u00182\u0006\u0010\u0019\u001a\u00020\u001aJ\"\u0010\u0013\u001a\u000e\u0012\u0004\u0012\u00020\u0015\u0012\u0004\u0012\u00020\u00160\u00142\u0006\u0010\u0017\u001a\u00020\u00182\u0006\u0010\u001b\u001a\u00020\u0018J\u0016\u0010\u001c\u001a\u00020\u00122\u0006\u0010\u001d\u001a\u00020\u00182\u0006\u0010\u0019\u001a\u00020\u001aJ\u0016\u0010\u001e\u001a\u00020\u00122\u0006\u0010\u001d\u001a\u00020\u00182\u0006\u0010\u0019\u001a\u00020\u001aJ\u0016\u0010\u001e\u001a\u00020\u00122\u0006\u0010\u001d\u001a\u00020\u00182\u0006\u0010\u001b\u001a\u00020\u0018J\u000e\u0010\u001f\u001a\u00020\u00122\u0006\u0010 \u001a\u00020\u0018R\u000e\u0010\u0006\u001a\u00020\u0005X\u0082\u000e¢\u0006\u0002\n��R\u000e\u0010\u0004\u001a\u00020\u0005X\u0082\u000e¢\u0006\u0002\n��R\u0011\u0010\u0002\u001a\u00020\u0003¢\u0006\b\n��\u001a\u0004\b\b\u0010\tR\u0011\u0010\n\u001a\u00020\u000b¢\u0006\b\n��\u001a\u0004\b\f\u0010\rR\u0016\u0010\u000e\u001a\n \u0010*\u0004\u0018\u00010\u000f0\u000fX\u0082\u0004¢\u0006\u0002\n��¨\u0006!"}, d2 = {"Lai/platon/pulsar/examples/common/Crawler;", "", "context", "Lai/platon/pulsar/context/PulsarContext;", "beforeBatchHandler", "Lai/platon/pulsar/crawl/WebPageBatchHandler;", "afterBatchHandler", "(Lai/platon/pulsar/context/PulsarContext;Lai/platon/pulsar/crawl/WebPageBatchHandler;Lai/platon/pulsar/crawl/WebPageBatchHandler;)V", "getContext", "()Lai/platon/pulsar/context/PulsarContext;", "i", "Lai/platon/pulsar/PulsarSession;", "getI", "()Lai/platon/pulsar/PulsarSession;", "logger", "Lorg/slf4j/Logger;", "kotlin.jvm.PlatformType", "extractAds", "", "load", "Lkotlin/Pair;", "Lai/platon/pulsar/persist/WebPage;", "Lai/platon/pulsar/dom/FeaturedDocument;", "url", "", "options", "Lai/platon/pulsar/common/options/LoadOptions;", "args", "loadAllNews", "portalUrl", "loadOutPages", "scan", "baseUri", "pulsar-examples"})
/* loaded from: input_file:ai/platon/pulsar/examples/common/Crawler.class */
public class Crawler {

    @NotNull
    private final PulsarContext context;

    @NotNull
    private WebPageBatchHandler beforeBatchHandler;

    @NotNull
    private WebPageBatchHandler afterBatchHandler;
    private final Logger logger;

    @NotNull
    private final PulsarSession i;

    public Crawler(@NotNull PulsarContext pulsarContext, @NotNull WebPageBatchHandler webPageBatchHandler, @NotNull WebPageBatchHandler webPageBatchHandler2) {
        Intrinsics.checkNotNullParameter(pulsarContext, "context");
        Intrinsics.checkNotNullParameter(webPageBatchHandler, "beforeBatchHandler");
        Intrinsics.checkNotNullParameter(webPageBatchHandler2, "afterBatchHandler");
        this.context = pulsarContext;
        this.beforeBatchHandler = webPageBatchHandler;
        this.afterBatchHandler = webPageBatchHandler2;
        this.logger = LoggerFactory.getLogger(Crawler.class);
        this.i = this.context.createSession();
    }

    public /* synthetic */ Crawler(PulsarContext pulsarContext, WebPageBatchHandler webPageBatchHandler, WebPageBatchHandler webPageBatchHandler2, int i, DefaultConstructorMarker defaultConstructorMarker) {
        this(pulsarContext, (i & 2) != 0 ? new BeforeWebPageBatchHandler() : webPageBatchHandler, (i & 4) != 0 ? new AfterWebPageBatchHandler() : webPageBatchHandler2);
    }

    @NotNull
    public final PulsarContext getContext() {
        return this.context;
    }

    @NotNull
    public final PulsarSession getI() {
        return this.i;
    }

    @NotNull
    public final Pair<WebPage, FeaturedDocument> load(@NotNull String str, @NotNull String str2) {
        Intrinsics.checkNotNullParameter(str, "url");
        Intrinsics.checkNotNullParameter(str2, "args");
        return load(str, this.i.options(str2));
    }

    @NotNull
    public final Pair<WebPage, FeaturedDocument> load(@NotNull String str, @NotNull LoadOptions loadOptions) {
        Intrinsics.checkNotNullParameter(str, "url");
        Intrinsics.checkNotNullParameter(loadOptions, "options");
        WebPage load = this.i.load(str, loadOptions);
        FeaturedDocument parse$default = PulsarSession.DefaultImpls.parse$default(this.i, load, false, 2, (Object) null);
        parse$default.absoluteLinks();
        parse$default.stripScripts();
        if (!StringsKt.isBlank(loadOptions.getCorrectedOutLinkSelector())) {
            Sequence<String> filter = SequencesKt.filter(CollectionsKt.asSequence(FeaturedDocument.select$default(parse$default, loadOptions.getCorrectedOutLinkSelector(), 0, 0, new Function1<Element, String>() { // from class: ai.platon.pulsar.examples.common.Crawler$load$1
                public final String invoke(@NotNull Element element) {
                    Intrinsics.checkNotNullParameter(element, "it");
                    return element.attr("abs:href");
                }
            }, 6, (Object) null)), new Function1<String, Boolean>() { // from class: ai.platon.pulsar.examples.common.Crawler$load$2
                @NotNull
                public final Boolean invoke(String str2) {
                    return Boolean.valueOf(Urls.isValidUrl(str2));
                }
            });
            HashSet hashSet = new HashSet();
            for (String str2 : filter) {
                Intrinsics.checkNotNullExpressionValue(str2, "it");
                hashSet.add(StringsKt.substringBefore$default(str2, ".com", (String) null, 2, (Object) null));
            }
            Sequence filter2 = SequencesKt.filter(CollectionsKt.asSequence(hashSet), new Function1<String, Boolean>() { // from class: ai.platon.pulsar.examples.common.Crawler$load$4
                @NotNull
                public final Boolean invoke(@NotNull String str3) {
                    Intrinsics.checkNotNullParameter(str3, "it");
                    return Boolean.valueOf(!StringsKt.isBlank(str3));
                }
            });
            HashSet hashSet2 = new HashSet();
            Iterator it = filter2.iterator();
            while (it.hasNext()) {
                hashSet2.add(((String) it.next()) + ".com");
            }
            HashSet hashSet3 = hashSet2;
            ArrayList arrayList = new ArrayList();
            for (Object obj : hashSet3) {
                if (NetUtil.testHttpNetwork(new URL((String) obj))) {
                    arrayList.add(obj);
                }
            }
            System.out.println((Object) CollectionsKt.joinToString$default(CollectionsKt.take(arrayList, 10), "\n", (CharSequence) null, (CharSequence) null, 0, (CharSequence) null, new Function1<String, CharSequence>() { // from class: ai.platon.pulsar.examples.common.Crawler$load$7
                @NotNull
                public final CharSequence invoke(@NotNull String str3) {
                    Intrinsics.checkNotNullParameter(str3, "it");
                    return str3;
                }
            }, 30, (Object) null));
        }
        this.logger.info("Export to: file://{}", PulsarSession.DefaultImpls.export$default(this.i, parse$default, (String) null, 2, (Object) null));
        return TuplesKt.to(load, parse$default);
    }

    public final void loadOutPages(@NotNull String str, @NotNull String str2) {
        Intrinsics.checkNotNullParameter(str, "portalUrl");
        Intrinsics.checkNotNullParameter(str2, "args");
        loadOutPages(str, this.i.options(str2));
    }

    public final void loadOutPages(@NotNull String str, @NotNull LoadOptions loadOptions) {
        Intrinsics.checkNotNullParameter(str, "portalUrl");
        Intrinsics.checkNotNullParameter(loadOptions, "options");
        FeaturedDocument parse$default = PulsarSession.DefaultImpls.parse$default(this.i, this.i.load(str, loadOptions), false, 2, (Object) null);
        parse$default.absoluteLinks();
        parse$default.stripScripts();
        this.logger.info("Portal page is exported to: file://" + PulsarSession.DefaultImpls.export$default(this.i, parse$default, (String) null, 2, (Object) null));
        List select$default = FeaturedDocument.select$default(parse$default, loadOptions.getCorrectedOutLinkSelector(), 0, 0, new Function1<Element, String>() { // from class: ai.platon.pulsar.examples.common.Crawler$loadOutPages$links$1
            public final String invoke(@NotNull Element element) {
                Intrinsics.checkNotNullParameter(element, "it");
                return element.attr("abs:href");
            }
        }, 6, (Object) null);
        LinkedHashSet linkedHashSet = new LinkedHashSet();
        Iterator it = select$default.iterator();
        while (it.hasNext()) {
            NormUrl normalizeOrNull$default = PulsarSession.DefaultImpls.normalizeOrNull$default(getI(), (String) it.next(), (LoadOptions) null, false, 6, (Object) null);
            String spec = normalizeOrNull$default == null ? null : normalizeOrNull$default.getSpec();
            if (spec != null) {
                linkedHashSet.add(spec);
            }
        }
        List take = CollectionsKt.take(linkedHashSet, loadOptions.getTopLinks());
        this.logger.info("Total " + take.size() + " items to load");
        Iterator it2 = PulsarSession.DefaultImpls.loadAll$default(this.i, take, LoadOptions.createItemOptions$default(loadOptions, (VolatileConfig) null, 1, (Object) null), false, 4, (Object) null).iterator();
        while (it2.hasNext()) {
            System.out.println((Object) ((WebPage) it2.next()).getUrl());
        }
    }

    public final void loadAllNews(@NotNull String str, @NotNull LoadOptions loadOptions) {
        Intrinsics.checkNotNullParameter(str, "portalUrl");
        Intrinsics.checkNotNullParameter(loadOptions, "options");
        Collection simpleLiveLinks = this.i.load(str, loadOptions).getSimpleLiveLinks();
        Intrinsics.checkNotNullExpressionValue(simpleLiveLinks, "portal.simpleLiveLinks");
        Collection collection = simpleLiveLinks;
        ArrayList arrayList = new ArrayList();
        for (Object obj : collection) {
            String str2 = (String) obj;
            Intrinsics.checkNotNullExpressionValue(str2, "it");
            if (StringsKt.contains$default(str2, "jinrong", false, 2, (Object) null)) {
                arrayList.add(obj);
            }
        }
        for (WebPage webPage : PulsarSession.DefaultImpls.loadAll$default(this.i, arrayList, this.i.options("--parse"), false, 4, (Object) null)) {
            System.out.println((Object) (webPage.getUrl() + " " + webPage.getContentTitle()));
        }
    }

    public final void extractAds() {
        Iterable<Element> select$default = FeaturedDocument.select$default(PulsarSession.DefaultImpls.loadDocument$default(this.i, "https://wuhan.baixing.com/xianhualipin/a1100414743.html", (LoadOptions) null, 2, (Object) null), "a[href~=mssp.baidu]", 0, 0, 6, (Object) null);
        ArrayList arrayList = new ArrayList(CollectionsKt.collectionSizeOrDefault(select$default, 10));
        for (Element element : select$default) {
            arrayList.add(Unit.INSTANCE);
        }
    }

    public final void scan(@NotNull String str) {
        Intrinsics.checkNotNullParameter(str, "baseUri");
        this.i.getContext().scan(str).forEachRemaining(Crawler::m8scan$lambda9);
    }

    /* renamed from: scan$lambda-9, reason: not valid java name */
    private static final void m8scan$lambda9(WebPage webPage) {
        int length;
        Intrinsics.checkNotNullParameter(webPage, "it");
        ByteBuffer content = webPage.getContent();
        if (content == null) {
            length = 0;
        } else {
            byte[] array = content.array();
            length = array == null ? 0 : array.length;
        }
        System.out.println(length);
    }
}
