/*
 * Decompiled with CFR 0.152.
 */
package ai.platon.pulsar.parse.tika;

import ai.platon.pulsar.common.ReflectionUtils;
import ai.platon.pulsar.common.config.ImmutableConfig;
import ai.platon.pulsar.crawl.filter.CrawlFilters;
import ai.platon.pulsar.crawl.parse.ParseFilters;
import ai.platon.pulsar.crawl.parse.ParseResult;
import ai.platon.pulsar.crawl.parse.Parser;
import ai.platon.pulsar.crawl.parse.html.HTMLMetaTags;
import ai.platon.pulsar.crawl.parse.html.PrimerParser;
import ai.platon.pulsar.parse.tika.DOMBuilder;
import ai.platon.pulsar.persist.WebPage;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import kotlin.Metadata;
import kotlin.jvm.internal.DefaultConstructorMarker;
import kotlin.jvm.internal.Intrinsics;
import kotlin.text.StringsKt;
import org.apache.html.dom.HTMLDocumentImpl;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.HtmlMapper;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Node;
import org.xml.sax.ContentHandler;

@Metadata(mv={1, 5, 1}, k=1, xi=48, d1={"\u0000P\n\u0002\u0018\u0002\n\u0002\u0018\u0002\n\u0000\n\u0002\u0018\u0002\n\u0002\b\u0002\n\u0002\u0018\u0002\n\u0000\n\u0002\u0018\u0002\n\u0002\b\u0002\n\u0002\u0018\u0002\n\u0002\b\u0002\n\u0002\u0010\u000e\n\u0002\b\u0005\n\u0002\u0018\u0002\n\u0002\b\u0003\n\u0002\u0018\u0002\n\u0000\n\u0002\u0018\u0002\n\u0000\n\u0002\u0018\u0002\n\u0000\n\u0002\u0018\u0002\n\u0000\u0018\u00002\u00020\u0001B\u000f\b\u0016\u0012\u0006\u0010\u0002\u001a\u00020\u0003\u00a2\u0006\u0002\u0010\u0004B%\u0012\n\b\u0002\u0010\u0005\u001a\u0004\u0018\u00010\u0006\u0012\n\b\u0002\u0010\u0007\u001a\u0004\u0018\u00010\b\u0012\u0006\u0010\u0002\u001a\u00020\u0003\u00a2\u0006\u0002\u0010\tJ\u0010\u0010\u001b\u001a\u00020\u001c2\u0006\u0010\u001d\u001a\u00020\u001eH\u0016R\u0016\u0010\n\u001a\n \f*\u0004\u0018\u00010\u000b0\u000bX\u0082\u0004\u00a2\u0006\u0002\n\u0000R\u000e\u0010\r\u001a\u00020\u000eX\u0082\u0004\u00a2\u0006\u0002\n\u0000R\u0011\u0010\u0002\u001a\u00020\u0003\u00a2\u0006\b\n\u0000\u001a\u0004\b\u000f\u0010\u0010R\u0013\u0010\u0005\u001a\u0004\u0018\u00010\u0006\u00a2\u0006\b\n\u0000\u001a\u0004\b\u0011\u0010\u0012R\u0010\u0010\u0013\u001a\u0004\u0018\u00010\u0014X\u0082\u000e\u00a2\u0006\u0002\n\u0000R\u0013\u0010\u0007\u001a\u0004\u0018\u00010\b\u00a2\u0006\b\n\u0000\u001a\u0004\b\u0015\u0010\u0016R\u000e\u0010\u0017\u001a\u00020\u0018X\u0082\u0004\u00a2\u0006\u0002\n\u0000R\u0016\u0010\u0019\u001a\n \f*\u0004\u0018\u00010\u001a0\u001aX\u0082\u0004\u00a2\u0006\u0002\n\u0000\u00a8\u0006\u001f"}, d2={"Lai/platon/pulsar/parse/tika/TikaParser;", "Lai/platon/pulsar/crawl/parse/Parser;", "conf", "Lai/platon/pulsar/common/config/ImmutableConfig;", "(Lai/platon/pulsar/common/config/ImmutableConfig;)V", "crawlFilters", "Lai/platon/pulsar/crawl/filter/CrawlFilters;", "parseFilters", "Lai/platon/pulsar/crawl/parse/ParseFilters;", "(Lai/platon/pulsar/crawl/filter/CrawlFilters;Lai/platon/pulsar/crawl/parse/ParseFilters;Lai/platon/pulsar/common/config/ImmutableConfig;)V", "LOG", "Lorg/slf4j/Logger;", "kotlin.jvm.PlatformType", "cachingPolicy", "", "getConf", "()Lai/platon/pulsar/common/config/ImmutableConfig;", "getCrawlFilters", "()Lai/platon/pulsar/crawl/filter/CrawlFilters;", "htmlMapper", "Lorg/apache/tika/parser/html/HtmlMapper;", "getParseFilters", "()Lai/platon/pulsar/crawl/parse/ParseFilters;", "primerParser", "Lai/platon/pulsar/crawl/parse/html/PrimerParser;", "tikaConfig", "Lorg/apache/tika/config/TikaConfig;", "parse", "Lai/platon/pulsar/crawl/parse/ParseResult;", "page", "Lai/platon/pulsar/persist/WebPage;", "pulsar-parse"})
public final class TikaParser
implements Parser {
    @Nullable
    private final CrawlFilters crawlFilters;
    @Nullable
    private final ParseFilters parseFilters;
    @NotNull
    private final ImmutableConfig conf;
    private final Logger LOG;
    @NotNull
    private final PrimerParser primerParser;
    private final TikaConfig tikaConfig;
    @NotNull
    private final String cachingPolicy;
    @Nullable
    private HtmlMapper htmlMapper;

    /*
     * WARNING - void declaration
     */
    public TikaParser(@Nullable CrawlFilters crawlFilters, @Nullable ParseFilters parseFilters, @NotNull ImmutableConfig conf) {
        HtmlMapper htmlMapper;
        Intrinsics.checkNotNullParameter((Object)conf, (String)"conf");
        this.crawlFilters = crawlFilters;
        this.parseFilters = parseFilters;
        this.conf = conf;
        this.LOG = LoggerFactory.getLogger(TikaParser.class);
        this.primerParser = new PrimerParser(this.conf);
        this.tikaConfig = TikaConfig.getDefaultConfig();
        this.cachingPolicy = this.conf.get("parser.caching.forbidden.policy", "content");
        TikaParser tikaParser = this;
        String string = this.conf.get("tika.htmlmapper.classname");
        if (string == null) {
            htmlMapper = null;
        } else {
            void it;
            String string2 = string;
            boolean bl = false;
            boolean bl2 = false;
            String string3 = string2;
            TikaParser tikaParser2 = tikaParser;
            boolean bl3 = false;
            HtmlMapper htmlMapper2 = (HtmlMapper)ReflectionUtils.forNameOrNull((String)it);
            tikaParser = tikaParser2;
            htmlMapper = htmlMapper2;
        }
        tikaParser.htmlMapper = htmlMapper;
    }

    public /* synthetic */ TikaParser(CrawlFilters crawlFilters, ParseFilters parseFilters, ImmutableConfig immutableConfig, int n, DefaultConstructorMarker defaultConstructorMarker) {
        if ((n & 1) != 0) {
            crawlFilters = null;
        }
        if ((n & 2) != 0) {
            parseFilters = null;
        }
        this(crawlFilters, parseFilters, immutableConfig);
    }

    @Nullable
    public final CrawlFilters getCrawlFilters() {
        return this.crawlFilters;
    }

    @Nullable
    public final ParseFilters getParseFilters() {
        return this.parseFilters;
    }

    @NotNull
    public final ImmutableConfig getConf() {
        return this.conf;
    }

    public TikaParser(@NotNull ImmutableConfig conf) {
        Intrinsics.checkNotNullParameter((Object)conf, (String)"conf");
        this(null, null, conf);
    }

    @NotNull
    public ParseResult parse(@NotNull WebPage page) {
        ParseFilters parseFilters;
        URL uRL;
        Object object;
        Intrinsics.checkNotNullParameter((Object)page, (String)"page");
        String baseUrl = page.getLocation();
        try {
            object = new URL(baseUrl);
        }
        catch (MalformedURLException e) {
            return ParseResult.Companion.failed((Throwable)e);
        }
        Object base = object;
        String e = page.getContentType();
        Intrinsics.checkNotNullExpressionValue((Object)e, (String)"page.contentType");
        String mimeType = e;
        org.apache.tika.metadata.Metadata tikamd = new org.apache.tika.metadata.Metadata();
        HTMLDocumentImpl doc = new HTMLDocumentImpl();
        doc.setErrorChecking(false);
        DocumentFragment root = doc.createDocumentFragment();
        DOMBuilder domhandler = new DOMBuilder((Document)doc, root);
        ParseContext context = new ParseContext();
        if (this.htmlMapper != null) {
            context.set(HtmlMapper.class, (Object)this.htmlMapper);
        }
        tikamd.set("Content-Type", mimeType);
        try {
            ByteBuffer raw = page.getContent();
            if (raw != null) {
                this.tikaConfig.getParser().parse((InputStream)new ByteArrayInputStream(raw.array(), raw.arrayOffset() + raw.position(), raw.remaining()), (ContentHandler)domhandler, tikamd, context);
            }
        }
        catch (Exception e2) {
            this.LOG.error("Error parsing " + page.getUrl(), (Throwable)e2);
            return ParseResult.Companion.failed((Throwable)e2);
        }
        String pageTitle = "";
        String pageText = "";
        boolean bl = false;
        Set hypeLinks = new LinkedHashSet();
        Intrinsics.checkNotNullExpressionValue((Object)root, (String)"root");
        HTMLMetaTags metaTags = new HTMLMetaTags((Node)root, (URL)base);
        if (!metaTags.getNoIndex()) {
            pageText = this.primerParser.getPageText((Node)root);
            pageTitle = this.primerParser.getPageTitle((Node)root);
        }
        if (!metaTags.getNoFollow()) {
            URL baseTag = this.primerParser.getBaseURLFromTag((Node)root);
            uRL = baseTag;
            this.primerParser.collectLinks((URL)(uRL == null ? base : uRL), hypeLinks, (Node)root, null);
        }
        page.setPageTitle(pageTitle);
        page.setPageText(pageText);
        uRL = tikamd.names();
        Intrinsics.checkNotNullExpressionValue((Object)uRL, (String)"tikamd.names()");
        for (String name : uRL) {
            if (StringsKt.equals((String)name, (String)TikaCoreProperties.TITLE.toString(), (boolean)true)) continue;
            page.getMetadata().set(name, tikamd.get(name));
        }
        ParseResult parseResult = new ParseResult(1, 0, null, 4, null);
        if (metaTags.getRefresh()) {
            parseResult.setMinorCode(100);
            Map map = parseResult.getArgs();
            Intrinsics.checkNotNullExpressionValue((Object)map, (String)"parseResult.args");
            String string = "refreshHref";
            String string2 = String.valueOf(metaTags.getRefreshHref());
            boolean bl2 = false;
            map.put(string, string2);
            map = parseResult.getArgs();
            Intrinsics.checkNotNullExpressionValue((Object)map, (String)"parseResult.args");
            string = "refreshTime";
            string2 = Integer.toString(metaTags.getRefreshTime());
            bl2 = false;
            map.put(string, string2);
        }
        if ((parseFilters = this.parseFilters) != null) {
            parseFilters.filter(new ai.platon.pulsar.crawl.parse.html.ParseContext(page, parseResult));
        }
        if (metaTags.getNoCache()) {
            page.getMetadata().set("caching.forbidden", this.cachingPolicy);
        }
        return parseResult;
    }
}

