/*
 * Decompiled with CFR 0.152.
 */
package ai.platon.pulsar.parse.html.filters;

import ai.platon.pulsar.boilerpipe.document.TextDocument;
import ai.platon.pulsar.boilerpipe.extractors.ChineseNewsExtractor;
import ai.platon.pulsar.boilerpipe.sax.SAXInput;
import ai.platon.pulsar.boilerpipe.utils.ProcessingException;
import ai.platon.pulsar.common.config.ImmutableConfig;
import ai.platon.pulsar.crawl.parse.AbstractParseFilter;
import ai.platon.pulsar.crawl.parse.FilterResult;
import ai.platon.pulsar.crawl.parse.html.ParseContext;
import ai.platon.pulsar.crawl.parse.html.PrimerParser;
import ai.platon.pulsar.persist.WebPage;
import ai.platon.pulsar.persist.WebPageExt;
import ai.platon.pulsar.persist.metadata.PageCategory;
import ai.platon.pulsar.persist.model.PageModel;
import java.time.Instant;
import java.util.Map;
import kotlin.Metadata;
import kotlin.jvm.internal.Intrinsics;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.xml.sax.InputSource;

@Metadata(mv={1, 5, 1}, k=1, xi=48, d1={"\u0000>\n\u0002\u0018\u0002\n\u0002\u0018\u0002\n\u0000\n\u0002\u0018\u0002\n\u0002\b\u0004\n\u0002\u0018\u0002\n\u0002\b\u0002\n\u0002\u0018\u0002\n\u0000\n\u0002\u0018\u0002\n\u0000\n\u0002\u0018\u0002\n\u0000\n\u0002\u0018\u0002\n\u0000\n\u0002\u0018\u0002\n\u0000\n\u0002\u0010\u000e\n\u0000\u0018\u00002\u00020\u0001B\r\u0012\u0006\u0010\u0002\u001a\u00020\u0003\u00a2\u0006\u0002\u0010\u0004J\u0010\u0010\f\u001a\u00020\r2\u0006\u0010\u000e\u001a\u00020\u000fH\u0014J\u0012\u0010\u0010\u001a\u0004\u0018\u00010\u00112\u0006\u0010\u0012\u001a\u00020\u0013H\u0002J\u0018\u0010\u0010\u001a\u0004\u0018\u00010\u00112\u0006\u0010\u0012\u001a\u00020\u00132\u0006\u0010\u0014\u001a\u00020\u0015R\u0011\u0010\u0002\u001a\u00020\u0003\u00a2\u0006\b\n\u0000\u001a\u0004\b\u0005\u0010\u0006R\u0016\u0010\u0007\u001a\n \t*\u0004\u0018\u00010\b0\bX\u0082\u0004\u00a2\u0006\u0002\n\u0000R\u000e\u0010\n\u001a\u00020\u000bX\u0082\u0004\u00a2\u0006\u0002\n\u0000\u00a8\u0006\u0016"}, d2={"Lai/platon/pulsar/parse/html/filters/BoilerpipeExtractor;", "Lai/platon/pulsar/crawl/parse/AbstractParseFilter;", "conf", "Lai/platon/pulsar/common/config/ImmutableConfig;", "(Lai/platon/pulsar/common/config/ImmutableConfig;)V", "getConf", "()Lai/platon/pulsar/common/config/ImmutableConfig;", "log", "Lorg/apache/commons/logging/Log;", "kotlin.jvm.PlatformType", "primerParser", "Lai/platon/pulsar/crawl/parse/html/PrimerParser;", "doFilter", "Lai/platon/pulsar/crawl/parse/FilterResult;", "parseContext", "Lai/platon/pulsar/crawl/parse/html/ParseContext;", "extract", "Lai/platon/pulsar/boilerpipe/document/TextDocument;", "page", "Lai/platon/pulsar/persist/WebPage;", "encoding", "", "pulsar-parse"})
public final class BoilerpipeExtractor
extends AbstractParseFilter {
    @NotNull
    private final ImmutableConfig conf;
    private final Log log;
    @NotNull
    private final PrimerParser primerParser;

    public BoilerpipeExtractor(@NotNull ImmutableConfig conf) {
        Intrinsics.checkNotNullParameter((Object)conf, (String)"conf");
        super(0, null, 3, null);
        this.conf = conf;
        this.log = LogFactory.getLog((String)BoilerpipeExtractor.class.getName());
        this.primerParser = new PrimerParser(this.conf);
    }

    @NotNull
    public final ImmutableConfig getConf() {
        return this.conf;
    }

    @NotNull
    protected FilterResult doFilter(@NotNull ParseContext parseContext) {
        Intrinsics.checkNotNullParameter((Object)parseContext, (String)"parseContext");
        WebPage page = parseContext.getPage();
        String string = page.getEncoding();
        String string2 = string == null ? "UTF-8" : string;
        this.extract(page, string2);
        return FilterResult.Companion.success$default((FilterResult.Companion)FilterResult.Companion, (int)0, (int)1, null);
    }

    @Nullable
    public final TextDocument extract(@NotNull WebPage page, @NotNull String encoding) {
        Intrinsics.checkNotNullParameter((Object)page, (String)"page");
        Intrinsics.checkNotNullParameter((Object)encoding, (String)"encoding");
        TextDocument textDocument = this.extract(page);
        if (textDocument == null) {
            return null;
        }
        TextDocument doc = textDocument;
        WebPageExt pageExt = new WebPageExt(page);
        page.setContentTitle(doc.getContentTitle());
        page.setContentText(doc.getTextContent());
        page.setPageCategory(PageCategory.parse((String)doc.getPageCategoryAsString()));
        Instant instant = doc.getPublishTime();
        Intrinsics.checkNotNullExpressionValue((Object)instant, (String)"doc.publishTime");
        pageExt.updateContentPublishTime(instant);
        instant = doc.getModifiedTime();
        Intrinsics.checkNotNullExpressionValue((Object)instant, (String)"doc.modifiedTime");
        pageExt.updateContentModifiedTime(instant);
        int id = 1000;
        PageModel pageModel = page.ensurePageModel();
        Map map = doc.getFields();
        Intrinsics.checkNotNullExpressionValue((Object)map, (String)"doc.fields");
        pageModel.emplace(id, 0, "boilerpipe", map);
        return doc;
    }

    private final TextDocument extract(WebPage page) {
        if (page.getContent() == null) {
            this.log.warn((Object)("Can not extract page without content, url : " + page.getUrl()));
            return null;
        }
        try {
            if (page.getEncoding() == null) {
                this.primerParser.detectEncoding(page);
            }
            InputSource inputSource = page.getContentAsSaxInputSource();
            Intrinsics.checkNotNullExpressionValue((Object)inputSource, (String)"page.contentAsSaxInputSource");
            InputSource inputSource2 = inputSource;
            TextDocument doc = new SAXInput().parse(page.getLocation(), inputSource2);
            ChineseNewsExtractor extractor = new ChineseNewsExtractor();
            extractor.process(doc);
            return doc;
        }
        catch (ProcessingException e) {
            this.log.warn((Object)("Failed to extract text content by boilerpipe, " + e.getMessage()));
            return null;
        }
    }
}

