/*
 * Decompiled with CFR 0.152.
 */
package ai.platon.pulsar.boilerpipe.extractors;

import ai.platon.pulsar.boilerpipe.document.TextDocument;
import ai.platon.pulsar.boilerpipe.extractors.TextExtractor;
import ai.platon.pulsar.boilerpipe.filters.heuristics.ArticleMetadataFilter;
import ai.platon.pulsar.boilerpipe.filters.heuristics.BlockProximityFusion;
import ai.platon.pulsar.boilerpipe.filters.heuristics.ContentDateStringNumberFilter;
import ai.platon.pulsar.boilerpipe.filters.heuristics.DocumentTitleMatchClassifier;
import ai.platon.pulsar.boilerpipe.filters.heuristics.ExpandTitleToContentFilter;
import ai.platon.pulsar.boilerpipe.filters.heuristics.IgnoreBlocksAfterContentFilter;
import ai.platon.pulsar.boilerpipe.filters.heuristics.IgnoreBlocksAfterContentFromEndFilter;
import ai.platon.pulsar.boilerpipe.filters.heuristics.KeepLargestBlockFilter;
import ai.platon.pulsar.boilerpipe.filters.heuristics.LargeBlockSameTagLevelToContentFilter;
import ai.platon.pulsar.boilerpipe.filters.heuristics.ListAtEndFilter;
import ai.platon.pulsar.boilerpipe.filters.heuristics.TerminatingBlocksFinder;
import ai.platon.pulsar.boilerpipe.filters.heuristics.TrailingHeadlineToBoilerplateFilter;
import ai.platon.pulsar.boilerpipe.filters.simple.BoilerplateBlockFilter;
import ai.platon.pulsar.boilerpipe.filters.simple.LabeledFieldExtractorFilter;
import ai.platon.pulsar.boilerpipe.filters.simple.RegexFieldExtractorFilter;
import ai.platon.pulsar.boilerpipe.filters.statistics.NumWordsRulesClassifier;
import ai.platon.pulsar.boilerpipe.utils.BoiConstants;
import ai.platon.pulsar.boilerpipe.utils.ProcessingException;
import ai.platon.pulsar.common.DateTimes;
import com.google.common.collect.ListMultimap;
import java.time.Instant;
import java.time.ZoneId;
import java.util.Set;

public final class ChineseNewsExtractor
implements TextExtractor {
    public static final ChineseNewsExtractor INSTANCE = new ChineseNewsExtractor();
    private ZoneId zoneId = ZoneId.systemDefault();
    private ListMultimap<String, String> labeledFieldRules = BoiConstants.LABELED_FIELD_RULES;
    private ListMultimap<String, String> regexFieldRules = BoiConstants.REGEX_FIELD_RULES;
    private Set<String> terminatingBlocksContains = BoiConstants.TERMINATING_BLOCKS_CONTAINS;
    private Set<String> terminatingBlocksStartsWith = BoiConstants.TERMINATING_BLOCKS_STARTS_WITH;

    public void setZoneId(ZoneId zoneId) {
        this.zoneId = zoneId;
    }

    public ZoneId getZoneId() {
        return this.zoneId;
    }

    public void setLabeledFieldRules(ListMultimap<String, String> labeledFieldRules) {
        this.labeledFieldRules.putAll(labeledFieldRules);
    }

    public void setRegexFieldRules(ListMultimap<String, String> regexFieldRules) {
        this.regexFieldRules.putAll(regexFieldRules);
    }

    public void setTerminatingBlocksContains(Set<String> terminatingBlocksContains) {
        this.terminatingBlocksContains.addAll(terminatingBlocksContains);
    }

    public void setTerminatingBlocksStartsWith(Set<String> terminatingBlocksStartsWith) {
        this.terminatingBlocksStartsWith.addAll(terminatingBlocksStartsWith);
    }

    public static ChineseNewsExtractor getInstance() {
        return INSTANCE;
    }

    @Override
    public boolean process(TextDocument doc) throws ProcessingException {
        new TerminatingBlocksFinder(this.terminatingBlocksContains, this.terminatingBlocksStartsWith).process(doc);
        new DocumentTitleMatchClassifier(doc.getPageTitle()).process(doc);
        NumWordsRulesClassifier.INSTANCE.process(doc);
        IgnoreBlocksAfterContentFilter.DEFAULT_INSTANCE.process(doc);
        IgnoreBlocksAfterContentFromEndFilter.INSTANCE.process(doc);
        TrailingHeadlineToBoilerplateFilter.INSTANCE.process(doc);
        BlockProximityFusion.MAX_DISTANCE_1.process(doc);
        new ArticleMetadataFilter(this.zoneId).process(doc);
        BoilerplateBlockFilter.INSTANCE_KEEP_TITLE.process(doc);
        BlockProximityFusion.MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL.process(doc);
        KeepLargestBlockFilter.INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS.process(doc);
        ExpandTitleToContentFilter.INSTANCE.process(doc);
        LargeBlockSameTagLevelToContentFilter.INSTANCE.process(doc);
        ListAtEndFilter.INSTANCE.process(doc);
        ContentDateStringNumberFilter.INSTANCE.process(doc);
        new RegexFieldExtractorFilter(this.regexFieldRules, 200).process(doc);
        new LabeledFieldExtractorFilter(this.labeledFieldRules).process(doc);
        doc.setContentTitle(doc.getFieldOrDefault("auto_article_title", ""));
        doc.setField("auto_text_content_length", String.valueOf(doc.getTextContent().length()));
        doc.setField("auto_html_content_length", String.valueOf(doc.getHtmlContent().length()));
        doc.setField("auto_publish_time", DateTimes.isoInstantFormat((Instant)doc.getPublishTime()));
        doc.setField("auto_modified_time", DateTimes.isoInstantFormat((Instant)doc.getModifiedTime()));
        doc.setField("auto_page_category", doc.getPageCategoryAsString());
        doc.setField("auto_page_title", doc.getPageTitle());
        doc.setField("auto_article_title", doc.getContentTitle());
        doc.setField("auto_html_content", doc.getHtmlContent());
        doc.setField("auto_text_content", doc.getTextContent());
        return true;
    }
}

