package ai.platon.pulsar.boilerpipe.filters.heuristics;

import ai.platon.pulsar.boilerpipe.document.BlockLabels;
import ai.platon.pulsar.boilerpipe.document.TextBlock;
import ai.platon.pulsar.boilerpipe.document.TextDocument;
import ai.platon.pulsar.boilerpipe.filters.TextBlockFilter;
import ai.platon.pulsar.boilerpipe.utils.ManualRules;
import ai.platon.pulsar.boilerpipe.utils.ProcessingException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

/* loaded from: input_file:ai/platon/pulsar/boilerpipe/filters/heuristics/DocumentTitleMatchClassifier.class */
public final class DocumentTitleMatchClassifier implements TextBlockFilter {
    public static final int MinTitleSize = 6;
    public static final int MaxTitleSize = 200;
    private final Set<String> potentialTitles;
    public static final Pattern PAT_REMOVE_CHARACTERS = Pattern.compile("[\\?？\\!！\\.。\\-\\:：]+");
    public static final Pattern[] PotentialTitlePatterns = {Pattern.compile("[ ]*[\\|»|-][ ]*"), Pattern.compile("[ ]*[\\|»|:][ ]*"), Pattern.compile("[ ]*[\\|»|:\\(\\)][ ]*"), Pattern.compile("[ ]*[\\|»|:\\(\\)\\-][ ]*"), Pattern.compile("[ ]*[\\|»|,|:\\(\\)\\-][ ]*"), Pattern.compile("[ ]*[\\|»|,|:\\(\\)\\- ][ ]*"), Pattern.compile("[ ]*[»,，:：_（）【】\\|\\-\\(\\)][ ]*"), Pattern.compile("[ ]*[»,，:：_（）【】\\|\\(\\)][ ]*"), Pattern.compile("[ ]*[»,，_（）【】\\|\\(\\)][ ]*"), Pattern.compile("[ ]*[»,，（）【】\\|\\(\\)][ ]*"), Pattern.compile("[ ]*[»【】][ ]*")};

    public DocumentTitleMatchClassifier(String str) {
        if (str == null) {
            this.potentialTitles = null;
            return;
        }
        String lowerCase = str.replace((char) 160, ' ').replace("'", "").trim().toLowerCase();
        if (lowerCase.length() == 0) {
            this.potentialTitles = null;
            return;
        }
        this.potentialTitles = new HashSet();
        this.potentialTitles.add(lowerCase);
        for (Pattern pattern : PotentialTitlePatterns) {
            String longestPart = getLongestPart(lowerCase, pattern);
            if (validatePotentialTitle(longestPart)) {
                this.potentialTitles.add(longestPart);
            }
        }
        addPotentialTitles(this.potentialTitles, lowerCase, "[ ]+[\\|][ ]+", 4);
        addPotentialTitles(this.potentialTitles, lowerCase, "[ ]+[\\-][ ]+", 4);
        this.potentialTitles.add(lowerCase.replaceFirst(" - [^\\-]+$", ""));
        this.potentialTitles.add(lowerCase.replaceFirst("^[^\\-]+ - ", ""));
    }

    @Override // ai.platon.pulsar.boilerpipe.filters.TextBlockFilter
    public boolean process(TextDocument textDocument) throws ProcessingException {
        if (extractContentTitleByRule(textDocument) != null) {
            return true;
        }
        return extractContentTitleByPageTitle(textDocument);
    }

    public String extractContentTitleByRule(TextDocument textDocument) {
        String str = null;
        Iterator<Map.Entry<String, String>> it = ManualRules.TITLE_RULES.entrySet().iterator();
        while (true) {
            if (!it.hasNext()) {
                break;
            }
            Map.Entry<String, String> next = it.next();
            if (textDocument.getBaseUrl().matches(next.getKey())) {
                str = next.getValue();
                break;
            }
        }
        if (str == null) {
            return null;
        }
        String str2 = null;
        Iterator<TextBlock> it2 = textDocument.getTextBlocks().iterator();
        while (true) {
            if (!it2.hasNext()) {
                break;
            }
            TextBlock next2 = it2.next();
            if (next2.getCssSelector().equalsIgnoreCase(str)) {
                str2 = next2.getText();
                next2.setIsContent(true);
                next2.addLabel(BlockLabels.CONTENT_TITLE);
                break;
            }
        }
        return str2;
    }

    public boolean extractContentTitleByPageTitle(TextDocument textDocument) {
        if (this.potentialTitles == null) {
            return false;
        }
        boolean z = false;
        Iterator<TextBlock> it = textDocument.getTextBlocks().iterator();
        while (true) {
            if (!it.hasNext()) {
                break;
            }
            TextBlock next = it.next();
            String lowerCase = next.getText().replace((char) 160, ' ').replace("'", "").trim().toLowerCase();
            if (this.potentialTitles.contains(lowerCase)) {
                next.addLabel(BlockLabels.CONTENT_TITLE);
                z = true;
                break;
            }
            if (lowerCase.length() <= 200) {
                if (this.potentialTitles.contains(PAT_REMOVE_CHARACTERS.matcher(lowerCase).replaceAll("").trim())) {
                    next.addLabel(BlockLabels.CONTENT_TITLE);
                    z = true;
                    break;
                }
            }
        }
        return z;
    }

    public boolean validatePotentialTitle(String str) {
        return str != null && str.length() > 6 && str.length() < 200;
    }

    public Set<String> getPotentialTitles() {
        return this.potentialTitles;
    }

    private void addPotentialTitles(Set<String> set, String str, String str2, int i) {
        String[] split = str.split(str2);
        if (split.length == 1) {
            return;
        }
        for (String str3 : split) {
            if (!str3.contains(".com") && str3.split("[\b ]+").length >= i) {
                set.add(str3);
            }
        }
    }

    private String getLongestPart(String str, Pattern pattern) {
        int length;
        String[] split = str.split(pattern.pattern());
        if (split.length == 1) {
            return null;
        }
        int i = 0;
        String str2 = "";
        for (String str3 : split) {
            if (!str3.contains(".com") && ((length = str3.split("[\b ]+").length) > i || str3.length() > str2.length())) {
                i = length;
                str2 = str3;
            }
        }
        if (str2.length() == 0) {
            return null;
        }
        return str2.trim();
    }
}
