/*
 * Decompiled with CFR 0.152.
 */
package ai.platon.pulsar.boilerpipe.filters.heuristics;

import ai.platon.pulsar.boilerpipe.document.TextBlock;
import ai.platon.pulsar.boilerpipe.document.TextDocument;
import ai.platon.pulsar.boilerpipe.filters.TextBlockFilter;
import ai.platon.pulsar.boilerpipe.utils.ManualRules;
import ai.platon.pulsar.boilerpipe.utils.ProcessingException;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

public final class DocumentTitleMatchClassifier
implements TextBlockFilter {
    public static final Pattern PAT_REMOVE_CHARACTERS = Pattern.compile("[\\?\uff1f\\!\uff01\\.\u3002\\-\\:\uff1a]+");
    public static final int MinTitleSize = 6;
    public static final int MaxTitleSize = 200;
    public static final Pattern[] PotentialTitlePatterns = new Pattern[]{Pattern.compile("[ ]*[\\|\u00bb|-][ ]*"), Pattern.compile("[ ]*[\\|\u00bb|:][ ]*"), Pattern.compile("[ ]*[\\|\u00bb|:\\(\\)][ ]*"), Pattern.compile("[ ]*[\\|\u00bb|:\\(\\)\\-][ ]*"), Pattern.compile("[ ]*[\\|\u00bb|,|:\\(\\)\\-][ ]*"), Pattern.compile("[ ]*[\\|\u00bb|,|:\\(\\)\\-\u00a0][ ]*"), Pattern.compile("[ ]*[\u00bb,\uff0c:\uff1a_\uff08\uff09\u3010\u3011\\|\\-\\(\\)][ ]*"), Pattern.compile("[ ]*[\u00bb,\uff0c:\uff1a_\uff08\uff09\u3010\u3011\\|\\(\\)][ ]*"), Pattern.compile("[ ]*[\u00bb,\uff0c_\uff08\uff09\u3010\u3011\\|\\(\\)][ ]*"), Pattern.compile("[ ]*[\u00bb,\uff0c\uff08\uff09\u3010\u3011\\|\\(\\)][ ]*"), Pattern.compile("[ ]*[\u00bb\u3010\u3011][ ]*")};
    private final Set<String> potentialTitles;

    public DocumentTitleMatchClassifier(String title) {
        if (title == null) {
            this.potentialTitles = null;
        } else {
            title = title.replace('\u00a0', ' ');
            title = title.replace("'", "");
            if ((title = title.trim().toLowerCase()).length() == 0) {
                this.potentialTitles = null;
            } else {
                this.potentialTitles = new HashSet<String>();
                this.potentialTitles.add(title);
                for (Pattern pattern : PotentialTitlePatterns) {
                    String p = this.getLongestPart(title, pattern);
                    if (!this.validatePotentialTitle(p)) continue;
                    this.potentialTitles.add(p);
                }
                this.addPotentialTitles(this.potentialTitles, title, "[ ]+[\\|][ ]+", 4);
                this.addPotentialTitles(this.potentialTitles, title, "[ ]+[\\-][ ]+", 4);
                this.potentialTitles.add(title.replaceFirst(" - [^\\-]+$", ""));
                this.potentialTitles.add(title.replaceFirst("^[^\\-]+ - ", ""));
            }
        }
    }

    @Override
    public boolean process(TextDocument doc) throws ProcessingException {
        String contentTitle = this.extractContentTitleByRule(doc);
        if (contentTitle != null) {
            return true;
        }
        return this.extractContentTitleByPageTitle(doc);
    }

    public String extractContentTitleByRule(TextDocument doc) {
        String selector = null;
        for (Map.Entry<String, String> entry : ManualRules.TITLE_RULES.entrySet()) {
            if (!doc.getBaseUrl().matches(entry.getKey())) continue;
            selector = entry.getValue();
            break;
        }
        if (selector == null) {
            return null;
        }
        String contentTitle = null;
        for (TextBlock tb : doc.getTextBlocks()) {
            if (!tb.getCssSelector().equalsIgnoreCase(selector)) continue;
            contentTitle = tb.getText();
            tb.setIsContent(true);
            tb.addLabel("pulsar.text/CONTENT_TITLE");
            break;
        }
        return contentTitle;
    }

    public boolean extractContentTitleByPageTitle(TextDocument doc) {
        if (this.potentialTitles == null) {
            return false;
        }
        boolean changes = false;
        for (TextBlock tb : doc.getTextBlocks()) {
            String text = tb.getText();
            text = text.replace('\u00a0', ' ');
            text = text.replace("'", "");
            if (this.potentialTitles.contains(text = text.trim().toLowerCase())) {
                tb.addLabel("pulsar.text/CONTENT_TITLE");
                changes = true;
                break;
            }
            if (text.length() > 200 || !this.potentialTitles.contains(text = PAT_REMOVE_CHARACTERS.matcher(text).replaceAll("").trim())) continue;
            tb.addLabel("pulsar.text/CONTENT_TITLE");
            changes = true;
            break;
        }
        return changes;
    }

    public boolean validatePotentialTitle(String title) {
        return title != null && title.length() > 6 && title.length() < 200;
    }

    public Set<String> getPotentialTitles() {
        return this.potentialTitles;
    }

    private void addPotentialTitles(Set<String> potentialTitles, String title, String pattern, int minWords) {
        String[] parts = title.split(pattern);
        if (parts.length == 1) {
            return;
        }
        for (int i = 0; i < parts.length; ++i) {
            int numWords;
            String p = parts[i];
            if (p.contains(".com") || (numWords = p.split("[\b ]+").length) < minWords) continue;
            potentialTitles.add(p);
        }
    }

    private String getLongestPart(String title, Pattern pattern) {
        String[] parts = title.split(pattern.pattern());
        if (parts.length == 1) {
            return null;
        }
        int longestNumWords = 0;
        String longestPart = "";
        for (int i = 0; i < parts.length; ++i) {
            int numWords;
            String p = parts[i];
            if (p.contains(".com") || (numWords = p.split("[\b ]+").length) <= longestNumWords && p.length() <= longestPart.length()) continue;
            longestNumWords = numWords;
            longestPart = p;
        }
        if (longestPart.length() == 0) {
            return null;
        }
        return longestPart.trim();
    }
}

