package com.eshore.kg.qa.extract.html;

import com.eshore.framework.OnEnable;
import com.eshore.framework.StandardComponent;
import com.eshore.framework.StandardProperty;
import com.eshore.framework.impl.CacheableNodeWorker;
import com.eshore.kg.qa.extract.Paragraph;
import com.eshore.utils.Accumulation;
import com.eshore.utils.Pair;
import com.eshore.utils.StringUtils;
import com.eshore.writable.WritableDouble;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

@StandardComponent("提取类似Markdown内容")
/* loaded from: input_file:com/eshore/kg/qa/extract/html/MarkdownContentExtraction.class */
public class MarkdownContentExtraction extends CacheableNodeWorker<JsoupDocument, ExtractionContext> {

    @StandardProperty(name = "selectorReliability", description = "selectorReliability", defaultValue = "1.2")
    private double selectorReliability;

    @StandardProperty(name = "propabilityMinimum", description = "propabilityMinimum", defaultValue = "1.9")
    private float propabilityMinimum;

    @StandardProperty(name = "tags of children", description = "tags of children", defaultValue = "['a','blockquote','br','code','div','em','h1','h2','h3','h4','h5','h6','hr','ol','p','pre','strong','table','ul']")
    private Set<String> childTags;

    @StandardProperty(name = "tags of exclude", description = "tags of exclude", defaultValue = "['li','td','thead','script','style']")
    private Set<String> excludeTags;

    @StandardProperty(name = "tags of exclude whole tree", description = "tags of exclude whole tree", defaultValue = "['header']")
    private Set<String> excludeTreeTags;

    @StandardProperty(name = "childrenLengthPropabilities", description = "propabilities of children length ", defaultValue = "[0, 0, 0.2, 0.5, 0.8, 0.9, 0.9, 0.9, 0.95, 0.95, 0.95, 0.95, 0.95]")
    private float[] childrenLengthPropabilities;

    @StandardProperty(name = "textMinimum", description = "text length minimum", defaultValue = "100")
    private int textMinimum;
    private Accumulation<String> selectors = new Accumulation<>();
    private static final Pattern SPACE = Pattern.compile("\\s+");

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:com/eshore/kg/qa/extract/html/MarkdownContentExtraction$MaxPropability.class */
    public static class MaxPropability {
        float propability;
        Element element = null;

        MaxPropability(float f) {
            this.propability = f;
        }
    }

    @OnEnable
    private void onEnable() {
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public boolean go(JsoupDocument jsoupDocument, ExtractionContext extractionContext) {
        if (retry(jsoupDocument, extractionContext)) {
            return true;
        }
        Element searchMarkdownBody = searchMarkdownBody(jsoupDocument.getDocument().body());
        if (searchMarkdownBody == null) {
            return false;
        }
        parseContent(searchMarkdownBody, jsoupDocument);
        return true;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public boolean retry(JsoupDocument jsoupDocument, ExtractionContext extractionContext) {
        for (Map.Entry entry : this.selectors.entrySet()) {
            if (((WritableDouble) entry.getValue()).get() >= this.selectorReliability) {
                Element selectFirst = jsoupDocument.getDocument().selectFirst((String) entry.getKey());
                if (selectFirst != null) {
                    parseContent(selectFirst, jsoupDocument);
                    return true;
                }
            }
        }
        return false;
    }

    private void parseContent(Element element, JsoupDocument jsoupDocument) {
        LinkedList linkedList = new LinkedList();
        linkedList.add(new Pair(jsoupDocument, 0));
        Iterator it = element.children().iterator();
        while (it.hasNext()) {
            Element element2 = (Element) it.next();
            int isHeader = isHeader(element2.tagName());
            if (isHeader == 0) {
                appendText((Paragraph) ((Pair) linkedList.getLast()).getKey(), element2);
            } else {
                while (((Integer) ((Pair) linkedList.getLast()).getValue()).intValue() >= isHeader) {
                    linkedList.pollLast();
                }
                Paragraph title = new Paragraph().setTitle(element2.text());
                ((Paragraph) ((Pair) linkedList.getLast()).getKey()).addChild(title);
                linkedList.add(new Pair(title, Integer.valueOf(isHeader)));
            }
        }
    }

    private void appendText(Paragraph paragraph, Element element) {
        if (element.tagName().equalsIgnoreCase("pre") || element.tagName().equalsIgnoreCase("code")) {
            return;
        }
        if (element.tagName().equalsIgnoreCase("ul")) {
            Iterator it = element.children().iterator();
            while (it.hasNext()) {
                appendText(paragraph, (Element) it.next());
            }
        } else {
            String trim = element.text().trim();
            if (trim.length() > 0) {
                paragraph.appendText(trim);
            }
        }
    }

    private int isHeader(String str) {
        if (!str.startsWith("h")) {
            return 0;
        }
        try {
            return Integer.parseInt(str.substring(1));
        } catch (NumberFormatException e) {
            return 0;
        }
    }

    private Element searchMarkdownBody(Element element) {
        MaxPropability maxPropability = new MaxPropability(this.propabilityMinimum);
        searchMarkdownBody(element, maxPropability);
        if (maxPropability.element != null) {
            String cssSelector = maxPropability.element.cssSelector();
            if (this.selectors.get(cssSelector) < this.selectorReliability) {
                this.selectors.add(cssSelector, 1.0d - (1.0f / maxPropability.propability));
            }
        }
        return maxPropability.element;
    }

    private void searchMarkdownBody(Element element, MaxPropability maxPropability) {
        if (this.excludeTreeTags.contains(element.tagName().toLowerCase())) {
            return;
        }
        float isMarkdownBody = isMarkdownBody(element);
        if (isMarkdownBody > maxPropability.propability) {
            maxPropability.propability = isMarkdownBody;
            maxPropability.element = element;
        }
        Iterator it = element.children().iterator();
        while (it.hasNext()) {
            searchMarkdownBody((Element) it.next(), maxPropability);
        }
    }

    private float isMarkdownBody(Element element) {
        if (this.excludeTags.contains(element.tagName().toLowerCase())) {
            return 0.0f;
        }
        float f = 1.0f;
        if (StringUtils.isEmpty(element.id()) && StringUtils.isEmpty(element.className())) {
            f = (float) (1.0f * 0.25d);
        }
        Elements children = element.children();
        if (children.size() < this.childrenLengthPropabilities.length) {
            f *= this.childrenLengthPropabilities[children.size()];
        }
        if (f < 0.01d) {
            return 0.0f;
        }
        HashSet hashSet = new HashSet();
        boolean z = false;
        Iterator it = children.iterator();
        while (it.hasNext()) {
            String lowerCase = ((Element) it.next()).tagName().toLowerCase();
            if (lowerCase.startsWith("h")) {
                z = true;
            }
            if (!this.childTags.contains(lowerCase)) {
                return 0.0f;
            }
            hashSet.add(lowerCase);
        }
        if (!z) {
            f = (float) (f * 0.25d);
        }
        float log = (float) (f * (1.0d + Math.log(hashSet.size() + 7.3890560989306495d)));
        int length = SPACE.matcher(element.text()).replaceAll("").length();
        if (length < this.textMinimum) {
            return 0.0f;
        }
        return (float) (log * (1.0d + Math.log(length / this.textMinimum)));
    }
}
