package xin.manong.weapon.base.html;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.text.html.HTML;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import xin.manong.weapon.base.util.CommonUtil;

/* loaded from: input_file:xin/manong/weapon/base/html/HTMLExtractor.class */
public class HTMLExtractor {
    private static final Logger logger = LoggerFactory.getLogger(HTMLExtractor.class);
    private static final Pattern DATE_TIME_PATTERN1 = Pattern.compile("([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})");
    private static final Pattern DATE_TIME_PATTERN2 = Pattern.compile("([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[0-9])[^0-9]{1,5}?([0-9]{1,2})");
    private static final Set<String> EXCLUDE_NODES = new HashSet<String>() { // from class: xin.manong.weapon.base.html.HTMLExtractor.1
        {
            add("script");
            add("noscript");
            add("style");
            add("iframe");
            add("select");
            add("input");
            add("button");
        }
    };

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:xin/manong/weapon/base/html/HTMLExtractor$NodeStat.class */
    public static class NodeStat {
        public int nodeCount;
        public int textCount;

        private NodeStat() {
        }
    }

    public static Element extractMainElement(String str, String str2) {
        if (StringUtils.isEmpty(str)) {
            logger.error("page HTML is empty");
            return null;
        }
        Document parse = StringUtils.isEmpty(str2) ? Jsoup.parse(str) : Jsoup.parse(str, str2);
        parse.select(String.join(",", EXCLUDE_NODES)).remove();
        Element body = parse.body();
        if (body == null) {
            logger.warn("page body is not found");
            return null;
        }
        HTMLNode hTMLNode = new HTMLNode(body);
        computeScore(hTMLNode);
        return selectMainElement(hTMLNode);
    }

    public static Long extractPublishTime(Element element) {
        Element element2 = element;
        for (int i = 0; i < 6 && element2 != null; i++) {
            String outerHtml = element2.outerHtml();
            Matcher matcher = DATE_TIME_PATTERN1.matcher(outerHtml);
            if (matcher.find()) {
                return CommonUtil.stringToTime(String.format("%s-%s-%s %s:%s:%s", matcher.group(1), matcher.group(2), matcher.group(3), matcher.group(4), matcher.group(5), matcher.group(6)), null);
            }
            Matcher matcher2 = DATE_TIME_PATTERN2.matcher(outerHtml);
            if (matcher2.find()) {
                return CommonUtil.stringToTime(String.format("%s-%s-%s %s:%s", matcher2.group(1), matcher2.group(2), matcher2.group(3), matcher2.group(4), matcher2.group(5)), "yyyy-MM-dd HH:mm");
            }
            while (!element2.tag().equals(HTML.Tag.BODY)) {
                element2 = element2.parent();
                if (element2 == null || element2.childNodeSize() != 1) {
                }
            }
            return null;
        }
        return null;
    }

    public static String buildMainHTML(Element element) {
        if (element == null) {
            return "";
        }
        ArrayList<Element> arrayList = new ArrayList();
        Iterator it = element.childNodes().iterator();
        while (it.hasNext()) {
            arrayList.addAll(buildHTMLElements((Node) it.next()));
        }
        StringBuffer stringBuffer = new StringBuffer();
        for (Element element2 : arrayList) {
            if (stringBuffer.length() > 0) {
                stringBuffer.append("\n");
            }
            stringBuffer.append(element2.outerHtml());
        }
        return stringBuffer.toString();
    }

    private static List<Element> buildHTMLElements(Node node) {
        ArrayList arrayList = new ArrayList();
        if (node instanceof TextNode) {
            if (((TextNode) node).text().trim().isEmpty()) {
                return arrayList;
            }
            Element parent = node.parent();
            Element element = new Element(Boolean.valueOf((parent instanceof Element) && parent.isBlock() && parent.childNodeSize() == 1).booleanValue() ? "p" : "span");
            element.appendChild(node.clone());
            arrayList.add(element);
        } else if (node instanceof Element) {
            Element element2 = (Element) node;
            String tagName = element2.tagName();
            if (!isVisible(element2)) {
                return arrayList;
            }
            if (tagName.equals("br")) {
                arrayList.add(new Element("br"));
                return arrayList;
            }
            if (tagName.equals("img") || tagName.equals("video")) {
                Element buildImageElement = tagName.equals("img") ? buildImageElement(element2) : buildVideoElement(element2);
                if (buildImageElement != null) {
                    arrayList.add(buildImageElement);
                }
                return arrayList;
            }
            ArrayList arrayList2 = new ArrayList();
            Iterator it = element2.childNodes().iterator();
            while (it.hasNext()) {
                arrayList2.addAll(buildHTMLElements((Node) it.next()));
            }
            if (arrayList2.isEmpty()) {
                return arrayList;
            }
            if (!element2.isBlock() || !containsInline(arrayList2)) {
                arrayList.addAll(arrayList2);
                return arrayList;
            }
            if (arrayList2.size() == 1) {
                Element element3 = new Element("p");
                Element element4 = (Element) arrayList2.get(0);
                if (element4.tagName().equals("p")) {
                    element3.appendChildren(element4.childNodes());
                } else {
                    element3.appendChild(element4);
                }
                arrayList.add(element3);
                return arrayList;
            }
            Element element5 = new Element("p");
            element5.appendChildren(arrayList2);
            arrayList.add(element5);
        }
        return arrayList;
    }

    private static Element buildImageElement(Element element) {
        Element element2 = new Element("img");
        String attr = element.attr("abs:src");
        if (StringUtils.isEmpty(attr)) {
            attr = element.attr("abs:data-src");
        }
        if (StringUtils.isEmpty(attr)) {
            return null;
        }
        if (attr.startsWith("//")) {
            attr = String.format("http:%s", attr);
        }
        element2.attr("src", attr);
        String attr2 = element.attr("width");
        if (!StringUtils.isEmpty(attr2)) {
            element2.attr("width", attr2);
        }
        String attr3 = element.attr("height");
        if (!StringUtils.isEmpty(attr3)) {
            element2.attr("height", attr3);
        }
        return element2;
    }

    private static Element buildVideoElement(Element element) {
        Element element2 = new Element("video");
        String attr = element.attr("abs:src");
        if (StringUtils.isEmpty(attr)) {
            Element findFirstChildElement = findFirstChildElement(element, "source");
            if (findFirstChildElement == null) {
                return null;
            }
            attr = findFirstChildElement.attr("abs:src");
            if (StringUtils.isEmpty(attr)) {
                return null;
            }
        }
        if (attr.startsWith("//")) {
            attr = String.format("http:%s", attr);
        }
        element2.attr("src", attr);
        String attr2 = element.attr("width");
        if (!StringUtils.isEmpty(attr2)) {
            element2.attr("width", attr2);
        }
        String attr3 = element.attr("height");
        if (!StringUtils.isEmpty(attr3)) {
            element2.attr("height", attr3);
        }
        return element2;
    }

    private static Element findFirstChildElement(Element element, String str) {
        if (element == null) {
            return null;
        }
        Iterator it = element.children().iterator();
        while (it.hasNext()) {
            Element element2 = (Element) it.next();
            if (element2.tagName().equalsIgnoreCase(str)) {
                return element2;
            }
        }
        return null;
    }

    private static boolean containsInline(List<Element> list) {
        Iterator<Element> it = list.iterator();
        while (it.hasNext()) {
            if (!it.next().isBlock()) {
                return true;
            }
        }
        return false;
    }

    private static void computeScore(HTMLNode hTMLNode) {
        if (hTMLNode.node instanceof TextNode) {
            int length = hTMLNode.node.text().trim().length();
            hTMLNode.textCount = length;
            if (length > 0) {
                hTMLNode.segmentTextCounts.add(Integer.valueOf(length));
                return;
            }
            return;
        }
        if (hTMLNode.node instanceof Element) {
            Element element = hTMLNode.node;
            String tagName = element.tagName();
            if (!isVisible(element) || tagName.equals("br")) {
                return;
            }
            for (Node node : element.childNodes()) {
                if (!(node instanceof Comment)) {
                    HTMLNode hTMLNode2 = new HTMLNode(node);
                    hTMLNode2.parentNode = hTMLNode;
                    computeScore(hTMLNode2);
                    accumulateChildNode(hTMLNode, hTMLNode2);
                }
            }
            hTMLNode.nodeCount++;
            if (tagName.equals("p") || tagName.equals("section")) {
                hTMLNode.paragraphNodeCount++;
            } else if (tagName.equals("a")) {
                hTMLNode.anchorNodeCount++;
                hTMLNode.anchorTextCount = hTMLNode.textCount;
            }
            int i = hTMLNode.textCount - hTMLNode.anchorTextCount;
            int i2 = hTMLNode.nodeCount - hTMLNode.anchorNodeCount;
            hTMLNode.density = (i2 == 0 || i == 0) ? 0.0d : (i * 1.0d) / i2;
            hTMLNode.score = Math.log(computeVariance(hTMLNode.segmentTextCounts)) * hTMLNode.sumDensity * Math.log((hTMLNode.textCount - hTMLNode.anchorTextCount) + 1) * Math.log10(hTMLNode.paragraphNodeCount + 2);
        }
    }

    private static boolean isVisible(Element element) {
        String attr = element.attr("style");
        return (attr == null ? "" : attr.replaceAll("\\s", "")).indexOf("display:none") == -1;
    }

    private static double computeVariance(List<Integer> list) {
        if (list == null || list.isEmpty()) {
            return 0.0d;
        }
        if (list.size() == 1) {
            return (list.get(0).intValue() * 1.0d) / 2.0d;
        }
        double d = 0.0d;
        while (list.iterator().hasNext()) {
            d += r0.next().intValue();
        }
        double size = d / list.size();
        double d2 = 0.0d;
        for (Integer num : list) {
            d2 += (num.intValue() - size) * (num.intValue() - size);
        }
        return Math.sqrt((d2 / list.size()) + 1.0d);
    }

    private static void accumulateChildNode(HTMLNode hTMLNode, HTMLNode hTMLNode2) {
        hTMLNode.textCount += hTMLNode2.textCount;
        hTMLNode.anchorTextCount += hTMLNode2.anchorTextCount;
        hTMLNode.nodeCount += hTMLNode2.nodeCount;
        hTMLNode.anchorNodeCount += hTMLNode2.anchorNodeCount;
        hTMLNode.paragraphNodeCount += hTMLNode2.paragraphNodeCount;
        hTMLNode.sumDensity += hTMLNode2.density;
        hTMLNode.segmentTextCounts.addAll(hTMLNode2.segmentTextCounts);
        hTMLNode.childNodes.add(hTMLNode2);
    }

    private static Element selectMainElement(HTMLNode hTMLNode) {
        LinkedList linkedList = new LinkedList();
        PriorityQueue priorityQueue = new PriorityQueue(3, (hTMLNode2, hTMLNode3) -> {
            if (hTMLNode2.score > hTMLNode3.score) {
                return 1;
            }
            return hTMLNode2.score < hTMLNode3.score ? -1 : 0;
        });
        linkedList.add(hTMLNode);
        while (!linkedList.isEmpty()) {
            HTMLNode hTMLNode4 = (HTMLNode) linkedList.remove(0);
            if (!Double.isNaN(hTMLNode4.score) && (hTMLNode4.node instanceof Element)) {
                if (priorityQueue.size() < 3) {
                    priorityQueue.offer(hTMLNode4);
                } else if (((HTMLNode) priorityQueue.peek()).score < hTMLNode4.score) {
                    priorityQueue.poll();
                    priorityQueue.offer(hTMLNode4);
                }
                if (hTMLNode4.childNodes != null) {
                    linkedList.addAll(hTMLNode4.childNodes);
                }
            }
        }
        ArrayList arrayList = new ArrayList();
        arrayList.addAll(priorityQueue);
        arrayList.sort((hTMLNode5, hTMLNode6) -> {
            if (hTMLNode5.score > hTMLNode6.score) {
                return -1;
            }
            return hTMLNode5.score < hTMLNode6.score ? 1 : 0;
        });
        return arrayList.isEmpty() ? hTMLNode.node : selectMainHTMLNode(arrayList).node;
    }

    private static HTMLNode selectMainHTMLNode(List<HTMLNode> list) {
        HTMLNode hTMLNode = list.get(0);
        HTMLNode findParentHTMLNode = findParentHTMLNode(hTMLNode);
        if (findParentHTMLNode == null) {
            return hTMLNode;
        }
        NodeStat nodeStat = new NodeStat();
        nodeStat.nodeCount = 1;
        nodeStat.textCount = hTMLNode.textCount;
        int i = 1;
        while (true) {
            if (i >= list.size()) {
                break;
            }
            HTMLNode hTMLNode2 = list.get(i);
            if (hTMLNode2 == findParentHTMLNode) {
                nodeStat.nodeCount++;
                nodeStat.textCount = findParentHTMLNode.textCount;
                break;
            }
            HTMLNode findParentHTMLNode2 = findParentHTMLNode(hTMLNode2);
            if (findParentHTMLNode2 != null && findParentHTMLNode2 == findParentHTMLNode && (hTMLNode2.textCount >= 300 || (hTMLNode2.textCount * 1.0d) / hTMLNode.textCount >= 0.4d)) {
                nodeStat.nodeCount++;
                nodeStat.textCount += hTMLNode2.textCount;
            }
            i++;
        }
        return (nodeStat.nodeCount <= 1 || (((double) nodeStat.textCount) * 1.0d) / ((double) findParentHTMLNode.textCount) < 0.8d) ? hTMLNode : findParentHTMLNode;
    }

    private static HTMLNode findParentHTMLNode(HTMLNode hTMLNode) {
        if (hTMLNode == null) {
            return null;
        }
        HTMLNode hTMLNode2 = hTMLNode.parentNode;
        while (true) {
            HTMLNode hTMLNode3 = hTMLNode2;
            if (hTMLNode3 == null) {
                return null;
            }
            if (hTMLNode3.node.childNodeSize() == 1 && hTMLNode3.parentNode != null) {
                hTMLNode2 = hTMLNode3.parentNode;
            }
            return hTMLNode3;
        }
    }
}
