package com.qwazr.extractor.parser;

import com.fasterxml.jackson.annotation.JsonIgnore;
import com.qwazr.extractor.ParserAbstract;
import com.qwazr.extractor.ParserDocument;
import com.qwazr.extractor.ParserField;
import com.qwazr.utils.DomUtils;
import com.qwazr.utils.HtmlUtils;
import com.qwazr.utils.StringUtils;
import com.qwazr.utils.XPathParser;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import javax.xml.xpath.XPathExpressionException;
import org.apache.xerces.parsers.DOMParser;
import org.cyberneko.html.HTMLConfiguration;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import se.fishtank.css.selectors.Selectors;
import se.fishtank.css.selectors.dom.W3CNode;

/* loaded from: input_file:com/qwazr/extractor/parser/Html.class */
public class Html extends ParserAbstract {
    public static final String[] DEFAULT_MIMETYPES = {"text/html"};
    public static final String[] DEFAULT_EXTENSIONS = {"htm", "html"};
    protected static final ParserField TITLE = ParserField.newString("title", "The title of the document");
    protected static final ParserField CONTENT = ParserField.newString("content", "The text content of the document. One item per paragraph");
    protected static final ParserField H1 = ParserField.newString("h1", "H1 header contents");
    protected static final ParserField H2 = ParserField.newString("h2", "H2 header contents");
    protected static final ParserField H3 = ParserField.newString("h3", "H3 header contents");
    protected static final ParserField H4 = ParserField.newString("h4", "H4 header contents");
    protected static final ParserField H5 = ParserField.newString("h5", "H5 header contents");
    protected static final ParserField H6 = ParserField.newString("h6", "H6 header contents");
    protected static final ParserField ANCHORS = ParserField.newString("anchors", "Anchors");
    protected static final ParserField IMAGES = ParserField.newMap("images", "Image tags");
    protected static final ParserField METAS = ParserField.newMap("metas", "Meta tags");
    protected static final ParserField SELECTORS = ParserField.newMap("selectors", "Selector results");
    protected static final ParserField LANG_DETECTION = ParserField.newString("lang_detection", "Detection of the language");
    protected static final ParserField[] FIELDS = {TITLE, CONTENT, H1, H2, H3, H4, H5, H6, ANCHORS, IMAGES, METAS, LANG_DETECTION, SELECTORS};
    protected static final ParserField XPATH_PARAM = ParserField.newString("xpath", "Any XPATH selector");
    protected static final ParserField XPATH_NAME_PARAM = ParserField.newString("xpath_name", "The name of the XPATH selector");
    protected static final ParserField CSS_PARAM = ParserField.newString("css", "Any CSS selector");
    protected static final ParserField CSS_NAME_PARAM = ParserField.newString("css_name", "The name of the CSS selector");
    protected static final ParserField[] PARAMETERS = {XPATH_PARAM, XPATH_NAME_PARAM, CSS_PARAM, CSS_NAME_PARAM};

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:com/qwazr/extractor/parser/Html$ListConsumer.class */
    public class ListConsumer extends ArrayList<Object> implements XPathParser.Consumer {
        private ListConsumer() {
        }

        @JsonIgnore
        public void accept(Node node) {
            accept(node.getTextContent());
        }

        @JsonIgnore
        public void accept(Boolean bool) {
            add(bool);
        }

        @JsonIgnore
        public void accept(String str) {
            if (str != null) {
                add(str.trim());
            }
        }

        @JsonIgnore
        public void accept(Number number) {
            add(number);
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // com.qwazr.extractor.ParserAbstract
    public ParserField[] getParameters() {
        return PARAMETERS;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // com.qwazr.extractor.ParserAbstract
    public ParserField[] getFields() {
        return FIELDS;
    }

    private void extractTitle(XPathParser xPathParser, Document document, ParserDocument parserDocument) throws XPathExpressionException {
        String evaluateString = xPathParser.evaluateString(document, "/html/head/title");
        if (evaluateString != null) {
            parserDocument.set(TITLE, evaluateString);
        }
    }

    private void extractHeaders(Document document, ParserDocument parserDocument) {
        addToField(parserDocument, H1, document.getElementsByTagName("h1"));
        addToField(parserDocument, H2, document.getElementsByTagName("h2"));
        addToField(parserDocument, H3, document.getElementsByTagName("h3"));
        addToField(parserDocument, H4, document.getElementsByTagName("h4"));
        addToField(parserDocument, H5, document.getElementsByTagName("h5"));
        addToField(parserDocument, H6, document.getElementsByTagName("h6"));
    }

    private void extractAnchors(XPathParser xPathParser, Document document, ParserDocument parserDocument) throws XPathExpressionException {
        xPathParser.evaluateNodes(document, "//a/@href").forEach(node -> {
            parserDocument.add(ANCHORS, DomUtils.getAttributeString(node, "href"));
        });
    }

    private void extractImgTags(Document document, ParserDocument parserDocument) {
        DomUtils.iterator(document.getElementsByTagName("img")).forEach(node -> {
            LinkedHashMap linkedHashMap = new LinkedHashMap();
            addToMap(linkedHashMap, "src", DomUtils.getAttributeString(node, "src"));
            addToMap(linkedHashMap, "alt", DomUtils.getAttributeString(node, "alt"));
            if (linkedHashMap.isEmpty()) {
                return;
            }
            parserDocument.add(IMAGES, linkedHashMap);
        });
    }

    private void extractTextContent(Document document, ParserDocument parserDocument) throws IOException {
        HtmlUtils.domTextExtractor(document, str -> {
            parserDocument.add(CONTENT, str);
        });
        parserDocument.add(LANG_DETECTION, languageDetection(CONTENT, 10000));
    }

    private void extractMeta(Document document, ParserDocument parserDocument) {
        NodeList elementsByTagName = document.getElementsByTagName("head");
        if (elementsByTagName == null || elementsByTagName.getLength() == 0) {
            return;
        }
        Node item = elementsByTagName.item(0);
        if (item.getNodeType() != 1) {
            return;
        }
        LinkedHashMap linkedHashMap = new LinkedHashMap();
        DomUtils.iterator(((Element) item).getElementsByTagName("meta")).forEach(node -> {
            String attributeString = DomUtils.getAttributeString(node, "name");
            String attributeString2 = DomUtils.getAttributeString(node, "content");
            if (StringUtils.isEmpty(attributeString) || StringUtils.isEmpty(attributeString2)) {
                return;
            }
            linkedHashMap.put(attributeString, attributeString2);
        });
        if (linkedHashMap.isEmpty()) {
            return;
        }
        parserDocument.add(METAS, linkedHashMap);
    }

    private final int extractXPath(XPathParser xPathParser, Node node, LinkedHashMap<String, Object> linkedHashMap) throws XPathExpressionException {
        int i = 0;
        while (true) {
            String parameterValue = getParameterValue(XPATH_PARAM, i);
            if (parameterValue == null) {
                return i;
            }
            String parameterValue2 = getParameterValue(XPATH_NAME_PARAM, i);
            ListConsumer listConsumer = new ListConsumer();
            xPathParser.evaluate(node, parameterValue, listConsumer);
            linkedHashMap.put(parameterValue2 == null ? Integer.toString(i) : parameterValue2, listConsumer);
            i++;
        }
    }

    private int extractCss(Node node, LinkedHashMap<String, Object> linkedHashMap) {
        int i = 0;
        Selectors selectors = new Selectors(new W3CNode(node));
        while (true) {
            String parameterValue = getParameterValue(CSS_PARAM, i);
            if (parameterValue == null) {
                return i;
            }
            String parameterValue2 = getParameterValue(CSS_NAME_PARAM, i);
            ListConsumer listConsumer = new ListConsumer();
            List querySelectorAll = selectors.querySelectorAll(parameterValue);
            listConsumer.getClass();
            querySelectorAll.forEach(listConsumer::accept);
            linkedHashMap.put(parameterValue2 == null ? Integer.toString(i) : parameterValue2, listConsumer);
            i++;
        }
    }

    private void addToMap(Map<String, String> map, String str, String str2) {
        if (StringUtils.isEmpty(str2)) {
            return;
        }
        map.put(str, str2);
    }

    private void addToField(ParserDocument parserDocument, ParserField parserField, NodeList nodeList) {
        DomUtils.iterator(nodeList).forEach(node -> {
            parserDocument.add(parserField, node.getTextContent());
        });
    }

    @Override // com.qwazr.extractor.ParserAbstract
    protected void parseContent(InputStream inputStream, String str, String str2) throws Exception {
        HTMLConfiguration hTMLConfiguration = new HTMLConfiguration();
        hTMLConfiguration.setFeature("http://xml.org/sax/features/namespaces", true);
        hTMLConfiguration.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content", false);
        hTMLConfiguration.setFeature("http://cyberneko.org/html/features/balance-tags", true);
        hTMLConfiguration.setFeature("http://cyberneko.org/html/features/report-errors", false);
        hTMLConfiguration.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
        hTMLConfiguration.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
        DOMParser dOMParser = new DOMParser(hTMLConfiguration);
        dOMParser.parse(new InputSource(inputStream));
        ParserDocument newParserDocument = getNewParserDocument();
        Document document = dOMParser.getDocument();
        XPathParser xPathParser = new XPathParser();
        LinkedHashMap<String, Object> linkedHashMap = new LinkedHashMap<>();
        extractXPath(xPathParser, document, linkedHashMap);
        extractCss(document, linkedHashMap);
        if (!linkedHashMap.isEmpty()) {
            newParserDocument.set(SELECTORS, linkedHashMap);
            return;
        }
        extractTitle(xPathParser, document, newParserDocument);
        extractHeaders(document, newParserDocument);
        extractAnchors(xPathParser, document, newParserDocument);
        extractImgTags(document, newParserDocument);
        extractTextContent(document, newParserDocument);
        extractMeta(document, newParserDocument);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // com.qwazr.extractor.ParserAbstract
    public String[] getDefaultExtensions() {
        return DEFAULT_EXTENSIONS;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // com.qwazr.extractor.ParserAbstract
    public String[] getDefaultMimeTypes() {
        return DEFAULT_MIMETYPES;
    }
}
