package com.qwazr.library.html;

import com.fasterxml.jackson.annotation.JsonIgnore;
import com.qwazr.extractor.ParserAbstract;
import com.qwazr.extractor.ParserField;
import com.qwazr.extractor.ParserFieldsBuilder;
import com.qwazr.extractor.ParserResultBuilder;
import com.qwazr.utils.DomUtils;
import com.qwazr.utils.HtmlUtils;
import com.qwazr.utils.IOUtils;
import com.qwazr.utils.StringUtils;
import com.qwazr.utils.XPathParser;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.ws.rs.core.MultivaluedMap;
import javax.xml.xpath.XPathExpressionException;
import org.apache.xerces.parsers.DOMParser;
import org.cyberneko.html.HTMLConfiguration;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import se.fishtank.css.selectors.Selectors;
import se.fishtank.css.selectors.dom.W3CNode;

/* loaded from: input_file:com/qwazr/library/html/HtmlParser.class */
public class HtmlParser extends ParserAbstract {
    private static final String[] DEFAULT_MIMETYPES = {"text/html"};
    private static final String[] DEFAULT_EXTENSIONS = {"htm", "html"};
    private static final ParserField TITLE = ParserField.newString("title", "The title of the document");
    private static final ParserField CONTENT = ParserField.newString("content", "The text content of the document. One item per paragraph");
    private static final ParserField H1 = ParserField.newString("h1", "H1 header contents");
    private static final ParserField H2 = ParserField.newString("h2", "H2 header contents");
    private static final ParserField H3 = ParserField.newString("h3", "H3 header contents");
    private static final ParserField H4 = ParserField.newString("h4", "H4 header contents");
    private static final ParserField H5 = ParserField.newString("h5", "H5 header contents");
    private static final ParserField H6 = ParserField.newString("h6", "H6 header contents");
    private static final ParserField ANCHORS = ParserField.newString("anchors", "Anchors");
    private static final ParserField IMAGES = ParserField.newMap("images", "Image tags");
    private static final ParserField METAS = ParserField.newMap("metas", "Meta tags");
    private static final ParserField SELECTORS = ParserField.newMap("selectors", "Selector results");
    private static final ParserField LANG_DETECTION = ParserField.newString("lang_detection", "Detection of the language");
    private static final ParserField[] FIELDS = {TITLE, CONTENT, H1, H2, H3, H4, H5, H6, ANCHORS, IMAGES, METAS, LANG_DETECTION, SELECTORS};
    private static final ParserField XPATH_PARAM = ParserField.newString("xpath", "Any XPATH selector");
    private static final ParserField XPATH_NAME_PARAM = ParserField.newString("xpath_name", "The name of the XPATH selector");
    private static final ParserField CSS_PARAM = ParserField.newString("css", "Any CSS selector");
    private static final ParserField CSS_NAME_PARAM = ParserField.newString("css_name", "The name of the CSS selector");
    private static final ParserField REGEXP_PARAM = ParserField.newString("regexp", "Any regular expression");
    private static final ParserField REGEXP_NAME_PARAM = ParserField.newString("regexp_name", "The name of the regular expression");
    private static final ParserField[] PARAMETERS = {XPATH_PARAM, XPATH_NAME_PARAM, CSS_PARAM, CSS_NAME_PARAM, REGEXP_PARAM, REGEXP_NAME_PARAM};

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:com/qwazr/library/html/HtmlParser$ListConsumer.class */
    public class ListConsumer extends ArrayList<Object> implements XPathParser.Consumer {
        private ListConsumer() {
        }

        @JsonIgnore
        public void accept(Node node) {
            accept(node.getTextContent());
        }

        @JsonIgnore
        public void accept(Boolean bool) {
            add(bool);
        }

        @JsonIgnore
        public void accept(String str) {
            if (str != null) {
                add(str.trim());
            }
        }

        @JsonIgnore
        public void accept(Number number) {
            add(number);
        }
    }

    public ParserField[] getParameters() {
        return PARAMETERS;
    }

    public ParserField[] getFields() {
        return FIELDS;
    }

    private void extractTitle(XPathParser xPathParser, Document document, ParserFieldsBuilder parserFieldsBuilder) throws XPathExpressionException {
        String evaluateString = xPathParser.evaluateString(document, "/html/head/title");
        if (evaluateString != null) {
            parserFieldsBuilder.set(TITLE, evaluateString);
        }
    }

    private void extractHeaders(Document document, ParserFieldsBuilder parserFieldsBuilder) {
        addToField(parserFieldsBuilder, H1, document.getElementsByTagName("h1"));
        addToField(parserFieldsBuilder, H2, document.getElementsByTagName("h2"));
        addToField(parserFieldsBuilder, H3, document.getElementsByTagName("h3"));
        addToField(parserFieldsBuilder, H4, document.getElementsByTagName("h4"));
        addToField(parserFieldsBuilder, H5, document.getElementsByTagName("h5"));
        addToField(parserFieldsBuilder, H6, document.getElementsByTagName("h6"));
    }

    private void extractAnchors(XPathParser xPathParser, Document document, ParserFieldsBuilder parserFieldsBuilder) throws XPathExpressionException {
        xPathParser.evaluateNodes(document, "//a/@href").forEach(node -> {
            parserFieldsBuilder.add(ANCHORS, DomUtils.getAttributeString(node, "href"));
        });
    }

    private void extractImgTags(Document document, ParserFieldsBuilder parserFieldsBuilder) {
        DomUtils.iterator(document.getElementsByTagName("img")).forEach(node -> {
            LinkedHashMap linkedHashMap = new LinkedHashMap();
            addToMap(linkedHashMap, "src", DomUtils.getAttributeString(node, "src"));
            addToMap(linkedHashMap, "alt", DomUtils.getAttributeString(node, "alt"));
            if (linkedHashMap.isEmpty()) {
                return;
            }
            parserFieldsBuilder.add(IMAGES, linkedHashMap);
        });
    }

    private void extractTextContent(Document document, ParserFieldsBuilder parserFieldsBuilder) throws IOException {
        HtmlUtils.domTextExtractor(document, str -> {
            parserFieldsBuilder.add(CONTENT, str);
        });
        parserFieldsBuilder.add(LANG_DETECTION, languageDetection(parserFieldsBuilder, CONTENT, 10000));
    }

    private void extractMeta(Document document, ParserFieldsBuilder parserFieldsBuilder) {
        NodeList elementsByTagName = document.getElementsByTagName("head");
        if (elementsByTagName == null || elementsByTagName.getLength() == 0) {
            return;
        }
        Node item = elementsByTagName.item(0);
        if (item.getNodeType() != 1) {
            return;
        }
        LinkedHashMap linkedHashMap = new LinkedHashMap();
        DomUtils.iterator(((Element) item).getElementsByTagName("meta")).forEach(node -> {
            String attributeString = DomUtils.getAttributeString(node, "name");
            String attributeString2 = DomUtils.getAttributeString(node, "content");
            if (StringUtils.isEmpty(attributeString) || StringUtils.isEmpty(attributeString2)) {
                return;
            }
            linkedHashMap.put(attributeString, attributeString2);
        });
        if (linkedHashMap.isEmpty()) {
            return;
        }
        parserFieldsBuilder.add(METAS, linkedHashMap);
    }

    private int extractXPath(MultivaluedMap<String, String> multivaluedMap, XPathParser xPathParser, Node node, LinkedHashMap<String, Object> linkedHashMap) throws XPathExpressionException {
        int i = 0;
        while (true) {
            String parameterValue = getParameterValue(multivaluedMap, XPATH_PARAM, i);
            if (parameterValue == null) {
                return i;
            }
            String parameterValue2 = getParameterValue(multivaluedMap, XPATH_NAME_PARAM, i);
            ListConsumer listConsumer = new ListConsumer();
            xPathParser.evaluate(node, parameterValue, listConsumer);
            linkedHashMap.put(parameterValue2 == null ? Integer.toString(i) : parameterValue2, listConsumer);
            i++;
        }
    }

    private int extractCss(MultivaluedMap<String, String> multivaluedMap, Node node, LinkedHashMap<String, Object> linkedHashMap) {
        int i = 0;
        Selectors selectors = new Selectors(new W3CNode(node));
        while (true) {
            String parameterValue = getParameterValue(multivaluedMap, CSS_PARAM, i);
            if (parameterValue == null) {
                return i;
            }
            String parameterValue2 = getParameterValue(multivaluedMap, CSS_NAME_PARAM, i);
            ListConsumer listConsumer = new ListConsumer();
            List querySelectorAll = selectors.querySelectorAll(parameterValue);
            listConsumer.getClass();
            querySelectorAll.forEach(listConsumer::accept);
            linkedHashMap.put(parameterValue2 == null ? Integer.toString(i) : parameterValue2, listConsumer);
            i++;
        }
    }

    private int extractRegExp(MultivaluedMap<String, String> multivaluedMap, String str, LinkedHashMap<String, Object> linkedHashMap) {
        int i = 0;
        while (true) {
            String parameterValue = getParameterValue(multivaluedMap, REGEXP_PARAM, i);
            if (parameterValue == null) {
                return i;
            }
            String parameterValue2 = getParameterValue(multivaluedMap, REGEXP_NAME_PARAM, i);
            ListConsumer listConsumer = new ListConsumer();
            Matcher matcher = Pattern.compile(parameterValue, 32).matcher(str);
            int groupCount = matcher.groupCount();
            while (matcher.find()) {
                for (int i2 = 1; i2 <= groupCount; i2++) {
                    listConsumer.accept(matcher.group(i2));
                }
            }
            linkedHashMap.put(parameterValue2 == null ? Integer.toString(i) : parameterValue2, listConsumer);
            i++;
        }
    }

    private void addToMap(Map<String, String> map, String str, String str2) {
        if (StringUtils.isEmpty(str2)) {
            return;
        }
        map.put(str, str2);
    }

    private void addToField(ParserFieldsBuilder parserFieldsBuilder, ParserField parserField, NodeList nodeList) {
        DomUtils.iterator(nodeList).forEach(node -> {
            parserFieldsBuilder.add(parserField, node.getTextContent());
        });
    }

    public void parseContent(MultivaluedMap<String, String> multivaluedMap, InputStream inputStream, String str, String str2, ParserResultBuilder parserResultBuilder) throws Exception {
        String str3;
        boolean z = multivaluedMap != null && multivaluedMap.containsKey(XPATH_PARAM.name);
        boolean z2 = multivaluedMap != null && multivaluedMap.containsKey(CSS_PARAM.name);
        boolean z3 = multivaluedMap != null && multivaluedMap.containsKey(REGEXP_PARAM.name);
        boolean z4 = z || z2 || z3;
        HTMLConfiguration hTMLConfiguration = new HTMLConfiguration();
        hTMLConfiguration.setFeature("http://xml.org/sax/features/namespaces", true);
        hTMLConfiguration.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content", false);
        hTMLConfiguration.setFeature("http://cyberneko.org/html/features/balance-tags", true);
        hTMLConfiguration.setFeature("http://cyberneko.org/html/features/report-errors", false);
        hTMLConfiguration.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
        hTMLConfiguration.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
        DOMParser dOMParser = new DOMParser(hTMLConfiguration);
        if (z3) {
            str3 = IOUtils.toString(inputStream, StandardCharsets.UTF_8);
            dOMParser.parse(new InputSource(new StringReader(str3)));
        } else {
            str3 = null;
            dOMParser.parse(new InputSource(inputStream));
        }
        ParserFieldsBuilder newDocument = parserResultBuilder.newDocument();
        LinkedHashMap<String, Object> linkedHashMap = new LinkedHashMap<>();
        Document document = dOMParser.getDocument();
        XPathParser xPathParser = (z || !z4) ? new XPathParser() : null;
        if (z) {
            extractXPath(multivaluedMap, xPathParser, document, linkedHashMap);
        }
        if (z2) {
            extractCss(multivaluedMap, document, linkedHashMap);
        }
        if (z3) {
            extractRegExp(multivaluedMap, str3, linkedHashMap);
        }
        if (!linkedHashMap.isEmpty()) {
            newDocument.set(SELECTORS, linkedHashMap);
            return;
        }
        extractTitle(xPathParser, document, newDocument);
        extractHeaders(document, newDocument);
        extractAnchors(xPathParser, document, newDocument);
        extractImgTags(document, newDocument);
        extractTextContent(document, newDocument);
        extractMeta(document, newDocument);
    }

    public String[] getDefaultExtensions() {
        return DEFAULT_EXTENSIONS;
    }

    public String[] getDefaultMimeTypes() {
        return DEFAULT_MIMETYPES;
    }
}
