package com.qwazr.library.html;

import com.fasterxml.jackson.annotation.JsonIgnore;
import com.qwazr.extractor.ParserFactory;
import com.qwazr.extractor.ParserField;
import com.qwazr.extractor.ParserInterface;
import com.qwazr.extractor.ParserResult;
import com.qwazr.extractor.ParserUtils;
import com.qwazr.utils.DomUtils;
import com.qwazr.utils.HtmlUtils;
import com.qwazr.utils.IOUtils;
import com.qwazr.utils.StringUtils;
import com.qwazr.utils.XPathParser;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.ws.rs.InternalServerErrorException;
import javax.ws.rs.NotAcceptableException;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.MultivaluedMap;
import javax.xml.xpath.XPathExpressionException;
import org.apache.xerces.parsers.DOMParser;
import org.cyberneko.html.HTMLConfiguration;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import se.fishtank.css.selectors.Selectors;
import se.fishtank.css.selectors.dom.W3CNode;

/* loaded from: input_file:com/qwazr/library/html/HtmlParser.class */
public class HtmlParser implements ParserFactory, ParserInterface {
    private static final ThreadLocal<DOMParser> DOM_PARSER_THREAD_LOCAL = ThreadLocal.withInitial(HtmlParser::getNewDomParser);
    private static final MediaType DEFAULT_MIMETYPE = MediaType.valueOf("text/html");
    private static final Collection<MediaType> DEFAULT_MIMETYPES = List.of(DEFAULT_MIMETYPE);
    private static final String NAME = "html";
    private static final Collection<String> DEFAULT_EXTENSIONS = List.of("htm", NAME);
    private static final ParserField HEADERS = ParserField.newString("headers", "Extract headers (h1, h2, h3, h4, h5, h6)");
    private static final ParserField H1 = ParserField.newString("h1", "H1 header contents");
    private static final ParserField H2 = ParserField.newString("h2", "H2 header contents");
    private static final ParserField H3 = ParserField.newString("h3", "H3 header contents");
    private static final ParserField H4 = ParserField.newString("h4", "H4 header contents");
    private static final ParserField H5 = ParserField.newString("h5", "H5 header contents");
    private static final ParserField H6 = ParserField.newString("h6", "H6 header contents");
    private static final ParserField ANCHORS = ParserField.newString("anchors", "Anchors");
    private static final ParserField IMAGES = ParserField.newMap("images", "Image tags");
    private static final ParserField METAS = ParserField.newMap("metas", "Meta tags");
    private static final ParserField SELECTORS = ParserField.newMap("selectors", "Selector results");
    private static final Collection<ParserField> FIELDS = List.of((Object[]) new ParserField[]{TITLE, CONTENT, H1, H2, H3, H4, H5, H6, ANCHORS, IMAGES, METAS, LANG_DETECTION, SELECTORS});
    private static final ParserField XPATH_PARAM = ParserField.newString("xpath", "Any XPATH selector");
    private static final ParserField XPATH_NAME_PARAM = ParserField.newString("xpath_name", "The name of the XPATH selector");
    private static final ParserField CSS_PARAM = ParserField.newString("css", "Any CSS selector");
    private static final ParserField CSS_NAME_PARAM = ParserField.newString("css_name", "The name of the CSS selector");
    private static final ParserField REGEXP_PARAM = ParserField.newString("regexp", "Any regular expression");
    private static final ParserField REGEXP_NAME_PARAM = ParserField.newString("regexp_name", "The name of the regular expression");
    private static final List<ParserField> PARAMETERS = List.of((Object[]) new ParserField[]{TITLE, CONTENT, HEADERS, ANCHORS, IMAGES, METAS, XPATH_PARAM, XPATH_NAME_PARAM, CSS_PARAM, CSS_NAME_PARAM, REGEXP_PARAM, REGEXP_NAME_PARAM});

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:com/qwazr/library/html/HtmlParser$ListConsumer.class */
    public static class ListConsumer extends ArrayList<Object> implements XPathParser.Consumer {
        private ListConsumer() {
        }

        @JsonIgnore
        public void accept(Node node) {
            accept(node.getTextContent());
        }

        @JsonIgnore
        public void accept(Boolean bool) {
            add(bool);
        }

        @JsonIgnore
        public void accept(String str) {
            if (str != null) {
                add(str.trim());
            }
        }

        @JsonIgnore
        public void accept(Number number) {
            add(number);
        }
    }

    public static HTMLConfiguration getNewHtmlConfiguration() {
        HTMLConfiguration hTMLConfiguration = new HTMLConfiguration();
        hTMLConfiguration.setFeature("http://xml.org/sax/features/namespaces", true);
        hTMLConfiguration.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content", false);
        hTMLConfiguration.setFeature("http://cyberneko.org/html/features/balance-tags", true);
        hTMLConfiguration.setFeature("http://cyberneko.org/html/features/report-errors", false);
        hTMLConfiguration.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
        hTMLConfiguration.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
        return hTMLConfiguration;
    }

    public static DOMParser getNewDomParser() {
        return new DOMParser(getNewHtmlConfiguration());
    }

    public static DOMParser getThreadLocalDomParser() {
        return DOM_PARSER_THREAD_LOCAL.get();
    }

    public Collection<ParserField> getParameters() {
        return PARAMETERS;
    }

    public Collection<ParserField> getFields() {
        return FIELDS;
    }

    private void extractTitle(XPathParser xPathParser, Document document, ParserResult.FieldsBuilder fieldsBuilder) throws XPathExpressionException {
        String evaluateString = xPathParser.evaluateString(document, "/html/head/title//text()");
        if (evaluateString != null) {
            fieldsBuilder.set(TITLE, evaluateString);
        }
    }

    private void extractHeaders(Document document, ParserResult.FieldsBuilder fieldsBuilder) {
        addToField(fieldsBuilder, H1, document.getElementsByTagName("h1"));
        addToField(fieldsBuilder, H2, document.getElementsByTagName("h2"));
        addToField(fieldsBuilder, H3, document.getElementsByTagName("h3"));
        addToField(fieldsBuilder, H4, document.getElementsByTagName("h4"));
        addToField(fieldsBuilder, H5, document.getElementsByTagName("h5"));
        addToField(fieldsBuilder, H6, document.getElementsByTagName("h6"));
    }

    private void extractAnchors(XPathParser xPathParser, Document document, ParserResult.FieldsBuilder fieldsBuilder) throws XPathExpressionException {
        DomUtils.forEach(xPathParser.evaluateNodes(document, "//a/@href"), node -> {
            fieldsBuilder.add(ANCHORS, DomUtils.getAttributeString(node, "href"));
        });
    }

    private void extractImgTags(Document document, ParserResult.FieldsBuilder fieldsBuilder) {
        DomUtils.forEach(document.getElementsByTagName("img"), node -> {
            LinkedHashMap linkedHashMap = new LinkedHashMap();
            addToMap(linkedHashMap, "src", DomUtils.getAttributeString(node, "src"));
            addToMap(linkedHashMap, "alt", DomUtils.getAttributeString(node, "alt"));
            if (linkedHashMap.isEmpty()) {
                return;
            }
            fieldsBuilder.add(IMAGES, linkedHashMap);
        });
    }

    private void extractTextContent(Document document, ParserResult.FieldsBuilder fieldsBuilder) {
        HtmlUtils.domTextExtractor(document, str -> {
            fieldsBuilder.add(CONTENT, str);
        });
        fieldsBuilder.add(LANG_DETECTION, ParserUtils.languageDetection(fieldsBuilder, CONTENT, 10000));
    }

    private void extractMeta(Document document, ParserResult.FieldsBuilder fieldsBuilder) {
        NodeList elementsByTagName = document.getElementsByTagName("head");
        if (elementsByTagName == null || elementsByTagName.getLength() == 0) {
            return;
        }
        Node item = elementsByTagName.item(0);
        if (item.getNodeType() != 1) {
            return;
        }
        LinkedHashMap linkedHashMap = new LinkedHashMap();
        DomUtils.forEach(((Element) item).getElementsByTagName("meta"), node -> {
            String attributeString = DomUtils.getAttributeString(node, "name");
            String attributeString2 = DomUtils.getAttributeString(node, "content");
            if (StringUtils.isEmpty(attributeString) || StringUtils.isEmpty(attributeString2)) {
                return;
            }
            linkedHashMap.put(attributeString, attributeString2);
        });
        if (linkedHashMap.isEmpty()) {
            return;
        }
        fieldsBuilder.add(METAS, linkedHashMap);
    }

    private Map<String, String> extractPrefixParameters(MultivaluedMap<String, String> multivaluedMap, ParserField parserField, ParserField parserField2) {
        LinkedHashMap linkedHashMap = new LinkedHashMap();
        int i = 0;
        while (true) {
            String parameterValue = ParserUtils.getParameterValue(multivaluedMap, parserField, i);
            if (parameterValue == null) {
                return linkedHashMap;
            }
            String parameterValue2 = ParserUtils.getParameterValue(multivaluedMap, parserField2, i);
            linkedHashMap.put(parameterValue2 == null ? Integer.toString(i) : parameterValue2, parameterValue);
            i++;
        }
    }

    private void extractXPath(Map<String, String> map, XPathParser xPathParser, Node node, LinkedHashMap<String, Object> linkedHashMap) throws XPathExpressionException {
        for (Map.Entry<String, String> entry : map.entrySet()) {
            ListConsumer listConsumer = new ListConsumer();
            xPathParser.evaluate(node, entry.getValue(), listConsumer);
            linkedHashMap.put(entry.getKey(), listConsumer);
        }
    }

    private void extractCss(Map<String, String> map, Node node, LinkedHashMap<String, Object> linkedHashMap) {
        Selectors selectors = new Selectors(new W3CNode(node));
        for (Map.Entry<String, String> entry : map.entrySet()) {
            ListConsumer listConsumer = new ListConsumer();
            List querySelectorAll = selectors.querySelectorAll(entry.getValue());
            Objects.requireNonNull(listConsumer);
            querySelectorAll.forEach(listConsumer::accept);
            linkedHashMap.put(entry.getKey(), listConsumer);
        }
    }

    private void extractRegExp(Map<String, String> map, String str, LinkedHashMap<String, Object> linkedHashMap) {
        for (Map.Entry<String, String> entry : map.entrySet()) {
            ListConsumer listConsumer = new ListConsumer();
            Matcher matcher = Pattern.compile(entry.getValue(), 32).matcher(str);
            int groupCount = matcher.groupCount();
            while (matcher.find()) {
                for (int i = 1; i <= groupCount; i++) {
                    listConsumer.accept(matcher.group(i));
                }
            }
            linkedHashMap.put(entry.getKey(), listConsumer);
        }
    }

    private void addToMap(Map<String, String> map, String str, String str2) {
        if (StringUtils.isEmpty(str2)) {
            return;
        }
        map.put(str, str2);
    }

    private void addToField(ParserResult.FieldsBuilder fieldsBuilder, ParserField parserField, NodeList nodeList) {
        DomUtils.forEach(nodeList, node -> {
            fieldsBuilder.add(parserField, node.getTextContent());
        });
    }

    public ParserResult extract(MultivaluedMap<String, String> multivaluedMap, InputStream inputStream, MediaType mediaType) throws IOException {
        String str;
        ParserResult.Builder of = ParserResult.of(NAME);
        if (mediaType != null) {
            try {
                of.metas().set(MIME_TYPE, mediaType.toString());
            } catch (XPathExpressionException e) {
                throw new NotAcceptableException("Error in the XPATH expression: " + e.getMessage(), e);
            } catch (SAXException e2) {
                throw new InternalServerErrorException(e2);
            }
        }
        Map<String, String> extractPrefixParameters = extractPrefixParameters(multivaluedMap, XPATH_PARAM, XPATH_NAME_PARAM);
        Map<String, String> extractPrefixParameters2 = extractPrefixParameters(multivaluedMap, CSS_PARAM, CSS_NAME_PARAM);
        Map<String, String> extractPrefixParameters3 = extractPrefixParameters(multivaluedMap, REGEXP_PARAM, REGEXP_NAME_PARAM);
        boolean z = (extractPrefixParameters.isEmpty() && extractPrefixParameters2.isEmpty() && extractPrefixParameters3.isEmpty()) ? false : true;
        DOMParser threadLocalDomParser = getThreadLocalDomParser();
        if (extractPrefixParameters3.isEmpty()) {
            str = null;
            threadLocalDomParser.parse(new InputSource(new InputStreamReader(inputStream, StandardCharsets.UTF_8)));
        } else {
            str = IOUtils.toString(inputStream, StandardCharsets.UTF_8);
            threadLocalDomParser.parse(new InputSource(new StringReader(str)));
        }
        ParserResult.FieldsBuilder newDocument = of.newDocument();
        LinkedHashMap<String, Object> linkedHashMap = new LinkedHashMap<>();
        Document document = threadLocalDomParser.getDocument();
        XPathParser xPathParser = (extractPrefixParameters.isEmpty() && z) ? null : new XPathParser();
        if (!extractPrefixParameters.isEmpty()) {
            extractXPath(extractPrefixParameters, xPathParser, document, linkedHashMap);
        }
        if (!extractPrefixParameters2.isEmpty()) {
            extractCss(extractPrefixParameters2, document, linkedHashMap);
        }
        if (!extractPrefixParameters3.isEmpty()) {
            extractRegExp(extractPrefixParameters3, str, linkedHashMap);
        }
        boolean isEmpty = linkedHashMap.isEmpty();
        if (!isEmpty) {
            newDocument.set(SELECTORS, linkedHashMap);
        }
        if (isEmpty || (multivaluedMap != null && multivaluedMap.containsKey(TITLE.name))) {
            extractTitle(xPathParser, document, newDocument);
        }
        if (isEmpty || (multivaluedMap != null && multivaluedMap.containsKey(HEADERS.name))) {
            extractHeaders(document, newDocument);
        }
        if (isEmpty || (multivaluedMap != null && multivaluedMap.containsKey(ANCHORS.name))) {
            extractAnchors(xPathParser, document, newDocument);
        }
        if (isEmpty || (multivaluedMap != null && multivaluedMap.containsKey(IMAGES.name))) {
            extractImgTags(document, newDocument);
        }
        if (isEmpty || (multivaluedMap != null && multivaluedMap.containsKey(CONTENT.name))) {
            extractTextContent(document, newDocument);
        }
        if (isEmpty || (multivaluedMap != null && multivaluedMap.containsKey(METAS.name))) {
            extractMeta(document, newDocument);
        }
        return of.build();
    }

    public ParserResult extract(MultivaluedMap<String, String> multivaluedMap, Path path) throws IOException {
        return ParserUtils.toBufferedStream(path, inputStream -> {
            return extract(multivaluedMap, inputStream, DEFAULT_MIMETYPE);
        });
    }

    public String getName() {
        return NAME;
    }

    public ParserInterface createParser() {
        return this;
    }

    public Collection<String> getSupportedFileExtensions() {
        return DEFAULT_EXTENSIONS;
    }

    public Collection<MediaType> getSupportedMimeTypes() {
        return DEFAULT_MIMETYPES;
    }
}
