package com.qwazr.extractor.parser;

import com.gargoylesoftware.htmlunit.Page;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebClientOptions;
import com.gargoylesoftware.htmlunit.html.DomElement;
import com.gargoylesoftware.htmlunit.html.DomNodeList;
import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.qwazr.extractor.ParserAbstract;
import com.qwazr.extractor.ParserDocument;
import com.qwazr.extractor.ParserField;
import com.qwazr.utils.StringUtils;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

/* loaded from: input_file:com/qwazr/extractor/parser/Html.class */
public class Html extends ParserAbstract {
    public static final String[] DEFAULT_MIMETYPES = {"text/html"};
    public static final String[] DEFAULT_EXTENSIONS = {"htm", "html"};
    protected static final ParserField TITLE = ParserField.newString("title", "The title of the document");
    protected static final ParserField CONTENT = ParserField.newString("content", "The text content of the document. One item per paragraph");
    protected static final ParserField H1 = ParserField.newString("h1", "H1 header contents");
    protected static final ParserField H2 = ParserField.newString("h2", "H2 header contents");
    protected static final ParserField H3 = ParserField.newString("h3", "H3 header contents");
    protected static final ParserField H4 = ParserField.newString("h4", "H4 header contents");
    protected static final ParserField H5 = ParserField.newString("h5", "H5 header contents");
    protected static final ParserField H6 = ParserField.newString("h6", "H6 header contents");
    protected static final ParserField ANCHORS = ParserField.newString("anchors", "Anchors");
    protected static final ParserField IMAGES = ParserField.newMap("images", "Image tags");
    protected static final ParserField METAS = ParserField.newMap("metas", "Meta tags");
    protected static final ParserField XPATH = ParserField.newMap("xpath", "XPath selector results");
    protected static final ParserField CSS = ParserField.newMap("css", "CSS selector results");
    protected static final ParserField LANG_DETECTION = ParserField.newString("lang_detection", "Detection of the language");
    protected static final ParserField[] FIELDS = {TITLE, CONTENT, H1, H2, H3, H4, H5, H6, ANCHORS, IMAGES, METAS, LANG_DETECTION, XPATH};
    protected static final ParserField XPATH_PARAM = ParserField.newString("xpath", "Any XPATH selector");
    protected static final ParserField XPATH_NAME_PARAM = ParserField.newString("xpath_name", "The name of the XPATH selector");
    protected static final ParserField CSS_PARAM = ParserField.newString("css", "Any CSS selector");
    protected static final ParserField CSS_NAME_PARAM = ParserField.newString("css_name", "The name of the CSS selector");
    protected static final ParserField[] PARAMETERS = {XPATH_PARAM, XPATH_NAME_PARAM, CSS_PARAM, CSS_NAME_PARAM};

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // com.qwazr.extractor.ParserAbstract
    public ParserField[] getParameters() {
        return PARAMETERS;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // com.qwazr.extractor.ParserAbstract
    public ParserField[] getFields() {
        return FIELDS;
    }

    private void extractTitle(HtmlPage htmlPage, ParserDocument parserDocument) {
        String titleText = htmlPage.getTitleText();
        if (titleText != null) {
            parserDocument.add(TITLE, titleText);
        }
    }

    private void extractHeaders(HtmlElement htmlElement, ParserDocument parserDocument) {
        addToField(parserDocument, H1, htmlElement.getElementsByTagName("h1"));
        addToField(parserDocument, H2, htmlElement.getElementsByTagName("h2"));
        addToField(parserDocument, H3, htmlElement.getElementsByTagName("h3"));
        addToField(parserDocument, H4, htmlElement.getElementsByTagName("h4"));
        addToField(parserDocument, H5, htmlElement.getElementsByTagName("h5"));
        addToField(parserDocument, H6, htmlElement.getElementsByTagName("h6"));
    }

    private void extractAnchors(HtmlPage htmlPage, ParserDocument parserDocument) {
        List anchors = htmlPage.getAnchors();
        if (anchors == null) {
            return;
        }
        Iterator it = anchors.iterator();
        while (it.hasNext()) {
            parserDocument.add(ANCHORS, ((HtmlAnchor) it.next()).getHrefAttribute());
        }
    }

    private void extractImgTags(HtmlPage htmlPage, ParserDocument parserDocument) {
        DomNodeList<DomElement> elementsByTagName = htmlPage.getElementsByTagName("img");
        if (elementsByTagName == null) {
            return;
        }
        for (DomElement domElement : elementsByTagName) {
            LinkedHashMap linkedHashMap = new LinkedHashMap();
            addToMap(linkedHashMap, "src", domElement.getAttribute("src"));
            addToMap(linkedHashMap, "alt", domElement.getAttribute("alt"));
            if (!linkedHashMap.isEmpty()) {
                parserDocument.add(IMAGES, linkedHashMap);
            }
        }
    }

    private void extractTextContent(HtmlPage htmlPage, ParserDocument parserDocument) throws IOException {
        String asText = htmlPage.asText();
        if (asText == null) {
            return;
        }
        ArrayList arrayList = new ArrayList();
        StringUtils.linesCollector(asText, false, arrayList);
        Iterator it = arrayList.iterator();
        while (it.hasNext()) {
            String trim = ((String) it.next()).trim();
            if (!StringUtils.isEmpty(trim)) {
                parserDocument.add(CONTENT, trim);
            }
        }
        parserDocument.add(LANG_DETECTION, languageDetection(CONTENT, 10000));
    }

    private void extractMeta(HtmlPage htmlPage, ParserDocument parserDocument) {
        DomNodeList<DomElement> elementsByTagName;
        HtmlElement head = htmlPage.getHead();
        if (head == null || (elementsByTagName = head.getElementsByTagName("meta")) == null) {
            return;
        }
        LinkedHashMap linkedHashMap = new LinkedHashMap();
        for (DomElement domElement : elementsByTagName) {
            String attribute = domElement.getAttribute("name");
            String attribute2 = domElement.getAttribute("content");
            if (!StringUtils.isEmpty(attribute) && !StringUtils.isEmpty(attribute2)) {
                linkedHashMap.put(attribute, attribute2);
            }
        }
        if (linkedHashMap.isEmpty()) {
            return;
        }
        parserDocument.add(METAS, linkedHashMap);
    }

    private final List<String> dumpSelectors(List<?> list) {
        ArrayList arrayList = new ArrayList();
        if (list == null) {
            return arrayList;
        }
        for (Object obj : list) {
            arrayList.add(obj instanceof HtmlElement ? ((HtmlElement) obj).asText() : obj.toString());
        }
        return arrayList;
    }

    private final int extractXPath(HtmlPage htmlPage, ParserDocument parserDocument) {
        int i = 0;
        while (true) {
            String parameterValue = getParameterValue(XPATH_PARAM, i);
            if (parameterValue == null) {
                return i;
            }
            String parameterValue2 = getParameterValue(XPATH_NAME_PARAM, i);
            LinkedHashMap linkedHashMap = new LinkedHashMap();
            linkedHashMap.put(parameterValue2 == null ? Integer.toString(i) : parameterValue2, dumpSelectors(htmlPage.getByXPath(parameterValue)));
            parserDocument.add(XPATH, linkedHashMap);
            i++;
        }
    }

    private final int extractCss(HtmlPage htmlPage, ParserDocument parserDocument) {
        int i = 0;
        while (true) {
            String parameterValue = getParameterValue(CSS_PARAM, i);
            if (parameterValue == null) {
                return i;
            }
            String parameterValue2 = getParameterValue(CSS_NAME_PARAM, i);
            LinkedHashMap linkedHashMap = new LinkedHashMap();
            linkedHashMap.put(parameterValue2 == null ? Integer.toString(i) : parameterValue2, dumpSelectors(htmlPage.querySelectorAll(parameterValue)));
            parserDocument.add(CSS, linkedHashMap);
            i++;
        }
    }

    @Override // com.qwazr.extractor.ParserAbstract
    protected void parseContent(File file, String str, String str2) throws Exception {
        WebClient webClient = new WebClient();
        Throwable th = null;
        try {
            try {
                WebClientOptions options = webClient.getOptions();
                options.setRedirectEnabled(false);
                options.setJavaScriptEnabled(false);
                options.setCssEnabled(false);
                options.setThrowExceptionOnFailingStatusCode(false);
                options.setThrowExceptionOnScriptError(false);
                Page page = webClient.getPage(file.toURI().toURL());
                if (!page.isHtmlPage()) {
                    if (webClient != null) {
                        if (0 == 0) {
                            webClient.close();
                            return;
                        }
                        try {
                            webClient.close();
                            return;
                        } catch (Throwable th2) {
                            th.addSuppressed(th2);
                            return;
                        }
                    }
                    return;
                }
                HtmlPage htmlPage = (HtmlPage) page;
                ParserDocument newParserDocument = getNewParserDocument();
                HtmlElement documentElement = htmlPage.getDocumentElement();
                if (extractXPath(htmlPage, newParserDocument) + extractCss(htmlPage, newParserDocument) == 0) {
                    extractTitle(htmlPage, newParserDocument);
                    extractHeaders(documentElement, newParserDocument);
                    extractAnchors(htmlPage, newParserDocument);
                    extractImgTags(htmlPage, newParserDocument);
                    extractTextContent(htmlPage, newParserDocument);
                    extractMeta(htmlPage, newParserDocument);
                }
                if (webClient != null) {
                    if (0 == 0) {
                        webClient.close();
                        return;
                    }
                    try {
                        webClient.close();
                    } catch (Throwable th3) {
                        th.addSuppressed(th3);
                    }
                }
            } catch (Throwable th4) {
                th = th4;
                throw th4;
            }
        } catch (Throwable th5) {
            if (webClient != null) {
                if (th != null) {
                    try {
                        webClient.close();
                    } catch (Throwable th6) {
                        th.addSuppressed(th6);
                    }
                } else {
                    webClient.close();
                }
            }
            throw th5;
        }
    }

    private void addToMap(Map<String, String> map, String str, String str2) {
        if (StringUtils.isEmpty(str2)) {
            return;
        }
        map.put(str, str2);
    }

    private void addToField(ParserDocument parserDocument, ParserField parserField, DomNodeList<HtmlElement> domNodeList) {
        if (domNodeList == null) {
            return;
        }
        Iterator it = domNodeList.iterator();
        while (it.hasNext()) {
            parserDocument.add(parserField, ((HtmlElement) it.next()).asText());
        }
    }

    @Override // com.qwazr.extractor.ParserAbstract
    protected void parseContent(InputStream inputStream, String str, String str2) throws Exception {
        File createTempFile = ParserAbstract.createTempFile(inputStream, str == null ? "page.html" : "." + str);
        try {
            parseContent(createTempFile, str, str2);
            createTempFile.delete();
        } catch (Throwable th) {
            createTempFile.delete();
            throw th;
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // com.qwazr.extractor.ParserAbstract
    public String[] getDefaultExtensions() {
        return DEFAULT_EXTENSIONS;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // com.qwazr.extractor.ParserAbstract
    public String[] getDefaultMimeTypes() {
        return DEFAULT_MIMETYPES;
    }
}
