package com.bytegriffin.get4j.net.http;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONException;
import com.alibaba.fastjson.JSONObject;
import com.bytegriffin.get4j.conf.DefaultConfig;
import com.bytegriffin.get4j.core.ExceptionCatcher;
import com.bytegriffin.get4j.core.Globals;
import com.bytegriffin.get4j.core.Page;
import com.bytegriffin.get4j.core.UrlQueue;
import com.bytegriffin.get4j.fetch.FetchResourceSelector;
import com.bytegriffin.get4j.send.EmailSender;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.jayway.jsonpath.JsonPath;
import com.jayway.jsonpath.PathNotFoundException;
import com.jayway.jsonpath.Predicate;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.commons.net.SocketClient;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.fa.PersianAnalyzer;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.parser.Parser;
import org.jsoup.select.Elements;

/* loaded from: input_file:com/bytegriffin/get4j/net/http/UrlAnalyzer.class */
public final class UrlAnalyzer {
    private Page page;
    private static final Logger logger = LogManager.getLogger((Class<?>) UrlAnalyzer.class);
    private static final UrlAnalyzer singleton = new UrlAnalyzer();

    private UrlAnalyzer() {
    }

    private UrlAnalyzer setPage(Page page) {
        this.page = page;
        return this;
    }

    public static UrlAnalyzer custom(Page page) {
        return singleton.setPage(page);
    }

    public static String getTitle(String str) {
        return Jsoup.parse(str).title();
    }

    public static String formatListDetailUrl(String str) {
        return str.replace(DefaultConfig.fetch_list_url_left, "").replace(DefaultConfig.fetch_list_url_right, "").trim();
    }

    public static String selectPageText(Page page, String str) {
        if (Strings.isNullOrEmpty(str)) {
            return null;
        }
        String str2 = null;
        if (page.isHtmlContent()) {
            Elements select = Jsoup.parse(page.getHtmlContent(), page.getUrl()).select(str);
            str2 = str.contains("[src]") ? select.attr("src") : str.contains("[href]") ? select.attr("href") : select.text();
        } else if (page.isJsonContent()) {
            try {
                str2 = (String) JsonPath.read(page.getJsonContent(), str, new Predicate[0]);
            } catch (PathNotFoundException e) {
                EmailSender.sendMail(e);
                ExceptionCatcher.addException(page.getSeedName(), e);
                logger.error("种子[" + page.getSeedName() + "]在使用Jsonpath[" + str + "]定位解析Json字符串时出错，", (Throwable) e);
            }
        } else if (page.isXmlContent()) {
            Document parse = Jsoup.parse(page.getXmlContent(), "", Parser.xmlParser());
            String str3 = "";
            if (str.contains("[") && str.contains("]")) {
                str3 = str.substring(str.indexOf("[") + 1, str.lastIndexOf("]"));
            }
            Elements select2 = parse.select(str);
            if (!Strings.isNullOrEmpty(str3)) {
                Iterator<Element> it = select2.iterator();
                while (true) {
                    if (!it.hasNext()) {
                        break;
                    }
                    Element next = it.next();
                    if (!Strings.isNullOrEmpty(str3) && next.hasAttr(str3)) {
                        str2 = next.attr(str3).trim();
                        break;
                    }
                    if (!Strings.isNullOrEmpty(next.text().trim())) {
                        str2 = next.text().trim();
                        break;
                    }
                }
            } else {
                str2 = select2.text();
            }
        }
        return str2;
    }

    public static String selectPageContent(Page page, String str) {
        if (Strings.isNullOrEmpty(str)) {
            return null;
        }
        String str2 = null;
        if (page.isHtmlContent()) {
            str2 = Jsoup.parse(page.getHtmlContent(), page.getUrl()).select(str).toString();
        } else if (page.isJsonContent()) {
            try {
                str2 = (String) JsonPath.read(page.getJsonContent(), str, new Predicate[0]);
            } catch (PathNotFoundException e) {
                EmailSender.sendMail(e);
                ExceptionCatcher.addException(page.getSeedName(), e);
                logger.error("种子[" + page.getSeedName() + "]在使用Jsonpath[" + str + "]定位解析Json字符串时出错，", (Throwable) e);
            }
        } else if (page.isXmlContent()) {
            str2 = Jsoup.parse(page.getXmlContent(), "", Parser.xmlParser()).select(str).toString();
        }
        return str2;
    }

    private void addOptionUrl(Document document, HashSet<String> hashSet) {
        Iterator<Element> it = document.select("option[value]").iterator();
        while (it.hasNext()) {
            String absUrl = it.next().absUrl("value");
            if (isStartHttpUrl(absUrl)) {
                hashSet.add(absUrl);
            }
        }
    }

    public final HashSet<String> sniffAllLinks() {
        HashSet<String> newHashSet = Sets.newHashSet();
        if (this.page.isHtmlContent()) {
            Document parse = Jsoup.parse(this.page.getHtmlContent(), this.page.getUrl());
            HashSet<String> allUrlByElement = getAllUrlByElement(parse.select("a[href], frame[src], iframe[src], area[src]"));
            HashSet<String> resources = this.page.getResources();
            Iterator<String> it = allUrlByElement.iterator();
            while (it.hasNext()) {
                String next = it.next();
                if (FetchResourceSelector.isFindResources(next)) {
                    resources.add(next);
                } else {
                    newHashSet.add(next);
                }
            }
            addOptionUrl(parse, newHashSet);
        } else if (this.page.isJsonContent()) {
            newHashSet.addAll(sniffUrlFromJson(false));
        } else if (this.page.isXmlContent()) {
            newHashSet.addAll(sniffUrlFromXml(false));
        }
        return newHashSet;
    }

    public final HashSet<String> sniffSiteLinks() {
        HashSet<String> newHashSet = Sets.newHashSet();
        String url = this.page.getUrl();
        String str = "";
        try {
            URI uri = new URI(url);
            String authority = uri.getAuthority();
            String path = uri.getPath();
            str = authority + path.substring(0, path.lastIndexOf("/") + 1);
        } catch (URISyntaxException e) {
            EmailSender.sendMail(e);
            ExceptionCatcher.addException(this.page.getSeedName(), e);
            logger.error("线程[" + Thread.currentThread().getName() + "]嗅探种子[" + this.page.getSeedName() + "]在嗅探整站链接提取url前缀时出错：", (Throwable) e);
        }
        if (this.page.isHtmlContent()) {
            String htmlContent = this.page.getHtmlContent();
            if (Strings.isNullOrEmpty(htmlContent)) {
                return null;
            }
            Document parse = Jsoup.parse(htmlContent, url);
            HashSet<String> allUrlByElement = getAllUrlByElement(parse.select("a[href], frame[src], iframe[src], area[src]"));
            HashSet<String> resources = this.page.getResources();
            Iterator<String> it = allUrlByElement.iterator();
            while (it.hasNext()) {
                String next = it.next();
                if (!next.contains(str) || FetchResourceSelector.isFindResources(next)) {
                    resources.add(next);
                } else {
                    newHashSet.add(next);
                }
            }
            addOptionUrl(parse, newHashSet);
        } else if (this.page.isJsonContent()) {
            newHashSet.addAll(sniffUrlFromJson(false));
        } else if (this.page.isXmlContent()) {
            newHashSet.addAll(sniffUrlFromXml(false));
        }
        return newHashSet;
    }

    public final HashSet<String> sniffDetailLinks() {
        String str = Globals.FETCH_DETAIL_SELECT_CACHE.get(this.page.getSeedName());
        if (Strings.isNullOrEmpty(str)) {
            return null;
        }
        HashSet<String> newHashSet = Sets.newHashSet();
        if (this.page.isHtmlContent()) {
            String url = this.page.getUrl();
            String htmlContent = this.page.getHtmlContent();
            if (Strings.isNullOrEmpty(htmlContent)) {
                return null;
            }
            if (str.startsWith(DefaultConfig.fetch_detail_json_prefix)) {
                List<String> splitToList = Splitter.on(DefaultConfig.fetch_detail_json_suffix).trimResults().omitEmptyStrings().splitToList(str);
                String replace = splitToList.size() > 0 ? splitToList.get(0).replace(DefaultConfig.fetch_detail_json_prefix, "") : null;
                Iterator<String> it = FetchResourceSelector.xmlSelect(htmlContent, splitToList.size() > 1 ? splitToList.get(1) : "").iterator();
                while (it.hasNext()) {
                    newHashSet.add(replace + it.next());
                }
            } else {
                newHashSet.addAll(getAllUrlByElement(Jsoup.parse(htmlContent, url).select(str)));
            }
        } else if (this.page.isJsonContent()) {
            if (str.startsWith(DefaultConfig.json_path_prefix)) {
                if (str.contains("|")) {
                    List<String> splitToList2 = Splitter.on("|").trimResults().omitEmptyStrings().splitToList(str);
                    String str2 = splitToList2.size() > 0 ? splitToList2.get(0) : null;
                    String str3 = splitToList2.size() > 1 ? splitToList2.get(1) : "";
                    for (String str4 : FetchResourceSelector.jsonPath2List(this.page.getJsonContent(), str2, "")) {
                        if (str4 != null) {
                            newHashSet.addAll(getAllUrlByElement(Jsoup.parse(str4, this.page.getUrl()).select(str3)));
                        }
                    }
                } else {
                    newHashSet = FetchResourceSelector.jsonPath(this.page.getJsonContent(), str, "");
                }
            } else if (str.contains(DefaultConfig.json_path_prefix)) {
                String[] split = str.split("\\$.");
                newHashSet = FetchResourceSelector.jsonPath(this.page.getJsonContent(), DefaultConfig.json_path_prefix + split[1], split[0]);
            }
        } else if (this.page.isXmlContent()) {
            newHashSet = FetchResourceSelector.xmlSelect(this.page.getXmlContent(), str);
        }
        return newHashSet;
    }

    public final void sniffAndSetResources() {
        HashSet<String> newHashSet = Sets.newHashSet();
        FetchResourceSelector fetchResourceSelector = Globals.FETCH_RESOURCE_SELECTOR_CACHE.get(this.page.getSeedName());
        if (fetchResourceSelector == null || fetchResourceSelector.isConfigAll()) {
            if (this.page.isHtmlContent()) {
                Document parse = Jsoup.parse(this.page.getHtmlContent(), this.page.getUrl());
                Elements select = parse.select("link[href], script[src], img[src], embed[src], video[src], audio[src], track[src]");
                Elements select2 = parse.select("a[href~=(?i)." + FetchResourceSelector.BINARY_FILTERS + "]");
                HashSet<String> allUrlByElement = getAllUrlByElement(select);
                HashSet<String> allUrlByElement2 = getAllUrlByElement(select2);
                if (allUrlByElement.size() > 0) {
                    newHashSet.addAll(allUrlByElement);
                }
                if (allUrlByElement2.size() > 0) {
                    newHashSet.addAll(allUrlByElement2);
                }
            } else if (this.page.isJsonContent()) {
                newHashSet.addAll(sniffUrlFromJson(true));
            } else if (this.page.isXmlContent()) {
                newHashSet.addAll(sniffUrlFromXml(true));
            } else {
                newHashSet.add(this.page.getUrl());
            }
        } else if (!fetchResourceSelector.isConfigNone()) {
            List<String> selectors = fetchResourceSelector.getSelectors();
            if (this.page.isHtmlContent()) {
                for (String str : selectors) {
                    if (!Strings.isNullOrEmpty(str)) {
                        newHashSet.addAll(fetchResourceSelector.cssSelect(this.page, str));
                    }
                }
            } else if (this.page.isJsonContent()) {
                for (String str2 : selectors) {
                    if (!Strings.isNullOrEmpty(str2)) {
                        String[] split = str2.split("\\$.");
                        newHashSet.addAll(FetchResourceSelector.jsonPath(this.page.getJsonContent(), DefaultConfig.json_path_prefix + split[1], split[0]));
                    }
                }
            } else if (this.page.isXmlContent()) {
                for (String str3 : selectors) {
                    if (!Strings.isNullOrEmpty(str3)) {
                        newHashSet.addAll(FetchResourceSelector.xmlSelect(this.page.getXmlContent(), str3));
                    }
                }
            }
        }
        this.page.setResources(newHashSet);
    }

    public Map<String, String> mappingDetailLinkAndAvatar() {
        String str = Globals.FETCH_DETAIL_SELECT_CACHE.get(this.page.getSeedName());
        HashMap newHashMap = Maps.newHashMap();
        List<String> selectors = Globals.FETCH_RESOURCE_SELECTOR_CACHE.get(this.page.getSeedName()).getSelectors();
        if (this.page.isHtmlContent()) {
            for (String str2 : selectors) {
                if (!Strings.isNullOrEmpty(str2)) {
                    Iterator<Element> it = Jsoup.parse(this.page.getHtmlContent(), this.page.getUrl()).select(str2).iterator();
                    while (it.hasNext()) {
                        Document parse = Jsoup.parse(it.next().childNodes().toString(), this.page.getUrl());
                        Elements select = parse.select(str);
                        Elements select2 = parse.select("img[src]");
                        String attr = select.attr("href");
                        if (Strings.isNullOrEmpty(attr) || attr.startsWith(PersianAnalyzer.STOPWORDS_COMMENT) || attr.equalsIgnoreCase("null") || attr.contains("javascript:") || attr.contains("mailto:") || attr.contains("about:blank")) {
                            attr = select.next().attr("href");
                        }
                        if (attr.startsWith(".")) {
                            attr = attr.replace(".", "");
                        }
                        String absoluteURL = getAbsoluteURL(this.page.getUrl(), attr);
                        String attr2 = select2.attr("src");
                        if (Strings.isNullOrEmpty(attr2) || attr2.startsWith(PersianAnalyzer.STOPWORDS_COMMENT) || attr2.equalsIgnoreCase("null")) {
                            attr2 = select2.next().attr("src");
                        }
                        if (!Strings.isNullOrEmpty(attr2)) {
                            attr2 = getAbsoluteURL(this.page.getUrl(), attr2);
                        }
                        newHashMap.put(absoluteURL, attr2);
                    }
                }
            }
        } else if (this.page.isJsonContent()) {
            List<String> jsonPath2List = FetchResourceSelector.jsonPath2List(this.page.getJsonContent(), str, "");
            if (jsonPath2List == null || jsonPath2List.size() == 0) {
                return newHashMap;
            }
            List<String> arrayList = new ArrayList();
            Iterator<String> it2 = selectors.iterator();
            while (it2.hasNext()) {
                String[] split = it2.next().split("\\$.");
                arrayList = FetchResourceSelector.jsonPath2List(this.page.getJsonContent(), DefaultConfig.json_path_prefix + split[1], split[0]);
            }
            setDetailLinkAvatarMapping(jsonPath2List, arrayList, newHashMap);
        } else if (this.page.isXmlContent()) {
            List<String> xmlSelect2List = FetchResourceSelector.xmlSelect2List(this.page.getXmlContent(), str);
            if (xmlSelect2List == null || xmlSelect2List.size() == 0) {
                return newHashMap;
            }
            List<String> arrayList2 = new ArrayList();
            Iterator<String> it3 = selectors.iterator();
            while (it3.hasNext()) {
                arrayList2 = FetchResourceSelector.xmlSelect2List(this.page.getXmlContent(), it3.next());
            }
            setDetailLinkAvatarMapping(xmlSelect2List, arrayList2, newHashMap);
        }
        return newHashMap;
    }

    private void setDetailLinkAvatarMapping(List<String> list, List<String> list2, Map<String, String> map) {
        if (list2 == null || list2.size() == 0 || list.size() != list2.size()) {
            return;
        }
        for (int i = 0; i < list.size(); i++) {
            String str = list.get(i);
            if (!Strings.isNullOrEmpty(str) && !str.startsWith(PersianAnalyzer.STOPWORDS_COMMENT) && !str.equalsIgnoreCase("null") && !str.contains("javascript:") && !str.contains("mailto:") && !str.contains("about:blank")) {
                if (str.startsWith(".")) {
                    str = str.replace(".", "");
                }
                String absoluteURL = getAbsoluteURL(this.page.getUrl(), str);
                String str2 = list2.get(i);
                if (Strings.isNullOrEmpty(str2) || str2.startsWith(PersianAnalyzer.STOPWORDS_COMMENT) || str2.equalsIgnoreCase("null")) {
                    str2 = null;
                }
                if (!Strings.isNullOrEmpty(str2)) {
                    str2 = getAbsoluteURL(this.page.getUrl(), str2);
                }
                map.put(absoluteURL, str2);
            }
        }
    }

    private HashSet<String> sniffUrlFromJson(boolean z) {
        JSONObject parseObject;
        HashSet<String> newHashSet = Sets.newHashSet();
        try {
            parseObject = JSONObject.parseObject(this.page.getJsonContent());
        } catch (JSONException | ClassCastException e) {
            parseObject = JSONArray.parseObject("{ 'root':" + JSONArray.parseArray(this.page.getJsonContent()).toJSONString() + DefaultConfig.fetch_list_url_right);
        }
        Iterator<String> it = travelJson(parseObject, Sets.newHashSet()).iterator();
        while (it.hasNext()) {
            String next = it.next();
            if (FetchResourceSelector.isFindResources(next) == z) {
                newHashSet.add(next);
            }
        }
        return newHashSet;
    }

    private HashSet<String> travelJson(JSONObject jSONObject, HashSet<String> hashSet) {
        if (jSONObject == null) {
            return hashSet;
        }
        Iterator<Map.Entry<String, Object>> it = jSONObject.entrySet().iterator();
        while (it.hasNext()) {
            Object value = it.next().getValue();
            if (value != null) {
                if (value instanceof JSONObject) {
                    travelJson((JSONObject) value, hashSet);
                } else if (value instanceof JSONArray) {
                    Iterator<Object> it2 = ((JSONArray) value).iterator();
                    while (it2.hasNext()) {
                        Object next = it2.next();
                        if (!(next instanceof JSONArray) && !(next instanceof String)) {
                            travelJson((JSONObject) next, hashSet);
                        }
                    }
                } else if ((value instanceof String) && isStartHttpUrl(value.toString())) {
                    hashSet.add(value.toString());
                }
            }
        }
        return hashSet;
    }

    private HashSet<String> sniffUrlFromXml(boolean z) {
        HashSet<String> travelXml = travelXml(Jsoup.parse(this.page.getXmlContent(), "", Parser.xmlParser()).childNodes(), Sets.newHashSet());
        HashSet<String> newHashSet = Sets.newHashSet();
        Iterator<String> it = travelXml.iterator();
        while (it.hasNext()) {
            String next = it.next();
            if (FetchResourceSelector.isFindResources(next) == z) {
                newHashSet.add(next);
            }
        }
        return newHashSet;
    }

    private HashSet<String> travelXml(List<Node> list, HashSet<String> hashSet) {
        if (list == null || list.size() == 0) {
            return hashSet;
        }
        for (Node node : list) {
            if (!"#declaration".equalsIgnoreCase(node.nodeName())) {
                if (node.childNodeSize() > 0) {
                    travelXmlAttributes(node, hashSet);
                    travelXml(node.childNodes(), hashSet);
                } else {
                    travelXmlAttributes(node, hashSet);
                    if (isStartHttpUrl(node.toString().trim())) {
                        hashSet.add(node.toString().trim());
                    }
                }
            }
        }
        return hashSet;
    }

    private void travelXmlAttributes(Node node, HashSet<String> hashSet) {
        for (Attribute attribute : node.attributes().asList()) {
            String key = attribute.getKey();
            String value = attribute.getValue();
            if (!Strings.isNullOrEmpty(key) && !key.startsWith("xmlns") && !Strings.isNullOrEmpty(value) && isStartHttpUrl(value)) {
                hashSet.add(value.trim());
            }
        }
    }

    public final HashSet<String> getAllUrlByElement(Elements elements) {
        String url = this.page.getUrl();
        HashSet<String> newHashSet = Sets.newHashSet();
        Iterator<Element> it = elements.iterator();
        while (it.hasNext()) {
            Element next = it.next();
            String absUrl = next.absUrl(next.toString().contains("href") ? "href" : "src");
            if (!Strings.isNullOrEmpty(absUrl) && !Strings.isNullOrEmpty(url) && !url.equals(absUrl) && !absUrl.startsWith(PersianAnalyzer.STOPWORDS_COMMENT) && !absUrl.equalsIgnoreCase("null") && !absUrl.contains("javascript:") && !absUrl.contains("mailto:") && !absUrl.contains("about:blank")) {
                if (absUrl.contains("|")) {
                    try {
                        absUrl = absUrl.replace("|", URLEncoder.encode("|", "UTF-8"));
                    } catch (UnsupportedEncodingException e) {
                        e.printStackTrace();
                    }
                }
                newHashSet.add(absUrl);
            }
        }
        return newHashSet;
    }

    private String getAbsoluteURL(String str, String str2) {
        String str3 = null;
        try {
            str2 = URLDecoder.decode(str2, "UTF-8").split(SocketClient.NETASCII_EOL)[0];
            str3 = new URI(str.trim()).resolve(str2.replace(ShingleFilter.DEFAULT_TOKEN_SEPARATOR, "")).toURL().toString();
        } catch (Exception e) {
            logger.warn("转换相对路径时，发现了非法的url格式：[" + str2 + "]。");
            UrlQueue.newFailVisitedUrl(this.page.getSeedName(), str2);
            EmailSender.sendMail(e);
            ExceptionCatcher.addException(this.page.getSeedName(), e);
        }
        return str3;
    }

    public static boolean isStartHttpUrl(String str) {
        return str.startsWith("http://") || str.startsWith("https://");
    }

    public static String addUrlSchema(String str) {
        if (Strings.isNullOrEmpty(str)) {
            return str;
        }
        if (!isStartHttpUrl(str)) {
            str = "http://" + str.trim();
        }
        return str;
    }

    public static String filterUrlPound(String str) {
        return str.endsWith(PersianAnalyzer.STOPWORDS_COMMENT) ? filterUrlPound(str.substring(0, str.length() - 1)) : str;
    }

    public static boolean isAcessListUrl(Page page) {
        return Globals.LIST_URLS_CACHE.get(page.getSeedName()).contains(page.getUrl());
    }
}
