package lt.tokenmill.crawling.parser;

import com.google.common.base.Strings;
import com.google.common.collect.Lists;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.stream.Collectors;
import lt.tokenmill.crawling.data.HttpArticle;
import lt.tokenmill.crawling.data.HttpArticleParseResult;
import lt.tokenmill.crawling.data.HttpSource;
import lt.tokenmill.crawling.parser.data.MatchedDate;
import lt.tokenmill.crawling.parser.data.MatchedString;
import lt.tokenmill.crawling.parser.utils.HttpSourceTester;
import lt.tokenmill.crawling.parser.utils.JsonLdParser;
import lt.tokenmill.crawling.parser.utils.TextFilters;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

/* loaded from: input_file:lt/tokenmill/crawling/parser/ArticleExtractor.class */
public class ArticleExtractor {
    public static HttpArticle extractArticle(String str, String str2, HttpSource httpSource, String str3) {
        return extractArticleWithDetails(str, str2, httpSource, str3).getArticle();
    }

    public static HttpArticleParseResult extractArticleWithDetails(String str, String str2, HttpSource httpSource, String str3) {
        Document parse = Jsoup.parse(str, str2);
        HttpArticleParseResult httpArticleParseResult = new HttpArticleParseResult();
        HttpArticle httpArticle = new HttpArticle();
        httpArticle.setUrl(str2);
        httpArticle.setSource(httpSource.getUrl());
        httpArticle.setAppIds(httpSource.getAppIds());
        httpArticle.setCategories(httpSource.getCategories());
        JsonLdParser.JsonLdArticle parse2 = JsonLdParser.parse(JsonLdParser.extractJsonLdParts(parse));
        List<MatchedString> extractTitlesWithJsoup = extractTitlesWithJsoup(parse, parse2, httpSource);
        httpArticle.setTitle((String) extractTitlesWithJsoup.stream().map((v0) -> {
            return v0.getValue();
        }).collect(Collectors.joining("\n")));
        httpArticleParseResult.setTitleMatches((List) extractTitlesWithJsoup.stream().map((v0) -> {
            return v0.getMatch();
        }).collect(Collectors.toList()));
        List<MatchedString> extractTextsWithJsoup = extractTextsWithJsoup(parse, httpSource);
        httpArticle.setText((String) extractTextsWithJsoup.stream().map((v0) -> {
            return v0.getValue();
        }).map(str4 -> {
            return TextFilters.normalizeText(str4, httpSource.getTextNormalizers());
        }).collect(Collectors.joining("\n")));
        httpArticleParseResult.setTextMatches((List) extractTextsWithJsoup.stream().map((v0) -> {
            return v0.getMatch();
        }).distinct().collect(Collectors.toList()));
        List<MatchedDate> extractPublicationDates = extractPublicationDates(str, parse, parse2, httpSource, str3);
        MatchedDate orElse = extractPublicationDates.stream().filter(matchedDate -> {
            return matchedDate.getDate() != null;
        }).findFirst().orElse(null);
        httpArticle.setPublished(orElse != null ? orElse.getDate() : null);
        httpArticleParseResult.setPublishedPattern(orElse != null ? orElse.getPattern() : null);
        httpArticleParseResult.setPublishedTexts(orElse != null ? Lists.newArrayList(new String[]{orElse.getValue()}) : (List) extractPublicationDates.stream().map((v0) -> {
            return v0.getValue();
        }).collect(Collectors.toList()));
        httpArticleParseResult.setPublishedMatches(orElse != null ? Lists.newArrayList(new String[]{orElse.getMatch()}) : (List) extractPublicationDates.stream().map((v0) -> {
            return v0.getMatch();
        }).collect(Collectors.toList()));
        httpArticleParseResult.setArticle(httpArticle);
        return httpArticleParseResult;
    }

    private static List<MatchedDate> extractPublicationDates(String str, Document document, JsonLdParser.JsonLdArticle jsonLdArticle, HttpSource httpSource, String str2) {
        ArrayList newArrayList = Lists.newArrayList();
        if (str2 != null) {
            newArrayList.add(new MatchedDate(str2, "HINT"));
        }
        for (String str3 : httpSource.getDateSelectors()) {
            document.select(str3).forEach(element -> {
                newArrayList.add(new MatchedDate(element.text(), str3));
            });
        }
        if (jsonLdArticle != null && !Strings.isNullOrEmpty(jsonLdArticle.getDatePublished())) {
            newArrayList.add(new MatchedDate(jsonLdArticle.getDatePublished(), "LD+JSON"));
        }
        newArrayList.addAll(DateParser.extractFromMeta(document));
        newArrayList.addAll(DateParser.extractFromProperties(document));
        return (List) newArrayList.stream().map(matchedDate -> {
            return DateParser.parse(matchedDate, httpSource);
        }).filter(matchedDate2 -> {
            return matchedDate2 != null;
        }).collect(Collectors.toList());
    }

    private static MatchedDate parseDate(MatchedDate matchedDate, HttpSource httpSource) {
        return DateParser.parse(matchedDate, httpSource);
    }

    private static List<MatchedString> extractTextsWithJsoup(Document document, HttpSource httpSource) {
        ArrayList newArrayList = Lists.newArrayList();
        for (String str : httpSource.getTextSelectors()) {
            document.select(str).forEach(element -> {
                newArrayList.add(new MatchedString(element.text(), str));
            });
        }
        if (!newArrayList.isEmpty()) {
            return newArrayList;
        }
        String text = document.select("[itemprop*=articleBody] p").text();
        return (text == null || text.trim().isEmpty()) ? (List) document.select("p").stream().map(element2 -> {
            return new MatchedString(element2.text(), "p");
        }).collect(Collectors.toList()) : Lists.newArrayList(new MatchedString[]{new MatchedString(text, "[itemprop*=articleBody] p")});
    }

    private static List<MatchedString> extractTitlesWithJsoup(Document document, JsonLdParser.JsonLdArticle jsonLdArticle, HttpSource httpSource) {
        ArrayList newArrayList = Lists.newArrayList();
        if (httpSource.getTitleSelectors().size() > 0) {
            for (String str : httpSource.getTitleSelectors()) {
                document.select(str).forEach(element -> {
                    newArrayList.add(new MatchedString(element.text(), str));
                });
            }
        } else {
            if (jsonLdArticle != null && Strings.isNullOrEmpty(jsonLdArticle.getHeadline())) {
                newArrayList.add(new MatchedString(jsonLdArticle.getHeadline(), "LD+JSON"));
            }
            newArrayList.addAll(TitleParser.extractFromMeta(document));
        }
        if (newArrayList.isEmpty()) {
            newArrayList.addAll((Collection) document.select("h1").stream().map(element2 -> {
                return new MatchedString(element2.text(), "h1");
            }).collect(Collectors.toList()));
            newArrayList.addAll((Collection) document.select(HttpSourceTester.TITLE).stream().map(element3 -> {
                return new MatchedString(element3.text(), HttpSourceTester.TITLE);
            }).collect(Collectors.toList()));
        }
        return (List) newArrayList.stream().map(matchedString -> {
            matchedString.setValue(matchedString.getValue().replaceAll("\\s*\\|.+", ""));
            return matchedString;
        }).filter(matchedString2 -> {
            return !matchedString2.getValue().contains("${");
        }).distinct().limit(1L).collect(Collectors.toList());
    }
}
