package lt.tokenmill.crawling.crawler.bolt;

import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.persistence.Status;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import lt.tokenmill.crawling.crawler.CrawlerConstants;
import lt.tokenmill.crawling.crawler.ServiceProvider;
import lt.tokenmill.crawling.crawler.utils.UrlFiltersCache;
import lt.tokenmill.crawling.data.DataUtils;
import lt.tokenmill.crawling.data.HttpArticle;
import lt.tokenmill.crawling.data.HttpSource;
import lt.tokenmill.crawling.es.EsDocumentOperations;
import lt.tokenmill.crawling.es.EsHttpSourceOperations;
import lt.tokenmill.crawling.es.EsHttpSourcesCache;
import lt.tokenmill.crawling.parser.ArticleExtractor;
import org.apache.storm.metric.api.MultiCountMetric;
import org.apache.storm.shade.com.google.common.base.Strings;
import org.apache.storm.shade.org.apache.http.entity.ContentType;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:lt/tokenmill/crawling/crawler/bolt/ArticleIndexerBolt.class */
public class ArticleIndexerBolt extends BaseRichBolt {
    private static final Logger LOG = LoggerFactory.getLogger(ArticleIndexerBolt.class);
    private ServiceProvider serviceProvider;
    private OutputCollector collector;
    private MultiCountMetric eventCounter;
    private EsHttpSourceOperations esHttpSourceOperations;
    private EsDocumentOperations esDocumentOperations;
    private Pattern dateInUrl = Pattern.compile(".*(\\d{4}/\\d{2}/\\d{2}).*");

    public void execute(Tuple tuple) {
        String stringByField = tuple.getStringByField("url");
        Metadata metadata = (Metadata) tuple.getValueByField("metadata");
        String firstValue = metadata.getFirstValue(CrawlerConstants.META_SOURCE);
        Boolean valueOf = Boolean.valueOf(Boolean.parseBoolean(metadata.getFirstValue(CrawlerConstants.META_IS_SEED)));
        HttpSource httpSource = EsHttpSourcesCache.get(this.esHttpSourceOperations, firstValue);
        String filter = UrlFiltersCache.get(httpSource).filter(stringByField);
        if (filter == null) {
            LOG.info("Skipping analysis of '{}' because it is rejected by filters", stringByField);
            this.eventCounter.scope("analysis_skipped").incr();
            this.collector.emit(tuple, new Values(new Object[]{stringByField, metadata}));
            this.collector.ack(tuple);
            return;
        }
        if (valueOf.booleanValue()) {
            this.eventCounter.scope("analysis_skipped_seed").incr();
            this.collector.emit(tuple, new Values(new Object[]{stringByField, metadata}));
            this.collector.ack(tuple);
            return;
        }
        byte[] binaryByField = tuple.getBinaryByField("content");
        String str = null;
        String str2 = null;
        try {
            String firstValue2 = metadata.getFirstValue("content-type");
            if (firstValue2 != null) {
                ContentType parse = ContentType.parse(firstValue2);
                str2 = parse.getMimeType();
                str = parse.getCharset().displayName();
            }
        } catch (Exception e) {
            LOG.warn("Failed to get charset and mime type for '{}'", stringByField);
        }
        if (str == null) {
            str = "UTF-8";
        }
        if (binaryByField == null || binaryByField.length == 0) {
            LOG.warn("Skipping url '{}' because it has no content", stringByField);
            this.eventCounter.scope("analysis_empty").incr();
            this.collector.emit("status", tuple, new Values(new Object[]{stringByField, metadata, Status.ERROR}));
        } else if (str2 == null || !str2.toLowerCase().contains("html")) {
            LOG.warn("Ignoring url '{}' because mime-type is '{}'", stringByField, str2);
            this.eventCounter.scope("analysis_wrong_type").incr();
            this.collector.emit("status", tuple, new Values(new Object[]{stringByField, metadata, Status.ERROR}));
        } else {
            try {
                LOG.info("Analyzing url '{}'", stringByField);
                HttpArticle analyze = analyze(stringByField, filter, httpSource, new String(binaryByField, str), metadata);
                if (Strings.isNullOrEmpty(analyze.getTitle()) || Strings.isNullOrEmpty(analyze.getText()) || analyze.getPublished() == null) {
                    LOG.warn("Url '{}' analysis returned incomplete data", stringByField);
                    this.eventCounter.scope("analysis_incomplete").incr();
                    this.collector.emit("status", tuple, new Values(new Object[]{stringByField, metadata, Status.ERROR}));
                } else {
                    this.esDocumentOperations.store(analyze);
                    LOG.info("Stored article '{}'", stringByField);
                    this.eventCounter.scope("analysis_success").incr();
                    this.collector.emit("status", tuple, new Values(new Object[]{stringByField, metadata, Status.FETCHED}));
                }
                this.collector.emit(tuple, new Values(new Object[]{stringByField, metadata}));
            } catch (Exception e2) {
                LOG.error("Failed to analyze '{}'", stringByField, e2);
                this.eventCounter.scope("analysis_error").incr();
                this.collector.emit("status", tuple, new Values(new Object[]{stringByField, metadata, Status.ERROR}));
            }
        }
        this.collector.ack(tuple);
    }

    private HttpArticle analyze(String str, String str2, HttpSource httpSource, String str3, Metadata metadata) throws Exception {
        String firstValue = metadata.getFirstValue(CrawlerConstants.META_PUBLISHED);
        if (firstValue == null) {
            firstValue = metadata.getFirstValue(CrawlerConstants.META_FEED_PUBLISHED);
        }
        if (firstValue == null) {
            Matcher matcher = this.dateInUrl.matcher(str);
            if (matcher.find()) {
                firstValue = matcher.group(1);
            }
        }
        HttpArticle extractArticle = ArticleExtractor.extractArticle(str3, str2, httpSource, firstValue);
        extractArticle.setDiscovered(DataUtils.parseFromUTC(metadata.getFirstValue(CrawlerConstants.META_DISCOVERED)));
        return extractArticle;
    }

    public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) {
        this.collector = outputCollector;
        this.eventCounter = topologyContext.registerMetric(getClass().getSimpleName(), new MultiCountMetric(), 10);
        this.serviceProvider = new ServiceProvider();
        this.esHttpSourceOperations = this.serviceProvider.createEsHttpSourceOperations(map);
        this.esDocumentOperations = this.serviceProvider.creatEsDocumentOperations(map);
    }

    public void cleanup() {
        super.cleanup();
    }

    public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
        outputFieldsDeclarer.declare(new Fields(new String[]{"url", "metadata"}));
        outputFieldsDeclarer.declareStream("status", new Fields(new String[]{"url", "metadata", "status"}));
    }
}
