package cc.twittertools.download;

import cc.twittertools.corpus.data.HTMLStatusExtractor;
import com.google.common.base.Preconditions;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import com.google.gson.JsonSyntaxException;
import com.ning.http.client.AsyncCompletionHandler;
import com.ning.http.client.AsyncHandler;
import com.ning.http.client.AsyncHttpClient;
import com.ning.http.client.AsyncHttpClientConfig;
import com.ning.http.client.HttpResponseHeaders;
import com.ning.http.client.HttpResponseStatus;
import com.ning.http.client.Response;
import com.ning.http.client.extra.ThrottleRequestFilter;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.Iterator;
import java.util.Map;
import java.util.Timer;
import java.util.TimerTask;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.log4j.Logger;

/* loaded from: input_file:cc/twittertools/download/AsyncHTMLStatusBlockCrawler.class */
public class AsyncHTMLStatusBlockCrawler {
    private static final int TWEET_BLOCK_SIZE = 500;
    private static final int MAX_CONNECTIONS = 100;
    private static final int CONNECTION_TIMEOUT = 10000;
    private static final int IDLE_CONNECTION_TIMEOUT = 10000;
    private static final int REQUEST_TIMEOUT = 10000;
    private static final int MAX_RETRY_ATTEMPTS = 2;
    private static final int WAIT_BEFORE_RETRY = 1000;
    private final File file;
    private final File output;
    private final File repair;
    private final AsyncHttpClient asyncHttpClient;
    private final boolean noFollow;
    private final ConcurrentSkipListMap<Long, String> crawl = new ConcurrentSkipListMap<>();
    private final ConcurrentSkipListMap<Long, String> crawl_repair = new ConcurrentSkipListMap<>();
    private final AtomicInteger connections = new AtomicInteger(0);
    private static final String DATA_OPTION = "data";
    private static final String OUTPUT_OPTION = "output";
    private static final String REPAIR_OPTION = "repair";
    private static final String NOFOLLOW_OPTION = "noFollow";
    private static final Logger LOG = Logger.getLogger(AsyncHTMLStatusBlockCrawler.class);
    private static final Timer timer = new Timer(true);
    private static final JsonParser JSON_PARSER = new JsonParser();
    private static final Gson GSON = new Gson();

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:cc/twittertools/download/AsyncHTMLStatusBlockCrawler$TweetFetcherHandler.class */
    public class TweetFetcherHandler extends AsyncCompletionHandler<Response> {
        private final long id;
        private final String username;
        private final String url;
        private final int numRetries;
        private final boolean followRedirects;
        private final String line;
        private int httpStatus = -1;
        private HTMLStatusExtractor extractor = new HTMLStatusExtractor();

        /* JADX INFO: Access modifiers changed from: private */
        /* loaded from: input_file:cc/twittertools/download/AsyncHTMLStatusBlockCrawler$TweetFetcherHandler$RetryTask.class */
        public class RetryTask extends TimerTask {
            private final long id;
            private final String username;
            private final String url;
            private final int numRetries;
            private final boolean followRedirects;

            public RetryTask(long j, String str, String str2, int i, boolean z) {
                this.id = j;
                this.username = str;
                this.url = str2;
                this.numRetries = i;
                this.followRedirects = z;
            }

            @Override // java.util.TimerTask, java.lang.Runnable
            public void run() {
                AsyncHTMLStatusBlockCrawler.this.crawlURL(this.url, new TweetFetcherHandler(this.id, this.username, this.url, this.numRetries, this.followRedirects, TweetFetcherHandler.this.line));
            }
        }

        public TweetFetcherHandler(long j, String str, String str2, int i, boolean z, String str3) {
            this.id = j;
            this.username = str;
            this.url = str2;
            this.numRetries = i;
            this.followRedirects = z;
            this.line = str3;
        }

        public long getId() {
            return this.id;
        }

        public String getLine() {
            return this.line;
        }

        public AsyncHandler.STATE onStatusReceived(HttpResponseStatus httpResponseStatus) throws Exception {
            this.httpStatus = httpResponseStatus.getStatusCode();
            switch (this.httpStatus) {
                case 404:
                    AsyncHTMLStatusBlockCrawler.LOG.warn("Abandoning missing page: " + this.url);
                    AsyncHTMLStatusBlockCrawler.this.connections.decrementAndGet();
                    return AsyncHandler.STATE.ABORT;
                case AsyncHTMLStatusBlockCrawler.TWEET_BLOCK_SIZE /* 500 */:
                    retry();
                    return AsyncHandler.STATE.ABORT;
                default:
                    return super.onStatusReceived(httpResponseStatus);
            }
        }

        public AsyncHandler.STATE onHeadersReceived(HttpResponseHeaders httpResponseHeaders) throws Exception {
            switch (this.httpStatus) {
                case 301:
                case 302:
                    String firstValue = httpResponseHeaders.getHeaders().getFirstValue("Location");
                    if (firstValue.contains("protected_redirect=true")) {
                        AsyncHTMLStatusBlockCrawler.LOG.warn("Abandoning protected account: " + this.url);
                        AsyncHTMLStatusBlockCrawler.this.connections.decrementAndGet();
                    } else if (firstValue.contains("account/suspended")) {
                        AsyncHTMLStatusBlockCrawler.LOG.warn("Abandoning suspended account: " + this.url);
                        AsyncHTMLStatusBlockCrawler.this.connections.decrementAndGet();
                    } else if (firstValue.contains("//status") || firstValue.contains("login?redirect_after_login")) {
                        AsyncHTMLStatusBlockCrawler.LOG.warn("Abandoning deleted account: " + this.url);
                        AsyncHTMLStatusBlockCrawler.this.connections.decrementAndGet();
                    } else if (this.followRedirects) {
                        AsyncHTMLStatusBlockCrawler.this.crawlURL(firstValue, new TweetFetcherHandler(this.id, this.username, firstValue, this.numRetries, this.followRedirects, this.line));
                    } else {
                        AsyncHTMLStatusBlockCrawler.LOG.warn("Abandoning redirect: " + this.url);
                        AsyncHTMLStatusBlockCrawler.this.connections.decrementAndGet();
                    }
                    return AsyncHandler.STATE.ABORT;
                default:
                    return super.onHeadersReceived(httpResponseHeaders);
            }
        }

        /* renamed from: onCompleted, reason: merged with bridge method [inline-methods] */
        public Response m5onCompleted(Response response) {
            switch (this.httpStatus) {
                case -1:
                case 301:
                case 302:
                case 404:
                case AsyncHTMLStatusBlockCrawler.TWEET_BLOCK_SIZE /* 500 */:
                    return response;
                default:
                    try {
                        JsonObject extractTweet = this.extractor.extractTweet(response.getResponseBody("UTF-8"));
                        extractTweet.addProperty("requested_id", new Long(this.id));
                        AsyncHTMLStatusBlockCrawler.this.crawl.put(Long.valueOf(this.id), AsyncHTMLStatusBlockCrawler.GSON.toJson(extractTweet));
                        AsyncHTMLStatusBlockCrawler.this.connections.decrementAndGet();
                        return response;
                    } catch (IOException e) {
                        AsyncHTMLStatusBlockCrawler.LOG.warn("Error (" + e + "): " + this.url);
                        retry();
                        return response;
                    } catch (NullPointerException e2) {
                        AsyncHTMLStatusBlockCrawler.LOG.warn("Unexpected format for embedded JSON: " + this.url);
                        retry();
                        return response;
                    } catch (JsonSyntaxException e3) {
                        AsyncHTMLStatusBlockCrawler.LOG.warn("Unable to parse embedded JSON: " + this.url);
                        retry();
                        return response;
                    }
            }
        }

        public void onThrowable(Throwable th) {
            retry();
        }

        private void retry() {
            if (this.numRetries < AsyncHTMLStatusBlockCrawler.MAX_RETRY_ATTEMPTS) {
                AsyncHTMLStatusBlockCrawler.timer.schedule(new RetryTask(this.id, this.username, this.url, this.numRetries + 1, this.followRedirects), 1000L);
                return;
            }
            AsyncHTMLStatusBlockCrawler.LOG.warn("Abandoning after max retry attempts: " + this.url);
            AsyncHTMLStatusBlockCrawler.this.crawl_repair.put(Long.valueOf(this.id), this.line);
            AsyncHTMLStatusBlockCrawler.this.connections.decrementAndGet();
        }
    }

    public AsyncHTMLStatusBlockCrawler(File file, String str, String str2, boolean z) throws IOException {
        this.file = (File) Preconditions.checkNotNull(file);
        this.noFollow = z;
        if (!file.exists()) {
            throw new IOException(file + " does not exist!");
        }
        this.output = new File((String) Preconditions.checkNotNull(str));
        File parentFile = this.output.getParentFile();
        if (parentFile != null && !parentFile.exists()) {
            throw new IOException(str + "'s parent directory does not exist!");
        }
        if (str2 != null) {
            this.repair = new File(str2);
            File parentFile2 = this.repair.getParentFile();
            if (parentFile2 != null && !parentFile2.exists()) {
                throw new IOException(str2 + "'s parent directory does not exist!");
            }
        } else {
            this.repair = null;
        }
        this.asyncHttpClient = new AsyncHttpClient(new AsyncHttpClientConfig.Builder().addRequestFilter(new ThrottleRequestFilter(MAX_CONNECTIONS)).setConnectionTimeoutInMs(10000).setIdleConnectionInPoolTimeoutInMs(10000).setRequestTimeoutInMs(10000).setMaxRequestRetry(0).build());
    }

    public static String getUrl(long j, String str) {
        Preconditions.checkNotNull(str);
        return String.format("http://twitter.com/%s/status/%d", str, Long.valueOf(j));
    }

    public void fetch() throws IOException {
        long currentTimeMillis = System.currentTimeMillis();
        LOG.info("Processing " + this.file);
        int i = 0;
        BufferedReader bufferedReader = null;
        try {
            try {
                bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(this.file)));
                while (true) {
                    String readLine = bufferedReader.readLine();
                    if (readLine == null) {
                        break;
                    }
                    try {
                        String[] split = readLine.split("\t");
                        long parseLong = Long.parseLong(split[0]);
                        String str = split.length > 1 ? split[1] : "a";
                        String url = getUrl(parseLong, str);
                        this.connections.incrementAndGet();
                        crawlURL(url, new TweetFetcherHandler(parseLong, str, url, 0, !this.noFollow, readLine));
                        i++;
                        if (i % TWEET_BLOCK_SIZE == 0) {
                            LOG.info(i + " requests submitted");
                        }
                    } catch (NumberFormatException e) {
                    }
                }
                bufferedReader.close();
            } catch (IOException e2) {
                e2.printStackTrace();
                bufferedReader.close();
            }
            LOG.info("Waiting for remaining requests (" + this.connections.get() + ") to finish!");
            for (int i2 = 0; i2 < 10 && this.connections.get() != 0; i2++) {
                try {
                    Thread.sleep(1000L);
                } catch (Exception e3) {
                    e3.printStackTrace();
                }
            }
            this.asyncHttpClient.close();
            long currentTimeMillis2 = System.currentTimeMillis() - currentTimeMillis;
            LOG.info("Total request submitted: " + i);
            LOG.info(this.crawl.size() + " tweets fetched in " + currentTimeMillis2 + "ms");
            LOG.info("Writing tweets...");
            int i3 = 0;
            OutputStreamWriter outputStreamWriter = new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(this.output)), "UTF-8");
            Iterator<Map.Entry<Long, String>> it = this.crawl.entrySet().iterator();
            while (it.hasNext()) {
                i3++;
                outputStreamWriter.write(it.next().getValue() + "\n");
            }
            outputStreamWriter.close();
            LOG.info(i3 + " statuses written.");
            if (this.repair != null) {
                LOG.info("Writing repair data file...");
                int i4 = 0;
                OutputStreamWriter outputStreamWriter2 = new OutputStreamWriter(new FileOutputStream(this.repair), "UTF-8");
                Iterator<Map.Entry<Long, String>> it2 = this.crawl_repair.entrySet().iterator();
                while (it2.hasNext()) {
                    i4++;
                    outputStreamWriter2.write(it2.next().getValue() + "\n");
                }
                outputStreamWriter2.close();
                LOG.info(i4 + " statuses need repair.");
            }
            LOG.info("Done!");
        } catch (Throwable th) {
            bufferedReader.close();
            throw th;
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    public void crawlURL(String str, TweetFetcherHandler tweetFetcherHandler) {
        try {
            this.asyncHttpClient.prepareGet(str).addHeader("Accept-Charset", "utf-8").addHeader("Accept-Language", "en-US").execute(tweetFetcherHandler);
        } catch (IOException e) {
            LOG.warn("Abandoning due to error (" + e + "): " + str);
            this.crawl_repair.put(Long.valueOf(tweetFetcherHandler.getId()), tweetFetcherHandler.getLine());
            this.connections.decrementAndGet();
        }
    }

    public static void main(String[] strArr) throws Exception {
        Options options = new Options();
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("data file with tweet ids");
        options.addOption(OptionBuilder.create(DATA_OPTION));
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("output file (*.gz)");
        options.addOption(OptionBuilder.create(OUTPUT_OPTION));
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("output repair file (can be used later as a data file)");
        options.addOption(OptionBuilder.create(REPAIR_OPTION));
        options.addOption(NOFOLLOW_OPTION, NOFOLLOW_OPTION, false, "don't follow 301 redirects");
        CommandLine commandLine = null;
        try {
            commandLine = new GnuParser().parse(options, strArr);
        } catch (ParseException e) {
            System.err.println("Error parsing command line: " + e.getMessage());
            System.exit(-1);
        }
        if (!commandLine.hasOption(DATA_OPTION) || !commandLine.hasOption(OUTPUT_OPTION)) {
            new HelpFormatter().printHelp(AsyncHTMLStatusBlockCrawler.class.getName(), options);
            System.exit(-1);
        }
        new AsyncHTMLStatusBlockCrawler(new File(commandLine.getOptionValue(DATA_OPTION)), commandLine.getOptionValue(OUTPUT_OPTION), commandLine.getOptionValue(REPAIR_OPTION), commandLine.hasOption(NOFOLLOW_OPTION)).fetch();
    }
}
