package org.dstadler.commoncrawl;

import com.google.common.base.Preconditions;
import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.util.logging.Logger;
import java.util.zip.GZIPInputStream;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.archive.io.warc.WARCRecord;
import org.archive.util.LaxHttpParser;
import org.commoncrawl.hadoop.mapred.ArcRecord;
import org.dstadler.commons.http.HttpClientWrapper;
import org.dstadler.commons.logging.jdk.LoggerFactory;

/* loaded from: input_file:org/dstadler/commoncrawl/Utils.class */
public class Utils {
    public static final int INDEX_BLOCK_COUNT = 2644;
    public static final int BLOCK_SIZE = 65536;
    public static final String COMMON_CRAWL_URL = "https://data.commoncrawl.org/";
    public static final String INDEX_URL = "https://commoncrawl.s3.amazonaws.com/projects/url-index/url-index.1356128792";
    public static final int HEADER_BLOCK_SIZE = 8;
    private static final Logger log = LoggerFactory.make();
    public static File DOWNLOAD_DIR = new File("../download");
    public static File BACKUP_DIR = new File("../backup");
    public static final File COMMONURLS_PATH = new File("commonurls.txt");

    public static String reverseDomain(String str) {
        if (StringUtils.isEmpty(str)) {
            return str;
        }
        String[] split = str.split("\\.");
        Preconditions.checkState(split.length > 0, "Should have some parts, but did not found any for %s", new Object[]{str});
        StringBuilder sb = new StringBuilder();
        for (int length = split.length - 1; length >= 0; length--) {
            sb.append(split[length]).append(".");
        }
        sb.setLength(sb.length() - 1);
        return sb.toString();
    }

    public static URI convertUrl(String str) throws URISyntaxException {
        URI uri = new URI((str.endsWith(":http") ? "http://" + StringUtils.removeEnd(str, ":http") : str.endsWith(":https") ? "https://" + StringUtils.removeEnd(str, ":https") : "http://" + str).replace("[", "").replace("]", ""));
        return new URI(uri.getScheme(), uri.getUserInfo(), reverseDomain(uri.getHost()), uri.getPort(), uri.getPath(), uri.getQuery(), uri.getFragment());
    }

    public static void logProgress(long j, int i, int i2, long j2, long j3, int i3, long j4) {
        if (j3 % i3 == 0) {
            double currentTimeMillis = (System.currentTimeMillis() - j2) / 1000.0d;
            long j5 = j3 - i2;
            long j6 = j + (j5 * i);
            Logger logger = log;
            String.format("%.2f", Double.valueOf(currentTimeMillis));
            String.format("%.2f", Double.valueOf(j5 / currentTimeMillis));
            if (j4 > 0) {
                String str = ", " + String.format("%.2f", Double.valueOf((j6 / j4) * 100.0d)) + "% of " + j4 + " bytes done";
            }
            logger.info("Reading block " + j3 + " at position " + logger + " fetched " + j6 + " blocks in " + logger + " s, with " + j5 + " per second" + logger);
        }
    }

    public static boolean isCorruptDownload(File file) throws IOException {
        FileInputStream fileInputStream = new FileInputStream(file);
        try {
            byte[] bArr = new byte[100];
            IOUtils.read(fileInputStream, bArr);
            String lowerCase = new String(bArr, StandardCharsets.UTF_8).trim().toLowerCase();
            if (!lowerCase.startsWith("<!doctype html") && !lowerCase.startsWith("<html")) {
                if (!lowerCase.startsWith("<!--[if ie")) {
                    fileInputStream.close();
                    return false;
                }
            }
            fileInputStream.close();
            return true;
        } catch (Throwable th) {
            try {
                fileInputStream.close();
            } catch (Throwable th2) {
                th.addSuppressed(th2);
            }
            throw th;
        }
    }

    public static File computeDownloadFileName(String str, String str2) {
        return computeDownloadFileName(DOWNLOAD_DIR, str, str2);
    }

    public static File computeDownloadFileName(File file, String str, String str2) {
        String replace = StringUtils.removeEnd(StringUtils.removeStart(StringUtils.removeStart(str, "http://"), "https://"), ":http").replace("/", "_").replace("[", "(").replace("]", ")").replace("?", "_").replace(":", "_").replace("%", "_").replace("+", "_").replace("*", "_");
        if (replace.length() > 240) {
            String extension = FilenameUtils.getExtension(replace);
            if (extension.length() > 10) {
                extension = "";
            }
            replace = replace.substring(0, 240) + "..." + extension;
        }
        return new File(file, replace.endsWith(str2) ? replace : replace + str2);
    }

    public static File downloadFileFromCommonCrawl(CloseableHttpClient closeableHttpClient, String str, DocumentLocation documentLocation, boolean z) throws IOException {
        File computeDownloadFileName = computeDownloadFileName(str, MimeTypes.toExtension(documentLocation.getMime()));
        File computeDownloadFileName2 = computeDownloadFileName(BACKUP_DIR, str, MimeTypes.toExtension(documentLocation.getMime()));
        if (computeDownloadFileName.exists() || computeDownloadFileName2.exists()) {
            log.info("File " + computeDownloadFileName + " already downloaded: " + computeDownloadFileName.exists() + "/" + computeDownloadFileName2.exists());
            return null;
        }
        try {
            downloadFileFromCommonCrawl(closeableHttpClient, str, documentLocation, z, computeDownloadFileName);
        } catch (IOException e) {
            if (!e.getMessage().contains("HTTP StatusCode 500")) {
                throw e;
            }
            downloadFileFromCommonCrawl(closeableHttpClient, str, documentLocation, z, computeDownloadFileName);
        }
        return computeDownloadFileName;
    }

    public static void downloadFileFromCommonCrawl(CloseableHttpClient closeableHttpClient, String str, DocumentLocation documentLocation, boolean z, File file) throws IOException {
        log.info("Reading file for " + str + " at " + documentLocation.getRangeHeader() + " from " + documentLocation.getUrl() + " to " + file);
        HttpGet httpGet = new HttpGet(documentLocation.getUrl());
        httpGet.addHeader("Range", documentLocation.getRangeHeader());
        CloseableHttpResponse execute = closeableHttpClient.execute(httpGet);
        try {
            GZIPInputStream gZIPInputStream = new GZIPInputStream(HttpClientWrapper.checkAndFetch(execute, documentLocation.getUrl()).getContent());
            try {
                if (z) {
                    WARCRecord wARCRecord = new WARCRecord(new FastBufferedInputStream(gZIPInputStream), file.getName(), 0L);
                    try {
                        LaxHttpParser.parseHeaders(wARCRecord, "UTF-8");
                        try {
                            FileUtils.copyInputStreamToFile(wARCRecord, file);
                            wARCRecord.close();
                        } catch (IllegalStateException e) {
                            throw new IOException(e);
                        }
                    } catch (Throwable th) {
                        try {
                            wARCRecord.close();
                        } catch (Throwable th2) {
                            th.addSuppressed(th2);
                        }
                        throw th;
                    }
                } else {
                    ArcRecord arcRecord = new ArcRecord();
                    arcRecord.readFrom(gZIPInputStream);
                    try {
                        FileUtils.copyInputStreamToFile(arcRecord.getHttpResponse().getEntity().getContent(), file);
                    } catch (IllegalStateException | HttpException e2) {
                        throw new IOException(e2);
                    }
                }
                gZIPInputStream.close();
                if (execute != null) {
                    execute.close();
                }
            } finally {
            }
        } catch (Throwable th3) {
            if (execute != null) {
                try {
                    execute.close();
                } catch (Throwable th4) {
                    th3.addSuppressed(th4);
                }
            }
            throw th3;
        }
    }

    public static void ensureDownloadDir() {
        if (!DOWNLOAD_DIR.exists() && !DOWNLOAD_DIR.mkdirs()) {
            throw new IllegalStateException("Could not create directory " + DOWNLOAD_DIR);
        }
    }
}
