package org.dstadler.commoncrawl;

import com.google.common.collect.ImmutableSet;
import com.google.common.collect.TreeMultimap;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.file.FileVisitOption;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.NavigableSet;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.dstadler.commons.logging.jdk.LoggerFactory;

/* loaded from: input_file:org/dstadler/commoncrawl/Deduplicate.class */
public class Deduplicate {
    private static final Logger log = LoggerFactory.make();
    private static final Set<String> SCAN_EXCLUDES = ImmutableSet.of("**/.svn/**", "lost+found", "**/.git/**");

    public static void main(String[] strArr) throws IOException {
        LoggerFactory.initLogging();
        TreeMultimap<Long, String> scanAndSortFiles = scanAndSortFiles();
        NavigableSet<Long> keySet = scanAndSortFiles.keySet();
        log.info("Having " + keySet.size() + " different sizes between " + keySet.first() + " and " + keySet.last());
        int i = 0;
        int i2 = 0;
        for (Long l : keySet) {
            NavigableSet<String> navigableSet = scanAndSortFiles.get(l);
            if (navigableSet.size() <= 1) {
                log.info("Only having " + navigableSet.size() + " files with size " + l + ", " + (scanAndSortFiles.size() - i2) + " files left");
                i2 += navigableSet.size();
            } else {
                log.info("Looking at " + navigableSet.size() + " files with size " + l + ", " + (scanAndSortFiles.size() - i2) + " files left");
                HashMap hashMap = new HashMap();
                for (String str : navigableSet) {
                    i2++;
                    try {
                        String hash = hash(new File(Utils.DOWNLOAD_DIR, str));
                        if (hashMap.containsKey(hash)) {
                            i++;
                            log.info("Dups: " + i + ", Count: " + i2 + ", SizeKey: " + l + ": File " + str + " is the same as " + ((String) hashMap.get(hash)));
                            FileUtils.moveFile(new File(Utils.DOWNLOAD_DIR, str), new File(Utils.BACKUP_DIR, str));
                        } else {
                            hashMap.put(hash, str);
                        }
                    } catch (FileNotFoundException e) {
                        log.log(Level.WARNING, "Could not read file '" + new File(Utils.DOWNLOAD_DIR, str).getAbsolutePath() + "' for size " + l + ", probably the filename contains unexpected characters", (Throwable) e);
                    } catch (IOException | RuntimeException e2) {
                        throw new IOException("Failed for file '" + new File(Utils.DOWNLOAD_DIR, str).getAbsolutePath() + "' for size " + l, e2);
                    }
                }
            }
        }
        log.info("Found " + i + " duplicate files");
    }

    private static TreeMultimap<Long, String> scanAndSortFiles() throws IOException {
        log.info("Scanning for files in " + Utils.DOWNLOAD_DIR);
        final AtomicLong atomicLong = new AtomicLong();
        final TreeMultimap<Long, String> create = TreeMultimap.create();
        Files.walkFileTree(Utils.DOWNLOAD_DIR.toPath(), EnumSet.of(FileVisitOption.FOLLOW_LINKS), Integer.MAX_VALUE, new SimpleFileVisitor<Path>() { // from class: org.dstadler.commoncrawl.Deduplicate.1
            @Override // java.nio.file.SimpleFileVisitor, java.nio.file.FileVisitor
            public FileVisitResult preVisitDirectory(Path path, BasicFileAttributes basicFileAttributes) {
                if (Deduplicate.SCAN_EXCLUDES.contains(path.toFile().getName())) {
                    Deduplicate.log.info("Skipping directory " + path);
                    return FileVisitResult.SKIP_SUBTREE;
                }
                Deduplicate.log.info("Entering directory " + path);
                return FileVisitResult.CONTINUE;
            }

            @Override // java.nio.file.SimpleFileVisitor, java.nio.file.FileVisitor
            public FileVisitResult visitFile(Path path, BasicFileAttributes basicFileAttributes) {
                long andIncrement = atomicLong.getAndIncrement();
                if (andIncrement % 10000 == 0) {
                    Logger logger = Deduplicate.log;
                    logger.info("Handling file " + andIncrement + ": " + logger);
                }
                create.put(Long.valueOf(path.toFile().length()), StringUtils.removeStart(path.toFile().toString(), Utils.DOWNLOAD_DIR.toString() + "/"));
                return FileVisitResult.CONTINUE;
            }
        });
        log.info("Found " + create.values().size() + " files");
        return create;
    }

    private static String hash(File file) throws IOException {
        BufferedInputStream bufferedInputStream = new BufferedInputStream(new FileInputStream(file), file.length() > 1048576 ? 1048576 : file.length() == 0 ? 1024 : (int) file.length());
        try {
            String md5Hex = DigestUtils.md5Hex(bufferedInputStream);
            bufferedInputStream.close();
            return md5Hex;
        } catch (Throwable th) {
            try {
                bufferedInputStream.close();
            } catch (Throwable th2) {
                th.addSuppressed(th2);
            }
            throw th;
        }
    }
}
