package org.fbk.cit.hlt.thewikimachine.csv;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.text.DecimalFormat;
import java.util.Date;
import java.util.List;
import java.util.SortedMap;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import org.fbk.cit.hlt.thewikimachine.analysis.HardTokenizer;
import org.fbk.cit.hlt.thewikimachine.util.FreqSet;
import org.fbk.cit.hlt.thewikimachine.util.StringTable;
import org.xerial.snappy.SnappyInputStream;
import org.xerial.snappy.SnappyOutputStream;

/* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/csv/FileUtils.class */
public class FileUtils {
    static Logger logger = Logger.getLogger(FileUtils.class.getName());
    private static Pattern tabPattern = Pattern.compile(StringTable.HORIZONTAL_TABULATION);
    private static DecimalFormat df = new DecimalFormat("###,###,###,###");
    public static final int DEFAULT_NOTIFICATION_POINT = 1000000;

    public static void csort(String str, String str2, int i, int i2) throws IOException {
        csort(new File(str), new File(str2), i, i2);
    }

    public static void csort(File file, File file2, int i, int i2) throws IOException {
        new CSort(file, file2, i, i2).run();
    }

    public static void sort(String str, String str2, int i, int i2, boolean z) throws IOException {
        sort(new File(str), new File(str2), i, i2, z);
    }

    public static void sort(String str, String str2, int i, int i2) throws IOException {
        sort(new File(str), new File(str2), i, i2);
    }

    public static void sort(File file, File file2, int i, int i2) throws IOException {
        new Sort(file, file2, i, i2).run();
    }

    public static void sort(File file, File file2, int i, int i2, boolean z) throws IOException {
        new Sort(file, file2, i, i2, z).run();
    }

    public static void uniq(String str, String str2, int i) throws IOException {
        uniq(new File(str), new File(str2), i);
    }

    public static void uniq(File file, File file2, int i) throws IOException {
        logger.info("reading " + file + "...");
        long currentTimeMillis = System.currentTimeMillis();
        FreqSet freqSet = new FreqSet();
        LineNumberReader lineNumberReader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));
        int i2 = 0;
        while (true) {
            String readLine = lineNumberReader.readLine();
            if (readLine == null) {
                lineNumberReader.close();
                logger.info(df.format(i2) + StringTable.HORIZONTAL_TABULATION + df.format(freqSet.size()) + StringTable.HORIZONTAL_TABULATION + df.format(System.currentTimeMillis() - currentTimeMillis) + StringTable.HORIZONTAL_TABULATION + new Date());
                logger.info("sorting " + file + "...");
                long currentTimeMillis2 = System.currentTimeMillis();
                SortedMap<Integer, List<String>> sortedMap = freqSet.toSortedMap();
                writeSortedMap(sortedMap, file2);
                logger.info(sortedMap.size() + " lines sorted in " + df.format(System.currentTimeMillis() - currentTimeMillis2) + " ms " + new Date());
                return;
            }
            try {
                try {
                    String[] split = tabPattern.split(readLine);
                    if (split.length > i) {
                        freqSet.add(split[i]);
                    }
                    i2++;
                } catch (Exception e) {
                    logger.error("Error at line " + i2);
                    logger.error(e);
                    i2++;
                }
                if (i2 % 1000000 == 0) {
                    logger.info(df.format(i2) + StringTable.HORIZONTAL_TABULATION + df.format(freqSet.size()) + StringTable.HORIZONTAL_TABULATION + df.format(System.currentTimeMillis() - currentTimeMillis) + StringTable.HORIZONTAL_TABULATION + new Date());
                    currentTimeMillis = System.currentTimeMillis();
                }
            } catch (Throwable th) {
                int i3 = i2 + 1;
                throw th;
            }
        }
    }

    public static void df(File file, File file2) throws IOException {
        logger.info("reading " + file + "...");
        long currentTimeMillis = System.currentTimeMillis();
        FreqSet freqSet = new FreqSet();
        LineNumberReader lineNumberReader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));
        HardTokenizer hardTokenizer = HardTokenizer.getInstance();
        int i = 0;
        while (true) {
            String readLine = lineNumberReader.readLine();
            if (readLine == null) {
                lineNumberReader.close();
                logger.info(df.format(i) + StringTable.HORIZONTAL_TABULATION + df.format(freqSet.size()) + StringTable.HORIZONTAL_TABULATION + df.format(System.currentTimeMillis() - currentTimeMillis) + StringTable.HORIZONTAL_TABULATION + new Date());
                logger.info("sorting " + file + "...");
                long currentTimeMillis2 = System.currentTimeMillis();
                writeSortedMap(freqSet.toSortedMap(), file2);
                logger.info(df.format(r0.size()) + " distinct frequencies found in " + df.format(System.currentTimeMillis() - currentTimeMillis2) + " ms " + new Date());
                return;
            }
            try {
                try {
                    String[] split = tabPattern.split(readLine);
                    if (split.length > 0) {
                        for (String str : hardTokenizer.stringArray(split[0])) {
                            freqSet.add(str);
                        }
                    }
                    i++;
                } catch (Exception e) {
                    logger.error("Error at line " + i);
                    logger.error(e);
                    i++;
                }
                if (i % 1000000 == 0) {
                    logger.info(df.format(i) + StringTable.HORIZONTAL_TABULATION + df.format(freqSet.size()) + StringTable.HORIZONTAL_TABULATION + df.format(System.currentTimeMillis() - currentTimeMillis) + StringTable.HORIZONTAL_TABULATION + new Date());
                    currentTimeMillis = System.currentTimeMillis();
                }
            } catch (Throwable th) {
                int i2 = i + 1;
                throw th;
            }
        }
    }

    private static void writeSortedMap(SortedMap<Integer, List<String>> sortedMap, File file) throws IOException {
        logger.info("writing " + file + "...");
        long currentTimeMillis = System.currentTimeMillis();
        PrintWriter printWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "UTF-8")));
        int i = 0;
        for (Integer num : sortedMap.keySet()) {
            List<String> list = sortedMap.get(num);
            for (int i2 = 0; i2 < list.size(); i2++) {
                printWriter.print(num);
                printWriter.print('\t');
                printWriter.println(list.get(i2));
            }
            i++;
        }
        logger.info(df.format(sortedMap.size()) + " distinct frequencies in " + df.format(System.currentTimeMillis() - currentTimeMillis) + " ms " + new Date());
        printWriter.close();
    }

    public static void filter(String str, String str2, FreqSet freqSet, int i, int i2) throws IOException {
        filter(new File(str), new File(str2), freqSet, i, i2, false);
    }

    public static void filter(String str, String str2, FreqSet freqSet, int i, int i2, boolean z) throws IOException {
        filter(new File(str), new File(str2), freqSet, i, i2, z);
    }

    public static void filter(File file, File file2, FreqSet freqSet, int i, int i2) throws IOException {
        filter(file, file2, freqSet, i, i2, false);
    }

    public static void filter(File file, File file2, FreqSet freqSet, int i, int i2, boolean z) throws IOException {
        PrintWriter printWriter;
        LineNumberReader lineNumberReader;
        long currentTimeMillis = System.currentTimeMillis();
        if (z) {
            printWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new SnappyOutputStream(new FileOutputStream(file2)), "UTF-8")));
            lineNumberReader = new LineNumberReader(new InputStreamReader(new SnappyInputStream(new FileInputStream(file)), "UTF-8"));
        } else {
            printWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file2), "UTF-8")));
            lineNumberReader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));
        }
        int i3 = 0;
        int i4 = 0;
        logger.info("total\tcount\ttime (ms)\tdate");
        while (true) {
            String readLine = lineNumberReader.readLine();
            if (readLine == null) {
                lineNumberReader.close();
                printWriter.close();
                logger.info(df.format(i3) + StringTable.HORIZONTAL_TABULATION + df.format(i4) + StringTable.HORIZONTAL_TABULATION + df.format(System.currentTimeMillis() - currentTimeMillis) + StringTable.HORIZONTAL_TABULATION + new Date());
                return;
            }
            String[] split = tabPattern.split(readLine);
            if (split.length >= i && freqSet.get(split[i]) >= i2) {
                printWriter.println(readLine);
                i4++;
            }
            i3++;
            if (i3 % 1000000 == 0) {
                logger.info(df.format(i3) + StringTable.HORIZONTAL_TABULATION + df.format(i4) + StringTable.HORIZONTAL_TABULATION + df.format(System.currentTimeMillis() - currentTimeMillis) + StringTable.HORIZONTAL_TABULATION + new Date());
                currentTimeMillis = System.currentTimeMillis();
            }
        }
    }
}
