package eu.monnetproject.bliss.experiments;

import eu.monnetproject.bliss.CLIOpts;
import eu.monnetproject.bliss.WordMap;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream;

/* loaded from: input_file:eu/monnetproject/bliss/experiments/CleanCorpus.class */
public class CleanCorpus {
    public static final int BUF_SIZE = 1048576;

    public static void main(String[] strArr) throws Exception {
        CLIOpts cLIOpts = new CLIOpts(strArr);
        boolean flag = cLIOpts.flag("unk", "Output low-frequency tokens as <UNK>");
        boolean flag2 = cLIOpts.flag("stopWords", "Retain stop words");
        File roFile = cLIOpts.roFile("corpus[.gz|.b2]", "The integerized corpus");
        File roFile2 = cLIOpts.roFile("wordMap", "The word map");
        File roFile3 = cLIOpts.roFile("freqs", "The file of token frequencies");
        int intValue = cLIOpts.intValue("freqMin", "The minimum frequency to accept");
        int intValue2 = cLIOpts.intValue("lenMin", "The minimum document length to accept");
        File woFile = cLIOpts.woFile("out[.gz|.bz2]", "The file to write to");
        if (cLIOpts.verify(CleanCorpus.class)) {
            int calcW = WordMap.calcW(roFile2);
            int[] readFreqs = readFreqs(roFile3, calcW);
            int i = flag2 ? Integer.MAX_VALUE : readFreqs[0];
            DataInputStream dataInputStream = roFile.getName().endsWith(".gz") ? new DataInputStream(new GZIPInputStream(new FileInputStream(roFile))) : roFile.getName().endsWith(".bz2") ? new DataInputStream(new BZip2CompressorInputStream(new FileInputStream(roFile))) : new DataInputStream(new FileInputStream(roFile));
            DataOutputStream dataOutputStream = woFile.getName().endsWith(".gz") ? new DataOutputStream(new GZIPOutputStream(new FileOutputStream(woFile))) : roFile.getName().endsWith(".bz2") ? new DataOutputStream(new BZip2CompressorOutputStream(new FileOutputStream(woFile))) : new DataOutputStream(new FileOutputStream(woFile));
            int i2 = 0;
            for (int i3 = 1; i3 < readFreqs.length; i3++) {
                if (readFreqs[i3] >= intValue && readFreqs[i3] < i) {
                    i2++;
                }
            }
            System.err.println("Remaining: " + i2);
            cleanCorpus(dataInputStream, dataOutputStream, readFreqs, intValue, i, intValue2, flag ? calcW + 1 : -1);
        }
    }

    public static int[] readFreqs(File file, int i) throws FileNotFoundException, IOException {
        DataInputStream dataInputStream = new DataInputStream(new FileInputStream(file));
        int[] iArr = new int[i];
        for (int i2 = 0; i2 < i; i2++) {
            iArr[i2] = dataInputStream.readInt();
        }
        dataInputStream.close();
        return iArr;
    }

    private static void cleanCorpus(DataInputStream dataInputStream, DataOutputStream dataOutputStream, int[] iArr, int i, int i2, int i3, int i4) throws IOException {
        int[] iArr2 = new int[BUF_SIZE];
        int i5 = 0;
        int i6 = 0;
        int i7 = 0;
        int i8 = 0;
        boolean z = true;
        while (dataInputStream.available() > 0) {
            try {
                int readInt = dataInputStream.readInt();
                if (readInt == 0) {
                    if (z) {
                        z = false;
                        int i9 = i5;
                        i5++;
                        iArr2[i9] = 0;
                    } else {
                        z = true;
                        if (i6 >= i3 && i7 >= i3) {
                            for (int i10 = 0; i10 < i5 && i10 < 1048576; i10++) {
                                dataOutputStream.writeInt(iArr2[i10]);
                            }
                            dataOutputStream.writeInt(0);
                        }
                        i5 = 0;
                        i7 = 0;
                        i6 = 0;
                    }
                } else if (iArr[readInt] >= i && iArr[readInt] < i2) {
                    if (i5 < 1048576) {
                        int i11 = i5;
                        i5++;
                        iArr2[i11] = readInt;
                    } else if (i5 == 1048576) {
                        System.err.println("Buffer too small!");
                        i5++;
                    }
                    if (z) {
                        i6++;
                    } else {
                        i7++;
                    }
                } else if (i4 > 0) {
                    if (i5 < 1048576) {
                        int i12 = i5;
                        i5++;
                        iArr2[i12] = i4;
                    } else if (i5 == 1048576) {
                        System.err.println("Buffer too small!");
                        i5++;
                    }
                    if (z) {
                        i6++;
                    } else {
                        i7++;
                    }
                }
                i8++;
                if (i8 % 100000 == 0) {
                    System.err.print(".");
                }
            } catch (EOFException e) {
            }
        }
        System.err.println();
        dataInputStream.close();
        dataOutputStream.close();
    }
}
