package eu.monnetproject.bliss.experiments;

import eu.monnetproject.bliss.PTBTokenizer;
import eu.monnetproject.bliss.Tokenizer;
import eu.monnetproject.bliss.WordMap;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream;

/* loaded from: input_file:eu/monnetproject/bliss/experiments/IntegerizeCorpus.class */
public class IntegerizeCorpus {
    private static final Tokenizer tokenizer = new PTBTokenizer();
    private static final Pattern docStart = Pattern.compile("<doc.*title=\"(.*)\">.*");
    private static final Pattern docEnd = Pattern.compile("</doc>");

    public static String normalize(String str) {
        return str.toLowerCase();
    }

    private static void fail(String str) {
        System.err.println(str);
        System.err.println("\nUsage:\n\tmvn exec:java -Dexec.mainClass=" + IntegerizeCorpus.class.getName() + " -Dexec.args=\"[-s SAMPLING_RATE] corpus[.gz|.bz2] wordMap output[.gz|.bz2]\"\n\t\tcorpus and output may use suffix .gz or .bz2 to enable compression\n\t\twordMap is always appended to (if it exists)");
        System.exit(-1);
    }

    public static void main(String[] strArr) throws IOException {
        ArrayList arrayList = new ArrayList(Arrays.asList(strArr));
        int i = 1;
        int i2 = 0;
        while (i2 < arrayList.size()) {
            if (((String) arrayList.get(i2)).equals("-s") && i2 + 1 < arrayList.size()) {
                i = Integer.parseInt((String) arrayList.get(i2 + 1));
                if (i <= 0) {
                    fail("Non-positive sample rate!");
                }
                arrayList.remove(i2);
                arrayList.remove(i2);
                i2--;
            }
            i2++;
        }
        if (arrayList.size() != 3) {
            fail("Wrong number of arguments");
        }
        File file = new File((String) arrayList.get(0));
        if (!file.exists() || !file.canRead()) {
            fail("Could not access corpus file");
        }
        InputStream gZIPInputStream = file.getName().endsWith(".gz") ? new GZIPInputStream(new FileInputStream(file)) : file.getName().endsWith(".bz2") ? new BZip2CompressorInputStream(new FileInputStream(file)) : new FileInputStream(file);
        File file2 = new File((String) arrayList.get(1));
        WordMap fromFile = (file2.exists() && file2.canRead()) ? WordMap.fromFile(file2) : new WordMap();
        if (file2.exists() && !file2.canWrite()) {
            fail("Cannot access word map file");
        }
        File file3 = new File((String) arrayList.get(2));
        if (file3.exists() && !file3.canWrite()) {
            fail("Could not access out file");
        }
        PrintWriter printWriter = file3.getName().endsWith(".gz") ? new PrintWriter(new GZIPOutputStream(new FileOutputStream(file3))) : file.getName().endsWith(".bz2") ? new PrintWriter((OutputStream) new BZip2CompressorOutputStream(new FileOutputStream(file3))) : new PrintWriter(file3);
        integerize(gZIPInputStream, fromFile, printWriter, i);
        printWriter.close();
        fromFile.write(file2);
    }

    public static void integerize(InputStream inputStream, WordMap wordMap, PrintWriter printWriter, int i) {
        Scanner useDelimiter = new Scanner(inputStream).useDelimiter("\r?\n");
        boolean z = true;
        while (useDelimiter.hasNext()) {
            String next = useDelimiter.next();
            Matcher matcher = docStart.matcher(next);
            if (matcher.matches()) {
                if (!z) {
                    printWriter.println();
                }
                printWriter.print(matcher.group(1));
                printWriter.print(":");
            } else if (docEnd.matcher(next).matches()) {
                printWriter.println();
                z = true;
            } else {
                int i2 = 0;
                for (String str : tokenizer.tokenize(next)) {
                    if (str.length() != 0) {
                        int i3 = i2;
                        i2++;
                        if (i3 % i == 0) {
                            printWriter.print(wordMap.offer(normalize(str)));
                            printWriter.print(" ");
                            z = false;
                        }
                    }
                }
            }
        }
    }
}
