package eu.monnetproject.bliss.experiments;

import eu.monnetproject.bliss.CLIOpts;
import eu.monnetproject.bliss.PTBTokenizer;
import eu.monnetproject.bliss.WordMap;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntListIterator;
import java.io.DataOutputStream;
import java.io.File;
import java.util.Iterator;
import java.util.Scanner;

/* loaded from: input_file:eu/monnetproject/bliss/experiments/IntegerizeAcquis.class */
public class IntegerizeAcquis {
    public static void main(String[] strArr) throws Exception {
        CLIOpts cLIOpts = new CLIOpts(strArr);
        File roFile = cLIOpts.roFile("corpus[.gz|.bz2]", "The corpus in Acquis format");
        File woFile = cLIOpts.woFile("wordMap", "The file to write the word map to");
        File woFile2 = cLIOpts.woFile("corpusOut[.gz|.bz2]", "The file to write the integerized corpus to");
        if (cLIOpts.verify(IntegerizeAcquis.class)) {
            WordMap wordMap = new WordMap();
            PTBTokenizer pTBTokenizer = new PTBTokenizer();
            Scanner scanner = new Scanner(CLIOpts.openInputAsMaybeZipped(roFile));
            DataOutputStream dataOutputStream = new DataOutputStream(CLIOpts.openOutputAsMaybeZipped(woFile2));
            IntArrayList[] intArrayListArr = {new IntArrayList(), new IntArrayList()};
            int i = 0;
            while (scanner.hasNextLine()) {
                i++;
                if (i % 100000 == 0) {
                    System.err.print(".");
                }
                String nextLine = scanner.nextLine();
                if (nextLine.contains("<s1>") || nextLine.contains("<s2>")) {
                    boolean z = !nextLine.contains("<s1>");
                    Iterator it = pTBTokenizer.tokenize(nextLine.replaceAll("<[^>]+>", "")).iterator();
                    while (it.hasNext()) {
                        intArrayListArr[z ? 1 : 0].add(wordMap.offer((String) it.next()));
                    }
                } else if (nextLine.contains("</linkGrp>")) {
                    IntListIterator it2 = intArrayListArr[0].iterator();
                    while (it2.hasNext()) {
                        dataOutputStream.writeInt(((Integer) it2.next()).intValue());
                    }
                    dataOutputStream.writeInt(0);
                    intArrayListArr[0].clear();
                    IntListIterator it3 = intArrayListArr[1].iterator();
                    while (it3.hasNext()) {
                        dataOutputStream.writeInt(((Integer) it3.next()).intValue());
                    }
                    intArrayListArr[1].clear();
                    dataOutputStream.writeInt(0);
                }
            }
            System.err.println();
            dataOutputStream.flush();
            dataOutputStream.close();
            wordMap.write(woFile);
        }
    }
}
