package it.unimi.dsi.law.warc.tool;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.fastutil.longs.LongArrays;
import it.unimi.dsi.io.FileLinesCollection;
import it.unimi.dsi.law.warc.io.GZWarcRecord;
import it.unimi.dsi.law.warc.io.WarcRecord;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.util.StringMap;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.RandomAccessFile;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:it/unimi/dsi/law/warc/tool/CutWarc.class */
public class CutWarc {
    private static final Logger LOGGER = LoggerFactory.getLogger(CutWarc.class);
    static final int IO_BUFFER_SIZE = 65536;

    public static void run(FastBufferedInputStream fastBufferedInputStream, RandomAccessFile randomAccessFile, boolean z, boolean z2, long[] jArr, int i, OutputStream outputStream) throws IOException, WarcRecord.FormatException {
        WarcRecord gZWarcRecord = z ? new GZWarcRecord() : new WarcRecord();
        WarcRecord gZWarcRecord2 = z2 ? new GZWarcRecord() : new WarcRecord();
        ProgressLogger progressLogger = new ProgressLogger(LOGGER, "documents");
        progressLogger.start("Cutting documents");
        for (int i2 = 0; i2 < i; i2++) {
            randomAccessFile.seek(jArr[i2] * 8);
            fastBufferedInputStream.position(randomAccessFile.readLong());
            gZWarcRecord.resetRead();
            gZWarcRecord.read(fastBufferedInputStream);
            gZWarcRecord2.copy(gZWarcRecord);
            gZWarcRecord2.write(outputStream);
            progressLogger.lightUpdate();
        }
        progressLogger.stop();
    }

    public static void main(String[] strArr) throws Exception {
        SimpleJSAP simpleJSAP = new SimpleJSAP(CutWarc.class.getName(), "Cuts (that is, extracts record) from a warc file. It requires an index.", new Parameter[]{new Switch("gzip", 'z', "gzip", "Tells if the input warc is compressed."), new Switch("outzip", 'Z', "outzip", "Tells if the output warc must be compressed."), new Switch("permissive", 'p', "permissive", "Ignore unknown urls instead of throwing an exception"), new FlaggedOption("recordFile", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, false, 'r', "recordFile", "A file containing, one per line, the ordinal numbers or URL of records to be output."), new FlaggedOption("urlMap", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, false, 'm', "url-map", "The term map from URL to record number."), new UnflaggedOption("warcFile", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, true, false, "The Warc file basename."), new UnflaggedOption("recordSpec", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, false, true, "The spec (ordinal number or URL) of records to be output.")});
        JSAPResult parse = simpleJSAP.parse(strArr);
        if (simpleJSAP.messagePrinted()) {
            System.exit(1);
        }
        if (!parse.userSpecified("recordFile") && !parse.userSpecified("recordSpec")) {
            throw new IllegalArgumentException("One of the two options recordFile and recordSpec must be set.");
        }
        if (parse.userSpecified("recordSpec") && parse.userSpecified("recordFile")) {
            throw new IllegalArgumentException("You cannot specify both recordFile and recordSpec options");
        }
        CharSequence[] stringArray = parse.userSpecified("recordSpec") ? parse.getStringArray("recordSpec") : (CharSequence[]) new FileLinesCollection(parse.getString("recordFile"), "UTF-8").allLines().toArray(new CharSequence[0]);
        String string = parse.getString("warcFile");
        boolean z = parse.getBoolean("gzip");
        boolean z2 = parse.getBoolean("outzip");
        boolean z3 = parse.getBoolean("permissive");
        long[] jArr = new long[stringArray.length];
        StringMap stringMap = parse.getString("urlMap") == null ? null : (StringMap) BinIO.loadObject(parse.getString("urlMap"));
        int i = 0;
        for (int i2 = 0; i2 < stringArray.length; i2++) {
            try {
                jArr[i] = Long.parseLong(stringArray[i2].toString());
                if (jArr[i] >= 0) {
                    i++;
                }
            } catch (NumberFormatException e) {
                if (stringMap == null) {
                    throw new RuntimeException("URLs cannot be specified if a map is not provided");
                }
                jArr[i] = stringMap.getLong(stringArray[i2]);
                if (jArr[i] >= 0) {
                    i++;
                } else if (!z3) {
                    throw new RuntimeException("URL " + ((Object) stringArray[i2]) + " cannot be resolved");
                }
            }
        }
        LongArrays.quickSort(jArr, 0, i);
        FastBufferedInputStream fastBufferedInputStream = new FastBufferedInputStream(new FileInputStream(new File(string + ".warc" + (z ? ".gz" : ""))), 65536);
        RandomAccessFile randomAccessFile = new RandomAccessFile(new File(string + ".warc" + (z ? ".gz" : "") + ".idx"), "r");
        FastBufferedOutputStream fastBufferedOutputStream = new FastBufferedOutputStream(System.out, 65536);
        run(fastBufferedInputStream, randomAccessFile, z, z2, jArr, i, fastBufferedOutputStream);
        fastBufferedInputStream.close();
        randomAccessFile.close();
        fastBufferedOutputStream.close();
    }
}
