package weka.distributed.hadoop;

import distributed.core.DistributedJob;
import distributed.core.DistributedJobConfig;
import distributed.hadoop.HDFSUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.Vector;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import weka.core.Attribute;
import weka.core.CommandlineRunnable;
import weka.core.Environment;
import weka.core.Instances;
import weka.core.Option;
import weka.core.Utils;
import weka.core.stats.ArffSummaryNumericMetric;
import weka.core.stats.NominalStats;
import weka.core.stats.NumericStats;
import weka.distributed.CSVToARFFHeaderReduceTask;
import weka.distributed.DistributedWekaException;

/* loaded from: input_file:weka/distributed/hadoop/RandomizedDataChunkHadoopJob.class */
public class RandomizedDataChunkHadoopJob extends HadoopJob implements CommandlineRunnable {
    private static final long serialVersionUID = 3559718941696900951L;
    protected static final String OUTPUT_SUBDIR = "/randomized";
    protected String m_wekaCsvToArffMapTaskOpts;
    protected ArffHeaderHadoopJob m_arffHeaderJob;
    protected String m_numDataChunks;
    protected String m_numInstancesPerDataChunk;
    protected String m_classIndex;
    protected String m_randomSeed;
    protected boolean m_dontDefaultToLastAttIfClassNotSet;
    protected boolean m_cleanOutputDir;

    public RandomizedDataChunkHadoopJob() {
        super("Randomly shuffled (and stratified) data chunk job", "Create a set of input files where the rows are randomly shuffled (and stratified if the class is set and nominal). One of numRandomizedDataChunks or numInstancesPerRandomizedDataChunk must be set in conjunction with this option.");
        this.m_wekaCsvToArffMapTaskOpts = "";
        this.m_arffHeaderJob = new ArffHeaderHadoopJob();
        this.m_numDataChunks = "";
        this.m_numInstancesPerDataChunk = "";
        this.m_classIndex = "";
        this.m_randomSeed = "1";
        this.m_mrConfig.setMapperClass(RandomizedDataChunkHadoopMapper.class.getName());
        this.m_mrConfig.setReducerClass(RandomizedDataChunkHadoopReducer.class.getName());
        this.m_mrConfig.setMapOutputValueClass(Text.class.getName());
    }

    public String globalInfo() {
        return "Produces randomized and stratified data chunks";
    }

    public void setCSVMapTaskOptions(String str) {
        this.m_wekaCsvToArffMapTaskOpts = str;
    }

    public String getCSVMapTaskOptions() {
        return this.m_wekaCsvToArffMapTaskOpts;
    }

    public String numRandomizedDataChunksTipText() {
        return "The number of randomly shuffled data chunks to create. Use in conjunction with createRandomizedDataChunks";
    }

    public void setDontDefaultToLastAttIfClassNotSpecified(boolean z) {
        this.m_dontDefaultToLastAttIfClassNotSet = z;
    }

    public boolean getDontDefaultToLastAttIfClassNotSpecified() {
        return this.m_dontDefaultToLastAttIfClassNotSet;
    }

    public void setNumRandomizedDataChunks(String str) {
        this.m_numDataChunks = str;
    }

    public String getNumRandomizedDataChunks() {
        return this.m_numDataChunks;
    }

    public String numInstancesPerRandomizedDataChunkTipText() {
        return "The number of instances that each randomly shuffled data chunk should contain. Use in conjunction with createRandomizedDataChunks.";
    }

    public void setNumInstancesPerRandomizedDataChunk(String str) {
        this.m_numInstancesPerDataChunk = str;
    }

    public String getNumInstancesPerRandomizedDataChunk() {
        return this.m_numInstancesPerDataChunk;
    }

    public String classAttributeTipText() {
        return "The name or index of the class attribute. 'first' and 'last' may also be used. If set, and the class is nominal, then output chunk files will be stratified.";
    }

    public void setClassAttribute(String str) {
        this.m_classIndex = str;
    }

    public String getClassAttribute() {
        return this.m_classIndex;
    }

    public String randomSeedTipText() {
        return "The random seed to use when randomly shuffling the data";
    }

    public void setRandomSeed(String str) {
        this.m_randomSeed = str;
    }

    public String getRandomSeed() {
        return this.m_randomSeed;
    }

    public String cleanOutputDirectoryTipText() {
        return "Set to true to delete any existing output directory and force this job to run";
    }

    public void setCleanOutputDirectory(boolean z) {
        this.m_cleanOutputDir = z;
    }

    public boolean getCleanOutputDirectory() {
        return this.m_cleanOutputDir;
    }

    @Override // weka.distributed.hadoop.HadoopJob
    public Enumeration<Option> listOptions() {
        Vector vector = new Vector();
        vector.add(new Option("\tClean output directory. Forces the job to run in the case where the output directory\n\talready exists and is populated with chunk files", "clean", 0, "-clean"));
        vector.add(new Option("\tNumber of randomized data chunks. Use in conjunction with\n\t-randomized-chunks", "num-chunks", 1, "-num-chunks <integer>"));
        vector.add(new Option("\tNumber of instances per randomized data chunk.\n\tUse in conjunction with -randomized-chunks", "num-instances-per-chunk", 1, "-num-instances-per-chunk <integer>"));
        vector.add(new Option("\tClass index (1-based) or class attribute name (default = last attribute).", "class", 1, "-class <index or name>"));
        vector.add(new Option("\tRandom seed (default = 1)", "seed", 1, "-seed <integer>"));
        vector.add(new Option("", "", 0, "\nOptions specific to ARFF training header creation:"));
        Enumeration<Option> listOptions = new ArffHeaderHadoopJob().listOptions();
        while (listOptions.hasMoreElements()) {
            vector.add(listOptions.nextElement());
        }
        return vector.elements();
    }

    @Override // weka.distributed.hadoop.HadoopJob
    public void setOptions(String[] strArr) throws Exception {
        setNumRandomizedDataChunks(Utils.getOption("num-chunks", strArr));
        setNumInstancesPerRandomizedDataChunk(Utils.getOption("num-instances-per-chunk", strArr));
        setClassAttribute(Utils.getOption("class", strArr));
        setRandomSeed(Utils.getOption("seed", strArr));
        setCleanOutputDirectory(Utils.getFlag("clean", strArr));
        String[] strArr2 = (String[]) strArr.clone();
        super.setOptions(strArr);
        this.m_arffHeaderJob.setOptions(strArr2);
        String joinOptions = Utils.joinOptions(this.m_arffHeaderJob.getOptions());
        if (DistributedJobConfig.isEmpty(joinOptions)) {
            return;
        }
        setCSVMapTaskOptions(joinOptions);
    }

    @Override // weka.distributed.hadoop.HadoopJob
    public String[] getOptions() {
        ArrayList arrayList = new ArrayList();
        if (getCleanOutputDirectory()) {
            arrayList.add("-clean");
        }
        if (!DistributedJobConfig.isEmpty(getNumRandomizedDataChunks())) {
            arrayList.add("-num-chunks");
            arrayList.add(getNumRandomizedDataChunks());
        }
        if (!DistributedJobConfig.isEmpty(getNumInstancesPerRandomizedDataChunk())) {
            arrayList.add("-num-instances-per-chunk");
            arrayList.add(getNumInstancesPerRandomizedDataChunk());
        }
        if (!DistributedJobConfig.isEmpty(getClassAttribute())) {
            arrayList.add("-class");
            arrayList.add(getClassAttribute());
        }
        if (!DistributedJobConfig.isEmpty(getRandomSeed())) {
            arrayList.add("-seed");
            arrayList.add(getRandomSeed());
        }
        if (!DistributedJobConfig.isEmpty(getCSVMapTaskOptions())) {
            try {
                for (String str : Utils.splitOptions(getCSVMapTaskOptions())) {
                    arrayList.add(str);
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        return (String[]) arrayList.toArray(new String[arrayList.size()]);
    }

    public String[] getJobOptionsOnly() {
        ArrayList arrayList = new ArrayList();
        if (getCleanOutputDirectory()) {
            arrayList.add("-clean");
        }
        if (!DistributedJobConfig.isEmpty(getNumRandomizedDataChunks())) {
            arrayList.add("-num-chunks");
            arrayList.add(getNumRandomizedDataChunks());
        }
        if (!DistributedJobConfig.isEmpty(getNumInstancesPerRandomizedDataChunk())) {
            arrayList.add("-num-instances-per-chunk");
            arrayList.add(getNumInstancesPerRandomizedDataChunk());
        }
        if (!DistributedJobConfig.isEmpty(getClassAttribute())) {
            arrayList.add("-class");
            arrayList.add(getClassAttribute());
        }
        if (!DistributedJobConfig.isEmpty(getRandomSeed())) {
            arrayList.add("-seed");
            arrayList.add(getRandomSeed());
        }
        return (String[]) arrayList.toArray(new String[arrayList.size()]);
    }

    protected boolean initializeAndRunArffJob() throws DistributedWekaException, IOException {
        if (this.m_arffHeaderJob.getFinalHeader() != null) {
            return true;
        }
        if (this.m_env == null) {
            this.m_env = Environment.getSystemWide();
        }
        logMessage("Checking to see if ARFF job is needed....");
        statusMessage("Checking to see if ARFF job is needed...");
        this.m_arffHeaderJob.setEnvironment(this.m_env);
        this.m_arffHeaderJob.setLog(getLog());
        this.m_arffHeaderJob.setStatusMessagePrefix(this.m_statusMessagePrefix);
        if (this.m_arffHeaderJob.runJob()) {
            return true;
        }
        statusMessage("Unable to continue - creating the ARFF header failed!");
        logMessage("Unable to continue - creating the ARFF header failed!");
        return false;
    }

    public boolean runJob() throws DistributedWekaException {
        int parseInt;
        ClassLoader contextClassLoader = Thread.currentThread().getContextClassLoader();
        try {
            Thread.currentThread().setContextClassLoader(getClass().getClassLoader());
            try {
                if (!this.m_cleanOutputDir) {
                    String str = this.m_mrConfig.getOutputPath() + OUTPUT_SUBDIR;
                    Configuration configuration = new Configuration();
                    this.m_mrConfig.getHDFSConfig().configureForHadoop(configuration, this.m_env);
                    if (FileSystem.get(configuration).exists(new Path(str + "/chunk0-r-00000"))) {
                        if (this.m_log != null) {
                            statusMessage("Output directory is populated with chunk files - no need to execute");
                            logMessage("Output directory is populated with chunk files - no need to execute");
                        } else {
                            System.err.println("Output directory is populated with chunk files - no need to execute");
                        }
                        Thread.currentThread().setContextClassLoader(contextClassLoader);
                        return true;
                    }
                }
                setJobStatus(DistributedJob.JobStatus.RUNNING);
                this.m_arffHeaderJob.setGenerateCharts(false);
                if (!initializeAndRunArffJob()) {
                    Thread.currentThread().setContextClassLoader(contextClassLoader);
                    return false;
                }
                Instances finalHeader = this.m_arffHeaderJob.getFinalHeader();
                Instances stripSummaryAtts = CSVToARFFHeaderReduceTask.stripSummaryAtts(finalHeader);
                try {
                    WekaClassifierHadoopMapper.setClassIndex(getClassAttribute(), stripSummaryAtts, !this.m_dontDefaultToLastAttIfClassNotSet);
                    Attribute attribute = null;
                    for (int i = 0; i < stripSummaryAtts.numAttributes(); i++) {
                        if (stripSummaryAtts.attribute(i).isNumeric() || stripSummaryAtts.attribute(i).isNominal()) {
                            attribute = stripSummaryAtts.attribute(i);
                            break;
                        }
                    }
                    String name = attribute.name();
                    Attribute attribute2 = finalHeader.attribute("arff_summary_" + name);
                    if (attribute2 == null) {
                        throw new DistributedWekaException("Was unable to find the summary attribute for attribute: " + name);
                    }
                    int i2 = 0;
                    if (attribute.isNominal()) {
                        NominalStats attributeToStats = NominalStats.attributeToStats(attribute2);
                        Iterator it = attributeToStats.getLabels().iterator();
                        while (it.hasNext()) {
                            i2 = (int) (i2 + attributeToStats.getCount((String) it.next()));
                        }
                    } else {
                        i2 = (int) NumericStats.attributeToStats(attribute2).getStats()[ArffSummaryNumericMetric.COUNT.ordinal()];
                    }
                    Configuration configuration2 = new Configuration();
                    String environmentSubstitute = environmentSubstitute(this.m_arffHeaderJob.getAggregatedHeaderPath());
                    HDFSUtils.addFileToDistributedCache(this.m_mrConfig.getHDFSConfig(), configuration2, environmentSubstitute, this.m_env);
                    String substring = environmentSubstitute.substring(environmentSubstitute.lastIndexOf("/") + 1, environmentSubstitute.length());
                    ArrayList arrayList = new ArrayList();
                    arrayList.add("-arff-header");
                    arrayList.add(substring);
                    if (!DistributedJobConfig.isEmpty(getClassAttribute())) {
                        arrayList.add("-class");
                        arrayList.add(environmentSubstitute(getClassAttribute()));
                    }
                    if (!DistributedJobConfig.isEmpty(getRandomSeed())) {
                        arrayList.add("-seed");
                        arrayList.add(environmentSubstitute(getRandomSeed()));
                    }
                    if (this.m_dontDefaultToLastAttIfClassNotSet) {
                        arrayList.add("-dont-default-class-to-last");
                    }
                    this.m_mrConfig.setUserSuppliedProperty(RandomizedDataChunkHadoopMapper.RANDOMIZED_DATA_CHUNK_MAP_TASK_OPTIONS, environmentSubstitute(Utils.joinOptions((String[]) arrayList.toArray(new String[arrayList.size()]))));
                    this.m_mrConfig.setUserSuppliedProperty(CSVToArffHeaderHadoopMapper.CSV_TO_ARFF_HEADER_MAP_TASK_OPTIONS, environmentSubstitute(getCSVMapTaskOptions()));
                    if (DistributedJobConfig.isEmpty(getNumRandomizedDataChunks()) && DistributedJobConfig.isEmpty(getNumInstancesPerRandomizedDataChunk())) {
                        throw new DistributedWekaException("Must specify either the number of chunks to create or the number of instances per chunk");
                    }
                    if (DistributedJobConfig.isEmpty(getNumRandomizedDataChunks())) {
                        try {
                            int parseInt2 = Integer.parseInt(environmentSubstitute(getNumInstancesPerRandomizedDataChunk()));
                            if (parseInt2 <= 0) {
                                throw new DistributedWekaException("Number of instances per chunk must be > 0");
                            }
                            if (parseInt2 > i2) {
                                throw new DistributedWekaException("Can't have more instances per chunk than there are instances in the dataset!");
                            }
                            parseInt = (int) Math.ceil(i2 / parseInt2);
                        } catch (NumberFormatException e) {
                            throw new DistributedWekaException(e);
                        }
                    } else {
                        try {
                            parseInt = Integer.parseInt(environmentSubstitute(getNumRandomizedDataChunks()));
                        } catch (NumberFormatException e2) {
                            throw new DistributedWekaException(e2);
                        }
                    }
                    if (parseInt <= 1) {
                        throw new DistributedWekaException("Can't randomize because number of data chunks <= 1");
                    }
                    this.m_mrConfig.setUserSuppliedProperty(RandomizedDataChunkHadoopReducer.NUM_DATA_CHUNKS, "" + parseInt);
                    String environmentSubstitute2 = environmentSubstitute(this.m_mrConfig.getOutputPath() + OUTPUT_SUBDIR);
                    this.m_mrConfig.setOutputPath(environmentSubstitute2);
                    this.m_mrConfig.setNumberOfReducers("1");
                    installWekaLibrariesInHDFS(configuration2);
                    try {
                        Job configureForHadoop = this.m_mrConfig.configureForHadoop("Create randomly shuffled input data chunk job - num chunks: " + parseInt, configuration2, this.m_env);
                        for (int i3 = 0; i3 < parseInt; i3++) {
                            MultipleOutputs.addNamedOutput(configureForHadoop, "chunk" + i3, TextOutputFormat.class, Text.class, Text.class);
                        }
                        this.m_mrConfig.deleteOutputDirectory(configureForHadoop, this.m_env);
                        statusMessage("Submitting randomized data chunk job ");
                        logMessage("Submitting randomized data chunk job ");
                        boolean runJob = runJob(configureForHadoop);
                        setJobStatus(runJob ? DistributedJob.JobStatus.FINISHED : DistributedJob.JobStatus.FAILED);
                        if (runJob) {
                            HDFSUtils.deleteFile(this.m_mrConfig.getHDFSConfig(), configuration2, environmentSubstitute2 + "/part-r-00000", this.m_env);
                        } else {
                            statusMessage("Create randomly shuffled input data chunk job failed - check logs on Hadoop");
                            logMessage("Create randomly shuffled input data chunk job job failed - check logs on Hadoop");
                        }
                        Thread.currentThread().setContextClassLoader(contextClassLoader);
                        return runJob;
                    } catch (ClassNotFoundException e3) {
                        throw new DistributedWekaException(e3);
                    }
                } catch (Exception e4) {
                    throw new DistributedWekaException(e4);
                }
            } catch (Exception e5) {
                setJobStatus(DistributedJob.JobStatus.FAILED);
                throw new DistributedWekaException(e5);
            }
        } catch (Throwable th) {
            Thread.currentThread().setContextClassLoader(contextClassLoader);
            throw th;
        }
    }

    public String getRandomizedChunkOutputPath() {
        return this.m_mrConfig.getOutputPath() + (this.m_mrConfig.getOutputPath().endsWith(OUTPUT_SUBDIR) ? "" : OUTPUT_SUBDIR);
    }

    public void run(Object obj, String[] strArr) throws IllegalArgumentException {
        if (!(obj instanceof RandomizedDataChunkHadoopJob)) {
            throw new IllegalArgumentException("Object to run is not a RandomizedDataChunkHadoopJob!");
        }
        try {
            RandomizedDataChunkHadoopJob randomizedDataChunkHadoopJob = (RandomizedDataChunkHadoopJob) obj;
            if (Utils.getFlag('h', strArr)) {
                System.err.println(DistributedJob.makeOptionsStr(randomizedDataChunkHadoopJob));
                System.exit(1);
            }
            randomizedDataChunkHadoopJob.setOptions(strArr);
            randomizedDataChunkHadoopJob.runJob();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static void main(String[] strArr) {
        RandomizedDataChunkHadoopJob randomizedDataChunkHadoopJob = new RandomizedDataChunkHadoopJob();
        randomizedDataChunkHadoopJob.run(randomizedDataChunkHadoopJob, strArr);
    }
}
