/*
 * Decompiled with CFR 0.152.
 */
package org.forester.application;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.math.RoundingMode;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.List;
import org.forester.io.parsers.FastaParser;
import org.forester.io.parsers.GeneralMsaParser;
import org.forester.msa.DeleteableMsa;
import org.forester.msa.Msa;
import org.forester.msa.MsaInferrer;
import org.forester.msa.MsaMethods;
import org.forester.msa_compactor.Chart;
import org.forester.msa_compactor.MsaCompactor;
import org.forester.msa_compactor.MsaProperties;
import org.forester.util.CommandLineArguments;
import org.forester.util.DescriptiveStatistics;
import org.forester.util.ForesterUtil;

public class msa_compactor {
    private static final NumberFormat NF_1 = new DecimalFormat("0.#");
    private static final NumberFormat NF_4 = new DecimalFormat("0.####");
    private static final String HELP_OPTION_1 = "help";
    private static final String HELP_OPTION_2 = "h";
    private static final String REMOVE_WORST_OFFENDERS_OPTION = "r";
    private static final String AV_GAPINESS_OPTION = "g";
    private static final String STEP_OPTION = "s";
    private static final String LENGTH_OPTION = "l";
    private static final String REALIGN_OPTION = "a";
    private static final String INFO_ONLY_OPTION = "i";
    private static final String STEP_FOR_DIAGNOSTICS_OPTION = "sd";
    private static final String MIN_LENGTH_OPTION = "ml";
    private static final String GAP_RATIO_LENGTH_OPTION = "gr";
    private static final String REPORT_ENTROPY = "e";
    private static final String OUTPUT_FORMAT_OPTION = "f";
    private static final String OUTPUT_REMOVED_SEQS_OPTION = "ro";
    private static final String MAFFT_OPTIONS = "mo";
    private static final String PERFORM_PHYLOGENETIC_INFERENCE = "t";
    private static final String PATH_TO_MAFFT_OPTION = "mafft";
    private static final String DO_NOT_NORMALIZE_FOR_EFF_LENGTH_OPTION = "nn";
    private static final String PRG_NAME = "msa_compactor";
    private static final String PRG_DESC = "multiple sequence aligment compactor";
    private static final String PRG_VERSION = "0.3";
    private static final String PRG_DATE = "140508";
    private static final String E_MAIL = "czmasek@sanfordburham.org";
    private static final String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester";

    public static void main(String[] args) {
        try {
            boolean chart_only;
            CommandLineArguments cla = new CommandLineArguments(args);
            if (cla.isOptionSet(HELP_OPTION_1) || cla.isOptionSet(HELP_OPTION_2) || cla.getNumberOfNames() < 1 || cla.getNumberOfNames() > 2) {
                msa_compactor.printHelp();
                System.exit(0);
            }
            File in = cla.getFile(0);
            File out = null;
            if (cla.getNumberOfNames() > 1) {
                out = cla.getFile(1);
            }
            int worst_remove = -1;
            double av_gap = -1.0;
            int length = -1;
            int step = 1;
            boolean realign = false;
            boolean normalize_for_effective_seq_length = true;
            String path_to_mafft = null;
            int step_for_diagnostics = 1;
            int min_length = -1;
            double gap_ratio = -1.0;
            boolean report_entropy = false;
            Msa.MSA_FORMAT output_format = Msa.MSA_FORMAT.FASTA;
            File removed_seqs_out_base = null;
            String mafft_options = "--auto";
            boolean perform_phylogenetic_inference = false;
            ArrayList<String> allowed_options = new ArrayList<String>();
            allowed_options.add(REMOVE_WORST_OFFENDERS_OPTION);
            allowed_options.add(AV_GAPINESS_OPTION);
            allowed_options.add(LENGTH_OPTION);
            allowed_options.add(REALIGN_OPTION);
            allowed_options.add(DO_NOT_NORMALIZE_FOR_EFF_LENGTH_OPTION);
            allowed_options.add(STEP_OPTION);
            allowed_options.add(PATH_TO_MAFFT_OPTION);
            allowed_options.add(STEP_FOR_DIAGNOSTICS_OPTION);
            allowed_options.add(MIN_LENGTH_OPTION);
            allowed_options.add(GAP_RATIO_LENGTH_OPTION);
            allowed_options.add(REPORT_ENTROPY);
            allowed_options.add(OUTPUT_FORMAT_OPTION);
            allowed_options.add(OUTPUT_REMOVED_SEQS_OPTION);
            allowed_options.add(MAFFT_OPTIONS);
            allowed_options.add(PERFORM_PHYLOGENETIC_INFERENCE);
            allowed_options.add(INFO_ONLY_OPTION);
            String dissallowed_options = cla.validateAllowedOptionsAsString(allowed_options);
            if (dissallowed_options.length() > 0) {
                ForesterUtil.fatalError(PRG_NAME, "unknown option(s): " + dissallowed_options);
            }
            DeleteableMsa msa = null;
            FileInputStream is = new FileInputStream(in);
            msa = FastaParser.isLikelyFasta(in) ? DeleteableMsa.createInstance(FastaParser.parseMsa(is)) : DeleteableMsa.createInstance(GeneralMsaParser.parse(is));
            DescriptiveStatistics initial_msa_stats = MsaMethods.calculateEffectiveLengthStatistics(msa);
            if (cla.isOptionSet(INFO_ONLY_OPTION)) {
                msa_compactor.printInfo(in, msa, initial_msa_stats);
                System.exit(0);
            }
            boolean bl = chart_only = !cla.isOptionSet(LENGTH_OPTION) && !cla.isOptionSet(REMOVE_WORST_OFFENDERS_OPTION) && !cla.isOptionSet(AV_GAPINESS_OPTION) && !cla.isOptionSet(MIN_LENGTH_OPTION);
            if (!chart_only && out == null) {
                ForesterUtil.fatalError(PRG_NAME, "outfile file missing");
            }
            if (cla.isOptionSet(REMOVE_WORST_OFFENDERS_OPTION) && ((worst_remove = cla.getOptionValueAsInt(REMOVE_WORST_OFFENDERS_OPTION)) < 1 || worst_remove >= msa.getNumberOfSequences() - 1)) {
                ForesterUtil.fatalError(PRG_NAME, "number of worst offender sequences to remove is out of range: " + worst_remove);
            }
            if (cla.isOptionSet(AV_GAPINESS_OPTION)) {
                if (cla.isOptionSet(REMOVE_WORST_OFFENDERS_OPTION)) {
                    msa_compactor.printHelp();
                    System.exit(0);
                }
                if ((av_gap = cla.getOptionValueAsDouble(AV_GAPINESS_OPTION)) < 0.0 || av_gap >= 1.0) {
                    ForesterUtil.fatalError(PRG_NAME, "target gap-ratio is out of range: " + av_gap);
                }
            }
            if (cla.isOptionSet(LENGTH_OPTION)) {
                if (cla.isOptionSet(REMOVE_WORST_OFFENDERS_OPTION) || cla.isOptionSet(AV_GAPINESS_OPTION)) {
                    msa_compactor.printHelp();
                    System.exit(0);
                }
                if ((length = cla.getOptionValueAsInt(LENGTH_OPTION)) >= msa.getLength()) {
                    ForesterUtil.fatalError(PRG_NAME, "target length is out of range [longer than MSA (" + msa.getLength() + ")]: " + length);
                } else if ((double)length < initial_msa_stats.getMin()) {
                    ForesterUtil.fatalError(PRG_NAME, "target length is out of range [shorter than the shortest sequence (" + initial_msa_stats.getMin() + ") ]: " + length);
                }
            }
            if (cla.isOptionSet(MIN_LENGTH_OPTION)) {
                if (cla.isOptionSet(LENGTH_OPTION) || cla.isOptionSet(REMOVE_WORST_OFFENDERS_OPTION) || cla.isOptionSet(AV_GAPINESS_OPTION) || cla.isOptionSet(STEP_OPTION) || cla.isOptionSet(REALIGN_OPTION) || cla.isOptionSet(PATH_TO_MAFFT_OPTION) || cla.isOptionSet(STEP_FOR_DIAGNOSTICS_OPTION) || cla.isOptionSet(REPORT_ENTROPY) || cla.isOptionSet(OUTPUT_REMOVED_SEQS_OPTION) || cla.isOptionSet(PERFORM_PHYLOGENETIC_INFERENCE)) {
                    msa_compactor.printHelp();
                    System.exit(0);
                }
                if ((min_length = cla.getOptionValueAsInt(MIN_LENGTH_OPTION)) < 2 || (double)min_length > initial_msa_stats.getMax()) {
                    ForesterUtil.fatalError(PRG_NAME, "value for minimal sequence length is out of range: " + min_length);
                }
            }
            if (cla.isOptionSet(STEP_OPTION) && ((step = cla.getOptionValueAsInt(STEP_OPTION)) < 1 || step > msa.getNumberOfSequences() || worst_remove > 0 && step > worst_remove)) {
                ForesterUtil.fatalError(PRG_NAME, "value for step is out of range: " + step);
            }
            if (cla.isOptionSet(REALIGN_OPTION)) {
                realign = true;
            }
            if (cla.isOptionSet(PATH_TO_MAFFT_OPTION)) {
                if (!realign) {
                    ForesterUtil.fatalError(PRG_NAME, "no need to indicate path to MAFFT without realigning");
                }
                path_to_mafft = cla.getOptionValueAsCleanString(PATH_TO_MAFFT_OPTION);
            }
            if (cla.isOptionSet(DO_NOT_NORMALIZE_FOR_EFF_LENGTH_OPTION)) {
                normalize_for_effective_seq_length = false;
            }
            if (cla.isOptionSet(STEP_FOR_DIAGNOSTICS_OPTION) && ((step_for_diagnostics = cla.getOptionValueAsInt(STEP_FOR_DIAGNOSTICS_OPTION)) < 1 || step_for_diagnostics > msa.getNumberOfSequences() || worst_remove > 0 && step_for_diagnostics > worst_remove)) {
                ForesterUtil.fatalError(PRG_NAME, "value for diagnostic step is out of range: " + step_for_diagnostics);
            }
            if (cla.isOptionSet(GAP_RATIO_LENGTH_OPTION) && ((gap_ratio = cla.getOptionValueAsDouble(GAP_RATIO_LENGTH_OPTION)) < 0.0 || gap_ratio > 1.0)) {
                ForesterUtil.fatalError(PRG_NAME, "gap ratio is out of range: " + gap_ratio);
            }
            if (cla.isOptionSet(REPORT_ENTROPY)) {
                report_entropy = true;
            }
            if (cla.isOptionSet(OUTPUT_FORMAT_OPTION)) {
                String fs = cla.getOptionValueAsCleanString(OUTPUT_FORMAT_OPTION);
                if (fs.equalsIgnoreCase("p")) {
                    output_format = Msa.MSA_FORMAT.PHYLIP;
                } else if (fs.equalsIgnoreCase(OUTPUT_FORMAT_OPTION)) {
                    output_format = Msa.MSA_FORMAT.FASTA;
                } else if (fs.equalsIgnoreCase("n")) {
                    output_format = Msa.MSA_FORMAT.NEXUS;
                } else {
                    ForesterUtil.fatalError(PRG_NAME, "illegal or empty output format option: " + fs);
                }
            }
            if (cla.isOptionSet(OUTPUT_REMOVED_SEQS_OPTION)) {
                String s2 = cla.getOptionValueAsCleanString(OUTPUT_REMOVED_SEQS_OPTION);
                removed_seqs_out_base = new File(s2);
            }
            if (realign) {
                if (ForesterUtil.isEmpty(path_to_mafft)) {
                    path_to_mafft = MsaCompactor.guessPathToMafft();
                }
                msa_compactor.checkPathToMafft(path_to_mafft);
                if (cla.isOptionSet(MAFFT_OPTIONS) && (ForesterUtil.isEmpty(mafft_options = cla.getOptionValueAsCleanString(MAFFT_OPTIONS)) || mafft_options.length() < 3)) {
                    ForesterUtil.fatalError(PRG_NAME, "illegal or empty MAFFT options: " + mafft_options);
                }
            } else if (cla.isOptionSet(MAFFT_OPTIONS)) {
                ForesterUtil.fatalError(PRG_NAME, "no need to indicate MAFFT options without realigning");
            }
            if (cla.isOptionSet(PERFORM_PHYLOGENETIC_INFERENCE)) {
                perform_phylogenetic_inference = true;
            }
            if (chart_only) {
                if (out != null || removed_seqs_out_base != null) {
                    ForesterUtil.fatalError(PRG_NAME, "chart only, no outfile(s) produced, thus no need to indicate output file(s)");
                }
                if (!realign && cla.isOptionSet(STEP_OPTION)) {
                    ForesterUtil.fatalError(PRG_NAME, "chart only, no re-aligning, thus no need to use step for output and re-aligning; use -sd instead");
                }
            }
            if (perform_phylogenetic_inference && step_for_diagnostics != 1) {
                ForesterUtil.fatalError(PRG_NAME, "step for diagnostics reports needs to be set to 1 for tree calculation");
            }
            msa_compactor.printInfo(in, msa, initial_msa_stats);
            if (!chart_only) {
                System.out.println("Output                               : " + out);
            }
            if (removed_seqs_out_base != null) {
                System.out.println("Write removed sequences to           : " + removed_seqs_out_base);
            }
            if (worst_remove > 0) {
                System.out.println("Number of worst offenders to remove  : " + worst_remove);
            }
            if (av_gap > 0.0) {
                System.out.println("Target gap-ratio                     : " + av_gap);
            }
            if (length > 0) {
                System.out.println("Target MSA length                    : " + length);
            }
            if (min_length > 1) {
                System.out.println("Minimal effective sequence length    : " + min_length);
            }
            if (gap_ratio > -1.0) {
                System.out.println("Maximum allowed gap ratio per column : " + gap_ratio);
            }
            if (out != null || removed_seqs_out_base != null) {
                System.out.print("Output format                        : ");
                if (output_format == Msa.MSA_FORMAT.FASTA) {
                    System.out.println("fasta");
                } else if (output_format == Msa.MSA_FORMAT.PHYLIP) {
                    System.out.println("phylip");
                } else if (output_format == Msa.MSA_FORMAT.NEXUS) {
                    System.out.println("nexus");
                }
            }
            if (min_length == -1) {
                if (chart_only && !realign) {
                    System.out.println("Step for output and re-aligning      : n/a");
                } else if (chart_only) {
                    System.out.println("Step for re-aligning                 : " + step);
                } else {
                    System.out.println("Step for output and re-aligning      : " + step);
                }
                System.out.println("Step for diagnostics reports         : " + step_for_diagnostics);
                System.out.println("Calculate normalized Shannon Entropy : " + report_entropy);
                if (normalize_for_effective_seq_length) {
                    System.out.println("Normalize                            : with individual, effective sequence lenghts");
                } else {
                    System.out.println("Normalize                            : with MSA length");
                }
                System.out.println("Realign with MAFFT                   : " + realign);
                if (realign) {
                    System.out.println("MAFFT options                        : " + mafft_options);
                }
                System.out.println("Simple tree (Kimura distances, NJ)   : " + perform_phylogenetic_inference);
            }
            System.out.println();
            int initial_number_of_seqs = msa.getNumberOfSequences();
            List<MsaProperties> msa_props = null;
            MsaCompactor mc = new MsaCompactor(msa);
            mc.setInfileName(in.getName());
            if (worst_remove > 0 || av_gap > 0.0 || length > 0 || min_length != -1) {
                mc.setOutputFormat(output_format);
                mc.setOutFileBase(out);
            }
            if (min_length != -1) {
                mc.removeSequencesByMinimalLength(min_length);
            } else {
                mc.setPeformPhylogenticInference(perform_phylogenetic_inference);
                if (removed_seqs_out_base != null) {
                    mc.setRemovedSeqsOutBase(removed_seqs_out_base);
                }
                mc.setNorm(normalize_for_effective_seq_length);
                mc.setRealign(realign);
                if (realign) {
                    mc.setPathToMafft(path_to_mafft);
                    mc.setMafftOptions(mafft_options);
                }
                mc.setStep(step);
                mc.setStepForDiagnostics(step_for_diagnostics);
                mc.setCalculateNormalizedShannonEntropy(report_entropy);
                msa_props = worst_remove > 0 ? mc.removeWorstOffenders(worst_remove) : (av_gap > 0.0 ? mc.removeViaGapAverage(av_gap) : (length > 0 ? mc.removeViaLength(length) : mc.chart(step, realign, normalize_for_effective_seq_length)));
                Chart.display(msa_props, initial_number_of_seqs, report_entropy, in.getName());
                System.out.println();
                System.out.println("Final MSA properties");
                msa_compactor.printMsaInfo(msa, MsaMethods.calculateEffectiveLengthStatistics(msa));
            }
        }
        catch (IllegalArgumentException iae) {
            ForesterUtil.fatalError(PRG_NAME, iae.getMessage());
        }
        catch (IOException ioe) {
            ForesterUtil.fatalError(PRG_NAME, ioe.getMessage());
        }
        catch (Exception e) {
            ForesterUtil.unexpectedFatalError(PRG_NAME, e);
        }
    }

    private static void printInfo(File in, DeleteableMsa msa, DescriptiveStatistics initial_msa_stats) {
        ForesterUtil.printProgramInformation(PRG_NAME, PRG_DESC, PRG_VERSION, PRG_DATE, E_MAIL, WWW, ForesterUtil.getForesterLibraryInformation());
        System.out.println("Input MSA                            : " + in);
        msa_compactor.printMsaInfo(msa, initial_msa_stats);
    }

    private static void printMsaInfo(DeleteableMsa msa, DescriptiveStatistics msa_stats) {
        System.out.println("MSA length                           : " + msa.getLength());
        System.out.println("Number of sequences                  : " + msa.getNumberOfSequences());
        System.out.println("Median sequence length               : " + NF_1.format(msa_stats.median()));
        System.out.println("Mean sequence length                 : " + NF_1.format(msa_stats.arithmeticMean()));
        System.out.println("Max sequence length                  : " + (int)msa_stats.getMax());
        System.out.println("Min sequence length                  : " + (int)msa_stats.getMin());
        System.out.println("Gap ratio                            : " + NF_4.format(MsaMethods.calcGapRatio(msa)));
        System.out.println("Mean gap count per sequence          : " + NF_1.format(MsaMethods.calcNumberOfGapsStats(msa).arithmeticMean()));
        System.out.println("Normalized Shannon Entropy (entn7)   : " + NF_4.format(MsaMethods.calcNormalizedShannonsEntropy(7, msa)));
        System.out.println("Normalized Shannon Entropy (entn21)  : " + NF_4.format(MsaMethods.calcNormalizedShannonsEntropy(21, msa)));
    }

    private static void checkPathToMafft(String path_to_mafft) {
        if (ForesterUtil.isEmpty(path_to_mafft) || !MsaInferrer.isInstalled(path_to_mafft)) {
            if (ForesterUtil.isEmpty(path_to_mafft)) {
                ForesterUtil.fatalError(PRG_NAME, "no MAFFT executable found, use -\"mafft=<path to MAFFT>\" option");
            } else {
                ForesterUtil.fatalError(PRG_NAME, "no MAFFT executable at \"" + path_to_mafft + "\"");
            }
        }
    }

    private static void printHelp() {
        ForesterUtil.printProgramInformation(PRG_NAME, PRG_DESC, PRG_VERSION, PRG_DATE, E_MAIL, WWW, ForesterUtil.getForesterLibraryInformation());
        String path_to_mafft = MsaCompactor.guessPathToMafft();
        String mafft_comment = !ForesterUtil.isEmpty(path_to_mafft) ? " (using " + path_to_mafft + ")" : " (no path to MAFFT found, use -\"mafft=<path to MAFFT>\" option";
        System.out.println("Usage:");
        System.out.println();
        System.out.println("msa_compactor [options] <msa input file> [output file base]");
        System.out.println();
        System.out.println(" options: ");
        System.out.println();
        System.out.println("   -i             to only display same basic information about the MSA");
        System.out.println("   -r=<integer>   number of worst offender sequences to remove");
        System.out.println("   -l=<integer>   target MSA length");
        System.out.println("   -g=<decimal>   target gap-ratio (0.0-1.0)");
        System.out.println("   -a             to realign using MAFFT" + mafft_comment);
        System.out.println("   -mo=<string>   options for MAFFT (default: --auto)");
        System.out.println("   -s=<integer>   step for output and re-aligning (default: 1)");
        System.out.println("   -sd=<integer>  step for diagnostics reports (default: 1)");
        System.out.println("   -e             to calculate normalized Shannon Entropy (not recommended for very large alignments)");
        System.out.println("   -f=<f|p|n>     format for output alignments: f for fasta (default), p for phylip, or n for nexus");
        System.out.println("   -ro=<file>     to output the removed sequences");
        System.out.println("   -ml=<integer>  minimal effecive sequence length (for deleting of shorter sequences)");
        System.out.println("   -gr=<decimal>  maximal allowed gap ratio per column (for deleting of columms) (0.0-1.0)");
        System.out.println("   -t             to calculate a simple phylogenetic tree (Kimura distances, NJ)");
        System.out.println("   -nn            to normalize gap-contributions with MSA length, instead of individual effective sequence lenghts");
        System.out.println();
        System.out.println();
        System.out.println();
    }

    static {
        NF_1.setRoundingMode(RoundingMode.HALF_UP);
        NF_4.setRoundingMode(RoundingMode.HALF_UP);
    }
}

