package bio.ferlab.datalake.spark3.utils;

import bio.ferlab.datalake.spark3.transformation.CamelToSnake$;
import bio.ferlab.datalake.spark3.transformation.NormalizeColumnName$;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.DataFrameReader;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.SparkSession$;
import org.apache.spark.sql.SparkSession$implicits$;
import org.apache.spark.sql.functions$;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.StringType$;
import org.apache.spark.sql.types.StructField;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Array$;
import scala.MatchError;
import scala.Predef$;
import scala.Predef$ArrowAssoc$;
import scala.Tuple2;
import scala.Tuple7;
import scala.collection.TraversableOnce;
import scala.collection.immutable.$colon;
import scala.collection.immutable.List;
import scala.collection.immutable.List$;
import scala.collection.immutable.Nil$;
import scala.collection.immutable.StringOps;
import scala.collection.mutable.ArrayOps;
import scala.reflect.ClassTag$;
import scala.reflect.api.Mirror;
import scala.reflect.api.TypeCreator;
import scala.reflect.api.TypeTags;
import scala.reflect.api.Types;
import scala.reflect.api.Universe;
import scala.reflect.runtime.package$;
import scala.runtime.BoxedUnit;
import scala.runtime.BoxesRunTime;

/* compiled from: DataProfiling.scala */
/* loaded from: input_file:bio/ferlab/datalake/spark3/utils/DataProfiling$.class */
public final class DataProfiling$ {
    public static DataProfiling$ MODULE$;
    private final Logger log;

    static {
        new DataProfiling$();
    }

    public Logger log() {
        return this.log;
    }

    public SparkSession initSparkSession(String str, String str2, String str3, SparkSession sparkSession) {
        SparkConf conf = sparkSession.sparkContext().getConf();
        sparkSession.stop();
        return SparkSession$.MODULE$.builder().config((SparkConf) Predef$.MODULE$.Map().apply(Predef$.MODULE$.wrapRefArray(new Tuple2[]{Predef$ArrowAssoc$.MODULE$.$minus$greater$extension(Predef$.MODULE$.ArrowAssoc("spark.sql.legacy.timeParserPolicy"), "CORRECTED"), Predef$ArrowAssoc$.MODULE$.$minus$greater$extension(Predef$.MODULE$.ArrowAssoc("spark.sql.legacy.parquet.datetimeRebaseModeInWrite"), "CORRECTED"), Predef$ArrowAssoc$.MODULE$.$minus$greater$extension(Predef$.MODULE$.ArrowAssoc("fs.s3a.access.key"), str), Predef$ArrowAssoc$.MODULE$.$minus$greater$extension(Predef$.MODULE$.ArrowAssoc("fs.s3a.secret.key"), str2), Predef$ArrowAssoc$.MODULE$.$minus$greater$extension(Predef$.MODULE$.ArrowAssoc("spark.hadoop.fs.s3a.endpoint"), str3), Predef$ArrowAssoc$.MODULE$.$minus$greater$extension(Predef$.MODULE$.ArrowAssoc("spark.hadoop.fs.s3a.impl"), "org.apache.hadoop.fs.s3a.S3AFileSystem"), Predef$ArrowAssoc$.MODULE$.$minus$greater$extension(Predef$.MODULE$.ArrowAssoc("spark.hadoop.fs.s3a.aws.credentials.provider"), "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider"), Predef$ArrowAssoc$.MODULE$.$minus$greater$extension(Predef$.MODULE$.ArrowAssoc("spark.hadoop.fs.s3a.path.style.access"), "true"), Predef$ArrowAssoc$.MODULE$.$minus$greater$extension(Predef$.MODULE$.ArrowAssoc("spark.hadoop.fs.s3a.connection.ssl.enabled"), "true"), Predef$ArrowAssoc$.MODULE$.$minus$greater$extension(Predef$.MODULE$.ArrowAssoc("spark.sql.extensions"), "io.delta.sql.DeltaSparkSessionExtension"), Predef$ArrowAssoc$.MODULE$.$minus$greater$extension(Predef$.MODULE$.ArrowAssoc("spark.sql.catalog.spark_catalog"), "org.apache.spark.sql.delta.catalog.DeltaCatalog"), Predef$ArrowAssoc$.MODULE$.$minus$greater$extension(Predef$.MODULE$.ArrowAssoc("spark.databricks.delta.retentionDurationCheck.enabled"), "false"), Predef$ArrowAssoc$.MODULE$.$minus$greater$extension(Predef$.MODULE$.ArrowAssoc("spark.delta.merge.repartitionBeforeWrite"), "true")})).foldLeft(conf, (sparkConf, tuple2) -> {
            Tuple2 tuple2 = new Tuple2(sparkConf, tuple2);
            if (tuple2 != null) {
                SparkConf sparkConf = (SparkConf) tuple2._1();
                Tuple2 tuple22 = (Tuple2) tuple2._2();
                if (tuple22 != null) {
                    return sparkConf.set((String) tuple22._1(), (String) tuple22._2());
                }
            }
            throw new MatchError(tuple2);
        })).enableHiveSupport().appName("SparkApp").getOrCreate();
    }

    public Dataset<Row> showSchema(String str, int i, SparkSession sparkSession, DataFrameReader dataFrameReader) {
        Dataset<Row> orderBy = dataFrameReader.option("query", new StringBuilder(111).append("SELECT table_catalog, table_schema, table_name, table_type FROM information_schema.tables WHERE table_schema='").append(str).append("'").toString()).load().orderBy("table_name", Predef$.MODULE$.wrapRefArray(new String[0]));
        orderBy.show(i, false);
        return orderBy;
    }

    public int showSchema$default$2() {
        return 50;
    }

    public Dataset<Row> showSchemas(int i, SparkSession sparkSession, DataFrameReader dataFrameReader) {
        Dataset<Row> orderBy = dataFrameReader.option("query", "SELECT table_catalog, table_schema, table_name, table_type FROM information_schema.tables").load().groupBy(Predef$.MODULE$.wrapRefArray(new Column[]{functions$.MODULE$.col("table_schema")})).count().orderBy("table_schema", Predef$.MODULE$.wrapRefArray(new String[0]));
        orderBy.show(i, false);
        return orderBy;
    }

    public int showSchemas$default$1() {
        return 50;
    }

    public Dataset<Row> sql(String str, int i, SparkSession sparkSession, DataFrameReader dataFrameReader) {
        Dataset<Row> load = dataFrameReader.option("query", str).load();
        load.show(i, false);
        return load;
    }

    public int sql$default$2() {
        return 50;
    }

    public void printIngestionSpec(Dataset<Row> dataset, String str, String str2, String str3, String str4, String str5, String str6) {
        Predef$.MODULE$.println(new StringBuilder(8).append(str).append(".").append(str2).append(",").append(str5).append(",").append(str4).append("/").append(str3).append("/").append(CamelToSnake$.MODULE$.camel2Snake(str2)).append(",").append(str.toLowerCase()).append("_").append(CamelToSnake$.MODULE$.camel2Snake(str2)).append(",").append(str6).toString());
        new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps(dataset.schema().fields())).foreach(structField -> {
            $anonfun$printIngestionSpec$1(structField);
            return BoxedUnit.UNIT;
        });
    }

    public String printIngestionSpec$default$5() {
        return "s3a://red-prd/raw";
    }

    public String printIngestionSpec$default$6() {
        return "DELTA";
    }

    public String printIngestionSpec$default$7() {
        return "INSERT";
    }

    public String externalDsId(String str, String str2, int i) {
        return ((TraversableOnce) new StringOps(Predef$.MODULE$.augmentString(new StringBuilder(3).append("\"").append(str).append(".").append(str2).append("\"").toString())).padTo(i, " ", Predef$.MODULE$.fallbackStringCanBuildFrom())).mkString();
    }

    public int externalDsId$default$3() {
        return 55;
    }

    public String rawDsId(String str, String str2, int i) {
        return ((TraversableOnce) new StringOps(Predef$.MODULE$.augmentString(new StringBuilder(7).append("\"raw_").append(str.toLowerCase()).append("_").append(str2.toLowerCase()).append("\"").toString())).padTo(i, " ", Predef$.MODULE$.fallbackStringCanBuildFrom())).mkString();
    }

    public int rawDsId$default$3() {
        return 55;
    }

    public void printSourceDataset(String str, String str2, String str3, String str4, int i, int i2) {
        String externalDsId = externalDsId(str, str2, i);
        Predef$.MODULE$.println(new StringBuilder(74).append("DatasetConf(").append(externalDsId).append(", ").append(str4).append(",\"\", ").append(str3).append(", Read, Some(TableConf(\"").append(str).append("\", ").append(((TraversableOnce) new StringOps(Predef$.MODULE$.augmentString(new StringBuilder(2).append("\"").append(str2).append("\"").toString())).padTo(i2, " ", Predef$.MODULE$.fallbackStringCanBuildFrom())).mkString()).append(")), readoptions = ").append(str4).append("_options),").toString());
    }

    public int printSourceDataset$default$5() {
        return 55;
    }

    public int printSourceDataset$default$6() {
        return 35;
    }

    public void printRawDataset(String str, String str2, String str3, String str4, String str5, int i, int i2, int i3) {
        String lowerCase = str.toLowerCase();
        String rawDsId = rawDsId(str, str2, i);
        String mkString = ((TraversableOnce) new StringOps(Predef$.MODULE$.augmentString(new StringBuilder(4).append("\"/").append(lowerCase).append("/").append(CamelToSnake$.MODULE$.camel2Snake(str2)).append("\"").toString())).padTo(i2, " ", Predef$.MODULE$.fallbackStringCanBuildFrom())).mkString();
        Predef$.MODULE$.println(new StringBuilder(48).append("DatasetConf(").append(rawDsId).append(", ").append(str3).append(", ").append(mkString).append(", ").append(str4).append(", ").append(str5).append(", Some(TableConf(\"raw\", ").append(((TraversableOnce) new StringOps(Predef$.MODULE$.augmentString(new StringBuilder(3).append("\"").append(lowerCase).append("_").append(CamelToSnake$.MODULE$.camel2Snake(str2)).append("\"").toString())).padTo(i3, " ", Predef$.MODULE$.fallbackStringCanBuildFrom())).mkString()).append("))),").toString());
    }

    public String printRawDataset$default$3() {
        return "red_raw";
    }

    public String printRawDataset$default$4() {
        return "DELTA";
    }

    public String printRawDataset$default$5() {
        return "Insert";
    }

    public int printRawDataset$default$6() {
        return 55;
    }

    public int printRawDataset$default$7() {
        return 55;
    }

    public int printRawDataset$default$8() {
        return 35;
    }

    public void externalToRawMap(String str, String str2, int i) {
        String externalDsId = externalDsId(str, str2, i);
        Predef$.MODULE$.println(new StringBuilder(5).append(" ").append(externalDsId).append("-> ").append(rawDsId(str, str2, i)).append(",").toString());
    }

    public int externalToRawMap$default$3() {
        return 55;
    }

    public Dataset<Row> analyseColumn(Dataset<Row> dataset, String str) {
        long count = dataset.select(str, Predef$.MODULE$.wrapRefArray(new String[0])).count();
        long count2 = dataset.select(str, Predef$.MODULE$.wrapRefArray(new String[0])).distinct().count();
        long count3 = dataset.select(str, Predef$.MODULE$.wrapRefArray(new String[0])).where(functions$.MODULE$.col(str).isNull()).count();
        return dataset.select(str, Predef$.MODULE$.wrapRefArray(new String[0])).groupBy(str, Predef$.MODULE$.wrapRefArray(new String[0])).count().orderBy(Predef$.MODULE$.wrapRefArray(new Column[]{functions$.MODULE$.col("count").desc()})).limit(5).withColumn(str, functions$.MODULE$.when(functions$.MODULE$.col(str).isNull(), functions$.MODULE$.lit("null")).otherwise(functions$.MODULE$.col(str).cast(StringType$.MODULE$))).withColumn("%", functions$.MODULE$.bround(functions$.MODULE$.col("count").$div(functions$.MODULE$.lit(BoxesRunTime.boxToLong(count))).$times(BoxesRunTime.boxToInteger(100)), 3)).withColumn("column_name", functions$.MODULE$.lit(str)).groupBy("column_name", Predef$.MODULE$.wrapRefArray(new String[0])).agg(functions$.MODULE$.collect_list(functions$.MODULE$.struct(Predef$.MODULE$.wrapRefArray(new Column[]{functions$.MODULE$.col(str).as("name"), functions$.MODULE$.col("%")}))).as("%_top5_values"), Predef$.MODULE$.wrapRefArray(new Column[0])).withColumn("distinct_values", functions$.MODULE$.lit(BoxesRunTime.boxToLong(count2))).withColumn("total_values", functions$.MODULE$.lit(BoxesRunTime.boxToLong(count))).withColumn("%_null", functions$.MODULE$.bround(functions$.MODULE$.lit(BoxesRunTime.boxToLong(count3)).$div(functions$.MODULE$.lit(BoxesRunTime.boxToLong(count))).$times(BoxesRunTime.boxToInteger(100)), 3)).withColumn("%_non_null", functions$.MODULE$.bround(functions$.MODULE$.lit(BoxesRunTime.boxToLong(count - count3)).$div(functions$.MODULE$.lit(BoxesRunTime.boxToLong(count))).$times(BoxesRunTime.boxToInteger(100)), 3)).withColumn("interpretation", functions$.MODULE$.when(functions$.MODULE$.col("distinct_values").$eq$eq$eq(functions$.MODULE$.col("total_values")), functions$.MODULE$.lit("PK")).when(functions$.MODULE$.lit(BoxesRunTime.boxToLong(count2)).$eq$eq$eq(BoxesRunTime.boxToInteger(2)).and(functions$.MODULE$.lit(BoxesRunTime.boxToLong(count3)).$eq$eq$eq(BoxesRunTime.boxToInteger(0))).or(functions$.MODULE$.lit(BoxesRunTime.boxToLong(count2)).$eq$eq$eq(BoxesRunTime.boxToInteger(3))).and(functions$.MODULE$.lit(BoxesRunTime.boxToLong(count3)).$greater(BoxesRunTime.boxToInteger(0))), functions$.MODULE$.lit("BOOLEAN")).otherwise(""));
    }

    public String analyseDf(Dataset<Row> dataset) {
        String str = (String) new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps(dataset.columns())).head();
        Dataset dataset2 = (Dataset) new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps((Object[]) new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps((String[]) new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps(dataset.columns())).tail())).zipWithIndex(Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(Tuple2.class))))).foldLeft(analyseColumn(dataset, str), (dataset3, tuple2) -> {
            Tuple2 tuple2 = new Tuple2(dataset3, tuple2);
            if (tuple2 != null) {
                Dataset dataset3 = (Dataset) tuple2._1();
                Tuple2 tuple22 = (Tuple2) tuple2._2();
                if (tuple22 != null) {
                    String str2 = (String) tuple22._1();
                    MODULE$.log().info(new StringBuilder(38).append("ANALYSING [").append(((TraversableOnce) new StringOps(Predef$.MODULE$.augmentString(str2)).padTo(50, " ", Predef$.MODULE$.fallbackStringCanBuildFrom())).mkString()).append("] \t COLUMN ").append(tuple22._2$mcI$sp()).append(" out of ").append(dataset.columns().length).append(" COLUMNS").toString());
                    return dataset3.unionByName(MODULE$.analyseColumn(dataset, str2));
                }
            }
            throw new MatchError(tuple2);
        });
        SparkSession$implicits$ implicits = dataset.sparkSession().implicits();
        TypeTags universe = package$.MODULE$.universe();
        Dataset as = dataset2.as(implicits.newProductEncoder(universe.TypeTag().apply(package$.MODULE$.universe().runtimeMirror(getClass().getClassLoader()), new TypeCreator() { // from class: bio.ferlab.datalake.spark3.utils.DataProfiling$$typecreator4$1
            public <U extends Universe> Types.TypeApi apply(Mirror<U> mirror) {
                Universe universe2 = mirror.universe();
                return universe2.internal().reificationSupport().TypeRef(universe2.internal().reificationSupport().ThisType(mirror.staticPackage("scala").asModule().moduleClass()), mirror.staticClass("scala.Tuple7"), new $colon.colon(universe2.internal().reificationSupport().TypeRef(universe2.internal().reificationSupport().SingleType(mirror.staticPackage("scala").asModule().moduleClass().asType().toTypeConstructor(), mirror.staticModule("scala.Predef")), universe2.internal().reificationSupport().selectType(mirror.staticModule("scala.Predef").asModule().moduleClass(), "String"), Nil$.MODULE$), new $colon.colon(universe2.internal().reificationSupport().TypeRef(universe2.internal().reificationSupport().SingleType(universe2.internal().reificationSupport().SingleType(universe2.internal().reificationSupport().thisPrefix(mirror.RootClass()), mirror.staticPackage("scala")), mirror.staticModule("scala.package")), universe2.internal().reificationSupport().selectType(mirror.staticModule("scala.package").asModule().moduleClass(), "List"), List$.MODULE$.apply(Predef$.MODULE$.wrapRefArray(new Types.TypeApi[]{universe2.internal().reificationSupport().TypeRef(universe2.internal().reificationSupport().ThisType(mirror.staticPackage("scala").asModule().moduleClass()), mirror.staticClass("scala.Tuple2"), List$.MODULE$.apply(Predef$.MODULE$.wrapRefArray(new Types.TypeApi[]{universe2.internal().reificationSupport().TypeRef(universe2.internal().reificationSupport().SingleType(mirror.staticPackage("scala").asModule().moduleClass().asType().toTypeConstructor(), mirror.staticModule("scala.Predef")), universe2.internal().reificationSupport().selectType(mirror.staticModule("scala.Predef").asModule().moduleClass(), "String"), Nil$.MODULE$), mirror.staticClass("scala.Double").asType().toTypeConstructor()})))}))), new $colon.colon(mirror.staticClass("scala.Long").asType().toTypeConstructor(), new $colon.colon(mirror.staticClass("scala.Long").asType().toTypeConstructor(), new $colon.colon(mirror.staticClass("scala.Double").asType().toTypeConstructor(), new $colon.colon(mirror.staticClass("scala.Double").asType().toTypeConstructor(), new $colon.colon(universe2.internal().reificationSupport().TypeRef(universe2.internal().reificationSupport().SingleType(mirror.staticPackage("scala").asModule().moduleClass().asType().toTypeConstructor(), mirror.staticModule("scala.Predef")), universe2.internal().reificationSupport().selectType(mirror.staticModule("scala.Predef").asModule().moduleClass(), "String"), Nil$.MODULE$), Nil$.MODULE$))))))));
            }
        })));
        return new StringOps(Predef$.MODULE$.augmentString(new StringBuilder(18).append(new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps(as.columns())).mkString(";")).append("\n       |").append(new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps((Object[]) new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps((Object[]) new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps((Object[]) as.collect())).map(tuple7 -> {
            if (tuple7 == null) {
                throw new MatchError(tuple7);
            }
            String str2 = (String) tuple7._1();
            List list = (List) tuple7._2();
            long unboxToLong = BoxesRunTime.unboxToLong(tuple7._3());
            long unboxToLong2 = BoxesRunTime.unboxToLong(tuple7._4());
            double unboxToDouble = BoxesRunTime.unboxToDouble(tuple7._5());
            double unboxToDouble2 = BoxesRunTime.unboxToDouble(tuple7._6());
            return new Tuple7(str2, list.mkString(","), BoxesRunTime.boxToLong(unboxToLong), BoxesRunTime.boxToLong(unboxToLong2), BoxesRunTime.boxToDouble(unboxToDouble), BoxesRunTime.boxToDouble(unboxToDouble2), (String) tuple7._7());
        }, Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(Tuple7.class))))).map(tuple72 -> {
            return tuple72.productIterator().mkString(";");
        }, Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(String.class))))).mkString("\n")).append("\n       |").toString())).stripMargin();
    }

    public static final /* synthetic */ void $anonfun$printIngestionSpec$1(StructField structField) {
        BoxedUnit boxedUnit;
        if (structField == null) {
            throw new MatchError(structField);
        }
        String name = structField.name();
        DataType dataType = structField.dataType();
        String str = (String) NormalizeColumnName$.MODULE$.normalize().apply(name);
        String sb = new StringBuilder(5).append(name).append(",").append(ClassGenerator$.MODULE$.getType().apply(dataType)).append(",-,").append(str).append(",").append(ClassGenerator$.MODULE$.getType().apply(dataType)).toString();
        if (str != null ? str.equals(name) : name == null) {
            Predef$.MODULE$.println(sb);
            boxedUnit = BoxedUnit.UNIT;
        } else {
            Predef$.MODULE$.println(new StringBuilder(20).append(sb).append(",NormalizeColumnName").toString());
            boxedUnit = BoxedUnit.UNIT;
        }
    }

    private DataProfiling$() {
        MODULE$ = this;
        this.log = LoggerFactory.getLogger(getClass().getCanonicalName());
    }
}
