package com.datastax.data.prepare.spark.dataset;

import com.datastax.insight.annonation.InsightComponent;
import com.datastax.insight.annonation.InsightComponentArg;
import com.datastax.insight.spec.Operator;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:com/datastax/data/prepare/spark/dataset/OutlierDetectionOperator.class */
public class OutlierDetectionOperator implements Operator {
    private static final Logger logger = LoggerFactory.getLogger(OutlierDetectionOperator.class);

    @InsightComponent(name = "离群点检测(距离)", type = "com.datastax.insight.dataprprocess.detectOutlier.distance", description = "通过第k邻近距离判断数据点，取k邻近距离的所有点的平均距离的前n个点作为离群点")
    public static <T> Dataset<T> distanceOutlier(@InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset<T> dataset, @InsightComponentArg(name = "列名", description = "选择离群点的列名，用分号隔开") String str, @InsightComponentArg(name = "近邻", description = "通过设置近邻点的数量来得到k邻近距离") int i, @InsightComponentArg(name = "离群点数量", description = "设置离群点数量, 取距离前n个点") int i2, @InsightComponentArg(name = "距离方法", description = "距离计算方法", defaultValue = "欧式距离", items = "欧式距离;平方距离;余弦距离;反余弦距离") String str2) {
        if (dataset == null) {
            throw new com.datastax.data.prepare.util.a("densities离群点检测--数据集为空");
        }
        if (i <= 0) {
            throw new com.datastax.data.prepare.util.a("densities离群点检测--距离某点的第k点距离的k小于或者等于0");
        }
        if (i2 <= 0) {
            throw new com.datastax.data.prepare.util.a("distances的离群点检测--离群点数量n小于或者等于0");
        }
        if (str == null || str.length() == 0) {
            throw new com.datastax.data.prepare.util.a("distances的离群点检测--选择的列名为空");
        }
        String[] a = j.a(dataset.schema(), str.split(";"));
        if (a.length == 0) {
            logger.info("distances的离群点检测--选中的列中没有可用于离群点计算, 返回原数据集");
            return dataset;
        }
        if (dataset.count() < 10000) {
            return (Dataset<T>) j.b(dataset.toDF(), a, i, i2, str2);
        }
        logger.info("数据集大于10000行，暂时不支持于离群点检测，返回原数据集");
        return dataset;
    }

    @InsightComponent(name = "离群点检测(密度)", type = "com.datastax.insight.dataprprocess.detectOutlier.densities", description = "通过某一点在距离d范围内的点和所有点的比例得出点的密度, 再和设置的概率相比得出离群点")
    public static <T> Dataset<T> densitiesOutlier(@InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset<T> dataset, @InsightComponentArg(name = "列名", description = "选择离群点的列名，用分号隔开") String str, @InsightComponentArg(name = "距离", description = "距离") double d, @InsightComponentArg(name = "比例", description = "小于该比例的点将设为离群点") double d2, @InsightComponentArg(name = "距离方法", description = "距离计算方法", defaultValue = "欧式距离", items = "欧式距离;平方距离;余弦距离;反余弦距离") String str2) {
        if (dataset == null) {
            throw new com.datastax.data.prepare.util.a("densities离群点检测--数据集为空");
        }
        if (d <= 0.0d) {
            throw new com.datastax.data.prepare.util.a("densities离群点检测--distance值小于等于0");
        }
        if (d2 <= 0.0d || d2 >= 1.0d) {
            throw new com.datastax.data.prepare.util.a("densities离群点检测--proportion值为空");
        }
        if (str == null || str.length() == 0) {
            throw new com.datastax.data.prepare.util.a("densities离群点检测--选择的列名为空");
        }
        String[] a = j.a(dataset.schema(), str.split(";"));
        if (a.length == 0) {
            logger.info("densities离群点检测--选中的列中没有可用于离群点计算, 返回原数据集");
            return dataset;
        }
        if (dataset.count() < 10000) {
            return (Dataset<T>) j.a((Dataset<Row>) dataset.toDF(), a, d, d2, str2);
        }
        logger.info("数据集大于10000行，暂时不支持于离群点检测，返回原数据集");
        return dataset;
    }

    @InsightComponent(name = "离群点检测(LOF)", type = "com.datastax.insight.dataprprocess.detectOutlier.lof", description = "通过LOF判断离群点")
    public static <T> Dataset<T> LOFOutlier(@InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset<T> dataset, @InsightComponentArg(name = "列名", description = "选择离群点的列名，用分号隔开") String str, @InsightComponentArg(name = "下限", description = "第k邻近点的k的下限") int i, @InsightComponentArg(name = "上限", description = "第k邻近点的k的上限") int i2, @InsightComponentArg(name = "距离方法", description = "距离计算方法", defaultValue = "欧式距离", items = "欧式距离;平方距离;余弦距离;反余弦距离") String str2) {
        if (dataset == null) {
            throw new com.datastax.data.prepare.util.a("LOF离群点检测--数据集为空");
        }
        if (i <= 0 || i2 <= 0) {
            throw new com.datastax.data.prepare.util.a("LOF离群点检测--上限或者下限小于或等于0");
        }
        if (str == null || str.length() == 0) {
            throw new com.datastax.data.prepare.util.a("LOF离群点检测--选择的列名为空");
        }
        String[] a = j.a(dataset.schema(), str.split(";"));
        if (a.length == 0) {
            logger.info("LOF离群点检测--选中的列中没有可用于离群点计算, 返回原数据集");
            return dataset;
        }
        if (i2 < i) {
            i2 = i;
            i = i2;
        }
        long count = dataset.count();
        if (count < i) {
            throw new com.datastax.data.prepare.util.a("LOF离群点检测--下限大于数据集的行数");
        }
        if (count > i2) {
            i2 = (int) count;
        }
        if (dataset.count() < 10000) {
            return (Dataset<T>) j.a((Dataset<Row>) dataset.toDF(), a, i, i2, str2);
        }
        logger.info("数据集大于10000行，暂时不支持于离群点检测，返回原数据集");
        return dataset;
    }

    protected static <T> Dataset<T> a(Dataset<T> dataset, String str, int i, int i2) {
        return dataset;
    }
}
