package com.datastax.data.prepare.spark.math

import com.datastax.data.prepare.spark.dataset.params.OutlierObject
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.sql.{Dataset, Row, functions}

object VectorCorrelation {
  final private val vectorResult = "vectorResult"
  final private val columnId = "columnID"
  final private val labelPreffix = "vector-"

  def row2OutlierObject(data: Dataset[Row], cols: Array[String]): Seq[OutlierObject] = {
    val vectorAssmebler = new VectorAssembler()
      .setInputCols(cols)
      .setOutputCol(vectorResult)
    val rows = vectorAssmebler.transform(data.withColumn(columnId, functions.monotonically_increasing_id()).na.drop(cols)).select(columnId, vectorResult).collect()
    for(i <- rows.indices) yield new OutlierObject(labelPreffix + i.toString, rows(i).getLong(0), rows(i).get(1).asInstanceOf[DenseVector].toArray)
  }

  def dot(xs: Array[Double], ys: Array[Double]): Double = vectorFunc(xs, ys)(_ * _)

  def vectorDist(xs: Array[Double], ys: Array[Double]): Double = vectorSqrtDist(xs) * vectorSqrtDist(ys)

  def vectorSqrtDist(xs: Array[Double]): Double = Math.sqrt(vectorFunc(xs, xs)(_ * _))

  def vectorFunc(xs: Array[Double], ys: Array[Double])(f: (Double, Double) => Double): Double = {
    var i = 0
    var sum = 0.0
    while(i < xs.length) {
      sum = sum + f(xs(i), ys(i))
      i = i + 1
    }
    sum
  }



  def main(args: Array[String]) = {
    val xs = Array(3.9127551591, 7.6612839183)
    val ys = Array(3.7488127414, 7.8150589371)
    val result = vectorFunc(xs, ys)((x: Double, y: Double) => Math.pow(y - x, 2))
    println(Math.sqrt(result))
  }

}
