package com.omega.engine.ad.op.gpu;

import com.omega.common.data.Tensor;
import com.omega.engine.ad.op.TensorOP;
import com.omega.engine.gpu.CUDAMemoryManager;
import com.omega.engine.gpu.CUDAModules;
import java.io.Serializable;
import jcuda.NativePointerObject;
import jcuda.Pointer;
import jcuda.driver.CUfunction;
import jcuda.driver.CUstream;
import jcuda.driver.JCudaDriver;
import jcuda.runtime.JCuda;
import jcuda.runtime.cudaError;

/* loaded from: input_file:com/omega/engine/ad/op/gpu/OPKernel.class */
public class OPKernel implements Serializable {
    private static final long serialVersionUID = 3345793649705471080L;
    private static OPKernel kernel = null;
    public int N = 0;
    private int CAFFE_CUDA_NUM_THREADS = 1024;
    private CUfunction fill_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "fill_kernel");
    private CUfunction axpy_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "axpy_kernel");
    private CUfunction copy_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "copy_kernel");
    private CUfunction copy_number_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "copy_number_kernel");
    private CUfunction copy_channel_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "copy_channel_kernel");
    private CUfunction add_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "add_kernel");
    private CUfunction add_scalar_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "add_scalar_kernel");
    private CUfunction add_number_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "add_number_kernel");
    private CUfunction add_channel_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "add_channel_kernel");
    private CUfunction sub_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "sub_kernel");
    private CUfunction sub_axis_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "sub_axis_kernel");
    private CUfunction sub_scalar_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "sub_scalar_kernel");
    private CUfunction scalar_sub_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "scalar_sub_kernel");
    private CUfunction mul_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "mul_kernel");
    private CUfunction mul_scalar_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "mul_scalar_kernel");
    private CUfunction mul_plus_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "mul_plus_kernel");
    private CUfunction mul_plus_scalar_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "mul_plus_scalar_kernel");
    private CUfunction mul_plus_scalar_axis_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "mul_plus_scalar_axis_kernel");
    private CUfunction div_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "div_kernel");
    private CUfunction div_axis_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "div_axis_kernel");
    private CUfunction div_scalar_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "div_scalar_kernel");
    private CUfunction scalar_div_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "scalar_div_kernel");
    private CUfunction div_plus_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "div_plus_kernel");
    private CUfunction div_plus_axis_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "div_plus_axis_kernel");
    private CUfunction div_plus_scalar_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "div_plus_scalar_kernel");
    private CUfunction scalar_plus_div_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "scalar_plus_div_kernel");
    private CUfunction div_bGrad_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "div_bGrad_kernel");
    private CUfunction div_bGrad_axis_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "div_bGrad_axis_kernel");
    private CUfunction div_scalar_bGrad_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "div_scalar_bGrad_kernel");
    private CUfunction pow_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "pow_kernel");
    private CUfunction log_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "log_kernel");
    private CUfunction exp_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "exp_kernel");
    private CUfunction sin_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "sin_kernel");
    private CUfunction cos_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "cos_kernel");
    private CUfunction tan_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "tan_kernel");
    private CUfunction atan_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "atan_kernel");
    private CUfunction tan_back_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "tan_back_kernel");
    private CUfunction atan_back_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "atan_back_kernel");
    private CUfunction sum_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "sum_kernel");
    private CUfunction sum_channel_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "sum_channel_kernel");
    private CUfunction max_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "max_kernel");
    private CUfunction max_channel_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "max_channel_kernel");
    private CUfunction max_backward_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "max_backward_kernel");
    private CUfunction max_channel_backward_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "max_channel_backward_kernel");
    private CUfunction broadcast_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "broadcast_kernel");
    private CUfunction broadcast_channel_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "broadcast_number_kernel");
    private CUfunction broadcast_plus_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "broadcast_plus_kernel");
    private CUfunction broadcast_channel_plus_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "broadcast_number_plus_kernel");
    private CUfunction clamp_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "clamp_kernel");
    private CUfunction clamp_back_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "clamp_back_kernel");
    private CUfunction maximum_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "maximum_kernel");
    private CUfunction minimum_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "minimum_kernel");
    private CUfunction maximum_back_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "maximum_back_kernel");
    private CUfunction minimum_back_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "minimum_back_kernel");
    private CUfunction transpose_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "transpose_kernel");
    private CUfunction permute_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "permute_kernel");
    private CUfunction sqrt_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "sqrt_kernel");
    private CUfunction bool_gpu_function = CUDAModules.getLocalFunctionByModule("OPKernel.cu", "bool_kernel");

    public static OPKernel getInstance() {
        if (kernel == null) {
            kernel = new OPKernel();
        }
        return kernel;
    }

    public void fill_gpu(Tensor tensor, float f) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.fill_gpu_function, CAFFE_GET_BLOCKS(tensor.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor.getDataLength()}), Pointer.to(new float[]{f}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void copy_gpu(Tensor tensor, Tensor tensor2, int i) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.copy_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new int[]{i}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new int[]{0})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void copy_gpu(Tensor tensor, Tensor tensor2, int i, int i2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.copy_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new int[]{i}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new int[]{i2})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void copy_number_gpu(Tensor tensor, Tensor tensor2, int i, int i2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.copy_number_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new int[]{tensor.number}), Pointer.to(new int[]{tensor.channel}), Pointer.to(new int[]{tensor.height}), Pointer.to(new int[]{tensor.width}), Pointer.to(new int[]{i}), Pointer.to(new int[]{i2})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void copy_channel_gpu(Tensor tensor, Tensor tensor2, int i, int i2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.copy_channel_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new int[]{tensor.number}), Pointer.to(new int[]{tensor.channel}), Pointer.to(new int[]{tensor.height}), Pointer.to(new int[]{tensor.width}), Pointer.to(new int[]{i}), Pointer.to(new int[]{i2})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void add_gpu(Tensor tensor, Tensor tensor2, Tensor tensor3) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.add_gpu_function, CAFFE_GET_BLOCKS(tensor3.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor3.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor3.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void add_gpu(Tensor tensor, Tensor tensor2, Tensor tensor3, int i) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.add_gpu_function, CAFFE_GET_BLOCKS(tensor3.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor3.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor3.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void add_gpu(Tensor tensor, Tensor tensor2, Tensor tensor3, int i, int i2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.add_gpu_function, CAFFE_GET_BLOCKS(i2), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{i2}), Pointer.to(new NativePointerObject[]{tensor.getGpuData().withByteOffset(i * 4)}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData().withByteOffset(i * 4)}), Pointer.to(new NativePointerObject[]{tensor3.getGpuData().withByteOffset(i * 4)})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void add_gpu(Tensor tensor, Tensor tensor2, Tensor tensor3, int i, int i2, int i3, int i4) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.add_gpu_function, CAFFE_GET_BLOCKS(i4), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{i4}), Pointer.to(new NativePointerObject[]{tensor.getGpuData().withByteOffset(i * 4)}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData().withByteOffset(i2 * 4)}), Pointer.to(new NativePointerObject[]{tensor3.getGpuData().withByteOffset(i3 * 4)})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void axpy_gpu(Tensor tensor, Tensor tensor2, int i, int i2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.axpy_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new int[]{i}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new int[]{i2})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void add_scalar_gpu(Tensor tensor, float f, Tensor tensor2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.add_scalar_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new float[]{f}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void add_number_gpu(Tensor tensor, Tensor tensor2, int i) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.add_number_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new int[]{tensor.number}), Pointer.to(new int[]{tensor.channel}), Pointer.to(new int[]{tensor.height}), Pointer.to(new int[]{tensor.width}), Pointer.to(new int[]{i})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void add_channel_gpu(Tensor tensor, Tensor tensor2, int i) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.add_channel_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new int[]{tensor.number}), Pointer.to(new int[]{tensor.channel}), Pointer.to(new int[]{tensor.height}), Pointer.to(new int[]{tensor.width}), Pointer.to(new int[]{i})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void sub_gpu(Tensor tensor, Tensor tensor2, Tensor tensor3) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.sub_gpu_function, CAFFE_GET_BLOCKS(tensor3.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor3.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor3.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void sub_gpu(Tensor tensor, Tensor tensor2, Tensor tensor3, int i) {
        int i2 = 0;
        try {
            switch (i) {
                case 0:
                    i2 = tensor.channel * tensor.height * tensor.width;
                    break;
                case 1:
                    i2 = tensor.height * tensor.width;
                    break;
            }
            checkCUDA(JCudaDriver.cuLaunchKernel(this.sub_axis_gpu_function, CAFFE_GET_BLOCKS(tensor.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor3.getGpuData()}), Pointer.to(new int[]{i2})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void sub_gpu(Tensor tensor, Tensor tensor2, Tensor tensor3, int i, int i2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.sub_gpu_function, CAFFE_GET_BLOCKS(i2), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{i2}), Pointer.to(new NativePointerObject[]{tensor.getGpuData().withByteOffset(i * 4)}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData().withByteOffset(i * 4)}), Pointer.to(new NativePointerObject[]{tensor3.getGpuData().withByteOffset(i * 4)})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void sub_scalar_gpu(Tensor tensor, float f, Tensor tensor2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.sub_scalar_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new float[]{f}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void scalar_sub_gpu(float f, Tensor tensor, Tensor tensor2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.scalar_sub_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new float[]{f}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void scalar_sub_gpu(float f, Tensor tensor, Tensor tensor2, int i, int i2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.scalar_sub_gpu_function, CAFFE_GET_BLOCKS(i2), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{i2}), Pointer.to(new float[]{f}), Pointer.to(new NativePointerObject[]{tensor.getGpuData().withByteOffset(i * 4)}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData().withByteOffset(i * 4)})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void mul_gpu(Tensor tensor, Tensor tensor2, Tensor tensor3) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.mul_gpu_function, CAFFE_GET_BLOCKS(tensor3.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor3.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor3.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void bool_gpu(Tensor tensor, Tensor tensor2, Tensor tensor3, float f) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.bool_gpu_function, CAFFE_GET_BLOCKS(tensor3.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor3.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor3.getGpuData()}), Pointer.to(new float[]{f})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void mul_gpu(Tensor tensor, Tensor tensor2, Tensor tensor3, int i, int i2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.mul_gpu_function, CAFFE_GET_BLOCKS(i2), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{i2}), Pointer.to(new NativePointerObject[]{tensor.getGpuData().withByteOffset(i * 4)}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData().withByteOffset(i * 4)}), Pointer.to(new NativePointerObject[]{tensor3.getGpuData().withByteOffset(i * 4)})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void mul_gpu(Tensor tensor, Tensor tensor2, Tensor tensor3, int i, int i2, int i3, int i4) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.mul_gpu_function, CAFFE_GET_BLOCKS(i4), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{i4}), Pointer.to(new NativePointerObject[]{tensor.getGpuData().withByteOffset(i * 4)}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData().withByteOffset(i2 * 4)}), Pointer.to(new NativePointerObject[]{tensor3.getGpuData().withByteOffset(i3 * 4)})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void mul_scalar_gpu(Tensor tensor, float f, Tensor tensor2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.mul_scalar_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new float[]{f}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void mul_plus_gpu(Tensor tensor, Tensor tensor2, Tensor tensor3) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.mul_plus_gpu_function, CAFFE_GET_BLOCKS(tensor3.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor3.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor3.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void mul_plus_scalar_gpu(Tensor tensor, float f, Tensor tensor2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.mul_plus_scalar_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new float[]{f}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void mul_plus_scalar_gpu(Tensor tensor, float f, Tensor tensor2, int i) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.mul_plus_scalar_axis_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new float[]{f}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new int[]{i})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void div_gpu(Tensor tensor, Tensor tensor2, Tensor tensor3) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.div_gpu_function, CAFFE_GET_BLOCKS(tensor3.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor3.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor3.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void div_gpu(Tensor tensor, Tensor tensor2, Tensor tensor3, int i) {
        int i2 = 0;
        try {
            switch (i) {
                case 0:
                    i2 = tensor.channel * tensor.height * tensor.width;
                    break;
                case 1:
                    i2 = tensor.height * tensor.width;
                    break;
            }
            checkCUDA(JCudaDriver.cuLaunchKernel(this.div_axis_gpu_function, CAFFE_GET_BLOCKS(tensor3.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor3.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor3.getGpuData()}), Pointer.to(new int[]{i2})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void div_scalar_gpu(Tensor tensor, float f, Tensor tensor2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.div_scalar_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new float[]{f}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void scalar_div_gpu(Tensor tensor, float f, Tensor tensor2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.scalar_div_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new float[]{f}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void div_plus_gpu(Tensor tensor, Tensor tensor2, Tensor tensor3) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.div_plus_gpu_function, CAFFE_GET_BLOCKS(tensor3.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor3.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor3.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void div_plus_gpu(Tensor tensor, Tensor tensor2, Tensor tensor3, int i) {
        int i2 = 0;
        try {
            switch (i) {
                case 0:
                    i2 = tensor.channel * tensor.height * tensor.width;
                    break;
                case 1:
                    i2 = tensor.height * tensor.width;
                    break;
            }
            checkCUDA(JCudaDriver.cuLaunchKernel(this.div_plus_axis_gpu_function, CAFFE_GET_BLOCKS(tensor3.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor3.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor3.getGpuData()}), Pointer.to(new int[]{i2})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void div_plus_scalar_gpu(Tensor tensor, float f, Tensor tensor2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.div_plus_scalar_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new float[]{f}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void scalar_plus_div_gpu(Tensor tensor, float f, Tensor tensor2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.scalar_plus_div_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new float[]{f}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void div_bGrad_gpu(Tensor tensor, Tensor tensor2, Tensor tensor3, Tensor tensor4) {
        int axis = TensorOP.getAxis(tensor, tensor4);
        if (axis >= 0) {
            div_bGrad_gpu(tensor, tensor2, tensor3, tensor4, axis);
            return;
        }
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.div_bGrad_gpu_function, CAFFE_GET_BLOCKS(tensor4.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor4.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor3.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor4.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void div_bGrad_gpu(Tensor tensor, Tensor tensor2, Tensor tensor3, Tensor tensor4, int i) {
        int i2 = 0;
        try {
            switch (i) {
                case 0:
                    i2 = tensor.channel * tensor.height * tensor.width;
                    break;
                case 1:
                    i2 = tensor.height * tensor.width;
                    break;
            }
            checkCUDA(JCudaDriver.cuLaunchKernel(this.div_bGrad_axis_gpu_function, CAFFE_GET_BLOCKS(tensor4.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor4.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor3.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor4.getGpuData()}), Pointer.to(new int[]{i2})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void div_scalar_bGrad_gpu(Tensor tensor, float f, Tensor tensor2, Tensor tensor3) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.div_scalar_bGrad_gpu_function, CAFFE_GET_BLOCKS(tensor3.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor3.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new float[]{f}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor3.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void pow_gpu(Tensor tensor, float f, Tensor tensor2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.pow_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new float[]{f}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void log_gpu(Tensor tensor, Tensor tensor2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.log_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void exp_gpu(Tensor tensor, Tensor tensor2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.exp_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void sum_gpu(Tensor tensor, Tensor tensor2, int i) {
        try {
            if (i == 0) {
                checkCUDA(JCudaDriver.cuLaunchKernel(this.sum_gpu_function, 1, 1, 1, 1, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor.dataLength}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()})}), (Pointer) null));
            } else {
                checkCUDA(JCudaDriver.cuLaunchKernel(this.sum_channel_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new int[]{tensor.channel}), Pointer.to(new int[]{tensor.height}), Pointer.to(new int[]{tensor.width})}), (Pointer) null));
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void max_gpu(Tensor tensor, Tensor tensor2, int i) {
        try {
            if (i == 0) {
                checkCUDA(JCudaDriver.cuLaunchKernel(this.max_gpu_function, 1, 1, 1, 1, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor.dataLength}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()})}), (Pointer) null));
            } else {
                checkCUDA(JCudaDriver.cuLaunchKernel(this.max_channel_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new int[]{tensor.channel}), Pointer.to(new int[]{tensor.height}), Pointer.to(new int[]{tensor.width})}), (Pointer) null));
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void max_backward_gpu(Tensor tensor, Tensor tensor2, Tensor tensor3, int i) {
        try {
            if (i == 0) {
                checkCUDA(JCudaDriver.cuLaunchKernel(this.max_backward_gpu_function, 1, 1, 1, 1, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.dataLength}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor3.getGpuData()})}), (Pointer) null));
            } else {
                checkCUDA(JCudaDriver.cuLaunchKernel(this.max_channel_backward_gpu_function, CAFFE_GET_BLOCKS(tensor3.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor3.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor3.getGpuData()}), Pointer.to(new int[]{tensor2.channel}), Pointer.to(new int[]{tensor2.height}), Pointer.to(new int[]{tensor2.width})}), (Pointer) null));
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void sqrt_gpu(Tensor tensor, Tensor tensor2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.sqrt_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void sin_gpu(Tensor tensor, Tensor tensor2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.sin_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void cos_gpu(Tensor tensor, Tensor tensor2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.cos_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void tan_gpu(Tensor tensor, Tensor tensor2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.tan_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void tan_back_gpu(Tensor tensor, Tensor tensor2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.tan_back_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void atan_gpu(Tensor tensor, Tensor tensor2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.atan_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void atan_back_gpu(Tensor tensor, Tensor tensor2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.atan_back_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void broadcast_gpu(Tensor tensor, Tensor tensor2, int i) {
        try {
            if (i == 0) {
                checkCUDA(JCudaDriver.cuLaunchKernel(this.broadcast_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()})}), (Pointer) null));
            } else {
                checkCUDA(JCudaDriver.cuLaunchKernel(this.broadcast_channel_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new int[]{tensor2.channel}), Pointer.to(new int[]{tensor2.height}), Pointer.to(new int[]{tensor2.width})}), (Pointer) null));
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void broadcast_plus_gpu(Tensor tensor, Tensor tensor2, int i) {
        try {
            if (i == 0) {
                checkCUDA(JCudaDriver.cuLaunchKernel(this.broadcast_plus_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()})}), (Pointer) null));
            } else {
                checkCUDA(JCudaDriver.cuLaunchKernel(this.broadcast_channel_plus_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new int[]{tensor2.channel}), Pointer.to(new int[]{tensor2.height}), Pointer.to(new int[]{tensor2.width})}), (Pointer) null));
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void clamp_gpu(Tensor tensor, float f, float f2, Tensor tensor2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.clamp_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new float[]{f}), Pointer.to(new float[]{f2}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void clamp_back_gpu(Tensor tensor, float f, float f2, Tensor tensor2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.clamp_back_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new float[]{f}), Pointer.to(new float[]{f2}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void maximum_gpu(Tensor tensor, Tensor tensor2, Tensor tensor3) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.maximum_gpu_function, CAFFE_GET_BLOCKS(tensor3.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor3.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor3.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void minimum_gpu(Tensor tensor, Tensor tensor2, Tensor tensor3) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.minimum_gpu_function, CAFFE_GET_BLOCKS(tensor3.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor3.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor3.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void maximum_back_gpu(Tensor tensor, Tensor tensor2, Tensor tensor3) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.maximum_back_gpu_function, CAFFE_GET_BLOCKS(tensor3.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor3.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor3.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void transpose_gpu(Tensor tensor, Tensor tensor2) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.transpose_gpu_function, CAFFE_GET_BLOCKS(tensor2.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor2.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new int[]{tensor.number}), Pointer.to(new int[]{tensor.width})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void permute_gpu(Tensor tensor, Tensor tensor2, int[] iArr) {
        try {
            int[] strides = getStrides(tensor.shape());
            int[] strides2 = getStrides(tensor2.shape());
            NativePointerObject pointer = CUDAMemoryManager.getPointer(iArr.length);
            JCuda.cudaMemcpy(pointer, Pointer.to(iArr), iArr.length * 4, 1);
            NativePointerObject pointer2 = CUDAMemoryManager.getPointer(iArr.length);
            JCuda.cudaMemcpy(pointer2, Pointer.to(strides), iArr.length * 4, 1);
            NativePointerObject pointer3 = CUDAMemoryManager.getPointer(iArr.length);
            JCuda.cudaMemcpy(pointer3, Pointer.to(strides2), iArr.length * 4, 1);
            checkCUDA(JCudaDriver.cuLaunchKernel(this.permute_gpu_function, CAFFE_GET_BLOCKS(tensor.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new NativePointerObject[]{pointer}), Pointer.to(new NativePointerObject[]{pointer2}), Pointer.to(new NativePointerObject[]{pointer3}), Pointer.to(new int[]{iArr.length})}), (Pointer) null));
            CUDAMemoryManager.free(pointer);
            CUDAMemoryManager.free(pointer2);
            CUDAMemoryManager.free(pointer3);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public int[] dim_out(int[] iArr, int[] iArr2) {
        int[] iArr3 = new int[iArr.length];
        for (int i = 0; i < iArr2.length; i++) {
            iArr3[i] = iArr[iArr2[i]];
        }
        return iArr3;
    }

    public int[] getStrides(int[] iArr) {
        int[] iArr2 = new int[iArr.length];
        for (int i = 0; i < iArr.length; i++) {
            iArr2[i] = 1;
        }
        for (int length = iArr.length - 2; length >= 0; length--) {
            iArr2[length] = iArr2[length + 1] * iArr[length + 1];
        }
        return iArr2;
    }

    public void minimum_back_gpu(Tensor tensor, Tensor tensor2, Tensor tensor3) {
        try {
            checkCUDA(JCudaDriver.cuLaunchKernel(this.minimum_back_gpu_function, CAFFE_GET_BLOCKS(tensor3.getDataLength()), 1, 1, this.CAFFE_CUDA_NUM_THREADS, 1, 1, 0, (CUstream) null, Pointer.to(new NativePointerObject[]{Pointer.to(new int[]{tensor3.getDataLength()}), Pointer.to(new NativePointerObject[]{tensor.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor2.getGpuData()}), Pointer.to(new NativePointerObject[]{tensor3.getGpuData()})}), (Pointer) null));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void mean_gpu(Tensor tensor, int i, Tensor tensor2) {
        try {
            int i2 = tensor.number;
            if (i == 1) {
                i2 = tensor.channel;
            }
            sum_gpu(tensor, tensor2, i);
            div_scalar_gpu(tensor2, i2, tensor2);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public int CAFFE_GET_BLOCKS(int i) {
        return ((i + this.CAFFE_CUDA_NUM_THREADS) - 1) / this.CAFFE_CUDA_NUM_THREADS;
    }

    public void checkCUDA(int i) {
        if (i != 0) {
            System.err.println("Error code " + i + ":" + cudaError.stringFor(i));
        }
    }
}
