| static void | LibMatrixCUDA. abs(ExecutionContext ec,
   GPUContext gCtx,
   String instName,
   MatrixObject in1,
   String outputName) | Performs an "abs" operation on a matrix on the GPU | 
| static void | LibMatrixCUDA. acos(ExecutionContext ec,
    GPUContext gCtx,
    String instName,
    MatrixObject in1,
    String outputName) | Performs an "acos" operation on a matrix on the GPU | 
| static void | LibMatrixCUDA. asin(ExecutionContext ec,
    GPUContext gCtx,
    String instName,
    MatrixObject in1,
    String outputName) | Performs an "asin" operation on a matrix on the GPU | 
| static void | LibMatrixCUDA. atan(ExecutionContext ec,
    GPUContext gCtx,
    String instName,
    MatrixObject in1,
    String outputName) | Performs an "atan" operation on a matrix on the GPU | 
| static void | LibMatrixCUDA. axpy(ExecutionContext ec,
    GPUContext gCtx,
    String instName,
    MatrixObject in1,
    MatrixObject in2,
    String outputName,
    double constant) | Performs daxpy operation | 
| static void | LibMatrixCuDNN. batchNormalizationBackward(GPUContext gCtx,
                          String instName,
                          MatrixObject image,
                          MatrixObject dout,
                          MatrixObject scale,
                          MatrixObject dX,
                          MatrixObject dScale,
                          MatrixObject dBias,
                          double epsilon,
                          MatrixObject resultSaveMean,
                          MatrixObject resultSaveInvVariance) | This method computes the backpropagation errors for image, scale and bias of batch normalization layer | 
| static void | LibMatrixCuDNN. batchNormalizationForwardInference(GPUContext gCtx,
                                  String instName,
                                  MatrixObject image,
                                  MatrixObject scale,
                                  MatrixObject bias,
                                  MatrixObject runningMean,
                                  MatrixObject runningVar,
                                  MatrixObject ret,
                                  double epsilon) | Performs the forward BatchNormalization layer computation for inference | 
| static void | LibMatrixCuDNN. batchNormalizationForwardTraining(GPUContext gCtx,
                                 String instName,
                                 MatrixObject image,
                                 MatrixObject scale,
                                 MatrixObject bias,
                                 MatrixObject runningMean,
                                 MatrixObject runningVar,
                                 MatrixObject ret,
                                 MatrixObject retRunningMean,
                                 MatrixObject retRunningVar,
                                 double epsilon,
                                 double exponentialAverageFactor,
                                 MatrixObject resultSaveMean,
                                 MatrixObject resultSaveInvVariance) | Performs the forward BatchNormalization layer computation for training | 
| static void | LibMatrixCUDA. biasAdd(GPUContext gCtx,
       String instName,
       MatrixObject input,
       MatrixObject bias,
       MatrixObject outputBlock) | Performs the operation corresponding to the DML script:
 ones = matrix(1, rows=1, cols=Hout*Wout)
 output = input + matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)
 This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function | 
| static void | LibMatrixCUDA. biasMultiply(GPUContext gCtx,
            String instName,
            MatrixObject input,
            MatrixObject bias,
            MatrixObject outputBlock) | Performs the operation corresponding to the DML script:
 ones = matrix(1, rows=1, cols=Hout*Wout)
 output = input * matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)
 This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function | 
| static void | LibMatrixCUDA. cbind(ExecutionContext ec,
     GPUContext gCtx,
     String instName,
     MatrixObject in1,
     MatrixObject in2,
     String outputName) |  | 
| static void | LibMatrixCUDA. ceil(ExecutionContext ec,
    GPUContext gCtx,
    String instName,
    MatrixObject in1,
    String outputName) | Performs an "ceil" operation on a matrix on the GPU | 
| static void | LibMatrixCUDA. channelSums(GPUContext gCtx,
           String instName,
           MatrixObject input,
           MatrixObject outputBlock,
           long C,
           long HW) | Perform channel_sums operations: out = rowSums(matrix(colSums(A), rows=C, cols=HW)) | 
| static int | LibMatrixCUDA. computeNNZ(GPUContext gCtx,
          jcuda.Pointer densePtr,
          int length) | Utility to compute number of non-zeroes on the GPU | 
| static void | LibMatrixCuDNN. conv2d(GPUContext gCtx,
      String instName,
      MatrixObject image,
      MatrixObject filter,
      MatrixObject outputBlock,
      int N,
      int C,
      int H,
      int W,
      int K,
      int R,
      int S,
      int pad_h,
      int pad_w,
      int stride_h,
      int stride_w,
      int P,
      int Q,
      double intermediateMemoryBudget) | Performs a 2D convolution | 
| static void | LibMatrixCuDNN. conv2dBackwardData(GPUContext gCtx,
                  String instName,
                  MatrixObject filter,
                  MatrixObject dout,
                  MatrixObject output,
                  int N,
                  int C,
                  int H,
                  int W,
                  int K,
                  int R,
                  int S,
                  int pad_h,
                  int pad_w,
                  int stride_h,
                  int stride_w,
                  int P,
                  int Q,
                  double intermediateMemoryBudget) | This method computes the backpropogation errors for previous layer of convolution operation | 
| static void | LibMatrixCuDNN. conv2dBackwardFilter(GPUContext gCtx,
                    String instName,
                    MatrixObject image,
                    MatrixObject dout,
                    MatrixObject outputBlock,
                    int N,
                    int C,
                    int H,
                    int W,
                    int K,
                    int R,
                    int S,
                    int pad_h,
                    int pad_w,
                    int stride_h,
                    int stride_w,
                    int P,
                    int Q,
                    double intermediateMemoryBudget) | This method computes the backpropogation errors for filter of convolution operation | 
| static void | LibMatrixCuDNN. conv2dBiasAdd(GPUContext gCtx,
             String instName,
             MatrixObject image,
             MatrixObject bias,
             MatrixObject filter,
             MatrixObject output,
             int N,
             int C,
             int H,
             int W,
             int K,
             int R,
             int S,
             int pad_h,
             int pad_w,
             int stride_h,
             int stride_w,
             int P,
             int Q,
             double intermediateMemoryBudget) | Does a 2D convolution followed by a bias_add | 
| static void | LibMatrixCUDA. cos(ExecutionContext ec,
   GPUContext gCtx,
   String instName,
   MatrixObject in1,
   String outputName) | Performs an "cos" operation on a matrix on the GPU | 
| static void | LibMatrixCUDA. cosh(ExecutionContext ec,
    GPUContext gCtx,
    String instName,
    MatrixObject in1,
    String outputName) | Performs an "cosh" operation on a matrix on the GPU | 
| static LibMatrixCuDNNConvolutionAlgorithm | LibMatrixCuDNNConvolutionAlgorithm. cudnnGetConvolutionBackwardDataAlgorithm(GPUContext gCtx,
                                        String instName,
                                        int N,
                                        int C,
                                        int H,
                                        int W,
                                        int K,
                                        int R,
                                        int S,
                                        int pad_h,
                                        int pad_w,
                                        int stride_h,
                                        int stride_w,
                                        int P,
                                        int Q,
                                        long workspaceLimit) | Factory method to get the algorithm wrapper for convolution backward data | 
| static LibMatrixCuDNNConvolutionAlgorithm | LibMatrixCuDNNConvolutionAlgorithm. cudnnGetConvolutionBackwardFilterAlgorithm(GPUContext gCtx,
                                          String instName,
                                          int N,
                                          int C,
                                          int H,
                                          int W,
                                          int K,
                                          int R,
                                          int S,
                                          int pad_h,
                                          int pad_w,
                                          int stride_h,
                                          int stride_w,
                                          int P,
                                          int Q,
                                          long workspaceLimit) | Factory method to get the algorithm wrapper for convolution backward filter | 
| static LibMatrixCuDNNConvolutionAlgorithm | LibMatrixCuDNNConvolutionAlgorithm. cudnnGetConvolutionForwardAlgorithm(GPUContext gCtx,
                                   String instName,
                                   int N,
                                   int C,
                                   int H,
                                   int W,
                                   int K,
                                   int R,
                                   int S,
                                   int pad_h,
                                   int pad_w,
                                   int stride_h,
                                   int stride_w,
                                   int P,
                                   int Q,
                                   long workspaceLimit) | Factory method to get the algorithm wrapper for convolution forward | 
| static LibMatrixCuDNNPoolingDescriptors | LibMatrixCuDNNPoolingDescriptors. cudnnPoolingBackwardDescriptors(GPUContext gCtx,
                               String instName,
                               int N,
                               int C,
                               int H,
                               int W,
                               int K,
                               int R,
                               int S,
                               int pad_h,
                               int pad_w,
                               int stride_h,
                               int stride_w,
                               int P,
                               int Q,
                               LibMatrixDNN.PoolingType poolingType) | Get descriptors for maxpooling backward operation | 
| static LibMatrixCuDNNPoolingDescriptors | LibMatrixCuDNNPoolingDescriptors. cudnnPoolingDescriptors(GPUContext gCtx,
                       String instName,
                       int N,
                       int C,
                       int H,
                       int W,
                       int K,
                       int R,
                       int S,
                       int pad_h,
                       int pad_w,
                       int stride_h,
                       int stride_w,
                       int P,
                       int Q,
                       LibMatrixDNN.PoolingType poolingType) | Get descriptors for maxpooling operation | 
| static void | LibMatrixCUDA. cumulativeScan(ExecutionContext ec,
              GPUContext gCtx,
              String instName,
              String kernelFunction,
              MatrixObject in,
              String outputName) | Cumulative scan | 
| static void | LibMatrixCUDA. cumulativeSumProduct(ExecutionContext ec,
                    GPUContext gCtx,
                    String instName,
                    String kernelFunction,
                    MatrixObject in,
                    String outputName) | Cumulative sum-product kernel cascade invokation | 
| static void | LibMatrixCUDA. denseTranspose(ExecutionContext ec,
              GPUContext gCtx,
              String instName,
              jcuda.Pointer A,
              jcuda.Pointer C,
              long numRowsA,
              long numColsA) | Computes C = t(A) | 
| void | CudaSupportFunctions. deviceToHost(GPUContext gCtx,
            jcuda.Pointer src,
            double[] dest,
            String instName,
            boolean isEviction) |  | 
| void | DoublePrecisionCudaSupportFunctions. deviceToHost(GPUContext gCtx,
            jcuda.Pointer src,
            double[] dest,
            String instName,
            boolean isEviction) |  | 
| void | SinglePrecisionCudaSupportFunctions. deviceToHost(GPUContext gCtx,
            jcuda.Pointer src,
            double[] dest,
            String instName,
            boolean isEviction) |  | 
| static jcuda.Pointer | LibMatrixCUDA. double2float(GPUContext gCtx,
            jcuda.Pointer A,
            jcuda.Pointer ret,
            int numElems) |  | 
| static void | LibMatrixCUDA. exp(ExecutionContext ec,
   GPUContext gCtx,
   String instName,
   MatrixObject in1,
   String outputName) | Performs an "exp" operation on a matrix on the GPU | 
| static jcuda.Pointer | LibMatrixCUDA. float2double(GPUContext gCtx,
            jcuda.Pointer A,
            jcuda.Pointer ret,
            int numElems) |  | 
| static void | LibMatrixCUDA. floor(ExecutionContext ec,
     GPUContext gCtx,
     String instName,
     MatrixObject in1,
     String outputName) | Performs an "floor" operation on a matrix on the GPU | 
| static JCudaKernels | LibMatrixCUDA. getCudaKernels(GPUContext gCtx) |  | 
| static jcuda.Pointer | LibMatrixCUDA. getDensePointer(GPUContext gCtx,
               MatrixObject input,
               String instName) | Convenience method to get jcudaDenseMatrixPtr. | 
| static jcuda.Pointer | LibMatrixCuDNN. getDensePointerForCuDNN(GPUContext gCtx,
                       MatrixObject image,
                       String instName,
                       int numRows,
                       int numCols) | Convenience method to get jcudaDenseMatrixPtr. | 
| static long | LibMatrixCUDA. getNnz(GPUContext gCtx,
      String instName,
      MatrixObject mo,
      boolean recomputeDenseNNZ) | Note: if the matrix is in dense format, it explicitly re-computes the number of nonzeros. | 
| void | CudaSupportFunctions. hostToDevice(GPUContext gCtx,
            double[] src,
            jcuda.Pointer dest,
            String instName) |  | 
| void | DoublePrecisionCudaSupportFunctions. hostToDevice(GPUContext gCtx,
            double[] src,
            jcuda.Pointer dest,
            String instName) |  | 
| void | SinglePrecisionCudaSupportFunctions. hostToDevice(GPUContext gCtx,
            double[] src,
            jcuda.Pointer dest,
            String instName) |  | 
| static boolean | LibMatrixCUDA. isInSparseFormat(GPUContext gCtx,
                MatrixObject mo) |  | 
| static void | LibMatrixCUDA. log(ExecutionContext ec,
   GPUContext gCtx,
   String instName,
   MatrixObject in1,
   String outputName) | Performs an "log" operation on a matrix on the GPU | 
| static void | LibMatrixCuDNN. lstm(ExecutionContext ec,
    GPUContext gCtx,
    String instName,
    jcuda.Pointer X,
    jcuda.Pointer wPointer,
    jcuda.Pointer out0,
    jcuda.Pointer c0,
    boolean return_sequences,
    String outputName,
    String cyName,
    int N,
    int M,
    int D,
    int T) | Computes the forward pass for an LSTM layer with M neurons. | 
| static void | LibMatrixCuDNN. lstmBackward(ExecutionContext ec,
            GPUContext gCtx,
            String instName,
            jcuda.Pointer x,
            jcuda.Pointer hx,
            jcuda.Pointer cx,
            jcuda.Pointer wPointer,
            String doutName,
            String dcyName,
            String dxName,
            String dwName,
            String dbName,
            String dhxName,
            String dcxName,
            boolean return_sequences,
            int N,
            int M,
            int D,
            int T) |  | 
| static MatrixObject | LibMatrixCuMatMult. matmult(ExecutionContext ec,
       GPUContext gCtx,
       String instName,
       MatrixObject left,
       MatrixObject right,
       String outputName,
       boolean isLeftTransposed,
       boolean isRightTransposed) | Matrix multiply on GPU Examines sparsity and shapes and routes call to
 appropriate method from cuBLAS or cuSparse C = op(A) x op(B)
 The user is expected to call
 ec.releaseMatrixOutputForGPUInstruction(outputName); | 
| static void | LibMatrixCUDA. matmultTSMM(ExecutionContext ec,
           GPUContext gCtx,
           String instName,
           MatrixObject left,
           String outputName,
           boolean isLeftTransposed) | Performs tsmm, A %*% A' or A' %*% A, on GPU by exploiting cublasDsyrk(...) | 
| static void | LibMatrixCUDA. matrixMatrixArithmetic(ExecutionContext ec,
                      GPUContext gCtx,
                      String instName,
                      MatrixObject in1,
                      MatrixObject in2,
                      String outputName,
                      boolean isLeftTransposed,
                      boolean isRightTransposed,
                      BinaryOperator op) | Performs elementwise arithmetic operation specified by op of two input matrices in1 and in2 | 
| static void | LibMatrixCUDA. matrixMatrixRelational(ExecutionContext ec,
                      GPUContext gCtx,
                      String instName,
                      MatrixObject in1,
                      MatrixObject in2,
                      String outputName,
                      BinaryOperator op) | Performs elementwise operation relational specified by op of two input matrices in1 and in2 | 
| static void | LibMatrixCUDA. matrixScalarArithmetic(ExecutionContext ec,
                      GPUContext gCtx,
                      String instName,
                      MatrixObject in,
                      String outputName,
                      boolean isInputTransposed,
                      ScalarOperator op) | Entry point to perform elementwise matrix-scalar arithmetic operation specified by op | 
| static void | LibMatrixCUDA. matrixScalarOp(ExecutionContext ec,
              GPUContext gCtx,
              String instName,
              MatrixObject in,
              String outputName,
              boolean isInputTransposed,
              ScalarOperator op) | Utility to do matrix-scalar operation kernel | 
| static void | LibMatrixCUDA. matrixScalarRelational(ExecutionContext ec,
                      GPUContext gCtx,
                      String instName,
                      MatrixObject in,
                      String outputName,
                      ScalarOperator op) | Entry point to perform elementwise matrix-scalar relational operation specified by op | 
| static void | LibMatrixCuDNN. pooling(GPUContext gCtx,
       String instName,
       MatrixObject image,
       MatrixObject outputBlock,
       int N,
       int C,
       int H,
       int W,
       int K,
       int R,
       int S,
       int pad_h,
       int pad_w,
       int stride_h,
       int stride_w,
       int P,
       int Q,
       LibMatrixDNN.PoolingType poolingType,
       double intermediateMemoryBudget) | performs maxpooling on GPU by exploiting cudnnPoolingForward(...) | 
| static void | LibMatrixCuDNN. poolingBackward(GPUContext gCtx,
               String instName,
               MatrixObject image,
               MatrixObject dout,
               MatrixObject maxpoolOutput,
               MatrixObject outputBlock,
               int N,
               int C,
               int H,
               int W,
               int K,
               int R,
               int S,
               int pad_h,
               int pad_w,
               int stride_h,
               int stride_w,
               int P,
               int Q,
               LibMatrixDNN.PoolingType poolingType,
               double intermediateMemoryBudget) | Performs maxpoolingBackward on GPU by exploiting cudnnPoolingBackward(...)
 This method computes the backpropogation errors for previous layer of maxpooling operation | 
| static void | LibMatrixCUDA. rbind(ExecutionContext ec,
     GPUContext gCtx,
     String instName,
     MatrixObject in1,
     MatrixObject in2,
     String outputName) |  | 
| static void | LibMatrixCuDNN. relu(ExecutionContext ec,
    GPUContext gCtx,
    String instName,
    MatrixObject in,
    String outputName) | Performs the relu operation on the GPU. | 
| static void | LibMatrixCUDA. reluBackward(GPUContext gCtx,
            String instName,
            MatrixObject input,
            MatrixObject dout,
            MatrixObject outputBlock) | This method computes the backpropagation errors for previous layer of relu operation | 
| static void | LibMatrixCUDA. round(ExecutionContext ec,
     GPUContext gCtx,
     String instName,
     MatrixObject in1,
     String outputName) | Performs an "round" operation on a matrix on the GPU | 
| static void | LibMatrixCUDA. sigmoid(ExecutionContext ec,
       GPUContext gCtx,
       String instName,
       MatrixObject in1,
       String outputName) | Performs an "sigmoid" operation on a matrix on the GPU | 
| static void | LibMatrixCUDA. sign(ExecutionContext ec,
    GPUContext gCtx,
    String instName,
    MatrixObject in1,
    String outputName) | Performs an "sign" operation on a matrix on the GPU | 
| static void | LibMatrixCUDA. sin(ExecutionContext ec,
   GPUContext gCtx,
   String instName,
   MatrixObject in1,
   String outputName) | Performs an "sin" operation on a matrix on the GPU | 
| static void | LibMatrixCUDA. sinh(ExecutionContext ec,
    GPUContext gCtx,
    String instName,
    MatrixObject in1,
    String outputName) | Performs an "sinh" operation on a matrix on the GPU | 
| static void | LibMatrixCUDA. sliceOperations(ExecutionContext ec,
               GPUContext gCtx,
               String instName,
               MatrixObject in1,
               IndexRange ixrange,
               String outputName) | Method to perform rightIndex operation for a given lower and upper bounds in row and column dimensions. | 
| static void | LibMatrixCuDNN. softmax(ExecutionContext ec,
       GPUContext gCtx,
       String instName,
       MatrixObject in1,
       String outputName) | Performs an "softmax" operation on a matrix on the GPU | 
| static void | LibMatrixCUDA. solve(ExecutionContext ec,
     GPUContext gCtx,
     String instName,
     MatrixObject in1,
     MatrixObject in2,
     String outputName) | Implements the "solve" function for systemds Ax = B (A is of size m*n, B is of size m*1, x is of size n*1) | 
| static void | LibMatrixCUDA. sqrt(ExecutionContext ec,
    GPUContext gCtx,
    String instName,
    MatrixObject in1,
    String outputName) | Performs an "sqrt" operation on a matrix on the GPU | 
| static void | LibMatrixCUDA. tan(ExecutionContext ec,
   GPUContext gCtx,
   String instName,
   MatrixObject in1,
   String outputName) | Performs an "tan" operation on a matrix on the GPU | 
| static void | LibMatrixCUDA. tanh(ExecutionContext ec,
    GPUContext gCtx,
    String instName,
    MatrixObject in1,
    String outputName) | Performs an "tanh" operation on a matrix on the GPU | 
| static void | LibMatrixCUDA. transpose(ExecutionContext ec,
         GPUContext gCtx,
         String instName,
         MatrixObject in,
         String outputName) | Transposes the input matrix using cublasDgeam | 
| static void | LibMatrixCUDA. unaryAggregate(ExecutionContext ec,
              GPUContext gCtx,
              String instName,
              MatrixObject in1,
              String output,
              AggregateUnaryOperator op) | Entry point to perform Unary aggregate operations on the GPU. |