#ifndef GINCSVD_KERNELS_DEF_CUH_
#define GINCSVD_KERNELS_DEF_CUH_

#include <assert.h>
#include "gincsvd_kernels_decl.hpp"

namespace GINCSVD {
  namespace details {
    template <class T>
    __global__ void leftScaleKernel(int N, T *A, int LDA, const T* diag) {
      int i = blockIdx.x*blockDim.x + threadIdx.x;
      int j = blockIdx.y*blockDim.y + threadIdx.y;
      int index = j*LDA + i;
      if (i<N && j<N) { A[index] = A[index] * diag[i]; }
    }

    template <class T>
    __global__ void rightScaleKernel(int N, T *A, int LDA, const T* diag) {
      int i = blockIdx.x*blockDim.x + threadIdx.x;
      int j = blockIdx.y*blockDim.y + threadIdx.y;
      int index = j*LDA + i;
      if (i<N && j<N) { A[index] = A[index] * diag[j]; }
    }

    template <class T>
    __global__ void copyUpperTriKernel(int N, const T *A, int LDA, T *B, int LDB) {
      const int i = blockIdx.x*blockDim.x + threadIdx.x;
      const int j = blockIdx.y*blockDim.y + threadIdx.y;
      const int indexA = j*LDA + i;
      const int indexB = j*LDB + i;
      if (i<N && j<N) {
        if (i <= j) B[indexB] = A[indexA];
        else        B[indexB] = (T)0.0;
      }
    }

    template <class T>
    __global__ void copyToDiagKernel(int M, int N, T *A, int LDA, const T *d) {
      const int i = blockIdx.x*blockDim.x + threadIdx.x;
      const int j = blockIdx.y*blockDim.y + threadIdx.y;
      const int indexA = j*LDA + i;
      if (i<M && j<N) {
        if (i == j) A[indexA] = d[i];
        else        A[indexA] = (T)0.0;
      }
    }
  }
}

template <class T>
void GINCSVD::details::leftScale(int N, T *A, int LDA, const T* diag) {
  const int blockSize = 16;
  dim3 dimBlock( blockSize, blockSize );
  dim3 dimGrid( ceil(float(N)/float(dimBlock.x)), ceil(float(N)/float(dimBlock.y)) );
  leftScaleKernel<<<dimGrid, dimBlock>>>(N,A,LDA,diag);
}

template <class T>
void GINCSVD::details::rightScale(int N, T *A, int LDA, const T* diag) {
  const int blockSize = 16;
  dim3 dimBlock( blockSize, blockSize );
  dim3 dimGrid( ceil(float(N)/float(dimBlock.x)), ceil(float(N)/float(dimBlock.y)) );
  rightScaleKernel<<<dimGrid, dimBlock>>>(N,A,LDA,diag);
}

template <class T>
void GINCSVD::details::copyUpperTri(int N, const T *A, int LDA, T *B, int LDB, cudaStream_t stream) {
  const int blockSize = 16;
  dim3 dimBlock( blockSize, blockSize );
  dim3 dimGrid( ceil(float(N)/float(dimBlock.x)), ceil(float(N)/float(dimBlock.y)) );
  copyUpperTriKernel<<<dimGrid, dimBlock, 0, stream>>>(N,A,LDA,B,LDB);
}

template <class T>
void GINCSVD::details::copyToDiag(int M, int N, T *A, int LDA, const T *D, cudaStream_t stream) {
  const int blockSize = 16;
  dim3 dimBlock( blockSize, blockSize );
  dim3 dimGrid( ceil(float(M)/float(dimBlock.x)), ceil(float(N)/float(dimBlock.y)) );
  copyToDiagKernel<<<dimGrid, dimBlock, 0, stream>>>(M,N,A,LDA,D);
}

#endif // GINCSVD_KERNELS_DEF_CUH_
