#include <iostream>
#include <gincsvd.hpp>

int main() {
  using std::cout;
  using std::endl;
  typedef double T;

  cout << "------------------------\n"
       << "----- CUBLAS Test ------\n"
       << "------------------------\n\n";

  const int M = 100000;
  const int N = 100;
  double *devA,*devB,*devC;
  double *hstA;
  cudaStream_t      stream1, stream2;
  GINCSVD_CUDA_CHECK( cudaStreamCreate(&stream1) );
  GINCSVD_CUDA_CHECK( cudaStreamCreate(&stream2) );
  cublasHandle_t blas_handle;
  GINCSVD_BLAS_CHECK( cublasCreate(&blas_handle) );
  GINCSVD_BLAS_CHECK( cublasSetStream(blas_handle, stream1) );
  GINCSVD_CUDA_CHECK( cudaHostAlloc(&hstA, sizeof(T)*M*N, cudaHostAllocWriteCombined) );
  GINCSVD_CUDA_CHECK( cudaMalloc(&devA, sizeof(T)*M*N) );
  GINCSVD_CUDA_CHECK( cudaMalloc(&devB, sizeof(T)*M*N) );
  GINCSVD_CUDA_CHECK( cudaMalloc(&devC, sizeof(T)*N*N) );

  GINCSVD_CUDA_CHECK( cudaDeviceSynchronize() );

  {
    cudaEvent_t copy1, copy2, gemm1, gemm2;
    GINCSVD_CUDA_CHECK( cudaEventCreate(&copy1) );
    GINCSVD_CUDA_CHECK( cudaEventCreate(&copy2) );
    GINCSVD_CUDA_CHECK( cudaEventCreate(&gemm1) );
    GINCSVD_CUDA_CHECK( cudaEventCreate(&gemm2) );

    GINCSVD_CUDA_CHECK( cudaEventRecord(gemm1,stream1) );
    GINCSVD_CUDA_CHECK( cudaEventRecord(copy1,stream2) );
    GINCSVD_BLAS_CHECK( GINCSVD::details::GEMM(blas_handle, CUBLAS_OP_T, CUBLAS_OP_N,
                                               N,N,M,
                                               (T)1.0, devB,M, devB,M,
                                               (T)0.0, devC,N) );
    GINCSVD_CUDA_CHECK( cudaMemcpyAsync(devA,hstA,M*N,cudaMemcpyHostToDevice,stream2) );
    GINCSVD_CUDA_CHECK( cudaEventRecord(copy2,stream2) );
    GINCSVD_CUDA_CHECK( cudaEventRecord(gemm2,stream1) );

    float t;
    GINCSVD_CUDA_CHECK( cudaEventSynchronize(copy2) );
    GINCSVD_CUDA_CHECK( cudaEventElapsedTime(&t, copy1, copy2) );
    cout << "copy time: " << t << endl;
    GINCSVD_CUDA_CHECK( cudaEventSynchronize(gemm2) );
    GINCSVD_CUDA_CHECK( cudaEventElapsedTime(&t, gemm1, gemm2) );
    cout << "gemm time: " << t << endl;

    GINCSVD_CUDA_CHECK( cudaEventDestroy(copy1) );
    GINCSVD_CUDA_CHECK( cudaEventDestroy(copy2) );
    GINCSVD_CUDA_CHECK( cudaEventDestroy(gemm1) );
    GINCSVD_CUDA_CHECK( cudaEventDestroy(gemm2) );
  }

  GINCSVD_CUDA_CHECK( cudaDeviceSynchronize() );

  GINCSVD_CUDA_CHECK( cudaFree(devA) );
  GINCSVD_CUDA_CHECK( cudaFree(devB) );
  GINCSVD_CUDA_CHECK( cudaFree(devC) );
  GINCSVD_CUDA_CHECK( cudaFreeHost(hstA) );
  GINCSVD_CUDA_CHECK( cudaStreamDestroy(stream1) );
  GINCSVD_CUDA_CHECK( cudaStreamDestroy(stream2) );

  return 0;
}
