#ifndef GINCSVD_HPP_
#define GINCSVD_HPP_

#include <cstring>
#include <cstdlib>
#include <algorithm>
#include <iterator>
#include <assert.h>
#include <iostream>
#include <iomanip>
#include <vector>

#include <cuda.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cula_lapack_device.hpp>

#include "gincsvd_config.h"
#include "gincsvd_kernels_decl.hpp"

// "namespace" these macros as well as we can
#define GINCSVD_CUDA_CHECK(status) \
    GINCSVD::details::cudaCheck( status , __FILE__ , __LINE__ )
#define GINCSVD_BLAS_CHECK(status) \
    GINCSVD::details::cublasCheck( status , __FILE__ , __LINE__ )
#define GINCSVD_CULA_CHECK(status) \
    GINCSVD::details::culaCheck( status , __FILE__ , __LINE__ )

namespace GINCSVD {

  struct timing_info {
    float total_time;
    float t_push; int numPush;
    float t_expand;
    float t_pullUV;
    timing_info() : total_time(0.0f), t_push(0.0f), numPush(0), t_expand(0.0f), t_pullUV(0.0f) {}
  };

  namespace details {
    void breakForDebug(int a) {
      int b;
      b = a;
      (void)b;
    }
    inline void cudaCheck(cudaError status, const char *file, const int line )
    {
      if (cudaSuccess != status)
      {
        std::cerr << "File " << file << ", line " << line << " returned CUDA error code " << status << ":\n* "
                  << cudaGetErrorString(status) << std::endl;
#ifdef GINCSVD_DEBUG
        breakForDebug(0);
#endif
        exit(-1);
      }
    }

    inline void cublasCheck(cublasStatus_t status, const char *file, const int line )
    {
      if (CUBLAS_STATUS_SUCCESS != status)
      {
        std::cerr << "File " << file << ", line " << line << " returned CUBLAS error code " << status << "\n*" << std::endl;
#ifdef GINCSVD_DEBUG
        breakForDebug(0);
#endif
        exit(-1);
      }
    }

    inline void culaCheck(culaStatus status, const char *file, const int line )
    {
      if (culaNoError != status)
      {
        culaInfo info = culaGetErrorInfo();
        const int errsz = 1024;
        char errstr[errsz];
        if( culaGetErrorInfoString(status, info, errstr, errsz) != culaNoError ) {
          std::cerr << "error calling culaGetErrorInfo(); this is likely an internal GINCSVD error." << std::endl;
        }
        std::cerr << "File " << file << ", line " << line << " returned CULA error code " << status << ", info " << info << ":\n* "
                  << errstr << std::endl;
#ifdef GINCSVD_DEBUG
        breakForDebug(1);
#endif
        exit(-1);
      }
    }
  }

  namespace debug {
    template <class T>
    inline void printDeviceArray(const char *name, int M, int N, const T* dev, int ldd)
    {
#ifdef GINCSVD_DEBUG
      std::cout << "print " << name << ", size " << M << "," << N << "(" << ldd << ")\n";
      T* host = new T[ldd*N];
      std::fill(host,host+ldd*N,0.0);
      GINCSVD_CUDA_CHECK( cudaMemcpy(host,dev,sizeof(T)*ldd*N,cudaMemcpyDeviceToHost) );
      for (int ii=0; ii<M; ++ii) {
        std::cout << "    ";
        for (int jj=0; jj<N; ++jj) {
          std::cout << host[jj*ldd+ii] << '\t';
        }
        std::cout << '\n';
      }
      std::cout << std::flush;
      delete [] host;
#endif
    }
  }


  //////////////////////////////////////////////////////////////////////////////
  ///  "Which" enum
  //////////////////////////////////////////////////////////////////////////////
  enum EWhich {
    LARGEST,
    SMALLEST
  };


  //////////////////////////////////////////////////////////////////////////////
  ///  Generators
  //////////////////////////////////////////////////////////////////////////////
  template <class T>
  class MatrixGenerator {
    public:
      typedef T scalar_type;
    private:
      const T* A0_;
      const T* A_;
      int M_;
    public:
      MatrixGenerator(const T* A, int M) : A0_(A), M_(M) {reset();}
      void generate(T *staging, int L) {
        std::copy(A_, A_+L*M_, staging);
        A_ += L*M_;
      }
      void reset() {
        A_ = A0_;
      }
  };

  template <class T>
  class RandomMatrixGenerator {
    public:
      typedef T scalar_type;
    private:
      int M_;
    public:
      RandomMatrixGenerator(int M) : M_(M) {}
      virtual ~RandomMatrixGenerator() {}
      void generate(T *staging, int L)
      {
        std::generate(staging,staging+M_*L,rand);
      }
  };

  template <class T>
  class OnesMatrixGenerator {
    public:
      typedef T scalar_type;
    private:
      int M_;
    public:
      OnesMatrixGenerator(int M) : M_(M) {}
      virtual ~OnesMatrixGenerator() {}
      void generate(T *staging, int L)
      {
        std::fill(staging,staging+M_*L,1.0);
      }
  };


  //////////////////////////////////////////////////////////////////////////////
  ///  init
  //////////////////////////////////////////////////////////////////////////////
  void init(int device)
  {
    using std::cout;
    using std::endl;
    //
    GINCSVD_CUDA_CHECK( cudaSetDevice(device) );
    cudaDeviceProp deviceProp;
    GINCSVD_CUDA_CHECK( cudaGetDeviceProperties(&deviceProp, device) );
    cout << "Using device " << device << ", \"" << deviceProp.name << "\""
         << ", of compute capability " << deviceProp.major << "." << deviceProp.minor
         << endl;
    if (deviceProp.asyncEngineCount == 0) {
      cout << "*** WARNING: asyncEngineCount == 0; cannot overlap communication with computation." << endl;
    }
    size_t total, free;
    GINCSVD_CUDA_CHECK( cudaMemGetInfo(&free,&total) );
    cout << "Free memory : " << std::setw(5) << (free  >> 20) << " MB" << endl;
    cout << "Total memory: " << std::setw(5) << (total >> 20) << " MB" << endl;
    GINCSVD_CULA_CHECK( culaInitialize() );
  }


  //////////////////////////////////////////////////////////////////////////////
  ///  update routine
  //////////////////////////////////////////////////////////////////////////////
  namespace details {
    template <class T, class Generator>
    void gincsvd_update(const EWhich which, const int M, const int N, const int K,
                        const int I, const int curK, const int L,
                        const T* Ap, T* tmp,
                        T* U, T* VT, T* B, T* S,
                        cublasHandle_t blas_handle, cudaStream_t computestream,
                        const int nextL, Generator &g, T* deviceRecv, T* hostStaging, cudaStream_t sendstream,
                        T* hostS, cudaStream_t recvstream,
                        timing_info &t);

    template <class T>
    cublasStatus_t GEMM(cublasHandle_t handle,
                        cublasOperation_t transa,
                        cublasOperation_t transb,
                        int m,
                        int n,
                        int k,
                        const T alpha,
                        const T *A, int lda,
                        const T *B, int ldb,
                        const T beta,
                        T *C, int ldc);

    template <>
    cublasStatus_t GEMM<float>(cublasHandle_t handle,
                               cublasOperation_t transa,
                               cublasOperation_t transb,
                               int m,
                               int n,
                               int k,
                               const float alpha,
                               const float *A, int lda,
                               const float *B, int ldb,
                               const float beta,
                               float *C, int ldc)
    { return cublasSgemm(handle,transa,transb,m,n,k,
                         &alpha,A,lda,B,ldb,
                         &beta,C,ldc); }

    template <>
    cublasStatus_t GEMM<double>(cublasHandle_t handle,
                               cublasOperation_t transa,
                               cublasOperation_t transb,
                               int m,
                               int n,
                               int k,
                               const double alpha,
                               const double *A, int lda,
                               const double *B, int ldb,
                               const double beta,
                               double *C, int ldc)
    { return cublasDgemm(handle,transa,transb,m,n,k,
                         &alpha,A,lda,B,ldb,
                         &beta,C,ldc); }

    template <class T>
    cublasStatus_t AXPY(cublasHandle_t handle,
                        int n,
                        const T alpha,
                        const T *x, int incx,
                              T *y, int incy);

    template <>
    cublasStatus_t AXPY<float>(cublasHandle_t handle,
                               int n,
                               const float alpha,
                               const float *x, int incx,
                                     float *y, int incy)
    { return cublasSaxpy(handle,n,&alpha,x,incx,y,incy); }

    template <>
    cublasStatus_t AXPY<double>(cublasHandle_t handle,
                               int n,
                               const double alpha,
                               const double *x, int incx,
                                     double *y, int incy)
    { return cublasDaxpy(handle,n,&alpha,x,incx,y,incy); }

  }


  //////////////////////////////////////////////////////////////////////////////
  ///  drivers for sync and async updates
  //////////////////////////////////////////////////////////////////////////////
  template <class Generator>
  timing_info gincsvd_gen(EWhich which, int M, int N, int K, int L,
                    Generator g,
                    typename Generator::scalar_type* U,
                    typename Generator::scalar_type* V,
                    typename Generator::scalar_type* S,
                    bool printSigmas = false);

  template <class Generator>
  timing_info gincsvd_gen_sync(EWhich which, int M, int N, int K, int L,
                    Generator g,
                    typename Generator::scalar_type* U,
                    typename Generator::scalar_type* V,
                    typename Generator::scalar_type* S,
                    bool printSigmas = false);


  //////////////////////////////////////////////////////////////////////////////
  ///  matrix-based drivers
  //////////////////////////////////////////////////////////////////////////////
  template <class T>
  timing_info gincsvd(EWhich which, int M, int N, int K, int L,
                      const T* A, T* U, T* VT, T* S, bool printSigmas = false)
  {
    return GINCSVD::gincsvd_gen(which,M,N,K,L,GINCSVD::MatrixGenerator<T>(A,M),U,VT,S,printSigmas);
  }

  template <class T>
  timing_info gincsvd_sync(EWhich which, int M, int N, int K, int L,
                           const T* A, T* U, T* VT, T* S, bool printSigmas = false)
  {
    return GINCSVD::gincsvd_gen_sync(which,M,N,K,L,GINCSVD::MatrixGenerator<T>(A,M),U,VT,S,printSigmas);
  }

} // end of namespace GINCSVD


template <class Generator>
GINCSVD::timing_info GINCSVD::gincsvd_gen_sync(const EWhich which, const int M, const int N, const int K, const int L,
                                      Generator g,
                                      typename Generator::scalar_type* U,
                                      typename Generator::scalar_type* VT,
                                      typename Generator::scalar_type* S,
                                      bool printValues)
{
  using std::min;
  using std::max;
  timing_info timing;
  typedef typename Generator::scalar_type T;
  /* U is M * K+L
     VT is only ever K * N
     B is (K+L)*(K+L), but we'll allocate double so there is space for B's right singular vectors
     S is K+L, but we need another K+L for temp storage used in update
   */
  T *hostStaging = new T[M*L];
  T *devA;
  T *devU;
  T *devVT;
  T *devB;
  T *devS;
  T *tmp;
  GINCSVD_CUDA_CHECK( cudaMalloc(&tmp,     sizeof(T)*M*K)          );
  GINCSVD_CUDA_CHECK( cudaMalloc(&devA,    sizeof(T)*M*L)          );
  GINCSVD_CUDA_CHECK( cudaMalloc(&devU ,   sizeof(T)*M*(K+L))      );
  GINCSVD_CUDA_CHECK( cudaMalloc(&devVT,   sizeof(T)*N*K)          );
  GINCSVD_CUDA_CHECK( cudaMalloc(&devB , 2*sizeof(T)*(K+L)*(K+L))  );
  GINCSVD_CUDA_CHECK( cudaMalloc(&devS , 2*sizeof(T)*(K+L))        );

  cublasHandle_t blas_handle;
  GINCSVD_BLAS_CHECK( cublasCreate(&blas_handle) );
  // single stream for all work, the default stream
  cudaStream_t defstream = NULL;
  GINCSVD_BLAS_CHECK( cublasSetStream(blas_handle, defstream) );

  /* events to time the component operations */
  cudaEvent_t eXfer[2];
  GINCSVD_CUDA_CHECK( cudaEventCreate(&eXfer[0]) );
  GINCSVD_CUDA_CHECK( cudaEventCreate(&eXfer[1]) );
  cudaEvent_t eOverall[3];
  GINCSVD_CUDA_CHECK( cudaEventCreate(&eOverall[0]) );
  GINCSVD_CUDA_CHECK( cudaEventCreate(&eOverall[1]) );
  GINCSVD_CUDA_CHECK( cudaEventCreate(&eOverall[2]) );

  /*
      column number of last column produced (sent),  0 <= colProd <= N
      column number of last column consumed (svd'd), 0 <= colCons <= colProd
  */
  int colProd=0;
  int colCons=0;

  /* curK starts at 0 and grows to K after K columns */
  int curK = 0;

  ///////////////////////////////////////////////////////////////////////////////////
  // begin of loop over blocks
  GINCSVD_CUDA_CHECK( cudaEventRecord(eOverall[0], defstream) );

  while (colCons < colProd || colProd < N)
  {
    assert(0 <= curK);
    assert(curK <= colCons);
    assert(curK <= K);
    assert(0 <= colCons);
    assert(0 <= colProd);
    assert(colProd <= N);
    assert(colCons <= colProd);

    /////////////////////////////////////////////////////////////////////////////////
    // do the copy if there are remaining columns
    // can't update with more than L or K or more than we have
    const int numToSend = min( L, N-colProd );
    if (numToSend > 0) {
#ifdef GINCSVD_DEBUG
      std::cout << "-------- Transferring columns " << colProd << " through " << colProd + numToSend - 1 << std::endl;
#endif
      g.generate(hostStaging,numToSend);
      GINCSVD_CUDA_CHECK( cudaEventRecord( eXfer[0], defstream ) );
      GINCSVD_CUDA_CHECK( cudaMemcpyAsync( devA, hostStaging, sizeof(T)*M*numToSend, cudaMemcpyHostToDevice, defstream) );
      GINCSVD_CUDA_CHECK( cudaEventRecord( eXfer[1], defstream ) );
      float t;
      GINCSVD_CUDA_CHECK( cudaEventSynchronize(eXfer[1]) );
      GINCSVD_CUDA_CHECK( cudaEventElapsedTime(&t, eXfer[0], eXfer[1]) );
      timing.t_push += t;
      timing.numPush++;
      colProd += numToSend;
    }

    /////////////////////////////////////////////////////////////////////////////////
    // if columns have been produced/xferred, then we have work to do
    // i.e., we're not on the first iteration
    // do the update: updating with columns [colCons+1,colProd]
    if (colProd > 0) {
      const int numLastSent = colProd - colCons;
      assert(numLastSent > 0);
#ifdef GINCSVD_DEBUG
      std::cout << "-------- Updating (current rank: " << curK << ") with columns " << colCons << " to " << colCons + numLastSent - 1 << std::endl;
#endif
      details::gincsvd_update(which,M,N,K,
                              colCons,curK,numLastSent,
                              devA,tmp,
                              devU,devVT,devB,devS,
                              blas_handle, defstream,
                              0,g,(T*)NULL,(T*)NULL,defstream,
                              (T*)NULL,defstream,
                              timing);
      // inc this now, it will be updated momentarily
      colCons = colProd;
      curK = min(colCons,K);
    }

    /////////////////////////////////////////////////////////////////////////////////
    // print current singular values
    if (curK > 0)
    {
      GINCSVD_CUDA_CHECK( cudaMemcpyAsync( S, devS, sizeof(T)*curK, cudaMemcpyDeviceToHost, defstream) );
      GINCSVD_CUDA_CHECK( cudaEventRecord( eXfer[1], defstream ) );
      GINCSVD_CUDA_CHECK( cudaEventSynchronize( eXfer[1] ) );
      if (printValues) {
        std::cout << "Current singular values:\n";
        std::copy(S,S+curK,std::ostream_iterator<T>(std::cout,"\n"));
      }
    }

  }
  // end of loop over blocks
  ///////////////////////////////////////////////////////////////////////////////////

  // copy back U and V
  GINCSVD_CUDA_CHECK( cudaEventRecord(eOverall[1], defstream) );
  GINCSVD_CUDA_CHECK( cudaMemcpyAsync( U,  devU,  sizeof(T)*M*curK, cudaMemcpyDeviceToHost, defstream) );
  GINCSVD_CUDA_CHECK( cudaMemcpyAsync( VT, devVT, sizeof(T)*N*curK, cudaMemcpyDeviceToHost, defstream) );
  GINCSVD_CUDA_CHECK( cudaEventRecord(eOverall[2], defstream) );

  // stop and record times
  GINCSVD_CUDA_CHECK( cudaEventSynchronize(eOverall[2]) );
  GINCSVD_CUDA_CHECK( cudaEventElapsedTime(&timing.total_time,  eOverall[0], eOverall[2]) );
  GINCSVD_CUDA_CHECK( cudaEventElapsedTime(&timing.t_pullUV, eOverall[1], eOverall[2]) );

  // clean up
  GINCSVD_CUDA_CHECK( cudaEventDestroy(eXfer[0]) );
  GINCSVD_CUDA_CHECK( cudaEventDestroy(eXfer[1]) );
  GINCSVD_CUDA_CHECK( cudaEventDestroy(eOverall[0]) );
  GINCSVD_CUDA_CHECK( cudaEventDestroy(eOverall[1]) );
  GINCSVD_CUDA_CHECK( cudaEventDestroy(eOverall[2]) );
  //
  GINCSVD_BLAS_CHECK( cublasDestroy(blas_handle) );
  //
  GINCSVD_CUDA_CHECK( cudaFree(devA ) );
  GINCSVD_CUDA_CHECK( cudaFree(devU ) );
  GINCSVD_CUDA_CHECK( cudaFree(devVT) );
  GINCSVD_CUDA_CHECK( cudaFree(devB ) );
  GINCSVD_CUDA_CHECK( cudaFree(devS ) );
  GINCSVD_CUDA_CHECK( cudaFree(tmp ) );
  delete [] hostStaging;

  return timing;
} // gincsvd_gen_sync


/////////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class Generator>
GINCSVD::timing_info GINCSVD::gincsvd_gen(const EWhich which, const int M, const int N, const int K, const int L,
                                 Generator g,
                                 typename Generator::scalar_type* U,
                                 typename Generator::scalar_type* VT,
                                 typename Generator::scalar_type* S,
                                 bool printValues)
{
  using std::min;
  using std::max;
  timing_info timing;
  typedef typename Generator::scalar_type T;
  T* hostStaging;
  T* wholeBuffer;
  T* inoutBufPtr[2];
  T* cmputBufPtr[2];
  // buffer space is M * (L+max(K,L)), so that it is big enough for both:
  // - L input vectors (xfer)
  // and
  // - L input vectors (compute)
  // - K vectors of the updated left singular basis U*Q, they must be formed ex situ by GEMM
  GINCSVD_CUDA_CHECK( cudaHostAlloc((void**)&hostStaging,sizeof(T)*M*L, cudaHostAllocWriteCombined) );
  GINCSVD_CUDA_CHECK( cudaMalloc((void**)&wholeBuffer,sizeof(T)*M*(L + max(L,K))) );
  /* layout of buffers is:
    if L<K:     L      K-L   L
             |------------|-------|
             | input |****| xfer  | bufComp=0 during read
             |------------|-------|
             |  temp U*Q  | xfer  | bufComp=0 during compute
             |------------|-------|

             |-------|------------|
             |  xfer |****| input | bufComp=1 during read
             |-------|------------|
             |  xfer |  temp U*Q  | bufComp=1 during compute
             |-------|------------|
    we need three markers, will use four
    * xfer positions 1 and 2, resp. columns 0 and K=max(K,L)
    * compute space for U*Q , resp. columns 0 and L

    if L>=K, then 2*L is the minimum needed (for two data buffers)
             |-------|-------|
             | input |  xfer | bufComp=0 during read
             |-------|-------|
             | U*Q |*|  xfer | bufComp=0 during compute
             |-------|-------|
             |-------|-------|
             |  xfer | input | bufComp=1 during read
             |-------|-------|
             |  xfer | U*Q |*| bufComp=1 during compute
             |-------|-------|
    again, need three markers, will use four
    * xfer positions 1 and 2, resp. columns 0 and L=max(K,L)
    * compute space for U*Q , resp. columns 0 and L
    note, these are the same offsets as above, for the L<K case
  */
  inoutBufPtr[0] = wholeBuffer;
  cmputBufPtr[0] = wholeBuffer;
  inoutBufPtr[1] = wholeBuffer + M*max(K,L);
  cmputBufPtr[1] = wholeBuffer + M*L;
  /* U is M * K+L
     VT is only ever K * N
     B is (K+L)*(K+L), but we'll allocate double so there is space for B's right singular vectors
     S is K+L, but we need another K+L for temp storage used in update
   */
  T *devU;
  T *devVT;
  T *devB;
  T *devS;
  GINCSVD_CUDA_CHECK( cudaMalloc(&devU ,sizeof(T)*M*(K+L))        );
  GINCSVD_CUDA_CHECK( cudaMalloc(&devVT,sizeof(T)*N*K)            );
  GINCSVD_CUDA_CHECK( cudaMalloc(&devB ,2*sizeof(T)*(K+L)*(K+L))  );
  GINCSVD_CUDA_CHECK( cudaMalloc(&devS ,2*sizeof(T)*(K+L))        );
  /* make new streams
     - transfer stream for transferring next batch of columns to device
     - compute stream for doing BLAS work
     CULA, unfortunately, doesn't use streams, and induces a device synchronize on calls
  */
  cublasHandle_t blas_handle;
  GINCSVD_BLAS_CHECK( cublasCreate(&blas_handle) );
  cudaStream_t      cmptstream, xferstream;
  GINCSVD_CUDA_CHECK( cudaStreamCreate(&cmptstream) );
  GINCSVD_CUDA_CHECK( cudaStreamCreate(&xferstream) );
  GINCSVD_BLAS_CHECK( cublasSetStream(blas_handle, cmptstream) );

  /* events to time the component operations */
  cudaEvent_t eInitSend[2];
  GINCSVD_CUDA_CHECK( cudaEventCreate(&eInitSend[0]) );
  GINCSVD_CUDA_CHECK( cudaEventCreate(&eInitSend[1]) );
  cudaEvent_t eOverall[3];
  GINCSVD_CUDA_CHECK( cudaEventCreate(&eOverall[0]) );
  GINCSVD_CUDA_CHECK( cudaEventCreate(&eOverall[1]) );
  GINCSVD_CUDA_CHECK( cudaEventCreate(&eOverall[2]) );

  /*
      indices into buffers
      bufComp==0 means compute is in the first part of the buffers, async xfer in the second
      bufComp==1 means the opposite
  */
  int bufComp=0;

  /*
      column number of last column produced (sent),  0 <= colProd <= N
      column number of last column consumed (svd'd), 0 <= colCons <= colProd
  */
  int colProd=0;
  int colCons=0;

  /* curK starts at 0 and grows to K after K columns */
  int curK = 0;

  ///////////////////////////////////////////////////////////////////////////////////
  // begin of loop over blocks
  GINCSVD_CUDA_CHECK( cudaEventRecord(eOverall[0], cmptstream) );

  /////////////////////////////////////////////////////////////////////////////////
  // do initial copy in the compute stream
  {
    const int initSend = min( min(K,L),N-colProd );
#ifdef GINCSVD_DEBUG
    std::cout << "-------- Initial transfer of columns 0 through " << initSend-1 << std::endl;
#endif
    g.generate(hostStaging,initSend);
    GINCSVD_CUDA_CHECK( cudaEventRecord(eInitSend[0],cmptstream) );
    GINCSVD_CUDA_CHECK( cudaMemcpyAsync( inoutBufPtr[bufComp], hostStaging, sizeof(T)*M*initSend, cudaMemcpyHostToDevice, cmptstream) );
    GINCSVD_CUDA_CHECK( cudaEventRecord(eInitSend[1],cmptstream) );
    // these are sent in the cmptstream, so will be available for the first iteration
    colProd += initSend;
  }

  ///////////////////////////////////////////////////////////////////////////////////
  // begin of loop over blocks
  while (colCons < N)
  {
    assert(0 <= curK);
    assert(curK <= colCons);
    assert(curK <= K);
    assert(0 <= colCons);
    assert(0 <= colProd);
    assert(colProd <= N);
    assert(colCons <= colProd);

    /////////////////////////////////////////////////////////////////////////////////
    // update our buffer indices
    const int bufXfer = bufComp ^ 1;
    assert(bufXfer + bufComp == 1);

    /////////////////////////////////////////////////////////////////////////////////
    // print current singular values
    // don't know how to do this yet; might need to print these inside.
    // however, ultimate utility of this code requires giving the user a handle for checking whether these are available
    // also probably want to have an option for transferring U,V back during the first stage
    //if (curK > 0)
    //{
    //  float t;
    //  GINCSVD_CUDA_CHECK( cudaEventSynchronize(eTransfer[PULLS_STOP]) );
    //  GINCSVD_CUDA_CHECK( cudaEventElapsedTime(&t, eTransfer[PULLS_START], eTransfer[PULLS_STOP]) );
    //  timing.t_pullS += t;
    //  if (printValues) {
    //    std::cout << "Current singular values:\n";
    //    std::copy(S,S+curK,std::ostream_iterator<T>(std::cout,"\n"));
    //  }
    //}

    // figure out how big the next update is; this may be zero, if colProd == N
    const int thisL = min(L, N-colProd);

    // do the update: updating with columns [colCons+1,colProd]
    const int lastL = colProd - colCons;
    assert(lastL > 0);
#ifdef GINCSVD_DEBUG
    std::cout << "-------- Updating (current rank: " << curK << ") with columns " << colCons << " to " << colCons + lastL - 1 << std::endl;
#endif
    details::gincsvd_update(which,M,N,K,
                            colCons,curK,lastL,
                            inoutBufPtr[bufComp],cmputBufPtr[bufComp],
                            devU,devVT,devB,devS,
                            blas_handle, cmptstream,
                            thisL,g,inoutBufPtr[bufXfer],hostStaging,xferstream,
                            S,xferstream,
                            timing);

    // these have been sent and are available for the next iteration
    colProd += thisL;
    // after returning from update(), we've called CULA, which means we've device-synced, which means xfer to is done if we did one above
    colCons += lastL;
    curK = min(colCons,K);
    // swap xfer/comp buffers
    bufComp = bufXfer;
  }
  // end of loop over blocks
  ///////////////////////////////////////////////////////////////////////////////////

  // copy back U and V
  GINCSVD_CUDA_CHECK( cudaEventRecord(eOverall[1], cmptstream) );
  GINCSVD_CUDA_CHECK( cudaMemcpyAsync( U,  devU,  sizeof(T)*M*curK, cudaMemcpyDeviceToHost, cmptstream) );
  GINCSVD_CUDA_CHECK( cudaMemcpyAsync( VT, devVT, sizeof(T)*N*curK, cudaMemcpyDeviceToHost, cmptstream) );
  GINCSVD_CUDA_CHECK( cudaEventRecord(eOverall[2], cmptstream) );

  // stop and record times
  GINCSVD_CUDA_CHECK( cudaEventSynchronize(eOverall[2]) );
  GINCSVD_CUDA_CHECK( cudaEventElapsedTime(&timing.total_time,  eOverall[0], eOverall[2]) );
  GINCSVD_CUDA_CHECK( cudaEventElapsedTime(&timing.t_pullUV, eOverall[1], eOverall[2]) );

  {
    // didn't record this before; get it now
    float t;
    GINCSVD_CUDA_CHECK( cudaEventElapsedTime(&t,eInitSend[0],eInitSend[1]) );
    timing.t_push += t;
    timing.numPush++;
  }

  // clean up
  GINCSVD_CUDA_CHECK( cudaEventDestroy(eInitSend[0]) );
  GINCSVD_CUDA_CHECK( cudaEventDestroy(eInitSend[1]) );
  GINCSVD_CUDA_CHECK( cudaEventDestroy(eOverall[0]) );
  GINCSVD_CUDA_CHECK( cudaEventDestroy(eOverall[1]) );
  GINCSVD_CUDA_CHECK( cudaEventDestroy(eOverall[2]) );
  //
  GINCSVD_CUDA_CHECK( cudaStreamDestroy(xferstream) );
  GINCSVD_CUDA_CHECK( cudaStreamDestroy(cmptstream) );
  //
  GINCSVD_BLAS_CHECK( cublasDestroy(blas_handle) );
  //
  GINCSVD_CUDA_CHECK( cudaFree(wholeBuffer) );
  GINCSVD_CUDA_CHECK( cudaFree(devU ) );
  GINCSVD_CUDA_CHECK( cudaFree(devVT) );
  GINCSVD_CUDA_CHECK( cudaFree(devB ) );
  GINCSVD_CUDA_CHECK( cudaFree(devS ) );
  GINCSVD_CUDA_CHECK( cudaFreeHost(hostStaging) );

  return timing;
}

/////////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class T, class Generator>
void GINCSVD::details::gincsvd_update(const EWhich which, const int M, const int N, const int K,
                                      int I, int curK, int L,
                                      const T* Ap, T* tmp,
                                      T* U, T* VT, T* B, T* S,
                                      cublasHandle_t blas_handle, cudaStream_t computeStream,
                                      const int toSend, Generator &g, T* deviceRecv, T* hostStaging, cudaStream_t sendStream,
                                      T* hostS, cudaStream_t recvStream,
                                      timing_info &timing)
{
  cudaEvent_t expandStart, expandStop, sendStart, sendStop;
  GINCSVD_CUDA_CHECK( cudaEventCreate(&expandStart) );
  GINCSVD_CUDA_CHECK( cudaEventCreate(&expandStop) );
  GINCSVD_CUDA_CHECK( cudaEventCreate(&sendStart) );
  GINCSVD_CUDA_CHECK( cudaEventCreate(&sendStop) );
  //
  const int LDU = M;
  T *U1 = U;
  T *U2 = U + curK*LDU;
  const int LDVT = K;
  T *VT1 = VT;
  T *VT2 = VT + I*LDVT;
  // debug::printDeviceArray("Aplus ",M,L,Ap,M);
  //////////////////////////////////////////////////////////////////////////////////////////
  // first iteration: push and GESVD, no overlapping
  if (I==0) {
    if (toSend > 0) {
      g.generate(hostStaging,toSend);
#ifdef GINCSVD_DEBUG
      std::cout << "-------- Transferring next " << toSend << " columns" << std::endl;
#endif
      GINCSVD_CUDA_CHECK( cudaEventRecord(sendStart, sendStream) );
      GINCSVD_CUDA_CHECK( cudaMemcpyAsync(deviceRecv,hostStaging,sizeof(T)*M*toSend,cudaMemcpyHostToDevice,sendStream) );
      GINCSVD_CUDA_CHECK( cudaEventRecord(sendStop, sendStream) );
    }
    // FIRST ITERATION: just an orthogonal factorization, via SVD
    GINCSVD_CUDA_CHECK( cudaMemcpyAsync(U1,Ap,sizeof(T)*M*L,cudaMemcpyDeviceToDevice,computeStream) );
    GINCSVD_CULA_CHECK( culaDeviceGesvd('O','A',M,L,U1,LDU,S,U1,LDU,VT,LDVT) );
    debug::printDeviceArray("S after first pass",1,L,S,1);
    if (hostS != NULL) {
      // start copy-back of S
      GINCSVD_CUDA_CHECK( cudaMemcpyAsync(hostS,S,sizeof(T)*L,cudaMemcpyDeviceToHost,recvStream) );
    }
  }
  //////////////////////////////////////////////////////////////////////////////////////////
  // later iterations: full update sequence: push and Gram-Schmidt expansion overlap
  else {
    /* set up pointers:
        B was allocated at 2*(K+Lmax)*(K+Lmax), room for a (K+Lmax) SVD and its right singular vectors
        curK <= K and L <= Lmax, so there is room for two KpL x KpL matrices
    */
    const int KpL = curK+L;
    const int LDB = KpL;
    T *B11 = B;
    T *B12 = B + curK*LDB;          // == &B[0,   curK]
    T *B22 = B + curK*LDB + curK;   // == &B[curK,curK]
    const int LDWT = KpL;
    T *WT = B + LDB*LDB;
    // S was allocated at 2*(K+Lmax), so there is room for KpL * 2; put tau after the first KpL
    T *tau  = S + KpL;
    // need gram-schmidt; only way to get work done before calling CULA
    // B = [S C]
    //     [0 Z]
    // copy Ap to Up  (stream cmptstream)
    // C = U'*Up      (stream cmptstream)
    // Up = Up - U*C  (stream cmptstream)
    //
    // [U A] [S 0] [VT 0] = [U Ap] [S C] [VT 0]
    //       [0 I] [ 0 I]          [0 I] [0  I]
    //                    = [U Up] [S C] [VT 0]
    //                             [0 Z] [0  I]
    // 1d copy, as they both have the same tight stride
    debug::printDeviceArray("S entering update",1,curK,S,1);
    GINCSVD_CUDA_CHECK( cudaEventRecord(expandStart, computeStream) );
    GINCSVD_CUDA_CHECK( cudaEventRecord(sendStart,   sendStream) );
    // puts U'*A in part B12 of B
    GINCSVD_BLAS_CHECK( details::GEMM(blas_handle, CUBLAS_OP_T, CUBLAS_OP_N,
                                      K,L,M,
                                      (T)1.0, U,LDU, Ap,LDU,
                                      (T)0.0, B12,LDB) );
    GINCSVD_BLAS_CHECK( details::GEMM(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N,
                                      M,L,K,
                                      (T)-1.0, U,LDU, B12,LDB,
                                      (T) 0.0, U2,LDU) );
    GINCSVD_BLAS_CHECK( details::AXPY(blas_handle, M*L, (T)1.0, Ap,1, U2,1) );
    if (toSend > 0) {
      g.generate(hostStaging,toSend);
#ifdef GINCSVD_DEBUG
      std::cout << "-------- Transferring next " << toSend << " columns" << std::endl;
#endif
      GINCSVD_CUDA_CHECK( cudaMemcpyAsync(deviceRecv,hostStaging,sizeof(T)*M*toSend,cudaMemcpyHostToDevice,sendStream) );
    }
    GINCSVD_CUDA_CHECK( cudaEventRecord(sendStop,   sendStream) );
    GINCSVD_CUDA_CHECK( cudaEventRecord(expandStop, computeStream) );
    // compute QR factorization of (I-UU')*Ap
    GINCSVD_CULA_CHECK( culaDeviceGeqrf(M,L,U2,LDU,tau) );
    // copy R to B22
    copyUpperTri(L, U2,LDU,B22,LDB);
    // set [S] in first K columns of B
    //     [0]
    copyToDiag(K+L,K,B11,LDB, S);
    debug::printDeviceArray("B",KpL,KpL,B,LDB);
    // put Q in U2
    GINCSVD_CULA_CHECK( culaDeviceOrgqr(M,L,L,U2,LDU,tau) );
    // compute SVD of B
    GINCSVD_CULA_CHECK( culaDeviceGesvd('O','A',KpL,KpL,B,LDB,S,NULL,LDB,WT,LDWT) );
    debug::printDeviceArray("S",1,KpL,S,1);
    const int newK = std::min( K, curK+L );
    if (which == SMALLEST) {
      // below, use left and right singular vectors corresponding to smallest singular values
      // it's okay to move B and WT, because we don't use them anymore after this
      const int trunc = curK+L-newK;
      B  += LDB*trunc;
      WT += trunc;
      // must do two copies (because copies can't overlap space); stage in G, then back
      GINCSVD_CUDA_CHECK( cudaMemcpyAsync(tau,S+trunc,sizeof(T)*newK,cudaMemcpyDeviceToDevice,computeStream) );
      GINCSVD_CUDA_CHECK( cudaMemcpyAsync(S,tau,      sizeof(T)*newK,cudaMemcpyDeviceToDevice,computeStream) );
    }
    // update U
    GINCSVD_BLAS_CHECK( details::GEMM(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N,
                                      M,newK,KpL,
                                      (T)1.0, U,LDU, B,LDB,
                                      (T)0.0, tmp,LDU) );
    GINCSVD_CUDA_CHECK( cudaMemcpyAsync(U,tmp,sizeof(T)*M*newK,cudaMemcpyDeviceToDevice,computeStream)  );
    /* update VT
                  curK   L     I  L                    I      L
            newK [WT1   WT2] [VT  0 ] curK  = newK [ WT1*VT  WT2 ]
                             [ 0  I ]  L
     */
    GINCSVD_BLAS_CHECK( details::GEMM(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N,
                                      newK,I,curK,
                                      (T)1.0, WT,LDWT, VT,LDVT,
                                      (T)0.0, tmp,newK) );
    // width and height are reversed here, because this copy uses C/row-major (transposed) layout
    const int stridevt  = sizeof(T)*LDVT;
    const int stridewt  = sizeof(T)*LDWT;
    const int stridetmp = sizeof(T)*newK;
    const int width     = sizeof(T)*newK;
    const T *WT2 = WT+LDWT*curK;
    GINCSVD_CUDA_CHECK( cudaMemcpy2DAsync(VT1,stridevt,tmp,stridetmp,width,I,cudaMemcpyDeviceToDevice,computeStream) );
    GINCSVD_CUDA_CHECK( cudaMemcpy2DAsync(VT2,stridevt,WT2, stridewt,width,L,cudaMemcpyDeviceToDevice,computeStream) );
    if (hostS != NULL) {
      // start copy-back of S
      GINCSVD_CUDA_CHECK( cudaMemcpyAsync(hostS,S,sizeof(T)*newK,cudaMemcpyDeviceToHost,recvStream) );
    }
  }
  // now that other work is registered, check the timings from before the CULA calls
  if (I != 0) {
    float t;
    GINCSVD_CUDA_CHECK( cudaEventElapsedTime(&t, expandStart, expandStop) );
    timing.t_expand += t;
  }
  if (toSend > 0) {
    float t;
    GINCSVD_CUDA_CHECK( cudaEventElapsedTime(&t, sendStart,   sendStop) );
    timing.t_push += t;
    timing.numPush++;
  }
  GINCSVD_CUDA_CHECK( cudaEventDestroy(expandStart) );
  GINCSVD_CUDA_CHECK( cudaEventDestroy(expandStop) );
  GINCSVD_CUDA_CHECK( cudaEventDestroy(sendStart) );
  GINCSVD_CUDA_CHECK( cudaEventDestroy(sendStop) );
}


#endif
