diff --git a/cuda/debug/Makefile b/cuda/debug/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..8ae7c9f28ba7baf815b58391441b1fad9ac670dc
--- /dev/null
+++ b/cuda/debug/Makefile
@@ -0,0 +1,20 @@
+NVCC = nvcc
+NVCC_FLAGS = -arch=sm_80 -lineinfo --ptxas-options=--verbose
+
+SRCS := $(wildcard *.cu)
+EXES  = $(patsubst %.cu,%.x,$(SRCS))
+
+all: exe
+
+exe : $(EXES)
+
+help:
+	@echo -e "make exe  \t compile: $(SRCS)"
+
+%.x: %.cu
+	@echo -e "\nCOMPILE: $<"
+	$(NVCC) $(NVCC_FLAGS) -o $@ $<
+
+clean:
+	$(RM) $(EXES) 
+
diff --git a/cuda/debug/jacobi.cu b/cuda/debug/jacobi.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7812c65db3a53fcc6189169a6c46469ada556dc2
--- /dev/null
+++ b/cuda/debug/jacobi.cu
@@ -0,0 +1,168 @@
+#include <math.h>
+#include <string.h>
+#include <float.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+#include <cuda_runtime.h>
+
+#include <cub/device/device_reduce.cuh>
+using namespace cub;
+
+#define NUM_THREADS 32
+
+
+__global__ void jacobi(float * curr, float * next, float * errs, int n, int stride){
+
+    // INSERT CODE: index definitions
+    int i = ;
+    int j = ;
+
+    int idx = ; 
+
+    // INSERT CODE: 
+    if ( ) {
+
+    // INSERT CODE: 
+        next[idx] = 0.25 * ( ) ;
+
+        errs[idx] = fabs(next[idx] - curr[idx]);
+
+    }
+
+}
+
+
+int main(int argc, char *argv[]) {
+
+    int n = 4096;
+    int stride;
+    int iter_max = 1000;
+    double tol = 1.0e-6;
+
+    if (argc > 1) {
+        n = std::stoi(argv[1]);
+    }
+
+    if (argc > 2) {
+        iter_max = std::stoi(argv[2]);
+    }
+
+    stride = n;
+
+    std::cout << "Dim: " << n << std::endl;
+    std::cout << "Stride: " << stride << std::endl;
+
+    size_t nbytes = sizeof(float) * n * stride;
+
+    float * h_curr = (float *) malloc(nbytes);
+
+    float h_err = 1.;
+
+    float * d_curr;
+    float * d_next;
+    float * d_errs;
+
+    float * d_err;
+
+    cudaMalloc((void **) &d_curr, nbytes);
+    cudaMalloc((void **) &d_next, nbytes);
+    cudaMalloc((void **) &d_errs, nbytes);
+
+    cudaMalloc((void **) &d_err, sizeof(float));
+
+    cudaMemset(d_curr, 0, nbytes);
+    cudaMemset(d_next, 0, nbytes);
+
+    dim3 nThreads(NUM_THREADS, NUM_THREADS);
+    dim3 nBlocks((n-1)/nThreads.x+1, (n-1)/nThreads.y+1);
+
+    // Determine temporary device storage requirements
+    void     *d_temp_storage = NULL;
+    size_t   temp_storage_bytes = 0;
+    cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_errs, d_err, n*stride);
+    // Allocate temporary storage
+    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+    // timing
+    cudaEvent_t start, stop;
+    float etime, time_fd, time_max;
+
+    time_fd = 0.0;
+    time_max = 0.0;
+
+    // create events
+    // INSERT CODE: 
+
+    int iter = 0;
+    struct timeval temp_1, temp_2;
+    double ttime=0.;
+
+    for ( int i = 0 ; i < n ; i++ ) {
+        h_curr[i * stride] = 1.;
+    }
+
+    cudaMemcpy(d_curr, h_curr, nbytes, cudaMemcpyHostToDevice);
+
+    printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, n);
+
+    gettimeofday(&temp_1, (struct timezone*)0);
+
+    while ( h_err > tol && iter < iter_max )
+    {
+
+        cudaEventRecord(start);
+
+        jacobi<<<nBlocks, nThreads>>>(d_curr, d_next, d_errs, n, stride);
+
+        cudaEventRecord(stop);
+        cudaEventSynchronize(stop);
+
+        cudaEventElapsedTime(&etime, start, stop);
+
+        time_fd += etime;
+
+        cudaEventRecord(start);
+
+        cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_errs, d_err, n*stride);
+
+        cudaEventRecord(stop);
+        cudaEventSynchronize(stop);
+
+        cudaEventElapsedTime(&etime, start, stop);
+
+        time_max += etime;
+
+        cudaMemcpy(&h_err, d_err, sizeof(float), cudaMemcpyDeviceToHost);
+
+        //if(iter % 10 == 0) printf("%5d, %0.8lf\n", iter, h_err);
+
+        std::swap(d_curr, d_next);
+
+        iter++;
+    }
+
+    gettimeofday(&temp_2, (struct timezone*)0);
+    ttime = 0.000001*((temp_2.tv_sec-temp_1.tv_sec)*1.e6+(temp_2.tv_usec-temp_1 .tv_usec));
+
+    printf("Elapsed time (s) = %.2lf\n", ttime);
+    printf("Stopped at iteration: %u\n", iter);
+
+    time_fd /= iter;
+    time_max /= iter;
+
+    std::cout << "Time FD  / ms " << time_fd << std::endl;
+    std::cout << "Time MAX / ms " << time_max << std::endl;
+
+    free(h_curr);
+
+    cudaFree(d_curr);
+    cudaFree(d_next);
+
+    // release event resources
+    // INSERT CODE: 
+
+    return 0;
+
+}
diff --git a/cuda/debug/kernel_copy.cu b/cuda/debug/kernel_copy.cu
new file mode 100644
index 0000000000000000000000000000000000000000..66ff1ab06a4cfbdcbfcbcd905545c4c937b94677
--- /dev/null
+++ b/cuda/debug/kernel_copy.cu
@@ -0,0 +1,142 @@
+#include <stdio.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#define BLOCK_DIM 256
+
+#define REPEAT 1
+
+
+__global__ void strideCopy(float *odata, float* idata, int stride)
+{
+    // INSERT CODE: index definitions
+    int xid = ;
+    odata[xid] = idata[xid];
+}
+
+__global__ void offsetCopy(float *odata, float* idata, int offset)
+{
+    // INSERT CODE: index definitions
+    int xid = ;
+    odata[xid] = idata[xid];
+}
+
+void init_data(float * buffer, int size) {
+   for ( int i = 0 ; i < size ; i++ ) {
+      buffer[i] = (float) i;
+   }
+}
+
+int main (int argc, char *argv[]) {
+  
+   int buffer_size;
+
+   float * h_buffer;
+
+   float * d_inp;
+   float * d_out;
+
+   cudaEvent_t start, end;
+   float eventEtime, eventBandwidth;
+
+   int size = 65535;
+   size *= BLOCK_DIM;
+
+   int stride = 1;
+   int offset = 0;
+
+   if (argc > 1) {
+      stride = atoi(argv[1]);
+   }
+
+   if (argc > 2) {
+      offset = atoi(argv[2]);
+   }
+
+   buffer_size = (size+offset)*stride;
+
+   dim3 block (BLOCK_DIM);
+   dim3 grid  ( (size-1)/block.x+1 );
+
+   printf("Number of Elemets = %d\n", size);
+   printf("Stride            = %d\n", stride);
+   printf("Offset            = %d\n", offset);
+   printf("Buffer size       = %d MB\n", sizeof(float)*buffer_size/(1<<20));
+
+   printf("Block size        = %d\n", block.x);
+   printf("Grid size         = %d\n", grid.x);
+
+   printf("Repeat            = %d\n", REPEAT);
+
+//-- insert C code -------------
+// allocate memory on host
+
+//------------------------------
+
+   init_data (h_buffer, buffer_size);
+
+//-- insert CUDA code ----------
+// creat cuda events (start, end) for timing
+
+//------------------------------
+
+//-- insert CUDA code ----------
+// allocate memory buffers on selected GPU
+
+//------------------------------
+
+   cudaMemcpy(d_inp, h_buffer, sizeof(float)*buffer_size, cudaMemcpyHostToDevice);
+
+   printf ("\nStride Copy : Stride = %d\n", stride);
+
+   cudaEventRecord(start);
+
+//-- insert CUDA code ----------
+// launch stride copy kernel
+
+//------------------------------
+
+   cudaEventRecord(end);
+   cudaEventSynchronize(end);
+   cudaEventElapsedTime(&eventEtime, start, end);
+
+   eventEtime /= REPEAT;
+
+   eventBandwidth = (float) ( 2.0F*size*sizeof(float) ) / ( eventEtime );
+   printf("Time / ms        = %.1f\n", eventEtime);
+   printf("Bandwidth / GB/s = %.2f\n", eventBandwidth / 1.e6);
+
+   printf ("\nOffset Copy : Offset = %d\n", offset);
+
+   cudaEventRecord(start);
+
+//-- insert CUDA code ----------
+// launch offset copy kernel
+
+//------------------------------
+
+   cudaEventRecord(end);
+   cudaEventSynchronize(end);
+   cudaEventElapsedTime(&eventEtime, start, end);
+
+   eventEtime /= REPEAT;
+
+   eventBandwidth = (float) ( 2.0F*size*sizeof(float) ) / ( eventEtime );
+   printf("Time / ms        = %.1f\n", eventEtime);
+   printf("Bandwidth / GB/s = %.2f\n", eventBandwidth / 1.e6);
+
+   cudaMemcpy(h_buffer, d_out, sizeof(float)*buffer_size, cudaMemcpyDeviceToHost);
+
+//-- insert CUDA code ----------
+// free resources on device (memory buffers and events)
+
+//----------------------------
+
+//-- insert C coda -----------
+// free resources on host
+
+//------------------------------
+
+   return 0;
+}
+
diff --git a/cuda/debug/matrix_mul.cu b/cuda/debug/matrix_mul.cu
new file mode 100644
index 0000000000000000000000000000000000000000..157e70179e0413cc89f724d157dda1578b5bb67c
--- /dev/null
+++ b/cuda/debug/matrix_mul.cu
@@ -0,0 +1,139 @@
+#include <stdio.h>
+#include <assert.h>
+#define epsilon (float)1e-5
+
+#define NUM_THREADS 16
+
+void matrixMulHost(float* A, float* B, float* C, int n) {
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < n; ++j) {
+      float pvalue = 0;
+      for (int k = 0; k < n; ++k) {
+        float a = A[i * n + k];
+        float b = B[k * n + j];
+        pvalue += a * b;
+      }
+      C[i * n + j] = pvalue;
+    }
+  }
+}
+
+__global__ void MatrixMulKernel(float* A, float* B, float* C, int n) {
+
+   // INSERT CODE: index definitions
+   int j = ;
+   int i = ;
+
+   // INSERT CODE: 
+   if ( ) {
+
+   }
+}
+
+int main(int argc, char** argv) {
+
+  int n;
+
+  float *h_A, *h_B, *h_C;
+  float *h_ref;
+
+  if(argc<2) {
+    fprintf(stderr,"Usage: %s Width\n",argv[0]);
+    exit(1);
+  }
+
+  n=atoi(argv[1]);
+
+  if(n<1) {
+    fprintf(stderr,"Error Width=%d, must be > 0\n",n);
+    exit(1);
+  }
+
+  size_t nbytes = n * n * sizeof(float);
+
+  h_A = (float *) malloc(nbytes);
+  h_B = (float *) malloc(nbytes);
+  h_C = (float *) malloc(nbytes);
+
+  h_ref = (float *) malloc(nbytes);
+
+  memset(h_C, 0, nbytes);
+  memset(h_ref, 0, nbytes);
+
+  for(int y = 0; y < n; y++){
+    for(int x = 0; x < n; x++) {
+      h_A[y * n + x]=(float)(((y + 1) * n + x + 1)/n);
+      h_B[y * n + x]=(float)(((y + 1) * n + x + 1)/n);
+    }
+  }
+
+  float flops;
+  float *d_A, *d_B, *d_C;
+
+  // CUDA grid management
+  dim3 threads(NUM_THREADS, NUM_THREADS);
+  dim3 blocks(n/NUM_THREADS,n/NUM_THREADS);
+
+  cudaMalloc(&d_A, nbytes);
+  cudaMalloc(&d_B, nbytes);
+  cudaMalloc(&d_C, nbytes);
+
+  cudaMemcpy(d_A, h_A, nbytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_B, h_B, nbytes, cudaMemcpyHostToDevice);
+
+  // cudaGetLastError call to reset previous CUDA errors
+  // INSERT CODE
+
+  // Create start and stop CUDA events 
+  // INSERT CODE
+
+
+
+  // kernel launch
+  MatrixMulKernel<<<blocks, threads>>>(d_A, d_B, d_C, n);
+
+  // event record, synchronization, elapsed time and destruction
+  // INSERT CODE
+
+  // device synchronization and cudaGetLastError call
+  // INSERT CODE
+
+  float elapsed;
+  cudaEventElapsedTime(&elapsed, start, stop);
+  elapsed = elapsed/1000.f; // convert to seconds
+
+  // calculate Mflops
+  // INSERT CODE
+  printf("Kernel elapsed time %fs \n", elapsed);
+  printf("Gflops: %f\n", flops);
+
+  // copy back results from device
+  cudaMemcpy(h_C, d_C, nbytes, cudaMemcpyDeviceToHost);
+
+  // free memory on device
+  cudaFree(d_A);
+  cudaFree(d_B);
+  cudaFree(d_C);
+
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+
+  matrixMulHost(h_A, h_B, h_ref, n);
+
+  int errCnt = 0;
+  for(int y = 0; y < n; y++){
+    for(int x = 0; x < n; x++) {
+      float it = h_ref[y * n + x];
+      if(fabs(it - h_C[y * n + x]) > epsilon * it) {
+        printf("failing x=%d, y=%d: %f!=%f \n", x, y, it, h_C[y * n + x]);
+        errCnt++;
+      }
+    }
+  }
+
+  if(errCnt==0)
+    printf("\nTEST PASSED\n");
+  else
+    printf("\n\nTEST FAILED: number of errors:  %d\n", errCnt);
+
+}
diff --git a/cuda/debug/matrix_mul_shared.cu b/cuda/debug/matrix_mul_shared.cu
new file mode 100644
index 0000000000000000000000000000000000000000..691efda508a2ae15236b86fc48f85588bc0bf1db
--- /dev/null
+++ b/cuda/debug/matrix_mul_shared.cu
@@ -0,0 +1,200 @@
+#include <stdio.h>
+#include <assert.h>
+#define epsilon (float)1e-5
+
+#define NUM_THREADS 16
+
+void matrixMulHost(float* A, float* B, float* C, int n) {
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < n; ++j) {
+      float pvalue = 0;
+      for (int k = 0; k < n; ++k) {
+        float a = A[i * n + k];
+        float b = B[k * n + j];
+        pvalue += a * b;
+      }
+      C[i * n + j] = pvalue;
+    }
+  }
+}
+
+__device__ int get_offset (int idx_i, int idx_j, int N) {
+   return idx_i * N * NUM_THREADS + idx_j * NUM_THREADS;
+}
+
+__global__ void MatrixMulKernelShared(float *A, float *B, float *C, int N) {
+
+   // Shared memory used to store Asub and Bsub respectively
+   __shared__ float As[NUM_THREADS][NUM_THREADS];
+   __shared__ float Bs[NUM_THREADS][NUM_THREADS];
+ 
+   // Block row and column 
+   int ib = blockIdx.y;
+   int jb = blockIdx.x;
+
+   // Thread row and column within Csub 
+   int it = threadIdx.y;
+   int jt = threadIdx.x;
+
+   int a_offset, b_offset, c_offset;
+
+   // Each thread computes one element of Csub
+   // by accumulating results into Cvalue
+   float Cvalue = 0.0f;
+
+   // Loop over all the sub-matrices of A and B that are 
+   // required to compute Csub. 
+   // Multiply each pair of sub-matrices together
+   // and accumulate the results. 
+
+   for (int kb = 0; kb < (N / NUM_THREADS); ++kb) {
+
+      // Get the starting address (a_offset) of Asub
+      // (sub-matrix of A of dimension NB x NB)
+      // Asub is located i_block sub-matrices to the right and
+      // k_block sub-matrices down from the upper-left corner of A
+      a_offset = get_offset (ib, kb, N);
+      // Get the starting address (b_offset) of Bsub
+      b_offset = get_offset (kb, jb, N);
+
+      // Load Asub and Bsub from device memory to shared memory
+      // Each thread loads one element of each sub-matrix
+      // ---------------- //
+      // INSERT CUDA CODE //
+      // ---------------- //
+
+      // Synchronize to make sure the sub-matrices are loaded
+      // before starting the computation
+      // ---------------- //
+      // INSERT CUDA CODE //
+      // ---------------- //
+
+      // Multiply As and Bs together
+      for (int k = 0; k < NUM_THREADS; ++k) {
+         // ---------------- //
+         // INSERT CUDA CODE //
+         // ---------------- //
+
+      }
+      // Synchronize to make sure that the preceding
+      // computation is done before loading two new
+      // sub-matrices of A and B in the next iteration
+      // ---------------- //
+      // INSERT CUDA CODE //
+      // ---------------- //
+
+   }
+   
+   c_offset = get_offset (ib, jb, N);
+   // Each thread block computes one sub-matrix Csub of C
+   // ---------------- //
+   // INSERT CUDA CODE //
+   // ---------------- //
+
+}
+
+int main(int argc, char** argv) {
+
+  int n;
+
+  float *h_A, *h_B, *h_C;
+  float *h_ref;
+
+  if(argc<2) {
+    fprintf(stderr,"Usage: %s Width\n",argv[0]);
+    exit(1);
+  }
+
+  n=atoi(argv[1]);
+
+  if(n<1) {
+    fprintf(stderr,"Error Width=%d, must be > 0\n",n);
+    exit(1);
+  }
+
+  size_t nbytes = n * n * sizeof(float);
+
+  h_A = (float *) malloc(nbytes);
+  h_B = (float *) malloc(nbytes);
+  h_C = (float *) malloc(nbytes);
+
+  h_ref = (float *) malloc(nbytes);
+
+  memset(h_C, 0, nbytes);
+  memset(h_ref, 0, nbytes);
+
+  for(int y = 0; y < n; y++){
+    for(int x = 0; x < n; x++) {
+      h_A[y * n + x]=(float)(((y + 1) * n + x + 1)/n);
+      h_B[y * n + x]=(float)(((y + 1) * n + x + 1)/n);
+    }
+  }
+
+  float flops;
+  float *d_A, *d_B, *d_C;
+
+  // CUDA grid management
+  dim3 threads(NUM_THREADS, NUM_THREADS);
+  dim3 blocks(n/NUM_THREADS,n/NUM_THREADS);
+
+  cudaMalloc(&d_A, nbytes);
+  cudaMalloc(&d_B, nbytes);
+  cudaMalloc(&d_C, nbytes);
+
+  cudaMemcpy(d_A, h_A, nbytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_B, h_B, nbytes, cudaMemcpyHostToDevice);
+
+  // cudaGetLastError call to reset previous CUDA errors
+  // INSERT CODE
+
+  // Create start and stop CUDA events 
+  // INSERT CODE
+
+  // kernel launch
+  MatrixMulKernelShared<<<blocks, threads>>>(d_A, d_B, d_C, n);
+
+  // event record, synchronization, elapsed time and destruction
+  // INSERT CODE
+
+  // device synchronization and cudaGetLastError call
+  // INSERT CODE
+
+  float elapsed;
+  cudaEventElapsedTime(&elapsed, start, stop);
+  elapsed = elapsed/1000.f; // convert to seconds
+
+  // calculate Mflops
+  // INSERT CODE
+  printf("Kernel elapsed time %fs \n", elapsed);
+  printf("Gflops: %f\n", flops);
+
+  // copy back results from device
+  cudaMemcpy(h_C, d_C, nbytes, cudaMemcpyDeviceToHost);
+
+  // free memory on device
+  cudaFree(d_A);
+  cudaFree(d_B);
+  cudaFree(d_C);
+
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+
+  matrixMulHost(h_A, h_B, h_ref, n);
+
+  int errCnt = 0;
+  for(int y = 0; y < n; y++){
+    for(int x = 0; x < n; x++) {
+      float it = h_ref[y * n + x];
+      if(fabs(it - h_C[y * n + x]) > epsilon * it) {
+        printf("failing x=%d, y=%d: %f!=%f \n", x, y, it, h_C[y * n + x]);
+        errCnt++;
+      }
+    }
+  }
+
+  if(errCnt==0)
+    printf("\nTEST PASSED\n");
+  else
+    printf("\n\nTEST FAILED: number of errors:  %d\n", errCnt);
+
+}
diff --git a/cuda/debug/matrix_transpose.cu b/cuda/debug/matrix_transpose.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f49fe4a7262a75aa29978f6f3ac43b29fe1fd55c
--- /dev/null
+++ b/cuda/debug/matrix_transpose.cu
@@ -0,0 +1,262 @@
+
+#define SIZE         512
+#define NUM_THREADS   32
+#define NUM_REPS       3
+
+#include <stdio.h>
+
+
+__global__ void copy(float *odata, float* idata, int width, int height)
+{
+  // INSERT CODE: index definitions
+  int i = ;
+  int j = ;
+
+  int index  = ;
+
+  // INSERT CODE: 
+
+}
+
+
+// transpose naive
+
+__global__ void transposeNaive(float *odata, float* idata, int width, int height)
+{
+  // INSERT CODE: index definitions
+  int xIndex = ;
+  int yIndex = ;
+
+  int index_in  = ;
+  int index_out = ;
+
+  // INSERT CODE: 
+
+}
+
+// transpose in shared memory
+
+__global__ void transposeShared(float *odata, float *idata, int width, int height)
+{
+  // INSERT CODE: 
+
+  // INSERT CODE: index definitions
+  int xIndex = ;
+  int yIndex = ;  
+
+  int index_in = ;
+
+  xIndex = ;
+  yIndex = ;
+
+  int index_out = ;
+
+  // INSERT CODE: load to shared memory
+
+  // INSERT CODE: Synchronize before start computation
+  __syncthreads();
+
+  // INSERT CODE: 
+
+}
+
+// transpose in shared memory with no bank conflicts
+
+__global__ void transposeNoBankConflicts(float *odata, float *idata, int width, int height)
+{
+  // INSERT CODE: 
+
+  // INSERT CODE: index definitions
+  int xIndex = ;
+  int yIndex = ;  
+
+  int index_in = ;
+
+  xIndex = ;
+  yIndex = ;
+
+  int index_out = ;
+
+  // INSERT CODE: load to shared memory
+
+  // INSERT CODE: Synchronize before start computation
+  __syncthreads();
+
+  // INSERT CODE: 
+
+}
+
+
+// transpose host
+
+void transposeHost(float* gold, float* idata, const  int m, const  int n)
+{
+  for(  int y = 0; y < n; ++y) {
+    for(  int x = 0; x < m; ++x) {
+      gold[(x * n) + y] = idata[(y * m) + x];
+    }
+  }
+}
+
+int main( int argc, char** argv) 
+{
+  int m = SIZE;
+  int n = SIZE;
+
+  if (m%NUM_THREADS != 0 || n%NUM_THREADS != 0) {
+      printf("\nMatrix size must be integral multiple of tile size\nExiting...\n\n");
+      printf("FAILED\n\n");
+      return 1;
+  }
+
+  const  int mem_size = sizeof(float) * m*n;
+
+  // allocate host memory
+  float *h_idata = (float*) malloc(mem_size);
+  float *h_odata = (float*) malloc(mem_size);
+  float *transposeGold = (float *) malloc(mem_size);  
+
+  // allocate device memory
+  float *d_idata, *d_odata;
+  cudaMalloc( (void**) &d_idata, mem_size);
+  cudaMalloc( (void**) &d_odata, mem_size);
+
+  // initalize host data
+  for(  int i = 0; i < (m*n); ++i)
+    h_idata[i] = (float) i;
+  
+  // copy host data to device
+  cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice);
+
+  // Compute reference transpose solution
+  transposeHost(transposeGold, h_idata, m, n);
+
+  // print out common data for all kernels
+  printf("\nMatrix size: %dx%d (%dx%d tiles), tile size: %dx%d, block size: %dx%d\n\n", 
+	 m, n, m/NUM_THREADS, n/NUM_THREADS, NUM_THREADS, NUM_THREADS, NUM_THREADS, NUM_THREADS);
+    printf ("mem_size %d \n", mem_size);
+
+  // execution configuration parameters
+  dim3 threads(NUM_THREADS, NUM_THREADS);
+  dim3 grid(m/NUM_THREADS, n/NUM_THREADS);
+
+  // timing
+  float etime, bw;
+
+  // CUDA events
+  cudaEvent_t start, stop;
+
+  // initialize events
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  cudaError_t err_cuda;
+  err_cuda = cudaGetLastError()  ;
+
+
+  // copy kernel
+
+  cudaEventRecord(start);
+  for (int i=0; i < NUM_REPS; i++) {
+      copy<<<grid, threads>>>(d_odata, d_idata, m, n);
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+
+  err_cuda = cudaGetLastError() ;
+
+  if(err_cuda != cudaSuccess)  {
+      fprintf(stderr,"%s\n",cudaGetErrorString(err_cuda)) ;
+  }
+
+  cudaEventElapsedTime(&etime, start, stop);
+
+  bw = 2.0f * 1000.0f * mem_size/(1024*1024*1024)/(etime/NUM_REPS);
+
+  printf("%-20s time[ms] %.10e BW[GB/s] %.5f\n", "copy kernel", etime/NUM_REPS, bw);
+
+
+  // transpose naive
+
+  cudaEventRecord(start);
+  for (int i=0; i < NUM_REPS; i++) {
+      transposeNaive<<<grid, threads>>>(d_odata, d_idata, m, n);
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+
+  err_cuda = cudaGetLastError() ;
+
+  if(err_cuda != cudaSuccess)  {
+      fprintf(stderr,"%s\n",cudaGetErrorString(err_cuda)) ;
+  }
+
+  cudaEventElapsedTime(&etime, start, stop);
+
+  bw = 2.0f * 1000.0f * mem_size/(1024*1024*1024)/(etime/NUM_REPS);
+
+  printf("%-20s time[ms] %.10e BW[GB/s] %.5f\n", "transpose naive", etime/NUM_REPS, bw);
+
+  cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost);
+
+
+  // transpose shared memory
+
+  cudaEventRecord(start);
+  for (int i=0; i < NUM_REPS; i++) {
+      transposeShared<<<grid, threads>>>(d_odata, d_idata, m, n);
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+
+  err_cuda = cudaGetLastError() ;
+
+  if(err_cuda != cudaSuccess)  {
+      fprintf(stderr,"%s\n",cudaGetErrorString(err_cuda)) ;
+  }
+
+  cudaEventElapsedTime(&etime, start, stop);
+
+  bw = 2.0f * 1000.0f * mem_size/(1024*1024*1024)/(etime/NUM_REPS);
+
+  printf("%-20s time[ms] %.10e BW[GB/s] %.5f\n", "transpose shared", etime/NUM_REPS, bw);
+
+  cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost);
+
+
+  // transpose shared memory no bank conflicts
+
+  cudaEventRecord(start);
+  for (int i=0; i < NUM_REPS; i++) {
+      transposeNoBankConflicts<<<grid, threads>>>(d_odata, d_idata, m, n);
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+
+  err_cuda = cudaGetLastError() ;
+
+  if(err_cuda != cudaSuccess)  {
+      fprintf(stderr,"%s\n",cudaGetErrorString(err_cuda)) ;
+  }
+
+  cudaEventElapsedTime(&etime, start, stop);
+
+  bw = 2.0f * 1000.0f * mem_size/(1024*1024*1024)/(etime/NUM_REPS);
+
+  printf("%-20s time[ms] %.10e BW[GB/s] %.5f\n", "SM no bank conflicts", etime/NUM_REPS, bw);
+
+  cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost);
+
+  // cleanup
+  free(h_idata);
+  free(h_odata);
+  free(transposeGold);
+  cudaFree(d_idata);
+  cudaFree(d_odata);
+
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  
+  return 0;
+
+}
diff --git a/cuda/debug/solutions/Makefile b/cuda/debug/solutions/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..3408e7b43707c7b5d7508b840ee4d3a5ba22625c
--- /dev/null
+++ b/cuda/debug/solutions/Makefile
@@ -0,0 +1,66 @@
+PGCC  = nvc -O3 -gpu=cc80 -Minfo=all -acc=noautopar
+PGF90 = nvfortran -O3 -gpu=cc80 -Minfo=all -acc=noautopar
+
+NVCC = nvcc
+NVCC_FLAGS = -arch=sm_80 -lineinfo --ptxas-options=--verbose -Xptxas -dlcm=ca
+# -Xptxas -dlcm=ca
+# -Xptxas -dlcm=cg # L2 only
+
+DIMS = 555
+ITER = 2
+
+NAME := $(sort $(wildcard *.cu))
+NAME := $(filter-out max.cu,$(NAME))
+NAME := $(patsubst %.cu,%,$(NAME))
+
+SRCS  = $(patsubst %,%.cu,$(NAME))
+EXES  = $(patsubst %,%.x,$(NAME))
+NCUS  = $(patsubst %,%.ncu-rep,$(NAME))
+RUNS  = $(patsubst %,%.run,$(NAME))
+METS  = $(patsubst %,%.metrics,$(NAME))
+SANIT = $(patsubst %,%.sanitizer,$(NAME))
+
+.PHONY: all clean cleanall run exe prof # $(NCUS)
+
+all: exe
+
+exe : $(EXES)
+run : $(RUNS)
+ncu : $(NCUS)
+prof: $(METS)
+sanitizer: $(SANIT)
+
+
+help:
+	@echo -e "make exe  \t compile: $(SRCS)"
+	@echo -e "make run  \t run: $(EXES)"
+	@echo -e "make ncu  \t profile: $(EXES)"
+	@echo -e "make prof \t metrics: $(EXES)"
+
+%.x: %.cu
+	@echo -e "\nCOMPILE: $<"
+	$(NVCC) $(NVCC_FLAGS) -o $@ $<
+
+%.run: %.x
+	@echo -e "\nRUN: $<"
+	./$< $(DIMS)
+
+%.ncu-rep: %.x
+	@echo -e "\nNCU: $<"
+	ncu --set full -f -k jacobi -o $(@:.ncu-rep=) ./$< $(DIMS) $(ITER)
+	#ncu --cache-control all --set full -f -k jacobi -o $(@:.ncu-rep=) ./$< $(DIMS) $(ITER)
+
+%.metrics: %.x
+	@echo -e "\nMETRICS: $<"
+	ncu --cache-control all -k jacobi --metrics $(METRICS) ./$< $(DIMS) 1
+
+%.sanitizer: %.x
+	@echo -e "\nSANITIZER: $<"
+	compute-sanitizer ./$< $(DIMS) 1
+
+clean:
+	$(RM) $(EXES) 
+
+cleanall: clean
+	$(RM) $(NCUS)
+
diff --git a/cuda/debug/solutions/jacobi_01.cu b/cuda/debug/solutions/jacobi_01.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fb074ffa50a02b64f3189b46ff166dc8babe72d9
--- /dev/null
+++ b/cuda/debug/solutions/jacobi_01.cu
@@ -0,0 +1,174 @@
+#include <math.h>
+#include <string.h>
+#include <float.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+#include <cuda_runtime.h>
+
+#include <cub/device/device_reduce.cuh>
+using namespace cub;
+
+#define NUM_THREADS 32
+
+
+__global__ void jacobi(float * curr, float * next, float * errs, int n, int stride){
+
+    int i = threadIdx.x + blockIdx.x * blockDim.x;
+    int j = threadIdx.y + blockIdx.y * blockDim.y;
+
+    int idx = i + j * stride;
+
+    if ( (i > 0) && (i < n-1) && (j > 0) && (j < n-1) ) {
+
+        next[idx] = 0.25 * (
+                curr[(i-1) + j * stride] +
+                curr[(i+1) + j * stride] +
+                curr[i + (j-1) * stride] +
+                curr[i + (j+1) * stride]
+                ) ;
+
+        errs[idx] = fabs(next[idx] - curr[idx]);
+
+    }
+
+}
+
+
+int main(int argc, char *argv[]) {
+
+    int n = 4096;
+    int stride;
+    int iter_max = 1000;
+    double tol = 1.0e-6;
+
+    if (argc > 1) {
+        n = std::stoi(argv[1]);
+    }
+
+    if (argc > 2) {
+        iter_max = std::stoi(argv[2]);
+    }
+
+    stride = n;
+
+    std::cout << "Dim: " << n << std::endl;
+    std::cout << "Stride: " << stride << std::endl;
+
+    size_t nbytes = sizeof(float) * n * stride;
+
+    float * h_curr = (float *) malloc(nbytes);
+
+    float h_err = 1.;
+
+    float * d_curr;
+    float * d_next;
+    float * d_errs;
+
+    float * d_err;
+
+    cudaMalloc((void **) &d_curr, nbytes);
+    cudaMalloc((void **) &d_next, nbytes);
+    cudaMalloc((void **) &d_errs, nbytes);
+
+    cudaMalloc((void **) &d_err, sizeof(float));
+
+    cudaMemset(d_curr, 0, nbytes);
+    cudaMemset(d_next, 0, nbytes);
+
+    dim3 nThreads(NUM_THREADS, NUM_THREADS);
+    dim3 nBlocks((n-1)/nThreads.x+1, (n-1)/nThreads.y+1);
+
+    // Determine temporary device storage requirements
+    void     *d_temp_storage = NULL;
+    size_t   temp_storage_bytes = 0;
+    cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_errs, d_err, n*stride);
+    // Allocate temporary storage
+    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+    // timing
+    cudaEvent_t start, stop;
+    float etime, time_fd, time_max;
+
+    time_fd = 0.0;
+    time_max = 0.0;
+
+    // create events
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    int iter = 0;
+    struct timeval temp_1, temp_2;
+    double ttime=0.;
+
+    for ( int i = 0 ; i < n ; i++ ) {
+        h_curr[i * stride] = 1.;
+    }
+
+    cudaMemcpy(d_curr, h_curr, nbytes, cudaMemcpyHostToDevice);
+
+    printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, n);
+
+    gettimeofday(&temp_1, (struct timezone*)0);
+
+    while ( h_err > tol && iter < iter_max )
+    {
+
+        cudaEventRecord(start);
+
+        jacobi<<<nBlocks, nThreads>>>(d_curr, d_next, d_errs, n, stride);
+
+        cudaEventRecord(stop);
+        cudaEventSynchronize(stop);
+
+        cudaEventElapsedTime(&etime, start, stop);
+
+        time_fd += etime;
+
+        cudaEventRecord(start);
+
+        cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_errs, d_err, n*stride);
+
+        cudaEventRecord(stop);
+        cudaEventSynchronize(stop);
+
+        cudaEventElapsedTime(&etime, start, stop);
+
+        time_max += etime;
+
+        cudaMemcpy(&h_err, d_err, sizeof(float), cudaMemcpyDeviceToHost);
+
+        //h_err = 1.e6;
+
+        //if(iter % 10 == 0) printf("%5d, %0.8lf\n", iter, h_err);
+
+        std::swap(d_curr, d_next);
+
+        iter++;
+    }
+
+    gettimeofday(&temp_2, (struct timezone*)0);
+    ttime = 0.000001*((temp_2.tv_sec-temp_1.tv_sec)*1.e6+(temp_2.tv_usec-temp_1 .tv_usec));
+
+    printf("Elapsed time (s) = %.2lf\n", ttime);
+    printf("Stopped at iteration: %u\n", iter);
+
+    time_fd /= iter;
+    time_max /= iter;
+
+    std::cout << "Time FD  / ms " << time_fd << std::endl;
+    std::cout << "Time MAX / ms " << time_max << std::endl;
+
+    free(h_curr);
+
+    cudaFree(d_curr);
+    cudaFree(d_next);
+
+    // release event resources
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+
+    return 0;
+
+}
diff --git a/cuda/debug/solutions/jacobi_02.cu b/cuda/debug/solutions/jacobi_02.cu
new file mode 100644
index 0000000000000000000000000000000000000000..38fc601b752d45389280b5aa4b3af226ff21d57f
--- /dev/null
+++ b/cuda/debug/solutions/jacobi_02.cu
@@ -0,0 +1,178 @@
+#include <math.h>
+#include <string.h>
+#include <float.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+#include <cuda_runtime.h>
+
+#include <cub/device/device_reduce.cuh>
+using namespace cub;
+
+#define NUM_THREADS 32
+
+
+__global__ void jacobi(float * curr, float * next, float * errs, int n, int stride){
+
+    int i = threadIdx.x + blockIdx.x * blockDim.x;
+    int j = threadIdx.y + blockIdx.y * blockDim.y;
+
+    int idx = i + j * stride;
+
+    if ( (i > 0) && (i < n-1) && (j > 0) && (j < n-1) ) {
+
+        next[idx] = 0.25 * (
+                curr[(i-1) + j * stride] +
+                curr[(i+1) + j * stride] +
+                curr[i + (j-1) * stride] +
+                curr[i + (j+1) * stride]
+                ) ;
+
+        errs[idx] = fabs(next[idx] - curr[idx]);
+
+    }
+
+}
+
+
+int main(int argc, char *argv[]) {
+
+    int n = 4096;
+    int stride;
+    int iter_max = 1000;
+    double tol = 1.0e-6;
+
+    if (argc > 1) {
+        n = std::stoi(argv[1]);
+    }
+
+    if (argc > 2) {
+        iter_max = std::stoi(argv[2]);
+    }
+
+    stride = n;
+
+    float h_err = 1.;
+
+    float * d_curr;
+    float * d_next;
+    float * d_errs;
+
+    float * d_err;
+
+    size_t pitch;
+
+    cudaMallocPitch(&d_curr, &pitch, n * sizeof(float), n);
+    cudaMallocPitch(&d_next, &pitch, n * sizeof(float), n);
+    cudaMallocPitch(&d_errs, &pitch, n * sizeof(float), n);
+
+    stride = pitch/sizeof(float);
+
+    cudaMalloc((void **) &d_err, sizeof(float));
+
+    size_t nbytes = sizeof(float) * n * stride;
+
+    float * h_curr = (float *) malloc(nbytes);
+
+    cudaMemset(d_curr, 0, nbytes);
+    cudaMemset(d_next, 0, nbytes);
+
+    dim3 nThreads(NUM_THREADS, NUM_THREADS);
+    dim3 nBlocks((n-1)/nThreads.x+1, (n-1)/nThreads.y+1);
+
+    // Determine temporary device storage requirements
+    void     *d_temp_storage = NULL;
+    size_t   temp_storage_bytes = 0;
+    cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_errs, d_err, n*stride);
+    // Allocate temporary storage
+    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+    // timing
+    cudaEvent_t start, stop;
+    float etime, time_fd, time_max;
+
+    time_fd = 0.0;
+    time_max = 0.0;
+
+    // create events
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    int iter = 0;
+    struct timeval temp_1, temp_2;
+    double ttime=0.;
+
+    for ( int i = 0 ; i < n ; i++ ) {
+        h_curr[i * stride] = 1.;
+    }
+
+    cudaMemcpy(d_curr, h_curr, nbytes, cudaMemcpyHostToDevice);
+
+    std::cout << "Dim: " << n << std::endl;
+    std::cout << "Stride: " << stride << std::endl;
+
+    printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, n);
+
+    gettimeofday(&temp_1, (struct timezone*)0);
+
+    while ( h_err > tol && iter < iter_max )
+    {
+
+        cudaEventRecord(start);
+
+        jacobi<<<nBlocks, nThreads>>>(d_curr, d_next, d_errs, n, stride);
+
+        cudaEventRecord(stop);
+        cudaEventSynchronize(stop);
+
+        cudaEventElapsedTime(&etime, start, stop);
+
+        time_fd += etime;
+
+        cudaEventRecord(start);
+
+        cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_errs, d_err, n*stride);
+
+        cudaEventRecord(stop);
+        cudaEventSynchronize(stop);
+
+        cudaEventElapsedTime(&etime, start, stop);
+
+        time_max += etime;
+
+        cudaMemcpy(&h_err, d_err, sizeof(float), cudaMemcpyDeviceToHost);
+
+        //h_err = 1.e6;
+
+        //if(iter % 10 == 0) printf("%5d, %0.8lf\n", iter, h_err);
+
+        std::swap(d_curr, d_next);
+
+        iter++;
+    }
+
+    gettimeofday(&temp_2, (struct timezone*)0);
+    ttime = 0.000001*((temp_2.tv_sec-temp_1.tv_sec)*1.e6+(temp_2.tv_usec-temp_1 .tv_usec));
+
+    printf("Elapsed time (s) = %.2lf\n", ttime);
+    printf("Stopped at iteration: %u\n", iter);
+
+    time_fd /= iter;
+    time_max /= iter;
+
+    std::cout << "Time FD  / ms " << time_fd << std::endl;
+    std::cout << "Time MAX / ms " << time_max << std::endl;
+
+    free(h_curr);
+
+    cudaFree(d_curr);
+    cudaFree(d_next);
+
+    // release event resources
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+
+    return 0;
+
+}
diff --git a/cuda/debug/solutions/jacobi_03.cu b/cuda/debug/solutions/jacobi_03.cu
new file mode 100644
index 0000000000000000000000000000000000000000..48c9795ad7fd4100c2685b3d4558c969493c5568
--- /dev/null
+++ b/cuda/debug/solutions/jacobi_03.cu
@@ -0,0 +1,178 @@
+#include <math.h>
+#include <string.h>
+#include <float.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+#include <cuda_runtime.h>
+
+#include <cub/device/device_reduce.cuh>
+using namespace cub;
+
+#define NUM_THREADS 32
+
+
+__global__ void jacobi(float * curr, float * next, float * errs, int n, int stride){
+
+    int i = threadIdx.x + blockIdx.x * blockDim.x;
+    int j = threadIdx.y + blockIdx.y * blockDim.y;
+
+    int idx = i + j * stride;
+
+    if ( (i > 0) && (i < n-1) && (j > 0) && (j < n-1) ) {
+
+        next[idx] = 0.25 * (
+                curr[(i-1) + j * stride] +
+                curr[(i+1) + j * stride] +
+                curr[i + (j-1) * stride] +
+                curr[i + (j+1) * stride]
+                ) ;
+
+        errs[idx] = fabs(next[idx] - curr[idx]);
+
+    }
+
+}
+
+
+int main(int argc, char *argv[]) {
+
+    int n = 4096;
+    int stride;
+    int iter_max = 1000;
+    double tol = 1.0e-6;
+
+    if (argc > 1) {
+        n = std::stoi(argv[1]);
+    }
+
+    if (argc > 2) {
+        iter_max = std::stoi(argv[2]);
+    }
+
+    stride = n;
+
+    float h_err = 1.;
+
+    float * d_curr;
+    float * d_next;
+    float * d_errs;
+
+    float * d_err;
+
+    size_t cache_line = 128;
+
+    stride = n + (cache_line - (n * sizeof(float)) % cache_line) / sizeof(float);
+
+    size_t nbytes = sizeof(float) * n * stride;
+
+    float * h_curr = (float *) malloc(nbytes);
+
+    cudaMalloc((void **) &d_curr, nbytes);
+    cudaMalloc((void **) &d_next, nbytes);
+    cudaMalloc((void **) &d_errs, nbytes);
+
+    cudaMalloc((void **) &d_err, sizeof(float));
+
+    cudaMemset(d_curr, 0, nbytes);
+    cudaMemset(d_next, 0, nbytes);
+
+    dim3 nThreads(NUM_THREADS, NUM_THREADS);
+    dim3 nBlocks((n-1)/nThreads.x+1, (n-1)/nThreads.y+1);
+
+    // Determine temporary device storage requirements
+    void     *d_temp_storage = NULL;
+    size_t   temp_storage_bytes = 0;
+    cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_errs, d_err, n*stride);
+    // Allocate temporary storage
+    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+    // timing
+    cudaEvent_t start, stop;
+    float etime, time_fd, time_max;
+
+    time_fd = 0.0;
+    time_max = 0.0;
+
+    // create events
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    int iter = 0;
+    struct timeval temp_1, temp_2;
+    double ttime=0.;
+
+    for ( int i = 0 ; i < n ; i++ ) {
+        h_curr[i * stride] = 1.;
+    }
+
+    cudaMemcpy(d_curr, h_curr, nbytes, cudaMemcpyHostToDevice);
+
+    std::cout << "Dim: " << n << std::endl;
+    std::cout << "Stride: " << stride << std::endl;
+
+    printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, n);
+
+    gettimeofday(&temp_1, (struct timezone*)0);
+
+    while ( h_err > tol && iter < iter_max )
+    {
+
+        cudaEventRecord(start);
+
+        jacobi<<<nBlocks, nThreads>>>(d_curr, d_next, d_errs, n, stride);
+
+        cudaEventRecord(stop);
+        cudaEventSynchronize(stop);
+
+        cudaEventElapsedTime(&etime, start, stop);
+
+        time_fd += etime;
+
+        cudaEventRecord(start);
+
+        cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_errs, d_err, n*stride);
+
+        cudaEventRecord(stop);
+        cudaEventSynchronize(stop);
+
+        cudaEventElapsedTime(&etime, start, stop);
+
+        time_max += etime;
+
+        cudaMemcpy(&h_err, d_err, sizeof(float), cudaMemcpyDeviceToHost);
+
+        //h_err = 1.e6;
+
+        //if(iter % 10 == 0) printf("%5d, %0.8lf\n", iter, h_err);
+
+        std::swap(d_curr, d_next);
+
+        iter++;
+    }
+
+    gettimeofday(&temp_2, (struct timezone*)0);
+    ttime = 0.000001*((temp_2.tv_sec-temp_1.tv_sec)*1.e6+(temp_2.tv_usec-temp_1 .tv_usec));
+
+    printf("Elapsed time (s) = %.2lf\n", ttime);
+    printf("Stopped at iteration: %u\n", iter);
+
+    time_fd /= iter;
+    time_max /= iter;
+
+    std::cout << "Time FD  / ms " << time_fd << std::endl;
+    std::cout << "Time MAX / ms " << time_max << std::endl;
+
+    free(h_curr);
+
+    cudaFree(d_curr);
+    cudaFree(d_next);
+
+    // release event resources
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+
+    return 0;
+
+}
diff --git a/cuda/debug/solutions/kernel_copy.cu b/cuda/debug/solutions/kernel_copy.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ab5729db47de5087aebaf245ae68dcef0d5a8b3a
--- /dev/null
+++ b/cuda/debug/solutions/kernel_copy.cu
@@ -0,0 +1,150 @@
+#include <stdio.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#define BLOCK_DIM 256
+
+#define REPEAT 1
+
+
+__global__ void strideCopy(float *odata, float* idata, int stride)
+{
+    int xid = (blockIdx.x * blockDim.x + threadIdx.x) * stride;
+    odata[xid] = idata[xid];
+}
+
+__global__ void offsetCopy(float *odata, float* idata, int offset)
+{
+    int xid = blockIdx.x * blockDim.x + threadIdx.x + offset;
+    odata[xid] = idata[xid];
+}
+
+void init_data(float * buffer, int size) {
+   for ( int i = 0 ; i < size ; i++ ) {
+      buffer[i] = (float) i;
+   }
+}
+
+int main (int argc, char *argv[]) {
+  
+   int buffer_size;
+
+   float * h_buffer;
+
+   float * d_inp;
+   float * d_out;
+
+   cudaEvent_t start, end;
+   float eventEtime, eventBandwidth;
+
+   int size = 65535;
+   size *= BLOCK_DIM;
+
+   int stride = 1;
+   int offset = 0;
+
+   if (argc > 1) {
+      stride = atoi(argv[1]);
+   }
+
+   if (argc > 2) {
+      offset = atoi(argv[2]);
+   }
+
+   buffer_size = (size+offset)*stride;
+
+   dim3 block (BLOCK_DIM);
+   dim3 grid  ( (size-1)/block.x+1 );
+
+   printf("Number of Elemets = %d\n", size);
+   printf("Stride            = %d\n", stride);
+   printf("Offset            = %d\n", offset);
+   printf("Buffer size       = %d MB\n", sizeof(float)*buffer_size/(1<<20));
+
+   printf("Block size        = %d\n", block.x);
+   printf("Grid size         = %d\n", grid.x);
+
+   printf("Repeat            = %d\n", REPEAT);
+
+//-- insert C code -------------
+// allocate memory on host
+   h_buffer = (float *) malloc (sizeof(float)*buffer_size);
+//------------------------------
+
+   init_data (h_buffer, buffer_size);
+
+//-- insert CUDA code ----------
+// creat cuda events (start, end) for timing
+   cudaEventCreate(&start);
+   cudaEventCreate(&end);
+//------------------------------
+
+//-- insert CUDA code ----------
+// allocate memory buffers on selected GPU
+   cudaMalloc((void**)&d_inp, sizeof(float)*buffer_size);
+   cudaMalloc((void**)&d_out, sizeof(float)*buffer_size);
+//------------------------------
+
+   cudaMemcpy(d_inp, h_buffer, sizeof(float)*buffer_size, cudaMemcpyHostToDevice);
+
+   printf ("\nStride Copy : Stride = %d\n", stride);
+
+   cudaEventRecord(start);
+
+//-- insert CUDA code ----------
+// launch stride copy kernel
+   for ( int i=0 ; i < REPEAT ; i++ ) {
+      strideCopy <<<grid, block>>> (d_out, d_inp, stride);
+   }
+//------------------------------
+
+   cudaEventRecord(end);
+   cudaEventSynchronize(end);
+   cudaEventElapsedTime(&eventEtime, start, end);
+
+   eventEtime /= REPEAT;
+
+   eventBandwidth = (float) ( 2.0F*size*sizeof(float) ) / ( eventEtime );
+   printf("Time / ms        = %.1f\n", eventEtime);
+   printf("Bandwidth / GB/s = %.2f\n", eventBandwidth / 1.e6);
+
+   printf ("\nOffset Copy : Offset = %d\n", offset);
+
+   cudaEventRecord(start);
+
+//-- insert CUDA code ----------
+// launch offset copy kernel
+   for ( int i=0 ; i < REPEAT ; i++ ) {
+      offsetCopy <<<grid, block>>> (d_out, d_inp, offset);
+   }
+//------------------------------
+
+   cudaEventRecord(end);
+   cudaEventSynchronize(end);
+   cudaEventElapsedTime(&eventEtime, start, end);
+
+   eventEtime /= REPEAT;
+
+   eventBandwidth = (float) ( 2.0F*size*sizeof(float) ) / ( eventEtime );
+   printf("Time / ms        = %.1f\n", eventEtime);
+   printf("Bandwidth / GB/s = %.2f\n", eventBandwidth / 1.e6);
+
+   cudaMemcpy(h_buffer, d_out, sizeof(float)*buffer_size, cudaMemcpyDeviceToHost);
+
+//-- insert CUDA code ----------
+// free resources on device (memory buffers and events)
+   cudaEventDestroy(start);
+   cudaEventDestroy(end);
+
+   cudaFree(d_inp);
+   cudaFree(d_out);
+//----------------------------
+
+//-- insert C coda -----------
+// free resources on host
+   free(h_buffer);
+//------------------------------
+
+   return 0;
+}
+
diff --git a/cuda/debug/solutions/matrix_mul.cu b/cuda/debug/solutions/matrix_mul.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c99c63bbdd1e701a1b65e3017a767d42c508c526
--- /dev/null
+++ b/cuda/debug/solutions/matrix_mul.cu
@@ -0,0 +1,160 @@
+#include <stdio.h>
+#include <assert.h>
+#define epsilon (float)1e-5
+
+#define NUM_THREADS 16
+
+void matrixMulHost(float* A, float* B, float* C, int n) {
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < n; ++j) {
+      float pvalue = 0;
+      for (int k = 0; k < n; ++k) {
+        float a = A[i * n + k];
+        float b = B[k * n + j];
+        pvalue += a * b;
+      }
+      C[i * n + j] = pvalue;
+    }
+  }
+}
+
+__global__ void MatrixMulKernel(float* A, float* B, float* C, int n) {
+
+   // INSERT CODE: index definitions
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+
+   // INSERT CODE: fastest index to threads along x direction
+   if (i < n && j < n) {
+      float pvalue = 0;
+      for (int k = 0; k < n; ++k) {
+        float a = A[i * n + k];
+        float b = B[k * n + j];
+        pvalue += a * b;
+      }
+      C[i * n + j] = pvalue;
+   }
+}
+
+int main(int argc, char** argv) {
+
+  int n;
+
+  float *h_A, *h_B, *h_C;
+  float *h_ref;
+
+  if(argc<2) {
+    fprintf(stderr,"Usage: %s Width\n",argv[0]);
+    exit(1);
+  }
+
+  n=atoi(argv[1]);
+
+  if(n<1) {
+    fprintf(stderr,"Error Width=%d, must be > 0\n",n);
+    exit(1);
+  }
+
+  size_t nbytes = n * n * sizeof(float);
+
+  h_A = (float *) malloc(nbytes);
+  h_B = (float *) malloc(nbytes);
+  h_C = (float *) malloc(nbytes);
+
+  h_ref = (float *) malloc(nbytes);
+
+  memset(h_C, 0, nbytes);
+  memset(h_ref, 0, nbytes);
+
+  for(int y = 0; y < n; y++){
+    for(int x = 0; x < n; x++) {
+      h_A[y * n + x]=(float)(((y + 1) * n + x + 1)/n);
+      h_B[y * n + x]=(float)(((y + 1) * n + x + 1)/n);
+    }
+  }
+
+  float flops;
+  float *d_A, *d_B, *d_C;
+
+  // CUDA grid management
+  dim3 threads(NUM_THREADS, NUM_THREADS);
+  dim3 blocks(n/NUM_THREADS,n/NUM_THREADS);
+
+  cudaMalloc(&d_A, nbytes);
+  cudaMalloc(&d_B, nbytes);
+  cudaMalloc(&d_C, nbytes);
+
+  cudaMemcpy(d_A, h_A, nbytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_B, h_B, nbytes, cudaMemcpyHostToDevice);
+
+  // cudaGetLastError call to reset previous CUDA errors
+  // INSERT CODE
+  cudaError_t mycudaerror  ;
+  mycudaerror = cudaGetLastError()  ;
+
+  // Create start and stop CUDA events 
+  // INSERT CODE
+  cudaEvent_t start, stop;
+
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  cudaEventRecord(start);
+
+  // kernel launch
+  MatrixMulKernel<<<blocks, threads>>>(d_A, d_B, d_C, n);
+
+  // event record, synchronization, elapsed time and destruction
+  // INSERT CODE
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+
+  // device synchronization and cudaGetLastError call
+  // INSERT CODE
+  mycudaerror = cudaGetLastError() ;
+  if(mycudaerror != cudaSuccess)  {
+    fprintf(stderr,"%s\n",cudaGetErrorString(mycudaerror)) ;
+    exit(1);
+  }
+
+  float elapsed;
+  cudaEventElapsedTime(&elapsed, start, stop);
+  elapsed = elapsed/1000.f; // convert to seconds
+
+  // calculate Mflops
+  // INSERT CODE
+  printf("Kernel elapsed time %fs \n", elapsed);
+  flops = 2.*n*n*n ;
+  flops = flops/(1.e9*elapsed) ;
+  printf("Gflops: %f\n", flops);
+
+  // copy back results from device
+  cudaMemcpy(h_C, d_C, nbytes, cudaMemcpyDeviceToHost);
+
+  // free memory on device
+  cudaFree(d_A);
+  cudaFree(d_B);
+  cudaFree(d_C);
+
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+
+  matrixMulHost(h_A, h_B, h_ref, n);
+
+  int errCnt = 0;
+  for(int y = 0; y < n; y++){
+    for(int x = 0; x < n; x++) {
+      float it = h_ref[y * n + x];
+      if(fabs(it - h_C[y * n + x]) > epsilon * it) {
+        printf("failing x=%d, y=%d: %f!=%f \n", x, y, it, h_C[y * n + x]);
+        errCnt++;
+      }
+    }
+  }
+
+  if(errCnt==0)
+    printf("\nTEST PASSED\n");
+  else
+    printf("\n\nTEST FAILED: number of errors:  %d\n", errCnt);
+
+}
diff --git a/cuda/debug/solutions/matrix_mul_shared.cu b/cuda/debug/solutions/matrix_mul_shared.cu
new file mode 100644
index 0000000000000000000000000000000000000000..129dde71bf4673ed6eb90a17ed66877eb9d8f195
--- /dev/null
+++ b/cuda/debug/solutions/matrix_mul_shared.cu
@@ -0,0 +1,239 @@
+#include <stdio.h>
+#include <assert.h>
+#define epsilon (float)1e-5
+
+#define NUM_THREADS 16
+
+void matrixMulHost(float* A, float* B, float* C, int n) {
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < n; ++j) {
+      float pvalue = 0;
+      for (int k = 0; k < n; ++k) {
+        float a = A[i * n + k];
+        float b = B[k * n + j];
+        pvalue += a * b;
+      }
+      C[i * n + j] = pvalue;
+    }
+  }
+}
+
+__global__ void MatrixMulKernel(float* A, float* B, float* C, int n) {
+
+   // INSERT CODE: index definitions
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+
+   // INSERT CODE: fastest index to threads along x direction
+   if (i < n && j < n) {
+      float pvalue = 0;
+      for (int k = 0; k < n; ++k) {
+        float a = A[i * n + k];
+        float b = B[k * n + j];
+        pvalue += a * b;
+      }
+      C[i * n + j] = pvalue;
+   }
+}
+
+__device__ int get_offset (int idx_i, int idx_j, int N) {
+   return idx_i * N * NUM_THREADS + idx_j * NUM_THREADS;
+}
+
+__global__ void MatrixMulKernelShared(float *A, float *B, float *C, int N) {
+
+   // Shared memory used to store Asub and Bsub respectively
+   __shared__ float As[NUM_THREADS][NUM_THREADS];
+   __shared__ float Bs[NUM_THREADS][NUM_THREADS];
+ 
+   // Block row and column 
+   int ib = blockIdx.y;
+   int jb = blockIdx.x;
+
+   // Thread row and column within Csub 
+   int it = threadIdx.y;
+   int jt = threadIdx.x;
+
+   int a_offset, b_offset, c_offset;
+
+   // Each thread computes one element of Csub
+   // by accumulating results into Cvalue
+   float Cvalue = 0.0f;
+
+   // Loop over all the sub-matrices of A and B that are 
+   // required to compute Csub. 
+   // Multiply each pair of sub-matrices together
+   // and accumulate the results. 
+
+   for (int kb = 0; kb < (N / NUM_THREADS); ++kb) {
+
+      // Get the starting address (a_offset) of Asub
+      // (sub-matrix of A of dimension NB x NB)
+      // Asub is located i_block sub-matrices to the right and
+      // k_block sub-matrices down from the upper-left corner of A
+      a_offset = get_offset (ib, kb, N);
+      // Get the starting address (b_offset) of Bsub
+      b_offset = get_offset (kb, jb, N);
+
+      // Load Asub and Bsub from device memory to shared memory
+      // Each thread loads one element of each sub-matrix
+      // ---------------- //
+      // INSERT CUDA CODE //
+      // ---------------- //
+      As[it][jt] = A[a_offset + it*N + jt];
+      Bs[it][jt] = B[b_offset + it*N + jt];
+
+      // Synchronize to make sure the sub-matrices are loaded
+      // before starting the computation
+      // ---------------- //
+      // INSERT CUDA CODE //
+      // ---------------- //
+      __syncthreads();
+
+      // Multiply As and Bs together
+      for (int k = 0; k < NUM_THREADS; ++k) {
+         // ---------------- //
+         // INSERT CUDA CODE //
+         // ---------------- //
+         Cvalue += As[it][k] * Bs[k][jt]; 
+      }
+      // Synchronize to make sure that the preceding
+      // computation is done before loading two new
+      // sub-matrices of A and B in the next iteration
+      // ---------------- //
+      // INSERT CUDA CODE //
+      // ---------------- //
+      __syncthreads();
+   }
+   
+   c_offset = get_offset (ib, jb, N);
+   // Each thread block computes one sub-matrix Csub of C
+   // ---------------- //
+   // INSERT CUDA CODE //
+   // ---------------- //
+   C[c_offset + it * N + jt] = Cvalue;
+
+}
+
+int main(int argc, char** argv) {
+
+  int n;
+
+  float *h_A, *h_B, *h_C;
+  float *h_ref;
+
+  if(argc<2) {
+    fprintf(stderr,"Usage: %s Width\n",argv[0]);
+    exit(1);
+  }
+
+  n=atoi(argv[1]);
+
+  if(n<1) {
+    fprintf(stderr,"Error Width=%d, must be > 0\n",n);
+    exit(1);
+  }
+
+  size_t nbytes = n * n * sizeof(float);
+
+  h_A = (float *) malloc(nbytes);
+  h_B = (float *) malloc(nbytes);
+  h_C = (float *) malloc(nbytes);
+
+  h_ref = (float *) malloc(nbytes);
+
+  memset(h_C, 0, nbytes);
+  memset(h_ref, 0, nbytes);
+
+  for(int y = 0; y < n; y++){
+    for(int x = 0; x < n; x++) {
+      h_A[y * n + x]=(float)(((y + 1) * n + x + 1)/n);
+      h_B[y * n + x]=(float)(((y + 1) * n + x + 1)/n);
+    }
+  }
+
+  float flops;
+  float *d_A, *d_B, *d_C;
+
+  // CUDA grid management
+  dim3 threads(NUM_THREADS, NUM_THREADS);
+  dim3 blocks(n/NUM_THREADS,n/NUM_THREADS);
+
+  cudaMalloc(&d_A, nbytes);
+  cudaMalloc(&d_B, nbytes);
+  cudaMalloc(&d_C, nbytes);
+
+  cudaMemcpy(d_A, h_A, nbytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_B, h_B, nbytes, cudaMemcpyHostToDevice);
+
+  // cudaGetLastError call to reset previous CUDA errors
+  // INSERT CODE
+  cudaError_t mycudaerror  ;
+  mycudaerror = cudaGetLastError()  ;
+
+  // Create start and stop CUDA events 
+  // INSERT CODE
+  cudaEvent_t start, stop;
+
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  cudaEventRecord(start);
+
+  // kernel launch
+  MatrixMulKernelShared<<<blocks, threads>>>(d_A, d_B, d_C, n);
+
+  // event record, synchronization, elapsed time and destruction
+  // INSERT CODE
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+
+  // device synchronization and cudaGetLastError call
+  // INSERT CODE
+  mycudaerror = cudaGetLastError() ;
+  if(mycudaerror != cudaSuccess)  {
+    fprintf(stderr,"%s\n",cudaGetErrorString(mycudaerror)) ;
+    exit(1);
+  }
+
+  float elapsed;
+  cudaEventElapsedTime(&elapsed, start, stop);
+  elapsed = elapsed/1000.f; // convert to seconds
+
+  // calculate Mflops
+  // INSERT CODE
+  printf("Kernel elapsed time %fs \n", elapsed);
+  flops = 2.*n*n*n ;
+  flops = flops/(1.e9*elapsed) ;
+  printf("Gflops: %f\n", flops);
+
+  // copy back results from device
+  cudaMemcpy(h_C, d_C, nbytes, cudaMemcpyDeviceToHost);
+
+  // free memory on device
+  cudaFree(d_A);
+  cudaFree(d_B);
+  cudaFree(d_C);
+
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+
+  matrixMulHost(h_A, h_B, h_ref, n);
+
+  int errCnt = 0;
+  for(int y = 0; y < n; y++){
+    for(int x = 0; x < n; x++) {
+      float it = h_ref[y * n + x];
+      if(fabs(it - h_C[y * n + x]) > epsilon * it) {
+        printf("failing x=%d, y=%d: %f!=%f \n", x, y, it, h_C[y * n + x]);
+        errCnt++;
+      }
+    }
+  }
+
+  if(errCnt==0)
+    printf("\nTEST PASSED\n");
+  else
+    printf("\n\nTEST FAILED: number of errors:  %d\n", errCnt);
+
+}
diff --git a/cuda/debug/solutions/matrix_transpose.cu b/cuda/debug/solutions/matrix_transpose.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c3ed67f58133fe5b63bd97d0aa31b73a576afba3
--- /dev/null
+++ b/cuda/debug/solutions/matrix_transpose.cu
@@ -0,0 +1,252 @@
+
+#define SIZE         512
+#define NUM_THREADS   32
+#define NUM_REPS       3
+
+#include <stdio.h>
+
+
+__global__ void copy(float *odata, float* idata, int width, int height)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+
+  int index  = i + j * width;
+
+  odata[index] = idata[index];
+}
+
+
+// transpose naive
+
+__global__ void transposeNaive(float *odata, float* idata, int width, int height)
+{
+  int xIndex = blockIdx.x * NUM_THREADS + threadIdx.x;
+  int yIndex = blockIdx.y * NUM_THREADS + threadIdx.y;
+
+  int index_in  = xIndex + width * yIndex;
+  int index_out = yIndex + height * xIndex;
+
+  odata[index_out] = idata[index_in];
+}
+
+// transpose in shared memory
+
+__global__ void transposeShared(float *odata, float *idata, int width, int height)
+{
+  __shared__ float tile[NUM_THREADS][NUM_THREADS];
+
+  int xIndex = blockIdx.x * NUM_THREADS + threadIdx.x;
+  int yIndex = blockIdx.y * NUM_THREADS + threadIdx.y;  
+
+  int index_in = xIndex + (yIndex)*width;
+
+  xIndex = blockIdx.y * NUM_THREADS + threadIdx.x;
+  yIndex = blockIdx.x * NUM_THREADS + threadIdx.y;
+
+  int index_out = xIndex + (yIndex)*height;
+
+  tile[threadIdx.y][threadIdx.x] = idata[index_in];
+
+  __syncthreads();
+
+  odata[index_out] = tile[threadIdx.x][threadIdx.y];
+}
+
+// transpose in shared memory with no bank conflicts
+
+__global__ void transposeNoBankConflicts(float *odata, float *idata, int width, int height)
+{
+  __shared__ float tile[NUM_THREADS][NUM_THREADS+1];
+
+  int xIndex = blockIdx.x * NUM_THREADS + threadIdx.x;
+  int yIndex = blockIdx.y * NUM_THREADS + threadIdx.y;  
+
+  int index_in = xIndex + (yIndex)*width;
+
+  xIndex = blockIdx.y * NUM_THREADS + threadIdx.x;
+  yIndex = blockIdx.x * NUM_THREADS + threadIdx.y;
+
+  int index_out = xIndex + (yIndex)*height;
+
+  tile[threadIdx.y][threadIdx.x] = idata[index_in];
+
+  __syncthreads();
+
+  odata[index_out] = tile[threadIdx.x][threadIdx.y];
+}
+
+
+// transpose host
+
+void transposeHost(float* gold, float* idata, const  int m, const  int n)
+{
+  for(  int y = 0; y < n; ++y) {
+    for(  int x = 0; x < m; ++x) {
+      gold[(x * n) + y] = idata[(y * m) + x];
+    }
+  }
+}
+
+int main( int argc, char** argv) 
+{
+  int m = SIZE;
+  int n = SIZE;
+
+  if (m%NUM_THREADS != 0 || n%NUM_THREADS != 0) {
+      printf("\nMatrix size must be integral multiple of tile size\nExiting...\n\n");
+      printf("FAILED\n\n");
+      return 1;
+  }
+
+  const  int mem_size = sizeof(float) * m*n;
+
+  // allocate host memory
+  float *h_idata = (float*) malloc(mem_size);
+  float *h_odata = (float*) malloc(mem_size);
+  float *transposeGold = (float *) malloc(mem_size);  
+
+  // allocate device memory
+  float *d_idata, *d_odata;
+  cudaMalloc( (void**) &d_idata, mem_size);
+  cudaMalloc( (void**) &d_odata, mem_size);
+
+  // initalize host data
+  for(  int i = 0; i < (m*n); ++i)
+    h_idata[i] = (float) i;
+  
+  // copy host data to device
+  cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice);
+
+  // Compute reference transpose solution
+  transposeHost(transposeGold, h_idata, m, n);
+
+  // print out common data for all kernels
+  printf("\nMatrix size: %dx%d (%dx%d tiles), tile size: %dx%d, block size: %dx%d\n\n", 
+	 m, n, m/NUM_THREADS, n/NUM_THREADS, NUM_THREADS, NUM_THREADS, NUM_THREADS, NUM_THREADS);
+    printf ("mem_size %d \n", mem_size);
+
+  // execution configuration parameters
+  dim3 threads(NUM_THREADS, NUM_THREADS);
+  dim3 grid(m/NUM_THREADS, n/NUM_THREADS);
+
+  // timing
+  float etime, bw;
+
+  // CUDA events
+  cudaEvent_t start, stop;
+
+  // initialize events
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  cudaError_t err_cuda;
+  err_cuda = cudaGetLastError()  ;
+
+
+  // copy kernel
+
+  cudaEventRecord(start);
+  for (int i=0; i < NUM_REPS; i++) {
+      copy<<<grid, threads>>>(d_odata, d_idata, m, n);
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+
+  err_cuda = cudaGetLastError() ;
+
+  if(err_cuda != cudaSuccess)  {
+      fprintf(stderr,"%s\n",cudaGetErrorString(err_cuda)) ;
+  }
+
+  cudaEventElapsedTime(&etime, start, stop);
+
+  bw = 2.0f * 1000.0f * mem_size/(1024*1024*1024)/(etime/NUM_REPS);
+
+  printf("%-20s time[ms] %.10e BW[GB/s] %.5f\n", "copy kernel", etime/NUM_REPS, bw);
+
+
+  // transpose naive
+
+  cudaEventRecord(start);
+  for (int i=0; i < NUM_REPS; i++) {
+      transposeNaive<<<grid, threads>>>(d_odata, d_idata, m, n);
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+
+  err_cuda = cudaGetLastError() ;
+
+  if(err_cuda != cudaSuccess)  {
+      fprintf(stderr,"%s\n",cudaGetErrorString(err_cuda)) ;
+  }
+
+  cudaEventElapsedTime(&etime, start, stop);
+
+  bw = 2.0f * 1000.0f * mem_size/(1024*1024*1024)/(etime/NUM_REPS);
+
+  printf("%-20s time[ms] %.10e BW[GB/s] %.5f\n", "transpose naive", etime/NUM_REPS, bw);
+
+  cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost);
+
+
+  // transpose shared memory
+
+  cudaEventRecord(start);
+  for (int i=0; i < NUM_REPS; i++) {
+      transposeShared<<<grid, threads>>>(d_odata, d_idata, m, n);
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+
+  err_cuda = cudaGetLastError() ;
+
+  if(err_cuda != cudaSuccess)  {
+      fprintf(stderr,"%s\n",cudaGetErrorString(err_cuda)) ;
+  }
+
+  cudaEventElapsedTime(&etime, start, stop);
+
+  bw = 2.0f * 1000.0f * mem_size/(1024*1024*1024)/(etime/NUM_REPS);
+
+  printf("%-20s time[ms] %.10e BW[GB/s] %.5f\n", "transpose shared", etime/NUM_REPS, bw);
+
+  cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost);
+
+
+  // transpose shared memory no bank conflicts
+
+  cudaEventRecord(start);
+  for (int i=0; i < NUM_REPS; i++) {
+      transposeNoBankConflicts<<<grid, threads>>>(d_odata, d_idata, m, n);
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+
+  err_cuda = cudaGetLastError() ;
+
+  if(err_cuda != cudaSuccess)  {
+      fprintf(stderr,"%s\n",cudaGetErrorString(err_cuda)) ;
+  }
+
+  cudaEventElapsedTime(&etime, start, stop);
+
+  bw = 2.0f * 1000.0f * mem_size/(1024*1024*1024)/(etime/NUM_REPS);
+
+  printf("%-20s time[ms] %.10e BW[GB/s] %.5f\n", "SM no bank conflicts", etime/NUM_REPS, bw);
+
+  cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost);
+
+  // cleanup
+  free(h_idata);
+  free(h_odata);
+  free(transposeGold);
+  cudaFree(d_idata);
+  cudaFree(d_odata);
+
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  
+  return 0;
+
+}