diff --git a/cuda/debug/Makefile b/cuda/debug/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..8ae7c9f28ba7baf815b58391441b1fad9ac670dc --- /dev/null +++ b/cuda/debug/Makefile @@ -0,0 +1,20 @@ +NVCC = nvcc +NVCC_FLAGS = -arch=sm_80 -lineinfo --ptxas-options=--verbose + +SRCS := $(wildcard *.cu) +EXES = $(patsubst %.cu,%.x,$(SRCS)) + +all: exe + +exe : $(EXES) + +help: + @echo -e "make exe \t compile: $(SRCS)" + +%.x: %.cu + @echo -e "\nCOMPILE: $<" + $(NVCC) $(NVCC_FLAGS) -o $@ $< + +clean: + $(RM) $(EXES) + diff --git a/cuda/debug/jacobi.cu b/cuda/debug/jacobi.cu new file mode 100644 index 0000000000000000000000000000000000000000..7812c65db3a53fcc6189169a6c46469ada556dc2 --- /dev/null +++ b/cuda/debug/jacobi.cu @@ -0,0 +1,168 @@ +#include <math.h> +#include <string.h> +#include <float.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/time.h> + +#include <cuda_runtime.h> + +#include <cub/device/device_reduce.cuh> +using namespace cub; + +#define NUM_THREADS 32 + + +__global__ void jacobi(float * curr, float * next, float * errs, int n, int stride){ + + // INSERT CODE: index definitions + int i = ; + int j = ; + + int idx = ; + + // INSERT CODE: + if ( ) { + + // INSERT CODE: + next[idx] = 0.25 * ( ) ; + + errs[idx] = fabs(next[idx] - curr[idx]); + + } + +} + + +int main(int argc, char *argv[]) { + + int n = 4096; + int stride; + int iter_max = 1000; + double tol = 1.0e-6; + + if (argc > 1) { + n = std::stoi(argv[1]); + } + + if (argc > 2) { + iter_max = std::stoi(argv[2]); + } + + stride = n; + + std::cout << "Dim: " << n << std::endl; + std::cout << "Stride: " << stride << std::endl; + + size_t nbytes = sizeof(float) * n * stride; + + float * h_curr = (float *) malloc(nbytes); + + float h_err = 1.; + + float * d_curr; + float * d_next; + float * d_errs; + + float * d_err; + + cudaMalloc((void **) &d_curr, nbytes); + cudaMalloc((void **) &d_next, nbytes); + cudaMalloc((void **) &d_errs, nbytes); + + cudaMalloc((void **) &d_err, sizeof(float)); + + cudaMemset(d_curr, 0, nbytes); + cudaMemset(d_next, 0, nbytes); + + dim3 nThreads(NUM_THREADS, NUM_THREADS); + dim3 nBlocks((n-1)/nThreads.x+1, (n-1)/nThreads.y+1); + + // Determine temporary device storage requirements + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_errs, d_err, n*stride); + // Allocate temporary storage + cudaMalloc(&d_temp_storage, temp_storage_bytes); + + // timing + cudaEvent_t start, stop; + float etime, time_fd, time_max; + + time_fd = 0.0; + time_max = 0.0; + + // create events + // INSERT CODE: + + int iter = 0; + struct timeval temp_1, temp_2; + double ttime=0.; + + for ( int i = 0 ; i < n ; i++ ) { + h_curr[i * stride] = 1.; + } + + cudaMemcpy(d_curr, h_curr, nbytes, cudaMemcpyHostToDevice); + + printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, n); + + gettimeofday(&temp_1, (struct timezone*)0); + + while ( h_err > tol && iter < iter_max ) + { + + cudaEventRecord(start); + + jacobi<<<nBlocks, nThreads>>>(d_curr, d_next, d_errs, n, stride); + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + cudaEventElapsedTime(&etime, start, stop); + + time_fd += etime; + + cudaEventRecord(start); + + cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_errs, d_err, n*stride); + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + cudaEventElapsedTime(&etime, start, stop); + + time_max += etime; + + cudaMemcpy(&h_err, d_err, sizeof(float), cudaMemcpyDeviceToHost); + + //if(iter % 10 == 0) printf("%5d, %0.8lf\n", iter, h_err); + + std::swap(d_curr, d_next); + + iter++; + } + + gettimeofday(&temp_2, (struct timezone*)0); + ttime = 0.000001*((temp_2.tv_sec-temp_1.tv_sec)*1.e6+(temp_2.tv_usec-temp_1 .tv_usec)); + + printf("Elapsed time (s) = %.2lf\n", ttime); + printf("Stopped at iteration: %u\n", iter); + + time_fd /= iter; + time_max /= iter; + + std::cout << "Time FD / ms " << time_fd << std::endl; + std::cout << "Time MAX / ms " << time_max << std::endl; + + free(h_curr); + + cudaFree(d_curr); + cudaFree(d_next); + + // release event resources + // INSERT CODE: + + return 0; + +} diff --git a/cuda/debug/kernel_copy.cu b/cuda/debug/kernel_copy.cu new file mode 100644 index 0000000000000000000000000000000000000000..66ff1ab06a4cfbdcbfcbcd905545c4c937b94677 --- /dev/null +++ b/cuda/debug/kernel_copy.cu @@ -0,0 +1,142 @@ +#include <stdio.h> +#include <cuda.h> +#include <cuda_runtime.h> + +#define BLOCK_DIM 256 + +#define REPEAT 1 + + +__global__ void strideCopy(float *odata, float* idata, int stride) +{ + // INSERT CODE: index definitions + int xid = ; + odata[xid] = idata[xid]; +} + +__global__ void offsetCopy(float *odata, float* idata, int offset) +{ + // INSERT CODE: index definitions + int xid = ; + odata[xid] = idata[xid]; +} + +void init_data(float * buffer, int size) { + for ( int i = 0 ; i < size ; i++ ) { + buffer[i] = (float) i; + } +} + +int main (int argc, char *argv[]) { + + int buffer_size; + + float * h_buffer; + + float * d_inp; + float * d_out; + + cudaEvent_t start, end; + float eventEtime, eventBandwidth; + + int size = 65535; + size *= BLOCK_DIM; + + int stride = 1; + int offset = 0; + + if (argc > 1) { + stride = atoi(argv[1]); + } + + if (argc > 2) { + offset = atoi(argv[2]); + } + + buffer_size = (size+offset)*stride; + + dim3 block (BLOCK_DIM); + dim3 grid ( (size-1)/block.x+1 ); + + printf("Number of Elemets = %d\n", size); + printf("Stride = %d\n", stride); + printf("Offset = %d\n", offset); + printf("Buffer size = %d MB\n", sizeof(float)*buffer_size/(1<<20)); + + printf("Block size = %d\n", block.x); + printf("Grid size = %d\n", grid.x); + + printf("Repeat = %d\n", REPEAT); + +//-- insert C code ------------- +// allocate memory on host + +//------------------------------ + + init_data (h_buffer, buffer_size); + +//-- insert CUDA code ---------- +// creat cuda events (start, end) for timing + +//------------------------------ + +//-- insert CUDA code ---------- +// allocate memory buffers on selected GPU + +//------------------------------ + + cudaMemcpy(d_inp, h_buffer, sizeof(float)*buffer_size, cudaMemcpyHostToDevice); + + printf ("\nStride Copy : Stride = %d\n", stride); + + cudaEventRecord(start); + +//-- insert CUDA code ---------- +// launch stride copy kernel + +//------------------------------ + + cudaEventRecord(end); + cudaEventSynchronize(end); + cudaEventElapsedTime(&eventEtime, start, end); + + eventEtime /= REPEAT; + + eventBandwidth = (float) ( 2.0F*size*sizeof(float) ) / ( eventEtime ); + printf("Time / ms = %.1f\n", eventEtime); + printf("Bandwidth / GB/s = %.2f\n", eventBandwidth / 1.e6); + + printf ("\nOffset Copy : Offset = %d\n", offset); + + cudaEventRecord(start); + +//-- insert CUDA code ---------- +// launch offset copy kernel + +//------------------------------ + + cudaEventRecord(end); + cudaEventSynchronize(end); + cudaEventElapsedTime(&eventEtime, start, end); + + eventEtime /= REPEAT; + + eventBandwidth = (float) ( 2.0F*size*sizeof(float) ) / ( eventEtime ); + printf("Time / ms = %.1f\n", eventEtime); + printf("Bandwidth / GB/s = %.2f\n", eventBandwidth / 1.e6); + + cudaMemcpy(h_buffer, d_out, sizeof(float)*buffer_size, cudaMemcpyDeviceToHost); + +//-- insert CUDA code ---------- +// free resources on device (memory buffers and events) + +//---------------------------- + +//-- insert C coda ----------- +// free resources on host + +//------------------------------ + + return 0; +} + diff --git a/cuda/debug/matrix_mul.cu b/cuda/debug/matrix_mul.cu new file mode 100644 index 0000000000000000000000000000000000000000..157e70179e0413cc89f724d157dda1578b5bb67c --- /dev/null +++ b/cuda/debug/matrix_mul.cu @@ -0,0 +1,139 @@ +#include <stdio.h> +#include <assert.h> +#define epsilon (float)1e-5 + +#define NUM_THREADS 16 + +void matrixMulHost(float* A, float* B, float* C, int n) { + for (int i = 0; i < n; ++i) { + for (int j = 0; j < n; ++j) { + float pvalue = 0; + for (int k = 0; k < n; ++k) { + float a = A[i * n + k]; + float b = B[k * n + j]; + pvalue += a * b; + } + C[i * n + j] = pvalue; + } + } +} + +__global__ void MatrixMulKernel(float* A, float* B, float* C, int n) { + + // INSERT CODE: index definitions + int j = ; + int i = ; + + // INSERT CODE: + if ( ) { + + } +} + +int main(int argc, char** argv) { + + int n; + + float *h_A, *h_B, *h_C; + float *h_ref; + + if(argc<2) { + fprintf(stderr,"Usage: %s Width\n",argv[0]); + exit(1); + } + + n=atoi(argv[1]); + + if(n<1) { + fprintf(stderr,"Error Width=%d, must be > 0\n",n); + exit(1); + } + + size_t nbytes = n * n * sizeof(float); + + h_A = (float *) malloc(nbytes); + h_B = (float *) malloc(nbytes); + h_C = (float *) malloc(nbytes); + + h_ref = (float *) malloc(nbytes); + + memset(h_C, 0, nbytes); + memset(h_ref, 0, nbytes); + + for(int y = 0; y < n; y++){ + for(int x = 0; x < n; x++) { + h_A[y * n + x]=(float)(((y + 1) * n + x + 1)/n); + h_B[y * n + x]=(float)(((y + 1) * n + x + 1)/n); + } + } + + float flops; + float *d_A, *d_B, *d_C; + + // CUDA grid management + dim3 threads(NUM_THREADS, NUM_THREADS); + dim3 blocks(n/NUM_THREADS,n/NUM_THREADS); + + cudaMalloc(&d_A, nbytes); + cudaMalloc(&d_B, nbytes); + cudaMalloc(&d_C, nbytes); + + cudaMemcpy(d_A, h_A, nbytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_B, h_B, nbytes, cudaMemcpyHostToDevice); + + // cudaGetLastError call to reset previous CUDA errors + // INSERT CODE + + // Create start and stop CUDA events + // INSERT CODE + + + + // kernel launch + MatrixMulKernel<<<blocks, threads>>>(d_A, d_B, d_C, n); + + // event record, synchronization, elapsed time and destruction + // INSERT CODE + + // device synchronization and cudaGetLastError call + // INSERT CODE + + float elapsed; + cudaEventElapsedTime(&elapsed, start, stop); + elapsed = elapsed/1000.f; // convert to seconds + + // calculate Mflops + // INSERT CODE + printf("Kernel elapsed time %fs \n", elapsed); + printf("Gflops: %f\n", flops); + + // copy back results from device + cudaMemcpy(h_C, d_C, nbytes, cudaMemcpyDeviceToHost); + + // free memory on device + cudaFree(d_A); + cudaFree(d_B); + cudaFree(d_C); + + cudaEventDestroy(start); + cudaEventDestroy(stop); + + matrixMulHost(h_A, h_B, h_ref, n); + + int errCnt = 0; + for(int y = 0; y < n; y++){ + for(int x = 0; x < n; x++) { + float it = h_ref[y * n + x]; + if(fabs(it - h_C[y * n + x]) > epsilon * it) { + printf("failing x=%d, y=%d: %f!=%f \n", x, y, it, h_C[y * n + x]); + errCnt++; + } + } + } + + if(errCnt==0) + printf("\nTEST PASSED\n"); + else + printf("\n\nTEST FAILED: number of errors: %d\n", errCnt); + +} diff --git a/cuda/debug/matrix_mul_shared.cu b/cuda/debug/matrix_mul_shared.cu new file mode 100644 index 0000000000000000000000000000000000000000..691efda508a2ae15236b86fc48f85588bc0bf1db --- /dev/null +++ b/cuda/debug/matrix_mul_shared.cu @@ -0,0 +1,200 @@ +#include <stdio.h> +#include <assert.h> +#define epsilon (float)1e-5 + +#define NUM_THREADS 16 + +void matrixMulHost(float* A, float* B, float* C, int n) { + for (int i = 0; i < n; ++i) { + for (int j = 0; j < n; ++j) { + float pvalue = 0; + for (int k = 0; k < n; ++k) { + float a = A[i * n + k]; + float b = B[k * n + j]; + pvalue += a * b; + } + C[i * n + j] = pvalue; + } + } +} + +__device__ int get_offset (int idx_i, int idx_j, int N) { + return idx_i * N * NUM_THREADS + idx_j * NUM_THREADS; +} + +__global__ void MatrixMulKernelShared(float *A, float *B, float *C, int N) { + + // Shared memory used to store Asub and Bsub respectively + __shared__ float As[NUM_THREADS][NUM_THREADS]; + __shared__ float Bs[NUM_THREADS][NUM_THREADS]; + + // Block row and column + int ib = blockIdx.y; + int jb = blockIdx.x; + + // Thread row and column within Csub + int it = threadIdx.y; + int jt = threadIdx.x; + + int a_offset, b_offset, c_offset; + + // Each thread computes one element of Csub + // by accumulating results into Cvalue + float Cvalue = 0.0f; + + // Loop over all the sub-matrices of A and B that are + // required to compute Csub. + // Multiply each pair of sub-matrices together + // and accumulate the results. + + for (int kb = 0; kb < (N / NUM_THREADS); ++kb) { + + // Get the starting address (a_offset) of Asub + // (sub-matrix of A of dimension NB x NB) + // Asub is located i_block sub-matrices to the right and + // k_block sub-matrices down from the upper-left corner of A + a_offset = get_offset (ib, kb, N); + // Get the starting address (b_offset) of Bsub + b_offset = get_offset (kb, jb, N); + + // Load Asub and Bsub from device memory to shared memory + // Each thread loads one element of each sub-matrix + // ---------------- // + // INSERT CUDA CODE // + // ---------------- // + + // Synchronize to make sure the sub-matrices are loaded + // before starting the computation + // ---------------- // + // INSERT CUDA CODE // + // ---------------- // + + // Multiply As and Bs together + for (int k = 0; k < NUM_THREADS; ++k) { + // ---------------- // + // INSERT CUDA CODE // + // ---------------- // + + } + // Synchronize to make sure that the preceding + // computation is done before loading two new + // sub-matrices of A and B in the next iteration + // ---------------- // + // INSERT CUDA CODE // + // ---------------- // + + } + + c_offset = get_offset (ib, jb, N); + // Each thread block computes one sub-matrix Csub of C + // ---------------- // + // INSERT CUDA CODE // + // ---------------- // + +} + +int main(int argc, char** argv) { + + int n; + + float *h_A, *h_B, *h_C; + float *h_ref; + + if(argc<2) { + fprintf(stderr,"Usage: %s Width\n",argv[0]); + exit(1); + } + + n=atoi(argv[1]); + + if(n<1) { + fprintf(stderr,"Error Width=%d, must be > 0\n",n); + exit(1); + } + + size_t nbytes = n * n * sizeof(float); + + h_A = (float *) malloc(nbytes); + h_B = (float *) malloc(nbytes); + h_C = (float *) malloc(nbytes); + + h_ref = (float *) malloc(nbytes); + + memset(h_C, 0, nbytes); + memset(h_ref, 0, nbytes); + + for(int y = 0; y < n; y++){ + for(int x = 0; x < n; x++) { + h_A[y * n + x]=(float)(((y + 1) * n + x + 1)/n); + h_B[y * n + x]=(float)(((y + 1) * n + x + 1)/n); + } + } + + float flops; + float *d_A, *d_B, *d_C; + + // CUDA grid management + dim3 threads(NUM_THREADS, NUM_THREADS); + dim3 blocks(n/NUM_THREADS,n/NUM_THREADS); + + cudaMalloc(&d_A, nbytes); + cudaMalloc(&d_B, nbytes); + cudaMalloc(&d_C, nbytes); + + cudaMemcpy(d_A, h_A, nbytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_B, h_B, nbytes, cudaMemcpyHostToDevice); + + // cudaGetLastError call to reset previous CUDA errors + // INSERT CODE + + // Create start and stop CUDA events + // INSERT CODE + + // kernel launch + MatrixMulKernelShared<<<blocks, threads>>>(d_A, d_B, d_C, n); + + // event record, synchronization, elapsed time and destruction + // INSERT CODE + + // device synchronization and cudaGetLastError call + // INSERT CODE + + float elapsed; + cudaEventElapsedTime(&elapsed, start, stop); + elapsed = elapsed/1000.f; // convert to seconds + + // calculate Mflops + // INSERT CODE + printf("Kernel elapsed time %fs \n", elapsed); + printf("Gflops: %f\n", flops); + + // copy back results from device + cudaMemcpy(h_C, d_C, nbytes, cudaMemcpyDeviceToHost); + + // free memory on device + cudaFree(d_A); + cudaFree(d_B); + cudaFree(d_C); + + cudaEventDestroy(start); + cudaEventDestroy(stop); + + matrixMulHost(h_A, h_B, h_ref, n); + + int errCnt = 0; + for(int y = 0; y < n; y++){ + for(int x = 0; x < n; x++) { + float it = h_ref[y * n + x]; + if(fabs(it - h_C[y * n + x]) > epsilon * it) { + printf("failing x=%d, y=%d: %f!=%f \n", x, y, it, h_C[y * n + x]); + errCnt++; + } + } + } + + if(errCnt==0) + printf("\nTEST PASSED\n"); + else + printf("\n\nTEST FAILED: number of errors: %d\n", errCnt); + +} diff --git a/cuda/debug/matrix_transpose.cu b/cuda/debug/matrix_transpose.cu new file mode 100644 index 0000000000000000000000000000000000000000..f49fe4a7262a75aa29978f6f3ac43b29fe1fd55c --- /dev/null +++ b/cuda/debug/matrix_transpose.cu @@ -0,0 +1,262 @@ + +#define SIZE 512 +#define NUM_THREADS 32 +#define NUM_REPS 3 + +#include <stdio.h> + + +__global__ void copy(float *odata, float* idata, int width, int height) +{ + // INSERT CODE: index definitions + int i = ; + int j = ; + + int index = ; + + // INSERT CODE: + +} + + +// transpose naive + +__global__ void transposeNaive(float *odata, float* idata, int width, int height) +{ + // INSERT CODE: index definitions + int xIndex = ; + int yIndex = ; + + int index_in = ; + int index_out = ; + + // INSERT CODE: + +} + +// transpose in shared memory + +__global__ void transposeShared(float *odata, float *idata, int width, int height) +{ + // INSERT CODE: + + // INSERT CODE: index definitions + int xIndex = ; + int yIndex = ; + + int index_in = ; + + xIndex = ; + yIndex = ; + + int index_out = ; + + // INSERT CODE: load to shared memory + + // INSERT CODE: Synchronize before start computation + __syncthreads(); + + // INSERT CODE: + +} + +// transpose in shared memory with no bank conflicts + +__global__ void transposeNoBankConflicts(float *odata, float *idata, int width, int height) +{ + // INSERT CODE: + + // INSERT CODE: index definitions + int xIndex = ; + int yIndex = ; + + int index_in = ; + + xIndex = ; + yIndex = ; + + int index_out = ; + + // INSERT CODE: load to shared memory + + // INSERT CODE: Synchronize before start computation + __syncthreads(); + + // INSERT CODE: + +} + + +// transpose host + +void transposeHost(float* gold, float* idata, const int m, const int n) +{ + for( int y = 0; y < n; ++y) { + for( int x = 0; x < m; ++x) { + gold[(x * n) + y] = idata[(y * m) + x]; + } + } +} + +int main( int argc, char** argv) +{ + int m = SIZE; + int n = SIZE; + + if (m%NUM_THREADS != 0 || n%NUM_THREADS != 0) { + printf("\nMatrix size must be integral multiple of tile size\nExiting...\n\n"); + printf("FAILED\n\n"); + return 1; + } + + const int mem_size = sizeof(float) * m*n; + + // allocate host memory + float *h_idata = (float*) malloc(mem_size); + float *h_odata = (float*) malloc(mem_size); + float *transposeGold = (float *) malloc(mem_size); + + // allocate device memory + float *d_idata, *d_odata; + cudaMalloc( (void**) &d_idata, mem_size); + cudaMalloc( (void**) &d_odata, mem_size); + + // initalize host data + for( int i = 0; i < (m*n); ++i) + h_idata[i] = (float) i; + + // copy host data to device + cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice); + + // Compute reference transpose solution + transposeHost(transposeGold, h_idata, m, n); + + // print out common data for all kernels + printf("\nMatrix size: %dx%d (%dx%d tiles), tile size: %dx%d, block size: %dx%d\n\n", + m, n, m/NUM_THREADS, n/NUM_THREADS, NUM_THREADS, NUM_THREADS, NUM_THREADS, NUM_THREADS); + printf ("mem_size %d \n", mem_size); + + // execution configuration parameters + dim3 threads(NUM_THREADS, NUM_THREADS); + dim3 grid(m/NUM_THREADS, n/NUM_THREADS); + + // timing + float etime, bw; + + // CUDA events + cudaEvent_t start, stop; + + // initialize events + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaError_t err_cuda; + err_cuda = cudaGetLastError() ; + + + // copy kernel + + cudaEventRecord(start); + for (int i=0; i < NUM_REPS; i++) { + copy<<<grid, threads>>>(d_odata, d_idata, m, n); + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + err_cuda = cudaGetLastError() ; + + if(err_cuda != cudaSuccess) { + fprintf(stderr,"%s\n",cudaGetErrorString(err_cuda)) ; + } + + cudaEventElapsedTime(&etime, start, stop); + + bw = 2.0f * 1000.0f * mem_size/(1024*1024*1024)/(etime/NUM_REPS); + + printf("%-20s time[ms] %.10e BW[GB/s] %.5f\n", "copy kernel", etime/NUM_REPS, bw); + + + // transpose naive + + cudaEventRecord(start); + for (int i=0; i < NUM_REPS; i++) { + transposeNaive<<<grid, threads>>>(d_odata, d_idata, m, n); + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + err_cuda = cudaGetLastError() ; + + if(err_cuda != cudaSuccess) { + fprintf(stderr,"%s\n",cudaGetErrorString(err_cuda)) ; + } + + cudaEventElapsedTime(&etime, start, stop); + + bw = 2.0f * 1000.0f * mem_size/(1024*1024*1024)/(etime/NUM_REPS); + + printf("%-20s time[ms] %.10e BW[GB/s] %.5f\n", "transpose naive", etime/NUM_REPS, bw); + + cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost); + + + // transpose shared memory + + cudaEventRecord(start); + for (int i=0; i < NUM_REPS; i++) { + transposeShared<<<grid, threads>>>(d_odata, d_idata, m, n); + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + err_cuda = cudaGetLastError() ; + + if(err_cuda != cudaSuccess) { + fprintf(stderr,"%s\n",cudaGetErrorString(err_cuda)) ; + } + + cudaEventElapsedTime(&etime, start, stop); + + bw = 2.0f * 1000.0f * mem_size/(1024*1024*1024)/(etime/NUM_REPS); + + printf("%-20s time[ms] %.10e BW[GB/s] %.5f\n", "transpose shared", etime/NUM_REPS, bw); + + cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost); + + + // transpose shared memory no bank conflicts + + cudaEventRecord(start); + for (int i=0; i < NUM_REPS; i++) { + transposeNoBankConflicts<<<grid, threads>>>(d_odata, d_idata, m, n); + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + err_cuda = cudaGetLastError() ; + + if(err_cuda != cudaSuccess) { + fprintf(stderr,"%s\n",cudaGetErrorString(err_cuda)) ; + } + + cudaEventElapsedTime(&etime, start, stop); + + bw = 2.0f * 1000.0f * mem_size/(1024*1024*1024)/(etime/NUM_REPS); + + printf("%-20s time[ms] %.10e BW[GB/s] %.5f\n", "SM no bank conflicts", etime/NUM_REPS, bw); + + cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost); + + // cleanup + free(h_idata); + free(h_odata); + free(transposeGold); + cudaFree(d_idata); + cudaFree(d_odata); + + cudaEventDestroy(start); + cudaEventDestroy(stop); + + return 0; + +} diff --git a/cuda/debug/solutions/Makefile b/cuda/debug/solutions/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..3408e7b43707c7b5d7508b840ee4d3a5ba22625c --- /dev/null +++ b/cuda/debug/solutions/Makefile @@ -0,0 +1,66 @@ +PGCC = nvc -O3 -gpu=cc80 -Minfo=all -acc=noautopar +PGF90 = nvfortran -O3 -gpu=cc80 -Minfo=all -acc=noautopar + +NVCC = nvcc +NVCC_FLAGS = -arch=sm_80 -lineinfo --ptxas-options=--verbose -Xptxas -dlcm=ca +# -Xptxas -dlcm=ca +# -Xptxas -dlcm=cg # L2 only + +DIMS = 555 +ITER = 2 + +NAME := $(sort $(wildcard *.cu)) +NAME := $(filter-out max.cu,$(NAME)) +NAME := $(patsubst %.cu,%,$(NAME)) + +SRCS = $(patsubst %,%.cu,$(NAME)) +EXES = $(patsubst %,%.x,$(NAME)) +NCUS = $(patsubst %,%.ncu-rep,$(NAME)) +RUNS = $(patsubst %,%.run,$(NAME)) +METS = $(patsubst %,%.metrics,$(NAME)) +SANIT = $(patsubst %,%.sanitizer,$(NAME)) + +.PHONY: all clean cleanall run exe prof # $(NCUS) + +all: exe + +exe : $(EXES) +run : $(RUNS) +ncu : $(NCUS) +prof: $(METS) +sanitizer: $(SANIT) + + +help: + @echo -e "make exe \t compile: $(SRCS)" + @echo -e "make run \t run: $(EXES)" + @echo -e "make ncu \t profile: $(EXES)" + @echo -e "make prof \t metrics: $(EXES)" + +%.x: %.cu + @echo -e "\nCOMPILE: $<" + $(NVCC) $(NVCC_FLAGS) -o $@ $< + +%.run: %.x + @echo -e "\nRUN: $<" + ./$< $(DIMS) + +%.ncu-rep: %.x + @echo -e "\nNCU: $<" + ncu --set full -f -k jacobi -o $(@:.ncu-rep=) ./$< $(DIMS) $(ITER) + #ncu --cache-control all --set full -f -k jacobi -o $(@:.ncu-rep=) ./$< $(DIMS) $(ITER) + +%.metrics: %.x + @echo -e "\nMETRICS: $<" + ncu --cache-control all -k jacobi --metrics $(METRICS) ./$< $(DIMS) 1 + +%.sanitizer: %.x + @echo -e "\nSANITIZER: $<" + compute-sanitizer ./$< $(DIMS) 1 + +clean: + $(RM) $(EXES) + +cleanall: clean + $(RM) $(NCUS) + diff --git a/cuda/debug/solutions/jacobi_01.cu b/cuda/debug/solutions/jacobi_01.cu new file mode 100644 index 0000000000000000000000000000000000000000..fb074ffa50a02b64f3189b46ff166dc8babe72d9 --- /dev/null +++ b/cuda/debug/solutions/jacobi_01.cu @@ -0,0 +1,174 @@ +#include <math.h> +#include <string.h> +#include <float.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/time.h> + +#include <cuda_runtime.h> + +#include <cub/device/device_reduce.cuh> +using namespace cub; + +#define NUM_THREADS 32 + + +__global__ void jacobi(float * curr, float * next, float * errs, int n, int stride){ + + int i = threadIdx.x + blockIdx.x * blockDim.x; + int j = threadIdx.y + blockIdx.y * blockDim.y; + + int idx = i + j * stride; + + if ( (i > 0) && (i < n-1) && (j > 0) && (j < n-1) ) { + + next[idx] = 0.25 * ( + curr[(i-1) + j * stride] + + curr[(i+1) + j * stride] + + curr[i + (j-1) * stride] + + curr[i + (j+1) * stride] + ) ; + + errs[idx] = fabs(next[idx] - curr[idx]); + + } + +} + + +int main(int argc, char *argv[]) { + + int n = 4096; + int stride; + int iter_max = 1000; + double tol = 1.0e-6; + + if (argc > 1) { + n = std::stoi(argv[1]); + } + + if (argc > 2) { + iter_max = std::stoi(argv[2]); + } + + stride = n; + + std::cout << "Dim: " << n << std::endl; + std::cout << "Stride: " << stride << std::endl; + + size_t nbytes = sizeof(float) * n * stride; + + float * h_curr = (float *) malloc(nbytes); + + float h_err = 1.; + + float * d_curr; + float * d_next; + float * d_errs; + + float * d_err; + + cudaMalloc((void **) &d_curr, nbytes); + cudaMalloc((void **) &d_next, nbytes); + cudaMalloc((void **) &d_errs, nbytes); + + cudaMalloc((void **) &d_err, sizeof(float)); + + cudaMemset(d_curr, 0, nbytes); + cudaMemset(d_next, 0, nbytes); + + dim3 nThreads(NUM_THREADS, NUM_THREADS); + dim3 nBlocks((n-1)/nThreads.x+1, (n-1)/nThreads.y+1); + + // Determine temporary device storage requirements + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_errs, d_err, n*stride); + // Allocate temporary storage + cudaMalloc(&d_temp_storage, temp_storage_bytes); + + // timing + cudaEvent_t start, stop; + float etime, time_fd, time_max; + + time_fd = 0.0; + time_max = 0.0; + + // create events + cudaEventCreate(&start); + cudaEventCreate(&stop); + + int iter = 0; + struct timeval temp_1, temp_2; + double ttime=0.; + + for ( int i = 0 ; i < n ; i++ ) { + h_curr[i * stride] = 1.; + } + + cudaMemcpy(d_curr, h_curr, nbytes, cudaMemcpyHostToDevice); + + printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, n); + + gettimeofday(&temp_1, (struct timezone*)0); + + while ( h_err > tol && iter < iter_max ) + { + + cudaEventRecord(start); + + jacobi<<<nBlocks, nThreads>>>(d_curr, d_next, d_errs, n, stride); + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + cudaEventElapsedTime(&etime, start, stop); + + time_fd += etime; + + cudaEventRecord(start); + + cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_errs, d_err, n*stride); + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + cudaEventElapsedTime(&etime, start, stop); + + time_max += etime; + + cudaMemcpy(&h_err, d_err, sizeof(float), cudaMemcpyDeviceToHost); + + //h_err = 1.e6; + + //if(iter % 10 == 0) printf("%5d, %0.8lf\n", iter, h_err); + + std::swap(d_curr, d_next); + + iter++; + } + + gettimeofday(&temp_2, (struct timezone*)0); + ttime = 0.000001*((temp_2.tv_sec-temp_1.tv_sec)*1.e6+(temp_2.tv_usec-temp_1 .tv_usec)); + + printf("Elapsed time (s) = %.2lf\n", ttime); + printf("Stopped at iteration: %u\n", iter); + + time_fd /= iter; + time_max /= iter; + + std::cout << "Time FD / ms " << time_fd << std::endl; + std::cout << "Time MAX / ms " << time_max << std::endl; + + free(h_curr); + + cudaFree(d_curr); + cudaFree(d_next); + + // release event resources + cudaEventDestroy(start); + cudaEventDestroy(stop); + + return 0; + +} diff --git a/cuda/debug/solutions/jacobi_02.cu b/cuda/debug/solutions/jacobi_02.cu new file mode 100644 index 0000000000000000000000000000000000000000..38fc601b752d45389280b5aa4b3af226ff21d57f --- /dev/null +++ b/cuda/debug/solutions/jacobi_02.cu @@ -0,0 +1,178 @@ +#include <math.h> +#include <string.h> +#include <float.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/time.h> + +#include <cuda_runtime.h> + +#include <cub/device/device_reduce.cuh> +using namespace cub; + +#define NUM_THREADS 32 + + +__global__ void jacobi(float * curr, float * next, float * errs, int n, int stride){ + + int i = threadIdx.x + blockIdx.x * blockDim.x; + int j = threadIdx.y + blockIdx.y * blockDim.y; + + int idx = i + j * stride; + + if ( (i > 0) && (i < n-1) && (j > 0) && (j < n-1) ) { + + next[idx] = 0.25 * ( + curr[(i-1) + j * stride] + + curr[(i+1) + j * stride] + + curr[i + (j-1) * stride] + + curr[i + (j+1) * stride] + ) ; + + errs[idx] = fabs(next[idx] - curr[idx]); + + } + +} + + +int main(int argc, char *argv[]) { + + int n = 4096; + int stride; + int iter_max = 1000; + double tol = 1.0e-6; + + if (argc > 1) { + n = std::stoi(argv[1]); + } + + if (argc > 2) { + iter_max = std::stoi(argv[2]); + } + + stride = n; + + float h_err = 1.; + + float * d_curr; + float * d_next; + float * d_errs; + + float * d_err; + + size_t pitch; + + cudaMallocPitch(&d_curr, &pitch, n * sizeof(float), n); + cudaMallocPitch(&d_next, &pitch, n * sizeof(float), n); + cudaMallocPitch(&d_errs, &pitch, n * sizeof(float), n); + + stride = pitch/sizeof(float); + + cudaMalloc((void **) &d_err, sizeof(float)); + + size_t nbytes = sizeof(float) * n * stride; + + float * h_curr = (float *) malloc(nbytes); + + cudaMemset(d_curr, 0, nbytes); + cudaMemset(d_next, 0, nbytes); + + dim3 nThreads(NUM_THREADS, NUM_THREADS); + dim3 nBlocks((n-1)/nThreads.x+1, (n-1)/nThreads.y+1); + + // Determine temporary device storage requirements + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_errs, d_err, n*stride); + // Allocate temporary storage + cudaMalloc(&d_temp_storage, temp_storage_bytes); + + // timing + cudaEvent_t start, stop; + float etime, time_fd, time_max; + + time_fd = 0.0; + time_max = 0.0; + + // create events + cudaEventCreate(&start); + cudaEventCreate(&stop); + + int iter = 0; + struct timeval temp_1, temp_2; + double ttime=0.; + + for ( int i = 0 ; i < n ; i++ ) { + h_curr[i * stride] = 1.; + } + + cudaMemcpy(d_curr, h_curr, nbytes, cudaMemcpyHostToDevice); + + std::cout << "Dim: " << n << std::endl; + std::cout << "Stride: " << stride << std::endl; + + printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, n); + + gettimeofday(&temp_1, (struct timezone*)0); + + while ( h_err > tol && iter < iter_max ) + { + + cudaEventRecord(start); + + jacobi<<<nBlocks, nThreads>>>(d_curr, d_next, d_errs, n, stride); + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + cudaEventElapsedTime(&etime, start, stop); + + time_fd += etime; + + cudaEventRecord(start); + + cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_errs, d_err, n*stride); + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + cudaEventElapsedTime(&etime, start, stop); + + time_max += etime; + + cudaMemcpy(&h_err, d_err, sizeof(float), cudaMemcpyDeviceToHost); + + //h_err = 1.e6; + + //if(iter % 10 == 0) printf("%5d, %0.8lf\n", iter, h_err); + + std::swap(d_curr, d_next); + + iter++; + } + + gettimeofday(&temp_2, (struct timezone*)0); + ttime = 0.000001*((temp_2.tv_sec-temp_1.tv_sec)*1.e6+(temp_2.tv_usec-temp_1 .tv_usec)); + + printf("Elapsed time (s) = %.2lf\n", ttime); + printf("Stopped at iteration: %u\n", iter); + + time_fd /= iter; + time_max /= iter; + + std::cout << "Time FD / ms " << time_fd << std::endl; + std::cout << "Time MAX / ms " << time_max << std::endl; + + free(h_curr); + + cudaFree(d_curr); + cudaFree(d_next); + + // release event resources + cudaEventDestroy(start); + cudaEventDestroy(stop); + + return 0; + +} diff --git a/cuda/debug/solutions/jacobi_03.cu b/cuda/debug/solutions/jacobi_03.cu new file mode 100644 index 0000000000000000000000000000000000000000..48c9795ad7fd4100c2685b3d4558c969493c5568 --- /dev/null +++ b/cuda/debug/solutions/jacobi_03.cu @@ -0,0 +1,178 @@ +#include <math.h> +#include <string.h> +#include <float.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/time.h> + +#include <cuda_runtime.h> + +#include <cub/device/device_reduce.cuh> +using namespace cub; + +#define NUM_THREADS 32 + + +__global__ void jacobi(float * curr, float * next, float * errs, int n, int stride){ + + int i = threadIdx.x + blockIdx.x * blockDim.x; + int j = threadIdx.y + blockIdx.y * blockDim.y; + + int idx = i + j * stride; + + if ( (i > 0) && (i < n-1) && (j > 0) && (j < n-1) ) { + + next[idx] = 0.25 * ( + curr[(i-1) + j * stride] + + curr[(i+1) + j * stride] + + curr[i + (j-1) * stride] + + curr[i + (j+1) * stride] + ) ; + + errs[idx] = fabs(next[idx] - curr[idx]); + + } + +} + + +int main(int argc, char *argv[]) { + + int n = 4096; + int stride; + int iter_max = 1000; + double tol = 1.0e-6; + + if (argc > 1) { + n = std::stoi(argv[1]); + } + + if (argc > 2) { + iter_max = std::stoi(argv[2]); + } + + stride = n; + + float h_err = 1.; + + float * d_curr; + float * d_next; + float * d_errs; + + float * d_err; + + size_t cache_line = 128; + + stride = n + (cache_line - (n * sizeof(float)) % cache_line) / sizeof(float); + + size_t nbytes = sizeof(float) * n * stride; + + float * h_curr = (float *) malloc(nbytes); + + cudaMalloc((void **) &d_curr, nbytes); + cudaMalloc((void **) &d_next, nbytes); + cudaMalloc((void **) &d_errs, nbytes); + + cudaMalloc((void **) &d_err, sizeof(float)); + + cudaMemset(d_curr, 0, nbytes); + cudaMemset(d_next, 0, nbytes); + + dim3 nThreads(NUM_THREADS, NUM_THREADS); + dim3 nBlocks((n-1)/nThreads.x+1, (n-1)/nThreads.y+1); + + // Determine temporary device storage requirements + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_errs, d_err, n*stride); + // Allocate temporary storage + cudaMalloc(&d_temp_storage, temp_storage_bytes); + + // timing + cudaEvent_t start, stop; + float etime, time_fd, time_max; + + time_fd = 0.0; + time_max = 0.0; + + // create events + cudaEventCreate(&start); + cudaEventCreate(&stop); + + int iter = 0; + struct timeval temp_1, temp_2; + double ttime=0.; + + for ( int i = 0 ; i < n ; i++ ) { + h_curr[i * stride] = 1.; + } + + cudaMemcpy(d_curr, h_curr, nbytes, cudaMemcpyHostToDevice); + + std::cout << "Dim: " << n << std::endl; + std::cout << "Stride: " << stride << std::endl; + + printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, n); + + gettimeofday(&temp_1, (struct timezone*)0); + + while ( h_err > tol && iter < iter_max ) + { + + cudaEventRecord(start); + + jacobi<<<nBlocks, nThreads>>>(d_curr, d_next, d_errs, n, stride); + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + cudaEventElapsedTime(&etime, start, stop); + + time_fd += etime; + + cudaEventRecord(start); + + cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_errs, d_err, n*stride); + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + cudaEventElapsedTime(&etime, start, stop); + + time_max += etime; + + cudaMemcpy(&h_err, d_err, sizeof(float), cudaMemcpyDeviceToHost); + + //h_err = 1.e6; + + //if(iter % 10 == 0) printf("%5d, %0.8lf\n", iter, h_err); + + std::swap(d_curr, d_next); + + iter++; + } + + gettimeofday(&temp_2, (struct timezone*)0); + ttime = 0.000001*((temp_2.tv_sec-temp_1.tv_sec)*1.e6+(temp_2.tv_usec-temp_1 .tv_usec)); + + printf("Elapsed time (s) = %.2lf\n", ttime); + printf("Stopped at iteration: %u\n", iter); + + time_fd /= iter; + time_max /= iter; + + std::cout << "Time FD / ms " << time_fd << std::endl; + std::cout << "Time MAX / ms " << time_max << std::endl; + + free(h_curr); + + cudaFree(d_curr); + cudaFree(d_next); + + // release event resources + cudaEventDestroy(start); + cudaEventDestroy(stop); + + return 0; + +} diff --git a/cuda/debug/solutions/kernel_copy.cu b/cuda/debug/solutions/kernel_copy.cu new file mode 100644 index 0000000000000000000000000000000000000000..ab5729db47de5087aebaf245ae68dcef0d5a8b3a --- /dev/null +++ b/cuda/debug/solutions/kernel_copy.cu @@ -0,0 +1,150 @@ +#include <stdio.h> +#include <cuda.h> +#include <cuda_runtime.h> + +#define BLOCK_DIM 256 + +#define REPEAT 1 + + +__global__ void strideCopy(float *odata, float* idata, int stride) +{ + int xid = (blockIdx.x * blockDim.x + threadIdx.x) * stride; + odata[xid] = idata[xid]; +} + +__global__ void offsetCopy(float *odata, float* idata, int offset) +{ + int xid = blockIdx.x * blockDim.x + threadIdx.x + offset; + odata[xid] = idata[xid]; +} + +void init_data(float * buffer, int size) { + for ( int i = 0 ; i < size ; i++ ) { + buffer[i] = (float) i; + } +} + +int main (int argc, char *argv[]) { + + int buffer_size; + + float * h_buffer; + + float * d_inp; + float * d_out; + + cudaEvent_t start, end; + float eventEtime, eventBandwidth; + + int size = 65535; + size *= BLOCK_DIM; + + int stride = 1; + int offset = 0; + + if (argc > 1) { + stride = atoi(argv[1]); + } + + if (argc > 2) { + offset = atoi(argv[2]); + } + + buffer_size = (size+offset)*stride; + + dim3 block (BLOCK_DIM); + dim3 grid ( (size-1)/block.x+1 ); + + printf("Number of Elemets = %d\n", size); + printf("Stride = %d\n", stride); + printf("Offset = %d\n", offset); + printf("Buffer size = %d MB\n", sizeof(float)*buffer_size/(1<<20)); + + printf("Block size = %d\n", block.x); + printf("Grid size = %d\n", grid.x); + + printf("Repeat = %d\n", REPEAT); + +//-- insert C code ------------- +// allocate memory on host + h_buffer = (float *) malloc (sizeof(float)*buffer_size); +//------------------------------ + + init_data (h_buffer, buffer_size); + +//-- insert CUDA code ---------- +// creat cuda events (start, end) for timing + cudaEventCreate(&start); + cudaEventCreate(&end); +//------------------------------ + +//-- insert CUDA code ---------- +// allocate memory buffers on selected GPU + cudaMalloc((void**)&d_inp, sizeof(float)*buffer_size); + cudaMalloc((void**)&d_out, sizeof(float)*buffer_size); +//------------------------------ + + cudaMemcpy(d_inp, h_buffer, sizeof(float)*buffer_size, cudaMemcpyHostToDevice); + + printf ("\nStride Copy : Stride = %d\n", stride); + + cudaEventRecord(start); + +//-- insert CUDA code ---------- +// launch stride copy kernel + for ( int i=0 ; i < REPEAT ; i++ ) { + strideCopy <<<grid, block>>> (d_out, d_inp, stride); + } +//------------------------------ + + cudaEventRecord(end); + cudaEventSynchronize(end); + cudaEventElapsedTime(&eventEtime, start, end); + + eventEtime /= REPEAT; + + eventBandwidth = (float) ( 2.0F*size*sizeof(float) ) / ( eventEtime ); + printf("Time / ms = %.1f\n", eventEtime); + printf("Bandwidth / GB/s = %.2f\n", eventBandwidth / 1.e6); + + printf ("\nOffset Copy : Offset = %d\n", offset); + + cudaEventRecord(start); + +//-- insert CUDA code ---------- +// launch offset copy kernel + for ( int i=0 ; i < REPEAT ; i++ ) { + offsetCopy <<<grid, block>>> (d_out, d_inp, offset); + } +//------------------------------ + + cudaEventRecord(end); + cudaEventSynchronize(end); + cudaEventElapsedTime(&eventEtime, start, end); + + eventEtime /= REPEAT; + + eventBandwidth = (float) ( 2.0F*size*sizeof(float) ) / ( eventEtime ); + printf("Time / ms = %.1f\n", eventEtime); + printf("Bandwidth / GB/s = %.2f\n", eventBandwidth / 1.e6); + + cudaMemcpy(h_buffer, d_out, sizeof(float)*buffer_size, cudaMemcpyDeviceToHost); + +//-- insert CUDA code ---------- +// free resources on device (memory buffers and events) + cudaEventDestroy(start); + cudaEventDestroy(end); + + cudaFree(d_inp); + cudaFree(d_out); +//---------------------------- + +//-- insert C coda ----------- +// free resources on host + free(h_buffer); +//------------------------------ + + return 0; +} + diff --git a/cuda/debug/solutions/matrix_mul.cu b/cuda/debug/solutions/matrix_mul.cu new file mode 100644 index 0000000000000000000000000000000000000000..c99c63bbdd1e701a1b65e3017a767d42c508c526 --- /dev/null +++ b/cuda/debug/solutions/matrix_mul.cu @@ -0,0 +1,160 @@ +#include <stdio.h> +#include <assert.h> +#define epsilon (float)1e-5 + +#define NUM_THREADS 16 + +void matrixMulHost(float* A, float* B, float* C, int n) { + for (int i = 0; i < n; ++i) { + for (int j = 0; j < n; ++j) { + float pvalue = 0; + for (int k = 0; k < n; ++k) { + float a = A[i * n + k]; + float b = B[k * n + j]; + pvalue += a * b; + } + C[i * n + j] = pvalue; + } + } +} + +__global__ void MatrixMulKernel(float* A, float* B, float* C, int n) { + + // INSERT CODE: index definitions + int j = blockDim.x * blockIdx.x + threadIdx.x; + int i = blockDim.y * blockIdx.y + threadIdx.y; + + // INSERT CODE: fastest index to threads along x direction + if (i < n && j < n) { + float pvalue = 0; + for (int k = 0; k < n; ++k) { + float a = A[i * n + k]; + float b = B[k * n + j]; + pvalue += a * b; + } + C[i * n + j] = pvalue; + } +} + +int main(int argc, char** argv) { + + int n; + + float *h_A, *h_B, *h_C; + float *h_ref; + + if(argc<2) { + fprintf(stderr,"Usage: %s Width\n",argv[0]); + exit(1); + } + + n=atoi(argv[1]); + + if(n<1) { + fprintf(stderr,"Error Width=%d, must be > 0\n",n); + exit(1); + } + + size_t nbytes = n * n * sizeof(float); + + h_A = (float *) malloc(nbytes); + h_B = (float *) malloc(nbytes); + h_C = (float *) malloc(nbytes); + + h_ref = (float *) malloc(nbytes); + + memset(h_C, 0, nbytes); + memset(h_ref, 0, nbytes); + + for(int y = 0; y < n; y++){ + for(int x = 0; x < n; x++) { + h_A[y * n + x]=(float)(((y + 1) * n + x + 1)/n); + h_B[y * n + x]=(float)(((y + 1) * n + x + 1)/n); + } + } + + float flops; + float *d_A, *d_B, *d_C; + + // CUDA grid management + dim3 threads(NUM_THREADS, NUM_THREADS); + dim3 blocks(n/NUM_THREADS,n/NUM_THREADS); + + cudaMalloc(&d_A, nbytes); + cudaMalloc(&d_B, nbytes); + cudaMalloc(&d_C, nbytes); + + cudaMemcpy(d_A, h_A, nbytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_B, h_B, nbytes, cudaMemcpyHostToDevice); + + // cudaGetLastError call to reset previous CUDA errors + // INSERT CODE + cudaError_t mycudaerror ; + mycudaerror = cudaGetLastError() ; + + // Create start and stop CUDA events + // INSERT CODE + cudaEvent_t start, stop; + + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); + + // kernel launch + MatrixMulKernel<<<blocks, threads>>>(d_A, d_B, d_C, n); + + // event record, synchronization, elapsed time and destruction + // INSERT CODE + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + // device synchronization and cudaGetLastError call + // INSERT CODE + mycudaerror = cudaGetLastError() ; + if(mycudaerror != cudaSuccess) { + fprintf(stderr,"%s\n",cudaGetErrorString(mycudaerror)) ; + exit(1); + } + + float elapsed; + cudaEventElapsedTime(&elapsed, start, stop); + elapsed = elapsed/1000.f; // convert to seconds + + // calculate Mflops + // INSERT CODE + printf("Kernel elapsed time %fs \n", elapsed); + flops = 2.*n*n*n ; + flops = flops/(1.e9*elapsed) ; + printf("Gflops: %f\n", flops); + + // copy back results from device + cudaMemcpy(h_C, d_C, nbytes, cudaMemcpyDeviceToHost); + + // free memory on device + cudaFree(d_A); + cudaFree(d_B); + cudaFree(d_C); + + cudaEventDestroy(start); + cudaEventDestroy(stop); + + matrixMulHost(h_A, h_B, h_ref, n); + + int errCnt = 0; + for(int y = 0; y < n; y++){ + for(int x = 0; x < n; x++) { + float it = h_ref[y * n + x]; + if(fabs(it - h_C[y * n + x]) > epsilon * it) { + printf("failing x=%d, y=%d: %f!=%f \n", x, y, it, h_C[y * n + x]); + errCnt++; + } + } + } + + if(errCnt==0) + printf("\nTEST PASSED\n"); + else + printf("\n\nTEST FAILED: number of errors: %d\n", errCnt); + +} diff --git a/cuda/debug/solutions/matrix_mul_shared.cu b/cuda/debug/solutions/matrix_mul_shared.cu new file mode 100644 index 0000000000000000000000000000000000000000..129dde71bf4673ed6eb90a17ed66877eb9d8f195 --- /dev/null +++ b/cuda/debug/solutions/matrix_mul_shared.cu @@ -0,0 +1,239 @@ +#include <stdio.h> +#include <assert.h> +#define epsilon (float)1e-5 + +#define NUM_THREADS 16 + +void matrixMulHost(float* A, float* B, float* C, int n) { + for (int i = 0; i < n; ++i) { + for (int j = 0; j < n; ++j) { + float pvalue = 0; + for (int k = 0; k < n; ++k) { + float a = A[i * n + k]; + float b = B[k * n + j]; + pvalue += a * b; + } + C[i * n + j] = pvalue; + } + } +} + +__global__ void MatrixMulKernel(float* A, float* B, float* C, int n) { + + // INSERT CODE: index definitions + int j = blockDim.x * blockIdx.x + threadIdx.x; + int i = blockDim.y * blockIdx.y + threadIdx.y; + + // INSERT CODE: fastest index to threads along x direction + if (i < n && j < n) { + float pvalue = 0; + for (int k = 0; k < n; ++k) { + float a = A[i * n + k]; + float b = B[k * n + j]; + pvalue += a * b; + } + C[i * n + j] = pvalue; + } +} + +__device__ int get_offset (int idx_i, int idx_j, int N) { + return idx_i * N * NUM_THREADS + idx_j * NUM_THREADS; +} + +__global__ void MatrixMulKernelShared(float *A, float *B, float *C, int N) { + + // Shared memory used to store Asub and Bsub respectively + __shared__ float As[NUM_THREADS][NUM_THREADS]; + __shared__ float Bs[NUM_THREADS][NUM_THREADS]; + + // Block row and column + int ib = blockIdx.y; + int jb = blockIdx.x; + + // Thread row and column within Csub + int it = threadIdx.y; + int jt = threadIdx.x; + + int a_offset, b_offset, c_offset; + + // Each thread computes one element of Csub + // by accumulating results into Cvalue + float Cvalue = 0.0f; + + // Loop over all the sub-matrices of A and B that are + // required to compute Csub. + // Multiply each pair of sub-matrices together + // and accumulate the results. + + for (int kb = 0; kb < (N / NUM_THREADS); ++kb) { + + // Get the starting address (a_offset) of Asub + // (sub-matrix of A of dimension NB x NB) + // Asub is located i_block sub-matrices to the right and + // k_block sub-matrices down from the upper-left corner of A + a_offset = get_offset (ib, kb, N); + // Get the starting address (b_offset) of Bsub + b_offset = get_offset (kb, jb, N); + + // Load Asub and Bsub from device memory to shared memory + // Each thread loads one element of each sub-matrix + // ---------------- // + // INSERT CUDA CODE // + // ---------------- // + As[it][jt] = A[a_offset + it*N + jt]; + Bs[it][jt] = B[b_offset + it*N + jt]; + + // Synchronize to make sure the sub-matrices are loaded + // before starting the computation + // ---------------- // + // INSERT CUDA CODE // + // ---------------- // + __syncthreads(); + + // Multiply As and Bs together + for (int k = 0; k < NUM_THREADS; ++k) { + // ---------------- // + // INSERT CUDA CODE // + // ---------------- // + Cvalue += As[it][k] * Bs[k][jt]; + } + // Synchronize to make sure that the preceding + // computation is done before loading two new + // sub-matrices of A and B in the next iteration + // ---------------- // + // INSERT CUDA CODE // + // ---------------- // + __syncthreads(); + } + + c_offset = get_offset (ib, jb, N); + // Each thread block computes one sub-matrix Csub of C + // ---------------- // + // INSERT CUDA CODE // + // ---------------- // + C[c_offset + it * N + jt] = Cvalue; + +} + +int main(int argc, char** argv) { + + int n; + + float *h_A, *h_B, *h_C; + float *h_ref; + + if(argc<2) { + fprintf(stderr,"Usage: %s Width\n",argv[0]); + exit(1); + } + + n=atoi(argv[1]); + + if(n<1) { + fprintf(stderr,"Error Width=%d, must be > 0\n",n); + exit(1); + } + + size_t nbytes = n * n * sizeof(float); + + h_A = (float *) malloc(nbytes); + h_B = (float *) malloc(nbytes); + h_C = (float *) malloc(nbytes); + + h_ref = (float *) malloc(nbytes); + + memset(h_C, 0, nbytes); + memset(h_ref, 0, nbytes); + + for(int y = 0; y < n; y++){ + for(int x = 0; x < n; x++) { + h_A[y * n + x]=(float)(((y + 1) * n + x + 1)/n); + h_B[y * n + x]=(float)(((y + 1) * n + x + 1)/n); + } + } + + float flops; + float *d_A, *d_B, *d_C; + + // CUDA grid management + dim3 threads(NUM_THREADS, NUM_THREADS); + dim3 blocks(n/NUM_THREADS,n/NUM_THREADS); + + cudaMalloc(&d_A, nbytes); + cudaMalloc(&d_B, nbytes); + cudaMalloc(&d_C, nbytes); + + cudaMemcpy(d_A, h_A, nbytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_B, h_B, nbytes, cudaMemcpyHostToDevice); + + // cudaGetLastError call to reset previous CUDA errors + // INSERT CODE + cudaError_t mycudaerror ; + mycudaerror = cudaGetLastError() ; + + // Create start and stop CUDA events + // INSERT CODE + cudaEvent_t start, stop; + + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); + + // kernel launch + MatrixMulKernelShared<<<blocks, threads>>>(d_A, d_B, d_C, n); + + // event record, synchronization, elapsed time and destruction + // INSERT CODE + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + // device synchronization and cudaGetLastError call + // INSERT CODE + mycudaerror = cudaGetLastError() ; + if(mycudaerror != cudaSuccess) { + fprintf(stderr,"%s\n",cudaGetErrorString(mycudaerror)) ; + exit(1); + } + + float elapsed; + cudaEventElapsedTime(&elapsed, start, stop); + elapsed = elapsed/1000.f; // convert to seconds + + // calculate Mflops + // INSERT CODE + printf("Kernel elapsed time %fs \n", elapsed); + flops = 2.*n*n*n ; + flops = flops/(1.e9*elapsed) ; + printf("Gflops: %f\n", flops); + + // copy back results from device + cudaMemcpy(h_C, d_C, nbytes, cudaMemcpyDeviceToHost); + + // free memory on device + cudaFree(d_A); + cudaFree(d_B); + cudaFree(d_C); + + cudaEventDestroy(start); + cudaEventDestroy(stop); + + matrixMulHost(h_A, h_B, h_ref, n); + + int errCnt = 0; + for(int y = 0; y < n; y++){ + for(int x = 0; x < n; x++) { + float it = h_ref[y * n + x]; + if(fabs(it - h_C[y * n + x]) > epsilon * it) { + printf("failing x=%d, y=%d: %f!=%f \n", x, y, it, h_C[y * n + x]); + errCnt++; + } + } + } + + if(errCnt==0) + printf("\nTEST PASSED\n"); + else + printf("\n\nTEST FAILED: number of errors: %d\n", errCnt); + +} diff --git a/cuda/debug/solutions/matrix_transpose.cu b/cuda/debug/solutions/matrix_transpose.cu new file mode 100644 index 0000000000000000000000000000000000000000..c3ed67f58133fe5b63bd97d0aa31b73a576afba3 --- /dev/null +++ b/cuda/debug/solutions/matrix_transpose.cu @@ -0,0 +1,252 @@ + +#define SIZE 512 +#define NUM_THREADS 32 +#define NUM_REPS 3 + +#include <stdio.h> + + +__global__ void copy(float *odata, float* idata, int width, int height) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + + int index = i + j * width; + + odata[index] = idata[index]; +} + + +// transpose naive + +__global__ void transposeNaive(float *odata, float* idata, int width, int height) +{ + int xIndex = blockIdx.x * NUM_THREADS + threadIdx.x; + int yIndex = blockIdx.y * NUM_THREADS + threadIdx.y; + + int index_in = xIndex + width * yIndex; + int index_out = yIndex + height * xIndex; + + odata[index_out] = idata[index_in]; +} + +// transpose in shared memory + +__global__ void transposeShared(float *odata, float *idata, int width, int height) +{ + __shared__ float tile[NUM_THREADS][NUM_THREADS]; + + int xIndex = blockIdx.x * NUM_THREADS + threadIdx.x; + int yIndex = blockIdx.y * NUM_THREADS + threadIdx.y; + + int index_in = xIndex + (yIndex)*width; + + xIndex = blockIdx.y * NUM_THREADS + threadIdx.x; + yIndex = blockIdx.x * NUM_THREADS + threadIdx.y; + + int index_out = xIndex + (yIndex)*height; + + tile[threadIdx.y][threadIdx.x] = idata[index_in]; + + __syncthreads(); + + odata[index_out] = tile[threadIdx.x][threadIdx.y]; +} + +// transpose in shared memory with no bank conflicts + +__global__ void transposeNoBankConflicts(float *odata, float *idata, int width, int height) +{ + __shared__ float tile[NUM_THREADS][NUM_THREADS+1]; + + int xIndex = blockIdx.x * NUM_THREADS + threadIdx.x; + int yIndex = blockIdx.y * NUM_THREADS + threadIdx.y; + + int index_in = xIndex + (yIndex)*width; + + xIndex = blockIdx.y * NUM_THREADS + threadIdx.x; + yIndex = blockIdx.x * NUM_THREADS + threadIdx.y; + + int index_out = xIndex + (yIndex)*height; + + tile[threadIdx.y][threadIdx.x] = idata[index_in]; + + __syncthreads(); + + odata[index_out] = tile[threadIdx.x][threadIdx.y]; +} + + +// transpose host + +void transposeHost(float* gold, float* idata, const int m, const int n) +{ + for( int y = 0; y < n; ++y) { + for( int x = 0; x < m; ++x) { + gold[(x * n) + y] = idata[(y * m) + x]; + } + } +} + +int main( int argc, char** argv) +{ + int m = SIZE; + int n = SIZE; + + if (m%NUM_THREADS != 0 || n%NUM_THREADS != 0) { + printf("\nMatrix size must be integral multiple of tile size\nExiting...\n\n"); + printf("FAILED\n\n"); + return 1; + } + + const int mem_size = sizeof(float) * m*n; + + // allocate host memory + float *h_idata = (float*) malloc(mem_size); + float *h_odata = (float*) malloc(mem_size); + float *transposeGold = (float *) malloc(mem_size); + + // allocate device memory + float *d_idata, *d_odata; + cudaMalloc( (void**) &d_idata, mem_size); + cudaMalloc( (void**) &d_odata, mem_size); + + // initalize host data + for( int i = 0; i < (m*n); ++i) + h_idata[i] = (float) i; + + // copy host data to device + cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice); + + // Compute reference transpose solution + transposeHost(transposeGold, h_idata, m, n); + + // print out common data for all kernels + printf("\nMatrix size: %dx%d (%dx%d tiles), tile size: %dx%d, block size: %dx%d\n\n", + m, n, m/NUM_THREADS, n/NUM_THREADS, NUM_THREADS, NUM_THREADS, NUM_THREADS, NUM_THREADS); + printf ("mem_size %d \n", mem_size); + + // execution configuration parameters + dim3 threads(NUM_THREADS, NUM_THREADS); + dim3 grid(m/NUM_THREADS, n/NUM_THREADS); + + // timing + float etime, bw; + + // CUDA events + cudaEvent_t start, stop; + + // initialize events + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaError_t err_cuda; + err_cuda = cudaGetLastError() ; + + + // copy kernel + + cudaEventRecord(start); + for (int i=0; i < NUM_REPS; i++) { + copy<<<grid, threads>>>(d_odata, d_idata, m, n); + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + err_cuda = cudaGetLastError() ; + + if(err_cuda != cudaSuccess) { + fprintf(stderr,"%s\n",cudaGetErrorString(err_cuda)) ; + } + + cudaEventElapsedTime(&etime, start, stop); + + bw = 2.0f * 1000.0f * mem_size/(1024*1024*1024)/(etime/NUM_REPS); + + printf("%-20s time[ms] %.10e BW[GB/s] %.5f\n", "copy kernel", etime/NUM_REPS, bw); + + + // transpose naive + + cudaEventRecord(start); + for (int i=0; i < NUM_REPS; i++) { + transposeNaive<<<grid, threads>>>(d_odata, d_idata, m, n); + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + err_cuda = cudaGetLastError() ; + + if(err_cuda != cudaSuccess) { + fprintf(stderr,"%s\n",cudaGetErrorString(err_cuda)) ; + } + + cudaEventElapsedTime(&etime, start, stop); + + bw = 2.0f * 1000.0f * mem_size/(1024*1024*1024)/(etime/NUM_REPS); + + printf("%-20s time[ms] %.10e BW[GB/s] %.5f\n", "transpose naive", etime/NUM_REPS, bw); + + cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost); + + + // transpose shared memory + + cudaEventRecord(start); + for (int i=0; i < NUM_REPS; i++) { + transposeShared<<<grid, threads>>>(d_odata, d_idata, m, n); + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + err_cuda = cudaGetLastError() ; + + if(err_cuda != cudaSuccess) { + fprintf(stderr,"%s\n",cudaGetErrorString(err_cuda)) ; + } + + cudaEventElapsedTime(&etime, start, stop); + + bw = 2.0f * 1000.0f * mem_size/(1024*1024*1024)/(etime/NUM_REPS); + + printf("%-20s time[ms] %.10e BW[GB/s] %.5f\n", "transpose shared", etime/NUM_REPS, bw); + + cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost); + + + // transpose shared memory no bank conflicts + + cudaEventRecord(start); + for (int i=0; i < NUM_REPS; i++) { + transposeNoBankConflicts<<<grid, threads>>>(d_odata, d_idata, m, n); + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + err_cuda = cudaGetLastError() ; + + if(err_cuda != cudaSuccess) { + fprintf(stderr,"%s\n",cudaGetErrorString(err_cuda)) ; + } + + cudaEventElapsedTime(&etime, start, stop); + + bw = 2.0f * 1000.0f * mem_size/(1024*1024*1024)/(etime/NUM_REPS); + + printf("%-20s time[ms] %.10e BW[GB/s] %.5f\n", "SM no bank conflicts", etime/NUM_REPS, bw); + + cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost); + + // cleanup + free(h_idata); + free(h_odata); + free(transposeGold); + cudaFree(d_idata); + cudaFree(d_odata); + + cudaEventDestroy(start); + cudaEventDestroy(stop); + + return 0; + +}