From 7119202ed302e93d0b3600ae738fe16ab81162eb Mon Sep 17 00:00:00 2001 From: Simone Bna <s.bn@cineca.it> Date: Thu, 19 Nov 2020 20:00:05 +0100 Subject: [PATCH] ENH: switched from struct to class, modified some APIs in AmgXCSRMatrix --- src/AmgXCSRMatrix.H | 105 +++++++++++++++++++++++--------------- src/AmgXCSRMatrix.cu | 119 ++++++++++++++++++++----------------------- 2 files changed, 120 insertions(+), 104 deletions(-) diff --git a/src/AmgXCSRMatrix.H b/src/AmgXCSRMatrix.H index 82268d8..f015567 100644 --- a/src/AmgXCSRMatrix.H +++ b/src/AmgXCSRMatrix.H @@ -22,52 +22,75 @@ #pragma once -struct AmgXCSRMatrix +class AmgXCSRMatrix { - // CSR device data for AmgX matrix - int *colIndices; - int *rowOffsets; - double *values; + public: - // Temporary storage for the permutation - double *valuesTmp; + // Perform the conversion between an LDU matrix and a CSR matrix, + // possibly distributed + void setValuesLDU + ( + int nrows, + int nInternalFaces, + int diagIndexGlobal, + int lowOffGlobal, + int uppOffGlobal, + const int *upperAddr, + const int *lowerAddr, + const int extNnz, + const int *extRow, + const int *extCol, + const double *diagVals, + const double *upperVals, + const double *lowerVals, + const double *extVals + ); - // Permutation array to convert between LDU and CSR - // Also possibly encodes sorting of columns - int *ldu2csrPerm; + // Updates the CSR matrix values based on LDU matrix values, permuted with + // the previously determined permutation + void updateValues + ( + const int nrows, + const int nInternalFaces, + const int extNnz, + const double *diagVal, + const double *uppVal, + const double *lowVal, + const double *extVals + ); - // Perform the conversion between an LDU matrix and a CSR matrix, - // possibly distributed - void convertLDU2CSR( - int nrows, - int nInternalFaces, - int diagIndexGlobal, - int lowOffGlobal, - int uppOffGlobal, - const int *upperAddr, - const int *lowerAddr, - const int extNnz, - const int *extRow, - const int *extCol, - const double *diagVals, - const double *upperVals, - const double *lowerVals, - const double *extVals); + const int* getColIndices() const + { + return colIndices; + } - // Updates the CSR matrix values based on LDU matrix values, permuted with - // the previously determined permutation - void updateValues( - const int nrows, - const int nInternalFaces, - const int extNnz, - const double *diagVal, - const double *uppVal, - const double *lowVal, - const double *extVals); + const int* getRowOffsets() const + { + return rowOffsets; + } - // Discard elements of the matrix structure - void discardStructure(); + const double* getValues() const + { + return values; + } - // Finalise all data - void finalise(); + // Discard elements of the matrix structure + void discardStructure(); + + // Finalise all data + void finalise(); + + private: + // CSR device data for AmgX matrix + int *colIndices; + int *rowOffsets; + double *values; + + // Temporary storage for the permutation + double *valuesTmp; + + // Permutation array to convert between LDU and CSR + // Also possibly encodes sorting of columns + int *ldu2csrPerm; }; + diff --git a/src/AmgXCSRMatrix.cu b/src/AmgXCSRMatrix.cu index 6db5131..066a1cb 100644 --- a/src/AmgXCSRMatrix.cu +++ b/src/AmgXCSRMatrix.cu @@ -39,14 +39,16 @@ } // Offset the column indices to transform from local to global -__global__ void localToGlobalColIndices( +__global__ void localToGlobalColIndices +( const int nnz, const int nrows, const int nInternalFaces, const int diagIndexGlobal, const int lowOffGlobal, const int uppOffGlobal, - int *colIndices) + int *colIndices +) { // Offset by global offset for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < nnz; i += blockDim.x * gridDim.x) @@ -72,94 +74,51 @@ __global__ void localToGlobalColIndices( } // Apply the pre-existing permutation to the values [and columns] -__global__ void applyPermutation( +__global__ void applyPermutation +( const int totalNnz, const int *perm, const int *colIndicesTmp, const double *valuesTmp, int *colIndices, double *values, - bool valuesOnly) + bool valuesOnly +) { // Permute col indices and values for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < totalNnz; i += blockDim.x * gridDim.x) { - // if ( i<nnz ) - { - int p = perm[i]; + int p = perm[i]; - // In the values only case the column indices and row offsets remain fixed so no update - if (!valuesOnly) - { + // In the values only case the column indices and row offsets remain fixed so no update + if (!valuesOnly) + { colIndices[i] = colIndicesTmp[p]; - } - - values[i] = valuesTmp[p]; } + + values[i] = valuesTmp[p]; } } // Flatten the row indices into the row offsets -__global__ void createRowOffsets( +__global__ void createRowOffsets +( int nnz, int nrows, int *rowIndices, - int *rowOffsets) + int *rowOffsets +) { for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < nnz; i += blockDim.x * gridDim.x) { - if ( i<nnz ) - { int j = rowIndices[i]; - if ( j<nrows+1 ) - { - atomicAdd(&rowOffsets[j], 1); - } - else - { - printf( " - Loop interno :: %d \n", j ); - } - } - else - { - printf( " - Loop esterno :: %d \n", i ); - } + atomicAdd(&rowOffsets[j], 1); } } -// Updates the values based on the previously determined permutation -void AmgXCSRMatrix::updateValues( - const int nrows, - const int nInternalFaces, - const int extNnz, - const double *diagVal, - const double *uppVal, - const double *lowVal, - const double *extVal) -{ - // Determine the local non-zeros from the internal faces - int localNnz = nrows + 2 * nInternalFaces; - - // Add external non-zeros (communicated halo entries) - int totalNnz = localNnz + extNnz; - - // Copy the values in [ diag, upper, lower, (external) ] - CHECK(cudaMemcpy(valuesTmp, diagVal, sizeof(double) * nrows, cudaMemcpyDefault)); - CHECK(cudaMemcpy(valuesTmp + nrows, uppVal, sizeof(double) * nInternalFaces, cudaMemcpyDefault)); - CHECK(cudaMemcpy(valuesTmp + nrows + nInternalFaces, lowVal, sizeof(double) * nInternalFaces, cudaMemcpyDefault)); - if (extNnz > 0) - { - CHECK(cudaMemcpy(valuesTmp + localNnz, extVal, sizeof(double) * extNnz, cudaMemcpyDefault)); - } - - constexpr int nthreads = 128; - int nblocks = totalNnz / nthreads + 1; - applyPermutation<<<nblocks, nthreads>>>(totalNnz, ldu2csrPerm, nullptr, valuesTmp, nullptr, values, true); - CHECK(cudaStreamSynchronize(0)); -} - // Perform the conversion between an LDU matrix and a CSR matrix, possibly distributed -void AmgXCSRMatrix::convertLDU2CSR( +void AmgXCSRMatrix::setValuesLDU +( int nrows, int nInternalFaces, int diagIndexGlobal, @@ -173,7 +132,8 @@ void AmgXCSRMatrix::convertLDU2CSR( const double *diagVals, const double *upperVals, const double *lowerVals, - const double *extVals) + const double *extVals +) { // Determine the local non-zeros from the internal faces int localNnz = nrows + 2 * nInternalFaces; @@ -264,6 +224,39 @@ void AmgXCSRMatrix::convertLDU2CSR( CHECK(cudaFree(colIndicesTmp)); } +// Updates the values based on the previously determined permutation +void AmgXCSRMatrix::updateValues +( + const int nrows, + const int nInternalFaces, + const int extNnz, + const double *diagVal, + const double *uppVal, + const double *lowVal, + const double *extVal +) +{ + // Determine the local non-zeros from the internal faces + int localNnz = nrows + 2 * nInternalFaces; + + // Add external non-zeros (communicated halo entries) + int totalNnz = localNnz + extNnz; + + // Copy the values in [ diag, upper, lower, (external) ] + CHECK(cudaMemcpy(valuesTmp, diagVal, sizeof(double) * nrows, cudaMemcpyDefault)); + CHECK(cudaMemcpy(valuesTmp + nrows, uppVal, sizeof(double) * nInternalFaces, cudaMemcpyDefault)); + CHECK(cudaMemcpy(valuesTmp + nrows + nInternalFaces, lowVal, sizeof(double) * nInternalFaces, cudaMemcpyDefault)); + if (extNnz > 0) + { + CHECK(cudaMemcpy(valuesTmp + localNnz, extVal, sizeof(double) * extNnz, cudaMemcpyDefault)); + } + + constexpr int nthreads = 128; + int nblocks = totalNnz / nthreads + 1; + applyPermutation<<<nblocks, nthreads>>>(totalNnz, ldu2csrPerm, nullptr, valuesTmp, nullptr, values, true); + CHECK(cudaStreamSynchronize(0)); +} + // XXX Should implement an early abandonment of the // unnecessary data for capacity optimisation void AmgXCSRMatrix::discardStructure() -- GitLab