From 7119202ed302e93d0b3600ae738fe16ab81162eb Mon Sep 17 00:00:00 2001
From: Simone Bna <s.bn@cineca.it>
Date: Thu, 19 Nov 2020 20:00:05 +0100
Subject: [PATCH] ENH: switched from struct to class, modified some APIs in
 AmgXCSRMatrix

---
 src/AmgXCSRMatrix.H  | 105 +++++++++++++++++++++++---------------
 src/AmgXCSRMatrix.cu | 119 ++++++++++++++++++++-----------------------
 2 files changed, 120 insertions(+), 104 deletions(-)

diff --git a/src/AmgXCSRMatrix.H b/src/AmgXCSRMatrix.H
index 82268d8..f015567 100644
--- a/src/AmgXCSRMatrix.H
+++ b/src/AmgXCSRMatrix.H
@@ -22,52 +22,75 @@
 
 #pragma once
 
-struct AmgXCSRMatrix
+class AmgXCSRMatrix
 {
-    // CSR device data for AmgX matrix
-    int *colIndices;
-    int *rowOffsets;
-    double *values;
+    public:
 
-    // Temporary storage for the permutation
-    double *valuesTmp;
+        // Perform the conversion between an LDU matrix and a CSR matrix,
+        // possibly distributed
+        void setValuesLDU
+        (
+            int nrows,
+            int nInternalFaces,
+            int diagIndexGlobal,
+            int lowOffGlobal,
+            int uppOffGlobal,
+            const int *upperAddr,
+            const int *lowerAddr,
+            const int extNnz,
+            const int *extRow,
+            const int *extCol,
+            const double *diagVals,
+            const double *upperVals,
+            const double *lowerVals,
+            const double *extVals
+        );
 
-    // Permutation array to convert between LDU and CSR
-    // Also possibly encodes sorting of columns
-    int *ldu2csrPerm;
+        // Updates the CSR matrix values based on LDU matrix values, permuted with
+        // the previously determined permutation
+        void updateValues
+        (
+            const int nrows,
+            const int nInternalFaces,
+            const int extNnz,
+            const double *diagVal,
+            const double *uppVal,
+            const double *lowVal,
+            const double *extVals
+        );
 
-    // Perform the conversion between an LDU matrix and a CSR matrix,
-    // possibly distributed
-    void convertLDU2CSR(
-        int nrows,
-        int nInternalFaces,
-        int diagIndexGlobal,
-        int lowOffGlobal,
-        int uppOffGlobal,
-        const int *upperAddr,
-        const int *lowerAddr,
-        const int extNnz,
-        const int *extRow,
-        const int *extCol,
-        const double *diagVals,
-        const double *upperVals,
-        const double *lowerVals,
-        const double *extVals);
+        const int* getColIndices() const
+        {
+            return colIndices;
+        }
 
-    // Updates the CSR matrix values based on LDU matrix values, permuted with
-    // the previously determined permutation
-    void updateValues(
-        const int nrows,
-        const int nInternalFaces,
-        const int extNnz,
-        const double *diagVal,
-        const double *uppVal,
-        const double *lowVal,
-        const double *extVals);
+        const int* getRowOffsets() const
+        {
+            return rowOffsets;
+        }
 
-    // Discard elements of the matrix structure
-    void discardStructure();
+        const double* getValues() const
+        {
+            return values;
+        }
 
-    // Finalise all data
-    void finalise();
+        // Discard elements of the matrix structure
+        void discardStructure();
+
+        // Finalise all data
+        void finalise();
+
+    private:
+        // CSR device data for AmgX matrix
+        int *colIndices;
+        int *rowOffsets;
+        double *values;
+
+        // Temporary storage for the permutation
+        double *valuesTmp;
+        
+        // Permutation array to convert between LDU and CSR
+        // Also possibly encodes sorting of columns
+        int *ldu2csrPerm;
 };
+
diff --git a/src/AmgXCSRMatrix.cu b/src/AmgXCSRMatrix.cu
index 6db5131..066a1cb 100644
--- a/src/AmgXCSRMatrix.cu
+++ b/src/AmgXCSRMatrix.cu
@@ -39,14 +39,16 @@
     }
 
 // Offset the column indices to transform from local to global
-__global__ void localToGlobalColIndices(
+__global__ void localToGlobalColIndices
+(
     const int nnz,
     const int nrows,
     const int nInternalFaces,
     const int diagIndexGlobal,
     const int lowOffGlobal,
     const int uppOffGlobal,
-    int *colIndices)
+    int *colIndices
+)
 {
     // Offset by global offset
     for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < nnz; i += blockDim.x * gridDim.x)
@@ -72,94 +74,51 @@ __global__ void localToGlobalColIndices(
 }
 
 // Apply the pre-existing permutation to the values [and columns]
-__global__ void applyPermutation(
+__global__ void applyPermutation
+(
     const int totalNnz,
     const int *perm,
     const int *colIndicesTmp,
     const double *valuesTmp,
     int *colIndices,
     double *values,
-    bool valuesOnly)
+    bool valuesOnly
+)
 {
     // Permute col indices and values
     for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < totalNnz; i += blockDim.x * gridDim.x)
     {
-        // if ( i<nnz )
-        {
-          int p = perm[i];
+        int p = perm[i];
 
-          // In the values only case the column indices and row offsets remain fixed so no update
-          if (!valuesOnly)
-          {
+        // In the values only case the column indices and row offsets remain fixed so no update
+        if (!valuesOnly)
+        {
             colIndices[i] = colIndicesTmp[p];
-          }
-
-          values[i] = valuesTmp[p];
         }
+
+        values[i] = valuesTmp[p];
     }
 }
 
 // Flatten the row indices into the row offsets
-__global__ void createRowOffsets(
+__global__ void createRowOffsets
+(
     int nnz,
     int nrows, 
     int *rowIndices,
-    int *rowOffsets)
+    int *rowOffsets
+)
 {
     for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < nnz; i += blockDim.x * gridDim.x)
     {
-      if ( i<nnz )  
-      {
         int j = rowIndices[i];
-        if ( j<nrows+1 )
-        {
-          atomicAdd(&rowOffsets[j], 1);
-        }
-        else
-        {
-          printf( " - Loop interno :: %d \n", j  );
-        }
-      }
-      else
-      {
-        printf( " - Loop esterno :: %d \n", i  );
-      }
+        atomicAdd(&rowOffsets[j], 1);
     }
 }
 
-// Updates the values based on the previously determined permutation
-void AmgXCSRMatrix::updateValues(
-    const int nrows,
-    const int nInternalFaces,
-    const int extNnz,
-    const double *diagVal,
-    const double *uppVal,
-    const double *lowVal,
-    const double *extVal)
-{
-    // Determine the local non-zeros from the internal faces
-    int localNnz = nrows + 2 * nInternalFaces;
-
-    // Add external non-zeros (communicated halo entries)
-    int totalNnz = localNnz + extNnz;
-
-    // Copy the values in [ diag, upper, lower, (external) ]
-    CHECK(cudaMemcpy(valuesTmp, diagVal, sizeof(double) * nrows, cudaMemcpyDefault));
-    CHECK(cudaMemcpy(valuesTmp + nrows, uppVal, sizeof(double) * nInternalFaces, cudaMemcpyDefault));
-    CHECK(cudaMemcpy(valuesTmp + nrows + nInternalFaces, lowVal, sizeof(double) * nInternalFaces, cudaMemcpyDefault));
-    if (extNnz > 0)
-    {
-        CHECK(cudaMemcpy(valuesTmp + localNnz, extVal, sizeof(double) * extNnz, cudaMemcpyDefault));
-    }
-
-    constexpr int nthreads = 128;
-    int nblocks = totalNnz / nthreads + 1;
-    applyPermutation<<<nblocks, nthreads>>>(totalNnz, ldu2csrPerm, nullptr, valuesTmp, nullptr, values, true);
-    CHECK(cudaStreamSynchronize(0));
-}
-
 // Perform the conversion between an LDU matrix and a CSR matrix, possibly distributed
-void AmgXCSRMatrix::convertLDU2CSR(
+void AmgXCSRMatrix::setValuesLDU
+(
     int nrows,
     int nInternalFaces,
     int diagIndexGlobal,
@@ -173,7 +132,8 @@ void AmgXCSRMatrix::convertLDU2CSR(
     const double *diagVals,
     const double *upperVals,
     const double *lowerVals,
-    const double *extVals)
+    const double *extVals
+)
 {
     // Determine the local non-zeros from the internal faces
     int localNnz = nrows + 2 * nInternalFaces;
@@ -264,6 +224,39 @@ void AmgXCSRMatrix::convertLDU2CSR(
     CHECK(cudaFree(colIndicesTmp));
 }
 
+// Updates the values based on the previously determined permutation
+void AmgXCSRMatrix::updateValues
+(
+    const int nrows,
+    const int nInternalFaces,
+    const int extNnz,
+    const double *diagVal,
+    const double *uppVal,
+    const double *lowVal,
+    const double *extVal
+)
+{
+    // Determine the local non-zeros from the internal faces
+    int localNnz = nrows + 2 * nInternalFaces;
+
+    // Add external non-zeros (communicated halo entries)
+    int totalNnz = localNnz + extNnz;
+
+    // Copy the values in [ diag, upper, lower, (external) ]
+    CHECK(cudaMemcpy(valuesTmp, diagVal, sizeof(double) * nrows, cudaMemcpyDefault));
+    CHECK(cudaMemcpy(valuesTmp + nrows, uppVal, sizeof(double) * nInternalFaces, cudaMemcpyDefault));
+    CHECK(cudaMemcpy(valuesTmp + nrows + nInternalFaces, lowVal, sizeof(double) * nInternalFaces, cudaMemcpyDefault));
+    if (extNnz > 0)
+    {
+        CHECK(cudaMemcpy(valuesTmp + localNnz, extVal, sizeof(double) * extNnz, cudaMemcpyDefault));
+    }
+
+    constexpr int nthreads = 128;
+    int nblocks = totalNnz / nthreads + 1;
+    applyPermutation<<<nblocks, nthreads>>>(totalNnz, ldu2csrPerm, nullptr, valuesTmp, nullptr, values, true);
+    CHECK(cudaStreamSynchronize(0));
+}
+
 // XXX Should implement an early abandonment of the
 // unnecessary data for capacity optimisation
 void AmgXCSRMatrix::discardStructure()
-- 
GitLab