From 132c69dbe60fd70ab1de6de8e6140204cbe73d47 Mon Sep 17 00:00:00 2001 From: Simone Bna <s.bn@cineca.it> Date: Wed, 26 May 2021 16:59:39 +0200 Subject: [PATCH] deleted old file --- src/FOAM2CSR.mirco.cu | 515 ------------------------------------------ 1 file changed, 515 deletions(-) delete mode 100644 src/FOAM2CSR.mirco.cu diff --git a/src/FOAM2CSR.mirco.cu b/src/FOAM2CSR.mirco.cu deleted file mode 100644 index d0ac8f2..0000000 --- a/src/FOAM2CSR.mirco.cu +++ /dev/null @@ -1,515 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - */ - -#include "FOAM2CSR.H" -#include <cuda.h> -#include <cub/cub.cuh> -#include <thrust/scan.h> -#include <thrust/sequence.h> -#include <thrust/execution_policy.h> -#include <unistd.h> - -#define CHECK(call) \ - { \ - cudaError_t e = call; \ - if (e != cudaSuccess) \ - { \ - printf("Cuda failure: '%s %d %s'", \ - __FILE__, __LINE__, cudaGetErrorString(e)); \ - } \ - } - -__global__ void printPerm( - const int nrows, - const int nnz, - int* rowsIdx, - int* colsIdx, - int* colsPerm ) -{ - - if ( threadIdx.x < nrows ) - { - printf( "rows(%d) = %d\n", threadIdx.x, rowsIdx[threadIdx.x] ); - } - - if ( threadIdx.x < nnz ) - { - printf( "colsPerm(%d) = %d\n", threadIdx.x, colsIdx[colsPerm[threadIdx.x]] ); - } - -} - -__global__ void printData( - const int nrows, - const int nnz, - int* rowsIdx, - int* colsIdx, - double* values ) -{ - - if ( threadIdx.x < nrows ) - { - printf( "rows(%d) = %d\n", threadIdx.x, rowsIdx[threadIdx.x] ); - } - - if ( threadIdx.x < nnz ) - { - printf( "cols(%d) = %d\n", threadIdx.x, colsIdx[threadIdx.x] ); - } - - if ( threadIdx.x < nnz ) - { - printf( "values(%d) = %d, %f\n", threadIdx.x, colsIdx[threadIdx.x], (float)values[threadIdx.x] ); - } - -} - -// Offset the column indices to transform from local to global -__global__ void localToGlobalColIndices( - const int nnz, - const int nrows, - const int nInternalFaces, - const int diagIndexGlobal, - const int lowOffGlobal, - const int uppOffGlobal, - int *colIndices) -{ - // Offset by global offset - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < nnz; i += blockDim.x * gridDim.x) - { - int offset; - if ( i < nnz ) - { - // Depending upon the row, different offsets must be applied - if (i < nrows) - { - offset = diagIndexGlobal; - } - else if (i < nrows + nInternalFaces) - { - offset = uppOffGlobal; - } - else - { - offset = lowOffGlobal; - } - colIndices[i] += offset; - } - } -} - - - -// Offset the column indices to transform from local to global -__global__ void localToGlobalRowIndices( - const int nnz, - const int nrows, - const int nInternalFaces, - const int diagIndexGlobal, - const int lowOffGlobal, - const int uppOffGlobal, - int *rowIndices) -{ - // Offset by global offset - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < nnz; i += blockDim.x * gridDim.x) - { - int offset; - if ( i < nnz ) - { - // Depending upon the row, different offsets must be applied - if (i < nrows) - { - offset = diagIndexGlobal; - } - else if (i < nrows + nInternalFaces) - { - offset = uppOffGlobal; - } - else - { - offset = lowOffGlobal; - } - - rowIndices[i] += offset - diagIndexGlobal; - } - } -} - - - -// Apply the pre-existing permutation to the values [and columns] -__global__ void applyPermutation( - const int totalNnz, - const int *perm, - const int *colIndicesTmp, - const double *valuesTmp, - int *colIndices, - double *values, - bool valuesOnly) -{ - // Permute col indices and values - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < totalNnz; i += blockDim.x * gridDim.x) - { - - if ( i < totalNnz ) - { - int p = perm[i]; - - // In the values only case the column indices and row offsets remain fixed so no update - if ( p< totalNnz ) - { - if (!valuesOnly) - { - colIndices[i] = colIndicesTmp[p]; - } - if ( p < totalNnz ) - { - values[i] = valuesTmp[p]; - } - } - } - } -} - -// Flatten the row indices into the row offsets -// >>> constexpr int nthreads = 128; -// >>> int nblocks = totalNnz / nthreads + 1; -// >>> createRowOffsets<<<nblocks, nthreads>>>(totalNnz, nrows+1, rowIndices, rowOffsets); -__global__ void createRowOffsets( - int nnz, - int nrows, - int *rowIndices, - int *rowOffsets) -{ - -/* - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < nnz; i += blockDim.x * gridDim.x) - { - int k = rowIndices[i]; - if ( k >= nrows ) { - printf( "Sforo il numero di linee\n" ); - printf( "Memory Access :: %d, %d, %d, %d,%d, %d, %d, %d, %d\n", i, k, nnz, nrows, threadIdx.x, blockIdx.x, blockDim.x, gridDim.x, rowIndices[i] ); - } - - if ( i >= nnz || i == 0 ) { - printf( "Memory Access :: %d, %d, %d, %d,%d, %d, %d, %d, %d\n", i, k, nnz, nrows, threadIdx.x, blockIdx.x, blockDim.x, gridDim.x, rowIndices[i] ); - } - atomicAdd(&rowOffsets[k], 1); - } -*/ - - for ( int i = threadIdx.x + blockIdx.x * blockDim.x; i<nnz; i += blockDim.x * gridDim.x) - { - if ( i < nnz ) - { - int k = rowIndices[i]; - if ( k < nrows+1 ) - { - atomicAdd(&rowOffsets[k], 1); - } - } - } - - -} - -// Updates the values based on the previously determined permutation -void FOAM2CSR::updateValues( - const int nrows, - const int nInternalFaces, - const int extNnz, - const double *diagVal, - const double *uppVal, - const double *lowVal, - const double *extVal) -{ - - // Determine the local non-zeros from the internal faces - int localNnz = nrows + 2 * nInternalFaces; - - // Add external non-zeros (communicated halo entries) - int totalNnz = localNnz + extNnz; - - // Copy the values in [ diag, upper, lower, (external) ] - CHECK(cudaMemcpy(valuesTmp, diagVal, sizeof(double) * nrows, cudaMemcpyDefault)); - CHECK(cudaMemcpy(valuesTmp + nrows, uppVal, sizeof(double) * nInternalFaces, cudaMemcpyDefault)); - CHECK(cudaMemcpy(valuesTmp + nrows + nInternalFaces, lowVal, sizeof(double) * nInternalFaces, cudaMemcpyDefault)); - if (extNnz > 0) - { - CHECK(cudaMemcpy(valuesTmp + localNnz, extVal, sizeof(double) * extNnz, cudaMemcpyDefault)); - } - - constexpr int nthreads = 128; - int nblocks = totalNnz / nthreads + 1; - applyPermutation<<<nblocks, nthreads>>>(totalNnz, ldu2csrPerm, nullptr, valuesTmp, nullptr, values, true); - CHECK(cudaStreamSynchronize(0)); -} - -// Perform the conversion between an LDU matrix and a CSR matrix, possibly distributed -void FOAM2CSR::convertLDU2CSR( - int nrows, - int nInternalFaces, - int diagIndexGlobal, - int lowOffGlobal, - int uppOffGlobal, - const int *upperAddr, - const int *lowerAddr, - const int extNnz, - const int *extRow, - const int *extCol, - const double *diagVals, - const double *upperVals, - const double *lowerVals, - const double *extVals) -{ - - - // Determine the local non-zeros from the internal faces - int localNnz = nrows + 2 * nInternalFaces; - - // Add external non-zeros (communicated halo entries) - int totalNnz = localNnz + extNnz; - - - if ( diagIndexGlobal > 0 ) - { - sleep(2); - std::cout << " ============================================================== " << std::endl; - std::cout << "P1 :: nrows :: " << nrows << std::endl; - std::cout << "P1 :: nInternalFaces :: " << nInternalFaces << std::endl; - std::cout << "P1 :: extNnz :: " << extNnz << std::endl; - std::cout << "P1 :: diag index global :: " << diagIndexGlobal << std::endl; - std::cout << "P1 :: low offset global :: " << lowOffGlobal << std::endl; - std::cout << "P1 :: upp offset global :: " << uppOffGlobal << std::endl; - std::cout << "P1 :: upperaddr0 :: " << upperAddr[0] << std::endl; - std::cout << "P1 :: loweraddr0 :: " << lowerAddr[0] << std::endl; - if ( extNnz > 0 ){ - std::cout << "P1 :: extRow :: " << extRow[0] << std::endl; - std::cout << "P1 :: extCol :: " << extCol[0] << std::endl; - } - FILE* fid = fopen( "P1_descr.txt", "w" ); - fprintf( fid, "%d, %d, %d, %d, %d, %d\n", nrows, nInternalFaces, diagIndexGlobal, lowOffGlobal, uppOffGlobal, extNnz ); - fclose(fid); - - fid = fopen( "P1_LDU_diag.txt", "w" ); - for ( int i =0; i<nrows; i++ ) fprintf( fid, "%g\n", diagVals[i] ); - fclose(fid); - fid = fopen( "P1_LDU_sym.txt", "w" ); - for ( int i =0; i<nInternalFaces; i++ ) fprintf( fid, "%d, %d, %g, %g\n", - upperAddr[i], lowerAddr[i], upperVals[i], lowerVals[i] ); - fclose(fid); - if ( extNnz > 0 ){ - fid = fopen( "P1_LDU_ext.txt", "w" ); - for ( int i =0; i<extNnz; i++ ) fprintf( fid, "%d, %d, %g\n", - extRow[i], extCol[i], extVals[i] ); - fclose(fid); - } - std::cout << " ============================================================== " << std::endl; - } - else - { - std::cout << " ============================================================== " << std::endl; - std::cout << "P0 :: nrows :: " << nrows << std::endl; - std::cout << "P0 :: nInternalFaces :: " << nInternalFaces << std::endl; - std::cout << "P0 :: extNnz :: " << extNnz << std::endl; - std::cout << "P0 :: diag index global :: " << diagIndexGlobal << std::endl; - std::cout << "P0 :: low offset global :: " << lowOffGlobal << std::endl; - std::cout << "P0 :: upp offset global :: " << uppOffGlobal << std::endl; - std::cout << "P0 :: upperaddr0 :: " << upperAddr[0] << std::endl; - std::cout << "P0 :: loweraddr0 :: " << lowerAddr[0] << std::endl; - if ( extNnz > 0 ) - { - std::cout << "P0 :: extRow :: " << extRow[0] << std::endl; - std::cout << "P0 :: extCol :: " << extCol[0] << std::endl; - } - FILE* fid = fopen( "P0_descr.txt", "w" ); - fprintf( fid, "%d, %d, %d, %d, %d, %d\n", nrows, nInternalFaces, diagIndexGlobal, lowOffGlobal, uppOffGlobal, extNnz ); - fclose(fid); - fid = fopen( "P0_LDU_diag.txt", "w" ); - for ( int i =0; i<nrows; i++ ) fprintf( fid, "%g\n", diagVals[i] ); - fclose(fid); - fid = fopen( "P0_LDU_sym.txt", "w" ); - for ( int i =0; i<nInternalFaces; i++ ) fprintf( fid, "%d, %d, %g, %g\n", - upperAddr[i], lowerAddr[i], upperVals[i], lowerVals[i] ); - fclose(fid); - if ( extNnz > 0 ){ - fid = fopen( "P0_LDU_ext.txt", "w" ); - for ( int i =0; i<extNnz; i++ ) fprintf( fid, "%d, %d, %g\n", - extRow[i], extCol[i], extVals[i] ); - fclose(fid); - } - sleep(3); - } - - - // Generate unpermuted index list [0, ..., totalNnz-1] - int GuardSize=1000000; - int *permTmp; - CHECK(cudaMalloc(&permTmp, sizeof(int) * (totalNnz+GuardSize))); - thrust::sequence(thrust::device, permTmp, permTmp + totalNnz, 0); - - // Fill rowIndicesTmp with [0, ..., n-1], lowerAddr, upperAddr, (extAddr) - int *rowIndicesTmp; - CHECK(cudaMalloc(&rowIndicesTmp, sizeof(int) * (totalNnz+GuardSize))); - CHECK(cudaMemcpy(rowIndicesTmp, permTmp, nrows * sizeof(int), cudaMemcpyDefault)); - CHECK(cudaMemcpy(rowIndicesTmp + nrows, lowerAddr, nInternalFaces * sizeof(int), cudaMemcpyDefault)); - CHECK(cudaMemcpy(rowIndicesTmp + nrows + nInternalFaces, upperAddr, nInternalFaces * sizeof(int), cudaMemcpyDefault)); - if (extNnz > 0) - { - CHECK(cudaMemcpy(rowIndicesTmp + localNnz, extRow, extNnz * sizeof(int), cudaMemcpyDefault)); - } - constexpr int nthreads = 128; - int nblocks = localNnz / nthreads + 1; - // localToGlobalRowIndices<<<nblocks, nthreads>>>(localNnz, nrows, nInternalFaces, diagIndexGlobal, lowOffGlobal, uppOffGlobal, rowIndicesTmp); - - // Make space for the row indices and stored permutation - int *rowIndices; - CHECK(cudaMalloc(&rowIndices, sizeof(int) * (totalNnz+GuardSize))); - CHECK(cudaMalloc(&ldu2csrPerm, sizeof(int) * (totalNnz+GuardSize))); - - // Sort the row indices and store results in the permutation - void *tempStorage = NULL; - size_t tempStorageBytes = 0; - cub::DeviceRadixSort::SortPairs(tempStorage, tempStorageBytes, rowIndicesTmp, rowIndices, permTmp, ldu2csrPerm, totalNnz); - CHECK(cudaMalloc(&tempStorage, tempStorageBytes)); - - - cub::DeviceRadixSort::SortPairs(tempStorage, tempStorageBytes, rowIndicesTmp, rowIndices, permTmp, ldu2csrPerm, totalNnz); - CHECK(cudaFree(permTmp)); - CHECK(cudaFree(tempStorage)); - - // Make space for the row offsets - CHECK(cudaMalloc(&rowOffsets, sizeof(int) * (nrows + 1 + GuardSize))); - CHECK(cudaMemset(rowOffsets, 0, sizeof(int) * (nrows + 1 + GuardSize))); - - // XXX Taking the non zero per row data from the host could be more - // efficient, experiment with this in the future - //cudaMemcpy(rowOffsets, nz_per_row, sizeof(int) * nrows, cudaMemcpyDefault); - //thrust::exclusive_scan(thrust::device, rowOffsets, rowOffsets + nrows + 1, rowOffsets); - - // Convert the row indices into offsets - nblocks = totalNnz / nthreads + 1; - createRowOffsets<<<nblocks, nthreads>>>(totalNnz, nrows+1, rowIndices, rowOffsets); - thrust::exclusive_scan(thrust::device, rowOffsets, rowOffsets + nrows + 1, rowOffsets); - // CHECK(cudaFree(rowIndices)); - // printf( "%d - %d \n", rowOffsets[nrows], totalNnz ); - - - - // Fill valuesTmp with diagVals, upperVals, lowerVals, (extVals) - CHECK(cudaMalloc(&valuesTmp, (totalNnz+GuardSize) * sizeof(double))); - CHECK(cudaMemcpy(valuesTmp, diagVals, nrows * sizeof(double), cudaMemcpyDefault)); - CHECK(cudaMemcpy(valuesTmp + nrows, upperVals, nInternalFaces * sizeof(double), cudaMemcpyDefault)); - CHECK(cudaMemcpy(valuesTmp + nrows + nInternalFaces, lowerVals, nInternalFaces * sizeof(double), cudaMemcpyDefault)); - if (extNnz > 0) - { - CHECK(cudaMemcpy(valuesTmp + localNnz, extVals, extNnz * sizeof(double), cudaMemcpyDefault)); - } - - // Concat [0, ..., n-1], upperAddr, lowerAddr (note switched) into column indices - int *colIndicesTmp; - CHECK(cudaMalloc(&colIndicesTmp, (totalNnz+GuardSize) * sizeof(int))); - CHECK(cudaMemcpy(colIndicesTmp, rowIndicesTmp, nrows * sizeof(int), cudaMemcpyDefault)); - CHECK(cudaMemcpy(colIndicesTmp + nrows, rowIndicesTmp + nrows + nInternalFaces, nInternalFaces * sizeof(int), cudaMemcpyDefault)); - CHECK(cudaMemcpy(colIndicesTmp + nrows + nInternalFaces, rowIndicesTmp + nrows, nInternalFaces * sizeof(int), cudaMemcpyDefault)); - if (extNnz > 0) - { - CHECK(cudaMemcpy(colIndicesTmp + localNnz, extCol, extNnz * sizeof(int), cudaMemcpyDefault)); - } - - CHECK(cudaFree(rowIndicesTmp)); - - - - // Construct the global column indices - nblocks = localNnz / nthreads + 1; - localToGlobalColIndices<<<nblocks, nthreads>>>(localNnz, nrows, nInternalFaces, diagIndexGlobal, lowOffGlobal, uppOffGlobal, colIndicesTmp); - - - // std::cout << "Print" << std::endl; - // nblocks = 1; - // printPerm<<<nblocks, nthreads>>>( 9, 33, rowOffsets, colIndicesTmp, ldu2csrPerm ); - // std::cout << "Exit" << std::endl; - - - // Allocate space to store the permuted column indices and values - CHECK(cudaMalloc(&colIndices, sizeof(int) * (totalNnz+GuardSize))); - CHECK(cudaMalloc(&values, sizeof(double) * (totalNnz+GuardSize))); - - // Swap column indices based on the pre-determined permutation - nblocks = totalNnz / nthreads + 1; - applyPermutation<<<nblocks, nthreads>>>(totalNnz, ldu2csrPerm, colIndicesTmp, valuesTmp, colIndices, values, false); - CHECK(cudaFree(colIndicesTmp)); - - - int* csr_rowOfs = (int*) malloc( sizeof(int)* (nrows+1) ); - int* csr_rowIdx = (int*) malloc( sizeof(int)* totalNnz ); - int* csr_colIdx = (int*) malloc( sizeof(int)* totalNnz ); - double* csr_values = (double*) malloc( sizeof(double)*totalNnz ); - - CHECK(cudaMemcpy( csr_rowOfs, rowOffsets, (nrows+1) * sizeof(int), cudaMemcpyDeviceToHost)); - CHECK(cudaMemcpy( csr_rowIdx, rowIndices, totalNnz * sizeof(int), cudaMemcpyDeviceToHost)); - CHECK(cudaMemcpy( csr_colIdx, colIndices, totalNnz * sizeof(int), cudaMemcpyDeviceToHost)); - CHECK(cudaMemcpy( csr_values, values, totalNnz * sizeof(double), cudaMemcpyDeviceToHost)); - - if ( diagIndexGlobal > 0 ) - { - FILE* fid = fopen( "P1_CSR_rowOffs.txt", "w" ); - for ( int i =0; i<nrows+1; i++ ) fprintf( fid, "%d\n", csr_rowOfs[i] ); - fclose(fid); - fid = fopen( "P1_CSR_col.txt", "w" ); - for ( int i =0; i<totalNnz; i++ ) fprintf( fid, "%d, %d, %g\n", csr_rowIdx[i], csr_colIdx[i], csr_values[i] ); - fclose(fid); - } - else - { - FILE* fid = fopen( "P0_CSR_rowOffs.txt", "w" ); - for ( int i =0; i<nrows+1; i++ ) fprintf( fid, "%d\n", csr_rowOfs[i] ); - fclose(fid); - fid = fopen( "P0_CSR_col.txt", "w" ); - for ( int i =0; i<totalNnz; i++ ) fprintf( fid, "%d, %d, %g\n", csr_rowIdx[i], csr_colIdx[i], csr_values[i] ); - fclose(fid); - } - - -} - -// XXX Should implement an early abandonment of the -// unnecessary data for capacity optimisation -void FOAM2CSR::discardStructure() -{ -} - -// Deallocate remaining storage -void FOAM2CSR::finalise() -{ - if(rowOffsets != nullptr) - CHECK(cudaFree(rowOffsets)); - if(colIndices != nullptr) - CHECK(cudaFree(colIndices)); - if(values != nullptr) - CHECK(cudaFree(values)); - if(valuesTmp != nullptr) - CHECK(cudaFree(valuesTmp)); - if(ldu2csrPerm != nullptr) - CHECK(cudaFree(ldu2csrPerm)); -} -- GitLab