diff --git a/Makefile b/Makefile index 6c74edcf2fa524a7d36a303e272e5b30b9aba565..5a77bea191ba64617e3837654a2177210ecd7dad 100644 --- a/Makefile +++ b/Makefile @@ -21,11 +21,70 @@ # # CUB is released in cuda 11. Using c++11 or c++14 fixed the ieee128 issue. +.PHONY: clean + +SRCDIR=./src +INCDIR=./inc +OBJDIR=./obj +LIBDIR=./lib NVARCH ?= 70 +RM=rm -rf + +NVCL=nvcc +NVCC=nvcc -c +NVCCINCFLAGS=-I$(INCDIR) -I${CUBROOT} -I${SPECTRUM_MPI_HOME}/include -I${PETSC_INC} -I${AMGX_INC} +NVCCFLAGS=-O0 -g -std=c++14 --compiler-options='-fPIC -std=c++14' -arch=sm_$(NVARCH) + + + +OBJECTS= \ +$(OBJDIR)/AmgXSolver.o \ +$(OBJDIR)/consolidate.o \ +$(OBJDIR)/FOAM2CSR.o \ +$(OBJDIR)/init.o \ +$(OBJDIR)/misc.o \ +$(OBJDIR)/setA.o \ +$(OBJDIR)/solve.o + + + +$(LIBDIR)/libFOAM2CSR.so: $(OBJECTS) + $(NVCL) $(NVCCFLAGS) -L${AMGX_LIB} -lamgx --shared $^ -o $@ + + +$(OBJDIR)/AmgXSolver.o: $(SRCDIR)/AmgXSolver.cpp $(INCDIR)/AmgXSolver.H + $(NVCC) $(NVCCFLAGS) $(NVCCINCFLAGS) $< -o $@ + + +$(OBJDIR)/consolidate.o: $(SRCDIR)/consolidate.cu $(INCDIR)/AmgXSolver.H + $(NVCC) $(NVCCFLAGS) $(NVCCINCFLAGS) $< -o $@ + + +$(OBJDIR)/FOAM2CSR.o: $(SRCDIR)/FOAM2CSR.cu $(INCDIR)/FOAM2CSR.H + $(NVCC) $(NVCCFLAGS) $(NVCCINCFLAGS) $< -o $@ + + +$(OBJDIR)/init.o: $(SRCDIR)/init.cpp $(INCDIR)/AmgXSolver.H + $(NVCC) $(NVCCFLAGS) $(NVCCINCFLAGS) $< -o $@ + + +$(OBJDIR)/misc.o: $(SRCDIR)/misc.cpp $(INCDIR)/AmgXSolver.H + $(NVCC) $(NVCCFLAGS) $(NVCCINCFLAGS) $< -o $@ + + +$(OBJDIR)/setA.o: $(SRCDIR)/setA.cpp $(INCDIR)/AmgXSolver.H + $(NVCC) $(NVCCFLAGS) $(NVCCINCFLAGS) $< -o $@ + + +$(OBJDIR)/solve.o: $(SRCDIR)/solve.cpp $(INCDIR)/AmgXSolver.H + $(NVCC) $(NVCCFLAGS) $(NVCCINCFLAGS) $< -o $@ -all: - nvcc -O3 -I. -I${CUBROOT} -std=c++14 --compiler-options='-fPIC -std=c++14' -arch=sm_${NVARCH} --shared FOAM2CSR.cu -o libFOAM2CSR.so clean: - rm -rf libFOAM2CSR.so + $(RM) $(LIBDIR)/*.so + $(RM) $(OBJDIR)/*.o + +# +# nvcc -O3 -I. -I${CUBROOT} -std=c++14 --compiler-options='-fPIC -std=c++14' -arch=sm_${NVARCH} --shared FOAM2CSR.cu -o libFOAM2CSR.so +# diff --git a/Makefile.orig b/Makefile.orig new file mode 100644 index 0000000000000000000000000000000000000000..6c74edcf2fa524a7d36a303e272e5b30b9aba565 --- /dev/null +++ b/Makefile.orig @@ -0,0 +1,31 @@ +# +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# + +# CUB is released in cuda 11. Using c++11 or c++14 fixed the ieee128 issue. + +NVARCH ?= 70 + +all: + nvcc -O3 -I. -I${CUBROOT} -std=c++14 --compiler-options='-fPIC -std=c++14' -arch=sm_${NVARCH} --shared FOAM2CSR.cu -o libFOAM2CSR.so + +clean: + rm -rf libFOAM2CSR.so diff --git a/inc/AmgXSolver.H b/inc/AmgXSolver.H new file mode 100644 index 0000000000000000000000000000000000000000..7ac8a56d0a91ac1ebccc75707e18fbffc182536d --- /dev/null +++ b/inc/AmgXSolver.H @@ -0,0 +1,617 @@ +/** + * \file AmgXSolver.hpp + * \brief Definition of class AmgXSolver. + * \author Pi-Yueh Chuang (pychuang@gwu.edu) + * \date 2015-09-01 + * \copyright Copyright (c) 2015-2019 Pi-Yueh Chuang, Lorena A. Barba. + * \copyright Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * This project is released under MIT License. + */ + + +#ifndef __AMGX_SOLVER_H__ +#define __AMGX_SOLVER_H__ + +// CUDA +#include <cuda_runtime.h> + +// STL +# include <string> +# include <vector> + +// AmgX +# include <amgx_c.h> + +// PETSc +# include <petscmat.h> +# include <petscvec.h> + + +/** \brief A macro to check the returned CUDA error code. + * + * \param call [in] Function call to CUDA API. + */ +# define CHECK(call) \ +{ \ + const cudaError_t error = call; \ + if (error != cudaSuccess) \ + { \ + SETERRQ4(PETSC_COMM_WORLD, PETSC_ERR_SIG, \ + "Error: %s:%d, code:%d, reason: %s\n", \ + __FILE__, __LINE__, error, cudaGetErrorString(error)); \ + } \ +} + + +/** \brief A shorter macro for checking PETSc returned error code. */ +# define CHK CHKERRQ(ierr) + + +/** \brief A wrapper class for coupling PETSc and AmgX. + * + * This class is a wrapper of AmgX library for PETSc. PETSc users only need to + * pass a PETSc matrix and vectors into an AmgXSolver instance to solve their + * linear systems. The class is designed specifically for the situation where + * the number of MPI processes is more than the number of GPU devices. + * + * Eaxmple usage: + * \code + * int main(int argc, char **argv) + * { + * // initialize matrix A, RHS, etc using PETSc + * ... + * + * // create an instance of the solver wrapper + * AmgXSolver solver; + * // initialize the instance with communicator, executation mode, and config file + * solver.initialize(comm, mode, file); + * // set matrix A. Currently it only accept PETSc AIJ matrix + * solver.setA(A); + * // solve. x and rhs are PETSc vectors. unkns will be the final result in the end + * solver.solve(unks, rhs); + * // get number of iterations + * int iters; + * solver.getIters(iters); + * // get residual at the last iteration + * double res; + * solver.getResidual(iters, res); + * // finalization + * solver.finalize(); + * + * // other codes + * .... + * + * return 0; + * } + * \endcode + */ +class AmgXSolver +{ + public: + + /** \brief Default constructor. */ + AmgXSolver() = default; + + + /** \brief Construct a AmgXSolver instance. + * + * \param comm [in] MPI communicator. + * \param modeStr [in] A string; target mode of AmgX (e.g., dDDI). + * \param cfgFile [in] A string; the path to AmgX configuration file. + */ + AmgXSolver(const MPI_Comm &comm, + const std::string &modeStr, const std::string &cfgFile); + + + /** \brief Destructor. */ + ~AmgXSolver(); + + + /** \brief Initialize a AmgXSolver instance. + * + * \param comm [in] MPI communicator. + * \param modeStr [in] A string; target mode of AmgX (e.g., dDDI). + * \param cfgFile [in] A string; the path to AmgX configuration file. + * + * \return PetscErrorCode. + */ + PetscErrorCode initialize(const MPI_Comm &comm, + const std::string &modeStr, const std::string &cfgFile); + + + /** \brief Finalize this instance. + * + * This function destroys AmgX data. When there are more than one + * AmgXSolver instances, the last one destroyed is also in charge of + * destroying the shared resource object and finalizing AmgX. + * + * \return PetscErrorCode. + */ + PetscErrorCode finalize(); + + + /** \brief Set up the matrix used by AmgX. + * + * This function will automatically convert PETSc matrix to AmgX matrix. + * If the number of MPI processes is higher than the number of available + * GPU devices, we also redistribute the matrix in this function and + * upload redistributed one to GPUs. + * + * Note: currently we can only handle AIJ/MPIAIJ format. + * + * \param A [in] A PETSc Mat. + * + * \return PetscErrorCode. + */ + PetscErrorCode setA(const Mat &A); + + + /** \brief Set up the matrix used by AmgX. + * + * This function sets up the AmgX matrix from the provided CSR data + * structures and partition data. + * + * \param nGlobalRows [in] The number of global rows. + * \param nLocalRows [in] The number of local rows on this rank. + * \param nLocalNz [in] The total number of non zero entries locally. + * \param rowOffsets [in] The local CSR matrix row offsets. + * \param colIndicesGlobal [in] The global CSR matrix column indices. + * \param values [in] The local CSR matrix values. + * \param partData [in] Array of length nGlobalRows containing the rank + * id of the owning rank for each row. + * + * \return PetscErrorCode. + */ + PetscErrorCode setA( + const PetscInt nGlobalRows, + const PetscInt nLocalRows, + const PetscInt nLocalNz, + const PetscInt* rowOffsets, + const PetscInt* colIndicesGlobal, + const PetscScalar* values, + const PetscInt* partData); + + + /** \brief Re-sets up an existing AmgX matrix. + * + * Replaces the matrix coefficients with the provided values and performs + * a resetup for the AmgX matrix. + * + * \param nLocalRows [in] The number of local rows on this rank. + * \param nLocalNz [in] The total number of non zero entries locally. + * \param values [in] The local CSR matrix values. + * + * \return PetscErrorCode. + */ + PetscErrorCode updateA( + const PetscInt nLocalRows, + const PetscInt nLocalNz, + const PetscScalar* values); + + + /** \brief Solve the linear system. + * + * \p p vector will be used as an initial guess and will be updated to the + * solution by the end of solving. + * + * For cases that use more MPI processes than the number of GPUs, this + * function will do data gathering before solving and data scattering + * after the solving. + * + * \param p [in, out] A PETSc Vec object representing unknowns. + * \param b [in] A PETSc Vec representing right-hand-side. + * + * \return PetscErrorCode. + */ + PetscErrorCode solve(Vec &p, Vec &b); + + + /** \brief Solve the linear system. + * + * \p p vector will be used as an initial guess and will be updated to the + * solution by the end of solving. + * + * For cases that use more MPI processes than the number of GPUs, this + * function will do data gathering before solving and data scattering + * after the solving. + * + * \param p [in, out] The unknown array. + * \param b [in] The RHS array. + * \param nRows [in] The number of rows in this rank. + * + * \return PetscErrorCode. + */ + PetscErrorCode solve(PetscScalar *p, const PetscScalar *b, const int nRows); + + + /** \brief Get the number of iterations of the last solving. + * + * \param iter [out] Number of iterations. + * + * \return PetscErrorCode. + */ + PetscErrorCode getIters(int &iter); + + + /** \brief Get the residual at a specific iteration during the last solving. + * + * \param iter [in] Target iteration. + * \param res [out] Returned residual. + * + * \return PetscErrorCode. + */ + PetscErrorCode getResidual(const int &iter, double &res); + + + private: + + /** \brief Current count of AmgXSolver instances. + * + * This static variable is used to count the number of instances. The + * fisrt instance is responsable for initializing AmgX library and the + * resource instance. + */ + static int count; + + /** \brief A flag indicating if this instance has been initialized. */ + bool isInitialized = false; + + /** \brief The name of the node that this MPI process belongs to. */ + std::string nodeName; + + + + + /** \brief Number of local GPU devices used by AmgX.*/ + PetscMPIInt nDevs; + + /** \brief The ID of corresponding GPU device used by this MPI process. */ + PetscMPIInt devID; + + /** \brief A flag indicating if this process will talk to GPU. */ + PetscMPIInt gpuProc = MPI_UNDEFINED; + + /** \brief A communicator for global world. */ + MPI_Comm globalCpuWorld; + + /** \brief A communicator for local world (i.e., in-node). */ + MPI_Comm localCpuWorld; + + /** \brief A communicator for MPI processes that can talk to GPUs. */ + MPI_Comm gpuWorld; + + /** \brief A communicator for processes sharing the same devices. */ + MPI_Comm devWorld; + + /** \brief Size of \ref AmgXSolver::globalCpuWorld "globalCpuWorld". */ + PetscMPIInt globalSize; + + /** \brief Size of \ref AmgXSolver::localCpuWorld "localCpuWorld". */ + PetscMPIInt localSize; + + /** \brief Size of \ref AmgXSolver::gpuWorld "gpuWorld". */ + PetscMPIInt gpuWorldSize; + + /** \brief Size of \ref AmgXSolver::devWorld "devWorld". */ + PetscMPIInt devWorldSize; + + /** \brief Rank in \ref AmgXSolver::globalCpuWorld "globalCpuWorld". */ + PetscMPIInt myGlobalRank; + + /** \brief Rank in \ref AmgXSolver::localCpuWorld "localCpuWorld". */ + PetscMPIInt myLocalRank; + + /** \brief Rank in \ref AmgXSolver::gpuWorld "gpuWorld". */ + PetscMPIInt myGpuWorldRank; + + /** \brief Rank in \ref AmgXSolver::devWorld "devWorld". */ + PetscMPIInt myDevWorldRank; + + + + + /** \brief A parameter used by AmgX. */ + int ring; + + /** \brief AmgX solver mode. */ + AMGX_Mode mode; + + /** \brief AmgX config object. */ + AMGX_config_handle cfg = nullptr; + + /** \brief AmgX matrix object. */ + AMGX_matrix_handle AmgXA = nullptr; + + /** \brief AmgX vector object representing unknowns. */ + AMGX_vector_handle AmgXP = nullptr; + + /** \brief AmgX vector object representing RHS. */ + AMGX_vector_handle AmgXRHS = nullptr; + + /** \brief AmgX solver object. */ + AMGX_solver_handle solver = nullptr; + + /** \brief AmgX resource object. + * + * Due to the design of AmgX library, using more than one resource + * instance may cause some problems. So we make the resource instance + * as a static member to keep only one instance. + */ + static AMGX_resources_handle rsrc; + + /** \brief Initialize consolidation, if required. Allocates space to hold + * consolidated CSR matrix and solution/RHS vectors on the root rank and, + * if device pointer consolidation, allocates and opens IPC memory handles. + * Calculates and stores consolidated sizes and displacements. + * + * \param nLocalRows [in] The number of rows owned by this rank. + * \param nLocalNz [in] The number of non-zeros owned by this rank. + * \param values [in] The values of the CSR matrix A. + * + * \return PetscErrorCode. + */ + PetscErrorCode initializeConsolidation( + const PetscInt nLocalRows, + const PetscInt nLocalNz, + const PetscScalar *values); + + /** \brief Realise consolidation, if required. This copies data from multiple ranks + * that share a single GPU, so a single consolidated CSR matrix can be passed to + * AmgX. As such, users can efficiently execute with more MPI ranks than GPUs, + * potentially improving performance where un-accelerated components of an application + * require CPU resources. CUDA IPC is leveraged to avoid buffering data onto the host, + * rather enabling fast intra-GPU copies if the local CSR matrices are already on the GPU. + * + * \param nLocalRows [in] The number of rows owned by this rank. + * \param nLocalNz [in] The number of non-zeros owned by this rank. + * \param rowOffsets [in] The row offsets of the CSR matrix A. + * \param colIndicesGlobal [in] The global column indices of the CSR matrix A. + * \param values [in] The values of the CSR matrix A. + * + * \return PetscErrorCode. + */ + PetscErrorCode consolidateMatrix( + const PetscInt nLocalRows, + const PetscInt nLocalNz, + const PetscInt *rowOffsets, + const PetscInt *colIndicesGlobal, + const PetscScalar *values); + + /** \brief Re-consolidates the values of the CSR matrix from multiple ranks. + * + * \param nLocalNz [in] The number of non-zeros owned by this rank. + * \param values [in] The values of the CSR matrix A. + * + * \return PetscErrorCode. + */ + PetscErrorCode reconsolidateValues( + const PetscInt nLocalNz, + const PetscScalar *values); + + /** \brief De-allocates consolidated data structures, if any, after the AmgX matrix + * has been constructed and the duplicate data becomes redundant. + * + * \return PetscErrorCode. + */ + PetscErrorCode freeConsStructure(); + + /** \brief De-allocates all consolidated matrix structures. + * + * \return PetscErrorCode. + */ + PetscErrorCode finalizeConsolidation(); + + /** \brief Enumeration for the status of matrix consolidation for the solver.*/ + enum class ConsolidationStatus { + Uninitialized, + None, + Host, + Device + } consolidationStatus; + + /** \brief The number of non-zeros consolidated from multiple ranks to a device.*/ + int nConsNz = 0; + + /** \brief The number of rows consolidated from multiple ranks to a device.*/ + int nConsRows = 0; + + /** \brief The row offsets consolidated onto a single device.*/ + PetscInt *rowOffsetsCons = nullptr; + + /** \brief The global column indices consolidated onto a single device.*/ + PetscInt *colIndicesGlobalCons = nullptr; + + /** \brief The values consolidated onto a single device.*/ + PetscScalar* valuesCons = nullptr; + + /** \brief The solution vector consolidated onto a single device.*/ + PetscScalar* pCons = nullptr; + + /** \brief The RHS vector consolidated onto a single device.*/ + PetscScalar* rhsCons = nullptr; + + /** \brief The number of rows per rank associated with a single device.*/ + std::vector<int> nRowsInDevWorld; + + /** \brief The number of non-zeros per rank associated with a single device.*/ + std::vector<int> nnzInDevWorld; + + /** \brief The row displacements per rank associated with a single device.*/ + std::vector<int> rowDispls; + + /** \brief The non-zero displacements per rank associated with a single device.*/ + std::vector<int> nzDispls; + + /** \brief A VecScatter for gathering/scattering between original PETSc + * Vec and temporary redistributed PETSc Vec.*/ + VecScatter scatterLhs = nullptr; + + /** \brief A VecScatter for gathering/scattering between original PETSc + * Vec and temporary redistributed PETSc Vec.*/ + VecScatter scatterRhs = nullptr; + + /** \brief A temporary PETSc Vec holding redistributed unknowns. */ + Vec redistLhs = nullptr; + + /** \brief A temporary PETSc Vec holding redistributed RHS. */ + Vec redistRhs = nullptr; + + + + + /** \brief Set AmgX solver mode based on the user-provided string. + * + * Available modes are: dDDI, dDFI, dFFI, hDDI, hDFI, hFFI. + * + * \param modeStr [in] a std::string. + * \return PetscErrorCode. + */ + PetscErrorCode setMode(const std::string &modeStr); + + + /** \brief Get the number of GPU devices on this computing node. + * + * \return PetscErrorCode. + */ + PetscErrorCode setDeviceCount(); + + + /** \brief Set the ID of the corresponding GPU used by this process. + * + * \return PetscErrorCode. + */ + PetscErrorCode setDeviceIDs(); + + + /** \brief Initialize all MPI communicators. + * + * The \p comm provided will be duplicated and saved to the + * \ref AmgXSolver::globalCpuWorld "globalCpuWorld". + * + * \param comm [in] Global communicator. + * \return PetscErrorCode. + */ + PetscErrorCode initMPIcomms(const MPI_Comm &comm); + + + /** \brief Perform necessary initialization of AmgX. + * + * This function initializes AmgX for current instance. Based on + * \ref AmgXSolver::count "count", only the instance initialized first + * is in charge of initializing AmgX and the resource instance. + * + * \param cfgFile [in] Path to AmgX solver configuration file. + * \return PetscErrorCode. + */ + PetscErrorCode initAmgX(const std::string &cfgFile); + + + /** \brief Get IS for the row indices that processes in + * \ref AmgXSolver::gpuWorld "gpuWorld" will held. + * + * \param A [in] PETSc matrix. + * \param devIS [out] PETSc IS. + * \return PetscErrorCode. + */ + PetscErrorCode getDevIS(const Mat &A, IS &devIS); + + + /** \brief Get local sequential PETSc Mat of the redistributed matrix. + * + * \param A [in] Original PETSc Mat. + * \param devIS [in] PETSc IS representing redistributed row indices. + * \param localA [out] Local sequential redistributed matrix. + * \return PetscErrorCode. + */ + PetscErrorCode getLocalA(const Mat &A, const IS &devIS, Mat &localA); + + + /** \brief Redistribute matrix. + * + * \param A [in] Original PETSc Mat object. + * \param devIS [in] PETSc IS representing redistributed rows. + * \param newA [out] Redistributed matrix. + * \return PetscErrorCode. + */ + PetscErrorCode redistMat(const Mat &A, const IS &devIS, Mat &newA); + + + /** \brief Get \ref AmgXSolver::scatterLhs "scatterLhs" and + * \ref AmgXSolver::scatterRhs "scatterRhs". + * + * \param A1 [in] Original PETSc Mat object. + * \param A2 [in] Redistributed PETSc Mat object. + * \param devIS [in] PETSc IS representing redistributed row indices. + * \return PetscErrorCode. + */ + PetscErrorCode getVecScatter(const Mat &A1, const Mat &A2, const IS &devIS); + + + /** \brief Get data of compressed row layout of local sparse matrix. + * + * \param localA [in] Sequential local redistributed PETSc Mat. + * \param localN [out] Number of local rows. + * \param row [out] Row vector in compressed row layout. + * \param col [out] Column vector in compressed row layout. + * \param data [out] Data vector in compressed row layout. + * \return PetscErrorCode. + */ + PetscErrorCode getLocalMatRawData(const Mat &localA, + PetscInt &localN, std::vector<PetscInt> &row, + std::vector<PetscInt64> &col, std::vector<PetscScalar> &data); + + + /** \brief Destroy the sequential local redistributed matrix. + * + * \param A [in] Original PETSc Mat. + * \param localA [in, out] Local matrix, returning null pointer. + * \return PetscErrorCode. + */ + PetscErrorCode destroyLocalA(const Mat &A, Mat &localA); + + /** \brief Check whether the global matrix distribution is contiguous + * + * If the global matrix is distributed such that contiguous chunks of rows are + * distributed over the individual ranks in ascending rank order, the partition vector + * has trivial structure (i.e. [0, ..., 0, 1, ..., 1, ..., R-1, ..., R-1] for R ranks) and + * its calculation can be skipped since all information is available to AmgX through + * the number of ranks and the partition *offsets* (i.e. how many rows are on each rank). + * + * \param devIS [in] PETSc IS representing redistributed row indices. + * \param isContiguous [out] Whether the global matrix is contiguously distributed. + * \param partOffsets [out] If contiguous, holds the partition offsets for all R ranks + * \return PetscErrorCode. + */ + PetscErrorCode checkForContiguousPartitioning( + const IS &devIS, PetscBool &isContiguous, std::vector<PetscInt> &partOffsets); + + /** \brief Get partition data required by AmgX. + * + * \param devIS [in] PETSc IS representing redistributed row indices. + * \param N [in] Total number of rows in global matrix. + * \param partData [out] Partition data, either explicit vector or offsets. + * \param usesOffsets [out] If PETSC_TRUE, partitioning is contiguous and partData contains + * partition offsets, see checkForContiguousPartitioning(). Otherwise, contains explicit + * partition vector. + * \return PetscErrorCode. + */ + PetscErrorCode getPartData(const IS &devIS, + const PetscInt &N, std::vector<PetscInt> &partData, PetscBool &usesOffsets); + + + /** \brief Function that actually solves the system. + * + * This function won't check if the process is involved in AmgX solver. + * So if calling this function from processes not in the + * \ref AmgXSolver::gpuWorld "gpuWorld", something bad will happen. + * This function hence won't do data gathering/scattering, either. + * + * \param p [in, out] PETSc Vec for unknowns. + * \param b [in] PETSc Vec for RHS. + * \return PetscErrorCode. + */ + PetscErrorCode solve_real(Vec &p, Vec &b); +}; + +#endif + diff --git a/FOAM2CSR.hpp b/inc/FOAM2CSR.H similarity index 100% rename from FOAM2CSR.hpp rename to inc/FOAM2CSR.H diff --git a/lib/libFOAM2CSR.so b/lib/libFOAM2CSR.so new file mode 100755 index 0000000000000000000000000000000000000000..c656be86c52ef3ca95eaa2da5fd44058ad09a2ec Binary files /dev/null and b/lib/libFOAM2CSR.so differ diff --git a/loadENV.sh b/loadENV.sh new file mode 100644 index 0000000000000000000000000000000000000000..0ba20e18e90dfb6ad5b01aee11e58c7d3164464f --- /dev/null +++ b/loadENV.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# Load teh environment needed to run the job +module purge +module use /m100_scratch/userinternal/mvalent3/modules/opt/modulefiles/profiles +module load local_profile/global + +# Load needed modules +module load gnu/8.4.0 +module load spectrum_mpi/10.3.1--binary +module load cuda/10.2 +module load hpc-sdk/2020--binary +module load openblas/0.3.9--gnu--8.4.0 +module load lapack/3.9.0--gnu--8.4.0 +module load AMGx/2.1.x--spectrum_mpi--10.3.1--binary +module load petsc-sytstem-openblas-hypre-viennacl-ml-kokkos/3.14.0--spectrum_mpi--10.3.1--binary +module load spectrum_mpi/10.3.1--binary + + + +export CUBROOT="${HPC_SDK_HOME}/Linux_ppc64le/2020/cuda/11.0/include" + + + diff --git a/obj/AmgXSolver.o b/obj/AmgXSolver.o new file mode 100644 index 0000000000000000000000000000000000000000..e0be2885222906a06ff32184b0e2037c5866a88d Binary files /dev/null and b/obj/AmgXSolver.o differ diff --git a/obj/FOAM2CSR.o b/obj/FOAM2CSR.o new file mode 100644 index 0000000000000000000000000000000000000000..1b32218a53f19bdc5ff7d13225349f37c934b531 Binary files /dev/null and b/obj/FOAM2CSR.o differ diff --git a/obj/consolidate.o b/obj/consolidate.o new file mode 100644 index 0000000000000000000000000000000000000000..f5903af2347b8fafbc93b758fe86989617e2b1e1 Binary files /dev/null and b/obj/consolidate.o differ diff --git a/obj/init.o b/obj/init.o new file mode 100644 index 0000000000000000000000000000000000000000..5f1e110602f995a95415a394a40e71fafb65e1de Binary files /dev/null and b/obj/init.o differ diff --git a/obj/misc.o b/obj/misc.o new file mode 100644 index 0000000000000000000000000000000000000000..f32ef13cdd1b650d6df20eca979d95c9ae0a25e2 Binary files /dev/null and b/obj/misc.o differ diff --git a/obj/setA.o b/obj/setA.o new file mode 100644 index 0000000000000000000000000000000000000000..d5f6dcf0d0a25de17fe8eb25faea1d8855af1fab Binary files /dev/null and b/obj/setA.o differ diff --git a/obj/solve.o b/obj/solve.o new file mode 100644 index 0000000000000000000000000000000000000000..4761a1be53412ff49bc53dd1534ec2b76dde103d Binary files /dev/null and b/obj/solve.o differ diff --git a/src/.FOAM2CSR.cu.swp b/src/.FOAM2CSR.cu.swp new file mode 100644 index 0000000000000000000000000000000000000000..a0de4277822bfeb80c1defbb320aba7e59822125 Binary files /dev/null and b/src/.FOAM2CSR.cu.swp differ diff --git a/src/AmgXSolver.cpp b/src/AmgXSolver.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c437c35390a9838bc7079974628f007570cc7be8 --- /dev/null +++ b/src/AmgXSolver.cpp @@ -0,0 +1,19 @@ +/** + * \file AmgXSolver.cpp + * \brief Definition of member functions of the class AmgXSolver. + * \author Pi-Yueh Chuang (pychuang@gwu.edu) + * \date 2015-09-01 + * \copyright Copyright (c) 2015-2019 Pi-Yueh Chuang, Lorena A. Barba. + * This project is released under MIT License. + */ + + +// AmgXWrapper +# include "AmgXSolver.H" + + +// initialize AmgXSolver::count to 0 +int AmgXSolver::count = 0; + +// initialize AmgXSolver::rsrc to nullptr; +AMGX_resources_handle AmgXSolver::rsrc = nullptr; diff --git a/FOAM2CSR.cu b/src/FOAM2CSR.cu similarity index 93% rename from FOAM2CSR.cu rename to src/FOAM2CSR.cu index 5ed889906d7865267c2098d5c05113857a63ba87..dd7b5a65cd67ed355b04e55843889acb1ceac02b 100644 --- a/FOAM2CSR.cu +++ b/src/FOAM2CSR.cu @@ -20,7 +20,7 @@ * DEALINGS IN THE SOFTWARE. */ -#include <FOAM2CSR.hpp> +#include <FOAM2CSR.H> #include <cuda.h> #include <cub/cub.cuh> #include <thrust/scan.h> @@ -83,27 +83,46 @@ __global__ void applyPermutation( // Permute col indices and values for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < totalNnz; i += blockDim.x * gridDim.x) { - int p = perm[i]; - - // In the values only case the column indices and row offsets remain fixed so no update - if (!valuesOnly) + // if ( i<nnz ) { + int p = perm[i]; + + // In the values only case the column indices and row offsets remain fixed so no update + if (!valuesOnly) + { colIndices[i] = colIndicesTmp[p]; - } + } - values[i] = valuesTmp[p]; + values[i] = valuesTmp[p]; + } } } // Flatten the row indices into the row offsets __global__ void createRowOffsets( int nnz, + int nrows, int *rowIndices, int *rowOffsets) { for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < nnz; i += blockDim.x * gridDim.x) { - atomicAdd(&rowOffsets[rowIndices[i]], 1); + if ( i<nnz ) + { + int j = rowIndices[i]; + if ( j<nrows+1 ) + { + atomicAdd(&rowOffsets[j], 1); + } + else + { + printf( " - Loop interno :: %d \n", j ); + } + } + else + { + printf( " - Loop esterno :: %d \n", i ); + } } } @@ -203,7 +222,7 @@ void FOAM2CSR::convertLDU2CSR( // Convert the row indices into offsets constexpr int nthreads = 128; int nblocks = totalNnz / nthreads + 1; - createRowOffsets<<<nblocks, nthreads>>>(totalNnz, rowIndices, rowOffsets); + createRowOffsets<<<nblocks, nthreads>>>(totalNnz, nrows, rowIndices, rowOffsets); thrust::exclusive_scan(thrust::device, rowOffsets, rowOffsets + nrows + 1, rowOffsets); CHECK(cudaFree(rowIndices)); diff --git a/src/FOAM2CSR.mirco.cu b/src/FOAM2CSR.mirco.cu new file mode 100644 index 0000000000000000000000000000000000000000..d0ac8f2bd67720ecc2ab6c4b73f7ed446a0bfb85 --- /dev/null +++ b/src/FOAM2CSR.mirco.cu @@ -0,0 +1,515 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "FOAM2CSR.H" +#include <cuda.h> +#include <cub/cub.cuh> +#include <thrust/scan.h> +#include <thrust/sequence.h> +#include <thrust/execution_policy.h> +#include <unistd.h> + +#define CHECK(call) \ + { \ + cudaError_t e = call; \ + if (e != cudaSuccess) \ + { \ + printf("Cuda failure: '%s %d %s'", \ + __FILE__, __LINE__, cudaGetErrorString(e)); \ + } \ + } + +__global__ void printPerm( + const int nrows, + const int nnz, + int* rowsIdx, + int* colsIdx, + int* colsPerm ) +{ + + if ( threadIdx.x < nrows ) + { + printf( "rows(%d) = %d\n", threadIdx.x, rowsIdx[threadIdx.x] ); + } + + if ( threadIdx.x < nnz ) + { + printf( "colsPerm(%d) = %d\n", threadIdx.x, colsIdx[colsPerm[threadIdx.x]] ); + } + +} + +__global__ void printData( + const int nrows, + const int nnz, + int* rowsIdx, + int* colsIdx, + double* values ) +{ + + if ( threadIdx.x < nrows ) + { + printf( "rows(%d) = %d\n", threadIdx.x, rowsIdx[threadIdx.x] ); + } + + if ( threadIdx.x < nnz ) + { + printf( "cols(%d) = %d\n", threadIdx.x, colsIdx[threadIdx.x] ); + } + + if ( threadIdx.x < nnz ) + { + printf( "values(%d) = %d, %f\n", threadIdx.x, colsIdx[threadIdx.x], (float)values[threadIdx.x] ); + } + +} + +// Offset the column indices to transform from local to global +__global__ void localToGlobalColIndices( + const int nnz, + const int nrows, + const int nInternalFaces, + const int diagIndexGlobal, + const int lowOffGlobal, + const int uppOffGlobal, + int *colIndices) +{ + // Offset by global offset + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < nnz; i += blockDim.x * gridDim.x) + { + int offset; + if ( i < nnz ) + { + // Depending upon the row, different offsets must be applied + if (i < nrows) + { + offset = diagIndexGlobal; + } + else if (i < nrows + nInternalFaces) + { + offset = uppOffGlobal; + } + else + { + offset = lowOffGlobal; + } + colIndices[i] += offset; + } + } +} + + + +// Offset the column indices to transform from local to global +__global__ void localToGlobalRowIndices( + const int nnz, + const int nrows, + const int nInternalFaces, + const int diagIndexGlobal, + const int lowOffGlobal, + const int uppOffGlobal, + int *rowIndices) +{ + // Offset by global offset + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < nnz; i += blockDim.x * gridDim.x) + { + int offset; + if ( i < nnz ) + { + // Depending upon the row, different offsets must be applied + if (i < nrows) + { + offset = diagIndexGlobal; + } + else if (i < nrows + nInternalFaces) + { + offset = uppOffGlobal; + } + else + { + offset = lowOffGlobal; + } + + rowIndices[i] += offset - diagIndexGlobal; + } + } +} + + + +// Apply the pre-existing permutation to the values [and columns] +__global__ void applyPermutation( + const int totalNnz, + const int *perm, + const int *colIndicesTmp, + const double *valuesTmp, + int *colIndices, + double *values, + bool valuesOnly) +{ + // Permute col indices and values + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < totalNnz; i += blockDim.x * gridDim.x) + { + + if ( i < totalNnz ) + { + int p = perm[i]; + + // In the values only case the column indices and row offsets remain fixed so no update + if ( p< totalNnz ) + { + if (!valuesOnly) + { + colIndices[i] = colIndicesTmp[p]; + } + if ( p < totalNnz ) + { + values[i] = valuesTmp[p]; + } + } + } + } +} + +// Flatten the row indices into the row offsets +// >>> constexpr int nthreads = 128; +// >>> int nblocks = totalNnz / nthreads + 1; +// >>> createRowOffsets<<<nblocks, nthreads>>>(totalNnz, nrows+1, rowIndices, rowOffsets); +__global__ void createRowOffsets( + int nnz, + int nrows, + int *rowIndices, + int *rowOffsets) +{ + +/* + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < nnz; i += blockDim.x * gridDim.x) + { + int k = rowIndices[i]; + if ( k >= nrows ) { + printf( "Sforo il numero di linee\n" ); + printf( "Memory Access :: %d, %d, %d, %d,%d, %d, %d, %d, %d\n", i, k, nnz, nrows, threadIdx.x, blockIdx.x, blockDim.x, gridDim.x, rowIndices[i] ); + } + + if ( i >= nnz || i == 0 ) { + printf( "Memory Access :: %d, %d, %d, %d,%d, %d, %d, %d, %d\n", i, k, nnz, nrows, threadIdx.x, blockIdx.x, blockDim.x, gridDim.x, rowIndices[i] ); + } + atomicAdd(&rowOffsets[k], 1); + } +*/ + + for ( int i = threadIdx.x + blockIdx.x * blockDim.x; i<nnz; i += blockDim.x * gridDim.x) + { + if ( i < nnz ) + { + int k = rowIndices[i]; + if ( k < nrows+1 ) + { + atomicAdd(&rowOffsets[k], 1); + } + } + } + + +} + +// Updates the values based on the previously determined permutation +void FOAM2CSR::updateValues( + const int nrows, + const int nInternalFaces, + const int extNnz, + const double *diagVal, + const double *uppVal, + const double *lowVal, + const double *extVal) +{ + + // Determine the local non-zeros from the internal faces + int localNnz = nrows + 2 * nInternalFaces; + + // Add external non-zeros (communicated halo entries) + int totalNnz = localNnz + extNnz; + + // Copy the values in [ diag, upper, lower, (external) ] + CHECK(cudaMemcpy(valuesTmp, diagVal, sizeof(double) * nrows, cudaMemcpyDefault)); + CHECK(cudaMemcpy(valuesTmp + nrows, uppVal, sizeof(double) * nInternalFaces, cudaMemcpyDefault)); + CHECK(cudaMemcpy(valuesTmp + nrows + nInternalFaces, lowVal, sizeof(double) * nInternalFaces, cudaMemcpyDefault)); + if (extNnz > 0) + { + CHECK(cudaMemcpy(valuesTmp + localNnz, extVal, sizeof(double) * extNnz, cudaMemcpyDefault)); + } + + constexpr int nthreads = 128; + int nblocks = totalNnz / nthreads + 1; + applyPermutation<<<nblocks, nthreads>>>(totalNnz, ldu2csrPerm, nullptr, valuesTmp, nullptr, values, true); + CHECK(cudaStreamSynchronize(0)); +} + +// Perform the conversion between an LDU matrix and a CSR matrix, possibly distributed +void FOAM2CSR::convertLDU2CSR( + int nrows, + int nInternalFaces, + int diagIndexGlobal, + int lowOffGlobal, + int uppOffGlobal, + const int *upperAddr, + const int *lowerAddr, + const int extNnz, + const int *extRow, + const int *extCol, + const double *diagVals, + const double *upperVals, + const double *lowerVals, + const double *extVals) +{ + + + // Determine the local non-zeros from the internal faces + int localNnz = nrows + 2 * nInternalFaces; + + // Add external non-zeros (communicated halo entries) + int totalNnz = localNnz + extNnz; + + + if ( diagIndexGlobal > 0 ) + { + sleep(2); + std::cout << " ============================================================== " << std::endl; + std::cout << "P1 :: nrows :: " << nrows << std::endl; + std::cout << "P1 :: nInternalFaces :: " << nInternalFaces << std::endl; + std::cout << "P1 :: extNnz :: " << extNnz << std::endl; + std::cout << "P1 :: diag index global :: " << diagIndexGlobal << std::endl; + std::cout << "P1 :: low offset global :: " << lowOffGlobal << std::endl; + std::cout << "P1 :: upp offset global :: " << uppOffGlobal << std::endl; + std::cout << "P1 :: upperaddr0 :: " << upperAddr[0] << std::endl; + std::cout << "P1 :: loweraddr0 :: " << lowerAddr[0] << std::endl; + if ( extNnz > 0 ){ + std::cout << "P1 :: extRow :: " << extRow[0] << std::endl; + std::cout << "P1 :: extCol :: " << extCol[0] << std::endl; + } + FILE* fid = fopen( "P1_descr.txt", "w" ); + fprintf( fid, "%d, %d, %d, %d, %d, %d\n", nrows, nInternalFaces, diagIndexGlobal, lowOffGlobal, uppOffGlobal, extNnz ); + fclose(fid); + + fid = fopen( "P1_LDU_diag.txt", "w" ); + for ( int i =0; i<nrows; i++ ) fprintf( fid, "%g\n", diagVals[i] ); + fclose(fid); + fid = fopen( "P1_LDU_sym.txt", "w" ); + for ( int i =0; i<nInternalFaces; i++ ) fprintf( fid, "%d, %d, %g, %g\n", + upperAddr[i], lowerAddr[i], upperVals[i], lowerVals[i] ); + fclose(fid); + if ( extNnz > 0 ){ + fid = fopen( "P1_LDU_ext.txt", "w" ); + for ( int i =0; i<extNnz; i++ ) fprintf( fid, "%d, %d, %g\n", + extRow[i], extCol[i], extVals[i] ); + fclose(fid); + } + std::cout << " ============================================================== " << std::endl; + } + else + { + std::cout << " ============================================================== " << std::endl; + std::cout << "P0 :: nrows :: " << nrows << std::endl; + std::cout << "P0 :: nInternalFaces :: " << nInternalFaces << std::endl; + std::cout << "P0 :: extNnz :: " << extNnz << std::endl; + std::cout << "P0 :: diag index global :: " << diagIndexGlobal << std::endl; + std::cout << "P0 :: low offset global :: " << lowOffGlobal << std::endl; + std::cout << "P0 :: upp offset global :: " << uppOffGlobal << std::endl; + std::cout << "P0 :: upperaddr0 :: " << upperAddr[0] << std::endl; + std::cout << "P0 :: loweraddr0 :: " << lowerAddr[0] << std::endl; + if ( extNnz > 0 ) + { + std::cout << "P0 :: extRow :: " << extRow[0] << std::endl; + std::cout << "P0 :: extCol :: " << extCol[0] << std::endl; + } + FILE* fid = fopen( "P0_descr.txt", "w" ); + fprintf( fid, "%d, %d, %d, %d, %d, %d\n", nrows, nInternalFaces, diagIndexGlobal, lowOffGlobal, uppOffGlobal, extNnz ); + fclose(fid); + fid = fopen( "P0_LDU_diag.txt", "w" ); + for ( int i =0; i<nrows; i++ ) fprintf( fid, "%g\n", diagVals[i] ); + fclose(fid); + fid = fopen( "P0_LDU_sym.txt", "w" ); + for ( int i =0; i<nInternalFaces; i++ ) fprintf( fid, "%d, %d, %g, %g\n", + upperAddr[i], lowerAddr[i], upperVals[i], lowerVals[i] ); + fclose(fid); + if ( extNnz > 0 ){ + fid = fopen( "P0_LDU_ext.txt", "w" ); + for ( int i =0; i<extNnz; i++ ) fprintf( fid, "%d, %d, %g\n", + extRow[i], extCol[i], extVals[i] ); + fclose(fid); + } + sleep(3); + } + + + // Generate unpermuted index list [0, ..., totalNnz-1] + int GuardSize=1000000; + int *permTmp; + CHECK(cudaMalloc(&permTmp, sizeof(int) * (totalNnz+GuardSize))); + thrust::sequence(thrust::device, permTmp, permTmp + totalNnz, 0); + + // Fill rowIndicesTmp with [0, ..., n-1], lowerAddr, upperAddr, (extAddr) + int *rowIndicesTmp; + CHECK(cudaMalloc(&rowIndicesTmp, sizeof(int) * (totalNnz+GuardSize))); + CHECK(cudaMemcpy(rowIndicesTmp, permTmp, nrows * sizeof(int), cudaMemcpyDefault)); + CHECK(cudaMemcpy(rowIndicesTmp + nrows, lowerAddr, nInternalFaces * sizeof(int), cudaMemcpyDefault)); + CHECK(cudaMemcpy(rowIndicesTmp + nrows + nInternalFaces, upperAddr, nInternalFaces * sizeof(int), cudaMemcpyDefault)); + if (extNnz > 0) + { + CHECK(cudaMemcpy(rowIndicesTmp + localNnz, extRow, extNnz * sizeof(int), cudaMemcpyDefault)); + } + constexpr int nthreads = 128; + int nblocks = localNnz / nthreads + 1; + // localToGlobalRowIndices<<<nblocks, nthreads>>>(localNnz, nrows, nInternalFaces, diagIndexGlobal, lowOffGlobal, uppOffGlobal, rowIndicesTmp); + + // Make space for the row indices and stored permutation + int *rowIndices; + CHECK(cudaMalloc(&rowIndices, sizeof(int) * (totalNnz+GuardSize))); + CHECK(cudaMalloc(&ldu2csrPerm, sizeof(int) * (totalNnz+GuardSize))); + + // Sort the row indices and store results in the permutation + void *tempStorage = NULL; + size_t tempStorageBytes = 0; + cub::DeviceRadixSort::SortPairs(tempStorage, tempStorageBytes, rowIndicesTmp, rowIndices, permTmp, ldu2csrPerm, totalNnz); + CHECK(cudaMalloc(&tempStorage, tempStorageBytes)); + + + cub::DeviceRadixSort::SortPairs(tempStorage, tempStorageBytes, rowIndicesTmp, rowIndices, permTmp, ldu2csrPerm, totalNnz); + CHECK(cudaFree(permTmp)); + CHECK(cudaFree(tempStorage)); + + // Make space for the row offsets + CHECK(cudaMalloc(&rowOffsets, sizeof(int) * (nrows + 1 + GuardSize))); + CHECK(cudaMemset(rowOffsets, 0, sizeof(int) * (nrows + 1 + GuardSize))); + + // XXX Taking the non zero per row data from the host could be more + // efficient, experiment with this in the future + //cudaMemcpy(rowOffsets, nz_per_row, sizeof(int) * nrows, cudaMemcpyDefault); + //thrust::exclusive_scan(thrust::device, rowOffsets, rowOffsets + nrows + 1, rowOffsets); + + // Convert the row indices into offsets + nblocks = totalNnz / nthreads + 1; + createRowOffsets<<<nblocks, nthreads>>>(totalNnz, nrows+1, rowIndices, rowOffsets); + thrust::exclusive_scan(thrust::device, rowOffsets, rowOffsets + nrows + 1, rowOffsets); + // CHECK(cudaFree(rowIndices)); + // printf( "%d - %d \n", rowOffsets[nrows], totalNnz ); + + + + // Fill valuesTmp with diagVals, upperVals, lowerVals, (extVals) + CHECK(cudaMalloc(&valuesTmp, (totalNnz+GuardSize) * sizeof(double))); + CHECK(cudaMemcpy(valuesTmp, diagVals, nrows * sizeof(double), cudaMemcpyDefault)); + CHECK(cudaMemcpy(valuesTmp + nrows, upperVals, nInternalFaces * sizeof(double), cudaMemcpyDefault)); + CHECK(cudaMemcpy(valuesTmp + nrows + nInternalFaces, lowerVals, nInternalFaces * sizeof(double), cudaMemcpyDefault)); + if (extNnz > 0) + { + CHECK(cudaMemcpy(valuesTmp + localNnz, extVals, extNnz * sizeof(double), cudaMemcpyDefault)); + } + + // Concat [0, ..., n-1], upperAddr, lowerAddr (note switched) into column indices + int *colIndicesTmp; + CHECK(cudaMalloc(&colIndicesTmp, (totalNnz+GuardSize) * sizeof(int))); + CHECK(cudaMemcpy(colIndicesTmp, rowIndicesTmp, nrows * sizeof(int), cudaMemcpyDefault)); + CHECK(cudaMemcpy(colIndicesTmp + nrows, rowIndicesTmp + nrows + nInternalFaces, nInternalFaces * sizeof(int), cudaMemcpyDefault)); + CHECK(cudaMemcpy(colIndicesTmp + nrows + nInternalFaces, rowIndicesTmp + nrows, nInternalFaces * sizeof(int), cudaMemcpyDefault)); + if (extNnz > 0) + { + CHECK(cudaMemcpy(colIndicesTmp + localNnz, extCol, extNnz * sizeof(int), cudaMemcpyDefault)); + } + + CHECK(cudaFree(rowIndicesTmp)); + + + + // Construct the global column indices + nblocks = localNnz / nthreads + 1; + localToGlobalColIndices<<<nblocks, nthreads>>>(localNnz, nrows, nInternalFaces, diagIndexGlobal, lowOffGlobal, uppOffGlobal, colIndicesTmp); + + + // std::cout << "Print" << std::endl; + // nblocks = 1; + // printPerm<<<nblocks, nthreads>>>( 9, 33, rowOffsets, colIndicesTmp, ldu2csrPerm ); + // std::cout << "Exit" << std::endl; + + + // Allocate space to store the permuted column indices and values + CHECK(cudaMalloc(&colIndices, sizeof(int) * (totalNnz+GuardSize))); + CHECK(cudaMalloc(&values, sizeof(double) * (totalNnz+GuardSize))); + + // Swap column indices based on the pre-determined permutation + nblocks = totalNnz / nthreads + 1; + applyPermutation<<<nblocks, nthreads>>>(totalNnz, ldu2csrPerm, colIndicesTmp, valuesTmp, colIndices, values, false); + CHECK(cudaFree(colIndicesTmp)); + + + int* csr_rowOfs = (int*) malloc( sizeof(int)* (nrows+1) ); + int* csr_rowIdx = (int*) malloc( sizeof(int)* totalNnz ); + int* csr_colIdx = (int*) malloc( sizeof(int)* totalNnz ); + double* csr_values = (double*) malloc( sizeof(double)*totalNnz ); + + CHECK(cudaMemcpy( csr_rowOfs, rowOffsets, (nrows+1) * sizeof(int), cudaMemcpyDeviceToHost)); + CHECK(cudaMemcpy( csr_rowIdx, rowIndices, totalNnz * sizeof(int), cudaMemcpyDeviceToHost)); + CHECK(cudaMemcpy( csr_colIdx, colIndices, totalNnz * sizeof(int), cudaMemcpyDeviceToHost)); + CHECK(cudaMemcpy( csr_values, values, totalNnz * sizeof(double), cudaMemcpyDeviceToHost)); + + if ( diagIndexGlobal > 0 ) + { + FILE* fid = fopen( "P1_CSR_rowOffs.txt", "w" ); + for ( int i =0; i<nrows+1; i++ ) fprintf( fid, "%d\n", csr_rowOfs[i] ); + fclose(fid); + fid = fopen( "P1_CSR_col.txt", "w" ); + for ( int i =0; i<totalNnz; i++ ) fprintf( fid, "%d, %d, %g\n", csr_rowIdx[i], csr_colIdx[i], csr_values[i] ); + fclose(fid); + } + else + { + FILE* fid = fopen( "P0_CSR_rowOffs.txt", "w" ); + for ( int i =0; i<nrows+1; i++ ) fprintf( fid, "%d\n", csr_rowOfs[i] ); + fclose(fid); + fid = fopen( "P0_CSR_col.txt", "w" ); + for ( int i =0; i<totalNnz; i++ ) fprintf( fid, "%d, %d, %g\n", csr_rowIdx[i], csr_colIdx[i], csr_values[i] ); + fclose(fid); + } + + +} + +// XXX Should implement an early abandonment of the +// unnecessary data for capacity optimisation +void FOAM2CSR::discardStructure() +{ +} + +// Deallocate remaining storage +void FOAM2CSR::finalise() +{ + if(rowOffsets != nullptr) + CHECK(cudaFree(rowOffsets)); + if(colIndices != nullptr) + CHECK(cudaFree(colIndices)); + if(values != nullptr) + CHECK(cudaFree(values)); + if(valuesTmp != nullptr) + CHECK(cudaFree(valuesTmp)); + if(ldu2csrPerm != nullptr) + CHECK(cudaFree(ldu2csrPerm)); +} diff --git a/src/consolidate.cu b/src/consolidate.cu new file mode 100644 index 0000000000000000000000000000000000000000..853a965d02329d5e9a3009bdfa686356002e23dc --- /dev/null +++ b/src/consolidate.cu @@ -0,0 +1,428 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * \file consolidate.cu + * \brief Definition of member functions related to matrix consolidation. + * \author Matt Martineau (mmartineau@nvidia.com) + * \date 2020-07-31 + */ + +#include <AmgXSolver.H> + +#include <numeric> + +/* + Changes local row offsets to describe the consolidated row space on + the root rank. +*/ +__global__ void fixConsolidatedRowOffsets(int nLocalRows, int offset, int* rowOffsets) +{ + for(int i = threadIdx.x + blockIdx.x*blockDim.x; i < nLocalRows; i += blockDim.x*gridDim.x) + { + rowOffsets[i] += offset; + } +} + +// A set of handles to the device data storing a consolidated CSR matrix +struct ConsolidationHandles +{ + cudaIpcMemHandle_t rhsConsHandle; + cudaIpcMemHandle_t solConsHandle; + cudaIpcMemHandle_t rowOffsetsConsHandle; + cudaIpcMemHandle_t colIndicesConsHandle; + cudaIpcMemHandle_t valuesConsHandle; +}; + +/* \implements AmgXSolver::initializeConsolidation */ +PetscErrorCode AmgXSolver::initializeConsolidation( + const PetscInt nLocalRows, + const PetscInt nLocalNz, + const PetscScalar* values) +{ + PetscFunctionBeginUser; + + // Check if multiple ranks are associated with a device + if (devWorldSize == 1) + { + consolidationStatus = ConsolidationStatus::None; + PetscFunctionReturn(0); + } + + nRowsInDevWorld.resize(devWorldSize); + nnzInDevWorld.resize(devWorldSize); + rowDispls.resize(devWorldSize+1, 0); + nzDispls.resize(devWorldSize+1, 0); + + // Fetch to all the number of local rows on each rank + MPI_Request req[2]; + int ierr = MPI_Iallgather(&nLocalRows, 1, MPI_INT, nRowsInDevWorld.data(), 1, MPI_INT, devWorld, &req[0]); CHK; + + // Fetch to all the number of non zeros on each rank + ierr = MPI_Iallgather(&nLocalNz, 1, MPI_INT, nnzInDevWorld.data(), 1, MPI_INT, devWorld, &req[1]); CHK; + MPI_Waitall(2, req, MPI_STATUSES_IGNORE); + + // Calculate consolidate number of rows, non-zeros, and calculate row, non-zero displacements + nConsNz = std::accumulate(nnzInDevWorld.begin(), nnzInDevWorld.end(), 0); + nConsRows = std::accumulate(nRowsInDevWorld.begin(), nRowsInDevWorld.end(), 0); + std::partial_sum(nRowsInDevWorld.begin(), nRowsInDevWorld.end(), rowDispls.begin()+1); + std::partial_sum(nnzInDevWorld.begin(), nnzInDevWorld.end(), nzDispls.begin()+1); + + // Consolidate the CSR matrix data from multiple ranks sharing a single GPU to a + // root rank, allowing multiple ranks per GPU. This allows overdecomposing the problem + // when there are more CPU cores than GPUs, without the inefficiences of performing + // the linear solve on multiple separate domains. + // If the data is a device pointer then use IPC handles to perform the intra-GPU + // copies from the allocations of different processes operating the same GPU. + // Opening the handles as an initialization step means it is not necessary to + // repeatedly call cudaIpcOpenMemHandle, which can be expensive. + cudaPointerAttributes att; + cudaError_t err = cudaPointerGetAttributes(&att, values); + if (err != cudaErrorInvalidValue && att.type == cudaMemoryTypeDevice) + { + ConsolidationHandles handles; + // The data is already on the GPU so consolidate there + if (gpuProc == 0) + { + // We are consolidating data that already exists on the GPU + CHECK(cudaMalloc((void**)&rhsCons, sizeof(PetscScalar) * nConsRows)); + CHECK(cudaMalloc((void**)&pCons, sizeof(PetscScalar) * nConsRows)); + CHECK(cudaMalloc((void**)&rowOffsetsCons, sizeof(PetscInt) * (nConsRows+1))); + CHECK(cudaMalloc((void**)&colIndicesGlobalCons, sizeof(PetscInt) * nConsNz)); + CHECK(cudaMalloc((void**)&valuesCons, sizeof(PetscScalar) * nConsNz)); + + CHECK(cudaIpcGetMemHandle(&handles.rhsConsHandle, rhsCons)); + CHECK(cudaIpcGetMemHandle(&handles.solConsHandle, pCons)); + CHECK(cudaIpcGetMemHandle(&handles.rowOffsetsConsHandle, rowOffsetsCons)); + CHECK(cudaIpcGetMemHandle(&handles.colIndicesConsHandle, colIndicesGlobalCons)); + CHECK(cudaIpcGetMemHandle(&handles.valuesConsHandle, valuesCons)); + } + + MPI_Bcast(&handles, sizeof(ConsolidationHandles), MPI_BYTE, 0, devWorld); + + if(gpuProc == MPI_UNDEFINED) + { + CHECK(cudaIpcOpenMemHandle((void**)&rhsCons, handles.rhsConsHandle, cudaIpcMemLazyEnablePeerAccess)); + CHECK(cudaIpcOpenMemHandle((void**)&pCons, handles.solConsHandle, cudaIpcMemLazyEnablePeerAccess)); + CHECK(cudaIpcOpenMemHandle((void**)&rowOffsetsCons, handles.rowOffsetsConsHandle, cudaIpcMemLazyEnablePeerAccess)); + CHECK(cudaIpcOpenMemHandle((void**)&colIndicesGlobalCons, handles.colIndicesConsHandle, cudaIpcMemLazyEnablePeerAccess)); + CHECK(cudaIpcOpenMemHandle((void**)&valuesCons, handles.valuesConsHandle, cudaIpcMemLazyEnablePeerAccess)); + } + + consolidationStatus = ConsolidationStatus::Device; + } + else + { + if (gpuProc == 0) + { + // The data is already on the CPU so consolidate there + rowOffsetsCons = new PetscInt[nConsRows+1]; + colIndicesGlobalCons = new PetscInt[nConsNz]; + valuesCons = new PetscScalar[nConsNz]; + rhsCons = new PetscScalar[nConsRows]; + pCons = new PetscScalar[nConsRows]; + } + + consolidationStatus = ConsolidationStatus::Host; + } + + PetscFunctionReturn(0); +} + +/* \implements AmgXSolver::consolidateMatrix */ +PetscErrorCode AmgXSolver::consolidateMatrix( + const PetscInt nLocalRows, + const PetscInt nLocalNz, + const PetscInt* rowOffsets, + const PetscInt* colIndicesGlobal, + const PetscScalar* values) +{ + PetscFunctionBeginUser; + + // Consolidation has been previously used, must deallocate the structures + if (consolidationStatus != ConsolidationStatus::Uninitialized) + { + // XXX Would the maintainers be happy to include a warning message here, + // that makes it clear updateA should be preferentially adopted by developers + // if the sparsity pattern does not change? This would avoid many costs, + // including re-consolidation costs. + + finalizeConsolidation(); + } + + // Allocate space for the structures required to consolidate + initializeConsolidation(nLocalRows, nLocalNz, values); + + switch(consolidationStatus) + { + + case ConsolidationStatus::None: + { + // Consolidation is not required + PetscFunctionReturn(0); + } + case ConsolidationStatus::Uninitialized: + { + SETERRQ(MPI_COMM_WORLD, PETSC_ERR_SUP_SYS, + "Attempting to consolidate before consolidation is initialized.\n"); + break; + } + case ConsolidationStatus::Device: + { + // Copy the data to the consolidation buffer + CHECK(cudaMemcpy(&rowOffsetsCons[rowDispls[myDevWorldRank]], rowOffsets, sizeof(PetscInt) * nLocalRows, cudaMemcpyDefault)); + CHECK(cudaMemcpy(&colIndicesGlobalCons[nzDispls[myDevWorldRank]], colIndicesGlobal, sizeof(PetscInt) * nLocalNz, cudaMemcpyDefault)); + CHECK(cudaMemcpy(&valuesCons[nzDispls[myDevWorldRank]], values, sizeof(PetscScalar) * nLocalNz, cudaMemcpyDefault)); + + // cudaMemcpy does not block the host in the cases above, device to device copies, + // so sychronize with device to ensure operation is complete. Barrier on all devWorld + // ranks to ensure full arrays are populated before the root process uses the data. + CHECK(cudaDeviceSynchronize()); + int ierr = MPI_Barrier(devWorld); CHK; + + if (gpuProc == 0) + { + // Adjust merged row offsets so that they are correct for the consolidated matrix + for (int i = 1; i < devWorldSize; ++i) + { + int nthreads = 128; + int nblocks = nRowsInDevWorld[i] / nthreads + 1; + fixConsolidatedRowOffsets<<<nblocks, nthreads>>>(nRowsInDevWorld[i], nzDispls[i], &rowOffsetsCons[rowDispls[i]]); + } + + // Manually add the last entry of the rowOffsets list, which is the + // number of non-zeros in the CSR matrix + CHECK(cudaMemcpy(&rowOffsetsCons[nConsRows], &nConsNz, sizeof(int), cudaMemcpyDefault)); + } + else + { + // Close IPC handles and deallocate for consolidation + CHECK(cudaIpcCloseMemHandle(rowOffsetsCons)); + CHECK(cudaIpcCloseMemHandle(colIndicesGlobalCons)); + } + + CHECK(cudaDeviceSynchronize()); + break; + } + case ConsolidationStatus::Host: + { + // Gather the matrix data to the root rank for consolidation + MPI_Request req[3]; + int ierr = MPI_Igatherv(rowOffsets, nLocalRows, MPI_INT, rowOffsetsCons, nRowsInDevWorld.data(), rowDispls.data(), MPI_INT, 0, devWorld, &req[0]); CHK; + ierr = MPI_Igatherv(colIndicesGlobal, nLocalNz, MPI_INT, colIndicesGlobalCons, nnzInDevWorld.data(), nzDispls.data(), MPI_INT, 0, devWorld, &req[1]); CHK; + ierr = MPI_Igatherv(values, nLocalNz, MPI_DOUBLE, valuesCons, nnzInDevWorld.data(), nzDispls.data(), MPI_DOUBLE, 0, devWorld, &req[2]); CHK; + MPI_Waitall(3, req, MPI_STATUSES_IGNORE); + + if (gpuProc == 0) + { + // Adjust merged row offsets so that they are correct for the consolidated matrix + for (int j = 1; j < devWorldSize; ++j) + { + for(int i = 0; i < nRowsInDevWorld[j]; ++i) + { + rowOffsetsCons[rowDispls[j] + i] += nzDispls[j]; + } + } + + // Manually add the last entry of the rowOffsets list, which is the + // number of non-zeros in the CSR matrix + rowOffsetsCons[nConsRows] = nConsNz; + } + + break; + } + default: + { + SETERRQ(MPI_COMM_WORLD, PETSC_ERR_SUP_SYS, + "Incorrect consolidation status set.\n"); + break; + } + + } + + PetscFunctionReturn(0); +} + +/* \implements AmgXSolver::reconsolidateValues */ +PetscErrorCode AmgXSolver::reconsolidateValues( + const PetscInt nLocalNz, + const PetscScalar* values) +{ + PetscFunctionBeginUser; + + switch (consolidationStatus) + { + + case ConsolidationStatus::None: + { + // Consolidation is not required + PetscFunctionReturn(0); + } + case ConsolidationStatus::Uninitialized: + { + SETERRQ(MPI_COMM_WORLD, PETSC_ERR_SUP_SYS, + "Attempting to re-consolidate before consolidation is initialized.\n"); + break; + } + case ConsolidationStatus::Device: + { + CHECK(cudaDeviceSynchronize()); + int ierr = MPI_Barrier(devWorld); CHK; + + // The data is already on the GPU so consolidate there + CHECK(cudaMemcpy(&valuesCons[nzDispls[myDevWorldRank]], values, sizeof(PetscScalar) * nLocalNz, cudaMemcpyDefault)); + + CHECK(cudaDeviceSynchronize()); + ierr = MPI_Barrier(devWorld); CHK; + + break; + } + case ConsolidationStatus::Host: + { + // Gather the matrix values to the root rank for consolidation + int ierr = MPI_Gatherv(values, nLocalNz, MPI_DOUBLE, valuesCons, nnzInDevWorld.data(), nzDispls.data(), MPI_DOUBLE, 0, devWorld); CHK; + break; + } + default: + { + SETERRQ(MPI_COMM_WORLD, PETSC_ERR_SUP_SYS, + "Incorrect consolidation status set.\n"); + break; + } + + } + + PetscFunctionReturn(0); +} + +/* \implements AmgXSolver::freeConsStructure */ +PetscErrorCode AmgXSolver::freeConsStructure() +{ + PetscFunctionBeginUser; + + // Only the root rank maintains a consolidated structure + if(gpuProc == MPI_UNDEFINED) + { + PetscFunctionReturn(0); + } + + switch(consolidationStatus) + { + + case ConsolidationStatus::None: + { + // Consolidation is not required + PetscFunctionReturn(0); + } + case ConsolidationStatus::Uninitialized: + { + SETERRQ(MPI_COMM_WORLD, PETSC_ERR_SUP_SYS, + "Attempting to free consolidation structures before consolidation is initialized.\n"); + break; + } + case ConsolidationStatus::Device: + { + // Free the device allocated consolidated CSR matrix structure + CHECK(cudaFree(rowOffsetsCons)); + CHECK(cudaFree(colIndicesGlobalCons)); + break; + } + case ConsolidationStatus::Host: + { + // Free the host allocated consolidated CSR matrix structure + delete[] rowOffsetsCons; + delete[] colIndicesGlobalCons; + break; + } + default: + { + SETERRQ(MPI_COMM_WORLD, PETSC_ERR_SUP_SYS, + "Incorrect consolidation status set.\n"); + break; + } + + } + + PetscFunctionReturn(0); +} + +/* \implements AmgXSolver::finalizeConsolidation */ +PetscErrorCode AmgXSolver::finalizeConsolidation() +{ + PetscFunctionBeginUser; + + switch(consolidationStatus) + { + + case ConsolidationStatus::None: + case ConsolidationStatus::Uninitialized: + { + // Consolidation is not required or uninitialized + PetscFunctionReturn(0); + } + case ConsolidationStatus::Device: + { + if (gpuProc == 0) + { + // Deallocate the CSR matrix values, solution and RHS + CHECK(cudaFree(valuesCons)); + CHECK(cudaFree(pCons)); + CHECK(cudaFree(rhsCons)); + } + else + { + // Close the remaining IPC memory handles + CHECK(cudaIpcCloseMemHandle(valuesCons)); + CHECK(cudaIpcCloseMemHandle(pCons)); + CHECK(cudaIpcCloseMemHandle(rhsCons)); + } + break; + } + case ConsolidationStatus::Host: + { + if(gpuProc == 0) + { + delete[] valuesCons; + delete[] pCons; + delete[] rhsCons; + } + break; + } + default: + { + SETERRQ(MPI_COMM_WORLD, PETSC_ERR_SUP_SYS, + "Incorrect consolidation status set.\n"); + break; + } + + } + + // Free the local GPU partitioning structures + if(consolidationStatus == ConsolidationStatus::Device || consolidationStatus == ConsolidationStatus::Host) + { + nRowsInDevWorld.clear(); + nnzInDevWorld.clear(); + rowDispls.clear(); + nzDispls.clear(); + } + + consolidationStatus = ConsolidationStatus::Uninitialized; + + PetscFunctionReturn(0); +} diff --git a/src/init.cpp b/src/init.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d6afcd7578c7db434e907a9a2343c2d47b5f1352 --- /dev/null +++ b/src/init.cpp @@ -0,0 +1,336 @@ +/** + * \file init.cpp + * \brief Definition of some member functions of the class AmgXSolver. + * \author Pi-Yueh Chuang (pychuang@gwu.edu) + * \date 2016-01-08 + * \copyright Copyright (c) 2015-2019 Pi-Yueh Chuang, Lorena A. Barba. + * This project is released under MIT License. + */ + + +// CUDA +# include <cuda_runtime.h> + +// AmgXSolver +# include "AmgXSolver.H" +# include <iostream> + +/* \implements AmgXSolver::AmgXSolver */ +AmgXSolver::AmgXSolver(const MPI_Comm &comm, + const std::string &modeStr, const std::string &cfgFile) +{ + initialize(comm, modeStr, cfgFile); +} + + +/* \implements AmgXSolver::~AmgXSolver */ +AmgXSolver::~AmgXSolver() +{ + if (isInitialized) finalize(); +} + + +/* \implements AmgXSolver::initialize */ +PetscErrorCode AmgXSolver::initialize(const MPI_Comm &comm, + const std::string &modeStr, const std::string &cfgFile) +{ + PetscErrorCode ierr; + + PetscFunctionBeginUser; + + // if this instance has already been initialized, skip + if (isInitialized) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, + "This AmgXSolver instance has been initialized on this process."); + + // increase the number of AmgXSolver instances + count += 1; + + // get the name of this node + int len; + char name[MPI_MAX_PROCESSOR_NAME]; + ierr = MPI_Get_processor_name(name, &len); CHK; + nodeName = name; + + // get the mode of AmgX solver + ierr = setMode(modeStr); CHK; + + // initialize communicators and corresponding information + ierr = initMPIcomms(comm); CHK; + + // only processes in gpuWorld are required to initialize AmgX + if (gpuProc == 0) + { + ierr = initAmgX(cfgFile); CHK; + } + + // a bool indicating if this instance is initialized + isInitialized = true; + + consolidationStatus = ConsolidationStatus::Uninitialized; + + PetscFunctionReturn(0); +} + + +/* \implements AmgXSolver::initMPIcomms */ +PetscErrorCode AmgXSolver::initMPIcomms(const MPI_Comm &comm) +{ + PetscErrorCode ierr; + + PetscFunctionBeginUser; + + // duplicate the global communicator + ierr = MPI_Comm_dup(comm, &globalCpuWorld); CHK; + ierr = MPI_Comm_set_name(globalCpuWorld, "globalCpuWorld"); CHK; + + // get size and rank for global communicator + ierr = MPI_Comm_size(globalCpuWorld, &globalSize); CHK; + ierr = MPI_Comm_rank(globalCpuWorld, &myGlobalRank); CHK; + + + // Get the communicator for processors on the same node (local world) + ierr = MPI_Comm_split_type(globalCpuWorld, + MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &localCpuWorld); CHK; + ierr = MPI_Comm_set_name(localCpuWorld, "localCpuWorld"); CHK; + + // get size and rank for local communicator + ierr = MPI_Comm_size(localCpuWorld, &localSize); CHK; + ierr = MPI_Comm_rank(localCpuWorld, &myLocalRank); CHK; + + + // set up the variable nDevs + ierr = setDeviceCount(); CHK; + + + // set up corresponding ID of the device used by each local process + ierr = setDeviceIDs(); CHK; + ierr = MPI_Barrier(globalCpuWorld); CHK; + + + // split the global world into a world involved in AmgX and a null world + ierr = MPI_Comm_split(globalCpuWorld, gpuProc, 0, &gpuWorld); CHK; + + // get size and rank for the communicator corresponding to gpuWorld + if (gpuWorld != MPI_COMM_NULL) + { + ierr = MPI_Comm_set_name(gpuWorld, "gpuWorld"); CHK; + ierr = MPI_Comm_size(gpuWorld, &gpuWorldSize); CHK; + ierr = MPI_Comm_rank(gpuWorld, &myGpuWorldRank); CHK; + } + else // for those can not communicate with GPU devices + { + gpuWorldSize = MPI_UNDEFINED; + myGpuWorldRank = MPI_UNDEFINED; + } + + // split local world into worlds corresponding to each CUDA device + ierr = MPI_Comm_split(localCpuWorld, devID, 0, &devWorld); CHK; + ierr = MPI_Comm_set_name(devWorld, "devWorld"); CHK; + + // get size and rank for the communicator corresponding to myWorld + ierr = MPI_Comm_size(devWorld, &devWorldSize); CHK; + ierr = MPI_Comm_rank(devWorld, &myDevWorldRank); CHK; + + ierr = MPI_Barrier(globalCpuWorld); CHK; + + return 0; +} + + +/* \implements AmgXSolver::setDeviceCount */ +PetscErrorCode AmgXSolver::setDeviceCount() +{ + PetscFunctionBeginUser; + + // get the number of devices that AmgX solvers can use + switch (mode) + { + case AMGX_mode_dDDI: // for GPU cases, nDevs is the # of local GPUs + case AMGX_mode_dDFI: // for GPU cases, nDevs is the # of local GPUs + case AMGX_mode_dFFI: // for GPU cases, nDevs is the # of local GPUs + // get the number of total cuda devices + CHECK(cudaGetDeviceCount(&nDevs)); + std::cout << "Number of GPU devices :: " << nDevs << std::endl; + + // Check whether there is at least one CUDA device on this node + if (nDevs == 0) SETERRQ1(MPI_COMM_WORLD, PETSC_ERR_SUP_SYS, + "There is no CUDA device on the node %s !\n", nodeName.c_str()); + break; + case AMGX_mode_hDDI: // for CPU cases, nDevs is the # of local processes + case AMGX_mode_hDFI: // for CPU cases, nDevs is the # of local processes + case AMGX_mode_hFFI: // for CPU cases, nDevs is the # of local processes + default: + nDevs = localSize; + break; + } + + PetscFunctionReturn(0); +} + + +/* \implements AmgXSolver::setDeviceIDs */ +PetscErrorCode AmgXSolver::setDeviceIDs() +{ + PetscFunctionBeginUser; + + PetscErrorCode ierr; + + // set the ID of device that each local process will use + if (nDevs == localSize) // # of the devices and local precosses are the same + { + devID = myLocalRank; + gpuProc = 0; + } + else if (nDevs > localSize) // there are more devices than processes + { + ierr = PetscPrintf(localCpuWorld, "CUDA devices on the node %s " + "are more than the MPI processes launched. Only %d CUDA " + "devices will be used.\n", nodeName.c_str(), localSize); CHK; + + devID = myLocalRank; + gpuProc = 0; + } + else // there more processes than devices + { + int nBasic = localSize / nDevs, + nRemain = localSize % nDevs; + + if (myLocalRank < (nBasic+1)*nRemain) + { + devID = myLocalRank / (nBasic + 1); + if (myLocalRank % (nBasic + 1) == 0) gpuProc = 0; + } + else + { + devID = (myLocalRank - (nBasic+1)*nRemain) / nBasic + nRemain; + if ((myLocalRank - (nBasic+1)*nRemain) % nBasic == 0) gpuProc = 0; + } + } + + // Set the device for each rank + cudaSetDevice(devID); + + PetscFunctionReturn(0); +} + + +/* \implements AmgXSolver::initAmgX */ +PetscErrorCode AmgXSolver::initAmgX(const std::string &cfgFile) +{ + PetscFunctionBeginUser; + + // only the first instance (AmgX solver) is in charge of initializing AmgX + if (count == 1) + { + // initialize AmgX + AMGX_SAFE_CALL(AMGX_initialize()); + + // intialize AmgX plugings + AMGX_SAFE_CALL(AMGX_initialize_plugins()); + + // only the master process can output something on the screen + AMGX_SAFE_CALL(AMGX_register_print_callback( + [](const char *msg, int length)->void + {PetscPrintf(PETSC_COMM_WORLD, "%s", msg);})); + + // let AmgX to handle errors returned + AMGX_SAFE_CALL(AMGX_install_signal_handler()); + } + + // create an AmgX configure object + AMGX_SAFE_CALL(AMGX_config_create_from_file(&cfg, cfgFile.c_str())); + + // let AmgX handle returned error codes internally + AMGX_SAFE_CALL(AMGX_config_add_parameters(&cfg, "exception_handling=1")); + + // create an AmgX resource object, only the first instance is in charge + if (count == 1) AMGX_resources_create(&rsrc, cfg, &gpuWorld, 1, &devID); + + // create AmgX vector object for unknowns and RHS + AMGX_vector_create(&AmgXP, rsrc, mode); + AMGX_vector_create(&AmgXRHS, rsrc, mode); + + // create AmgX matrix object for unknowns and RHS + AMGX_matrix_create(&AmgXA, rsrc, mode); + + // create an AmgX solver object + AMGX_solver_create(&solver, rsrc, mode, cfg); + + // obtain the default number of rings based on current configuration + AMGX_config_get_default_number_of_rings(cfg, &ring); + + PetscFunctionReturn(0); +} + + +/* \implements AmgXSolver::finalize */ +PetscErrorCode AmgXSolver::finalize() +{ + PetscErrorCode ierr; + + PetscFunctionBeginUser; + + // skip if this instance has not been initialized + if (! isInitialized) + { + ierr = PetscPrintf(PETSC_COMM_WORLD, + "This AmgXWrapper has not been initialized. " + "Please initialize it before finalization.\n"); CHK; + + PetscFunctionReturn(0); + } + + // only processes using GPU are required to destroy AmgX content + if (gpuProc == 0) + { + // destroy solver instance + AMGX_solver_destroy(solver); + + // destroy matrix instance + AMGX_matrix_destroy(AmgXA); + + // destroy RHS and unknown vectors + AMGX_vector_destroy(AmgXP); + AMGX_vector_destroy(AmgXRHS); + + // only the last instance need to destroy resource and finalizing AmgX + if (count == 1) + { + AMGX_resources_destroy(rsrc); + AMGX_SAFE_CALL(AMGX_config_destroy(cfg)); + + AMGX_SAFE_CALL(AMGX_finalize_plugins()); + AMGX_SAFE_CALL(AMGX_finalize()); + } + else + { + AMGX_config_destroy(cfg); + } + + // destroy gpuWorld + ierr = MPI_Comm_free(&gpuWorld); CHK; + } + + finalizeConsolidation(); + + // destroy PETSc objects + ierr = VecScatterDestroy(&scatterLhs); CHK; + ierr = VecScatterDestroy(&scatterRhs); CHK; + ierr = VecDestroy(&redistLhs); CHK; + ierr = VecDestroy(&redistRhs); CHK; + + // re-set necessary variables in case users want to reuse + // the variable of this instance for a new instance + gpuProc = MPI_UNDEFINED; + ierr = MPI_Comm_free(&globalCpuWorld); CHK; + ierr = MPI_Comm_free(&localCpuWorld); CHK; + ierr = MPI_Comm_free(&devWorld); CHK; + + // decrease the number of instances + count -= 1; + + // change status + isInitialized = false; + + PetscFunctionReturn(0); +} diff --git a/src/misc.cpp b/src/misc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..86a7fa1157914abfbff9d0f25eb8950f6421b228 --- /dev/null +++ b/src/misc.cpp @@ -0,0 +1,62 @@ +/** + * \file misc.cpp + * \brief Definition of some member functions of the class AmgXSolver. + * \author Pi-Yueh Chuang (pychuang@gwu.edu) + * \date 2016-01-08 + * \copyright Copyright (c) 2015-2019 Pi-Yueh Chuang, Lorena A. Barba. + * This project is released under MIT License. + */ + + +// AmgXWrapper +# include "AmgXSolver.H" + + +/* \implements AmgXSolver::setMode */ +PetscErrorCode AmgXSolver::setMode(const std::string &modeStr) +{ + PetscFunctionBeginUser; + + if (modeStr == "dDDI") + mode = AMGX_mode_dDDI; + else if (modeStr == "dDFI") + mode = AMGX_mode_dDFI; + else if (modeStr == "dFFI") + mode = AMGX_mode_dFFI; + else if (modeStr[0] == 'h') + SETERRQ1(MPI_COMM_WORLD, PETSC_ERR_ARG_WRONG, + "CPU mode, %s, is not supported in this wrapper!", + modeStr.c_str()); + else + SETERRQ1(MPI_COMM_WORLD, PETSC_ERR_ARG_WRONG, + "%s is not an available mode! Available modes are: " + "dDDI, dDFI, dFFI.\n", modeStr.c_str()); + + PetscFunctionReturn(0); +} + + +/* \implements AmgXSolver::getIters */ +PetscErrorCode AmgXSolver::getIters(int &iter) +{ + PetscFunctionBeginUser; + + // only processes using AmgX will try to get # of iterations + if (gpuProc == 0) + AMGX_solver_get_iterations_number(solver, &iter); + + PetscFunctionReturn(0); +} + + +/* \implements AmgXSolver::getResidual */ +PetscErrorCode AmgXSolver::getResidual(const int &iter, double &res) +{ + PetscFunctionBeginUser; + + // only processes using AmgX will try to get residual + if (gpuProc == 0) + AMGX_solver_get_iteration_residual(solver, iter, 0, &res); + + PetscFunctionReturn(0); +} diff --git a/src/setA.cpp b/src/setA.cpp new file mode 100644 index 0000000000000000000000000000000000000000..39d514a5bededd7888f0d1d73c7d6f673b6a13c3 --- /dev/null +++ b/src/setA.cpp @@ -0,0 +1,525 @@ +/** + * \file setA.cpp + * \brief Definition of member functions regarding setting A in AmgXSolver. + * \author Pi-Yueh Chuang (pychuang@gwu.edu) + * \date 2016-01-08 + * \copyright Copyright (c) 2015-2019 Pi-Yueh Chuang, Lorena A. Barba. + * \copyright Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * This project is released under MIT License. + */ + + +// STD +# include <cstring> +# include <algorithm> +#include <iostream> + +// AmgXSolver +# include "AmgXSolver.H" + + +/* \implements AmgXSolver::setA */ +PetscErrorCode AmgXSolver::setA(const Mat &A) +{ + PetscFunctionBeginUser; + + PetscErrorCode ierr; + + Mat localA; + + IS devIS; + + PetscInt nGlobalRows, + nLocalRows; + PetscBool usesOffsets; + + std::vector<PetscInt> row; + std::vector<PetscInt64> col; + std::vector<PetscScalar> data; + std::vector<PetscInt> partData; + + + // get number of rows in global matrix + ierr = MatGetSize(A, &nGlobalRows, nullptr); CHK; + + // get the row indices of redistributed matrix owned by processes in gpuWorld + ierr = getDevIS(A, devIS); CHK; + + // get sequential local portion of redistributed matrix + ierr = getLocalA(A, devIS, localA); CHK; + + // get compressed row layout of the local Mat + ierr = getLocalMatRawData(localA, nLocalRows, row, col, data); CHK; + + // destroy local matrix + ierr = destroyLocalA(A, localA); CHK; + + // get a partition vector required by AmgX + ierr = getPartData(devIS, nGlobalRows, partData, usesOffsets); CHK; + + + // upload matrix A to AmgX + if (gpuWorld != MPI_COMM_NULL) + { + ierr = MPI_Barrier(gpuWorld); CHK; + // offsets need to be 64 bit, since we use 64 bit column indices + std::vector<PetscInt64> offsets; + + AMGX_distribution_handle dist; + AMGX_distribution_create(&dist, cfg); + if (usesOffsets) { + offsets.assign(partData.begin(), partData.end()); + AMGX_distribution_set_partition_data(dist, AMGX_DIST_PARTITION_OFFSETS, offsets.data()); + } else { + AMGX_distribution_set_partition_data(dist, AMGX_DIST_PARTITION_VECTOR, partData.data()); + } + + AMGX_matrix_upload_distributed( + AmgXA, nGlobalRows, nLocalRows, row[nLocalRows], + 1, 1, row.data(), col.data(), data.data(), + nullptr, dist); + AMGX_distribution_destroy(dist); + + // bind the matrix A to the solver + ierr = MPI_Barrier(gpuWorld); CHK; + AMGX_solver_setup(solver, AmgXA); + + // connect (bind) vectors to the matrix + AMGX_vector_bind(AmgXP, AmgXA); + AMGX_vector_bind(AmgXRHS, AmgXA); + } + ierr = MPI_Barrier(globalCpuWorld); CHK; + + // destroy temporary PETSc objects + ierr = ISDestroy(&devIS); CHK; + + PetscFunctionReturn(0); +} + + +/* \implements AmgXSolver::getDevIS */ +PetscErrorCode AmgXSolver::getDevIS(const Mat &A, IS &devIS) +{ + PetscFunctionBeginUser; + + PetscErrorCode ierr; + IS tempIS; + + // get index sets of A locally owned by each process + // note that devIS is now a serial IS on each process + ierr = MatGetOwnershipIS(A, &devIS, nullptr); CHK; + + // concatenate index sets that belong to the same devWorld + // note that now devIS is a parallel IS of communicator devWorld + ierr = ISOnComm(devIS, devWorld, PETSC_USE_POINTER, &tempIS); CHK; + ierr = ISDestroy(&devIS); CHK; + + // all gather in order to have all indices belong to a devWorld on the + // leading rank of that devWorld. THIS IS NOT EFFICIENT!! + // note that now devIS is again a serial IS on each process + ierr = ISAllGather(tempIS, &devIS); CHK; + ierr = ISDestroy(&tempIS); CHK; + + // empty devIS on ranks other than the leading ranks in each devWorld + if (myDevWorldRank != 0) + ierr = ISGeneralSetIndices(devIS, 0, nullptr, PETSC_COPY_VALUES); CHK; + + // devIS is not guaranteed to be sorted. We sort it here. + ierr = ISSort(devIS); CHK; + + PetscFunctionReturn(0); +} + + +/* \implements AmgXSolver::getLocalA */ +PetscErrorCode AmgXSolver::getLocalA(const Mat &A, const IS &devIS, Mat &localA) +{ + PetscFunctionBeginUser; + + PetscErrorCode ierr; + MatType type; + + // get the Mat type + ierr = MatGetType(A, &type); CHK; + + // check whether the Mat type is supported + if (std::strcmp(type, MATSEQAIJ) == 0) // sequential AIJ + { + // make localA point to the same memory space as A does + localA = A; + } + else if (std::strcmp(type, MATMPIAIJ) == 0) + { + Mat tempA; + + // redistribute matrix and also get corresponding scatters. + ierr = redistMat(A, devIS, tempA); CHK; + + // get local matrix from redistributed matrix + ierr = MatMPIAIJGetLocalMat(tempA, MAT_INITIAL_MATRIX, &localA); CHK; + + // destroy redistributed matrix + if (tempA == A) + { + tempA = nullptr; + } + else + { + ierr = MatDestroy(&tempA); CHK; + } + } + else + { + SETERRQ1(globalCpuWorld, PETSC_ERR_ARG_WRONG, + "Mat type %s is not supported!\n", type); + } + + PetscFunctionReturn(0); +} + + +/* \implements AmgXSolver::redistMat */ +PetscErrorCode AmgXSolver::redistMat(const Mat &A, const IS &devIS, Mat &newA) +{ + PetscFunctionBeginUser; + + PetscErrorCode ierr; + + if (gpuWorldSize == globalSize) // no redistributation required + { + newA = A; + } + else + { + IS is; + + // re-set the communicator of devIS to globalCpuWorld + ierr = ISOnComm(devIS, globalCpuWorld, PETSC_USE_POINTER, &is); CHK; + + // redistribute the matrix A to newA + ierr = MatGetSubMatrix(A, is, is, MAT_INITIAL_MATRIX, &newA); CHK; + + // get VecScatters between original data layout and the new one + ierr = getVecScatter(A, newA, is); CHK; + + // destroy the temporary IS + ierr = ISDestroy(&is); CHK; + } + + PetscFunctionReturn(0); +} + + +/* \implements AmgXSolver::getVecScatter */ +PetscErrorCode AmgXSolver::getVecScatter( + const Mat &A1, const Mat &A2, const IS &devIS) +{ + PetscFunctionBeginUser; + + PetscErrorCode ierr; + + Vec tempLhs; + Vec tempRhs; + + ierr = MatCreateVecs(A1, &tempLhs, &tempRhs); CHK; + ierr = MatCreateVecs(A2, &redistLhs, &redistRhs); CHK; + + ierr = VecScatterCreate(tempLhs, devIS, redistLhs, devIS, &scatterLhs); CHK; + ierr = VecScatterCreate(tempRhs, devIS, redistRhs, devIS, &scatterRhs); CHK; + + ierr = VecDestroy(&tempRhs); CHK; + ierr = VecDestroy(&tempLhs); CHK; + + PetscFunctionReturn(0); +} + + +/* \implements AmgXSolver::getLocalMatRawData */ +PetscErrorCode AmgXSolver::getLocalMatRawData(const Mat &localA, + PetscInt &localN, std::vector<PetscInt> &row, + std::vector<PetscInt64> &col, std::vector<PetscScalar> &data) +{ + PetscFunctionBeginUser; + + PetscErrorCode ierr; + + const PetscInt *rawCol, + *rawRow; + + PetscScalar *rawData; + + PetscInt rawN; + + PetscBool done; + + // get row and column indices in compressed row format + ierr = MatGetRowIJ(localA, 0, PETSC_FALSE, PETSC_FALSE, + &rawN, &rawRow, &rawCol, &done); CHK; + + // rawN will be returned by MatRestoreRowIJ, so we have to copy it + localN = rawN; + + // check if the function worked + if (! done) + SETERRQ(globalCpuWorld, PETSC_ERR_SIG, "MatGetRowIJ did not work!"); + + // get data + ierr = MatSeqAIJGetArray(localA, &rawData); CHK; + + // copy values to STL vector. Note: there is an implicit conversion from + // PetscInt to PetscInt64 for the column vector + col.assign(rawCol, rawCol+rawRow[localN]); + row.assign(rawRow, rawRow+localN+1); + data.assign(rawData, rawData+rawRow[localN]); + + + // return ownership of memory space to PETSc + ierr = MatRestoreRowIJ(localA, 0, PETSC_FALSE, PETSC_FALSE, + &rawN, &rawRow, &rawCol, &done); CHK; + + // check if the function worked + if (! done) + SETERRQ(globalCpuWorld, PETSC_ERR_SIG, "MatRestoreRowIJ did not work!"); + + // return ownership of memory space to PETSc + ierr = MatSeqAIJRestoreArray(localA, &rawData); CHK; + + PetscFunctionReturn(0); +} + + +/* \implements AmgXSolver::destroyLocalA */ +PetscErrorCode AmgXSolver::destroyLocalA(const Mat &A, Mat &localA) +{ + PetscFunctionBeginUser; + + PetscErrorCode ierr; + + MatType type; + + // Get the Mat type + ierr = MatGetType(A, &type); CHK; + + // when A is sequential, we can not destroy the memory space + if (std::strcmp(type, MATSEQAIJ) == 0) + { + localA = nullptr; + } + // for parallel case, localA can be safely destroyed + else if (std::strcmp(type, MATMPIAIJ) == 0) + { + ierr = MatDestroy(&localA); CHK; + } + + PetscFunctionReturn(0); +} + +/* \implements AmgXSolver::checkForContiguousPartitioning */ +PetscErrorCode AmgXSolver::checkForContiguousPartitioning( + const IS &devIS, PetscBool &isContiguous, std::vector<PetscInt> &partOffsets) +{ + PetscFunctionBeginUser; + PetscErrorCode ierr; + PetscBool sorted; + PetscInt ismax= -2; // marker for "unsorted", allows to check after global sort + + ierr = ISSorted(devIS, &sorted); CHK; + if (sorted) + { + ierr = ISGetMinMax(devIS, NULL, &ismax); CHK; + } + partOffsets.resize(gpuWorldSize); + ++ismax; // add 1 to allow reusing gathered ismax values as partition offsets for AMGX + MPI_Allgather(&ismax, 1, MPIU_INT, &partOffsets[0], 1, MPIU_INT, gpuWorld); + bool all_sorted = std::is_sorted(partOffsets.begin(), partOffsets.end()) + && partOffsets[0] != -1; + if (all_sorted) + { + partOffsets.insert(partOffsets.begin(), 0); // partition 0 always starts at 0 + isContiguous = PETSC_TRUE; + } + else + { + isContiguous = PETSC_FALSE; + } + PetscFunctionReturn(0); +} + + +/* \implements AmgXSolver::getPartData */ +PetscErrorCode AmgXSolver::getPartData( + const IS &devIS, const PetscInt &N, std::vector<PetscInt> &partData, PetscBool &usesOffsets) +{ + PetscFunctionBeginUser; + + PetscErrorCode ierr; + + VecScatter scatter; + Vec tempMPI, + tempSEQ; + + PetscInt n; + PetscScalar *tempPartVec; + + ierr = ISGetLocalSize(devIS, &n); CHK; + + if (gpuWorld != MPI_COMM_NULL) + { + // check if sorted/contiguous, then we can skip expensive scatters + checkForContiguousPartitioning(devIS, usesOffsets, partData); + if (!usesOffsets) + { + ierr = VecCreateMPI(gpuWorld, n, N, &tempMPI); CHK; + + IS is; + ierr = ISOnComm(devIS, gpuWorld, PETSC_USE_POINTER, &is); CHK; + ierr = VecISSet(tempMPI, is, (PetscScalar) myGpuWorldRank); CHK; + ierr = ISDestroy(&is); CHK; + + ierr = VecScatterCreateToAll(tempMPI, &scatter, &tempSEQ); CHK; + ierr = VecScatterBegin(scatter, + tempMPI, tempSEQ, INSERT_VALUES, SCATTER_FORWARD); CHK; + ierr = VecScatterEnd(scatter, + tempMPI, tempSEQ, INSERT_VALUES, SCATTER_FORWARD); CHK; + ierr = VecScatterDestroy(&scatter); CHK; + ierr = VecDestroy(&tempMPI); CHK; + + ierr = VecGetArray(tempSEQ, &tempPartVec); CHK; + + partData.assign(tempPartVec, tempPartVec+N); + + ierr = VecRestoreArray(tempSEQ, &tempPartVec); CHK; + + ierr = VecDestroy(&tempSEQ); CHK; + } + } + ierr = MPI_Barrier(globalCpuWorld); CHK; + + PetscFunctionReturn(0); +} + + +/* \implements AmgXSolver::setA */ +PetscErrorCode AmgXSolver::setA( + const PetscInt nGlobalRows, + const PetscInt nLocalRows, + const PetscInt nLocalNz, + const PetscInt* rowOffsets, + const PetscInt* colIndicesGlobal, + const PetscScalar* values, + const PetscInt* partData) +{ + PetscFunctionBeginUser; + + // Merge the distributed matrix for MPI processes sharing a GPU + // consolidateMatrix(nLocalRows, nLocalNz, rowOffsets, colIndicesGlobal, values); + + int ierr; + + // upload matrix A to AmgX + if (gpuWorld != MPI_COMM_NULL) + { + ierr = MPI_Barrier(gpuWorld); CHK; + + //if (consolidationStatus == ConsolidationStatus::None) + //{ + + + + std::cout << "Modified version :: " << ring << std::endl; + AMGX_matrix_upload_all_global_32( + AmgXA, nGlobalRows, nLocalRows, nLocalNz, + 1, 1, rowOffsets, colIndicesGlobal, values, + nullptr, 1, 1, nullptr ); + +/* + int* csr_rowOfs = (int*) malloc( sizeof(int)* (nrows+1) ); + int* csr_rowIdx = (int*) malloc( sizeof(int)* totalNnz ); + int* csr_colIdx = (int*) malloc( sizeof(int)* totalNnz ); + double* csr_values = (double*) malloc( sizeof(double)*totalNnz ); + + CHECK(cudaMemcpy( csr_rowOfs, rowOffsets, (nrows+1) * sizeof(int), cudaMemcpyDeviceToHost)); + CHECK(cudaMemcpy( csr_rowIdx, rowIndices, totalNnz * sizeof(int), cudaMemcpyDeviceToHost)); + CHECK(cudaMemcpy( csr_colIdx, colIndices, totalNnz * sizeof(int), cudaMemcpyDeviceToHost)); + CHECK(cudaMemcpy( csr_values, values, totalNnz * sizeof(double), cudaMemcpyDeviceToHost)); + + if ( diagIndexGlobal > 0 ) + { + FILE* fid = fopen( "P1_CSR_rowOffs.txt", "w" ); + for ( int i =0; i<nrows+1; i++ ) fprintf( fid, "%d\n", csr_rowOfs[i] ); + fclose(fid); + fid = fopen( "P1_CSR_col.txt", "w" ); + for ( int i =0; i<totalNnz; i++ ) fprintf( fid, "%d, %d, %g\n", csr_rowIdx[i], csr_colIdx[i], csr_values[i] ); + fclose(fid); + } + else + { + FILE* fid = fopen( "P0_CSR_rowOffs.txt", "w" ); + for ( int i =0; i<nrows+1; i++ ) fprintf( fid, "%d\n", csr_rowOfs[i] ); + fclose(fid); + fid = fopen( "P0_CSR_col.txt", "w" ); + for ( int i =0; i<totalNnz; i++ ) fprintf( fid, "%d, %d, %g\n", csr_rowIdx[i], csr_colIdx[i], csr_values[i] ); + fclose(fid); + */ + + //} + //else + //{ + // AMGX_matrix_upload_all_global_32( + // AmgXA, nGlobalRows, nConsRows, nConsNz, + // 1, 1, rowOffsetsCons, colIndicesGlobalCons, valuesCons, + // nullptr, ring, ring, partData); + // + // // The rowOffsets and colIndices are no longer needed + // freeConsStructure(); + //} + + // bind the matrix A to the solver + ierr = MPI_Barrier(gpuWorld); CHK; + AMGX_solver_setup(solver, AmgXA); + + // connect (bind) vectors to the matrix + AMGX_vector_bind(AmgXP, AmgXA); + AMGX_vector_bind(AmgXRHS, AmgXA); + } + ierr = MPI_Barrier(globalCpuWorld); CHK; + + PetscFunctionReturn(0); +} + +/* \implements AmgXSolver::updateA */ +PetscErrorCode AmgXSolver::updateA( + const PetscInt nLocalRows, + const PetscInt nLocalNz, + const PetscScalar* values) +{ + PetscFunctionBeginUser; + + // Merges the values from multiple MPI processes sharing a single GPU + // reconsolidateValues(nLocalNz, values); + + int ierr; + // Replace the coefficients for the CSR matrix A within AmgX + if (gpuWorld != MPI_COMM_NULL) + { + ierr = MPI_Barrier(gpuWorld); CHK; + + //if (consolidationStatus == ConsolidationStatus::None) + //{ + AMGX_matrix_replace_coefficients(AmgXA, nLocalRows, nLocalNz, values, nullptr); + //} + //else + //{ + // AMGX_matrix_replace_coefficients(AmgXA, nConsRows, nConsNz, valuesCons, nullptr); + //} + + ierr = MPI_Barrier(gpuWorld); CHK; + + // Re-setup the solver (a reduced overhead setup that accounts for consistent matrix structure) + AMGX_solver_resetup(solver, AmgXA); + } + + ierr = MPI_Barrier(globalCpuWorld); CHK; + + PetscFunctionReturn(0); +} diff --git a/src/solve.cpp b/src/solve.cpp new file mode 100644 index 0000000000000000000000000000000000000000..55a33f885f673983de3f480cb9a1aabdf33c27f3 --- /dev/null +++ b/src/solve.cpp @@ -0,0 +1,195 @@ +/** + * @file solve.cpp + * @brief Definition of member functions regarding solving in AmgXSolver. + * @author Pi-Yueh Chuang (pychuang@gwu.edu) + * @date 2016-01-08 + * \copyright Copyright (c) 2015-2019 Pi-Yueh Chuang, Lorena A. Barba. + * This project is released under MIT License. + */ + + +// AmgXWrapper +# include "AmgXSolver.H" + + +/* \implements AmgXSolver::solve */ +PetscErrorCode AmgXSolver::solve(Vec &p, Vec &b) +{ + PetscFunctionBeginUser; + + PetscErrorCode ierr; + + if (globalSize != gpuWorldSize) + { + ierr = VecScatterBegin(scatterRhs, + b, redistRhs, INSERT_VALUES, SCATTER_FORWARD); CHK; + ierr = VecScatterBegin(scatterLhs, + p, redistLhs, INSERT_VALUES, SCATTER_FORWARD); CHK; + + ierr = VecScatterEnd(scatterRhs, + b, redistRhs, INSERT_VALUES, SCATTER_FORWARD); CHK; + ierr = VecScatterEnd(scatterLhs, + p, redistLhs, INSERT_VALUES, SCATTER_FORWARD); CHK; + + if (gpuWorld != MPI_COMM_NULL) + { + ierr = solve_real(redistLhs, redistRhs); CHK; + } + ierr = MPI_Barrier(globalCpuWorld); CHK; + + ierr = VecScatterBegin(scatterLhs, + redistLhs, p, INSERT_VALUES, SCATTER_REVERSE); CHK; + ierr = VecScatterEnd(scatterLhs, + redistLhs, p, INSERT_VALUES, SCATTER_REVERSE); CHK; + } + else + { + if (gpuWorld != MPI_COMM_NULL) + { + ierr = solve_real(p, b); CHK; + } + ierr = MPI_Barrier(globalCpuWorld); CHK; + } + + PetscFunctionReturn(0); +} + + +/* \implements AmgXSolver::solve_real */ +PetscErrorCode AmgXSolver::solve_real(Vec &p, Vec &b) +{ + PetscFunctionBeginUser; + + PetscErrorCode ierr; + + double *unks, + *rhs; + + int size; + + AMGX_SOLVE_STATUS status; + + // get size of local vector (p and b should have the same local size) + ierr = VecGetLocalSize(p, &size); CHK; + + // get pointers to the raw data of local vectors + ierr = VecGetArray(p, &unks); CHK; + ierr = VecGetArray(b, &rhs); CHK; + + // upload vectors to AmgX + AMGX_vector_upload(AmgXP, size, 1, unks); + AMGX_vector_upload(AmgXRHS, size, 1, rhs); + + // solve + ierr = MPI_Barrier(gpuWorld); CHK; + AMGX_solver_solve(solver, AmgXRHS, AmgXP); + + // get the status of the solver + AMGX_solver_get_status(solver, &status); + + // check whether the solver successfully solve the problem + if (status != AMGX_SOLVE_SUCCESS) SETERRQ1(globalCpuWorld, + PETSC_ERR_CONV_FAILED, "AmgX solver failed to solve the system! " + "The error code is %d.\n", status); + + // download data from device + AMGX_vector_download(AmgXP, unks); + + // restore PETSc vectors + ierr = VecRestoreArray(p, &unks); CHK; + ierr = VecRestoreArray(b, &rhs); CHK; + + PetscFunctionReturn(0); +} + + +/* \implements AmgXSolver::solve */ +PetscErrorCode AmgXSolver::solve(PetscScalar* p, const PetscScalar* b, const int nRows) +{ + PetscFunctionBeginUser; + + int ierr; + + if (consolidationStatus == ConsolidationStatus::Device) + { + CHECK(cudaMemcpy((void**)&pCons[rowDispls[myDevWorldRank]], p, sizeof(PetscScalar) * nRows, cudaMemcpyDefault)); + CHECK(cudaMemcpy((void**)&rhsCons[rowDispls[myDevWorldRank]], b, sizeof(PetscScalar) * nRows, cudaMemcpyDefault)); + + // Must synchronize here as device to device copies are non-blocking w.r.t host + CHECK(cudaDeviceSynchronize()); + ierr = MPI_Barrier(devWorld); CHK; + } + else if (consolidationStatus == ConsolidationStatus::Host) + { + MPI_Request req[2]; + ierr = MPI_Igatherv(p, nRows, MPI_DOUBLE, &pCons[rowDispls[myDevWorldRank]], nRowsInDevWorld.data(), rowDispls.data(), MPI_DOUBLE, 0, devWorld, &req[0]); CHK; + ierr = MPI_Igatherv(b, nRows, MPI_DOUBLE, &rhsCons[rowDispls[myDevWorldRank]], nRowsInDevWorld.data(), rowDispls.data(), MPI_DOUBLE, 0, devWorld, &req[1]); CHK; + MPI_Waitall(2, req, MPI_STATUSES_IGNORE); + } + + if (gpuWorld != MPI_COMM_NULL) + { + // Upload potentially consolidated vectors to AmgX + if (consolidationStatus == ConsolidationStatus::None) + { + AMGX_vector_upload(AmgXP, nRows, 1, p); + AMGX_vector_upload(AmgXRHS, nRows, 1, b); + } + else + { + AMGX_vector_upload(AmgXP, nConsRows, 1, pCons); + AMGX_vector_upload(AmgXRHS, nConsRows, 1, rhsCons); + } + + ierr = MPI_Barrier(gpuWorld); CHK; + + // Solve + AMGX_solver_solve(solver, AmgXRHS, AmgXP); + + // Get the status of the solver + AMGX_SOLVE_STATUS status; + AMGX_solver_get_status(solver, &status); + + // Check whether the solver successfully solved the problem + if (status != AMGX_SOLVE_SUCCESS) + SETERRQ1(globalCpuWorld, + PETSC_ERR_CONV_FAILED, "AmgX solver failed to solve the system! " + "The error code is %d.\n", + status); + + // Download data from device + if (consolidationStatus == ConsolidationStatus::None) + { + AMGX_vector_download(AmgXP, p); + } + else + { + AMGX_vector_download(AmgXP, pCons); + + // AMGX_vector_download invokes a device to device copy here, so it is essential that + // the root rank blocks the host before other ranks copy from the consolidated solution + CHECK(cudaDeviceSynchronize()); + } + } + + // If the matrix is consolidated, scatter the + if (consolidationStatus == ConsolidationStatus::Device) + { + // Must synchronise before each rank attempts to read from the consolidated solution + ierr = MPI_Barrier(devWorld); CHK; + + CHECK(cudaMemcpy((void **)p, &pCons[rowDispls[myDevWorldRank]], sizeof(PetscScalar) * nRows, cudaMemcpyDefault)); + CHECK(cudaDeviceSynchronize()); + } + else if (consolidationStatus == ConsolidationStatus::Host) + { + // Must synchronise before each rank attempts to read from the consolidated solution + ierr = MPI_Barrier(devWorld); CHK; + + ierr = MPI_Scatterv(&pCons[rowDispls[myDevWorldRank]], nRowsInDevWorld.data(), rowDispls.data(), MPI_DOUBLE, p, nRows, MPI_DOUBLE, 0, devWorld); CHK; + } + + ierr = MPI_Barrier(globalCpuWorld); CHK; + + PetscFunctionReturn(0); +}