diff --git a/Makefile b/Makefile
index 6c74edcf2fa524a7d36a303e272e5b30b9aba565..5a77bea191ba64617e3837654a2177210ecd7dad 100644
--- a/Makefile
+++ b/Makefile
@@ -21,11 +21,70 @@
 #
 
 # CUB is released in cuda 11. Using c++11 or c++14 fixed the ieee128 issue.
+.PHONY: clean
+
+SRCDIR=./src
+INCDIR=./inc
+OBJDIR=./obj
+LIBDIR=./lib
 
 NVARCH ?= 70
+RM=rm -rf
+
+NVCL=nvcc
+NVCC=nvcc -c
+NVCCINCFLAGS=-I$(INCDIR) -I${CUBROOT} -I${SPECTRUM_MPI_HOME}/include  -I${PETSC_INC} -I${AMGX_INC}
+NVCCFLAGS=-O0 -g -std=c++14 --compiler-options='-fPIC -std=c++14' -arch=sm_$(NVARCH)
+
+
+
+OBJECTS= \
+$(OBJDIR)/AmgXSolver.o \
+$(OBJDIR)/consolidate.o \
+$(OBJDIR)/FOAM2CSR.o \
+$(OBJDIR)/init.o \
+$(OBJDIR)/misc.o \
+$(OBJDIR)/setA.o \
+$(OBJDIR)/solve.o
+
+
+
+$(LIBDIR)/libFOAM2CSR.so: $(OBJECTS)
+	$(NVCL) $(NVCCFLAGS) -L${AMGX_LIB} -lamgx --shared $^ -o $@
+
+
+$(OBJDIR)/AmgXSolver.o: $(SRCDIR)/AmgXSolver.cpp $(INCDIR)/AmgXSolver.H
+	$(NVCC) $(NVCCFLAGS) $(NVCCINCFLAGS) $< -o $@
+
+
+$(OBJDIR)/consolidate.o: $(SRCDIR)/consolidate.cu $(INCDIR)/AmgXSolver.H
+	$(NVCC) $(NVCCFLAGS) $(NVCCINCFLAGS) $< -o $@
+
+
+$(OBJDIR)/FOAM2CSR.o: $(SRCDIR)/FOAM2CSR.cu $(INCDIR)/FOAM2CSR.H
+	$(NVCC) $(NVCCFLAGS) $(NVCCINCFLAGS) $< -o $@
+
+
+$(OBJDIR)/init.o: $(SRCDIR)/init.cpp $(INCDIR)/AmgXSolver.H
+	$(NVCC) $(NVCCFLAGS) $(NVCCINCFLAGS) $< -o $@
+
+
+$(OBJDIR)/misc.o: $(SRCDIR)/misc.cpp $(INCDIR)/AmgXSolver.H
+	$(NVCC) $(NVCCFLAGS) $(NVCCINCFLAGS) $< -o $@
+
+
+$(OBJDIR)/setA.o: $(SRCDIR)/setA.cpp $(INCDIR)/AmgXSolver.H
+	$(NVCC) $(NVCCFLAGS) $(NVCCINCFLAGS) $< -o $@
+
+
+$(OBJDIR)/solve.o: $(SRCDIR)/solve.cpp $(INCDIR)/AmgXSolver.H
+	$(NVCC) $(NVCCFLAGS) $(NVCCINCFLAGS) $< -o $@
 
-all:
-	nvcc -O3 -I. -I${CUBROOT} -std=c++14 --compiler-options='-fPIC -std=c++14' -arch=sm_${NVARCH} --shared FOAM2CSR.cu -o libFOAM2CSR.so
 
 clean:
-	rm -rf libFOAM2CSR.so
+	$(RM) $(LIBDIR)/*.so
+	$(RM) $(OBJDIR)/*.o
+
+#
+#	nvcc -O3 -I. -I${CUBROOT} -std=c++14 --compiler-options='-fPIC -std=c++14' -arch=sm_${NVARCH} --shared FOAM2CSR.cu -o libFOAM2CSR.so
+#
diff --git a/Makefile.orig b/Makefile.orig
new file mode 100644
index 0000000000000000000000000000000000000000..6c74edcf2fa524a7d36a303e272e5b30b9aba565
--- /dev/null
+++ b/Makefile.orig
@@ -0,0 +1,31 @@
+#
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+
+# CUB is released in cuda 11. Using c++11 or c++14 fixed the ieee128 issue.
+
+NVARCH ?= 70
+
+all:
+	nvcc -O3 -I. -I${CUBROOT} -std=c++14 --compiler-options='-fPIC -std=c++14' -arch=sm_${NVARCH} --shared FOAM2CSR.cu -o libFOAM2CSR.so
+
+clean:
+	rm -rf libFOAM2CSR.so
diff --git a/inc/AmgXSolver.H b/inc/AmgXSolver.H
new file mode 100644
index 0000000000000000000000000000000000000000..7ac8a56d0a91ac1ebccc75707e18fbffc182536d
--- /dev/null
+++ b/inc/AmgXSolver.H
@@ -0,0 +1,617 @@
+/**
+ * \file AmgXSolver.hpp
+ * \brief Definition of class AmgXSolver.
+ * \author Pi-Yueh Chuang (pychuang@gwu.edu)
+ * \date 2015-09-01
+ * \copyright Copyright (c) 2015-2019 Pi-Yueh Chuang, Lorena A. Barba.
+ * \copyright Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *            This project is released under MIT License.
+ */
+
+
+#ifndef __AMGX_SOLVER_H__
+#define __AMGX_SOLVER_H__
+
+// CUDA
+#include <cuda_runtime.h>
+
+// STL
+# include <string>
+# include <vector>
+
+// AmgX
+# include <amgx_c.h>
+
+// PETSc
+# include <petscmat.h>
+# include <petscvec.h>
+
+
+/** \brief A macro to check the returned CUDA error code.
+ *
+ * \param call [in] Function call to CUDA API.
+ */
+# define CHECK(call)                                                        \
+{                                                                           \
+    const cudaError_t       error = call;                                   \
+    if (error != cudaSuccess)                                               \
+    {                                                                       \
+        SETERRQ4(PETSC_COMM_WORLD, PETSC_ERR_SIG,                           \
+            "Error: %s:%d, code:%d, reason: %s\n",                          \
+            __FILE__, __LINE__, error, cudaGetErrorString(error));          \
+    }                                                                       \
+}
+
+
+/** \brief A shorter macro for checking PETSc returned error code. */
+# define CHK CHKERRQ(ierr)
+
+
+/** \brief A wrapper class for coupling PETSc and AmgX.
+ *
+ * This class is a wrapper of AmgX library for PETSc. PETSc users only need to
+ * pass a PETSc matrix and vectors into an AmgXSolver instance to solve their
+ * linear systems. The class is designed specifically for the situation where
+ * the number of MPI processes is more than the number of GPU devices.
+ *
+ * Eaxmple usage:
+ * \code
+ * int main(int argc, char **argv)
+ * {
+ *     // initialize matrix A, RHS, etc using PETSc
+ *     ...
+ *
+ *     // create an instance of the solver wrapper
+ *     AmgXSolver    solver;
+ *     // initialize the instance with communicator, executation mode, and config file
+ *     solver.initialize(comm, mode, file);
+ *     // set matrix A. Currently it only accept PETSc AIJ matrix
+ *     solver.setA(A);
+ *     // solve. x and rhs are PETSc vectors. unkns will be the final result in the end
+ *     solver.solve(unks, rhs);
+ *     // get number of iterations
+ *     int         iters;
+ *     solver.getIters(iters);
+ *     // get residual at the last iteration
+ *     double      res;
+ *     solver.getResidual(iters, res);
+ *     // finalization
+ *     solver.finalize();
+ *
+ *     // other codes
+ *     ....
+ *
+ *     return 0;
+ * }
+ * \endcode
+ */
+class AmgXSolver
+{
+    public:
+
+        /** \brief Default constructor. */
+        AmgXSolver() = default;
+
+
+        /** \brief Construct a AmgXSolver instance.
+         *
+         * \param comm [in] MPI communicator.
+         * \param modeStr [in] A string; target mode of AmgX (e.g., dDDI).
+         * \param cfgFile [in] A string; the path to AmgX configuration file.
+         */
+        AmgXSolver(const MPI_Comm &comm,
+                const std::string &modeStr, const std::string &cfgFile);
+
+
+        /** \brief Destructor. */
+        ~AmgXSolver();
+
+
+        /** \brief Initialize a AmgXSolver instance.
+         *
+         * \param comm [in] MPI communicator.
+         * \param modeStr [in] A string; target mode of AmgX (e.g., dDDI).
+         * \param cfgFile [in] A string; the path to AmgX configuration file.
+         *
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode initialize(const MPI_Comm &comm,
+                const std::string &modeStr, const std::string &cfgFile);
+
+
+        /** \brief Finalize this instance.
+         *
+         * This function destroys AmgX data. When there are more than one
+         * AmgXSolver instances, the last one destroyed is also in charge of
+         * destroying the shared resource object and finalizing AmgX.
+         *
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode finalize();
+
+
+        /** \brief Set up the matrix used by AmgX.
+         *
+         * This function will automatically convert PETSc matrix to AmgX matrix.
+         * If the number of MPI processes is higher than the number of available
+         * GPU devices, we also redistribute the matrix in this function and
+         * upload redistributed one to GPUs.
+         *
+         * Note: currently we can only handle AIJ/MPIAIJ format.
+         *
+         * \param A [in] A PETSc Mat.
+         *
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode setA(const Mat &A);
+
+
+        /** \brief Set up the matrix used by AmgX.
+         *
+         * This function sets up the AmgX matrix from the provided CSR data
+         * structures and partition data.
+         *
+         * \param nGlobalRows [in] The number of global rows.
+         * \param nLocalRows [in] The number of local rows on this rank.
+         * \param nLocalNz [in] The total number of non zero entries locally.
+         * \param rowOffsets [in] The local CSR matrix row offsets.
+         * \param colIndicesGlobal [in] The global CSR matrix column indices.
+         * \param values [in] The local CSR matrix values.
+         * \param partData [in] Array of length nGlobalRows containing the rank
+         * id of the owning rank for each row.
+         *
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode setA(
+            const PetscInt nGlobalRows,
+            const PetscInt nLocalRows,
+            const PetscInt nLocalNz,
+            const PetscInt* rowOffsets,
+            const PetscInt* colIndicesGlobal,
+            const PetscScalar* values,
+            const PetscInt* partData);
+
+
+        /** \brief Re-sets up an existing AmgX matrix.
+         *
+         * Replaces the matrix coefficients with the provided values and performs
+         * a resetup for the AmgX matrix.
+         *
+         * \param nLocalRows [in] The number of local rows on this rank.
+         * \param nLocalNz [in] The total number of non zero entries locally.
+         * \param values [in] The local CSR matrix values.
+         *
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode updateA(
+            const PetscInt nLocalRows,
+            const PetscInt nLocalNz,
+            const PetscScalar* values);
+
+
+        /** \brief Solve the linear system.
+         *
+         * \p p vector will be used as an initial guess and will be updated to the
+         * solution by the end of solving.
+         *
+         * For cases that use more MPI processes than the number of GPUs, this
+         * function will do data gathering before solving and data scattering
+         * after the solving.
+         *
+         * \param p [in, out] A PETSc Vec object representing unknowns.
+         * \param b [in] A PETSc Vec representing right-hand-side.
+         *
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode solve(Vec &p, Vec &b);
+
+
+        /** \brief Solve the linear system.
+         *
+         * \p p vector will be used as an initial guess and will be updated to the
+         * solution by the end of solving.
+         *
+         * For cases that use more MPI processes than the number of GPUs, this
+         * function will do data gathering before solving and data scattering
+         * after the solving.
+         *
+         * \param p [in, out] The unknown array.
+         * \param b [in] The RHS array.
+         * \param nRows [in] The number of rows in this rank.
+         *
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode solve(PetscScalar *p, const PetscScalar *b, const int nRows);
+
+
+        /** \brief Get the number of iterations of the last solving.
+         *
+         * \param iter [out] Number of iterations.
+         *
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode getIters(int &iter);
+
+
+        /** \brief Get the residual at a specific iteration during the last solving.
+         *
+         * \param iter [in] Target iteration.
+         * \param res [out] Returned residual.
+         *
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode getResidual(const int &iter, double &res);
+
+
+    private:
+
+        /** \brief Current count of AmgXSolver instances.
+         *
+         * This static variable is used to count the number of instances. The
+         * fisrt instance is responsable for initializing AmgX library and the
+         * resource instance.
+         */
+        static int              count;
+
+        /** \brief A flag indicating if this instance has been initialized. */
+        bool                    isInitialized = false;
+
+        /** \brief The name of the node that this MPI process belongs to. */
+        std::string             nodeName;
+
+
+
+
+        /** \brief Number of local GPU devices used by AmgX.*/
+        PetscMPIInt             nDevs;
+
+        /** \brief The ID of corresponding GPU device used by this MPI process. */
+        PetscMPIInt             devID;
+
+        /** \brief A flag indicating if this process will talk to GPU. */
+        PetscMPIInt             gpuProc = MPI_UNDEFINED;
+
+        /** \brief A communicator for global world. */
+        MPI_Comm                globalCpuWorld;
+
+        /** \brief A communicator for local world (i.e., in-node). */
+        MPI_Comm                localCpuWorld;
+
+        /** \brief A communicator for MPI processes that can talk to GPUs. */
+        MPI_Comm                gpuWorld;
+
+        /** \brief A communicator for processes sharing the same devices. */
+        MPI_Comm                devWorld;
+
+        /** \brief Size of \ref AmgXSolver::globalCpuWorld "globalCpuWorld". */
+        PetscMPIInt             globalSize;
+
+        /** \brief Size of \ref AmgXSolver::localCpuWorld "localCpuWorld". */
+        PetscMPIInt             localSize;
+
+        /** \brief Size of \ref AmgXSolver::gpuWorld "gpuWorld". */
+        PetscMPIInt             gpuWorldSize;
+
+        /** \brief Size of \ref AmgXSolver::devWorld "devWorld". */
+        PetscMPIInt             devWorldSize;
+
+        /** \brief Rank in \ref AmgXSolver::globalCpuWorld "globalCpuWorld". */
+        PetscMPIInt             myGlobalRank;
+
+        /** \brief Rank in \ref AmgXSolver::localCpuWorld "localCpuWorld". */
+        PetscMPIInt             myLocalRank;
+
+        /** \brief Rank in \ref AmgXSolver::gpuWorld "gpuWorld". */
+        PetscMPIInt             myGpuWorldRank;
+
+        /** \brief Rank in \ref AmgXSolver::devWorld "devWorld". */
+        PetscMPIInt             myDevWorldRank;
+
+
+
+
+        /** \brief A parameter used by AmgX. */
+        int                     ring;
+
+        /** \brief AmgX solver mode. */
+        AMGX_Mode               mode;
+
+        /** \brief AmgX config object. */
+        AMGX_config_handle      cfg = nullptr;
+
+        /** \brief AmgX matrix object. */
+        AMGX_matrix_handle      AmgXA = nullptr;
+
+        /** \brief AmgX vector object representing unknowns. */
+        AMGX_vector_handle      AmgXP = nullptr;
+
+        /** \brief AmgX vector object representing RHS. */
+        AMGX_vector_handle      AmgXRHS = nullptr;
+
+        /** \brief AmgX solver object. */
+        AMGX_solver_handle      solver = nullptr;
+
+        /** \brief AmgX resource object.
+         *
+         * Due to the design of AmgX library, using more than one resource
+         * instance may cause some problems. So we make the resource instance
+         * as a static member to keep only one instance.
+         */
+        static AMGX_resources_handle   rsrc;
+
+        /** \brief Initialize consolidation, if required. Allocates space to hold
+         * consolidated CSR matrix and solution/RHS vectors on the root rank and,
+         * if device pointer consolidation, allocates and opens IPC memory handles.
+         * Calculates and stores consolidated sizes and displacements.
+         *
+         * \param nLocalRows [in] The number of rows owned by this rank.
+         * \param nLocalNz [in] The number of non-zeros owned by this rank.
+         * \param values [in] The values of the CSR matrix A.
+         *
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode initializeConsolidation(
+            const PetscInt nLocalRows,
+            const PetscInt nLocalNz,
+            const PetscScalar *values);
+
+        /** \brief Realise consolidation, if required. This copies data from multiple ranks
+         * that share a single GPU, so a single consolidated CSR matrix can be passed to
+         * AmgX. As such, users can efficiently execute with more MPI ranks than GPUs,
+         * potentially improving performance where un-accelerated components of an application
+         * require CPU resources. CUDA IPC is leveraged to avoid buffering data onto the host,
+         * rather enabling fast intra-GPU copies if the local CSR matrices are already on the GPU.
+         *
+         * \param nLocalRows [in] The number of rows owned by this rank.
+         * \param nLocalNz [in] The number of non-zeros owned by this rank.
+         * \param rowOffsets [in] The row offsets of the CSR matrix A.
+         * \param colIndicesGlobal [in] The global column indices of the CSR matrix A.
+         * \param values [in] The values of the CSR matrix A.
+         *
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode consolidateMatrix(
+            const PetscInt nLocalRows,
+            const PetscInt nLocalNz,
+            const PetscInt *rowOffsets,
+            const PetscInt *colIndicesGlobal,
+            const PetscScalar *values);
+
+        /** \brief Re-consolidates the values of the CSR matrix from multiple ranks.
+         *
+         * \param nLocalNz [in] The number of non-zeros owned by this rank.
+         * \param values [in] The values of the CSR matrix A.
+         *
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode reconsolidateValues(
+            const PetscInt nLocalNz,
+            const PetscScalar *values);
+
+        /** \brief De-allocates consolidated data structures, if any, after the AmgX matrix
+         * has been constructed and the duplicate data becomes redundant.
+         *
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode freeConsStructure();
+
+        /** \brief De-allocates all consolidated matrix structures.
+         *
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode finalizeConsolidation();
+
+        /** \brief Enumeration for the status of matrix consolidation for the solver.*/
+        enum class ConsolidationStatus {
+            Uninitialized,
+            None,
+            Host,
+            Device
+        } consolidationStatus;
+
+        /** \brief The number of non-zeros consolidated from multiple ranks to a device.*/
+        int nConsNz = 0;
+
+        /** \brief The number of rows consolidated from multiple ranks to a device.*/
+        int nConsRows = 0;
+
+        /** \brief The row offsets consolidated onto a single device.*/
+        PetscInt *rowOffsetsCons = nullptr;
+
+        /** \brief The global column indices consolidated onto a single device.*/
+        PetscInt *colIndicesGlobalCons = nullptr;
+
+        /** \brief The values consolidated onto a single device.*/
+        PetscScalar* valuesCons = nullptr;
+
+        /** \brief The solution vector consolidated onto a single device.*/
+        PetscScalar* pCons = nullptr;
+
+        /** \brief The RHS vector consolidated onto a single device.*/
+        PetscScalar* rhsCons = nullptr;
+
+        /** \brief The number of rows per rank associated with a single device.*/
+        std::vector<int> nRowsInDevWorld;
+
+        /** \brief The number of non-zeros per rank associated with a single device.*/
+        std::vector<int> nnzInDevWorld;
+
+        /** \brief The row displacements per rank associated with a single device.*/
+        std::vector<int> rowDispls;
+
+        /** \brief The non-zero displacements per rank associated with a single device.*/
+        std::vector<int> nzDispls;
+
+        /** \brief A VecScatter for gathering/scattering between original PETSc
+         *         Vec and temporary redistributed PETSc Vec.*/
+        VecScatter              scatterLhs = nullptr;
+
+        /** \brief A VecScatter for gathering/scattering between original PETSc
+         *         Vec and temporary redistributed PETSc Vec.*/
+        VecScatter              scatterRhs = nullptr;
+
+        /** \brief A temporary PETSc Vec holding redistributed unknowns. */
+        Vec                     redistLhs = nullptr;
+
+        /** \brief A temporary PETSc Vec holding redistributed RHS. */
+        Vec                     redistRhs = nullptr;
+
+
+
+
+        /** \brief Set AmgX solver mode based on the user-provided string.
+         *
+         * Available modes are: dDDI, dDFI, dFFI, hDDI, hDFI, hFFI.
+         *
+         * \param modeStr [in] a std::string.
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode setMode(const std::string &modeStr);
+
+
+        /** \brief Get the number of GPU devices on this computing node.
+         *
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode setDeviceCount();
+
+
+        /** \brief Set the ID of the corresponding GPU used by this process.
+         *
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode setDeviceIDs();
+
+
+        /** \brief Initialize all MPI communicators.
+         *
+         * The \p comm provided will be duplicated and saved to the
+         * \ref AmgXSolver::globalCpuWorld "globalCpuWorld".
+         *
+         * \param comm [in] Global communicator.
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode initMPIcomms(const MPI_Comm &comm);
+
+
+        /** \brief Perform necessary initialization of AmgX.
+         *
+         * This function initializes AmgX for current instance. Based on
+         * \ref AmgXSolver::count "count", only the instance initialized first
+         * is in charge of initializing AmgX and the resource instance.
+         *
+         * \param cfgFile [in] Path to AmgX solver configuration file.
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode initAmgX(const std::string &cfgFile);
+
+
+        /** \brief Get IS for the row indices that processes in
+         *      \ref AmgXSolver::gpuWorld "gpuWorld" will held.
+         *
+         * \param A [in] PETSc matrix.
+         * \param devIS [out] PETSc IS.
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode getDevIS(const Mat &A, IS &devIS);
+
+
+        /** \brief Get local sequential PETSc Mat of the redistributed matrix.
+         *
+         * \param A [in] Original PETSc Mat.
+         * \param devIS [in] PETSc IS representing redistributed row indices.
+         * \param localA [out] Local sequential redistributed matrix.
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode getLocalA(const Mat &A, const IS &devIS, Mat &localA);
+
+
+        /** \brief Redistribute matrix.
+         *
+         * \param A [in] Original PETSc Mat object.
+         * \param devIS [in] PETSc IS representing redistributed rows.
+         * \param newA [out] Redistributed matrix.
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode redistMat(const Mat &A, const IS &devIS, Mat &newA);
+
+
+        /** \brief Get \ref AmgXSolver::scatterLhs "scatterLhs" and
+         *      \ref AmgXSolver::scatterRhs "scatterRhs".
+         *
+         * \param A1 [in] Original PETSc Mat object.
+         * \param A2 [in] Redistributed PETSc Mat object.
+         * \param devIS [in] PETSc IS representing redistributed row indices.
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode getVecScatter(const Mat &A1, const Mat &A2, const IS &devIS);
+
+
+        /** \brief Get data of compressed row layout of local sparse matrix.
+         *
+         * \param localA [in] Sequential local redistributed PETSc Mat.
+         * \param localN [out] Number of local rows.
+         * \param row [out] Row vector in compressed row layout.
+         * \param col [out] Column vector in compressed row layout.
+         * \param data [out] Data vector in compressed row layout.
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode getLocalMatRawData(const Mat &localA,
+                PetscInt &localN, std::vector<PetscInt> &row,
+                std::vector<PetscInt64> &col, std::vector<PetscScalar> &data);
+
+
+        /** \brief Destroy the sequential local redistributed matrix.
+         *
+         * \param A [in] Original PETSc Mat.
+         * \param localA [in, out] Local matrix, returning null pointer.
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode destroyLocalA(const Mat &A, Mat &localA);
+
+        /** \brief Check whether the global matrix distribution is contiguous
+         *
+         * If the global matrix is distributed such that contiguous chunks of rows are
+         * distributed over the individual ranks in ascending rank order, the partition vector
+         * has trivial structure (i.e. [0, ..., 0, 1, ..., 1, ..., R-1, ..., R-1] for R ranks) and
+         * its calculation can be skipped since all information is available to AmgX through
+         * the number of ranks and the partition *offsets* (i.e. how many rows are on each rank).
+         *
+         * \param devIS [in] PETSc IS representing redistributed row indices.
+         * \param isContiguous [out] Whether the global matrix is contiguously distributed.
+         * \param partOffsets [out] If contiguous, holds the partition offsets for all R ranks
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode checkForContiguousPartitioning(
+            const IS &devIS, PetscBool &isContiguous, std::vector<PetscInt> &partOffsets);
+
+        /** \brief Get partition data required by AmgX.
+         *
+         * \param devIS [in] PETSc IS representing redistributed row indices.
+         * \param N [in] Total number of rows in global matrix.
+         * \param partData [out] Partition data, either explicit vector or offsets.
+         * \param usesOffsets [out] If PETSC_TRUE, partitioning is contiguous and partData contains
+         *      partition offsets, see checkForContiguousPartitioning(). Otherwise, contains explicit
+         *      partition vector.
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode getPartData(const IS &devIS,
+                const PetscInt &N, std::vector<PetscInt> &partData, PetscBool &usesOffsets);
+
+
+        /** \brief Function that actually solves the system.
+         *
+         * This function won't check if the process is involved in AmgX solver.
+         * So if calling this function from processes not in the
+         * \ref AmgXSolver::gpuWorld "gpuWorld", something bad will happen.
+         * This function hence won't do data gathering/scattering, either.
+         *
+         * \param p [in, out] PETSc Vec for unknowns.
+         * \param b [in] PETSc Vec for RHS.
+         * \return PetscErrorCode.
+         */
+        PetscErrorCode solve_real(Vec &p, Vec &b);
+};
+
+#endif
+
diff --git a/FOAM2CSR.hpp b/inc/FOAM2CSR.H
similarity index 100%
rename from FOAM2CSR.hpp
rename to inc/FOAM2CSR.H
diff --git a/lib/libFOAM2CSR.so b/lib/libFOAM2CSR.so
new file mode 100755
index 0000000000000000000000000000000000000000..c656be86c52ef3ca95eaa2da5fd44058ad09a2ec
Binary files /dev/null and b/lib/libFOAM2CSR.so differ
diff --git a/loadENV.sh b/loadENV.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0ba20e18e90dfb6ad5b01aee11e58c7d3164464f
--- /dev/null
+++ b/loadENV.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+ 
+# Load teh environment needed to run the job
+module purge
+module use /m100_scratch/userinternal/mvalent3/modules/opt/modulefiles/profiles
+module load local_profile/global
+ 
+# Load needed modules
+module load gnu/8.4.0
+module load spectrum_mpi/10.3.1--binary
+module load cuda/10.2
+module load hpc-sdk/2020--binary
+module load openblas/0.3.9--gnu--8.4.0
+module load lapack/3.9.0--gnu--8.4.0
+module load AMGx/2.1.x--spectrum_mpi--10.3.1--binary
+module load petsc-sytstem-openblas-hypre-viennacl-ml-kokkos/3.14.0--spectrum_mpi--10.3.1--binary
+module load spectrum_mpi/10.3.1--binary
+
+
+ 
+export CUBROOT="${HPC_SDK_HOME}/Linux_ppc64le/2020/cuda/11.0/include"
+
+
+
diff --git a/obj/AmgXSolver.o b/obj/AmgXSolver.o
new file mode 100644
index 0000000000000000000000000000000000000000..e0be2885222906a06ff32184b0e2037c5866a88d
Binary files /dev/null and b/obj/AmgXSolver.o differ
diff --git a/obj/FOAM2CSR.o b/obj/FOAM2CSR.o
new file mode 100644
index 0000000000000000000000000000000000000000..1b32218a53f19bdc5ff7d13225349f37c934b531
Binary files /dev/null and b/obj/FOAM2CSR.o differ
diff --git a/obj/consolidate.o b/obj/consolidate.o
new file mode 100644
index 0000000000000000000000000000000000000000..f5903af2347b8fafbc93b758fe86989617e2b1e1
Binary files /dev/null and b/obj/consolidate.o differ
diff --git a/obj/init.o b/obj/init.o
new file mode 100644
index 0000000000000000000000000000000000000000..5f1e110602f995a95415a394a40e71fafb65e1de
Binary files /dev/null and b/obj/init.o differ
diff --git a/obj/misc.o b/obj/misc.o
new file mode 100644
index 0000000000000000000000000000000000000000..f32ef13cdd1b650d6df20eca979d95c9ae0a25e2
Binary files /dev/null and b/obj/misc.o differ
diff --git a/obj/setA.o b/obj/setA.o
new file mode 100644
index 0000000000000000000000000000000000000000..d5f6dcf0d0a25de17fe8eb25faea1d8855af1fab
Binary files /dev/null and b/obj/setA.o differ
diff --git a/obj/solve.o b/obj/solve.o
new file mode 100644
index 0000000000000000000000000000000000000000..4761a1be53412ff49bc53dd1534ec2b76dde103d
Binary files /dev/null and b/obj/solve.o differ
diff --git a/src/.FOAM2CSR.cu.swp b/src/.FOAM2CSR.cu.swp
new file mode 100644
index 0000000000000000000000000000000000000000..a0de4277822bfeb80c1defbb320aba7e59822125
Binary files /dev/null and b/src/.FOAM2CSR.cu.swp differ
diff --git a/src/AmgXSolver.cpp b/src/AmgXSolver.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c437c35390a9838bc7079974628f007570cc7be8
--- /dev/null
+++ b/src/AmgXSolver.cpp
@@ -0,0 +1,19 @@
+/**
+ * \file AmgXSolver.cpp
+ * \brief Definition of member functions of the class AmgXSolver.
+ * \author Pi-Yueh Chuang (pychuang@gwu.edu)
+ * \date 2015-09-01
+ * \copyright Copyright (c) 2015-2019 Pi-Yueh Chuang, Lorena A. Barba.
+ *            This project is released under MIT License.
+ */
+
+
+// AmgXWrapper
+# include "AmgXSolver.H"
+
+
+// initialize AmgXSolver::count to 0
+int AmgXSolver::count = 0;
+
+// initialize AmgXSolver::rsrc to nullptr;
+AMGX_resources_handle AmgXSolver::rsrc = nullptr;
diff --git a/FOAM2CSR.cu b/src/FOAM2CSR.cu
similarity index 93%
rename from FOAM2CSR.cu
rename to src/FOAM2CSR.cu
index 5ed889906d7865267c2098d5c05113857a63ba87..dd7b5a65cd67ed355b04e55843889acb1ceac02b 100644
--- a/FOAM2CSR.cu
+++ b/src/FOAM2CSR.cu
@@ -20,7 +20,7 @@
  * DEALINGS IN THE SOFTWARE.
  */
 
-#include <FOAM2CSR.hpp>
+#include <FOAM2CSR.H>
 #include <cuda.h>
 #include <cub/cub.cuh>
 #include <thrust/scan.h>
@@ -83,27 +83,46 @@ __global__ void applyPermutation(
     // Permute col indices and values
     for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < totalNnz; i += blockDim.x * gridDim.x)
     {
-        int p = perm[i];
-
-        // In the values only case the column indices and row offsets remain fixed so no update
-        if (!valuesOnly)
+        // if ( i<nnz )
         {
+          int p = perm[i];
+
+          // In the values only case the column indices and row offsets remain fixed so no update
+          if (!valuesOnly)
+          {
             colIndices[i] = colIndicesTmp[p];
-        }
+          }
 
-        values[i] = valuesTmp[p];
+          values[i] = valuesTmp[p];
+        }
     }
 }
 
 // Flatten the row indices into the row offsets
 __global__ void createRowOffsets(
     int nnz,
+    int nrows, 
     int *rowIndices,
     int *rowOffsets)
 {
     for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < nnz; i += blockDim.x * gridDim.x)
     {
-        atomicAdd(&rowOffsets[rowIndices[i]], 1);
+      if ( i<nnz )  
+      {
+        int j = rowIndices[i];
+        if ( j<nrows+1 )
+        {
+          atomicAdd(&rowOffsets[j], 1);
+        }
+        else
+        {
+          printf( " - Loop interno :: %d \n", j  );
+        }
+      }
+      else
+      {
+        printf( " - Loop esterno :: %d \n", i  );
+      }
     }
 }
 
@@ -203,7 +222,7 @@ void FOAM2CSR::convertLDU2CSR(
     // Convert the row indices into offsets
     constexpr int nthreads = 128;
     int nblocks = totalNnz / nthreads + 1;
-    createRowOffsets<<<nblocks, nthreads>>>(totalNnz, rowIndices, rowOffsets);
+    createRowOffsets<<<nblocks, nthreads>>>(totalNnz, nrows, rowIndices, rowOffsets);
     thrust::exclusive_scan(thrust::device, rowOffsets, rowOffsets + nrows + 1, rowOffsets);
     CHECK(cudaFree(rowIndices));
 
diff --git a/src/FOAM2CSR.mirco.cu b/src/FOAM2CSR.mirco.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d0ac8f2bd67720ecc2ab6c4b73f7ed446a0bfb85
--- /dev/null
+++ b/src/FOAM2CSR.mirco.cu
@@ -0,0 +1,515 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "FOAM2CSR.H"
+#include <cuda.h>
+#include <cub/cub.cuh>
+#include <thrust/scan.h>
+#include <thrust/sequence.h>
+#include <thrust/execution_policy.h>
+#include <unistd.h>
+
+#define CHECK(call)                                              \
+    {                                                            \
+        cudaError_t e = call;                                    \
+        if (e != cudaSuccess)                                    \
+        {                                                        \
+            printf("Cuda failure: '%s %d %s'",                   \
+                __FILE__, __LINE__, cudaGetErrorString(e));      \
+        }                                                        \
+    }
+
+__global__ void printPerm(
+  const int nrows,
+  const int nnz,
+  int* rowsIdx,
+  int* colsIdx,
+  int* colsPerm )
+{
+
+  if ( threadIdx.x < nrows )
+  {
+    printf( "rows(%d) = %d\n", threadIdx.x, rowsIdx[threadIdx.x] );
+  }
+
+  if ( threadIdx.x < nnz )
+  {
+    printf( "colsPerm(%d) = %d\n", threadIdx.x, colsIdx[colsPerm[threadIdx.x]] );
+  }
+
+}
+
+__global__ void printData(
+  const int nrows,
+  const int nnz,
+  int* rowsIdx,
+  int* colsIdx,
+  double* values )
+{
+
+  if ( threadIdx.x < nrows )
+  {
+    printf( "rows(%d) = %d\n", threadIdx.x, rowsIdx[threadIdx.x] );
+  }
+
+  if ( threadIdx.x < nnz )
+  {
+    printf( "cols(%d) = %d\n", threadIdx.x, colsIdx[threadIdx.x] );
+  }
+
+  if ( threadIdx.x < nnz )
+  {
+    printf( "values(%d) = %d, %f\n", threadIdx.x, colsIdx[threadIdx.x], (float)values[threadIdx.x] );
+  }
+
+}
+
+// Offset the column indices to transform from local to global
+__global__ void localToGlobalColIndices(
+    const int nnz,
+    const int nrows,
+    const int nInternalFaces,
+    const int diagIndexGlobal,
+    const int lowOffGlobal,
+    const int uppOffGlobal,
+    int *colIndices)
+{
+    // Offset by global offset
+    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < nnz; i += blockDim.x * gridDim.x)
+    {
+        int offset;
+        if ( i < nnz )
+        {
+          // Depending upon the row, different offsets must be applied
+          if (i < nrows)
+          {
+              offset = diagIndexGlobal;
+          }
+          else if (i < nrows + nInternalFaces)
+          {
+              offset = uppOffGlobal;
+          }
+          else
+          {
+              offset = lowOffGlobal;
+          }
+          colIndices[i] += offset;
+        }
+    }
+}
+
+
+
+// Offset the column indices to transform from local to global
+__global__ void localToGlobalRowIndices(
+    const int nnz,
+    const int nrows,
+    const int nInternalFaces,
+    const int diagIndexGlobal,
+    const int lowOffGlobal,
+    const int uppOffGlobal,
+    int *rowIndices)
+{
+    // Offset by global offset
+    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < nnz; i += blockDim.x * gridDim.x)
+    {
+        int offset;
+        if ( i < nnz )
+        {
+          // Depending upon the row, different offsets must be applied
+          if (i < nrows)
+          {
+              offset = diagIndexGlobal;
+          }
+          else if (i < nrows + nInternalFaces)
+          {
+              offset = uppOffGlobal;
+          }
+          else
+          {
+              offset = lowOffGlobal;
+          }
+
+          rowIndices[i] += offset - diagIndexGlobal;
+        }
+    }
+}
+
+
+
+// Apply the pre-existing permutation to the values [and columns]
+__global__ void applyPermutation(
+    const int totalNnz,
+    const int *perm,
+    const int *colIndicesTmp,
+    const double *valuesTmp,
+    int *colIndices,
+    double *values,
+    bool valuesOnly)
+{
+    // Permute col indices and values
+    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < totalNnz; i += blockDim.x * gridDim.x)
+    {
+
+      if ( i < totalNnz )
+      {
+        int p = perm[i];
+
+        // In the values only case the column indices and row offsets remain fixed so no update
+        if ( p< totalNnz )
+        {
+          if (!valuesOnly)
+          {
+              colIndices[i] = colIndicesTmp[p];
+          }
+          if ( p < totalNnz )
+          {
+            values[i] = valuesTmp[p];
+          }
+        }
+      }
+    }
+}
+
+// Flatten the row indices into the row offsets
+// >>>  constexpr int nthreads = 128;
+// >>>  int nblocks = totalNnz / nthreads + 1;
+// >>>  createRowOffsets<<<nblocks, nthreads>>>(totalNnz, nrows+1, rowIndices, rowOffsets);
+__global__ void createRowOffsets(
+    int nnz,
+    int nrows,
+    int *rowIndices,
+    int *rowOffsets)
+{
+
+/*
+    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < nnz; i += blockDim.x * gridDim.x)
+    {
+        int k = rowIndices[i];
+        if ( k >= nrows ) {
+          printf( "Sforo il numero di linee\n" );
+          printf( "Memory Access :: %d, %d, %d, %d,%d, %d, %d, %d, %d\n", i, k, nnz, nrows, threadIdx.x, blockIdx.x, blockDim.x, gridDim.x, rowIndices[i] );
+        }
+
+        if ( i >= nnz || i == 0 ) {
+          printf( "Memory Access :: %d, %d, %d, %d,%d, %d, %d, %d, %d\n", i, k, nnz, nrows, threadIdx.x, blockIdx.x, blockDim.x, gridDim.x, rowIndices[i] );
+        }
+        atomicAdd(&rowOffsets[k], 1);
+    }
+*/
+
+  for ( int i = threadIdx.x + blockIdx.x * blockDim.x; i<nnz; i += blockDim.x * gridDim.x)
+  {
+    if ( i < nnz )
+    {
+      int k = rowIndices[i];
+      if ( k < nrows+1 ) 
+      {
+        atomicAdd(&rowOffsets[k], 1);
+      }
+    }
+  }
+
+
+}
+
+// Updates the values based on the previously determined permutation
+void FOAM2CSR::updateValues(
+    const int nrows,
+    const int nInternalFaces,
+    const int extNnz,
+    const double *diagVal,
+    const double *uppVal,
+    const double *lowVal,
+    const double *extVal)
+{
+
+    // Determine the local non-zeros from the internal faces
+    int localNnz = nrows + 2 * nInternalFaces;
+
+    // Add external non-zeros (communicated halo entries)
+    int totalNnz = localNnz + extNnz;
+
+    // Copy the values in [ diag, upper, lower, (external) ]
+    CHECK(cudaMemcpy(valuesTmp, diagVal, sizeof(double) * nrows, cudaMemcpyDefault));
+    CHECK(cudaMemcpy(valuesTmp + nrows, uppVal, sizeof(double) * nInternalFaces, cudaMemcpyDefault));
+    CHECK(cudaMemcpy(valuesTmp + nrows + nInternalFaces, lowVal, sizeof(double) * nInternalFaces, cudaMemcpyDefault));
+    if (extNnz > 0)
+    {
+        CHECK(cudaMemcpy(valuesTmp + localNnz, extVal, sizeof(double) * extNnz, cudaMemcpyDefault));
+    }
+
+    constexpr int nthreads = 128;
+    int nblocks = totalNnz / nthreads + 1;
+    applyPermutation<<<nblocks, nthreads>>>(totalNnz, ldu2csrPerm, nullptr, valuesTmp, nullptr, values, true);
+    CHECK(cudaStreamSynchronize(0));
+}
+
+// Perform the conversion between an LDU matrix and a CSR matrix, possibly distributed
+void FOAM2CSR::convertLDU2CSR(
+    int nrows,
+    int nInternalFaces,
+    int diagIndexGlobal,
+    int lowOffGlobal,
+    int uppOffGlobal,
+    const int *upperAddr,
+    const int *lowerAddr,
+    const int extNnz,
+    const int *extRow,
+    const int *extCol,
+    const double *diagVals,
+    const double *upperVals,
+    const double *lowerVals,
+    const double *extVals)
+{
+
+
+    // Determine the local non-zeros from the internal faces
+    int localNnz = nrows + 2 * nInternalFaces;
+
+    // Add external non-zeros (communicated halo entries)
+    int totalNnz = localNnz + extNnz;
+
+
+    if ( diagIndexGlobal > 0 ) 
+    {
+      sleep(2);
+      std::cout << " ============================================================== " << std::endl;
+      std::cout << "P1 :: nrows             :: " << nrows           << std::endl;
+      std::cout << "P1 :: nInternalFaces    :: " << nInternalFaces  << std::endl;
+      std::cout << "P1 :: extNnz            :: " << extNnz          << std::endl;
+      std::cout << "P1 :: diag index global :: " << diagIndexGlobal << std::endl;
+      std::cout << "P1 :: low offset global :: " << lowOffGlobal    << std::endl;
+      std::cout << "P1 :: upp offset global :: " << uppOffGlobal    << std::endl;
+      std::cout << "P1 :: upperaddr0        :: " << upperAddr[0]    << std::endl;
+      std::cout << "P1 :: loweraddr0        :: " << lowerAddr[0]    << std::endl;
+      if ( extNnz > 0 ){
+        std::cout << "P1 :: extRow            :: " << extRow[0]       << std::endl;
+        std::cout << "P1 :: extCol            :: " << extCol[0]       << std::endl;
+      }
+        FILE* fid = fopen( "P1_descr.txt", "w" );
+        fprintf( fid, "%d, %d, %d, %d, %d, %d\n", nrows, nInternalFaces, diagIndexGlobal, lowOffGlobal, uppOffGlobal, extNnz );
+        fclose(fid);
+      
+      fid = fopen( "P1_LDU_diag.txt", "w" );
+      for ( int i =0; i<nrows; i++ ) fprintf( fid, "%g\n", diagVals[i] );
+      fclose(fid);
+      fid = fopen( "P1_LDU_sym.txt", "w" );
+      for ( int i =0; i<nInternalFaces; i++ ) fprintf( fid, "%d, %d, %g, %g\n", 
+                     upperAddr[i], lowerAddr[i], upperVals[i], lowerVals[i] );
+      fclose(fid);
+      if ( extNnz > 0 ){
+        fid = fopen( "P1_LDU_ext.txt", "w" );
+        for ( int i =0; i<extNnz; i++ ) fprintf( fid, "%d, %d, %g\n", 
+                     extRow[i], extCol[i], extVals[i] );
+        fclose(fid);
+      }
+      std::cout << " ============================================================== " << std::endl;
+    }
+    else 
+    {
+      std::cout << " ============================================================== " << std::endl;
+      std::cout << "P0 :: nrows             :: " << nrows           << std::endl;
+      std::cout << "P0 :: nInternalFaces    :: " << nInternalFaces  << std::endl;
+      std::cout << "P0 :: extNnz            :: " << extNnz          << std::endl;
+      std::cout << "P0 :: diag index global :: " << diagIndexGlobal << std::endl;
+      std::cout << "P0 :: low offset global :: " << lowOffGlobal    << std::endl;
+      std::cout << "P0 :: upp offset global :: " << uppOffGlobal    << std::endl;
+      std::cout << "P0 :: upperaddr0        :: " << upperAddr[0]    << std::endl;
+      std::cout << "P0 :: loweraddr0        :: " << lowerAddr[0]    << std::endl;
+      if ( extNnz > 0 )
+      {
+        std::cout << "P0 :: extRow            :: " << extRow[0]       << std::endl;
+        std::cout << "P0 :: extCol            :: " << extCol[0]       << std::endl;
+      }
+      FILE* fid = fopen( "P0_descr.txt", "w" );
+      fprintf( fid, "%d, %d, %d, %d, %d, %d\n", nrows, nInternalFaces, diagIndexGlobal, lowOffGlobal, uppOffGlobal, extNnz );
+      fclose(fid);
+      fid = fopen( "P0_LDU_diag.txt", "w" );
+      for ( int i =0; i<nrows; i++ ) fprintf( fid, "%g\n", diagVals[i] );
+      fclose(fid);
+      fid = fopen( "P0_LDU_sym.txt", "w" );
+      for ( int i =0; i<nInternalFaces; i++ ) fprintf( fid, "%d, %d, %g, %g\n", 
+                     upperAddr[i], lowerAddr[i], upperVals[i], lowerVals[i] );
+      fclose(fid);
+      if ( extNnz > 0 ){
+        fid = fopen( "P0_LDU_ext.txt", "w" );
+        for ( int i =0; i<extNnz; i++ ) fprintf( fid, "%d, %d, %g\n", 
+                       extRow[i], extCol[i], extVals[i] );
+        fclose(fid);
+      }
+      sleep(3);
+    }
+
+
+    // Generate unpermuted index list [0, ..., totalNnz-1]
+    int GuardSize=1000000;
+    int *permTmp;
+    CHECK(cudaMalloc(&permTmp, sizeof(int) * (totalNnz+GuardSize)));
+    thrust::sequence(thrust::device, permTmp, permTmp + totalNnz, 0);
+
+    // Fill rowIndicesTmp with [0, ..., n-1], lowerAddr, upperAddr, (extAddr)
+    int *rowIndicesTmp;
+    CHECK(cudaMalloc(&rowIndicesTmp, sizeof(int) * (totalNnz+GuardSize)));
+    CHECK(cudaMemcpy(rowIndicesTmp, permTmp, nrows * sizeof(int), cudaMemcpyDefault));
+    CHECK(cudaMemcpy(rowIndicesTmp + nrows, lowerAddr, nInternalFaces * sizeof(int), cudaMemcpyDefault));
+    CHECK(cudaMemcpy(rowIndicesTmp + nrows + nInternalFaces, upperAddr, nInternalFaces * sizeof(int), cudaMemcpyDefault));
+    if (extNnz > 0)
+    {
+        CHECK(cudaMemcpy(rowIndicesTmp + localNnz, extRow, extNnz * sizeof(int), cudaMemcpyDefault));
+    }
+    constexpr int nthreads = 128;
+    int nblocks = localNnz / nthreads + 1;
+    // localToGlobalRowIndices<<<nblocks, nthreads>>>(localNnz, nrows, nInternalFaces, diagIndexGlobal, lowOffGlobal, uppOffGlobal, rowIndicesTmp);
+
+    // Make space for the row indices and stored permutation
+    int *rowIndices;
+    CHECK(cudaMalloc(&rowIndices, sizeof(int) * (totalNnz+GuardSize)));
+    CHECK(cudaMalloc(&ldu2csrPerm, sizeof(int) * (totalNnz+GuardSize)));
+
+    // Sort the row indices and store results in the permutation
+    void *tempStorage = NULL;
+    size_t tempStorageBytes = 0;
+    cub::DeviceRadixSort::SortPairs(tempStorage, tempStorageBytes, rowIndicesTmp, rowIndices, permTmp, ldu2csrPerm, totalNnz);
+    CHECK(cudaMalloc(&tempStorage, tempStorageBytes));
+
+
+    cub::DeviceRadixSort::SortPairs(tempStorage, tempStorageBytes, rowIndicesTmp, rowIndices, permTmp, ldu2csrPerm, totalNnz);
+    CHECK(cudaFree(permTmp));
+    CHECK(cudaFree(tempStorage));
+
+    // Make space for the row offsets
+    CHECK(cudaMalloc(&rowOffsets, sizeof(int) * (nrows + 1 + GuardSize)));
+    CHECK(cudaMemset(rowOffsets, 0, sizeof(int) * (nrows + 1 + GuardSize)));
+
+    // XXX Taking the non zero per row data from the host could be more
+    // efficient, experiment with this in the future
+    //cudaMemcpy(rowOffsets, nz_per_row, sizeof(int) * nrows, cudaMemcpyDefault);
+    //thrust::exclusive_scan(thrust::device, rowOffsets, rowOffsets + nrows + 1, rowOffsets);
+
+    // Convert the row indices into offsets
+    nblocks = totalNnz / nthreads + 1;
+    createRowOffsets<<<nblocks, nthreads>>>(totalNnz, nrows+1, rowIndices, rowOffsets);
+    thrust::exclusive_scan(thrust::device, rowOffsets, rowOffsets + nrows + 1, rowOffsets);
+    // CHECK(cudaFree(rowIndices));
+    // printf( "%d - %d \n", rowOffsets[nrows], totalNnz );
+
+
+
+    // Fill valuesTmp with diagVals, upperVals, lowerVals, (extVals)
+    CHECK(cudaMalloc(&valuesTmp, (totalNnz+GuardSize) * sizeof(double)));
+    CHECK(cudaMemcpy(valuesTmp, diagVals, nrows * sizeof(double), cudaMemcpyDefault));
+    CHECK(cudaMemcpy(valuesTmp + nrows, upperVals, nInternalFaces * sizeof(double), cudaMemcpyDefault));
+    CHECK(cudaMemcpy(valuesTmp + nrows + nInternalFaces, lowerVals, nInternalFaces * sizeof(double), cudaMemcpyDefault));
+    if (extNnz > 0)
+    {
+        CHECK(cudaMemcpy(valuesTmp + localNnz, extVals, extNnz * sizeof(double), cudaMemcpyDefault));
+    }
+
+    // Concat [0, ..., n-1], upperAddr, lowerAddr (note switched) into column indices
+    int *colIndicesTmp;
+    CHECK(cudaMalloc(&colIndicesTmp, (totalNnz+GuardSize) * sizeof(int)));
+    CHECK(cudaMemcpy(colIndicesTmp, rowIndicesTmp, nrows * sizeof(int), cudaMemcpyDefault));
+    CHECK(cudaMemcpy(colIndicesTmp + nrows, rowIndicesTmp + nrows + nInternalFaces, nInternalFaces * sizeof(int), cudaMemcpyDefault));
+    CHECK(cudaMemcpy(colIndicesTmp + nrows + nInternalFaces, rowIndicesTmp + nrows, nInternalFaces * sizeof(int), cudaMemcpyDefault));
+    if (extNnz > 0)
+    {
+        CHECK(cudaMemcpy(colIndicesTmp + localNnz, extCol, extNnz * sizeof(int), cudaMemcpyDefault));
+    }
+
+    CHECK(cudaFree(rowIndicesTmp));
+
+
+
+    // Construct the global column indices
+    nblocks = localNnz / nthreads + 1;
+    localToGlobalColIndices<<<nblocks, nthreads>>>(localNnz, nrows, nInternalFaces, diagIndexGlobal, lowOffGlobal, uppOffGlobal, colIndicesTmp);
+
+
+    // std::cout << "Print" << std::endl;
+    // nblocks = 1;
+    // printPerm<<<nblocks, nthreads>>>( 9, 33, rowOffsets, colIndicesTmp, ldu2csrPerm );
+    // std::cout << "Exit" << std::endl;
+
+
+    // Allocate space to store the permuted column indices and values
+    CHECK(cudaMalloc(&colIndices, sizeof(int) * (totalNnz+GuardSize)));
+    CHECK(cudaMalloc(&values, sizeof(double) * (totalNnz+GuardSize)));
+
+    // Swap column indices based on the pre-determined permutation
+    nblocks = totalNnz / nthreads + 1;
+    applyPermutation<<<nblocks, nthreads>>>(totalNnz, ldu2csrPerm, colIndicesTmp, valuesTmp, colIndices, values, false);
+    CHECK(cudaFree(colIndicesTmp));
+
+
+    int*    csr_rowOfs = (int*)    malloc( sizeof(int)*   (nrows+1) );
+    int*    csr_rowIdx = (int*)    malloc( sizeof(int)*   totalNnz );
+    int*    csr_colIdx = (int*)    malloc( sizeof(int)*   totalNnz );
+    double* csr_values = (double*) malloc( sizeof(double)*totalNnz );
+
+    CHECK(cudaMemcpy( csr_rowOfs, rowOffsets, (nrows+1) * sizeof(int),    cudaMemcpyDeviceToHost));
+    CHECK(cudaMemcpy( csr_rowIdx, rowIndices, totalNnz * sizeof(int),     cudaMemcpyDeviceToHost));
+    CHECK(cudaMemcpy( csr_colIdx, colIndices, totalNnz  * sizeof(int),    cudaMemcpyDeviceToHost));
+    CHECK(cudaMemcpy( csr_values, values,     totalNnz  * sizeof(double), cudaMemcpyDeviceToHost));
+
+    if ( diagIndexGlobal > 0 ) 
+    {
+      FILE* fid = fopen( "P1_CSR_rowOffs.txt", "w" );
+      for ( int i =0; i<nrows+1; i++ ) fprintf( fid, "%d\n", csr_rowOfs[i] );
+      fclose(fid);
+      fid = fopen( "P1_CSR_col.txt", "w" );
+      for ( int i =0; i<totalNnz; i++ ) fprintf( fid, "%d, %d, %g\n", csr_rowIdx[i], csr_colIdx[i], csr_values[i] );
+      fclose(fid);
+   }
+   else
+   {
+     FILE* fid = fopen( "P0_CSR_rowOffs.txt", "w" );
+     for ( int i =0; i<nrows+1; i++ ) fprintf( fid, "%d\n", csr_rowOfs[i] );
+     fclose(fid);
+     fid = fopen( "P0_CSR_col.txt", "w" );
+     for ( int i =0; i<totalNnz; i++ ) fprintf( fid, "%d, %d, %g\n", csr_rowIdx[i], csr_colIdx[i], csr_values[i] );
+     fclose(fid);
+   }
+
+
+}
+
+// XXX Should implement an early abandonment of the
+// unnecessary data for capacity optimisation
+void FOAM2CSR::discardStructure()
+{
+}
+
+// Deallocate remaining storage
+void FOAM2CSR::finalise()
+{
+    if(rowOffsets != nullptr)
+        CHECK(cudaFree(rowOffsets));
+    if(colIndices != nullptr)
+        CHECK(cudaFree(colIndices));
+    if(values != nullptr)
+        CHECK(cudaFree(values));
+    if(valuesTmp != nullptr)
+        CHECK(cudaFree(valuesTmp));
+    if(ldu2csrPerm != nullptr)
+        CHECK(cudaFree(ldu2csrPerm));
+}
diff --git a/src/consolidate.cu b/src/consolidate.cu
new file mode 100644
index 0000000000000000000000000000000000000000..853a965d02329d5e9a3009bdfa686356002e23dc
--- /dev/null
+++ b/src/consolidate.cu
@@ -0,0 +1,428 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ * \file consolidate.cu
+ * \brief Definition of member functions related to matrix consolidation.
+ * \author Matt Martineau (mmartineau@nvidia.com)
+ * \date 2020-07-31
+ */
+
+#include <AmgXSolver.H>
+
+#include <numeric>
+
+/*
+    Changes local row offsets to describe the consolidated row space on
+    the root rank.
+*/
+__global__ void fixConsolidatedRowOffsets(int nLocalRows, int offset, int* rowOffsets)
+{
+    for(int i = threadIdx.x + blockIdx.x*blockDim.x; i < nLocalRows; i += blockDim.x*gridDim.x)
+    {
+        rowOffsets[i] += offset;
+    }
+}
+
+// A set of handles to the device data storing a consolidated CSR matrix
+struct ConsolidationHandles
+{
+    cudaIpcMemHandle_t rhsConsHandle;
+    cudaIpcMemHandle_t solConsHandle;
+    cudaIpcMemHandle_t rowOffsetsConsHandle;
+    cudaIpcMemHandle_t colIndicesConsHandle;
+    cudaIpcMemHandle_t valuesConsHandle;
+};
+
+/* \implements AmgXSolver::initializeConsolidation */
+PetscErrorCode AmgXSolver::initializeConsolidation(
+    const PetscInt nLocalRows,
+    const PetscInt nLocalNz,
+    const PetscScalar* values)
+{
+    PetscFunctionBeginUser;
+
+    // Check if multiple ranks are associated with a device
+    if (devWorldSize == 1)
+    {
+        consolidationStatus = ConsolidationStatus::None;
+        PetscFunctionReturn(0);
+    }
+
+    nRowsInDevWorld.resize(devWorldSize);
+    nnzInDevWorld.resize(devWorldSize);
+    rowDispls.resize(devWorldSize+1, 0);
+    nzDispls.resize(devWorldSize+1, 0);
+
+    // Fetch to all the number of local rows on each rank
+    MPI_Request req[2];
+    int ierr = MPI_Iallgather(&nLocalRows, 1, MPI_INT, nRowsInDevWorld.data(), 1, MPI_INT, devWorld, &req[0]); CHK;
+
+    // Fetch to all the number of non zeros on each rank
+    ierr = MPI_Iallgather(&nLocalNz, 1, MPI_INT, nnzInDevWorld.data(), 1, MPI_INT, devWorld, &req[1]); CHK;
+    MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
+
+    // Calculate consolidate number of rows, non-zeros, and calculate row, non-zero displacements
+    nConsNz = std::accumulate(nnzInDevWorld.begin(), nnzInDevWorld.end(), 0);
+    nConsRows = std::accumulate(nRowsInDevWorld.begin(), nRowsInDevWorld.end(), 0);
+    std::partial_sum(nRowsInDevWorld.begin(), nRowsInDevWorld.end(), rowDispls.begin()+1);
+    std::partial_sum(nnzInDevWorld.begin(), nnzInDevWorld.end(), nzDispls.begin()+1);
+
+    // Consolidate the CSR matrix data from multiple ranks sharing a single GPU to a
+    // root rank, allowing multiple ranks per GPU. This allows overdecomposing the problem
+    // when there are more CPU cores than GPUs, without the inefficiences of performing
+    // the linear solve on multiple separate domains.
+    // If the data is a device pointer then use IPC handles to perform the intra-GPU
+    // copies from the allocations of different processes operating the same GPU.
+    // Opening the handles as an initialization step means it is not necessary to
+    // repeatedly call cudaIpcOpenMemHandle, which can be expensive.
+    cudaPointerAttributes att;
+    cudaError_t err = cudaPointerGetAttributes(&att, values);
+    if (err != cudaErrorInvalidValue && att.type == cudaMemoryTypeDevice)
+    {
+        ConsolidationHandles handles;
+        // The data is already on the GPU so consolidate there
+        if (gpuProc == 0)
+        {
+            // We are consolidating data that already exists on the GPU
+            CHECK(cudaMalloc((void**)&rhsCons, sizeof(PetscScalar) * nConsRows));
+            CHECK(cudaMalloc((void**)&pCons, sizeof(PetscScalar) * nConsRows));
+            CHECK(cudaMalloc((void**)&rowOffsetsCons, sizeof(PetscInt) * (nConsRows+1)));
+            CHECK(cudaMalloc((void**)&colIndicesGlobalCons, sizeof(PetscInt) * nConsNz));
+            CHECK(cudaMalloc((void**)&valuesCons, sizeof(PetscScalar) * nConsNz));
+
+            CHECK(cudaIpcGetMemHandle(&handles.rhsConsHandle, rhsCons));
+            CHECK(cudaIpcGetMemHandle(&handles.solConsHandle, pCons));
+            CHECK(cudaIpcGetMemHandle(&handles.rowOffsetsConsHandle, rowOffsetsCons));
+            CHECK(cudaIpcGetMemHandle(&handles.colIndicesConsHandle, colIndicesGlobalCons));
+            CHECK(cudaIpcGetMemHandle(&handles.valuesConsHandle, valuesCons));
+        }
+
+        MPI_Bcast(&handles, sizeof(ConsolidationHandles), MPI_BYTE, 0, devWorld);
+
+        if(gpuProc == MPI_UNDEFINED)
+        {
+            CHECK(cudaIpcOpenMemHandle((void**)&rhsCons, handles.rhsConsHandle, cudaIpcMemLazyEnablePeerAccess));
+            CHECK(cudaIpcOpenMemHandle((void**)&pCons, handles.solConsHandle, cudaIpcMemLazyEnablePeerAccess));
+            CHECK(cudaIpcOpenMemHandle((void**)&rowOffsetsCons, handles.rowOffsetsConsHandle, cudaIpcMemLazyEnablePeerAccess));
+            CHECK(cudaIpcOpenMemHandle((void**)&colIndicesGlobalCons, handles.colIndicesConsHandle, cudaIpcMemLazyEnablePeerAccess));
+            CHECK(cudaIpcOpenMemHandle((void**)&valuesCons, handles.valuesConsHandle, cudaIpcMemLazyEnablePeerAccess));
+        }
+
+        consolidationStatus = ConsolidationStatus::Device;
+    }
+    else
+    {
+        if (gpuProc == 0)
+        {
+            // The data is already on the CPU so consolidate there
+            rowOffsetsCons = new PetscInt[nConsRows+1];
+            colIndicesGlobalCons = new PetscInt[nConsNz];
+            valuesCons = new PetscScalar[nConsNz];
+            rhsCons = new PetscScalar[nConsRows];
+            pCons = new PetscScalar[nConsRows];
+        }
+
+        consolidationStatus = ConsolidationStatus::Host;
+    }
+
+    PetscFunctionReturn(0);
+}
+
+/* \implements AmgXSolver::consolidateMatrix */
+PetscErrorCode AmgXSolver::consolidateMatrix(
+    const PetscInt nLocalRows,
+    const PetscInt nLocalNz,
+    const PetscInt* rowOffsets,
+    const PetscInt* colIndicesGlobal,
+    const PetscScalar* values)
+{
+    PetscFunctionBeginUser;
+
+    // Consolidation has been previously used, must deallocate the structures
+    if (consolidationStatus != ConsolidationStatus::Uninitialized)
+    {
+        // XXX Would the maintainers be happy to include a warning message here,
+        // that makes it clear updateA should be preferentially adopted by developers
+        // if the sparsity pattern does not change? This would avoid many costs,
+        // including re-consolidation costs.
+
+        finalizeConsolidation();
+    }
+
+    // Allocate space for the structures required to consolidate
+    initializeConsolidation(nLocalRows, nLocalNz, values);
+
+    switch(consolidationStatus)
+    {
+
+    case ConsolidationStatus::None:
+    {
+        // Consolidation is not required
+        PetscFunctionReturn(0);
+    }
+    case ConsolidationStatus::Uninitialized:
+    {
+        SETERRQ(MPI_COMM_WORLD, PETSC_ERR_SUP_SYS,
+                "Attempting to consolidate before consolidation is initialized.\n");
+        break;
+    }
+    case ConsolidationStatus::Device:
+    {
+        // Copy the data to the consolidation buffer
+        CHECK(cudaMemcpy(&rowOffsetsCons[rowDispls[myDevWorldRank]], rowOffsets, sizeof(PetscInt) * nLocalRows, cudaMemcpyDefault));
+        CHECK(cudaMemcpy(&colIndicesGlobalCons[nzDispls[myDevWorldRank]], colIndicesGlobal, sizeof(PetscInt) * nLocalNz, cudaMemcpyDefault));
+        CHECK(cudaMemcpy(&valuesCons[nzDispls[myDevWorldRank]], values, sizeof(PetscScalar) * nLocalNz, cudaMemcpyDefault));
+
+        // cudaMemcpy does not block the host in the cases above, device to device copies,
+        // so sychronize with device to ensure operation is complete. Barrier on all devWorld
+        // ranks to ensure full arrays are populated before the root process uses the data.
+        CHECK(cudaDeviceSynchronize());
+        int ierr = MPI_Barrier(devWorld); CHK;
+
+        if (gpuProc == 0)
+        {
+            // Adjust merged row offsets so that they are correct for the consolidated matrix
+            for (int i = 1; i < devWorldSize; ++i)
+            {
+                int nthreads = 128;
+                int nblocks = nRowsInDevWorld[i] / nthreads + 1;
+                fixConsolidatedRowOffsets<<<nblocks, nthreads>>>(nRowsInDevWorld[i], nzDispls[i], &rowOffsetsCons[rowDispls[i]]);
+            }
+
+            // Manually add the last entry of the rowOffsets list, which is the
+            // number of non-zeros in the CSR matrix
+            CHECK(cudaMemcpy(&rowOffsetsCons[nConsRows], &nConsNz, sizeof(int), cudaMemcpyDefault));
+        }
+        else
+        {
+            // Close IPC handles and deallocate for consolidation
+            CHECK(cudaIpcCloseMemHandle(rowOffsetsCons));
+            CHECK(cudaIpcCloseMemHandle(colIndicesGlobalCons));
+        }
+
+        CHECK(cudaDeviceSynchronize());
+        break;
+    }
+    case ConsolidationStatus::Host:
+    {
+        // Gather the matrix data to the root rank for consolidation
+        MPI_Request req[3];
+        int ierr = MPI_Igatherv(rowOffsets, nLocalRows, MPI_INT, rowOffsetsCons, nRowsInDevWorld.data(), rowDispls.data(), MPI_INT, 0, devWorld, &req[0]); CHK;
+        ierr = MPI_Igatherv(colIndicesGlobal, nLocalNz, MPI_INT, colIndicesGlobalCons, nnzInDevWorld.data(), nzDispls.data(), MPI_INT, 0, devWorld, &req[1]); CHK;
+        ierr = MPI_Igatherv(values, nLocalNz, MPI_DOUBLE, valuesCons, nnzInDevWorld.data(), nzDispls.data(), MPI_DOUBLE, 0, devWorld, &req[2]); CHK;
+        MPI_Waitall(3, req, MPI_STATUSES_IGNORE);
+
+        if (gpuProc == 0)
+        {
+            // Adjust merged row offsets so that they are correct for the consolidated matrix
+            for (int j = 1; j < devWorldSize; ++j)
+            {
+                for(int i = 0; i < nRowsInDevWorld[j]; ++i)
+                {
+                    rowOffsetsCons[rowDispls[j] + i] += nzDispls[j];
+                }
+            }
+
+            // Manually add the last entry of the rowOffsets list, which is the
+            // number of non-zeros in the CSR matrix
+            rowOffsetsCons[nConsRows] = nConsNz;
+        }
+
+        break;
+    }
+    default:
+    {
+        SETERRQ(MPI_COMM_WORLD, PETSC_ERR_SUP_SYS,
+                "Incorrect consolidation status set.\n");
+        break;
+    }
+
+    }
+
+    PetscFunctionReturn(0);
+}
+
+/* \implements AmgXSolver::reconsolidateValues */
+PetscErrorCode AmgXSolver::reconsolidateValues(
+    const PetscInt nLocalNz,
+    const PetscScalar* values)
+{
+    PetscFunctionBeginUser;
+
+    switch (consolidationStatus)
+    {
+
+    case ConsolidationStatus::None:
+    {
+        // Consolidation is not required
+        PetscFunctionReturn(0);
+    }
+    case ConsolidationStatus::Uninitialized:
+    {
+        SETERRQ(MPI_COMM_WORLD, PETSC_ERR_SUP_SYS,
+                "Attempting to re-consolidate before consolidation is initialized.\n");
+        break;
+    }
+    case ConsolidationStatus::Device:
+    {
+        CHECK(cudaDeviceSynchronize());
+        int ierr = MPI_Barrier(devWorld); CHK;
+
+        // The data is already on the GPU so consolidate there
+        CHECK(cudaMemcpy(&valuesCons[nzDispls[myDevWorldRank]], values, sizeof(PetscScalar) * nLocalNz, cudaMemcpyDefault));
+
+        CHECK(cudaDeviceSynchronize());
+        ierr = MPI_Barrier(devWorld); CHK;
+
+        break;
+    }
+    case ConsolidationStatus::Host:
+    {
+        // Gather the matrix values to the root rank for consolidation
+        int ierr = MPI_Gatherv(values, nLocalNz, MPI_DOUBLE, valuesCons, nnzInDevWorld.data(), nzDispls.data(), MPI_DOUBLE, 0, devWorld); CHK;
+        break;
+    }
+    default:
+    {
+        SETERRQ(MPI_COMM_WORLD, PETSC_ERR_SUP_SYS,
+                "Incorrect consolidation status set.\n");
+        break;
+    }
+
+    }
+
+    PetscFunctionReturn(0);
+}
+
+/* \implements AmgXSolver::freeConsStructure */
+PetscErrorCode AmgXSolver::freeConsStructure()
+{
+    PetscFunctionBeginUser;
+
+    // Only the root rank maintains a consolidated structure
+    if(gpuProc == MPI_UNDEFINED)
+    {
+        PetscFunctionReturn(0);
+    }
+
+    switch(consolidationStatus)
+    {
+
+    case ConsolidationStatus::None:
+    {
+        // Consolidation is not required
+        PetscFunctionReturn(0);
+    }
+    case ConsolidationStatus::Uninitialized:
+    {
+        SETERRQ(MPI_COMM_WORLD, PETSC_ERR_SUP_SYS,
+                 "Attempting to free consolidation structures before consolidation is initialized.\n");
+        break;
+    }
+    case ConsolidationStatus::Device:
+    {
+        // Free the device allocated consolidated CSR matrix structure
+        CHECK(cudaFree(rowOffsetsCons));
+        CHECK(cudaFree(colIndicesGlobalCons));
+        break;
+    }
+    case ConsolidationStatus::Host:
+    {
+        // Free the host allocated consolidated CSR matrix structure
+        delete[] rowOffsetsCons;
+        delete[] colIndicesGlobalCons;
+        break;
+    }
+    default:
+    {
+        SETERRQ(MPI_COMM_WORLD, PETSC_ERR_SUP_SYS,
+                "Incorrect consolidation status set.\n");
+        break;
+    }
+
+    }
+
+    PetscFunctionReturn(0);
+}
+
+/* \implements AmgXSolver::finalizeConsolidation */
+PetscErrorCode AmgXSolver::finalizeConsolidation()
+{
+    PetscFunctionBeginUser;
+
+    switch(consolidationStatus)
+    {
+
+    case ConsolidationStatus::None:
+    case ConsolidationStatus::Uninitialized:
+    {
+        // Consolidation is not required or uninitialized
+        PetscFunctionReturn(0);
+    }
+    case ConsolidationStatus::Device:
+    {
+        if (gpuProc == 0)
+        {
+            // Deallocate the CSR matrix values, solution and RHS
+            CHECK(cudaFree(valuesCons));
+            CHECK(cudaFree(pCons));
+            CHECK(cudaFree(rhsCons));
+        }
+        else
+        {
+            // Close the remaining IPC memory handles
+            CHECK(cudaIpcCloseMemHandle(valuesCons));
+            CHECK(cudaIpcCloseMemHandle(pCons));
+            CHECK(cudaIpcCloseMemHandle(rhsCons));
+        }
+        break;
+    }
+    case ConsolidationStatus::Host:
+    {
+        if(gpuProc == 0)
+        {
+            delete[] valuesCons;
+            delete[] pCons;
+            delete[] rhsCons;
+        }
+        break;
+    }
+    default:
+    {
+        SETERRQ(MPI_COMM_WORLD, PETSC_ERR_SUP_SYS,
+                "Incorrect consolidation status set.\n");
+        break;
+    }
+
+    }
+
+    // Free the local GPU partitioning structures
+    if(consolidationStatus == ConsolidationStatus::Device || consolidationStatus == ConsolidationStatus::Host)
+    {
+        nRowsInDevWorld.clear();
+        nnzInDevWorld.clear();
+        rowDispls.clear();
+        nzDispls.clear();
+    }
+
+    consolidationStatus = ConsolidationStatus::Uninitialized;
+
+    PetscFunctionReturn(0);
+}
diff --git a/src/init.cpp b/src/init.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d6afcd7578c7db434e907a9a2343c2d47b5f1352
--- /dev/null
+++ b/src/init.cpp
@@ -0,0 +1,336 @@
+/**
+ * \file init.cpp
+ * \brief Definition of some member functions of the class AmgXSolver.
+ * \author Pi-Yueh Chuang (pychuang@gwu.edu)
+ * \date 2016-01-08
+ * \copyright Copyright (c) 2015-2019 Pi-Yueh Chuang, Lorena A. Barba.
+ *            This project is released under MIT License.
+ */
+
+
+// CUDA
+# include <cuda_runtime.h>
+
+// AmgXSolver
+# include "AmgXSolver.H"
+# include <iostream>
+
+/* \implements AmgXSolver::AmgXSolver */
+AmgXSolver::AmgXSolver(const MPI_Comm &comm,
+        const std::string &modeStr, const std::string &cfgFile)
+{
+    initialize(comm, modeStr, cfgFile);
+}
+
+
+/* \implements AmgXSolver::~AmgXSolver */
+AmgXSolver::~AmgXSolver()
+{
+    if (isInitialized) finalize();
+}
+
+
+/* \implements AmgXSolver::initialize */
+PetscErrorCode AmgXSolver::initialize(const MPI_Comm &comm,
+        const std::string &modeStr, const std::string &cfgFile)
+{
+    PetscErrorCode      ierr;
+
+    PetscFunctionBeginUser;
+
+    // if this instance has already been initialized, skip
+    if (isInitialized) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE,
+            "This AmgXSolver instance has been initialized on this process.");
+
+    // increase the number of AmgXSolver instances
+    count += 1;
+
+    // get the name of this node
+    int     len;
+    char    name[MPI_MAX_PROCESSOR_NAME];
+    ierr = MPI_Get_processor_name(name, &len); CHK;
+    nodeName = name;
+
+    // get the mode of AmgX solver
+    ierr = setMode(modeStr); CHK;
+
+    // initialize communicators and corresponding information
+    ierr = initMPIcomms(comm); CHK;
+
+    // only processes in gpuWorld are required to initialize AmgX
+    if (gpuProc == 0)
+    {
+        ierr = initAmgX(cfgFile); CHK;
+    }
+
+    // a bool indicating if this instance is initialized
+    isInitialized = true;
+
+    consolidationStatus = ConsolidationStatus::Uninitialized;
+
+    PetscFunctionReturn(0);
+}
+
+
+/* \implements AmgXSolver::initMPIcomms */
+PetscErrorCode AmgXSolver::initMPIcomms(const MPI_Comm &comm)
+{
+    PetscErrorCode      ierr;
+
+    PetscFunctionBeginUser;
+
+    // duplicate the global communicator
+    ierr = MPI_Comm_dup(comm, &globalCpuWorld); CHK;
+    ierr = MPI_Comm_set_name(globalCpuWorld, "globalCpuWorld"); CHK;
+
+    // get size and rank for global communicator
+    ierr = MPI_Comm_size(globalCpuWorld, &globalSize); CHK;
+    ierr = MPI_Comm_rank(globalCpuWorld, &myGlobalRank); CHK;
+
+
+    // Get the communicator for processors on the same node (local world)
+    ierr = MPI_Comm_split_type(globalCpuWorld,
+            MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &localCpuWorld); CHK;
+    ierr = MPI_Comm_set_name(localCpuWorld, "localCpuWorld"); CHK;
+
+    // get size and rank for local communicator
+    ierr = MPI_Comm_size(localCpuWorld, &localSize); CHK;
+    ierr = MPI_Comm_rank(localCpuWorld, &myLocalRank); CHK;
+
+
+    // set up the variable nDevs
+    ierr = setDeviceCount(); CHK;
+
+
+    // set up corresponding ID of the device used by each local process
+    ierr = setDeviceIDs(); CHK;
+    ierr = MPI_Barrier(globalCpuWorld); CHK;
+
+
+    // split the global world into a world involved in AmgX and a null world
+    ierr = MPI_Comm_split(globalCpuWorld, gpuProc, 0, &gpuWorld); CHK;
+
+    // get size and rank for the communicator corresponding to gpuWorld
+    if (gpuWorld != MPI_COMM_NULL)
+    {
+        ierr = MPI_Comm_set_name(gpuWorld, "gpuWorld"); CHK;
+        ierr = MPI_Comm_size(gpuWorld, &gpuWorldSize); CHK;
+        ierr = MPI_Comm_rank(gpuWorld, &myGpuWorldRank); CHK;
+    }
+    else // for those can not communicate with GPU devices
+    {
+        gpuWorldSize = MPI_UNDEFINED;
+        myGpuWorldRank = MPI_UNDEFINED;
+    }
+
+    // split local world into worlds corresponding to each CUDA device
+    ierr = MPI_Comm_split(localCpuWorld, devID, 0, &devWorld); CHK;
+    ierr = MPI_Comm_set_name(devWorld, "devWorld"); CHK;
+
+    // get size and rank for the communicator corresponding to myWorld
+    ierr = MPI_Comm_size(devWorld, &devWorldSize); CHK;
+    ierr = MPI_Comm_rank(devWorld, &myDevWorldRank); CHK;
+
+    ierr = MPI_Barrier(globalCpuWorld); CHK;
+
+    return 0;
+}
+
+
+/* \implements AmgXSolver::setDeviceCount */
+PetscErrorCode AmgXSolver::setDeviceCount()
+{
+    PetscFunctionBeginUser;
+
+    // get the number of devices that AmgX solvers can use
+    switch (mode)
+    {
+        case AMGX_mode_dDDI: // for GPU cases, nDevs is the # of local GPUs
+        case AMGX_mode_dDFI: // for GPU cases, nDevs is the # of local GPUs
+        case AMGX_mode_dFFI: // for GPU cases, nDevs is the # of local GPUs
+            // get the number of total cuda devices
+            CHECK(cudaGetDeviceCount(&nDevs));
+            std::cout << "Number of GPU devices  :: " << nDevs << std::endl;
+
+            // Check whether there is at least one CUDA device on this node
+            if (nDevs == 0) SETERRQ1(MPI_COMM_WORLD, PETSC_ERR_SUP_SYS,
+                    "There is no CUDA device on the node %s !\n", nodeName.c_str());
+            break;
+        case AMGX_mode_hDDI: // for CPU cases, nDevs is the # of local processes
+        case AMGX_mode_hDFI: // for CPU cases, nDevs is the # of local processes
+        case AMGX_mode_hFFI: // for CPU cases, nDevs is the # of local processes
+        default:
+            nDevs = localSize;
+            break;
+    }
+
+    PetscFunctionReturn(0);
+}
+
+
+/* \implements AmgXSolver::setDeviceIDs */
+PetscErrorCode AmgXSolver::setDeviceIDs()
+{
+    PetscFunctionBeginUser;
+
+    PetscErrorCode      ierr;
+
+    // set the ID of device that each local process will use
+    if (nDevs == localSize) // # of the devices and local precosses are the same
+    {
+        devID = myLocalRank;
+        gpuProc = 0;
+    }
+    else if (nDevs > localSize) // there are more devices than processes
+    {
+        ierr = PetscPrintf(localCpuWorld, "CUDA devices on the node %s "
+                "are more than the MPI processes launched. Only %d CUDA "
+                "devices will be used.\n", nodeName.c_str(), localSize); CHK;
+
+        devID = myLocalRank;
+        gpuProc = 0;
+    }
+    else // there more processes than devices
+    {
+        int     nBasic = localSize / nDevs,
+                nRemain = localSize % nDevs;
+
+        if (myLocalRank < (nBasic+1)*nRemain)
+        {
+            devID = myLocalRank / (nBasic + 1);
+            if (myLocalRank % (nBasic + 1) == 0)  gpuProc = 0;
+        }
+        else
+        {
+            devID = (myLocalRank - (nBasic+1)*nRemain) / nBasic + nRemain;
+            if ((myLocalRank - (nBasic+1)*nRemain) % nBasic == 0) gpuProc = 0;
+        }
+    }
+
+    // Set the device for each rank
+    cudaSetDevice(devID);
+
+    PetscFunctionReturn(0);
+}
+
+
+/* \implements AmgXSolver::initAmgX */
+PetscErrorCode AmgXSolver::initAmgX(const std::string &cfgFile)
+{
+    PetscFunctionBeginUser;
+
+    // only the first instance (AmgX solver) is in charge of initializing AmgX
+    if (count == 1)
+    {
+        // initialize AmgX
+        AMGX_SAFE_CALL(AMGX_initialize());
+
+        // intialize AmgX plugings
+        AMGX_SAFE_CALL(AMGX_initialize_plugins());
+
+        // only the master process can output something on the screen
+        AMGX_SAFE_CALL(AMGX_register_print_callback(
+                    [](const char *msg, int length)->void
+                    {PetscPrintf(PETSC_COMM_WORLD, "%s", msg);}));
+
+        // let AmgX to handle errors returned
+        AMGX_SAFE_CALL(AMGX_install_signal_handler());
+    }
+
+    // create an AmgX configure object
+    AMGX_SAFE_CALL(AMGX_config_create_from_file(&cfg, cfgFile.c_str()));
+
+    // let AmgX handle returned error codes internally
+    AMGX_SAFE_CALL(AMGX_config_add_parameters(&cfg, "exception_handling=1"));
+
+    // create an AmgX resource object, only the first instance is in charge
+    if (count == 1) AMGX_resources_create(&rsrc, cfg, &gpuWorld, 1, &devID);
+
+    // create AmgX vector object for unknowns and RHS
+    AMGX_vector_create(&AmgXP, rsrc, mode);
+    AMGX_vector_create(&AmgXRHS, rsrc, mode);
+
+    // create AmgX matrix object for unknowns and RHS
+    AMGX_matrix_create(&AmgXA, rsrc, mode);
+
+    // create an AmgX solver object
+    AMGX_solver_create(&solver, rsrc, mode, cfg);
+
+    // obtain the default number of rings based on current configuration
+    AMGX_config_get_default_number_of_rings(cfg, &ring);
+
+    PetscFunctionReturn(0);
+}
+
+
+/* \implements AmgXSolver::finalize */
+PetscErrorCode AmgXSolver::finalize()
+{
+    PetscErrorCode      ierr;
+
+    PetscFunctionBeginUser;
+
+    // skip if this instance has not been initialized
+    if (! isInitialized)
+    {
+        ierr = PetscPrintf(PETSC_COMM_WORLD,
+                "This AmgXWrapper has not been initialized. "
+                "Please initialize it before finalization.\n"); CHK;
+
+        PetscFunctionReturn(0);
+    }
+
+    // only processes using GPU are required to destroy AmgX content
+    if (gpuProc == 0)
+    {
+        // destroy solver instance
+        AMGX_solver_destroy(solver);
+
+        // destroy matrix instance
+        AMGX_matrix_destroy(AmgXA);
+
+        // destroy RHS and unknown vectors
+        AMGX_vector_destroy(AmgXP);
+        AMGX_vector_destroy(AmgXRHS);
+
+        // only the last instance need to destroy resource and finalizing AmgX
+        if (count == 1)
+        {
+            AMGX_resources_destroy(rsrc);
+            AMGX_SAFE_CALL(AMGX_config_destroy(cfg));
+
+            AMGX_SAFE_CALL(AMGX_finalize_plugins());
+            AMGX_SAFE_CALL(AMGX_finalize());
+        }
+        else
+        {
+            AMGX_config_destroy(cfg);
+        }
+
+        // destroy gpuWorld
+        ierr = MPI_Comm_free(&gpuWorld); CHK;
+    }
+
+    finalizeConsolidation();
+
+    // destroy PETSc objects
+    ierr = VecScatterDestroy(&scatterLhs); CHK;
+    ierr = VecScatterDestroy(&scatterRhs); CHK;
+    ierr = VecDestroy(&redistLhs); CHK;
+    ierr = VecDestroy(&redistRhs); CHK;
+
+    // re-set necessary variables in case users want to reuse
+    // the variable of this instance for a new instance
+    gpuProc = MPI_UNDEFINED;
+    ierr = MPI_Comm_free(&globalCpuWorld); CHK;
+    ierr = MPI_Comm_free(&localCpuWorld); CHK;
+    ierr = MPI_Comm_free(&devWorld); CHK;
+
+    // decrease the number of instances
+    count -= 1;
+
+    // change status
+    isInitialized = false;
+
+    PetscFunctionReturn(0);
+}
diff --git a/src/misc.cpp b/src/misc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..86a7fa1157914abfbff9d0f25eb8950f6421b228
--- /dev/null
+++ b/src/misc.cpp
@@ -0,0 +1,62 @@
+/**
+ * \file misc.cpp
+ * \brief Definition of some member functions of the class AmgXSolver.
+ * \author Pi-Yueh Chuang (pychuang@gwu.edu)
+ * \date 2016-01-08
+ * \copyright Copyright (c) 2015-2019 Pi-Yueh Chuang, Lorena A. Barba.
+ *            This project is released under MIT License.
+ */
+
+
+// AmgXWrapper
+# include "AmgXSolver.H"
+
+
+/* \implements AmgXSolver::setMode */
+PetscErrorCode AmgXSolver::setMode(const std::string &modeStr)
+{
+    PetscFunctionBeginUser;
+
+    if (modeStr == "dDDI")
+        mode = AMGX_mode_dDDI;
+    else if (modeStr == "dDFI")
+        mode = AMGX_mode_dDFI;
+    else if (modeStr == "dFFI")
+        mode = AMGX_mode_dFFI;
+    else if (modeStr[0] == 'h')
+        SETERRQ1(MPI_COMM_WORLD, PETSC_ERR_ARG_WRONG,
+                "CPU mode, %s, is not supported in this wrapper!",
+                modeStr.c_str());
+    else
+        SETERRQ1(MPI_COMM_WORLD, PETSC_ERR_ARG_WRONG,
+                "%s is not an available mode! Available modes are: "
+                "dDDI, dDFI, dFFI.\n", modeStr.c_str());
+
+    PetscFunctionReturn(0);
+}
+
+
+/* \implements AmgXSolver::getIters */
+PetscErrorCode AmgXSolver::getIters(int &iter)
+{
+    PetscFunctionBeginUser;
+
+    // only processes using AmgX will try to get # of iterations
+    if (gpuProc == 0)
+        AMGX_solver_get_iterations_number(solver, &iter);
+
+    PetscFunctionReturn(0);
+}
+
+
+/* \implements AmgXSolver::getResidual */
+PetscErrorCode AmgXSolver::getResidual(const int &iter, double &res)
+{
+    PetscFunctionBeginUser;
+
+    // only processes using AmgX will try to get residual
+    if (gpuProc == 0)
+        AMGX_solver_get_iteration_residual(solver, iter, 0, &res);
+
+    PetscFunctionReturn(0);
+}
diff --git a/src/setA.cpp b/src/setA.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..39d514a5bededd7888f0d1d73c7d6f673b6a13c3
--- /dev/null
+++ b/src/setA.cpp
@@ -0,0 +1,525 @@
+/**
+ * \file setA.cpp
+ * \brief Definition of member functions regarding setting A in AmgXSolver.
+ * \author Pi-Yueh Chuang (pychuang@gwu.edu)
+ * \date 2016-01-08
+ * \copyright Copyright (c) 2015-2019 Pi-Yueh Chuang, Lorena A. Barba.
+ * \copyright Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *            This project is released under MIT License.
+ */
+
+
+// STD
+# include <cstring>
+# include <algorithm>
+#include  <iostream>
+
+// AmgXSolver
+# include "AmgXSolver.H"
+
+
+/* \implements AmgXSolver::setA */
+PetscErrorCode AmgXSolver::setA(const Mat &A)
+{
+    PetscFunctionBeginUser;
+
+    PetscErrorCode      ierr;
+
+    Mat                 localA;
+
+    IS                  devIS;
+
+    PetscInt            nGlobalRows,
+                        nLocalRows;
+    PetscBool           usesOffsets;
+
+    std::vector<PetscInt>       row;
+    std::vector<PetscInt64>     col;
+    std::vector<PetscScalar>    data;
+    std::vector<PetscInt>       partData;
+
+
+    // get number of rows in global matrix
+    ierr = MatGetSize(A, &nGlobalRows, nullptr); CHK;
+
+    // get the row indices of redistributed matrix owned by processes in gpuWorld
+    ierr = getDevIS(A, devIS); CHK;
+
+    // get sequential local portion of redistributed matrix
+    ierr = getLocalA(A, devIS, localA); CHK;
+
+    // get compressed row layout of the local Mat
+    ierr = getLocalMatRawData(localA, nLocalRows, row, col, data); CHK;
+
+    // destroy local matrix
+    ierr = destroyLocalA(A, localA); CHK;
+
+    // get a partition vector required by AmgX
+    ierr = getPartData(devIS, nGlobalRows, partData, usesOffsets); CHK;
+
+
+    // upload matrix A to AmgX
+    if (gpuWorld != MPI_COMM_NULL)
+    {
+        ierr = MPI_Barrier(gpuWorld); CHK;
+        // offsets need to be 64 bit, since we use 64 bit column indices
+        std::vector<PetscInt64> offsets;
+
+        AMGX_distribution_handle dist;
+        AMGX_distribution_create(&dist, cfg);
+        if (usesOffsets) {
+            offsets.assign(partData.begin(), partData.end());
+            AMGX_distribution_set_partition_data(dist, AMGX_DIST_PARTITION_OFFSETS, offsets.data());
+        } else {
+            AMGX_distribution_set_partition_data(dist, AMGX_DIST_PARTITION_VECTOR, partData.data());
+        }
+
+        AMGX_matrix_upload_distributed(
+                AmgXA, nGlobalRows, nLocalRows, row[nLocalRows],
+                1, 1, row.data(), col.data(), data.data(),
+                nullptr, dist);
+        AMGX_distribution_destroy(dist);
+
+        // bind the matrix A to the solver
+        ierr = MPI_Barrier(gpuWorld); CHK;
+        AMGX_solver_setup(solver, AmgXA);
+
+        // connect (bind) vectors to the matrix
+        AMGX_vector_bind(AmgXP, AmgXA);
+        AMGX_vector_bind(AmgXRHS, AmgXA);
+    }
+    ierr = MPI_Barrier(globalCpuWorld); CHK;
+
+    // destroy temporary PETSc objects
+    ierr = ISDestroy(&devIS); CHK;
+
+    PetscFunctionReturn(0);
+}
+
+
+/* \implements AmgXSolver::getDevIS */
+PetscErrorCode AmgXSolver::getDevIS(const Mat &A, IS &devIS)
+{
+    PetscFunctionBeginUser;
+
+    PetscErrorCode      ierr;
+    IS                  tempIS;
+
+    // get index sets of A locally owned by each process
+    // note that devIS is now a serial IS on each process
+    ierr = MatGetOwnershipIS(A, &devIS, nullptr); CHK;
+
+    // concatenate index sets that belong to the same devWorld
+    // note that now devIS is a parallel IS of communicator devWorld
+    ierr = ISOnComm(devIS, devWorld, PETSC_USE_POINTER, &tempIS); CHK;
+    ierr = ISDestroy(&devIS); CHK;
+
+    // all gather in order to have all indices belong to a devWorld on the
+    // leading rank of that devWorld. THIS IS NOT EFFICIENT!!
+    // note that now devIS is again a serial IS on each process
+    ierr = ISAllGather(tempIS, &devIS); CHK;
+    ierr = ISDestroy(&tempIS); CHK;
+
+    // empty devIS on ranks other than the leading ranks in each devWorld
+    if (myDevWorldRank != 0)
+        ierr = ISGeneralSetIndices(devIS, 0, nullptr, PETSC_COPY_VALUES); CHK;
+
+    // devIS is not guaranteed to be sorted. We sort it here.
+    ierr = ISSort(devIS); CHK;
+
+    PetscFunctionReturn(0);
+}
+
+
+/* \implements AmgXSolver::getLocalA */
+PetscErrorCode AmgXSolver::getLocalA(const Mat &A, const IS &devIS, Mat &localA)
+{
+    PetscFunctionBeginUser;
+
+    PetscErrorCode      ierr;
+    MatType             type;
+
+    // get the Mat type
+    ierr = MatGetType(A, &type); CHK;
+
+    // check whether the Mat type is supported
+    if (std::strcmp(type, MATSEQAIJ) == 0) // sequential AIJ
+    {
+        // make localA point to the same memory space as A does
+        localA = A;
+    }
+    else if (std::strcmp(type, MATMPIAIJ) == 0)
+    {
+        Mat                 tempA;
+
+        // redistribute matrix and also get corresponding scatters.
+        ierr = redistMat(A, devIS, tempA); CHK;
+
+        // get local matrix from redistributed matrix
+        ierr = MatMPIAIJGetLocalMat(tempA, MAT_INITIAL_MATRIX, &localA); CHK;
+
+        // destroy redistributed matrix
+        if (tempA == A)
+        {
+            tempA = nullptr;
+        }
+        else
+        {
+            ierr = MatDestroy(&tempA); CHK;
+        }
+    }
+    else
+    {
+        SETERRQ1(globalCpuWorld, PETSC_ERR_ARG_WRONG,
+                "Mat type %s is not supported!\n", type);
+    }
+
+    PetscFunctionReturn(0);
+}
+
+
+/* \implements AmgXSolver::redistMat */
+PetscErrorCode AmgXSolver::redistMat(const Mat &A, const IS &devIS, Mat &newA)
+{
+    PetscFunctionBeginUser;
+
+    PetscErrorCode      ierr;
+
+    if (gpuWorldSize == globalSize) // no redistributation required
+    {
+        newA = A;
+    }
+    else
+    {
+        IS      is;
+
+        // re-set the communicator of devIS to globalCpuWorld
+        ierr = ISOnComm(devIS, globalCpuWorld, PETSC_USE_POINTER, &is); CHK;
+
+        // redistribute the matrix A to newA
+        ierr = MatGetSubMatrix(A, is, is, MAT_INITIAL_MATRIX, &newA); CHK;
+
+        // get VecScatters between original data layout and the new one
+        ierr = getVecScatter(A, newA, is); CHK;
+
+        // destroy the temporary IS
+        ierr = ISDestroy(&is); CHK;
+    }
+
+    PetscFunctionReturn(0);
+}
+
+
+/* \implements AmgXSolver::getVecScatter */
+PetscErrorCode AmgXSolver::getVecScatter(
+        const Mat &A1, const Mat &A2, const IS &devIS)
+{
+    PetscFunctionBeginUser;
+
+    PetscErrorCode      ierr;
+
+    Vec                 tempLhs;
+    Vec                 tempRhs;
+
+    ierr = MatCreateVecs(A1, &tempLhs, &tempRhs); CHK;
+    ierr = MatCreateVecs(A2, &redistLhs, &redistRhs); CHK;
+
+    ierr = VecScatterCreate(tempLhs, devIS, redistLhs, devIS, &scatterLhs); CHK;
+    ierr = VecScatterCreate(tempRhs, devIS, redistRhs, devIS, &scatterRhs); CHK;
+
+    ierr = VecDestroy(&tempRhs); CHK;
+    ierr = VecDestroy(&tempLhs); CHK;
+
+    PetscFunctionReturn(0);
+}
+
+
+/* \implements AmgXSolver::getLocalMatRawData */
+PetscErrorCode AmgXSolver::getLocalMatRawData(const Mat &localA,
+        PetscInt &localN, std::vector<PetscInt> &row,
+        std::vector<PetscInt64> &col, std::vector<PetscScalar> &data)
+{
+    PetscFunctionBeginUser;
+
+    PetscErrorCode      ierr;
+
+    const PetscInt      *rawCol,
+                        *rawRow;
+
+    PetscScalar         *rawData;
+
+    PetscInt            rawN;
+
+    PetscBool           done;
+
+    // get row and column indices in compressed row format
+    ierr = MatGetRowIJ(localA, 0, PETSC_FALSE, PETSC_FALSE,
+            &rawN, &rawRow, &rawCol, &done); CHK;
+
+    // rawN will be returned by MatRestoreRowIJ, so we have to copy it
+    localN = rawN;
+
+    // check if the function worked
+    if (! done)
+        SETERRQ(globalCpuWorld, PETSC_ERR_SIG, "MatGetRowIJ did not work!");
+
+    // get data
+    ierr = MatSeqAIJGetArray(localA, &rawData); CHK;
+
+    // copy values to STL vector. Note: there is an implicit conversion from
+    // PetscInt to PetscInt64 for the column vector
+    col.assign(rawCol, rawCol+rawRow[localN]);
+    row.assign(rawRow, rawRow+localN+1);
+    data.assign(rawData, rawData+rawRow[localN]);
+
+
+    // return ownership of memory space to PETSc
+    ierr = MatRestoreRowIJ(localA, 0, PETSC_FALSE, PETSC_FALSE,
+            &rawN, &rawRow, &rawCol, &done); CHK;
+
+    // check if the function worked
+    if (! done)
+        SETERRQ(globalCpuWorld, PETSC_ERR_SIG, "MatRestoreRowIJ did not work!");
+
+    // return ownership of memory space to PETSc
+    ierr = MatSeqAIJRestoreArray(localA, &rawData); CHK;
+
+    PetscFunctionReturn(0);
+}
+
+
+/* \implements AmgXSolver::destroyLocalA */
+PetscErrorCode AmgXSolver::destroyLocalA(const Mat &A, Mat &localA)
+{
+    PetscFunctionBeginUser;
+
+    PetscErrorCode      ierr;
+
+    MatType             type;
+
+    // Get the Mat type
+    ierr = MatGetType(A, &type); CHK;
+
+    // when A is sequential, we can not destroy the memory space
+    if (std::strcmp(type, MATSEQAIJ) == 0)
+    {
+        localA = nullptr;
+    }
+    // for parallel case, localA can be safely destroyed
+    else if (std::strcmp(type, MATMPIAIJ) == 0)
+    {
+        ierr = MatDestroy(&localA); CHK;
+    }
+
+    PetscFunctionReturn(0);
+}
+
+/* \implements AmgXSolver::checkForContiguousPartitioning */
+PetscErrorCode AmgXSolver::checkForContiguousPartitioning(
+    const IS &devIS, PetscBool &isContiguous, std::vector<PetscInt> &partOffsets)
+{
+    PetscFunctionBeginUser;
+    PetscErrorCode      ierr;
+    PetscBool sorted;
+    PetscInt ismax= -2; // marker for "unsorted", allows to check after global sort
+
+    ierr = ISSorted(devIS, &sorted); CHK;
+    if (sorted)
+    {
+        ierr = ISGetMinMax(devIS, NULL, &ismax); CHK;
+    }
+    partOffsets.resize(gpuWorldSize);
+    ++ismax; // add 1 to allow reusing gathered ismax values as partition offsets for AMGX
+    MPI_Allgather(&ismax, 1, MPIU_INT, &partOffsets[0], 1, MPIU_INT, gpuWorld);
+    bool all_sorted = std::is_sorted(partOffsets.begin(), partOffsets.end())
+                        && partOffsets[0] != -1;
+    if (all_sorted)
+    {
+        partOffsets.insert(partOffsets.begin(), 0); // partition 0 always starts at 0
+        isContiguous = PETSC_TRUE;
+    }
+    else
+    {
+        isContiguous = PETSC_FALSE;
+    }
+    PetscFunctionReturn(0);
+}
+
+
+/* \implements AmgXSolver::getPartData */
+PetscErrorCode AmgXSolver::getPartData(
+        const IS &devIS, const PetscInt &N, std::vector<PetscInt> &partData, PetscBool &usesOffsets)
+{
+    PetscFunctionBeginUser;
+
+    PetscErrorCode      ierr;
+
+    VecScatter          scatter;
+    Vec                 tempMPI,
+                        tempSEQ;
+
+    PetscInt            n;
+    PetscScalar         *tempPartVec;
+
+    ierr = ISGetLocalSize(devIS, &n); CHK;
+
+    if (gpuWorld != MPI_COMM_NULL)
+    {
+        // check if sorted/contiguous, then we can skip expensive scatters
+        checkForContiguousPartitioning(devIS, usesOffsets, partData);
+        if (!usesOffsets)
+        {
+            ierr = VecCreateMPI(gpuWorld, n, N, &tempMPI); CHK;
+
+            IS      is;
+            ierr = ISOnComm(devIS, gpuWorld, PETSC_USE_POINTER, &is); CHK;
+            ierr = VecISSet(tempMPI, is, (PetscScalar) myGpuWorldRank); CHK;
+            ierr = ISDestroy(&is); CHK;
+
+            ierr = VecScatterCreateToAll(tempMPI, &scatter, &tempSEQ); CHK;
+            ierr = VecScatterBegin(scatter,
+                    tempMPI, tempSEQ, INSERT_VALUES, SCATTER_FORWARD); CHK;
+            ierr = VecScatterEnd(scatter,
+                    tempMPI, tempSEQ, INSERT_VALUES, SCATTER_FORWARD); CHK;
+            ierr = VecScatterDestroy(&scatter); CHK;
+            ierr = VecDestroy(&tempMPI); CHK;
+
+            ierr = VecGetArray(tempSEQ, &tempPartVec); CHK;
+
+            partData.assign(tempPartVec, tempPartVec+N);
+
+            ierr = VecRestoreArray(tempSEQ, &tempPartVec); CHK;
+
+            ierr = VecDestroy(&tempSEQ); CHK;
+        }
+    }
+    ierr = MPI_Barrier(globalCpuWorld); CHK;
+
+    PetscFunctionReturn(0);
+}
+
+
+/* \implements AmgXSolver::setA */
+PetscErrorCode AmgXSolver::setA(
+    const PetscInt nGlobalRows,
+    const PetscInt nLocalRows,
+    const PetscInt nLocalNz,
+    const PetscInt* rowOffsets,
+    const PetscInt* colIndicesGlobal,
+    const PetscScalar* values,
+    const PetscInt* partData)
+{
+    PetscFunctionBeginUser;
+
+    // Merge the distributed matrix for MPI processes sharing a GPU
+    // consolidateMatrix(nLocalRows, nLocalNz, rowOffsets, colIndicesGlobal, values);
+
+    int ierr;
+
+    // upload matrix A to AmgX
+    if (gpuWorld != MPI_COMM_NULL)
+    {
+        ierr = MPI_Barrier(gpuWorld); CHK;
+
+        //if (consolidationStatus == ConsolidationStatus::None)
+        //{
+        
+        
+        
+            std::cout << "Modified version :: " << ring  << std::endl;
+            AMGX_matrix_upload_all_global_32(
+                AmgXA, nGlobalRows, nLocalRows, nLocalNz,
+                1, 1, rowOffsets, colIndicesGlobal, values,
+                nullptr, 1, 1, nullptr );
+
+/*
+                int*    csr_rowOfs = (int*)    malloc( sizeof(int)*   (nrows+1) );
+    int*    csr_rowIdx = (int*)    malloc( sizeof(int)*   totalNnz );
+    int*    csr_colIdx = (int*)    malloc( sizeof(int)*   totalNnz );
+    double* csr_values = (double*) malloc( sizeof(double)*totalNnz );
+
+    CHECK(cudaMemcpy( csr_rowOfs, rowOffsets, (nrows+1) * sizeof(int),    cudaMemcpyDeviceToHost));
+    CHECK(cudaMemcpy( csr_rowIdx, rowIndices, totalNnz * sizeof(int),     cudaMemcpyDeviceToHost));
+    CHECK(cudaMemcpy( csr_colIdx, colIndices, totalNnz  * sizeof(int),    cudaMemcpyDeviceToHost));
+    CHECK(cudaMemcpy( csr_values, values,     totalNnz  * sizeof(double), cudaMemcpyDeviceToHost));
+
+    if ( diagIndexGlobal > 0 )
+    {
+      FILE* fid = fopen( "P1_CSR_rowOffs.txt", "w" );
+      for ( int i =0; i<nrows+1; i++ ) fprintf( fid, "%d\n", csr_rowOfs[i] );
+      fclose(fid);
+      fid = fopen( "P1_CSR_col.txt", "w" );
+      for ( int i =0; i<totalNnz; i++ ) fprintf( fid, "%d, %d, %g\n", csr_rowIdx[i], csr_colIdx[i], csr_values[i] );
+      fclose(fid);
+   }
+   else
+   {
+     FILE* fid = fopen( "P0_CSR_rowOffs.txt", "w" );
+     for ( int i =0; i<nrows+1; i++ ) fprintf( fid, "%d\n", csr_rowOfs[i] );
+     fclose(fid);
+     fid = fopen( "P0_CSR_col.txt", "w" );
+     for ( int i =0; i<totalNnz; i++ ) fprintf( fid, "%d, %d, %g\n", csr_rowIdx[i], csr_colIdx[i], csr_values[i] );
+     fclose(fid);
+ */
+
+        //}
+        //else
+        //{
+        //    AMGX_matrix_upload_all_global_32(
+        //        AmgXA, nGlobalRows, nConsRows, nConsNz,
+        //        1, 1, rowOffsetsCons, colIndicesGlobalCons, valuesCons,
+        //        nullptr, ring, ring, partData);
+        //
+        //    // The rowOffsets and colIndices are no longer needed
+        //    freeConsStructure();
+        //}
+
+        // bind the matrix A to the solver
+        ierr = MPI_Barrier(gpuWorld); CHK;
+        AMGX_solver_setup(solver, AmgXA);
+
+        // connect (bind) vectors to the matrix
+        AMGX_vector_bind(AmgXP, AmgXA);
+        AMGX_vector_bind(AmgXRHS, AmgXA);
+    }
+    ierr = MPI_Barrier(globalCpuWorld); CHK;
+
+    PetscFunctionReturn(0);
+}
+
+/* \implements AmgXSolver::updateA */
+PetscErrorCode AmgXSolver::updateA(
+    const PetscInt nLocalRows,
+    const PetscInt nLocalNz,
+    const PetscScalar* values)
+{
+    PetscFunctionBeginUser;
+
+    // Merges the values from multiple MPI processes sharing a single GPU
+    // reconsolidateValues(nLocalNz, values);
+
+    int ierr;
+    // Replace the coefficients for the CSR matrix A within AmgX
+    if (gpuWorld != MPI_COMM_NULL)
+    {
+        ierr = MPI_Barrier(gpuWorld); CHK;
+
+        //if (consolidationStatus == ConsolidationStatus::None)
+        //{
+            AMGX_matrix_replace_coefficients(AmgXA, nLocalRows, nLocalNz, values, nullptr);
+        //}
+        //else
+        //{
+        //    AMGX_matrix_replace_coefficients(AmgXA, nConsRows, nConsNz, valuesCons, nullptr);
+        //}
+
+        ierr = MPI_Barrier(gpuWorld); CHK;
+
+        // Re-setup the solver (a reduced overhead setup that accounts for consistent matrix structure)
+        AMGX_solver_resetup(solver, AmgXA);
+    }
+
+    ierr = MPI_Barrier(globalCpuWorld); CHK;
+
+    PetscFunctionReturn(0);
+}
diff --git a/src/solve.cpp b/src/solve.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..55a33f885f673983de3f480cb9a1aabdf33c27f3
--- /dev/null
+++ b/src/solve.cpp
@@ -0,0 +1,195 @@
+/**
+ * @file solve.cpp
+ * @brief Definition of member functions regarding solving in AmgXSolver.
+ * @author Pi-Yueh Chuang (pychuang@gwu.edu)
+ * @date 2016-01-08
+ * \copyright Copyright (c) 2015-2019 Pi-Yueh Chuang, Lorena A. Barba.
+ *            This project is released under MIT License.
+ */
+
+
+// AmgXWrapper
+# include "AmgXSolver.H"
+
+
+/* \implements AmgXSolver::solve */
+PetscErrorCode AmgXSolver::solve(Vec &p, Vec &b)
+{
+    PetscFunctionBeginUser;
+
+    PetscErrorCode      ierr;
+
+    if (globalSize != gpuWorldSize)
+    {
+        ierr = VecScatterBegin(scatterRhs,
+                b, redistRhs, INSERT_VALUES, SCATTER_FORWARD); CHK;
+        ierr = VecScatterBegin(scatterLhs,
+                p, redistLhs, INSERT_VALUES, SCATTER_FORWARD); CHK;
+
+        ierr = VecScatterEnd(scatterRhs,
+                b, redistRhs, INSERT_VALUES, SCATTER_FORWARD); CHK;
+        ierr = VecScatterEnd(scatterLhs,
+                p, redistLhs, INSERT_VALUES, SCATTER_FORWARD); CHK;
+
+        if (gpuWorld != MPI_COMM_NULL)
+        {
+            ierr = solve_real(redistLhs, redistRhs); CHK;
+        }
+        ierr = MPI_Barrier(globalCpuWorld); CHK;
+
+        ierr = VecScatterBegin(scatterLhs,
+                redistLhs, p, INSERT_VALUES, SCATTER_REVERSE); CHK;
+        ierr = VecScatterEnd(scatterLhs,
+                redistLhs, p, INSERT_VALUES, SCATTER_REVERSE); CHK;
+    }
+    else
+    {
+        if (gpuWorld != MPI_COMM_NULL)
+        {
+            ierr = solve_real(p, b); CHK;
+        }
+        ierr = MPI_Barrier(globalCpuWorld); CHK;
+    }
+
+    PetscFunctionReturn(0);
+}
+
+
+/* \implements AmgXSolver::solve_real */
+PetscErrorCode AmgXSolver::solve_real(Vec &p, Vec &b)
+{
+    PetscFunctionBeginUser;
+
+    PetscErrorCode      ierr;
+
+    double              *unks,
+                        *rhs;
+
+    int                 size;
+
+    AMGX_SOLVE_STATUS   status;
+
+    // get size of local vector (p and b should have the same local size)
+    ierr = VecGetLocalSize(p, &size); CHK;
+
+    // get pointers to the raw data of local vectors
+    ierr = VecGetArray(p, &unks); CHK;
+    ierr = VecGetArray(b, &rhs); CHK;
+
+    // upload vectors to AmgX
+    AMGX_vector_upload(AmgXP, size, 1, unks);
+    AMGX_vector_upload(AmgXRHS, size, 1, rhs);
+
+    // solve
+    ierr = MPI_Barrier(gpuWorld); CHK;
+    AMGX_solver_solve(solver, AmgXRHS, AmgXP);
+
+    // get the status of the solver
+    AMGX_solver_get_status(solver, &status);
+
+    // check whether the solver successfully solve the problem
+    if (status != AMGX_SOLVE_SUCCESS) SETERRQ1(globalCpuWorld,
+            PETSC_ERR_CONV_FAILED, "AmgX solver failed to solve the system! "
+            "The error code is %d.\n", status);
+
+    // download data from device
+    AMGX_vector_download(AmgXP, unks);
+
+    // restore PETSc vectors
+    ierr = VecRestoreArray(p, &unks); CHK;
+    ierr = VecRestoreArray(b, &rhs); CHK;
+
+    PetscFunctionReturn(0);
+}
+
+
+/* \implements AmgXSolver::solve */
+PetscErrorCode AmgXSolver::solve(PetscScalar* p, const PetscScalar* b, const int nRows)
+{
+    PetscFunctionBeginUser;
+
+    int ierr;
+
+    if (consolidationStatus == ConsolidationStatus::Device)
+    {
+        CHECK(cudaMemcpy((void**)&pCons[rowDispls[myDevWorldRank]], p, sizeof(PetscScalar) * nRows, cudaMemcpyDefault));
+        CHECK(cudaMemcpy((void**)&rhsCons[rowDispls[myDevWorldRank]], b, sizeof(PetscScalar) * nRows, cudaMemcpyDefault));
+
+        // Must synchronize here as device to device copies are non-blocking w.r.t host
+        CHECK(cudaDeviceSynchronize());
+        ierr = MPI_Barrier(devWorld); CHK;
+    }
+    else if (consolidationStatus == ConsolidationStatus::Host)
+    {
+        MPI_Request req[2];
+        ierr = MPI_Igatherv(p, nRows, MPI_DOUBLE, &pCons[rowDispls[myDevWorldRank]], nRowsInDevWorld.data(), rowDispls.data(), MPI_DOUBLE, 0, devWorld, &req[0]); CHK;
+        ierr = MPI_Igatherv(b, nRows, MPI_DOUBLE, &rhsCons[rowDispls[myDevWorldRank]], nRowsInDevWorld.data(), rowDispls.data(), MPI_DOUBLE, 0, devWorld, &req[1]); CHK;
+        MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
+    }
+
+    if (gpuWorld != MPI_COMM_NULL)
+    {
+        // Upload potentially consolidated vectors to AmgX
+        if (consolidationStatus == ConsolidationStatus::None)
+        {
+            AMGX_vector_upload(AmgXP, nRows, 1, p);
+            AMGX_vector_upload(AmgXRHS, nRows, 1, b);
+        }
+        else
+        {
+            AMGX_vector_upload(AmgXP, nConsRows, 1, pCons);
+            AMGX_vector_upload(AmgXRHS, nConsRows, 1, rhsCons);
+        }
+
+        ierr = MPI_Barrier(gpuWorld); CHK;
+
+        // Solve
+        AMGX_solver_solve(solver, AmgXRHS, AmgXP);
+
+        // Get the status of the solver
+        AMGX_SOLVE_STATUS   status;
+        AMGX_solver_get_status(solver, &status);
+
+        // Check whether the solver successfully solved the problem
+        if (status != AMGX_SOLVE_SUCCESS)
+                SETERRQ1(globalCpuWorld,
+                        PETSC_ERR_CONV_FAILED, "AmgX solver failed to solve the system! "
+                                                "The error code is %d.\n",
+                        status);
+
+        // Download data from device
+        if (consolidationStatus == ConsolidationStatus::None)
+        {
+            AMGX_vector_download(AmgXP, p);
+        }
+        else
+        {
+            AMGX_vector_download(AmgXP, pCons);
+
+            // AMGX_vector_download invokes a device to device copy here, so it is essential that
+            // the root rank blocks the host before other ranks copy from the consolidated solution
+            CHECK(cudaDeviceSynchronize());
+        }
+    }
+
+    // If the matrix is consolidated, scatter the
+    if (consolidationStatus == ConsolidationStatus::Device)
+    {
+        // Must synchronise before each rank attempts to read from the consolidated solution
+        ierr = MPI_Barrier(devWorld); CHK;
+
+        CHECK(cudaMemcpy((void **)p, &pCons[rowDispls[myDevWorldRank]], sizeof(PetscScalar) * nRows, cudaMemcpyDefault));
+        CHECK(cudaDeviceSynchronize());
+    }
+    else if (consolidationStatus == ConsolidationStatus::Host)
+    {
+        // Must synchronise before each rank attempts to read from the consolidated solution
+        ierr = MPI_Barrier(devWorld); CHK;
+
+        ierr = MPI_Scatterv(&pCons[rowDispls[myDevWorldRank]], nRowsInDevWorld.data(), rowDispls.data(), MPI_DOUBLE, p, nRows, MPI_DOUBLE, 0, devWorld); CHK;
+    }
+
+    ierr = MPI_Barrier(globalCpuWorld); CHK;
+
+    PetscFunctionReturn(0);
+}