diff --git a/src/FOAM2CSR.mirco.cu b/src/FOAM2CSR.mirco.cu
deleted file mode 100644
index d0ac8f2bd67720ecc2ab6c4b73f7ed446a0bfb85..0000000000000000000000000000000000000000
--- a/src/FOAM2CSR.mirco.cu
+++ /dev/null
@@ -1,515 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-#include "FOAM2CSR.H"
-#include <cuda.h>
-#include <cub/cub.cuh>
-#include <thrust/scan.h>
-#include <thrust/sequence.h>
-#include <thrust/execution_policy.h>
-#include <unistd.h>
-
-#define CHECK(call)                                              \
-    {                                                            \
-        cudaError_t e = call;                                    \
-        if (e != cudaSuccess)                                    \
-        {                                                        \
-            printf("Cuda failure: '%s %d %s'",                   \
-                __FILE__, __LINE__, cudaGetErrorString(e));      \
-        }                                                        \
-    }
-
-__global__ void printPerm(
-  const int nrows,
-  const int nnz,
-  int* rowsIdx,
-  int* colsIdx,
-  int* colsPerm )
-{
-
-  if ( threadIdx.x < nrows )
-  {
-    printf( "rows(%d) = %d\n", threadIdx.x, rowsIdx[threadIdx.x] );
-  }
-
-  if ( threadIdx.x < nnz )
-  {
-    printf( "colsPerm(%d) = %d\n", threadIdx.x, colsIdx[colsPerm[threadIdx.x]] );
-  }
-
-}
-
-__global__ void printData(
-  const int nrows,
-  const int nnz,
-  int* rowsIdx,
-  int* colsIdx,
-  double* values )
-{
-
-  if ( threadIdx.x < nrows )
-  {
-    printf( "rows(%d) = %d\n", threadIdx.x, rowsIdx[threadIdx.x] );
-  }
-
-  if ( threadIdx.x < nnz )
-  {
-    printf( "cols(%d) = %d\n", threadIdx.x, colsIdx[threadIdx.x] );
-  }
-
-  if ( threadIdx.x < nnz )
-  {
-    printf( "values(%d) = %d, %f\n", threadIdx.x, colsIdx[threadIdx.x], (float)values[threadIdx.x] );
-  }
-
-}
-
-// Offset the column indices to transform from local to global
-__global__ void localToGlobalColIndices(
-    const int nnz,
-    const int nrows,
-    const int nInternalFaces,
-    const int diagIndexGlobal,
-    const int lowOffGlobal,
-    const int uppOffGlobal,
-    int *colIndices)
-{
-    // Offset by global offset
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < nnz; i += blockDim.x * gridDim.x)
-    {
-        int offset;
-        if ( i < nnz )
-        {
-          // Depending upon the row, different offsets must be applied
-          if (i < nrows)
-          {
-              offset = diagIndexGlobal;
-          }
-          else if (i < nrows + nInternalFaces)
-          {
-              offset = uppOffGlobal;
-          }
-          else
-          {
-              offset = lowOffGlobal;
-          }
-          colIndices[i] += offset;
-        }
-    }
-}
-
-
-
-// Offset the column indices to transform from local to global
-__global__ void localToGlobalRowIndices(
-    const int nnz,
-    const int nrows,
-    const int nInternalFaces,
-    const int diagIndexGlobal,
-    const int lowOffGlobal,
-    const int uppOffGlobal,
-    int *rowIndices)
-{
-    // Offset by global offset
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < nnz; i += blockDim.x * gridDim.x)
-    {
-        int offset;
-        if ( i < nnz )
-        {
-          // Depending upon the row, different offsets must be applied
-          if (i < nrows)
-          {
-              offset = diagIndexGlobal;
-          }
-          else if (i < nrows + nInternalFaces)
-          {
-              offset = uppOffGlobal;
-          }
-          else
-          {
-              offset = lowOffGlobal;
-          }
-
-          rowIndices[i] += offset - diagIndexGlobal;
-        }
-    }
-}
-
-
-
-// Apply the pre-existing permutation to the values [and columns]
-__global__ void applyPermutation(
-    const int totalNnz,
-    const int *perm,
-    const int *colIndicesTmp,
-    const double *valuesTmp,
-    int *colIndices,
-    double *values,
-    bool valuesOnly)
-{
-    // Permute col indices and values
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < totalNnz; i += blockDim.x * gridDim.x)
-    {
-
-      if ( i < totalNnz )
-      {
-        int p = perm[i];
-
-        // In the values only case the column indices and row offsets remain fixed so no update
-        if ( p< totalNnz )
-        {
-          if (!valuesOnly)
-          {
-              colIndices[i] = colIndicesTmp[p];
-          }
-          if ( p < totalNnz )
-          {
-            values[i] = valuesTmp[p];
-          }
-        }
-      }
-    }
-}
-
-// Flatten the row indices into the row offsets
-// >>>  constexpr int nthreads = 128;
-// >>>  int nblocks = totalNnz / nthreads + 1;
-// >>>  createRowOffsets<<<nblocks, nthreads>>>(totalNnz, nrows+1, rowIndices, rowOffsets);
-__global__ void createRowOffsets(
-    int nnz,
-    int nrows,
-    int *rowIndices,
-    int *rowOffsets)
-{
-
-/*
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < nnz; i += blockDim.x * gridDim.x)
-    {
-        int k = rowIndices[i];
-        if ( k >= nrows ) {
-          printf( "Sforo il numero di linee\n" );
-          printf( "Memory Access :: %d, %d, %d, %d,%d, %d, %d, %d, %d\n", i, k, nnz, nrows, threadIdx.x, blockIdx.x, blockDim.x, gridDim.x, rowIndices[i] );
-        }
-
-        if ( i >= nnz || i == 0 ) {
-          printf( "Memory Access :: %d, %d, %d, %d,%d, %d, %d, %d, %d\n", i, k, nnz, nrows, threadIdx.x, blockIdx.x, blockDim.x, gridDim.x, rowIndices[i] );
-        }
-        atomicAdd(&rowOffsets[k], 1);
-    }
-*/
-
-  for ( int i = threadIdx.x + blockIdx.x * blockDim.x; i<nnz; i += blockDim.x * gridDim.x)
-  {
-    if ( i < nnz )
-    {
-      int k = rowIndices[i];
-      if ( k < nrows+1 ) 
-      {
-        atomicAdd(&rowOffsets[k], 1);
-      }
-    }
-  }
-
-
-}
-
-// Updates the values based on the previously determined permutation
-void FOAM2CSR::updateValues(
-    const int nrows,
-    const int nInternalFaces,
-    const int extNnz,
-    const double *diagVal,
-    const double *uppVal,
-    const double *lowVal,
-    const double *extVal)
-{
-
-    // Determine the local non-zeros from the internal faces
-    int localNnz = nrows + 2 * nInternalFaces;
-
-    // Add external non-zeros (communicated halo entries)
-    int totalNnz = localNnz + extNnz;
-
-    // Copy the values in [ diag, upper, lower, (external) ]
-    CHECK(cudaMemcpy(valuesTmp, diagVal, sizeof(double) * nrows, cudaMemcpyDefault));
-    CHECK(cudaMemcpy(valuesTmp + nrows, uppVal, sizeof(double) * nInternalFaces, cudaMemcpyDefault));
-    CHECK(cudaMemcpy(valuesTmp + nrows + nInternalFaces, lowVal, sizeof(double) * nInternalFaces, cudaMemcpyDefault));
-    if (extNnz > 0)
-    {
-        CHECK(cudaMemcpy(valuesTmp + localNnz, extVal, sizeof(double) * extNnz, cudaMemcpyDefault));
-    }
-
-    constexpr int nthreads = 128;
-    int nblocks = totalNnz / nthreads + 1;
-    applyPermutation<<<nblocks, nthreads>>>(totalNnz, ldu2csrPerm, nullptr, valuesTmp, nullptr, values, true);
-    CHECK(cudaStreamSynchronize(0));
-}
-
-// Perform the conversion between an LDU matrix and a CSR matrix, possibly distributed
-void FOAM2CSR::convertLDU2CSR(
-    int nrows,
-    int nInternalFaces,
-    int diagIndexGlobal,
-    int lowOffGlobal,
-    int uppOffGlobal,
-    const int *upperAddr,
-    const int *lowerAddr,
-    const int extNnz,
-    const int *extRow,
-    const int *extCol,
-    const double *diagVals,
-    const double *upperVals,
-    const double *lowerVals,
-    const double *extVals)
-{
-
-
-    // Determine the local non-zeros from the internal faces
-    int localNnz = nrows + 2 * nInternalFaces;
-
-    // Add external non-zeros (communicated halo entries)
-    int totalNnz = localNnz + extNnz;
-
-
-    if ( diagIndexGlobal > 0 ) 
-    {
-      sleep(2);
-      std::cout << " ============================================================== " << std::endl;
-      std::cout << "P1 :: nrows             :: " << nrows           << std::endl;
-      std::cout << "P1 :: nInternalFaces    :: " << nInternalFaces  << std::endl;
-      std::cout << "P1 :: extNnz            :: " << extNnz          << std::endl;
-      std::cout << "P1 :: diag index global :: " << diagIndexGlobal << std::endl;
-      std::cout << "P1 :: low offset global :: " << lowOffGlobal    << std::endl;
-      std::cout << "P1 :: upp offset global :: " << uppOffGlobal    << std::endl;
-      std::cout << "P1 :: upperaddr0        :: " << upperAddr[0]    << std::endl;
-      std::cout << "P1 :: loweraddr0        :: " << lowerAddr[0]    << std::endl;
-      if ( extNnz > 0 ){
-        std::cout << "P1 :: extRow            :: " << extRow[0]       << std::endl;
-        std::cout << "P1 :: extCol            :: " << extCol[0]       << std::endl;
-      }
-        FILE* fid = fopen( "P1_descr.txt", "w" );
-        fprintf( fid, "%d, %d, %d, %d, %d, %d\n", nrows, nInternalFaces, diagIndexGlobal, lowOffGlobal, uppOffGlobal, extNnz );
-        fclose(fid);
-      
-      fid = fopen( "P1_LDU_diag.txt", "w" );
-      for ( int i =0; i<nrows; i++ ) fprintf( fid, "%g\n", diagVals[i] );
-      fclose(fid);
-      fid = fopen( "P1_LDU_sym.txt", "w" );
-      for ( int i =0; i<nInternalFaces; i++ ) fprintf( fid, "%d, %d, %g, %g\n", 
-                     upperAddr[i], lowerAddr[i], upperVals[i], lowerVals[i] );
-      fclose(fid);
-      if ( extNnz > 0 ){
-        fid = fopen( "P1_LDU_ext.txt", "w" );
-        for ( int i =0; i<extNnz; i++ ) fprintf( fid, "%d, %d, %g\n", 
-                     extRow[i], extCol[i], extVals[i] );
-        fclose(fid);
-      }
-      std::cout << " ============================================================== " << std::endl;
-    }
-    else 
-    {
-      std::cout << " ============================================================== " << std::endl;
-      std::cout << "P0 :: nrows             :: " << nrows           << std::endl;
-      std::cout << "P0 :: nInternalFaces    :: " << nInternalFaces  << std::endl;
-      std::cout << "P0 :: extNnz            :: " << extNnz          << std::endl;
-      std::cout << "P0 :: diag index global :: " << diagIndexGlobal << std::endl;
-      std::cout << "P0 :: low offset global :: " << lowOffGlobal    << std::endl;
-      std::cout << "P0 :: upp offset global :: " << uppOffGlobal    << std::endl;
-      std::cout << "P0 :: upperaddr0        :: " << upperAddr[0]    << std::endl;
-      std::cout << "P0 :: loweraddr0        :: " << lowerAddr[0]    << std::endl;
-      if ( extNnz > 0 )
-      {
-        std::cout << "P0 :: extRow            :: " << extRow[0]       << std::endl;
-        std::cout << "P0 :: extCol            :: " << extCol[0]       << std::endl;
-      }
-      FILE* fid = fopen( "P0_descr.txt", "w" );
-      fprintf( fid, "%d, %d, %d, %d, %d, %d\n", nrows, nInternalFaces, diagIndexGlobal, lowOffGlobal, uppOffGlobal, extNnz );
-      fclose(fid);
-      fid = fopen( "P0_LDU_diag.txt", "w" );
-      for ( int i =0; i<nrows; i++ ) fprintf( fid, "%g\n", diagVals[i] );
-      fclose(fid);
-      fid = fopen( "P0_LDU_sym.txt", "w" );
-      for ( int i =0; i<nInternalFaces; i++ ) fprintf( fid, "%d, %d, %g, %g\n", 
-                     upperAddr[i], lowerAddr[i], upperVals[i], lowerVals[i] );
-      fclose(fid);
-      if ( extNnz > 0 ){
-        fid = fopen( "P0_LDU_ext.txt", "w" );
-        for ( int i =0; i<extNnz; i++ ) fprintf( fid, "%d, %d, %g\n", 
-                       extRow[i], extCol[i], extVals[i] );
-        fclose(fid);
-      }
-      sleep(3);
-    }
-
-
-    // Generate unpermuted index list [0, ..., totalNnz-1]
-    int GuardSize=1000000;
-    int *permTmp;
-    CHECK(cudaMalloc(&permTmp, sizeof(int) * (totalNnz+GuardSize)));
-    thrust::sequence(thrust::device, permTmp, permTmp + totalNnz, 0);
-
-    // Fill rowIndicesTmp with [0, ..., n-1], lowerAddr, upperAddr, (extAddr)
-    int *rowIndicesTmp;
-    CHECK(cudaMalloc(&rowIndicesTmp, sizeof(int) * (totalNnz+GuardSize)));
-    CHECK(cudaMemcpy(rowIndicesTmp, permTmp, nrows * sizeof(int), cudaMemcpyDefault));
-    CHECK(cudaMemcpy(rowIndicesTmp + nrows, lowerAddr, nInternalFaces * sizeof(int), cudaMemcpyDefault));
-    CHECK(cudaMemcpy(rowIndicesTmp + nrows + nInternalFaces, upperAddr, nInternalFaces * sizeof(int), cudaMemcpyDefault));
-    if (extNnz > 0)
-    {
-        CHECK(cudaMemcpy(rowIndicesTmp + localNnz, extRow, extNnz * sizeof(int), cudaMemcpyDefault));
-    }
-    constexpr int nthreads = 128;
-    int nblocks = localNnz / nthreads + 1;
-    // localToGlobalRowIndices<<<nblocks, nthreads>>>(localNnz, nrows, nInternalFaces, diagIndexGlobal, lowOffGlobal, uppOffGlobal, rowIndicesTmp);
-
-    // Make space for the row indices and stored permutation
-    int *rowIndices;
-    CHECK(cudaMalloc(&rowIndices, sizeof(int) * (totalNnz+GuardSize)));
-    CHECK(cudaMalloc(&ldu2csrPerm, sizeof(int) * (totalNnz+GuardSize)));
-
-    // Sort the row indices and store results in the permutation
-    void *tempStorage = NULL;
-    size_t tempStorageBytes = 0;
-    cub::DeviceRadixSort::SortPairs(tempStorage, tempStorageBytes, rowIndicesTmp, rowIndices, permTmp, ldu2csrPerm, totalNnz);
-    CHECK(cudaMalloc(&tempStorage, tempStorageBytes));
-
-
-    cub::DeviceRadixSort::SortPairs(tempStorage, tempStorageBytes, rowIndicesTmp, rowIndices, permTmp, ldu2csrPerm, totalNnz);
-    CHECK(cudaFree(permTmp));
-    CHECK(cudaFree(tempStorage));
-
-    // Make space for the row offsets
-    CHECK(cudaMalloc(&rowOffsets, sizeof(int) * (nrows + 1 + GuardSize)));
-    CHECK(cudaMemset(rowOffsets, 0, sizeof(int) * (nrows + 1 + GuardSize)));
-
-    // XXX Taking the non zero per row data from the host could be more
-    // efficient, experiment with this in the future
-    //cudaMemcpy(rowOffsets, nz_per_row, sizeof(int) * nrows, cudaMemcpyDefault);
-    //thrust::exclusive_scan(thrust::device, rowOffsets, rowOffsets + nrows + 1, rowOffsets);
-
-    // Convert the row indices into offsets
-    nblocks = totalNnz / nthreads + 1;
-    createRowOffsets<<<nblocks, nthreads>>>(totalNnz, nrows+1, rowIndices, rowOffsets);
-    thrust::exclusive_scan(thrust::device, rowOffsets, rowOffsets + nrows + 1, rowOffsets);
-    // CHECK(cudaFree(rowIndices));
-    // printf( "%d - %d \n", rowOffsets[nrows], totalNnz );
-
-
-
-    // Fill valuesTmp with diagVals, upperVals, lowerVals, (extVals)
-    CHECK(cudaMalloc(&valuesTmp, (totalNnz+GuardSize) * sizeof(double)));
-    CHECK(cudaMemcpy(valuesTmp, diagVals, nrows * sizeof(double), cudaMemcpyDefault));
-    CHECK(cudaMemcpy(valuesTmp + nrows, upperVals, nInternalFaces * sizeof(double), cudaMemcpyDefault));
-    CHECK(cudaMemcpy(valuesTmp + nrows + nInternalFaces, lowerVals, nInternalFaces * sizeof(double), cudaMemcpyDefault));
-    if (extNnz > 0)
-    {
-        CHECK(cudaMemcpy(valuesTmp + localNnz, extVals, extNnz * sizeof(double), cudaMemcpyDefault));
-    }
-
-    // Concat [0, ..., n-1], upperAddr, lowerAddr (note switched) into column indices
-    int *colIndicesTmp;
-    CHECK(cudaMalloc(&colIndicesTmp, (totalNnz+GuardSize) * sizeof(int)));
-    CHECK(cudaMemcpy(colIndicesTmp, rowIndicesTmp, nrows * sizeof(int), cudaMemcpyDefault));
-    CHECK(cudaMemcpy(colIndicesTmp + nrows, rowIndicesTmp + nrows + nInternalFaces, nInternalFaces * sizeof(int), cudaMemcpyDefault));
-    CHECK(cudaMemcpy(colIndicesTmp + nrows + nInternalFaces, rowIndicesTmp + nrows, nInternalFaces * sizeof(int), cudaMemcpyDefault));
-    if (extNnz > 0)
-    {
-        CHECK(cudaMemcpy(colIndicesTmp + localNnz, extCol, extNnz * sizeof(int), cudaMemcpyDefault));
-    }
-
-    CHECK(cudaFree(rowIndicesTmp));
-
-
-
-    // Construct the global column indices
-    nblocks = localNnz / nthreads + 1;
-    localToGlobalColIndices<<<nblocks, nthreads>>>(localNnz, nrows, nInternalFaces, diagIndexGlobal, lowOffGlobal, uppOffGlobal, colIndicesTmp);
-
-
-    // std::cout << "Print" << std::endl;
-    // nblocks = 1;
-    // printPerm<<<nblocks, nthreads>>>( 9, 33, rowOffsets, colIndicesTmp, ldu2csrPerm );
-    // std::cout << "Exit" << std::endl;
-
-
-    // Allocate space to store the permuted column indices and values
-    CHECK(cudaMalloc(&colIndices, sizeof(int) * (totalNnz+GuardSize)));
-    CHECK(cudaMalloc(&values, sizeof(double) * (totalNnz+GuardSize)));
-
-    // Swap column indices based on the pre-determined permutation
-    nblocks = totalNnz / nthreads + 1;
-    applyPermutation<<<nblocks, nthreads>>>(totalNnz, ldu2csrPerm, colIndicesTmp, valuesTmp, colIndices, values, false);
-    CHECK(cudaFree(colIndicesTmp));
-
-
-    int*    csr_rowOfs = (int*)    malloc( sizeof(int)*   (nrows+1) );
-    int*    csr_rowIdx = (int*)    malloc( sizeof(int)*   totalNnz );
-    int*    csr_colIdx = (int*)    malloc( sizeof(int)*   totalNnz );
-    double* csr_values = (double*) malloc( sizeof(double)*totalNnz );
-
-    CHECK(cudaMemcpy( csr_rowOfs, rowOffsets, (nrows+1) * sizeof(int),    cudaMemcpyDeviceToHost));
-    CHECK(cudaMemcpy( csr_rowIdx, rowIndices, totalNnz * sizeof(int),     cudaMemcpyDeviceToHost));
-    CHECK(cudaMemcpy( csr_colIdx, colIndices, totalNnz  * sizeof(int),    cudaMemcpyDeviceToHost));
-    CHECK(cudaMemcpy( csr_values, values,     totalNnz  * sizeof(double), cudaMemcpyDeviceToHost));
-
-    if ( diagIndexGlobal > 0 ) 
-    {
-      FILE* fid = fopen( "P1_CSR_rowOffs.txt", "w" );
-      for ( int i =0; i<nrows+1; i++ ) fprintf( fid, "%d\n", csr_rowOfs[i] );
-      fclose(fid);
-      fid = fopen( "P1_CSR_col.txt", "w" );
-      for ( int i =0; i<totalNnz; i++ ) fprintf( fid, "%d, %d, %g\n", csr_rowIdx[i], csr_colIdx[i], csr_values[i] );
-      fclose(fid);
-   }
-   else
-   {
-     FILE* fid = fopen( "P0_CSR_rowOffs.txt", "w" );
-     for ( int i =0; i<nrows+1; i++ ) fprintf( fid, "%d\n", csr_rowOfs[i] );
-     fclose(fid);
-     fid = fopen( "P0_CSR_col.txt", "w" );
-     for ( int i =0; i<totalNnz; i++ ) fprintf( fid, "%d, %d, %g\n", csr_rowIdx[i], csr_colIdx[i], csr_values[i] );
-     fclose(fid);
-   }
-
-
-}
-
-// XXX Should implement an early abandonment of the
-// unnecessary data for capacity optimisation
-void FOAM2CSR::discardStructure()
-{
-}
-
-// Deallocate remaining storage
-void FOAM2CSR::finalise()
-{
-    if(rowOffsets != nullptr)
-        CHECK(cudaFree(rowOffsets));
-    if(colIndices != nullptr)
-        CHECK(cudaFree(colIndices));
-    if(values != nullptr)
-        CHECK(cudaFree(values));
-    if(valuesTmp != nullptr)
-        CHECK(cudaFree(valuesTmp));
-    if(ldu2csrPerm != nullptr)
-        CHECK(cudaFree(ldu2csrPerm));
-}