Major commit: graph partitioning algorithms, level scheduling method,…

… GPU Preconditioner framework
su2code · digvijay-y · Jun 10, 2025 · Jun 10, 2025 · Jun 25, 2025 · Jun 25, 2025
commit 6f51f456a2e6904c3cad0d9bc7e8ff1d11a9978b
diff --git a/Common/include/CConfig.hpp b/Common/include/CConfig.hpp
@@ -521,6 +521,7 @@ class CConfig {
   Kind_Gradient_Method_Recon,      /*!< \brief Numerical method for computation of spatial gradients used for upwind reconstruction. */
   Kind_Deform_Linear_Solver,             /*!< Numerical method to deform the grid */
   Kind_Deform_Linear_Solver_Prec,        /*!< \brief Preconditioner of the linear solver. */
+  Kind_Graph_Part_Algo,    /*!< \brief Algorithm for parallel partitioning of the matrix graph. */
   Kind_Linear_Solver,                    /*!< \brief Numerical solver for the implicit scheme. */
   Kind_Linear_Solver_Prec,               /*!< \brief Preconditioner of the linear solver. */
   Kind_DiscAdj_Linear_Solver,            /*!< \brief Linear solver for the discrete adjoint system. */
@@ -4136,6 +4137,12 @@ class CConfig {
    */
   bool GetLeastSquaresRequired(void) const { return LeastSquaresRequired; }
 
+    /*!
+   * \brief Get the type of algorithm used for partitioning the matrix graph.
+   * \return Algorithm that divides the matrix into partitions that are executed parallely.
+   */
+  unsigned short GetKind_Graph_Part_Algo(void) const { return Kind_Graph_Part_Algo; }
+
   /*!
    * \brief Get the kind of solver for the implicit solver.
    * \return Numerical solver for implicit formulation (solving the linear system).

diff --git a/Common/include/geometry/CGeometry.hpp b/Common/include/geometry/CGeometry.hpp
@@ -260,6 +260,12 @@ class CGeometry {
   unsigned long* nPointCumulative{nullptr}; /*!< \brief Cumulative storage array containing the total number of points
                                                on all prior ranks in the linear partitioning. */
 
+  unsigned long nPartition; /*!< \brief Number of divisions of the matrix graph during execution of parallel
+                               partitioning algorithms. */
+  unsigned long maxPartitionSize; /*!< \brief Size of the level with the maximum number of elements. */
+  vector<unsigned long>
+      partitionOffsets; /*!< \brief Vector array containing the indices at which different parallel partitions begin. */
+
   /*--- Data structures for point-to-point MPI communications. ---*/
 
   int maxCountPerPoint{0}; /*!< \brief Maximum number of pieces of data sent per vertex in point-to-point comms. */

diff --git a/Common/include/geometry/CPhysicalGeometry.hpp b/Common/include/geometry/CPhysicalGeometry.hpp
@@ -148,6 +148,17 @@ class CPhysicalGeometry final : public CGeometry {
    */
   void DistributeColoring(const CConfig* config, CGeometry* geometry);
 
+  /*!
+   * \brief Divide the graph produced by the matrix into parallel partitions.
+   * \param[in] config - Definition of the particular problem.
+   * \param[in] pointList - Ordered list of points in the mesh.
+   * \param[in] numPartitions - Returns the number of parallel partitions created by the algorithm.
+   * \param[in] indexOffsets - Vector array that represents the starting index of each partition in the reordered point
+   * list.
+   */
+  template <class ScalarType>
+  void PartitionGraph(const CConfig* config, vector<ScalarType>& pointList);
+
   /*!
    * \brief Distribute the grid points, including ghost points, across all ranks based on a ParMETIS coloring.
    * \param[in] config - Definition of the particular problem.

diff --git a/Common/include/linear_algebra/CMatrixVectorProduct.hpp b/Common/include/linear_algebra/CMatrixVectorProduct.hpp
@@ -101,14 +101,17 @@ class CSysMatrixVectorProduct final : public CMatrixVectorProduct<ScalarType> {
    * \param[out] v - CSysVector that is the result of the product
    */
   inline void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const override {
-#ifdef HAVE_CUDA
     if (config->GetCUDA()) {
+#ifdef HAVE_CUDA
       matrix.GPUMatrixVectorProduct(u, v, geometry, config);
+#else
+      SU2_MPI::Error(
+          "\nError in launching Matrix-Vector Product Function\nENABLE_CUDA is set to YES\nPlease compile with CUDA "
+          "options enabled in Meson to access GPU Functions",
+          CURRENT_FUNCTION);
+#endif
     } else {
       matrix.MatrixVectorProduct(u, v, geometry, config);
     }
-#else
-    matrix.MatrixVectorProduct(u, v, geometry, config);
-#endif
   }
 };
diff --git a/Common/include/linear_algebra/CPreconditioner.hpp b/Common/include/linear_algebra/CPreconditioner.hpp
@@ -205,17 +205,15 @@ class CLU_SGSPreconditioner final : public CPreconditioner<ScalarType> {
    * \param[out] v - CSysVector that is the result of the preconditioning.
    */
   inline void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const override {
-    #ifdef HAVE_CUDA
-    if(config->GetCUDA()) 
-    {
+#ifdef HAVE_CUDA
+    if (config->GetCUDA()) {
       sparse_matrix.GPUComputeLU_SGSPreconditioner(u, v, geometry, config);
-    }
-    else {
+    } else {
       sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
     }
-    #else
+#else
     sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
-    #endif
+#endif
   }
 };
 

diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp
@@ -149,6 +149,7 @@ class CSysMatrix {
   const unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */
   const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
   const unsigned long* d_dia_ptr; /*!< \brief Device Column index for each of the elements in val(). */
+  unsigned long* d_partition_offsets;
 
   ScalarType* ILU_matrix;           /*!< \brief Entries of the ILU sparse matrix. */
   unsigned long nnz_ilu;            /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
@@ -865,8 +866,8 @@ class CSysMatrix {
    * \param[in] vec - CSysVector to be multiplied by the preconditioner.
    * \param[out] prod - Result of the product A*vec.
    */
-  void GPUComputeLU_SGSPreconditioner(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod, CGeometry* geometry,
-    const CConfig* config) const;
+  void GPUComputeLU_SGSPreconditioner(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod,
+                                      CGeometry* geometry, const CConfig* config) const;
 
   /*!
    * \brief Build the Jacobi preconditioner.

diff --git a/Common/include/linear_algebra/GPUComms.cuh b/Common/include/linear_algebra/GPUComms.cuh
@@ -28,17 +28,51 @@
 #include<cuda_runtime.h>
 #include"iostream"
 
-namespace KernelParameters{
+namespace kernelParameters{
 
   /*Returns the rounded up value of the decimal quotient to the next integer (in all cases)*/
-  inline constexpr int rounded_up_division(const int divisor, int dividend) { return ((dividend + divisor - 1) / divisor); }   
+  inline constexpr int rounded_up_division(const int divisor, int dividend) { return ((dividend + divisor - 1) / divisor); }
 
   /*Returns the rounded down value of the decimal quotient to the previous integer (in all cases)*/
-  inline constexpr int rounded_down_division(const int divisor, int dividend) { return ((dividend - divisor + 1) / divisor); }   
+  inline constexpr int rounded_down_division(const int divisor, int dividend) { return ((dividend - divisor + 1) / divisor); }
+
+  const unsigned int MVP_BLOCK_SIZE = 1024;
+  const unsigned int MVP_WARP_SIZE = 32;
+
+};
+
+struct matrixParameters{
+
+  public:
+    unsigned long totalRows;
+    unsigned long blockRowSize;
+    unsigned long blockColSize;
+    unsigned long nPartition; 
+    unsigned long blockSize;
+
+    matrixParameters(unsigned long nPointDomain, unsigned long nEqn, unsigned long nVar, unsigned long nPartitions)
+    {
+      totalRows = nPointDomain;
+      blockRowSize = nEqn;
+      blockColSize = nVar;
+      nPartition = nPartitions;
+      blockSize = nVar * nEqn;
+    }
+};
+struct precondParameters{
+
+  public:
+    dim3 gaussElimBlockDim;
+    dim3 gaussElimGridDim;
+
+    precondParameters(matrixParameters matrixParam)
+    {
+      unsigned int geBlockx = matrixParam.blockSize;
+      gaussElimBlockDim = {geBlockx, 1, 1};
+      gaussElimGridDim = {1,1,1};
+    }
+};
 
-  const int MVP_BLOCK_SIZE = 1024;
-  const int MVP_WARP_SIZE = 32;
-}
 /*!
   * \brief assert style function that reads return codes after intercepting CUDA API calls.
   *        It returns the result code and its location if the call is unsuccessful.

diff --git a/Common/include/option_structure.hpp b/Common/include/option_structure.hpp
@@ -2345,6 +2345,16 @@ static const MapType<std::string, ENUM_FFD_BLENDING> Blending_Map = {
   MakePair("BEZIER", BEZIER)
 };
 
+/*!
+ * \brief Types of graph partitioning algorithms for parallel computing
+ */
+enum ENUM_GRAPH_PART_ALGORITHM {
+  LEVEL_SCHEDULING,   /*!< \brief  Partitions the graphs according to level-set algorithm. */
+};
+static const MapType<std::string, ENUM_GRAPH_PART_ALGORITHM> Graph_Part_Map = {
+  MakePair("LEVEL_SCHEDULING", LEVEL_SCHEDULING)
+};
+
 /*!
  * \brief Types of solvers for solving linear systems
  */

diff --git a/Common/src/CConfig.cpp b/Common/src/CConfig.cpp
@@ -1849,6 +1849,9 @@ void CConfig::SetConfig_Options() {
   /*!\par CONFIG_CATEGORY: Linear solver definition \ingroup Config*/
   /*--- Options related to the linear solvers ---*/
 
+  /*!\brief GRAPH_PARTIONING
+   *  \n DESCRIPTION: Algorithm for partioning the matrix graph to facilitate parallel execution of inear algebra subroutines\n OPTIONS: see \link Graph_Part_Map \endlink \n DEFAULT: LEVEL_SCHEDULING \ingroup Config*/
+  addEnumOption("GRAPH_PART_ALGORITHM", Kind_Graph_Part_Algo, Graph_Part_Map, LEVEL_SCHEDULING);
   /*!\brief LINEAR_SOLVER
    *  \n DESCRIPTION: Linear solver for the implicit, mesh deformation, or discrete adjoint systems \n OPTIONS: see \link Linear_Solver_Map \endlink \n DEFAULT: FGMRES \ingroup Config*/
   addEnumOption("LINEAR_SOLVER", Kind_Linear_Solver, Linear_Solver_Map, FGMRES);

diff --git a/Common/src/geometry/CPhysicalGeometry.cpp b/Common/src/geometry/CPhysicalGeometry.cpp
@@ -26,6 +26,7 @@
  */
 
 #include "../../include/geometry/CPhysicalGeometry.hpp"
+#include "../../include/linear_algebra/CGraphPartitioning.hpp"
 #include "../../include/adt/CADTPointsOnlyClass.hpp"
 #include "../../include/toolboxes/printing_toolbox.hpp"
 #include "../../include/toolboxes/CLinearPartitioner.hpp"
@@ -49,6 +50,8 @@
 #include "../../include/geometry/primal_grid/CPyramid.hpp"
 #include "../../include/geometry/primal_grid/CPrism.hpp"
 #include "../../include/geometry/primal_grid/CVertexMPI.hpp"
+#include "boost/integer_fwd.hpp"
+#include "cgnslib.h"
 
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -699,6 +702,21 @@ void CPhysicalGeometry::DistributeColoring(const CConfig* config, CGeometry* geo
   delete[] nPoint_Flag;
 }
 
+template <class ScalarType>
+void CPhysicalGeometry::PartitionGraph(const CConfig* config, vector<ScalarType>& pointList) {
+  unsigned short KindAlgorithm = config->GetKind_Graph_Part_Algo();
+  partitionOffsets.reserve(nPointDomain);
+
+  switch (KindAlgorithm) {
+    case LEVEL_SCHEDULING:
+      auto levelSchedule = CLevelScheduling<ScalarType>(nPointDomain, nodes);
+      levelSchedule.Partition(pointList, partitionOffsets);
+      nPartition = levelSchedule.nLevels;
+      maxPartitionSize = levelSchedule.maxLevelWidth;
+      break;
+  }
+}
+
 void CPhysicalGeometry::DistributeVolumeConnectivity(const CConfig* config, CGeometry* geometry,
                                                      unsigned short Elem_Type) {
   unsigned short NODES_PER_ELEMENT = 0;
@@ -4542,6 +4560,8 @@ void CPhysicalGeometry::SetRCM_Ordering(CConfig* config) {
     if (!status) SU2_MPI::Error("RCM ordering failed", CURRENT_FUNCTION);
   }
 
+  if (config->GetCUDA()) PartitionGraph(config, Result);
+
   /*--- Add the MPI points ---*/
   for (auto iPoint = nPointDomain; iPoint < nPoint; iPoint++) {
     Result.push_back(iPoint);

diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp
@@ -71,6 +71,7 @@ CSysMatrix<ScalarType>::~CSysMatrix() {
   GPUMemoryAllocation::gpu_free(d_matrix);
   GPUMemoryAllocation::gpu_free(d_row_ptr);
   GPUMemoryAllocation::gpu_free(d_col_ind);
+  GPUMemoryAllocation::gpu_free(d_partition_offsets);
 
 #ifdef USE_MKL
   mkl_jit_destroy(MatrixMatrixProductJitter);
@@ -150,11 +151,16 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
     ptr = GPUMemoryAllocation::gpu_alloc_cpy<const unsigned long>(src_ptr, num * sizeof(const unsigned long));
   };
 
+  auto GPUVectorAllocAndCopy = [](unsigned long*& ptr, vector<unsigned long>& src_ptr, unsigned long num) {
+    ptr = GPUMemoryAllocation::gpu_alloc_cpy<unsigned long>(&src_ptr[0], num * sizeof(unsigned long));
+  };
+
   GPUAllocAndInit(d_matrix, nnz * nVar * nEqn);
   GPUAllocAndCopy(d_row_ptr, row_ptr, (nPointDomain + 1.0));
   GPUAllocAndCopy(d_col_ind, col_ind, nnz);
   GPUAllocAndCopy(d_dia_ptr, dia_ptr, nPointDomain);
-
+  GPUVectorAllocAndCopy(d_partition_offsets, geometry->partitionOffsets, geometry->nPartition);
+
   if (needTranspPtr) col_ptr = geometry->GetTransposeSparsePatternMap(type).data();
 
   if (type == ConnectivityType::FiniteVolume) {