Addresed changes in PR 2539

digvijay-y · digvijay-y · commit c3b806209ce5 · 2026-03-08T13:55:47.000+05:30
diff --git a/Common/include/CConfig.hpp b/Common/include/CConfig.hpp
@@ -521,7 +521,6 @@ class CConfig {
   Kind_Gradient_Method_Recon,      /*!< \brief Numerical method for computation of spatial gradients used for upwind reconstruction. */
   Kind_Deform_Linear_Solver,             /*!< Numerical method to deform the grid */
   Kind_Deform_Linear_Solver_Prec,        /*!< \brief Preconditioner of the linear solver. */
-  Kind_Graph_Part_Algo,    /*!< \brief Algorithm for parallel partitioning of the matrix graph. */
   Kind_Linear_Solver,                    /*!< \brief Numerical solver for the implicit scheme. */
   Kind_Linear_Solver_Prec,               /*!< \brief Preconditioner of the linear solver. */
   Kind_DiscAdj_Linear_Solver,            /*!< \brief Linear solver for the discrete adjoint system. */
@@ -538,6 +537,8 @@ class CConfig {
   Kind_TimeStep_Heat,           /*!< \brief Time stepping method for the (fvm) heat equation. */
   n_Datadriven_files;
 
+  ENUM_GRAPH_PART_ALGORITHM Kind_Graph_Part_Algo; /*!< \brief Algorithm for parallel partitioning of the matrix graph. */
+
   DataDrivenFluid_ParsedOptions datadriven_ParsedOptions; /*!< \brief Options for data-driven fluid analysis. */
 
   STRUCT_TIME_INT Kind_TimeIntScheme_FEA;    /*!< \brief Time integration for the FEA equations. */
@@ -4167,7 +4168,7 @@ class CConfig {
    * \brief Get the type of algorithm used for partitioning the matrix graph.
    * \return Algorithm that divides the matrix into partitions that are executed parallely.
    */
-  unsigned short GetKind_Graph_Part_Algo(void) const { return Kind_Graph_Part_Algo; }
+  ENUM_GRAPH_PART_ALGORITHM GetKind_Graph_Part_Algo(void) const { return Kind_Graph_Part_Algo; }
 
   /*!
    * \brief Get the kind of solver for the implicit solver.
@@ -4236,12 +4237,6 @@ class CConfig {
    */
   unsigned short GetCuda_Block_Size(void) const { return Cuda_Block_Size; }
 
-  /*!
-   * \brief Get the number of matrix rows assigned per CUDA Block.
-   * \return The number of matrix rows assigned per CUDA Block.
-   */
-  unsigned short GetRows_Per_Cuda_Block(void) const { return cudaKernelParameters::rounded_up_division(cudaKernelParameters::CUDA_WARP_SIZE, Cuda_Block_Size); }
-
   /*!
    * \brief Get the relaxation factor for solution updates of adjoint solvers.
    */
diff --git a/Common/include/geometry/CGeometry.hpp b/Common/include/geometry/CGeometry.hpp
@@ -260,7 +260,7 @@ class CGeometry {
   unsigned long* nPointCumulative{nullptr}; /*!< \brief Cumulative storage array containing the total number of points
                                                on all prior ranks in the linear partitioning. */
 
-  unsigned long nPartition;       /*!< \brief Number of divisions of the matrix graph during execution of parallel
+  unsigned long nColor;       /*!< \brief Number of divisions of the matrix graph during execution of parallel
                                      partitioning algorithms. */
   unsigned long maxPartitionSize; /*!< \brief Size of the level with the maximum number of elements. */
   vector<unsigned long>
diff --git a/Common/include/linear_algebra/CGraphPartitioning.hpp b/Common/include/linear_algebra/CGraphPartitioning.hpp
@@ -59,7 +59,6 @@ template <class ScalarType>
 CGraphPartitioning<ScalarType>::~CGraphPartitioning() {}
 
 template <class ScalarType>
-
 class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
  private:
   ScalarType nPointDomain;
@@ -81,15 +80,13 @@ class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
     maxLevelWidth = 0ul;
   }
 
-  CLevelScheduling() = delete;  // Removing default constructor
-
   /*!
    * \brief Divides the levels into groups of chains depending on the preset GPU block and warp size.
    * \param[in] levelOffsets - Represents the vector array containing the ordered list of starting rows of each level.
    * \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
    * \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block.
    */
-  void CalculateChain(vector<ScalarType> levelOffsets, vector<ScalarType>& chainPtr, unsigned short rowsPerBlock) {
+  void CalculateChain(const vector<ScalarType>& levelOffsets, vector<ScalarType>& chainPtr, unsigned short rowsPerBlock) {
     ScalarType levelWidth = 0;
 
     /*This is not a magic number. We are simply initializing
@@ -115,34 +112,33 @@ class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
   /*!
    * \brief Reorders the points according to the levels
    * \param[in] pointList - Ordered array that contains the list of all mesh points.
-   * \param[in] inversePointList - Array utilized to access the index of each point in pointList.
    * \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
+   * \param[out] reorderedPointList - Reordered list of points after applying level scheduling.
    */
-  void Reorder(vector<ScalarType>& pointList, vector<ScalarType>& inversePointList, vector<ScalarType> levelOffsets) {
+  void Reorder(const vector<ScalarType>& pointList, const vector<ScalarType>& levelOffsets,
+               vector<ScalarType>& reorderedPointList) {
+    auto levelOffsetsCursor = levelOffsets;
+
     for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) {
       const auto globalPoint = pointList[localPoint];
-      inversePointList[levelOffsets[levels[localPoint]]++] = globalPoint;
+      reorderedPointList[levelOffsetsCursor[levels[localPoint]]++] = globalPoint;
     }
-
-    pointList = std::move(inversePointList);
   }
 
   /*!
    * \brief Reorders the points according to the levels
-   * \param[in] pointList - Ordered array that contains the list  of all mesh points.
+   * \param[in,out] pointList - Ordered array that contains the list of all mesh points.
    * \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
    * \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
    * \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block.
    */
   void Partition(vector<ScalarType>& pointList, vector<ScalarType>& levelOffsets, vector<ScalarType>& chainPtr,
                  unsigned short rowsPerBlock) override {
-    vector<ScalarType> inversePointList;
-    inversePointList.reserve(nPointDomain);
-    levels.reserve(nPointDomain);
+    vector<ScalarType> inversePointList(nPointDomain);
+    levels.resize(nPointDomain, 0ul);
 
     for (auto point = 0ul; point < nPointDomain; point++) {
       inversePointList[pointList[point]] = point;
-      levels[point] = 0;
     }
 
     //  Local Point - Ordering of the points post the RCM ordering
@@ -175,7 +171,8 @@ class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
       levelOffsets[iLevel] += levelOffsets[iLevel - 1];
     }
 
-    Reorder(pointList, inversePointList, levelOffsets);
+    Reorder(pointList, levelOffsets, inversePointList);
+    pointList = std::move(inversePointList);
 
     CalculateChain(levelOffsets, chainPtr, rowsPerBlock);
   }
diff --git a/Common/include/linear_algebra/CLinearAlgebraUtils.hpp b/Common/include/linear_algebra/CLinearAlgebraUtils.hpp
@@ -0,0 +1,40 @@
+/*! 
+ * \file CLinearAlgebraUtils.hpp
+ * \brief Utility helpers for linear algebra modules.
+ * \author SU2 Contributors
+ * \version 8.2.0 "Harrier"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2025, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../parallelization/omp_structure.hpp"
+#include "../option_structure.hpp"
+
+namespace LinearAlgebraUtils {
+
+inline unsigned short ComputeRowsPerCudaBlock(unsigned short cudaBlockSize) {
+  return static_cast<unsigned short>(
+      roundUpDiv(static_cast<size_t>(cudaKernelParameters::CUDA_WARP_SIZE), static_cast<size_t>(cudaBlockSize)));
+}
+
+}  // namespace LinearAlgebraUtils
diff --git a/Common/include/linear_algebra/CPreconditioner.hpp b/Common/include/linear_algebra/CPreconditioner.hpp
@@ -208,12 +208,10 @@ class CLU_SGSPreconditioner final : public CPreconditioner<ScalarType> {
 #ifdef HAVE_CUDA
     if (config->GetCUDA()) {
       sparse_matrix.GPUComputeLU_SGSPreconditioner(u, v, geometry, config);
-    } else {
-      sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
+      return;
     }
-#else
-    sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
 #endif
+    sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
   }
 };
 
diff --git a/Common/include/linear_algebra/GPUComms.cuh b/Common/include/linear_algebra/GPUComms.cuh
@@ -31,16 +31,16 @@
 #include<iostream>
 #include "../option_structure.hpp"
 
-/*!
- * \struct matrixParameters
+/*! 
+ * \struct MatrixParameters
  * \brief Structure containing information related to the Jacobian Matrix which is utilized by any launched Kernel.
  *
  *  This implementation alleviates the need to pass an excessive number of arguments
  *  to a Kernel and, instead, packages it into a single structure. While this leads
  *  to data duplication for a short period of time, this is a much cleaner and resuable approach.
  * \author A. Raj
  */
-struct matrixParameters{
+struct MatrixParameters {
 
   public:
     unsigned long totalRows;        /*!< \brief Contains the total number of rows of the Jacbian Matrix. */
@@ -54,7 +54,7 @@ struct matrixParameters{
     unsigned short rowsPerBlock;     /*!< \brief Number of rows being processed by each thread block. This is equal to the number
                                         of warps present in the block as each row gets assigned a warp. */
 
-    matrixParameters(unsigned long nPointDomain, unsigned long nEqn, unsigned long nVar, unsigned long nPartitions, unsigned short rowsPrBlck){
+    MatrixParameters(unsigned long nPointDomain, unsigned long nEqn, unsigned long nVar, unsigned long nPartitions, unsigned short rowsPrBlck){
       totalRows = nPointDomain;
       blockRowSize = nEqn;
       blockColSize = nVar;
diff --git a/Common/include/option_structure.hpp b/Common/include/option_structure.hpp
@@ -78,16 +78,6 @@ enum class SU2_COMPONENT {
  */
 namespace cudaKernelParameters{
 
-  /*!
-   * \brief Returns the rounded up value of the decimal quotient to the next integer (in all cases).
-   */
-  inline unsigned int rounded_up_division(int divisor, int dividend) { return ((dividend + divisor - 1) / divisor); }
-
-  /*!
-   * \brief Returns the rounded down value of the decimal quotient to the previous integer (in all cases).
-   */
-  inline unsigned int rounded_down_division(int divisor, int dividend) { return ((dividend - divisor + 1) / divisor); }
-
   static constexpr short CUDA_WARP_SIZE = 32;  /*!< \brief Outlines the numbers of threads per warp for a CUDA GPU. */
 }
 
@@ -2368,11 +2358,11 @@ static const MapType<std::string, ENUM_FFD_BLENDING> Blending_Map = {
 /*!
  * \brief Types of graph partitioning algorithms for parallel computing
  */
-enum ENUM_GRAPH_PART_ALGORITHM {
+enum class ENUM_GRAPH_PART_ALGORITHM {
   LEVEL_SCHEDULING,   /*!< \brief  Partitions the graphs according to level-set algorithm. */
 };
 static const MapType<std::string, ENUM_GRAPH_PART_ALGORITHM> Graph_Part_Map = {
-  MakePair("LEVEL_SCHEDULING", LEVEL_SCHEDULING)
+  MakePair("LEVEL_SCHEDULING", ENUM_GRAPH_PART_ALGORITHM::LEVEL_SCHEDULING)
 };
 
 /*!
diff --git a/Common/src/CConfig.cpp b/Common/src/CConfig.cpp
@@ -1862,7 +1862,8 @@ void CConfig::SetConfig_Options() {
 
   /*!\brief GRAPH_PARTIONING
    *  \n DESCRIPTION: Algorithm for partioning the matrix graph to facilitate parallel execution of linear algebra subroutines\n OPTIONS: see \link Graph_Part_Map \endlink \n DEFAULT: LEVEL_SCHEDULING \ingroup Config*/
-  addEnumOption("GRAPH_PART_ALGORITHM", Kind_Graph_Part_Algo, Graph_Part_Map, LEVEL_SCHEDULING);
+  addEnumOption("GRAPH_PART_ALGORITHM", Kind_Graph_Part_Algo, Graph_Part_Map,
+                ENUM_GRAPH_PART_ALGORITHM::LEVEL_SCHEDULING);
   /*!\brief LINEAR_SOLVER
    *  \n DESCRIPTION: Linear solver for the implicit, mesh deformation, or discrete adjoint systems \n OPTIONS: see \link Linear_Solver_Map \endlink \n DEFAULT: FGMRES \ingroup Config*/
   addEnumOption("LINEAR_SOLVER", Kind_Linear_Solver, Linear_Solver_Map, FGMRES);
diff --git a/Common/src/geometry/CPhysicalGeometry.cpp b/Common/src/geometry/CPhysicalGeometry.cpp
@@ -27,6 +27,7 @@
 
 #include "../../include/geometry/CPhysicalGeometry.hpp"
 #include "../../include/linear_algebra/CGraphPartitioning.hpp"
+#include "../../include/linear_algebra/CLinearAlgebraUtils.hpp"
 #include "../../include/adt/CADTPointsOnlyClass.hpp"
 #include "../../include/toolboxes/printing_toolbox.hpp"
 #include "../../include/toolboxes/CLinearPartitioner.hpp"
@@ -702,14 +703,15 @@ void CPhysicalGeometry::DistributeColoring(const CConfig* config, CGeometry* geo
 
 template <class ScalarType>
 void CPhysicalGeometry::PartitionGraph(const CConfig* config, vector<ScalarType>& pointList) {
-  unsigned short KindAlgorithm = config->GetKind_Graph_Part_Algo();
+  auto KindAlgorithm = config->GetKind_Graph_Part_Algo();
   partitionOffsets.reserve(nPointDomain);
 
   switch (KindAlgorithm) {
-    case LEVEL_SCHEDULING:
+    case ENUM_GRAPH_PART_ALGORITHM::LEVEL_SCHEDULING:
       auto levelSchedule = CLevelScheduling<ScalarType>(nPointDomain, nodes);
-      levelSchedule.Partition(pointList, partitionOffsets, chainPtr, config->GetRows_Per_Cuda_Block());
-      nPartition = levelSchedule.nLevels;
+      levelSchedule.Partition(pointList, partitionOffsets, chainPtr,
+                              LinearAlgebraUtils::ComputeRowsPerCudaBlock(config->GetCuda_Block_Size()));
+      nColor = levelSchedule.nLevels;
       maxPartitionSize = levelSchedule.maxLevelWidth;
       break;
   }
diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp
@@ -165,7 +165,7 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
     GPUAllocAndCopy(d_row_ptr, row_ptr, (nPointDomain + 1.0));
     GPUAllocAndCopy(d_col_ind, col_ind, nnz);
     GPUAllocAndCopy(d_dia_ptr, dia_ptr, nPointDomain);
-    GPUVectorAllocAndCopy(d_partition_offsets, geometry->partitionOffsets, geometry->nPartition + 1);
+    GPUVectorAllocAndCopy(d_partition_offsets, geometry->partitionOffsets, geometry->nColor + 1);
   }
 
   if (needTranspPtr) col_ptr = geometry->GetTransposeSparsePatternMap(type).data();
diff --git a/Common/src/linear_algebra/CSysMatrixGPU.cu b/Common/src/linear_algebra/CSysMatrixGPU.cu

Original file line number	Diff line number	Diff line change
`@@ -208,12 +208,10 @@ class CLU_SGSPreconditioner final : public CPreconditioner<ScalarType> {`
`208`	`208`	`#ifdef HAVE_CUDA`
`209`	`209`	`if (config->GetCUDA()) {`
`210`	`210`	`sparse_matrix.GPUComputeLU_SGSPreconditioner(u, v, geometry, config);`
`211`		`- } else {`
`212`		`- sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);`
	`211`	`+ return;`
`213`	`212`	`}`
`214`		`-#else`
`215`		`- sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);`
`216`	`213`	`#endif`
	`214`	`+ sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);`
`217`	`215`	`}`
`218`	`216`	`};`
`219`	`217`
Original file line number	Diff line number	Diff line change
`@@ -165,7 +165,7 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi`
`165`	`165`	`GPUAllocAndCopy(d_row_ptr, row_ptr, (nPointDomain + 1.0));`
`166`	`166`	`GPUAllocAndCopy(d_col_ind, col_ind, nnz);`
`167`	`167`	`GPUAllocAndCopy(d_dia_ptr, dia_ptr, nPointDomain);`
`168`		`- GPUVectorAllocAndCopy(d_partition_offsets, geometry->partitionOffsets, geometry->nPartition + 1);`
	`168`	`+ GPUVectorAllocAndCopy(d_partition_offsets, geometry->partitionOffsets, geometry->nColor + 1);`
`169`	`169`	`}`
`170`	`170`
`171`	`171`	`if (needTranspPtr) col_ptr = geometry->GetTransposeSparsePatternMap(type).data();`