Skip to content

Commit c3b8062

Browse files
committed
Addresed changes in PR 2539
1 parent 352e148 commit c3b8062

11 files changed

Lines changed: 138 additions & 81 deletions

File tree

Common/include/CConfig.hpp

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -521,7 +521,6 @@ class CConfig {
521521
Kind_Gradient_Method_Recon, /*!< \brief Numerical method for computation of spatial gradients used for upwind reconstruction. */
522522
Kind_Deform_Linear_Solver, /*!< Numerical method to deform the grid */
523523
Kind_Deform_Linear_Solver_Prec, /*!< \brief Preconditioner of the linear solver. */
524-
Kind_Graph_Part_Algo, /*!< \brief Algorithm for parallel partitioning of the matrix graph. */
525524
Kind_Linear_Solver, /*!< \brief Numerical solver for the implicit scheme. */
526525
Kind_Linear_Solver_Prec, /*!< \brief Preconditioner of the linear solver. */
527526
Kind_DiscAdj_Linear_Solver, /*!< \brief Linear solver for the discrete adjoint system. */
@@ -538,6 +537,8 @@ class CConfig {
538537
Kind_TimeStep_Heat, /*!< \brief Time stepping method for the (fvm) heat equation. */
539538
n_Datadriven_files;
540539

540+
ENUM_GRAPH_PART_ALGORITHM Kind_Graph_Part_Algo; /*!< \brief Algorithm for parallel partitioning of the matrix graph. */
541+
541542
DataDrivenFluid_ParsedOptions datadriven_ParsedOptions; /*!< \brief Options for data-driven fluid analysis. */
542543

543544
STRUCT_TIME_INT Kind_TimeIntScheme_FEA; /*!< \brief Time integration for the FEA equations. */
@@ -4167,7 +4168,7 @@ class CConfig {
41674168
* \brief Get the type of algorithm used for partitioning the matrix graph.
41684169
* \return Algorithm that divides the matrix into partitions that are executed parallely.
41694170
*/
4170-
unsigned short GetKind_Graph_Part_Algo(void) const { return Kind_Graph_Part_Algo; }
4171+
ENUM_GRAPH_PART_ALGORITHM GetKind_Graph_Part_Algo(void) const { return Kind_Graph_Part_Algo; }
41714172

41724173
/*!
41734174
* \brief Get the kind of solver for the implicit solver.
@@ -4236,12 +4237,6 @@ class CConfig {
42364237
*/
42374238
unsigned short GetCuda_Block_Size(void) const { return Cuda_Block_Size; }
42384239

4239-
/*!
4240-
* \brief Get the number of matrix rows assigned per CUDA Block.
4241-
* \return The number of matrix rows assigned per CUDA Block.
4242-
*/
4243-
unsigned short GetRows_Per_Cuda_Block(void) const { return cudaKernelParameters::rounded_up_division(cudaKernelParameters::CUDA_WARP_SIZE, Cuda_Block_Size); }
4244-
42454240
/*!
42464241
* \brief Get the relaxation factor for solution updates of adjoint solvers.
42474242
*/

Common/include/geometry/CGeometry.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ class CGeometry {
260260
unsigned long* nPointCumulative{nullptr}; /*!< \brief Cumulative storage array containing the total number of points
261261
on all prior ranks in the linear partitioning. */
262262

263-
unsigned long nPartition; /*!< \brief Number of divisions of the matrix graph during execution of parallel
263+
unsigned long nColor; /*!< \brief Number of divisions of the matrix graph during execution of parallel
264264
partitioning algorithms. */
265265
unsigned long maxPartitionSize; /*!< \brief Size of the level with the maximum number of elements. */
266266
vector<unsigned long>

Common/include/linear_algebra/CGraphPartitioning.hpp

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@ template <class ScalarType>
5959
CGraphPartitioning<ScalarType>::~CGraphPartitioning() {}
6060

6161
template <class ScalarType>
62-
6362
class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
6463
private:
6564
ScalarType nPointDomain;
@@ -81,15 +80,13 @@ class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
8180
maxLevelWidth = 0ul;
8281
}
8382

84-
CLevelScheduling() = delete; // Removing default constructor
85-
8683
/*!
8784
* \brief Divides the levels into groups of chains depending on the preset GPU block and warp size.
8885
* \param[in] levelOffsets - Represents the vector array containing the ordered list of starting rows of each level.
8986
* \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
9087
* \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block.
9188
*/
92-
void CalculateChain(vector<ScalarType> levelOffsets, vector<ScalarType>& chainPtr, unsigned short rowsPerBlock) {
89+
void CalculateChain(const vector<ScalarType>& levelOffsets, vector<ScalarType>& chainPtr, unsigned short rowsPerBlock) {
9390
ScalarType levelWidth = 0;
9491

9592
/*This is not a magic number. We are simply initializing
@@ -115,34 +112,33 @@ class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
115112
/*!
116113
* \brief Reorders the points according to the levels
117114
* \param[in] pointList - Ordered array that contains the list of all mesh points.
118-
* \param[in] inversePointList - Array utilized to access the index of each point in pointList.
119115
* \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
116+
* \param[out] reorderedPointList - Reordered list of points after applying level scheduling.
120117
*/
121-
void Reorder(vector<ScalarType>& pointList, vector<ScalarType>& inversePointList, vector<ScalarType> levelOffsets) {
118+
void Reorder(const vector<ScalarType>& pointList, const vector<ScalarType>& levelOffsets,
119+
vector<ScalarType>& reorderedPointList) {
120+
auto levelOffsetsCursor = levelOffsets;
121+
122122
for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) {
123123
const auto globalPoint = pointList[localPoint];
124-
inversePointList[levelOffsets[levels[localPoint]]++] = globalPoint;
124+
reorderedPointList[levelOffsetsCursor[levels[localPoint]]++] = globalPoint;
125125
}
126-
127-
pointList = std::move(inversePointList);
128126
}
129127

130128
/*!
131129
* \brief Reorders the points according to the levels
132-
* \param[in] pointList - Ordered array that contains the list of all mesh points.
130+
* \param[in,out] pointList - Ordered array that contains the list of all mesh points.
133131
* \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
134132
* \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
135133
* \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block.
136134
*/
137135
void Partition(vector<ScalarType>& pointList, vector<ScalarType>& levelOffsets, vector<ScalarType>& chainPtr,
138136
unsigned short rowsPerBlock) override {
139-
vector<ScalarType> inversePointList;
140-
inversePointList.reserve(nPointDomain);
141-
levels.reserve(nPointDomain);
137+
vector<ScalarType> inversePointList(nPointDomain);
138+
levels.resize(nPointDomain, 0ul);
142139

143140
for (auto point = 0ul; point < nPointDomain; point++) {
144141
inversePointList[pointList[point]] = point;
145-
levels[point] = 0;
146142
}
147143

148144
// Local Point - Ordering of the points post the RCM ordering
@@ -175,7 +171,8 @@ class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
175171
levelOffsets[iLevel] += levelOffsets[iLevel - 1];
176172
}
177173

178-
Reorder(pointList, inversePointList, levelOffsets);
174+
Reorder(pointList, levelOffsets, inversePointList);
175+
pointList = std::move(inversePointList);
179176

180177
CalculateChain(levelOffsets, chainPtr, rowsPerBlock);
181178
}
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
/*!
2+
* \file CLinearAlgebraUtils.hpp
3+
* \brief Utility helpers for linear algebra modules.
4+
* \author SU2 Contributors
5+
* \version 8.2.0 "Harrier"
6+
*
7+
* SU2 Project Website: https://su2code.github.io
8+
*
9+
* The SU2 Project is maintained by the SU2 Foundation
10+
* (http://su2foundation.org)
11+
*
12+
* Copyright 2012-2025, SU2 Contributors (cf. AUTHORS.md)
13+
*
14+
* SU2 is free software; you can redistribute it and/or
15+
* modify it under the terms of the GNU Lesser General Public
16+
* License as published by the Free Software Foundation; either
17+
* version 2.1 of the License, or (at your option) any later version.
18+
*
19+
* SU2 is distributed in the hope that it will be useful,
20+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
21+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22+
* Lesser General Public License for more details.
23+
*
24+
* You should have received a copy of the GNU Lesser General Public
25+
* License along with SU2. If not, see <http://www.gnu.org/licenses/>.
26+
*/
27+
28+
#pragma once
29+
30+
#include "../parallelization/omp_structure.hpp"
31+
#include "../option_structure.hpp"
32+
33+
namespace LinearAlgebraUtils {
34+
35+
inline unsigned short ComputeRowsPerCudaBlock(unsigned short cudaBlockSize) {
36+
return static_cast<unsigned short>(
37+
roundUpDiv(static_cast<size_t>(cudaKernelParameters::CUDA_WARP_SIZE), static_cast<size_t>(cudaBlockSize)));
38+
}
39+
40+
} // namespace LinearAlgebraUtils

Common/include/linear_algebra/CPreconditioner.hpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -208,12 +208,10 @@ class CLU_SGSPreconditioner final : public CPreconditioner<ScalarType> {
208208
#ifdef HAVE_CUDA
209209
if (config->GetCUDA()) {
210210
sparse_matrix.GPUComputeLU_SGSPreconditioner(u, v, geometry, config);
211-
} else {
212-
sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
211+
return;
213212
}
214-
#else
215-
sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
216213
#endif
214+
sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
217215
}
218216
};
219217

Common/include/linear_algebra/GPUComms.cuh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,16 +31,16 @@
3131
#include<iostream>
3232
#include "../option_structure.hpp"
3333

34-
/*!
35-
* \struct matrixParameters
34+
/*!
35+
* \struct MatrixParameters
3636
* \brief Structure containing information related to the Jacobian Matrix which is utilized by any launched Kernel.
3737
*
3838
* This implementation alleviates the need to pass an excessive number of arguments
3939
* to a Kernel and, instead, packages it into a single structure. While this leads
4040
* to data duplication for a short period of time, this is a much cleaner and resuable approach.
4141
* \author A. Raj
4242
*/
43-
struct matrixParameters{
43+
struct MatrixParameters {
4444

4545
public:
4646
unsigned long totalRows; /*!< \brief Contains the total number of rows of the Jacbian Matrix. */
@@ -54,7 +54,7 @@ struct matrixParameters{
5454
unsigned short rowsPerBlock; /*!< \brief Number of rows being processed by each thread block. This is equal to the number
5555
of warps present in the block as each row gets assigned a warp. */
5656

57-
matrixParameters(unsigned long nPointDomain, unsigned long nEqn, unsigned long nVar, unsigned long nPartitions, unsigned short rowsPrBlck){
57+
MatrixParameters(unsigned long nPointDomain, unsigned long nEqn, unsigned long nVar, unsigned long nPartitions, unsigned short rowsPrBlck){
5858
totalRows = nPointDomain;
5959
blockRowSize = nEqn;
6060
blockColSize = nVar;

Common/include/option_structure.hpp

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -78,16 +78,6 @@ enum class SU2_COMPONENT {
7878
*/
7979
namespace cudaKernelParameters{
8080

81-
/*!
82-
* \brief Returns the rounded up value of the decimal quotient to the next integer (in all cases).
83-
*/
84-
inline unsigned int rounded_up_division(int divisor, int dividend) { return ((dividend + divisor - 1) / divisor); }
85-
86-
/*!
87-
* \brief Returns the rounded down value of the decimal quotient to the previous integer (in all cases).
88-
*/
89-
inline unsigned int rounded_down_division(int divisor, int dividend) { return ((dividend - divisor + 1) / divisor); }
90-
9181
static constexpr short CUDA_WARP_SIZE = 32; /*!< \brief Outlines the numbers of threads per warp for a CUDA GPU. */
9282
}
9383

@@ -2368,11 +2358,11 @@ static const MapType<std::string, ENUM_FFD_BLENDING> Blending_Map = {
23682358
/*!
23692359
* \brief Types of graph partitioning algorithms for parallel computing
23702360
*/
2371-
enum ENUM_GRAPH_PART_ALGORITHM {
2361+
enum class ENUM_GRAPH_PART_ALGORITHM {
23722362
LEVEL_SCHEDULING, /*!< \brief Partitions the graphs according to level-set algorithm. */
23732363
};
23742364
static const MapType<std::string, ENUM_GRAPH_PART_ALGORITHM> Graph_Part_Map = {
2375-
MakePair("LEVEL_SCHEDULING", LEVEL_SCHEDULING)
2365+
MakePair("LEVEL_SCHEDULING", ENUM_GRAPH_PART_ALGORITHM::LEVEL_SCHEDULING)
23762366
};
23772367

23782368
/*!

Common/src/CConfig.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1862,7 +1862,8 @@ void CConfig::SetConfig_Options() {
18621862

18631863
/*!\brief GRAPH_PARTIONING
18641864
* \n DESCRIPTION: Algorithm for partioning the matrix graph to facilitate parallel execution of linear algebra subroutines\n OPTIONS: see \link Graph_Part_Map \endlink \n DEFAULT: LEVEL_SCHEDULING \ingroup Config*/
1865-
addEnumOption("GRAPH_PART_ALGORITHM", Kind_Graph_Part_Algo, Graph_Part_Map, LEVEL_SCHEDULING);
1865+
addEnumOption("GRAPH_PART_ALGORITHM", Kind_Graph_Part_Algo, Graph_Part_Map,
1866+
ENUM_GRAPH_PART_ALGORITHM::LEVEL_SCHEDULING);
18661867
/*!\brief LINEAR_SOLVER
18671868
* \n DESCRIPTION: Linear solver for the implicit, mesh deformation, or discrete adjoint systems \n OPTIONS: see \link Linear_Solver_Map \endlink \n DEFAULT: FGMRES \ingroup Config*/
18681869
addEnumOption("LINEAR_SOLVER", Kind_Linear_Solver, Linear_Solver_Map, FGMRES);

Common/src/geometry/CPhysicalGeometry.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828
#include "../../include/geometry/CPhysicalGeometry.hpp"
2929
#include "../../include/linear_algebra/CGraphPartitioning.hpp"
30+
#include "../../include/linear_algebra/CLinearAlgebraUtils.hpp"
3031
#include "../../include/adt/CADTPointsOnlyClass.hpp"
3132
#include "../../include/toolboxes/printing_toolbox.hpp"
3233
#include "../../include/toolboxes/CLinearPartitioner.hpp"
@@ -702,14 +703,15 @@ void CPhysicalGeometry::DistributeColoring(const CConfig* config, CGeometry* geo
702703

703704
template <class ScalarType>
704705
void CPhysicalGeometry::PartitionGraph(const CConfig* config, vector<ScalarType>& pointList) {
705-
unsigned short KindAlgorithm = config->GetKind_Graph_Part_Algo();
706+
auto KindAlgorithm = config->GetKind_Graph_Part_Algo();
706707
partitionOffsets.reserve(nPointDomain);
707708

708709
switch (KindAlgorithm) {
709-
case LEVEL_SCHEDULING:
710+
case ENUM_GRAPH_PART_ALGORITHM::LEVEL_SCHEDULING:
710711
auto levelSchedule = CLevelScheduling<ScalarType>(nPointDomain, nodes);
711-
levelSchedule.Partition(pointList, partitionOffsets, chainPtr, config->GetRows_Per_Cuda_Block());
712-
nPartition = levelSchedule.nLevels;
712+
levelSchedule.Partition(pointList, partitionOffsets, chainPtr,
713+
LinearAlgebraUtils::ComputeRowsPerCudaBlock(config->GetCuda_Block_Size()));
714+
nColor = levelSchedule.nLevels;
713715
maxPartitionSize = levelSchedule.maxLevelWidth;
714716
break;
715717
}

Common/src/linear_algebra/CSysMatrix.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
165165
GPUAllocAndCopy(d_row_ptr, row_ptr, (nPointDomain + 1.0));
166166
GPUAllocAndCopy(d_col_ind, col_ind, nnz);
167167
GPUAllocAndCopy(d_dia_ptr, dia_ptr, nPointDomain);
168-
GPUVectorAllocAndCopy(d_partition_offsets, geometry->partitionOffsets, geometry->nPartition + 1);
168+
GPUVectorAllocAndCopy(d_partition_offsets, geometry->partitionOffsets, geometry->nColor + 1);
169169
}
170170

171171
if (needTranspPtr) col_ptr = geometry->GetTransposeSparsePatternMap(type).data();

0 commit comments

Comments
 (0)