Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
79c9e52
updating branch
areenraj Jun 10, 2025
6686171
updating branch
areenraj Jun 10, 2025
b7591f4
Major commit: graph partitioning algorithms, level scheduling method,…
areenraj Jun 25, 2025
6f51f45
Major commit: graph partitioning algorithms, level scheduling method,…
areenraj Jun 25, 2025
cdf0225
resolving conflicts
areenraj Jun 25, 2025
9dd3028
resolving conflicts
areenraj Jun 25, 2025
2495edd
resolving some more conflicts
areenraj Jun 25, 2025
f68358e
cleaning up
areenraj Jun 25, 2025
bf561b6
apologies for repeated commits, just cleaning
areenraj Jun 25, 2025
9121e25
coalesced memory access for MVP, shared memory addition and lamda fun…
areenraj Jun 29, 2025
991e29f
bug fixes
areenraj Jul 1, 2025
9bfeff9
Merge remote-tracking branch 'upstream/master'
areenraj Jul 3, 2025
0372099
Working GPU LU_SGS Preconditioner Port
areenraj Jul 15, 2025
52b90b6
Fixed the issue with the visibility of the rowsPerBlock variable. Als…
areenraj Jul 17, 2025
d367627
Working LU_SGS Preconditioner with graph partitioned algorithms, upda…
areenraj Jul 17, 2025
661c9b8
LU_SGS Preconditioner Port
areenraj Jul 17, 2025
b5cf7dd
Merge branch 'master' of https://github.com/areenraj/SU2_GSoC_GPU
areenraj Jul 17, 2025
c4dbe5c
Fixing warnings
areenraj Jul 17, 2025
b3d2fbf
Merge branch 'develop' of https://github.com/su2code/SU2
areenraj Jul 17, 2025
1be1e2f
Syncing repo to develop
areenraj Jul 17, 2025
7472bd1
updating submodule versions
areenraj Jul 17, 2025
352e148
Fixing some more warnings
areenraj Jul 17, 2025
c3b8062
Addresed changes in PR 2539
digvijay-y Mar 8, 2026
64df92e
Merge branch 'develop' into gpu-lusgs
digvijay-y Mar 8, 2026
340d2f9
WIP: local changes before merge
digvijay-y Mar 8, 2026
791a03e
Merge branch 'gpu-lusgs' of https://github.com/digvijay-y/SU2 into gp…
digvijay-y Mar 8, 2026
4ac7e97
ncolor -> nGraphPartition
digvijay-y Mar 8, 2026
b808fdc
Merge branch 'develop' into gpu-lusgs
digvijay-y Mar 9, 2026
b0bde3f
Merge branch 'develop' into gpu-lusgs
digvijay-y Mar 11, 2026
8cfb381
Merge branch 'develop' into gpu-lusgs
digvijay-y Apr 7, 2026
6001d61
Merge branch 'develop' into gpu-lusgs
digvijay-y Apr 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Major commit: graph partitioning algorithms, level scheduling method,…
… GPU Preconditioner framework
  • Loading branch information
areenraj committed Jun 25, 2025
commit 6f51f456a2e6904c3cad0d9bc7e8ff1d11a9978b
7 changes: 7 additions & 0 deletions Common/include/CConfig.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,7 @@ class CConfig {
Kind_Gradient_Method_Recon, /*!< \brief Numerical method for computation of spatial gradients used for upwind reconstruction. */
Kind_Deform_Linear_Solver, /*!< Numerical method to deform the grid */
Kind_Deform_Linear_Solver_Prec, /*!< \brief Preconditioner of the linear solver. */
Kind_Graph_Part_Algo, /*!< \brief Algorithm for parallel partitioning of the matrix graph. */
Kind_Linear_Solver, /*!< \brief Numerical solver for the implicit scheme. */
Kind_Linear_Solver_Prec, /*!< \brief Preconditioner of the linear solver. */
Kind_DiscAdj_Linear_Solver, /*!< \brief Linear solver for the discrete adjoint system. */
Expand Down Expand Up @@ -4136,6 +4137,12 @@ class CConfig {
*/
bool GetLeastSquaresRequired(void) const { return LeastSquaresRequired; }

/*!
* \brief Get the type of algorithm used for partitioning the matrix graph.
* \return Algorithm that divides the matrix into partitions that are executed parallely.
*/
unsigned short GetKind_Graph_Part_Algo(void) const { return Kind_Graph_Part_Algo; }

/*!
* \brief Get the kind of solver for the implicit solver.
* \return Numerical solver for implicit formulation (solving the linear system).
Expand Down
6 changes: 6 additions & 0 deletions Common/include/geometry/CGeometry.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,12 @@ class CGeometry {
unsigned long* nPointCumulative{nullptr}; /*!< \brief Cumulative storage array containing the total number of points
on all prior ranks in the linear partitioning. */

unsigned long nPartition; /*!< \brief Number of divisions of the matrix graph during execution of parallel
partitioning algorithms. */
unsigned long maxPartitionSize; /*!< \brief Size of the level with the maximum number of elements. */
vector<unsigned long>
partitionOffsets; /*!< \brief Vector array containing the indices at which different parallel partitions begin. */

/*--- Data structures for point-to-point MPI communications. ---*/

int maxCountPerPoint{0}; /*!< \brief Maximum number of pieces of data sent per vertex in point-to-point comms. */
Expand Down
11 changes: 11 additions & 0 deletions Common/include/geometry/CPhysicalGeometry.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,17 @@ class CPhysicalGeometry final : public CGeometry {
*/
void DistributeColoring(const CConfig* config, CGeometry* geometry);

/*!
* \brief Divide the graph produced by the matrix into parallel partitions.
* \param[in] config - Definition of the particular problem.
* \param[in] pointList - Ordered list of points in the mesh.
* \param[in] numPartitions - Returns the number of parallel partitions created by the algorithm.
* \param[in] indexOffsets - Vector array that represents the starting index of each partition in the reordered point
* list.
*/
template <class ScalarType>
void PartitionGraph(const CConfig* config, vector<ScalarType>& pointList);

/*!
* \brief Distribute the grid points, including ghost points, across all ranks based on a ParMETIS coloring.
* \param[in] config - Definition of the particular problem.
Expand Down
11 changes: 7 additions & 4 deletions Common/include/linear_algebra/CMatrixVectorProduct.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,14 +101,17 @@ class CSysMatrixVectorProduct final : public CMatrixVectorProduct<ScalarType> {
* \param[out] v - CSysVector that is the result of the product
*/
inline void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const override {
#ifdef HAVE_CUDA
if (config->GetCUDA()) {
#ifdef HAVE_CUDA
matrix.GPUMatrixVectorProduct(u, v, geometry, config);
#else
SU2_MPI::Error(
"\nError in launching Matrix-Vector Product Function\nENABLE_CUDA is set to YES\nPlease compile with CUDA "
"options enabled in Meson to access GPU Functions",
CURRENT_FUNCTION);
#endif
} else {
matrix.MatrixVectorProduct(u, v, geometry, config);
}
#else
matrix.MatrixVectorProduct(u, v, geometry, config);
#endif
}
};
12 changes: 5 additions & 7 deletions Common/include/linear_algebra/CPreconditioner.hpp
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks good. Did you test this once with the flag on or off, just to make sure?

Original file line number Diff line number Diff line change
Expand Up @@ -205,17 +205,15 @@ class CLU_SGSPreconditioner final : public CPreconditioner<ScalarType> {
* \param[out] v - CSysVector that is the result of the preconditioning.
*/
inline void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const override {
#ifdef HAVE_CUDA
if(config->GetCUDA())
{
#ifdef HAVE_CUDA
if (config->GetCUDA()) {
sparse_matrix.GPUComputeLU_SGSPreconditioner(u, v, geometry, config);
}
else {
} else {
sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
}
#else
#else
sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
#endif
#endif
}
};

Expand Down
5 changes: 3 additions & 2 deletions Common/include/linear_algebra/CSysMatrix.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ class CSysMatrix {
const unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */
const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
const unsigned long* d_dia_ptr; /*!< \brief Device Column index for each of the elements in val(). */
unsigned long* d_partition_offsets;

ScalarType* ILU_matrix; /*!< \brief Entries of the ILU sparse matrix. */
unsigned long nnz_ilu; /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
Expand Down Expand Up @@ -865,8 +866,8 @@ class CSysMatrix {
* \param[in] vec - CSysVector to be multiplied by the preconditioner.
* \param[out] prod - Result of the product A*vec.
*/
void GPUComputeLU_SGSPreconditioner(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod, CGeometry* geometry,
const CConfig* config) const;
void GPUComputeLU_SGSPreconditioner(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod,
CGeometry* geometry, const CConfig* config) const;

/*!
* \brief Build the Jacobi preconditioner.
Expand Down
46 changes: 40 additions & 6 deletions Common/include/linear_algebra/GPUComms.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -28,17 +28,51 @@
#include<cuda_runtime.h>
#include"iostream"

namespace KernelParameters{
namespace kernelParameters{

/*Returns the rounded up value of the decimal quotient to the next integer (in all cases)*/
inline constexpr int rounded_up_division(const int divisor, int dividend) { return ((dividend + divisor - 1) / divisor); }
inline constexpr int rounded_up_division(const int divisor, int dividend) { return ((dividend + divisor - 1) / divisor); }

/*Returns the rounded down value of the decimal quotient to the previous integer (in all cases)*/
inline constexpr int rounded_down_division(const int divisor, int dividend) { return ((dividend - divisor + 1) / divisor); }
inline constexpr int rounded_down_division(const int divisor, int dividend) { return ((dividend - divisor + 1) / divisor); }

const unsigned int MVP_BLOCK_SIZE = 1024;
const unsigned int MVP_WARP_SIZE = 32;

};

struct matrixParameters{

public:
unsigned long totalRows;
unsigned long blockRowSize;
unsigned long blockColSize;
unsigned long nPartition;
unsigned long blockSize;

matrixParameters(unsigned long nPointDomain, unsigned long nEqn, unsigned long nVar, unsigned long nPartitions)
{
totalRows = nPointDomain;
blockRowSize = nEqn;
blockColSize = nVar;
nPartition = nPartitions;
blockSize = nVar * nEqn;
}
};
struct precondParameters{

public:
dim3 gaussElimBlockDim;
dim3 gaussElimGridDim;

precondParameters(matrixParameters matrixParam)
{
unsigned int geBlockx = matrixParam.blockSize;
gaussElimBlockDim = {geBlockx, 1, 1};
gaussElimGridDim = {1,1,1};
}
};

const int MVP_BLOCK_SIZE = 1024;
const int MVP_WARP_SIZE = 32;
}
/*!
* \brief assert style function that reads return codes after intercepting CUDA API calls.
* It returns the result code and its location if the call is unsuccessful.
Expand Down
10 changes: 10 additions & 0 deletions Common/include/option_structure.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2345,6 +2345,16 @@ static const MapType<std::string, ENUM_FFD_BLENDING> Blending_Map = {
MakePair("BEZIER", BEZIER)
};

/*!
* \brief Types of graph partitioning algorithms for parallel computing
*/
enum ENUM_GRAPH_PART_ALGORITHM {
LEVEL_SCHEDULING, /*!< \brief Partitions the graphs according to level-set algorithm. */
};
static const MapType<std::string, ENUM_GRAPH_PART_ALGORITHM> Graph_Part_Map = {
MakePair("LEVEL_SCHEDULING", LEVEL_SCHEDULING)
};

/*!
* \brief Types of solvers for solving linear systems
*/
Expand Down
3 changes: 3 additions & 0 deletions Common/src/CConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1849,6 +1849,9 @@ void CConfig::SetConfig_Options() {
/*!\par CONFIG_CATEGORY: Linear solver definition \ingroup Config*/
/*--- Options related to the linear solvers ---*/

/*!\brief GRAPH_PARTIONING
* \n DESCRIPTION: Algorithm for partioning the matrix graph to facilitate parallel execution of inear algebra subroutines\n OPTIONS: see \link Graph_Part_Map \endlink \n DEFAULT: LEVEL_SCHEDULING \ingroup Config*/
addEnumOption("GRAPH_PART_ALGORITHM", Kind_Graph_Part_Algo, Graph_Part_Map, LEVEL_SCHEDULING);
/*!\brief LINEAR_SOLVER
* \n DESCRIPTION: Linear solver for the implicit, mesh deformation, or discrete adjoint systems \n OPTIONS: see \link Linear_Solver_Map \endlink \n DEFAULT: FGMRES \ingroup Config*/
addEnumOption("LINEAR_SOLVER", Kind_Linear_Solver, Linear_Solver_Map, FGMRES);
Expand Down
20 changes: 20 additions & 0 deletions Common/src/geometry/CPhysicalGeometry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
*/

#include "../../include/geometry/CPhysicalGeometry.hpp"
#include "../../include/linear_algebra/CGraphPartitioning.hpp"
#include "../../include/adt/CADTPointsOnlyClass.hpp"
#include "../../include/toolboxes/printing_toolbox.hpp"
#include "../../include/toolboxes/CLinearPartitioner.hpp"
Expand All @@ -49,6 +50,8 @@
#include "../../include/geometry/primal_grid/CPyramid.hpp"
#include "../../include/geometry/primal_grid/CPrism.hpp"
#include "../../include/geometry/primal_grid/CVertexMPI.hpp"
#include "boost/integer_fwd.hpp"
#include "cgnslib.h"

#include <sys/types.h>
#include <sys/stat.h>
Expand Down Expand Up @@ -699,6 +702,21 @@ void CPhysicalGeometry::DistributeColoring(const CConfig* config, CGeometry* geo
delete[] nPoint_Flag;
}

template <class ScalarType>
void CPhysicalGeometry::PartitionGraph(const CConfig* config, vector<ScalarType>& pointList) {
unsigned short KindAlgorithm = config->GetKind_Graph_Part_Algo();
partitionOffsets.reserve(nPointDomain);

switch (KindAlgorithm) {
case LEVEL_SCHEDULING:
auto levelSchedule = CLevelScheduling<ScalarType>(nPointDomain, nodes);
levelSchedule.Partition(pointList, partitionOffsets);
nPartition = levelSchedule.nLevels;
maxPartitionSize = levelSchedule.maxLevelWidth;
break;
}
Comment on lines +708 to +716

Check notice

Code scanning / CodeQL

No trivial switch statements Note

This switch statement should either handle more cases, or be rewritten as an if statement.
}

void CPhysicalGeometry::DistributeVolumeConnectivity(const CConfig* config, CGeometry* geometry,
unsigned short Elem_Type) {
unsigned short NODES_PER_ELEMENT = 0;
Expand Down Expand Up @@ -4542,6 +4560,8 @@ void CPhysicalGeometry::SetRCM_Ordering(CConfig* config) {
if (!status) SU2_MPI::Error("RCM ordering failed", CURRENT_FUNCTION);
}

if (config->GetCUDA()) PartitionGraph(config, Result);

/*--- Add the MPI points ---*/
for (auto iPoint = nPointDomain; iPoint < nPoint; iPoint++) {
Result.push_back(iPoint);
Expand Down
8 changes: 7 additions & 1 deletion Common/src/linear_algebra/CSysMatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ CSysMatrix<ScalarType>::~CSysMatrix() {
GPUMemoryAllocation::gpu_free(d_matrix);
GPUMemoryAllocation::gpu_free(d_row_ptr);
GPUMemoryAllocation::gpu_free(d_col_ind);
GPUMemoryAllocation::gpu_free(d_partition_offsets);

#ifdef USE_MKL
mkl_jit_destroy(MatrixMatrixProductJitter);
Expand Down Expand Up @@ -150,11 +151,16 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
ptr = GPUMemoryAllocation::gpu_alloc_cpy<const unsigned long>(src_ptr, num * sizeof(const unsigned long));
};

auto GPUVectorAllocAndCopy = [](unsigned long*& ptr, vector<unsigned long>& src_ptr, unsigned long num) {
ptr = GPUMemoryAllocation::gpu_alloc_cpy<unsigned long>(&src_ptr[0], num * sizeof(unsigned long));
};

GPUAllocAndInit(d_matrix, nnz * nVar * nEqn);
GPUAllocAndCopy(d_row_ptr, row_ptr, (nPointDomain + 1.0));
GPUAllocAndCopy(d_col_ind, col_ind, nnz);
GPUAllocAndCopy(d_dia_ptr, dia_ptr, nPointDomain);

GPUVectorAllocAndCopy(d_partition_offsets, geometry->partitionOffsets, geometry->nPartition);

if (needTranspPtr) col_ptr = geometry->GetTransposeSparsePatternMap(type).data();

if (type == ConnectivityType::FiniteVolume) {
Expand Down
Loading