Skip to content
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
79c9e52
updating branch
areenraj Jun 10, 2025
6686171
updating branch
areenraj Jun 10, 2025
b7591f4
Major commit: graph partitioning algorithms, level scheduling method,…
areenraj Jun 25, 2025
6f51f45
Major commit: graph partitioning algorithms, level scheduling method,…
areenraj Jun 25, 2025
cdf0225
resolving conflicts
areenraj Jun 25, 2025
9dd3028
resolving conflicts
areenraj Jun 25, 2025
2495edd
resolving some more conflicts
areenraj Jun 25, 2025
f68358e
cleaning up
areenraj Jun 25, 2025
bf561b6
apologies for repeated commits, just cleaning
areenraj Jun 25, 2025
9121e25
coalesced memory access for MVP, shared memory addition and lamda fun…
areenraj Jun 29, 2025
991e29f
bug fixes
areenraj Jul 1, 2025
9bfeff9
Merge remote-tracking branch 'upstream/master'
areenraj Jul 3, 2025
0372099
Working GPU LU_SGS Preconditioner Port
areenraj Jul 15, 2025
52b90b6
Fixed the issue with the visibility of the rowsPerBlock variable. Als…
areenraj Jul 17, 2025
d367627
Working LU_SGS Preconditioner with graph partitioned algorithms, upda…
areenraj Jul 17, 2025
661c9b8
LU_SGS Preconditioner Port
areenraj Jul 17, 2025
b5cf7dd
Merge branch 'master' of https://github.com/areenraj/SU2_GSoC_GPU
areenraj Jul 17, 2025
c4dbe5c
Fixing warnings
areenraj Jul 17, 2025
b3d2fbf
Merge branch 'develop' of https://github.com/su2code/SU2
areenraj Jul 17, 2025
1be1e2f
Syncing repo to develop
areenraj Jul 17, 2025
7472bd1
updating submodule versions
areenraj Jul 17, 2025
352e148
Fixing some more warnings
areenraj Jul 17, 2025
c3b8062
Addresed changes in PR 2539
digvijay-y Mar 8, 2026
64df92e
Merge branch 'develop' into gpu-lusgs
digvijay-y Mar 8, 2026
340d2f9
WIP: local changes before merge
digvijay-y Mar 8, 2026
791a03e
Merge branch 'gpu-lusgs' of https://github.com/digvijay-y/SU2 into gp…
digvijay-y Mar 8, 2026
4ac7e97
ncolor -> nGraphPartition
digvijay-y Mar 8, 2026
b808fdc
Merge branch 'develop' into gpu-lusgs
digvijay-y Mar 9, 2026
b0bde3f
Merge branch 'develop' into gpu-lusgs
digvijay-y Mar 11, 2026
8cfb381
Merge branch 'develop' into gpu-lusgs
digvijay-y Apr 7, 2026
6001d61
Merge branch 'develop' into gpu-lusgs
digvijay-y Apr 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions Common/include/CConfig.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,7 @@ class CConfig {
Kind_Gradient_Method_Recon, /*!< \brief Numerical method for computation of spatial gradients used for upwind reconstruction. */
Kind_Deform_Linear_Solver, /*!< Numerical method to deform the grid */
Kind_Deform_Linear_Solver_Prec, /*!< \brief Preconditioner of the linear solver. */
Kind_Graph_Part_Algo, /*!< \brief Algorithm for parallel partitioning of the matrix graph. */
Kind_Linear_Solver, /*!< \brief Numerical solver for the implicit scheme. */
Kind_Linear_Solver_Prec, /*!< \brief Preconditioner of the linear solver. */
Kind_DiscAdj_Linear_Solver, /*!< \brief Linear solver for the discrete adjoint system. */
Expand Down Expand Up @@ -631,6 +632,8 @@ class CConfig {
unsigned long Linear_Solver_Restart_Frequency; /*!< \brief Restart frequency of the linear solver for the implicit formulation. */
unsigned long Linear_Solver_Prec_Threads; /*!< \brief Number of threads per rank for ILU and LU_SGS preconditioners. */
unsigned short Linear_Solver_ILU_n; /*!< \brief ILU fill=in level. */
unsigned short Cuda_Block_Size; /*!< \brief User-specified value for the X-Axis dimension of thread blocks
that are deployed by the CUDA Kernels. */
su2double SemiSpan; /*!< \brief Wing Semi span. */
su2double Roe_Kappa; /*!< \brief Relaxation of the Roe scheme. */
su2double Relaxation_Factor_Adjoint; /*!< \brief Relaxation coefficient for variable updates of adjoint solvers. */
Expand Down Expand Up @@ -4136,6 +4139,12 @@ class CConfig {
*/
bool GetLeastSquaresRequired(void) const { return LeastSquaresRequired; }

/*!
* \brief Get the type of algorithm used for partitioning the matrix graph.
* \return Algorithm that divides the matrix into partitions that are executed parallely.
*/
unsigned short GetKind_Graph_Part_Algo(void) const { return Kind_Graph_Part_Algo; }

/*!
* \brief Get the kind of solver for the implicit solver.
* \return Numerical solver for implicit formulation (solving the linear system).
Expand Down Expand Up @@ -4197,6 +4206,18 @@ class CConfig {
*/
su2double GetLinear_Solver_Smoother_Relaxation(void) const { return Linear_Solver_Smoother_Relaxation; }

/*!
* \brief Get thread block dimensions (X-axis) being used by the CUDA Kernels.
* \return Thread block dimensions (X-axis) being used by the CUDA Kernels.
*/
unsigned short GetCuda_Block_Size(void) const { return Cuda_Block_Size; }

/*!
* \brief Get the number of matrix rows assigned per CUDA Block.
* \return The number of matrix rows assigned per CUDA Block.
*/
unsigned short GetRows_Per_Cuda_Block(void) const { return cudaKernelParameters::rounded_up_division(cudaKernelParameters::CUDA_WARP_SIZE, Cuda_Block_Size); }

/*!
* \brief Get the relaxation factor for solution updates of adjoint solvers.
*/
Expand Down
7 changes: 7 additions & 0 deletions Common/include/geometry/CGeometry.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,13 @@ class CGeometry {
unsigned long* nPointCumulative{nullptr}; /*!< \brief Cumulative storage array containing the total number of points
on all prior ranks in the linear partitioning. */

unsigned long nPartition; /*!< \brief Number of divisions of the matrix graph during execution of parallel
partitioning algorithms. */
unsigned long maxPartitionSize; /*!< \brief Size of the level with the maximum number of elements. */
vector<unsigned long>
partitionOffsets; /*!< \brief Vector array containing the indices at which different parallel partitions begin. */
vector<unsigned long> chainPtr; /*!< \brief Vector array containing distribution of levels into chains. */

/*--- Data structures for point-to-point MPI communications. ---*/

int maxCountPerPoint{0}; /*!< \brief Maximum number of pieces of data sent per vertex in point-to-point comms. */
Expand Down
8 changes: 8 additions & 0 deletions Common/include/geometry/CPhysicalGeometry.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,14 @@ class CPhysicalGeometry final : public CGeometry {
*/
void DistributeColoring(const CConfig* config, CGeometry* geometry);

/*!
* \brief Divide the graph produced by the matrix into parallel partitions.
* \param[in] config - Definition of the particular problem.
* \param[in] pointList - Ordered list of points in the mesh.
*/
template <class ScalarType>
void PartitionGraph(const CConfig* config, vector<ScalarType>& pointList);

/*!
* \brief Distribute the grid points, including ghost points, across all ranks based on a ParMETIS coloring.
* \param[in] config - Definition of the particular problem.
Expand Down
183 changes: 183 additions & 0 deletions Common/include/linear_algebra/CGraphPartitioning.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
/*!
* \file CGraphPartitioning.hpp
* \brief Headers for the classes realted to the algorithms that are used
to divide the matrix acyclic graph into parallel partitions.
* \author A. Raj
* \version 8.2.0 "Harrier"
*
* SU2 Project Website: https://su2code.github.io
*
* The SU2 Project is maintained by the SU2 Foundation
* (http://su2foundation.org)
*
* Copyright 2012-2025, SU2 Contributors (cf. AUTHORS.md)
*
* SU2 is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* SU2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with SU2. If not, see <http://www.gnu.org/licenses/>.
*/

#pragma once

#include "../geometry/CGeometry.hpp"

/*!
* \class CGraphPartitioning
* \brief Abstract base class for defining graph partitioning algorithms.
* \author A. Raj
*
* In order to use certain parallel algorithms in the solution process -
* whether with linear solvers or preconditioners - we require the matrix
* to be partitioned into certain parallel divisions. These maybe in the form
* of levels, blocks, colors and so on. Since a number of different algorithms
* can be used to split the graph, we've introduced a base class containing the
* "Partition" member function from which child classes of the specific
* algorithm can be derived. Currently, we are only using direct declarations
* of the derived classes in the code. However, this method was chosen as it
* allows us to pass different child class algorithms to a single implementation
* of the function that requires it - similar to the CMatrixVectorProduct class.
*/

template <class ScalarType>

class CGraphPartitioning {
public:
virtual ~CGraphPartitioning() = 0;
virtual void Partition(vector<ScalarType>& pointList, vector<ScalarType>& partitionOffsets,
vector<ScalarType>& chainPtr, unsigned short chainLimit) = 0;
};
template <class ScalarType>
CGraphPartitioning<ScalarType>::~CGraphPartitioning() {}

template <class ScalarType>

class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
private:
ScalarType nPointDomain;
CPoint* nodes;

public:
ScalarType nLevels;
ScalarType maxLevelWidth;
vector<ScalarType> levels;

/*!
* \brief constructor of the class
* \param[in] nPointDomain_ref - number of points associated with the problem
* \param[in] nodes_ref - represents the relationships between the points
*/
inline CLevelScheduling<ScalarType>(ScalarType nPointDomain_ref, CPoint* nodes_ref)
: nPointDomain(nPointDomain_ref), nodes(nodes_ref) {
nLevels = 0ul;
maxLevelWidth = 0ul;
}

CLevelScheduling() = delete; // Removing default constructor

/*!
* \brief Divides the levels into groups of chains depending on the preset GPU block and warp size.
* \param[in] levelOffsets - Represents the vector array containing the ordered list of starting rows of each level.
* \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
* \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block.
*/
void CalculateChain(vector<ScalarType> levelOffsets, vector<ScalarType>& chainPtr, unsigned short rowsPerBlock) {
ScalarType levelWidth = 0;
unsigned short chainLength = chainPtr.capacity();

/*This is not a magic number. We are simply initializing
the point array with its first element that is always zero.*/
chainPtr.push_back(0);

for (ScalarType iLevel = 0ul; iLevel < nLevels; iLevel++) {
levelWidth = levelOffsets[iLevel + 1] - levelOffsets[iLevel];
maxLevelWidth = std::max(levelWidth, maxLevelWidth);

if (levelWidth > rowsPerBlock) {
if (chainPtr.back() != iLevel) {
chainPtr.push_back(iLevel);
}

chainPtr.push_back(iLevel + 1);
}
}

chainPtr.push_back(nLevels);
}

/*!
* \brief Reorders the points according to the levels
* \param[in] pointList - Ordered array that contains the list of all mesh points.
* \param[in] inversePointList - Array utilized to access the index of each point in pointList.
* \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
*/
void Reorder(vector<ScalarType>& pointList, vector<ScalarType>& inversePointList, vector<ScalarType> levelOffsets) {
for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) {
const auto globalPoint = pointList[localPoint];
inversePointList[levelOffsets[levels[localPoint]]++] = globalPoint;
}

pointList = std::move(inversePointList);
}

/*!
* \brief Reorders the points according to the levels
* \param[in] pointList - Ordered array that contains the list of all mesh points.
* \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
* \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
* \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block.
*/
void Partition(vector<ScalarType>& pointList, vector<ScalarType>& levelOffsets, vector<ScalarType>& chainPtr,
unsigned short rowsPerBlock) override {
vector<ScalarType> inversePointList;
inversePointList.reserve(nPointDomain);
levels.reserve(nPointDomain);

for (auto point = 0ul; point < nPointDomain; point++) {
inversePointList[pointList[point]] = point;
levels[point] = 0;
}

// Local Point - Ordering of the points post the RCM ordering
// Global Point - Original order of the points before the RCM ordering

for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) {
const auto globalPoint = pointList[localPoint];

for (auto adjPoints = 0u; adjPoints < nodes->GetnPoint(globalPoint); adjPoints++) {
const auto adjGlobalPoint = nodes->GetPoint(globalPoint, adjPoints);

if (adjGlobalPoint < nPointDomain) {
const auto adjLocalPoint = inversePointList[adjGlobalPoint];

if (adjLocalPoint < localPoint) {
levels[localPoint] = std::max(levels[localPoint], levels[adjLocalPoint] + 1);
}
}
}

nLevels = std::max(nLevels, levels[localPoint] + 1);
}

levelOffsets.resize(nLevels + 1);
for (auto iPoint = 0ul; iPoint < nPointDomain; iPoint++) {
++levelOffsets[levels[iPoint] + 1];
}

for (auto iLevel = 2ul; iLevel <= nLevels; ++iLevel) {
levelOffsets[iLevel] += levelOffsets[iLevel - 1];
}

Reorder(pointList, inversePointList, levelOffsets);

CalculateChain(levelOffsets, chainPtr, rowsPerBlock);
}
};
12 changes: 5 additions & 7 deletions Common/include/linear_algebra/CPreconditioner.hpp
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks good. Did you test this once with the flag on or off, just to make sure?

Original file line number Diff line number Diff line change
Expand Up @@ -205,17 +205,15 @@ class CLU_SGSPreconditioner final : public CPreconditioner<ScalarType> {
* \param[out] v - CSysVector that is the result of the preconditioning.
*/
inline void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const override {
#ifdef HAVE_CUDA
if(config->GetCUDA())
{
#ifdef HAVE_CUDA
if (config->GetCUDA()) {
sparse_matrix.GPUComputeLU_SGSPreconditioner(u, v, geometry, config);
}
else {
} else {
sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
}
#else
#else
sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
#endif
#endif
}
};

Expand Down
17 changes: 15 additions & 2 deletions Common/include/linear_algebra/CSysMatrix.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,15 @@ class CSysMatrix {
ScalarType* d_matrix; /*!< \brief Device Pointer to store the matrix values on the GPU. */
const unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */
const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
<<<<<<< HEAD
bool useCuda; /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
Mainly used to conditionally free GPU memory in the class destructor. */
=======
const unsigned long* d_dia_ptr; /*!< \brief Device Column index for each of the elements in val(). */
unsigned long* d_partition_offsets;
bool useCuda; /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
Mainly used to conditionally free GPU memory in the class destructor. */
>>>>>>> precond_port

ScalarType* ILU_matrix; /*!< \brief Entries of the ILU sparse matrix. */
unsigned long nnz_ilu; /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
Expand Down Expand Up @@ -862,7 +869,10 @@ class CSysMatrix {

/*!
<<<<<<< HEAD
<<<<<<< HEAD
=======
=======
>>>>>>> precond_port
* \brief Performs first step of the LU_SGS Preconditioner building
* \param[in] vec - CSysVector to be multiplied by the sparse matrix A.
* \param[in] geometry - Geometrical definition of the problem.
Expand All @@ -888,13 +898,16 @@ class CSysMatrix {
void GPUGaussElimination(ScalarType& prod, CGeometry* geometry, const CConfig* config) const;

/*!
<<<<<<< HEAD
>>>>>>> upstream/develop
=======
>>>>>>> precond_port
* \brief Multiply CSysVector by the preconditioner all of which are stored on the device
* \param[in] vec - CSysVector to be multiplied by the preconditioner.
* \param[out] prod - Result of the product A*vec.
*/
void GPUComputeLU_SGSPreconditioner(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod, CGeometry* geometry,
const CConfig* config) const;
void GPUComputeLU_SGSPreconditioner(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod,
CGeometry* geometry, const CConfig* config) const;

/*!
* \brief Build the Jacobi preconditioner.
Expand Down
Loading