Skip to content

Commit 65bbf24

Browse files
committed
extract prefix sum computations
* introduce a new `prefix_sum` kernel that is used instead * remove a few unnecessary `Array::clear` invocations * fix reference ParILU initialization for zero diagonal entries.
1 parent adddb06 commit 65bbf24

28 files changed

Lines changed: 490 additions & 328 deletions

core/matrix/common_kernels.hpp

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
/*******************************<GINKGO LICENSE>******************************
2+
Copyright (c) 2017-2019, the Ginkgo authors
3+
All rights reserved.
4+
5+
Redistribution and use in source and binary forms, with or without
6+
modification, are permitted provided that the following conditions
7+
are met:
8+
9+
1. Redistributions of source code must retain the above copyright
10+
notice, this list of conditions and the following disclaimer.
11+
12+
2. Redistributions in binary form must reproduce the above copyright
13+
notice, this list of conditions and the following disclaimer in the
14+
documentation and/or other materials provided with the distribution.
15+
16+
3. Neither the name of the copyright holder nor the names of its
17+
contributors may be used to endorse or promote products derived from
18+
this software without specific prior written permission.
19+
20+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21+
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22+
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23+
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24+
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31+
******************************<GINKGO LICENSE>*******************************/
32+
33+
#ifndef GKO_CORE_MATRIX_COMMON_KERNELS_HPP_
34+
#define GKO_CORE_MATRIX_COMMON_KERNELS_HPP_
35+
36+
37+
#include <memory>
38+
39+
40+
#include <ginkgo/core/base/executor.hpp>
41+
#include <ginkgo/core/base/types.hpp>
42+
43+
44+
namespace gko {
45+
namespace kernels {
46+
47+
48+
#define GKO_DECLARE_PREFIX_SUM_KERNEL(IndexType) \
49+
void prefix_sum(std::shared_ptr<const DefaultExecutor> exec, \
50+
IndexType *counts, size_type num_entries)
51+
52+
53+
#define GKO_DECLARE_ALL_AS_TEMPLATES \
54+
template <typename IndexType> \
55+
GKO_DECLARE_PREFIX_SUM_KERNEL(IndexType)
56+
57+
58+
namespace omp {
59+
60+
GKO_DECLARE_ALL_AS_TEMPLATES;
61+
62+
} // namespace omp
63+
64+
65+
namespace cuda {
66+
67+
GKO_DECLARE_ALL_AS_TEMPLATES;
68+
69+
} // namespace cuda
70+
71+
72+
namespace reference {
73+
74+
GKO_DECLARE_ALL_AS_TEMPLATES;
75+
76+
} // namespace reference
77+
78+
79+
namespace hip {
80+
81+
GKO_DECLARE_ALL_AS_TEMPLATES;
82+
83+
} // namespace hip
84+
85+
86+
#undef GKO_DECLARE_ALL_AS_TEMPLATES
87+
88+
89+
} // namespace kernels
90+
} // namespace gko
91+
92+
#endif // GKO_CORE_MATRIX_COMMON_KERNELS_HPP_

cuda/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ target_sources(ginkgo_cuda
6464
base/version.cpp
6565
components/zero_array.cu
6666
factorization/par_ilu_kernels.cu
67+
matrix/common_kernels.cu
6768
matrix/coo_kernels.cu
6869
matrix/csr_kernels.cu
6970
matrix/dense_kernels.cu

cuda/factorization/par_ilu_kernels.cu

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3838
#include <ginkgo/core/matrix/coo.hpp>
3939

4040

41+
#include "core/matrix/common_kernels.hpp"
4142
#include "cuda/base/math.hpp"
4243
#include "cuda/base/types.hpp"
43-
#include "cuda/components/prefix_sum.cuh"
4444

4545

4646
namespace gko {
@@ -67,7 +67,6 @@ void initialize_row_ptrs_l_u(
6767
IndexType *l_row_ptrs, IndexType *u_row_ptrs)
6868
{
6969
const size_type num_rows{system_matrix->get_size()[0]};
70-
const size_type num_row_ptrs{num_rows + 1};
7170

7271
const dim3 block_size{default_block_size, 1, 1};
7372
const uint32 number_blocks =
@@ -80,18 +79,8 @@ void initialize_row_ptrs_l_u(
8079
as_cuda_type(system_matrix->get_const_values()),
8180
as_cuda_type(l_row_ptrs), as_cuda_type(u_row_ptrs));
8281

83-
Array<IndexType> block_sum(exec, grid_dim.x);
84-
auto block_sum_ptr = block_sum.get_data();
85-
86-
start_prefix_sum<default_block_size><<<grid_dim, block_size>>>(
87-
num_row_ptrs, as_cuda_type(l_row_ptrs), as_cuda_type(block_sum_ptr));
88-
finalize_prefix_sum<default_block_size><<<grid_dim, block_size>>>(
89-
num_row_ptrs, as_cuda_type(l_row_ptrs), as_cuda_type(block_sum_ptr));
90-
91-
start_prefix_sum<default_block_size><<<grid_dim, block_size>>>(
92-
num_row_ptrs, as_cuda_type(u_row_ptrs), as_cuda_type(block_sum_ptr));
93-
finalize_prefix_sum<default_block_size><<<grid_dim, block_size>>>(
94-
num_row_ptrs, as_cuda_type(u_row_ptrs), as_cuda_type(block_sum_ptr));
82+
prefix_sum(exec, l_row_ptrs, num_rows + 1);
83+
prefix_sum(exec, u_row_ptrs, num_rows + 1);
9584
}
9685

9786
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(

cuda/matrix/common_kernels.cu

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
/*******************************<GINKGO LICENSE>******************************
2+
Copyright (c) 2017-2019, the Ginkgo authors
3+
All rights reserved.
4+
5+
Redistribution and use in source and binary forms, with or without
6+
modification, are permitted provided that the following conditions
7+
are met:
8+
9+
1. Redistributions of source code must retain the above copyright
10+
notice, this list of conditions and the following disclaimer.
11+
12+
2. Redistributions in binary form must reproduce the above copyright
13+
notice, this list of conditions and the following disclaimer in the
14+
documentation and/or other materials provided with the distribution.
15+
16+
3. Neither the name of the copyright holder nor the names of its
17+
contributors may be used to endorse or promote products derived from
18+
this software without specific prior written permission.
19+
20+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21+
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22+
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23+
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24+
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31+
******************************<GINKGO LICENSE>*******************************/
32+
33+
34+
#include "core/matrix/common_kernels.hpp"
35+
36+
37+
#include "cuda/components/prefix_sum.cuh"
38+
39+
40+
namespace gko {
41+
namespace kernels {
42+
namespace cuda {
43+
44+
45+
constexpr int prefix_sum_block_size = 512;
46+
47+
48+
template <typename IndexType>
49+
void prefix_sum(std::shared_ptr<const CudaExecutor> exec, IndexType *counts,
50+
size_type num_entries)
51+
{
52+
auto num_blocks = ceildiv(num_entries, prefix_sum_block_size);
53+
Array<IndexType> block_sum_array(exec, num_blocks);
54+
auto block_sums = block_sum_array.get_data();
55+
start_prefix_sum<prefix_sum_block_size>
56+
<<<num_blocks, prefix_sum_block_size>>>(num_entries, counts,
57+
block_sums);
58+
finalize_prefix_sum<prefix_sum_block_size>
59+
<<<num_blocks, prefix_sum_block_size>>>(num_entries, counts,
60+
block_sums);
61+
}
62+
63+
64+
GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_KERNEL);
65+
66+
// explicitly instantiate for size_type as well, as this is used in the SellP
67+
// format
68+
template void prefix_sum<size_type>(std::shared_ptr<const CudaExecutor> exec,
69+
size_type *counts, size_type num_entries);
70+
71+
72+
} // namespace cuda
73+
} // namespace kernels
74+
} // namespace gko

cuda/matrix/csr_kernels.cu

Lines changed: 3 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4646
#include <ginkgo/core/matrix/sellp.hpp>
4747

4848

49+
#include "core/matrix/common_kernels.hpp"
4950
#include "core/matrix/dense_kernels.hpp"
5051
#include "core/synthesizer/implementation_selection.hpp"
5152
#include "cuda/base/config.hpp"
@@ -55,7 +56,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5556
#include "cuda/base/types.hpp"
5657
#include "cuda/components/atomic.cuh"
5758
#include "cuda/components/cooperative_groups.cuh"
58-
#include "cuda/components/prefix_sum.cuh"
5959
#include "cuda/components/reduction.cuh"
6060
#include "cuda/components/segment_scan.cuh"
6161
#include "cuda/components/uninitialized_array.hpp"
@@ -675,27 +675,14 @@ void convert_to_sellp(std::shared_ptr<const CudaExecutor> exec,
675675
as_cuda_type(nnz_per_row.get_const_data()), as_cuda_type(slice_lengths),
676676
as_cuda_type(slice_sets));
677677

678-
auto add_values =
679-
Array<size_type>(exec, ceildiv(slice_num + 1, default_block_size));
680-
grid_dim = ceildiv(slice_num + 1, default_block_size);
681-
682-
start_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
683-
slice_num + 1, as_cuda_type(slice_sets),
684-
as_cuda_type(add_values.get_data()));
685-
686-
finalize_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
687-
slice_num + 1, as_cuda_type(slice_sets),
688-
as_cuda_type(add_values.get_const_data()));
678+
prefix_sum(exec, slice_sets, slice_num + 1);
689679

690680
grid_dim = ceildiv(num_rows, default_block_size);
691681
kernel::fill_in_sellp<<<grid_dim, default_block_size>>>(
692682
num_rows, slice_size, as_cuda_type(source_values),
693683
as_cuda_type(source_row_ptrs), as_cuda_type(source_col_idxs),
694684
as_cuda_type(slice_lengths), as_cuda_type(slice_sets),
695685
as_cuda_type(result_col_idxs), as_cuda_type(result_values));
696-
697-
nnz_per_row.clear();
698-
add_values.clear();
699686
}
700687

701688
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -777,11 +764,6 @@ void calculate_total_cols(std::shared_ptr<const CudaExecutor> exec,
777764

778765
exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
779766
result);
780-
781-
block_results.clear();
782-
nnz_per_row.clear();
783-
max_nnz_per_slice.clear();
784-
d_result.clear();
785767
}
786768

787769
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -915,10 +897,6 @@ void calculate_max_nnz_per_row(std::shared_ptr<const CudaExecutor> exec,
915897

916898
exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
917899
result);
918-
919-
nnz_per_row.clear();
920-
block_results.clear();
921-
d_result.clear();
922900
}
923901

924902
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -950,15 +928,7 @@ void convert_to_hybrid(std::shared_ptr<const CudaExecutor> exec,
950928
num_rows, max_nnz_per_row, as_cuda_type(source->get_const_row_ptrs()),
951929
as_cuda_type(coo_offset.get_data()));
952930

953-
auto add_values =
954-
Array<size_type>(exec, ceildiv(num_rows, default_block_size));
955-
grid_dim = ceildiv(num_rows, default_block_size);
956-
start_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
957-
num_rows, as_cuda_type(coo_offset.get_data()),
958-
as_cuda_type(add_values.get_data()));
959-
finalize_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
960-
num_rows, as_cuda_type(coo_offset.get_data()),
961-
as_cuda_type(add_values.get_const_data()));
931+
prefix_sum(exec, coo_offset.get_data(), num_rows);
962932

963933
grid_dim = ceildiv(num_rows * config::warp_size, default_block_size);
964934
kernel::fill_in_hybrid<<<grid_dim, default_block_size>>>(

0 commit comments

Comments
 (0)