Skip to content

Commit b2f7d96

Browse files
committed
change KCFG -> DCFG and add desc for list
1 parent 6514704 commit b2f7d96

8 files changed

Lines changed: 88 additions & 74 deletions

File tree

dpcpp/base/config.hpp

Lines changed: 31 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -77,21 +77,21 @@ struct config {
7777

7878

7979
/**
80-
* KCFG_1D provides the usual way to embed information from workgroup size and
80+
* DCFG_1D provides the usual way to embed information from workgroup size and
8181
* sub_group size. We consider the workgroup size up to 4096 which requires 13
82-
* bits to store it and sub_group size up to 64 which requires 7 bits to store
83-
* it.
82+
* bits, and sub_group size up to 64 which requires 7 bits.
8483
*/
85-
using KCFG_1D = ConfigSet<13, 7>;
84+
using DCFG_1D = ConfigSet<13, 7>;
8685

8786

8887
template <uint32 block, uint32 subgroup>
8988
struct device_config {
9089
static constexpr uint32 block_size = block;
9190
static constexpr uint32 subgroup_size = subgroup;
92-
static constexpr uint32 encode = KCFG_1D::encode(block_size, subgroup_size);
91+
static constexpr uint32 encode = DCFG_1D::encode(block_size, subgroup_size);
9392
};
9493

94+
9595
/**
9696
* encode_list base type
9797
*
@@ -112,30 +112,44 @@ struct encode_list<syn::type_list<Types...>> {
112112
};
113113

114114

115-
using block_cfg_type_list_t =
115+
// dcfg_block_type_list_t is the type list for different workgroup size.
116+
using dcfg_block_type_list_t =
116117
syn::type_list<device_config<512, 16>, device_config<256, 16>,
117118
device_config<128, 16>>;
118119

119-
using block_cfg_list_t = encode_list<block_cfg_type_list_t>::type;
120+
// dcfg_block_list_t is the value list variant of dcfg_block_type_list_t
121+
using dcfg_block_list_t = encode_list<dcfg_block_type_list_t>::type;
122+
120123

124+
// dcfg_1d_type_list_t is the type list for different workgroup and sub_group
125+
// size.
126+
using dcfg_1d_type_list_t =
127+
syn::type_list<device_config<512, 64>, device_config<512, 32>,
128+
device_config<512, 16>, device_config<256, 32>,
129+
device_config<256, 16>, device_config<256, 8>>;
121130

122-
using kcfg_1d_list_t =
123-
syn::value_list<uint32, KCFG_1D::encode(512, 64), KCFG_1D::encode(512, 32),
124-
KCFG_1D::encode(512, 16), KCFG_1D::encode(256, 32),
125-
KCFG_1D::encode(256, 16), KCFG_1D::encode(256, 8)>;
131+
// dcfg_1d_type_list_t is the value list variant of dcfg_1d_type_type_list_t
132+
using dcfg_1d_list_t = encode_list<dcfg_1d_type_list_t>::type;
126133

127134

128-
using kcfg_sq_type_list_t =
135+
// dcfg_sq_type_list_t is the type list for different sub_group size and its
136+
// workgroup size is square of sub_group.
137+
using dcfg_sq_type_list_t =
129138
syn::type_list<device_config<4096, 64>, device_config<1024, 32>,
130139
device_config<256, 16>, device_config<64, 8>>;
131140

132-
using kcfg_sq_list_t = encode_list<kcfg_sq_type_list_t>::type;
141+
// dcfg_sq_list_t is the value list variant of dcfg_sq_type_list_t
142+
using dcfg_sq_list_t = encode_list<dcfg_sq_type_list_t>::type;
143+
133144

145+
// dcfg_1sg_list_t is the type list for only one sub_group in a workgroup.
146+
using dcfg_1sg_type_list_t =
147+
syn::type_list<device_config<64, 64>, device_config<32, 32>,
148+
device_config<16, 16>, device_config<8, 8>,
149+
device_config<4, 4>>;
134150

135-
using kcfg_1sg_list_t =
136-
syn::value_list<uint32, KCFG_1D::encode(64, 64), KCFG_1D::encode(32, 32),
137-
KCFG_1D::encode(16, 16), KCFG_1D::encode(8, 8),
138-
KCFG_1D::encode(4, 4)>;
151+
// dcfg_1sg_list_t is the value list variant of dcfg_1sg_type_list_t
152+
using dcfg_1sg_list_t = encode_list<dcfg_1sg_type_list_t>::type;
139153

140154

141155
} // namespace dpcpp

dpcpp/base/helper.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,11 +86,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
8686
sycl::queue* queue, InferredArgs... args) \
8787
{ \
8888
queue->submit([&](sycl::handler& cgh) { \
89-
if constexpr (KCFG_1D::decode<1>(encoded) > 1) { \
89+
if constexpr (DCFG_1D::decode<1>(encoded) > 1) { \
9090
cgh.parallel_for( \
9191
sycl_nd_range(grid, block), [= \
9292
](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( \
93-
KCFG_1D::decode<1>( \
93+
DCFG_1D::decode<1>( \
9494
encoded))]] { \
9595
kernel_<encoded>(args..., item_ct1); \
9696
}); \

dpcpp/base/kernel_launch_reduction.dp.hpp

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ namespace kernels {
5454
namespace dpcpp {
5555

5656

57-
static constexpr auto kcfg_1d_list_simple_reduction = kcfg_1d_list_t();
57+
static constexpr auto dcfg_1d_list_simple_reduction = dcfg_1d_list_t();
5858

5959

6060
template <typename DeviceConfig, typename ValueType, typename KernelFunction,
@@ -249,7 +249,7 @@ void run_kernel_reduction_impl(std::shared_ptr<const DpcppExecutor> exec,
249249

250250
GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION_TOTYPE(select_run_kernel_reduction,
251251
run_kernel_reduction_impl,
252-
KCFG_1D)
252+
DCFG_1D)
253253

254254

255255
template <typename ValueType, typename KernelFunction, typename ReductionOp,
@@ -261,12 +261,12 @@ void run_kernel_reduction_cached(std::shared_ptr<const DpcppExecutor> exec,
261261
array<char>& tmp, KernelArgs&&... args)
262262
{
263263
const auto desired_cfg = get_first_cfg(
264-
as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
265-
return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg),
266-
KCFG_1D::decode<1>(cfg));
264+
as_array(dcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
265+
return validate(exec->get_queue(), DCFG_1D::decode<0>(cfg),
266+
DCFG_1D::decode<1>(cfg));
267267
});
268268
select_run_kernel_reduction(
269-
kcfg_1d_list_simple_reduction,
269+
dcfg_1d_list_simple_reduction,
270270
[&](std::uint32_t cfg) { return cfg == desired_cfg; },
271271
syn::value_list<bool>(), syn::value_list<int>(),
272272
syn::value_list<size_type>(), syn::type_list<>(), exec, fn, op,
@@ -283,12 +283,12 @@ void run_kernel_reduction_cached(std::shared_ptr<const DpcppExecutor> exec,
283283
array<char>& tmp, KernelArgs&&... args)
284284
{
285285
const auto desired_cfg = get_first_cfg(
286-
as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
287-
return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg),
288-
KCFG_1D::decode<1>(cfg));
286+
as_array(dcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
287+
return validate(exec->get_queue(), DCFG_1D::decode<0>(cfg),
288+
DCFG_1D::decode<1>(cfg));
289289
});
290290
select_run_kernel_reduction(
291-
kcfg_1d_list_simple_reduction,
291+
dcfg_1d_list_simple_reduction,
292292
[&](std::uint32_t cfg) { return cfg == desired_cfg; },
293293
syn::value_list<bool>(), syn::value_list<int>(),
294294
syn::value_list<size_type>(), syn::type_list<>(), exec, fn, op,
@@ -600,7 +600,7 @@ void run_kernel_row_reduction_stage1(std::shared_ptr<const DpcppExecutor> exec,
600600

601601
GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION_TOTYPE(
602602
select_kernel_row_reduction_stage1, run_kernel_row_reduction_stage1,
603-
KCFG_1D);
603+
DCFG_1D);
604604

605605

606606
template <typename cfg, typename ValueType, typename KernelFunction,
@@ -666,7 +666,7 @@ void run_kernel_col_reduction_stage1(std::shared_ptr<const DpcppExecutor> exec,
666666

667667
GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION_TOTYPE(
668668
select_kernel_col_reduction_stage1, run_kernel_col_reduction_stage1,
669-
KCFG_1D);
669+
DCFG_1D);
670670

671671

672672
} // namespace
@@ -682,12 +682,12 @@ void run_kernel_row_reduction_cached(std::shared_ptr<const DpcppExecutor> exec,
682682
KernelArgs&&... args)
683683
{
684684
const auto desired_cfg = get_first_cfg(
685-
as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
686-
return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg),
687-
KCFG_1D::decode<1>(cfg));
685+
as_array(dcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
686+
return validate(exec->get_queue(), DCFG_1D::decode<0>(cfg),
687+
DCFG_1D::decode<1>(cfg));
688688
});
689689
select_kernel_row_reduction_stage1(
690-
kcfg_1d_list_simple_reduction,
690+
dcfg_1d_list_simple_reduction,
691691
[&](std::uint32_t cfg) { return cfg == desired_cfg; },
692692
syn::value_list<bool>(), syn::value_list<int>(),
693693
syn::value_list<size_type>(), syn::type_list<>(), exec, fn, op,
@@ -705,12 +705,12 @@ void run_kernel_col_reduction_cached(std::shared_ptr<const DpcppExecutor> exec,
705705
array<char>& tmp, KernelArgs&&... args)
706706
{
707707
const auto desired_cfg = get_first_cfg(
708-
as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
709-
return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg),
710-
KCFG_1D::decode<1>(cfg));
708+
as_array(dcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
709+
return validate(exec->get_queue(), DCFG_1D::decode<0>(cfg),
710+
DCFG_1D::decode<1>(cfg));
711711
});
712712
select_kernel_col_reduction_stage1(
713-
kcfg_1d_list_simple_reduction,
713+
dcfg_1d_list_simple_reduction,
714714
[&](std::uint32_t cfg) { return cfg == desired_cfg; },
715715
syn::value_list<bool>(), syn::value_list<int>(),
716716
syn::value_list<size_type>(), syn::type_list<>(), exec, fn, op,

dpcpp/components/prefix_sum_kernels.dp.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,16 +51,16 @@ namespace dpcpp {
5151
namespace components {
5252

5353

54-
static constexpr auto block_cfg_list = block_cfg_list_t();
54+
static constexpr auto block_cfg_list = dcfg_block_list_t();
5555

5656

5757
GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION_TOTYPE(start_prefix_sum,
58-
start_prefix_sum, KCFG_1D);
58+
start_prefix_sum, DCFG_1D);
5959
GKO_ENABLE_DEFAULT_CONFIG_CALL(start_prefix_sum_call, start_prefix_sum,
6060
block_cfg_list)
6161

6262
GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION_TOTYPE(finalize_prefix_sum,
63-
finalize_prefix_sum, KCFG_1D);
63+
finalize_prefix_sum, DCFG_1D);
6464
GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_prefix_sum_call, finalize_prefix_sum,
6565
block_cfg_list)
6666

@@ -75,10 +75,10 @@ void prefix_sum(std::shared_ptr<const DpcppExecutor> exec, IndexType* counts,
7575
constexpr auto block_cfg_array = as_array(block_cfg_list);
7676
const std::uint32_t cfg =
7777
get_first_cfg(block_cfg_array, [&queue](std::uint32_t cfg) {
78-
return validate(queue, KCFG_1D::decode<0>(cfg),
79-
KCFG_1D::decode<1>(cfg));
78+
return validate(queue, DCFG_1D::decode<0>(cfg),
79+
DCFG_1D::decode<1>(cfg));
8080
});
81-
const auto wg_size = KCFG_1D::decode<0>(cfg);
81+
const auto wg_size = DCFG_1D::decode<0>(cfg);
8282
auto num_blocks = ceildiv(num_entries, wg_size);
8383
array<IndexType> block_sum_array(exec, num_blocks - 1);
8484
auto block_sums = block_sum_array.get_data();

dpcpp/components/reduction.dp.hpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@ namespace dpcpp {
6363

6464

6565
static constexpr int default_block_size = 256;
66-
static constexpr auto kcfg_1d_list = kcfg_1d_list_t();
67-
static constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
66+
static constexpr auto dcfg_1d_list = dcfg_1d_list_t();
67+
static constexpr auto dcfg_1d_array = as_array(dcfg_1d_list);
6868

6969
/**
7070
* @internal
@@ -236,10 +236,10 @@ void reduce_add_array(dim3 grid, dim3 block, size_type dynamic_shared_memory,
236236
}
237237

238238
GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION_TOTYPE(reduce_add_array_config,
239-
reduce_add_array, KCFG_1D);
239+
reduce_add_array, DCFG_1D);
240240

241241
GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_add_array_call, reduce_add_array_config,
242-
kcfg_1d_list);
242+
dcfg_1d_list);
243243

244244

245245
/**
@@ -260,14 +260,14 @@ ValueType reduce_add_array(std::shared_ptr<const DpcppExecutor> exec,
260260
auto block_results = array<ValueType>(exec);
261261
ValueType answer = zero<ValueType>();
262262
auto queue = exec->get_queue();
263-
constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
263+
constexpr auto dcfg_1d_array = as_array(dcfg_1d_list);
264264
const std::uint32_t cfg =
265-
get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
266-
return validate(queue, KCFG_1D::decode<0>(cfg),
267-
KCFG_1D::decode<1>(cfg));
265+
get_first_cfg(dcfg_1d_array, [&queue](std::uint32_t cfg) {
266+
return validate(queue, DCFG_1D::decode<0>(cfg),
267+
DCFG_1D::decode<1>(cfg));
268268
});
269-
const auto wg_size = KCFG_1D::decode<0>(cfg);
270-
const auto sg_size = KCFG_1D::decode<1>(cfg);
269+
const auto wg_size = DCFG_1D::decode<0>(cfg);
270+
const auto sg_size = DCFG_1D::decode<1>(cfg);
271271

272272
if (size > wg_size) {
273273
const auto n = ceildiv(size, wg_size);

dpcpp/matrix/dense_kernels.dp.cpp

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,10 @@ namespace dense {
7373

7474
// Disable the 64 subgroup. CPU supports 64 now, but conj_transpose will
7575
// lead CL_OUT_OF_RESOURCES. TODO: investigate this issue.
76-
constexpr auto kcfg_1d_list = kcfg_1d_list_t();
77-
constexpr auto subgroup_list = kcfg_1sg_list_t();
78-
constexpr auto kcfg_sq_list = kcfg_sq_list_t();
79-
constexpr auto kcfg_1d_array = syn::as_array(kcfg_1d_list);
76+
constexpr auto dcfg_1d_list = dcfg_1d_list_t();
77+
constexpr auto subgroup_list = dcfg_1sg_list_t();
78+
constexpr auto dcfg_sq_list = dcfg_sq_list_t();
79+
constexpr auto dcfg_1d_array = syn::as_array(dcfg_1d_list);
8080
constexpr int default_block_size = 256;
8181

8282

@@ -193,9 +193,9 @@ void conj_transpose(dim3 grid, dim3 block, size_type dynamic_shared_memory,
193193

194194

195195
GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION_TOTYPE(conj_transpose,
196-
conj_transpose, KCFG_1D);
196+
conj_transpose, DCFG_1D);
197197
GKO_ENABLE_DEFAULT_CONFIG_CALL(conj_transpose_call, conj_transpose,
198-
kcfg_sq_list);
198+
dcfg_sq_list);
199199

200200

201201
} // namespace kernel
@@ -552,7 +552,7 @@ void transpose(std::shared_ptr<const DefaultExecutor> exec,
552552
{
553553
auto queue = exec->get_queue();
554554
kernel::transpose_call(
555-
kcfg_sq_type_list_t(),
555+
dcfg_sq_type_list_t(),
556556
[&queue](auto cfg) {
557557
const auto sg_size = cfg.subgroup_size;
558558
return validate(queue, cfg.block_size, sg_size) &&
@@ -572,17 +572,17 @@ void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
572572
matrix::Dense<ValueType>* trans)
573573
{
574574
auto size = orig->get_size();
575-
auto sq_array = syn::as_array(kcfg_sq_list);
575+
auto sq_array = syn::as_array(dcfg_sq_list);
576576
auto queue = exec->get_queue();
577577
const std::uint32_t cfg =
578578
get_first_cfg(sq_array, [&queue](std::uint32_t cfg) {
579-
const auto sg_size = KCFG_1D::decode<1>(cfg);
580-
return validate(queue, KCFG_1D::decode<0>(cfg), sg_size) &&
579+
const auto sg_size = DCFG_1D::decode<1>(cfg);
580+
return validate(queue, DCFG_1D::decode<0>(cfg), sg_size) &&
581581
sg_size * (sg_size + 1) * sizeof(ValueType) <=
582582
queue->get_device()
583583
.get_info<sycl::info::device::local_mem_size>();
584584
});
585-
const auto sg_size = KCFG_1D::decode<1>(cfg);
585+
const auto sg_size = DCFG_1D::decode<1>(cfg);
586586
dim3 grid(ceildiv(size[1], sg_size), ceildiv(size[0], sg_size));
587587
dim3 block(sg_size, sg_size);
588588
kernel::conj_transpose_call(cfg, grid, block, 0, queue, size[0], size[1],

dpcpp/synthesizer/implementation_selection.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ namespace syn {
114114

115115

116116
#define GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION_TOTYPE(_name, _callable, \
117-
_kcfg) \
117+
_dcfg) \
118118
template <typename Predicate, bool... BoolArgs, int... IntArgs, \
119119
gko::size_type... SizeTArgs, typename... TArgs, \
120120
typename... InferredArgs> \
@@ -139,7 +139,7 @@ namespace syn {
139139
if (is_eligible(K)) { \
140140
_callable<BoolArgs..., IntArgs..., SizeTArgs..., TArgs..., \
141141
::gko::kernels::dpcpp::device_config< \
142-
_kcfg::decode<0>(K), _kcfg::decode<1>(K)>>( \
142+
_dcfg::decode<0>(K), _dcfg::decode<1>(K)>>( \
143143
std::forward<InferredArgs>(args)...); \
144144
} else { \
145145
_name(::gko::syn::value_list<std::uint32_t, Rest...>(), \

0 commit comments

Comments
 (0)