Skip to content

Commit a37c101

Browse files
authored
Merge pull request #791 from ginkgo-project/benchmarks-auto-repetitions
Merge Benchmarks auto repetitions This PR adds the option to automatically deduce the number of repetitions a benchmark should use. Especially for small working sizes this can lead to more consistent results. The number is chosen s.t. the benchmark runs at least `min_repetitions` and either the total runtime surpasses `min_runtime` or the number of repetitions surpasses `max_repetitions`. Additionally, the timing overhead is reduced, by increasing the number of iterations between each timing. These intervals increase with the factor `repetition_growth_factor`. All mentioned parameters can be adjusted through command-line flags. This behavior is NOT enabled by default, the flags `-repetitions auto` has to be used. The PR also changes the internal repetition loop in the benchmark's implementations, using a range-based for-loop similar to google's benchmark. Related PR: #791
2 parents 3112263 + 33ff686 commit a37c101

7 files changed

Lines changed: 356 additions & 73 deletions

File tree

benchmark/blas/blas.cpp

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -437,32 +437,34 @@ void apply_blas(const char *operation_name, std::shared_ptr<gko::Executor> exec,
437437

438438
auto op = operation_map[operation_name](exec, parse_dims(test_case));
439439

440+
auto timer = get_timer(exec, FLAGS_gpu_timer);
441+
IterationControl ic(timer);
442+
440443
// warm run
441-
for (unsigned int i = 0; i < FLAGS_warmup; i++) {
444+
for (auto _ : ic.warmup_run()) {
442445
op->prepare();
443446
exec->synchronize();
444447
op->run();
445448
exec->synchronize();
446449
}
447450

448451
// timed run
449-
auto timer = get_timer(exec, FLAGS_gpu_timer);
450-
for (unsigned int i = 0; i < FLAGS_repetitions; i++) {
451-
op->prepare();
452-
exec->synchronize();
453-
timer->tic();
452+
op->prepare();
453+
for (auto _ : ic.run()) {
454454
op->run();
455-
timer->toc();
456455
}
457-
auto runtime = timer->compute_average_time();
458-
auto flops = static_cast<double>(op->get_flops());
459-
auto mem = static_cast<double>(op->get_memory());
456+
const auto runtime = ic.compute_average_time();
457+
const auto flops = static_cast<double>(op->get_flops());
458+
const auto mem = static_cast<double>(op->get_memory());
459+
const auto repetitions = ic.get_num_repetitions();
460460
add_or_set_member(blas_case[operation_name], "time", runtime,
461461
allocator);
462462
add_or_set_member(blas_case[operation_name], "flops", flops / runtime,
463463
allocator);
464464
add_or_set_member(blas_case[operation_name], "bandwidth", mem / runtime,
465465
allocator);
466+
add_or_set_member(blas_case[operation_name], "repetitions", repetitions,
467+
allocator);
466468

467469
// compute and write benchmark data
468470
add_or_set_member(blas_case[operation_name], "completed", true,

benchmark/conversions/conversions.cpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,24 +72,25 @@ void convert_matrix(const gko::LinOp *matrix_from, const char *format_to,
7272
gko::matrix_data<etype> data{gko::dim<2>{1, 1}, 1};
7373
auto matrix_to =
7474
share(formats::matrix_factory.at(format_to)(exec, data));
75+
76+
auto timer = get_timer(exec, FLAGS_gpu_timer);
77+
IterationControl ic{timer};
78+
7579
// warm run
76-
for (unsigned int i = 0; i < FLAGS_warmup; i++) {
80+
for (auto _ : ic.warmup_run()) {
7781
exec->synchronize();
7882
matrix_to->copy_from(matrix_from);
7983
exec->synchronize();
8084
matrix_to->clear();
8185
}
82-
auto timer = get_timer(exec, FLAGS_gpu_timer);
8386
// timed run
84-
for (unsigned int i = 0; i < FLAGS_repetitions; i++) {
85-
exec->synchronize();
86-
timer->tic();
87+
for (auto _ : ic.run()) {
8788
matrix_to->copy_from(matrix_from);
88-
timer->toc();
89-
matrix_to->clear();
9089
}
9190
add_or_set_member(conversion_case[conversion_name], "time",
9291
timer->compute_average_time(), allocator);
92+
add_or_set_member(conversion_case[conversion_name], "repetitions",
93+
timer->get_num_repetitions(), allocator);
9394

9495
// compute and write benchmark data
9596
add_or_set_member(conversion_case[conversion_name], "completed", true,

benchmark/preconditioner/preconditioner.cpp

Lines changed: 18 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -159,45 +159,38 @@ void run_preconditioner(const char *precond_name,
159159
allocator);
160160
}
161161

162+
IterationControl ic_gen{get_timer(exec, FLAGS_gpu_timer)};
163+
IterationControl ic_apply{get_timer(exec, FLAGS_gpu_timer)};
164+
162165
{
163166
// fast run, gets total time
164167
auto x_clone = clone(x);
165168

166169
auto precond = precond_factory.at(precond_name)(exec);
167170

168-
for (auto i = 0u; i < FLAGS_warmup; ++i) {
171+
172+
for (auto _ : ic_apply.warmup_run()) {
169173
precond->generate(system_matrix)->apply(lend(b), lend(x_clone));
170174
}
171-
auto generate_timer = get_timer(exec, FLAGS_gpu_timer);
172-
auto apply_timer = get_timer(exec, FLAGS_gpu_timer);
173175

174-
exec->synchronize();
175-
generate_timer->tic();
176176
std::unique_ptr<gko::LinOp> precond_op;
177-
for (auto i = 0u; i < FLAGS_repetitions; ++i) {
177+
for (auto _ : ic_gen.run()) {
178178
precond_op = precond->generate(system_matrix);
179179
}
180-
generate_timer->toc();
181180

182-
// the timer is out of the loops to reduce calling synchronize
183-
// overhead, so the timer does not know the number of repetitions.
184-
auto generate_time =
185-
generate_timer->get_total_time() / FLAGS_repetitions;
186181
add_or_set_member(this_precond_data["generate"], "time",
187-
generate_time, allocator);
182+
ic_gen.compute_average_time(), allocator);
183+
add_or_set_member(this_precond_data["generate"], "repetitions",
184+
ic_gen.get_num_repetitions(), allocator);
188185

189-
exec->synchronize();
190-
apply_timer->tic();
191-
for (auto i = 0u; i < FLAGS_repetitions; ++i) {
186+
for (auto _ : ic_apply.run()) {
192187
precond_op->apply(lend(b), lend(x_clone));
193188
}
194-
apply_timer->toc();
195189

196-
// the timer is out of the loops to reduce calling synchronize
197-
// overhead, so the timer does not know the number of repetitions.
198-
auto apply_time = apply_timer->get_total_time() / FLAGS_repetitions;
199-
add_or_set_member(this_precond_data["apply"], "time", apply_time,
200-
allocator);
190+
add_or_set_member(this_precond_data["apply"], "time",
191+
ic_apply.compute_average_time(), allocator);
192+
add_or_set_member(this_precond_data["apply"], "repetitions",
193+
ic_apply.get_num_repetitions(), allocator);
201194
}
202195

203196
if (FLAGS_detailed) {
@@ -209,24 +202,24 @@ void run_preconditioner(const char *precond_name,
209202
std::make_shared<OperationLogger>(exec, FLAGS_nested_names);
210203
exec->add_logger(gen_logger);
211204
std::unique_ptr<gko::LinOp> precond_op;
212-
for (auto i = 0u; i < FLAGS_repetitions; ++i) {
205+
for (auto i = 0u; i < ic_gen.get_num_repetitions(); ++i) {
213206
precond_op = precond->generate(system_matrix);
214207
}
215208
exec->remove_logger(gko::lend(gen_logger));
216209

217210
gen_logger->write_data(this_precond_data["generate"]["components"],
218-
allocator, FLAGS_repetitions);
211+
allocator, ic_gen.get_num_repetitions());
219212

220213
auto apply_logger =
221214
std::make_shared<OperationLogger>(exec, FLAGS_nested_names);
222215
exec->add_logger(apply_logger);
223-
for (auto i = 0u; i < FLAGS_repetitions; ++i) {
216+
for (auto i = 0u; i < ic_apply.get_num_repetitions(); ++i) {
224217
precond_op->apply(lend(b), lend(x_clone));
225218
}
226219
exec->remove_logger(gko::lend(apply_logger));
227220

228221
apply_logger->write_data(this_precond_data["apply"]["components"],
229-
allocator, FLAGS_repetitions);
222+
allocator, ic_apply.get_num_repetitions());
230223
}
231224

232225
add_or_set_member(this_precond_data, "completed", true, allocator);

benchmark/solver/solver.cpp

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -399,9 +399,11 @@ void solve_system(const std::string &solver_name,
399399
allocator);
400400
}
401401

402+
IterationControl ic{get_timer(exec, FLAGS_gpu_timer)};
403+
402404
// warm run
403405
auto it_logger = std::make_shared<IterationLogger>(exec);
404-
for (unsigned int i = 0; i < FLAGS_warmup; i++) {
406+
for (auto _ : ic.warmup_run()) {
405407
auto x_clone = clone(x);
406408
auto precond = precond_factory.at(precond_name)(exec);
407409
auto solver = generate_solver(exec, give(precond), solver_name)
@@ -472,9 +474,10 @@ void solve_system(const std::string &solver_name,
472474

473475
// timed run
474476
auto generate_timer = get_timer(exec, FLAGS_gpu_timer);
475-
auto apply_timer = get_timer(exec, FLAGS_gpu_timer);
476-
for (unsigned int i = 0; i < FLAGS_repetitions; i++) {
477-
auto x_clone = clone(x);
477+
auto apply_timer = ic.get_timer();
478+
auto x_clone = clone(x);
479+
for (auto status : ic.run(false)) {
480+
x_clone = clone(x);
478481

479482
exec->synchronize();
480483
generate_timer->tic();
@@ -487,19 +490,19 @@ void solve_system(const std::string &solver_name,
487490
apply_timer->tic();
488491
solver->apply(lend(b), lend(x_clone));
489492
apply_timer->toc();
490-
491-
if (b->get_size()[1] == 1 && i == FLAGS_repetitions - 1 &&
492-
!FLAGS_overhead) {
493-
auto residual = compute_residual_norm(lend(system_matrix),
494-
lend(b), lend(x_clone));
495-
add_or_set_member(solver_json, "residual_norm", residual,
496-
allocator);
497-
}
493+
}
494+
if (b->get_size()[1] == 1 && !FLAGS_overhead) {
495+
auto residual = compute_residual_norm(lend(system_matrix), lend(b),
496+
lend(x_clone));
497+
add_or_set_member(solver_json, "residual_norm", residual,
498+
allocator);
498499
}
499500
add_or_set_member(solver_json["generate"], "time",
500501
generate_timer->compute_average_time(), allocator);
501502
add_or_set_member(solver_json["apply"], "time",
502503
apply_timer->compute_average_time(), allocator);
504+
add_or_set_member(solver_json, "repetitions",
505+
apply_timer->get_num_repetitions(), allocator);
503506

504507
// compute and write benchmark data
505508
add_or_set_member(solver_json, "completed", true, allocator);
@@ -515,7 +518,8 @@ void solve_system(const std::string &solver_name,
515518
int main(int argc, char *argv[])
516519
{
517520
// Set the default repetitions = 1.
518-
FLAGS_repetitions = 1;
521+
FLAGS_repetitions = "1";
522+
FLAGS_min_repetitions = 1;
519523
std::string header =
520524
"A benchmark for measuring performance of Ginkgo's solvers.\n";
521525
std::string format =

benchmark/spmv/spmv.cpp

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,10 @@ void apply_spmv(const char *format_name, std::shared_ptr<gko::Executor> exec,
9191
add_or_set_member(spmv_case[format_name], "max_relative_norm2",
9292
max_relative_norm2, allocator);
9393
}
94+
95+
IterationControl ic{get_timer(exec, FLAGS_gpu_timer)};
9496
// warm run
95-
for (unsigned int i = 0; i < FLAGS_warmup; i++) {
97+
for (auto _ : ic.warmup_run()) {
9698
auto x_clone = clone(x);
9799
exec->synchronize();
98100
system_matrix->apply(lend(b), lend(x_clone));
@@ -123,12 +125,10 @@ void apply_spmv(const char *format_name, std::shared_ptr<gko::Executor> exec,
123125
// variable is used.
124126
gko::_tuned_value = val;
125127
auto tuning_timer = get_timer(exec, FLAGS_gpu_timer);
126-
for (unsigned int i = 0; i < FLAGS_repetitions; i++) {
127-
auto x_clone = clone(x);
128-
exec->synchronize();
129-
tuning_timer->tic();
128+
IterationControl ic_tuning{tuning_timer};
129+
auto x_clone = clone(x);
130+
for (auto _ : ic_tuning.run()) {
130131
system_matrix->apply(lend(b), lend(x_clone));
131-
tuning_timer->toc();
132132
}
133133
tuning_case["time"].PushBack(tuning_timer->compute_average_time(),
134134
allocator);
@@ -140,16 +140,14 @@ void apply_spmv(const char *format_name, std::shared_ptr<gko::Executor> exec,
140140
#endif // GINKGO_BENCHMARK_ENABLE_TUNING
141141

142142
// timed run
143-
auto timer = get_timer(exec, FLAGS_gpu_timer);
144-
for (unsigned int i = 0; i < FLAGS_repetitions; i++) {
145-
auto x_clone = clone(x);
146-
exec->synchronize();
147-
timer->tic();
143+
auto x_clone = clone(x);
144+
for (auto _ : ic.run()) {
148145
system_matrix->apply(lend(b), lend(x_clone));
149-
timer->toc();
150146
}
151147
add_or_set_member(spmv_case[format_name], "time",
152-
timer->compute_average_time(), allocator);
148+
ic.compute_average_time(), allocator);
149+
add_or_set_member(spmv_case[format_name], "repetitions",
150+
ic.get_num_repetitions(), allocator);
153151

154152
// compute and write benchmark data
155153
add_or_set_member(spmv_case[format_name], "completed", true, allocator);

0 commit comments

Comments
 (0)