Skip to content

Commit 3ecb031

Browse files
yhmtsaiSlaedrpratikvnTerry CojeanThomas Grützmacher
committed
Review update
- fix gpu_timer in script - use int64_t - add const to function - update documentation - add get_latest_time Co-authored-by: Aditya Kashi <aditya.kashi@kit.edu> Co-authored-by: Pratik Nayak <pratikvn@protonmail.com> Co-authored-by: Terry Cojean <terry.cojean@kit.edu> Co-authored-by: Thomas Grützmacher <thomas.gruetzmacher@kit.edu>
1 parent d8a3775 commit 3ecb031

3 files changed

Lines changed: 129 additions & 24 deletions

File tree

BENCHMARKING.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,3 +305,5 @@ The supported environment variables are described in the following list:
305305
values as the right-hand side in solver benchmarks. Default is `unit`.
306306
* `DETAILED={0,1}` - selects whether detailed benchmarks should be ran for the
307307
solver benchmarks, can be either `0` (off) or `1` (on). The default is `0`.
308+
* `GPU_TIMER={true, false}` - If set to `true`, use the gpu timer, which is
309+
valid for cuda/hip executor, to measure the timing. Default is `false`.

benchmark/run_all_benchmarks.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ run_conversion_benchmarks() {
149149
cp "$1" "$1.imd" # make sure we're not loosing the original input
150150
./conversions/conversions --backup="$1.bkp" --double_buffer="$1.bkp2" \
151151
--executor="${EXECUTOR}" --formats="${FORMATS}" \
152-
--device_id="${DEVICE_ID}" --gpu_time=${GPU_TIMER} \
152+
--device_id="${DEVICE_ID}" --gpu_timer=${GPU_TIMER} \
153153
<"$1.imd" 2>&1 >"$1"
154154
keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd"
155155
}
@@ -165,7 +165,7 @@ run_spmv_benchmarks() {
165165
cp "$1" "$1.imd" # make sure we're not loosing the original input
166166
./spmv/spmv --backup="$1.bkp" --double_buffer="$1.bkp2" \
167167
--executor="${EXECUTOR}" --formats="${FORMATS}" \
168-
--device_id="${DEVICE_ID}" --gpu_time=${GPU_TIMER} \
168+
--device_id="${DEVICE_ID}" --gpu_timer=${GPU_TIMER} \
169169
<"$1.imd" 2>&1 >"$1"
170170
keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd"
171171
}
@@ -183,7 +183,7 @@ run_solver_benchmarks() {
183183
--executor="${EXECUTOR}" --solvers="${SOLVERS}" \
184184
--preconditioners="${PRECONDS}" \
185185
--max_iters=${SOLVERS_MAX_ITERATIONS} --rel_res_goal=${SOLVERS_PRECISION} \
186-
${SOLVERS_RHS_FLAG} ${DETAILED_STR} --device_id="${DEVICE_ID}" --gpu_time=${GPU_TIMER} \
186+
${SOLVERS_RHS_FLAG} ${DETAILED_STR} --device_id="${DEVICE_ID}" --gpu_timer=${GPU_TIMER} \
187187
<"$1.imd" 2>&1 >"$1"
188188
keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd"
189189
}
@@ -209,7 +209,7 @@ run_preconditioner_benchmarks() {
209209
--executor="${EXECUTOR}" --preconditioners="jacobi" \
210210
--jacobi_max_block_size="${bsize}" \
211211
--jacobi_storage="${prec}" \
212-
--device_id="${DEVICE_ID}" --gpu_time=${GPU_TIMER} \
212+
--device_id="${DEVICE_ID}" --gpu_timer=${GPU_TIMER} \
213213
<"$1.imd" 2>&1 >"$1"
214214
keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd"
215215
done

benchmark/utils/timer.hpp

Lines changed: 123 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -62,72 +62,142 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
6262
#include "hip/base/device_guard.hip.hpp"
6363

6464

65-
#endif // HAS_CUDA
65+
#endif // HAS_HIP
6666

6767

68+
// Command-line arguments
6869
DEFINE_bool(gpu_timer, false,
6970
"use gpu timer based on event. It is valid only when "
7071
"executor is cuda or hip");
7172

7273

74+
/**
75+
* Timer stores the timing information
76+
*/
7377
class Timer {
7478
public:
79+
/**
80+
* Starts the timer
81+
*/
7582
void tic()
7683
{
7784
assert(tic_called_ == false);
7885
this->tic_impl();
7986
tic_called_ = true;
8087
}
8188

82-
std::size_t toc()
89+
/**
90+
* Finishs the timer
91+
*/
92+
void toc()
8393
{
8494
assert(tic_called_ == true);
8595
auto ns = this->toc_impl();
8696
tic_called_ = false;
8797
this->add_record(ns);
88-
return ns;
8998
}
9099

91-
std::size_t get_total_time() { return total_duration_ns_; }
92-
93-
std::size_t get_tictoc_num() { return duration_ns_.size(); }
94-
95-
double get_average_time()
100+
/**
101+
* Get the summation of each time in nanoseconds.
102+
*
103+
* @return the nanoseconds of total time
104+
*/
105+
std::int64_t get_total_time() const { return total_duration_ns_; }
106+
107+
/**
108+
* Get the number of repetitions.
109+
*
110+
* @return the number of repetitions
111+
*/
112+
std::int64_t get_num_repetitions() const { return duration_ns_.size(); }
113+
114+
/**
115+
* Get the average time of repetitions in nanoseconds
116+
*
117+
* @return the average time in nanoseconds
118+
*/
119+
double get_average_time() const
96120
{
97121
return static_cast<double>(this->get_total_time()) /
98-
this->get_tictoc_num();
122+
this->get_num_repetitions();
123+
}
124+
125+
/**
126+
* Get the vector containing the each time results in nanoseconds.
127+
*
128+
* @return the vector of each time results in nanoseconds
129+
*/
130+
std::vector<std::int64_t> get_time_detail() const { return duration_ns_; }
131+
132+
/**
133+
* Get the latest result in nanoseconds. If there is not result yet, return 0.
134+
*
135+
* @return the latest result in nanoseconds
136+
*/
137+
std::int64_t get_latest_time() const {
138+
if (duration_ns_.size() >= 1) {
139+
return duration_ns_.back();
140+
} else {
141+
return 0;
142+
}
99143
}
100144

145+
/**
146+
* Clear the results of timer
147+
*/
101148
void clear()
102149
{
103150
duration_ns_.clear();
104151
tic_called_ = false;
105152
total_duration_ns_ = 0;
106153
}
107154

155+
/**
156+
* Create a timer
157+
*/
108158
Timer() : tic_called_(false), total_duration_ns_(0) {}
109159

110160
protected:
111-
void add_record(std::size_t ns)
161+
/**
162+
* Put the nanosecond result into vector
163+
*/
164+
void add_record(std::int64_t ns)
112165
{
113166
// add the result;
114167
duration_ns_.emplace_back(ns);
115168
total_duration_ns_ += ns;
116169
}
117170

171+
/**
172+
* The implementation of tic.
173+
*/
118174
virtual void tic_impl() = 0;
119175

120-
virtual std::size_t toc_impl() = 0;
176+
/**
177+
* The implementation of toc. Return the nanoseconds result.
178+
*
179+
* @return the nanoseconds result
180+
*/
181+
virtual std::int64_t toc_impl() = 0;
121182

122183
private:
123-
std::vector<std::size_t> duration_ns_;
184+
std::vector<std::int64_t> duration_ns_;
124185
bool tic_called_;
125-
std::size_t total_duration_ns_;
186+
std::int64_t total_duration_ns_;
126187
};
127188

128189

190+
/**
191+
* CpuTimer use synchronize of executor and std::chrono to measure the
192+
* timing.
193+
*/
129194
class CpuTimer : public Timer {
130195
public:
196+
/**
197+
* Create a CpuTimer
198+
*
199+
* @param exec Executor associated to the timer
200+
*/
131201
CpuTimer(std::shared_ptr<const gko::Executor> exec) : Timer(), exec_(exec)
132202
{}
133203

@@ -138,14 +208,14 @@ class CpuTimer : public Timer {
138208
start_ = std::chrono::steady_clock::now();
139209
}
140210

141-
std::size_t toc_impl() override
211+
std::int64_t toc_impl() override
142212
{
143213
exec_->synchronize();
144214
auto stop = std::chrono::steady_clock::now();
145215
auto duration_time =
146216
std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start_)
147217
.count();
148-
return static_cast<std::size_t>(duration_time);
218+
return duration_time;
149219
}
150220

151221
private:
@@ -157,12 +227,25 @@ class CpuTimer : public Timer {
157227
#ifdef HAS_CUDA
158228

159229

230+
/**
231+
* CudaTimer uses cuda executor and cudaEvent to measure the timing.
232+
*/
160233
class CudaTimer : public Timer {
161234
public:
235+
/**
236+
* Create a CudaTimer.
237+
*
238+
* @param exec Executor which is CudaExecutor indeed
239+
*/
162240
CudaTimer(std::shared_ptr<const gko::Executor> exec)
163241
: CudaTimer(std::dynamic_pointer_cast<const gko::CudaExecutor>(exec))
164242
{}
165243

244+
/**
245+
* Create a CudaTimer.
246+
*
247+
* @param exec CudaExecutor associated to the timer
248+
*/
166249
CudaTimer(std::shared_ptr<const gko::CudaExecutor> exec) : Timer()
167250
{
168251
assert(exec != nullptr);
@@ -181,7 +264,7 @@ class CudaTimer : public Timer {
181264
GKO_ASSERT_NO_CUDA_ERRORS(cudaEventRecord(start_));
182265
}
183266

184-
std::size_t toc_impl() override
267+
std::int64_t toc_impl() override
185268
{
186269
gko::cuda::device_guard g{id_};
187270
// Currently, gko::CudaExecutor always use default stream.
@@ -192,7 +275,7 @@ class CudaTimer : public Timer {
192275
// resolution of around 0.5 microseconds
193276
GKO_ASSERT_NO_CUDA_ERRORS(
194277
cudaEventElapsedTime(&duration_time, start_, stop_));
195-
return static_cast<std::size_t>(duration_time * 1e6);
278+
return static_cast<std::int64_t>(duration_time * 1e6);
196279
}
197280

198281
private:
@@ -209,12 +292,25 @@ class CudaTimer : public Timer {
209292
#ifdef HAS_HIP
210293

211294

295+
/**
296+
* HipTimer uses hip executor and hipEvent to measure the timing.
297+
*/
212298
class HipTimer : public Timer {
213299
public:
300+
/**
301+
* Create a HipTimer.
302+
*
303+
* @param exec Executor which is HipExecutor indeed
304+
*/
214305
HipTimer(std::shared_ptr<const gko::Executor> exec)
215306
: HipTimer(std::dynamic_pointer_cast<const gko::HipExecutor>(exec))
216307
{}
217308

309+
/**
310+
* Create a HipTimer.
311+
*
312+
* @param exec HipExecutor associated to the timer
313+
*/
218314
HipTimer(std::shared_ptr<const gko::HipExecutor> exec) : Timer()
219315
{
220316
assert(exec != nullptr);
@@ -233,7 +329,7 @@ class HipTimer : public Timer {
233329
GKO_ASSERT_NO_HIP_ERRORS(hipEventRecord(start_));
234330
}
235331

236-
std::size_t toc_impl() override
332+
std::int64_t toc_impl() override
237333
{
238334
gko::hip::device_guard g{id_};
239335
// Currently, gko::HipExecutor always use default stream.
@@ -244,7 +340,7 @@ class HipTimer : public Timer {
244340
// resolution of around 0.5 microseconds
245341
GKO_ASSERT_NO_HIP_ERRORS(
246342
hipEventElapsedTime(&duration_time, start_, stop_));
247-
return static_cast<std::size_t>(duration_time * 1e6);
343+
return static_cast<std::int64_t>(duration_time * 1e6);
248344
}
249345

250346
private:
@@ -258,6 +354,13 @@ class HipTimer : public Timer {
258354
#endif // HAS_HIP
259355

260356

357+
/**
358+
* Get the timer. If the executor does not support gpu timer, still return the
359+
* cpu timer.
360+
*
361+
* @param exec Executor associated to the timer
362+
* @param use_gpu_timer whether to use the gpu timer
363+
*/
261364
std::shared_ptr<Timer> get_timer(std::shared_ptr<const gko::Executor> exec,
262365
bool use_gpu_timer)
263366
{
@@ -276,6 +379,6 @@ std::shared_ptr<Timer> get_timer(std::shared_ptr<const gko::Executor> exec,
276379
}
277380
#endif // HAS_HIP
278381
}
279-
// Not use gpu_timer or not cuda/hip executor
382+
// No cuda/hip executor available or no gpu_timer used
280383
return std::make_shared<CpuTimer>(exec);
281384
}

0 commit comments

Comments
 (0)