@@ -62,72 +62,142 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
6262#include " hip/base/device_guard.hip.hpp"
6363
6464
65- #endif // HAS_CUDA
65+ #endif // HAS_HIP
6666
6767
68+ // Command-line arguments
6869DEFINE_bool (gpu_timer, false ,
6970 " use gpu timer based on event. It is valid only when "
7071 " executor is cuda or hip" );
7172
7273
74+ /* *
75+ * Timer stores the timing information
76+ */
7377class Timer {
7478public:
79+ /* *
80+ * Starts the timer
81+ */
7582 void tic ()
7683 {
7784 assert (tic_called_ == false );
7885 this ->tic_impl ();
7986 tic_called_ = true ;
8087 }
8188
82- std::size_t toc ()
89+ /* *
90+ * Finishs the timer
91+ */
92+ void toc ()
8393 {
8494 assert (tic_called_ == true );
8595 auto ns = this ->toc_impl ();
8696 tic_called_ = false ;
8797 this ->add_record (ns);
88- return ns;
8998 }
9099
91- std::size_t get_total_time () { return total_duration_ns_; }
92-
93- std::size_t get_tictoc_num () { return duration_ns_.size (); }
94-
95- double get_average_time ()
100+ /* *
101+ * Get the summation of each time in nanoseconds.
102+ *
103+ * @return the nanoseconds of total time
104+ */
105+ std::int64_t get_total_time () const { return total_duration_ns_; }
106+
107+ /* *
108+ * Get the number of repetitions.
109+ *
110+ * @return the number of repetitions
111+ */
112+ std::int64_t get_num_repetitions () const { return duration_ns_.size (); }
113+
114+ /* *
115+ * Get the average time of repetitions in nanoseconds
116+ *
117+ * @return the average time in nanoseconds
118+ */
119+ double get_average_time () const
96120 {
97121 return static_cast <double >(this ->get_total_time ()) /
98- this ->get_tictoc_num ();
122+ this ->get_num_repetitions ();
123+ }
124+
125+ /* *
126+ * Get the vector containing the each time results in nanoseconds.
127+ *
128+ * @return the vector of each time results in nanoseconds
129+ */
130+ std::vector<std::int64_t > get_time_detail () const { return duration_ns_; }
131+
132+ /* *
133+ * Get the latest result in nanoseconds. If there is not result yet, return 0.
134+ *
135+ * @return the latest result in nanoseconds
136+ */
137+ std::int64_t get_latest_time () const {
138+ if (duration_ns_.size () >= 1 ) {
139+ return duration_ns_.back ();
140+ } else {
141+ return 0 ;
142+ }
99143 }
100144
145+ /* *
146+ * Clear the results of timer
147+ */
101148 void clear ()
102149 {
103150 duration_ns_.clear ();
104151 tic_called_ = false ;
105152 total_duration_ns_ = 0 ;
106153 }
107154
155+ /* *
156+ * Create a timer
157+ */
108158 Timer () : tic_called_(false ), total_duration_ns_(0 ) {}
109159
110160protected:
111- void add_record (std::size_t ns)
161+ /* *
162+ * Put the nanosecond result into vector
163+ */
164+ void add_record (std::int64_t ns)
112165 {
113166 // add the result;
114167 duration_ns_.emplace_back (ns);
115168 total_duration_ns_ += ns;
116169 }
117170
171+ /* *
172+ * The implementation of tic.
173+ */
118174 virtual void tic_impl () = 0;
119175
120- virtual std::size_t toc_impl () = 0;
176+ /* *
177+ * The implementation of toc. Return the nanoseconds result.
178+ *
179+ * @return the nanoseconds result
180+ */
181+ virtual std::int64_t toc_impl () = 0;
121182
122183private:
123- std::vector<std::size_t > duration_ns_;
184+ std::vector<std::int64_t > duration_ns_;
124185 bool tic_called_;
125- std::size_t total_duration_ns_;
186+ std::int64_t total_duration_ns_;
126187};
127188
128189
190+ /* *
191+ * CpuTimer use synchronize of executor and std::chrono to measure the
192+ * timing.
193+ */
129194class CpuTimer : public Timer {
130195public:
196+ /* *
197+ * Create a CpuTimer
198+ *
199+ * @param exec Executor associated to the timer
200+ */
131201 CpuTimer (std::shared_ptr<const gko::Executor> exec) : Timer(), exec_(exec)
132202 {}
133203
@@ -138,14 +208,14 @@ class CpuTimer : public Timer {
138208 start_ = std::chrono::steady_clock::now ();
139209 }
140210
141- std::size_t toc_impl () override
211+ std::int64_t toc_impl () override
142212 {
143213 exec_->synchronize ();
144214 auto stop = std::chrono::steady_clock::now ();
145215 auto duration_time =
146216 std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start_)
147217 .count ();
148- return static_cast <std:: size_t >( duration_time) ;
218+ return duration_time;
149219 }
150220
151221private:
@@ -157,12 +227,25 @@ class CpuTimer : public Timer {
157227#ifdef HAS_CUDA
158228
159229
230+ /* *
231+ * CudaTimer uses cuda executor and cudaEvent to measure the timing.
232+ */
160233class CudaTimer : public Timer {
161234public:
235+ /* *
236+ * Create a CudaTimer.
237+ *
238+ * @param exec Executor which is CudaExecutor indeed
239+ */
162240 CudaTimer (std::shared_ptr<const gko::Executor> exec)
163241 : CudaTimer(std::dynamic_pointer_cast<const gko::CudaExecutor>(exec))
164242 {}
165243
244+ /* *
245+ * Create a CudaTimer.
246+ *
247+ * @param exec CudaExecutor associated to the timer
248+ */
166249 CudaTimer (std::shared_ptr<const gko::CudaExecutor> exec) : Timer()
167250 {
168251 assert (exec != nullptr );
@@ -181,7 +264,7 @@ class CudaTimer : public Timer {
181264 GKO_ASSERT_NO_CUDA_ERRORS (cudaEventRecord (start_));
182265 }
183266
184- std::size_t toc_impl () override
267+ std::int64_t toc_impl () override
185268 {
186269 gko::cuda::device_guard g{id_};
187270 // Currently, gko::CudaExecutor always use default stream.
@@ -192,7 +275,7 @@ class CudaTimer : public Timer {
192275 // resolution of around 0.5 microseconds
193276 GKO_ASSERT_NO_CUDA_ERRORS (
194277 cudaEventElapsedTime (&duration_time, start_, stop_));
195- return static_cast <std::size_t >(duration_time * 1e6 );
278+ return static_cast <std::int64_t >(duration_time * 1e6 );
196279 }
197280
198281private:
@@ -209,12 +292,25 @@ class CudaTimer : public Timer {
209292#ifdef HAS_HIP
210293
211294
295+ /* *
296+ * HipTimer uses hip executor and hipEvent to measure the timing.
297+ */
212298class HipTimer : public Timer {
213299public:
300+ /* *
301+ * Create a HipTimer.
302+ *
303+ * @param exec Executor which is HipExecutor indeed
304+ */
214305 HipTimer (std::shared_ptr<const gko::Executor> exec)
215306 : HipTimer(std::dynamic_pointer_cast<const gko::HipExecutor>(exec))
216307 {}
217308
309+ /* *
310+ * Create a HipTimer.
311+ *
312+ * @param exec HipExecutor associated to the timer
313+ */
218314 HipTimer (std::shared_ptr<const gko::HipExecutor> exec) : Timer()
219315 {
220316 assert (exec != nullptr );
@@ -233,7 +329,7 @@ class HipTimer : public Timer {
233329 GKO_ASSERT_NO_HIP_ERRORS (hipEventRecord (start_));
234330 }
235331
236- std::size_t toc_impl () override
332+ std::int64_t toc_impl () override
237333 {
238334 gko::hip::device_guard g{id_};
239335 // Currently, gko::HipExecutor always use default stream.
@@ -244,7 +340,7 @@ class HipTimer : public Timer {
244340 // resolution of around 0.5 microseconds
245341 GKO_ASSERT_NO_HIP_ERRORS (
246342 hipEventElapsedTime (&duration_time, start_, stop_));
247- return static_cast <std::size_t >(duration_time * 1e6 );
343+ return static_cast <std::int64_t >(duration_time * 1e6 );
248344 }
249345
250346private:
@@ -258,6 +354,13 @@ class HipTimer : public Timer {
258354#endif // HAS_HIP
259355
260356
357+ /* *
358+ * Get the timer. If the executor does not support gpu timer, still return the
359+ * cpu timer.
360+ *
361+ * @param exec Executor associated to the timer
362+ * @param use_gpu_timer whether to use the gpu timer
363+ */
261364std::shared_ptr<Timer> get_timer (std::shared_ptr<const gko::Executor> exec,
262365 bool use_gpu_timer)
263366{
@@ -276,6 +379,6 @@ std::shared_ptr<Timer> get_timer(std::shared_ptr<const gko::Executor> exec,
276379 }
277380#endif // HAS_HIP
278381 }
279- // Not use gpu_timer or not cuda/hip executor
382+ // No cuda/hip executor available or no gpu_timer used
280383 return std::make_shared<CpuTimer>(exec);
281384}
0 commit comments