Definition of the CGS2 version of finish_arnoldi method, for omp and …

…cuda executors. For omp, the omp is trying to move to the outer loop For cuda, the loop of kernels is change to a kernels with a loop. * The main routines (loop of dots and loop of axpy) are still too expensive.
ginkgo-project · thoasm · Feb 21, 2021 · Apr 27, 2020 · Apr 27, 2020 · Apr 29, 2020
commit f2b5a3ae2bdc6657b382a7ffe5c0d2bec5995980
diff --git a/common/solver/gmres_mixed_kernels.hpp.inc b/common/solver/gmres_mixed_kernels.hpp.inc
@@ -252,6 +252,75 @@ __global__ __launch_bounds__(default_dot_size) void multidot_kernel_num_iters(
 }
 
 
+template <typename ValueType, typename ValueTypeKrylovBases>
+__global__
+    __launch_bounds__(default_dot_size) void multidot_kernel_num_iters_new(
+        size_type num_iters, size_type num_rows, size_type num_cols,
+        const ValueType *__restrict__ next_krylov_basis,
+        size_type stride_next_krylov,
+        const ValueTypeKrylovBases *__restrict__ krylov_bases,
+        size_type stride_krylov, ValueType *__restrict__ hessenberg_iter,
+        size_type stride_hessenberg,
+        const stopping_status *__restrict__ stop_status)
+{
+    const auto mult = 1;
+    const auto tidx = threadIdx.x;
+    const auto tidy = threadIdx.y;
+    const auto col_idx = blockIdx.x * default_dot_dim + tidx;
+    const auto num = ceildiv(num_rows, gridDim.y);
+    const auto start_row = blockIdx.y * num;
+    const auto end_row =
+        ((blockIdx.y + 1) * num > num_rows) ? num_rows : (blockIdx.y + 1) * num;
+    // Used that way to get around dynamic initialization warning and
+    // template error when using `reduction_helper_array` directly in `reduce`
+    __shared__
+        UninitializedArray<ValueType, default_dot_dim *(default_dot_dim + 1)>
+            reduction_helper_array;
+    ValueType *__restrict__ reduction_helper = reduction_helper_array;
+
+    for (size_type k = 0; k < num_iters; ++k) {
+        ValueType local_res = zero<ValueType>();
+        const auto krylov_col = k * num_cols + col_idx;
+        if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) {
+            for (size_type i = start_row + tidy; i < end_row;
+                 i += default_dot_dim * mult) {
+                const auto next_krylov_idx =
+                    i * stride_next_krylov * mult + col_idx;
+                const auto krylov_idx = i * stride_krylov * mult + krylov_col;
+                local_res += next_krylov_basis[next_krylov_idx] *
+                             krylov_bases[krylov_idx];
+                if (mult > 1) {
+                    const auto next_krylov_idx_2 =
+                        next_krylov_idx +
+                        default_dot_size * stride_next_krylov * mult;
+                    const auto krylov_idx_2 =
+                        krylov_idx + default_dot_size * stride_krylov * mult;
+                    local_res += ((i + default_dot_size) < end_row)
+                                     ? next_krylov_basis[next_krylov_idx_2] *
+                                           krylov_bases[krylov_idx_2]
+                                     : 0.0;
+                }
+            }
+
+            reduction_helper[tidx * (default_dot_dim + 1) + tidy] = local_res;
+            __syncthreads();
+            local_res = reduction_helper[tidy * (default_dot_dim + 1) + tidx];
+            const auto tile_block = group::tiled_partition<default_dot_dim>(
+                group::this_thread_block());
+            const auto sum = reduce(
+                tile_block, local_res,
+                [](const ValueType &a, const ValueType &b) { return a + b; });
+            const auto new_col_idx = blockIdx.x * default_dot_dim + tidy;
+            if (tidx == 0 && new_col_idx < num_cols &&
+                !stop_status[new_col_idx].has_stopped()) {
+                const auto hessenberg_idx = k * stride_hessenberg + new_col_idx;
+                atomic_add(hessenberg_iter + hessenberg_idx, sum);
+            }
+        }
+    }
+}
+
+
 // Must be called with at least `num_rows * stride_next_krylov` threads in
 // total.
 template <int block_size, typename ValueType, typename ValueTypeKrylovBases>
@@ -280,6 +349,37 @@ __global__ __launch_bounds__(block_size) void update_next_krylov_kernel(
 }
 
 
+// Must be called with at least `num_rows * stride_next_krylov` threads in
+// total.
+template <int block_size, typename ValueType, typename ValueTypeKrylovBases>
+__global__
+    __launch_bounds__(block_size) void update_next_krylov_kernel_num_iters(
+        size_type num_iters, size_type num_rows, size_type num_cols,
+        ValueType *__restrict__ next_krylov_basis, size_type stride_next_krylov,
+        const ValueTypeKrylovBases *__restrict__ krylov_bases,
+        size_type stride_krylov, const ValueType *__restrict__ hessenberg_iter,
+        size_type stride_hessenberg,
+        const stopping_status *__restrict__ stop_status)
+{
+    const auto global_id = thread::get_thread_id_flat();
+    const auto row_idx = global_id / stride_next_krylov;
+    const auto col_idx = global_id % stride_next_krylov;
+
+    if (row_idx < num_rows && col_idx < num_cols &&
+        !stop_status[col_idx].has_stopped()) {
+        const auto next_krylov_idx = row_idx * stride_next_krylov + col_idx;
+        for (size_type k = 0; k < num_iters; ++k) {
+            const auto krylov_idx =
+                row_idx * stride_krylov + k * num_cols + col_idx;
+            const auto hessenberg_idx = k * stride_hessenberg + col_idx;
+
+            next_krylov_basis[next_krylov_idx] -=
+                hessenberg_iter[hessenberg_idx] * krylov_bases[krylov_idx];
+        }
+    }
+}
+
+
 // Must be called with at least `num_rows * stride_next_krylov` threads in
 // total.
 template <int block_size, typename ValueType, typename ValueTypeKrylovBases>
@@ -356,6 +456,43 @@ __global__ __launch_bounds__(block_size) void update_next_krylov_kernel_and_add(
 }
 
 
+// Must be called with at least `num_rows * stride_next_krylov` threads in
+// total.
+template <int block_size, typename ValueType, typename ValueTypeKrylovBases>
+__global__
+    __launch_bounds__(block_size) void update_next_krylov_kernel_num_iters_and_add(
+        size_type num_iters, size_type num_rows, size_type num_cols,
+        ValueType *__restrict__ next_krylov_basis, size_type stride_next_krylov,
+        const ValueTypeKrylovBases *__restrict__ krylov_bases,
+        size_type stride_krylov, ValueType *__restrict__ hessenberg_iter,
+        size_type stride_hessenberg, const ValueType *__restrict__ buffer_iter,
+        size_type stride_buffer,
+        const stopping_status *__restrict__ stop_status,
+        const stopping_status *__restrict__ reorth_status)
+{
+    const auto global_id = thread::get_thread_id_flat();
+    const auto row_idx = global_id / stride_next_krylov;
+    const auto col_idx = global_id % stride_next_krylov;
+
+    if (row_idx < num_rows && col_idx < num_cols &&
+        !stop_status[col_idx].has_stopped() &&
+        !reorth_status[col_idx].has_stopped()) {
+        const auto next_krylov_idx = row_idx * stride_next_krylov + col_idx;
+        for (size_type k = 0; k < num_iters; ++k) {
+            const auto krylov_idx =
+                row_idx * stride_krylov + k * num_cols + col_idx;
+            const auto hessenberg_idx = k * stride_hessenberg + col_idx;
+            const auto buffer_idx = k * stride_buffer + col_idx;
+            next_krylov_basis[next_krylov_idx] -=
+                buffer_iter[buffer_idx] * krylov_bases[krylov_idx];
+            if ((row_idx == 0) && !reorth_status[col_idx].has_stopped()) {
+                hessenberg_iter[hessenberg_idx] += buffer_iter[buffer_idx];
+            }
+        }
+    }
+}
+
+
 // Must be called with at least `num_cols` blocks, each with `block_size`
 // threads. `block_size` must be a power of 2.
 template <int block_size, typename ValueType>
@@ -396,7 +533,8 @@ __global__ __launch_bounds__(block_size) void update_hessenberg_2_kernel(
     const auto col_idx = blockIdx.x;
 
     // Used that way to get around dynamic initialization warning and
-    // template error when using `reduction_helper_array` directly in `reduce`
+    // template error when using `reduction_helper_array` directly in
+    // `reduce`
     __shared__ UninitializedArray<ValueType, block_size> reduction_helper_array;
     ValueType *__restrict__ reduction_helper = reduction_helper_array;
 
@@ -608,8 +746,8 @@ __global__ __launch_bounds__(block_size) void solve_upper_triangular_kernel(
 }
 
 
-// Must be called with at least `stride_preconditioner * num_rows` threads in
-// total.
+// Must be called with at least `stride_preconditioner * num_rows` threads
+// in total.
 template <size_type block_size, typename ValueType,
           typename ValueTypeKrylovBases>
 __global__ __launch_bounds__(block_size) void calculate_Qy_kernel(

diff --git a/core/solver/gmres_mixed.cpp b/core/solver/gmres_mixed.cpp
@@ -50,7 +50,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/solver/gmres_mixed_kernels.hpp"
 
 
+// #define TIMING 1
+
+
+#ifdef TIMING
 using double_seconds = std::chrono::duration<double>;
+#endif
 
 
 namespace gko {
@@ -161,7 +166,11 @@ void GmresMixed<ValueType, ValueTypeKrylovBases>::apply_impl(const LinOp *b,
     auto after_preconditioner =
         matrix::Dense<ValueType>::create_with_config_of(dense_x);
 
+#ifdef TIMING
     auto start = std::chrono::steady_clock::now();
+    auto time_SPMV = start - start;
+    auto time_STEP1 = start - start;
+#endif
 
     while (true) {
         ++total_iter;
@@ -220,17 +229,29 @@ void GmresMixed<ValueType, ValueTypeKrylovBases>::apply_impl(const LinOp *b,
         auto buffer_iter = buffer->create_submatrix(
             span{0, restart_iter + 2}, span{0, dense_b->get_size()[1]});
 
+#ifdef TIMING
+        auto t_aux_1 = std::chrono::steady_clock::now();
+#endif
         // Start of arnoldi
         system_matrix_->apply(preconditioned_vector.get(),
                               next_krylov_basis.get());
         // next_krylov_basis = A * preconditioned_vector
+#ifdef TIMING
+        time_SPMV += std::chrono::steady_clock::now() - t_aux_1;
+#endif
 
+#ifdef TIMING
+        auto t_aux_2 = std::chrono::steady_clock::now();
+#endif
         exec->run(gmres_mixed::make_step_1(
             next_krylov_basis.get(), givens_sin.get(), givens_cos.get(),
             residual_norm.get(), residual_norm_collection.get(),
             krylov_bases.get(), hessenberg_iter.get(), buffer_iter.get(),
             b_norm.get(), arnoldi_norm.get(), restart_iter, &final_iter_nums,
             &stop_status, &reorth_status, &num_reorth));
+#ifdef TIMING
+        time_STEP1 += std::chrono::steady_clock::now() - t_aux_2;
+#endif
         // for i in 0:restart_iter
         //     hessenberg(restart_iter, i) = next_krylov_basis' *
         //     krylov_bases(:, i) next_krylov_basis  -= hessenberg(restart_iter,
@@ -259,6 +280,9 @@ void GmresMixed<ValueType, ValueTypeKrylovBases>::apply_impl(const LinOp *b,
     }
 
     // Solve x
+#ifdef TIMING
+    auto t_aux_3 = std::chrono::steady_clock::now();
+#endif
     auto krylov_bases_small = krylov_bases->create_submatrix(
         span{0, system_matrix_->get_size()[0]},
         span{0, dense_b->get_size()[1] * (restart_iter + 1)});
@@ -270,21 +294,44 @@ void GmresMixed<ValueType, ValueTypeKrylovBases>::apply_impl(const LinOp *b,
         residual_norm_collection.get(), krylov_bases_small.get(),
         hessenberg_small.get(), y.get(), before_preconditioner.get(),
         &final_iter_nums));
+#ifdef TIMING
+    auto time_STEP2 = std::chrono::steady_clock::now() - t_aux_3;
+#endif
     // Solve upper triangular.
     // y = hessenberg \ residual_norm_collection
 
+#ifdef TIMING
+    auto t_aux_4 = std::chrono::steady_clock::now();
+#endif
     get_preconditioner()->apply(before_preconditioner.get(),
                                 after_preconditioner.get());
     dense_x->add_scaled(one_op.get(), after_preconditioner.get());
+#ifdef TIMING
+    auto time_SOLVEX = std::chrono::steady_clock::now() - t_aux_4;
+#endif
     // Solve x
     // x = x + get_preconditioner() * krylov_bases * y
 
+#ifdef TIMING
     auto time = std::chrono::steady_clock::now() - start;
     std::cout << "total_iter = " << total_iter << std::endl;
     std::cout << "time = "
               << std::chrono::duration_cast<double_seconds>(time).count()
               << std::endl;
+    std::cout << "time_SPMV = "
+              << std::chrono::duration_cast<double_seconds>(time_SPMV).count()
+              << std::endl;
+    std::cout << "time_STEP1 = "
+              << std::chrono::duration_cast<double_seconds>(time_STEP1).count()
+              << std::endl;
+    std::cout << "time_STEP2 = "
+              << std::chrono::duration_cast<double_seconds>(time_STEP2).count()
+              << std::endl;
+    std::cout << "time_SOLVEX = "
+              << std::chrono::duration_cast<double_seconds>(time_SOLVEX).count()
+              << std::endl;
     write(std::cout, lend(residual_norm));
+#endif
 }