ginkgo-project
diff --git a/‎benchmark/utils/formats.hpp‎
Lines changed: 26 additions & 4 deletions b/‎benchmark/utils/formats.hpp‎
Lines changed: 26 additions & 4 deletions
diff --git a/‎common/matrix/ell_kernels.hpp.inc‎
Lines changed: 36 additions & 26 deletions b/‎common/matrix/ell_kernels.hpp.inc‎
Lines changed: 36 additions & 26 deletions
diff --git a/‎core/base/precision_dispatch.hpp‎
Lines changed: 51 additions & 0 deletions b/‎core/base/precision_dispatch.hpp‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎core/device_hooks/common_kernels.inc.cpp‎
Lines changed: 11 additions & 6 deletions b/‎core/device_hooks/common_kernels.inc.cpp‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎core/matrix/ell.cpp‎
Lines changed: 11 additions & 6 deletions b/‎core/matrix/ell.cpp‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎core/matrix/ell_kernels.hpp‎
Lines changed: 23 additions & 16 deletions b/‎core/matrix/ell_kernels.hpp‎
Lines changed: 23 additions & 16 deletions
@@ -59,7 +59,8 @@ namespace formats {
 
 
 std::string available_format =
-    "coo, csr, ell, sellp, hybrid, hybrid0, hybrid25, hybrid33, hybrid40, "
+    "coo, csr, ell, fell, sellp, hybrid, hybrid0, hybrid25, hybrid33, "
+    "hybrid40, "
     "hybrid60, hybrid80, hybridlimit0, hybridlimit25, hybridlimit33, "
     "hybridminstorage"
 #ifdef HAS_CUDA
@@ -90,6 +91,9 @@ std::string format_description =
     "csrm: Ginkgo's CSR implementation with merge_path strategy.\n"
     "ell: Ellpack format according to Bell and Garland: Efficient Sparse "
     "Matrix-Vector Multiplication on CUDA.\n"
+    "fell: float Ellpack format according to Bell and Garland: Efficient "
+    "Sparse "
+    "Matrix-Vector Multiplication on CUDA.\n"
     "sellp: Sliced Ellpack uses a default block size of 32.\n"
     "hybrid: Hybrid uses ell and coo to represent the matrix.\n"
     "hybrid0, hybrid25, hybrid33, hybrid40, hybrid60, hybrid80: Hybrid uses "
@@ -204,6 +208,23 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
         {"csrc", READ_MATRIX(csr, std::make_shared<csr::classical>())},
         {"coo", read_matrix_from_data<gko::matrix::Coo<etype>>},
         {"ell", read_matrix_from_data<gko::matrix::Ell<etype>>},
+        {"fell",
+         [](std::shared_ptr<const gko::Executor> exec,
+            const gko::matrix_data<> &data) {
+             gko::matrix_data<float> conv_data;
+             conv_data.size = data.size;
+             conv_data.nonzeros.resize(data.nonzeros.size());
+             auto it = conv_data.nonzeros.begin();
+             for (auto &el : data.nonzeros) {
+                 it->row = el.row;
+                 it->column = el.column;
+                 it->value = el.value;
+                 ++it;
+             }
+             auto mat = gko::matrix::Ell<float>::create(std::move(exec));
+             mat->read(conv_data);
+             return mat;
+         }},
 #ifdef HAS_CUDA
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
         {"cusp_csr", read_matrix_from_data<cusp_csr>},
@@ -212,8 +233,8 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
         {"cusp_hybrid", read_matrix_from_data<cusp_hybrid>},
         {"cusp_coo", read_matrix_from_data<cusp_coo>},
         {"cusp_ell", read_matrix_from_data<cusp_ell>},
-#else // CUDA_VERSION >= 11000
-        // cusp_csr, cusp_coo use the generic ones from CUDA 11
+#else  // CUDA_VERSION >= 11000
+       // cusp_csr, cusp_coo use the generic ones from CUDA 11
         {"cusp_csr", read_matrix_from_data<cusp_gcsr>},
         {"cusp_coo", read_matrix_from_data<cusp_gcoo>},
 #endif
@@ -260,7 +281,8 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
         {"hybridminstorage",
          READ_MATRIX(hybrid,
                      std::make_shared<hybrid::minimal_storage_limit>())},
-        {"sellp", read_matrix_from_data<gko::matrix::Sellp<etype>>}};
+        {"sellp", read_matrix_from_data<gko::matrix::Sellp<etype>>}
+};
 // clang-format on
 
 
 
@@ -34,22 +34,25 @@ namespace kernel {
 namespace {
 
 
-template <int num_thread_per_worker, bool atomic, typename ValueType,
+template <int num_thread_per_worker, bool atomic, typename InputValueType,
+          typename MatrixValueType, typename OutputValueType,
           typename IndexType, typename Closure>
 __device__ void spmv_kernel(
     const size_type num_rows, const int num_worker_per_row,
-    const ValueType *__restrict__ val, const IndexType *__restrict__ col,
+    const MatrixValueType *__restrict__ val, const IndexType *__restrict__ col,
     const size_type stride, const size_type num_stored_elements_per_row,
-    const ValueType *__restrict__ b, const size_type b_stride,
-    ValueType *__restrict__ c, const size_type c_stride, Closure op)
+    const InputValueType *__restrict__ b, const size_type b_stride,
+    OutputValueType *__restrict__ c, const size_type c_stride, Closure op)
 {
     const auto tidx = thread::get_thread_id_flat();
     const auto column_id = blockIdx.y;
+    using compute_type =
+        decltype(InputValueType{} + MatrixValueType{} + OutputValueType{});
     if (num_thread_per_worker == 1) {
         // Specialize the num_thread_per_worker = 1. It doesn't need the shared
         // memory, __syncthreads, and atomic_add
         if (tidx < num_rows) {
-            ValueType temp = zero<ValueType>();
+            auto temp = zero<compute_type>();
             for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) {
                 const auto ind = tidx + idx * stride;
                 const auto col_idx = col[ind];
@@ -68,14 +71,14 @@ __device__ void spmv_kernel(
             const auto x = tidx % num_rows;
             const auto worker_id = tidx / num_rows;
             const auto step_size = num_worker_per_row * num_thread_per_worker;
-            __shared__ UninitializedArray<ValueType, default_block_size /
-                                                         num_thread_per_worker>
+            __shared__ UninitializedArray<
+                compute_type, default_block_size / num_thread_per_worker>
                 storage;
             if (idx_in_worker == 0) {
                 storage[threadIdx.x] = 0;
             }
             __syncthreads();
-            ValueType temp = zero<ValueType>();
+            auto temp = zero<compute_type>();
             for (size_type idx =
                      worker_id * num_thread_per_worker + idx_in_worker;
                  idx < num_stored_elements_per_row; idx += step_size) {
@@ -102,35 +105,41 @@ __device__ void spmv_kernel(
 }
 
 
-template <int num_thread_per_worker, bool atomic = false, typename ValueType,
-          typename IndexType>
+template <int num_thread_per_worker, bool atomic = false,
+          typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void spmv(
     const size_type num_rows, const int num_worker_per_row,
-    const ValueType *__restrict__ val, const IndexType *__restrict__ col,
+    const MatrixValueType *__restrict__ val, const IndexType *__restrict__ col,
     const size_type stride, const size_type num_stored_elements_per_row,
-    const ValueType *__restrict__ b, const size_type b_stride,
-    ValueType *__restrict__ c, const size_type c_stride)
+    const InputValueType *__restrict__ b, const size_type b_stride,
+    OutputValueType *__restrict__ c, const size_type c_stride)
 {
+    using compute_type =
+        decltype(InputValueType{} + MatrixValueType{} + OutputValueType{});
     spmv_kernel<num_thread_per_worker, atomic>(
         num_rows, num_worker_per_row, val, col, stride,
         num_stored_elements_per_row, b, b_stride, c, c_stride,
-        [](const ValueType &x, const ValueType &y) { return x; });
+        [](const compute_type &x, const OutputValueType &y) { return x; });
 }
 
 
-template <int num_thread_per_worker, bool atomic = false, typename ValueType,
-          typename IndexType>
+template <int num_thread_per_worker, bool atomic = false,
+          typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void spmv(
     const size_type num_rows, const int num_worker_per_row,
-    const ValueType *__restrict__ alpha, const ValueType *__restrict__ val,
-    const IndexType *__restrict__ col, const size_type stride,
-    const size_type num_stored_elements_per_row,
-    const ValueType *__restrict__ b, const size_type b_stride,
-    const ValueType *__restrict__ beta, ValueType *__restrict__ c,
+    const MatrixValueType *__restrict__ alpha,
+    const MatrixValueType *__restrict__ val, const IndexType *__restrict__ col,
+    const size_type stride, const size_type num_stored_elements_per_row,
+    const InputValueType *__restrict__ b, const size_type b_stride,
+    const OutputValueType *__restrict__ beta, OutputValueType *__restrict__ c,
     const size_type c_stride)
 {
-    const ValueType alpha_val = alpha[0];
-    const ValueType beta_val = beta[0];
+    using compute_type =
+        decltype(InputValueType{} + MatrixValueType{} + OutputValueType{});
+    const compute_type alpha_val = alpha[0];
+    const compute_type beta_val = beta[0];
     // Because the atomic operation changes the values of c during computation,
     // it can not do the right alpha * a * b + beta * c operation.
     // Thus, the cuda kernel only computes alpha * a * b when it uses atomic
@@ -139,15 +148,16 @@ __global__ __launch_bounds__(default_block_size) void spmv(
         spmv_kernel<num_thread_per_worker, atomic>(
             num_rows, num_worker_per_row, val, col, stride,
             num_stored_elements_per_row, b, b_stride, c, c_stride,
-            [&alpha_val](const ValueType &x, const ValueType &y) {
+            [&alpha_val](const compute_type &x, const OutputValueType &y) {
                 return alpha_val * x;
             });
     } else {
         spmv_kernel<num_thread_per_worker, atomic>(
             num_rows, num_worker_per_row, val, col, stride,
             num_stored_elements_per_row, b, b_stride, c, c_stride,
-            [&alpha_val, &beta_val](const ValueType &x, const ValueType &y) {
-                return alpha_val * x + beta_val * y;
+            [&alpha_val, &beta_val](const compute_type &x,
+                                    const OutputValueType &y) {
+                return alpha_val * x + beta_val * compute_type{y};
             });
     }
 }
 
@@ -147,6 +147,57 @@ void precision_dispatch_spmv(Function fn, const LinOp *alpha, const LinOp *in,
     }
 }
 
+
+template <typename ValueType, typename Function>
+void mixed_precision_dispatch(Function fn, const LinOp *in, LinOp *out)
+{
+    if (auto dense_in = dynamic_cast<const matrix::Dense<ValueType> *>(in)) {
+        if (auto dense_out = dynamic_cast<matrix::Dense<ValueType> *>(out)) {
+            fn(dense_in, dense_out);
+        } else if (auto dense_out =
+                       dynamic_cast<matrix::Dense<next_precision<ValueType>> *>(
+                           out)) {
+            fn(dense_in, dense_out);
+        } else {
+            GKO_NOT_SUPPORTED(out);
+        }
+    } else if (auto dense_in = dynamic_cast<
+                   const matrix::Dense<next_precision<ValueType>> *>(in)) {
+        if (auto dense_out = dynamic_cast<matrix::Dense<ValueType> *>(out)) {
+            fn(dense_in, dense_out);
+        } else if (auto dense_out =
+                       dynamic_cast<matrix::Dense<next_precision<ValueType>> *>(
+                           out)) {
+            fn(dense_in, dense_out);
+        } else {
+            GKO_NOT_SUPPORTED(out);
+        }
+    } else {
+        GKO_NOT_SUPPORTED(in);
+    }
+}
+
+template <typename ValueType, typename Function>
+void mixed_precision_dispatch_spmv(Function fn, const LinOp *in, LinOp *out)
+{
+    // do we need to convert complex Dense to real Dense?
+    auto complex_to_real =
+        !(is_complex<ValueType>() ||
+          dynamic_cast<const ConvertibleTo<matrix::Dense<>> *>(in));
+    if (complex_to_real) {
+        auto dense_in = make_temporary_conversion<to_complex<ValueType>>(in);
+        auto dense_out = make_temporary_conversion<to_complex<ValueType>>(out);
+        using Dense = matrix::Dense<ValueType>;
+        // These dynamic_casts are only needed to make the code compile
+        // If ValueType is complex, this branch will never be taken
+        // If ValueType is real, the cast is a no-op
+        fn(dynamic_cast<const Dense *>(dense_in->create_real_view().get()),
+           dynamic_cast<Dense *>(dense_out->create_real_view().get()));
+    } else {
+        mixed_precision_dispatch<ValueType>(fn, in, out);
+    }
+}
+
 }  // namespace gko
 
 
 
@@ -849,15 +849,20 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 namespace ell {
 
 
-template <typename ValueType, typename IndexType>
-GKO_DECLARE_ELL_SPMV_KERNEL(ValueType, IndexType)
+template <typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
+GKO_DECLARE_ELL_SPMV_KERNEL(InputValueType, MatrixValueType, OutputValueType,
+                            IndexType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ELL_SPMV_KERNEL);
 
-template <typename ValueType, typename IndexType>
-GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(ValueType, IndexType)
+template <typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
+GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(InputValueType, MatrixValueType,
+                                     OutputValueType, IndexType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
 
 template <typename ValueType, typename IndexType>
 
@@ -102,7 +102,7 @@ size_type calculate_max_nnz_per_row(
 template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::apply_impl(const LinOp *b, LinOp *x) const
 {
-    precision_dispatch_spmv<ValueType>(
+    mixed_precision_dispatch_spmv<ValueType>(
         [&](auto dense_b, auto dense_x) {
             this->get_executor()->run(ell::make_spmv(this, dense_b, dense_x));
         },
@@ -114,12 +114,17 @@ template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::apply_impl(const LinOp *alpha, const LinOp *b,
                                            const LinOp *beta, LinOp *x) const
 {
-    precision_dispatch_spmv<ValueType>(
-        [&](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) {
-            this->get_executor()->run(ell::make_advanced_spmv(
-                dense_alpha, this, dense_b, dense_beta, dense_x));
+    mixed_precision_dispatch_spmv<ValueType>(
+        [&](auto dense_b, auto dense_x) {
+            auto converted_alpha = make_temporary_conversion<ValueType>(alpha);
+            auto converted_beta =
+                make_temporary_conversion<typename std::remove_reference_t<
+                    decltype(*dense_x)>::value_type>(beta);
+            this->get_executor()->run(
+                ell::make_advanced_spmv(converted_alpha.get(), this, dense_b,
+                                        converted_beta.get(), dense_x));
         },
-        alpha, b, beta, x);
+        b, x);
 }
 
 
 
@@ -46,18 +46,21 @@ namespace gko {
 namespace kernels {
 
 
-#define GKO_DECLARE_ELL_SPMV_KERNEL(ValueType, IndexType)  \
-    void spmv(std::shared_ptr<const DefaultExecutor> exec, \
-              const matrix::Ell<ValueType, IndexType> *a,  \
-              const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *c)
-
-#define GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(ValueType, IndexType)  \
-    void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec, \
-                       const matrix::Dense<ValueType> *alpha,       \
-                       const matrix::Ell<ValueType, IndexType> *a,  \
-                       const matrix::Dense<ValueType> *b,           \
-                       const matrix::Dense<ValueType> *beta,        \
-                       matrix::Dense<ValueType> *c)
+#define GKO_DECLARE_ELL_SPMV_KERNEL(InputValueType, MatrixValueType, \
+                                    OutputValueType, IndexType)      \
+    void spmv(std::shared_ptr<const DefaultExecutor> exec,           \
+              const matrix::Ell<MatrixValueType, IndexType> *a,      \
+              const matrix::Dense<InputValueType> *b,                \
+              matrix::Dense<OutputValueType> *c)
+
+#define GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(InputValueType, MatrixValueType, \
+                                             OutputValueType, IndexType)      \
+    void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,           \
+                       const matrix::Dense<MatrixValueType> *alpha,           \
+                       const matrix::Ell<MatrixValueType, IndexType> *a,      \
+                       const matrix::Dense<InputValueType> *b,                \
+                       const matrix::Dense<OutputValueType> *beta,            \
+                       matrix::Dense<OutputValueType> *c)
 
 #define GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType)      \
     void convert_to_dense(std::shared_ptr<const DefaultExecutor> exec,     \
@@ -87,10 +90,14 @@ namespace kernels {
                           matrix::Diagonal<ValueType> *diag)
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES                                         \
-    template <typename ValueType, typename IndexType>                        \
-    GKO_DECLARE_ELL_SPMV_KERNEL(ValueType, IndexType);                       \
-    template <typename ValueType, typename IndexType>                        \
-    GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(ValueType, IndexType);              \
+    template <typename InputValueType, typename MatrixValueType,             \
+              typename OutputValueType, typename IndexType>                  \
+    GKO_DECLARE_ELL_SPMV_KERNEL(InputValueType, MatrixValueType,             \
+                                OutputValueType, IndexType);                 \
+    template <typename InputValueType, typename MatrixValueType,             \
+              typename OutputValueType, typename IndexType>                  \
+    GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(InputValueType, MatrixValueType,    \
+                                         OutputValueType, IndexType);        \
     template <typename ValueType, typename IndexType>                        \
     GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType);           \
     template <typename ValueType, typename IndexType>                        \