diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 5f8d59a8017..f587bf1d3fa 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -11,7 +11,7 @@ function(ginkgo_benchmark_add_tuning_maybe name)
 endfunction()
 
 function(ginkgo_benchmark_cusparse_linops type def)
-    add_library(cusparse_linops_${type} utils/cuda_linops.cu)
+    add_library(cusparse_linops_${type} utils/cuda_linops.cpp)
     if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
         # remove false positive CUDA warnings when calling one<T>() and zero<T>()
         target_compile_options(cusparse_linops_${type}
@@ -117,7 +117,7 @@ if (GINKGO_BUILD_CUDA)
     ginkgo_benchmark_cusparse_linops(s GKO_BENCHMARK_USE_SINGLE_PRECISION)
     ginkgo_benchmark_cusparse_linops(z GKO_BENCHMARK_USE_DOUBLE_COMPLEX_PRECISION)
     ginkgo_benchmark_cusparse_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION)
-    add_library(cuda_timer utils/cuda_timer.cu)
+    add_library(cuda_timer utils/cuda_timer.cpp)
     target_link_libraries(cuda_timer ginkgo ${CUDA_RUNTIME_LIBS})
     target_include_directories(cuda_timer SYSTEM PRIVATE ${CUDA_INCLUDE_DIRS})
 endif()
diff --git a/benchmark/blas/CMakeLists.txt b/benchmark/blas/CMakeLists.txt
index c3e40e80bc2..2aab2331b19 100644
--- a/benchmark/blas/CMakeLists.txt
+++ b/benchmark/blas/CMakeLists.txt
@@ -1 +1,4 @@
 ginkgo_add_typed_benchmark_executables(blas "NO" blas.cpp)
+if(GINKGO_BUILD_MPI)
+    add_subdirectory(distributed)
+endif()
diff --git a/benchmark/blas/blas.cpp b/benchmark/blas/blas.cpp
index 4ed42debbbf..58e241e92db 100644
--- a/benchmark/blas/blas.cpp
+++ b/benchmark/blas/blas.cpp
@@ -33,345 +33,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/ginkgo.hpp>
 
 
-#include <algorithm>
-#include <chrono>
 #include <cstdlib>
-#include <exception>
 #include <iomanip>
 #include <iostream>
-#include <typeinfo>
 
 
+#include "benchmark/blas/blas_common.hpp"
 #include "benchmark/utils/general.hpp"
-#include "benchmark/utils/loggers.hpp"
-#include "benchmark/utils/timer.hpp"
+#include "benchmark/utils/generator.hpp"
 #include "benchmark/utils/types.hpp"
 
 
-// Command-line arguments
-DEFINE_string(
-    operations, "copy,axpy,scal",
-    "A comma-separated list of BLAS operations to benchmark.\nCandidates are"
-    "   copy (y = x),\n"
-    "   axpy (y = y + a * x),\n"
-    "   multiaxpy (like axpy, but a has one entry per column),\n"
-    "   scal (y = a * y),\n"
-    "   multiscal (like scal, but a has one entry per column),\n"
-    "   dot (a = x' * y),"
-    "   norm (a = sqrt(x' * x)),\n"
-    "   mm (C = A * B),\n"
-    "   gemm (C = a * A * B + b * C)\n"
-    "where A has dimensions n x k, B has dimensions k x m,\n"
-    "C has dimensions n x m and x and y have dimensions n x r");
-
-
-class BenchmarkOperation {
-public:
-    virtual ~BenchmarkOperation() = default;
-
-    virtual gko::size_type get_flops() const = 0;
-    virtual gko::size_type get_memory() const = 0;
-    virtual void prepare(){};
-    virtual void run() = 0;
-};
-
-
-class CopyOperation : public BenchmarkOperation {
-public:
-    CopyOperation(std::shared_ptr<const gko::Executor> exec,
-                  gko::size_type rows, gko::size_type cols,
-                  gko::size_type istride, gko::size_type ostride)
-    {
-        in_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{rows, cols},
-                                                istride);
-        out_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{rows, cols},
-                                                 ostride);
-        in_->fill(1);
-    }
-
-    gko::size_type get_flops() const override
-    {
-        return in_->get_size()[0] * in_->get_size()[1];
-    }
-
-    gko::size_type get_memory() const override
-    {
-        return in_->get_size()[0] * in_->get_size()[1] * sizeof(etype) * 2;
-    }
-
-    void run() override { in_->convert_to(lend(out_)); }
-
-private:
-    std::unique_ptr<gko::matrix::Dense<etype>> in_;
-    std::unique_ptr<gko::matrix::Dense<etype>> out_;
-};
-
-
-class AxpyOperation : public BenchmarkOperation {
-public:
-    AxpyOperation(std::shared_ptr<const gko::Executor> exec,
-                  gko::size_type rows, gko::size_type cols,
-                  gko::size_type stride_in, gko::size_type stride_out,
-                  bool multi)
-    {
-        alpha_ = gko::matrix::Dense<etype>::create(
-            exec, gko::dim<2>{1, multi ? cols : 1});
-        x_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{rows, cols},
-                                               stride_in);
-        y_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{rows, cols},
-                                               stride_out);
-        alpha_->fill(1);
-        x_->fill(1);
-    }
-
-    gko::size_type get_flops() const override
-    {
-        return y_->get_size()[0] * y_->get_size()[1] * 2;
-    }
-
-    gko::size_type get_memory() const override
-    {
-        return y_->get_size()[0] * y_->get_size()[1] * sizeof(etype) * 3;
-    }
-
-    void prepare() override { y_->fill(1); }
-
-    void run() override { y_->add_scaled(lend(alpha_), lend(x_)); }
-
-private:
-    std::unique_ptr<gko::matrix::Dense<etype>> alpha_;
-    std::unique_ptr<gko::matrix::Dense<etype>> x_;
-    std::unique_ptr<gko::matrix::Dense<etype>> y_;
-};
-
-
-class ScalOperation : public BenchmarkOperation {
-public:
-    ScalOperation(std::shared_ptr<const gko::Executor> exec,
-                  gko::size_type rows, gko::size_type cols,
-                  gko::size_type stride, bool multi)
-    {
-        alpha_ = gko::matrix::Dense<etype>::create(
-            exec, gko::dim<2>{1, multi ? cols : 1});
-        y_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{rows, cols},
-                                               stride);
-        alpha_->fill(1);
-    }
-
-    gko::size_type get_flops() const override
-    {
-        return y_->get_size()[0] * y_->get_size()[1];
-    }
-
-    gko::size_type get_memory() const override
-    {
-        return y_->get_size()[0] * y_->get_size()[1] * sizeof(etype) * 2;
-    }
-
-    void prepare() override { y_->fill(1); }
-
-    void run() override { y_->scale(lend(alpha_)); }
-
-private:
-    std::unique_ptr<gko::matrix::Dense<etype>> alpha_;
-    std::unique_ptr<gko::matrix::Dense<etype>> y_;
-};
-
-
-class DotOperation : public BenchmarkOperation {
-public:
-    DotOperation(std::shared_ptr<const gko::Executor> exec, gko::size_type rows,
-                 gko::size_type cols, gko::size_type stride_x,
-                 gko::size_type stride_y)
-    {
-        alpha_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{1, cols});
-        x_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{rows, cols},
-                                               stride_x);
-        y_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{rows, cols},
-                                               stride_y);
-        x_->fill(1);
-        y_->fill(1);
-    }
-
-    gko::size_type get_flops() const override
-    {
-        return y_->get_size()[0] * y_->get_size()[1] * 2;
-    }
-
-    gko::size_type get_memory() const override
-    {
-        return y_->get_size()[0] * y_->get_size()[1] * sizeof(etype) * 2;
-    }
-
-    void run() override { x_->compute_dot(lend(y_), lend(alpha_)); }
-
-private:
-    std::unique_ptr<gko::matrix::Dense<etype>> alpha_;
-    std::unique_ptr<gko::matrix::Dense<etype>> x_;
-    std::unique_ptr<gko::matrix::Dense<etype>> y_;
-};
-
-
-class NormOperation : public BenchmarkOperation {
-public:
-    NormOperation(std::shared_ptr<const gko::Executor> exec,
-                  gko::size_type rows, gko::size_type cols,
-                  gko::size_type stride)
-    {
-        alpha_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{1, cols});
-        y_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{rows, cols},
-                                               stride);
-        y_->fill(1);
-    }
-
-    gko::size_type get_flops() const override
-    {
-        return y_->get_size()[0] * y_->get_size()[1] * 2;
-    }
-
-    gko::size_type get_memory() const override
-    {
-        return y_->get_size()[0] * y_->get_size()[1] * sizeof(etype);
-    }
-
-    void run() override { y_->compute_norm2(lend(alpha_)); }
-
-private:
-    std::unique_ptr<gko::matrix::Dense<etype>> alpha_;
-    std::unique_ptr<gko::matrix::Dense<etype>> y_;
-};
-
-
-class ApplyOperation : public BenchmarkOperation {
-public:
-    ApplyOperation(std::shared_ptr<const gko::Executor> exec, gko::size_type n,
-                   gko::size_type k, gko::size_type m, gko::size_type stride_A,
-                   gko::size_type stride_B, gko::size_type stride_C)
-    {
-        A_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{n, k},
-                                               stride_A);
-        B_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{k, m},
-                                               stride_B);
-        C_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{n, m},
-                                               stride_C);
-        A_->fill(1);
-        B_->fill(1);
-    }
-
-    gko::size_type get_flops() const override
-    {
-        return A_->get_size()[0] * A_->get_size()[1] * B_->get_size()[1] * 2;
-    }
-
-    gko::size_type get_memory() const override
-    {
-        return (A_->get_size()[0] * A_->get_size()[1] +
-                B_->get_size()[0] * B_->get_size()[1] +
-                C_->get_size()[0] * C_->get_size()[1]) *
-               sizeof(etype);
-    }
-
-    void run() override { A_->apply(lend(B_), lend(C_)); }
-
-private:
-    std::unique_ptr<gko::matrix::Dense<etype>> A_;
-    std::unique_ptr<gko::matrix::Dense<etype>> B_;
-    std::unique_ptr<gko::matrix::Dense<etype>> C_;
-};
-
-
-class AdvancedApplyOperation : public BenchmarkOperation {
-public:
-    AdvancedApplyOperation(std::shared_ptr<const gko::Executor> exec,
-                           gko::size_type n, gko::size_type k, gko::size_type m,
-                           gko::size_type stride_A, gko::size_type stride_B,
-                           gko::size_type stride_C)
-    {
-        A_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{n, k},
-                                               stride_A);
-        B_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{k, m},
-                                               stride_B);
-        C_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{n, m},
-                                               stride_C);
-        alpha_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{1, 1});
-        beta_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{1, 1});
-        A_->fill(1);
-        B_->fill(1);
-        alpha_->fill(1);
-        beta_->fill(1);
-    }
-
-    gko::size_type get_flops() const override
-    {
-        return A_->get_size()[0] * A_->get_size()[1] * B_->get_size()[1] * 2 +
-               C_->get_size()[0] * C_->get_size()[1] * 3;
-    }
-
-    gko::size_type get_memory() const override
-    {
-        return (A_->get_size()[0] * A_->get_size()[1] +
-                B_->get_size()[0] * B_->get_size()[1] +
-                C_->get_size()[0] * C_->get_size()[1] * 2) *
-               sizeof(etype);
-    }
-
-    void run() override
-    {
-        A_->apply(lend(alpha_), lend(B_), lend(beta_), lend(C_));
-    }
-
-private:
-    std::unique_ptr<gko::matrix::Dense<etype>> alpha_;
-    std::unique_ptr<gko::matrix::Dense<etype>> beta_;
-    std::unique_ptr<gko::matrix::Dense<etype>> A_;
-    std::unique_ptr<gko::matrix::Dense<etype>> B_;
-    std::unique_ptr<gko::matrix::Dense<etype>> C_;
-};
-
-
-struct dimensions {
-    gko::size_type n;
-    gko::size_type k;
-    gko::size_type m;
-    gko::size_type r;
-    gko::size_type stride_x;
-    gko::size_type stride_y;
-    gko::size_type stride_A;
-    gko::size_type stride_B;
-    gko::size_type stride_C;
-};
-
-
-gko::size_type get_optional(rapidjson::Value& obj, const char* name,
-                            gko::size_type default_value)
-{
-    if (obj.HasMember(name)) {
-        return obj[name].GetUint64();
-    } else {
-        return default_value;
-    }
-}
-
-
-dimensions parse_dims(rapidjson::Value& test_case)
-{
-    dimensions result;
-    result.n = test_case["n"].GetInt64();
-    result.k = get_optional(test_case, "k", result.n);
-    result.m = get_optional(test_case, "m", result.n);
-    result.r = get_optional(test_case, "r", 1);
-    if (test_case.HasMember("stride")) {
-        result.stride_x = test_case["stride"].GetInt64();
-        result.stride_y = result.stride_x;
-    } else {
-        result.stride_x = get_optional(test_case, "stride_x", result.r);
-        result.stride_y = get_optional(test_case, "stride_y", result.r);
-    }
-    result.stride_A = get_optional(test_case, "stride_A", result.k);
-    result.stride_B = get_optional(test_case, "stride_B", result.m);
-    result.stride_C = get_optional(test_case, "stride_C", result.m);
-    return result;
-}
+using Generator = DefaultSystemGenerator<>;
 
 
 std::map<std::string, std::function<std::unique_ptr<BenchmarkOperation>(
@@ -379,136 +52,80 @@ std::map<std::string, std::function<std::unique_ptr<BenchmarkOperation>(
     operation_map{
         {"copy",
          [](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
-             return std::make_unique<CopyOperation>(
-                 exec, dims.n, dims.r, dims.stride_x, dims.stride_y);
+             return std::make_unique<CopyOperation<Generator>>(
+                 exec, Generator{}, dims.n, dims.r, dims.stride_x,
+                 dims.stride_y);
          }},
         {"axpy",
          [](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
-             return std::make_unique<AxpyOperation>(
-                 exec, dims.n, dims.r, dims.stride_x, dims.stride_y, false);
+             return std::make_unique<AxpyOperation<Generator>>(
+                 exec, Generator{}, dims.n, dims.r, dims.stride_x,
+                 dims.stride_y, false);
          }},
         {"multiaxpy",
          [](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
-             return std::make_unique<AxpyOperation>(
-                 exec, dims.n, dims.r, dims.stride_x, dims.stride_y, true);
+             return std::make_unique<AxpyOperation<Generator>>(
+                 exec, Generator{}, dims.n, dims.r, dims.stride_x,
+                 dims.stride_y, true);
          }},
         {"scal",
          [](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
-             return std::make_unique<ScalOperation>(exec, dims.n, dims.r,
-                                                    dims.stride_y, false);
+             return std::make_unique<ScalOperation<Generator>>(
+                 exec, Generator{}, dims.n, dims.r, dims.stride_y, false);
          }},
         {"multiscal",
          [](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
-             return std::make_unique<ScalOperation>(exec, dims.n, dims.r,
-                                                    dims.stride_y, true);
+             return std::make_unique<ScalOperation<Generator>>(
+                 exec, Generator{}, dims.n, dims.r, dims.stride_y, true);
          }},
         {"dot",
          [](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
-             return std::make_unique<DotOperation>(
-                 exec, dims.n, dims.r, dims.stride_x, dims.stride_y);
+             return std::make_unique<DotOperation<Generator>>(
+                 exec, Generator{}, dims.n, dims.r, dims.stride_x,
+                 dims.stride_y);
          }},
         {"norm",
          [](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
-             return std::make_unique<NormOperation>(exec, dims.n, dims.r,
-                                                    dims.stride_y);
+             return std::make_unique<NormOperation<Generator>>(
+                 exec, Generator{}, dims.n, dims.r, dims.stride_y);
          }},
         {"mm",
          [](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
-             return std::make_unique<ApplyOperation>(
-                 exec, dims.n, dims.k, dims.m, dims.stride_A, dims.stride_B,
-                 dims.stride_C);
+             return std::make_unique<ApplyOperation<Generator>>(
+                 exec, Generator{}, dims.n, dims.k, dims.m, dims.stride_A,
+                 dims.stride_B, dims.stride_C);
          }},
         {"gemm",
          [](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
-             return std::make_unique<AdvancedApplyOperation>(
-                 exec, dims.n, dims.k, dims.m, dims.stride_A, dims.stride_B,
-                 dims.stride_C);
+             return std::make_unique<AdvancedApplyOperation<Generator>>(
+                 exec, Generator{}, dims.n, dims.k, dims.m, dims.stride_A,
+                 dims.stride_B, dims.stride_C);
          }}};
 
 
-void apply_blas(const char* operation_name, std::shared_ptr<gko::Executor> exec,
-                rapidjson::Value& test_case,
-                rapidjson::MemoryPoolAllocator<>& allocator)
-{
-    try {
-        auto& blas_case = test_case["blas"];
-        add_or_set_member(blas_case, operation_name,
-                          rapidjson::Value(rapidjson::kObjectType), allocator);
-
-        auto op = operation_map[operation_name](exec, parse_dims(test_case));
-
-        auto timer = get_timer(exec, FLAGS_gpu_timer);
-        IterationControl ic(timer);
-
-        // warm run
-        for (auto _ : ic.warmup_run()) {
-            op->prepare();
-            exec->synchronize();
-            op->run();
-            exec->synchronize();
-        }
-
-        // timed run
-        op->prepare();
-        for (auto _ : ic.run()) {
-            op->run();
-        }
-        const auto runtime = ic.compute_average_time();
-        const auto flops = static_cast<double>(op->get_flops());
-        const auto mem = static_cast<double>(op->get_memory());
-        const auto repetitions = ic.get_num_repetitions();
-        add_or_set_member(blas_case[operation_name], "time", runtime,
-                          allocator);
-        add_or_set_member(blas_case[operation_name], "flops", flops / runtime,
-                          allocator);
-        add_or_set_member(blas_case[operation_name], "bandwidth", mem / runtime,
-                          allocator);
-        add_or_set_member(blas_case[operation_name], "repetitions", repetitions,
-                          allocator);
-
-        // compute and write benchmark data
-        add_or_set_member(blas_case[operation_name], "completed", true,
-                          allocator);
-    } catch (const std::exception& e) {
-        add_or_set_member(test_case["blas"][operation_name], "completed", false,
-                          allocator);
-        if (FLAGS_keep_errors) {
-            rapidjson::Value msg_value;
-            msg_value.SetString(e.what(), allocator);
-            add_or_set_member(test_case["blas"][operation_name], "error",
-                              msg_value, allocator);
-        }
-        std::cerr << "Error when processing test case " << test_case << "\n"
-                  << "what(): " << e.what() << std::endl;
-    }
-}
-
-
 int main(int argc, char* argv[])
 {
-    std::string header =
-        "A benchmark for measuring performance of Ginkgo's BLAS-like "
-        "operations.\nParameters for a benchmark case are:\n"
-        "    n: number of rows for vectors and gemm output (required)\n"
-        "    r: number of columns for vectors (optional, default 1)\n"
-        "    m: number of columns for gemm output (optional, default n)\n"
-        "    k: inner dimension of the gemm (optional, default n)\n"
-        "    stride: storage stride for both vectors (optional, default r)\n"
-        "    stride_x: stride for input vector x (optional, default r)\n"
-        "    stride_y: stride for in/out vector y (optional, default r)\n"
-        "    stride_A: stride for A matrix in gemm (optional, default k)\n"
-        "    stride_B: stride for B matrix in gemm (optional, default m)\n"
-        "    stride_C: stride for C matrix in gemm (optional, default m)\n";
-    std::string format = std::string() + "  [\n    { \"n\": 100 },\n" +
-                         "    { \"n\": 200, \"m\": 200, \"k\": 200 }\n" +
-                         "  ]\n\n";
+    std::string header = R"("
+A benchmark for measuring performance of Ginkgo's BLAS-like "
+operations.
+Parameters for a benchmark case are:
+    n: number of rows for vectors and gemm output (required)
+    r: number of columns for vectors (optional, default 1)
+    m: number of columns for gemm output (optional, default n)
+    k: inner dimension of the gemm (optional, default n)
+    stride: storage stride for both vectors (optional, default r)
+    stride_x: stride for input vector x (optional, default r)
+    stride_y: stride for in/out vector y (optional, default r)
+    stride_A: stride for A matrix in gemm (optional, default k)
+    stride_B: stride for B matrix in gemm (optional, default m)
+    stride_C: stride for C matrix in gemm (optional, default m)
+)";
+    std::string format = example_config;
     initialize_argument_parsing(&argc, &argv, header, format);
 
     std::string extra_information = "The operations are " + FLAGS_operations;
     print_general_information(extra_information);
     auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer);
-    auto engine = get_engine();
-    auto operations = split(FLAGS_operations, ',');
 
     rapidjson::IStreamWrapper jcin(std::cin);
     rapidjson::Document test_cases;
@@ -520,37 +137,7 @@ int main(int argc, char* argv[])
         std::exit(1);
     }
 
-    auto& allocator = test_cases.GetAllocator();
-
-    for (auto& test_case : test_cases.GetArray()) {
-        try {
-            // set up benchmark
-            if (!test_case.HasMember("blas")) {
-                test_case.AddMember("blas",
-                                    rapidjson::Value(rapidjson::kObjectType),
-                                    allocator);
-            }
-            auto& blas_case = test_case["blas"];
-            if (!FLAGS_overwrite &&
-                all_of(begin(operations), end(operations),
-                       [&blas_case](const std::string& s) {
-                           return blas_case.HasMember(s.c_str());
-                       })) {
-                continue;
-            }
-            std::clog << "Running test case: " << test_case << std::endl;
-
-            for (const auto& operation_name : operations) {
-                apply_blas(operation_name.c_str(), exec, test_case, allocator);
-                std::clog << "Current state:" << std::endl
-                          << test_cases << std::endl;
-                backup_results(test_cases);
-            }
-        } catch (const std::exception& e) {
-            std::cerr << "Error setting up benchmark, what(): " << e.what()
-                      << std::endl;
-        }
-    }
+    run_blas_benchmarks(exec, operation_map, test_cases, true);
 
     std::cout << test_cases << std::endl;
 }
diff --git a/benchmark/blas/blas_common.hpp b/benchmark/blas/blas_common.hpp
new file mode 100644
index 00000000000..b6533aa820c
--- /dev/null
+++ b/benchmark/blas/blas_common.hpp
@@ -0,0 +1,515 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/ginkgo.hpp>
+
+
+#include <algorithm>
+#include <chrono>
+#include <cstdlib>
+#include <exception>
+#include <iomanip>
+#include <iostream>
+#include <typeinfo>
+
+
+#include "benchmark/utils/general.hpp"
+#include "benchmark/utils/loggers.hpp"
+#include "benchmark/utils/timer.hpp"
+#include "benchmark/utils/types.hpp"
+
+
+// Command-line arguments
+DEFINE_string(
+    operations, "copy,axpy,scal",
+    "A comma-separated list of BLAS operations to benchmark.\nCandidates are"
+    "   copy (y = x),\n"
+    "   axpy (y = y + a * x),\n"
+    "   multiaxpy (like axpy, but a has one entry per column),\n"
+    "   scal (y = a * y),\n"
+    "   multiscal (like scal, but a has one entry per column),\n"
+    "   dot (a = x' * y),"
+    "   norm (a = sqrt(x' * x)),\n"
+    "   mm (C = A * B),\n"
+    "   gemm (C = a * A * B + b * C)\n"
+    "where A has dimensions n x k, B has dimensions k x m,\n"
+    "C has dimensions n x m and x and y have dimensions n x r");
+
+
+std::string example_config = R"(
+  [
+    { "n": 100 },
+    { "n": 200, "m": 200, "k": 200 }
+  ]
+)";
+
+
+class BenchmarkOperation {
+public:
+    virtual ~BenchmarkOperation() = default;
+
+    virtual gko::size_type get_flops() const = 0;
+    virtual gko::size_type get_memory() const = 0;
+    virtual void prepare(){};
+    virtual void run() = 0;
+};
+
+template <typename Generator, typename T>
+auto as_vector(T& p)
+{
+    return gko::as<typename Generator::Vec>(gko::lend(p));
+}
+
+
+template <typename Generator>
+class CopyOperation : public BenchmarkOperation {
+public:
+    CopyOperation(std::shared_ptr<const gko::Executor> exec,
+                  const Generator& generator, gko::size_type rows,
+                  gko::size_type cols, gko::size_type stride_in,
+                  gko::size_type stride_out)
+    {
+        in_ = generator.create_multi_vector_strided(
+            exec, gko::dim<2>{rows, cols}, stride_in);
+        out_ = generator.create_multi_vector_strided(
+            exec, gko::dim<2>{rows, cols}, stride_out);
+        as_vector<Generator>(in_)->fill(1);
+    }
+
+    gko::size_type get_flops() const override
+    {
+        return in_->get_size()[0] * in_->get_size()[1];
+    }
+
+    gko::size_type get_memory() const override
+    {
+        return in_->get_size()[0] * in_->get_size()[1] * sizeof(etype) * 2;
+    }
+
+    void run() override
+    {
+        as_vector<Generator>(in_)->convert_to(as_vector<Generator>(out_));
+    }
+
+private:
+    std::unique_ptr<gko::LinOp> in_;
+    std::unique_ptr<gko::LinOp> out_;
+};
+
+
+template <typename Generator>
+class AxpyOperation : public BenchmarkOperation {
+public:
+    AxpyOperation(std::shared_ptr<const gko::Executor> exec,
+                  const Generator& generator, gko::size_type rows,
+                  gko::size_type cols, gko::size_type stride_in,
+                  gko::size_type stride_out, bool multi)
+    {
+        alpha_ = gko::matrix::Dense<etype>::create(
+            exec, gko::dim<2>{1, multi ? cols : 1});
+        x_ = generator.create_multi_vector_strided(
+            exec, gko::dim<2>{rows, cols}, stride_in);
+        y_ = generator.create_multi_vector_strided(
+            exec, gko::dim<2>{rows, cols}, stride_out);
+        alpha_->fill(1);
+        as_vector<Generator>(x_)->fill(1);
+    }
+
+    gko::size_type get_flops() const override
+    {
+        return y_->get_size()[0] * y_->get_size()[1] * 2;
+    }
+
+    gko::size_type get_memory() const override
+    {
+        return y_->get_size()[0] * y_->get_size()[1] * sizeof(etype) * 3;
+    }
+
+    void prepare() override { as_vector<Generator>(y_)->fill(1); }
+
+    void run() override
+    {
+        as_vector<Generator>(y_)->add_scaled(lend(alpha_), lend(x_));
+    }
+
+private:
+    std::unique_ptr<gko::matrix::Dense<etype>> alpha_;
+    std::unique_ptr<gko::LinOp> x_;
+    std::unique_ptr<gko::LinOp> y_;
+};
+
+
+template <typename Generator>
+class ScalOperation : public BenchmarkOperation {
+public:
+    ScalOperation(std::shared_ptr<const gko::Executor> exec,
+                  const Generator& generator, gko::size_type rows,
+                  gko::size_type cols, gko::size_type stride, bool multi)
+    {
+        alpha_ = gko::matrix::Dense<etype>::create(
+            exec, gko::dim<2>{1, multi ? cols : 1});
+        y_ = generator.create_multi_vector_strided(
+            exec, gko::dim<2>{rows, cols}, stride);
+        alpha_->fill(1);
+    }
+
+    gko::size_type get_flops() const override
+    {
+        return y_->get_size()[0] * y_->get_size()[1];
+    }
+
+    gko::size_type get_memory() const override
+    {
+        return y_->get_size()[0] * y_->get_size()[1] * sizeof(etype) * 2;
+    }
+
+    void prepare() override { as_vector<Generator>(y_)->fill(1); }
+
+    void run() override { as_vector<Generator>(y_)->scale(lend(alpha_)); }
+
+private:
+    std::unique_ptr<gko::matrix::Dense<etype>> alpha_;
+    std::unique_ptr<gko::LinOp> y_;
+};
+
+
+template <typename Generator>
+class DotOperation : public BenchmarkOperation {
+public:
+    DotOperation(std::shared_ptr<const gko::Executor> exec,
+                 const Generator& generator, gko::size_type rows,
+                 gko::size_type cols, gko::size_type stride_x,
+                 gko::size_type stride_y)
+    {
+        alpha_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{1, cols});
+        x_ = generator.create_multi_vector_strided(
+            exec, gko::dim<2>{rows, cols}, stride_x);
+        y_ = generator.create_multi_vector_strided(
+            exec, gko::dim<2>{rows, cols}, stride_y);
+        as_vector<Generator>(x_)->fill(1);
+        as_vector<Generator>(y_)->fill(1);
+    }
+
+    gko::size_type get_flops() const override
+    {
+        return y_->get_size()[0] * y_->get_size()[1] * 2;
+    }
+
+    gko::size_type get_memory() const override
+    {
+        return y_->get_size()[0] * y_->get_size()[1] * sizeof(etype) * 2;
+    }
+
+    void run() override
+    {
+        as_vector<Generator>(x_)->compute_dot(lend(y_), lend(alpha_));
+    }
+
+private:
+    std::unique_ptr<gko::matrix::Dense<etype>> alpha_;
+    std::unique_ptr<gko::LinOp> x_;
+    std::unique_ptr<gko::LinOp> y_;
+};
+
+
+template <typename Generator>
+class NormOperation : public BenchmarkOperation {
+public:
+    NormOperation(std::shared_ptr<const gko::Executor> exec,
+                  const Generator& generator, gko::size_type rows,
+                  gko::size_type cols, gko::size_type stride)
+    {
+        alpha_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{1, cols});
+        y_ = generator.create_multi_vector_strided(
+            exec, gko::dim<2>{rows, cols}, stride);
+        as_vector<Generator>(y_)->fill(1);
+    }
+
+    gko::size_type get_flops() const override
+    {
+        return y_->get_size()[0] * y_->get_size()[1] * 2;
+    }
+
+    gko::size_type get_memory() const override
+    {
+        return y_->get_size()[0] * y_->get_size()[1] * sizeof(etype);
+    }
+
+    void run() override
+    {
+        as_vector<Generator>(y_)->compute_norm2(lend(alpha_));
+    }
+
+private:
+    std::unique_ptr<gko::matrix::Dense<etype>> alpha_;
+    std::unique_ptr<gko::LinOp> y_;
+};
+
+
+template <typename Generator>
+class ApplyOperation : public BenchmarkOperation {
+public:
+    ApplyOperation(std::shared_ptr<const gko::Executor> exec,
+                   const Generator& generator, gko::size_type n,
+                   gko::size_type k, gko::size_type m, gko::size_type stride_A,
+                   gko::size_type stride_B, gko::size_type stride_C)
+    {
+        A_ = generator.create_multi_vector_strided(exec, gko::dim<2>{n, k},
+                                                   stride_A);
+        B_ = generator.create_multi_vector_strided(exec, gko::dim<2>{k, m},
+                                                   stride_B);
+        C_ = generator.create_multi_vector_strided(exec, gko::dim<2>{n, m},
+                                                   stride_C);
+        as_vector<Generator>(A_)->fill(1);
+        as_vector<Generator>(B_)->fill(1);
+    }
+
+    gko::size_type get_flops() const override
+    {
+        return A_->get_size()[0] * A_->get_size()[1] * B_->get_size()[1] * 2;
+    }
+
+    gko::size_type get_memory() const override
+    {
+        return (A_->get_size()[0] * A_->get_size()[1] +
+                B_->get_size()[0] * B_->get_size()[1] +
+                C_->get_size()[0] * C_->get_size()[1]) *
+               sizeof(etype);
+    }
+
+    void run() override { A_->apply(lend(B_), lend(C_)); }
+
+private:
+    std::unique_ptr<gko::LinOp> A_;
+    std::unique_ptr<gko::LinOp> B_;
+    std::unique_ptr<gko::LinOp> C_;
+};
+
+
+template <typename Generator>
+class AdvancedApplyOperation : public BenchmarkOperation {
+public:
+    AdvancedApplyOperation(std::shared_ptr<const gko::Executor> exec,
+                           const Generator& generator, gko::size_type n,
+                           gko::size_type k, gko::size_type m,
+                           gko::size_type stride_A, gko::size_type stride_B,
+                           gko::size_type stride_C)
+    {
+        A_ = generator.create_multi_vector_strided(exec, gko::dim<2>{n, k},
+                                                   stride_A);
+        B_ = generator.create_multi_vector_strided(exec, gko::dim<2>{k, m},
+                                                   stride_B);
+        C_ = generator.create_multi_vector_strided(exec, gko::dim<2>{n, m},
+                                                   stride_C);
+        alpha_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{1, 1});
+        beta_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{1, 1});
+        as_vector<Generator>(A_)->fill(1);
+        as_vector<Generator>(B_)->fill(1);
+        alpha_->fill(1);
+        beta_->fill(1);
+    }
+
+    gko::size_type get_flops() const override
+    {
+        return A_->get_size()[0] * A_->get_size()[1] * B_->get_size()[1] * 2 +
+               C_->get_size()[0] * C_->get_size()[1] * 3;
+    }
+
+    gko::size_type get_memory() const override
+    {
+        return (A_->get_size()[0] * A_->get_size()[1] +
+                B_->get_size()[0] * B_->get_size()[1] +
+                C_->get_size()[0] * C_->get_size()[1] * 2) *
+               sizeof(etype);
+    }
+
+    void run() override
+    {
+        A_->apply(lend(alpha_), lend(B_), lend(beta_), lend(C_));
+    }
+
+private:
+    std::unique_ptr<gko::matrix::Dense<etype>> alpha_;
+    std::unique_ptr<gko::matrix::Dense<etype>> beta_;
+    std::unique_ptr<gko::LinOp> A_;
+    std::unique_ptr<gko::LinOp> B_;
+    std::unique_ptr<gko::LinOp> C_;
+};
+
+
+struct dimensions {
+    gko::size_type n;
+    gko::size_type k;
+    gko::size_type m;
+    gko::size_type r;
+    gko::size_type stride_x;
+    gko::size_type stride_y;
+    gko::size_type stride_A;
+    gko::size_type stride_B;
+    gko::size_type stride_C;
+};
+
+
+dimensions parse_dims(rapidjson::Value& test_case)
+{
+    auto get_optional = [](rapidjson::Value& obj, const char* name,
+                           gko::size_type default_value) -> gko::size_type {
+        if (obj.HasMember(name)) {
+            return obj[name].GetUint64();
+        } else {
+            return default_value;
+        }
+    };
+
+    dimensions result;
+    result.n = test_case["n"].GetInt64();
+    result.k = get_optional(test_case, "k", result.n);
+    result.m = get_optional(test_case, "m", result.n);
+    result.r = get_optional(test_case, "r", 1);
+    if (test_case.HasMember("stride")) {
+        result.stride_x = test_case["stride"].GetInt64();
+        result.stride_y = result.stride_x;
+    } else {
+        result.stride_x = get_optional(test_case, "stride_x", result.r);
+        result.stride_y = get_optional(test_case, "stride_y", result.r);
+    }
+    result.stride_A = get_optional(test_case, "stride_A", result.k);
+    result.stride_B = get_optional(test_case, "stride_B", result.m);
+    result.stride_C = get_optional(test_case, "stride_C", result.m);
+    return result;
+}
+
+
+template <typename OpMap>
+void apply_blas(const char* operation_name, std::shared_ptr<gko::Executor> exec,
+                const OpMap& operation_map, rapidjson::Value& test_case,
+                rapidjson::MemoryPoolAllocator<>& allocator)
+{
+    try {
+        auto& blas_case = test_case["blas"];
+        add_or_set_member(blas_case, operation_name,
+                          rapidjson::Value(rapidjson::kObjectType), allocator);
+
+        auto op = operation_map.at(operation_name)(exec, parse_dims(test_case));
+
+        auto timer = get_timer(exec, FLAGS_gpu_timer);
+        IterationControl ic(timer);
+
+        // warm run
+        for (auto _ : ic.warmup_run()) {
+            op->prepare();
+            exec->synchronize();
+            op->run();
+            exec->synchronize();
+        }
+
+        // timed run
+        op->prepare();
+        for (auto _ : ic.run()) {
+            op->run();
+        }
+        const auto runtime = ic.compute_average_time();
+        const auto flops = static_cast<double>(op->get_flops());
+        const auto mem = static_cast<double>(op->get_memory());
+        const auto repetitions = ic.get_num_repetitions();
+        add_or_set_member(blas_case[operation_name], "time", runtime,
+                          allocator);
+        add_or_set_member(blas_case[operation_name], "flops", flops / runtime,
+                          allocator);
+        add_or_set_member(blas_case[operation_name], "bandwidth", mem / runtime,
+                          allocator);
+        add_or_set_member(blas_case[operation_name], "repetitions", repetitions,
+                          allocator);
+
+        // compute and write benchmark data
+        add_or_set_member(blas_case[operation_name], "completed", true,
+                          allocator);
+    } catch (const std::exception& e) {
+        add_or_set_member(test_case["blas"][operation_name], "completed", false,
+                          allocator);
+        if (FLAGS_keep_errors) {
+            rapidjson::Value msg_value;
+            msg_value.SetString(e.what(), allocator);
+            add_or_set_member(test_case["blas"][operation_name], "error",
+                              msg_value, allocator);
+        }
+        std::cerr << "Error when processing test case " << test_case << "\n"
+                  << "what(): " << e.what() << std::endl;
+    }
+}
+
+
+template <typename OpMap>
+void run_blas_benchmarks(std::shared_ptr<gko::Executor> exec,
+                         const OpMap& operation_map,
+                         rapidjson::Document& test_cases, bool do_print)
+{
+    auto operations = split(FLAGS_operations, ',');
+    auto& allocator = test_cases.GetAllocator();
+
+    for (auto& test_case : test_cases.GetArray()) {
+        try {
+            // set up benchmark
+            if (!test_case.HasMember("blas")) {
+                test_case.AddMember("blas",
+                                    rapidjson::Value(rapidjson::kObjectType),
+                                    allocator);
+            }
+            auto& blas_case = test_case["blas"];
+            if (!FLAGS_overwrite &&
+                all_of(begin(operations), end(operations),
+                       [&blas_case](const std::string& s) {
+                           return blas_case.HasMember(s.c_str());
+                       })) {
+                continue;
+            }
+            if (do_print) {
+                std::clog << "Running test case: " << test_case << std::endl;
+            }
+
+            for (const auto& operation_name : operations) {
+                apply_blas(operation_name.c_str(), exec, operation_map,
+                           test_case, allocator);
+
+                if (do_print) {
+                    std::clog << "Current state:" << std::endl
+                              << test_cases << std::endl;
+
+                    backup_results(test_cases);
+                }
+            }
+        } catch (const std::exception& e) {
+            std::cerr << "Error setting up benchmark, what(): " << e.what()
+                      << std::endl;
+        }
+    }
+}
diff --git a/benchmark/blas/distributed/CMakeLists.txt b/benchmark/blas/distributed/CMakeLists.txt
new file mode 100644
index 00000000000..1371294efb8
--- /dev/null
+++ b/benchmark/blas/distributed/CMakeLists.txt
@@ -0,0 +1 @@
+ginkgo_add_typed_benchmark_executables(multi-vector-distributed "NO" multi_vector.cpp)
diff --git a/benchmark/blas/distributed/multi_vector.cpp b/benchmark/blas/distributed/multi_vector.cpp
new file mode 100644
index 00000000000..147bcc649d0
--- /dev/null
+++ b/benchmark/blas/distributed/multi_vector.cpp
@@ -0,0 +1,135 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/ginkgo.hpp>
+
+
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+
+
+#include "benchmark/blas/blas_common.hpp"
+#include "benchmark/utils/general.hpp"
+#include "benchmark/utils/generator.hpp"
+#include "benchmark/utils/types.hpp"
+
+using Generator = DistributedDefaultSystemGenerator<DefaultSystemGenerator<>>;
+
+
+int main(int argc, char* argv[])
+{
+    gko::experimental::mpi::environment mpi_env{argc, argv};
+
+    std::string header = R"("
+A benchmark for measuring performance of Ginkgo's BLAS-like "
+operations.
+Parameters for a benchmark case are:
+    n: number of rows for vectors output (required)
+    r: number of columns for vectors (optional, default 1)
+    stride: storage stride for both vectors (optional, default r)
+    stride_x: stride for input vector x (optional, default r)
+    stride_y: stride for in/out vector y (optional, default r)
+)";
+    std::string format = example_config;
+    initialize_argument_parsing(&argc, &argv, header, format);
+
+    std::string extra_information = "The operations are " + FLAGS_operations;
+    print_general_information(extra_information);
+
+    const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+    const auto rank = comm.rank();
+
+    auto exec = executor_factory_mpi.at(FLAGS_executor)(comm.get());
+
+    std::string json_input = broadcast_json_input(std::cin, comm);
+    rapidjson::Document test_cases;
+    test_cases.Parse(json_input.c_str());
+    if (!test_cases.IsArray()) {
+        std::cerr
+            << "Input has to be a JSON array of benchmark configurations:\n"
+            << format;
+        std::exit(1);
+    }
+
+    std::map<std::string,
+             std::function<std::unique_ptr<BenchmarkOperation>(
+                 std::shared_ptr<const gko::Executor>, dimensions)>>
+        operation_map{
+            {"copy",
+             [&](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
+                 return std::make_unique<CopyOperation<Generator>>(
+                     exec, Generator{comm, {}}, dims.n, dims.r, dims.stride_x,
+                     dims.stride_y);
+             }},
+            {"axpy",
+             [&](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
+                 return std::make_unique<AxpyOperation<Generator>>(
+                     exec, Generator{comm, {}}, dims.n, dims.r, dims.stride_x,
+                     dims.stride_y, false);
+             }},
+            {"multiaxpy",
+             [&](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
+                 return std::make_unique<AxpyOperation<Generator>>(
+                     exec, Generator{comm, {}}, dims.n, dims.r, dims.stride_x,
+                     dims.stride_y, true);
+             }},
+            {"scal",
+             [&](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
+                 return std::make_unique<ScalOperation<Generator>>(
+                     exec, Generator{comm, {}}, dims.n, dims.r, dims.stride_y,
+                     false);
+             }},
+            {"multiscal",
+             [&](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
+                 return std::make_unique<ScalOperation<Generator>>(
+                     exec, Generator{comm, {}}, dims.n, dims.r, dims.stride_y,
+                     true);
+             }},
+            {"dot",
+             [&](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
+                 return std::make_unique<DotOperation<Generator>>(
+                     exec, Generator{comm, {}}, dims.n, dims.r, dims.stride_x,
+                     dims.stride_y);
+             }},
+            {"norm",
+             [&](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
+                 return std::make_unique<NormOperation<Generator>>(
+                     exec, Generator{comm, {}}, dims.n, dims.r, dims.stride_y);
+             }}};
+
+    run_blas_benchmarks(exec, operation_map, test_cases, rank == 0);
+
+    if (rank == 0) {
+        std::cout << test_cases << std::endl;
+    }
+}
diff --git a/benchmark/conversions/conversions.cpp b/benchmark/conversions/conversions.cpp
index 46212f8a591..8516f4a6bc3 100644
--- a/benchmark/conversions/conversions.cpp
+++ b/benchmark/conversions/conversions.cpp
@@ -45,8 +45,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "benchmark/utils/formats.hpp"
 #include "benchmark/utils/general.hpp"
-#include "benchmark/utils/loggers.hpp"
-#include "benchmark/utils/spmv_common.hpp"
+#include "benchmark/utils/generator.hpp"
+#include "benchmark/utils/spmv_validation.hpp"
 #include "benchmark/utils/timer.hpp"
 #include "benchmark/utils/types.hpp"
 
@@ -70,8 +70,7 @@ void convert_matrix(const gko::LinOp* matrix_from, const char* format_to,
                           rapidjson::Value(rapidjson::kObjectType), allocator);
 
         gko::matrix_data<etype, itype> data{gko::dim<2>{1, 1}, 1};
-        auto matrix_to =
-            share(formats::matrix_factory.at(format_to)(exec, data));
+        auto matrix_to = share(formats::matrix_factory(format_to, exec, data));
 
         auto timer = get_timer(exec, FLAGS_gpu_timer);
         IterationControl ic{timer};
@@ -114,9 +113,7 @@ int main(int argc, char* argv[])
 {
     std::string header =
         "A benchmark for measuring performance of Ginkgo's conversions.\n";
-    std::string format_str =
-        std::string() + "  [\n" + "    { \"filename\": \"my_file.mtx\"},\n" +
-        "    { \"filename\": \"my_file2.mtx\"}\n" + "  ]\n\n";
+    std::string format_str = example_config;
     initialize_argument_parsing(&argc, &argv, header, format_str);
 
     std::string extra_information =
@@ -147,10 +144,9 @@ int main(int argc, char* argv[])
         auto& conversion_case = test_case["conversions"];
 
         std::clog << "Running test case: " << test_case << std::endl;
-        std::ifstream mtx_fd(test_case["filename"].GetString());
         gko::matrix_data<etype, itype> data;
         try {
-            data = gko::read_generic_raw<etype, itype>(mtx_fd);
+            data = DefaultSystemGenerator<>::generate_matrix_data(test_case);
         } catch (std::exception& e) {
             std::cerr << "Error setting up matrix data, what(): " << e.what()
                       << std::endl;
@@ -158,10 +154,11 @@ int main(int argc, char* argv[])
         }
         std::clog << "Matrix is of size (" << data.size[0] << ", "
                   << data.size[1] << ")" << std::endl;
+        add_or_set_member(test_case, "size", data.size[0], allocator);
         for (const auto& format_from : formats) {
             try {
                 auto matrix_from =
-                    share(formats::matrix_factory.at(format_from)(exec, data));
+                    share(formats::matrix_factory(format_from, exec, data));
                 for (const auto& format_to : formats) {
                     if (format_from == format_to) {
                         continue;
@@ -182,7 +179,7 @@ int main(int argc, char* argv[])
                 }
                 backup_results(test_cases);
             } catch (const gko::AllocationError& e) {
-                for (const auto& format : formats::matrix_factory) {
+                for (const auto& format : formats::matrix_type_factory) {
                     const auto format_to = std::get<0>(format);
                     auto conversion_name =
                         std::string(format_from) + "-" + format_to;
diff --git a/benchmark/matrix_statistics/matrix_statistics.cpp b/benchmark/matrix_statistics/matrix_statistics.cpp
index 0b71ac8ccb8..a62c8c10637 100644
--- a/benchmark/matrix_statistics/matrix_statistics.cpp
+++ b/benchmark/matrix_statistics/matrix_statistics.cpp
@@ -34,14 +34,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <cmath>
-#include <cstdlib>
 #include <exception>
-#include <fstream>
 #include <iostream>
 
 
 #include "benchmark/utils/general.hpp"
-#include "benchmark/utils/spmv_common.hpp"
+#include "benchmark/utils/generator.hpp"
+#include "benchmark/utils/spmv_validation.hpp"
 #include "benchmark/utils/types.hpp"
 
 
@@ -173,9 +172,7 @@ int main(int argc, char* argv[])
     std::string header =
         "A utility that collects additional statistical properties of the "
         "matrix.\n";
-    std::string format = std::string() + "  [\n" +
-                         "    { \"filename\": \"my_file.mtx\"},\n" +
-                         "    { \"filename\": \"my_file2.mtx\"}\n" + "  ]\n\n";
+    std::string format = example_config;
     initialize_argument_parsing(&argc, &argv, header, format);
 
     std::clog << gko::version_info::get() << std::endl;
@@ -202,12 +199,13 @@ int main(int argc, char* argv[])
 
             std::clog << "Running test case: " << test_case << std::endl;
 
-            std::ifstream ifs(test_case["filename"].GetString());
-            auto matrix = gko::read_generic_raw<etype, gko::int64>(ifs);
-            ifs.close();
+            auto matrix =
+                DefaultSystemGenerator<etype, gko::int64>::generate_matrix_data(
+                    test_case);
 
             std::clog << "Matrix is of size (" << matrix.size[0] << ", "
                       << matrix.size[1] << ")" << std::endl;
+            add_or_set_member(test_case, "size", matrix.size[0], allocator);
 
             extract_matrix_statistics(matrix, test_case["problem"], allocator);
 
diff --git a/benchmark/preconditioner/preconditioner.cpp b/benchmark/preconditioner/preconditioner.cpp
index cbf81ed699a..8a947ac8df8 100644
--- a/benchmark/preconditioner/preconditioner.cpp
+++ b/benchmark/preconditioner/preconditioner.cpp
@@ -34,19 +34,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <algorithm>
-#include <chrono>
 #include <cstdlib>
 #include <exception>
-#include <fstream>
 #include <iomanip>
 #include <iostream>
 
 
 #include "benchmark/utils/formats.hpp"
 #include "benchmark/utils/general.hpp"
+#include "benchmark/utils/generator.hpp"
 #include "benchmark/utils/loggers.hpp"
 #include "benchmark/utils/preconditioners.hpp"
-#include "benchmark/utils/spmv_common.hpp"
+#include "benchmark/utils/spmv_validation.hpp"
 #include "benchmark/utils/timer.hpp"
 #include "benchmark/utils/types.hpp"
 
@@ -259,9 +258,7 @@ int main(int argc, char* argv[])
     FLAGS_formats = "csr";
     std::string header =
         "A benchmark for measuring preconditioner performance.\n";
-    std::string format = std::string() + "  [\n" +
-                         "    { \"filename\": \"my_file.mtx\"},\n" +
-                         "    { \"filename\": \"my_file2.mtx\"}\n" + "  ]\n\n";
+    std::string format = example_config;
     initialize_argument_parsing(&argc, &argv, header, format);
 
     std::string extra_information =
@@ -288,6 +285,8 @@ int main(int argc, char* argv[])
 
     auto& allocator = test_cases.GetAllocator();
 
+    DefaultSystemGenerator<> generator{};
+
     for (auto& test_case : test_cases.GetArray()) {
         try {
             // set up benchmark
@@ -307,18 +306,19 @@ int main(int argc, char* argv[])
             }
             std::clog << "Running test case: " << test_case << std::endl;
 
-            std::ifstream mtx_fd(test_case["filename"].GetString());
-            auto data = gko::read_generic_raw<etype, itype>(mtx_fd);
+            auto data = generator.generate_matrix_data(test_case);
 
             auto system_matrix =
-                share(formats::matrix_factory.at(FLAGS_formats)(exec, data));
-            auto b = create_vector<etype>(exec, system_matrix->get_size()[0],
-                                          engine);
-            auto x = create_vector<etype>(exec, system_matrix->get_size()[0]);
+                share(formats::matrix_factory(FLAGS_formats, exec, data));
+            auto b = generator.create_multi_vector_random(
+                exec, system_matrix->get_size()[0]);
+            auto x = generator.create_multi_vector(
+                exec, system_matrix->get_size()[0], gko::zero<etype>());
 
             std::clog << "Matrix is of size (" << system_matrix->get_size()[0]
                       << ", " << system_matrix->get_size()[1] << ")"
                       << std::endl;
+            add_or_set_member(test_case, "size", data.size[0], allocator);
             for (const auto& precond_name : preconditioners) {
                 run_preconditioner(precond_name.c_str(), exec, system_matrix,
                                    lend(b), lend(x), test_case, allocator);
diff --git a/benchmark/solver/CMakeLists.txt b/benchmark/solver/CMakeLists.txt
index 11c08bdf6d2..76edd10b2a9 100644
--- a/benchmark/solver/CMakeLists.txt
+++ b/benchmark/solver/CMakeLists.txt
@@ -1 +1,4 @@
 ginkgo_add_typed_benchmark_executables(solver "YES" solver.cpp)
+if(GINKGO_BUILD_MPI)
+    add_subdirectory(distributed)
+endif()
diff --git a/benchmark/solver/distributed/CMakeLists.txt b/benchmark/solver/distributed/CMakeLists.txt
new file mode 100644
index 00000000000..ca6586f1acf
--- /dev/null
+++ b/benchmark/solver/distributed/CMakeLists.txt
@@ -0,0 +1 @@
+ginkgo_add_typed_benchmark_executables(solver-distributed "YES" solver.cpp)
diff --git a/benchmark/solver/distributed/solver.cpp b/benchmark/solver/distributed/solver.cpp
new file mode 100644
index 00000000000..85ac23670d1
--- /dev/null
+++ b/benchmark/solver/distributed/solver.cpp
@@ -0,0 +1,155 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/ginkgo.hpp>
+
+
+#include <cstdlib>
+#include <exception>
+#include <iostream>
+#include <set>
+
+
+#include "benchmark/solver/solver_common.hpp"
+#include "benchmark/utils/general.hpp"
+#include "benchmark/utils/generator.hpp"
+
+
+struct Generator : public DistributedDefaultSystemGenerator<SolverGenerator> {
+    Generator(gko::experimental::mpi::communicator comm)
+        : DistributedDefaultSystemGenerator<SolverGenerator>{std::move(comm),
+                                                             {}}
+    {}
+
+    std::unique_ptr<Vec> generate_rhs(std::shared_ptr<const gko::Executor> exec,
+                                      const gko::LinOp* system_matrix,
+                                      rapidjson::Value& config) const
+    {
+        return Vec::create(
+            exec, comm, gko::dim<2>{system_matrix->get_size()[0], FLAGS_nrhs},
+            gko::as<typename LocalGenerator::Vec>(
+                local_generator.generate_rhs(
+                    exec, gko::as<Mtx>(system_matrix)->get_local_matrix().get(),
+                    config))
+                .get());
+    }
+
+    std::unique_ptr<Vec> generate_initial_guess(
+        std::shared_ptr<const gko::Executor> exec,
+        const gko::LinOp* system_matrix, const Vec* rhs) const
+    {
+        return Vec::create(
+            exec, comm, gko::dim<2>{rhs->get_size()[0], FLAGS_nrhs},
+            gko::as<typename LocalGenerator::Vec>(
+                local_generator.generate_initial_guess(
+                    exec, gko::as<Mtx>(system_matrix)->get_local_matrix().get(),
+                    rhs->get_local_vector()))
+                .get());
+    }
+};
+
+
+int main(int argc, char* argv[])
+{
+    gko::experimental::mpi::environment mpi_env{argc, argv};
+
+    // Set the default repetitions = 1.
+    FLAGS_repetitions = "1";
+    FLAGS_min_repetitions = 1;
+
+    std::string header =
+        "A benchmark for measuring Ginkgo's distributed solvers\n";
+    std::string format = example_config + R"(
+  The matrix will either be read from an input file if the filename parameter
+  is given, or generated as a stencil matrix.
+  If the filename parameter is given, all processes will read the file and
+  then the matrix is distributed row-block-wise.
+  In the other case, a size and stencil parameter have to be provided.
+  The size parameter denotes the size per process. It might be adjusted to
+  fit the dimensionality of the stencil more easily.
+  Possible values for "stencil" are:  5pt (2D), 7pt (3D), 9pt (2D), 27pt (3D).
+  Optional values for "comm_pattern" are: stencil, optimal (default).
+  Possible values for "optimal[spmv]" follow the pattern
+  "<local_format>-<non_local_format>", where both "local_format" and
+  "non_local_format" can be any of the recognized spmv formats.
+)";
+    initialize_argument_parsing(&argc, &argv, header, format);
+
+    const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+    const auto rank = comm.rank();
+
+    auto exec = executor_factory_mpi.at(FLAGS_executor)(comm.get());
+
+    std::string extra_information;
+    if (FLAGS_repetitions == "auto") {
+        extra_information =
+            "WARNING: repetitions = 'auto' not supported for MPI "
+            "benchmarks, setting repetitions to the default value.";
+        FLAGS_repetitions = "10";
+    }
+    if (rank == 0) {
+        print_general_information(extra_information);
+    }
+
+    std::set<std::string> supported_solvers = {"cg", "fcg", "cgs", "bicgstab",
+                                               "gmres"};
+    auto solvers = split(FLAGS_solvers, ',');
+    for (const auto& solver : solvers) {
+        if (supported_solvers.find(solver) == supported_solvers.end()) {
+            throw std::range_error(
+                std::string("The requested solvers <") + FLAGS_solvers +
+                "> contain the unsupported solver <" + solver + ">!");
+        }
+    }
+
+    std::stringstream ss_rel_res_goal;
+    ss_rel_res_goal << std::scientific << FLAGS_rel_res_goal;
+
+    std::string json_input = FLAGS_overhead
+                                 ? R"(
+[{"filename": "overhead.mtx",
+  "optimal": {"spmv": "csr-csr"}]
+)"
+                                 : broadcast_json_input(std::cin, comm);
+    rapidjson::Document test_cases;
+    test_cases.Parse(json_input.c_str());
+
+    if (!test_cases.IsArray()) {
+        print_config_error_and_exit();
+    }
+
+    run_solver_benchmarks(exec, test_cases, Generator(comm), rank == 0);
+
+    if (rank == 0) {
+        std::cout << test_cases << std::endl;
+    }
+}
diff --git a/benchmark/solver/solver.cpp b/benchmark/solver/solver.cpp
index 7dcdc516c15..e134a8d4a4f 100644
--- a/benchmark/solver/solver.cpp
+++ b/benchmark/solver/solver.cpp
@@ -46,520 +46,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <vector>
 
 
-#include "benchmark/utils/formats.hpp"
+#include "benchmark/solver/solver_common.hpp"
 #include "benchmark/utils/general.hpp"
-#include "benchmark/utils/loggers.hpp"
-#include "benchmark/utils/overhead_linop.hpp"
-#include "benchmark/utils/preconditioners.hpp"
-#include "benchmark/utils/timer.hpp"
-#include "benchmark/utils/types.hpp"
-
-
-#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
-#include "benchmark/utils/tuning_variables.hpp"
-#endif  // GINKGO_BENCHMARK_ENABLE_TUNING
-
-
-// Command-line arguments
-DEFINE_uint32(max_iters, 1000,
-              "Maximal number of iterations the solver will be run for");
-
-DEFINE_uint32(warmup_max_iters, 100,
-              "Maximal number of warmup iterations the solver will be run for");
-
-DEFINE_double(rel_res_goal, 1e-6, "The relative residual goal of the solver");
-
-DEFINE_bool(
-    rel_residual, false,
-    "Use relative residual instead of residual reduction stopping criterion");
-
-DEFINE_string(solvers, "cg",
-              "A comma-separated list of solvers to run. "
-              "Supported values are: bicgstab, bicg, cb_gmres_keep, "
-              "cb_gmres_reduce1, cb_gmres_reduce2, cb_gmres_integer, "
-              "cb_gmres_ireduce1, cb_gmres_ireduce2, cg, cgs, fcg, gmres, idr, "
-              "lower_trs, upper_trs, symm_direct, overhead");
-
-DEFINE_uint32(
-    nrhs, 1,
-    "The number of right hand sides. Record the residual only when nrhs == 1.");
-
-DEFINE_uint32(gmres_restart, 100,
-              "What maximum dimension of the Krylov space to use in GMRES");
-
-DEFINE_uint32(idr_subspace_dim, 2,
-              "What dimension of the subspace to use in IDR");
-
-DEFINE_double(
-    idr_kappa, 0.7,
-    "the number to check whether Av_n and v_n are too close or not in IDR");
-
-DEFINE_string(
-    rhs_generation, "1",
-    "Method used to generate the right hand side. Supported values are:"
-    "`1`, `random`, `sinus`. `1` sets all values of the right hand side to 1, "
-    "`random` assigns the values to a uniformly distributed random number "
-    "in [-1, 1), and `sinus` assigns b = A * (s / |s|) with A := system matrix,"
-    " s := vector with s(idx) = sin(idx) for non-complex types, and "
-    "s(idx) = sin(2*idx) + i * sin(2*idx+1).");
-
-DEFINE_string(
-    initial_guess_generation, "rhs",
-    "Method used to generate the initial guess. Supported values are: "
-    "`random`, `rhs`, `0`. `random` uses a random vector, `rhs` uses the right "
-    "hand side, and `0 uses a zero vector as the initial guess.");
-
-// This allows to benchmark the overhead of a solver by using the following
-// data: A=[1.0], x=[0.0], b=[nan]. This data can be used to benchmark normal
-// solvers or using the argument --solvers=overhead, a minimal solver will be
-// launched which contains only a few kernel calls.
-DEFINE_bool(overhead, false,
-            "If set, uses dummy data to benchmark Ginkgo overhead");
-
-
-// input validation
-[[noreturn]] void print_config_error_and_exit()
-{
-    std::cerr << "Input has to be a JSON array of matrix configurations:\n"
-              << "  [\n"
-              << "    { \"filename\": \"my_file.mtx\",  \"optimal\": { "
-                 "\"spmv\": \"<matrix format>\" },\n"
-                 "      \"rhs\": \"my_file_rhs.mtx\" },\n"
-              << "    { \"filename\": \"my_file2.mtx\", \"optimal\": { "
-                 "\"spmv\": \"<matrix format>\" } }\n"
-              << "  ]" << std::endl;
-    std::exit(1);
-}
-
-
-template <typename Engine>
-std::unique_ptr<vec<etype>> generate_rhs(
-    std::shared_ptr<const gko::Executor> exec,
-    std::shared_ptr<const gko::LinOp> system_matrix, Engine engine)
-{
-    gko::dim<2> vec_size{system_matrix->get_size()[0], FLAGS_nrhs};
-    if (FLAGS_rhs_generation == "1") {
-        return create_matrix<etype>(exec, vec_size, gko::one<etype>());
-    } else if (FLAGS_rhs_generation == "random") {
-        return create_matrix<etype>(exec, vec_size, engine);
-    } else if (FLAGS_rhs_generation == "sinus") {
-        auto rhs = vec<etype>::create(exec, vec_size);
-
-        auto tmp = create_matrix_sin<etype>(exec, vec_size);
-        auto scalar = gko::matrix::Dense<rc_etype>::create(
-            exec->get_master(), gko::dim<2>{1, vec_size[1]});
-        tmp->compute_norm2(scalar.get());
-        for (gko::size_type i = 0; i < vec_size[1]; ++i) {
-            scalar->at(0, i) = gko::one<rc_etype>() / scalar->at(0, i);
-        }
-        // normalize sin-vector
-        if (gko::is_complex_s<etype>::value) {
-            tmp->scale(scalar->make_complex().get());
-        } else {
-            tmp->scale(scalar.get());
-        }
-        system_matrix->apply(tmp.get(), rhs.get());
-        return rhs;
-    }
-    throw std::invalid_argument(std::string("\"rhs_generation\" = ") +
-                                FLAGS_rhs_generation + " is not supported!");
-}
-
-
-template <typename Engine>
-std::unique_ptr<vec<etype>> generate_initial_guess(
-    std::shared_ptr<const gko::Executor> exec,
-    std::shared_ptr<const gko::LinOp> system_matrix, const vec<etype>* rhs,
-    Engine engine)
-{
-    gko::dim<2> vec_size{system_matrix->get_size()[1], FLAGS_nrhs};
-    if (FLAGS_initial_guess_generation == "0") {
-        return create_matrix<etype>(exec, vec_size, gko::zero<etype>());
-    } else if (FLAGS_initial_guess_generation == "random") {
-        return create_matrix<etype>(exec, vec_size, engine);
-    } else if (FLAGS_initial_guess_generation == "rhs") {
-        return rhs->clone();
-    }
-    throw std::invalid_argument(std::string("\"initial_guess_generation\" = ") +
-                                FLAGS_initial_guess_generation +
-                                " is not supported!");
-}
-
-
-void validate_option_object(const rapidjson::Value& value)
-{
-    if (!value.IsObject() || !value.HasMember("optimal") ||
-        !value["optimal"].HasMember("spmv") ||
-        !value["optimal"]["spmv"].IsString() || !value.HasMember("filename") ||
-        !value["filename"].IsString() ||
-        (value.HasMember("rhs") && !value["rhs"].IsString())) {
-        print_config_error_and_exit();
-    }
-}
-
-
-std::shared_ptr<const gko::stop::CriterionFactory> create_criterion(
-    std::shared_ptr<const gko::Executor> exec, std::uint32_t max_iters)
-{
-    std::shared_ptr<const gko::stop::CriterionFactory> residual_stop;
-    if (FLAGS_rel_residual) {
-        residual_stop =
-            gko::share(gko::stop::ResidualNorm<rc_etype>::build()
-                           .with_baseline(gko::stop::mode::initial_resnorm)
-                           .with_reduction_factor(
-                               static_cast<rc_etype>(FLAGS_rel_res_goal))
-                           .on(exec));
-    } else {
-        residual_stop =
-            gko::share(gko::stop::ResidualNorm<rc_etype>::build()
-                           .with_baseline(gko::stop::mode::rhs_norm)
-                           .with_reduction_factor(
-                               static_cast<rc_etype>(FLAGS_rel_res_goal))
-                           .on(exec));
-    }
-    auto iteration_stop = gko::share(
-        gko::stop::Iteration::build().with_max_iters(max_iters).on(exec));
-    std::vector<std::shared_ptr<const gko::stop::CriterionFactory>>
-        criterion_vector{residual_stop, iteration_stop};
-    return gko::stop::combine(criterion_vector);
-}
-
-
-template <typename SolverIntermediate>
-std::unique_ptr<gko::LinOpFactory> add_criteria_precond_finalize(
-    SolverIntermediate inter, const std::shared_ptr<const gko::Executor>& exec,
-    std::shared_ptr<const gko::LinOpFactory> precond, std::uint32_t max_iters)
-{
-    return inter.with_criteria(create_criterion(exec, max_iters))
-        .with_preconditioner(give(precond))
-        .on(exec);
-}
-
-
-template <typename Solver>
-std::unique_ptr<gko::LinOpFactory> add_criteria_precond_finalize(
-    const std::shared_ptr<const gko::Executor>& exec,
-    std::shared_ptr<const gko::LinOpFactory> precond, std::uint32_t max_iters)
-{
-    return add_criteria_precond_finalize(Solver::build(), exec, precond,
-                                         max_iters);
-}
-
-
-std::unique_ptr<gko::LinOpFactory> generate_solver(
-    const std::shared_ptr<const gko::Executor>& exec,
-    std::shared_ptr<const gko::LinOpFactory> precond,
-    const std::string& description, std::uint32_t max_iters)
-{
-    std::string cb_gmres_prefix("cb_gmres_");
-    if (description.find(cb_gmres_prefix) == 0) {
-        auto s_prec = gko::solver::cb_gmres::storage_precision::keep;
-        const auto spec = description.substr(cb_gmres_prefix.length());
-        if (spec == "keep") {
-            s_prec = gko::solver::cb_gmres::storage_precision::keep;
-        } else if (spec == "reduce1") {
-            s_prec = gko::solver::cb_gmres::storage_precision::reduce1;
-        } else if (spec == "reduce2") {
-            s_prec = gko::solver::cb_gmres::storage_precision::reduce2;
-        } else if (spec == "integer") {
-            s_prec = gko::solver::cb_gmres::storage_precision::integer;
-        } else if (spec == "ireduce1") {
-            s_prec = gko::solver::cb_gmres::storage_precision::ireduce1;
-        } else if (spec == "ireduce2") {
-            s_prec = gko::solver::cb_gmres::storage_precision::ireduce2;
-        } else {
-            throw std::range_error(
-                std::string(
-                    "CB-GMRES does not have a corresponding solver to <") +
-                description + ">!");
-        }
-        return add_criteria_precond_finalize(
-            gko::solver::CbGmres<etype>::build()
-                .with_krylov_dim(FLAGS_gmres_restart)
-                .with_storage_precision(s_prec),
-            exec, precond, max_iters);
-    } else if (description == "bicgstab") {
-        return add_criteria_precond_finalize<gko::solver::Bicgstab<etype>>(
-            exec, precond, max_iters);
-    } else if (description == "bicg") {
-        return add_criteria_precond_finalize<gko::solver::Bicg<etype>>(
-            exec, precond, max_iters);
-    } else if (description == "cg") {
-        return add_criteria_precond_finalize<gko::solver::Cg<etype>>(
-            exec, precond, max_iters);
-    } else if (description == "cgs") {
-        return add_criteria_precond_finalize<gko::solver::Cgs<etype>>(
-            exec, precond, max_iters);
-    } else if (description == "fcg") {
-        return add_criteria_precond_finalize<gko::solver::Fcg<etype>>(
-            exec, precond, max_iters);
-    } else if (description == "idr") {
-        return add_criteria_precond_finalize(
-            gko::solver::Idr<etype>::build()
-                .with_subspace_dim(FLAGS_idr_subspace_dim)
-                .with_kappa(static_cast<rc_etype>(FLAGS_idr_kappa)),
-            exec, precond, max_iters);
-    } else if (description == "gmres") {
-        return add_criteria_precond_finalize(
-            gko::solver::Gmres<etype>::build().with_krylov_dim(
-                FLAGS_gmres_restart),
-            exec, precond, max_iters);
-    } else if (description == "lower_trs") {
-        return gko::solver::LowerTrs<etype>::build()
-            .with_num_rhs(FLAGS_nrhs)
-            .on(exec);
-    } else if (description == "upper_trs") {
-        return gko::solver::UpperTrs<etype>::build()
-            .with_num_rhs(FLAGS_nrhs)
-            .on(exec);
-    } else if (description == "symm_direct") {
-        return gko::experimental::solver::Direct<etype, itype>::build()
-            .with_factorization(
-                gko::experimental::factorization::Lu<etype, itype>::build()
-                    .with_symmetric_sparsity(true)
-                    .on(exec))
-            .on(exec);
-    } else if (description == "overhead") {
-        return add_criteria_precond_finalize<gko::Overhead<etype>>(
-            exec, precond, max_iters);
-    }
-    throw std::range_error(std::string("The provided string <") + description +
-                           "> does not match any solver!");
-}
-
-
-void write_precond_info(const gko::LinOp* precond,
-                        rapidjson::Value& precond_info,
-                        rapidjson::MemoryPoolAllocator<>& allocator)
-{
-    if (const auto jacobi =
-            dynamic_cast<const gko::preconditioner::Jacobi<etype>*>(precond)) {
-        // extract block sizes
-        const auto bdata =
-            jacobi->get_parameters().block_pointers.get_const_data();
-        add_or_set_member(precond_info, "block_sizes",
-                          rapidjson::Value(rapidjson::kArrayType), allocator);
-        const auto nblocks = jacobi->get_num_blocks();
-        for (auto i = decltype(nblocks){0}; i < nblocks; ++i) {
-            precond_info["block_sizes"].PushBack(bdata[i + 1] - bdata[i],
-                                                 allocator);
-        }
-
-        // extract block precisions
-        const auto pdata =
-            jacobi->get_parameters()
-                .storage_optimization.block_wise.get_const_data();
-        if (pdata) {
-            add_or_set_member(precond_info, "block_precisions",
-                              rapidjson::Value(rapidjson::kArrayType),
-                              allocator);
-            for (auto i = decltype(nblocks){0}; i < nblocks; ++i) {
-                precond_info["block_precisions"].PushBack(
-                    static_cast<int>(pdata[i]), allocator);
-            }
-        }
-
-        // extract condition numbers
-        const auto cdata = jacobi->get_conditioning();
-        if (cdata) {
-            add_or_set_member(precond_info, "block_conditioning",
-                              rapidjson::Value(rapidjson::kArrayType),
-                              allocator);
-            for (auto i = decltype(nblocks){0}; i < nblocks; ++i) {
-                precond_info["block_conditioning"].PushBack(cdata[i],
-                                                            allocator);
-            }
-        }
-    }
-}
-
-
-void solve_system(const std::string& solver_name,
-                  const std::string& precond_name,
-                  const char* precond_solver_name,
-                  std::shared_ptr<gko::Executor> exec,
-                  std::shared_ptr<const gko::LinOp> system_matrix,
-                  const vec<etype>* b, const vec<etype>* x,
-                  rapidjson::Value& test_case,
-                  rapidjson::MemoryPoolAllocator<>& allocator)
-{
-    try {
-        auto& solver_case = test_case["solver"];
-        if (!FLAGS_overwrite && solver_case.HasMember(precond_solver_name)) {
-            return;
-        }
-
-        add_or_set_member(solver_case, precond_solver_name,
-                          rapidjson::Value(rapidjson::kObjectType), allocator);
-        auto& solver_json = solver_case[precond_solver_name];
-        add_or_set_member(solver_json, "recurrent_residuals",
-                          rapidjson::Value(rapidjson::kArrayType), allocator);
-        add_or_set_member(solver_json, "true_residuals",
-                          rapidjson::Value(rapidjson::kArrayType), allocator);
-        add_or_set_member(solver_json, "implicit_residuals",
-                          rapidjson::Value(rapidjson::kArrayType), allocator);
-        add_or_set_member(solver_json, "iteration_timestamps",
-                          rapidjson::Value(rapidjson::kArrayType), allocator);
-        if (b->get_size()[1] == 1 && !FLAGS_overhead) {
-            auto rhs_norm = compute_norm2(lend(b));
-            add_or_set_member(solver_json, "rhs_norm", rhs_norm, allocator);
-        }
-        for (auto stage : {"generate", "apply"}) {
-            add_or_set_member(solver_json, stage,
-                              rapidjson::Value(rapidjson::kObjectType),
-                              allocator);
-            add_or_set_member(solver_json[stage], "components",
-                              rapidjson::Value(rapidjson::kObjectType),
-                              allocator);
-        }
-
-        IterationControl ic{get_timer(exec, FLAGS_gpu_timer)};
-
-        // warm run
-        std::shared_ptr<gko::LinOp> solver;
-        for (auto _ : ic.warmup_run()) {
-            auto x_clone = clone(x);
-            auto precond = precond_factory.at(precond_name)(exec);
-            solver = generate_solver(exec, give(precond), solver_name,
-                                     FLAGS_warmup_max_iters)
-                         ->generate(system_matrix);
-            solver->apply(lend(b), lend(x_clone));
-            exec->synchronize();
-        }
-
-        // detail run
-        if (FLAGS_detailed && !FLAGS_overhead) {
-            // slow run, get the time of each functions
-            auto x_clone = clone(x);
-
-            auto gen_logger =
-                std::make_shared<OperationLogger>(FLAGS_nested_names);
-            exec->add_logger(gen_logger);
-            if (exec->get_master() != exec) {
-                exec->get_master()->add_logger(gen_logger);
-            }
-
-            auto precond = precond_factory.at(precond_name)(exec);
-            solver = generate_solver(exec, give(precond), solver_name,
-                                     FLAGS_max_iters)
-                         ->generate(system_matrix);
-
-            if (exec->get_master() != exec) {
-                exec->get_master()->remove_logger(gko::lend(gen_logger));
-            }
-            exec->remove_logger(gko::lend(gen_logger));
-            gen_logger->write_data(solver_json["generate"]["components"],
-                                   allocator, 1);
-
-            if (auto prec =
-                    dynamic_cast<const gko::Preconditionable*>(lend(solver))) {
-                add_or_set_member(solver_json, "preconditioner",
-                                  rapidjson::Value(rapidjson::kObjectType),
-                                  allocator);
-                write_precond_info(
-                    lend(clone(get_executor(FLAGS_gpu_timer)->get_master(),
-                               prec->get_preconditioner())),
-                    solver_json["preconditioner"], allocator);
-            }
-
-            auto apply_logger =
-                std::make_shared<OperationLogger>(FLAGS_nested_names);
-            exec->add_logger(apply_logger);
-            if (exec->get_master() != exec) {
-                exec->get_master()->add_logger(apply_logger);
-            }
-
-            solver->apply(lend(b), lend(x_clone));
-
-            if (exec->get_master() != exec) {
-                exec->get_master()->remove_logger(gko::lend(apply_logger));
-            }
-            exec->remove_logger(gko::lend(apply_logger));
-            apply_logger->write_data(solver_json["apply"]["components"],
-                                     allocator, 1);
-
-            // slow run, gets the recurrent and true residuals of each iteration
-            if (b->get_size()[1] == 1) {
-                x_clone = clone(x);
-                auto res_logger = std::make_shared<ResidualLogger<etype>>(
-                    lend(system_matrix), b, solver_json["recurrent_residuals"],
-                    solver_json["true_residuals"],
-                    solver_json["implicit_residuals"],
-                    solver_json["iteration_timestamps"], allocator);
-                solver->add_logger(res_logger);
-                solver->apply(lend(b), lend(x_clone));
-                if (!res_logger->has_implicit_res_norms()) {
-                    solver_json.RemoveMember("implicit_residuals");
-                }
-            }
-            exec->synchronize();
-        }
-
-        // timed run
-        auto it_logger = std::make_shared<IterationLogger>();
-        auto generate_timer = get_timer(exec, FLAGS_gpu_timer);
-        auto apply_timer = ic.get_timer();
-        auto x_clone = clone(x);
-        for (auto status : ic.run(false)) {
-            x_clone = clone(x);
-
-            exec->synchronize();
-            generate_timer->tic();
-            auto precond = precond_factory.at(precond_name)(exec);
-            solver = generate_solver(exec, give(precond), solver_name,
-                                     FLAGS_max_iters)
-                         ->generate(system_matrix);
-            generate_timer->toc();
-
-            exec->synchronize();
-            if (ic.get_num_repetitions() == 0) {
-                solver->add_logger(it_logger);
-            }
-            apply_timer->tic();
-            solver->apply(lend(b), lend(x_clone));
-            apply_timer->toc();
-            if (ic.get_num_repetitions() == 0) {
-                solver->remove_logger(gko::lend(it_logger));
-            }
-        }
-        it_logger->write_data(solver_json["apply"], allocator);
-
-        if (b->get_size()[1] == 1 && !FLAGS_overhead) {
-            // a solver is considered direct if it didn't log any iterations
-            if (solver_json["apply"].HasMember("iterations") &&
-                solver_json["apply"]["iterations"].GetInt() == 0) {
-                auto error =
-                    compute_direct_error(lend(solver), lend(b), lend(x_clone));
-                add_or_set_member(solver_json, "forward_error", error,
-                                  allocator);
-            }
-            auto residual = compute_residual_norm(lend(system_matrix), lend(b),
-                                                  lend(x_clone));
-            add_or_set_member(solver_json, "residual_norm", residual,
-                              allocator);
-        }
-        add_or_set_member(solver_json["generate"], "time",
-                          generate_timer->compute_average_time(), allocator);
-        add_or_set_member(solver_json["apply"], "time",
-                          apply_timer->compute_average_time(), allocator);
-        add_or_set_member(solver_json, "repetitions",
-                          apply_timer->get_num_repetitions(), allocator);
-
-        // compute and write benchmark data
-        add_or_set_member(solver_json, "completed", true, allocator);
-    } catch (const std::exception& e) {
-        add_or_set_member(test_case["solver"][precond_solver_name], "completed",
-                          false, allocator);
-        if (FLAGS_keep_errors) {
-            rapidjson::Value msg_value;
-            msg_value.SetString(e.what(), allocator);
-            add_or_set_member(test_case["solver"][precond_solver_name], "error",
-                              msg_value, allocator);
-        }
-        std::cerr << "Error when processing test case " << test_case << "\n"
-                  << "what(): " << e.what() << std::endl;
-    }
-}
+#include "benchmark/utils/generator.hpp"
 
 
 int main(int argc, char* argv[])
@@ -569,16 +58,9 @@ int main(int argc, char* argv[])
     FLAGS_min_repetitions = 1;
     std::string header =
         "A benchmark for measuring performance of Ginkgo's solvers.\n";
-    std::string format =
-        std::string() + "  [\n" +
-        "    { \"filename\": \"my_file.mtx\",  \"optimal\": { "
-        "\"spmv\": \"<matrix format>\" },\n"
-        "      \"rhs\": \"my_file_rhs.mtx\" },\n" +
-        "    { \"filename\": \"my_file2.mtx\", \"optimal\": { "
-        "\"spmv\": \"<matrix format>\" } }\n" +
-        "  ]\n\n" +
-        "  \"optimal_format\" can be one of the recognized spmv "
-        "format\n\n";
+    std::string format = example_config + R"(
+  "optimal":"spmv" can be one of the recognized spmv formats
+)";
     initialize_argument_parsing(&argc, &argv, header, format);
 
     std::stringstream ss_rel_res_goal;
@@ -592,14 +74,6 @@ int main(int argc, char* argv[])
     print_general_information(extra_information);
 
     auto exec = get_executor(FLAGS_gpu_timer);
-    auto solvers = split(FLAGS_solvers, ',');
-    auto preconds = split(FLAGS_preconditioners, ',');
-    std::vector<std::string> precond_solvers;
-    for (const auto& s : solvers) {
-        for (const auto& p : preconds) {
-            precond_solvers.push_back(s + (p == "none" ? "" : "-" + p));
-        }
-    }
 
     rapidjson::Document test_cases;
     if (!FLAGS_overhead) {
@@ -617,73 +91,7 @@ int main(int argc, char* argv[])
         print_config_error_and_exit();
     }
 
-    auto engine = get_engine();
-    auto& allocator = test_cases.GetAllocator();
-
-    for (auto& test_case : test_cases.GetArray()) {
-        try {
-            // set up benchmark
-            validate_option_object(test_case);
-            if (!test_case.HasMember("solver")) {
-                test_case.AddMember("solver",
-                                    rapidjson::Value(rapidjson::kObjectType),
-                                    allocator);
-            }
-            auto& solver_case = test_case["solver"];
-            if (!FLAGS_overwrite &&
-                all_of(begin(precond_solvers), end(precond_solvers),
-                       [&solver_case](const std::string& s) {
-                           return solver_case.HasMember(s.c_str());
-                       })) {
-                continue;
-            }
-            std::clog << "Running test case: " << test_case << std::endl;
-            std::ifstream mtx_fd(test_case["filename"].GetString());
-
-            using Vec = gko::matrix::Dense<etype>;
-            std::shared_ptr<gko::LinOp> system_matrix;
-            std::unique_ptr<Vec> b;
-            std::unique_ptr<Vec> x;
-            if (FLAGS_overhead) {
-                system_matrix = gko::initialize<Vec>({1.0}, exec);
-                b = gko::initialize<Vec>(
-                    {std::numeric_limits<rc_etype>::quiet_NaN()}, exec);
-                x = gko::initialize<Vec>({0.0}, exec);
-            } else {
-                auto data = gko::read_generic_raw<etype, itype>(mtx_fd);
-                system_matrix = share(formats::matrix_factory.at(
-                    test_case["optimal"]["spmv"].GetString())(exec, data));
-                if (test_case.HasMember("rhs")) {
-                    std::ifstream rhs_fd{test_case["rhs"].GetString()};
-                    b = gko::read<Vec>(rhs_fd, exec);
-                } else {
-                    b = generate_rhs(exec, system_matrix, engine);
-                }
-                x = generate_initial_guess(exec, system_matrix, b.get(),
-                                           engine);
-            }
-
-            std::clog << "Matrix is of size (" << system_matrix->get_size()[0]
-                      << ", " << system_matrix->get_size()[1] << ")"
-                      << std::endl;
-            auto precond_solver_name = begin(precond_solvers);
-            for (const auto& solver_name : solvers) {
-                for (const auto& precond_name : preconds) {
-                    std::clog << "\tRunning solver: " << *precond_solver_name
-                              << std::endl;
-                    solve_system(solver_name, precond_name,
-                                 precond_solver_name->c_str(), exec,
-                                 system_matrix, lend(b), lend(x), test_case,
-                                 allocator);
-                    backup_results(test_cases);
-                    ++precond_solver_name;
-                }
-            }
-        } catch (const std::exception& e) {
-            std::cerr << "Error setting up solver, what(): " << e.what()
-                      << std::endl;
-        }
-    }
+    run_solver_benchmarks(exec, test_cases, SolverGenerator{}, true);
 
     std::cout << test_cases << std::endl;
 }
diff --git a/benchmark/solver/solver_common.hpp b/benchmark/solver/solver_common.hpp
new file mode 100644
index 00000000000..845fccb2fe5
--- /dev/null
+++ b/benchmark/solver/solver_common.hpp
@@ -0,0 +1,657 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GINKGO_BENCHMARK_SOLVER_SOLVER_COMMON_HPP
+#define GINKGO_BENCHMARK_SOLVER_SOLVER_COMMON_HPP
+
+
+#include "benchmark/utils/formats.hpp"
+#include "benchmark/utils/general.hpp"
+#include "benchmark/utils/generator.hpp"
+#include "benchmark/utils/loggers.hpp"
+#include "benchmark/utils/preconditioners.hpp"
+
+
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+#include "benchmark/utils/tuning_variables.hpp"
+#endif  // GINKGO_BENCHMARK_ENABLE_TUNING
+
+
+// Command-line arguments
+DEFINE_uint32(max_iters, 1000,
+              "Maximal number of iterations the solver will be run for");
+
+DEFINE_uint32(warmup_max_iters, 100,
+              "Maximal number of warmup iterations the solver will be run for");
+
+DEFINE_double(rel_res_goal, 1e-6, "The relative residual goal of the solver");
+
+DEFINE_bool(
+    rel_residual, false,
+    "Use relative residual instead of residual reduction stopping criterion");
+
+DEFINE_string(solvers, "cg",
+              "A comma-separated list of solvers to run. "
+              "Supported values are: bicgstab, bicg, cb_gmres_keep, "
+              "cb_gmres_reduce1, cb_gmres_reduce2, cb_gmres_integer, "
+              "cb_gmres_ireduce1, cb_gmres_ireduce2, cg, cgs, fcg, gmres, idr, "
+              "lower_trs, upper_trs, overhead");
+
+DEFINE_uint32(
+    nrhs, 1,
+    "The number of right hand sides. Record the residual only when nrhs == 1.");
+
+DEFINE_uint32(gmres_restart, 100,
+              "What maximum dimension of the Krylov space to use in GMRES");
+
+DEFINE_uint32(idr_subspace_dim, 2,
+              "What dimension of the subspace to use in IDR");
+
+DEFINE_double(
+    idr_kappa, 0.7,
+    "the number to check whether Av_n and v_n are too close or not in IDR");
+
+DEFINE_string(
+    rhs_generation, "1",
+    "Method used to generate the right hand side. Supported values are:"
+    "`1`, `random`, `sinus`. `1` sets all values of the right hand side to 1, "
+    "`random` assigns the values to a uniformly distributed random number "
+    "in [-1, 1), and `sinus` assigns b = A * (s / |s|) with A := system matrix,"
+    " s := vector with s(idx) = sin(idx) for non-complex types, and "
+    "s(idx) = sin(2*idx) + i * sin(2*idx+1).");
+
+DEFINE_string(
+    initial_guess_generation, "rhs",
+    "Method used to generate the initial guess. Supported values are: "
+    "`random`, `rhs`, `0`. `random` uses a random vector, `rhs` uses the right "
+    "hand side, and `0 uses a zero vector as the initial guess.");
+
+// This allows to benchmark the overhead of a solver by using the following
+// data: A=[1.0], x=[0.0], b=[nan]. This data can be used to benchmark normal
+// solvers or using the argument --solvers=overhead, a minimal solver will be
+// launched which contains only a few kernel calls.
+DEFINE_bool(overhead, false,
+            "If set, uses dummy data to benchmark Ginkgo overhead");
+
+
+std::string example_config = R"(
+  [
+    {"filename": "my_file.mtx", "optimal": {"spmv": "ell-csr"},
+     "rhs": "my_file_rhs.mtx"},
+    {"filename": "my_file2.mtx", "optimal": {"spmv": "coo-coo"},
+     "rhs": "my_file_rhs.mtx"},
+    {"size": 100, "stencil": "7pt", "comm_pattern": "stencil",
+     "optimal": {"spmv": "csr-coo"}}
+  ]
+)";
+
+
+// input validation
+[[noreturn]] void print_config_error_and_exit()
+{
+    std::cerr << "Input has to be a JSON array of solver configurations:\n"
+              << example_config << std::endl;
+    std::exit(1);
+}
+
+
+void validate_option_object(const rapidjson::Value& value)
+{
+    if (!value.IsObject() ||
+        !((value.HasMember("size") && value.HasMember("stencil") &&
+           value["size"].IsInt64() && value["stencil"].IsString()) ||
+          (value.HasMember("filename") && value["filename"].IsString())) ||
+        (!value.HasMember("optimal") && !value["optimal"].HasMember("spmv") &&
+         !value["optimal"]["spmv"].IsString())) {
+        print_config_error_and_exit();
+    }
+}
+
+
+std::shared_ptr<const gko::stop::CriterionFactory> create_criterion(
+    std::shared_ptr<const gko::Executor> exec, std::uint32_t max_iters)
+{
+    std::shared_ptr<const gko::stop::CriterionFactory> residual_stop;
+    if (FLAGS_rel_residual) {
+        residual_stop =
+            gko::share(gko::stop::ResidualNorm<rc_etype>::build()
+                           .with_baseline(gko::stop::mode::initial_resnorm)
+                           .with_reduction_factor(
+                               static_cast<rc_etype>(FLAGS_rel_res_goal))
+                           .on(exec));
+    } else {
+        residual_stop =
+            gko::share(gko::stop::ResidualNorm<rc_etype>::build()
+                           .with_baseline(gko::stop::mode::rhs_norm)
+                           .with_reduction_factor(
+                               static_cast<rc_etype>(FLAGS_rel_res_goal))
+                           .on(exec));
+    }
+    auto iteration_stop = gko::share(
+        gko::stop::Iteration::build().with_max_iters(max_iters).on(exec));
+    std::vector<std::shared_ptr<const gko::stop::CriterionFactory>>
+        criterion_vector{residual_stop, iteration_stop};
+    return gko::stop::combine(criterion_vector);
+}
+
+
+template <typename SolverIntermediate>
+std::unique_ptr<gko::LinOpFactory> add_criteria_precond_finalize(
+    SolverIntermediate inter, const std::shared_ptr<const gko::Executor>& exec,
+    std::shared_ptr<const gko::LinOpFactory> precond, std::uint32_t max_iters)
+{
+    return inter.with_criteria(create_criterion(exec, max_iters))
+        .with_preconditioner(give(precond))
+        .on(exec);
+}
+
+
+template <typename Solver>
+std::unique_ptr<gko::LinOpFactory> add_criteria_precond_finalize(
+    const std::shared_ptr<const gko::Executor>& exec,
+    std::shared_ptr<const gko::LinOpFactory> precond, std::uint32_t max_iters)
+{
+    return add_criteria_precond_finalize(Solver::build(), exec, precond,
+                                         max_iters);
+}
+
+
+std::unique_ptr<gko::LinOpFactory> generate_solver(
+    const std::shared_ptr<const gko::Executor>& exec,
+    std::shared_ptr<const gko::LinOpFactory> precond,
+    const std::string& description, std::uint32_t max_iters)
+{
+    std::string cb_gmres_prefix("cb_gmres_");
+    if (description.find(cb_gmres_prefix) == 0) {
+        auto s_prec = gko::solver::cb_gmres::storage_precision::keep;
+        const auto spec = description.substr(cb_gmres_prefix.length());
+        if (spec == "keep") {
+            s_prec = gko::solver::cb_gmres::storage_precision::keep;
+        } else if (spec == "reduce1") {
+            s_prec = gko::solver::cb_gmres::storage_precision::reduce1;
+        } else if (spec == "reduce2") {
+            s_prec = gko::solver::cb_gmres::storage_precision::reduce2;
+        } else if (spec == "integer") {
+            s_prec = gko::solver::cb_gmres::storage_precision::integer;
+        } else if (spec == "ireduce1") {
+            s_prec = gko::solver::cb_gmres::storage_precision::ireduce1;
+        } else if (spec == "ireduce2") {
+            s_prec = gko::solver::cb_gmres::storage_precision::ireduce2;
+        } else {
+            throw std::range_error(
+                std::string(
+                    "CB-GMRES does not have a corresponding solver to <") +
+                description + ">!");
+        }
+        return add_criteria_precond_finalize(
+            gko::solver::CbGmres<etype>::build()
+                .with_krylov_dim(FLAGS_gmres_restart)
+                .with_storage_precision(s_prec),
+            exec, precond, max_iters);
+    } else if (description == "bicgstab") {
+        return add_criteria_precond_finalize<gko::solver::Bicgstab<etype>>(
+            exec, precond, max_iters);
+    } else if (description == "bicg") {
+        return add_criteria_precond_finalize<gko::solver::Bicg<etype>>(
+            exec, precond, max_iters);
+    } else if (description == "cg") {
+        return add_criteria_precond_finalize<gko::solver::Cg<etype>>(
+            exec, precond, max_iters);
+    } else if (description == "cgs") {
+        return add_criteria_precond_finalize<gko::solver::Cgs<etype>>(
+            exec, precond, max_iters);
+    } else if (description == "fcg") {
+        return add_criteria_precond_finalize<gko::solver::Fcg<etype>>(
+            exec, precond, max_iters);
+    } else if (description == "idr") {
+        return add_criteria_precond_finalize(
+            gko::solver::Idr<etype>::build()
+                .with_subspace_dim(FLAGS_idr_subspace_dim)
+                .with_kappa(static_cast<rc_etype>(FLAGS_idr_kappa)),
+            exec, precond, max_iters);
+    } else if (description == "gmres") {
+        return add_criteria_precond_finalize(
+            gko::solver::Gmres<etype>::build().with_krylov_dim(
+                FLAGS_gmres_restart),
+            exec, precond, max_iters);
+    } else if (description == "lower_trs") {
+        return gko::solver::LowerTrs<etype>::build()
+            .with_num_rhs(FLAGS_nrhs)
+            .on(exec);
+    } else if (description == "upper_trs") {
+        return gko::solver::UpperTrs<etype>::build()
+            .with_num_rhs(FLAGS_nrhs)
+            .on(exec);
+    } else if (description == "overhead") {
+        return add_criteria_precond_finalize<gko::Overhead<etype>>(
+            exec, precond, max_iters);
+    }
+    throw std::range_error(std::string("The provided string <") + description +
+                           "> does not match any solver!");
+}
+
+
+void write_precond_info(const gko::LinOp* precond,
+                        rapidjson::Value& precond_info,
+                        rapidjson::MemoryPoolAllocator<>& allocator)
+{
+    if (const auto jacobi =
+            dynamic_cast<const gko::preconditioner::Jacobi<etype>*>(precond)) {
+        // extract block sizes
+        const auto bdata =
+            jacobi->get_parameters().block_pointers.get_const_data();
+        add_or_set_member(precond_info, "block_sizes",
+                          rapidjson::Value(rapidjson::kArrayType), allocator);
+        const auto nblocks = jacobi->get_num_blocks();
+        for (auto i = decltype(nblocks){0}; i < nblocks; ++i) {
+            precond_info["block_sizes"].PushBack(bdata[i + 1] - bdata[i],
+                                                 allocator);
+        }
+
+        // extract block precisions
+        const auto pdata =
+            jacobi->get_parameters()
+                .storage_optimization.block_wise.get_const_data();
+        if (pdata) {
+            add_or_set_member(precond_info, "block_precisions",
+                              rapidjson::Value(rapidjson::kArrayType),
+                              allocator);
+            for (auto i = decltype(nblocks){0}; i < nblocks; ++i) {
+                precond_info["block_precisions"].PushBack(
+                    static_cast<int>(pdata[i]), allocator);
+            }
+        }
+
+        // extract condition numbers
+        const auto cdata = jacobi->get_conditioning();
+        if (cdata) {
+            add_or_set_member(precond_info, "block_conditioning",
+                              rapidjson::Value(rapidjson::kArrayType),
+                              allocator);
+            for (auto i = decltype(nblocks){0}; i < nblocks; ++i) {
+                precond_info["block_conditioning"].PushBack(cdata[i],
+                                                            allocator);
+            }
+        }
+    }
+}
+
+
+struct SolverGenerator : DefaultSystemGenerator<> {
+    using Vec = typename DefaultSystemGenerator::Vec;
+
+    std::unique_ptr<Vec> generate_rhs(std::shared_ptr<const gko::Executor> exec,
+                                      const gko::LinOp* system_matrix,
+                                      rapidjson::Value& config) const
+    {
+        if (config.HasMember("rhs")) {
+            std::ifstream rhs_fd{config["rhs"].GetString()};
+            return gko::read<Vec>(rhs_fd, std::move(exec));
+        } else {
+            gko::dim<2> vec_size{system_matrix->get_size()[0], FLAGS_nrhs};
+            if (FLAGS_rhs_generation == "1") {
+                return create_multi_vector(exec, vec_size, gko::one<etype>());
+            } else if (FLAGS_rhs_generation == "random") {
+                return create_multi_vector_random(exec, vec_size);
+            } else if (FLAGS_rhs_generation == "sinus") {
+                auto rhs = vec<etype>::create(exec, vec_size);
+
+                auto tmp = create_matrix_sin<etype>(exec, vec_size);
+                auto scalar = gko::matrix::Dense<rc_etype>::create(
+                    exec->get_master(), gko::dim<2>{1, vec_size[1]});
+                tmp->compute_norm2(scalar.get());
+                for (gko::size_type i = 0; i < vec_size[1]; ++i) {
+                    scalar->at(0, i) = gko::one<rc_etype>() / scalar->at(0, i);
+                }
+                // normalize sin-vector
+                if (gko::is_complex_s<etype>::value) {
+                    tmp->scale(scalar->make_complex().get());
+                } else {
+                    tmp->scale(scalar.get());
+                }
+                system_matrix->apply(tmp.get(), rhs.get());
+                return rhs;
+            }
+            throw std::invalid_argument(std::string("\"rhs_generation\" = ") +
+                                        FLAGS_rhs_generation +
+                                        " is not supported!");
+        }
+    }
+
+    std::unique_ptr<Vec> generate_initial_guess(
+        std::shared_ptr<const gko::Executor> exec,
+        const gko::LinOp* system_matrix, const Vec* rhs) const
+    {
+        gko::dim<2> vec_size{system_matrix->get_size()[1], FLAGS_nrhs};
+        if (FLAGS_initial_guess_generation == "0") {
+            return create_multi_vector(exec, vec_size, gko::zero<etype>());
+        } else if (FLAGS_initial_guess_generation == "random") {
+            return create_multi_vector_random(exec, vec_size);
+        } else if (FLAGS_initial_guess_generation == "rhs") {
+            return rhs->clone();
+        }
+        throw std::invalid_argument(
+            std::string("\"initial_guess_generation\" = ") +
+            FLAGS_initial_guess_generation + " is not supported!");
+    }
+
+    std::unique_ptr<Vec> initialize(
+        std::initializer_list<etype> il,
+        std::shared_ptr<const gko::Executor> exec) const
+    {
+        return gko::initialize<Vec>(std::move(il), std::move(exec));
+    }
+
+    std::default_random_engine engine = get_engine();
+};
+
+
+template <typename VectorType>
+void solve_system(const std::string& solver_name,
+                  const std::string& precond_name,
+                  const char* precond_solver_name,
+                  std::shared_ptr<gko::Executor> exec,
+                  std::shared_ptr<const gko::LinOp> system_matrix,
+                  const VectorType* b, const VectorType* x,
+                  rapidjson::Value& test_case,
+                  rapidjson::MemoryPoolAllocator<>& allocator)
+{
+    try {
+        auto& solver_case = test_case["solver"];
+        if (!FLAGS_overwrite && solver_case.HasMember(precond_solver_name)) {
+            return;
+        }
+
+        add_or_set_member(solver_case, precond_solver_name,
+                          rapidjson::Value(rapidjson::kObjectType), allocator);
+        auto& solver_json = solver_case[precond_solver_name];
+        add_or_set_member(solver_json, "recurrent_residuals",
+                          rapidjson::Value(rapidjson::kArrayType), allocator);
+        add_or_set_member(solver_json, "true_residuals",
+                          rapidjson::Value(rapidjson::kArrayType), allocator);
+        add_or_set_member(solver_json, "implicit_residuals",
+                          rapidjson::Value(rapidjson::kArrayType), allocator);
+        add_or_set_member(solver_json, "iteration_timestamps",
+                          rapidjson::Value(rapidjson::kArrayType), allocator);
+        if (b->get_size()[1] == 1 && !FLAGS_overhead) {
+            auto rhs_norm = compute_norm2(lend(b));
+            add_or_set_member(solver_json, "rhs_norm", rhs_norm, allocator);
+        }
+        for (auto stage : {"generate", "apply"}) {
+            add_or_set_member(solver_json, stage,
+                              rapidjson::Value(rapidjson::kObjectType),
+                              allocator);
+            add_or_set_member(solver_json[stage], "components",
+                              rapidjson::Value(rapidjson::kObjectType),
+                              allocator);
+        }
+
+        IterationControl ic{get_timer(exec, FLAGS_gpu_timer)};
+
+        // warm run
+        std::shared_ptr<gko::LinOp> solver;
+        for (auto _ : ic.warmup_run()) {
+            auto x_clone = clone(x);
+            auto precond = precond_factory.at(precond_name)(exec);
+            solver = generate_solver(exec, give(precond), solver_name,
+                                     FLAGS_warmup_max_iters)
+                         ->generate(system_matrix);
+            solver->apply(lend(b), lend(x_clone));
+            exec->synchronize();
+        }
+
+        // detail run
+        if (FLAGS_detailed && !FLAGS_overhead) {
+            // slow run, get the time of each functions
+            auto x_clone = clone(x);
+
+            auto gen_logger =
+                std::make_shared<OperationLogger>(FLAGS_nested_names);
+            exec->add_logger(gen_logger);
+            if (exec != exec->get_master()) {
+                exec->get_master()->add_logger(gen_logger);
+            }
+
+            auto precond = precond_factory.at(precond_name)(exec);
+            solver = generate_solver(exec, give(precond), solver_name,
+                                     FLAGS_max_iters)
+                         ->generate(system_matrix);
+
+            exec->remove_logger(gko::lend(gen_logger));
+            if (exec != exec->get_master()) {
+                exec->get_master()->remove_logger(gko::lend(gen_logger));
+            }
+            gen_logger->write_data(solver_json["generate"]["components"],
+                                   allocator, 1);
+
+            if (auto prec =
+                    dynamic_cast<const gko::Preconditionable*>(lend(solver))) {
+                add_or_set_member(solver_json, "preconditioner",
+                                  rapidjson::Value(rapidjson::kObjectType),
+                                  allocator);
+                write_precond_info(
+                    lend(clone(exec->get_master(), prec->get_preconditioner())),
+                    solver_json["preconditioner"], allocator);
+            }
+
+            auto apply_logger =
+                std::make_shared<OperationLogger>(FLAGS_nested_names);
+            exec->add_logger(apply_logger);
+            if (exec != exec->get_master()) {
+                exec->get_master()->add_logger(apply_logger);
+            }
+
+
+            solver->apply(lend(b), lend(x_clone));
+
+            exec->remove_logger(gko::lend(apply_logger));
+            if (exec != exec->get_master()) {
+                exec->get_master()->remove_logger(gko::lend(apply_logger));
+            }
+            apply_logger->write_data(solver_json["apply"]["components"],
+                                     allocator, 1);
+
+            // slow run, gets the recurrent and true residuals of each iteration
+            if (b->get_size()[1] == 1) {
+                x_clone = clone(x);
+                auto res_logger = std::make_shared<ResidualLogger<etype>>(
+                    lend(system_matrix), b, solver_json["recurrent_residuals"],
+                    solver_json["true_residuals"],
+                    solver_json["implicit_residuals"],
+                    solver_json["iteration_timestamps"], allocator);
+                solver->add_logger(res_logger);
+                solver->apply(lend(b), lend(x_clone));
+                if (!res_logger->has_implicit_res_norms()) {
+                    solver_json.RemoveMember("implicit_residuals");
+                }
+            }
+            exec->synchronize();
+        }
+
+        // timed run
+        auto it_logger = std::make_shared<IterationLogger>();
+        auto generate_timer = get_timer(exec, FLAGS_gpu_timer);
+        auto apply_timer = ic.get_timer();
+        auto x_clone = clone(x);
+        for (auto status : ic.run(false)) {
+            x_clone = clone(x);
+
+            exec->synchronize();
+            generate_timer->tic();
+            auto precond = precond_factory.at(precond_name)(exec);
+            solver = generate_solver(exec, give(precond), solver_name,
+                                     FLAGS_max_iters)
+                         ->generate(system_matrix);
+            generate_timer->toc();
+
+            exec->synchronize();
+            if (ic.get_num_repetitions() == 0) {
+                solver->add_logger(it_logger);
+            }
+            apply_timer->tic();
+            solver->apply(lend(b), lend(x_clone));
+            apply_timer->toc();
+            if (ic.get_num_repetitions() == 0) {
+                solver->remove_logger(gko::lend(it_logger));
+            }
+        }
+        it_logger->write_data(solver_json["apply"], allocator);
+
+        if (b->get_size()[1] == 1 && !FLAGS_overhead) {
+            // a solver is considered direct if it didn't log any iterations
+            if (solver_json["apply"].HasMember("iterations") &&
+                solver_json["apply"]["iterations"].GetInt() == 0) {
+                auto error =
+                    compute_direct_error(lend(solver), lend(b), lend(x_clone));
+                add_or_set_member(solver_json, "forward_error", error,
+                                  allocator);
+            }
+            auto residual = compute_residual_norm(lend(system_matrix), lend(b),
+                                                  lend(x_clone));
+            add_or_set_member(solver_json, "residual_norm", residual,
+                              allocator);
+        }
+        add_or_set_member(solver_json["generate"], "time",
+                          generate_timer->compute_average_time(), allocator);
+        add_or_set_member(solver_json["apply"], "time",
+                          apply_timer->compute_average_time(), allocator);
+        add_or_set_member(solver_json, "repetitions",
+                          apply_timer->get_num_repetitions(), allocator);
+
+        // compute and write benchmark data
+        add_or_set_member(solver_json, "completed", true, allocator);
+    } catch (const std::exception& e) {
+        add_or_set_member(test_case["solver"][precond_solver_name], "completed",
+                          false, allocator);
+        if (FLAGS_keep_errors) {
+            rapidjson::Value msg_value;
+            msg_value.SetString(e.what(), allocator);
+            add_or_set_member(test_case["solver"][precond_solver_name], "error",
+                              msg_value, allocator);
+        }
+        std::cerr << "Error when processing test case " << test_case << "\n"
+                  << "what(): " << e.what() << std::endl;
+    }
+}
+
+
+template <typename SystemGenerator>
+void run_solver_benchmarks(std::shared_ptr<gko::Executor> exec,
+                           rapidjson::Document& test_cases,
+                           const SystemGenerator& system_generator,
+                           bool do_print)
+{
+    auto solvers = split(FLAGS_solvers, ',');
+    auto preconds = split(FLAGS_preconditioners, ',');
+    std::vector<std::string> precond_solvers;
+    for (const auto& s : solvers) {
+        for (const auto& p : preconds) {
+            precond_solvers.push_back(s + (p == "none" ? "" : "-" + p));
+        }
+    }
+
+    auto& allocator = test_cases.GetAllocator();
+
+    for (auto& test_case : test_cases.GetArray()) {
+        try {
+            // set up benchmark
+            validate_option_object(test_case);
+            if (!test_case.HasMember("solver")) {
+                test_case.AddMember("solver",
+                                    rapidjson::Value(rapidjson::kObjectType),
+                                    allocator);
+            }
+            auto& solver_case = test_case["solver"];
+            if (!FLAGS_overwrite &&
+                all_of(begin(precond_solvers), end(precond_solvers),
+                       [&solver_case](const std::string& s) {
+                           return solver_case.HasMember(s.c_str());
+                       })) {
+                continue;
+            }
+            if (do_print) {
+                std::clog << "Running test case: " << test_case << std::endl;
+            }
+
+            using Vec = typename SystemGenerator::Vec;
+            std::shared_ptr<gko::LinOp> system_matrix;
+            std::unique_ptr<Vec> b;
+            std::unique_ptr<Vec> x;
+            if (FLAGS_overhead) {
+                system_matrix = system_generator.initialize({1.0}, exec);
+                b = system_generator.initialize(
+                    {std::numeric_limits<rc_etype>::quiet_NaN()}, exec);
+                x = system_generator.initialize({0.0}, exec);
+            } else {
+                system_matrix =
+                    system_generator.generate_matrix_with_optimal_format(
+                        exec, test_case);
+                b = system_generator.generate_rhs(exec, system_matrix.get(),
+                                                  test_case);
+                x = system_generator.generate_initial_guess(
+                    exec, system_matrix.get(), b.get());
+            }
+
+            if (do_print) {
+                std::clog << "Matrix is of size ("
+                          << system_matrix->get_size()[0] << ", "
+                          << system_matrix->get_size()[1] << ")" << std::endl;
+            }
+            add_or_set_member(test_case, "size", system_matrix->get_size()[0],
+                              allocator);
+            auto precond_solver_name = begin(precond_solvers);
+            for (const auto& solver_name : solvers) {
+                for (const auto& precond_name : preconds) {
+                    if (do_print) {
+                        std::clog
+                            << "\tRunning solver: " << *precond_solver_name
+                            << std::endl;
+                    }
+                    solve_system(solver_name, precond_name,
+                                 precond_solver_name->c_str(), exec,
+                                 system_matrix, lend(b), lend(x), test_case,
+                                 allocator);
+                    if (do_print) {
+                        backup_results(test_cases);
+                    }
+                    ++precond_solver_name;
+                }
+            }
+        } catch (const std::exception& e) {
+            std::cerr << "Error setting up solver, what(): " << e.what()
+                      << std::endl;
+        }
+    }
+}
+
+
+#endif  // GINKGO_BENCHMARK_SOLVER_SOLVER_COMMON_HPP
diff --git a/benchmark/sparse_blas/sparse_blas.cpp b/benchmark/sparse_blas/sparse_blas.cpp
index adc75e03ae5..3adde1b348c 100644
--- a/benchmark/sparse_blas/sparse_blas.cpp
+++ b/benchmark/sparse_blas/sparse_blas.cpp
@@ -34,10 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <algorithm>
-#include <chrono>
-#include <cstdlib>
 #include <exception>
-#include <fstream>
 #include <functional>
 #include <iomanip>
 #include <iostream>
@@ -45,12 +42,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <numeric>
 #include <random>
 #include <typeinfo>
-#include <unordered_set>
 
 
 #include "benchmark/utils/general.hpp"
-#include "benchmark/utils/loggers.hpp"
-#include "benchmark/utils/spmv_common.hpp"
+#include "benchmark/utils/generator.hpp"
+#include "benchmark/utils/spmv_validation.hpp"
 #include "benchmark/utils/types.hpp"
 #include "core/test/utils/matrix_generator.hpp"
 
@@ -58,7 +54,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 const auto benchmark_name = "sparse_blas";
 
 
-using itype = gko::int32;
 using Mtx = gko::matrix::Csr<etype, itype>;
 using mat_data = gko::matrix_data<etype, itype>;
 
@@ -454,9 +449,7 @@ int main(int argc, char* argv[])
     std::string header =
         "A benchmark for measuring performance of Ginkgo's sparse BLAS "
         "operations.\n";
-    std::string format = std::string() + "  [\n" +
-                         "    { \"filename\": \"my_file.mtx\"},\n" +
-                         "    { \"filename\": \"my_file2.mtx\"}\n" + "  ]\n\n";
+    std::string format = example_config;
     initialize_argument_parsing(&argc, &argv, header, format);
 
     auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer);
@@ -487,12 +480,13 @@ int main(int argc, char* argv[])
             }
             auto& sp_blas_case = test_case[benchmark_name];
             std::clog << "Running test case: " << test_case << std::endl;
-            std::ifstream mtx_fd(test_case["filename"].GetString());
-            auto data = gko::read_generic_raw<etype, itype>(mtx_fd);
+            auto data =
+                DefaultSystemGenerator<>::generate_matrix_data(test_case);
             data.ensure_row_major_order();
             std::clog << "Matrix is of size (" << data.size[0] << ", "
                       << data.size[1] << "), " << data.nonzeros.size()
                       << std::endl;
+            add_or_set_member(test_case, "size", data.size[0], allocator);
 
             for (const auto& strategy_name : strategies) {
                 auto mtx = Mtx::create(exec, data.size, data.nonzeros.size(),
diff --git a/benchmark/spmv/CMakeLists.txt b/benchmark/spmv/CMakeLists.txt
index 6c5d10517a8..1e3bab1c884 100644
--- a/benchmark/spmv/CMakeLists.txt
+++ b/benchmark/spmv/CMakeLists.txt
@@ -1 +1,4 @@
 ginkgo_add_typed_benchmark_executables(spmv "YES" spmv.cpp)
+if(GINKGO_BUILD_MPI)
+    add_subdirectory(distributed)
+endif()
diff --git a/benchmark/spmv/distributed/CMakeLists.txt b/benchmark/spmv/distributed/CMakeLists.txt
new file mode 100644
index 00000000000..cadde3eea34
--- /dev/null
+++ b/benchmark/spmv/distributed/CMakeLists.txt
@@ -0,0 +1 @@
+ginkgo_add_typed_benchmark_executables(spmv-distributed "YES" spmv.cpp)
diff --git a/benchmark/spmv/distributed/spmv.cpp b/benchmark/spmv/distributed/spmv.cpp
new file mode 100644
index 00000000000..03530b139db
--- /dev/null
+++ b/benchmark/spmv/distributed/spmv.cpp
@@ -0,0 +1,139 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/ginkgo.hpp>
+
+
+#include <algorithm>
+#include <chrono>
+#include <cstdlib>
+#include <exception>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <typeinfo>
+
+
+#include "benchmark/spmv/spmv_common.hpp"
+#include "benchmark/utils/general.hpp"
+#include "benchmark/utils/generator.hpp"
+#include "benchmark/utils/timer.hpp"
+#include "benchmark/utils/types.hpp"
+
+
+DEFINE_string(local_formats, "csr",
+              "A comma-separated list of formats for the local matrix to run. "
+              "See the 'formats' option for a list of supported versions");
+DEFINE_string(non_local_formats, "csr",
+              "A comma-separated list of formats for the non-local matrix to "
+              "run. See the 'formats' option for a list of supported versions");
+
+
+std::string example_config = R"(
+  [
+    {"size": 100, "stencil": "7pt", "comm_pattern": "stencil"},
+    {"filename": "my_file.mtx"}
+  ]
+)";
+
+
+[[noreturn]] void print_config_error_and_exit()
+{
+    std::cerr << "Input has to be a JSON array of matrix configurations:\n"
+              << example_config << std::endl;
+    std::exit(1);
+}
+
+
+struct Generator : DistributedDefaultSystemGenerator<DefaultSystemGenerator<>> {
+    Generator(gko::experimental::mpi::communicator comm)
+        : DistributedDefaultSystemGenerator<DefaultSystemGenerator<>>{
+              std::move(comm), {}}
+    {}
+
+    void validate_options(const rapidjson::Value& options) const
+    {
+        if (!options.IsObject() ||
+            !((options.HasMember("size") && options.HasMember("stencil") &&
+               options.HasMember("comm_pattern")) ||
+              options.HasMember("filename"))) {
+            print_config_error_and_exit();
+        }
+    }
+};
+
+
+int main(int argc, char* argv[])
+{
+    gko::experimental::mpi::environment mpi_env{argc, argv};
+
+    const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+    const auto rank = comm.rank();
+
+    std::string header =
+        "A benchmark for measuring performance of Ginkgo's spmv.\n";
+    std::string format = example_config;
+    initialize_argument_parsing(&argc, &argv, header, format);
+
+    if (rank == 0) {
+        std::string extra_information = "The formats are [" +
+                                        FLAGS_local_formats + "]x[" +
+                                        FLAGS_non_local_formats + "]\n" +
+                                        "The number of right hand sides is " +
+                                        std::to_string(FLAGS_nrhs) + "\n";
+        print_general_information(extra_information);
+    }
+
+    auto exec = executor_factory_mpi.at(FLAGS_executor)(comm.get());
+
+    auto local_formats = split(FLAGS_local_formats, ',');
+    auto non_local_formats = split(FLAGS_non_local_formats, ',');
+    std::vector<std::string> formats;
+    for (const auto& local_fmt : local_formats) {
+        for (const auto& non_local_fmt : non_local_formats) {
+            formats.push_back(local_fmt + "-" + non_local_fmt);
+        }
+    }
+
+    std::string json_input = broadcast_json_input(std::cin, comm);
+    rapidjson::Document test_cases;
+    test_cases.Parse(json_input.c_str());
+    if (!test_cases.IsArray()) {
+        print_config_error_and_exit();
+    }
+
+    run_spmv_benchmark(exec, test_cases, formats, Generator{comm}, rank == 0);
+
+    if (rank == 0) {
+        std::cout << test_cases << std::endl;
+    }
+}
diff --git a/benchmark/spmv/spmv.cpp b/benchmark/spmv/spmv.cpp
index 9178cbd2a08..7090e978956 100644
--- a/benchmark/spmv/spmv.cpp
+++ b/benchmark/spmv/spmv.cpp
@@ -33,146 +33,37 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/ginkgo.hpp>
 
 
-#include <algorithm>
-#include <chrono>
 #include <cstdlib>
-#include <exception>
-#include <fstream>
-#include <iomanip>
 #include <iostream>
-#include <typeinfo>
 
 
+#include "benchmark/spmv/spmv_common.hpp"
 #include "benchmark/utils/formats.hpp"
 #include "benchmark/utils/general.hpp"
-#include "benchmark/utils/loggers.hpp"
-#include "benchmark/utils/spmv_common.hpp"
-#include "benchmark/utils/timer.hpp"
-#include "benchmark/utils/types.hpp"
-
-
-#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
-#include "benchmark/utils/tuning_variables.hpp"
-#endif  // GINKGO_BENCHMARK_ENABLE_TUNING
-
-
-// Command-line arguments
-DEFINE_uint32(nrhs, 1, "The number of right hand sides");
-
-
-// This function supposes that management of `FLAGS_overwrite` is done before
-// calling it
-void apply_spmv(const char* format_name, std::shared_ptr<gko::Executor> exec,
-                const gko::matrix_data<etype, itype>& data, const vec<etype>* b,
-                const vec<etype>* x, const vec<etype>* answer,
-                rapidjson::Value& test_case,
-                rapidjson::MemoryPoolAllocator<>& allocator)
-{
-    try {
-        auto& spmv_case = test_case["spmv"];
-        add_or_set_member(spmv_case, format_name,
-                          rapidjson::Value(rapidjson::kObjectType), allocator);
-
-        auto storage_logger = std::make_shared<StorageLogger>();
-        exec->add_logger(storage_logger);
-        auto system_matrix =
-            share(formats::matrix_factory.at(format_name)(exec, data));
-
-        exec->remove_logger(gko::lend(storage_logger));
-        storage_logger->write_data(spmv_case[format_name], allocator);
-        // check the residual
-        if (FLAGS_detailed) {
-            auto x_clone = clone(x);
-            exec->synchronize();
-            system_matrix->apply(lend(b), lend(x_clone));
-            exec->synchronize();
-            auto max_relative_norm2 =
-                compute_max_relative_norm2(lend(x_clone), lend(answer));
-            add_or_set_member(spmv_case[format_name], "max_relative_norm2",
-                              max_relative_norm2, allocator);
-        }
-
-        IterationControl ic{get_timer(exec, FLAGS_gpu_timer)};
-        // warm run
-        for (auto _ : ic.warmup_run()) {
-            auto x_clone = clone(x);
-            exec->synchronize();
-            system_matrix->apply(lend(b), lend(x_clone));
-            exec->synchronize();
-        }
-
-        // tuning run
-#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
-        auto& format_case = spmv_case[format_name];
-        if (!format_case.HasMember("tuning")) {
-            format_case.AddMember(
-                "tuning", rapidjson::Value(rapidjson::kObjectType), allocator);
+#include "benchmark/utils/generator.hpp"
+#include "benchmark/utils/spmv_validation.hpp"
+
+
+struct Generator : DefaultSystemGenerator<> {
+    void validate_options(const rapidjson::Value& options) const
+    {
+        if (!options.IsObject() ||
+            !((options.HasMember("size") && options.HasMember("stencil")) ||
+              options.HasMember("filename"))) {
+            std::cerr
+                << "Input has to be a JSON array of matrix configurations:\n"
+                << example_config << std::endl;
+            std::exit(1);
         }
-        auto& tuning_case = format_case["tuning"];
-        add_or_set_member(tuning_case, "time",
-                          rapidjson::Value(rapidjson::kArrayType), allocator);
-        add_or_set_member(tuning_case, "values",
-                          rapidjson::Value(rapidjson::kArrayType), allocator);
-
-        // Enable tuning for this portion of code
-        gko::_tuning_flag = true;
-        // Select some values we want to tune.
-        std::vector<gko::size_type> tuning_values{
-            1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
-        for (auto val : tuning_values) {
-            // Actually set the value that will be tuned. See
-            // cuda/components/format_conversion.cuh for an example of how this
-            // variable is used.
-            gko::_tuned_value = val;
-            auto tuning_timer = get_timer(exec, FLAGS_gpu_timer);
-            IterationControl ic_tuning{tuning_timer};
-            auto x_clone = clone(x);
-            for (auto _ : ic_tuning.run()) {
-                system_matrix->apply(lend(b), lend(x_clone));
-            }
-            tuning_case["time"].PushBack(ic_tuning.compute_average_time(),
-                                         allocator);
-            tuning_case["values"].PushBack(val, allocator);
-        }
-        // We put back the flag to false to use the default (non-tuned) values
-        // for the following
-        gko::_tuning_flag = false;
-#endif  // GINKGO_BENCHMARK_ENABLE_TUNING
-
-        // timed run
-        auto x_clone = clone(x);
-        for (auto _ : ic.run()) {
-            system_matrix->apply(lend(b), lend(x_clone));
-        }
-        add_or_set_member(spmv_case[format_name], "time",
-                          ic.compute_average_time(), allocator);
-        add_or_set_member(spmv_case[format_name], "repetitions",
-                          ic.get_num_repetitions(), allocator);
-
-        // compute and write benchmark data
-        add_or_set_member(spmv_case[format_name], "completed", true, allocator);
-    } catch (const std::exception& e) {
-        add_or_set_member(test_case["spmv"][format_name], "completed", false,
-                          allocator);
-        if (FLAGS_keep_errors) {
-            rapidjson::Value msg_value;
-            msg_value.SetString(e.what(), allocator);
-            add_or_set_member(test_case["spmv"][format_name], "error",
-                              msg_value, allocator);
-        }
-        std::cerr << "Error when processing test case " << test_case << "\n"
-                  << "what(): " << e.what() << std::endl;
     }
-}
+};
 
 
 int main(int argc, char* argv[])
 {
     std::string header =
         "A benchmark for measuring performance of Ginkgo's spmv.\n";
-    std::string format = std::string() + "  [\n" +
-                         "    { \"filename\": \"my_file.mtx\"},\n" +
-                         "    { \"filename\": \"my_file2.mtx\"}\n" + "  ]\n\n";
+    std::string format = example_config;
     initialize_argument_parsing(&argc, &argv, header, format);
 
     std::string extra_information = "The formats are " + FLAGS_formats +
@@ -181,7 +72,6 @@ int main(int argc, char* argv[])
     print_general_information(extra_information);
 
     auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer);
-    auto engine = get_engine();
     auto formats = split(FLAGS_formats, ',');
 
     rapidjson::IStreamWrapper jcin(std::cin);
@@ -191,80 +81,7 @@ int main(int argc, char* argv[])
         print_config_error_and_exit();
     }
 
-    auto& allocator = test_cases.GetAllocator();
-
-    for (auto& test_case : test_cases.GetArray()) {
-        try {
-            // set up benchmark
-            validate_option_object(test_case);
-            if (!test_case.HasMember("spmv")) {
-                test_case.AddMember("spmv",
-                                    rapidjson::Value(rapidjson::kObjectType),
-                                    allocator);
-            }
-            auto& spmv_case = test_case["spmv"];
-            if (!FLAGS_overwrite &&
-                all_of(begin(formats), end(formats),
-                       [&spmv_case](const std::string& s) {
-                           return spmv_case.HasMember(s.c_str());
-                       })) {
-                continue;
-            }
-            std::clog << "Running test case: " << test_case << std::endl;
-            std::ifstream mtx_fd(test_case["filename"].GetString());
-            auto data = gko::read_generic_raw<etype, itype>(mtx_fd);
-
-            auto nrhs = FLAGS_nrhs;
-            auto b = create_matrix<etype>(exec, gko::dim<2>{data.size[1], nrhs},
-                                          engine);
-            auto x = create_matrix<etype>(exec, gko::dim<2>{data.size[0], nrhs},
-                                          engine);
-            std::clog << "Matrix is of size (" << data.size[0] << ", "
-                      << data.size[1] << ")" << std::endl;
-            std::string best_format("none");
-            auto best_performance = 0.0;
-            if (!test_case.HasMember("optimal")) {
-                test_case.AddMember("optimal",
-                                    rapidjson::Value(rapidjson::kObjectType),
-                                    allocator);
-            }
-
-            // Compute the result from ginkgo::coo as the correct answer
-            auto answer = vec<etype>::create(exec);
-            if (FLAGS_detailed) {
-                auto system_matrix =
-                    share(formats::matrix_factory.at("coo")(exec, data));
-                answer->copy_from(lend(x));
-                exec->synchronize();
-                system_matrix->apply(lend(b), lend(answer));
-                exec->synchronize();
-            }
-            for (const auto& format_name : formats) {
-                apply_spmv(format_name.c_str(), exec, data, lend(b), lend(x),
-                           lend(answer), test_case, allocator);
-                std::clog << "Current state:" << std::endl
-                          << test_cases << std::endl;
-                if (spmv_case[format_name.c_str()]["completed"].GetBool()) {
-                    auto performance =
-                        spmv_case[format_name.c_str()]["time"].GetDouble();
-                    if (best_format == "none" ||
-                        performance < best_performance) {
-                        best_format = format_name;
-                        best_performance = performance;
-                        add_or_set_member(
-                            test_case["optimal"], "spmv",
-                            rapidjson::Value(best_format.c_str(), allocator)
-                                .Move(),
-                            allocator);
-                    }
-                }
-                backup_results(test_cases);
-            }
-        } catch (const std::exception& e) {
-            std::cerr << "Error setting up matrix data, what(): " << e.what()
-                      << std::endl;
-        }
-    }
+    run_spmv_benchmark(exec, test_cases, formats, Generator{}, true);
 
     std::cout << test_cases << std::endl;
 }
diff --git a/benchmark/spmv/spmv_common.hpp b/benchmark/spmv/spmv_common.hpp
new file mode 100644
index 00000000000..7ed9854832a
--- /dev/null
+++ b/benchmark/spmv/spmv_common.hpp
@@ -0,0 +1,245 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GINKGO_BENCHMARK_SPMV_SPMV_COMMON_HPP
+#define GINKGO_BENCHMARK_SPMV_SPMV_COMMON_HPP
+
+
+#include "benchmark/utils/formats.hpp"
+#include "benchmark/utils/general.hpp"
+#include "benchmark/utils/loggers.hpp"
+#include "benchmark/utils/timer.hpp"
+#include "benchmark/utils/types.hpp"
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+#include "benchmark/utils/tuning_variables.hpp"
+#endif  // GINKGO_BENCHMARK_ENABLE_TUNING
+
+
+// Command-line arguments
+DEFINE_uint32(nrhs, 1, "The number of right hand sides");
+
+
+// This function supposes that management of `FLAGS_overwrite` is done before
+// calling it
+template <typename Generator, typename VectorType, typename IndexType>
+void apply_spmv(const char* format_name, std::shared_ptr<gko::Executor> exec,
+                const Generator& generator,
+                const gko::matrix_data<etype, IndexType>& data,
+                const VectorType* b, const VectorType* x,
+                const VectorType* answer, rapidjson::Value& test_case,
+                rapidjson::MemoryPoolAllocator<>& allocator)
+{
+    try {
+        auto& spmv_case = test_case["spmv"];
+        add_or_set_member(spmv_case, format_name,
+                          rapidjson::Value(rapidjson::kObjectType), allocator);
+
+        auto system_matrix = generator.generate_matrix_with_format(
+            exec, format_name, data, &spmv_case[format_name], &allocator);
+
+        // check the residual
+        if (FLAGS_detailed) {
+            auto x_clone = clone(x);
+            exec->synchronize();
+            system_matrix->apply(lend(b), lend(x_clone));
+            exec->synchronize();
+            auto max_relative_norm2 =
+                compute_max_relative_norm2(lend(x_clone), lend(answer));
+            add_or_set_member(spmv_case[format_name], "max_relative_norm2",
+                              max_relative_norm2, allocator);
+        }
+
+        IterationControl ic{get_timer(exec, FLAGS_gpu_timer)};
+        // warm run
+        for (auto _ : ic.warmup_run()) {
+            auto x_clone = clone(x);
+            exec->synchronize();
+            system_matrix->apply(lend(b), lend(x_clone));
+            exec->synchronize();
+        }
+
+        // tuning run
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+        auto& format_case = spmv_case[format_name];
+        if (!format_case.HasMember("tuning")) {
+            format_case.AddMember(
+                "tuning", rapidjson::Value(rapidjson::kObjectType), allocator);
+        }
+        auto& tuning_case = format_case["tuning"];
+        add_or_set_member(tuning_case, "time",
+                          rapidjson::Value(rapidjson::kArrayType), allocator);
+        add_or_set_member(tuning_case, "values",
+                          rapidjson::Value(rapidjson::kArrayType), allocator);
+
+        // Enable tuning for this portion of code
+        gko::_tuning_flag = true;
+        // Select some values we want to tune.
+        std::vector<gko::size_type> tuning_values{
+            1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
+        for (auto val : tuning_values) {
+            // Actually set the value that will be tuned. See
+            // cuda/components/format_conversion.cuh for an example of how this
+            // variable is used.
+            gko::_tuned_value = val;
+            auto tuning_timer = get_timer(exec, FLAGS_gpu_timer);
+            IterationControl ic_tuning{tuning_timer};
+            auto x_clone = clone(x);
+            for (auto _ : ic_tuning.run()) {
+                system_matrix->apply(lend(b), lend(x_clone));
+            }
+            tuning_case["time"].PushBack(ic_tuning.compute_average_time(),
+                                         allocator);
+            tuning_case["values"].PushBack(val, allocator);
+        }
+        // We put back the flag to false to use the default (non-tuned) values
+        // for the following
+        gko::_tuning_flag = false;
+#endif  // GINKGO_BENCHMARK_ENABLE_TUNING
+
+        // timed run
+        auto x_clone = clone(x);
+        for (auto _ : ic.run()) {
+            system_matrix->apply(lend(b), lend(x_clone));
+        }
+        add_or_set_member(spmv_case[format_name], "time",
+                          ic.compute_average_time(), allocator);
+        add_or_set_member(spmv_case[format_name], "repetitions",
+                          ic.get_num_repetitions(), allocator);
+
+        // compute and write benchmark data
+        add_or_set_member(spmv_case[format_name], "completed", true, allocator);
+    } catch (const std::exception& e) {
+        add_or_set_member(test_case["spmv"][format_name], "completed", false,
+                          allocator);
+        if (FLAGS_keep_errors) {
+            rapidjson::Value msg_value;
+            msg_value.SetString(e.what(), allocator);
+            add_or_set_member(test_case["spmv"][format_name], "error",
+                              msg_value, allocator);
+        }
+        std::cerr << "Error when processing test case " << test_case << "\n"
+                  << "what(): " << e.what() << std::endl;
+    }
+}
+
+
+template <typename SystemGenerator>
+void run_spmv_benchmark(std::shared_ptr<gko::Executor> exec,
+                        rapidjson::Document& test_cases,
+                        const std::vector<std::string> formats,
+                        const SystemGenerator& system_generator, bool do_print)
+{
+    auto& allocator = test_cases.GetAllocator();
+
+    for (auto& test_case : test_cases.GetArray()) {
+        try {
+            // set up benchmark
+            system_generator.validate_options(test_case);
+            if (!test_case.HasMember("spmv")) {
+                test_case.AddMember("spmv",
+                                    rapidjson::Value(rapidjson::kObjectType),
+                                    allocator);
+            }
+            auto& spmv_case = test_case["spmv"];
+            if (!FLAGS_overwrite &&
+                all_of(begin(formats), end(formats),
+                       [&spmv_case](const std::string& s) {
+                           return spmv_case.HasMember(s.c_str());
+                       })) {
+                continue;
+            }
+            if (do_print) {
+                std::clog << "Running test case: " << test_case << std::endl;
+            }
+            auto data = system_generator.generate_matrix_data(test_case);
+
+            auto nrhs = FLAGS_nrhs;
+            auto b = system_generator.create_multi_vector_random(
+                exec, gko::dim<2>{data.size[1], nrhs});
+            auto x = system_generator.create_multi_vector_random(
+                exec, gko::dim<2>{data.size[0], nrhs});
+            if (do_print) {
+                std::clog << "Matrix is of size (" << data.size[0] << ", "
+                          << data.size[1] << ")" << std::endl;
+            }
+            add_or_set_member(test_case, "size", data.size[0], allocator);
+            add_or_set_member(test_case, "nnz", data.nonzeros.size(),
+                              allocator);
+            auto best_performance = std::numeric_limits<double>::max();
+            if (!test_case.HasMember("optimal")) {
+                test_case.AddMember("optimal",
+                                    rapidjson::Value(rapidjson::kObjectType),
+                                    allocator);
+            }
+
+            // Compute the result from ginkgo::coo as the correct answer
+            auto answer = gko::clone(lend(x));
+            if (FLAGS_detailed) {
+                auto system_matrix =
+                    system_generator.generate_matrix_with_default_format(exec,
+                                                                         data);
+                exec->synchronize();
+                system_matrix->apply(lend(b), lend(answer));
+                exec->synchronize();
+            }
+            for (const auto& format_name : formats) {
+                apply_spmv(format_name.c_str(), exec, system_generator, data,
+                           lend(b), lend(x), lend(answer), test_case,
+                           allocator);
+                if (do_print) {
+                    std::clog << "Current state:" << std::endl
+                              << test_cases << std::endl;
+                }
+                if (spmv_case[format_name.c_str()]["completed"].GetBool()) {
+                    auto performance =
+                        spmv_case[format_name.c_str()]["time"].GetDouble();
+                    if (performance < best_performance) {
+                        best_performance = performance;
+                        add_or_set_member(
+                            test_case["optimal"], "spmv",
+                            rapidjson::Value(format_name.c_str(), allocator)
+                                .Move(),
+                            allocator);
+                    }
+                }
+                if (do_print) {
+                    backup_results(test_cases);
+                }
+            }
+        } catch (const std::exception& e) {
+            std::cerr << "Error setting up matrix data, what(): " << e.what()
+                      << std::endl;
+        }
+    }
+}
+
+#endif  // GINKGO_BENCHMARK_SPMV_SPMV_COMMON_HPP
diff --git a/benchmark/utils/cuda_linops.cu b/benchmark/utils/cuda_linops.cpp
similarity index 100%
rename from benchmark/utils/cuda_linops.cu
rename to benchmark/utils/cuda_linops.cpp
diff --git a/benchmark/utils/cuda_timer.cu b/benchmark/utils/cuda_timer.cpp
similarity index 100%
rename from benchmark/utils/cuda_timer.cu
rename to benchmark/utils/cuda_timer.cpp
diff --git a/benchmark/utils/distributed_helpers.hpp b/benchmark/utils/distributed_helpers.hpp
new file mode 100644
index 00000000000..edbad2a9427
--- /dev/null
+++ b/benchmark/utils/distributed_helpers.hpp
@@ -0,0 +1,77 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GINKGO_BENCHMARK_UTILS_DISTRIBUTED_HELPERS_HPP
+#define GINKGO_BENCHMARK_UTILS_DISTRIBUTED_HELPERS_HPP
+
+
+#include "benchmark/utils/formats.hpp"
+#include "benchmark/utils/general.hpp"
+#include "benchmark/utils/loggers.hpp"
+#include "benchmark/utils/stencil_matrix.hpp"
+
+
+using global_itype = gko::int64;
+
+
+template <typename ValueType>
+using dist_vec = gko::experimental::distributed::Vector<ValueType>;
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+using dist_mtx =
+    gko::experimental::distributed::Matrix<ValueType, LocalIndexType,
+                                           GlobalIndexType>;
+
+
+std::string broadcast_json_input(std::istream& is,
+                                 gko::experimental::mpi::communicator comm)
+{
+    auto exec = gko::ReferenceExecutor::create();
+
+    std::string json_input;
+    if (comm.rank() == 0) {
+        std::string line;
+        while (is >> line) {
+            json_input += line;
+        }
+    }
+
+    auto input_size = json_input.size();
+    comm.broadcast(exec->get_master(), &input_size, 1, 0);
+    json_input.resize(input_size);
+    comm.broadcast(exec->get_master(), &json_input[0],
+                   static_cast<int>(input_size), 0);
+
+    return json_input;
+}
+
+
+#endif  // GINKGO_BENCHMARK_UTILS_DISTRIBUTED_HELPERS_HPP
diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp
index 00d0aced282..87c8eac208e 100644
--- a/benchmark/utils/formats.hpp
+++ b/benchmark/utils/formats.hpp
@@ -53,7 +53,7 @@ namespace formats {
 
 
 std::string available_format =
-    "coo, csr, ell, ell-mixed, sellp, hybrid, hybrid0, hybrid25, hybrid33, "
+    "coo, csr, ell, ell_mixed, sellp, hybrid, hybrid0, hybrid25, hybrid33, "
     "hybrid40, "
     "hybrid60, hybrid80, hybridlimit0, hybridlimit25, hybridlimit33, "
     "hybridminstorage"
@@ -84,7 +84,7 @@ std::string format_description =
     "csrs: Ginkgo's CSR implementation with sparselib strategy.\n"
     "ell: Ellpack format according to Bell and Garland: Efficient Sparse\n"
     "     Matrix-Vector Multiplication on CUDA.\n"
-    "ell-mixed: Mixed Precision Ellpack format according to Bell and Garland:\n"
+    "ell_mixed: Mixed Precision Ellpack format according to Bell and Garland:\n"
     "           Efficient Sparse Matrix-Vector Multiplication on CUDA.\n"
     "sellp: Sliced Ellpack uses a default block size of 32.\n"
     "hybrid: Hybrid uses ELL and COO to represent the matrix.\n"
@@ -158,50 +158,9 @@ namespace formats {
 // some shortcuts
 using hybrid = gko::matrix::Hybrid<etype, itype>;
 using csr = gko::matrix::Csr<etype, itype>;
-
-/**
- * Creates a Ginkgo matrix from the intermediate data representation format
- * gko::matrix_data.
- *
- * @param exec  the executor where the matrix will be put
- * @param data  the data represented in the intermediate representation format
- *
- * @tparam MatrixType  the Ginkgo matrix type (such as `gko::matrix::Csr<>`)
- *
- * @return a `unique_pointer` to the created matrix
- */
-template <typename MatrixType>
-std::unique_ptr<MatrixType> read_matrix_from_data(
-    std::shared_ptr<const gko::Executor> exec,
-    const gko::matrix_data<etype, itype>& data)
-{
-    auto mat = MatrixType::create(std::move(exec));
-    mat->read(data);
-    return mat;
-}
-
-
-/**
- * Creates a Ginkgo sparselib matrix from the intermediate data representation
- * format gko::matrix_data.
- *
- * @param exec  the executor where the matrix will be put
- * @param data  the data represented in the intermediate representation format
- *
- * @tparam MatrixTagType  the tag type for the matrix format, see
- *                        sparselib_linops.hpp
- *
- * @return a `unique_pointer` to the created matrix
- */
-template <typename MatrixTagType>
-std::unique_ptr<gko::LinOp> read_splib_matrix_from_data(
-    std::shared_ptr<const gko::Executor> exec,
-    const gko::matrix_data<etype, itype>& data)
-{
-    auto mat = create_sparselib_linop<MatrixTagType>(std::move(exec));
-    gko::as<gko::ReadableFromMatrixData<etype, itype>>(mat.get())->read(data);
-    return mat;
-}
+using coo = gko::matrix::Coo<etype, itype>;
+using ell = gko::matrix::Ell<etype, itype>;
+using ell_mixed = gko::matrix::Ell<gko::next_precision<etype>, itype>;
 
 
 /**
@@ -218,6 +177,9 @@ std::shared_ptr<csr::strategy_type> create_gpu_strategy(
         return std::make_shared<Strategy>(cuda->shared_from_this());
     } else if (auto hip = dynamic_cast<const gko::HipExecutor*>(exec.get())) {
         return std::make_shared<Strategy>(hip->shared_from_this());
+    } else if (auto dpcpp =
+                   dynamic_cast<const gko::DpcppExecutor*>(exec.get())) {
+        return std::make_shared<Strategy>(dpcpp->shared_from_this());
     } else {
         return std::make_shared<csr::classical>();
     }
@@ -248,129 +210,122 @@ void check_ell_admissibility(const gko::matrix_data<etype, itype>& data)
 }
 
 
-/**
- * Creates a Ginkgo matrix from the intermediate data representation format
- * gko::matrix_data with support for variable arguments.
- *
- * @param MATRIX_TYPE  the Ginkgo matrix type (such as `gko::matrix::Csr<>`)
- */
-#define READ_MATRIX(MATRIX_TYPE, ...)                                 \
-    [](std::shared_ptr<const gko::Executor> exec,                     \
-       const gko::matrix_data<etype, itype>& data)                    \
-        -> std::unique_ptr<MATRIX_TYPE> {                             \
-        auto mat = MATRIX_TYPE::create(std::move(exec), __VA_ARGS__); \
-        mat->read(data);                                              \
-        return mat;                                                   \
-    }
+template <typename MatrixType, typename... Args>
+auto create_matrix_type(Args&&... args)
+{
+    return [&](std::shared_ptr<const gko::Executor> exec)
+               -> std::unique_ptr<MatrixType> {
+        return MatrixType::create(std::move(exec), std::forward<Args>(args)...);
+    };
+}
+
+
+template <typename MatrixType, typename Strategy>
+auto create_matrix_type_with_gpu_strategy()
+{
+    return [&](std::shared_ptr<const gko::Executor> exec)
+               -> std::unique_ptr<MatrixType> {
+        return MatrixType::create(exec, create_gpu_strategy<Strategy>(exec));
+    };
+}
 
 
 // clang-format off
 const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
-                                std::shared_ptr<const gko::Executor>,
-                                const gko::matrix_data<etype, itype> &)>>
-    matrix_factory{
-        {"csr",
-         [](std::shared_ptr<const gko::Executor> exec,
-            const gko::matrix_data<etype, itype> &data) -> std::unique_ptr<csr> {
-            auto mat =
-                csr::create(exec, create_gpu_strategy<csr::automatical>(exec));
-            mat->read(data);
-            return mat;
-         }},
-        {"csri",
-         [](std::shared_ptr<const gko::Executor> exec,
-            const gko::matrix_data<etype, itype> &data) -> std::unique_ptr<csr> {
-             auto mat = csr::create(
-                 exec, create_gpu_strategy<csr::load_balance>(exec));
-             mat->read(data);
-             return mat;
-         }},
-        {"csrm", READ_MATRIX(csr, std::make_shared<csr::merge_path>())},
-        {"csrc", READ_MATRIX(csr, std::make_shared<csr::classical>())},
-        {"csrs", READ_MATRIX(csr, std::make_shared<csr::sparselib>())},
-        {"coo", read_matrix_from_data<gko::matrix::Coo<etype, itype>>},
-        {"ell", [](std::shared_ptr<const gko::Executor> exec,
-            const gko::matrix_data<etype, itype> &data) {
-             check_ell_admissibility(data);
-             auto mat = gko::matrix::Ell<etype, itype>::create(exec);
-             mat->read(data);
-             return mat;
-         }},
-        {"ell-mixed",
-         [](std::shared_ptr<const gko::Executor> exec,
-            const gko::matrix_data<etype, itype> &data) {
-             check_ell_admissibility(data);
-             gko::matrix_data<gko::next_precision<etype>, itype> conv_data;
-             conv_data.size = data.size;
-             conv_data.nonzeros.resize(data.nonzeros.size());
-             auto it = conv_data.nonzeros.begin();
-             for (auto &el : data.nonzeros) {
-                 it->row = el.row;
-                 it->column = el.column;
-                 it->value = el.value;
-                 ++it;
-             }
-             auto mat = gko::matrix::Ell<gko::next_precision<etype>, itype>::create(
-                 std::move(exec));
-             mat->read(conv_data);
-             return mat;
-         }},
+                                std::shared_ptr<const gko::Executor>)>>
+    matrix_type_factory{
+        {"csr", create_matrix_type_with_gpu_strategy<csr, csr::automatical>()},
+        {"csri", create_matrix_type_with_gpu_strategy<csr, csr::load_balance>()},
+        {"csrm", create_matrix_type<csr>(std::make_shared<csr::merge_path>())},
+        {"csrc", create_matrix_type<csr>(std::make_shared<csr::classical>())},
+        {"csrs", create_matrix_type<csr>(std::make_shared<csr::sparselib>())},
+        {"coo", create_matrix_type<coo>()},
+        {"ell", create_matrix_type<ell>()},
+        {"ell_mixed", create_matrix_type<ell_mixed>()},
 #ifdef HAS_CUDA
-        {"cusparse_csr", read_splib_matrix_from_data<cusparse_csr>},
-        {"cusparse_csrmp", read_splib_matrix_from_data<cusparse_csrmp>},
-        {"cusparse_csrmm", read_splib_matrix_from_data<cusparse_csrmm>},
-        {"cusparse_hybrid", read_splib_matrix_from_data<cusparse_hybrid>},
-        {"cusparse_coo", read_splib_matrix_from_data<cusparse_coo>},
-        {"cusparse_ell", read_splib_matrix_from_data<cusparse_ell>},
-        {"cusparse_csr", read_splib_matrix_from_data<cusparse_gcsr>},
-        {"cusparse_coo", read_splib_matrix_from_data<cusparse_gcoo>},
-        {"cusparse_csrex", read_splib_matrix_from_data<cusparse_csrex>},
-        {"cusparse_gcsr", read_splib_matrix_from_data<cusparse_gcsr>},
-        {"cusparse_gcsr2", read_splib_matrix_from_data<cusparse_gcsr2>},
-        {"cusparse_gcoo", read_splib_matrix_from_data<cusparse_gcoo>},
+        {"cusparse_csr", create_sparselib_linop<cusparse_csr>},
+        {"cusparse_csrmp", create_sparselib_linop<cusparse_csrmp>},
+        {"cusparse_csrmm", create_sparselib_linop<cusparse_csrmm>},
+        {"cusparse_hybrid", create_sparselib_linop<cusparse_hybrid>},
+        {"cusparse_coo", create_sparselib_linop<cusparse_coo>},
+        {"cusparse_ell", create_sparselib_linop<cusparse_ell>},
+        {"cusparse_csr", create_sparselib_linop<cusparse_gcsr>},
+        {"cusparse_coo", create_sparselib_linop<cusparse_gcoo>},
+        {"cusparse_csrex", create_sparselib_linop<cusparse_csrex>},
+        {"cusparse_gcsr", create_sparselib_linop<cusparse_gcsr>},
+        {"cusparse_gcsr2", create_sparselib_linop<cusparse_gcsr2>},
+        {"cusparse_gcoo", create_sparselib_linop<cusparse_gcoo>},
 #endif  // HAS_CUDA
 #ifdef HAS_HIP
-        {"hipsparse_csr", read_splib_matrix_from_data<hipsparse_csr>},
-        {"hipsparse_csrmm", read_splib_matrix_from_data<hipsparse_csrmm>},
-        {"hipsparse_hybrid", read_splib_matrix_from_data<hipsparse_hybrid>},
-        {"hipsparse_coo", read_splib_matrix_from_data<hipsparse_coo>},
-        {"hipsparse_ell", read_splib_matrix_from_data<hipsparse_ell>},
+        {"hipsparse_csr", create_sparselib_linop<hipsparse_csr>},
+        {"hipsparse_csrmm", create_sparselib_linop<hipsparse_csrmm>},
+        {"hipsparse_hybrid", create_sparselib_linop<hipsparse_hybrid>},
+        {"hipsparse_coo", create_sparselib_linop<hipsparse_coo>},
+        {"hipsparse_ell", create_sparselib_linop<hipsparse_ell>},
 #endif  // HAS_HIP
 #ifdef HAS_DPCPP
-        {"onemkl_csr", read_splib_matrix_from_data<onemkl_csr>},
-        {"onemkl_optimized_csr", read_splib_matrix_from_data<onemkl_optimized_csr>},
+        {"onemkl_csr", create_sparselib_linop<onemkl_csr>},
+        {"onemkl_optimized_csr", create_sparselib_linop<onemkl_optimized_csr>},
 #endif  // HAS_DPCPP
-        {"hybrid", read_matrix_from_data<hybrid>},
-        {"hybrid0",
-         READ_MATRIX(hybrid, std::make_shared<hybrid::imbalance_limit>(0))},
-        {"hybrid25",
-         READ_MATRIX(hybrid, std::make_shared<hybrid::imbalance_limit>(0.25))},
+        {"hybrid", create_matrix_type<hybrid>()},
+        {"hybrid0",create_matrix_type<hybrid>( std::make_shared<hybrid::imbalance_limit>(0))},
+        {"hybrid25",create_matrix_type<hybrid>( std::make_shared<hybrid::imbalance_limit>(0.25))},
         {"hybrid33",
-         READ_MATRIX(hybrid,
+         create_matrix_type<hybrid>(
                      std::make_shared<hybrid::imbalance_limit>(1.0 / 3.0))},
         {"hybrid40",
-         READ_MATRIX(hybrid, std::make_shared<hybrid::imbalance_limit>(0.4))},
+         create_matrix_type<hybrid>( std::make_shared<hybrid::imbalance_limit>(0.4))},
         {"hybrid60",
-         READ_MATRIX(hybrid, std::make_shared<hybrid::imbalance_limit>(0.6))},
+         create_matrix_type<hybrid>( std::make_shared<hybrid::imbalance_limit>(0.6))},
         {"hybrid80",
-         READ_MATRIX(hybrid, std::make_shared<hybrid::imbalance_limit>(0.8))},
+         create_matrix_type<hybrid>( std::make_shared<hybrid::imbalance_limit>(0.8))},
         {"hybridlimit0",
-         READ_MATRIX(hybrid,
+         create_matrix_type<hybrid>(
                      std::make_shared<hybrid::imbalance_bounded_limit>(0))},
         {"hybridlimit25",
-         READ_MATRIX(hybrid,
+         create_matrix_type<hybrid>(
                      std::make_shared<hybrid::imbalance_bounded_limit>(0.25))},
         {"hybridlimit33",
-         READ_MATRIX(hybrid, std::make_shared<hybrid::imbalance_bounded_limit>(
+         create_matrix_type<hybrid>( std::make_shared<hybrid::imbalance_bounded_limit>(
                                  1.0 / 3.0))},
         {"hybridminstorage",
-         READ_MATRIX(hybrid,
+         create_matrix_type<hybrid>(
                      std::make_shared<hybrid::minimal_storage_limit>())},
-        {"sellp", read_matrix_from_data<gko::matrix::Sellp<etype, itype>>}
+        {"sellp", create_matrix_type<gko::matrix::Sellp<etype, itype>>()}
 };
 // clang-format on
 
 
+std::unique_ptr<gko::LinOp> matrix_factory(
+    const std::string& format, std::shared_ptr<const gko::Executor> exec,
+    const gko::matrix_data<etype, itype>& data)
+{
+    auto mat = matrix_type_factory.at(format)(exec);
+    if (format == "ell" || format == "ell_mixed") {
+        check_ell_admissibility(data);
+    }
+    if (format == "ell_mixed") {
+        gko::matrix_data<gko::next_precision<etype>, itype> conv_data;
+        conv_data.size = data.size;
+        conv_data.nonzeros.resize(data.nonzeros.size());
+        auto it = conv_data.nonzeros.begin();
+        for (auto& el : data.nonzeros) {
+            it->row = el.row;
+            it->column = el.column;
+            it->value = el.value;
+            ++it;
+        }
+        gko::as<gko::ReadableFromMatrixData<gko::next_precision<etype>, itype>>(
+            mat.get())
+            ->read(conv_data);
+    } else {
+        gko::as<gko::ReadableFromMatrixData<etype, itype>>(mat.get())->read(
+            data);
+    }
+    return mat;
+}
+
+
 }  // namespace formats
 
 #endif  // GKO_BENCHMARK_UTILS_FORMATS_HPP_
diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp
index 5de8a097e4e..8ad2fdc09c8 100644
--- a/benchmark/utils/general.hpp
+++ b/benchmark/utils/general.hpp
@@ -60,6 +60,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "benchmark/utils/timer.hpp"
 #include "benchmark/utils/types.hpp"
+#include "core/distributed/helpers.hpp"
 
 
 // Global command-line arguments
@@ -137,12 +138,12 @@ void initialize_argument_parsing(int* argc, char** argv[], std::string& header,
            "format,\n"
         << "  but with test cases extended to include an additional member "
            "\n"
-        << "  object for each solver run in the benchmark.\n"
+        << "  object for each benchmark run.\n"
         << "  If run with a --backup flag, an intermediate result is "
            "written \n"
         << "  to a file in the same format. The backup file can be used as "
            "\n"
-        << "  input \n to this test suite, and the benchmarking will \n"
+        << "  input to this test suite, and the benchmarking will \n"
         << "  continue from the point where the backup file was created.";
 
     gflags::SetUsageMessage(doc.str());
@@ -178,23 +179,6 @@ void print_general_information(const std::string& extra)
 }
 
 
-/**
- * Creates a Ginkgo matrix from an input file.
- *
- * @param exec  the executor where the matrix will be put
- * @param options  should contain a `filename` option with the input file string
- *
- * @tparam MatrixType  the Ginkgo matrix type (such as `gko::matrix::Csr<>`)
- */
-template <typename MatrixType>
-std::unique_ptr<gko::LinOp> read_matrix(
-    std::shared_ptr<const gko::Executor> exec, const rapidjson::Value& options)
-{
-    return gko::read<MatrixType>(std::ifstream(options["filename"].GetString()),
-                                 std::move(exec));
-}
-
-
 // Returns a random number engine
 std::default_random_engine& get_engine()
 {
@@ -315,6 +299,50 @@ const std::map<std::string, std::function<std::shared_ptr<gko::Executor>(bool)>>
          }}};
 
 
+#if GINKGO_BUILD_MPI
+
+
+const std::map<std::string,
+               std::function<std::shared_ptr<gko::Executor>(MPI_Comm)>>
+    executor_factory_mpi{
+        {"reference",
+         [](MPI_Comm) { return gko::ReferenceExecutor::create(); }},
+        {"omp", [](MPI_Comm) { return gko::OmpExecutor::create(); }},
+        {"cuda",
+         [](MPI_Comm comm) {
+             FLAGS_device_id = gko::experimental::mpi::map_rank_to_device_id(
+                 comm, gko::CudaExecutor::get_num_devices());
+             return gko::CudaExecutor::create(
+                 FLAGS_device_id, gko::ReferenceExecutor::create(), false,
+                 gko::allocation_mode::device);
+         }},
+        {"hip",
+         [](MPI_Comm comm) {
+             FLAGS_device_id = gko::experimental::mpi::map_rank_to_device_id(
+                 comm, gko::HipExecutor::get_num_devices());
+             return gko::HipExecutor::create(
+                 FLAGS_device_id, gko::ReferenceExecutor::create(), true);
+         }},
+        {"dpcpp", [](MPI_Comm comm) {
+             if (gko::DpcppExecutor::get_num_devices("gpu")) {
+                 FLAGS_device_id =
+                     gko::experimental::mpi::map_rank_to_device_id(
+                         comm, gko::DpcppExecutor::get_num_devices("gpu"));
+             } else if (gko::DpcppExecutor::get_num_devices("cpu")) {
+                 FLAGS_device_id =
+                     gko::experimental::mpi::map_rank_to_device_id(
+                         comm, gko::DpcppExecutor::get_num_devices("cpu"));
+             } else {
+                 GKO_NOT_IMPLEMENTED;
+             }
+             return gko::DpcppExecutor::create(
+                 FLAGS_device_id, gko::ReferenceExecutor::create());
+         }}};
+
+
+#endif
+
+
 // returns the appropriate executor, as set by the executor flag
 std::shared_ptr<gko::Executor> get_executor(bool use_gpu_timer)
 {
@@ -365,55 +393,6 @@ create_matrix_sin(std::shared_ptr<const gko::Executor> exec, gko::dim<2> size)
     return res;
 }
 
-
-template <typename ValueType>
-std::unique_ptr<vec<ValueType>> create_matrix(
-    std::shared_ptr<const gko::Executor> exec, gko::dim<2> size,
-    ValueType value)
-{
-    auto res = vec<ValueType>::create(exec);
-    res->read(gko::matrix_data<ValueType, itype>(size, value));
-    return res;
-}
-
-
-// creates a random matrix
-template <typename ValueType, typename RandomEngine>
-std::unique_ptr<vec<ValueType>> create_matrix(
-    std::shared_ptr<const gko::Executor> exec, gko::dim<2> size,
-    RandomEngine& engine)
-{
-    auto res = vec<ValueType>::create(exec);
-    res->read(gko::matrix_data<ValueType, itype>(
-        size,
-        std::uniform_real_distribution<gko::remove_complex<ValueType>>(-1.0,
-                                                                       1.0),
-        engine));
-    return res;
-}
-
-
-// creates a zero vector
-template <typename ValueType>
-std::unique_ptr<vec<ValueType>> create_vector(
-    std::shared_ptr<const gko::Executor> exec, gko::size_type size)
-{
-    auto res = vec<ValueType>::create(exec);
-    res->read(gko::matrix_data<ValueType, itype>(gko::dim<2>{size, 1}));
-    return res;
-}
-
-
-// creates a random vector
-template <typename ValueType, typename RandomEngine>
-std::unique_ptr<vec<ValueType>> create_vector(
-    std::shared_ptr<const gko::Executor> exec, gko::size_type size,
-    RandomEngine& engine)
-{
-    return create_matrix<ValueType>(exec, gko::dim<2>{size, 1}, engine);
-}
-
-
 // utilities for computing norms and residuals
 template <typename ValueType>
 ValueType get_norm(const vec<ValueType>* norm)
@@ -422,8 +401,9 @@ ValueType get_norm(const vec<ValueType>* norm)
 }
 
 
-template <typename ValueType>
-gko::remove_complex<ValueType> compute_norm2(const vec<ValueType>* b)
+template <typename VectorType,
+          typename ValueType = typename VectorType::value_type>
+gko::remove_complex<ValueType> compute_norm2(const VectorType* b)
 {
     auto exec = b->get_executor();
     auto b_norm =
@@ -433,10 +413,11 @@ gko::remove_complex<ValueType> compute_norm2(const vec<ValueType>* b)
 }
 
 
-template <typename ValueType>
+template <typename VectorType,
+          typename ValueType = typename VectorType::value_type>
 gko::remove_complex<ValueType> compute_direct_error(const gko::LinOp* solver,
-                                                    const vec<ValueType>* b,
-                                                    const vec<ValueType>* x)
+                                                    const VectorType* b,
+                                                    const VectorType* x)
 {
     auto ref_exec = gko::ReferenceExecutor::create();
     auto exec = solver->get_executor();
@@ -449,10 +430,10 @@ gko::remove_complex<ValueType> compute_direct_error(const gko::LinOp* solver,
 }
 
 
-template <typename ValueType>
+template <typename VectorType,
+          typename ValueType = typename VectorType::value_type>
 gko::remove_complex<ValueType> compute_residual_norm(
-    const gko::LinOp* system_matrix, const vec<ValueType>* b,
-    const vec<ValueType>* x)
+    const gko::LinOp* system_matrix, const VectorType* b, const VectorType* x)
 {
     auto exec = system_matrix->get_executor();
     auto one = gko::initialize<vec<ValueType>>({1.0}, exec);
@@ -463,9 +444,10 @@ gko::remove_complex<ValueType> compute_residual_norm(
 }
 
 
-template <typename ValueType>
+template <typename VectorType,
+          typename ValueType = typename VectorType::value_type>
 gko::remove_complex<ValueType> compute_max_relative_norm2(
-    vec<ValueType>* result, const vec<ValueType>* answer)
+    VectorType* result, const VectorType* answer)
 {
     using rc_vtype = gko::remove_complex<ValueType>;
     auto exec = answer->get_executor();
@@ -483,9 +465,10 @@ gko::remove_complex<ValueType> compute_max_relative_norm2(
         clone(absolute_norm->get_executor()->get_master(), absolute_norm);
     rc_vtype max_relative_norm2 = 0;
     for (gko::size_type i = 0; i < host_answer_norm->get_size()[1]; i++) {
-        max_relative_norm2 =
-            std::max(host_absolute_norm->at(0, i) / host_answer_norm->at(0, i),
-                     max_relative_norm2);
+        max_relative_norm2 = std::max(
+            gko::detail::get_local(host_absolute_norm.get())->at(0, i) /
+                gko::detail::get_local(host_answer_norm.get())->at(0, i),
+            max_relative_norm2);
     }
     return max_relative_norm2;
 }
diff --git a/benchmark/utils/generator.hpp b/benchmark/utils/generator.hpp
new file mode 100644
index 00000000000..beb146ea853
--- /dev/null
+++ b/benchmark/utils/generator.hpp
@@ -0,0 +1,299 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GINKGO_BENCHMARK_UTILS_GENERATOR_HPP
+#define GINKGO_BENCHMARK_UTILS_GENERATOR_HPP
+
+
+#include <string>
+
+
+#include "benchmark/utils/formats.hpp"
+#include "benchmark/utils/general.hpp"
+#include "benchmark/utils/loggers.hpp"
+#include "benchmark/utils/stencil_matrix.hpp"
+#if GINKGO_BUILD_MPI
+#include "benchmark/utils/distributed_helpers.hpp"
+#endif
+
+
+template <typename ValueType = etype, typename IndexType = itype>
+struct DefaultSystemGenerator {
+    using index_type = IndexType;
+    using value_type = ValueType;
+    using Vec = vec<ValueType>;
+
+    static gko::matrix_data<ValueType, IndexType> generate_matrix_data(
+        rapidjson::Value& config)
+    {
+        if (config.HasMember("filename")) {
+            std::ifstream in(config["filename"].GetString());
+            return gko::read_generic_raw<ValueType, IndexType>(in);
+        } else if (config.HasMember("stencil")) {
+            return generate_stencil<ValueType, IndexType>(
+                config["stencil"].GetString(), config["size"].GetInt64());
+        } else {
+            throw std::runtime_error(
+                "No known way to generate matrix data found.");
+        }
+    }
+
+    static std::shared_ptr<gko::LinOp> generate_matrix_with_optimal_format(
+        std::shared_ptr<gko::Executor> exec, rapidjson::Value& config)
+    {
+        auto data = generate_matrix_data(config);
+        return generate_matrix_with_format(
+            std::move(exec), config["optimal"]["spmv"].GetString(), data);
+    }
+
+    static std::shared_ptr<gko::LinOp> generate_matrix_with_format(
+        std::shared_ptr<gko::Executor> exec, const std::string& format_name,
+        const gko::matrix_data<ValueType, itype>& data,
+        rapidjson::Value* spmv_case = nullptr,
+        rapidjson::MemoryPoolAllocator<>* allocator = nullptr)
+    {
+        auto storage_logger = std::make_shared<StorageLogger>();
+        if (spmv_case && allocator) {
+            exec->add_logger(storage_logger);
+        }
+
+        auto mtx =
+            gko::share(::formats::matrix_factory(format_name, exec, data));
+
+        if (spmv_case && allocator) {
+            exec->remove_logger(gko::lend(storage_logger));
+            storage_logger->write_data(*spmv_case, *allocator);
+        }
+
+        return mtx;
+    }
+
+    static std::shared_ptr<gko::LinOp> generate_matrix_with_default_format(
+        std::shared_ptr<gko::Executor> exec,
+        const gko::matrix_data<ValueType, itype>& data)
+    {
+        return generate_matrix_with_format(std::move(exec), "coo", data);
+    }
+
+    static std::unique_ptr<Vec> create_multi_vector(
+        std::shared_ptr<const gko::Executor> exec, gko::dim<2> size,
+        ValueType value)
+    {
+        auto res = Vec::create(exec);
+        res->read(gko::matrix_data<ValueType, itype>(size, value));
+        return res;
+    }
+
+    static std::unique_ptr<Vec> create_multi_vector_strided(
+        std::shared_ptr<const gko::Executor> exec, gko::dim<2> size,
+        gko::size_type stride)
+    {
+        auto res = Vec::create(exec, size, stride);
+        return res;
+    }
+
+    // creates a random multi_vector
+    static std::unique_ptr<Vec> create_multi_vector_random(
+        std::shared_ptr<const gko::Executor> exec, gko::dim<2> size)
+    {
+        auto res = Vec::create(exec);
+        res->read(gko::matrix_data<ValueType, itype>(
+            size,
+            std::uniform_real_distribution<gko::remove_complex<ValueType>>(-1.0,
+                                                                           1.0),
+            get_engine()));
+        return res;
+    }
+
+    static std::unique_ptr<Vec> initialize(
+        std::initializer_list<ValueType> il,
+        std::shared_ptr<const gko::Executor> exec)
+    {
+        return gko::initialize<Vec>(std::move(il), std::move(exec));
+    }
+};
+
+
+#if GINKGO_BUILD_MPI
+
+
+template <typename LocalGeneratorType>
+struct DistributedDefaultSystemGenerator {
+    using LocalGenerator = LocalGeneratorType;
+
+    using index_type = global_itype;
+    using local_index_type = typename LocalGenerator::index_type;
+    using value_type = typename LocalGenerator::value_type;
+
+    using Mtx = dist_mtx<value_type, local_index_type, index_type>;
+    using Vec = dist_vec<value_type>;
+
+    gko::matrix_data<value_type, index_type> generate_matrix_data(
+        rapidjson::Value& config) const
+    {
+        if (config.HasMember("filename")) {
+            std::ifstream in(config["filename"].GetString());
+            return gko::read_generic_raw<value_type, index_type>(in);
+        } else if (config.HasMember("stencil")) {
+            auto local_size = static_cast<global_itype>(
+                config["size"].GetInt64() / comm.size());
+            return generate_stencil<value_type, index_type>(
+                config["stencil"].GetString(), comm, local_size,
+                config["comm_pattern"].GetString() == std::string("optimal"));
+        } else {
+            throw std::runtime_error(
+                "No known way to generate matrix data found.");
+        }
+    }
+
+    std::shared_ptr<gko::LinOp> generate_matrix_with_optimal_format(
+        std::shared_ptr<gko::Executor> exec, rapidjson::Value& config) const
+    {
+        auto data = generate_matrix_data(config);
+        return generate_matrix_with_format(
+            std::move(exec), config["optimal"]["spmv"].GetString(), data);
+    }
+
+    std::shared_ptr<gko::LinOp> generate_matrix_with_format(
+        std::shared_ptr<gko::Executor> exec, const std::string& format_name,
+        const gko::matrix_data<value_type, index_type>& data,
+        rapidjson::Value* spmv_case = nullptr,
+        rapidjson::MemoryPoolAllocator<>* allocator = nullptr) const
+    {
+        auto part = gko::experimental::distributed::
+            Partition<itype, global_itype>::build_from_global_size_uniform(
+                exec, comm.size(), static_cast<global_itype>(data.size[0]));
+        auto formats = split(format_name, '-');
+
+        auto local_mat = formats::matrix_type_factory.at(formats[0])(exec);
+        auto non_local_mat = formats::matrix_type_factory.at(formats[1])(exec);
+
+        auto storage_logger = std::make_shared<StorageLogger>();
+        if (spmv_case && allocator) {
+            exec->add_logger(storage_logger);
+        }
+
+        auto dist_mat = dist_mtx<etype, itype, global_itype>::create(
+            exec, comm, local_mat.get(), non_local_mat.get());
+        dist_mat->read_distributed(data, part.get());
+
+        if (spmv_case && allocator) {
+            exec->remove_logger(gko::lend(storage_logger));
+            storage_logger->write_data(comm, *spmv_case, *allocator);
+        }
+
+        return dist_mat;
+    }
+
+    std::shared_ptr<gko::LinOp> generate_matrix_with_default_format(
+        std::shared_ptr<gko::Executor> exec,
+        const gko::matrix_data<value_type, global_itype>& data) const
+    {
+        return generate_matrix_with_format(std::move(exec), "coo-coo", data);
+    }
+
+    std::unique_ptr<Vec> create_multi_vector(
+        std::shared_ptr<const gko::Executor> exec, gko::dim<2> size,
+        value_type value) const
+    {
+        auto part = gko::experimental::distributed::
+            Partition<itype, global_itype>::build_from_global_size_uniform(
+                exec, comm.size(), static_cast<global_itype>(size[0]));
+        return Vec::create(
+            exec, comm, size,
+            local_generator
+                .create_multi_vector(
+                    exec,
+                    gko::dim<2>{static_cast<gko::size_type>(
+                                    part->get_part_size(comm.rank())),
+                                size[1]},
+                    value)
+                .get());
+    }
+
+    std::unique_ptr<Vec> create_multi_vector_strided(
+        std::shared_ptr<const gko::Executor> exec, gko::dim<2> size,
+        gko::size_type stride) const
+    {
+        auto part = gko::experimental::distributed::
+            Partition<itype, global_itype>::build_from_global_size_uniform(
+                exec, comm.size(), static_cast<global_itype>(size[0]));
+        return Vec::create(
+            exec, comm, size,
+            local_generator
+                .create_multi_vector_strided(
+                    exec,
+                    gko::dim<2>{static_cast<gko::size_type>(
+                                    part->get_part_size(comm.rank())),
+                                size[1]},
+                    stride)
+                .get());
+    }
+
+    // creates a random multi_vector
+    std::unique_ptr<Vec> create_multi_vector_random(
+        std::shared_ptr<const gko::Executor> exec, gko::dim<2> size) const
+    {
+        auto part = gko::experimental::distributed::
+            Partition<itype, global_itype>::build_from_global_size_uniform(
+                exec, comm.size(), static_cast<global_itype>(size[0]));
+        return Vec::create(
+            exec, comm, size,
+            local_generator
+                .create_multi_vector_random(
+                    exec, gko::dim<2>{static_cast<gko::size_type>(
+                                          part->get_part_size(comm.rank())),
+                                      size[1]})
+                .get());
+    }
+
+    std::unique_ptr<Vec> initialize(
+        std::initializer_list<value_type> il,
+        std::shared_ptr<const gko::Executor> exec) const
+    {
+        auto local = gko::initialize<typename LocalGenerator::Vec>(
+            std::move(il), std::move(exec));
+        auto global_rows = local->get_size()[0];
+        comm.all_reduce(gko::ReferenceExecutor::create(), &global_rows, 1,
+                        MPI_SUM);
+        return Vec::create(exec, comm,
+                           gko::dim<2>{global_rows, local->get_size()[1]},
+                           local.get());
+    }
+
+    gko::experimental::mpi::communicator comm;
+    LocalGenerator local_generator{};
+};
+
+
+#endif
+#endif  // GINKGO_BENCHMARK_UTILS_GENERATOR_HPP
diff --git a/benchmark/utils/loggers.hpp b/benchmark/utils/loggers.hpp
index 4b9a2ae47a5..0d726cc1662 100644
--- a/benchmark/utils/loggers.hpp
+++ b/benchmark/utils/loggers.hpp
@@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "benchmark/utils/general.hpp"
+#include "core/distributed/helpers.hpp"
 
 
 // A logger that accumulates the time of all operations
@@ -186,6 +187,25 @@ struct StorageLogger : gko::log::Logger {
         add_or_set_member(output, "storage", total, allocator);
     }
 
+#if GINKGO_BUILD_MPI
+    void write_data(gko::experimental::mpi::communicator comm,
+                    rapidjson::Value& output,
+                    rapidjson::MemoryPoolAllocator<>& allocator)
+    {
+        const std::lock_guard<std::mutex> lock(mutex);
+        gko::size_type total{};
+        for (const auto& e : storage) {
+            total += e.second;
+        }
+        comm.reduce(gko::ReferenceExecutor::create(),
+                    comm.rank() == 0
+                        ? static_cast<gko::size_type*>(MPI_IN_PLACE)
+                        : &total,
+                    &total, 1, MPI_SUM, 0);
+        add_or_set_member(output, "storage", total, allocator);
+    }
+#endif
+
 private:
     mutable std::mutex mutex;
     mutable std::unordered_map<gko::uintptr, gko::size_type> storage;
@@ -221,14 +241,21 @@ struct ResidualLogger : gko::log::Logger {
             rec_res_norms.PushBack(
                 get_norm(gko::as<vec<rc_vtype>>(residual_norm)), alloc);
         } else {
-            rec_res_norms.PushBack(
-                compute_norm2(gko::as<vec<ValueType>>(residual)), alloc);
+            gko::detail::vector_dispatch<rc_vtype>(
+                residual, [&](const auto v_residual) {
+                    rec_res_norms.PushBack(compute_norm2(v_residual), alloc);
+                });
         }
         if (solution) {
-            true_res_norms.PushBack(
-                compute_residual_norm(matrix, b,
-                                      gko::as<vec<ValueType>>(solution)),
-                alloc);
+            gko::detail::vector_dispatch<
+                rc_vtype>(solution, [&](auto v_solution) {
+                using concrete_type =
+                    std::remove_pointer_t<std::decay_t<decltype(v_solution)>>;
+                true_res_norms.PushBack(
+                    compute_residual_norm(matrix, gko::as<concrete_type>(b),
+                                          v_solution),
+                    alloc);
+            });
         } else {
             true_res_norms.PushBack(-1.0, alloc);
         }
@@ -243,7 +270,7 @@ struct ResidualLogger : gko::log::Logger {
         }
     }
 
-    ResidualLogger(const gko::LinOp* matrix, const vec<ValueType>* b,
+    ResidualLogger(const gko::LinOp* matrix, const gko::LinOp* b,
                    rapidjson::Value& rec_res_norms,
                    rapidjson::Value& true_res_norms,
                    rapidjson::Value& implicit_res_norms,
@@ -265,7 +292,7 @@ struct ResidualLogger : gko::log::Logger {
 
 private:
     const gko::LinOp* matrix;
-    const vec<ValueType>* b;
+    const gko::LinOp* b;
     std::chrono::steady_clock::time_point start;
     rapidjson::Value& rec_res_norms;
     rapidjson::Value& true_res_norms;
diff --git a/benchmark/utils/spmv_common.hpp b/benchmark/utils/spmv_validation.hpp
similarity index 79%
rename from benchmark/utils/spmv_common.hpp
rename to benchmark/utils/spmv_validation.hpp
index bfb57c319c8..83ea2085ec2 100644
--- a/benchmark/utils/spmv_common.hpp
+++ b/benchmark/utils/spmv_validation.hpp
@@ -30,8 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#ifndef GKO_BENCHMARK_UTILS_SPMV_COMMON_HPP_
-#define GKO_BENCHMARK_UTILS_SPMV_COMMON_HPP_
+#ifndef GKO_BENCHMARK_UTILS_SPMV_VALIDATION_HPP_
+#define GKO_BENCHMARK_UTILS_SPMV_VALIDATION_HPP_
 
 
 #include <ginkgo/ginkgo.hpp>
@@ -44,16 +44,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <rapidjson/document.h>
 
 
+std::string example_config = R"(
+  [
+    {"filename": "my_file.mtx"},
+    {"filename": "my_file2.mtx"},
+    {"size": 100, "stencil": "7pt"},
+  ]
+)";
+
+
 /**
  * Function which outputs the input format for benchmarks similar to the spmv.
  */
 [[noreturn]] void print_config_error_and_exit()
 {
     std::cerr << "Input has to be a JSON array of matrix configurations:\n"
-              << "  [\n"
-              << "    { \"filename\": \"my_file.mtx\" },\n"
-              << "    { \"filename\": \"my_file2.mtx\" }\n"
-              << "  ]" << std::endl;
+              << example_config << std::endl;
     std::exit(1);
 }
 
@@ -65,11 +71,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 void validate_option_object(const rapidjson::Value& value)
 {
-    if (!value.IsObject() || !value.HasMember("filename") ||
-        !value["filename"].IsString()) {
+    if (!value.IsObject() ||
+        !((value.HasMember("size") && value.HasMember("stencil") &&
+           value["size"].IsInt64() && value["stencil"].IsString()) ||
+          (value.HasMember("filename") && value["filename"].IsString()))) {
         print_config_error_and_exit();
     }
 }
 
 
-#endif  // GKO_BENCHMARK_UTILS_SPMV_COMMON_HPP_
+#endif  // GKO_BENCHMARK_UTILS_SPMV_VALIDATION_HPP_
diff --git a/benchmark/utils/stencil_matrix.hpp b/benchmark/utils/stencil_matrix.hpp
new file mode 100644
index 00000000000..ca89aa86079
--- /dev/null
+++ b/benchmark/utils/stencil_matrix.hpp
@@ -0,0 +1,433 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GINKGO_BENCHMARK_UTILS_STENCIL_MATRIX_HPP
+#define GINKGO_BENCHMARK_UTILS_STENCIL_MATRIX_HPP
+
+
+#include <ginkgo/core/base/matrix_data.hpp>
+#if GINKGO_BUILD_MPI
+#include <ginkgo/core/base/mpi.hpp>
+#endif
+
+
+template <typename T>
+double closest_nth_root(T v, int n)
+{
+    auto root = std::pow(v, 1. / static_cast<double>(n));
+    auto root_floor = std::floor(root);
+    auto root_ceil = std::ceil(root);
+    if (root - root_floor > root_ceil - root) {
+        return root_ceil;
+    } else {
+        return root_floor;
+    }
+}
+
+
+template <typename IndexType>
+bool is_in_box(const IndexType i, const IndexType bound)
+{
+    return 0 <= i && i < bound;
+}
+
+
+/**
+ * Generates matrix data for a 2D stencil matrix. If restricted is set to true,
+ * creates a 5-pt stencil, if it is false creates a 9-pt stencil.
+ *
+ * If `dim != [1 1]` then the matrix data is a subset of a larger matrix.
+ * The total matrix is a discretization of `[0, dims[0]] x [0, dims[1]]`, and
+ * each box is square. The position of the box defines the subset of the matrix.
+ * The degrees of freedom are ordered box-wise and the boxes themselves are
+ * ordered lexicographical. This means that the indices are with respect to the
+ * larger matrix, i.e. they might not start with 0.
+ *
+ * @param dims  The number of boxes in each dimension.
+ * @param positions  The position of this box with respect to each dimension.
+ * @param target_local_size  The desired size of the boxes. The actual size can
+ *                           deviate from this to accommodate the square size of
+ *                           the boxes.
+ * @param restricted  If true, a 5-pt stencil is used, else a 9-pt stencil.
+ *
+ * @return  matrix data of a box using either 5-pt or 9-pt stencil.
+ */
+template <typename ValueType, typename IndexType>
+gko::matrix_data<ValueType, IndexType> generate_2d_stencil_box(
+    std::array<int, 2> dims, std::array<int, 2> positions,
+    const gko::size_type target_local_size, bool restricted)
+{
+    auto num_boxes =
+        std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<>{});
+
+    const auto discretization_points =
+        static_cast<IndexType>(closest_nth_root(target_local_size, 2));
+    const auto local_size = static_cast<gko::size_type>(discretization_points *
+                                                        discretization_points);
+    const auto global_size = local_size * num_boxes;
+    auto A_data = gko::matrix_data<ValueType, IndexType>(
+        gko::dim<2>{static_cast<gko::size_type>(global_size),
+                    static_cast<gko::size_type>(global_size)});
+
+    auto global_offset = [&](const int position_x, const int position_y) {
+        return static_cast<int>(local_size) * dims[0] * position_x +
+               static_cast<int>(local_size) * position_y;
+    };
+
+    auto target_position = [&](const IndexType i, const int position) {
+        return is_in_box(i, discretization_points)
+                   ? position
+                   : (i < 0 ? position - 1 : position + 1);
+    };
+
+    auto target_local_idx = [&](const IndexType i) {
+        return is_in_box(i, discretization_points)
+                   ? i
+                   : (i < 0 ? discretization_points + i
+                            : discretization_points - i);
+    };
+
+    auto flat_idx = [&](const IndexType ix, const IndexType iy) {
+        auto tpx = target_position(ix, positions[0]);
+        auto tpy = target_position(iy, positions[1]);
+        if (is_in_box(tpx, dims[0]) && is_in_box(tpy, dims[1])) {
+            return global_offset(tpx, tpy) +
+                   target_local_idx(ix) * discretization_points +
+                   target_local_idx(iy);
+        } else {
+            return static_cast<IndexType>(-1);
+        }
+    };
+
+    auto is_valid_neighbor = [&](const IndexType d_i, const IndexType d_j) {
+        return !restricted || d_i == 0 || d_j == 0;
+    };
+
+    auto nnz_in_row = [&]() {
+        int num_neighbors = 0;
+        for (IndexType d_i : {-1, 0, 1}) {
+            for (IndexType d_j : {-1, 0, 1}) {
+                if (is_valid_neighbor(d_i, d_j)) {
+                    num_neighbors++;
+                }
+            }
+        }
+        return num_neighbors;
+    };
+    const auto diag_value = static_cast<ValueType>(nnz_in_row() - 1);
+
+    for (IndexType i = 0; i < discretization_points; ++i) {
+        for (IndexType j = 0; j < discretization_points; ++j) {
+            auto row = flat_idx(i, j);
+            for (IndexType d_i : {-1, 0, 1}) {
+                for (IndexType d_j : {-1, 0, 1}) {
+                    if (is_valid_neighbor(d_i, d_j)) {
+                        auto col = flat_idx(i + d_i, j + d_j);
+                        if (is_in_box(col,
+                                      static_cast<IndexType>(global_size))) {
+                            if (col != row) {
+                                A_data.nonzeros.emplace_back(
+                                    row, col, -gko::one<ValueType>());
+                            } else {
+                                A_data.nonzeros.emplace_back(row, col,
+                                                             diag_value);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return A_data;
+}
+
+
+/**
+ * Generates matrix data for a 3D stencil matrix. If restricted is set to true,
+ * creates a 7-pt stencil, if it is false creates a 27-pt stencil.
+ *
+ * If `dim != [1 1 1]` then the matrix data is a subset of a larger matrix.
+ * The total matrix is a discretization of `[0, dims[0]] x [0, dims[1]] x [0,
+ * dims[2]]`, and each box is a cube. The position of the box defines the subset
+ * of the matrix. The degrees of freedom are ordered box-wise and the boxes
+ * themselves are ordered lexicographical. This means that the indices are with
+ * respect to the larger matrix, i.e. they might not start with 0.
+ *
+ * @param dims  The number of boxes in each dimension.
+ * @param positions  The position of this box with respect to each dimension.
+ * @param target_local_size  The desired size of the boxes. The actual size can
+ *                           deviate from this to accommodate the uniform size
+ *                           of the boxes.
+ * @param restricted  If true, a 7-pt stencil is used, else a 27-pt stencil.
+ *
+ * @return  matrix data of a box using either 7-pt or 27-pt stencil.
+ */
+template <typename ValueType, typename IndexType>
+gko::matrix_data<ValueType, IndexType> generate_3d_stencil_box(
+    std::array<int, 3> dims, std::array<int, 3> positions,
+    const gko::size_type target_local_size, bool restricted)
+{
+    auto num_boxes =
+        std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<>{});
+
+    const auto discretization_points =
+        static_cast<IndexType>(closest_nth_root(target_local_size, 3));
+    const auto local_size = static_cast<gko::size_type>(
+        discretization_points * discretization_points * discretization_points);
+    const auto global_size = local_size * num_boxes;
+    auto A_data = gko::matrix_data<ValueType, IndexType>(
+        gko::dim<2>{static_cast<gko::size_type>(global_size),
+                    static_cast<gko::size_type>(global_size)});
+
+    auto global_offset = [&](const int position_x, const int position_y,
+                             const int position_z) {
+        return position_x * static_cast<int>(local_size) * dims[0] * dims[1] +
+               position_y * static_cast<int>(local_size) * dims[0] +
+               position_z * static_cast<int>(local_size);
+    };
+
+    auto target_position = [&](const IndexType i, const int position) {
+        return is_in_box(i, discretization_points)
+                   ? position
+                   : (i < 0 ? position - 1 : position + 1);
+    };
+
+    auto target_local_idx = [&](const IndexType i) {
+        return is_in_box(i, discretization_points)
+                   ? i
+                   : (i < 0 ? discretization_points + i
+                            : discretization_points - i);
+    };
+
+    auto flat_idx = [&](const IndexType ix, const IndexType iy,
+                        const IndexType iz) {
+        auto tpx = target_position(ix, positions[0]);
+        auto tpy = target_position(iy, positions[1]);
+        auto tpz = target_position(iz, positions[2]);
+        if (is_in_box(tpx, dims[0]) && is_in_box(tpy, dims[1]) &&
+            is_in_box(tpz, dims[2])) {
+            return global_offset(tpx, tpy, tpz) +
+                   target_local_idx(ix) * discretization_points *
+                       discretization_points +
+                   target_local_idx(iy) * discretization_points +
+                   target_local_idx(iz);
+        } else {
+            return static_cast<IndexType>(-1);
+        }
+    };
+
+    auto is_valid_neighbor = [&](const IndexType d_i, const IndexType d_j,
+                                 const IndexType d_k) {
+        return !restricted || ((d_i == 0) + (d_j == 0) + (d_k == 0) >= 2);
+    };
+
+    auto nnz_in_row = [&]() {
+        int num_neighbors = 0;
+        for (IndexType d_i : {-1, 0, 1}) {
+            for (IndexType d_j : {-1, 0, 1}) {
+                for (IndexType d_k : {-1, 0, 1}) {
+                    if (is_valid_neighbor(d_i, d_j, d_k)) {
+                        num_neighbors++;
+                    }
+                }
+            }
+        }
+        return num_neighbors;
+    };
+    const auto diag_value = static_cast<ValueType>(nnz_in_row() - 1);
+
+    for (IndexType i = 0; i < discretization_points; ++i) {
+        for (IndexType j = 0; j < discretization_points; ++j) {
+            for (IndexType k = 0; k < discretization_points; ++k) {
+                auto row = flat_idx(i, j, k);
+                for (IndexType d_i : {-1, 0, 1}) {
+                    for (IndexType d_j : {-1, 0, 1}) {
+                        for (IndexType d_k : {-1, 0, 1}) {
+                            if (is_valid_neighbor(d_i, d_j, d_k)) {
+                                auto col = flat_idx(i + d_i, j + d_j, k + d_k);
+                                if (is_in_box(col, static_cast<IndexType>(
+                                                       global_size))) {
+                                    if (col != row) {
+                                        A_data.nonzeros.emplace_back(
+                                            row, col, -gko::one<ValueType>());
+                                    } else {
+                                        A_data.nonzeros.emplace_back(
+                                            row, col, diag_value);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return A_data;
+}
+
+
+/**
+ * Generates matrix data for the requested stencil.
+ *
+ * @see generate_2d_stencil_box, generate_3d_stencil_box
+ *
+ * @param stencil_name  The name of the stencil.
+ * @param target_local_size  The desired size of the matrix. The actual size can
+ *                           deviate from this to accommodate the uniform size
+ *                           of the discretization.
+ * @return  matrix data using the requested stencil.
+ */
+template <typename ValueType, typename IndexType>
+gko::matrix_data<ValueType, IndexType> generate_stencil(
+    std::string stencil_name, const gko::size_type target_local_size)
+{
+    if (stencil_name == "5pt") {
+        return generate_2d_stencil_box<ValueType, IndexType>(
+            {1, 1}, {0, 0}, target_local_size, true);
+    } else if (stencil_name == "9pt") {
+        return generate_2d_stencil_box<ValueType, IndexType>(
+            {1, 1}, {0, 0}, target_local_size, false);
+    } else if (stencil_name == "7pt") {
+        return generate_3d_stencil_box<ValueType, IndexType>(
+            {1, 1, 1}, {0, 0, 0}, target_local_size, true);
+    } else if (stencil_name == "27pt") {
+        return generate_3d_stencil_box<ValueType, IndexType>(
+            {1, 1, 1}, {0, 0, 0}, target_local_size, false);
+    } else {
+        throw std::runtime_error("Stencil " + stencil_name +
+                                 " not implemented");
+    }
+}
+
+
+#if GINKGO_BUILD_MPI
+
+
+/**
+ * Generates matrix data for a given 2D stencil, where the position of this
+ * block is given by it's MPI rank.
+ *
+ * @see generate_2d_stencil_box
+ */
+template <typename ValueType, typename IndexType>
+gko::matrix_data<ValueType, IndexType> generate_2d_stencil(
+    gko::experimental::mpi::communicator comm,
+    const gko::size_type target_local_size, bool restricted, bool optimal_comm)
+{
+    if (optimal_comm) {
+        return generate_2d_stencil_box<ValueType, IndexType>(
+            {comm.size(), 1}, {comm.rank(), 0}, target_local_size, restricted);
+    } else {
+        std::array<int, 2> dims{};
+        MPI_Dims_create(comm.size(), dims.size(), dims.data());
+
+        std::array<int, 2> coords{};
+        coords[0] = comm.rank() % dims[0];
+        coords[1] = comm.rank() / dims[0];
+
+        return generate_2d_stencil_box<ValueType, IndexType>(
+            dims, coords, target_local_size, restricted);
+    }
+}
+
+
+/**
+ * Generates matrix data for a given 23 stencil, where the position of this
+ * block is given by it's MPI rank.
+ *
+ * @see generate_3d_stencil_box
+ */
+template <typename ValueType, typename IndexType>
+gko::matrix_data<ValueType, IndexType> generate_3d_stencil(
+    gko::experimental::mpi::communicator comm,
+    const gko::size_type target_local_size, bool restricted, bool optimal_comm)
+{
+    if (optimal_comm) {
+        return generate_3d_stencil_box<ValueType, IndexType>(
+            {comm.size(), 1, 1}, {comm.rank(), 0, 0}, target_local_size,
+            restricted);
+    } else {
+        std::array<int, 3> dims{};
+
+        MPI_Dims_create(comm.size(), dims.size(), dims.data());
+
+        std::array<int, 3> coords{};
+        coords[0] = comm.rank() % dims[0];
+        coords[1] = (comm.rank() / dims[0]) % dims[1];
+        coords[2] = comm.rank() / (dims[0] * dims[1]);
+
+        return generate_3d_stencil_box<ValueType, IndexType>(
+            dims, coords, target_local_size, restricted);
+    }
+}
+
+
+/**
+ * Generates matrix data for the requested stencil.
+ *
+ * @copydoc  generate_stencil(const gko::size_type, bool)
+ *
+ * @param comm  The MPI communicator to determine the rank.
+ * @param optimal_comm  If true, a  1D domain decomposition is used which leads
+ *                      to each processor having at most two neighbors. This
+ *                      also changes the domain shape to an elongated channel.
+ *                      If false, a mostly uniform 2D or 3D decomposition is
+ *                      used, and the domain shape is mostly cubic.
+ */
+template <typename ValueType, typename IndexType>
+gko::matrix_data<ValueType, IndexType> generate_stencil(
+    std::string stencil_name, gko::experimental::mpi::communicator comm,
+    const gko::size_type target_local_size, bool optimal_comm)
+{
+    if (stencil_name == "5pt") {
+        return generate_2d_stencil<ValueType, IndexType>(
+            std::move(comm), target_local_size, true, optimal_comm);
+    } else if (stencil_name == "9pt") {
+        return generate_2d_stencil<ValueType, IndexType>(
+            std::move(comm), target_local_size, false, optimal_comm);
+    } else if (stencil_name == "7pt") {
+        return generate_3d_stencil<ValueType, IndexType>(
+            std::move(comm), target_local_size, true, optimal_comm);
+    } else if (stencil_name == "27pt") {
+        return generate_3d_stencil<ValueType, IndexType>(
+            std::move(comm), target_local_size, false, optimal_comm);
+    } else {
+        throw std::runtime_error("Stencil " + stencil_name +
+                                 " not implemented");
+    }
+}
+
+
+#endif
+#endif  // GINKGO_BENCHMARK_UTILS_STENCIL_MATRIX_HPP