diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 5f8d59a8017..f587bf1d3fa 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -11,7 +11,7 @@ function(ginkgo_benchmark_add_tuning_maybe name) endfunction() function(ginkgo_benchmark_cusparse_linops type def) - add_library(cusparse_linops_${type} utils/cuda_linops.cu) + add_library(cusparse_linops_${type} utils/cuda_linops.cpp) if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA") # remove false positive CUDA warnings when calling one() and zero() target_compile_options(cusparse_linops_${type} @@ -117,7 +117,7 @@ if (GINKGO_BUILD_CUDA) ginkgo_benchmark_cusparse_linops(s GKO_BENCHMARK_USE_SINGLE_PRECISION) ginkgo_benchmark_cusparse_linops(z GKO_BENCHMARK_USE_DOUBLE_COMPLEX_PRECISION) ginkgo_benchmark_cusparse_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION) - add_library(cuda_timer utils/cuda_timer.cu) + add_library(cuda_timer utils/cuda_timer.cpp) target_link_libraries(cuda_timer ginkgo ${CUDA_RUNTIME_LIBS}) target_include_directories(cuda_timer SYSTEM PRIVATE ${CUDA_INCLUDE_DIRS}) endif() diff --git a/benchmark/blas/CMakeLists.txt b/benchmark/blas/CMakeLists.txt index c3e40e80bc2..2aab2331b19 100644 --- a/benchmark/blas/CMakeLists.txt +++ b/benchmark/blas/CMakeLists.txt @@ -1 +1,4 @@ ginkgo_add_typed_benchmark_executables(blas "NO" blas.cpp) +if(GINKGO_BUILD_MPI) + add_subdirectory(distributed) +endif() diff --git a/benchmark/blas/blas.cpp b/benchmark/blas/blas.cpp index 4ed42debbbf..58e241e92db 100644 --- a/benchmark/blas/blas.cpp +++ b/benchmark/blas/blas.cpp @@ -33,345 +33,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include -#include #include -#include #include #include -#include +#include "benchmark/blas/blas_common.hpp" #include "benchmark/utils/general.hpp" -#include "benchmark/utils/loggers.hpp" -#include "benchmark/utils/timer.hpp" +#include "benchmark/utils/generator.hpp" #include "benchmark/utils/types.hpp" -// Command-line arguments -DEFINE_string( - operations, "copy,axpy,scal", - "A comma-separated list of BLAS operations to benchmark.\nCandidates are" - " copy (y = x),\n" - " axpy (y = y + a * x),\n" - " multiaxpy (like axpy, but a has one entry per column),\n" - " scal (y = a * y),\n" - " multiscal (like scal, but a has one entry per column),\n" - " dot (a = x' * y)," - " norm (a = sqrt(x' * x)),\n" - " mm (C = A * B),\n" - " gemm (C = a * A * B + b * C)\n" - "where A has dimensions n x k, B has dimensions k x m,\n" - "C has dimensions n x m and x and y have dimensions n x r"); - - -class BenchmarkOperation { -public: - virtual ~BenchmarkOperation() = default; - - virtual gko::size_type get_flops() const = 0; - virtual gko::size_type get_memory() const = 0; - virtual void prepare(){}; - virtual void run() = 0; -}; - - -class CopyOperation : public BenchmarkOperation { -public: - CopyOperation(std::shared_ptr exec, - gko::size_type rows, gko::size_type cols, - gko::size_type istride, gko::size_type ostride) - { - in_ = gko::matrix::Dense::create(exec, gko::dim<2>{rows, cols}, - istride); - out_ = gko::matrix::Dense::create(exec, gko::dim<2>{rows, cols}, - ostride); - in_->fill(1); - } - - gko::size_type get_flops() const override - { - return in_->get_size()[0] * in_->get_size()[1]; - } - - gko::size_type get_memory() const override - { - return in_->get_size()[0] * in_->get_size()[1] * sizeof(etype) * 2; - } - - void run() override { in_->convert_to(lend(out_)); } - -private: - std::unique_ptr> in_; - std::unique_ptr> out_; -}; - - -class AxpyOperation : public BenchmarkOperation { -public: - AxpyOperation(std::shared_ptr exec, - gko::size_type rows, gko::size_type cols, - gko::size_type stride_in, gko::size_type stride_out, - bool multi) - { - alpha_ = gko::matrix::Dense::create( - exec, gko::dim<2>{1, multi ? cols : 1}); - x_ = gko::matrix::Dense::create(exec, gko::dim<2>{rows, cols}, - stride_in); - y_ = gko::matrix::Dense::create(exec, gko::dim<2>{rows, cols}, - stride_out); - alpha_->fill(1); - x_->fill(1); - } - - gko::size_type get_flops() const override - { - return y_->get_size()[0] * y_->get_size()[1] * 2; - } - - gko::size_type get_memory() const override - { - return y_->get_size()[0] * y_->get_size()[1] * sizeof(etype) * 3; - } - - void prepare() override { y_->fill(1); } - - void run() override { y_->add_scaled(lend(alpha_), lend(x_)); } - -private: - std::unique_ptr> alpha_; - std::unique_ptr> x_; - std::unique_ptr> y_; -}; - - -class ScalOperation : public BenchmarkOperation { -public: - ScalOperation(std::shared_ptr exec, - gko::size_type rows, gko::size_type cols, - gko::size_type stride, bool multi) - { - alpha_ = gko::matrix::Dense::create( - exec, gko::dim<2>{1, multi ? cols : 1}); - y_ = gko::matrix::Dense::create(exec, gko::dim<2>{rows, cols}, - stride); - alpha_->fill(1); - } - - gko::size_type get_flops() const override - { - return y_->get_size()[0] * y_->get_size()[1]; - } - - gko::size_type get_memory() const override - { - return y_->get_size()[0] * y_->get_size()[1] * sizeof(etype) * 2; - } - - void prepare() override { y_->fill(1); } - - void run() override { y_->scale(lend(alpha_)); } - -private: - std::unique_ptr> alpha_; - std::unique_ptr> y_; -}; - - -class DotOperation : public BenchmarkOperation { -public: - DotOperation(std::shared_ptr exec, gko::size_type rows, - gko::size_type cols, gko::size_type stride_x, - gko::size_type stride_y) - { - alpha_ = gko::matrix::Dense::create(exec, gko::dim<2>{1, cols}); - x_ = gko::matrix::Dense::create(exec, gko::dim<2>{rows, cols}, - stride_x); - y_ = gko::matrix::Dense::create(exec, gko::dim<2>{rows, cols}, - stride_y); - x_->fill(1); - y_->fill(1); - } - - gko::size_type get_flops() const override - { - return y_->get_size()[0] * y_->get_size()[1] * 2; - } - - gko::size_type get_memory() const override - { - return y_->get_size()[0] * y_->get_size()[1] * sizeof(etype) * 2; - } - - void run() override { x_->compute_dot(lend(y_), lend(alpha_)); } - -private: - std::unique_ptr> alpha_; - std::unique_ptr> x_; - std::unique_ptr> y_; -}; - - -class NormOperation : public BenchmarkOperation { -public: - NormOperation(std::shared_ptr exec, - gko::size_type rows, gko::size_type cols, - gko::size_type stride) - { - alpha_ = gko::matrix::Dense::create(exec, gko::dim<2>{1, cols}); - y_ = gko::matrix::Dense::create(exec, gko::dim<2>{rows, cols}, - stride); - y_->fill(1); - } - - gko::size_type get_flops() const override - { - return y_->get_size()[0] * y_->get_size()[1] * 2; - } - - gko::size_type get_memory() const override - { - return y_->get_size()[0] * y_->get_size()[1] * sizeof(etype); - } - - void run() override { y_->compute_norm2(lend(alpha_)); } - -private: - std::unique_ptr> alpha_; - std::unique_ptr> y_; -}; - - -class ApplyOperation : public BenchmarkOperation { -public: - ApplyOperation(std::shared_ptr exec, gko::size_type n, - gko::size_type k, gko::size_type m, gko::size_type stride_A, - gko::size_type stride_B, gko::size_type stride_C) - { - A_ = gko::matrix::Dense::create(exec, gko::dim<2>{n, k}, - stride_A); - B_ = gko::matrix::Dense::create(exec, gko::dim<2>{k, m}, - stride_B); - C_ = gko::matrix::Dense::create(exec, gko::dim<2>{n, m}, - stride_C); - A_->fill(1); - B_->fill(1); - } - - gko::size_type get_flops() const override - { - return A_->get_size()[0] * A_->get_size()[1] * B_->get_size()[1] * 2; - } - - gko::size_type get_memory() const override - { - return (A_->get_size()[0] * A_->get_size()[1] + - B_->get_size()[0] * B_->get_size()[1] + - C_->get_size()[0] * C_->get_size()[1]) * - sizeof(etype); - } - - void run() override { A_->apply(lend(B_), lend(C_)); } - -private: - std::unique_ptr> A_; - std::unique_ptr> B_; - std::unique_ptr> C_; -}; - - -class AdvancedApplyOperation : public BenchmarkOperation { -public: - AdvancedApplyOperation(std::shared_ptr exec, - gko::size_type n, gko::size_type k, gko::size_type m, - gko::size_type stride_A, gko::size_type stride_B, - gko::size_type stride_C) - { - A_ = gko::matrix::Dense::create(exec, gko::dim<2>{n, k}, - stride_A); - B_ = gko::matrix::Dense::create(exec, gko::dim<2>{k, m}, - stride_B); - C_ = gko::matrix::Dense::create(exec, gko::dim<2>{n, m}, - stride_C); - alpha_ = gko::matrix::Dense::create(exec, gko::dim<2>{1, 1}); - beta_ = gko::matrix::Dense::create(exec, gko::dim<2>{1, 1}); - A_->fill(1); - B_->fill(1); - alpha_->fill(1); - beta_->fill(1); - } - - gko::size_type get_flops() const override - { - return A_->get_size()[0] * A_->get_size()[1] * B_->get_size()[1] * 2 + - C_->get_size()[0] * C_->get_size()[1] * 3; - } - - gko::size_type get_memory() const override - { - return (A_->get_size()[0] * A_->get_size()[1] + - B_->get_size()[0] * B_->get_size()[1] + - C_->get_size()[0] * C_->get_size()[1] * 2) * - sizeof(etype); - } - - void run() override - { - A_->apply(lend(alpha_), lend(B_), lend(beta_), lend(C_)); - } - -private: - std::unique_ptr> alpha_; - std::unique_ptr> beta_; - std::unique_ptr> A_; - std::unique_ptr> B_; - std::unique_ptr> C_; -}; - - -struct dimensions { - gko::size_type n; - gko::size_type k; - gko::size_type m; - gko::size_type r; - gko::size_type stride_x; - gko::size_type stride_y; - gko::size_type stride_A; - gko::size_type stride_B; - gko::size_type stride_C; -}; - - -gko::size_type get_optional(rapidjson::Value& obj, const char* name, - gko::size_type default_value) -{ - if (obj.HasMember(name)) { - return obj[name].GetUint64(); - } else { - return default_value; - } -} - - -dimensions parse_dims(rapidjson::Value& test_case) -{ - dimensions result; - result.n = test_case["n"].GetInt64(); - result.k = get_optional(test_case, "k", result.n); - result.m = get_optional(test_case, "m", result.n); - result.r = get_optional(test_case, "r", 1); - if (test_case.HasMember("stride")) { - result.stride_x = test_case["stride"].GetInt64(); - result.stride_y = result.stride_x; - } else { - result.stride_x = get_optional(test_case, "stride_x", result.r); - result.stride_y = get_optional(test_case, "stride_y", result.r); - } - result.stride_A = get_optional(test_case, "stride_A", result.k); - result.stride_B = get_optional(test_case, "stride_B", result.m); - result.stride_C = get_optional(test_case, "stride_C", result.m); - return result; -} +using Generator = DefaultSystemGenerator<>; std::map( @@ -379,136 +52,80 @@ std::map( operation_map{ {"copy", [](std::shared_ptr exec, dimensions dims) { - return std::make_unique( - exec, dims.n, dims.r, dims.stride_x, dims.stride_y); + return std::make_unique>( + exec, Generator{}, dims.n, dims.r, dims.stride_x, + dims.stride_y); }}, {"axpy", [](std::shared_ptr exec, dimensions dims) { - return std::make_unique( - exec, dims.n, dims.r, dims.stride_x, dims.stride_y, false); + return std::make_unique>( + exec, Generator{}, dims.n, dims.r, dims.stride_x, + dims.stride_y, false); }}, {"multiaxpy", [](std::shared_ptr exec, dimensions dims) { - return std::make_unique( - exec, dims.n, dims.r, dims.stride_x, dims.stride_y, true); + return std::make_unique>( + exec, Generator{}, dims.n, dims.r, dims.stride_x, + dims.stride_y, true); }}, {"scal", [](std::shared_ptr exec, dimensions dims) { - return std::make_unique(exec, dims.n, dims.r, - dims.stride_y, false); + return std::make_unique>( + exec, Generator{}, dims.n, dims.r, dims.stride_y, false); }}, {"multiscal", [](std::shared_ptr exec, dimensions dims) { - return std::make_unique(exec, dims.n, dims.r, - dims.stride_y, true); + return std::make_unique>( + exec, Generator{}, dims.n, dims.r, dims.stride_y, true); }}, {"dot", [](std::shared_ptr exec, dimensions dims) { - return std::make_unique( - exec, dims.n, dims.r, dims.stride_x, dims.stride_y); + return std::make_unique>( + exec, Generator{}, dims.n, dims.r, dims.stride_x, + dims.stride_y); }}, {"norm", [](std::shared_ptr exec, dimensions dims) { - return std::make_unique(exec, dims.n, dims.r, - dims.stride_y); + return std::make_unique>( + exec, Generator{}, dims.n, dims.r, dims.stride_y); }}, {"mm", [](std::shared_ptr exec, dimensions dims) { - return std::make_unique( - exec, dims.n, dims.k, dims.m, dims.stride_A, dims.stride_B, - dims.stride_C); + return std::make_unique>( + exec, Generator{}, dims.n, dims.k, dims.m, dims.stride_A, + dims.stride_B, dims.stride_C); }}, {"gemm", [](std::shared_ptr exec, dimensions dims) { - return std::make_unique( - exec, dims.n, dims.k, dims.m, dims.stride_A, dims.stride_B, - dims.stride_C); + return std::make_unique>( + exec, Generator{}, dims.n, dims.k, dims.m, dims.stride_A, + dims.stride_B, dims.stride_C); }}}; -void apply_blas(const char* operation_name, std::shared_ptr exec, - rapidjson::Value& test_case, - rapidjson::MemoryPoolAllocator<>& allocator) -{ - try { - auto& blas_case = test_case["blas"]; - add_or_set_member(blas_case, operation_name, - rapidjson::Value(rapidjson::kObjectType), allocator); - - auto op = operation_map[operation_name](exec, parse_dims(test_case)); - - auto timer = get_timer(exec, FLAGS_gpu_timer); - IterationControl ic(timer); - - // warm run - for (auto _ : ic.warmup_run()) { - op->prepare(); - exec->synchronize(); - op->run(); - exec->synchronize(); - } - - // timed run - op->prepare(); - for (auto _ : ic.run()) { - op->run(); - } - const auto runtime = ic.compute_average_time(); - const auto flops = static_cast(op->get_flops()); - const auto mem = static_cast(op->get_memory()); - const auto repetitions = ic.get_num_repetitions(); - add_or_set_member(blas_case[operation_name], "time", runtime, - allocator); - add_or_set_member(blas_case[operation_name], "flops", flops / runtime, - allocator); - add_or_set_member(blas_case[operation_name], "bandwidth", mem / runtime, - allocator); - add_or_set_member(blas_case[operation_name], "repetitions", repetitions, - allocator); - - // compute and write benchmark data - add_or_set_member(blas_case[operation_name], "completed", true, - allocator); - } catch (const std::exception& e) { - add_or_set_member(test_case["blas"][operation_name], "completed", false, - allocator); - if (FLAGS_keep_errors) { - rapidjson::Value msg_value; - msg_value.SetString(e.what(), allocator); - add_or_set_member(test_case["blas"][operation_name], "error", - msg_value, allocator); - } - std::cerr << "Error when processing test case " << test_case << "\n" - << "what(): " << e.what() << std::endl; - } -} - - int main(int argc, char* argv[]) { - std::string header = - "A benchmark for measuring performance of Ginkgo's BLAS-like " - "operations.\nParameters for a benchmark case are:\n" - " n: number of rows for vectors and gemm output (required)\n" - " r: number of columns for vectors (optional, default 1)\n" - " m: number of columns for gemm output (optional, default n)\n" - " k: inner dimension of the gemm (optional, default n)\n" - " stride: storage stride for both vectors (optional, default r)\n" - " stride_x: stride for input vector x (optional, default r)\n" - " stride_y: stride for in/out vector y (optional, default r)\n" - " stride_A: stride for A matrix in gemm (optional, default k)\n" - " stride_B: stride for B matrix in gemm (optional, default m)\n" - " stride_C: stride for C matrix in gemm (optional, default m)\n"; - std::string format = std::string() + " [\n { \"n\": 100 },\n" + - " { \"n\": 200, \"m\": 200, \"k\": 200 }\n" + - " ]\n\n"; + std::string header = R"(" +A benchmark for measuring performance of Ginkgo's BLAS-like " +operations. +Parameters for a benchmark case are: + n: number of rows for vectors and gemm output (required) + r: number of columns for vectors (optional, default 1) + m: number of columns for gemm output (optional, default n) + k: inner dimension of the gemm (optional, default n) + stride: storage stride for both vectors (optional, default r) + stride_x: stride for input vector x (optional, default r) + stride_y: stride for in/out vector y (optional, default r) + stride_A: stride for A matrix in gemm (optional, default k) + stride_B: stride for B matrix in gemm (optional, default m) + stride_C: stride for C matrix in gemm (optional, default m) +)"; + std::string format = example_config; initialize_argument_parsing(&argc, &argv, header, format); std::string extra_information = "The operations are " + FLAGS_operations; print_general_information(extra_information); auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer); - auto engine = get_engine(); - auto operations = split(FLAGS_operations, ','); rapidjson::IStreamWrapper jcin(std::cin); rapidjson::Document test_cases; @@ -520,37 +137,7 @@ int main(int argc, char* argv[]) std::exit(1); } - auto& allocator = test_cases.GetAllocator(); - - for (auto& test_case : test_cases.GetArray()) { - try { - // set up benchmark - if (!test_case.HasMember("blas")) { - test_case.AddMember("blas", - rapidjson::Value(rapidjson::kObjectType), - allocator); - } - auto& blas_case = test_case["blas"]; - if (!FLAGS_overwrite && - all_of(begin(operations), end(operations), - [&blas_case](const std::string& s) { - return blas_case.HasMember(s.c_str()); - })) { - continue; - } - std::clog << "Running test case: " << test_case << std::endl; - - for (const auto& operation_name : operations) { - apply_blas(operation_name.c_str(), exec, test_case, allocator); - std::clog << "Current state:" << std::endl - << test_cases << std::endl; - backup_results(test_cases); - } - } catch (const std::exception& e) { - std::cerr << "Error setting up benchmark, what(): " << e.what() - << std::endl; - } - } + run_blas_benchmarks(exec, operation_map, test_cases, true); std::cout << test_cases << std::endl; } diff --git a/benchmark/blas/blas_common.hpp b/benchmark/blas/blas_common.hpp new file mode 100644 index 00000000000..b6533aa820c --- /dev/null +++ b/benchmark/blas/blas_common.hpp @@ -0,0 +1,515 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include +#include +#include +#include +#include +#include + + +#include "benchmark/utils/general.hpp" +#include "benchmark/utils/loggers.hpp" +#include "benchmark/utils/timer.hpp" +#include "benchmark/utils/types.hpp" + + +// Command-line arguments +DEFINE_string( + operations, "copy,axpy,scal", + "A comma-separated list of BLAS operations to benchmark.\nCandidates are" + " copy (y = x),\n" + " axpy (y = y + a * x),\n" + " multiaxpy (like axpy, but a has one entry per column),\n" + " scal (y = a * y),\n" + " multiscal (like scal, but a has one entry per column),\n" + " dot (a = x' * y)," + " norm (a = sqrt(x' * x)),\n" + " mm (C = A * B),\n" + " gemm (C = a * A * B + b * C)\n" + "where A has dimensions n x k, B has dimensions k x m,\n" + "C has dimensions n x m and x and y have dimensions n x r"); + + +std::string example_config = R"( + [ + { "n": 100 }, + { "n": 200, "m": 200, "k": 200 } + ] +)"; + + +class BenchmarkOperation { +public: + virtual ~BenchmarkOperation() = default; + + virtual gko::size_type get_flops() const = 0; + virtual gko::size_type get_memory() const = 0; + virtual void prepare(){}; + virtual void run() = 0; +}; + +template +auto as_vector(T& p) +{ + return gko::as(gko::lend(p)); +} + + +template +class CopyOperation : public BenchmarkOperation { +public: + CopyOperation(std::shared_ptr exec, + const Generator& generator, gko::size_type rows, + gko::size_type cols, gko::size_type stride_in, + gko::size_type stride_out) + { + in_ = generator.create_multi_vector_strided( + exec, gko::dim<2>{rows, cols}, stride_in); + out_ = generator.create_multi_vector_strided( + exec, gko::dim<2>{rows, cols}, stride_out); + as_vector(in_)->fill(1); + } + + gko::size_type get_flops() const override + { + return in_->get_size()[0] * in_->get_size()[1]; + } + + gko::size_type get_memory() const override + { + return in_->get_size()[0] * in_->get_size()[1] * sizeof(etype) * 2; + } + + void run() override + { + as_vector(in_)->convert_to(as_vector(out_)); + } + +private: + std::unique_ptr in_; + std::unique_ptr out_; +}; + + +template +class AxpyOperation : public BenchmarkOperation { +public: + AxpyOperation(std::shared_ptr exec, + const Generator& generator, gko::size_type rows, + gko::size_type cols, gko::size_type stride_in, + gko::size_type stride_out, bool multi) + { + alpha_ = gko::matrix::Dense::create( + exec, gko::dim<2>{1, multi ? cols : 1}); + x_ = generator.create_multi_vector_strided( + exec, gko::dim<2>{rows, cols}, stride_in); + y_ = generator.create_multi_vector_strided( + exec, gko::dim<2>{rows, cols}, stride_out); + alpha_->fill(1); + as_vector(x_)->fill(1); + } + + gko::size_type get_flops() const override + { + return y_->get_size()[0] * y_->get_size()[1] * 2; + } + + gko::size_type get_memory() const override + { + return y_->get_size()[0] * y_->get_size()[1] * sizeof(etype) * 3; + } + + void prepare() override { as_vector(y_)->fill(1); } + + void run() override + { + as_vector(y_)->add_scaled(lend(alpha_), lend(x_)); + } + +private: + std::unique_ptr> alpha_; + std::unique_ptr x_; + std::unique_ptr y_; +}; + + +template +class ScalOperation : public BenchmarkOperation { +public: + ScalOperation(std::shared_ptr exec, + const Generator& generator, gko::size_type rows, + gko::size_type cols, gko::size_type stride, bool multi) + { + alpha_ = gko::matrix::Dense::create( + exec, gko::dim<2>{1, multi ? cols : 1}); + y_ = generator.create_multi_vector_strided( + exec, gko::dim<2>{rows, cols}, stride); + alpha_->fill(1); + } + + gko::size_type get_flops() const override + { + return y_->get_size()[0] * y_->get_size()[1]; + } + + gko::size_type get_memory() const override + { + return y_->get_size()[0] * y_->get_size()[1] * sizeof(etype) * 2; + } + + void prepare() override { as_vector(y_)->fill(1); } + + void run() override { as_vector(y_)->scale(lend(alpha_)); } + +private: + std::unique_ptr> alpha_; + std::unique_ptr y_; +}; + + +template +class DotOperation : public BenchmarkOperation { +public: + DotOperation(std::shared_ptr exec, + const Generator& generator, gko::size_type rows, + gko::size_type cols, gko::size_type stride_x, + gko::size_type stride_y) + { + alpha_ = gko::matrix::Dense::create(exec, gko::dim<2>{1, cols}); + x_ = generator.create_multi_vector_strided( + exec, gko::dim<2>{rows, cols}, stride_x); + y_ = generator.create_multi_vector_strided( + exec, gko::dim<2>{rows, cols}, stride_y); + as_vector(x_)->fill(1); + as_vector(y_)->fill(1); + } + + gko::size_type get_flops() const override + { + return y_->get_size()[0] * y_->get_size()[1] * 2; + } + + gko::size_type get_memory() const override + { + return y_->get_size()[0] * y_->get_size()[1] * sizeof(etype) * 2; + } + + void run() override + { + as_vector(x_)->compute_dot(lend(y_), lend(alpha_)); + } + +private: + std::unique_ptr> alpha_; + std::unique_ptr x_; + std::unique_ptr y_; +}; + + +template +class NormOperation : public BenchmarkOperation { +public: + NormOperation(std::shared_ptr exec, + const Generator& generator, gko::size_type rows, + gko::size_type cols, gko::size_type stride) + { + alpha_ = gko::matrix::Dense::create(exec, gko::dim<2>{1, cols}); + y_ = generator.create_multi_vector_strided( + exec, gko::dim<2>{rows, cols}, stride); + as_vector(y_)->fill(1); + } + + gko::size_type get_flops() const override + { + return y_->get_size()[0] * y_->get_size()[1] * 2; + } + + gko::size_type get_memory() const override + { + return y_->get_size()[0] * y_->get_size()[1] * sizeof(etype); + } + + void run() override + { + as_vector(y_)->compute_norm2(lend(alpha_)); + } + +private: + std::unique_ptr> alpha_; + std::unique_ptr y_; +}; + + +template +class ApplyOperation : public BenchmarkOperation { +public: + ApplyOperation(std::shared_ptr exec, + const Generator& generator, gko::size_type n, + gko::size_type k, gko::size_type m, gko::size_type stride_A, + gko::size_type stride_B, gko::size_type stride_C) + { + A_ = generator.create_multi_vector_strided(exec, gko::dim<2>{n, k}, + stride_A); + B_ = generator.create_multi_vector_strided(exec, gko::dim<2>{k, m}, + stride_B); + C_ = generator.create_multi_vector_strided(exec, gko::dim<2>{n, m}, + stride_C); + as_vector(A_)->fill(1); + as_vector(B_)->fill(1); + } + + gko::size_type get_flops() const override + { + return A_->get_size()[0] * A_->get_size()[1] * B_->get_size()[1] * 2; + } + + gko::size_type get_memory() const override + { + return (A_->get_size()[0] * A_->get_size()[1] + + B_->get_size()[0] * B_->get_size()[1] + + C_->get_size()[0] * C_->get_size()[1]) * + sizeof(etype); + } + + void run() override { A_->apply(lend(B_), lend(C_)); } + +private: + std::unique_ptr A_; + std::unique_ptr B_; + std::unique_ptr C_; +}; + + +template +class AdvancedApplyOperation : public BenchmarkOperation { +public: + AdvancedApplyOperation(std::shared_ptr exec, + const Generator& generator, gko::size_type n, + gko::size_type k, gko::size_type m, + gko::size_type stride_A, gko::size_type stride_B, + gko::size_type stride_C) + { + A_ = generator.create_multi_vector_strided(exec, gko::dim<2>{n, k}, + stride_A); + B_ = generator.create_multi_vector_strided(exec, gko::dim<2>{k, m}, + stride_B); + C_ = generator.create_multi_vector_strided(exec, gko::dim<2>{n, m}, + stride_C); + alpha_ = gko::matrix::Dense::create(exec, gko::dim<2>{1, 1}); + beta_ = gko::matrix::Dense::create(exec, gko::dim<2>{1, 1}); + as_vector(A_)->fill(1); + as_vector(B_)->fill(1); + alpha_->fill(1); + beta_->fill(1); + } + + gko::size_type get_flops() const override + { + return A_->get_size()[0] * A_->get_size()[1] * B_->get_size()[1] * 2 + + C_->get_size()[0] * C_->get_size()[1] * 3; + } + + gko::size_type get_memory() const override + { + return (A_->get_size()[0] * A_->get_size()[1] + + B_->get_size()[0] * B_->get_size()[1] + + C_->get_size()[0] * C_->get_size()[1] * 2) * + sizeof(etype); + } + + void run() override + { + A_->apply(lend(alpha_), lend(B_), lend(beta_), lend(C_)); + } + +private: + std::unique_ptr> alpha_; + std::unique_ptr> beta_; + std::unique_ptr A_; + std::unique_ptr B_; + std::unique_ptr C_; +}; + + +struct dimensions { + gko::size_type n; + gko::size_type k; + gko::size_type m; + gko::size_type r; + gko::size_type stride_x; + gko::size_type stride_y; + gko::size_type stride_A; + gko::size_type stride_B; + gko::size_type stride_C; +}; + + +dimensions parse_dims(rapidjson::Value& test_case) +{ + auto get_optional = [](rapidjson::Value& obj, const char* name, + gko::size_type default_value) -> gko::size_type { + if (obj.HasMember(name)) { + return obj[name].GetUint64(); + } else { + return default_value; + } + }; + + dimensions result; + result.n = test_case["n"].GetInt64(); + result.k = get_optional(test_case, "k", result.n); + result.m = get_optional(test_case, "m", result.n); + result.r = get_optional(test_case, "r", 1); + if (test_case.HasMember("stride")) { + result.stride_x = test_case["stride"].GetInt64(); + result.stride_y = result.stride_x; + } else { + result.stride_x = get_optional(test_case, "stride_x", result.r); + result.stride_y = get_optional(test_case, "stride_y", result.r); + } + result.stride_A = get_optional(test_case, "stride_A", result.k); + result.stride_B = get_optional(test_case, "stride_B", result.m); + result.stride_C = get_optional(test_case, "stride_C", result.m); + return result; +} + + +template +void apply_blas(const char* operation_name, std::shared_ptr exec, + const OpMap& operation_map, rapidjson::Value& test_case, + rapidjson::MemoryPoolAllocator<>& allocator) +{ + try { + auto& blas_case = test_case["blas"]; + add_or_set_member(blas_case, operation_name, + rapidjson::Value(rapidjson::kObjectType), allocator); + + auto op = operation_map.at(operation_name)(exec, parse_dims(test_case)); + + auto timer = get_timer(exec, FLAGS_gpu_timer); + IterationControl ic(timer); + + // warm run + for (auto _ : ic.warmup_run()) { + op->prepare(); + exec->synchronize(); + op->run(); + exec->synchronize(); + } + + // timed run + op->prepare(); + for (auto _ : ic.run()) { + op->run(); + } + const auto runtime = ic.compute_average_time(); + const auto flops = static_cast(op->get_flops()); + const auto mem = static_cast(op->get_memory()); + const auto repetitions = ic.get_num_repetitions(); + add_or_set_member(blas_case[operation_name], "time", runtime, + allocator); + add_or_set_member(blas_case[operation_name], "flops", flops / runtime, + allocator); + add_or_set_member(blas_case[operation_name], "bandwidth", mem / runtime, + allocator); + add_or_set_member(blas_case[operation_name], "repetitions", repetitions, + allocator); + + // compute and write benchmark data + add_or_set_member(blas_case[operation_name], "completed", true, + allocator); + } catch (const std::exception& e) { + add_or_set_member(test_case["blas"][operation_name], "completed", false, + allocator); + if (FLAGS_keep_errors) { + rapidjson::Value msg_value; + msg_value.SetString(e.what(), allocator); + add_or_set_member(test_case["blas"][operation_name], "error", + msg_value, allocator); + } + std::cerr << "Error when processing test case " << test_case << "\n" + << "what(): " << e.what() << std::endl; + } +} + + +template +void run_blas_benchmarks(std::shared_ptr exec, + const OpMap& operation_map, + rapidjson::Document& test_cases, bool do_print) +{ + auto operations = split(FLAGS_operations, ','); + auto& allocator = test_cases.GetAllocator(); + + for (auto& test_case : test_cases.GetArray()) { + try { + // set up benchmark + if (!test_case.HasMember("blas")) { + test_case.AddMember("blas", + rapidjson::Value(rapidjson::kObjectType), + allocator); + } + auto& blas_case = test_case["blas"]; + if (!FLAGS_overwrite && + all_of(begin(operations), end(operations), + [&blas_case](const std::string& s) { + return blas_case.HasMember(s.c_str()); + })) { + continue; + } + if (do_print) { + std::clog << "Running test case: " << test_case << std::endl; + } + + for (const auto& operation_name : operations) { + apply_blas(operation_name.c_str(), exec, operation_map, + test_case, allocator); + + if (do_print) { + std::clog << "Current state:" << std::endl + << test_cases << std::endl; + + backup_results(test_cases); + } + } + } catch (const std::exception& e) { + std::cerr << "Error setting up benchmark, what(): " << e.what() + << std::endl; + } + } +} diff --git a/benchmark/blas/distributed/CMakeLists.txt b/benchmark/blas/distributed/CMakeLists.txt new file mode 100644 index 00000000000..1371294efb8 --- /dev/null +++ b/benchmark/blas/distributed/CMakeLists.txt @@ -0,0 +1 @@ +ginkgo_add_typed_benchmark_executables(multi-vector-distributed "NO" multi_vector.cpp) diff --git a/benchmark/blas/distributed/multi_vector.cpp b/benchmark/blas/distributed/multi_vector.cpp new file mode 100644 index 00000000000..147bcc649d0 --- /dev/null +++ b/benchmark/blas/distributed/multi_vector.cpp @@ -0,0 +1,135 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include +#include + + +#include "benchmark/blas/blas_common.hpp" +#include "benchmark/utils/general.hpp" +#include "benchmark/utils/generator.hpp" +#include "benchmark/utils/types.hpp" + +using Generator = DistributedDefaultSystemGenerator>; + + +int main(int argc, char* argv[]) +{ + gko::experimental::mpi::environment mpi_env{argc, argv}; + + std::string header = R"(" +A benchmark for measuring performance of Ginkgo's BLAS-like " +operations. +Parameters for a benchmark case are: + n: number of rows for vectors output (required) + r: number of columns for vectors (optional, default 1) + stride: storage stride for both vectors (optional, default r) + stride_x: stride for input vector x (optional, default r) + stride_y: stride for in/out vector y (optional, default r) +)"; + std::string format = example_config; + initialize_argument_parsing(&argc, &argv, header, format); + + std::string extra_information = "The operations are " + FLAGS_operations; + print_general_information(extra_information); + + const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); + const auto rank = comm.rank(); + + auto exec = executor_factory_mpi.at(FLAGS_executor)(comm.get()); + + std::string json_input = broadcast_json_input(std::cin, comm); + rapidjson::Document test_cases; + test_cases.Parse(json_input.c_str()); + if (!test_cases.IsArray()) { + std::cerr + << "Input has to be a JSON array of benchmark configurations:\n" + << format; + std::exit(1); + } + + std::map( + std::shared_ptr, dimensions)>> + operation_map{ + {"copy", + [&](std::shared_ptr exec, dimensions dims) { + return std::make_unique>( + exec, Generator{comm, {}}, dims.n, dims.r, dims.stride_x, + dims.stride_y); + }}, + {"axpy", + [&](std::shared_ptr exec, dimensions dims) { + return std::make_unique>( + exec, Generator{comm, {}}, dims.n, dims.r, dims.stride_x, + dims.stride_y, false); + }}, + {"multiaxpy", + [&](std::shared_ptr exec, dimensions dims) { + return std::make_unique>( + exec, Generator{comm, {}}, dims.n, dims.r, dims.stride_x, + dims.stride_y, true); + }}, + {"scal", + [&](std::shared_ptr exec, dimensions dims) { + return std::make_unique>( + exec, Generator{comm, {}}, dims.n, dims.r, dims.stride_y, + false); + }}, + {"multiscal", + [&](std::shared_ptr exec, dimensions dims) { + return std::make_unique>( + exec, Generator{comm, {}}, dims.n, dims.r, dims.stride_y, + true); + }}, + {"dot", + [&](std::shared_ptr exec, dimensions dims) { + return std::make_unique>( + exec, Generator{comm, {}}, dims.n, dims.r, dims.stride_x, + dims.stride_y); + }}, + {"norm", + [&](std::shared_ptr exec, dimensions dims) { + return std::make_unique>( + exec, Generator{comm, {}}, dims.n, dims.r, dims.stride_y); + }}}; + + run_blas_benchmarks(exec, operation_map, test_cases, rank == 0); + + if (rank == 0) { + std::cout << test_cases << std::endl; + } +} diff --git a/benchmark/conversions/conversions.cpp b/benchmark/conversions/conversions.cpp index 46212f8a591..8516f4a6bc3 100644 --- a/benchmark/conversions/conversions.cpp +++ b/benchmark/conversions/conversions.cpp @@ -45,8 +45,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/utils/formats.hpp" #include "benchmark/utils/general.hpp" -#include "benchmark/utils/loggers.hpp" -#include "benchmark/utils/spmv_common.hpp" +#include "benchmark/utils/generator.hpp" +#include "benchmark/utils/spmv_validation.hpp" #include "benchmark/utils/timer.hpp" #include "benchmark/utils/types.hpp" @@ -70,8 +70,7 @@ void convert_matrix(const gko::LinOp* matrix_from, const char* format_to, rapidjson::Value(rapidjson::kObjectType), allocator); gko::matrix_data data{gko::dim<2>{1, 1}, 1}; - auto matrix_to = - share(formats::matrix_factory.at(format_to)(exec, data)); + auto matrix_to = share(formats::matrix_factory(format_to, exec, data)); auto timer = get_timer(exec, FLAGS_gpu_timer); IterationControl ic{timer}; @@ -114,9 +113,7 @@ int main(int argc, char* argv[]) { std::string header = "A benchmark for measuring performance of Ginkgo's conversions.\n"; - std::string format_str = - std::string() + " [\n" + " { \"filename\": \"my_file.mtx\"},\n" + - " { \"filename\": \"my_file2.mtx\"}\n" + " ]\n\n"; + std::string format_str = example_config; initialize_argument_parsing(&argc, &argv, header, format_str); std::string extra_information = @@ -147,10 +144,9 @@ int main(int argc, char* argv[]) auto& conversion_case = test_case["conversions"]; std::clog << "Running test case: " << test_case << std::endl; - std::ifstream mtx_fd(test_case["filename"].GetString()); gko::matrix_data data; try { - data = gko::read_generic_raw(mtx_fd); + data = DefaultSystemGenerator<>::generate_matrix_data(test_case); } catch (std::exception& e) { std::cerr << "Error setting up matrix data, what(): " << e.what() << std::endl; @@ -158,10 +154,11 @@ int main(int argc, char* argv[]) } std::clog << "Matrix is of size (" << data.size[0] << ", " << data.size[1] << ")" << std::endl; + add_or_set_member(test_case, "size", data.size[0], allocator); for (const auto& format_from : formats) { try { auto matrix_from = - share(formats::matrix_factory.at(format_from)(exec, data)); + share(formats::matrix_factory(format_from, exec, data)); for (const auto& format_to : formats) { if (format_from == format_to) { continue; @@ -182,7 +179,7 @@ int main(int argc, char* argv[]) } backup_results(test_cases); } catch (const gko::AllocationError& e) { - for (const auto& format : formats::matrix_factory) { + for (const auto& format : formats::matrix_type_factory) { const auto format_to = std::get<0>(format); auto conversion_name = std::string(format_from) + "-" + format_to; diff --git a/benchmark/matrix_statistics/matrix_statistics.cpp b/benchmark/matrix_statistics/matrix_statistics.cpp index 0b71ac8ccb8..a62c8c10637 100644 --- a/benchmark/matrix_statistics/matrix_statistics.cpp +++ b/benchmark/matrix_statistics/matrix_statistics.cpp @@ -34,14 +34,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include -#include #include #include "benchmark/utils/general.hpp" -#include "benchmark/utils/spmv_common.hpp" +#include "benchmark/utils/generator.hpp" +#include "benchmark/utils/spmv_validation.hpp" #include "benchmark/utils/types.hpp" @@ -173,9 +172,7 @@ int main(int argc, char* argv[]) std::string header = "A utility that collects additional statistical properties of the " "matrix.\n"; - std::string format = std::string() + " [\n" + - " { \"filename\": \"my_file.mtx\"},\n" + - " { \"filename\": \"my_file2.mtx\"}\n" + " ]\n\n"; + std::string format = example_config; initialize_argument_parsing(&argc, &argv, header, format); std::clog << gko::version_info::get() << std::endl; @@ -202,12 +199,13 @@ int main(int argc, char* argv[]) std::clog << "Running test case: " << test_case << std::endl; - std::ifstream ifs(test_case["filename"].GetString()); - auto matrix = gko::read_generic_raw(ifs); - ifs.close(); + auto matrix = + DefaultSystemGenerator::generate_matrix_data( + test_case); std::clog << "Matrix is of size (" << matrix.size[0] << ", " << matrix.size[1] << ")" << std::endl; + add_or_set_member(test_case, "size", matrix.size[0], allocator); extract_matrix_statistics(matrix, test_case["problem"], allocator); diff --git a/benchmark/preconditioner/preconditioner.cpp b/benchmark/preconditioner/preconditioner.cpp index cbf81ed699a..8a947ac8df8 100644 --- a/benchmark/preconditioner/preconditioner.cpp +++ b/benchmark/preconditioner/preconditioner.cpp @@ -34,19 +34,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include -#include #include #include #include "benchmark/utils/formats.hpp" #include "benchmark/utils/general.hpp" +#include "benchmark/utils/generator.hpp" #include "benchmark/utils/loggers.hpp" #include "benchmark/utils/preconditioners.hpp" -#include "benchmark/utils/spmv_common.hpp" +#include "benchmark/utils/spmv_validation.hpp" #include "benchmark/utils/timer.hpp" #include "benchmark/utils/types.hpp" @@ -259,9 +258,7 @@ int main(int argc, char* argv[]) FLAGS_formats = "csr"; std::string header = "A benchmark for measuring preconditioner performance.\n"; - std::string format = std::string() + " [\n" + - " { \"filename\": \"my_file.mtx\"},\n" + - " { \"filename\": \"my_file2.mtx\"}\n" + " ]\n\n"; + std::string format = example_config; initialize_argument_parsing(&argc, &argv, header, format); std::string extra_information = @@ -288,6 +285,8 @@ int main(int argc, char* argv[]) auto& allocator = test_cases.GetAllocator(); + DefaultSystemGenerator<> generator{}; + for (auto& test_case : test_cases.GetArray()) { try { // set up benchmark @@ -307,18 +306,19 @@ int main(int argc, char* argv[]) } std::clog << "Running test case: " << test_case << std::endl; - std::ifstream mtx_fd(test_case["filename"].GetString()); - auto data = gko::read_generic_raw(mtx_fd); + auto data = generator.generate_matrix_data(test_case); auto system_matrix = - share(formats::matrix_factory.at(FLAGS_formats)(exec, data)); - auto b = create_vector(exec, system_matrix->get_size()[0], - engine); - auto x = create_vector(exec, system_matrix->get_size()[0]); + share(formats::matrix_factory(FLAGS_formats, exec, data)); + auto b = generator.create_multi_vector_random( + exec, system_matrix->get_size()[0]); + auto x = generator.create_multi_vector( + exec, system_matrix->get_size()[0], gko::zero()); std::clog << "Matrix is of size (" << system_matrix->get_size()[0] << ", " << system_matrix->get_size()[1] << ")" << std::endl; + add_or_set_member(test_case, "size", data.size[0], allocator); for (const auto& precond_name : preconditioners) { run_preconditioner(precond_name.c_str(), exec, system_matrix, lend(b), lend(x), test_case, allocator); diff --git a/benchmark/solver/CMakeLists.txt b/benchmark/solver/CMakeLists.txt index 11c08bdf6d2..76edd10b2a9 100644 --- a/benchmark/solver/CMakeLists.txt +++ b/benchmark/solver/CMakeLists.txt @@ -1 +1,4 @@ ginkgo_add_typed_benchmark_executables(solver "YES" solver.cpp) +if(GINKGO_BUILD_MPI) + add_subdirectory(distributed) +endif() diff --git a/benchmark/solver/distributed/CMakeLists.txt b/benchmark/solver/distributed/CMakeLists.txt new file mode 100644 index 00000000000..ca6586f1acf --- /dev/null +++ b/benchmark/solver/distributed/CMakeLists.txt @@ -0,0 +1 @@ +ginkgo_add_typed_benchmark_executables(solver-distributed "YES" solver.cpp) diff --git a/benchmark/solver/distributed/solver.cpp b/benchmark/solver/distributed/solver.cpp new file mode 100644 index 00000000000..85ac23670d1 --- /dev/null +++ b/benchmark/solver/distributed/solver.cpp @@ -0,0 +1,155 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include +#include +#include + + +#include "benchmark/solver/solver_common.hpp" +#include "benchmark/utils/general.hpp" +#include "benchmark/utils/generator.hpp" + + +struct Generator : public DistributedDefaultSystemGenerator { + Generator(gko::experimental::mpi::communicator comm) + : DistributedDefaultSystemGenerator{std::move(comm), + {}} + {} + + std::unique_ptr generate_rhs(std::shared_ptr exec, + const gko::LinOp* system_matrix, + rapidjson::Value& config) const + { + return Vec::create( + exec, comm, gko::dim<2>{system_matrix->get_size()[0], FLAGS_nrhs}, + gko::as( + local_generator.generate_rhs( + exec, gko::as(system_matrix)->get_local_matrix().get(), + config)) + .get()); + } + + std::unique_ptr generate_initial_guess( + std::shared_ptr exec, + const gko::LinOp* system_matrix, const Vec* rhs) const + { + return Vec::create( + exec, comm, gko::dim<2>{rhs->get_size()[0], FLAGS_nrhs}, + gko::as( + local_generator.generate_initial_guess( + exec, gko::as(system_matrix)->get_local_matrix().get(), + rhs->get_local_vector())) + .get()); + } +}; + + +int main(int argc, char* argv[]) +{ + gko::experimental::mpi::environment mpi_env{argc, argv}; + + // Set the default repetitions = 1. + FLAGS_repetitions = "1"; + FLAGS_min_repetitions = 1; + + std::string header = + "A benchmark for measuring Ginkgo's distributed solvers\n"; + std::string format = example_config + R"( + The matrix will either be read from an input file if the filename parameter + is given, or generated as a stencil matrix. + If the filename parameter is given, all processes will read the file and + then the matrix is distributed row-block-wise. + In the other case, a size and stencil parameter have to be provided. + The size parameter denotes the size per process. It might be adjusted to + fit the dimensionality of the stencil more easily. + Possible values for "stencil" are: 5pt (2D), 7pt (3D), 9pt (2D), 27pt (3D). + Optional values for "comm_pattern" are: stencil, optimal (default). + Possible values for "optimal[spmv]" follow the pattern + "-", where both "local_format" and + "non_local_format" can be any of the recognized spmv formats. +)"; + initialize_argument_parsing(&argc, &argv, header, format); + + const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); + const auto rank = comm.rank(); + + auto exec = executor_factory_mpi.at(FLAGS_executor)(comm.get()); + + std::string extra_information; + if (FLAGS_repetitions == "auto") { + extra_information = + "WARNING: repetitions = 'auto' not supported for MPI " + "benchmarks, setting repetitions to the default value."; + FLAGS_repetitions = "10"; + } + if (rank == 0) { + print_general_information(extra_information); + } + + std::set supported_solvers = {"cg", "fcg", "cgs", "bicgstab", + "gmres"}; + auto solvers = split(FLAGS_solvers, ','); + for (const auto& solver : solvers) { + if (supported_solvers.find(solver) == supported_solvers.end()) { + throw std::range_error( + std::string("The requested solvers <") + FLAGS_solvers + + "> contain the unsupported solver <" + solver + ">!"); + } + } + + std::stringstream ss_rel_res_goal; + ss_rel_res_goal << std::scientific << FLAGS_rel_res_goal; + + std::string json_input = FLAGS_overhead + ? R"( +[{"filename": "overhead.mtx", + "optimal": {"spmv": "csr-csr"}] +)" + : broadcast_json_input(std::cin, comm); + rapidjson::Document test_cases; + test_cases.Parse(json_input.c_str()); + + if (!test_cases.IsArray()) { + print_config_error_and_exit(); + } + + run_solver_benchmarks(exec, test_cases, Generator(comm), rank == 0); + + if (rank == 0) { + std::cout << test_cases << std::endl; + } +} diff --git a/benchmark/solver/solver.cpp b/benchmark/solver/solver.cpp index 7dcdc516c15..e134a8d4a4f 100644 --- a/benchmark/solver/solver.cpp +++ b/benchmark/solver/solver.cpp @@ -46,520 +46,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "benchmark/utils/formats.hpp" +#include "benchmark/solver/solver_common.hpp" #include "benchmark/utils/general.hpp" -#include "benchmark/utils/loggers.hpp" -#include "benchmark/utils/overhead_linop.hpp" -#include "benchmark/utils/preconditioners.hpp" -#include "benchmark/utils/timer.hpp" -#include "benchmark/utils/types.hpp" - - -#ifdef GINKGO_BENCHMARK_ENABLE_TUNING -#include "benchmark/utils/tuning_variables.hpp" -#endif // GINKGO_BENCHMARK_ENABLE_TUNING - - -// Command-line arguments -DEFINE_uint32(max_iters, 1000, - "Maximal number of iterations the solver will be run for"); - -DEFINE_uint32(warmup_max_iters, 100, - "Maximal number of warmup iterations the solver will be run for"); - -DEFINE_double(rel_res_goal, 1e-6, "The relative residual goal of the solver"); - -DEFINE_bool( - rel_residual, false, - "Use relative residual instead of residual reduction stopping criterion"); - -DEFINE_string(solvers, "cg", - "A comma-separated list of solvers to run. " - "Supported values are: bicgstab, bicg, cb_gmres_keep, " - "cb_gmres_reduce1, cb_gmres_reduce2, cb_gmres_integer, " - "cb_gmres_ireduce1, cb_gmres_ireduce2, cg, cgs, fcg, gmres, idr, " - "lower_trs, upper_trs, symm_direct, overhead"); - -DEFINE_uint32( - nrhs, 1, - "The number of right hand sides. Record the residual only when nrhs == 1."); - -DEFINE_uint32(gmres_restart, 100, - "What maximum dimension of the Krylov space to use in GMRES"); - -DEFINE_uint32(idr_subspace_dim, 2, - "What dimension of the subspace to use in IDR"); - -DEFINE_double( - idr_kappa, 0.7, - "the number to check whether Av_n and v_n are too close or not in IDR"); - -DEFINE_string( - rhs_generation, "1", - "Method used to generate the right hand side. Supported values are:" - "`1`, `random`, `sinus`. `1` sets all values of the right hand side to 1, " - "`random` assigns the values to a uniformly distributed random number " - "in [-1, 1), and `sinus` assigns b = A * (s / |s|) with A := system matrix," - " s := vector with s(idx) = sin(idx) for non-complex types, and " - "s(idx) = sin(2*idx) + i * sin(2*idx+1)."); - -DEFINE_string( - initial_guess_generation, "rhs", - "Method used to generate the initial guess. Supported values are: " - "`random`, `rhs`, `0`. `random` uses a random vector, `rhs` uses the right " - "hand side, and `0 uses a zero vector as the initial guess."); - -// This allows to benchmark the overhead of a solver by using the following -// data: A=[1.0], x=[0.0], b=[nan]. This data can be used to benchmark normal -// solvers or using the argument --solvers=overhead, a minimal solver will be -// launched which contains only a few kernel calls. -DEFINE_bool(overhead, false, - "If set, uses dummy data to benchmark Ginkgo overhead"); - - -// input validation -[[noreturn]] void print_config_error_and_exit() -{ - std::cerr << "Input has to be a JSON array of matrix configurations:\n" - << " [\n" - << " { \"filename\": \"my_file.mtx\", \"optimal\": { " - "\"spmv\": \"\" },\n" - " \"rhs\": \"my_file_rhs.mtx\" },\n" - << " { \"filename\": \"my_file2.mtx\", \"optimal\": { " - "\"spmv\": \"\" } }\n" - << " ]" << std::endl; - std::exit(1); -} - - -template -std::unique_ptr> generate_rhs( - std::shared_ptr exec, - std::shared_ptr system_matrix, Engine engine) -{ - gko::dim<2> vec_size{system_matrix->get_size()[0], FLAGS_nrhs}; - if (FLAGS_rhs_generation == "1") { - return create_matrix(exec, vec_size, gko::one()); - } else if (FLAGS_rhs_generation == "random") { - return create_matrix(exec, vec_size, engine); - } else if (FLAGS_rhs_generation == "sinus") { - auto rhs = vec::create(exec, vec_size); - - auto tmp = create_matrix_sin(exec, vec_size); - auto scalar = gko::matrix::Dense::create( - exec->get_master(), gko::dim<2>{1, vec_size[1]}); - tmp->compute_norm2(scalar.get()); - for (gko::size_type i = 0; i < vec_size[1]; ++i) { - scalar->at(0, i) = gko::one() / scalar->at(0, i); - } - // normalize sin-vector - if (gko::is_complex_s::value) { - tmp->scale(scalar->make_complex().get()); - } else { - tmp->scale(scalar.get()); - } - system_matrix->apply(tmp.get(), rhs.get()); - return rhs; - } - throw std::invalid_argument(std::string("\"rhs_generation\" = ") + - FLAGS_rhs_generation + " is not supported!"); -} - - -template -std::unique_ptr> generate_initial_guess( - std::shared_ptr exec, - std::shared_ptr system_matrix, const vec* rhs, - Engine engine) -{ - gko::dim<2> vec_size{system_matrix->get_size()[1], FLAGS_nrhs}; - if (FLAGS_initial_guess_generation == "0") { - return create_matrix(exec, vec_size, gko::zero()); - } else if (FLAGS_initial_guess_generation == "random") { - return create_matrix(exec, vec_size, engine); - } else if (FLAGS_initial_guess_generation == "rhs") { - return rhs->clone(); - } - throw std::invalid_argument(std::string("\"initial_guess_generation\" = ") + - FLAGS_initial_guess_generation + - " is not supported!"); -} - - -void validate_option_object(const rapidjson::Value& value) -{ - if (!value.IsObject() || !value.HasMember("optimal") || - !value["optimal"].HasMember("spmv") || - !value["optimal"]["spmv"].IsString() || !value.HasMember("filename") || - !value["filename"].IsString() || - (value.HasMember("rhs") && !value["rhs"].IsString())) { - print_config_error_and_exit(); - } -} - - -std::shared_ptr create_criterion( - std::shared_ptr exec, std::uint32_t max_iters) -{ - std::shared_ptr residual_stop; - if (FLAGS_rel_residual) { - residual_stop = - gko::share(gko::stop::ResidualNorm::build() - .with_baseline(gko::stop::mode::initial_resnorm) - .with_reduction_factor( - static_cast(FLAGS_rel_res_goal)) - .on(exec)); - } else { - residual_stop = - gko::share(gko::stop::ResidualNorm::build() - .with_baseline(gko::stop::mode::rhs_norm) - .with_reduction_factor( - static_cast(FLAGS_rel_res_goal)) - .on(exec)); - } - auto iteration_stop = gko::share( - gko::stop::Iteration::build().with_max_iters(max_iters).on(exec)); - std::vector> - criterion_vector{residual_stop, iteration_stop}; - return gko::stop::combine(criterion_vector); -} - - -template -std::unique_ptr add_criteria_precond_finalize( - SolverIntermediate inter, const std::shared_ptr& exec, - std::shared_ptr precond, std::uint32_t max_iters) -{ - return inter.with_criteria(create_criterion(exec, max_iters)) - .with_preconditioner(give(precond)) - .on(exec); -} - - -template -std::unique_ptr add_criteria_precond_finalize( - const std::shared_ptr& exec, - std::shared_ptr precond, std::uint32_t max_iters) -{ - return add_criteria_precond_finalize(Solver::build(), exec, precond, - max_iters); -} - - -std::unique_ptr generate_solver( - const std::shared_ptr& exec, - std::shared_ptr precond, - const std::string& description, std::uint32_t max_iters) -{ - std::string cb_gmres_prefix("cb_gmres_"); - if (description.find(cb_gmres_prefix) == 0) { - auto s_prec = gko::solver::cb_gmres::storage_precision::keep; - const auto spec = description.substr(cb_gmres_prefix.length()); - if (spec == "keep") { - s_prec = gko::solver::cb_gmres::storage_precision::keep; - } else if (spec == "reduce1") { - s_prec = gko::solver::cb_gmres::storage_precision::reduce1; - } else if (spec == "reduce2") { - s_prec = gko::solver::cb_gmres::storage_precision::reduce2; - } else if (spec == "integer") { - s_prec = gko::solver::cb_gmres::storage_precision::integer; - } else if (spec == "ireduce1") { - s_prec = gko::solver::cb_gmres::storage_precision::ireduce1; - } else if (spec == "ireduce2") { - s_prec = gko::solver::cb_gmres::storage_precision::ireduce2; - } else { - throw std::range_error( - std::string( - "CB-GMRES does not have a corresponding solver to <") + - description + ">!"); - } - return add_criteria_precond_finalize( - gko::solver::CbGmres::build() - .with_krylov_dim(FLAGS_gmres_restart) - .with_storage_precision(s_prec), - exec, precond, max_iters); - } else if (description == "bicgstab") { - return add_criteria_precond_finalize>( - exec, precond, max_iters); - } else if (description == "bicg") { - return add_criteria_precond_finalize>( - exec, precond, max_iters); - } else if (description == "cg") { - return add_criteria_precond_finalize>( - exec, precond, max_iters); - } else if (description == "cgs") { - return add_criteria_precond_finalize>( - exec, precond, max_iters); - } else if (description == "fcg") { - return add_criteria_precond_finalize>( - exec, precond, max_iters); - } else if (description == "idr") { - return add_criteria_precond_finalize( - gko::solver::Idr::build() - .with_subspace_dim(FLAGS_idr_subspace_dim) - .with_kappa(static_cast(FLAGS_idr_kappa)), - exec, precond, max_iters); - } else if (description == "gmres") { - return add_criteria_precond_finalize( - gko::solver::Gmres::build().with_krylov_dim( - FLAGS_gmres_restart), - exec, precond, max_iters); - } else if (description == "lower_trs") { - return gko::solver::LowerTrs::build() - .with_num_rhs(FLAGS_nrhs) - .on(exec); - } else if (description == "upper_trs") { - return gko::solver::UpperTrs::build() - .with_num_rhs(FLAGS_nrhs) - .on(exec); - } else if (description == "symm_direct") { - return gko::experimental::solver::Direct::build() - .with_factorization( - gko::experimental::factorization::Lu::build() - .with_symmetric_sparsity(true) - .on(exec)) - .on(exec); - } else if (description == "overhead") { - return add_criteria_precond_finalize>( - exec, precond, max_iters); - } - throw std::range_error(std::string("The provided string <") + description + - "> does not match any solver!"); -} - - -void write_precond_info(const gko::LinOp* precond, - rapidjson::Value& precond_info, - rapidjson::MemoryPoolAllocator<>& allocator) -{ - if (const auto jacobi = - dynamic_cast*>(precond)) { - // extract block sizes - const auto bdata = - jacobi->get_parameters().block_pointers.get_const_data(); - add_or_set_member(precond_info, "block_sizes", - rapidjson::Value(rapidjson::kArrayType), allocator); - const auto nblocks = jacobi->get_num_blocks(); - for (auto i = decltype(nblocks){0}; i < nblocks; ++i) { - precond_info["block_sizes"].PushBack(bdata[i + 1] - bdata[i], - allocator); - } - - // extract block precisions - const auto pdata = - jacobi->get_parameters() - .storage_optimization.block_wise.get_const_data(); - if (pdata) { - add_or_set_member(precond_info, "block_precisions", - rapidjson::Value(rapidjson::kArrayType), - allocator); - for (auto i = decltype(nblocks){0}; i < nblocks; ++i) { - precond_info["block_precisions"].PushBack( - static_cast(pdata[i]), allocator); - } - } - - // extract condition numbers - const auto cdata = jacobi->get_conditioning(); - if (cdata) { - add_or_set_member(precond_info, "block_conditioning", - rapidjson::Value(rapidjson::kArrayType), - allocator); - for (auto i = decltype(nblocks){0}; i < nblocks; ++i) { - precond_info["block_conditioning"].PushBack(cdata[i], - allocator); - } - } - } -} - - -void solve_system(const std::string& solver_name, - const std::string& precond_name, - const char* precond_solver_name, - std::shared_ptr exec, - std::shared_ptr system_matrix, - const vec* b, const vec* x, - rapidjson::Value& test_case, - rapidjson::MemoryPoolAllocator<>& allocator) -{ - try { - auto& solver_case = test_case["solver"]; - if (!FLAGS_overwrite && solver_case.HasMember(precond_solver_name)) { - return; - } - - add_or_set_member(solver_case, precond_solver_name, - rapidjson::Value(rapidjson::kObjectType), allocator); - auto& solver_json = solver_case[precond_solver_name]; - add_or_set_member(solver_json, "recurrent_residuals", - rapidjson::Value(rapidjson::kArrayType), allocator); - add_or_set_member(solver_json, "true_residuals", - rapidjson::Value(rapidjson::kArrayType), allocator); - add_or_set_member(solver_json, "implicit_residuals", - rapidjson::Value(rapidjson::kArrayType), allocator); - add_or_set_member(solver_json, "iteration_timestamps", - rapidjson::Value(rapidjson::kArrayType), allocator); - if (b->get_size()[1] == 1 && !FLAGS_overhead) { - auto rhs_norm = compute_norm2(lend(b)); - add_or_set_member(solver_json, "rhs_norm", rhs_norm, allocator); - } - for (auto stage : {"generate", "apply"}) { - add_or_set_member(solver_json, stage, - rapidjson::Value(rapidjson::kObjectType), - allocator); - add_or_set_member(solver_json[stage], "components", - rapidjson::Value(rapidjson::kObjectType), - allocator); - } - - IterationControl ic{get_timer(exec, FLAGS_gpu_timer)}; - - // warm run - std::shared_ptr solver; - for (auto _ : ic.warmup_run()) { - auto x_clone = clone(x); - auto precond = precond_factory.at(precond_name)(exec); - solver = generate_solver(exec, give(precond), solver_name, - FLAGS_warmup_max_iters) - ->generate(system_matrix); - solver->apply(lend(b), lend(x_clone)); - exec->synchronize(); - } - - // detail run - if (FLAGS_detailed && !FLAGS_overhead) { - // slow run, get the time of each functions - auto x_clone = clone(x); - - auto gen_logger = - std::make_shared(FLAGS_nested_names); - exec->add_logger(gen_logger); - if (exec->get_master() != exec) { - exec->get_master()->add_logger(gen_logger); - } - - auto precond = precond_factory.at(precond_name)(exec); - solver = generate_solver(exec, give(precond), solver_name, - FLAGS_max_iters) - ->generate(system_matrix); - - if (exec->get_master() != exec) { - exec->get_master()->remove_logger(gko::lend(gen_logger)); - } - exec->remove_logger(gko::lend(gen_logger)); - gen_logger->write_data(solver_json["generate"]["components"], - allocator, 1); - - if (auto prec = - dynamic_cast(lend(solver))) { - add_or_set_member(solver_json, "preconditioner", - rapidjson::Value(rapidjson::kObjectType), - allocator); - write_precond_info( - lend(clone(get_executor(FLAGS_gpu_timer)->get_master(), - prec->get_preconditioner())), - solver_json["preconditioner"], allocator); - } - - auto apply_logger = - std::make_shared(FLAGS_nested_names); - exec->add_logger(apply_logger); - if (exec->get_master() != exec) { - exec->get_master()->add_logger(apply_logger); - } - - solver->apply(lend(b), lend(x_clone)); - - if (exec->get_master() != exec) { - exec->get_master()->remove_logger(gko::lend(apply_logger)); - } - exec->remove_logger(gko::lend(apply_logger)); - apply_logger->write_data(solver_json["apply"]["components"], - allocator, 1); - - // slow run, gets the recurrent and true residuals of each iteration - if (b->get_size()[1] == 1) { - x_clone = clone(x); - auto res_logger = std::make_shared>( - lend(system_matrix), b, solver_json["recurrent_residuals"], - solver_json["true_residuals"], - solver_json["implicit_residuals"], - solver_json["iteration_timestamps"], allocator); - solver->add_logger(res_logger); - solver->apply(lend(b), lend(x_clone)); - if (!res_logger->has_implicit_res_norms()) { - solver_json.RemoveMember("implicit_residuals"); - } - } - exec->synchronize(); - } - - // timed run - auto it_logger = std::make_shared(); - auto generate_timer = get_timer(exec, FLAGS_gpu_timer); - auto apply_timer = ic.get_timer(); - auto x_clone = clone(x); - for (auto status : ic.run(false)) { - x_clone = clone(x); - - exec->synchronize(); - generate_timer->tic(); - auto precond = precond_factory.at(precond_name)(exec); - solver = generate_solver(exec, give(precond), solver_name, - FLAGS_max_iters) - ->generate(system_matrix); - generate_timer->toc(); - - exec->synchronize(); - if (ic.get_num_repetitions() == 0) { - solver->add_logger(it_logger); - } - apply_timer->tic(); - solver->apply(lend(b), lend(x_clone)); - apply_timer->toc(); - if (ic.get_num_repetitions() == 0) { - solver->remove_logger(gko::lend(it_logger)); - } - } - it_logger->write_data(solver_json["apply"], allocator); - - if (b->get_size()[1] == 1 && !FLAGS_overhead) { - // a solver is considered direct if it didn't log any iterations - if (solver_json["apply"].HasMember("iterations") && - solver_json["apply"]["iterations"].GetInt() == 0) { - auto error = - compute_direct_error(lend(solver), lend(b), lend(x_clone)); - add_or_set_member(solver_json, "forward_error", error, - allocator); - } - auto residual = compute_residual_norm(lend(system_matrix), lend(b), - lend(x_clone)); - add_or_set_member(solver_json, "residual_norm", residual, - allocator); - } - add_or_set_member(solver_json["generate"], "time", - generate_timer->compute_average_time(), allocator); - add_or_set_member(solver_json["apply"], "time", - apply_timer->compute_average_time(), allocator); - add_or_set_member(solver_json, "repetitions", - apply_timer->get_num_repetitions(), allocator); - - // compute and write benchmark data - add_or_set_member(solver_json, "completed", true, allocator); - } catch (const std::exception& e) { - add_or_set_member(test_case["solver"][precond_solver_name], "completed", - false, allocator); - if (FLAGS_keep_errors) { - rapidjson::Value msg_value; - msg_value.SetString(e.what(), allocator); - add_or_set_member(test_case["solver"][precond_solver_name], "error", - msg_value, allocator); - } - std::cerr << "Error when processing test case " << test_case << "\n" - << "what(): " << e.what() << std::endl; - } -} +#include "benchmark/utils/generator.hpp" int main(int argc, char* argv[]) @@ -569,16 +58,9 @@ int main(int argc, char* argv[]) FLAGS_min_repetitions = 1; std::string header = "A benchmark for measuring performance of Ginkgo's solvers.\n"; - std::string format = - std::string() + " [\n" + - " { \"filename\": \"my_file.mtx\", \"optimal\": { " - "\"spmv\": \"\" },\n" - " \"rhs\": \"my_file_rhs.mtx\" },\n" + - " { \"filename\": \"my_file2.mtx\", \"optimal\": { " - "\"spmv\": \"\" } }\n" + - " ]\n\n" + - " \"optimal_format\" can be one of the recognized spmv " - "format\n\n"; + std::string format = example_config + R"( + "optimal":"spmv" can be one of the recognized spmv formats +)"; initialize_argument_parsing(&argc, &argv, header, format); std::stringstream ss_rel_res_goal; @@ -592,14 +74,6 @@ int main(int argc, char* argv[]) print_general_information(extra_information); auto exec = get_executor(FLAGS_gpu_timer); - auto solvers = split(FLAGS_solvers, ','); - auto preconds = split(FLAGS_preconditioners, ','); - std::vector precond_solvers; - for (const auto& s : solvers) { - for (const auto& p : preconds) { - precond_solvers.push_back(s + (p == "none" ? "" : "-" + p)); - } - } rapidjson::Document test_cases; if (!FLAGS_overhead) { @@ -617,73 +91,7 @@ int main(int argc, char* argv[]) print_config_error_and_exit(); } - auto engine = get_engine(); - auto& allocator = test_cases.GetAllocator(); - - for (auto& test_case : test_cases.GetArray()) { - try { - // set up benchmark - validate_option_object(test_case); - if (!test_case.HasMember("solver")) { - test_case.AddMember("solver", - rapidjson::Value(rapidjson::kObjectType), - allocator); - } - auto& solver_case = test_case["solver"]; - if (!FLAGS_overwrite && - all_of(begin(precond_solvers), end(precond_solvers), - [&solver_case](const std::string& s) { - return solver_case.HasMember(s.c_str()); - })) { - continue; - } - std::clog << "Running test case: " << test_case << std::endl; - std::ifstream mtx_fd(test_case["filename"].GetString()); - - using Vec = gko::matrix::Dense; - std::shared_ptr system_matrix; - std::unique_ptr b; - std::unique_ptr x; - if (FLAGS_overhead) { - system_matrix = gko::initialize({1.0}, exec); - b = gko::initialize( - {std::numeric_limits::quiet_NaN()}, exec); - x = gko::initialize({0.0}, exec); - } else { - auto data = gko::read_generic_raw(mtx_fd); - system_matrix = share(formats::matrix_factory.at( - test_case["optimal"]["spmv"].GetString())(exec, data)); - if (test_case.HasMember("rhs")) { - std::ifstream rhs_fd{test_case["rhs"].GetString()}; - b = gko::read(rhs_fd, exec); - } else { - b = generate_rhs(exec, system_matrix, engine); - } - x = generate_initial_guess(exec, system_matrix, b.get(), - engine); - } - - std::clog << "Matrix is of size (" << system_matrix->get_size()[0] - << ", " << system_matrix->get_size()[1] << ")" - << std::endl; - auto precond_solver_name = begin(precond_solvers); - for (const auto& solver_name : solvers) { - for (const auto& precond_name : preconds) { - std::clog << "\tRunning solver: " << *precond_solver_name - << std::endl; - solve_system(solver_name, precond_name, - precond_solver_name->c_str(), exec, - system_matrix, lend(b), lend(x), test_case, - allocator); - backup_results(test_cases); - ++precond_solver_name; - } - } - } catch (const std::exception& e) { - std::cerr << "Error setting up solver, what(): " << e.what() - << std::endl; - } - } + run_solver_benchmarks(exec, test_cases, SolverGenerator{}, true); std::cout << test_cases << std::endl; } diff --git a/benchmark/solver/solver_common.hpp b/benchmark/solver/solver_common.hpp new file mode 100644 index 00000000000..845fccb2fe5 --- /dev/null +++ b/benchmark/solver/solver_common.hpp @@ -0,0 +1,657 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GINKGO_BENCHMARK_SOLVER_SOLVER_COMMON_HPP +#define GINKGO_BENCHMARK_SOLVER_SOLVER_COMMON_HPP + + +#include "benchmark/utils/formats.hpp" +#include "benchmark/utils/general.hpp" +#include "benchmark/utils/generator.hpp" +#include "benchmark/utils/loggers.hpp" +#include "benchmark/utils/preconditioners.hpp" + + +#ifdef GINKGO_BENCHMARK_ENABLE_TUNING +#include "benchmark/utils/tuning_variables.hpp" +#endif // GINKGO_BENCHMARK_ENABLE_TUNING + + +// Command-line arguments +DEFINE_uint32(max_iters, 1000, + "Maximal number of iterations the solver will be run for"); + +DEFINE_uint32(warmup_max_iters, 100, + "Maximal number of warmup iterations the solver will be run for"); + +DEFINE_double(rel_res_goal, 1e-6, "The relative residual goal of the solver"); + +DEFINE_bool( + rel_residual, false, + "Use relative residual instead of residual reduction stopping criterion"); + +DEFINE_string(solvers, "cg", + "A comma-separated list of solvers to run. " + "Supported values are: bicgstab, bicg, cb_gmres_keep, " + "cb_gmres_reduce1, cb_gmres_reduce2, cb_gmres_integer, " + "cb_gmres_ireduce1, cb_gmres_ireduce2, cg, cgs, fcg, gmres, idr, " + "lower_trs, upper_trs, overhead"); + +DEFINE_uint32( + nrhs, 1, + "The number of right hand sides. Record the residual only when nrhs == 1."); + +DEFINE_uint32(gmres_restart, 100, + "What maximum dimension of the Krylov space to use in GMRES"); + +DEFINE_uint32(idr_subspace_dim, 2, + "What dimension of the subspace to use in IDR"); + +DEFINE_double( + idr_kappa, 0.7, + "the number to check whether Av_n and v_n are too close or not in IDR"); + +DEFINE_string( + rhs_generation, "1", + "Method used to generate the right hand side. Supported values are:" + "`1`, `random`, `sinus`. `1` sets all values of the right hand side to 1, " + "`random` assigns the values to a uniformly distributed random number " + "in [-1, 1), and `sinus` assigns b = A * (s / |s|) with A := system matrix," + " s := vector with s(idx) = sin(idx) for non-complex types, and " + "s(idx) = sin(2*idx) + i * sin(2*idx+1)."); + +DEFINE_string( + initial_guess_generation, "rhs", + "Method used to generate the initial guess. Supported values are: " + "`random`, `rhs`, `0`. `random` uses a random vector, `rhs` uses the right " + "hand side, and `0 uses a zero vector as the initial guess."); + +// This allows to benchmark the overhead of a solver by using the following +// data: A=[1.0], x=[0.0], b=[nan]. This data can be used to benchmark normal +// solvers or using the argument --solvers=overhead, a minimal solver will be +// launched which contains only a few kernel calls. +DEFINE_bool(overhead, false, + "If set, uses dummy data to benchmark Ginkgo overhead"); + + +std::string example_config = R"( + [ + {"filename": "my_file.mtx", "optimal": {"spmv": "ell-csr"}, + "rhs": "my_file_rhs.mtx"}, + {"filename": "my_file2.mtx", "optimal": {"spmv": "coo-coo"}, + "rhs": "my_file_rhs.mtx"}, + {"size": 100, "stencil": "7pt", "comm_pattern": "stencil", + "optimal": {"spmv": "csr-coo"}} + ] +)"; + + +// input validation +[[noreturn]] void print_config_error_and_exit() +{ + std::cerr << "Input has to be a JSON array of solver configurations:\n" + << example_config << std::endl; + std::exit(1); +} + + +void validate_option_object(const rapidjson::Value& value) +{ + if (!value.IsObject() || + !((value.HasMember("size") && value.HasMember("stencil") && + value["size"].IsInt64() && value["stencil"].IsString()) || + (value.HasMember("filename") && value["filename"].IsString())) || + (!value.HasMember("optimal") && !value["optimal"].HasMember("spmv") && + !value["optimal"]["spmv"].IsString())) { + print_config_error_and_exit(); + } +} + + +std::shared_ptr create_criterion( + std::shared_ptr exec, std::uint32_t max_iters) +{ + std::shared_ptr residual_stop; + if (FLAGS_rel_residual) { + residual_stop = + gko::share(gko::stop::ResidualNorm::build() + .with_baseline(gko::stop::mode::initial_resnorm) + .with_reduction_factor( + static_cast(FLAGS_rel_res_goal)) + .on(exec)); + } else { + residual_stop = + gko::share(gko::stop::ResidualNorm::build() + .with_baseline(gko::stop::mode::rhs_norm) + .with_reduction_factor( + static_cast(FLAGS_rel_res_goal)) + .on(exec)); + } + auto iteration_stop = gko::share( + gko::stop::Iteration::build().with_max_iters(max_iters).on(exec)); + std::vector> + criterion_vector{residual_stop, iteration_stop}; + return gko::stop::combine(criterion_vector); +} + + +template +std::unique_ptr add_criteria_precond_finalize( + SolverIntermediate inter, const std::shared_ptr& exec, + std::shared_ptr precond, std::uint32_t max_iters) +{ + return inter.with_criteria(create_criterion(exec, max_iters)) + .with_preconditioner(give(precond)) + .on(exec); +} + + +template +std::unique_ptr add_criteria_precond_finalize( + const std::shared_ptr& exec, + std::shared_ptr precond, std::uint32_t max_iters) +{ + return add_criteria_precond_finalize(Solver::build(), exec, precond, + max_iters); +} + + +std::unique_ptr generate_solver( + const std::shared_ptr& exec, + std::shared_ptr precond, + const std::string& description, std::uint32_t max_iters) +{ + std::string cb_gmres_prefix("cb_gmres_"); + if (description.find(cb_gmres_prefix) == 0) { + auto s_prec = gko::solver::cb_gmres::storage_precision::keep; + const auto spec = description.substr(cb_gmres_prefix.length()); + if (spec == "keep") { + s_prec = gko::solver::cb_gmres::storage_precision::keep; + } else if (spec == "reduce1") { + s_prec = gko::solver::cb_gmres::storage_precision::reduce1; + } else if (spec == "reduce2") { + s_prec = gko::solver::cb_gmres::storage_precision::reduce2; + } else if (spec == "integer") { + s_prec = gko::solver::cb_gmres::storage_precision::integer; + } else if (spec == "ireduce1") { + s_prec = gko::solver::cb_gmres::storage_precision::ireduce1; + } else if (spec == "ireduce2") { + s_prec = gko::solver::cb_gmres::storage_precision::ireduce2; + } else { + throw std::range_error( + std::string( + "CB-GMRES does not have a corresponding solver to <") + + description + ">!"); + } + return add_criteria_precond_finalize( + gko::solver::CbGmres::build() + .with_krylov_dim(FLAGS_gmres_restart) + .with_storage_precision(s_prec), + exec, precond, max_iters); + } else if (description == "bicgstab") { + return add_criteria_precond_finalize>( + exec, precond, max_iters); + } else if (description == "bicg") { + return add_criteria_precond_finalize>( + exec, precond, max_iters); + } else if (description == "cg") { + return add_criteria_precond_finalize>( + exec, precond, max_iters); + } else if (description == "cgs") { + return add_criteria_precond_finalize>( + exec, precond, max_iters); + } else if (description == "fcg") { + return add_criteria_precond_finalize>( + exec, precond, max_iters); + } else if (description == "idr") { + return add_criteria_precond_finalize( + gko::solver::Idr::build() + .with_subspace_dim(FLAGS_idr_subspace_dim) + .with_kappa(static_cast(FLAGS_idr_kappa)), + exec, precond, max_iters); + } else if (description == "gmres") { + return add_criteria_precond_finalize( + gko::solver::Gmres::build().with_krylov_dim( + FLAGS_gmres_restart), + exec, precond, max_iters); + } else if (description == "lower_trs") { + return gko::solver::LowerTrs::build() + .with_num_rhs(FLAGS_nrhs) + .on(exec); + } else if (description == "upper_trs") { + return gko::solver::UpperTrs::build() + .with_num_rhs(FLAGS_nrhs) + .on(exec); + } else if (description == "overhead") { + return add_criteria_precond_finalize>( + exec, precond, max_iters); + } + throw std::range_error(std::string("The provided string <") + description + + "> does not match any solver!"); +} + + +void write_precond_info(const gko::LinOp* precond, + rapidjson::Value& precond_info, + rapidjson::MemoryPoolAllocator<>& allocator) +{ + if (const auto jacobi = + dynamic_cast*>(precond)) { + // extract block sizes + const auto bdata = + jacobi->get_parameters().block_pointers.get_const_data(); + add_or_set_member(precond_info, "block_sizes", + rapidjson::Value(rapidjson::kArrayType), allocator); + const auto nblocks = jacobi->get_num_blocks(); + for (auto i = decltype(nblocks){0}; i < nblocks; ++i) { + precond_info["block_sizes"].PushBack(bdata[i + 1] - bdata[i], + allocator); + } + + // extract block precisions + const auto pdata = + jacobi->get_parameters() + .storage_optimization.block_wise.get_const_data(); + if (pdata) { + add_or_set_member(precond_info, "block_precisions", + rapidjson::Value(rapidjson::kArrayType), + allocator); + for (auto i = decltype(nblocks){0}; i < nblocks; ++i) { + precond_info["block_precisions"].PushBack( + static_cast(pdata[i]), allocator); + } + } + + // extract condition numbers + const auto cdata = jacobi->get_conditioning(); + if (cdata) { + add_or_set_member(precond_info, "block_conditioning", + rapidjson::Value(rapidjson::kArrayType), + allocator); + for (auto i = decltype(nblocks){0}; i < nblocks; ++i) { + precond_info["block_conditioning"].PushBack(cdata[i], + allocator); + } + } + } +} + + +struct SolverGenerator : DefaultSystemGenerator<> { + using Vec = typename DefaultSystemGenerator::Vec; + + std::unique_ptr generate_rhs(std::shared_ptr exec, + const gko::LinOp* system_matrix, + rapidjson::Value& config) const + { + if (config.HasMember("rhs")) { + std::ifstream rhs_fd{config["rhs"].GetString()}; + return gko::read(rhs_fd, std::move(exec)); + } else { + gko::dim<2> vec_size{system_matrix->get_size()[0], FLAGS_nrhs}; + if (FLAGS_rhs_generation == "1") { + return create_multi_vector(exec, vec_size, gko::one()); + } else if (FLAGS_rhs_generation == "random") { + return create_multi_vector_random(exec, vec_size); + } else if (FLAGS_rhs_generation == "sinus") { + auto rhs = vec::create(exec, vec_size); + + auto tmp = create_matrix_sin(exec, vec_size); + auto scalar = gko::matrix::Dense::create( + exec->get_master(), gko::dim<2>{1, vec_size[1]}); + tmp->compute_norm2(scalar.get()); + for (gko::size_type i = 0; i < vec_size[1]; ++i) { + scalar->at(0, i) = gko::one() / scalar->at(0, i); + } + // normalize sin-vector + if (gko::is_complex_s::value) { + tmp->scale(scalar->make_complex().get()); + } else { + tmp->scale(scalar.get()); + } + system_matrix->apply(tmp.get(), rhs.get()); + return rhs; + } + throw std::invalid_argument(std::string("\"rhs_generation\" = ") + + FLAGS_rhs_generation + + " is not supported!"); + } + } + + std::unique_ptr generate_initial_guess( + std::shared_ptr exec, + const gko::LinOp* system_matrix, const Vec* rhs) const + { + gko::dim<2> vec_size{system_matrix->get_size()[1], FLAGS_nrhs}; + if (FLAGS_initial_guess_generation == "0") { + return create_multi_vector(exec, vec_size, gko::zero()); + } else if (FLAGS_initial_guess_generation == "random") { + return create_multi_vector_random(exec, vec_size); + } else if (FLAGS_initial_guess_generation == "rhs") { + return rhs->clone(); + } + throw std::invalid_argument( + std::string("\"initial_guess_generation\" = ") + + FLAGS_initial_guess_generation + " is not supported!"); + } + + std::unique_ptr initialize( + std::initializer_list il, + std::shared_ptr exec) const + { + return gko::initialize(std::move(il), std::move(exec)); + } + + std::default_random_engine engine = get_engine(); +}; + + +template +void solve_system(const std::string& solver_name, + const std::string& precond_name, + const char* precond_solver_name, + std::shared_ptr exec, + std::shared_ptr system_matrix, + const VectorType* b, const VectorType* x, + rapidjson::Value& test_case, + rapidjson::MemoryPoolAllocator<>& allocator) +{ + try { + auto& solver_case = test_case["solver"]; + if (!FLAGS_overwrite && solver_case.HasMember(precond_solver_name)) { + return; + } + + add_or_set_member(solver_case, precond_solver_name, + rapidjson::Value(rapidjson::kObjectType), allocator); + auto& solver_json = solver_case[precond_solver_name]; + add_or_set_member(solver_json, "recurrent_residuals", + rapidjson::Value(rapidjson::kArrayType), allocator); + add_or_set_member(solver_json, "true_residuals", + rapidjson::Value(rapidjson::kArrayType), allocator); + add_or_set_member(solver_json, "implicit_residuals", + rapidjson::Value(rapidjson::kArrayType), allocator); + add_or_set_member(solver_json, "iteration_timestamps", + rapidjson::Value(rapidjson::kArrayType), allocator); + if (b->get_size()[1] == 1 && !FLAGS_overhead) { + auto rhs_norm = compute_norm2(lend(b)); + add_or_set_member(solver_json, "rhs_norm", rhs_norm, allocator); + } + for (auto stage : {"generate", "apply"}) { + add_or_set_member(solver_json, stage, + rapidjson::Value(rapidjson::kObjectType), + allocator); + add_or_set_member(solver_json[stage], "components", + rapidjson::Value(rapidjson::kObjectType), + allocator); + } + + IterationControl ic{get_timer(exec, FLAGS_gpu_timer)}; + + // warm run + std::shared_ptr solver; + for (auto _ : ic.warmup_run()) { + auto x_clone = clone(x); + auto precond = precond_factory.at(precond_name)(exec); + solver = generate_solver(exec, give(precond), solver_name, + FLAGS_warmup_max_iters) + ->generate(system_matrix); + solver->apply(lend(b), lend(x_clone)); + exec->synchronize(); + } + + // detail run + if (FLAGS_detailed && !FLAGS_overhead) { + // slow run, get the time of each functions + auto x_clone = clone(x); + + auto gen_logger = + std::make_shared(FLAGS_nested_names); + exec->add_logger(gen_logger); + if (exec != exec->get_master()) { + exec->get_master()->add_logger(gen_logger); + } + + auto precond = precond_factory.at(precond_name)(exec); + solver = generate_solver(exec, give(precond), solver_name, + FLAGS_max_iters) + ->generate(system_matrix); + + exec->remove_logger(gko::lend(gen_logger)); + if (exec != exec->get_master()) { + exec->get_master()->remove_logger(gko::lend(gen_logger)); + } + gen_logger->write_data(solver_json["generate"]["components"], + allocator, 1); + + if (auto prec = + dynamic_cast(lend(solver))) { + add_or_set_member(solver_json, "preconditioner", + rapidjson::Value(rapidjson::kObjectType), + allocator); + write_precond_info( + lend(clone(exec->get_master(), prec->get_preconditioner())), + solver_json["preconditioner"], allocator); + } + + auto apply_logger = + std::make_shared(FLAGS_nested_names); + exec->add_logger(apply_logger); + if (exec != exec->get_master()) { + exec->get_master()->add_logger(apply_logger); + } + + + solver->apply(lend(b), lend(x_clone)); + + exec->remove_logger(gko::lend(apply_logger)); + if (exec != exec->get_master()) { + exec->get_master()->remove_logger(gko::lend(apply_logger)); + } + apply_logger->write_data(solver_json["apply"]["components"], + allocator, 1); + + // slow run, gets the recurrent and true residuals of each iteration + if (b->get_size()[1] == 1) { + x_clone = clone(x); + auto res_logger = std::make_shared>( + lend(system_matrix), b, solver_json["recurrent_residuals"], + solver_json["true_residuals"], + solver_json["implicit_residuals"], + solver_json["iteration_timestamps"], allocator); + solver->add_logger(res_logger); + solver->apply(lend(b), lend(x_clone)); + if (!res_logger->has_implicit_res_norms()) { + solver_json.RemoveMember("implicit_residuals"); + } + } + exec->synchronize(); + } + + // timed run + auto it_logger = std::make_shared(); + auto generate_timer = get_timer(exec, FLAGS_gpu_timer); + auto apply_timer = ic.get_timer(); + auto x_clone = clone(x); + for (auto status : ic.run(false)) { + x_clone = clone(x); + + exec->synchronize(); + generate_timer->tic(); + auto precond = precond_factory.at(precond_name)(exec); + solver = generate_solver(exec, give(precond), solver_name, + FLAGS_max_iters) + ->generate(system_matrix); + generate_timer->toc(); + + exec->synchronize(); + if (ic.get_num_repetitions() == 0) { + solver->add_logger(it_logger); + } + apply_timer->tic(); + solver->apply(lend(b), lend(x_clone)); + apply_timer->toc(); + if (ic.get_num_repetitions() == 0) { + solver->remove_logger(gko::lend(it_logger)); + } + } + it_logger->write_data(solver_json["apply"], allocator); + + if (b->get_size()[1] == 1 && !FLAGS_overhead) { + // a solver is considered direct if it didn't log any iterations + if (solver_json["apply"].HasMember("iterations") && + solver_json["apply"]["iterations"].GetInt() == 0) { + auto error = + compute_direct_error(lend(solver), lend(b), lend(x_clone)); + add_or_set_member(solver_json, "forward_error", error, + allocator); + } + auto residual = compute_residual_norm(lend(system_matrix), lend(b), + lend(x_clone)); + add_or_set_member(solver_json, "residual_norm", residual, + allocator); + } + add_or_set_member(solver_json["generate"], "time", + generate_timer->compute_average_time(), allocator); + add_or_set_member(solver_json["apply"], "time", + apply_timer->compute_average_time(), allocator); + add_or_set_member(solver_json, "repetitions", + apply_timer->get_num_repetitions(), allocator); + + // compute and write benchmark data + add_or_set_member(solver_json, "completed", true, allocator); + } catch (const std::exception& e) { + add_or_set_member(test_case["solver"][precond_solver_name], "completed", + false, allocator); + if (FLAGS_keep_errors) { + rapidjson::Value msg_value; + msg_value.SetString(e.what(), allocator); + add_or_set_member(test_case["solver"][precond_solver_name], "error", + msg_value, allocator); + } + std::cerr << "Error when processing test case " << test_case << "\n" + << "what(): " << e.what() << std::endl; + } +} + + +template +void run_solver_benchmarks(std::shared_ptr exec, + rapidjson::Document& test_cases, + const SystemGenerator& system_generator, + bool do_print) +{ + auto solvers = split(FLAGS_solvers, ','); + auto preconds = split(FLAGS_preconditioners, ','); + std::vector precond_solvers; + for (const auto& s : solvers) { + for (const auto& p : preconds) { + precond_solvers.push_back(s + (p == "none" ? "" : "-" + p)); + } + } + + auto& allocator = test_cases.GetAllocator(); + + for (auto& test_case : test_cases.GetArray()) { + try { + // set up benchmark + validate_option_object(test_case); + if (!test_case.HasMember("solver")) { + test_case.AddMember("solver", + rapidjson::Value(rapidjson::kObjectType), + allocator); + } + auto& solver_case = test_case["solver"]; + if (!FLAGS_overwrite && + all_of(begin(precond_solvers), end(precond_solvers), + [&solver_case](const std::string& s) { + return solver_case.HasMember(s.c_str()); + })) { + continue; + } + if (do_print) { + std::clog << "Running test case: " << test_case << std::endl; + } + + using Vec = typename SystemGenerator::Vec; + std::shared_ptr system_matrix; + std::unique_ptr b; + std::unique_ptr x; + if (FLAGS_overhead) { + system_matrix = system_generator.initialize({1.0}, exec); + b = system_generator.initialize( + {std::numeric_limits::quiet_NaN()}, exec); + x = system_generator.initialize({0.0}, exec); + } else { + system_matrix = + system_generator.generate_matrix_with_optimal_format( + exec, test_case); + b = system_generator.generate_rhs(exec, system_matrix.get(), + test_case); + x = system_generator.generate_initial_guess( + exec, system_matrix.get(), b.get()); + } + + if (do_print) { + std::clog << "Matrix is of size (" + << system_matrix->get_size()[0] << ", " + << system_matrix->get_size()[1] << ")" << std::endl; + } + add_or_set_member(test_case, "size", system_matrix->get_size()[0], + allocator); + auto precond_solver_name = begin(precond_solvers); + for (const auto& solver_name : solvers) { + for (const auto& precond_name : preconds) { + if (do_print) { + std::clog + << "\tRunning solver: " << *precond_solver_name + << std::endl; + } + solve_system(solver_name, precond_name, + precond_solver_name->c_str(), exec, + system_matrix, lend(b), lend(x), test_case, + allocator); + if (do_print) { + backup_results(test_cases); + } + ++precond_solver_name; + } + } + } catch (const std::exception& e) { + std::cerr << "Error setting up solver, what(): " << e.what() + << std::endl; + } + } +} + + +#endif // GINKGO_BENCHMARK_SOLVER_SOLVER_COMMON_HPP diff --git a/benchmark/sparse_blas/sparse_blas.cpp b/benchmark/sparse_blas/sparse_blas.cpp index adc75e03ae5..3adde1b348c 100644 --- a/benchmark/sparse_blas/sparse_blas.cpp +++ b/benchmark/sparse_blas/sparse_blas.cpp @@ -34,10 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include -#include #include -#include #include #include #include @@ -45,12 +42,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include #include "benchmark/utils/general.hpp" -#include "benchmark/utils/loggers.hpp" -#include "benchmark/utils/spmv_common.hpp" +#include "benchmark/utils/generator.hpp" +#include "benchmark/utils/spmv_validation.hpp" #include "benchmark/utils/types.hpp" #include "core/test/utils/matrix_generator.hpp" @@ -58,7 +54,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. const auto benchmark_name = "sparse_blas"; -using itype = gko::int32; using Mtx = gko::matrix::Csr; using mat_data = gko::matrix_data; @@ -454,9 +449,7 @@ int main(int argc, char* argv[]) std::string header = "A benchmark for measuring performance of Ginkgo's sparse BLAS " "operations.\n"; - std::string format = std::string() + " [\n" + - " { \"filename\": \"my_file.mtx\"},\n" + - " { \"filename\": \"my_file2.mtx\"}\n" + " ]\n\n"; + std::string format = example_config; initialize_argument_parsing(&argc, &argv, header, format); auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer); @@ -487,12 +480,13 @@ int main(int argc, char* argv[]) } auto& sp_blas_case = test_case[benchmark_name]; std::clog << "Running test case: " << test_case << std::endl; - std::ifstream mtx_fd(test_case["filename"].GetString()); - auto data = gko::read_generic_raw(mtx_fd); + auto data = + DefaultSystemGenerator<>::generate_matrix_data(test_case); data.ensure_row_major_order(); std::clog << "Matrix is of size (" << data.size[0] << ", " << data.size[1] << "), " << data.nonzeros.size() << std::endl; + add_or_set_member(test_case, "size", data.size[0], allocator); for (const auto& strategy_name : strategies) { auto mtx = Mtx::create(exec, data.size, data.nonzeros.size(), diff --git a/benchmark/spmv/CMakeLists.txt b/benchmark/spmv/CMakeLists.txt index 6c5d10517a8..1e3bab1c884 100644 --- a/benchmark/spmv/CMakeLists.txt +++ b/benchmark/spmv/CMakeLists.txt @@ -1 +1,4 @@ ginkgo_add_typed_benchmark_executables(spmv "YES" spmv.cpp) +if(GINKGO_BUILD_MPI) + add_subdirectory(distributed) +endif() diff --git a/benchmark/spmv/distributed/CMakeLists.txt b/benchmark/spmv/distributed/CMakeLists.txt new file mode 100644 index 00000000000..cadde3eea34 --- /dev/null +++ b/benchmark/spmv/distributed/CMakeLists.txt @@ -0,0 +1 @@ +ginkgo_add_typed_benchmark_executables(spmv-distributed "YES" spmv.cpp) diff --git a/benchmark/spmv/distributed/spmv.cpp b/benchmark/spmv/distributed/spmv.cpp new file mode 100644 index 00000000000..03530b139db --- /dev/null +++ b/benchmark/spmv/distributed/spmv.cpp @@ -0,0 +1,139 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "benchmark/spmv/spmv_common.hpp" +#include "benchmark/utils/general.hpp" +#include "benchmark/utils/generator.hpp" +#include "benchmark/utils/timer.hpp" +#include "benchmark/utils/types.hpp" + + +DEFINE_string(local_formats, "csr", + "A comma-separated list of formats for the local matrix to run. " + "See the 'formats' option for a list of supported versions"); +DEFINE_string(non_local_formats, "csr", + "A comma-separated list of formats for the non-local matrix to " + "run. See the 'formats' option for a list of supported versions"); + + +std::string example_config = R"( + [ + {"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}, + {"filename": "my_file.mtx"} + ] +)"; + + +[[noreturn]] void print_config_error_and_exit() +{ + std::cerr << "Input has to be a JSON array of matrix configurations:\n" + << example_config << std::endl; + std::exit(1); +} + + +struct Generator : DistributedDefaultSystemGenerator> { + Generator(gko::experimental::mpi::communicator comm) + : DistributedDefaultSystemGenerator>{ + std::move(comm), {}} + {} + + void validate_options(const rapidjson::Value& options) const + { + if (!options.IsObject() || + !((options.HasMember("size") && options.HasMember("stencil") && + options.HasMember("comm_pattern")) || + options.HasMember("filename"))) { + print_config_error_and_exit(); + } + } +}; + + +int main(int argc, char* argv[]) +{ + gko::experimental::mpi::environment mpi_env{argc, argv}; + + const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); + const auto rank = comm.rank(); + + std::string header = + "A benchmark for measuring performance of Ginkgo's spmv.\n"; + std::string format = example_config; + initialize_argument_parsing(&argc, &argv, header, format); + + if (rank == 0) { + std::string extra_information = "The formats are [" + + FLAGS_local_formats + "]x[" + + FLAGS_non_local_formats + "]\n" + + "The number of right hand sides is " + + std::to_string(FLAGS_nrhs) + "\n"; + print_general_information(extra_information); + } + + auto exec = executor_factory_mpi.at(FLAGS_executor)(comm.get()); + + auto local_formats = split(FLAGS_local_formats, ','); + auto non_local_formats = split(FLAGS_non_local_formats, ','); + std::vector formats; + for (const auto& local_fmt : local_formats) { + for (const auto& non_local_fmt : non_local_formats) { + formats.push_back(local_fmt + "-" + non_local_fmt); + } + } + + std::string json_input = broadcast_json_input(std::cin, comm); + rapidjson::Document test_cases; + test_cases.Parse(json_input.c_str()); + if (!test_cases.IsArray()) { + print_config_error_and_exit(); + } + + run_spmv_benchmark(exec, test_cases, formats, Generator{comm}, rank == 0); + + if (rank == 0) { + std::cout << test_cases << std::endl; + } +} diff --git a/benchmark/spmv/spmv.cpp b/benchmark/spmv/spmv.cpp index 9178cbd2a08..7090e978956 100644 --- a/benchmark/spmv/spmv.cpp +++ b/benchmark/spmv/spmv.cpp @@ -33,146 +33,37 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include -#include #include -#include -#include -#include #include -#include +#include "benchmark/spmv/spmv_common.hpp" #include "benchmark/utils/formats.hpp" #include "benchmark/utils/general.hpp" -#include "benchmark/utils/loggers.hpp" -#include "benchmark/utils/spmv_common.hpp" -#include "benchmark/utils/timer.hpp" -#include "benchmark/utils/types.hpp" - - -#ifdef GINKGO_BENCHMARK_ENABLE_TUNING -#include "benchmark/utils/tuning_variables.hpp" -#endif // GINKGO_BENCHMARK_ENABLE_TUNING - - -// Command-line arguments -DEFINE_uint32(nrhs, 1, "The number of right hand sides"); - - -// This function supposes that management of `FLAGS_overwrite` is done before -// calling it -void apply_spmv(const char* format_name, std::shared_ptr exec, - const gko::matrix_data& data, const vec* b, - const vec* x, const vec* answer, - rapidjson::Value& test_case, - rapidjson::MemoryPoolAllocator<>& allocator) -{ - try { - auto& spmv_case = test_case["spmv"]; - add_or_set_member(spmv_case, format_name, - rapidjson::Value(rapidjson::kObjectType), allocator); - - auto storage_logger = std::make_shared(); - exec->add_logger(storage_logger); - auto system_matrix = - share(formats::matrix_factory.at(format_name)(exec, data)); - - exec->remove_logger(gko::lend(storage_logger)); - storage_logger->write_data(spmv_case[format_name], allocator); - // check the residual - if (FLAGS_detailed) { - auto x_clone = clone(x); - exec->synchronize(); - system_matrix->apply(lend(b), lend(x_clone)); - exec->synchronize(); - auto max_relative_norm2 = - compute_max_relative_norm2(lend(x_clone), lend(answer)); - add_or_set_member(spmv_case[format_name], "max_relative_norm2", - max_relative_norm2, allocator); - } - - IterationControl ic{get_timer(exec, FLAGS_gpu_timer)}; - // warm run - for (auto _ : ic.warmup_run()) { - auto x_clone = clone(x); - exec->synchronize(); - system_matrix->apply(lend(b), lend(x_clone)); - exec->synchronize(); - } - - // tuning run -#ifdef GINKGO_BENCHMARK_ENABLE_TUNING - auto& format_case = spmv_case[format_name]; - if (!format_case.HasMember("tuning")) { - format_case.AddMember( - "tuning", rapidjson::Value(rapidjson::kObjectType), allocator); +#include "benchmark/utils/generator.hpp" +#include "benchmark/utils/spmv_validation.hpp" + + +struct Generator : DefaultSystemGenerator<> { + void validate_options(const rapidjson::Value& options) const + { + if (!options.IsObject() || + !((options.HasMember("size") && options.HasMember("stencil")) || + options.HasMember("filename"))) { + std::cerr + << "Input has to be a JSON array of matrix configurations:\n" + << example_config << std::endl; + std::exit(1); } - auto& tuning_case = format_case["tuning"]; - add_or_set_member(tuning_case, "time", - rapidjson::Value(rapidjson::kArrayType), allocator); - add_or_set_member(tuning_case, "values", - rapidjson::Value(rapidjson::kArrayType), allocator); - - // Enable tuning for this portion of code - gko::_tuning_flag = true; - // Select some values we want to tune. - std::vector tuning_values{ - 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; - for (auto val : tuning_values) { - // Actually set the value that will be tuned. See - // cuda/components/format_conversion.cuh for an example of how this - // variable is used. - gko::_tuned_value = val; - auto tuning_timer = get_timer(exec, FLAGS_gpu_timer); - IterationControl ic_tuning{tuning_timer}; - auto x_clone = clone(x); - for (auto _ : ic_tuning.run()) { - system_matrix->apply(lend(b), lend(x_clone)); - } - tuning_case["time"].PushBack(ic_tuning.compute_average_time(), - allocator); - tuning_case["values"].PushBack(val, allocator); - } - // We put back the flag to false to use the default (non-tuned) values - // for the following - gko::_tuning_flag = false; -#endif // GINKGO_BENCHMARK_ENABLE_TUNING - - // timed run - auto x_clone = clone(x); - for (auto _ : ic.run()) { - system_matrix->apply(lend(b), lend(x_clone)); - } - add_or_set_member(spmv_case[format_name], "time", - ic.compute_average_time(), allocator); - add_or_set_member(spmv_case[format_name], "repetitions", - ic.get_num_repetitions(), allocator); - - // compute and write benchmark data - add_or_set_member(spmv_case[format_name], "completed", true, allocator); - } catch (const std::exception& e) { - add_or_set_member(test_case["spmv"][format_name], "completed", false, - allocator); - if (FLAGS_keep_errors) { - rapidjson::Value msg_value; - msg_value.SetString(e.what(), allocator); - add_or_set_member(test_case["spmv"][format_name], "error", - msg_value, allocator); - } - std::cerr << "Error when processing test case " << test_case << "\n" - << "what(): " << e.what() << std::endl; } -} +}; int main(int argc, char* argv[]) { std::string header = "A benchmark for measuring performance of Ginkgo's spmv.\n"; - std::string format = std::string() + " [\n" + - " { \"filename\": \"my_file.mtx\"},\n" + - " { \"filename\": \"my_file2.mtx\"}\n" + " ]\n\n"; + std::string format = example_config; initialize_argument_parsing(&argc, &argv, header, format); std::string extra_information = "The formats are " + FLAGS_formats + @@ -181,7 +72,6 @@ int main(int argc, char* argv[]) print_general_information(extra_information); auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer); - auto engine = get_engine(); auto formats = split(FLAGS_formats, ','); rapidjson::IStreamWrapper jcin(std::cin); @@ -191,80 +81,7 @@ int main(int argc, char* argv[]) print_config_error_and_exit(); } - auto& allocator = test_cases.GetAllocator(); - - for (auto& test_case : test_cases.GetArray()) { - try { - // set up benchmark - validate_option_object(test_case); - if (!test_case.HasMember("spmv")) { - test_case.AddMember("spmv", - rapidjson::Value(rapidjson::kObjectType), - allocator); - } - auto& spmv_case = test_case["spmv"]; - if (!FLAGS_overwrite && - all_of(begin(formats), end(formats), - [&spmv_case](const std::string& s) { - return spmv_case.HasMember(s.c_str()); - })) { - continue; - } - std::clog << "Running test case: " << test_case << std::endl; - std::ifstream mtx_fd(test_case["filename"].GetString()); - auto data = gko::read_generic_raw(mtx_fd); - - auto nrhs = FLAGS_nrhs; - auto b = create_matrix(exec, gko::dim<2>{data.size[1], nrhs}, - engine); - auto x = create_matrix(exec, gko::dim<2>{data.size[0], nrhs}, - engine); - std::clog << "Matrix is of size (" << data.size[0] << ", " - << data.size[1] << ")" << std::endl; - std::string best_format("none"); - auto best_performance = 0.0; - if (!test_case.HasMember("optimal")) { - test_case.AddMember("optimal", - rapidjson::Value(rapidjson::kObjectType), - allocator); - } - - // Compute the result from ginkgo::coo as the correct answer - auto answer = vec::create(exec); - if (FLAGS_detailed) { - auto system_matrix = - share(formats::matrix_factory.at("coo")(exec, data)); - answer->copy_from(lend(x)); - exec->synchronize(); - system_matrix->apply(lend(b), lend(answer)); - exec->synchronize(); - } - for (const auto& format_name : formats) { - apply_spmv(format_name.c_str(), exec, data, lend(b), lend(x), - lend(answer), test_case, allocator); - std::clog << "Current state:" << std::endl - << test_cases << std::endl; - if (spmv_case[format_name.c_str()]["completed"].GetBool()) { - auto performance = - spmv_case[format_name.c_str()]["time"].GetDouble(); - if (best_format == "none" || - performance < best_performance) { - best_format = format_name; - best_performance = performance; - add_or_set_member( - test_case["optimal"], "spmv", - rapidjson::Value(best_format.c_str(), allocator) - .Move(), - allocator); - } - } - backup_results(test_cases); - } - } catch (const std::exception& e) { - std::cerr << "Error setting up matrix data, what(): " << e.what() - << std::endl; - } - } + run_spmv_benchmark(exec, test_cases, formats, Generator{}, true); std::cout << test_cases << std::endl; } diff --git a/benchmark/spmv/spmv_common.hpp b/benchmark/spmv/spmv_common.hpp new file mode 100644 index 00000000000..7ed9854832a --- /dev/null +++ b/benchmark/spmv/spmv_common.hpp @@ -0,0 +1,245 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GINKGO_BENCHMARK_SPMV_SPMV_COMMON_HPP +#define GINKGO_BENCHMARK_SPMV_SPMV_COMMON_HPP + + +#include "benchmark/utils/formats.hpp" +#include "benchmark/utils/general.hpp" +#include "benchmark/utils/loggers.hpp" +#include "benchmark/utils/timer.hpp" +#include "benchmark/utils/types.hpp" +#ifdef GINKGO_BENCHMARK_ENABLE_TUNING +#include "benchmark/utils/tuning_variables.hpp" +#endif // GINKGO_BENCHMARK_ENABLE_TUNING + + +// Command-line arguments +DEFINE_uint32(nrhs, 1, "The number of right hand sides"); + + +// This function supposes that management of `FLAGS_overwrite` is done before +// calling it +template +void apply_spmv(const char* format_name, std::shared_ptr exec, + const Generator& generator, + const gko::matrix_data& data, + const VectorType* b, const VectorType* x, + const VectorType* answer, rapidjson::Value& test_case, + rapidjson::MemoryPoolAllocator<>& allocator) +{ + try { + auto& spmv_case = test_case["spmv"]; + add_or_set_member(spmv_case, format_name, + rapidjson::Value(rapidjson::kObjectType), allocator); + + auto system_matrix = generator.generate_matrix_with_format( + exec, format_name, data, &spmv_case[format_name], &allocator); + + // check the residual + if (FLAGS_detailed) { + auto x_clone = clone(x); + exec->synchronize(); + system_matrix->apply(lend(b), lend(x_clone)); + exec->synchronize(); + auto max_relative_norm2 = + compute_max_relative_norm2(lend(x_clone), lend(answer)); + add_or_set_member(spmv_case[format_name], "max_relative_norm2", + max_relative_norm2, allocator); + } + + IterationControl ic{get_timer(exec, FLAGS_gpu_timer)}; + // warm run + for (auto _ : ic.warmup_run()) { + auto x_clone = clone(x); + exec->synchronize(); + system_matrix->apply(lend(b), lend(x_clone)); + exec->synchronize(); + } + + // tuning run +#ifdef GINKGO_BENCHMARK_ENABLE_TUNING + auto& format_case = spmv_case[format_name]; + if (!format_case.HasMember("tuning")) { + format_case.AddMember( + "tuning", rapidjson::Value(rapidjson::kObjectType), allocator); + } + auto& tuning_case = format_case["tuning"]; + add_or_set_member(tuning_case, "time", + rapidjson::Value(rapidjson::kArrayType), allocator); + add_or_set_member(tuning_case, "values", + rapidjson::Value(rapidjson::kArrayType), allocator); + + // Enable tuning for this portion of code + gko::_tuning_flag = true; + // Select some values we want to tune. + std::vector tuning_values{ + 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; + for (auto val : tuning_values) { + // Actually set the value that will be tuned. See + // cuda/components/format_conversion.cuh for an example of how this + // variable is used. + gko::_tuned_value = val; + auto tuning_timer = get_timer(exec, FLAGS_gpu_timer); + IterationControl ic_tuning{tuning_timer}; + auto x_clone = clone(x); + for (auto _ : ic_tuning.run()) { + system_matrix->apply(lend(b), lend(x_clone)); + } + tuning_case["time"].PushBack(ic_tuning.compute_average_time(), + allocator); + tuning_case["values"].PushBack(val, allocator); + } + // We put back the flag to false to use the default (non-tuned) values + // for the following + gko::_tuning_flag = false; +#endif // GINKGO_BENCHMARK_ENABLE_TUNING + + // timed run + auto x_clone = clone(x); + for (auto _ : ic.run()) { + system_matrix->apply(lend(b), lend(x_clone)); + } + add_or_set_member(spmv_case[format_name], "time", + ic.compute_average_time(), allocator); + add_or_set_member(spmv_case[format_name], "repetitions", + ic.get_num_repetitions(), allocator); + + // compute and write benchmark data + add_or_set_member(spmv_case[format_name], "completed", true, allocator); + } catch (const std::exception& e) { + add_or_set_member(test_case["spmv"][format_name], "completed", false, + allocator); + if (FLAGS_keep_errors) { + rapidjson::Value msg_value; + msg_value.SetString(e.what(), allocator); + add_or_set_member(test_case["spmv"][format_name], "error", + msg_value, allocator); + } + std::cerr << "Error when processing test case " << test_case << "\n" + << "what(): " << e.what() << std::endl; + } +} + + +template +void run_spmv_benchmark(std::shared_ptr exec, + rapidjson::Document& test_cases, + const std::vector formats, + const SystemGenerator& system_generator, bool do_print) +{ + auto& allocator = test_cases.GetAllocator(); + + for (auto& test_case : test_cases.GetArray()) { + try { + // set up benchmark + system_generator.validate_options(test_case); + if (!test_case.HasMember("spmv")) { + test_case.AddMember("spmv", + rapidjson::Value(rapidjson::kObjectType), + allocator); + } + auto& spmv_case = test_case["spmv"]; + if (!FLAGS_overwrite && + all_of(begin(formats), end(formats), + [&spmv_case](const std::string& s) { + return spmv_case.HasMember(s.c_str()); + })) { + continue; + } + if (do_print) { + std::clog << "Running test case: " << test_case << std::endl; + } + auto data = system_generator.generate_matrix_data(test_case); + + auto nrhs = FLAGS_nrhs; + auto b = system_generator.create_multi_vector_random( + exec, gko::dim<2>{data.size[1], nrhs}); + auto x = system_generator.create_multi_vector_random( + exec, gko::dim<2>{data.size[0], nrhs}); + if (do_print) { + std::clog << "Matrix is of size (" << data.size[0] << ", " + << data.size[1] << ")" << std::endl; + } + add_or_set_member(test_case, "size", data.size[0], allocator); + add_or_set_member(test_case, "nnz", data.nonzeros.size(), + allocator); + auto best_performance = std::numeric_limits::max(); + if (!test_case.HasMember("optimal")) { + test_case.AddMember("optimal", + rapidjson::Value(rapidjson::kObjectType), + allocator); + } + + // Compute the result from ginkgo::coo as the correct answer + auto answer = gko::clone(lend(x)); + if (FLAGS_detailed) { + auto system_matrix = + system_generator.generate_matrix_with_default_format(exec, + data); + exec->synchronize(); + system_matrix->apply(lend(b), lend(answer)); + exec->synchronize(); + } + for (const auto& format_name : formats) { + apply_spmv(format_name.c_str(), exec, system_generator, data, + lend(b), lend(x), lend(answer), test_case, + allocator); + if (do_print) { + std::clog << "Current state:" << std::endl + << test_cases << std::endl; + } + if (spmv_case[format_name.c_str()]["completed"].GetBool()) { + auto performance = + spmv_case[format_name.c_str()]["time"].GetDouble(); + if (performance < best_performance) { + best_performance = performance; + add_or_set_member( + test_case["optimal"], "spmv", + rapidjson::Value(format_name.c_str(), allocator) + .Move(), + allocator); + } + } + if (do_print) { + backup_results(test_cases); + } + } + } catch (const std::exception& e) { + std::cerr << "Error setting up matrix data, what(): " << e.what() + << std::endl; + } + } +} + +#endif // GINKGO_BENCHMARK_SPMV_SPMV_COMMON_HPP diff --git a/benchmark/utils/cuda_linops.cu b/benchmark/utils/cuda_linops.cpp similarity index 100% rename from benchmark/utils/cuda_linops.cu rename to benchmark/utils/cuda_linops.cpp diff --git a/benchmark/utils/cuda_timer.cu b/benchmark/utils/cuda_timer.cpp similarity index 100% rename from benchmark/utils/cuda_timer.cu rename to benchmark/utils/cuda_timer.cpp diff --git a/benchmark/utils/distributed_helpers.hpp b/benchmark/utils/distributed_helpers.hpp new file mode 100644 index 00000000000..edbad2a9427 --- /dev/null +++ b/benchmark/utils/distributed_helpers.hpp @@ -0,0 +1,77 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GINKGO_BENCHMARK_UTILS_DISTRIBUTED_HELPERS_HPP +#define GINKGO_BENCHMARK_UTILS_DISTRIBUTED_HELPERS_HPP + + +#include "benchmark/utils/formats.hpp" +#include "benchmark/utils/general.hpp" +#include "benchmark/utils/loggers.hpp" +#include "benchmark/utils/stencil_matrix.hpp" + + +using global_itype = gko::int64; + + +template +using dist_vec = gko::experimental::distributed::Vector; +template +using dist_mtx = + gko::experimental::distributed::Matrix; + + +std::string broadcast_json_input(std::istream& is, + gko::experimental::mpi::communicator comm) +{ + auto exec = gko::ReferenceExecutor::create(); + + std::string json_input; + if (comm.rank() == 0) { + std::string line; + while (is >> line) { + json_input += line; + } + } + + auto input_size = json_input.size(); + comm.broadcast(exec->get_master(), &input_size, 1, 0); + json_input.resize(input_size); + comm.broadcast(exec->get_master(), &json_input[0], + static_cast(input_size), 0); + + return json_input; +} + + +#endif // GINKGO_BENCHMARK_UTILS_DISTRIBUTED_HELPERS_HPP diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp index 00d0aced282..87c8eac208e 100644 --- a/benchmark/utils/formats.hpp +++ b/benchmark/utils/formats.hpp @@ -53,7 +53,7 @@ namespace formats { std::string available_format = - "coo, csr, ell, ell-mixed, sellp, hybrid, hybrid0, hybrid25, hybrid33, " + "coo, csr, ell, ell_mixed, sellp, hybrid, hybrid0, hybrid25, hybrid33, " "hybrid40, " "hybrid60, hybrid80, hybridlimit0, hybridlimit25, hybridlimit33, " "hybridminstorage" @@ -84,7 +84,7 @@ std::string format_description = "csrs: Ginkgo's CSR implementation with sparselib strategy.\n" "ell: Ellpack format according to Bell and Garland: Efficient Sparse\n" " Matrix-Vector Multiplication on CUDA.\n" - "ell-mixed: Mixed Precision Ellpack format according to Bell and Garland:\n" + "ell_mixed: Mixed Precision Ellpack format according to Bell and Garland:\n" " Efficient Sparse Matrix-Vector Multiplication on CUDA.\n" "sellp: Sliced Ellpack uses a default block size of 32.\n" "hybrid: Hybrid uses ELL and COO to represent the matrix.\n" @@ -158,50 +158,9 @@ namespace formats { // some shortcuts using hybrid = gko::matrix::Hybrid; using csr = gko::matrix::Csr; - -/** - * Creates a Ginkgo matrix from the intermediate data representation format - * gko::matrix_data. - * - * @param exec the executor where the matrix will be put - * @param data the data represented in the intermediate representation format - * - * @tparam MatrixType the Ginkgo matrix type (such as `gko::matrix::Csr<>`) - * - * @return a `unique_pointer` to the created matrix - */ -template -std::unique_ptr read_matrix_from_data( - std::shared_ptr exec, - const gko::matrix_data& data) -{ - auto mat = MatrixType::create(std::move(exec)); - mat->read(data); - return mat; -} - - -/** - * Creates a Ginkgo sparselib matrix from the intermediate data representation - * format gko::matrix_data. - * - * @param exec the executor where the matrix will be put - * @param data the data represented in the intermediate representation format - * - * @tparam MatrixTagType the tag type for the matrix format, see - * sparselib_linops.hpp - * - * @return a `unique_pointer` to the created matrix - */ -template -std::unique_ptr read_splib_matrix_from_data( - std::shared_ptr exec, - const gko::matrix_data& data) -{ - auto mat = create_sparselib_linop(std::move(exec)); - gko::as>(mat.get())->read(data); - return mat; -} +using coo = gko::matrix::Coo; +using ell = gko::matrix::Ell; +using ell_mixed = gko::matrix::Ell, itype>; /** @@ -218,6 +177,9 @@ std::shared_ptr create_gpu_strategy( return std::make_shared(cuda->shared_from_this()); } else if (auto hip = dynamic_cast(exec.get())) { return std::make_shared(hip->shared_from_this()); + } else if (auto dpcpp = + dynamic_cast(exec.get())) { + return std::make_shared(dpcpp->shared_from_this()); } else { return std::make_shared(); } @@ -248,129 +210,122 @@ void check_ell_admissibility(const gko::matrix_data& data) } -/** - * Creates a Ginkgo matrix from the intermediate data representation format - * gko::matrix_data with support for variable arguments. - * - * @param MATRIX_TYPE the Ginkgo matrix type (such as `gko::matrix::Csr<>`) - */ -#define READ_MATRIX(MATRIX_TYPE, ...) \ - [](std::shared_ptr exec, \ - const gko::matrix_data& data) \ - -> std::unique_ptr { \ - auto mat = MATRIX_TYPE::create(std::move(exec), __VA_ARGS__); \ - mat->read(data); \ - return mat; \ - } +template +auto create_matrix_type(Args&&... args) +{ + return [&](std::shared_ptr exec) + -> std::unique_ptr { + return MatrixType::create(std::move(exec), std::forward(args)...); + }; +} + + +template +auto create_matrix_type_with_gpu_strategy() +{ + return [&](std::shared_ptr exec) + -> std::unique_ptr { + return MatrixType::create(exec, create_gpu_strategy(exec)); + }; +} // clang-format off const std::map( - std::shared_ptr, - const gko::matrix_data &)>> - matrix_factory{ - {"csr", - [](std::shared_ptr exec, - const gko::matrix_data &data) -> std::unique_ptr { - auto mat = - csr::create(exec, create_gpu_strategy(exec)); - mat->read(data); - return mat; - }}, - {"csri", - [](std::shared_ptr exec, - const gko::matrix_data &data) -> std::unique_ptr { - auto mat = csr::create( - exec, create_gpu_strategy(exec)); - mat->read(data); - return mat; - }}, - {"csrm", READ_MATRIX(csr, std::make_shared())}, - {"csrc", READ_MATRIX(csr, std::make_shared())}, - {"csrs", READ_MATRIX(csr, std::make_shared())}, - {"coo", read_matrix_from_data>}, - {"ell", [](std::shared_ptr exec, - const gko::matrix_data &data) { - check_ell_admissibility(data); - auto mat = gko::matrix::Ell::create(exec); - mat->read(data); - return mat; - }}, - {"ell-mixed", - [](std::shared_ptr exec, - const gko::matrix_data &data) { - check_ell_admissibility(data); - gko::matrix_data, itype> conv_data; - conv_data.size = data.size; - conv_data.nonzeros.resize(data.nonzeros.size()); - auto it = conv_data.nonzeros.begin(); - for (auto &el : data.nonzeros) { - it->row = el.row; - it->column = el.column; - it->value = el.value; - ++it; - } - auto mat = gko::matrix::Ell, itype>::create( - std::move(exec)); - mat->read(conv_data); - return mat; - }}, + std::shared_ptr)>> + matrix_type_factory{ + {"csr", create_matrix_type_with_gpu_strategy()}, + {"csri", create_matrix_type_with_gpu_strategy()}, + {"csrm", create_matrix_type(std::make_shared())}, + {"csrc", create_matrix_type(std::make_shared())}, + {"csrs", create_matrix_type(std::make_shared())}, + {"coo", create_matrix_type()}, + {"ell", create_matrix_type()}, + {"ell_mixed", create_matrix_type()}, #ifdef HAS_CUDA - {"cusparse_csr", read_splib_matrix_from_data}, - {"cusparse_csrmp", read_splib_matrix_from_data}, - {"cusparse_csrmm", read_splib_matrix_from_data}, - {"cusparse_hybrid", read_splib_matrix_from_data}, - {"cusparse_coo", read_splib_matrix_from_data}, - {"cusparse_ell", read_splib_matrix_from_data}, - {"cusparse_csr", read_splib_matrix_from_data}, - {"cusparse_coo", read_splib_matrix_from_data}, - {"cusparse_csrex", read_splib_matrix_from_data}, - {"cusparse_gcsr", read_splib_matrix_from_data}, - {"cusparse_gcsr2", read_splib_matrix_from_data}, - {"cusparse_gcoo", read_splib_matrix_from_data}, + {"cusparse_csr", create_sparselib_linop}, + {"cusparse_csrmp", create_sparselib_linop}, + {"cusparse_csrmm", create_sparselib_linop}, + {"cusparse_hybrid", create_sparselib_linop}, + {"cusparse_coo", create_sparselib_linop}, + {"cusparse_ell", create_sparselib_linop}, + {"cusparse_csr", create_sparselib_linop}, + {"cusparse_coo", create_sparselib_linop}, + {"cusparse_csrex", create_sparselib_linop}, + {"cusparse_gcsr", create_sparselib_linop}, + {"cusparse_gcsr2", create_sparselib_linop}, + {"cusparse_gcoo", create_sparselib_linop}, #endif // HAS_CUDA #ifdef HAS_HIP - {"hipsparse_csr", read_splib_matrix_from_data}, - {"hipsparse_csrmm", read_splib_matrix_from_data}, - {"hipsparse_hybrid", read_splib_matrix_from_data}, - {"hipsparse_coo", read_splib_matrix_from_data}, - {"hipsparse_ell", read_splib_matrix_from_data}, + {"hipsparse_csr", create_sparselib_linop}, + {"hipsparse_csrmm", create_sparselib_linop}, + {"hipsparse_hybrid", create_sparselib_linop}, + {"hipsparse_coo", create_sparselib_linop}, + {"hipsparse_ell", create_sparselib_linop}, #endif // HAS_HIP #ifdef HAS_DPCPP - {"onemkl_csr", read_splib_matrix_from_data}, - {"onemkl_optimized_csr", read_splib_matrix_from_data}, + {"onemkl_csr", create_sparselib_linop}, + {"onemkl_optimized_csr", create_sparselib_linop}, #endif // HAS_DPCPP - {"hybrid", read_matrix_from_data}, - {"hybrid0", - READ_MATRIX(hybrid, std::make_shared(0))}, - {"hybrid25", - READ_MATRIX(hybrid, std::make_shared(0.25))}, + {"hybrid", create_matrix_type()}, + {"hybrid0",create_matrix_type( std::make_shared(0))}, + {"hybrid25",create_matrix_type( std::make_shared(0.25))}, {"hybrid33", - READ_MATRIX(hybrid, + create_matrix_type( std::make_shared(1.0 / 3.0))}, {"hybrid40", - READ_MATRIX(hybrid, std::make_shared(0.4))}, + create_matrix_type( std::make_shared(0.4))}, {"hybrid60", - READ_MATRIX(hybrid, std::make_shared(0.6))}, + create_matrix_type( std::make_shared(0.6))}, {"hybrid80", - READ_MATRIX(hybrid, std::make_shared(0.8))}, + create_matrix_type( std::make_shared(0.8))}, {"hybridlimit0", - READ_MATRIX(hybrid, + create_matrix_type( std::make_shared(0))}, {"hybridlimit25", - READ_MATRIX(hybrid, + create_matrix_type( std::make_shared(0.25))}, {"hybridlimit33", - READ_MATRIX(hybrid, std::make_shared( + create_matrix_type( std::make_shared( 1.0 / 3.0))}, {"hybridminstorage", - READ_MATRIX(hybrid, + create_matrix_type( std::make_shared())}, - {"sellp", read_matrix_from_data>} + {"sellp", create_matrix_type>()} }; // clang-format on +std::unique_ptr matrix_factory( + const std::string& format, std::shared_ptr exec, + const gko::matrix_data& data) +{ + auto mat = matrix_type_factory.at(format)(exec); + if (format == "ell" || format == "ell_mixed") { + check_ell_admissibility(data); + } + if (format == "ell_mixed") { + gko::matrix_data, itype> conv_data; + conv_data.size = data.size; + conv_data.nonzeros.resize(data.nonzeros.size()); + auto it = conv_data.nonzeros.begin(); + for (auto& el : data.nonzeros) { + it->row = el.row; + it->column = el.column; + it->value = el.value; + ++it; + } + gko::as, itype>>( + mat.get()) + ->read(conv_data); + } else { + gko::as>(mat.get())->read( + data); + } + return mat; +} + + } // namespace formats #endif // GKO_BENCHMARK_UTILS_FORMATS_HPP_ diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp index 5de8a097e4e..8ad2fdc09c8 100644 --- a/benchmark/utils/general.hpp +++ b/benchmark/utils/general.hpp @@ -60,6 +60,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/utils/timer.hpp" #include "benchmark/utils/types.hpp" +#include "core/distributed/helpers.hpp" // Global command-line arguments @@ -137,12 +138,12 @@ void initialize_argument_parsing(int* argc, char** argv[], std::string& header, "format,\n" << " but with test cases extended to include an additional member " "\n" - << " object for each solver run in the benchmark.\n" + << " object for each benchmark run.\n" << " If run with a --backup flag, an intermediate result is " "written \n" << " to a file in the same format. The backup file can be used as " "\n" - << " input \n to this test suite, and the benchmarking will \n" + << " input to this test suite, and the benchmarking will \n" << " continue from the point where the backup file was created."; gflags::SetUsageMessage(doc.str()); @@ -178,23 +179,6 @@ void print_general_information(const std::string& extra) } -/** - * Creates a Ginkgo matrix from an input file. - * - * @param exec the executor where the matrix will be put - * @param options should contain a `filename` option with the input file string - * - * @tparam MatrixType the Ginkgo matrix type (such as `gko::matrix::Csr<>`) - */ -template -std::unique_ptr read_matrix( - std::shared_ptr exec, const rapidjson::Value& options) -{ - return gko::read(std::ifstream(options["filename"].GetString()), - std::move(exec)); -} - - // Returns a random number engine std::default_random_engine& get_engine() { @@ -315,6 +299,50 @@ const std::map(bool)>> }}}; +#if GINKGO_BUILD_MPI + + +const std::map(MPI_Comm)>> + executor_factory_mpi{ + {"reference", + [](MPI_Comm) { return gko::ReferenceExecutor::create(); }}, + {"omp", [](MPI_Comm) { return gko::OmpExecutor::create(); }}, + {"cuda", + [](MPI_Comm comm) { + FLAGS_device_id = gko::experimental::mpi::map_rank_to_device_id( + comm, gko::CudaExecutor::get_num_devices()); + return gko::CudaExecutor::create( + FLAGS_device_id, gko::ReferenceExecutor::create(), false, + gko::allocation_mode::device); + }}, + {"hip", + [](MPI_Comm comm) { + FLAGS_device_id = gko::experimental::mpi::map_rank_to_device_id( + comm, gko::HipExecutor::get_num_devices()); + return gko::HipExecutor::create( + FLAGS_device_id, gko::ReferenceExecutor::create(), true); + }}, + {"dpcpp", [](MPI_Comm comm) { + if (gko::DpcppExecutor::get_num_devices("gpu")) { + FLAGS_device_id = + gko::experimental::mpi::map_rank_to_device_id( + comm, gko::DpcppExecutor::get_num_devices("gpu")); + } else if (gko::DpcppExecutor::get_num_devices("cpu")) { + FLAGS_device_id = + gko::experimental::mpi::map_rank_to_device_id( + comm, gko::DpcppExecutor::get_num_devices("cpu")); + } else { + GKO_NOT_IMPLEMENTED; + } + return gko::DpcppExecutor::create( + FLAGS_device_id, gko::ReferenceExecutor::create()); + }}}; + + +#endif + + // returns the appropriate executor, as set by the executor flag std::shared_ptr get_executor(bool use_gpu_timer) { @@ -365,55 +393,6 @@ create_matrix_sin(std::shared_ptr exec, gko::dim<2> size) return res; } - -template -std::unique_ptr> create_matrix( - std::shared_ptr exec, gko::dim<2> size, - ValueType value) -{ - auto res = vec::create(exec); - res->read(gko::matrix_data(size, value)); - return res; -} - - -// creates a random matrix -template -std::unique_ptr> create_matrix( - std::shared_ptr exec, gko::dim<2> size, - RandomEngine& engine) -{ - auto res = vec::create(exec); - res->read(gko::matrix_data( - size, - std::uniform_real_distribution>(-1.0, - 1.0), - engine)); - return res; -} - - -// creates a zero vector -template -std::unique_ptr> create_vector( - std::shared_ptr exec, gko::size_type size) -{ - auto res = vec::create(exec); - res->read(gko::matrix_data(gko::dim<2>{size, 1})); - return res; -} - - -// creates a random vector -template -std::unique_ptr> create_vector( - std::shared_ptr exec, gko::size_type size, - RandomEngine& engine) -{ - return create_matrix(exec, gko::dim<2>{size, 1}, engine); -} - - // utilities for computing norms and residuals template ValueType get_norm(const vec* norm) @@ -422,8 +401,9 @@ ValueType get_norm(const vec* norm) } -template -gko::remove_complex compute_norm2(const vec* b) +template +gko::remove_complex compute_norm2(const VectorType* b) { auto exec = b->get_executor(); auto b_norm = @@ -433,10 +413,11 @@ gko::remove_complex compute_norm2(const vec* b) } -template +template gko::remove_complex compute_direct_error(const gko::LinOp* solver, - const vec* b, - const vec* x) + const VectorType* b, + const VectorType* x) { auto ref_exec = gko::ReferenceExecutor::create(); auto exec = solver->get_executor(); @@ -449,10 +430,10 @@ gko::remove_complex compute_direct_error(const gko::LinOp* solver, } -template +template gko::remove_complex compute_residual_norm( - const gko::LinOp* system_matrix, const vec* b, - const vec* x) + const gko::LinOp* system_matrix, const VectorType* b, const VectorType* x) { auto exec = system_matrix->get_executor(); auto one = gko::initialize>({1.0}, exec); @@ -463,9 +444,10 @@ gko::remove_complex compute_residual_norm( } -template +template gko::remove_complex compute_max_relative_norm2( - vec* result, const vec* answer) + VectorType* result, const VectorType* answer) { using rc_vtype = gko::remove_complex; auto exec = answer->get_executor(); @@ -483,9 +465,10 @@ gko::remove_complex compute_max_relative_norm2( clone(absolute_norm->get_executor()->get_master(), absolute_norm); rc_vtype max_relative_norm2 = 0; for (gko::size_type i = 0; i < host_answer_norm->get_size()[1]; i++) { - max_relative_norm2 = - std::max(host_absolute_norm->at(0, i) / host_answer_norm->at(0, i), - max_relative_norm2); + max_relative_norm2 = std::max( + gko::detail::get_local(host_absolute_norm.get())->at(0, i) / + gko::detail::get_local(host_answer_norm.get())->at(0, i), + max_relative_norm2); } return max_relative_norm2; } diff --git a/benchmark/utils/generator.hpp b/benchmark/utils/generator.hpp new file mode 100644 index 00000000000..beb146ea853 --- /dev/null +++ b/benchmark/utils/generator.hpp @@ -0,0 +1,299 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GINKGO_BENCHMARK_UTILS_GENERATOR_HPP +#define GINKGO_BENCHMARK_UTILS_GENERATOR_HPP + + +#include + + +#include "benchmark/utils/formats.hpp" +#include "benchmark/utils/general.hpp" +#include "benchmark/utils/loggers.hpp" +#include "benchmark/utils/stencil_matrix.hpp" +#if GINKGO_BUILD_MPI +#include "benchmark/utils/distributed_helpers.hpp" +#endif + + +template +struct DefaultSystemGenerator { + using index_type = IndexType; + using value_type = ValueType; + using Vec = vec; + + static gko::matrix_data generate_matrix_data( + rapidjson::Value& config) + { + if (config.HasMember("filename")) { + std::ifstream in(config["filename"].GetString()); + return gko::read_generic_raw(in); + } else if (config.HasMember("stencil")) { + return generate_stencil( + config["stencil"].GetString(), config["size"].GetInt64()); + } else { + throw std::runtime_error( + "No known way to generate matrix data found."); + } + } + + static std::shared_ptr generate_matrix_with_optimal_format( + std::shared_ptr exec, rapidjson::Value& config) + { + auto data = generate_matrix_data(config); + return generate_matrix_with_format( + std::move(exec), config["optimal"]["spmv"].GetString(), data); + } + + static std::shared_ptr generate_matrix_with_format( + std::shared_ptr exec, const std::string& format_name, + const gko::matrix_data& data, + rapidjson::Value* spmv_case = nullptr, + rapidjson::MemoryPoolAllocator<>* allocator = nullptr) + { + auto storage_logger = std::make_shared(); + if (spmv_case && allocator) { + exec->add_logger(storage_logger); + } + + auto mtx = + gko::share(::formats::matrix_factory(format_name, exec, data)); + + if (spmv_case && allocator) { + exec->remove_logger(gko::lend(storage_logger)); + storage_logger->write_data(*spmv_case, *allocator); + } + + return mtx; + } + + static std::shared_ptr generate_matrix_with_default_format( + std::shared_ptr exec, + const gko::matrix_data& data) + { + return generate_matrix_with_format(std::move(exec), "coo", data); + } + + static std::unique_ptr create_multi_vector( + std::shared_ptr exec, gko::dim<2> size, + ValueType value) + { + auto res = Vec::create(exec); + res->read(gko::matrix_data(size, value)); + return res; + } + + static std::unique_ptr create_multi_vector_strided( + std::shared_ptr exec, gko::dim<2> size, + gko::size_type stride) + { + auto res = Vec::create(exec, size, stride); + return res; + } + + // creates a random multi_vector + static std::unique_ptr create_multi_vector_random( + std::shared_ptr exec, gko::dim<2> size) + { + auto res = Vec::create(exec); + res->read(gko::matrix_data( + size, + std::uniform_real_distribution>(-1.0, + 1.0), + get_engine())); + return res; + } + + static std::unique_ptr initialize( + std::initializer_list il, + std::shared_ptr exec) + { + return gko::initialize(std::move(il), std::move(exec)); + } +}; + + +#if GINKGO_BUILD_MPI + + +template +struct DistributedDefaultSystemGenerator { + using LocalGenerator = LocalGeneratorType; + + using index_type = global_itype; + using local_index_type = typename LocalGenerator::index_type; + using value_type = typename LocalGenerator::value_type; + + using Mtx = dist_mtx; + using Vec = dist_vec; + + gko::matrix_data generate_matrix_data( + rapidjson::Value& config) const + { + if (config.HasMember("filename")) { + std::ifstream in(config["filename"].GetString()); + return gko::read_generic_raw(in); + } else if (config.HasMember("stencil")) { + auto local_size = static_cast( + config["size"].GetInt64() / comm.size()); + return generate_stencil( + config["stencil"].GetString(), comm, local_size, + config["comm_pattern"].GetString() == std::string("optimal")); + } else { + throw std::runtime_error( + "No known way to generate matrix data found."); + } + } + + std::shared_ptr generate_matrix_with_optimal_format( + std::shared_ptr exec, rapidjson::Value& config) const + { + auto data = generate_matrix_data(config); + return generate_matrix_with_format( + std::move(exec), config["optimal"]["spmv"].GetString(), data); + } + + std::shared_ptr generate_matrix_with_format( + std::shared_ptr exec, const std::string& format_name, + const gko::matrix_data& data, + rapidjson::Value* spmv_case = nullptr, + rapidjson::MemoryPoolAllocator<>* allocator = nullptr) const + { + auto part = gko::experimental::distributed:: + Partition::build_from_global_size_uniform( + exec, comm.size(), static_cast(data.size[0])); + auto formats = split(format_name, '-'); + + auto local_mat = formats::matrix_type_factory.at(formats[0])(exec); + auto non_local_mat = formats::matrix_type_factory.at(formats[1])(exec); + + auto storage_logger = std::make_shared(); + if (spmv_case && allocator) { + exec->add_logger(storage_logger); + } + + auto dist_mat = dist_mtx::create( + exec, comm, local_mat.get(), non_local_mat.get()); + dist_mat->read_distributed(data, part.get()); + + if (spmv_case && allocator) { + exec->remove_logger(gko::lend(storage_logger)); + storage_logger->write_data(comm, *spmv_case, *allocator); + } + + return dist_mat; + } + + std::shared_ptr generate_matrix_with_default_format( + std::shared_ptr exec, + const gko::matrix_data& data) const + { + return generate_matrix_with_format(std::move(exec), "coo-coo", data); + } + + std::unique_ptr create_multi_vector( + std::shared_ptr exec, gko::dim<2> size, + value_type value) const + { + auto part = gko::experimental::distributed:: + Partition::build_from_global_size_uniform( + exec, comm.size(), static_cast(size[0])); + return Vec::create( + exec, comm, size, + local_generator + .create_multi_vector( + exec, + gko::dim<2>{static_cast( + part->get_part_size(comm.rank())), + size[1]}, + value) + .get()); + } + + std::unique_ptr create_multi_vector_strided( + std::shared_ptr exec, gko::dim<2> size, + gko::size_type stride) const + { + auto part = gko::experimental::distributed:: + Partition::build_from_global_size_uniform( + exec, comm.size(), static_cast(size[0])); + return Vec::create( + exec, comm, size, + local_generator + .create_multi_vector_strided( + exec, + gko::dim<2>{static_cast( + part->get_part_size(comm.rank())), + size[1]}, + stride) + .get()); + } + + // creates a random multi_vector + std::unique_ptr create_multi_vector_random( + std::shared_ptr exec, gko::dim<2> size) const + { + auto part = gko::experimental::distributed:: + Partition::build_from_global_size_uniform( + exec, comm.size(), static_cast(size[0])); + return Vec::create( + exec, comm, size, + local_generator + .create_multi_vector_random( + exec, gko::dim<2>{static_cast( + part->get_part_size(comm.rank())), + size[1]}) + .get()); + } + + std::unique_ptr initialize( + std::initializer_list il, + std::shared_ptr exec) const + { + auto local = gko::initialize( + std::move(il), std::move(exec)); + auto global_rows = local->get_size()[0]; + comm.all_reduce(gko::ReferenceExecutor::create(), &global_rows, 1, + MPI_SUM); + return Vec::create(exec, comm, + gko::dim<2>{global_rows, local->get_size()[1]}, + local.get()); + } + + gko::experimental::mpi::communicator comm; + LocalGenerator local_generator{}; +}; + + +#endif +#endif // GINKGO_BENCHMARK_UTILS_GENERATOR_HPP diff --git a/benchmark/utils/loggers.hpp b/benchmark/utils/loggers.hpp index 4b9a2ae47a5..0d726cc1662 100644 --- a/benchmark/utils/loggers.hpp +++ b/benchmark/utils/loggers.hpp @@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/utils/general.hpp" +#include "core/distributed/helpers.hpp" // A logger that accumulates the time of all operations @@ -186,6 +187,25 @@ struct StorageLogger : gko::log::Logger { add_or_set_member(output, "storage", total, allocator); } +#if GINKGO_BUILD_MPI + void write_data(gko::experimental::mpi::communicator comm, + rapidjson::Value& output, + rapidjson::MemoryPoolAllocator<>& allocator) + { + const std::lock_guard lock(mutex); + gko::size_type total{}; + for (const auto& e : storage) { + total += e.second; + } + comm.reduce(gko::ReferenceExecutor::create(), + comm.rank() == 0 + ? static_cast(MPI_IN_PLACE) + : &total, + &total, 1, MPI_SUM, 0); + add_or_set_member(output, "storage", total, allocator); + } +#endif + private: mutable std::mutex mutex; mutable std::unordered_map storage; @@ -221,14 +241,21 @@ struct ResidualLogger : gko::log::Logger { rec_res_norms.PushBack( get_norm(gko::as>(residual_norm)), alloc); } else { - rec_res_norms.PushBack( - compute_norm2(gko::as>(residual)), alloc); + gko::detail::vector_dispatch( + residual, [&](const auto v_residual) { + rec_res_norms.PushBack(compute_norm2(v_residual), alloc); + }); } if (solution) { - true_res_norms.PushBack( - compute_residual_norm(matrix, b, - gko::as>(solution)), - alloc); + gko::detail::vector_dispatch< + rc_vtype>(solution, [&](auto v_solution) { + using concrete_type = + std::remove_pointer_t>; + true_res_norms.PushBack( + compute_residual_norm(matrix, gko::as(b), + v_solution), + alloc); + }); } else { true_res_norms.PushBack(-1.0, alloc); } @@ -243,7 +270,7 @@ struct ResidualLogger : gko::log::Logger { } } - ResidualLogger(const gko::LinOp* matrix, const vec* b, + ResidualLogger(const gko::LinOp* matrix, const gko::LinOp* b, rapidjson::Value& rec_res_norms, rapidjson::Value& true_res_norms, rapidjson::Value& implicit_res_norms, @@ -265,7 +292,7 @@ struct ResidualLogger : gko::log::Logger { private: const gko::LinOp* matrix; - const vec* b; + const gko::LinOp* b; std::chrono::steady_clock::time_point start; rapidjson::Value& rec_res_norms; rapidjson::Value& true_res_norms; diff --git a/benchmark/utils/spmv_common.hpp b/benchmark/utils/spmv_validation.hpp similarity index 79% rename from benchmark/utils/spmv_common.hpp rename to benchmark/utils/spmv_validation.hpp index bfb57c319c8..83ea2085ec2 100644 --- a/benchmark/utils/spmv_common.hpp +++ b/benchmark/utils/spmv_validation.hpp @@ -30,8 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_BENCHMARK_UTILS_SPMV_COMMON_HPP_ -#define GKO_BENCHMARK_UTILS_SPMV_COMMON_HPP_ +#ifndef GKO_BENCHMARK_UTILS_SPMV_VALIDATION_HPP_ +#define GKO_BENCHMARK_UTILS_SPMV_VALIDATION_HPP_ #include @@ -44,16 +44,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +std::string example_config = R"( + [ + {"filename": "my_file.mtx"}, + {"filename": "my_file2.mtx"}, + {"size": 100, "stencil": "7pt"}, + ] +)"; + + /** * Function which outputs the input format for benchmarks similar to the spmv. */ [[noreturn]] void print_config_error_and_exit() { std::cerr << "Input has to be a JSON array of matrix configurations:\n" - << " [\n" - << " { \"filename\": \"my_file.mtx\" },\n" - << " { \"filename\": \"my_file2.mtx\" }\n" - << " ]" << std::endl; + << example_config << std::endl; std::exit(1); } @@ -65,11 +71,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void validate_option_object(const rapidjson::Value& value) { - if (!value.IsObject() || !value.HasMember("filename") || - !value["filename"].IsString()) { + if (!value.IsObject() || + !((value.HasMember("size") && value.HasMember("stencil") && + value["size"].IsInt64() && value["stencil"].IsString()) || + (value.HasMember("filename") && value["filename"].IsString()))) { print_config_error_and_exit(); } } -#endif // GKO_BENCHMARK_UTILS_SPMV_COMMON_HPP_ +#endif // GKO_BENCHMARK_UTILS_SPMV_VALIDATION_HPP_ diff --git a/benchmark/utils/stencil_matrix.hpp b/benchmark/utils/stencil_matrix.hpp new file mode 100644 index 00000000000..ca89aa86079 --- /dev/null +++ b/benchmark/utils/stencil_matrix.hpp @@ -0,0 +1,433 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GINKGO_BENCHMARK_UTILS_STENCIL_MATRIX_HPP +#define GINKGO_BENCHMARK_UTILS_STENCIL_MATRIX_HPP + + +#include +#if GINKGO_BUILD_MPI +#include +#endif + + +template +double closest_nth_root(T v, int n) +{ + auto root = std::pow(v, 1. / static_cast(n)); + auto root_floor = std::floor(root); + auto root_ceil = std::ceil(root); + if (root - root_floor > root_ceil - root) { + return root_ceil; + } else { + return root_floor; + } +} + + +template +bool is_in_box(const IndexType i, const IndexType bound) +{ + return 0 <= i && i < bound; +} + + +/** + * Generates matrix data for a 2D stencil matrix. If restricted is set to true, + * creates a 5-pt stencil, if it is false creates a 9-pt stencil. + * + * If `dim != [1 1]` then the matrix data is a subset of a larger matrix. + * The total matrix is a discretization of `[0, dims[0]] x [0, dims[1]]`, and + * each box is square. The position of the box defines the subset of the matrix. + * The degrees of freedom are ordered box-wise and the boxes themselves are + * ordered lexicographical. This means that the indices are with respect to the + * larger matrix, i.e. they might not start with 0. + * + * @param dims The number of boxes in each dimension. + * @param positions The position of this box with respect to each dimension. + * @param target_local_size The desired size of the boxes. The actual size can + * deviate from this to accommodate the square size of + * the boxes. + * @param restricted If true, a 5-pt stencil is used, else a 9-pt stencil. + * + * @return matrix data of a box using either 5-pt or 9-pt stencil. + */ +template +gko::matrix_data generate_2d_stencil_box( + std::array dims, std::array positions, + const gko::size_type target_local_size, bool restricted) +{ + auto num_boxes = + std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<>{}); + + const auto discretization_points = + static_cast(closest_nth_root(target_local_size, 2)); + const auto local_size = static_cast(discretization_points * + discretization_points); + const auto global_size = local_size * num_boxes; + auto A_data = gko::matrix_data( + gko::dim<2>{static_cast(global_size), + static_cast(global_size)}); + + auto global_offset = [&](const int position_x, const int position_y) { + return static_cast(local_size) * dims[0] * position_x + + static_cast(local_size) * position_y; + }; + + auto target_position = [&](const IndexType i, const int position) { + return is_in_box(i, discretization_points) + ? position + : (i < 0 ? position - 1 : position + 1); + }; + + auto target_local_idx = [&](const IndexType i) { + return is_in_box(i, discretization_points) + ? i + : (i < 0 ? discretization_points + i + : discretization_points - i); + }; + + auto flat_idx = [&](const IndexType ix, const IndexType iy) { + auto tpx = target_position(ix, positions[0]); + auto tpy = target_position(iy, positions[1]); + if (is_in_box(tpx, dims[0]) && is_in_box(tpy, dims[1])) { + return global_offset(tpx, tpy) + + target_local_idx(ix) * discretization_points + + target_local_idx(iy); + } else { + return static_cast(-1); + } + }; + + auto is_valid_neighbor = [&](const IndexType d_i, const IndexType d_j) { + return !restricted || d_i == 0 || d_j == 0; + }; + + auto nnz_in_row = [&]() { + int num_neighbors = 0; + for (IndexType d_i : {-1, 0, 1}) { + for (IndexType d_j : {-1, 0, 1}) { + if (is_valid_neighbor(d_i, d_j)) { + num_neighbors++; + } + } + } + return num_neighbors; + }; + const auto diag_value = static_cast(nnz_in_row() - 1); + + for (IndexType i = 0; i < discretization_points; ++i) { + for (IndexType j = 0; j < discretization_points; ++j) { + auto row = flat_idx(i, j); + for (IndexType d_i : {-1, 0, 1}) { + for (IndexType d_j : {-1, 0, 1}) { + if (is_valid_neighbor(d_i, d_j)) { + auto col = flat_idx(i + d_i, j + d_j); + if (is_in_box(col, + static_cast(global_size))) { + if (col != row) { + A_data.nonzeros.emplace_back( + row, col, -gko::one()); + } else { + A_data.nonzeros.emplace_back(row, col, + diag_value); + } + } + } + } + } + } + } + + return A_data; +} + + +/** + * Generates matrix data for a 3D stencil matrix. If restricted is set to true, + * creates a 7-pt stencil, if it is false creates a 27-pt stencil. + * + * If `dim != [1 1 1]` then the matrix data is a subset of a larger matrix. + * The total matrix is a discretization of `[0, dims[0]] x [0, dims[1]] x [0, + * dims[2]]`, and each box is a cube. The position of the box defines the subset + * of the matrix. The degrees of freedom are ordered box-wise and the boxes + * themselves are ordered lexicographical. This means that the indices are with + * respect to the larger matrix, i.e. they might not start with 0. + * + * @param dims The number of boxes in each dimension. + * @param positions The position of this box with respect to each dimension. + * @param target_local_size The desired size of the boxes. The actual size can + * deviate from this to accommodate the uniform size + * of the boxes. + * @param restricted If true, a 7-pt stencil is used, else a 27-pt stencil. + * + * @return matrix data of a box using either 7-pt or 27-pt stencil. + */ +template +gko::matrix_data generate_3d_stencil_box( + std::array dims, std::array positions, + const gko::size_type target_local_size, bool restricted) +{ + auto num_boxes = + std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<>{}); + + const auto discretization_points = + static_cast(closest_nth_root(target_local_size, 3)); + const auto local_size = static_cast( + discretization_points * discretization_points * discretization_points); + const auto global_size = local_size * num_boxes; + auto A_data = gko::matrix_data( + gko::dim<2>{static_cast(global_size), + static_cast(global_size)}); + + auto global_offset = [&](const int position_x, const int position_y, + const int position_z) { + return position_x * static_cast(local_size) * dims[0] * dims[1] + + position_y * static_cast(local_size) * dims[0] + + position_z * static_cast(local_size); + }; + + auto target_position = [&](const IndexType i, const int position) { + return is_in_box(i, discretization_points) + ? position + : (i < 0 ? position - 1 : position + 1); + }; + + auto target_local_idx = [&](const IndexType i) { + return is_in_box(i, discretization_points) + ? i + : (i < 0 ? discretization_points + i + : discretization_points - i); + }; + + auto flat_idx = [&](const IndexType ix, const IndexType iy, + const IndexType iz) { + auto tpx = target_position(ix, positions[0]); + auto tpy = target_position(iy, positions[1]); + auto tpz = target_position(iz, positions[2]); + if (is_in_box(tpx, dims[0]) && is_in_box(tpy, dims[1]) && + is_in_box(tpz, dims[2])) { + return global_offset(tpx, tpy, tpz) + + target_local_idx(ix) * discretization_points * + discretization_points + + target_local_idx(iy) * discretization_points + + target_local_idx(iz); + } else { + return static_cast(-1); + } + }; + + auto is_valid_neighbor = [&](const IndexType d_i, const IndexType d_j, + const IndexType d_k) { + return !restricted || ((d_i == 0) + (d_j == 0) + (d_k == 0) >= 2); + }; + + auto nnz_in_row = [&]() { + int num_neighbors = 0; + for (IndexType d_i : {-1, 0, 1}) { + for (IndexType d_j : {-1, 0, 1}) { + for (IndexType d_k : {-1, 0, 1}) { + if (is_valid_neighbor(d_i, d_j, d_k)) { + num_neighbors++; + } + } + } + } + return num_neighbors; + }; + const auto diag_value = static_cast(nnz_in_row() - 1); + + for (IndexType i = 0; i < discretization_points; ++i) { + for (IndexType j = 0; j < discretization_points; ++j) { + for (IndexType k = 0; k < discretization_points; ++k) { + auto row = flat_idx(i, j, k); + for (IndexType d_i : {-1, 0, 1}) { + for (IndexType d_j : {-1, 0, 1}) { + for (IndexType d_k : {-1, 0, 1}) { + if (is_valid_neighbor(d_i, d_j, d_k)) { + auto col = flat_idx(i + d_i, j + d_j, k + d_k); + if (is_in_box(col, static_cast( + global_size))) { + if (col != row) { + A_data.nonzeros.emplace_back( + row, col, -gko::one()); + } else { + A_data.nonzeros.emplace_back( + row, col, diag_value); + } + } + } + } + } + } + } + } + } + + return A_data; +} + + +/** + * Generates matrix data for the requested stencil. + * + * @see generate_2d_stencil_box, generate_3d_stencil_box + * + * @param stencil_name The name of the stencil. + * @param target_local_size The desired size of the matrix. The actual size can + * deviate from this to accommodate the uniform size + * of the discretization. + * @return matrix data using the requested stencil. + */ +template +gko::matrix_data generate_stencil( + std::string stencil_name, const gko::size_type target_local_size) +{ + if (stencil_name == "5pt") { + return generate_2d_stencil_box( + {1, 1}, {0, 0}, target_local_size, true); + } else if (stencil_name == "9pt") { + return generate_2d_stencil_box( + {1, 1}, {0, 0}, target_local_size, false); + } else if (stencil_name == "7pt") { + return generate_3d_stencil_box( + {1, 1, 1}, {0, 0, 0}, target_local_size, true); + } else if (stencil_name == "27pt") { + return generate_3d_stencil_box( + {1, 1, 1}, {0, 0, 0}, target_local_size, false); + } else { + throw std::runtime_error("Stencil " + stencil_name + + " not implemented"); + } +} + + +#if GINKGO_BUILD_MPI + + +/** + * Generates matrix data for a given 2D stencil, where the position of this + * block is given by it's MPI rank. + * + * @see generate_2d_stencil_box + */ +template +gko::matrix_data generate_2d_stencil( + gko::experimental::mpi::communicator comm, + const gko::size_type target_local_size, bool restricted, bool optimal_comm) +{ + if (optimal_comm) { + return generate_2d_stencil_box( + {comm.size(), 1}, {comm.rank(), 0}, target_local_size, restricted); + } else { + std::array dims{}; + MPI_Dims_create(comm.size(), dims.size(), dims.data()); + + std::array coords{}; + coords[0] = comm.rank() % dims[0]; + coords[1] = comm.rank() / dims[0]; + + return generate_2d_stencil_box( + dims, coords, target_local_size, restricted); + } +} + + +/** + * Generates matrix data for a given 23 stencil, where the position of this + * block is given by it's MPI rank. + * + * @see generate_3d_stencil_box + */ +template +gko::matrix_data generate_3d_stencil( + gko::experimental::mpi::communicator comm, + const gko::size_type target_local_size, bool restricted, bool optimal_comm) +{ + if (optimal_comm) { + return generate_3d_stencil_box( + {comm.size(), 1, 1}, {comm.rank(), 0, 0}, target_local_size, + restricted); + } else { + std::array dims{}; + + MPI_Dims_create(comm.size(), dims.size(), dims.data()); + + std::array coords{}; + coords[0] = comm.rank() % dims[0]; + coords[1] = (comm.rank() / dims[0]) % dims[1]; + coords[2] = comm.rank() / (dims[0] * dims[1]); + + return generate_3d_stencil_box( + dims, coords, target_local_size, restricted); + } +} + + +/** + * Generates matrix data for the requested stencil. + * + * @copydoc generate_stencil(const gko::size_type, bool) + * + * @param comm The MPI communicator to determine the rank. + * @param optimal_comm If true, a 1D domain decomposition is used which leads + * to each processor having at most two neighbors. This + * also changes the domain shape to an elongated channel. + * If false, a mostly uniform 2D or 3D decomposition is + * used, and the domain shape is mostly cubic. + */ +template +gko::matrix_data generate_stencil( + std::string stencil_name, gko::experimental::mpi::communicator comm, + const gko::size_type target_local_size, bool optimal_comm) +{ + if (stencil_name == "5pt") { + return generate_2d_stencil( + std::move(comm), target_local_size, true, optimal_comm); + } else if (stencil_name == "9pt") { + return generate_2d_stencil( + std::move(comm), target_local_size, false, optimal_comm); + } else if (stencil_name == "7pt") { + return generate_3d_stencil( + std::move(comm), target_local_size, true, optimal_comm); + } else if (stencil_name == "27pt") { + return generate_3d_stencil( + std::move(comm), target_local_size, false, optimal_comm); + } else { + throw std::runtime_error("Stencil " + stencil_name + + " not implemented"); + } +} + + +#endif +#endif // GINKGO_BENCHMARK_UTILS_STENCIL_MATRIX_HPP