From 31035ba201e904444e2f3555c32cc14522f6dcd8 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Tue, 13 Jan 2026 21:57:44 -0500 Subject: [PATCH 01/92] prepare for release --- ChangeLog | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/ChangeLog b/ChangeLog index b69a143..2b65354 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,13 +1,36 @@ ---------------------------------------------------------------------------- -DTLMod (0.3) not released yet (target: spring 2026) - - Thorough code review and important refactoring of the internals - - API changes: - - DTL::get_stream_by_name_or_null is now DTL::get_stream_by_name. The - corresponding python call is now dtl.get_stream_by_name - - Metadata management transferred from Engine to Stream. The - get_metadata_file_name method (or meta_file_name property in Python) - must be called by a Stream instance. +DTLMod (0.3) January 19, 2026 + +Major improvements: + - Enhanced code quality with comprehensive refactoring + - Improved modern C++17 usage throughout the codebase + - Reduced code complexity and technical debt + - Memory safety and robustness improvements + - Fixed critical memory safety issue in DTL connection management + - Fixed race condition in Engine creation + - Plugged memory leaks + - Major refactoring with improved memory management, encapsulation, and move-only semantics + - Improved CI/CD infrastructure + - Added Valgrind and sanitizer checks (AddressSanitizer, UndefinedBehaviorSanitizer) + - Enhanced test coverage and validation + - Comprehensive proofread of the documentation + +API Changes: + - DTL::get_stream_by_name_or_null() renamed to DTL::get_stream_by_name() + Returns std::optional> instead of raw pointer + Python: dtl.get_stream_by_name() now returns None if stream not found + - Metadata management transferred from Engine to Stream + Stream::get_metadata_file_name() replaces Engine::get_metadata_file_name() + Python: stream.metadata_file_name replaces engine.metadata_file_name + - Stream method chaining improvements + set_engine_type() and set_transport_method() now return Stream& instead of Stream* + Enables cleaner fluent interface: stream.set_engine_type(...).set_transport_method(...) + - DTL::create() now accepts std::string_view and has empty default parameter + - String parameters changed to std::string_view for better performance: + - DTL::add_stream(), DTL::get_stream_by_name() + - Stream and Variable constructors and methods + - get_engine_type_str() and get_transport_method_str() now return std::optional ---------------------------------------------------------------------------- From bd1087be43c4f9544c4fbf1dc50c8bab1ff36684 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Mon, 19 Jan 2026 10:46:51 -0500 Subject: [PATCH 02/92] fix ODR violation --- CMakeLists.txt | 6 ++---- test/test_util.hpp | 21 +++++++++++---------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c34e778..4ae0f6e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -251,12 +251,10 @@ if(GTEST_LIBRARY) test/dtl_staging_engine.cpp test/dtl_stream.cpp test/dtl_variable.cpp - test/main.cpp - test/test_util.hpp - include/dtlmod.hpp) + test/main.cpp) add_definitions(-DGTEST_USED) - add_executable(unit_tests EXCLUDE_FROM_ALL ${SOURCE_FILES} ${HEADER_FILES} ${TEST_FILES}) + add_executable(unit_tests EXCLUDE_FROM_ALL ${TEST_FILES}) target_include_directories(unit_tests PRIVATE ${CMAKE_SOURCE_DIR}/include ${CMAKE_BINARY_DIR}/include diff --git a/test/test_util.hpp b/test/test_util.hpp index 06b592c..54a47cf 100644 --- a/test/test_util.hpp +++ b/test/test_util.hpp @@ -7,16 +7,17 @@ #define __DTLMOD_TEST_UTIL_HPP__ #include -static void DO_TEST_WITH_FORK(const std::function &lambda) { - pid_t pid = fork(); - if (pid) { - int exit_code; - waitpid(pid, &exit_code, 0); - ASSERT_EQ(exit_code, 0); - } else { - lambda(); - exit(::testing::Test::HasFailure() ? 255 : 0); - } +inline void DO_TEST_WITH_FORK(const std::function& lambda) +{ + pid_t pid = fork(); + if (pid) { + int exit_code; + waitpid(pid, &exit_code, 0); + ASSERT_EQ(exit_code, 0); + } else { + lambda(); + exit(::testing::Test::HasFailure() ? 255 : 0); + } } #endif // __DTLMOD_TEST_UTIL_HPP__ \ No newline at end of file From 28ec119391d32ea9e81f2c68b3d349fd3831a9ce Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Mon, 19 Jan 2026 10:51:56 -0500 Subject: [PATCH 03/92] ignore __asan_handle_no_return from SimGrid --- .github/workflows/weekly-checks.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/weekly-checks.yml b/.github/workflows/weekly-checks.yml index 8fe1049..46da30e 100644 --- a/.github/workflows/weekly-checks.yml +++ b/.github/workflows/weekly-checks.yml @@ -126,7 +126,8 @@ jobs: - name: Check for sanitizer errors run: | - if grep -E "ERROR:|WARNING:" build/sanitizer-output.txt; then + # Filter out known benign warnings from SimGrid's context switching + if grep -E "ERROR:|WARNING:" build/sanitizer-output.txt | grep -v "ASan is ignoring requested __asan_handle_no_return"; then echo "::error::${{ matrix.sanitizer.name }} detected issues!" exit 1 fi From 854ea72745d8140f6c17272a51ab407f7c7fdc7a Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Mon, 19 Jan 2026 11:09:06 -0500 Subject: [PATCH 04/92] fix issues in weekly reports --- .github/workflows/weekly-checks.yml | 13 +++++++++++-- test/valgrind.supp | 28 ++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/.github/workflows/weekly-checks.yml b/.github/workflows/weekly-checks.yml index 46da30e..180cc3f 100644 --- a/.github/workflows/weekly-checks.yml +++ b/.github/workflows/weekly-checks.yml @@ -107,9 +107,17 @@ jobs: - name: Run tests with ${{ matrix.sanitizer.name }} env: LD_LIBRARY_PATH: /opt/simgrid/lib:/opt/fsmod/lib:/usr/local/lib - PYTHONPATH: /opt/simgrid/lib/python3.12/site-packages:/usr/local/lib/python3.12/dist-packages run: | export ${{ matrix.sanitizer.env_options }} + # Dynamically find Python paths for SimGrid, FSMod, and DTLMod + SIMGRID_PYTHON_PATH=$(find /opt/simgrid/lib -type d -path "*/python*/site-packages" 2>/dev/null | head -1) + [ -z "$SIMGRID_PYTHON_PATH" ] && SIMGRID_PYTHON_PATH=$(find /opt/simgrid/lib -type d -path "*/python*/dist-packages" 2>/dev/null | head -1) + FSMOD_PYTHON_PATH=$(find /opt/fsmod/lib -type d -path "*/python*/site-packages" 2>/dev/null | head -1) + [ -z "$FSMOD_PYTHON_PATH" ] && FSMOD_PYTHON_PATH=$(find /opt/fsmod/lib -type d -path "*/python*/dist-packages" 2>/dev/null | head -1) + DTLMOD_PYTHON_PATH=$(find /usr/local/lib -type d -path "*/python*/dist-packages" 2>/dev/null | head -1) + [ -z "$DTLMOD_PYTHON_PATH" ] && DTLMOD_PYTHON_PATH=$(find /usr/local/lib -type d -path "*/python*/site-packages" 2>/dev/null | head -1) + export PYTHONPATH="$SIMGRID_PYTHON_PATH:$FSMOD_PYTHON_PATH:$DTLMOD_PYTHON_PATH" + echo "PYTHONPATH=$PYTHONPATH" cd build ./unit_tests 2>&1 | tee sanitizer-output.txt cd test/python @@ -298,7 +306,8 @@ jobs: for report in reports/*-report/*.txt; do if [ -f "$report" ]; then echo "### $(basename $(dirname $report))" >> weekly-report.md - if grep -q "ERROR:\|WARNING:" "$report" 2>/dev/null; then + # Filter out benign warnings from SimGrid context switching and Valgrind redirections + if grep -E "ERROR:|WARNING:" "$report" 2>/dev/null | grep -v "ASan is ignoring requested __asan_handle_no_return" | grep -v "new redirection conflicts" | grep -q .; then echo "❌ Issues detected" >> weekly-report.md else echo "✅ No issues detected" >> weekly-report.md diff --git a/test/valgrind.supp b/test/valgrind.supp index 45de954..3e026d4 100644 --- a/test/valgrind.supp +++ b/test/valgrind.supp @@ -8,6 +8,13 @@ obj:*/libsimgrid.so* } +{ + simgrid_boost_context + Memcheck:Leak + ... + obj:*/libboost_context.so* +} + { boost_filesystem Memcheck:Leak @@ -15,6 +22,13 @@ obj:*/libboost_filesystem.so* } +{ + fsmod_library + Memcheck:Leak + ... + obj:*/libfsmod.so* +} + { python_interpreter Memcheck:Leak @@ -28,3 +42,17 @@ ... obj:*/libgtest.so* } + +{ + libstdc++_allocations + Memcheck:Leak + ... + obj:*/libstdc++.so* +} + +{ + graphviz_cgraph + Memcheck:Leak + ... + obj:*/libcgraph.so* +} From d0f2c171c647ff61a78f9c3d10488d2aba0c2e28 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Mon, 19 Jan 2026 11:17:14 -0500 Subject: [PATCH 05/92] skip python tests with Sanitizers, which don't work well with Python --- .github/workflows/weekly-checks.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/weekly-checks.yml b/.github/workflows/weekly-checks.yml index 180cc3f..9b3b6b3 100644 --- a/.github/workflows/weekly-checks.yml +++ b/.github/workflows/weekly-checks.yml @@ -120,8 +120,9 @@ jobs: echo "PYTHONPATH=$PYTHONPATH" cd build ./unit_tests 2>&1 | tee sanitizer-output.txt - cd test/python - python3 ./unit_tests_python.py 2>&1 | tee -a ../../sanitizer-output.txt + # Skip Python tests when running with sanitizers - ASan/UBSan don't work well with Python + # The C++ unit tests already cover the core functionality + echo "Skipping Python tests with sanitizers (ASan runtime incompatible with Python interpreter)" | tee -a sanitizer-output.txt cd ../.. - name: Upload sanitizer report From 837d3e4f785b29314a513a10038ef07422d0f660 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Wed, 21 Jan 2026 10:05:24 -0500 Subject: [PATCH 06/92] tell codefactor to ignore code duplication in tests --- .codefactor.yml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .codefactor.yml diff --git a/.codefactor.yml b/.codefactor.yml new file mode 100644 index 0000000..6c728ea --- /dev/null +++ b/.codefactor.yml @@ -0,0 +1,6 @@ +version: "1.0" +checks: + duplication: + exclude_paths: + - "test/**/*.cpp" + - "test/**/*.py" From 7cf1bf54c7130b960f6537361a1c36810f92d8e1 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Wed, 21 Jan 2026 10:10:32 -0500 Subject: [PATCH 07/92] fix (again) coverage computation for sonar --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 588ac33..0b04b0c 100755 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -121,7 +121,7 @@ jobs: lcov --keep-going --directory . --capture --output-file coverage.info lcov --remove coverage.info '*/test/*' -o coverage.info cd .. - gcovr -e test -e examples --sonarqube -u -o coverage.xml --exclude-throw-branches \ + gcovr --root . build -e test -e examples --sonarqube -u -o coverage.xml --exclude-throw-branches \ --gcov-ignore-parse-errors --exclude-unreachable-branches - name: Upload coverage to Codecov From 13a16f99aee1f219b3a2104ed858a6f7d00babcf Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Wed, 21 Jan 2026 10:19:16 -0500 Subject: [PATCH 08/92] fix (again) coverage computation for sonar --- .github/workflows/build.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0b04b0c..0b53b25 100755 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -120,8 +120,7 @@ jobs: cd build lcov --keep-going --directory . --capture --output-file coverage.info lcov --remove coverage.info '*/test/*' -o coverage.info - cd .. - gcovr --root . build -e test -e examples --sonarqube -u -o coverage.xml --exclude-throw-branches \ + gcovr --root .. -e test -e examples --sonarqube -u -o coverage.xml --exclude-throw-branches \ --gcov-ignore-parse-errors --exclude-unreachable-branches - name: Upload coverage to Codecov @@ -141,7 +140,7 @@ jobs: -Dsonar.projectKey=simgrid_dtlmod -Dsonar.organization=simgrid -Dsonar.cfamily.compile-commands=build/compile_commands.json - -Dsonar.coverageReportPaths=coverage.xml + -Dsonar.coverageReportPaths=build/coverage.xml - name: Build and deploy documentation if: github.ref == 'refs/heads/main' && env.TOKEN_GITHUB != '' From dbd8e5f123db91170c5269e585db3d8df6768a57 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Wed, 21 Jan 2026 10:24:07 -0500 Subject: [PATCH 09/92] fix (third time the charm) coverage computation for sonar --- .github/workflows/build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0b53b25..82938ee 100755 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -118,8 +118,8 @@ jobs: - name: Generate coverage report run: | cd build - lcov --keep-going --directory . --capture --output-file coverage.info - lcov --remove coverage.info '*/test/*' -o coverage.info + lcov --keep-going --ignore-errors mismatch --directory . --capture --output-file coverage.info + lcov --ignore-errors mismatch --remove coverage.info '*/test/*' -o coverage.info gcovr --root .. -e test -e examples --sonarqube -u -o coverage.xml --exclude-throw-branches \ --gcov-ignore-parse-errors --exclude-unreachable-branches From 4affbc5b90b9ed2f35421c25bb46f6bc58dc3a25 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Wed, 21 Jan 2026 10:30:45 -0500 Subject: [PATCH 10/92] use the right config file for codefactor --- .cfduplication.yml | 6 ++++++ .codefactor.yml | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) create mode 100644 .cfduplication.yml delete mode 100644 .codefactor.yml diff --git a/.cfduplication.yml b/.cfduplication.yml new file mode 100644 index 0000000..a7ba620 --- /dev/null +++ b/.cfduplication.yml @@ -0,0 +1,6 @@ +# duplication threshold [20 - 100] +MinLineCount: 20 + +# files/patterns to exclude +IgnorePatterns: + - test/** diff --git a/.codefactor.yml b/.codefactor.yml deleted file mode 100644 index 6c728ea..0000000 --- a/.codefactor.yml +++ /dev/null @@ -1,6 +0,0 @@ -version: "1.0" -checks: - duplication: - exclude_paths: - - "test/**/*.cpp" - - "test/**/*.py" From 3b3475b0be14cf872c45a61f9c29b0bcfaf1e8ca Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Wed, 21 Jan 2026 10:40:15 -0500 Subject: [PATCH 11/92] fix (fourth time the rrrrrhhhh) coverage computation for sonar --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 82938ee..0deb7e3 100755 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -120,7 +120,7 @@ jobs: cd build lcov --keep-going --ignore-errors mismatch --directory . --capture --output-file coverage.info lcov --ignore-errors mismatch --remove coverage.info '*/test/*' -o coverage.info - gcovr --root .. -e test -e examples --sonarqube -u -o coverage.xml --exclude-throw-branches \ + gcovr --root .. -f '../src/' -f '../include/' --sonarqube -u -o coverage.xml --exclude-throw-branches \ --gcov-ignore-parse-errors --exclude-unreachable-branches - name: Upload coverage to Codecov From c4e200e1137b19159eeaf09615e5edce023474d9 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Wed, 21 Jan 2026 10:47:18 -0500 Subject: [PATCH 12/92] restore a former config for coverage --- .github/workflows/build.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0deb7e3..35f35dc 100755 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -120,7 +120,8 @@ jobs: cd build lcov --keep-going --ignore-errors mismatch --directory . --capture --output-file coverage.info lcov --ignore-errors mismatch --remove coverage.info '*/test/*' -o coverage.info - gcovr --root .. -f '../src/' -f '../include/' --sonarqube -u -o coverage.xml --exclude-throw-branches \ + cd .. + gcovr -e test -e examples --sonarqube -u -o coverage.xml --exclude-throw-branches \ --gcov-ignore-parse-errors --exclude-unreachable-branches - name: Upload coverage to Codecov @@ -140,7 +141,7 @@ jobs: -Dsonar.projectKey=simgrid_dtlmod -Dsonar.organization=simgrid -Dsonar.cfamily.compile-commands=build/compile_commands.json - -Dsonar.coverageReportPaths=build/coverage.xml + -Dsonar.coverageReportPaths=coverage.xml - name: Build and deploy documentation if: github.ref == 'refs/heads/main' && env.TOKEN_GITHUB != '' From b619a58f7cf9e4979a742b5048aaadcc196645f1 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Wed, 21 Jan 2026 11:30:22 -0500 Subject: [PATCH 13/92] coverage --- .github/workflows/build.yml | 5 ++--- CMakeLists.txt | 6 ++++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 35f35dc..2191eca 100755 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -120,8 +120,7 @@ jobs: cd build lcov --keep-going --ignore-errors mismatch --directory . --capture --output-file coverage.info lcov --ignore-errors mismatch --remove coverage.info '*/test/*' -o coverage.info - cd .. - gcovr -e test -e examples --sonarqube -u -o coverage.xml --exclude-throw-branches \ + gcovr --root .. -e '../test' -e '../examples' --sonarqube -o coverage.xml --exclude-throw-branches \ --gcov-ignore-parse-errors --exclude-unreachable-branches - name: Upload coverage to Codecov @@ -141,7 +140,7 @@ jobs: -Dsonar.projectKey=simgrid_dtlmod -Dsonar.organization=simgrid -Dsonar.cfamily.compile-commands=build/compile_commands.json - -Dsonar.coverageReportPaths=coverage.xml + -Dsonar.coverageReportPaths=build/coverage.xml - name: Build and deploy documentation if: github.ref == 'refs/heads/main' && env.TOKEN_GITHUB != '' diff --git a/CMakeLists.txt b/CMakeLists.txt index 4ae0f6e..67dae7d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -217,6 +217,12 @@ set_target_properties(dtlmod PROPERTIES LINKER_LANGUAGE CXX PUBLIC_HEADER "${HEADER_FILES}") +# Enable coverage for Debug builds +if(CMAKE_BUILD_TYPE STREQUAL "Debug") + target_compile_options(dtlmod PRIVATE -g -O0 --coverage) + target_link_options(dtlmod PRIVATE --coverage) +endif() + target_include_directories(dtlmod PUBLIC $ From b3be4e71fd51eb974e62d11a5ef6e4c99baac573 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Mon, 26 Jan 2026 16:09:47 -0500 Subject: [PATCH 14/92] fix a code snippet --- doc/source/app_Actors.rst | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/doc/source/app_Actors.rst b/doc/source/app_Actors.rst index 2a9a2b4..493a55f 100644 --- a/doc/source/app_Actors.rst +++ b/doc/source/app_Actors.rst @@ -22,7 +22,7 @@ Data publisher .. code-block:: cpp - void distributed_publisher() { + void distributed_publisher(int num_ranks, int rank) { // Connect from the DTL auto dtl = DTL::connect() // Add a ``Data'' stream using a ``File'' engine @@ -31,7 +31,11 @@ Data publisher ->set_transport_method("File"); // Define a 2D array of int distributed over multiple ranks - auto V = s->define_variable("V", {size, size}, {size*rank, size*rank}, {l_size, l_size}, sizeof(int)); + // Each rank owns a 100 x 100 block + auto V = s->define_variable("V", {l_size * num_ranks, l_size}, // global shape + {l_size * rank, 0}, // local offset + {l_size, l_size}, // local count + sizeof(int)); // element size // Open the stream in ``Publish'' mode auto e = s->open("cluster:file_system:/working_dir/", Stream::Mode::Publish); @@ -40,7 +44,6 @@ Data publisher // Compute 1e3 floating point operations per element sg4::this_actor::execute(V->get_local_size() * 1e3); // Then publish ``V'' to the DTL - e->begin_transaction(); e->put(V); e->end_transaction(); From 05901b4d80967d148f3fdd42e8180e6b4ad874d5 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Wed, 28 Jan 2026 08:16:12 -0500 Subject: [PATCH 15/92] tiny revision of doc example --- doc/source/app_Actors.rst | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/doc/source/app_Actors.rst b/doc/source/app_Actors.rst index 493a55f..6d628a8 100644 --- a/doc/source/app_Actors.rst +++ b/doc/source/app_Actors.rst @@ -23,7 +23,7 @@ Data publisher .. code-block:: cpp void distributed_publisher(int num_ranks, int rank) { - // Connect from the DTL + // Connect to the DTL auto dtl = DTL::connect() // Add a ``Data'' stream using a ``File'' engine auto s = dtl->add_stream("Data") @@ -61,20 +61,19 @@ Data subscriber .. code-block:: cpp void subscriber() { - // Connect from the DTL auto dtl = DTL::connect() - // Add a stream + // Add the already defined ``Data'' stream auto s = dtl->add_stream("Data"); - // Obtain metadata for variable ``V'' + // Obtain information on variable ``V'' auto V = s->inquire_variable("V"); // Open the stream in ``Subscribe'' mode auto e = s->open("cluster:file_system:/working_dir/", Stream::Mode::Subscribe); for (int i = 0; i < 10 ; i++) { - // Get the latest transaction for variable ``V'' + // Get variable ``V'' from the DTL e->begin_transaction(); e->get(V); e->end_transaction(); @@ -82,9 +81,7 @@ Data subscriber sg4::this_actor::execute(V->get_local_size() * 1e3); } - // Close the engine e->close(); - // Disconnect from the DTL DTL::disconnect(); } From e0b85720589ea7be05d203ba8a864a5b29cc3c3a Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Wed, 11 Feb 2026 11:17:21 -0500 Subject: [PATCH 16/92] add barrier for subscribers in FileEngine too (at least to synchronize on closing) --- src/FileEngine.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/FileEngine.cpp b/src/FileEngine.cpp index 4a83b01..ce27613 100644 --- a/src/FileEngine.cpp +++ b/src/FileEngine.cpp @@ -219,6 +219,11 @@ void FileEngine::end_sub_transaction() transport->clear_to_read_in_transaction(self); XBT_DEBUG("All on-flight subscribe activities are completed."); + + // This is the end of the first transaction, create a barrier + if (auto sub_barrier = get_subscribers().get_or_create_barrier()) + XBT_DEBUG("Barrier created for %zu subscribers", get_subscribers().count()); + // Mark this transaction as over sub_transaction_in_progress_ = false; } From e9d4669720a9da1d5a417710af35a8a4d6bec3b6 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Wed, 11 Feb 2026 12:17:27 -0500 Subject: [PATCH 17/92] exclude some defensive guards from coverage reports --- src/FileEngine.cpp | 5 ++--- src/StagingEngine.cpp | 4 ++-- src/Stream.cpp | 4 ++-- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/FileEngine.cpp b/src/FileEngine.cpp index ce27613..3f709b8 100644 --- a/src/FileEngine.cpp +++ b/src/FileEngine.cpp @@ -74,9 +74,8 @@ void FileEngine::create_transport(const Transport::Method& /*transport_method*/) std::shared_ptr FileEngine::get_file_transport() const { auto transport = std::dynamic_pointer_cast(get_transport()); - if (!transport) { - throw TransportEngineMismatchException(XBT_THROW_POINT, "Transport is not a FileTransport"); - } + if (!transport) // LCOV_EXCL_LINE + throw TransportEngineMismatchException(XBT_THROW_POINT, "Transport is not a FileTransport"); // LCOV_EXCL_LINE return transport; } diff --git a/src/StagingEngine.cpp b/src/StagingEngine.cpp index 447477b..8b02c14 100644 --- a/src/StagingEngine.cpp +++ b/src/StagingEngine.cpp @@ -36,8 +36,8 @@ void StagingEngine::create_transport(const Transport::Method& transport_method) std::shared_ptr StagingEngine::get_staging_transport() const { auto transport = std::dynamic_pointer_cast(get_transport()); - if (!transport) - throw TransportEngineMismatchException(XBT_THROW_POINT, "Transport is not a StagingTransport"); + if (!transport) // LCOV_EXCL_LINE + throw TransportEngineMismatchException(XBT_THROW_POINT, "Transport is not a StagingTransport"); // LCOV_EXCL_LINE return transport; } diff --git a/src/Stream.cpp b/src/Stream.cpp index 2874616..568c102 100644 --- a/src/Stream.cpp +++ b/src/Stream.cpp @@ -58,7 +58,7 @@ std::optional Stream::get_engine_type_str() const noexcept if (type == engine_type_) return str; } - return std::nullopt; + return std::nullopt; // LCOV_EXCL_LINE } Stream& Stream::set_engine_type(const Engine::Type& engine_type) @@ -98,7 +98,7 @@ std::optional Stream::get_transport_method_str() const noexcept if (method == transport_method_) return str; } - return std::nullopt; + return std::nullopt; // LCOV_EXCL_LINE } Stream& Stream::set_transport_method(const Transport::Method& transport_method) From 216745dd1dc094bb34f092f4486776fe166b7856 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Wed, 11 Feb 2026 12:33:24 -0500 Subject: [PATCH 18/92] more tests and coverage exclusion --- src/Variable.cpp | 6 +++--- test/dtl_variable.cpp | 23 +++++++++++++++++++++++ test/python/dtl_variable.py | 25 +++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 3 deletions(-) diff --git a/src/Variable.cpp b/src/Variable.cpp index e3a7194..5972ff3 100644 --- a/src/Variable.cpp +++ b/src/Variable.cpp @@ -97,9 +97,9 @@ Variable::get_sizes_to_get_per_block(unsigned int transaction_id, const std::vec "Internal error: dimension mismatch in get_sizes_to_get_per_block"); std::vector> get_sizes_per_block; - // Validate transaction_id is within valid range - if (transaction_id > metadata_->get_current_transaction()) - throw InvalidTransactionIdException(XBT_THROW_POINT, std::to_string(transaction_id)); + // Validate transaction_id is within valid range (defense-in-depth: Transport also checks this) + if (transaction_id > metadata_->get_current_transaction()) // LCOV_EXCL_LINE + throw InvalidTransactionIdException(XBT_THROW_POINT, std::to_string(transaction_id)); // LCOV_EXCL_LINE auto blocks = metadata_->get_blocks_for_transaction(transaction_id); XBT_DEBUG("%zu block(s) to check for transaction %u", blocks.size(), transaction_id); diff --git a/test/dtl_variable.cpp b/test/dtl_variable.cpp index ea4b6d4..7cc192c 100644 --- a/test/dtl_variable.cpp +++ b/test/dtl_variable.cpp @@ -126,6 +126,29 @@ TEST_F(DTLVariableTest, InconsistentVariableDefinition) }); } +TEST_F(DTLVariableTest, OverflowVariableSize) +{ + DO_TEST_WITH_FORK([this]() { + this->setup_platform(); + host_->add_actor("TestActor", [this]() { + XBT_INFO("Connect to the DTL"); + auto dtl = dtlmod::DTL::connect(); + XBT_INFO("Create a stream"); + auto stream = dtl->add_stream("Stream"); + XBT_INFO("Define a variable whose dimensions overflow size_t when computing global size"); + auto var = + stream->define_variable("huge", {std::numeric_limits::max() / 2, 3}, {0, 0}, {1, 1}, sizeof(double)); + XBT_INFO("Calling get_global_size() should trigger an overflow exception"); + ASSERT_THROW(var->get_global_size(), std::overflow_error); + XBT_INFO("Disconnect the actor from the DTL"); + ASSERT_NO_THROW(dtlmod::DTL::disconnect()); + }); + + // Run the simulation + ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); + }); +} + TEST_F(DTLVariableTest, MultiDefineVariable) { DO_TEST_WITH_FORK([this]() { diff --git a/test/python/dtl_variable.py b/test/python/dtl_variable.py index d5348af..ce5e23c 100644 --- a/test/python/dtl_variable.py +++ b/test/python/dtl_variable.py @@ -131,6 +131,30 @@ def inconsistent_variable_definition(): host.add_actor("TestActor", inconsistent_variable_definition) e.run() +def run_test_overflow_variable_size(): + e, host = setup_platform() + + def overflow_variable_size(): + this_actor.info("Connect to the DTL") + dtl = DTL.connect() + this_actor.info("Create a stream") + stream = dtl.add_stream("Stream") + this_actor.info("Define a variable whose dimensions overflow size_t when computing global size") + max_size_t = ctypes.c_size_t(-1).value + var = stream.define_variable("huge", (max_size_t // 2, 3), (0, 0), (1, 1), + ctypes.sizeof(ctypes.c_double)) + this_actor.info("Calling global_size should trigger an overflow exception") + try: + _ = var.global_size + assert False, "Expected overflow exception was not raised" + except OverflowError: + pass # Test passes + this_actor.info("Disconnect the actor from the DTL") + DTL.disconnect() + + host.add_actor("TestActor", overflow_variable_size) + e.run() + def run_test_multi_define_variable(): e, host = setup_platform() @@ -337,6 +361,7 @@ def consumer(): tests = [ run_test_define_variable, run_test_inconsistent_variable_definition, + run_test_overflow_variable_size, run_test_multi_define_variable, run_test_distributed_variable, run_test_remove_variable, From 542eea229b4969fb46121ddbdbd68511c3d49d45 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Wed, 11 Feb 2026 12:50:43 -0500 Subject: [PATCH 19/92] prevent nodiscard warning --- test/dtl_variable.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/dtl_variable.cpp b/test/dtl_variable.cpp index 7cc192c..8ea03d8 100644 --- a/test/dtl_variable.cpp +++ b/test/dtl_variable.cpp @@ -139,7 +139,7 @@ TEST_F(DTLVariableTest, OverflowVariableSize) auto var = stream->define_variable("huge", {std::numeric_limits::max() / 2, 3}, {0, 0}, {1, 1}, sizeof(double)); XBT_INFO("Calling get_global_size() should trigger an overflow exception"); - ASSERT_THROW(var->get_global_size(), std::overflow_error); + ASSERT_THROW((void)var->get_global_size(), std::overflow_error); XBT_INFO("Disconnect the actor from the DTL"); ASSERT_NO_THROW(dtlmod::DTL::disconnect()); }); From a6d1c74f4743c468f11039492bab892bd68be5ec Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Wed, 11 Feb 2026 12:51:47 -0500 Subject: [PATCH 20/92] new test where subscribers arrive firsts and wait for publishers --- test/dtl_stream.cpp | 39 +++++++++++++++++++++++++++++++++++++++ test/python/dtl_stream.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/test/dtl_stream.cpp b/test/dtl_stream.cpp index eea102e..c2886d4 100644 --- a/test/dtl_stream.cpp +++ b/test/dtl_stream.cpp @@ -164,6 +164,45 @@ TEST_F(DTLStreamTest, PublishFileStreamOpenClose) }); } +TEST_F(DTLStreamTest, SubscriberWaitsForEngine) +{ + DO_TEST_WITH_FORK([this]() { + this->setup_platform(); + // Publisher sleeps before opening, so subscriber arrives first and waits for engine creation + prod_host_->add_actor("TestProducerActor", [this]() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("Stream"); + stream->set_transport_method(dtlmod::Transport::Method::File); + stream->set_engine_type(dtlmod::Engine::Type::File); + XBT_INFO("Publisher sleeps before opening the stream"); + sg4::this_actor::sleep_for(1); + XBT_INFO("Publisher opens the stream, creating the engine"); + auto engine = stream->open("zone:fs:/pfs/file", dtlmod::Stream::Mode::Publish); + ASSERT_EQ(stream->get_num_publishers(), 1U); + sg4::this_actor::sleep_for(1); + engine->close(); + dtlmod::DTL::disconnect(); + }); + + // Subscriber opens immediately, will block in wait_for_engine_creation() + cons_host_->add_actor("TestSubscriberActor", [this]() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("Stream"); + stream->set_transport_method(dtlmod::Transport::Method::File); + stream->set_engine_type(dtlmod::Engine::Type::File); + XBT_INFO("Subscriber opens the stream before the publisher, should wait for engine creation"); + auto engine = stream->open("zone:fs:/pfs/file", dtlmod::Stream::Mode::Subscribe); + XBT_INFO("Subscriber unblocked, engine is now available"); + ASSERT_TRUE(engine != nullptr); + engine->close(); + dtlmod::DTL::disconnect(); + }); + + // Run the simulation + ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); + }); +} + TEST_F(DTLStreamTest, PublishFileMultipleOpen) { DO_TEST_WITH_FORK([this]() { diff --git a/test/python/dtl_stream.py b/test/python/dtl_stream.py index 6b98b73..1888f00 100644 --- a/test/python/dtl_stream.py +++ b/test/python/dtl_stream.py @@ -162,6 +162,39 @@ def test_producer_actor(): e.run() +def run_test_subscriber_waits_for_engine(): + e, prod_host, cons_host = setup_platform() + + def publisher(): + dtl = DTL.connect() + stream = dtl.add_stream("Stream") + stream.set_transport_method(Transport.Method.File) + stream.set_engine_type(DTLEngine.Type.File) + this_actor.info("Publisher sleeps before opening the stream") + this_actor.sleep_for(1) + this_actor.info("Publisher opens the stream, creating the engine") + engine = stream.open("zone:fs:/pfs/file", Stream.Mode.Publish) + assert stream.num_publishers == 1 + this_actor.sleep_for(1) + engine.close() + DTL.disconnect() + + def subscriber(): + dtl = DTL.connect() + stream = dtl.add_stream("Stream") + stream.set_transport_method(Transport.Method.File) + stream.set_engine_type(DTLEngine.Type.File) + this_actor.info("Subscriber opens the stream before the publisher, should wait for engine creation") + engine = stream.open("zone:fs:/pfs/file", Stream.Mode.Subscribe) + this_actor.info("Subscriber unblocked, engine is now available") + assert engine is not None + engine.close() + DTL.disconnect() + + prod_host.add_actor("TestProducerActor", publisher) + cons_host.add_actor("TestSubscriberActor", subscriber) + e.run() + def run_test_publish_file_muliple_open(): e, prod_host, cons_host = setup_platform() def test_producer_actor(): @@ -216,6 +249,7 @@ def test_consumer_actor(): tests = [ run_test_incorrect_stream_settings, run_test_publish_file_stream_open_close, + run_test_subscriber_waits_for_engine, run_test_publish_file_muliple_open ] From 434cd3ef376a097fe099d3b74b3712fa4456bcdb Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Wed, 11 Feb 2026 13:24:57 -0500 Subject: [PATCH 21/92] do not guard against race conditions that cannot happen thanks to SimGrid's maestro orchestration --- include/dtlmod/Stream.hpp | 5 +---- src/Stream.cpp | 16 ---------------- test/dtl_stream.cpp | 39 --------------------------------------- test/python/dtl_stream.py | 34 ---------------------------------- 4 files changed, 1 insertion(+), 93 deletions(-) diff --git a/include/dtlmod/Stream.hpp b/include/dtlmod/Stream.hpp index 6632e70..6f92ee0 100644 --- a/include/dtlmod/Stream.hpp +++ b/include/dtlmod/Stream.hpp @@ -7,7 +7,6 @@ #define __DTLMOD_STREAM_HPP__ #include -#include #include "dtlmod/Engine.hpp" @@ -47,8 +46,7 @@ class Stream : public std::enable_shared_from_this { Transport::Method transport_method_ = Transport::Method::Undefined; bool metadata_export_ = false; std::string metadata_file_; - sg4::MutexPtr mutex_ = sg4::Mutex::create(); - sg4::ConditionVariablePtr engine_created_ = sg4::ConditionVariable::create(); + sg4::MutexPtr mutex_ = sg4::Mutex::create(); Mode access_mode_; std::unordered_map> variables_; @@ -68,7 +66,6 @@ class Stream : public std::enable_shared_from_this { // Helper methods for Stream::open void validate_open_parameters(std::string_view name, Mode mode) const; void create_engine_if_needed(std::string_view name, Mode mode); - void wait_for_engine_creation(); void register_actor_with_engine(Mode mode) const; // Helper method for Stream::define_variable diff --git a/src/Stream.cpp b/src/Stream.cpp index 568c102..7203301 100644 --- a/src/Stream.cpp +++ b/src/Stream.cpp @@ -190,27 +190,12 @@ void Stream::create_engine_if_needed(std::string_view name, Mode mode) if (metadata_export_) metadata_file_ = boost::replace_all_copy(engine_->get_name(), "/", "#") + "#md." + std::to_string(std::chrono::system_clock::now().time_since_epoch().count()); - engine_created_->notify_all(); } catch (...) { - // Notify waiting threads even on failure so they don't deadlock - engine_created_->notify_all(); throw; // Re-throw the exception } } } -/// Wait for the Engine to be created by another actor if needed. -void Stream::wait_for_engine_creation() -{ - if (not engine_) { - std::unique_lock lock(*mutex_); - while (not engine_) { - XBT_DEBUG("%s waits for the creation of the engine", sg4::this_actor::get_cname()); - engine_created_->wait(lock); - } - } -} - /// Register the current actor as a publisher or subscriber with the Engine. void Stream::register_actor_with_engine(Mode mode) const { @@ -232,7 +217,6 @@ std::shared_ptr Stream::open(std::string_view name, Mode mode) { validate_open_parameters(name, mode); create_engine_if_needed(name, mode); - wait_for_engine_creation(); register_actor_with_engine(mode); XBT_DEBUG("Stream '%s' uses engine '%s' and transport '%s' (%zu Pub. / %zu Sub.)", get_cname(), get_engine_type_str().value_or("Unknown"), get_transport_method_str().value_or("Unknown"), diff --git a/test/dtl_stream.cpp b/test/dtl_stream.cpp index c2886d4..eea102e 100644 --- a/test/dtl_stream.cpp +++ b/test/dtl_stream.cpp @@ -164,45 +164,6 @@ TEST_F(DTLStreamTest, PublishFileStreamOpenClose) }); } -TEST_F(DTLStreamTest, SubscriberWaitsForEngine) -{ - DO_TEST_WITH_FORK([this]() { - this->setup_platform(); - // Publisher sleeps before opening, so subscriber arrives first and waits for engine creation - prod_host_->add_actor("TestProducerActor", [this]() { - auto dtl = dtlmod::DTL::connect(); - auto stream = dtl->add_stream("Stream"); - stream->set_transport_method(dtlmod::Transport::Method::File); - stream->set_engine_type(dtlmod::Engine::Type::File); - XBT_INFO("Publisher sleeps before opening the stream"); - sg4::this_actor::sleep_for(1); - XBT_INFO("Publisher opens the stream, creating the engine"); - auto engine = stream->open("zone:fs:/pfs/file", dtlmod::Stream::Mode::Publish); - ASSERT_EQ(stream->get_num_publishers(), 1U); - sg4::this_actor::sleep_for(1); - engine->close(); - dtlmod::DTL::disconnect(); - }); - - // Subscriber opens immediately, will block in wait_for_engine_creation() - cons_host_->add_actor("TestSubscriberActor", [this]() { - auto dtl = dtlmod::DTL::connect(); - auto stream = dtl->add_stream("Stream"); - stream->set_transport_method(dtlmod::Transport::Method::File); - stream->set_engine_type(dtlmod::Engine::Type::File); - XBT_INFO("Subscriber opens the stream before the publisher, should wait for engine creation"); - auto engine = stream->open("zone:fs:/pfs/file", dtlmod::Stream::Mode::Subscribe); - XBT_INFO("Subscriber unblocked, engine is now available"); - ASSERT_TRUE(engine != nullptr); - engine->close(); - dtlmod::DTL::disconnect(); - }); - - // Run the simulation - ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); - }); -} - TEST_F(DTLStreamTest, PublishFileMultipleOpen) { DO_TEST_WITH_FORK([this]() { diff --git a/test/python/dtl_stream.py b/test/python/dtl_stream.py index 1888f00..6b98b73 100644 --- a/test/python/dtl_stream.py +++ b/test/python/dtl_stream.py @@ -162,39 +162,6 @@ def test_producer_actor(): e.run() -def run_test_subscriber_waits_for_engine(): - e, prod_host, cons_host = setup_platform() - - def publisher(): - dtl = DTL.connect() - stream = dtl.add_stream("Stream") - stream.set_transport_method(Transport.Method.File) - stream.set_engine_type(DTLEngine.Type.File) - this_actor.info("Publisher sleeps before opening the stream") - this_actor.sleep_for(1) - this_actor.info("Publisher opens the stream, creating the engine") - engine = stream.open("zone:fs:/pfs/file", Stream.Mode.Publish) - assert stream.num_publishers == 1 - this_actor.sleep_for(1) - engine.close() - DTL.disconnect() - - def subscriber(): - dtl = DTL.connect() - stream = dtl.add_stream("Stream") - stream.set_transport_method(Transport.Method.File) - stream.set_engine_type(DTLEngine.Type.File) - this_actor.info("Subscriber opens the stream before the publisher, should wait for engine creation") - engine = stream.open("zone:fs:/pfs/file", Stream.Mode.Subscribe) - this_actor.info("Subscriber unblocked, engine is now available") - assert engine is not None - engine.close() - DTL.disconnect() - - prod_host.add_actor("TestProducerActor", publisher) - cons_host.add_actor("TestSubscriberActor", subscriber) - e.run() - def run_test_publish_file_muliple_open(): e, prod_host, cons_host = setup_platform() def test_producer_actor(): @@ -249,7 +216,6 @@ def test_consumer_actor(): tests = [ run_test_incorrect_stream_settings, run_test_publish_file_stream_open_close, - run_test_subscriber_waits_for_engine, run_test_publish_file_muliple_open ] From 9a76dd6db9c0fd8486cc8dfa08a1f346186d5156 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Wed, 11 Feb 2026 13:49:03 -0500 Subject: [PATCH 22/92] strip branch attributes so SonarQube uses line coverage only --- .github/workflows/build.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 2191eca..d67422b 100755 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -122,6 +122,9 @@ jobs: lcov --ignore-errors mismatch --remove coverage.info '*/test/*' -o coverage.info gcovr --root .. -e '../test' -e '../examples' --sonarqube -o coverage.xml --exclude-throw-branches \ --gcov-ignore-parse-errors --exclude-unreachable-branches + # Strip branch attributes so SonarQube uses line coverage only + # (gcovr's sonarqube format includes branch data that penalizes macro-generated conditionals like XBT_DEBUG) + sed -i 's/ branchesToCover="[^"]*" coveredBranches="[^"]*"//g' coverage.xml - name: Upload coverage to Codecov if: env.CODECOV_TOKEN != '' From 7cd289b36f4b8ada2016e755c6a4569435ecef5a Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Wed, 11 Feb 2026 14:11:09 -0500 Subject: [PATCH 23/92] remove now useless try/catch --- src/Stream.cpp | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/src/Stream.cpp b/src/Stream.cpp index 7203301..cc6707e 100644 --- a/src/Stream.cpp +++ b/src/Stream.cpp @@ -175,24 +175,20 @@ void Stream::create_engine_if_needed(std::string_view name, Mode mode) if (not engine_) { std::shared_ptr temp_engine; - try { - if (engine_type_ == Engine::Type::Staging) { - temp_engine = std::make_shared(name, shared_from_this()); - temp_engine->create_transport(transport_method_); - } else if (engine_type_ == Engine::Type::File) { - temp_engine = std::make_shared(name, shared_from_this()); - temp_engine->create_transport(transport_method_); - } - - // Only commit if fully initialized - engine_ = std::move(temp_engine); - access_mode_ = mode; - if (metadata_export_) - metadata_file_ = boost::replace_all_copy(engine_->get_name(), "/", "#") + "#md." + - std::to_string(std::chrono::system_clock::now().time_since_epoch().count()); - } catch (...) { - throw; // Re-throw the exception + if (engine_type_ == Engine::Type::Staging) { + temp_engine = std::make_shared(name, shared_from_this()); + temp_engine->create_transport(transport_method_); + } else if (engine_type_ == Engine::Type::File) { + temp_engine = std::make_shared(name, shared_from_this()); + temp_engine->create_transport(transport_method_); } + + // Only commit if fully initialized + engine_ = std::move(temp_engine); + access_mode_ = mode; + if (metadata_export_) + metadata_file_ = boost::replace_all_copy(engine_->get_name(), "/", "#") + "#md." + + std::to_string(std::chrono::system_clock::now().time_since_epoch().count()); } } From 4fbeb8ed821fb276bca048cbacf2d854cb89f40e Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Thu, 12 Feb 2026 12:55:58 -0500 Subject: [PATCH 24/92] use python tests in coverage reports --- CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 67dae7d..b1fd6a9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -157,6 +157,12 @@ if(enable_python) set_property(TARGET python-bindings APPEND PROPERTY INCLUDE_DIRECTORIES "${INTERNAL_INCLUDES}") + # Enable coverage for Debug builds + if(CMAKE_BUILD_TYPE STREQUAL "Debug") + target_compile_options(python-bindings PRIVATE -g -O0 --coverage) + target_link_options(python-bindings PRIVATE --coverage) + endif() + if("${DTLMOD_PYTHON_LIBDIR}" STREQUAL "") # value not manually set if("${CMAKE_INSTALL_PREFIX}" STREQUAL "/usr") set(DTLMOD_PYTHON_LIBDIR ${Python3_SITEARCH}) From 52708d86ecf0db8cfba460a749c7cc9e631d14fc Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Thu, 12 Feb 2026 18:47:29 -0500 Subject: [PATCH 25/92] test DTL::get_all_streams() --- test/dtl_config.cpp | 7 +++++++ test/python/dtl_config.py | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/test/dtl_config.cpp b/test/dtl_config.cpp index 39f4677..1f1b1bd 100644 --- a/test/dtl_config.cpp +++ b/test/dtl_config.cpp @@ -97,6 +97,13 @@ TEST_F(DTLConfigTest, ConfigFile) XBT_INFO("Close the engine"); ASSERT_NO_THROW(engine->close()); + XBT_INFO("Check get_all_streams returns both configured streams"); + const auto& all_streams = dtl->get_all_streams(); + ASSERT_EQ(all_streams.size(), 3U); + ASSERT_TRUE(all_streams.find("Stream1") != all_streams.end()); + ASSERT_TRUE(all_streams.find("Stream2") != all_streams.end()); + ASSERT_TRUE(all_streams.find("Stream3") != all_streams.end()); + XBT_INFO("Disconnect the actor from the DTL"); ASSERT_NO_THROW(dtlmod::DTL::disconnect()); }); diff --git a/test/python/dtl_config.py b/test/python/dtl_config.py index bc8d42b..648eca9 100644 --- a/test/python/dtl_config.py +++ b/test/python/dtl_config.py @@ -64,6 +64,13 @@ def test_config_file(): this_actor.info("Close the engine") engine.close() + this_actor.info("Check all_streams returns both configured streams") + all_streams = dtl.all_streams + assert len(all_streams) == 3 + assert "Stream1" in all_streams + assert "Stream2" in all_streams + assert "Stream3" in all_streams + this_actor.info("Disconnect from the DTL") DTL.disconnect() From e345cf6a9c02497534648987699152717ee12495 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Thu, 12 Feb 2026 18:48:53 -0500 Subject: [PATCH 26/92] improve coverage of python bindings using named free functions instead of lambdas --- src/bindings/python/dtlmod_python.cpp | 95 ++++++++++++++++----------- 1 file changed, 55 insertions(+), 40 deletions(-) diff --git a/src/bindings/python/dtlmod_python.cpp b/src/bindings/python/dtlmod_python.cpp index 79a62b5..5ec2920 100644 --- a/src/bindings/python/dtlmod_python.cpp +++ b/src/bindings/python/dtlmod_python.cpp @@ -43,6 +43,46 @@ std::string get_dtlmod_version() dtlmod_version_get(&major, &minor, &patch); return simgrid::xbt::string_printf("%i.%i.%i", major, minor, patch); } + +py::object stream_engine_type_str(const Stream& self) +{ + auto result = self.get_engine_type_str(); + return result ? py::cast(*result) : py::cast(Py_None); +} + +py::object stream_transport_method_str(const Stream& self) +{ + auto result = self.get_transport_method_str(); + return result ? py::cast(*result) : py::cast(Py_None); +} + +std::shared_ptr dtl_stream_by_name(const DTL& self, std::string_view name) +{ + return self.get_stream_by_name(name).value_or(nullptr); +} + +std::shared_ptr stream_define_scalar_variable(Stream& self, std::string_view name, size_t element_size) +{ + return self.define_variable(name, element_size); +} + +std::shared_ptr stream_define_variable(Stream& self, std::string_view name, const std::vector& shape, + const std::vector& start, const std::vector& count, + size_t element_size) +{ + return self.define_variable(name, shape, start, count, element_size); +} + +void variable_set_transaction_selection_single(Variable& self, unsigned int transaction_id) +{ + self.set_transaction_selection(transaction_id); +} + +void variable_set_transaction_selection_range(Variable& self, unsigned int begin, unsigned int count) +{ + self.set_transaction_selection(begin, count); +} + } // namespace PYBIND11_MODULE(dtlmod, m) @@ -86,29 +126,17 @@ PYBIND11_MODULE(dtlmod, m) "Add a data stream to the DTL") .def_property_readonly("all_streams", &DTL::get_all_streams, "Retrieve all streams declared in the DTL (read-only)") - .def( - "stream_by_name", - [](const DTL& self, std::string_view name) { return self.get_stream_by_name(name).value_or(nullptr); }, - py::arg("name"), "Retrieve a data stream from the DTL by its name (returns None if not found)"); + .def("stream_by_name", &dtl_stream_by_name, py::arg("name"), + "Retrieve a data stream from the DTL by its name (returns None if not found)"); /* Class Stream */ py::class_> stream( m, "Stream", "A Stream defines the connection between the applications that produce or consume data and the DTL"); stream.def_property_readonly("name", &Stream::get_name, "The name of the Stream (read-only)") - .def_property_readonly( - "engine_type", - [](const Stream& self) { - auto result = self.get_engine_type_str(); - return result ? py::cast(*result) : py::cast(Py_None); - }, - "Print out the engine type of this Stream (read-only, returns None if invalid)") - .def_property_readonly( - "transport_method", - [](const Stream& self) { - auto result = self.get_transport_method_str(); - return result ? py::cast(*result) : py::cast(Py_None); - }, - "Print out the transport method of this Stream (read-only, returns None if invalid)") + .def_property_readonly("engine_type", &stream_engine_type_str, + "Print out the engine type of this Stream (read-only, returns None if invalid)") + .def_property_readonly("transport_method", &stream_transport_method_str, + "Print out the transport method of this Stream (read-only, returns None if invalid)") .def_property_readonly("access_mode", &Stream::get_access_mode_str, "Print out the access mode of this Stream (read-only)") .def_property_readonly("metadata_export", &Stream::does_export_metadata, @@ -129,20 +157,11 @@ PYBIND11_MODULE(dtlmod, m) .def_property_readonly("num_subscribers", &Stream::get_num_subscribers, "The number of actors connected to this Stream in Mode::Subscribe (read-only)") // Variable factory - .def( - "define_variable", - [](Stream& self, std::string_view name, size_t element_size) { - return self.define_variable(name, element_size); - }, - py::call_guard(), py::arg("name"), py::arg("element_size"), - "Define a scalar variable for this Stream") - .def( - "define_variable", - [](Stream& self, std::string_view name, const std::vector& shape, const std::vector& start, - const std::vector& count, - size_t element_size) { return self.define_variable(name, shape, start, count, element_size); }, - py::call_guard(), py::arg("name"), py::arg("shape"), py::arg("start"), - py::arg("count"), py::arg("element_size"), "Define a variable for this Stream") + .def("define_variable", &stream_define_scalar_variable, py::call_guard(), py::arg("name"), + py::arg("element_size"), "Define a scalar variable for this Stream") + .def("define_variable", &stream_define_variable, py::call_guard(), py::arg("name"), + py::arg("shape"), py::arg("start"), py::arg("count"), py::arg("element_size"), + "Define a variable for this Stream") .def_property_readonly("all_variables", &Stream::get_all_variables, "Retrieve the list of Variables by names") .def_property_readonly("metadata_file_name", &Stream::get_metadata_file_name, "The name of the file in which the stream stores metadata (read-only)") @@ -163,14 +182,10 @@ PYBIND11_MODULE(dtlmod, m) .def_property_readonly("local_size", &Variable::get_local_size, "The local size of the Variable for the current actor (read-only)") .def_property_readonly("global_size", &Variable::get_global_size, "The global size of the Variable (read-only)") - .def( - "set_transaction_selection", - [](Variable& self, unsigned int transaction_id) { self.set_transaction_selection(transaction_id); }, - py::arg("transaction_id"), "Set the selection of transactions to consider for this Variable") - .def( - "set_transaction_selection", - [](Variable& self, unsigned int begin, unsigned int count) { self.set_transaction_selection(begin, count); }, - py::arg("begin"), py::arg("count"), "Set the selection of transactions to consider for this Variable") + .def("set_transaction_selection", &variable_set_transaction_selection_single, py::arg("transaction_id"), + "Set the selection of transactions to consider for this Variable") + .def("set_transaction_selection", &variable_set_transaction_selection_range, py::arg("begin"), py::arg("count"), + "Set the selection of transactions to consider for this Variable") .def("set_selection", &Variable::set_selection, py::arg("start"), py::arg("count"), "Set the selection of elements to consider for this Variable"); From b23f68d70e720fc0325f1c62efce2c193ca6101c Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Thu, 12 Feb 2026 19:00:51 -0500 Subject: [PATCH 27/92] Revert "improve coverage of python bindings using named free functions instead of lambdas" This reverts commit e345cf6a9c02497534648987699152717ee12495. --- src/bindings/python/dtlmod_python.cpp | 95 +++++++++++---------------- 1 file changed, 40 insertions(+), 55 deletions(-) diff --git a/src/bindings/python/dtlmod_python.cpp b/src/bindings/python/dtlmod_python.cpp index 5ec2920..79a62b5 100644 --- a/src/bindings/python/dtlmod_python.cpp +++ b/src/bindings/python/dtlmod_python.cpp @@ -43,46 +43,6 @@ std::string get_dtlmod_version() dtlmod_version_get(&major, &minor, &patch); return simgrid::xbt::string_printf("%i.%i.%i", major, minor, patch); } - -py::object stream_engine_type_str(const Stream& self) -{ - auto result = self.get_engine_type_str(); - return result ? py::cast(*result) : py::cast(Py_None); -} - -py::object stream_transport_method_str(const Stream& self) -{ - auto result = self.get_transport_method_str(); - return result ? py::cast(*result) : py::cast(Py_None); -} - -std::shared_ptr dtl_stream_by_name(const DTL& self, std::string_view name) -{ - return self.get_stream_by_name(name).value_or(nullptr); -} - -std::shared_ptr stream_define_scalar_variable(Stream& self, std::string_view name, size_t element_size) -{ - return self.define_variable(name, element_size); -} - -std::shared_ptr stream_define_variable(Stream& self, std::string_view name, const std::vector& shape, - const std::vector& start, const std::vector& count, - size_t element_size) -{ - return self.define_variable(name, shape, start, count, element_size); -} - -void variable_set_transaction_selection_single(Variable& self, unsigned int transaction_id) -{ - self.set_transaction_selection(transaction_id); -} - -void variable_set_transaction_selection_range(Variable& self, unsigned int begin, unsigned int count) -{ - self.set_transaction_selection(begin, count); -} - } // namespace PYBIND11_MODULE(dtlmod, m) @@ -126,17 +86,29 @@ PYBIND11_MODULE(dtlmod, m) "Add a data stream to the DTL") .def_property_readonly("all_streams", &DTL::get_all_streams, "Retrieve all streams declared in the DTL (read-only)") - .def("stream_by_name", &dtl_stream_by_name, py::arg("name"), - "Retrieve a data stream from the DTL by its name (returns None if not found)"); + .def( + "stream_by_name", + [](const DTL& self, std::string_view name) { return self.get_stream_by_name(name).value_or(nullptr); }, + py::arg("name"), "Retrieve a data stream from the DTL by its name (returns None if not found)"); /* Class Stream */ py::class_> stream( m, "Stream", "A Stream defines the connection between the applications that produce or consume data and the DTL"); stream.def_property_readonly("name", &Stream::get_name, "The name of the Stream (read-only)") - .def_property_readonly("engine_type", &stream_engine_type_str, - "Print out the engine type of this Stream (read-only, returns None if invalid)") - .def_property_readonly("transport_method", &stream_transport_method_str, - "Print out the transport method of this Stream (read-only, returns None if invalid)") + .def_property_readonly( + "engine_type", + [](const Stream& self) { + auto result = self.get_engine_type_str(); + return result ? py::cast(*result) : py::cast(Py_None); + }, + "Print out the engine type of this Stream (read-only, returns None if invalid)") + .def_property_readonly( + "transport_method", + [](const Stream& self) { + auto result = self.get_transport_method_str(); + return result ? py::cast(*result) : py::cast(Py_None); + }, + "Print out the transport method of this Stream (read-only, returns None if invalid)") .def_property_readonly("access_mode", &Stream::get_access_mode_str, "Print out the access mode of this Stream (read-only)") .def_property_readonly("metadata_export", &Stream::does_export_metadata, @@ -157,11 +129,20 @@ PYBIND11_MODULE(dtlmod, m) .def_property_readonly("num_subscribers", &Stream::get_num_subscribers, "The number of actors connected to this Stream in Mode::Subscribe (read-only)") // Variable factory - .def("define_variable", &stream_define_scalar_variable, py::call_guard(), py::arg("name"), - py::arg("element_size"), "Define a scalar variable for this Stream") - .def("define_variable", &stream_define_variable, py::call_guard(), py::arg("name"), - py::arg("shape"), py::arg("start"), py::arg("count"), py::arg("element_size"), - "Define a variable for this Stream") + .def( + "define_variable", + [](Stream& self, std::string_view name, size_t element_size) { + return self.define_variable(name, element_size); + }, + py::call_guard(), py::arg("name"), py::arg("element_size"), + "Define a scalar variable for this Stream") + .def( + "define_variable", + [](Stream& self, std::string_view name, const std::vector& shape, const std::vector& start, + const std::vector& count, + size_t element_size) { return self.define_variable(name, shape, start, count, element_size); }, + py::call_guard(), py::arg("name"), py::arg("shape"), py::arg("start"), + py::arg("count"), py::arg("element_size"), "Define a variable for this Stream") .def_property_readonly("all_variables", &Stream::get_all_variables, "Retrieve the list of Variables by names") .def_property_readonly("metadata_file_name", &Stream::get_metadata_file_name, "The name of the file in which the stream stores metadata (read-only)") @@ -182,10 +163,14 @@ PYBIND11_MODULE(dtlmod, m) .def_property_readonly("local_size", &Variable::get_local_size, "The local size of the Variable for the current actor (read-only)") .def_property_readonly("global_size", &Variable::get_global_size, "The global size of the Variable (read-only)") - .def("set_transaction_selection", &variable_set_transaction_selection_single, py::arg("transaction_id"), - "Set the selection of transactions to consider for this Variable") - .def("set_transaction_selection", &variable_set_transaction_selection_range, py::arg("begin"), py::arg("count"), - "Set the selection of transactions to consider for this Variable") + .def( + "set_transaction_selection", + [](Variable& self, unsigned int transaction_id) { self.set_transaction_selection(transaction_id); }, + py::arg("transaction_id"), "Set the selection of transactions to consider for this Variable") + .def( + "set_transaction_selection", + [](Variable& self, unsigned int begin, unsigned int count) { self.set_transaction_selection(begin, count); }, + py::arg("begin"), py::arg("count"), "Set the selection of transactions to consider for this Variable") .def("set_selection", &Variable::set_selection, py::arg("start"), py::arg("count"), "Set the selection of elements to consider for this Variable"); From a81418e3ebeff0e8513d9588d4e5f133ef2082fd Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Thu, 12 Feb 2026 19:03:09 -0500 Subject: [PATCH 28/92] exclude bindings definition from coverage report, it's glue code, not application logic --- .github/workflows/build.yml | 4 ++-- sonar-project.properties | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d67422b..8aec1c2 100755 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -119,8 +119,8 @@ jobs: run: | cd build lcov --keep-going --ignore-errors mismatch --directory . --capture --output-file coverage.info - lcov --ignore-errors mismatch --remove coverage.info '*/test/*' -o coverage.info - gcovr --root .. -e '../test' -e '../examples' --sonarqube -o coverage.xml --exclude-throw-branches \ + lcov --ignore-errors mismatch --remove coverage.info '*/test/*' '*/bindings/*' -o coverage.info + gcovr --root .. -e '../test' -e '../examples' -e '../src/bindings' --sonarqube -o coverage.xml --exclude-throw-branches \ --gcov-ignore-parse-errors --exclude-unreachable-branches # Strip branch attributes so SonarQube uses line coverage only # (gcovr's sonarqube format includes branch data that penalizes macro-generated conditionals like XBT_DEBUG) diff --git a/sonar-project.properties b/sonar-project.properties index 5bf04b8..3c44fe4 100644 --- a/sonar-project.properties +++ b/sonar-project.properties @@ -10,7 +10,7 @@ sonar.links.scm=https://github.com/simgrid/DTLMod/ # Comma-separated paths to directories with sources (required) sonar.sources=src,include sonar.tests=test -sonar.coverage.exclusions=test/** # do not consider test directory for coverage +sonar.coverage.exclusions=test/**,src/bindings/** # do not consider test directory and Python bindings glue for coverage sonar.cfamily.reportingCppStandardOverride=c++17 From a989425534739742a9e69853a420a024ff5f3ebc Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Thu, 12 Feb 2026 19:10:37 -0500 Subject: [PATCH 29/92] fix coverage reporting --- .github/workflows/build.yml | 2 +- sonar-project.properties | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 8aec1c2..78568f6 100755 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -120,7 +120,7 @@ jobs: cd build lcov --keep-going --ignore-errors mismatch --directory . --capture --output-file coverage.info lcov --ignore-errors mismatch --remove coverage.info '*/test/*' '*/bindings/*' -o coverage.info - gcovr --root .. -e '../test' -e '../examples' -e '../src/bindings' --sonarqube -o coverage.xml --exclude-throw-branches \ + gcovr --root .. -e '../test' -e '../src/bindings' --sonarqube -o coverage.xml --exclude-throw-branches \ --gcov-ignore-parse-errors --exclude-unreachable-branches # Strip branch attributes so SonarQube uses line coverage only # (gcovr's sonarqube format includes branch data that penalizes macro-generated conditionals like XBT_DEBUG) diff --git a/sonar-project.properties b/sonar-project.properties index 3c44fe4..f30ce31 100644 --- a/sonar-project.properties +++ b/sonar-project.properties @@ -9,8 +9,9 @@ sonar.projectVersion=0.2 sonar.links.scm=https://github.com/simgrid/DTLMod/ # Comma-separated paths to directories with sources (required) sonar.sources=src,include +sonar.exclusions=src/bindings/** sonar.tests=test -sonar.coverage.exclusions=test/**,src/bindings/** # do not consider test directory and Python bindings glue for coverage +sonar.coverage.exclusions=test/** # do not consider test directory for coverage sonar.cfamily.reportingCppStandardOverride=c++17 From 021b490ec89e25eb96b7dfce974d9f75af8a5567 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Thu, 12 Feb 2026 19:23:08 -0500 Subject: [PATCH 30/92] coverage false negative: NVRO artifacts and defensive guards --- include/dtlmod/ActorRegistry.hpp | 4 ++-- src/FileEngine.cpp | 2 +- src/StagingEngine.cpp | 2 +- src/Stream.cpp | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/dtlmod/ActorRegistry.hpp b/include/dtlmod/ActorRegistry.hpp index 38cbec2..e323606 100644 --- a/include/dtlmod/ActorRegistry.hpp +++ b/include/dtlmod/ActorRegistry.hpp @@ -46,8 +46,8 @@ class ActorRegistry { [[nodiscard]] bool contains(sg4::ActorPtr actor) const noexcept { - if (!actor) - return false; // Safe handling for noexcept + if (!actor) // LCOV_EXCL_LINE + return false; // LCOV_EXCL_LINE return actors_.find(actor) != actors_.end(); } diff --git a/src/FileEngine.cpp b/src/FileEngine.cpp index 3f709b8..dbffb02 100644 --- a/src/FileEngine.cpp +++ b/src/FileEngine.cpp @@ -77,7 +77,7 @@ std::shared_ptr FileEngine::get_file_transport() const if (!transport) // LCOV_EXCL_LINE throw TransportEngineMismatchException(XBT_THROW_POINT, "Transport is not a FileTransport"); // LCOV_EXCL_LINE return transport; -} +} // LCOV_EXCL_LINE std::string FileEngine::get_path_to_dataset() const { diff --git a/src/StagingEngine.cpp b/src/StagingEngine.cpp index 8b02c14..443c7fd 100644 --- a/src/StagingEngine.cpp +++ b/src/StagingEngine.cpp @@ -39,7 +39,7 @@ std::shared_ptr StagingEngine::get_staging_transport() const if (!transport) // LCOV_EXCL_LINE throw TransportEngineMismatchException(XBT_THROW_POINT, "Transport is not a StagingTransport"); // LCOV_EXCL_LINE return transport; -} +} // LCOV_EXCL_LINE void StagingEngine::begin_pub_transaction() { diff --git a/src/Stream.cpp b/src/Stream.cpp index cc6707e..5aedece 100644 --- a/src/Stream.cpp +++ b/src/Stream.cpp @@ -334,7 +334,7 @@ std::vector Stream::get_all_variables() const for (const auto& [name, var] : variables_) variable_names.push_back(name); return variable_names; -} +} // LCOV_EXCL_LINE std::shared_ptr Stream::inquire_variable(std::string_view name) const { From e2975ceb35d2b9a73d70f6305aa5afc026f16b05 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Thu, 12 Feb 2026 21:11:31 -0500 Subject: [PATCH 31/92] Improvements after code review --- Brainstorm_and_TODOs.md | 2 +- include/dtlmod/CompressionReductionMethod.hpp | 23 ++++++++++++------- include/dtlmod/DTLException.hpp | 8 +++---- include/dtlmod/DecimationReductionMethod.hpp | 23 ++++++++++--------- include/dtlmod/FileTransport.hpp | 2 +- include/dtlmod/Metadata.hpp | 2 +- include/dtlmod/ReductionMethod.hpp | 18 +++++++-------- include/dtlmod/Variable.hpp | 5 ++++ src/CompressionReductionMethod.cpp | 2 +- src/DecimationReductionMethod.cpp | 8 +++---- src/Engine.cpp | 4 ++-- src/Metadata.cpp | 8 ++++--- src/Stream.cpp | 2 +- src/Variable.cpp | 7 ++++-- 14 files changed, 66 insertions(+), 48 deletions(-) diff --git a/Brainstorm_and_TODOs.md b/Brainstorm_and_TODOs.md index dcabb80..5c4051c 100644 --- a/Brainstorm_and_TODOs.md +++ b/Brainstorm_and_TODOs.md @@ -128,4 +128,4 @@ - [ ] add tests in `test/dtl_reduction.cpp` - [ ] SimpleDecimationFileEngine: - [ ] add python binding -- [ ] add documentation \ No newline at end of file +- [ ] add documentation diff --git a/include/dtlmod/CompressionReductionMethod.hpp b/include/dtlmod/CompressionReductionMethod.hpp index 83d5875..a97f650 100644 --- a/include/dtlmod/CompressionReductionMethod.hpp +++ b/include/dtlmod/CompressionReductionMethod.hpp @@ -40,26 +40,33 @@ class CompressionReductionMethod : public ReductionMethod { public: CompressionReductionMethod(const std::string& name) : ReductionMethod(name) {} - void parameterize_for_variable(std::shared_ptr var, + void parameterize_for_variable(const std::shared_ptr& var, const std::map& parameters) override; - void reduce_variable(std::shared_ptr /* var*/) override {} - [[nodiscard]] size_t get_reduced_variable_global_size(std::shared_ptr /*var*/) const override { return 0; } - [[nodiscard]] size_t get_reduced_variable_local_size(std::shared_ptr /*var*/) const override { return 0; } - [[nodiscard]] const std::vector& get_reduced_variable_shape(std::shared_ptr var) const override + void reduce_variable(const std::shared_ptr& /* var*/) override {} + [[nodiscard]] size_t get_reduced_variable_global_size(const std::shared_ptr& /*var*/) const override + { + return 0; + } + [[nodiscard]] size_t get_reduced_variable_local_size(const std::shared_ptr& /*var*/) const override + { + return 0; + } + [[nodiscard]] const std::vector& + get_reduced_variable_shape(const std::shared_ptr& var) const override { return var->get_shape(); } [[nodiscard]] const std::pair, std::vector>& - get_reduced_start_and_count_for(std::shared_ptr /*var*/, sg4::ActorPtr /*publisher*/) const override + get_reduced_start_and_count_for(const std::shared_ptr& /*var*/, sg4::ActorPtr /*publisher*/) const override { throw std::runtime_error("not implemented"); // return;// std::make_pair(std::vector(), std::vector()); } - [[nodiscard]] double get_flop_amount_to_reduce_variable(std::shared_ptr /*var*/) const override + [[nodiscard]] double get_flop_amount_to_reduce_variable(const std::shared_ptr& /*var*/) const override { return 0.0; } }; /// \endcond } // namespace dtlmod -#endif //__DTLMOD_COMPRESSION_REDUCTION_METHOD_HPP__ \ No newline at end of file +#endif //__DTLMOD_COMPRESSION_REDUCTION_METHOD_HPP__ diff --git a/include/dtlmod/DTLException.hpp b/include/dtlmod/DTLException.hpp index 0a954fc..14ee7c5 100644 --- a/include/dtlmod/DTLException.hpp +++ b/include/dtlmod/DTLException.hpp @@ -49,7 +49,7 @@ DECLARE_DTLMOD_EXCEPTION(UnknownOpenModeException, "Unknown open mode. Should be DECLARE_DTLMOD_EXCEPTION(UnknownVariableException, "Unknown Variable"); DECLARE_DTLMOD_EXCEPTION(MultipleVariableDefinitionException, "Multiple Variable Definition"); -DECLARE_DTLMOD_EXCEPTION(InconsistentVariableDefinitionException, "Insconsistent Variable Definition"); +DECLARE_DTLMOD_EXCEPTION(InconsistentVariableDefinitionException, "Inconsistent Variable Definition"); DECLARE_DTLMOD_EXCEPTION(IncorrectPathDefinitionException, "Fullpath must be structured as follows: " "netzone_name:file_system_name:/path/to/file_name"); @@ -58,10 +58,10 @@ DECLARE_DTLMOD_EXCEPTION(InvalidTransactionIdException, DECLARE_DTLMOD_EXCEPTION(GetWhenNoTransactionException, "Impossible to get. No transaction exists for variable"); DECLARE_DTLMOD_EXCEPTION(UnknownReductionMethodException, - "Unkown Reduction Method. Options are 'decimation' and 'compression'"); + "Unknown Reduction Method. Options are 'decimation' and 'compression'"); DECLARE_DTLMOD_EXCEPTION(InconsistentDecimationInterpolationException, - "Insconsistent Decimation Interpolation definition"); -DECLARE_DTLMOD_EXCEPTION(InconsistentDecimationStrideException, "Insconsistent Decimation Stride definition"); + "Inconsistent Decimation Interpolation definition"); +DECLARE_DTLMOD_EXCEPTION(InconsistentDecimationStrideException, "Inconsistent Decimation Stride definition"); DECLARE_DTLMOD_EXCEPTION(UnknownDecimationOptionException, "Unknown Decimation option"); DECLARE_DTLMOD_EXCEPTION(UnknownDecimationInterpolationException, "Unknown Decimation interpolation method"); DECLARE_DTLMOD_EXCEPTION(DoubleReductionException, "Double reduction is forbidden"); diff --git a/include/dtlmod/DecimationReductionMethod.hpp b/include/dtlmod/DecimationReductionMethod.hpp index edd6633..6804489 100644 --- a/include/dtlmod/DecimationReductionMethod.hpp +++ b/include/dtlmod/DecimationReductionMethod.hpp @@ -19,7 +19,7 @@ class ParameterizedDecimation { std::shared_ptr var_; // The variable to which this parameterized decimation is applied std::vector stride_; - std::string interpolation_method_ = ""; + std::string interpolation_method_; double cost_per_element_; std::vector reduced_shape_; @@ -49,8 +49,8 @@ class ParameterizedDecimation { [[nodiscard]] double get_flop_amount_to_decimate() const; public: - ParameterizedDecimation(std::shared_ptr var, const std::vector stride, - const std::string interpolation_method, double cost_per_element) + ParameterizedDecimation(const std::shared_ptr& var, const std::vector& stride, + const std::string& interpolation_method, double cost_per_element) : var_(var), stride_(stride), interpolation_method_(interpolation_method), cost_per_element_(cost_per_element) { } @@ -60,33 +60,34 @@ class DecimationReductionMethod : public ReductionMethod { std::map, std::shared_ptr> per_variable_parameterizations_; protected: - void parameterize_for_variable(std::shared_ptr var, + void parameterize_for_variable(const std::shared_ptr& var, const std::map& parameters) override; - void reduce_variable(std::shared_ptr var); + void reduce_variable(const std::shared_ptr& var) override; - [[nodiscard]] size_t get_reduced_variable_global_size(std::shared_ptr var) const override + [[nodiscard]] size_t get_reduced_variable_global_size(const std::shared_ptr& var) const override { return per_variable_parameterizations_.at(var)->get_global_reduced_size(); } - [[nodiscard]] size_t get_reduced_variable_local_size(std::shared_ptr var) const override + [[nodiscard]] size_t get_reduced_variable_local_size(const std::shared_ptr& var) const override { return per_variable_parameterizations_.at(var)->get_local_reduced_size(); } - [[nodiscard]] double get_flop_amount_to_reduce_variable(std::shared_ptr var) const override + [[nodiscard]] double get_flop_amount_to_reduce_variable(const std::shared_ptr& var) const override { return per_variable_parameterizations_.at(var)->get_flop_amount_to_decimate(); } - [[nodiscard]] const std::vector& get_reduced_variable_shape(std::shared_ptr var) const override + [[nodiscard]] const std::vector& + get_reduced_variable_shape(const std::shared_ptr& var) const override { return per_variable_parameterizations_.at(var)->get_reduced_shape(); } [[nodiscard]] const std::pair, std::vector>& - get_reduced_start_and_count_for(std::shared_ptr var, sg4::ActorPtr publisher) const override + get_reduced_start_and_count_for(const std::shared_ptr& var, sg4::ActorPtr publisher) const override { return per_variable_parameterizations_.at(var)->get_reduced_start_and_count_for(publisher); } @@ -96,4 +97,4 @@ class DecimationReductionMethod : public ReductionMethod { }; ///\endcond } // namespace dtlmod -#endif //__DTLMOD_DECIMATION_REDUCTION_METHOD_HPP__ \ No newline at end of file +#endif //__DTLMOD_DECIMATION_REDUCTION_METHOD_HPP__ diff --git a/include/dtlmod/FileTransport.hpp b/include/dtlmod/FileTransport.hpp index e0b79ec..ff06ca8 100644 --- a/include/dtlmod/FileTransport.hpp +++ b/include/dtlmod/FileTransport.hpp @@ -31,7 +31,7 @@ class FileTransport : public Transport { void close_pub_files() const; void close_sub_files(sg4::ActorPtr self); const std::vector, sg_size_t>>& - get_to_write_in_transaction_by_actor(sg4::ActorPtr actor) noexcept + get_to_write_in_transaction_by_actor(sg4::ActorPtr actor) { return to_write_in_transaction_[actor]; } diff --git a/include/dtlmod/Metadata.hpp b/include/dtlmod/Metadata.hpp index 643448c..24e10bb 100644 --- a/include/dtlmod/Metadata.hpp +++ b/include/dtlmod/Metadata.hpp @@ -20,7 +20,7 @@ class Variable; /// \cond EXCLUDE_FROM_DOCUMENTATION class Metadata { friend Variable; - std::shared_ptr variable_; + std::weak_ptr variable_; std::map, std::vector>, // starts and counts diff --git a/include/dtlmod/ReductionMethod.hpp b/include/dtlmod/ReductionMethod.hpp index 2ae0814..45f2fbe 100644 --- a/include/dtlmod/ReductionMethod.hpp +++ b/include/dtlmod/ReductionMethod.hpp @@ -27,15 +27,15 @@ class ReductionMethod { public: ReductionMethod(const std::string& name) : name_(name) {} - virtual void parameterize_for_variable(std::shared_ptr var, - const std::map& parameters) = 0; - virtual void reduce_variable(std::shared_ptr var) = 0; - virtual size_t get_reduced_variable_global_size(std::shared_ptr var) const = 0; - virtual size_t get_reduced_variable_local_size(std::shared_ptr var) const = 0; - virtual const std::vector& get_reduced_variable_shape(std::shared_ptr var) const = 0; + virtual void parameterize_for_variable(const std::shared_ptr& var, + const std::map& parameters) = 0; + virtual void reduce_variable(const std::shared_ptr& var) = 0; + virtual size_t get_reduced_variable_global_size(const std::shared_ptr& var) const = 0; + virtual size_t get_reduced_variable_local_size(const std::shared_ptr& var) const = 0; + virtual const std::vector& get_reduced_variable_shape(const std::shared_ptr& var) const = 0; virtual const std::pair, std::vector>& - get_reduced_start_and_count_for(std::shared_ptr var, simgrid::s4u::ActorPtr publisher) const = 0; - virtual double get_flop_amount_to_reduce_variable(std::shared_ptr var) const = 0; + get_reduced_start_and_count_for(const std::shared_ptr& var, simgrid::s4u::ActorPtr publisher) const = 0; + virtual double get_flop_amount_to_reduce_variable(const std::shared_ptr& var) const = 0; /// @brief Helper function to print out the name of the ReductionMethod. /// @return The corresponding string @@ -46,4 +46,4 @@ class ReductionMethod { }; ///\endcond } // namespace dtlmod -#endif //__DTLMOD_REDUCTION_METHOD_HPP__ \ No newline at end of file +#endif //__DTLMOD_REDUCTION_METHOD_HPP__ diff --git a/include/dtlmod/Variable.hpp b/include/dtlmod/Variable.hpp index 71c25c7..7c36612 100644 --- a/include/dtlmod/Variable.hpp +++ b/include/dtlmod/Variable.hpp @@ -83,6 +83,11 @@ class Variable : public std::enable_shared_from_this { { } + + Variable(const Variable&) = delete; + Variable& operator=(const Variable&) = delete; + Variable(Variable&&) = delete; + Variable& operator=(Variable&&) = delete; /// \endcond /// @brief Helper function to print out the name of the Variable. diff --git a/src/CompressionReductionMethod.cpp b/src/CompressionReductionMethod.cpp index 6b9a279..d9259ba 100644 --- a/src/CompressionReductionMethod.cpp +++ b/src/CompressionReductionMethod.cpp @@ -13,7 +13,7 @@ XBT_LOG_NEW_DEFAULT_SUBCATEGORY(dtlmod_compression_reduction, dtlmod, "DTL loggi namespace dtlmod { -void CompressionReductionMethod::parameterize_for_variable(std::shared_ptr var, +void CompressionReductionMethod::parameterize_for_variable(const std::shared_ptr& var, const std::map& parameters) { double new_accuracy = 1.0; diff --git a/src/DecimationReductionMethod.cpp b/src/DecimationReductionMethod.cpp index 6886402..bc82cf9 100644 --- a/src/DecimationReductionMethod.cpp +++ b/src/DecimationReductionMethod.cpp @@ -49,11 +49,11 @@ double ParameterizedDecimation::get_flop_amount_to_decimate() const return amount; } -void DecimationReductionMethod::parameterize_for_variable(std::shared_ptr var, +void DecimationReductionMethod::parameterize_for_variable(const std::shared_ptr& var, const std::map& parameters) { std::vector new_stride; - std::string new_interpolation_method = ""; + std::string new_interpolation_method; double new_cost_per_element = 1.0; // Detect existing parameterization (if any). @@ -126,7 +126,7 @@ void DecimationReductionMethod::parameterize_for_variable(std::shared_ptrset_cost_per_element(new_cost_per_element); } -void DecimationReductionMethod::reduce_variable(std::shared_ptr var) +void DecimationReductionMethod::reduce_variable(const std::shared_ptr& var) { auto parameterization = per_variable_parameterizations_[var]; auto original_shape = var->get_shape(); @@ -148,7 +148,7 @@ void DecimationReductionMethod::reduce_variable(std::shared_ptr var) size_t r_start = std::ceil(start[i] / (stride[i] * 1.0)); size_t r_next_start = std::min(original_shape[i], static_cast(std::ceil((start[i] + count[i]) / (stride[i] * 1.0)))); - XBT_DEBUG("Dim %lu: stride = %lu, Start = %lu, r_start = %lu, Count = %lu, r_count = %lu", i, stride[i], start[i], + XBT_DEBUG("Dim %zu: stride = %zu, Start = %zu, r_start = %zu, Count = %zu, r_count = %zu", i, stride[i], start[i], r_start, count[i], r_next_start - r_start); reduced_start.push_back(r_start); reduced_count.push_back(r_next_start - r_start); diff --git a/src/Engine.cpp b/src/Engine.cpp index 42caecf..55cfd0f 100644 --- a/src/Engine.cpp +++ b/src/Engine.cpp @@ -38,7 +38,7 @@ void Engine::put(const std::shared_ptr& var) const sg4::this_actor::execute(var->get_reduction_method()->get_flop_amount_to_reduce_variable(var)); XBT_DEBUG("Variable %s has been reduced!", var->get_cname()); // Now put the reduced version of the variable into the DTL, i.e., using its reduced local size. - XBT_DEBUG("Put this reduced version of %s (initial size = %lu, reduced size = %lu)", var->get_cname(), + XBT_DEBUG("Put this reduced version of %s (initial size = %zu, reduced size = %zu)", var->get_cname(), var->get_local_size(), var->get_reduction_method()->get_reduced_variable_local_size(var)); transport_->put(var, var->get_reduction_method()->get_reduced_variable_local_size(var)); } else @@ -54,7 +54,7 @@ void Engine::put(const std::shared_ptr& var, size_t simulated_size_in_ void Engine::get(const std::shared_ptr& var) const { if (var->is_reduced() && var->is_reduced_by_subscriber()) { - var->is_reduced_with_->reduce_variable(var); + var->get_reduction_method()->reduce_variable(var); // Perform an Exec activity before putting the variable into the DTL to account for the time needed to reduce it. sg4::this_actor::execute(var->get_reduction_method()->get_flop_amount_to_reduce_variable(var)); } diff --git a/src/Metadata.cpp b/src/Metadata.cpp index 28b5978..e78ca8c 100644 --- a/src/Metadata.cpp +++ b/src/Metadata.cpp @@ -20,10 +20,12 @@ void Metadata::add_transaction(unsigned int id, void Metadata::export_to_file(std::ofstream& ostream) const { - XBT_DEBUG("Variable %s:", variable_->get_cname()); - ostream << variable_->get_element_size() << "\t" << variable_->get_cname() << "\t" << transaction_infos_.size(); + auto var = variable_.lock(); + xbt_assert(var, "Metadata::export_to_file called after its Variable has been destroyed"); + XBT_DEBUG("Variable %s:", var->get_cname()); + ostream << var->get_element_size() << "\t" << var->get_cname() << "\t" << transaction_infos_.size(); ostream << "*{"; - auto shape = variable_->get_shape(); + auto shape = var->get_shape(); const auto last_index = shape.size() - 1; for (unsigned int i = 0; i < last_index; i++) ostream << shape[i] << ","; diff --git a/src/Stream.cpp b/src/Stream.cpp index 18dd959..d8c9f76 100644 --- a/src/Stream.cpp +++ b/src/Stream.cpp @@ -9,7 +9,6 @@ #include #include "dtlmod/CompressionReductionMethod.hpp" -#include "dtlmod/DecimationReductionMethod.hpp" #include "dtlmod/DTL.hpp" #include "dtlmod/DTLException.hpp" #include "dtlmod/DecimationReductionMethod.hpp" @@ -332,6 +331,7 @@ std::shared_ptr Stream::define_variable(std::string_view name, const s // Validate parameters validate_variable_parameters(shape, start, count, element_size); + std::unique_lock lock(*mutex_); auto publisher = sg4::Actor::self(); std::string name_str(name); auto var = variables_.find(name_str); diff --git a/src/Variable.cpp b/src/Variable.cpp index 22b4dd2..0a6305c 100644 --- a/src/Variable.cpp +++ b/src/Variable.cpp @@ -61,8 +61,11 @@ void Variable::set_transaction_selection(unsigned int begin, unsigned int count) void Variable::set_reduction_operation(std::shared_ptr method, std::map parameters) { + auto stream = defined_in_stream_.lock(); + xbt_assert(stream, "Variable::set_reduction_operation called after its Stream has been destroyed"); + if (is_reduced_with_ && reduction_origin_ == ReductionOrigin::Publisher && - defined_in_stream_.lock()->get_access_mode() == Stream::Mode::Subscribe) { + stream->get_access_mode() == Stream::Mode::Subscribe) { XBT_ERROR("Subscriber %s attempted to re-reduce Variable %s, but it was already reduced on publisher side.", sg4::Actor::self()->get_cname(), this->get_cname()); throw DoubleReductionException( @@ -73,7 +76,7 @@ void Variable::set_reduction_operation(std::shared_ptr method, method->parameterize_for_variable(shared_from_this(), parameters); method->reduce_variable(shared_from_this()); is_reduced_with_ = method; - if (defined_in_stream_.lock()->get_access_mode() == Stream::Mode::Publish) + if (stream->get_access_mode() == Stream::Mode::Publish) reduction_origin_ = ReductionOrigin::Publisher; else reduction_origin_ = ReductionOrigin::Subscriber; From c707b5a557ac7fb157d9e1479d802d2ee257aef1 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Thu, 12 Feb 2026 23:30:37 -0500 Subject: [PATCH 32/92] Improvements in design after code review --- include/dtlmod/CompressionReductionMethod.hpp | 73 +++++------- include/dtlmod/DecimationReductionMethod.hpp | 111 +++++++++--------- include/dtlmod/ReductionMethod.hpp | 16 ++- src/CompressionReductionMethod.cpp | 8 +- src/DecimationReductionMethod.cpp | 28 ++--- src/Engine.cpp | 10 +- src/Transport.cpp | 2 +- src/Variable.cpp | 16 +-- 8 files changed, 125 insertions(+), 139 deletions(-) diff --git a/include/dtlmod/CompressionReductionMethod.hpp b/include/dtlmod/CompressionReductionMethod.hpp index a97f650..c4b396b 100644 --- a/include/dtlmod/CompressionReductionMethod.hpp +++ b/include/dtlmod/CompressionReductionMethod.hpp @@ -12,60 +12,49 @@ namespace dtlmod { /// \cond EXCLUDE_FROM_DOCUMENTATION -class ParameterizedCompression { - friend class CompressionReductionMethod; - double accuracy_; - double compression_cost_per_element_; - double decompression_cost_per_element_; - -protected: - [[nodiscard]] double get_accuracy() const { return accuracy_; } - void set_accuracy(double accuracy) { accuracy_ = accuracy; } - [[nodiscard]] double get_compression_cost_per_element() const { return compression_cost_per_element_; } - void set_compression_cost_per_element(double cost) { compression_cost_per_element_ = cost; } - [[nodiscard]] double get_decompression_cost_per_element() const { return decompression_cost_per_element_; } - void set_decompression_cost_per_element(double cost) { decompression_cost_per_element_ = cost; } - -public: - ParameterizedCompression(double accuracy, double compression_cost_per_element, double decompression_cost_per_element) - : accuracy_(accuracy) - , compression_cost_per_element_(compression_cost_per_element) - , decompression_cost_per_element_(decompression_cost_per_element) - { - } -}; class CompressionReductionMethod : public ReductionMethod { - std::map, std::shared_ptr> per_variable_parameterizations_; + class ParameterizedCompression { + double accuracy_; + double compression_cost_per_element_; + double decompression_cost_per_element_; + + public: + ParameterizedCompression(double accuracy, double compression_cost_per_element, + double decompression_cost_per_element) + : accuracy_(accuracy) + , compression_cost_per_element_(compression_cost_per_element) + , decompression_cost_per_element_(decompression_cost_per_element) + { + } + + [[nodiscard]] double get_accuracy() const { return accuracy_; } + void set_accuracy(double accuracy) { accuracy_ = accuracy; } + [[nodiscard]] double get_compression_cost_per_element() const { return compression_cost_per_element_; } + void set_compression_cost_per_element(double cost) { compression_cost_per_element_ = cost; } + [[nodiscard]] double get_decompression_cost_per_element() const { return decompression_cost_per_element_; } + void set_decompression_cost_per_element(double cost) { decompression_cost_per_element_ = cost; } + }; + + std::map> per_variable_parameterizations_; public: CompressionReductionMethod(const std::string& name) : ReductionMethod(name) {} - void parameterize_for_variable(const std::shared_ptr& var, - const std::map& parameters) override; - void reduce_variable(const std::shared_ptr& /* var*/) override {} - [[nodiscard]] size_t get_reduced_variable_global_size(const std::shared_ptr& /*var*/) const override + void parameterize_for_variable(const Variable& var, const std::map& parameters) override; + void reduce_variable(const Variable& /* var*/) override {} + [[nodiscard]] size_t get_reduced_variable_global_size(const Variable& /*var*/) const override { return 0; } + [[nodiscard]] size_t get_reduced_variable_local_size(const Variable& /*var*/) const override { return 0; } + [[nodiscard]] const std::vector& get_reduced_variable_shape(const Variable& var) const override { - return 0; - } - [[nodiscard]] size_t get_reduced_variable_local_size(const std::shared_ptr& /*var*/) const override - { - return 0; - } - [[nodiscard]] const std::vector& - get_reduced_variable_shape(const std::shared_ptr& var) const override - { - return var->get_shape(); + return var.get_shape(); } [[nodiscard]] const std::pair, std::vector>& - get_reduced_start_and_count_for(const std::shared_ptr& /*var*/, sg4::ActorPtr /*publisher*/) const override + get_reduced_start_and_count_for(const Variable& /*var*/, sg4::ActorPtr /*publisher*/) const override { throw std::runtime_error("not implemented"); // return;// std::make_pair(std::vector(), std::vector()); } - [[nodiscard]] double get_flop_amount_to_reduce_variable(const std::shared_ptr& /*var*/) const override - { - return 0.0; - } + [[nodiscard]] double get_flop_amount_to_reduce_variable(const Variable& /*var*/) const override { return 0.0; } }; /// \endcond } // namespace dtlmod diff --git a/include/dtlmod/DecimationReductionMethod.hpp b/include/dtlmod/DecimationReductionMethod.hpp index 6804489..e2f6f52 100644 --- a/include/dtlmod/DecimationReductionMethod.hpp +++ b/include/dtlmod/DecimationReductionMethod.hpp @@ -14,82 +14,79 @@ namespace dtlmod { /// \cond EXCLUDE_FROM_DOCUMENTATION -class ParameterizedDecimation { - friend class DecimationReductionMethod; - std::shared_ptr var_; // The variable to which this parameterized decimation is applied - - std::vector stride_; - std::string interpolation_method_; - double cost_per_element_; - - std::vector reduced_shape_; - std::unordered_map, std::vector>> reduced_local_start_and_count_; - -protected: - void set_reduced_shape(const std::vector& reduced_shape) { reduced_shape_ = reduced_shape; } - void set_reduced_local_start_and_count(sg4::ActorPtr actor, const std::vector& reduced_local_start, - const std::vector& reduced_local_count) - { - reduced_local_start_and_count_.try_emplace(actor, std::make_pair(reduced_local_start, reduced_local_count)); - } - - [[nodiscard]] const std::vector& get_stride() const { return stride_; } - void set_stride(const std::vector& stride) { stride_ = stride; } - [[nodiscard]] const std::string& get_interpolation_method() const { return interpolation_method_; } - void set_interpolation_method(const std::string& method) { interpolation_method_ = method; } - [[nodiscard]] double get_cost_per_element() const { return cost_per_element_; } - void set_cost_per_element(double cost) { cost_per_element_ = cost; } - - [[nodiscard]] const std::vector& get_reduced_shape() const { return reduced_shape_; } - - [[nodiscard]] size_t get_global_reduced_size() const; - [[nodiscard]] size_t get_local_reduced_size() const; - [[nodiscard]] const std::pair, std::vector>& - get_reduced_start_and_count_for(sg4::ActorPtr publisher) const; - [[nodiscard]] double get_flop_amount_to_decimate() const; - -public: - ParameterizedDecimation(const std::shared_ptr& var, const std::vector& stride, - const std::string& interpolation_method, double cost_per_element) - : var_(var), stride_(stride), interpolation_method_(interpolation_method), cost_per_element_(cost_per_element) - { - } -}; - class DecimationReductionMethod : public ReductionMethod { - std::map, std::shared_ptr> per_variable_parameterizations_; + class ParameterizedDecimation { + const Variable* var_; // non-owning: the Variable outlives the parameterization (both owned by Stream) + + std::vector stride_; + std::string interpolation_method_; + double cost_per_element_; + + std::vector reduced_shape_; + std::unordered_map, std::vector>> + reduced_local_start_and_count_; + + public: + ParameterizedDecimation(const Variable& var, const std::vector& stride, + const std::string& interpolation_method, double cost_per_element) + : var_(&var), stride_(stride), interpolation_method_(interpolation_method), cost_per_element_(cost_per_element) + { + } + + void set_reduced_shape(const std::vector& reduced_shape) { reduced_shape_ = reduced_shape; } + void set_reduced_local_start_and_count(sg4::ActorPtr actor, const std::vector& reduced_local_start, + const std::vector& reduced_local_count) + { + reduced_local_start_and_count_.try_emplace(actor, std::make_pair(reduced_local_start, reduced_local_count)); + } + + [[nodiscard]] const std::vector& get_stride() const { return stride_; } + void set_stride(const std::vector& stride) { stride_ = stride; } + [[nodiscard]] const std::string& get_interpolation_method() const { return interpolation_method_; } + void set_interpolation_method(const std::string& method) { interpolation_method_ = method; } + [[nodiscard]] double get_cost_per_element() const { return cost_per_element_; } + void set_cost_per_element(double cost) { cost_per_element_ = cost; } + + [[nodiscard]] const std::vector& get_reduced_shape() const { return reduced_shape_; } + + [[nodiscard]] size_t get_global_reduced_size() const; + [[nodiscard]] size_t get_local_reduced_size() const; + [[nodiscard]] const std::pair, std::vector>& + get_reduced_start_and_count_for(sg4::ActorPtr publisher) const; + [[nodiscard]] double get_flop_amount_to_decimate() const; + }; + + std::map> per_variable_parameterizations_; protected: - void parameterize_for_variable(const std::shared_ptr& var, - const std::map& parameters) override; + void parameterize_for_variable(const Variable& var, const std::map& parameters) override; - void reduce_variable(const std::shared_ptr& var) override; + void reduce_variable(const Variable& var) override; - [[nodiscard]] size_t get_reduced_variable_global_size(const std::shared_ptr& var) const override + [[nodiscard]] size_t get_reduced_variable_global_size(const Variable& var) const override { - return per_variable_parameterizations_.at(var)->get_global_reduced_size(); + return per_variable_parameterizations_.at(&var)->get_global_reduced_size(); } - [[nodiscard]] size_t get_reduced_variable_local_size(const std::shared_ptr& var) const override + [[nodiscard]] size_t get_reduced_variable_local_size(const Variable& var) const override { - return per_variable_parameterizations_.at(var)->get_local_reduced_size(); + return per_variable_parameterizations_.at(&var)->get_local_reduced_size(); } - [[nodiscard]] double get_flop_amount_to_reduce_variable(const std::shared_ptr& var) const override + [[nodiscard]] double get_flop_amount_to_reduce_variable(const Variable& var) const override { - return per_variable_parameterizations_.at(var)->get_flop_amount_to_decimate(); + return per_variable_parameterizations_.at(&var)->get_flop_amount_to_decimate(); } - [[nodiscard]] const std::vector& - get_reduced_variable_shape(const std::shared_ptr& var) const override + [[nodiscard]] const std::vector& get_reduced_variable_shape(const Variable& var) const override { - return per_variable_parameterizations_.at(var)->get_reduced_shape(); + return per_variable_parameterizations_.at(&var)->get_reduced_shape(); } [[nodiscard]] const std::pair, std::vector>& - get_reduced_start_and_count_for(const std::shared_ptr& var, sg4::ActorPtr publisher) const override + get_reduced_start_and_count_for(const Variable& var, sg4::ActorPtr publisher) const override { - return per_variable_parameterizations_.at(var)->get_reduced_start_and_count_for(publisher); + return per_variable_parameterizations_.at(&var)->get_reduced_start_and_count_for(publisher); } public: diff --git a/include/dtlmod/ReductionMethod.hpp b/include/dtlmod/ReductionMethod.hpp index 45f2fbe..90f8c0a 100644 --- a/include/dtlmod/ReductionMethod.hpp +++ b/include/dtlmod/ReductionMethod.hpp @@ -7,7 +7,6 @@ #define __DTLMOD_REDUCTION_METHOD_HPP__ #include -#include #include #include @@ -27,15 +26,14 @@ class ReductionMethod { public: ReductionMethod(const std::string& name) : name_(name) {} - virtual void parameterize_for_variable(const std::shared_ptr& var, - const std::map& parameters) = 0; - virtual void reduce_variable(const std::shared_ptr& var) = 0; - virtual size_t get_reduced_variable_global_size(const std::shared_ptr& var) const = 0; - virtual size_t get_reduced_variable_local_size(const std::shared_ptr& var) const = 0; - virtual const std::vector& get_reduced_variable_shape(const std::shared_ptr& var) const = 0; + virtual void parameterize_for_variable(const Variable& var, const std::map& parameters) = 0; + virtual void reduce_variable(const Variable& var) = 0; + virtual size_t get_reduced_variable_global_size(const Variable& var) const = 0; + virtual size_t get_reduced_variable_local_size(const Variable& var) const = 0; + virtual const std::vector& get_reduced_variable_shape(const Variable& var) const = 0; virtual const std::pair, std::vector>& - get_reduced_start_and_count_for(const std::shared_ptr& var, simgrid::s4u::ActorPtr publisher) const = 0; - virtual double get_flop_amount_to_reduce_variable(const std::shared_ptr& var) const = 0; + get_reduced_start_and_count_for(const Variable& var, simgrid::s4u::ActorPtr publisher) const = 0; + virtual double get_flop_amount_to_reduce_variable(const Variable& var) const = 0; /// @brief Helper function to print out the name of the ReductionMethod. /// @return The corresponding string diff --git a/src/CompressionReductionMethod.cpp b/src/CompressionReductionMethod.cpp index d9259ba..171e7b8 100644 --- a/src/CompressionReductionMethod.cpp +++ b/src/CompressionReductionMethod.cpp @@ -13,7 +13,7 @@ XBT_LOG_NEW_DEFAULT_SUBCATEGORY(dtlmod_compression_reduction, dtlmod, "DTL loggi namespace dtlmod { -void CompressionReductionMethod::parameterize_for_variable(const std::shared_ptr& var, +void CompressionReductionMethod::parameterize_for_variable(const Variable& var, const std::map& parameters) { double new_accuracy = 1.0; @@ -21,7 +21,7 @@ void CompressionReductionMethod::parameterize_for_variable(const std::shared_ptr double new_decompression_cost_per_element = 1.0; // Detect existing parameterization (if any). - auto it = per_variable_parameterizations_.find(var); + auto it = per_variable_parameterizations_.find(&var); const bool exists = (it != per_variable_parameterizations_.end()); // Initialize from existing values (if present) to support partial updates. @@ -47,8 +47,8 @@ void CompressionReductionMethod::parameterize_for_variable(const std::shared_ptr if (!exists) { // First-time parameterization per_variable_parameterizations_.try_emplace( - var, std::make_shared(new_accuracy, new_compression_cost_per_element, - new_decompression_cost_per_element)); + &var, std::make_shared(new_accuracy, new_compression_cost_per_element, + new_decompression_cost_per_element)); return; } diff --git a/src/DecimationReductionMethod.cpp b/src/DecimationReductionMethod.cpp index bc82cf9..d82d555 100644 --- a/src/DecimationReductionMethod.cpp +++ b/src/DecimationReductionMethod.cpp @@ -15,24 +15,24 @@ XBT_LOG_NEW_DEFAULT_SUBCATEGORY(dtlmod_decimation_reduction, dtlmod, "DTL loggin namespace dtlmod { -size_t ParameterizedDecimation::get_global_reduced_size() const +size_t DecimationReductionMethod::ParameterizedDecimation::get_global_reduced_size() const { return std::accumulate(reduced_shape_.begin(), reduced_shape_.end(), var_->get_element_size(), std::multiplies<>{}); } -size_t ParameterizedDecimation::get_local_reduced_size() const +size_t DecimationReductionMethod::ParameterizedDecimation::get_local_reduced_size() const { auto start_and_count = reduced_local_start_and_count_.at(sg4::Actor::self()).second; return std::accumulate(start_and_count.begin(), start_and_count.end(), var_->get_element_size(), std::multiplies<>{}); } const std::pair, std::vector>& -ParameterizedDecimation::get_reduced_start_and_count_for(sg4::ActorPtr publisher) const +DecimationReductionMethod::ParameterizedDecimation::get_reduced_start_and_count_for(sg4::ActorPtr publisher) const { return reduced_local_start_and_count_.at(publisher); } -double ParameterizedDecimation::get_flop_amount_to_decimate() const +double DecimationReductionMethod::ParameterizedDecimation::get_flop_amount_to_decimate() const { XBT_DEBUG("Compute decimation cost with: cost_per_element = %.2f and interpolation_method = %s", cost_per_element_, interpolation_method_.c_str()); @@ -49,7 +49,7 @@ double ParameterizedDecimation::get_flop_amount_to_decimate() const return amount; } -void DecimationReductionMethod::parameterize_for_variable(const std::shared_ptr& var, +void DecimationReductionMethod::parameterize_for_variable(const Variable& var, const std::map& parameters) { std::vector new_stride; @@ -57,7 +57,7 @@ void DecimationReductionMethod::parameterize_for_variable(const std::shared_ptr< double new_cost_per_element = 1.0; // Detect existing parameterization (if any). - auto it = per_variable_parameterizations_.find(var); + auto it = per_variable_parameterizations_.find(&var); const bool exists = (it != per_variable_parameterizations_.end()); // Initialize from existing values (if present) to support partial updates. @@ -74,10 +74,10 @@ void DecimationReductionMethod::parameterize_for_variable(const std::shared_ptr< std::vector tokens; boost::split(tokens, value, boost::is_any_of(","), boost::token_compress_on); - if (var->get_shape().size() != tokens.size()) + if (var.get_shape().size() != tokens.size()) throw InconsistentDecimationStrideException( XBT_THROW_POINT, "Decimation Stride and Variable Shape vectors must have the same size. Stride: " + - std::to_string(tokens.size()) + ", Shape: " + std::to_string(var->get_shape().size())); + std::to_string(tokens.size()) + ", Shape: " + std::to_string(var.get_shape().size())); std::vector parsed_stride; parsed_stride.reserve(tokens.size()); @@ -97,7 +97,7 @@ void DecimationReductionMethod::parameterize_for_variable(const std::shared_ptr< std::string("Unknown interpolation method: ") + value + " (options are: linear, cubic, or quadratic)."); - if ((value == "quadratic" && var->get_shape().size() < 2) || (value == "cubic" && var->get_shape().size() < 3)) + if ((value == "quadratic" && var.get_shape().size() < 2) || (value == "cubic" && var.get_shape().size() < 3)) throw InconsistentDecimationInterpolationException( XBT_THROW_POINT, "Variable has not enough dimensions to apply this interpolation method"); } else if (key == "cost_per_element") @@ -109,7 +109,7 @@ void DecimationReductionMethod::parameterize_for_variable(const std::shared_ptr< if (!exists) { // First-time parameterization per_variable_parameterizations_.try_emplace( - var, + &var, std::make_shared(var, new_stride, new_interpolation_method, new_cost_per_element)); return; } @@ -126,10 +126,10 @@ void DecimationReductionMethod::parameterize_for_variable(const std::shared_ptr< existing->set_cost_per_element(new_cost_per_element); } -void DecimationReductionMethod::reduce_variable(const std::shared_ptr& var) +void DecimationReductionMethod::reduce_variable(const Variable& var) { - auto parameterization = per_variable_parameterizations_[var]; - auto original_shape = var->get_shape(); + auto parameterization = per_variable_parameterizations_[&var]; + auto original_shape = var.get_shape(); auto stride = parameterization->get_stride(); std::vector reduced_shape; @@ -139,7 +139,7 @@ void DecimationReductionMethod::reduce_variable(const std::shared_ptr& parameterization->set_reduced_shape(reduced_shape); auto self = sg4::Actor::self(); - auto [start, count] = var->get_local_start_and_count(self); + auto [start, count] = var.get_local_start_and_count(self); std::vector reduced_start; std::vector reduced_count; diff --git a/src/Engine.cpp b/src/Engine.cpp index 55cfd0f..71951a0 100644 --- a/src/Engine.cpp +++ b/src/Engine.cpp @@ -35,12 +35,12 @@ void Engine::put(const std::shared_ptr& var) const { if (var->is_reduced()) { // Perform an Exec activity before putting the variable into the DTL to account for the time needed to reduce it. - sg4::this_actor::execute(var->get_reduction_method()->get_flop_amount_to_reduce_variable(var)); + sg4::this_actor::execute(var->get_reduction_method()->get_flop_amount_to_reduce_variable(*var)); XBT_DEBUG("Variable %s has been reduced!", var->get_cname()); // Now put the reduced version of the variable into the DTL, i.e., using its reduced local size. XBT_DEBUG("Put this reduced version of %s (initial size = %zu, reduced size = %zu)", var->get_cname(), - var->get_local_size(), var->get_reduction_method()->get_reduced_variable_local_size(var)); - transport_->put(var, var->get_reduction_method()->get_reduced_variable_local_size(var)); + var->get_local_size(), var->get_reduction_method()->get_reduced_variable_local_size(*var)); + transport_->put(var, var->get_reduction_method()->get_reduced_variable_local_size(*var)); } else transport_->put(var, var->get_local_size()); } @@ -54,9 +54,9 @@ void Engine::put(const std::shared_ptr& var, size_t simulated_size_in_ void Engine::get(const std::shared_ptr& var) const { if (var->is_reduced() && var->is_reduced_by_subscriber()) { - var->get_reduction_method()->reduce_variable(var); + var->get_reduction_method()->reduce_variable(*var); // Perform an Exec activity before putting the variable into the DTL to account for the time needed to reduce it. - sg4::this_actor::execute(var->get_reduction_method()->get_flop_amount_to_reduce_variable(var)); + sg4::this_actor::execute(var->get_reduction_method()->get_flop_amount_to_reduce_variable(*var)); } transport_->get(var); diff --git a/src/Transport.cpp b/src/Transport.cpp index 09011c8..8e4be26 100644 --- a/src/Transport.cpp +++ b/src/Transport.cpp @@ -24,7 +24,7 @@ Transport::check_selection_and_get_blocks_to_get(std::shared_ptr var) std::vector count; if (var->is_reduced_by_subscriber()) - count = var->get_reduction_method()->get_reduced_variable_shape(var); + count = var->get_reduction_method()->get_reduced_variable_shape(*var); else count = var->get_shape(); diff --git a/src/Variable.cpp b/src/Variable.cpp index 0a6305c..036504c 100644 --- a/src/Variable.cpp +++ b/src/Variable.cpp @@ -73,13 +73,15 @@ void Variable::set_reduction_operation(std::shared_ptr method, "Variable has already been reduced by its producer; subscriber-side reduction is not allowed."); } - method->parameterize_for_variable(shared_from_this(), parameters); - method->reduce_variable(shared_from_this()); + method->parameterize_for_variable(*this, parameters); + method->reduce_variable(*this); is_reduced_with_ = method; - if (stream->get_access_mode() == Stream::Mode::Publish) - reduction_origin_ = ReductionOrigin::Publisher; - else - reduction_origin_ = ReductionOrigin::Subscriber; + if (reduction_origin_ == ReductionOrigin::None) { + if (stream->get_access_mode() == Stream::Mode::Publish) + reduction_origin_ = ReductionOrigin::Publisher; + else + reduction_origin_ = ReductionOrigin::Subscriber; + } } //////////////////////////////////////////// @@ -111,7 +113,7 @@ void Variable::add_transaction_metadata(unsigned int transaction_id, sg4::ActorP const std::string& location) { if (is_reduced_with_) { - auto start_and_count = is_reduced_with_->get_reduced_start_and_count_for(shared_from_this(), publisher); + auto start_and_count = is_reduced_with_->get_reduced_start_and_count_for(*this, publisher); metadata_->add_transaction(transaction_id, start_and_count, location, publisher); } else metadata_->add_transaction(transaction_id, local_start_and_count_[publisher], location, publisher); From 2062fa2b83ff0e437eaccca3edca62ccd90c646c Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Fri, 13 Feb 2026 11:10:32 -0500 Subject: [PATCH 33/92] implementation of the Compression Reduction Method --- include/dtlmod/CompressionReductionMethod.hpp | 47 +++++-- include/dtlmod/DTLException.hpp | 2 + include/dtlmod/ReductionMethod.hpp | 3 + src/CompressionReductionMethod.cpp | 119 +++++++++++++++++- src/Engine.cpp | 7 ++ src/Variable.cpp | 6 + 6 files changed, 169 insertions(+), 15 deletions(-) diff --git a/include/dtlmod/CompressionReductionMethod.hpp b/include/dtlmod/CompressionReductionMethod.hpp index c4b396b..68376b9 100644 --- a/include/dtlmod/CompressionReductionMethod.hpp +++ b/include/dtlmod/CompressionReductionMethod.hpp @@ -15,16 +15,27 @@ namespace dtlmod { class CompressionReductionMethod : public ReductionMethod { class ParameterizedCompression { + const Variable* var_; // non-owning: the Variable outlives the parameterization (both owned by Stream) double accuracy_; double compression_cost_per_element_; double decompression_cost_per_element_; + double compression_ratio_; + std::string compressor_profile_; // "fixed", "sz", "zfp" + double data_smoothness_; // hint in [0,1], shifts the model curve + double ratio_variability_; // per-transaction noise amplitude in [0,1] public: - ParameterizedCompression(double accuracy, double compression_cost_per_element, - double decompression_cost_per_element) - : accuracy_(accuracy) + ParameterizedCompression(const Variable& var, double accuracy, double compression_cost_per_element, + double decompression_cost_per_element, double compression_ratio, + const std::string& compressor_profile, double data_smoothness, double ratio_variability) + : var_(&var) + , accuracy_(accuracy) , compression_cost_per_element_(compression_cost_per_element) , decompression_cost_per_element_(decompression_cost_per_element) + , compression_ratio_(compression_ratio) + , compressor_profile_(compressor_profile) + , data_smoothness_(data_smoothness) + , ratio_variability_(ratio_variability) { } @@ -34,27 +45,45 @@ class CompressionReductionMethod : public ReductionMethod { void set_compression_cost_per_element(double cost) { compression_cost_per_element_ = cost; } [[nodiscard]] double get_decompression_cost_per_element() const { return decompression_cost_per_element_; } void set_decompression_cost_per_element(double cost) { decompression_cost_per_element_ = cost; } + [[nodiscard]] double get_compression_ratio() const { return compression_ratio_; } + void set_compression_ratio(double ratio) { compression_ratio_ = ratio; } + [[nodiscard]] const std::string& get_compressor_profile() const { return compressor_profile_; } + void set_compressor_profile(const std::string& profile) { compressor_profile_ = profile; } + [[nodiscard]] double get_data_smoothness() const { return data_smoothness_; } + void set_data_smoothness(double smoothness) { data_smoothness_ = smoothness; } + [[nodiscard]] double get_ratio_variability() const { return ratio_variability_; } + void set_ratio_variability(double variability) { ratio_variability_ = variability; } + + /// @brief Get the effective compression ratio, optionally perturbed by per-transaction noise. + [[nodiscard]] double get_effective_ratio(unsigned int transaction_id = 0) const; }; std::map> per_variable_parameterizations_; + /// @brief Derive the compression ratio from accuracy and compressor profile. + static double derive_compression_ratio(double accuracy, const std::string& profile, double data_smoothness); + public: CompressionReductionMethod(const std::string& name) : ReductionMethod(name) {} void parameterize_for_variable(const Variable& var, const std::map& parameters) override; void reduce_variable(const Variable& /* var*/) override {} - [[nodiscard]] size_t get_reduced_variable_global_size(const Variable& /*var*/) const override { return 0; } - [[nodiscard]] size_t get_reduced_variable_local_size(const Variable& /*var*/) const override { return 0; } + + [[nodiscard]] size_t get_reduced_variable_global_size(const Variable& var) const override; + [[nodiscard]] size_t get_reduced_variable_local_size(const Variable& var) const override; + [[nodiscard]] const std::vector& get_reduced_variable_shape(const Variable& var) const override { return var.get_shape(); } + [[nodiscard]] const std::pair, std::vector>& - get_reduced_start_and_count_for(const Variable& /*var*/, sg4::ActorPtr /*publisher*/) const override + get_reduced_start_and_count_for(const Variable& var, sg4::ActorPtr publisher) const override { - throw std::runtime_error("not implemented"); - // return;// std::make_pair(std::vector(), std::vector()); + return var.get_local_start_and_count(publisher); } - [[nodiscard]] double get_flop_amount_to_reduce_variable(const Variable& /*var*/) const override { return 0.0; } + + [[nodiscard]] double get_flop_amount_to_reduce_variable(const Variable& var) const override; + [[nodiscard]] double get_flop_amount_to_decompress_variable(const Variable& var) const override; }; /// \endcond } // namespace dtlmod diff --git a/include/dtlmod/DTLException.hpp b/include/dtlmod/DTLException.hpp index 14ee7c5..240a41a 100644 --- a/include/dtlmod/DTLException.hpp +++ b/include/dtlmod/DTLException.hpp @@ -67,6 +67,8 @@ DECLARE_DTLMOD_EXCEPTION(UnknownDecimationInterpolationException, "Unknown Decim DECLARE_DTLMOD_EXCEPTION(DoubleReductionException, "Double reduction is forbidden"); DECLARE_DTLMOD_EXCEPTION(UnknownCompressionOptionException, "Unknown Compression option"); +DECLARE_DTLMOD_EXCEPTION(InconsistentCompressionRatioException, "Inconsistent Compression ratio"); +DECLARE_DTLMOD_EXCEPTION(SubscriberSideCompressionException, "Compression can only be applied on the publisher side"); } // namespace dtlmod diff --git a/include/dtlmod/ReductionMethod.hpp b/include/dtlmod/ReductionMethod.hpp index 90f8c0a..480adb6 100644 --- a/include/dtlmod/ReductionMethod.hpp +++ b/include/dtlmod/ReductionMethod.hpp @@ -26,6 +26,8 @@ class ReductionMethod { public: ReductionMethod(const std::string& name) : name_(name) {} + virtual ~ReductionMethod() = default; + virtual void parameterize_for_variable(const Variable& var, const std::map& parameters) = 0; virtual void reduce_variable(const Variable& var) = 0; virtual size_t get_reduced_variable_global_size(const Variable& var) const = 0; @@ -34,6 +36,7 @@ class ReductionMethod { virtual const std::pair, std::vector>& get_reduced_start_and_count_for(const Variable& var, simgrid::s4u::ActorPtr publisher) const = 0; virtual double get_flop_amount_to_reduce_variable(const Variable& var) const = 0; + virtual double get_flop_amount_to_decompress_variable(const Variable& /*var*/) const { return 0.0; } /// @brief Helper function to print out the name of the ReductionMethod. /// @return The corresponding string diff --git a/src/CompressionReductionMethod.cpp b/src/CompressionReductionMethod.cpp index 171e7b8..a87b455 100644 --- a/src/CompressionReductionMethod.cpp +++ b/src/CompressionReductionMethod.cpp @@ -4,7 +4,7 @@ * under the terms of the license (GNU LGPL) which comes with this package. */ #include -#include // for std::accumulate +#include #include "dtlmod/CompressionReductionMethod.hpp" #include "dtlmod/DTLException.hpp" @@ -13,12 +13,72 @@ XBT_LOG_NEW_DEFAULT_SUBCATEGORY(dtlmod_compression_reduction, dtlmod, "DTL loggi namespace dtlmod { +double CompressionReductionMethod::ParameterizedCompression::get_effective_ratio(unsigned int transaction_id) const +{ + if (ratio_variability_ <= 0.0) + return compression_ratio_; + // Deterministic noise from hash of (variable_name, transaction_id) + size_t seed = std::hash{}(var_->get_name()) ^ (std::hash{}(transaction_id) << 1); + // Map to [1 - variability, 1 + variability] + double noise = 1.0 + ratio_variability_ * (2.0 * (seed % 10001) / 10000.0 - 1.0); + return std::max(1.0, compression_ratio_ * noise); +} + +size_t CompressionReductionMethod::get_reduced_variable_global_size(const Variable& var) const +{ + auto ratio = per_variable_parameterizations_.at(&var)->get_compression_ratio(); + return static_cast(std::ceil(var.get_global_size() / ratio)); +} + +size_t CompressionReductionMethod::get_reduced_variable_local_size(const Variable& var) const +{ + auto ratio = per_variable_parameterizations_.at(&var)->get_compression_ratio(); + return static_cast(std::ceil(var.get_local_size() / ratio)); +} + +double CompressionReductionMethod::get_flop_amount_to_reduce_variable(const Variable& var) const +{ + auto param = per_variable_parameterizations_.at(&var); + auto num_elements = var.get_local_size() / var.get_element_size(); + return param->get_compression_cost_per_element() * num_elements; +} + +double CompressionReductionMethod::get_flop_amount_to_decompress_variable(const Variable& var) const +{ + auto param = per_variable_parameterizations_.at(&var); + auto num_elements = var.get_local_size() / var.get_element_size(); + return param->get_decompression_cost_per_element() * num_elements; +} + +double CompressionReductionMethod::derive_compression_ratio(double accuracy, const std::string& profile, + double data_smoothness) +{ + if (profile == "sz") { + // SZ-like prediction-based compressor: empirical fit from published benchmarks on scientific data. + // Higher smoothness → better prediction → higher ratio. + double alpha = 3.0; + double beta = 0.8; + return std::max(1.0, alpha * std::pow(-std::log10(accuracy), beta) * (0.5 + data_smoothness)); + } else if (profile == "zfp") { + // ZFP-like transform-based compressor: rate = bits-per-value derived from accuracy. + // 64 bits (double) / rate gives the compression ratio. + double rate = std::max(1.0, -std::log2(accuracy) + 1.0); + return std::max(1.0, 64.0 / rate); + } + // "fixed" profile: ratio must be user-specified (handled by the caller) + return 1.0; +} + void CompressionReductionMethod::parameterize_for_variable(const Variable& var, const std::map& parameters) { - double new_accuracy = 1.0; + double new_accuracy = 1e-3; double new_compression_cost_per_element = 1.0; double new_decompression_cost_per_element = 1.0; + double new_compression_ratio = 0.0; // 0 means "not specified, must be derived" + std::string new_compressor_profile = "fixed"; + double new_data_smoothness = 0.5; + double new_ratio_variability = 0.0; // Detect existing parameterization (if any). auto it = per_variable_parameterizations_.find(&var); @@ -30,8 +90,14 @@ void CompressionReductionMethod::parameterize_for_variable(const Variable& var, new_accuracy = existing->get_accuracy(); new_compression_cost_per_element = existing->get_compression_cost_per_element(); new_decompression_cost_per_element = existing->get_decompression_cost_per_element(); + new_compression_ratio = existing->get_compression_ratio(); + new_compressor_profile = existing->get_compressor_profile(); + new_data_smoothness = existing->get_data_smoothness(); + new_ratio_variability = existing->get_ratio_variability(); } + bool ratio_explicitly_set = false; + for (const auto& [key, value] : parameters) { if (key == "accuracy") { new_accuracy = std::stod(value); @@ -39,28 +105,69 @@ void CompressionReductionMethod::parameterize_for_variable(const Variable& var, new_compression_cost_per_element = std::stod(value); } else if (key == "decompression_cost_per_element") { new_decompression_cost_per_element = std::stod(value); + } else if (key == "compression_ratio") { + new_compression_ratio = std::stod(value); + ratio_explicitly_set = true; + } else if (key == "compressor") { + if (value == "fixed" || value == "sz" || value == "zfp") + new_compressor_profile = value; + else + throw UnknownCompressionOptionException(XBT_THROW_POINT, "Unknown compressor profile: " + value + + " (options are: fixed, sz, or zfp)."); + } else if (key == "data_smoothness") { + new_data_smoothness = std::stod(value); + } else if (key == "ratio_variability") { + new_ratio_variability = std::stod(value); } else { throw UnknownCompressionOptionException(XBT_THROW_POINT, key.c_str()); } } + // Derive compression ratio if not explicitly specified + if (!ratio_explicitly_set && !exists) { + if (new_compressor_profile == "fixed") + throw InconsistentCompressionRatioException( + XBT_THROW_POINT, "Compressor profile 'fixed' requires an explicit 'compression_ratio' parameter."); + new_compression_ratio = derive_compression_ratio(new_accuracy, new_compressor_profile, new_data_smoothness); + } else if (ratio_explicitly_set) { + if (new_compression_ratio < 1.0) + throw InconsistentCompressionRatioException(XBT_THROW_POINT, "Compression ratio must be >= 1.0"); + } + + XBT_DEBUG("Compression parameterization for Variable %s: profile=%s, accuracy=%.2e, ratio=%.2f, " + "compression_cost=%.2f, decompression_cost=%.2f, smoothness=%.2f, variability=%.2f", + var.get_cname(), new_compressor_profile.c_str(), new_accuracy, new_compression_ratio, + new_compression_cost_per_element, new_decompression_cost_per_element, new_data_smoothness, + new_ratio_variability); + if (!exists) { - // First-time parameterization per_variable_parameterizations_.try_emplace( - &var, std::make_shared(new_accuracy, new_compression_cost_per_element, - new_decompression_cost_per_element)); + &var, std::make_shared( + var, new_accuracy, new_compression_cost_per_element, new_decompression_cost_per_element, + new_compression_ratio, new_compressor_profile, new_data_smoothness, new_ratio_variability)); return; } // If already exists, update only if changed. const auto& existing = it->second; - // Compare with existing to avoid unnecessary churn if (existing->get_accuracy() != new_accuracy) existing->set_accuracy(new_accuracy); if (existing->get_compression_cost_per_element() != new_compression_cost_per_element) existing->set_compression_cost_per_element(new_compression_cost_per_element); if (existing->get_decompression_cost_per_element() != new_decompression_cost_per_element) existing->set_decompression_cost_per_element(new_decompression_cost_per_element); + if (ratio_explicitly_set || new_compressor_profile != existing->get_compressor_profile()) { + double updated_ratio = ratio_explicitly_set + ? new_compression_ratio + : derive_compression_ratio(new_accuracy, new_compressor_profile, new_data_smoothness); + existing->set_compression_ratio(updated_ratio); + } + if (existing->get_compressor_profile() != new_compressor_profile) + existing->set_compressor_profile(new_compressor_profile); + if (existing->get_data_smoothness() != new_data_smoothness) + existing->set_data_smoothness(new_data_smoothness); + if (existing->get_ratio_variability() != new_ratio_variability) + existing->set_ratio_variability(new_ratio_variability); } } // namespace dtlmod diff --git a/src/Engine.cpp b/src/Engine.cpp index 71951a0..1c1e578 100644 --- a/src/Engine.cpp +++ b/src/Engine.cpp @@ -60,6 +60,13 @@ void Engine::get(const std::shared_ptr& var) const } transport_->get(var); + + // Decompression cost after receiving compressed data (e.g., publisher-side compression) + if (var->is_reduced()) { + double decompression_flops = var->get_reduction_method()->get_flop_amount_to_decompress_variable(*var); + if (decompression_flops > 0) + sg4::this_actor::execute(decompression_flops); + } } /// This function first synchronizes all the subscribers thanks to the internal barrier. When the last subscriber diff --git a/src/Variable.cpp b/src/Variable.cpp index 036504c..5fb5f34 100644 --- a/src/Variable.cpp +++ b/src/Variable.cpp @@ -4,6 +4,7 @@ * under the terms of the license (GNU LGPL) which comes with this package. */ #include "dtlmod/Variable.hpp" +#include "dtlmod/CompressionReductionMethod.hpp" #include "dtlmod/DTLException.hpp" #include "dtlmod/Stream.hpp" #include @@ -73,6 +74,11 @@ void Variable::set_reduction_operation(std::shared_ptr method, "Variable has already been reduced by its producer; subscriber-side reduction is not allowed."); } + // Compression is publisher-side only + if (dynamic_cast(method.get()) && stream->get_access_mode() == Stream::Mode::Subscribe) { + throw SubscriberSideCompressionException(XBT_THROW_POINT); + } + method->parameterize_for_variable(*this, parameters); method->reduce_variable(*this); is_reduced_with_ = method; From 5399459bf97860b466e36f26586fd57b91e5551e Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Fri, 13 Feb 2026 11:52:18 -0500 Subject: [PATCH 34/92] Propagate reduction state when inquiring on a Variable --- src/Stream.cpp | 6 ++++++ src/Variable.cpp | 10 +++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/Stream.cpp b/src/Stream.cpp index d8c9f76..80d4159 100644 --- a/src/Stream.cpp +++ b/src/Stream.cpp @@ -377,6 +377,12 @@ std::shared_ptr Stream::inquire_variable(std::string_view name) const std::vector(var->second->get_shape().size(), 0))); new_var->set_metadata(var->second->get_metadata()); + // Propagate reduction state so subscribers can detect publisher-side reduction + if (var->second->is_reduced()) { + new_var->is_reduced_with_ = var->second->get_reduction_method(); + new_var->reduction_origin_ = var->second->reduction_origin_; + } + return new_var; } } diff --git a/src/Variable.cpp b/src/Variable.cpp index 5fb5f34..8dc0bfb 100644 --- a/src/Variable.cpp +++ b/src/Variable.cpp @@ -65,6 +65,11 @@ void Variable::set_reduction_operation(std::shared_ptr method, auto stream = defined_in_stream_.lock(); xbt_assert(stream, "Variable::set_reduction_operation called after its Stream has been destroyed"); + // Compression is publisher-side only (check before double-reduction to give a more specific error) + if (dynamic_cast(method.get()) && stream->get_access_mode() == Stream::Mode::Subscribe) { + throw SubscriberSideCompressionException(XBT_THROW_POINT); + } + if (is_reduced_with_ && reduction_origin_ == ReductionOrigin::Publisher && stream->get_access_mode() == Stream::Mode::Subscribe) { XBT_ERROR("Subscriber %s attempted to re-reduce Variable %s, but it was already reduced on publisher side.", @@ -74,11 +79,6 @@ void Variable::set_reduction_operation(std::shared_ptr method, "Variable has already been reduced by its producer; subscriber-side reduction is not allowed."); } - // Compression is publisher-side only - if (dynamic_cast(method.get()) && stream->get_access_mode() == Stream::Mode::Subscribe) { - throw SubscriberSideCompressionException(XBT_THROW_POINT); - } - method->parameterize_for_variable(*this, parameters); method->reduce_variable(*this); is_reduced_with_ = method; From 5e893ee11995d793f1d5eb10d308861e533e367d Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Fri, 13 Feb 2026 11:52:49 -0500 Subject: [PATCH 35/92] more tests on reduction --- test/dtl_reduction.cpp | 203 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 203 insertions(+) diff --git a/test/dtl_reduction.cpp b/test/dtl_reduction.cpp index d86d6c3..031093d 100644 --- a/test/dtl_reduction.cpp +++ b/test/dtl_reduction.cpp @@ -306,3 +306,206 @@ TEST_F(DTLReductionTest, SinglePubSingleSubDecimationOnRead) ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); }); } + +TEST_F(DTLReductionTest, BogusCompressionSetting) +{ + DO_TEST_WITH_FORK([this]() { + this->setup_platform(); + host_->add_actor("TestActor", [this]() { + std::shared_ptr compressor; + XBT_INFO("Connect to the DTL"); + auto dtl = dtlmod::DTL::connect(); + XBT_INFO("Create a stream"); + auto stream = dtl->add_stream("my-output"); + stream->set_transport_method(dtlmod::Transport::Method::File); + stream->set_engine_type(dtlmod::Engine::Type::File); + XBT_INFO("Create a 3D variable"); + auto var = stream->define_variable("var3D", {640, 640, 640}, {0, 0, 0}, {640, 640, 640}, sizeof(double)); + XBT_INFO("Define a Compression Reduction Method"); + ASSERT_NO_THROW(compressor = stream->define_reduction_method("compression")); + XBT_INFO("Assign the compression method with a bogus option, should fail"); + ASSERT_THROW(var->set_reduction_operation(compressor, {{"bogus", "1"}}), + dtlmod::UnknownCompressionOptionException); + XBT_INFO("Assign the compression method with 'fixed' profile but no ratio, should fail"); + ASSERT_THROW(var->set_reduction_operation(compressor, {{"compressor", "fixed"}}), + dtlmod::InconsistentCompressionRatioException); + XBT_INFO("Assign the compression method with ratio < 1, should fail"); + ASSERT_THROW(var->set_reduction_operation(compressor, {{"compression_ratio", "0.5"}}), + dtlmod::InconsistentCompressionRatioException); + XBT_INFO("Assign the compression method with unknown compressor profile, should fail"); + ASSERT_THROW(var->set_reduction_operation(compressor, {{"compressor", "bogus"}}), + dtlmod::UnknownCompressionOptionException); + + XBT_INFO("Disconnect the actor from the DTL"); + dtlmod::DTL::disconnect(); + }); + + // Run the simulation + ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); + }); +} + +TEST_F(DTLReductionTest, SimpleCompressionFileEngine) +{ + DO_TEST_WITH_FORK([this]() { + this->setup_platform(); + host_->add_actor("Publisher", [this]() { + XBT_INFO("Connect to the DTL"); + auto dtl = dtlmod::DTL::connect(); + XBT_INFO("Create a stream"); + auto stream = dtl->add_stream("my-output"); + stream->set_transport_method(dtlmod::Transport::Method::File); + stream->set_engine_type(dtlmod::Engine::Type::File); + XBT_INFO("Create a 2D variable with 1000x1000 doubles"); + auto var = stream->define_variable("var2D", {1000, 1000}, {0, 0}, {1000, 1000}, sizeof(double)); + XBT_INFO("Define a Compression Reduction Method"); + auto compressor = stream->define_reduction_method("compression"); + + XBT_INFO("Open the stream in Publish mode"); + auto engine = stream->open("zone:my_fs:/host/scratch/my-working-dir/my-output", dtlmod::Stream::Mode::Publish); + sg4::this_actor::sleep_for(1); + + XBT_INFO("Assign compression with fixed ratio of 10"); + ASSERT_NO_THROW(var->set_reduction_operation(compressor, {{"compression_ratio", "10"}, + {"compression_cost_per_element", "5"}, + {"decompression_cost_per_element", "2"}})); + ASSERT_TRUE(var->is_reduced()); + ASSERT_TRUE(var->is_reduced_by_publisher()); + XBT_INFO("Verify reduced sizes"); + size_t original_global_size = sizeof(double) * 1000 * 1000; + size_t expected_reduced = static_cast(std::ceil(original_global_size / 10.0)); + ASSERT_EQ(compressor->get_reduced_variable_global_size(*var), expected_reduced); + ASSERT_EQ(compressor->get_reduced_variable_local_size(*var), expected_reduced); + XBT_INFO("Verify that shape is unchanged"); + auto reduced_shape = compressor->get_reduced_variable_shape(*var); + ASSERT_EQ(reduced_shape.size(), 2u); + ASSERT_EQ(reduced_shape[0], 1000u); + ASSERT_EQ(reduced_shape[1], 1000u); + XBT_INFO("Verify compression flop cost"); + double expected_flops = 5.0 * 1000 * 1000; // cost_per_element * num_elements + ASSERT_DOUBLE_EQ(compressor->get_flop_amount_to_reduce_variable(*var), expected_flops); + XBT_INFO("Verify decompression flop cost"); + double expected_decomp_flops = 2.0 * 1000 * 1000; + ASSERT_DOUBLE_EQ(compressor->get_flop_amount_to_decompress_variable(*var), expected_decomp_flops); + engine->begin_transaction(); + ASSERT_NO_THROW(engine->put(var)); + engine->end_transaction(); + sg4::this_actor::sleep_for(1); + engine->close(); + + XBT_INFO("Disconnect the actor from the DTL"); + dtlmod::DTL::disconnect(); + }); + + // Run the simulation + ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); + }); +} + +TEST_F(DTLReductionTest, CompressionWithDerivedRatio) +{ + DO_TEST_WITH_FORK([this]() { + this->setup_platform(); + host_->add_actor("TestActor", [this]() { + XBT_INFO("Connect to the DTL"); + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + stream->set_transport_method(dtlmod::Transport::Method::File); + stream->set_engine_type(dtlmod::Engine::Type::File); + auto var = stream->define_variable("var2D", {1000, 1000}, {0, 0}, {1000, 1000}, sizeof(double)); + size_t orig_size = sizeof(double) * 1000 * 1000; + + XBT_INFO("Test SZ profile: accuracy=1e-3, data_smoothness=0.5"); + auto sz_compressor = stream->define_reduction_method("compression"); + ASSERT_NO_THROW(var->set_reduction_operation( + sz_compressor, {{"compressor", "sz"}, {"accuracy", "1e-3"}, {"data_smoothness", "0.5"}})); + ASSERT_TRUE(var->is_reduced()); + // SZ model: ratio = 3.0 * pow(3, 0.8) * 1.0 ≈ 7.22 + size_t sz_reduced = sz_compressor->get_reduced_variable_global_size(*var); + ASSERT_GT(sz_reduced, 0u); + ASSERT_LT(sz_reduced, orig_size); + XBT_INFO("SZ reduced size: %zu (original: %zu, ratio: %.2f)", sz_reduced, orig_size, + static_cast(orig_size) / sz_reduced); + + XBT_INFO("Test ZFP profile: accuracy=1e-6"); + auto stream2 = dtl->add_stream("my-output-2"); + stream2->set_transport_method(dtlmod::Transport::Method::File); + stream2->set_engine_type(dtlmod::Engine::Type::File); + auto var2 = stream2->define_variable("var2D", {1000, 1000}, {0, 0}, {1000, 1000}, sizeof(double)); + auto zfp_compressor = stream2->define_reduction_method("compression"); + ASSERT_NO_THROW(var2->set_reduction_operation(zfp_compressor, {{"compressor", "zfp"}, {"accuracy", "1e-6"}})); + // ZFP model: rate = max(1.0, -log2(1e-6) + 1.0) ≈ 20.93, ratio = 64.0 / 20.93 ≈ 3.06 + size_t zfp_reduced = zfp_compressor->get_reduced_variable_global_size(*var2); + ASSERT_GT(zfp_reduced, 0u); + ASSERT_LT(zfp_reduced, orig_size); + XBT_INFO("ZFP reduced size: %zu (original: %zu, ratio: %.2f)", zfp_reduced, orig_size, + static_cast(orig_size) / zfp_reduced); + + XBT_INFO("Verify SZ gives higher compression than ZFP at these settings"); + ASSERT_LT(sz_reduced, zfp_reduced); + + XBT_INFO("Disconnect the actor from the DTL"); + dtlmod::DTL::disconnect(); + }); + + // Run the simulation + ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); + }); +} + +TEST_F(DTLReductionTest, DoubleReductionForbidden) +{ + DO_TEST_WITH_FORK([this]() { + this->setup_platform(); + host_->add_actor("TestActor", [this]() { + XBT_INFO("Connect to the DTL"); + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + stream->set_transport_method(dtlmod::Transport::Method::File); + stream->set_engine_type(dtlmod::Engine::Type::File); + auto var = stream->define_variable("var", {20000, 20000}, {0, 0}, {20000, 20000}, sizeof(double)); + auto compressor = stream->define_reduction_method("compression"); + auto engine = stream->open("zone:my_fs:/host/scratch/my-working-dir/my-output", dtlmod::Stream::Mode::Publish); + sg4::this_actor::sleep_for(1); + XBT_INFO("Apply publisher-side compression"); + ASSERT_NO_THROW(var->set_reduction_operation(compressor, {{"compression_ratio", "5"}})); + ASSERT_TRUE(var->is_reduced_by_publisher()); + + XBT_INFO("Re-parameterize the same reduction method (allowed — updates parameters)"); + ASSERT_NO_THROW(var->set_reduction_operation(compressor, {{"compression_ratio", "10"}})); + ASSERT_TRUE(var->is_reduced_by_publisher()); + + engine->begin_transaction(); + ASSERT_NO_THROW(engine->put(var)); + engine->end_transaction(); + sg4::this_actor::sleep_for(1); + engine->close(); + dtlmod::DTL::disconnect(); + + XBT_INFO("Wait and reconnect as subscriber"); + sg4::this_actor::sleep_until(10); + dtl = dtlmod::DTL::connect(); + engine = stream->open("zone:my_fs:/host/scratch/my-working-dir/my-output", dtlmod::Stream::Mode::Subscribe); + auto var_sub = stream->inquire_variable("var"); + + XBT_INFO("Verify that var_sub carries publisher reduction state"); + ASSERT_TRUE(var_sub->is_reduced()); + ASSERT_TRUE(var_sub->is_reduced_by_publisher()); + + XBT_INFO("Attempt subscriber-side compression, should fail (compression is publisher-side only)"); + auto sub_compressor = stream->define_reduction_method("compression"); + ASSERT_THROW(var_sub->set_reduction_operation(sub_compressor, {{"compression_ratio", "2"}}), + dtlmod::SubscriberSideCompressionException); + + XBT_INFO("Attempt subscriber-side decimation on a publisher-reduced variable, should fail (double reduction)"); + auto decimator = stream->define_reduction_method("decimation"); + ASSERT_THROW(var_sub->set_reduction_operation(decimator, {{"stride", "2,2"}}), dtlmod::DoubleReductionException); + + engine->close(); + dtlmod::DTL::disconnect(); + }); + + // Run the simulation + ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); + }); +} From d54814f35eafbe8558e691cd2ab3030d0b59e68f Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Fri, 13 Feb 2026 15:02:35 -0500 Subject: [PATCH 36/92] extend python bindings with reduction --- src/bindings/python/dtlmod_python.cpp | 46 +++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/src/bindings/python/dtlmod_python.cpp b/src/bindings/python/dtlmod_python.cpp index 79a62b5..9f96a02 100644 --- a/src/bindings/python/dtlmod_python.cpp +++ b/src/bindings/python/dtlmod_python.cpp @@ -9,12 +9,14 @@ #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -29,6 +31,7 @@ namespace py = pybind11; using dtlmod::DTL; using dtlmod::Engine; +using dtlmod::ReductionMethod; using dtlmod::Stream; using dtlmod::Transport; using dtlmod::Variable; @@ -73,6 +76,18 @@ PYBIND11_MODULE(dtlmod, m) py::register_exception(m, "GetWhenNoTransactionException"); + py::register_exception(m, "UnknownReductionMethodException"); + py::register_exception(m, "InconsistentDecimationStrideException"); + py::register_exception( + m, "InconsistentDecimationInterpolationException"); + py::register_exception(m, "UnknownDecimationOptionException"); + py::register_exception(m, "UnknownDecimationInterpolationException"); + py::register_exception(m, "DoubleReductionException"); + + py::register_exception(m, "UnknownCompressionOptionException"); + py::register_exception(m, "InconsistentCompressionRatioException"); + py::register_exception(m, "SubscriberSideCompressionException"); + /* Class DTL */ py::class_>(m, "DTL", "Data Transport Layer") .def_static("create", py::overload_cast(&DTL::create), py::call_guard(), @@ -147,7 +162,9 @@ PYBIND11_MODULE(dtlmod, m) .def_property_readonly("metadata_file_name", &Stream::get_metadata_file_name, "The name of the file in which the stream stores metadata (read-only)") .def("inquire_variable", &Stream::inquire_variable, py::arg("name"), "Retrieve a Variable information by name") - .def("remove_variable", &Stream::remove_variable, py::arg("name"), "Remove a Variable from this Stream"); + .def("remove_variable", &Stream::remove_variable, py::arg("name"), "Remove a Variable from this Stream") + .def("define_reduction_method", &Stream::define_reduction_method, py::arg("name"), + "Define a reduction method for this Stream (e.g. 'decimation' or 'compression')"); py::enum_(stream, "Mode", "The access mode for a Stream") .value("Publish", Stream::Mode::Publish) @@ -172,7 +189,32 @@ PYBIND11_MODULE(dtlmod, m) [](Variable& self, unsigned int begin, unsigned int count) { self.set_transaction_selection(begin, count); }, py::arg("begin"), py::arg("count"), "Set the selection of transactions to consider for this Variable") .def("set_selection", &Variable::set_selection, py::arg("start"), py::arg("count"), - "Set the selection of elements to consider for this Variable"); + "Set the selection of elements to consider for this Variable") + .def("set_reduction_operation", &Variable::set_reduction_operation, py::arg("method"), py::arg("parameters"), + "Set a reduction operation on this Variable with the given method and parameters") + .def_property_readonly("is_reduced", &Variable::is_reduced, + "Whether this Variable has a reduction method applied (read-only)") + .def_property_readonly("is_reduced_by_publisher", &Variable::is_reduced_by_publisher, + "Whether this Variable was reduced on the publisher side (read-only)") + .def_property_readonly("is_reduced_by_subscriber", &Variable::is_reduced_by_subscriber, + "Whether this Variable was reduced on the subscriber side (read-only)") + .def_property_readonly("reduction_method", &Variable::get_reduction_method, + "The reduction method applied to this Variable, or None (read-only)"); + + /* Class ReductionMethod */ + py::class_>(m, "ReductionMethod", + "A reduction method applied to Variables in a Stream") + .def_property_readonly("name", &ReductionMethod::get_name, "The name of the ReductionMethod (read-only)") + .def("get_reduced_variable_global_size", &ReductionMethod::get_reduced_variable_global_size, py::arg("var"), + "Get the reduced global size of a Variable") + .def("get_reduced_variable_local_size", &ReductionMethod::get_reduced_variable_local_size, py::arg("var"), + "Get the reduced local size of a Variable") + .def("get_reduced_variable_shape", &ReductionMethod::get_reduced_variable_shape, py::arg("var"), + "Get the reduced shape of a Variable") + .def("get_flop_amount_to_reduce_variable", &ReductionMethod::get_flop_amount_to_reduce_variable, py::arg("var"), + "Get the flop cost to reduce a Variable") + .def("get_flop_amount_to_decompress_variable", &ReductionMethod::get_flop_amount_to_decompress_variable, + py::arg("var"), "Get the flop cost to decompress a Variable"); /* Class Engine */ py::class_> engine( From 948c53b253c5d397ad9fda35c05a8747c7d3b546 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Fri, 13 Feb 2026 15:29:14 -0500 Subject: [PATCH 37/92] add python tests for reduction --- test/python/dtl_reduction.py | 452 +++++++++++++++++++++++++++++++ test/python/unit_tests_python.py | 3 +- 2 files changed, 454 insertions(+), 1 deletion(-) create mode 100644 test/python/dtl_reduction.py diff --git a/test/python/dtl_reduction.py b/test/python/dtl_reduction.py new file mode 100644 index 0000000..4dc8c5c --- /dev/null +++ b/test/python/dtl_reduction.py @@ -0,0 +1,452 @@ +# Copyright (c) 2025-2026. The SWAT Team. All rights reserved. +# +# This program is free software you can redistribute it and/or modify it +# under the terms of the license (GNU LGPL) which comes with this package. + +import ctypes +import math +import sys +import multiprocessing + +from simgrid import Engine, this_actor +from fsmod import FileSystem, OneDiskStorage +from dtlmod import (DTL, Engine as DTLEngine, Stream, Transport, + UnknownReductionMethodException, + UnknownDecimationOptionException, + InconsistentDecimationStrideException, + UnknownDecimationInterpolationException, + UnknownCompressionOptionException, + InconsistentCompressionRatioException, + SubscriberSideCompressionException, + DoubleReductionException) + + +def setup_platform(): + e = Engine(sys.argv) + e.set_log_control("no_loc") + e.set_log_control("root.thresh:critical") + + zone = e.netzone_root.add_netzone_empty("zone") + host = zone.add_host("host", "6Gf") + disk = host.add_disk("disk", "560MBps", "510MBps") + zone.seal() + + my_fs = FileSystem.create("my_fs") + FileSystem.register_file_system(zone, my_fs) + local_storage = OneDiskStorage.create("local_storage", disk) + my_fs.mount_partition("/host/scratch/", local_storage, "100GB") + + DTL.create() + return e, host + + +def run_test_bogus_decimation_setting(): + e, host = setup_platform() + + def test_actor(): + this_actor.info("Connect to the DTL") + dtl = DTL.connect() + this_actor.info("Create a stream") + stream = dtl.add_stream("my-output") + stream.set_transport_method(Transport.Method.File) + stream.set_engine_type(DTLEngine.Type.File) + this_actor.info("Create a 3D variable") + var = stream.define_variable("var3D", (640, 640, 640), (0, 0, 0), (640, 640, 640), + ctypes.sizeof(ctypes.c_double)) + this_actor.info("Define an unknown Reduction Method, should fail") + try: + stream.define_reduction_method("reduction") + assert False, "Expected UnknownReductionMethodException was not raised" + except UnknownReductionMethodException: + pass + this_actor.info("Define a Decimation Reduction Method") + decimator = stream.define_reduction_method("decimation") + this_actor.info("Assign the decimation method with a bogus option, should fail") + try: + var.set_reduction_operation(decimator, {"bogus": "-1"}) + assert False, "Expected UnknownDecimationOptionException was not raised" + except UnknownDecimationOptionException: + pass + this_actor.info("Assign the decimation method with only a 2D stride, should fail") + try: + var.set_reduction_operation(decimator, {"stride": "1,2"}) + assert False, "Expected InconsistentDecimationStrideException was not raised" + except InconsistentDecimationStrideException: + pass + this_actor.info("Assign the decimation method with a negative stride value, should fail") + try: + var.set_reduction_operation(decimator, {"stride": "1,2,-1"}) + assert False, "Expected InconsistentDecimationStrideException was not raised" + except InconsistentDecimationStrideException: + pass + this_actor.info("Assign the decimation method with a stride value of 0, should fail") + try: + var.set_reduction_operation(decimator, {"stride": "1,0,1"}) + assert False, "Expected InconsistentDecimationStrideException was not raised" + except InconsistentDecimationStrideException: + pass + this_actor.info("Assign the decimation method with an unknown interpolation method, should fail") + try: + var.set_reduction_operation(decimator, {"stride": "1,2,4", "interpolation": "bogus"}) + assert False, "Expected UnknownDecimationInterpolationException was not raised" + except UnknownDecimationInterpolationException: + pass + + this_actor.info("Disconnect the actor from the DTL") + DTL.disconnect() + + host.add_actor("TestActor", test_actor) + e.run() + + +def run_test_simple_decimation_file_engine(): + e, host = setup_platform() + + def test_actor(): + this_actor.info("Connect to the DTL") + dtl = DTL.connect() + this_actor.info("Create a stream") + stream = dtl.add_stream("my-output") + stream.set_transport_method(Transport.Method.File) + stream.set_engine_type(DTLEngine.Type.File) + this_actor.info("Create a 3D variable") + var = stream.define_variable("var3D", (640, 640, 640), (0, 0, 0), (640, 640, 640), + ctypes.sizeof(ctypes.c_double)) + this_actor.info("Define a Decimation Reduction Method") + decimator = stream.define_reduction_method("decimation") + this_actor.info("Open the stream in Publish mode") + engine = stream.open("zone:my_fs:/host/scratch/my-working-dir/my-output", Stream.Mode.Publish) + this_actor.sleep_for(1) + + this_actor.info("Start a Transaction (no reduction)") + engine.begin_transaction() + engine.put(var) + engine.end_transaction() + this_actor.sleep_until(6) + + this_actor.info("Assign the decimation method to var3D") + var.set_reduction_operation(decimator, {"stride": "1,2,4"}) + assert var.is_reduced + this_actor.info("Start a Transaction (with reduction)") + engine.begin_transaction() + engine.put(var) + engine.end_transaction() + this_actor.sleep_until(8) + + this_actor.info("Triple the cost per element") + var.set_reduction_operation(decimator, {"cost_per_element": "3"}) + engine.begin_transaction() + engine.put(var) + engine.end_transaction() + this_actor.sleep_until(10) + + this_actor.info("Create a second 3D variable with different decimation") + var2 = stream.define_variable("var3D_2", (640, 640, 640), (0, 0, 0), (640, 640, 640), + ctypes.sizeof(ctypes.c_double)) + var2.set_reduction_operation(decimator, {"stride": "2,2,2", "interpolation": "quadratic"}) + engine.begin_transaction() + engine.put(var2) + engine.end_transaction() + + this_actor.info("Close the engine") + engine.close() + this_actor.info("Disconnect the actor from the DTL") + DTL.disconnect() + + host.add_actor("TestActor", test_actor) + e.run() + + +def run_test_single_pub_single_sub_decimation_on_read(): + e, host = setup_platform() + + def test_actor(): + dtl = DTL.connect() + stream = dtl.add_stream("my-output") + stream.set_transport_method(Transport.Method.File) + stream.set_engine_type(DTLEngine.Type.File) + this_actor.info("Create a 2D variable with 20kx20k doubles") + var = stream.define_variable("var", (20000, 20000), (0, 0), (20000, 20000), ctypes.sizeof(ctypes.c_double)) + engine = stream.open("zone:my_fs:/host/scratch/my-working-dir/my-output", Stream.Mode.Publish) + this_actor.sleep_for(1) + + this_actor.info("Put the variable") + engine.begin_transaction() + engine.put(var) + engine.end_transaction() + + this_actor.info("Close the engine") + engine.close() + DTL.disconnect() + + assert dtl.has_active_connections == False + + this_actor.info("Wait until 10s before becoming a Subscriber") + this_actor.sleep_until(10) + dtl = DTL.connect() + + this_actor.info("Define a Decimation Reduction Method on Subscriber side") + decimator = stream.define_reduction_method("decimation") + engine = stream.open("zone:my_fs:/host/scratch/my-working-dir/my-output", Stream.Mode.Subscribe) + var_sub = stream.inquire_variable("var") + assert var_sub.name == "var" + assert var_sub.global_size == 8 * 20000 * 20000 + + this_actor.info("Get the entire variable (no reduction)") + engine.begin_transaction() + engine.get(var_sub) + engine.end_transaction() + + this_actor.info("Assign the decimation method to var_sub") + var_sub.set_reduction_operation(decimator, {"stride": "2,2"}) + + this_actor.info("Get a decimated version of the variable") + engine.begin_transaction() + engine.get(var_sub) + engine.end_transaction() + + this_actor.info("Check local size of var_sub. Should be 800,000,000 bytes") + assert var_sub.local_size == 8 * 10000 * 10000 + + this_actor.info("Close the engine") + engine.close() + this_actor.info("Disconnect the actor") + DTL.disconnect() + + host.add_actor("TestActor", test_actor) + e.run() + + +def run_test_bogus_compression_setting(): + e, host = setup_platform() + + def test_actor(): + this_actor.info("Connect to the DTL") + dtl = DTL.connect() + this_actor.info("Create a stream") + stream = dtl.add_stream("my-output") + stream.set_transport_method(Transport.Method.File) + stream.set_engine_type(DTLEngine.Type.File) + this_actor.info("Create a 3D variable") + var = stream.define_variable("var3D", (640, 640, 640), (0, 0, 0), (640, 640, 640), + ctypes.sizeof(ctypes.c_double)) + this_actor.info("Define a Compression Reduction Method") + compressor = stream.define_reduction_method("compression") + + this_actor.info("Assign the compression method with a bogus option, should fail") + try: + var.set_reduction_operation(compressor, {"bogus": "1"}) + assert False, "Expected UnknownCompressionOptionException was not raised" + except UnknownCompressionOptionException: + pass + this_actor.info("Assign with 'fixed' profile but no ratio, should fail") + try: + var.set_reduction_operation(compressor, {"compressor": "fixed"}) + assert False, "Expected InconsistentCompressionRatioException was not raised" + except InconsistentCompressionRatioException: + pass + this_actor.info("Assign with ratio < 1, should fail") + try: + var.set_reduction_operation(compressor, {"compression_ratio": "0.5"}) + assert False, "Expected InconsistentCompressionRatioException was not raised" + except InconsistentCompressionRatioException: + pass + this_actor.info("Assign with unknown compressor profile, should fail") + try: + var.set_reduction_operation(compressor, {"compressor": "bogus"}) + assert False, "Expected UnknownCompressionOptionException was not raised" + except UnknownCompressionOptionException: + pass + + this_actor.info("Disconnect the actor from the DTL") + DTL.disconnect() + + host.add_actor("TestActor", test_actor) + e.run() + + +def run_test_simple_compression_file_engine(): + e, host = setup_platform() + + def test_actor(): + this_actor.info("Connect to the DTL") + dtl = DTL.connect() + this_actor.info("Create a stream") + stream = dtl.add_stream("my-output") + stream.set_transport_method(Transport.Method.File) + stream.set_engine_type(DTLEngine.Type.File) + this_actor.info("Create a 2D variable with 1000x1000 doubles") + var = stream.define_variable("var2D", (1000, 1000), (0, 0), (1000, 1000), ctypes.sizeof(ctypes.c_double)) + this_actor.info("Define a Compression Reduction Method") + compressor = stream.define_reduction_method("compression") + + this_actor.info("Open the stream in Publish mode") + engine = stream.open("zone:my_fs:/host/scratch/my-working-dir/my-output", Stream.Mode.Publish) + this_actor.sleep_for(1) + + this_actor.info("Assign compression with fixed ratio of 10") + var.set_reduction_operation(compressor, {"compression_ratio": "10", + "compression_cost_per_element": "5", + "decompression_cost_per_element": "2"}) + assert var.is_reduced + assert var.is_reduced_by_publisher + + this_actor.info("Verify reduced sizes") + original_global_size = ctypes.sizeof(ctypes.c_double) * 1000 * 1000 + expected_reduced = math.ceil(original_global_size / 10.0) + assert compressor.get_reduced_variable_global_size(var) == expected_reduced + assert compressor.get_reduced_variable_local_size(var) == expected_reduced + + this_actor.info("Verify that shape is unchanged") + reduced_shape = compressor.get_reduced_variable_shape(var) + assert len(reduced_shape) == 2 + assert reduced_shape[0] == 1000 + assert reduced_shape[1] == 1000 + + this_actor.info("Verify compression flop cost") + expected_flops = 5.0 * 1000 * 1000 + assert compressor.get_flop_amount_to_reduce_variable(var) == expected_flops + + this_actor.info("Verify decompression flop cost") + expected_decomp_flops = 2.0 * 1000 * 1000 + assert compressor.get_flop_amount_to_decompress_variable(var) == expected_decomp_flops + + engine.begin_transaction() + engine.put(var) + engine.end_transaction() + this_actor.sleep_for(1) + engine.close() + + this_actor.info("Disconnect the actor from the DTL") + DTL.disconnect() + + host.add_actor("Publisher", test_actor) + e.run() + + +def run_test_compression_with_derived_ratio(): + e, host = setup_platform() + + def test_actor(): + this_actor.info("Connect to the DTL") + dtl = DTL.connect() + stream = dtl.add_stream("my-output") + stream.set_transport_method(Transport.Method.File) + stream.set_engine_type(DTLEngine.Type.File) + var = stream.define_variable("var2D", (1000, 1000), (0, 0), (1000, 1000), ctypes.sizeof(ctypes.c_double)) + orig_size = ctypes.sizeof(ctypes.c_double) * 1000 * 1000 + + this_actor.info("Test SZ profile: accuracy=1e-3, data_smoothness=0.5") + sz_compressor = stream.define_reduction_method("compression") + var.set_reduction_operation(sz_compressor, {"compressor": "sz", "accuracy": "1e-3", "data_smoothness": "0.5"}) + assert var.is_reduced + sz_reduced = sz_compressor.get_reduced_variable_global_size(var) + assert sz_reduced > 0 + assert sz_reduced < orig_size + this_actor.info(f"SZ reduced size: {sz_reduced} (original: {orig_size}, ratio: {orig_size / sz_reduced:.2f})") + + this_actor.info("Test ZFP profile: accuracy=1e-6") + stream2 = dtl.add_stream("my-output-2") + stream2.set_transport_method(Transport.Method.File) + stream2.set_engine_type(DTLEngine.Type.File) + var2 = stream2.define_variable("var2D", (1000, 1000), (0, 0), (1000, 1000), ctypes.sizeof(ctypes.c_double)) + zfp_compressor = stream2.define_reduction_method("compression") + var2.set_reduction_operation(zfp_compressor, {"compressor": "zfp", "accuracy": "1e-6"}) + zfp_reduced = zfp_compressor.get_reduced_variable_global_size(var2) + assert zfp_reduced > 0 + assert zfp_reduced < orig_size + this_actor.info(f"ZFP reduced size: {zfp_reduced} (original: {orig_size}, ratio: {orig_size / zfp_reduced:.2f})") + + this_actor.info("Verify SZ gives higher compression than ZFP at these settings") + assert sz_reduced < zfp_reduced + + this_actor.info("Disconnect the actor from the DTL") + DTL.disconnect() + + host.add_actor("TestActor", test_actor) + e.run() + + +def run_test_double_reduction_forbidden(): + e, host = setup_platform() + + def test_actor(): + this_actor.info("Connect to the DTL") + dtl = DTL.connect() + stream = dtl.add_stream("my-output") + stream.set_transport_method(Transport.Method.File) + stream.set_engine_type(DTLEngine.Type.File) + var = stream.define_variable("var", (20000, 20000), (0, 0), (20000, 20000), ctypes.sizeof(ctypes.c_double)) + compressor = stream.define_reduction_method("compression") + engine = stream.open("zone:my_fs:/host/scratch/my-working-dir/my-output", Stream.Mode.Publish) + this_actor.sleep_for(1) + + this_actor.info("Apply publisher-side compression") + var.set_reduction_operation(compressor, {"compression_ratio": "5"}) + assert var.is_reduced_by_publisher + + this_actor.info("Re-parameterize the same reduction method (allowed)") + var.set_reduction_operation(compressor, {"compression_ratio": "10"}) + assert var.is_reduced_by_publisher + + engine.begin_transaction() + engine.put(var) + engine.end_transaction() + this_actor.sleep_for(1) + engine.close() + DTL.disconnect() + + this_actor.info("Wait and reconnect as subscriber") + this_actor.sleep_until(10) + dtl = DTL.connect() + engine = stream.open("zone:my_fs:/host/scratch/my-working-dir/my-output", Stream.Mode.Subscribe) + var_sub = stream.inquire_variable("var") + + this_actor.info("Verify that var_sub carries publisher reduction state") + assert var_sub.is_reduced + assert var_sub.is_reduced_by_publisher + + this_actor.info("Attempt subscriber-side compression, should fail") + sub_compressor = stream.define_reduction_method("compression") + try: + var_sub.set_reduction_operation(sub_compressor, {"compression_ratio": "2"}) + assert False, "Expected SubscriberSideCompressionException was not raised" + except SubscriberSideCompressionException: + pass + + this_actor.info("Attempt subscriber-side decimation on publisher-reduced variable, should fail") + decimator = stream.define_reduction_method("decimation") + try: + var_sub.set_reduction_operation(decimator, {"stride": "2,2"}) + assert False, "Expected DoubleReductionException was not raised" + except DoubleReductionException: + pass + + engine.close() + DTL.disconnect() + + host.add_actor("TestActor", test_actor) + e.run() + + +if __name__ == '__main__': + tests = [ + run_test_bogus_decimation_setting, + run_test_simple_decimation_file_engine, + run_test_single_pub_single_sub_decimation_on_read, + run_test_bogus_compression_setting, + run_test_simple_compression_file_engine, + run_test_compression_with_derived_ratio, + run_test_double_reduction_forbidden, + ] + + for test in tests: + print(f"\n🔧 Run {test.__name__} ...") + p = multiprocessing.Process(target=test) + p.start() + p.join() + + if p.exitcode != 0: + print(f"❌ {test.__name__} failed with exit code {p.exitcode}") + else: + print(f"✅ {test.__name__} passed") diff --git a/test/python/unit_tests_python.py b/test/python/unit_tests_python.py index c0e7980..9e9f7be 100644 --- a/test/python/unit_tests_python.py +++ b/test/python/unit_tests_python.py @@ -8,7 +8,8 @@ "dtl_file_engine.py", "dtl_staging_engine.py", "dtl_stream.py", - "dtl_variable.py" + "dtl_variable.py", + "dtl_reduction.py" ] def run_script(script): From e024e80eeaf0bd0ff37c154855a2bd483af23fc8 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Sat, 14 Feb 2026 11:40:21 -0500 Subject: [PATCH 38/92] add tests of reduction with staging engine --- test/dtl_reduction.cpp | 133 ++++++++++++++++++++++++++++++++++ test/python/dtl_reduction.py | 135 ++++++++++++++++++++++++++++++++++- 2 files changed, 267 insertions(+), 1 deletion(-) diff --git a/test/dtl_reduction.cpp b/test/dtl_reduction.cpp index 031093d..be2252e 100644 --- a/test/dtl_reduction.cpp +++ b/test/dtl_reduction.cpp @@ -453,6 +453,139 @@ TEST_F(DTLReductionTest, CompressionWithDerivedRatio) }); } +TEST_F(DTLReductionTest, DecimationStagingEngine) +{ + DO_TEST_WITH_FORK([this]() { + // Build a two-host platform with a network link (required for staging transport) + auto* zone = sg4::Engine::get_instance()->get_netzone_root()->add_netzone_star("zone"); + auto* pub_host = zone->add_host("pub_host", "6Gf"); + auto* sub_host = zone->add_host("sub_host", "6Gf"); + auto* backbone = zone->add_link("backbone", "10Gbps")->set_latency("10us"); + auto* link_pub = zone->add_link("link_pub", "10Gbps")->set_latency("10us"); + auto* link_sub = zone->add_link("link_sub", "10Gbps")->set_latency("10us"); + zone->add_route(pub_host, nullptr, std::vector{link_pub, backbone}); + zone->add_route(sub_host, nullptr, std::vector{link_sub, backbone}); + zone->seal(); + dtlmod::DTL::create(); + + pub_host->add_actor("Publisher", []() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + stream->set_engine_type(dtlmod::Engine::Type::Staging); + stream->set_transport_method(dtlmod::Transport::Method::MQ); + XBT_INFO("Create a 2D variable with 10kx10k doubles"); + auto var = stream->define_variable("var", {10000, 10000}, {0, 0}, {10000, 10000}, sizeof(double)); + auto decimator = stream->define_reduction_method("decimation"); + auto engine = stream->open("my-output", dtlmod::Stream::Mode::Publish); + sg4::this_actor::sleep_for(0.5); + + XBT_INFO("Assign decimation with stride 2,2"); + ASSERT_NO_THROW(var->set_reduction_operation(decimator, {{"stride", "2,2"}})); + ASSERT_TRUE(var->is_reduced()); + ASSERT_TRUE(var->is_reduced_by_publisher()); + + XBT_INFO("Verify reduced shape: 5000x5000"); + auto shape = decimator->get_reduced_variable_shape(*var); + ASSERT_EQ(shape[0], 5000u); + ASSERT_EQ(shape[1], 5000u); + + engine->begin_transaction(); + ASSERT_NO_THROW(engine->put(var)); + engine->end_transaction(); + sg4::this_actor::sleep_for(1); + engine->close(); + dtlmod::DTL::disconnect(); + }); + + sub_host->add_actor("Subscriber", []() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + auto engine = stream->open("my-output", dtlmod::Stream::Mode::Subscribe); + auto var = stream->inquire_variable("var"); + + XBT_INFO("Get the decimated variable"); + engine->begin_transaction(); + ASSERT_NO_THROW(engine->get(var)); + engine->end_transaction(); + + engine->close(); + dtlmod::DTL::disconnect(); + }); + + ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); + }); +} + +TEST_F(DTLReductionTest, CompressionStagingEngine) +{ + DO_TEST_WITH_FORK([this]() { + // Build a two-host platform with a network link (required for staging transport) + auto* zone = sg4::Engine::get_instance()->get_netzone_root()->add_netzone_star("zone"); + auto* pub_host = zone->add_host("pub_host", "6Gf"); + auto* sub_host = zone->add_host("sub_host", "6Gf"); + auto* backbone = zone->add_link("backbone", "10Gbps")->set_latency("10us"); + auto* link_pub = zone->add_link("link_pub", "10Gbps")->set_latency("10us"); + auto* link_sub = zone->add_link("link_sub", "10Gbps")->set_latency("10us"); + zone->add_route(pub_host, nullptr, std::vector{link_pub, backbone}); + zone->add_route(sub_host, nullptr, std::vector{link_sub, backbone}); + zone->seal(); + dtlmod::DTL::create(); + + pub_host->add_actor("Publisher", []() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + stream->set_engine_type(dtlmod::Engine::Type::Staging); + stream->set_transport_method(dtlmod::Transport::Method::MQ); + XBT_INFO("Create a 2D variable with 10kx10k doubles"); + auto var = stream->define_variable("var", {10000, 10000}, {0, 0}, {10000, 10000}, sizeof(double)); + auto compressor = stream->define_reduction_method("compression"); + auto engine = stream->open("my-output", dtlmod::Stream::Mode::Publish); + sg4::this_actor::sleep_for(0.5); + + XBT_INFO("Assign compression with ratio 5 and explicit costs"); + ASSERT_NO_THROW(var->set_reduction_operation(compressor, {{"compression_ratio", "5"}, + {"compression_cost_per_element", "3"}, + {"decompression_cost_per_element", "1"}})); + ASSERT_TRUE(var->is_reduced()); + ASSERT_TRUE(var->is_reduced_by_publisher()); + + XBT_INFO("Verify compressed sizes"); + size_t expected_reduced = static_cast(std::ceil(sizeof(double) * 10000.0 * 10000.0 / 5.0)); + ASSERT_EQ(compressor->get_reduced_variable_global_size(*var), expected_reduced); + ASSERT_EQ(compressor->get_reduced_variable_local_size(*var), expected_reduced); + + XBT_INFO("Verify shape is unchanged (compression preserves shape)"); + auto shape = compressor->get_reduced_variable_shape(*var); + ASSERT_EQ(shape[0], 10000u); + ASSERT_EQ(shape[1], 10000u); + + engine->begin_transaction(); + ASSERT_NO_THROW(engine->put(var)); + engine->end_transaction(); + sg4::this_actor::sleep_for(1); + engine->close(); + dtlmod::DTL::disconnect(); + }); + + sub_host->add_actor("Subscriber", []() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + auto engine = stream->open("my-output", dtlmod::Stream::Mode::Subscribe); + auto var = stream->inquire_variable("var"); + + XBT_INFO("Get the compressed variable (decompression cost should be applied on subscriber)"); + engine->begin_transaction(); + ASSERT_NO_THROW(engine->get(var)); + engine->end_transaction(); + + engine->close(); + dtlmod::DTL::disconnect(); + }); + + ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); + }); +} + TEST_F(DTLReductionTest, DoubleReductionForbidden) { DO_TEST_WITH_FORK([this]() { diff --git a/test/python/dtl_reduction.py b/test/python/dtl_reduction.py index 4dc8c5c..4958a83 100644 --- a/test/python/dtl_reduction.py +++ b/test/python/dtl_reduction.py @@ -8,7 +8,7 @@ import sys import multiprocessing -from simgrid import Engine, this_actor +from simgrid import Engine, Host, this_actor from fsmod import FileSystem, OneDiskStorage from dtlmod import (DTL, Engine as DTLEngine, Stream, Transport, UnknownReductionMethodException, @@ -429,6 +429,137 @@ def test_actor(): e.run() +def setup_staging_platform(): + """Set up a two-host platform with network links for staging engine tests.""" + e = Engine(sys.argv) + e.set_log_control("no_loc") + e.set_log_control("root.thresh:critical") + + zone = e.netzone_root.add_netzone_star("zone") + pub_host = zone.add_host("pub_host", "6Gf") + sub_host = zone.add_host("sub_host", "6Gf") + backbone = zone.add_link("backbone", "10Gbps").set_latency("10us") + link_pub = zone.add_link("link_pub", "10Gbps").set_latency("10us") + link_sub = zone.add_link("link_sub", "10Gbps").set_latency("10us") + zone.add_route(pub_host, None, [link_pub, backbone]) + zone.add_route(sub_host, None, [link_sub, backbone]) + zone.seal() + + DTL.create() + return e + + +def run_test_decimation_staging_engine(): + e = setup_staging_platform() + pub_host = Host.by_name("pub_host") + sub_host = Host.by_name("sub_host") + + def publisher(): + dtl = DTL.connect() + stream = dtl.add_stream("my-output") + stream.set_engine_type(DTLEngine.Type.Staging) + stream.set_transport_method(Transport.Method.MQ) + this_actor.info("Create a 2D variable with 10kx10k doubles") + var = stream.define_variable("var", (10000, 10000), (0, 0), (10000, 10000), ctypes.sizeof(ctypes.c_double)) + decimator = stream.define_reduction_method("decimation") + engine = stream.open("my-output", Stream.Mode.Publish) + this_actor.sleep_for(0.5) + + this_actor.info("Assign decimation with stride 2,2") + var.set_reduction_operation(decimator, {"stride": "2,2"}) + assert var.is_reduced + assert var.is_reduced_by_publisher + + this_actor.info("Verify reduced shape: 5000x5000") + shape = decimator.get_reduced_variable_shape(var) + assert shape[0] == 5000 + assert shape[1] == 5000 + + engine.begin_transaction() + engine.put(var) + engine.end_transaction() + this_actor.sleep_for(1) + engine.close() + DTL.disconnect() + + def subscriber(): + dtl = DTL.connect() + stream = dtl.add_stream("my-output") + engine = stream.open("my-output", Stream.Mode.Subscribe) + var = stream.inquire_variable("var") + + this_actor.info("Get the decimated variable") + engine.begin_transaction() + engine.get(var) + engine.end_transaction() + + engine.close() + DTL.disconnect() + + pub_host.add_actor("Publisher", publisher) + sub_host.add_actor("Subscriber", subscriber) + e.run() + + +def run_test_compression_staging_engine(): + e = setup_staging_platform() + pub_host = Host.by_name("pub_host") + sub_host = Host.by_name("sub_host") + + def publisher(): + dtl = DTL.connect() + stream = dtl.add_stream("my-output") + stream.set_engine_type(DTLEngine.Type.Staging) + stream.set_transport_method(Transport.Method.MQ) + this_actor.info("Create a 2D variable with 10kx10k doubles") + var = stream.define_variable("var", (10000, 10000), (0, 0), (10000, 10000), ctypes.sizeof(ctypes.c_double)) + compressor = stream.define_reduction_method("compression") + engine = stream.open("my-output", Stream.Mode.Publish) + this_actor.sleep_for(0.5) + + this_actor.info("Assign compression with ratio 5 and explicit costs") + var.set_reduction_operation(compressor, {"compression_ratio": "5", + "compression_cost_per_element": "3", + "decompression_cost_per_element": "1"}) + assert var.is_reduced + assert var.is_reduced_by_publisher + + this_actor.info("Verify compressed sizes") + expected_reduced = math.ceil(ctypes.sizeof(ctypes.c_double) * 10000.0 * 10000.0 / 5.0) + assert compressor.get_reduced_variable_global_size(var) == expected_reduced + assert compressor.get_reduced_variable_local_size(var) == expected_reduced + + this_actor.info("Verify shape is unchanged") + shape = compressor.get_reduced_variable_shape(var) + assert shape[0] == 10000 + assert shape[1] == 10000 + + engine.begin_transaction() + engine.put(var) + engine.end_transaction() + this_actor.sleep_for(1) + engine.close() + DTL.disconnect() + + def subscriber(): + dtl = DTL.connect() + stream = dtl.add_stream("my-output") + engine = stream.open("my-output", Stream.Mode.Subscribe) + var = stream.inquire_variable("var") + + this_actor.info("Get the compressed variable (decompression cost should be applied)") + engine.begin_transaction() + engine.get(var) + engine.end_transaction() + + engine.close() + DTL.disconnect() + + pub_host.add_actor("Publisher", publisher) + sub_host.add_actor("Subscriber", subscriber) + e.run() + + if __name__ == '__main__': tests = [ run_test_bogus_decimation_setting, @@ -438,6 +569,8 @@ def test_actor(): run_test_simple_compression_file_engine, run_test_compression_with_derived_ratio, run_test_double_reduction_forbidden, + run_test_decimation_staging_engine, + run_test_compression_staging_engine, ] for test in tests: From b77133763092c8c8fe374c30a8a4b26cc8e90fe9 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Sat, 14 Feb 2026 11:40:59 -0500 Subject: [PATCH 39/92] document reduction features --- doc/source/Compression.rst | 109 +++++++++++++++++++++++++++++++++++++ doc/source/Decimation.rst | 85 +++++++++++++++++++++++++++++ doc/source/Reduction.rst | 67 +++++++++++++++++++++++ doc/source/app_API.rst | 23 ++++++++ doc/source/index.rst | 5 +- 5 files changed, 288 insertions(+), 1 deletion(-) create mode 100644 doc/source/Compression.rst create mode 100644 doc/source/Decimation.rst create mode 100644 doc/source/Reduction.rst diff --git a/doc/source/Compression.rst b/doc/source/Compression.rst new file mode 100644 index 0000000..f677e04 --- /dev/null +++ b/doc/source/Compression.rst @@ -0,0 +1,109 @@ +.. Copyright 2025-2026 + +.. _Compression: + +Compression +########### + +Lossy compression is a widely used data reduction technique in scientific computing. By accepting a controlled loss +of precision, compressors such as SZ and ZFP can achieve significant reductions in data volume while preserving the +features that matter for downstream analysis. DTLMod models the performance impact of compression on in situ +workflows without actually compressing any data: it simulates the computational cost of compression and +decompression and adjusts the volume of data transported through the DTL according to a compression ratio. + +How compression works in DTLMod +------------------------------- + +Unlike decimation, compression does not change the **shape** of a variable. A :math:`1000 \times 1000` array remains +a :math:`1000 \times 1000` array after compression. What changes is the **byte-size** of the variable: the number +of bytes transported through the DTL is divided by the compression ratio. This reflects the fact that real-world +lossy compressors produce a bitstream that is smaller than the original data but still represents all the elements +of the array. + +Compression is a **publisher-side only** operation. Applying compression on the subscriber side is not meaningful +because compression aims to reduce the volume of data that needs to be transported---which requires intervention +before the data leaves the publisher. + +Compressor profiles +------------------- + +DTLMod provides three ways to determine the compression ratio for a variable: + +**Fixed ratio.** The simplest option: you directly specify the desired compression ratio. This is useful when you +already know, from experiments or from the literature, the compression ratio achieved by a particular compressor +on data similar to yours. The ratio must be at least 1.0 (a ratio of 1 means no size reduction). + +**SZ profile.** This profile is inspired by the `SZ lossy compressor `_, a +prediction-based algorithm. SZ achieves high compression ratios on smooth scientific data because it can accurately +predict neighboring values and only store the (small) prediction errors. The compression ratio is derived from two +user-specified parameters: + +- **accuracy** (or error bound): the maximum acceptable pointwise error. Tighter accuracy requirements reduce the + compression ratio because more bits are needed to represent the prediction residuals. + +- **data smoothness**: a value between 0 and 1 that characterizes how regular the data is. Smooth data + (e.g., temperature fields) yields higher compression ratios because predictions are more accurate. Noisy or + turbulent data yields lower ratios. + +The model computes the ratio as: + +.. math:: + + r = \max\!\Big(1,\;\alpha \cdot \left(-\log_{10} \varepsilon\right)^{\beta} \cdot (0.5 + \sigma)\Big) + +where :math:`\varepsilon` is the accuracy, :math:`\sigma` is the data smoothness, and :math:`\alpha = 3.0`, +:math:`\beta = 0.8` are empirical parameters fitted from published benchmarks on scientific datasets. + +**ZFP profile.** This profile is inspired by the `ZFP compressor `_, +a transform-based algorithm. ZFP organizes data into small blocks, applies a near-orthogonal transform, and +encodes the resulting coefficients with a fixed number of bits per value. The compression ratio depends primarily +on the requested accuracy: + +.. math:: + + \text{rate} = \max(1,\;-\log_2 \varepsilon + 1) \quad;\quad r = \frac{64}{\text{rate}} + +where the rate represents the number of bits per double-precision value after compression. Higher accuracy +requirements increase the rate and therefore decrease the compression ratio. + +Compression and decompression costs +------------------------------------ + +Two independent cost parameters control the simulated computational overhead of compression: + +- **compression cost per element**: the number of floating-point operations incurred per array element when + compressing the data on the publisher side. + +- **decompression cost per element**: the number of floating-point operations incurred per array element when + decompressing the data on the subscriber side, after it has been received. + +Both parameters default to 1.0. The total compression cost for a variable is computed as: + +.. math:: + + C_{\text{compress}} = c_{\text{comp}} \times \frac{N_{\text{local}}}{\text{element\_size}} + +.. math:: + + C_{\text{decompress}} = c_{\text{decomp}} \times \frac{N_{\text{local}}}{\text{element\_size}} + +where :math:`N_{\text{local}}` is the local size of the variable in bytes and :math:`\text{element\_size}` is the +size of one array element. The compression cost is incurred by the publisher right before putting the variable into +the DTL, and the decompression cost is incurred by the subscriber right after receiving it. + +Per-transaction variability +--------------------------- + +In practice, the compression ratio achieved on a given variable varies from one time step to the next as the data +evolves. DTLMod can model this variability through an optional **ratio variability** parameter that introduces a +bounded, deterministic perturbation around the nominal compression ratio at each transaction. This enables the +simulation of realistic scenarios in which the effectiveness of compression fluctuates over the course of a run. + +Re-parameterization +------------------- + +As with decimation, compression parameters can be updated between transactions. You can change the compression +ratio, switch compressor profiles, adjust accuracy or smoothness, or modify the cost parameters for a variable that +is already being compressed. Only the parameters that are explicitly provided in the update are modified; the +others retain their previous values. This supports the simulation of adaptive compression strategies that adjust +their settings in response to changes in the data. diff --git a/doc/source/Decimation.rst b/doc/source/Decimation.rst new file mode 100644 index 0000000..e4d695f --- /dev/null +++ b/doc/source/Decimation.rst @@ -0,0 +1,85 @@ +.. Copyright 2025-2026 + +.. _Decimation: + +Decimation +########## + +Decimation is a spatial subsampling technique that reduces the size of a multidimensional array by keeping only every +*n*-th element along each dimension. It is the method of choice when a workflow component does not need the full +resolution of the data produced upstream---a common situation in visualization or coarse-grained analysis pipelines. + +How decimation works +-------------------- + +A decimation operation is controlled by a **stride vector** that specifies, for each dimension of a +:ref:`Concept_Variable`, how many elements to skip between two retained samples. For a variable of shape +:math:`(D_1, D_2, \ldots, D_k)` and a stride :math:`(s_1, s_2, \ldots, s_k)`, the shape of the reduced variable +becomes :math:`(\lceil D_1/s_1 \rceil, \lceil D_2/s_2 \rceil, \ldots, \lceil D_k/s_k \rceil)`. + +For instance, applying a stride of :math:`(1, 2, 4)` to a :math:`640 \times 640 \times 640` variable produces a +:math:`640 \times 320 \times 160` reduced variable---an 8x reduction in data volume. + +The stride vector must have the same number of dimensions as the variable it applies to and all stride values must +be strictly positive. A stride of 1 along a given dimension means no subsampling in that dimension. + +Decimation is applied **per variable**: within the same :ref:`Concept_Stream`, different variables can be decimated +with different strides, or not be decimated at all. + +Publisher-side and subscriber-side decimation +--------------------------------------------- + +Unlike compression, decimation can be applied on **both sides** of the data flow: + +- When applied by a **publisher**, decimation reduces the volume of data that leaves the publisher. The simulated + cost of the decimation kernel is incurred before the data is transported. Only the decimated version of the variable + is put into the DTL, which directly reduces I/O or network costs. + +- When applied by a **subscriber**, decimation reduces the volume of data that the subscriber has to process after + receiving it. The subscriber first retrieves the full variable and then applies decimation locally. This can be + useful when the subscriber only needs a coarse view of the data, but the full-resolution version must still be + transported because other subscribers or a checkpoint mechanism may need it. + +Interpolation +------------- + +In some workflows, the subsampled data must be smoothed or reconstructed to better approximate the original field. +DTLMod models this by allowing an optional **interpolation method** to be specified alongside the stride. The +supported interpolation methods are: + +- **linear**: suitable for piecewise-linear fields (variables with at least 1 dimension). +- **quadratic**: suitable for smoother fields (variables with at least 2 dimensions). +- **cubic**: suitable for highly smooth fields (variables with at least 3 dimensions). + +The choice of interpolation method does not affect the size of the reduced variable: it only affects the +**computational cost** of the decimation operation. Higher-order interpolation is more expensive: the cost +multiplier is 2x for linear, 4x for quadratic, and 8x for cubic interpolation relative to simple subsampling +without interpolation. This allows you to study the tradeoff between the quality of the reconstructed data and the +computational overhead introduced by the interpolation step. + +Computational cost model +------------------------ + +The simulated cost of a decimation operation, in floating-point operations, is determined by: + +.. math:: + + C = m \times c \times N + +where :math:`N` is the number of elements in the **local** (non-decimated) portion of the variable, +:math:`c` is a configurable **cost per element** (defaulting to 1.0), and :math:`m` is the interpolation +multiplier (1 for no interpolation, 2 for linear, 4 for quadratic, 8 for cubic). + +The cost per element can be adjusted to match the observed or estimated computational cost of a specific decimation +implementation in the real-world application being simulated. + +Re-parameterization +------------------- + +A decimation operation can be re-parameterized between transactions. For instance, you can change the stride, the +interpolation method, or the cost per element of a variable that has already been decimated. This enables the +simulation of adaptive workflows in which the level of subsampling changes over time in response to features detected +in the data. + +When re-parameterizing, only the parameters that are explicitly provided are updated; the others retain their +previous values. diff --git a/doc/source/Reduction.rst b/doc/source/Reduction.rst new file mode 100644 index 0000000..3736fc7 --- /dev/null +++ b/doc/source/Reduction.rst @@ -0,0 +1,67 @@ +.. Copyright 2025-2026 + +.. _Reduction: + +Data Reduction Operations +========================= + +Scientific simulations and in situ processing workflows produce ever-increasing volumes of data. Even with fast +networks and storage systems, the sheer amount of data transported through the DTL can become a bottleneck. +**Data reduction** techniques alleviate this pressure by decreasing the volume of data that must be moved between +publishers and subscribers, at the cost of some additional computation and, depending on the method, a controlled +loss of information. + +DTLMod allows you to study the impact of data reduction on the performance of in situ workflows by attaching a +**reduction method** to a :ref:`Concept_Stream` and then applying it, with specific parameters, to individual +:ref:`Concept_Variable` objects. When a publisher puts a reduced variable into the DTL, the simulation accounts for +the computational cost of the reduction operation and transports a smaller volume of data. On the subscriber side, +the simulation may account for a corresponding decompression or reconstruction cost when retrieving the variable. + +DTLMod currently exposes two families of reduction methods: + +**Decimation** selectively retains a subset of elements from a multidimensional array by applying a per-dimension +stride. The result is a smaller array whose shape reflects the subsampling factor in each dimension. This approach +is common in visualization pipelines where only every *n*-th data point is needed. Since decimation preserves the +original values of the retained elements, it is by nature a lossless operation on the selected subset. Optionally, +an interpolation step can be used to reconstruct missing values. More details are given in the +:ref:`Decimation` section. + +**Compression** reduces the byte-size of a variable without altering its shape. The compressed variable retains the +same number of elements but each element occupies fewer bytes, according to a **compression ratio** that can be +specified directly or derived from a compressor model. DTLMod provides built-in models inspired by the SZ and ZFP +lossy compressors to derive realistic compression ratios from data characteristics. More details are given in the +:ref:`Compression` section. + +Where and when reduction is applied +------------------------------------ + +Reduction methods can be applied on either side of the data flow: + +- **Publisher-side reduction** is the most common scenario. The publisher compresses or decimates data before putting + it into the DTL, reducing the volume of data that has to be transported and stored. Both decimation and compression + support this mode. + +- **Subscriber-side reduction** is only available for decimation. A subscriber can choose to retrieve a subsampled + version of a variable it receives from the DTL, reducing the volume of data it has to process. Compression on the + subscriber side is not supported because its purpose is precisely to reduce what has to be transported, which + requires intervention before the data leaves the publisher. + +When a publisher applies a reduction, the information is propagated to subscribers: any variable obtained through +``inquire_variable`` on the subscriber side carries the reduction state set by the publisher. This allows DTLMod to +prevent conflicting reduction operations. In particular, a subscriber cannot apply a second reduction to a variable +that was already reduced by its publisher. + +Simulated costs +--------------- + +A reduction operation in DTLMod introduces two potential costs: + +1. A **reduction cost** (in simulated floating-point operations) is incurred by the actor that applies the reduction, + right before it puts or gets the variable. This cost models the computational overhead of running a compressor or + a decimation kernel. + +2. A **decompression cost** (for compression only) is incurred on the subscriber side after it receives compressed + data. This cost models the time needed to decompress the data before it can be used by the analysis component. + +These costs are fully configurable through the parameters of each reduction method, enabling you to explore tradeoffs +between data movement savings and computational overhead for different reduction strategies. diff --git a/doc/source/app_API.rst b/doc/source/app_API.rst index 0191014..45b3bbd 100644 --- a/doc/source/app_API.rst +++ b/doc/source/app_API.rst @@ -130,6 +130,29 @@ selecting the :ref:`Concept_Transport` **method** of the Stream to either ``Tran :ref:`Inside_staging_engine` section of the documentation. +.. |Concept_Reduction| replace:: **Reduction** +.. _Concept_Reduction: + +Data Reduction +^^^^^^^^^^^^^^ + +In situ workflows that produce large volumes of data can benefit from **data reduction** to decrease the amount of +data transported through the DTL. DTLMod exposes reduction as an optional operation that can be applied to individual +|Concept_Variable| objects within a |Concept_Stream|_. + +A reduction method is first created on a |Concept_Stream|_ by specifying its type (``"decimation"`` or +``"compression"``). It is then applied to a |Concept_Variable|_ with a set of parameters that control the reduction +behavior---for instance, a stride vector for decimation or a compression ratio and compressor profile for compression. + +When a publisher puts a reduced variable into the DTL, the simulation accounts for the computational overhead of the +reduction and transports only the reduced volume. On the subscriber side, a decompression cost may be incurred when +the data is retrieved. The reduction state of a variable is automatically propagated to subscribers: when a subscriber +inquires a variable that has been reduced by its publisher, this information is preserved and prevents conflicting +double reductions. + +A complete description of the reduction mechanisms, their parameters, and their internal cost models is given in the +:ref:`Reduction` section. + .. |Concept_Variable| replace:: **Variable** .. _Concept_Variable: diff --git a/doc/source/index.rst b/doc/source/index.rst index ea9fcf0..771eb44 100755 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -47,7 +47,10 @@ effects of resource allocation strategies. Engines    Inside the File engine    Inside the Staging engine - + Data Reduction Operations +    Decimation +    Compression + .. Cheat Sheet on the sublevels .. .. # with overline, for parts From 0c199c3ee58fc184a6f55bf526f78b6144d8ddfd Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Sat, 14 Feb 2026 11:41:19 -0500 Subject: [PATCH 40/92] prepare for release --- ChangeLog | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/ChangeLog b/ChangeLog index 2b65354..9b980c0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,57 @@ ---------------------------------------------------------------------------- +DTLMod (0.4) February 16, 2026 + +Major improvements: + - Data Reduction framework + - New abstract ReductionMethod interface for extensible data reduction strategies + - Decimation method: spatial subsampling with per-dimension stride, optional + interpolation (linear, quadratic, cubic), configurable computational cost, + and support for both publisher-side and subscriber-side application + - Compression method: size reduction with preserved shape, supporting three + compressor profiles (fixed ratio, SZ-inspired, and ZFP-inspired models), + separate compression and decompression costs, and per-transaction ratio + variability + - Publisher-side reduction state is propagated to subscribers through + inquire_variable, enabling detection and prevention of conflicting + double reductions + - Reduction operations work with both File and Staging engines + - New documentation pages for the reduction feature (Reduction, Decimation, + and Compression) and updated Main Concepts page + - Improved test coverage + - Comprehensive C++ and Python tests for both reduction methods + - Coverage of error handling, parameter validation, re-parameterization, + and publisher-subscriber workflows for each reduction method + - New test for subscriber-first arrival pattern in File engine + - Code quality and CI improvements + - Improved coverage reporting with SonarQube and CodeFactor integration + - Added subscriber synchronization barrier in File engine + - Removed unnecessary defensive guards and race condition checks that + cannot occur under SimGrid's maestro orchestration + +API Changes: + - New Stream method: + - Stream::define_reduction_method(name) creates a named reduction method + ("decimation" or "compression") for the stream + - New Variable methods and properties: + - Variable::set_reduction_operation(method, parameters) applies a reduction + with key-value parameters to a variable + - Variable::is_reduced(), is_reduced_by_publisher(), is_reduced_by_subscriber() + query the reduction state + - Variable::get_reduction_method() retrieves the applied ReductionMethod + - New ReductionMethod class with query methods: + - get_reduced_variable_global_size(), get_reduced_variable_local_size() + - get_reduced_variable_shape() + - get_flop_amount_to_reduce_variable(), get_flop_amount_to_decompress_variable() + - Full Python bindings for all reduction operations, including nine new + exception types for parameter validation errors + - Engine::put() now automatically simulates reduction cost and transports the + reduced data size when the variable is reduced + - Engine::get() now automatically simulates decompression cost after receiving + compressed data + +---------------------------------------------------------------------------- + DTLMod (0.3) January 19, 2026 Major improvements: From 560c4443a4846e4a9da90f76911c65a2efbbb41b Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Sat, 14 Feb 2026 12:00:49 -0500 Subject: [PATCH 41/92] remove brainstorm --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 0436400..1f5af00 100644 --- a/.gitignore +++ b/.gitignore @@ -48,4 +48,4 @@ dtlmod.egg-info/ doc/source/_ext/__pycache__/ .sonarlint/ -.vscode/ \ No newline at end of file +.vscode/Brainstorm_and_TODOs.md From a872cb99bd35baf9edcc87030bc3960bc1440b22 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Sat, 14 Feb 2026 12:01:03 -0500 Subject: [PATCH 42/92] update version number --- CMakeLists.txt | 6 +++--- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b652c33..213e705 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ if(POLICY CMP0167) cmake_policy(SET CMP0167 NEW) endif() -project(dtlmod VERSION 0.2 DESCRIPTION "Data Transport Layer Module") +project(dtlmod VERSION 0.4 DESCRIPTION "Data Transport Layer Module") include(GNUInstallDirs) find_package(Boost 1.48) @@ -66,9 +66,9 @@ endif() # build the version number set(DTLMOD_VERSION_MAJOR "0") -set(DTLMOD_VERSION_MINOR "2") +set(DTLMOD_VERSION_MINOR "4") set(DTLMOD_VERSION_PATCH "0") -set(DTLMOD_VERSION_EXTRA "dev") +set(DTLMOD_VERSION_EXTRA "") ## GIT version check #################### diff --git a/pyproject.toml b/pyproject.toml index 8bb2bf6..418e16e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "dtlmod" -version = "0.2" +version = "0.4" description = "A versatile simulated data transport layer SimGrid module" authors = [ { name = "The SWAT Team", email = "simgrid-community@inria.fr" } From c2c69093768914cfb1d04f3de48bd8371932b11a Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Sat, 14 Feb 2026 12:25:35 -0500 Subject: [PATCH 43/92] [Sonar] fix a first bunch of issues --- include/dtlmod/CompressionReductionMethod.hpp | 10 +++++--- include/dtlmod/DecimationReductionMethod.hpp | 9 ++++--- include/dtlmod/ReductionMethod.hpp | 3 ++- include/dtlmod/Variable.hpp | 3 ++- src/CompressionReductionMethod.cpp | 19 +++++++------- src/DecimationReductionMethod.cpp | 25 +++++++++++-------- src/Stream.cpp | 3 +-- src/Variable.cpp | 2 +- 8 files changed, 40 insertions(+), 34 deletions(-) diff --git a/include/dtlmod/CompressionReductionMethod.hpp b/include/dtlmod/CompressionReductionMethod.hpp index 68376b9..257e24b 100644 --- a/include/dtlmod/CompressionReductionMethod.hpp +++ b/include/dtlmod/CompressionReductionMethod.hpp @@ -48,7 +48,7 @@ class CompressionReductionMethod : public ReductionMethod { [[nodiscard]] double get_compression_ratio() const { return compression_ratio_; } void set_compression_ratio(double ratio) { compression_ratio_ = ratio; } [[nodiscard]] const std::string& get_compressor_profile() const { return compressor_profile_; } - void set_compressor_profile(const std::string& profile) { compressor_profile_ = profile; } + void set_compressor_profile(std::string_view profile) { compressor_profile_ = profile; } [[nodiscard]] double get_data_smoothness() const { return data_smoothness_; } void set_data_smoothness(double smoothness) { data_smoothness_ = smoothness; } [[nodiscard]] double get_ratio_variability() const { return ratio_variability_; } @@ -64,9 +64,11 @@ class CompressionReductionMethod : public ReductionMethod { static double derive_compression_ratio(double accuracy, const std::string& profile, double data_smoothness); public: - CompressionReductionMethod(const std::string& name) : ReductionMethod(name) {} - void parameterize_for_variable(const Variable& var, const std::map& parameters) override; - void reduce_variable(const Variable& /* var*/) override {} + using ReductionMethod::ReductionMethod; + void parameterize_for_variable(const Variable& var, + const std::map>& parameters) override; + void reduce_variable(const Variable& /* var*/) override + { /* Variable metadata are not modfied when using compression */ } [[nodiscard]] size_t get_reduced_variable_global_size(const Variable& var) const override; [[nodiscard]] size_t get_reduced_variable_local_size(const Variable& var) const override; diff --git a/include/dtlmod/DecimationReductionMethod.hpp b/include/dtlmod/DecimationReductionMethod.hpp index e2f6f52..b8f515a 100644 --- a/include/dtlmod/DecimationReductionMethod.hpp +++ b/include/dtlmod/DecimationReductionMethod.hpp @@ -37,13 +37,13 @@ class DecimationReductionMethod : public ReductionMethod { void set_reduced_local_start_and_count(sg4::ActorPtr actor, const std::vector& reduced_local_start, const std::vector& reduced_local_count) { - reduced_local_start_and_count_.try_emplace(actor, std::make_pair(reduced_local_start, reduced_local_count)); + reduced_local_start_and_count_.try_emplace(actor, reduced_local_start, reduced_local_count); } [[nodiscard]] const std::vector& get_stride() const { return stride_; } void set_stride(const std::vector& stride) { stride_ = stride; } [[nodiscard]] const std::string& get_interpolation_method() const { return interpolation_method_; } - void set_interpolation_method(const std::string& method) { interpolation_method_ = method; } + void set_interpolation_method(std::string_view method) { interpolation_method_ = method; } [[nodiscard]] double get_cost_per_element() const { return cost_per_element_; } void set_cost_per_element(double cost) { cost_per_element_ = cost; } @@ -59,7 +59,8 @@ class DecimationReductionMethod : public ReductionMethod { std::map> per_variable_parameterizations_; protected: - void parameterize_for_variable(const Variable& var, const std::map& parameters) override; + void parameterize_for_variable(const Variable& var, + const std::map>& parameters) override; void reduce_variable(const Variable& var) override; @@ -90,7 +91,7 @@ class DecimationReductionMethod : public ReductionMethod { } public: - DecimationReductionMethod(const std::string& name) : ReductionMethod(name) {} + using ReductionMethod::ReductionMethod; }; ///\endcond } // namespace dtlmod diff --git a/include/dtlmod/ReductionMethod.hpp b/include/dtlmod/ReductionMethod.hpp index 480adb6..feaf233 100644 --- a/include/dtlmod/ReductionMethod.hpp +++ b/include/dtlmod/ReductionMethod.hpp @@ -28,7 +28,8 @@ class ReductionMethod { ReductionMethod(const std::string& name) : name_(name) {} virtual ~ReductionMethod() = default; - virtual void parameterize_for_variable(const Variable& var, const std::map& parameters) = 0; + virtual void parameterize_for_variable(const Variable& var, + const std::map>& parameters) = 0; virtual void reduce_variable(const Variable& var) = 0; virtual size_t get_reduced_variable_global_size(const Variable& var) const = 0; virtual size_t get_reduced_variable_local_size(const Variable& var) const = 0; diff --git a/include/dtlmod/Variable.hpp b/include/dtlmod/Variable.hpp index 7c36612..94f018e 100644 --- a/include/dtlmod/Variable.hpp +++ b/include/dtlmod/Variable.hpp @@ -126,7 +126,8 @@ class Variable : public std::enable_shared_from_this { /// @brief Assign a parameterized reduction method to the Variable. /// @param method a ReductionMethod (already defined). /// @param paramaters specific parameters in key-value form to apply the reduction method to the Variable. - void set_reduction_operation(std::shared_ptr method, std::map parameters); + void set_reduction_operation(std::shared_ptr method, + std::map> parameters); [[nodiscard]] bool is_reduced() const { return is_reduced_with_ != nullptr; } diff --git a/src/CompressionReductionMethod.cpp b/src/CompressionReductionMethod.cpp index a87b455..dd75aac 100644 --- a/src/CompressionReductionMethod.cpp +++ b/src/CompressionReductionMethod.cpp @@ -27,26 +27,26 @@ double CompressionReductionMethod::ParameterizedCompression::get_effective_ratio size_t CompressionReductionMethod::get_reduced_variable_global_size(const Variable& var) const { auto ratio = per_variable_parameterizations_.at(&var)->get_compression_ratio(); - return static_cast(std::ceil(var.get_global_size() / ratio)); + return static_cast(std::ceil(static_cast(var.get_global_size()) / ratio)); } size_t CompressionReductionMethod::get_reduced_variable_local_size(const Variable& var) const { auto ratio = per_variable_parameterizations_.at(&var)->get_compression_ratio(); - return static_cast(std::ceil(var.get_local_size() / ratio)); + return static_cast(std::ceil(static_cast(var.get_local_size()) / ratio)); } double CompressionReductionMethod::get_flop_amount_to_reduce_variable(const Variable& var) const { auto param = per_variable_parameterizations_.at(&var); - auto num_elements = var.get_local_size() / var.get_element_size(); + auto num_elements = static_cast(var.get_local_size() / var.get_element_size()); return param->get_compression_cost_per_element() * num_elements; } double CompressionReductionMethod::get_flop_amount_to_decompress_variable(const Variable& var) const { auto param = per_variable_parameterizations_.at(&var); - auto num_elements = var.get_local_size() / var.get_element_size(); + auto num_elements = static_cast(var.get_local_size() / var.get_element_size()); return param->get_decompression_cost_per_element() * num_elements; } @@ -69,8 +69,8 @@ double CompressionReductionMethod::derive_compression_ratio(double accuracy, con return 1.0; } -void CompressionReductionMethod::parameterize_for_variable(const Variable& var, - const std::map& parameters) +void CompressionReductionMethod::parameterize_for_variable( + const Variable& var, const std::map>& parameters) { double new_accuracy = 1e-3; double new_compression_cost_per_element = 1.0; @@ -119,7 +119,7 @@ void CompressionReductionMethod::parameterize_for_variable(const Variable& var, } else if (key == "ratio_variability") { new_ratio_variability = std::stod(value); } else { - throw UnknownCompressionOptionException(XBT_THROW_POINT, key.c_str()); + throw UnknownCompressionOptionException(XBT_THROW_POINT, key); } } @@ -129,9 +129,8 @@ void CompressionReductionMethod::parameterize_for_variable(const Variable& var, throw InconsistentCompressionRatioException( XBT_THROW_POINT, "Compressor profile 'fixed' requires an explicit 'compression_ratio' parameter."); new_compression_ratio = derive_compression_ratio(new_accuracy, new_compressor_profile, new_data_smoothness); - } else if (ratio_explicitly_set) { - if (new_compression_ratio < 1.0) - throw InconsistentCompressionRatioException(XBT_THROW_POINT, "Compression ratio must be >= 1.0"); + } else if (ratio_explicitly_set && new_compression_ratio < 1.0) { + throw InconsistentCompressionRatioException(XBT_THROW_POINT, "Compression ratio must be >= 1.0"); } XBT_DEBUG("Compression parameterization for Variable %s: profile=%s, accuracy=%.2e, ratio=%.2f, " diff --git a/src/DecimationReductionMethod.cpp b/src/DecimationReductionMethod.cpp index d82d555..690fc69 100644 --- a/src/DecimationReductionMethod.cpp +++ b/src/DecimationReductionMethod.cpp @@ -37,20 +37,21 @@ double DecimationReductionMethod::ParameterizedDecimation::get_flop_amount_to_de XBT_DEBUG("Compute decimation cost with: cost_per_element = %.2f and interpolation_method = %s", cost_per_element_, interpolation_method_.c_str()); double amount = cost_per_element_; + auto local_size = static_cast(var_->get_local_size()); if (interpolation_method_.empty()) { - amount *= var_->get_local_size(); + amount *= local_size; } else if (interpolation_method_ == "linear") { - amount = 2 * amount * var_->get_local_size(); + amount = 2 * amount * local_size; } else if (interpolation_method_ == "quadratic") { - amount = 4 * amount * var_->get_local_size(); + amount = 4 * amount * local_size; } else if (interpolation_method_ == "cubic") { - amount = 8 * amount * var_->get_local_size(); + amount = 8 * amount * local_size; } // Sanity check done when parameterizing the reduction method for this variable return amount; } -void DecimationReductionMethod::parameterize_for_variable(const Variable& var, - const std::map& parameters) +void DecimationReductionMethod::parameterize_for_variable( + const Variable& var, const std::map>& parameters) { std::vector new_stride; std::string new_interpolation_method; @@ -103,7 +104,7 @@ void DecimationReductionMethod::parameterize_for_variable(const Variable& var, } else if (key == "cost_per_element") new_cost_per_element = std::stod(value); else - throw UnknownDecimationOptionException(XBT_THROW_POINT, key.c_str()); + throw UnknownDecimationOptionException(XBT_THROW_POINT, key); } if (!exists) { @@ -135,7 +136,8 @@ void DecimationReductionMethod::reduce_variable(const Variable& var) std::vector reduced_shape; size_t i = 0; for (auto dim_size : original_shape) - reduced_shape.push_back(std::ceil(dim_size / (stride[i++] * 1.0))); + reduced_shape.push_back( + static_cast(std::ceil(static_cast(dim_size) / static_cast(stride[i++])))); parameterization->set_reduced_shape(reduced_shape); auto self = sg4::Actor::self(); @@ -145,9 +147,10 @@ void DecimationReductionMethod::reduce_variable(const Variable& var) for (size_t i = 0; i < original_shape.size(); i++) { // Sanity checks that shape, start, and count have the same size have already been done - size_t r_start = std::ceil(start[i] / (stride[i] * 1.0)); - size_t r_next_start = - std::min(original_shape[i], static_cast(std::ceil((start[i] + count[i]) / (stride[i] * 1.0)))); + size_t r_start = static_cast(std::ceil(static_cast(start[i]) / static_cast(stride[i]))); + size_t r_next_start = std::min( + original_shape[i], + static_cast(std::ceil(static_cast(start[i] + count[i]) / static_cast(stride[i])))); XBT_DEBUG("Dim %zu: stride = %zu, Start = %zu, r_start = %zu, Count = %zu, r_count = %zu", i, stride[i], start[i], r_start, count[i], r_next_start - r_start); reduced_start.push_back(r_start); diff --git a/src/Stream.cpp b/src/Stream.cpp index 80d4159..b436cd6 100644 --- a/src/Stream.cpp +++ b/src/Stream.cpp @@ -158,8 +158,7 @@ void Stream::export_metadata_to_file() const std::shared_ptr Stream::define_reduction_method(const std::string& name) { - auto it = reduction_methods_.find(name); - if (it != reduction_methods_.end()) + if (auto it = reduction_methods_.find(name); it != reduction_methods_.end()) return it->second; std::shared_ptr reduction_method; diff --git a/src/Variable.cpp b/src/Variable.cpp index 8dc0bfb..1339d5d 100644 --- a/src/Variable.cpp +++ b/src/Variable.cpp @@ -60,7 +60,7 @@ void Variable::set_transaction_selection(unsigned int begin, unsigned int count) } void Variable::set_reduction_operation(std::shared_ptr method, - std::map parameters) + std::map> parameters) { auto stream = defined_in_stream_.lock(); xbt_assert(stream, "Variable::set_reduction_operation called after its Stream has been destroyed"); From a570653f995a223d3aec7098ec80c0577258e85c Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Sat, 14 Feb 2026 12:36:25 -0500 Subject: [PATCH 44/92] [Sonar] more issues fixed --- include/dtlmod/CompressionReductionMethod.hpp | 2 +- include/dtlmod/Variable.hpp | 2 +- src/CompressionReductionMethod.cpp | 2 +- src/DecimationReductionMethod.cpp | 10 ++++++---- src/Variable.cpp | 2 +- 5 files changed, 10 insertions(+), 8 deletions(-) diff --git a/include/dtlmod/CompressionReductionMethod.hpp b/include/dtlmod/CompressionReductionMethod.hpp index 257e24b..c37a4d9 100644 --- a/include/dtlmod/CompressionReductionMethod.hpp +++ b/include/dtlmod/CompressionReductionMethod.hpp @@ -61,7 +61,7 @@ class CompressionReductionMethod : public ReductionMethod { std::map> per_variable_parameterizations_; /// @brief Derive the compression ratio from accuracy and compressor profile. - static double derive_compression_ratio(double accuracy, const std::string& profile, double data_smoothness); + static double derive_compression_ratio(double accuracy, std::string_view profile, double data_smoothness); public: using ReductionMethod::ReductionMethod; diff --git a/include/dtlmod/Variable.hpp b/include/dtlmod/Variable.hpp index 94f018e..244fea1 100644 --- a/include/dtlmod/Variable.hpp +++ b/include/dtlmod/Variable.hpp @@ -127,7 +127,7 @@ class Variable : public std::enable_shared_from_this { /// @param method a ReductionMethod (already defined). /// @param paramaters specific parameters in key-value form to apply the reduction method to the Variable. void set_reduction_operation(std::shared_ptr method, - std::map> parameters); + const std::map>& parameters); [[nodiscard]] bool is_reduced() const { return is_reduced_with_ != nullptr; } diff --git a/src/CompressionReductionMethod.cpp b/src/CompressionReductionMethod.cpp index dd75aac..b706447 100644 --- a/src/CompressionReductionMethod.cpp +++ b/src/CompressionReductionMethod.cpp @@ -50,7 +50,7 @@ double CompressionReductionMethod::get_flop_amount_to_decompress_variable(const return param->get_decompression_cost_per_element() * num_elements; } -double CompressionReductionMethod::derive_compression_ratio(double accuracy, const std::string& profile, +double CompressionReductionMethod::derive_compression_ratio(double accuracy, std::string_view profile, double data_smoothness) { if (profile == "sz") { diff --git a/src/DecimationReductionMethod.cpp b/src/DecimationReductionMethod.cpp index 690fc69..54c4547 100644 --- a/src/DecimationReductionMethod.cpp +++ b/src/DecimationReductionMethod.cpp @@ -134,10 +134,12 @@ void DecimationReductionMethod::reduce_variable(const Variable& var) auto stride = parameterization->get_stride(); std::vector reduced_shape; - size_t i = 0; - for (auto dim_size : original_shape) + size_t idx = 0; + for (auto dim_size : original_shape) { reduced_shape.push_back( - static_cast(std::ceil(static_cast(dim_size) / static_cast(stride[i++])))); + static_cast(std::ceil(static_cast(dim_size) / static_cast(stride[idx])))); + idx++; + } parameterization->set_reduced_shape(reduced_shape); auto self = sg4::Actor::self(); @@ -147,7 +149,7 @@ void DecimationReductionMethod::reduce_variable(const Variable& var) for (size_t i = 0; i < original_shape.size(); i++) { // Sanity checks that shape, start, and count have the same size have already been done - size_t r_start = static_cast(std::ceil(static_cast(start[i]) / static_cast(stride[i]))); + auto r_start = static_cast(std::ceil(static_cast(start[i]) / static_cast(stride[i]))); size_t r_next_start = std::min( original_shape[i], static_cast(std::ceil(static_cast(start[i] + count[i]) / static_cast(stride[i])))); diff --git a/src/Variable.cpp b/src/Variable.cpp index 1339d5d..71663e3 100644 --- a/src/Variable.cpp +++ b/src/Variable.cpp @@ -60,7 +60,7 @@ void Variable::set_transaction_selection(unsigned int begin, unsigned int count) } void Variable::set_reduction_operation(std::shared_ptr method, - std::map> parameters) + const std::map>& parameters) { auto stream = defined_in_stream_.lock(); xbt_assert(stream, "Variable::set_reduction_operation called after its Stream has been destroyed"); From 34a1e7aaf8ccde1d0ca55c857f06b666511ebe98 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Sat, 14 Feb 2026 12:45:26 -0500 Subject: [PATCH 45/92] reduce method complexity --- include/dtlmod/CompressionReductionMethod.hpp | 7 + include/dtlmod/DecimationReductionMethod.hpp | 6 + src/CompressionReductionMethod.cpp | 156 ++++++++---------- src/DecimationReductionMethod.cpp | 140 ++++++++-------- 4 files changed, 150 insertions(+), 159 deletions(-) diff --git a/include/dtlmod/CompressionReductionMethod.hpp b/include/dtlmod/CompressionReductionMethod.hpp index c37a4d9..c2f77ab 100644 --- a/include/dtlmod/CompressionReductionMethod.hpp +++ b/include/dtlmod/CompressionReductionMethod.hpp @@ -63,6 +63,13 @@ class CompressionReductionMethod : public ReductionMethod { /// @brief Derive the compression ratio from accuracy and compressor profile. static double derive_compression_ratio(double accuracy, std::string_view profile, double data_smoothness); + /// @brief Validate compressor profile string. Throws if invalid. + static void validate_compressor_profile(std::string_view profile); + + /// @brief Validate and resolve the compression ratio from parsed parameters. + static double resolve_compression_ratio(double ratio, bool ratio_explicitly_set, bool is_new, + std::string_view profile, double accuracy, double data_smoothness); + public: using ReductionMethod::ReductionMethod; void parameterize_for_variable(const Variable& var, diff --git a/include/dtlmod/DecimationReductionMethod.hpp b/include/dtlmod/DecimationReductionMethod.hpp index b8f515a..a924bc2 100644 --- a/include/dtlmod/DecimationReductionMethod.hpp +++ b/include/dtlmod/DecimationReductionMethod.hpp @@ -58,6 +58,12 @@ class DecimationReductionMethod : public ReductionMethod { std::map> per_variable_parameterizations_; + /// @brief Parse and validate a comma-separated stride string against a variable's shape. + static std::vector parse_stride(std::string_view value, const Variable& var); + + /// @brief Validate that an interpolation method is compatible with the variable's dimensionality. + static void validate_interpolation(std::string_view method, const Variable& var); + protected: void parameterize_for_variable(const Variable& var, const std::map>& parameters) override; diff --git a/src/CompressionReductionMethod.cpp b/src/CompressionReductionMethod.cpp index b706447..e9efd93 100644 --- a/src/CompressionReductionMethod.cpp +++ b/src/CompressionReductionMethod.cpp @@ -55,11 +55,12 @@ double CompressionReductionMethod::derive_compression_ratio(double accuracy, std { if (profile == "sz") { // SZ-like prediction-based compressor: empirical fit from published benchmarks on scientific data. - // Higher smoothness → better prediction → higher ratio. + // Higher smoothness -> better prediction -> higher ratio. double alpha = 3.0; double beta = 0.8; return std::max(1.0, alpha * std::pow(-std::log10(accuracy), beta) * (0.5 + data_smoothness)); - } else if (profile == "zfp") { + } + if (profile == "zfp") { // ZFP-like transform-based compressor: rate = bits-per-value derived from accuracy. // 64 bits (double) / rate gives the compression ratio. double rate = std::max(1.0, -std::log2(accuracy) + 1.0); @@ -69,104 +70,91 @@ double CompressionReductionMethod::derive_compression_ratio(double accuracy, std return 1.0; } +void CompressionReductionMethod::validate_compressor_profile(std::string_view profile) +{ + if (profile != "fixed" && profile != "sz" && profile != "zfp") + throw UnknownCompressionOptionException(XBT_THROW_POINT, "Unknown compressor profile: " + std::string(profile) + + " (options are: fixed, sz, or zfp)."); +} + +double CompressionReductionMethod::resolve_compression_ratio(double ratio, bool ratio_explicitly_set, bool is_new, + std::string_view profile, double accuracy, + double data_smoothness) +{ + if (ratio_explicitly_set) { + if (ratio < 1.0) + throw InconsistentCompressionRatioException(XBT_THROW_POINT, "Compression ratio must be >= 1.0"); + return ratio; + } + if (is_new) { + if (profile == "fixed") + throw InconsistentCompressionRatioException( + XBT_THROW_POINT, "Compressor profile 'fixed' requires an explicit 'compression_ratio' parameter."); + return derive_compression_ratio(accuracy, profile, data_smoothness); + } + return ratio; // Keep existing ratio for partial updates without explicit ratio +} + void CompressionReductionMethod::parameterize_for_variable( const Variable& var, const std::map>& parameters) { - double new_accuracy = 1e-3; - double new_compression_cost_per_element = 1.0; - double new_decompression_cost_per_element = 1.0; - double new_compression_ratio = 0.0; // 0 means "not specified, must be derived" - std::string new_compressor_profile = "fixed"; - double new_data_smoothness = 0.5; - double new_ratio_variability = 0.0; - - // Detect existing parameterization (if any). - auto it = per_variable_parameterizations_.find(&var); - const bool exists = (it != per_variable_parameterizations_.end()); - - // Initialize from existing values (if present) to support partial updates. - if (exists) { + // Start from existing values (if any) to support partial updates. + auto it = per_variable_parameterizations_.find(&var); + bool is_new = (it == per_variable_parameterizations_.end()); + + double accuracy = 1e-3; + double compression_cost_per_element = 1.0; + double decompression_cost_per_element = 1.0; + double compression_ratio = 0.0; + std::string compressor_profile = "fixed"; + double data_smoothness = 0.5; + double ratio_variability = 0.0; + + if (!is_new) { const auto& existing = it->second; - new_accuracy = existing->get_accuracy(); - new_compression_cost_per_element = existing->get_compression_cost_per_element(); - new_decompression_cost_per_element = existing->get_decompression_cost_per_element(); - new_compression_ratio = existing->get_compression_ratio(); - new_compressor_profile = existing->get_compressor_profile(); - new_data_smoothness = existing->get_data_smoothness(); - new_ratio_variability = existing->get_ratio_variability(); + accuracy = existing->get_accuracy(); + compression_cost_per_element = existing->get_compression_cost_per_element(); + decompression_cost_per_element = existing->get_decompression_cost_per_element(); + compression_ratio = existing->get_compression_ratio(); + compressor_profile = existing->get_compressor_profile(); + data_smoothness = existing->get_data_smoothness(); + ratio_variability = existing->get_ratio_variability(); } bool ratio_explicitly_set = false; for (const auto& [key, value] : parameters) { - if (key == "accuracy") { - new_accuracy = std::stod(value); - } else if (key == "compression_cost_per_element") { - new_compression_cost_per_element = std::stod(value); - } else if (key == "decompression_cost_per_element") { - new_decompression_cost_per_element = std::stod(value); - } else if (key == "compression_ratio") { - new_compression_ratio = std::stod(value); - ratio_explicitly_set = true; + if (key == "accuracy") + accuracy = std::stod(value); + else if (key == "compression_cost_per_element") + compression_cost_per_element = std::stod(value); + else if (key == "decompression_cost_per_element") + decompression_cost_per_element = std::stod(value); + else if (key == "compression_ratio") { + compression_ratio = std::stod(value); + ratio_explicitly_set = true; } else if (key == "compressor") { - if (value == "fixed" || value == "sz" || value == "zfp") - new_compressor_profile = value; - else - throw UnknownCompressionOptionException(XBT_THROW_POINT, "Unknown compressor profile: " + value + - " (options are: fixed, sz, or zfp)."); - } else if (key == "data_smoothness") { - new_data_smoothness = std::stod(value); - } else if (key == "ratio_variability") { - new_ratio_variability = std::stod(value); - } else { + validate_compressor_profile(value); + compressor_profile = value; + } else if (key == "data_smoothness") + data_smoothness = std::stod(value); + else if (key == "ratio_variability") + ratio_variability = std::stod(value); + else throw UnknownCompressionOptionException(XBT_THROW_POINT, key); - } } - // Derive compression ratio if not explicitly specified - if (!ratio_explicitly_set && !exists) { - if (new_compressor_profile == "fixed") - throw InconsistentCompressionRatioException( - XBT_THROW_POINT, "Compressor profile 'fixed' requires an explicit 'compression_ratio' parameter."); - new_compression_ratio = derive_compression_ratio(new_accuracy, new_compressor_profile, new_data_smoothness); - } else if (ratio_explicitly_set && new_compression_ratio < 1.0) { - throw InconsistentCompressionRatioException(XBT_THROW_POINT, "Compression ratio must be >= 1.0"); - } + compression_ratio = resolve_compression_ratio(compression_ratio, ratio_explicitly_set, is_new, compressor_profile, + accuracy, data_smoothness); XBT_DEBUG("Compression parameterization for Variable %s: profile=%s, accuracy=%.2e, ratio=%.2f, " "compression_cost=%.2f, decompression_cost=%.2f, smoothness=%.2f, variability=%.2f", - var.get_cname(), new_compressor_profile.c_str(), new_accuracy, new_compression_ratio, - new_compression_cost_per_element, new_decompression_cost_per_element, new_data_smoothness, - new_ratio_variability); - - if (!exists) { - per_variable_parameterizations_.try_emplace( - &var, std::make_shared( - var, new_accuracy, new_compression_cost_per_element, new_decompression_cost_per_element, - new_compression_ratio, new_compressor_profile, new_data_smoothness, new_ratio_variability)); - return; - } + var.get_cname(), compressor_profile.c_str(), accuracy, compression_ratio, compression_cost_per_element, + decompression_cost_per_element, data_smoothness, ratio_variability); - // If already exists, update only if changed. - const auto& existing = it->second; - - if (existing->get_accuracy() != new_accuracy) - existing->set_accuracy(new_accuracy); - if (existing->get_compression_cost_per_element() != new_compression_cost_per_element) - existing->set_compression_cost_per_element(new_compression_cost_per_element); - if (existing->get_decompression_cost_per_element() != new_decompression_cost_per_element) - existing->set_decompression_cost_per_element(new_decompression_cost_per_element); - if (ratio_explicitly_set || new_compressor_profile != existing->get_compressor_profile()) { - double updated_ratio = ratio_explicitly_set - ? new_compression_ratio - : derive_compression_ratio(new_accuracy, new_compressor_profile, new_data_smoothness); - existing->set_compression_ratio(updated_ratio); - } - if (existing->get_compressor_profile() != new_compressor_profile) - existing->set_compressor_profile(new_compressor_profile); - if (existing->get_data_smoothness() != new_data_smoothness) - existing->set_data_smoothness(new_data_smoothness); - if (existing->get_ratio_variability() != new_ratio_variability) - existing->set_ratio_variability(new_ratio_variability); + // Always (re)create the parameterization — avoids field-by-field update complexity. + per_variable_parameterizations_[&var] = std::make_shared( + var, accuracy, compression_cost_per_element, decompression_cost_per_element, compression_ratio, + compressor_profile, data_smoothness, ratio_variability); } } // namespace dtlmod diff --git a/src/DecimationReductionMethod.cpp b/src/DecimationReductionMethod.cpp index 54c4547..3777873 100644 --- a/src/DecimationReductionMethod.cpp +++ b/src/DecimationReductionMethod.cpp @@ -36,95 +36,85 @@ double DecimationReductionMethod::ParameterizedDecimation::get_flop_amount_to_de { XBT_DEBUG("Compute decimation cost with: cost_per_element = %.2f and interpolation_method = %s", cost_per_element_, interpolation_method_.c_str()); - double amount = cost_per_element_; + double amount = cost_per_element_; auto local_size = static_cast(var_->get_local_size()); - if (interpolation_method_.empty()) { - amount *= local_size; - } else if (interpolation_method_ == "linear") { - amount = 2 * amount * local_size; - } else if (interpolation_method_ == "quadratic") { - amount = 4 * amount * local_size; - } else if (interpolation_method_ == "cubic") { - amount = 8 * amount * local_size; - } // Sanity check done when parameterizing the reduction method for this variable - return amount; + int multiplier = 1; + + if (interpolation_method_ == "linear") + multiplier = 2; + else if (interpolation_method_ == "quadratic") + multiplier = 4; + else if (interpolation_method_ == "cubic") + multiplier = 8; + + return multiplier * amount * local_size; +} + +std::vector DecimationReductionMethod::parse_stride(std::string_view value, const Variable& var) +{ + std::vector tokens; + std::string value_str(value); + boost::split(tokens, value_str, boost::is_any_of(","), boost::token_compress_on); + + if (var.get_shape().size() != tokens.size()) + throw InconsistentDecimationStrideException( + XBT_THROW_POINT, "Decimation Stride and Variable Shape vectors must have the same size. Stride: " + + std::to_string(tokens.size()) + ", Shape: " + std::to_string(var.get_shape().size())); + + std::vector stride; + stride.reserve(tokens.size()); + for (const auto& t : tokens) { + auto dim_stride = std::stoul(t); + if (t[0] == '-' || dim_stride == 0) + throw InconsistentDecimationStrideException(XBT_THROW_POINT, "Stride values must be strictly positive"); + stride.push_back(dim_stride); + } + return stride; +} + +void DecimationReductionMethod::validate_interpolation(std::string_view method, const Variable& var) +{ + if (method != "linear" && method != "quadratic" && method != "cubic") + throw UnknownDecimationInterpolationException(XBT_THROW_POINT, std::string("Unknown interpolation method: ") + + std::string(method) + + " (options are: linear, cubic, or quadratic)."); + + if ((method == "quadratic" && var.get_shape().size() < 2) || (method == "cubic" && var.get_shape().size() < 3)) + throw InconsistentDecimationInterpolationException( + XBT_THROW_POINT, "Variable has not enough dimensions to apply this interpolation method"); } void DecimationReductionMethod::parameterize_for_variable( const Variable& var, const std::map>& parameters) { - std::vector new_stride; - std::string new_interpolation_method; - double new_cost_per_element = 1.0; - - // Detect existing parameterization (if any). - auto it = per_variable_parameterizations_.find(&var); - const bool exists = (it != per_variable_parameterizations_.end()); - - // Initialize from existing values (if present) to support partial updates. - if (exists) { - const auto& existing = it->second; - // Replace these getters with your actual API: - new_stride = existing->get_stride(); - new_interpolation_method = existing->get_interpolation_method(); - new_cost_per_element = existing->get_cost_per_element(); + // Start from existing values (if any) to support partial updates. + auto it = per_variable_parameterizations_.find(&var); + + std::vector stride; + std::string interpolation_method; + double cost_per_element = 1.0; + + if (it != per_variable_parameterizations_.end()) { + stride = it->second->get_stride(); + interpolation_method = it->second->get_interpolation_method(); + cost_per_element = it->second->get_cost_per_element(); } for (const auto& [key, value] : parameters) { - if (key == "stride") { - std::vector tokens; - boost::split(tokens, value, boost::is_any_of(","), boost::token_compress_on); - - if (var.get_shape().size() != tokens.size()) - throw InconsistentDecimationStrideException( - XBT_THROW_POINT, "Decimation Stride and Variable Shape vectors must have the same size. Stride: " + - std::to_string(tokens.size()) + ", Shape: " + std::to_string(var.get_shape().size())); - - std::vector parsed_stride; - parsed_stride.reserve(tokens.size()); - for (const auto& t : tokens) { - auto dim_stride = std::stoul(t); - if (t[0] == '-' || dim_stride == 0) - throw InconsistentDecimationStrideException(XBT_THROW_POINT, "Stride values must be strictly positive"); - parsed_stride.push_back(dim_stride); - } - new_stride = std::move(parsed_stride); - - } else if (key == "interpolation") { - if (value == "linear" || value == "quadratic" || value == "cubic") - new_interpolation_method = value; - else - throw UnknownDecimationInterpolationException(XBT_THROW_POINT, - std::string("Unknown interpolation method: ") + value + - " (options are: linear, cubic, or quadratic)."); - - if ((value == "quadratic" && var.get_shape().size() < 2) || (value == "cubic" && var.get_shape().size() < 3)) - throw InconsistentDecimationInterpolationException( - XBT_THROW_POINT, "Variable has not enough dimensions to apply this interpolation method"); + if (key == "stride") + stride = parse_stride(value, var); + else if (key == "interpolation") { + validate_interpolation(value, var); + interpolation_method = value; } else if (key == "cost_per_element") - new_cost_per_element = std::stod(value); + cost_per_element = std::stod(value); else throw UnknownDecimationOptionException(XBT_THROW_POINT, key); } - if (!exists) { - // First-time parameterization - per_variable_parameterizations_.try_emplace( - &var, - std::make_shared(var, new_stride, new_interpolation_method, new_cost_per_element)); - return; - } - - // If already exists, update only if changed. - const auto& existing = it->second; - - // Compare with existing to avoid unnecessary churn - if (existing->get_stride() != new_stride) - existing->set_stride(new_stride); - if (existing->get_interpolation_method() != new_interpolation_method) - existing->set_interpolation_method(new_interpolation_method); - if (existing->get_cost_per_element() != new_cost_per_element) - existing->set_cost_per_element(new_cost_per_element); + // Always (re)create the parameterization — avoids field-by-field update complexity. + per_variable_parameterizations_[&var] = + std::make_shared(var, stride, interpolation_method, cost_per_element); } void DecimationReductionMethod::reduce_variable(const Variable& var) From e42988bd657a6e62e6e06b312a2c32c415e43902 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Sat, 14 Feb 2026 12:56:09 -0500 Subject: [PATCH 46/92] fix broken test --- src/DecimationReductionMethod.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/DecimationReductionMethod.cpp b/src/DecimationReductionMethod.cpp index 3777873..f7e4165 100644 --- a/src/DecimationReductionMethod.cpp +++ b/src/DecimationReductionMethod.cpp @@ -112,9 +112,15 @@ void DecimationReductionMethod::parameterize_for_variable( throw UnknownDecimationOptionException(XBT_THROW_POINT, key); } - // Always (re)create the parameterization — avoids field-by-field update complexity. - per_variable_parameterizations_[&var] = - std::make_shared(var, stride, interpolation_method, cost_per_element); + if (it == per_variable_parameterizations_.end()) { + per_variable_parameterizations_.try_emplace( + &var, std::make_shared(var, stride, interpolation_method, cost_per_element)); + } else { + // Update in-place to preserve per-actor state accumulated by reduce_variable(). + it->second->set_stride(stride); + it->second->set_interpolation_method(interpolation_method); + it->second->set_cost_per_element(cost_per_element); + } } void DecimationReductionMethod::reduce_variable(const Variable& var) From f4579eb30bdc9be7b553206bd5606acfe96b104b Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Sat, 14 Feb 2026 13:21:56 -0500 Subject: [PATCH 47/92] [sonar] use a struct to configure compression --- include/dtlmod/CompressionReductionMethod.hpp | 56 +++++++------------ include/dtlmod/ReductionMethod.hpp | 2 +- src/CompressionReductionMethod.cpp | 56 +++++++------------ 3 files changed, 41 insertions(+), 73 deletions(-) diff --git a/include/dtlmod/CompressionReductionMethod.hpp b/include/dtlmod/CompressionReductionMethod.hpp index c2f77ab..e705dda 100644 --- a/include/dtlmod/CompressionReductionMethod.hpp +++ b/include/dtlmod/CompressionReductionMethod.hpp @@ -14,45 +14,31 @@ namespace dtlmod { /// \cond EXCLUDE_FROM_DOCUMENTATION class CompressionReductionMethod : public ReductionMethod { + struct CompressionConfig { + double accuracy = 1e-3; + double compression_cost_per_element = 1.0; + double decompression_cost_per_element = 1.0; + double compression_ratio = 0.0; + std::string compressor_profile = "fixed"; + double data_smoothness = 0.5; + double ratio_variability = 0.0; + }; + class ParameterizedCompression { const Variable* var_; // non-owning: the Variable outlives the parameterization (both owned by Stream) - double accuracy_; - double compression_cost_per_element_; - double decompression_cost_per_element_; - double compression_ratio_; - std::string compressor_profile_; // "fixed", "sz", "zfp" - double data_smoothness_; // hint in [0,1], shifts the model curve - double ratio_variability_; // per-transaction noise amplitude in [0,1] + CompressionConfig cfg_; public: - ParameterizedCompression(const Variable& var, double accuracy, double compression_cost_per_element, - double decompression_cost_per_element, double compression_ratio, - const std::string& compressor_profile, double data_smoothness, double ratio_variability) - : var_(&var) - , accuracy_(accuracy) - , compression_cost_per_element_(compression_cost_per_element) - , decompression_cost_per_element_(decompression_cost_per_element) - , compression_ratio_(compression_ratio) - , compressor_profile_(compressor_profile) - , data_smoothness_(data_smoothness) - , ratio_variability_(ratio_variability) - { - } - - [[nodiscard]] double get_accuracy() const { return accuracy_; } - void set_accuracy(double accuracy) { accuracy_ = accuracy; } - [[nodiscard]] double get_compression_cost_per_element() const { return compression_cost_per_element_; } - void set_compression_cost_per_element(double cost) { compression_cost_per_element_ = cost; } - [[nodiscard]] double get_decompression_cost_per_element() const { return decompression_cost_per_element_; } - void set_decompression_cost_per_element(double cost) { decompression_cost_per_element_ = cost; } - [[nodiscard]] double get_compression_ratio() const { return compression_ratio_; } - void set_compression_ratio(double ratio) { compression_ratio_ = ratio; } - [[nodiscard]] const std::string& get_compressor_profile() const { return compressor_profile_; } - void set_compressor_profile(std::string_view profile) { compressor_profile_ = profile; } - [[nodiscard]] double get_data_smoothness() const { return data_smoothness_; } - void set_data_smoothness(double smoothness) { data_smoothness_ = smoothness; } - [[nodiscard]] double get_ratio_variability() const { return ratio_variability_; } - void set_ratio_variability(double variability) { ratio_variability_ = variability; } + ParameterizedCompression(const Variable& var, CompressionConfig cfg) : var_(&var), cfg_(std::move(cfg)) {} + + [[nodiscard]] double get_accuracy() const { return cfg_.accuracy; } + [[nodiscard]] double get_compression_cost_per_element() const { return cfg_.compression_cost_per_element; } + [[nodiscard]] double get_decompression_cost_per_element() const { return cfg_.decompression_cost_per_element; } + [[nodiscard]] double get_compression_ratio() const { return cfg_.compression_ratio; } + [[nodiscard]] const std::string& get_compressor_profile() const { return cfg_.compressor_profile; } + [[nodiscard]] double get_data_smoothness() const { return cfg_.data_smoothness; } + [[nodiscard]] double get_ratio_variability() const { return cfg_.ratio_variability; } + [[nodiscard]] const CompressionConfig& get_config() const { return cfg_; } /// @brief Get the effective compression ratio, optionally perturbed by per-transaction noise. [[nodiscard]] double get_effective_ratio(unsigned int transaction_id = 0) const; diff --git a/include/dtlmod/ReductionMethod.hpp b/include/dtlmod/ReductionMethod.hpp index feaf233..5ae33b7 100644 --- a/include/dtlmod/ReductionMethod.hpp +++ b/include/dtlmod/ReductionMethod.hpp @@ -25,7 +25,7 @@ class ReductionMethod { std::string name_; public: - ReductionMethod(const std::string& name) : name_(name) {} + explicit ReductionMethod(const std::string& name) : name_(name) {} virtual ~ReductionMethod() = default; virtual void parameterize_for_variable(const Variable& var, diff --git a/src/CompressionReductionMethod.cpp b/src/CompressionReductionMethod.cpp index e9efd93..a4d39fa 100644 --- a/src/CompressionReductionMethod.cpp +++ b/src/CompressionReductionMethod.cpp @@ -15,13 +15,13 @@ namespace dtlmod { double CompressionReductionMethod::ParameterizedCompression::get_effective_ratio(unsigned int transaction_id) const { - if (ratio_variability_ <= 0.0) - return compression_ratio_; + if (cfg_.ratio_variability <= 0.0) + return cfg_.compression_ratio; // Deterministic noise from hash of (variable_name, transaction_id) size_t seed = std::hash{}(var_->get_name()) ^ (std::hash{}(transaction_id) << 1); // Map to [1 - variability, 1 + variability] - double noise = 1.0 + ratio_variability_ * (2.0 * (seed % 10001) / 10000.0 - 1.0); - return std::max(1.0, compression_ratio_ * noise); + double noise = 1.0 + cfg_.ratio_variability * (2.0 * (seed % 10001) / 10000.0 - 1.0); + return std::max(1.0, cfg_.compression_ratio * noise); } size_t CompressionReductionMethod::get_reduced_variable_global_size(const Variable& var) const @@ -102,59 +102,41 @@ void CompressionReductionMethod::parameterize_for_variable( auto it = per_variable_parameterizations_.find(&var); bool is_new = (it == per_variable_parameterizations_.end()); - double accuracy = 1e-3; - double compression_cost_per_element = 1.0; - double decompression_cost_per_element = 1.0; - double compression_ratio = 0.0; - std::string compressor_profile = "fixed"; - double data_smoothness = 0.5; - double ratio_variability = 0.0; - - if (!is_new) { - const auto& existing = it->second; - accuracy = existing->get_accuracy(); - compression_cost_per_element = existing->get_compression_cost_per_element(); - decompression_cost_per_element = existing->get_decompression_cost_per_element(); - compression_ratio = existing->get_compression_ratio(); - compressor_profile = existing->get_compressor_profile(); - data_smoothness = existing->get_data_smoothness(); - ratio_variability = existing->get_ratio_variability(); - } + CompressionConfig cfg = is_new ? CompressionConfig{} : it->second->get_config(); bool ratio_explicitly_set = false; for (const auto& [key, value] : parameters) { if (key == "accuracy") - accuracy = std::stod(value); + cfg.accuracy = std::stod(value); else if (key == "compression_cost_per_element") - compression_cost_per_element = std::stod(value); + cfg.compression_cost_per_element = std::stod(value); else if (key == "decompression_cost_per_element") - decompression_cost_per_element = std::stod(value); + cfg.decompression_cost_per_element = std::stod(value); else if (key == "compression_ratio") { - compression_ratio = std::stod(value); - ratio_explicitly_set = true; + cfg.compression_ratio = std::stod(value); + ratio_explicitly_set = true; } else if (key == "compressor") { validate_compressor_profile(value); - compressor_profile = value; + cfg.compressor_profile = value; } else if (key == "data_smoothness") - data_smoothness = std::stod(value); + cfg.data_smoothness = std::stod(value); else if (key == "ratio_variability") - ratio_variability = std::stod(value); + cfg.ratio_variability = std::stod(value); else throw UnknownCompressionOptionException(XBT_THROW_POINT, key); } - compression_ratio = resolve_compression_ratio(compression_ratio, ratio_explicitly_set, is_new, compressor_profile, - accuracy, data_smoothness); + cfg.compression_ratio = resolve_compression_ratio(cfg.compression_ratio, ratio_explicitly_set, is_new, + cfg.compressor_profile, cfg.accuracy, cfg.data_smoothness); XBT_DEBUG("Compression parameterization for Variable %s: profile=%s, accuracy=%.2e, ratio=%.2f, " "compression_cost=%.2f, decompression_cost=%.2f, smoothness=%.2f, variability=%.2f", - var.get_cname(), compressor_profile.c_str(), accuracy, compression_ratio, compression_cost_per_element, - decompression_cost_per_element, data_smoothness, ratio_variability); + var.get_cname(), cfg.compressor_profile.c_str(), cfg.accuracy, cfg.compression_ratio, + cfg.compression_cost_per_element, cfg.decompression_cost_per_element, cfg.data_smoothness, + cfg.ratio_variability); // Always (re)create the parameterization — avoids field-by-field update complexity. - per_variable_parameterizations_[&var] = std::make_shared( - var, accuracy, compression_cost_per_element, decompression_cost_per_element, compression_ratio, - compressor_profile, data_smoothness, ratio_variability); + per_variable_parameterizations_[&var] = std::make_shared(var, std::move(cfg)); } } // namespace dtlmod From 77cf5f13c1e0e3e446eeb8d6bdd5fa35c752ed16 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Sat, 14 Feb 2026 13:25:21 -0500 Subject: [PATCH 48/92] untrack brainstorm --- Brainstorm_and_TODOs.md | 131 ---------------------------------------- 1 file changed, 131 deletions(-) delete mode 100644 Brainstorm_and_TODOs.md diff --git a/Brainstorm_and_TODOs.md b/Brainstorm_and_TODOs.md deleted file mode 100644 index 5c4051c..0000000 --- a/Brainstorm_and_TODOs.md +++ /dev/null @@ -1,131 +0,0 @@ -# Brainstorm to add data reduction to DTLMod -## Assumptions -- Only one **reduction method** can be applied to a Variable **at a given time**. -- It should be possible to **change** the reduction method or its parameterization from **one transaction to another**. -- If we want to apply **different reduction methods** to the same Variable, it must be over **different Streams**. - - in that case, we define a Variable for each stream (with the same name, shape, and distribution) and then apply a single reduction method at a time. -- A reduction method should be applied to a Variable either on the Publisher or the Subscriber side. -- As long as a reduction method is defined for a Variable, it must applied to each transaction that involves this Variable - -## Notes on ADIOS does manage Variable reduction -- Has an Operator class (broader definition than just reduction, can also be encryption for instance) -- Has IO::defineOperator(name, type, parameters) to configure the operation for that IO (a.k.a. Stream in the DTL) - - This function must be called when parsing the configuration file -- Has IO::addOperation(variable, operatorType, parameters) and Variable::addOperation(Operator, parameters) the former is to define the operation before the variable while the latter does the work - - multiple operations can be applied to a single variable -- Has Variable::removeOperation -- The doc says that the same operation can be applied to "a set of variables", but nothing in the code looks like that. -- In the ADIOS XML configuration file, uses a and a set of key/value parameters (which provides a uniform way to parse), the name refers (mostly) to a specific lossy compressor. In the case of the DTL, it will rather be the name of a technique (e.g., decimation, compression, refactoring) all gather under "reduction" rather than "operator" - -## Considered Reduction Methods - -### Decimation -- This method amounts to ignore some elements of the variables, i.e., *one every other X* in each dimension. -- It is parameterized by: - - The applied **stride** that can be different for each dimension, e.g, {X, Y, Z} for a 3D variable - - A **cost per element**. By default, as we have to simulate a traversal of the data, we can account for a couple flops per element (**e.g., 1 or 2**). - - An optional **interpolation technique** - - It simulates the fact that the elements kept capture information about the discarded elements. This would **increase the simulated cost per element** associated to the decimation. - - Depending on the **shape** of the variable (i.e., number of dimensions), interpolation methods can for instance be **linear**, **quadatric**, or **cubic** ,referring to how many neighbors are considered. - - **Assumptions for the first implementation:** - - We do not consider interpolation beyond three dimensions - - We do not consider the fact that elements needed by a rank to compute the interpolation are owned by another rank. -- Impact on the local and global sizes of the variable: - - For each of its dimensions, a Variable is defined by a *shape* (the total #elements) and two arrays of *local_start* (the offset for each actor owning a part of the data) and *local_count* (the number of elements owned by each actor) values. When we apply a decimation of stride *X* on a dimension, we have: - - **reduced_shape** = ceil(shape / (1.0 * X )) - - **reduced_local_start** = ceil(local_start / (1.0 * X )) and 0 if greater than (reduced_shape - 1) - - **reduced_local_count** = min(shape, ceil((local_start + local_count) / 1.0 * X)) - ceil(local_start / (1.0 * X)) - - The `get_reduced_global_size()` method will return the product of the element size by the `reduced_shape` of each dimension. - - The `get_reduced_local_size()` method will return the product of the `reduced_local_count` of each dimension. - - **Note:** these two functions may be for internal purposes only and not user-facing. - -- **Behavior depending on the engine type** - - With a **File** engine - - If the reduction method is applied on the **publisher side**, it's to speed up I/O and reduce the storage footprint (e.g., for a checkpoint operation). A `put()` of a Variable on which decimation is applied considers the **local_reduced_size** for the I/O operations triggered by the put. This is automatically reflected in the metadata stored for this Variable. - - If the reduction method is applied on the **subscriber side**, this means that the subscriber does not want to fetch the entire data from storage. The behavior is thus similar to that of calling `set_selection()`. - - With a **Staging** engine - - The reduction method can be applied on either side of the stream with the same effect. The internal behavior is close to that of a transaction with a selection. The exact data transfer pattern is determined when the subscribers specify what they need from the publishers. In that case, this means to consider the **reduced** versions of **shape**, **start**, and **count** to determine the block to exchange. A notable difference is that if the reduction method is applied on one side, the other side must compute the reduced information first. - - When the decimation does interpolation, the cost of its computation must be simulated on the publisher side, but only when determining the exchanges between individual pairs of publishers and subscribers. - - -### Compression -- This methods produces a smaller version of a variable on the publisher side, but keep the same metadata information (shape, start, and, count). -- It is parameterized by: - - An **accuracy** (usually expressed as 10 to the minus X). The lower the accuracy value (meaning more decimals must be kept), the lower the compression ratio, and the lower the compression cost. - - A **compression cost per element** applied on the **publisher side** - - A **decompression cost per element** applied on the **subscriber side** -- The overall reduction cost and the size of the reduced version of the variable (or compression ratio) are related to the **accuracy** -- The compression and decompression costs are is likely to be much higher than for decimation, but still are proportional to the number of elements in the variable (must consider each and every element), which motivated the use of a cost per element as a first approximation. This will also allow us to distinguish executions on CPU or GPU at some point (in the later version). - -- **Behavior depending on the engine type** - - With a **File** engine - - The initial definition of the variable is kept (as decompressing will come back to that) but the local/global sizes are impacted - - Add a `local_compressed_size` and a `global_compressed_size` (uniform across ranks or not is to be decided) - Use the `local_compressed_size` in the `put()` and when transforming the `put()` into I/O operations - - The variable must also be tagged as `stored_compressed`, likely with extra details about the compression technique, ratio, ... - - With a **Staging** engine - - Here reduction only aims at speeding up data transport as data is not stored. Blocks to exchange are computed based on the original description of the data, but the size of each block is reduced to be transferred - - **Note:** as a first approximation we assume a **uniform** compression of each block. - - In both cases, the compression happens at the beginning of the `put()` on the publisher side, while decompression happens at the end of the `get()` on the subscriber side. - - -## Proposed API and behavior (WIP) - - [x] new `ReductionMethod` class: An abstract class from which decimation and compression will inherit - - members: - - [x] `std::string name_` - - methods and behavior: - - [x] `get_name` and `get_cname` - - [x] `virtual void parameterize_for_variable(std::shared_ptr var, std::map parameters) = 0`: parse the parameters and creates the parameterized version of the `ReductionMethod` - - [x] `virtual void reduce_variable(std::shared_ptr var) = 0`: compute `reduced_shape` and `reduced_local_start_and_count` for a variable and store them in the `ParameterizedDecimation` - - [x] `virtual size_t get_reduced_variable_global_size(std::shared_ptr var) const = 0` - - [x] `virtual size_t get_reduced_variable_local_size(std::shared_ptr var) const = 0` - - - [ ] new `DecimationReductionMethod` class - - members: - - [x] `std::map, ParameterizedDecimation> per_variable_parameterizations_` - - methods and behavior: - - The parameters used by the decimation method can be different for each variable. They must be stored in a map whose key is the Variable (assuming that the a `ReductionMethod` can only be applied once to a variable). The values in that map are A `ParameterizedDecimation` objects that contain the `stride`, `interpolation_method`, and `cost_per_element` to use for this variable. - - The parameterization to use for a variable is set when calling `Variable::add_reduction_operation` - - - [ ] new `ParameterizedDecimation` class - - members: - - [x] `std::vector stride_` - - [x] `std::string interpolation_method_` (default to empty string) - - [x] `double cost_per_element_` - - [x] `std::vector reduced_shape_` - - [x] `std::unordered_map, std::vector>> reduced_local_start_and_count_` - - [x] `size_t element_size_` - - methods and behavior: - - - - [ ] new `CompressionReductionMethod` class - - members: - - [x] `std::map, ParameterizedCompression> per_variable_parameterizations_` - - methods: - - [x] `void parse_parameters(std::shared_ptr var, std::map parameters)` - - behavior: - - The parameters used by the compression method can be different for each variable. They must be stored in a map whose key is the Variable (assuming that the a `ReductionMethod` can only be applied once to a variable). The values in that map are A `ParameterizedCompression` objects that contain the `accuracy`, `compression_cost_per_element`, and `decompression_cost_per_element` to use for this variable. - - The parameterization to use for a variable is set when calling `Variable::add_reduction_operation` - - - [ ] New member(s) and function(s) in `Stream` class - - members: - - [x] `reduction_methods_`, a vector of `ReductionMethod` objects - - methods: - - [x]`std::shared_ptr Stream::define_reduction_method(const std::string& name)` to create a new `ReductionMethod` object and store it in `reduction_methods_` - - - [ ] New member(s) and function(s) in `Variable` class - - members: - - [x] `is_reduced_with_`: the shared pointer to the applied `ReductionMethod`, set by `set_reduction_operation` - - methods: - - [x] `void Variable::set_reduction_operation(std::shared_ptr, std::map parameters)` that triggers the parameter parsing of the `ReductionMethod` passed in argument - - [x] `bool is_reduced() const` - - [ ] decide if public or protected - - [x] `const std::shared_ptr& get_reduction_method() const` - - [ ] decide if public or protected - - -## TODOs -- [ ] add tests in `test/dtl_reduction.cpp` - - [ ] SimpleDecimationFileEngine: -- [ ] add python binding -- [ ] add documentation From 465b6b27fbe59127c60296ea7eeb560487508750 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Sat, 14 Feb 2026 21:09:49 -0500 Subject: [PATCH 49/92] tiny edits --- doc/source/Decimation.rst | 6 ++---- src/Engine.cpp | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/doc/source/Decimation.rst b/doc/source/Decimation.rst index e4d695f..7015f78 100644 --- a/doc/source/Decimation.rst +++ b/doc/source/Decimation.rst @@ -35,10 +35,8 @@ Unlike compression, decimation can be applied on **both sides** of the data flow cost of the decimation kernel is incurred before the data is transported. Only the decimated version of the variable is put into the DTL, which directly reduces I/O or network costs. -- When applied by a **subscriber**, decimation reduces the volume of data that the subscriber has to process after - receiving it. The subscriber first retrieves the full variable and then applies decimation locally. This can be - useful when the subscriber only needs a coarse view of the data, but the full-resolution version must still be - transported because other subscribers or a checkpoint mechanism may need it. +- When applied by a **subscriber**, decimation reduces the volume of data that the subscriber has to process before + retrieving it from the DTL. Interpolation ------------- diff --git a/src/Engine.cpp b/src/Engine.cpp index 1c1e578..14cd918 100644 --- a/src/Engine.cpp +++ b/src/Engine.cpp @@ -55,7 +55,7 @@ void Engine::get(const std::shared_ptr& var) const { if (var->is_reduced() && var->is_reduced_by_subscriber()) { var->get_reduction_method()->reduce_variable(*var); - // Perform an Exec activity before putting the variable into the DTL to account for the time needed to reduce it. + // Perform an Exec activity before getting the variable for the DTL to account for the time needed to reduce it. sg4::this_actor::execute(var->get_reduction_method()->get_flop_amount_to_reduce_variable(*var)); } From 22fb4619aedcd6fb606a4574c2186189484a4757 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Mon, 16 Feb 2026 08:55:32 -0500 Subject: [PATCH 50/92] fix false negative in valgrind weekly report --- .github/workflows/weekly-checks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/weekly-checks.yml b/.github/workflows/weekly-checks.yml index 9b3b6b3..720732e 100644 --- a/.github/workflows/weekly-checks.yml +++ b/.github/workflows/weekly-checks.yml @@ -258,7 +258,7 @@ jobs: cat valgrind-summary.txt # Check for critical errors - if grep -q "ERROR SUMMARY: [^0]" valgrind-output.txt; then + if grep -qP "ERROR SUMMARY: [1-9]" valgrind-output.txt; then echo "::error::Valgrind detected memory errors!" exit 1 fi From e72edf4a797537d809b0d764954f926cb00e1f2a Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Mon, 16 Feb 2026 09:57:55 -0500 Subject: [PATCH 51/92] use fun rather than obj in valgrind supp file --- test/valgrind.supp | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/test/valgrind.supp b/test/valgrind.supp index 3e026d4..1d1c9a9 100644 --- a/test/valgrind.supp +++ b/test/valgrind.supp @@ -1,32 +1,48 @@ # Valgrind suppressions file for DTLMod # This file contains suppressions for known false positives and third-party library issues +# SimGrid I/O activity leaks in forked test processes (DO_TEST_WITH_FORK uses _exit()) { - simgrid_context_factory + simgrid_io_init_leak Memcheck:Leak ... - obj:*/libsimgrid.so* + fun:*simgrid*s4u*Io*init* } { - simgrid_boost_context + simgrid_disk_io_init_leak Memcheck:Leak ... - obj:*/libboost_context.so* + fun:*simgrid*s4u*Disk*io_init* } +# SimGrid context and engine allocations not freed in forked children { - boost_filesystem + simgrid_context_factory Memcheck:Leak ... - obj:*/libboost_filesystem.so* + fun:*simgrid* } { fsmod_library Memcheck:Leak ... - obj:*/libfsmod.so* + fun:*simgrid*fsmod* +} + +{ + boost_context + Memcheck:Leak + ... + obj:*/libboost_context.so* +} + +{ + boost_filesystem + Memcheck:Leak + ... + obj:*/libboost_filesystem.so* } { From 424ab3be53fdad362b1dac9b0e90f8707c0b4639 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Mon, 16 Feb 2026 09:58:32 -0500 Subject: [PATCH 52/92] default initialization for access_mode_ --- include/dtlmod/Stream.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/dtlmod/Stream.hpp b/include/dtlmod/Stream.hpp index 2f9b298..12229ef 100644 --- a/include/dtlmod/Stream.hpp +++ b/include/dtlmod/Stream.hpp @@ -48,7 +48,7 @@ class Stream : public std::enable_shared_from_this { bool metadata_export_ = false; std::string metadata_file_; sg4::MutexPtr mutex_ = sg4::Mutex::create(); - Mode access_mode_; + Mode access_mode_ = Mode::Publish; std::unordered_map> variables_; std::unordered_map> reduction_methods_; From cf41c718506d7ae9727fce5e1d7807e1099ae643 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Thu, 19 Feb 2026 23:03:01 -0500 Subject: [PATCH 53/92] Memory efficiency: progressive eviction of metadata transaction entries --- ChangeLog | 19 ++++++++++ include/dtlmod/Engine.hpp | 4 +++ include/dtlmod/FileEngine.hpp | 1 + include/dtlmod/Metadata.hpp | 11 +++++- include/dtlmod/Stream.hpp | 5 ++- src/Engine.cpp | 1 + src/FileEngine.cpp | 16 +++++++++ src/Metadata.cpp | 67 +++++++++++++++++++++++++---------- src/Stream.cpp | 36 +++++++++++++++++-- 9 files changed, 137 insertions(+), 23 deletions(-) diff --git a/ChangeLog b/ChangeLog index 9b980c0..5d825a0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,24 @@ ---------------------------------------------------------------------------- +DTLMod (0.5) not released yet (target: May 2026) + +Improvements: + - Memory efficiency: progressive eviction of metadata transaction entries + - Once all subscribers have consumed a transaction, its entries are evicted + from the in-memory Metadata::transaction_infos_ map + - When metadata export is enabled and publishers and subscribers coexist + (file streaming), evicted entries are progressively flushed to + per-variable temporary files; the final metadata file is assembled at + pub_close() from those files and any remaining in-memory entries, + preserving the existing format and transaction count + - When the stream is opened by subscribers only after all publishers have + closed (sequential scenario), memory-only eviction is performed (the + metadata file has already been written by pub_close()) + - Memory footprint of the File engine now grows as O(N_pub) instead of + O(N_pub × N_transactions) for long-running concurrent streaming workloads + +---------------------------------------------------------------------------- + DTLMod (0.4) February 16, 2026 Major improvements: diff --git a/include/dtlmod/Engine.hpp b/include/dtlmod/Engine.hpp index 5df0f9a..faf04f4 100644 --- a/include/dtlmod/Engine.hpp +++ b/include/dtlmod/Engine.hpp @@ -55,6 +55,8 @@ class Engine { std::shared_ptr transport_ = nullptr; std::weak_ptr stream_; + bool pub_ever_present_ = false; + ActorRegistry publishers_; ActorRegistry subscribers_; @@ -86,6 +88,8 @@ class Engine { [[nodiscard]] ActorRegistry& get_subscribers() noexcept { return subscribers_; } [[nodiscard]] const ActorRegistry& get_subscribers() const noexcept { return subscribers_; } + [[nodiscard]] bool pub_ever_present() const noexcept { return pub_ever_present_; } + // Pure virtual methods for derived classes to implement virtual void create_transport(const Transport::Method& transport_method) = 0; virtual void begin_pub_transaction() = 0; diff --git a/include/dtlmod/FileEngine.hpp b/include/dtlmod/FileEngine.hpp index 67ad0b2..6056ca4 100644 --- a/include/dtlmod/FileEngine.hpp +++ b/include/dtlmod/FileEngine.hpp @@ -39,6 +39,7 @@ class FileEngine : public Engine { unsigned int current_sub_transaction_id_ = 0; bool sub_transaction_in_progress_ = false; + unsigned int subs_completed_current_tx_ = 0; void create_transport(const Transport::Method& transport_method) override; [[nodiscard]] const std::shared_ptr& get_file_system() const noexcept { return file_system_; } diff --git a/include/dtlmod/Metadata.hpp b/include/dtlmod/Metadata.hpp index 24e10bb..ac82342 100644 --- a/include/dtlmod/Metadata.hpp +++ b/include/dtlmod/Metadata.hpp @@ -6,6 +6,8 @@ #ifndef __DTLMOD_METADATA_HPP__ #define __DTLMOD_METADATA_HPP__ +#include + #include #include @@ -29,6 +31,8 @@ class Metadata { std::less<>> transaction_infos_; + unsigned int flushed_count_ = 0; // number of transactions already flushed to the prog file + protected: const std::map, std::vector>, std::pair, std::less<>>& @@ -45,7 +49,12 @@ class Metadata { { return transaction_infos_.empty() ? 0 : (transaction_infos_.rbegin())->first; } - void export_to_file(std::ofstream& ostream) const; + // Write entries for tx_id to out, increment flushed_count_, erase from transaction_infos_ + void write_transaction_to_stream(unsigned int tx_id, std::ofstream& out); + // Remove tx_id from transaction_infos_ without writing to file + void evict_transaction(unsigned int tx_id); + // Write all remaining transactions; prog_file_path contains already-flushed entries to prepend + void export_to_file(std::ofstream& ostream, const std::string& prog_file_path = "") const; }; /// \endcond diff --git a/include/dtlmod/Stream.hpp b/include/dtlmod/Stream.hpp index 12229ef..9a992b0 100644 --- a/include/dtlmod/Stream.hpp +++ b/include/dtlmod/Stream.hpp @@ -47,6 +47,8 @@ class Stream : public std::enable_shared_from_this { Transport::Method transport_method_ = Transport::Method::Undefined; bool metadata_export_ = false; std::string metadata_file_; + std::unordered_map var_prog_file_paths_; // variable name -> prog file path + bool metadata_exported_ = false; // true once export_metadata_to_file() has been called sg4::MutexPtr mutex_ = sg4::Mutex::create(); Mode access_mode_ = Mode::Publish; @@ -63,7 +65,8 @@ class Stream : public std::enable_shared_from_this { } void close() noexcept { engine_ = nullptr; } - void export_metadata_to_file() const; + void export_metadata_to_file(); + void flush_and_evict_transaction(unsigned int tx_id); // Helper methods for Stream::open void validate_open_parameters(std::string_view name, Mode mode) const; diff --git a/src/Engine.cpp b/src/Engine.cpp index 14cd918..77f9e96 100644 --- a/src/Engine.cpp +++ b/src/Engine.cpp @@ -93,6 +93,7 @@ void Engine::close() /// \cond EXCLUDE_FROM_DOCUMENTATION void Engine::add_publisher(sg4::ActorPtr actor) { + pub_ever_present_ = true; transport_->add_publisher(publishers_.count()); publishers_.add(actor); } diff --git a/src/FileEngine.cpp b/src/FileEngine.cpp index a7a0b01..f91840f 100644 --- a/src/FileEngine.cpp +++ b/src/FileEngine.cpp @@ -224,6 +224,22 @@ void FileEngine::end_sub_transaction() if (auto sub_barrier = get_subscribers().get_or_create_barrier()) XBT_DEBUG("Barrier created for %zu subscribers", get_subscribers().count()); + // Evict this transaction's metadata once all subscribers have completed their reads. + // Only applies in the concurrent streaming scenario (pub was registered on this same engine). + if (pub_ever_present()) { + unsigned int tx_to_evict = 0; + { + std::unique_lock lock(*get_subscribers().get_mutex()); + if (++subs_completed_current_tx_ == get_subscribers().count()) { + subs_completed_current_tx_ = 0; + tx_to_evict = current_sub_transaction_id_; + } + } + if (tx_to_evict > 0) + if (auto s = get_stream()) + s->flush_and_evict_transaction(tx_to_evict); + } + // Mark this transaction as over sub_transaction_in_progress_ = false; } diff --git a/src/Metadata.cpp b/src/Metadata.cpp index e78ca8c..84f28de 100644 --- a/src/Metadata.cpp +++ b/src/Metadata.cpp @@ -18,12 +18,51 @@ void Metadata::add_transaction(unsigned int id, transaction_infos_[id][start_and_count] = std::make_pair(location, publisher); } -void Metadata::export_to_file(std::ofstream& ostream) const +static void write_block_entries(std::ofstream& ostream, + const std::map, std::vector>, + std::pair, std::less<>>& transaction) +{ + for (const auto& [block_info, location] : transaction) { + const auto& [block_start, block_count] = block_info; + const auto& [where, actor] = location; + + ostream << " " << where.c_str() << ": ["; + XBT_DEBUG(" Actor %s wrote:", actor->get_cname()); + unsigned long last = block_start.size() - 1; + for (unsigned long i = 0; i < last; i++) { + ostream << block_start[i] << ":" << block_start[i] + block_count[i] << ", "; + XBT_DEBUG(" Dimension %lu : [%zu..%zu]", i + 1, block_start[i], block_start[i] + block_count[i]); + } + ostream << block_start[last] << ":" << block_start[last] + block_count[last] << "]" << std::endl; + XBT_DEBUG(" Dimension %lu : [%zu..%zu]", last + 1, block_start[last], block_start[last] + block_count[last]); + XBT_DEBUG(" in: %s", where.c_str()); + } +} + +void Metadata::write_transaction_to_stream(unsigned int tx_id, std::ofstream& out) +{ + auto it = transaction_infos_.find(tx_id); + if (it == transaction_infos_.end()) + return; + XBT_DEBUG(" Transaction %u:", tx_id); + out << " Transaction " << tx_id << ":" << std::endl; + write_block_entries(out, it->second); + flushed_count_++; + transaction_infos_.erase(it); +} + +void Metadata::evict_transaction(unsigned int tx_id) +{ + transaction_infos_.erase(tx_id); +} + +void Metadata::export_to_file(std::ofstream& ostream, const std::string& prog_file_path) const { auto var = variable_.lock(); xbt_assert(var, "Metadata::export_to_file called after its Variable has been destroyed"); XBT_DEBUG("Variable %s:", var->get_cname()); - ostream << var->get_element_size() << "\t" << var->get_cname() << "\t" << transaction_infos_.size(); + unsigned int total = flushed_count_ + static_cast(transaction_infos_.size()); + ostream << var->get_element_size() << "\t" << var->get_cname() << "\t" << total; ostream << "*{"; auto shape = var->get_shape(); const auto last_index = shape.size() - 1; @@ -31,25 +70,17 @@ void Metadata::export_to_file(std::ofstream& ostream) const ostream << shape[i] << ","; ostream << shape[last_index] << "}" << std::endl; + // Copy already-flushed entries from the per-variable prog file (if any) + if (!prog_file_path.empty()) { + std::ifstream prog(prog_file_path, std::ios::binary); + ostream << prog.rdbuf(); + } + + // Write remaining in-memory entries for (const auto& [id, transaction] : transaction_infos_) { XBT_DEBUG(" Transaction %u:", id); ostream << " Transaction " << id << ":" << std::endl; - for (const auto& [block_info, location] : transaction) { - const auto& [block_start, block_count] = block_info; - const auto& [where, actor] = location; - - ostream << " " << where.c_str() << ": ["; - XBT_DEBUG(" Actor %s wrote:", actor->get_cname()); - unsigned long last = block_start.size() - 1; - for (unsigned long i = 0; i < last; i++) { - ostream << block_start[i] << ":" << block_start[i] + block_count[i] << ", "; - XBT_DEBUG(" Dimension %lu : [%zu..%zu]", i + 1, block_start[i], block_start[i] + block_count[i]); - } - ostream << block_start[last] << ":" << block_start[last] + block_count[last] << "]" << std::endl; - XBT_DEBUG(" Dimension %lu : [%zu..%zu]", last + 1, block_start[last], block_start[last] + block_count[last]); - - XBT_DEBUG(" in: %s", where.c_str()); - } + write_block_entries(ostream, transaction); } } /// \endcond diff --git a/src/Stream.cpp b/src/Stream.cpp index b436cd6..8f46791 100644 --- a/src/Stream.cpp +++ b/src/Stream.cpp @@ -146,13 +146,43 @@ Stream& Stream::unset_metadata_export() noexcept return *this; } -void Stream::export_metadata_to_file() const +void Stream::export_metadata_to_file() { + metadata_exported_ = true; if (metadata_export_) { std::ofstream export_stream(metadata_file_, std::ofstream::out); - for (const auto& [name, v] : variables_) - v->get_metadata()->export_to_file(export_stream); + for (const auto& [name, v] : variables_) { + auto it = var_prog_file_paths_.find(name); + std::string prog = (it != var_prog_file_paths_.end()) ? it->second : ""; + v->get_metadata()->export_to_file(export_stream, prog); + } export_stream.close(); + // Remove per-variable prog files now that the final file is written + for (const auto& [name, path] : var_prog_file_paths_) + std::remove(path.c_str()); + var_prog_file_paths_.clear(); + } +} + +void Stream::flush_and_evict_transaction(unsigned int tx_id) +{ + if (metadata_exported_) { + // Sequential scenario or post-export: the metadata file is already complete; just free memory + for (const auto& [name, v] : variables_) + v->get_metadata()->evict_transaction(tx_id); + return; + } + if (metadata_export_) { + for (const auto& [name, v] : variables_) { + if (!var_prog_file_paths_.count(name)) + var_prog_file_paths_[name] = metadata_file_ + "." + name + ".prog"; + std::ofstream out(var_prog_file_paths_.at(name), std::ofstream::app); + v->get_metadata()->write_transaction_to_stream(tx_id, out); + // write_transaction_to_stream also increments flushed_count_ and evicts from transaction_infos_ + } + } else { + for (const auto& [name, v] : variables_) + v->get_metadata()->evict_transaction(tx_id); } } From 4a07017264a33f5fae7ca926cff2aa4f7c1891af Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Fri, 20 Feb 2026 00:05:43 -0500 Subject: [PATCH 54/92] remove dead code --- include/dtlmod/Stream.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/dtlmod/Stream.hpp b/include/dtlmod/Stream.hpp index 9a992b0..d03b74e 100644 --- a/include/dtlmod/Stream.hpp +++ b/include/dtlmod/Stream.hpp @@ -57,8 +57,6 @@ class Stream : public std::enable_shared_from_this { protected: /// \cond EXCLUDE_FROM_DOCUMENTATION - [[nodiscard]] const Transport::Method& get_transport_method() const noexcept { return transport_method_; } - [[nodiscard]] static constexpr const char* mode_to_str(Mode mode) noexcept { return (mode == Mode::Publish) ? "Mode::Publish" : "Mode::Subscribe"; From f902c1a0e963efcfc0d3b3a851ae557c32f63867 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Mon, 2 Mar 2026 22:29:54 -0500 Subject: [PATCH 55/92] register subscriber variable in CompressionReductionMethod during inquire --- include/dtlmod/CompressionReductionMethod.hpp | 4 ++++ src/CompressionReductionMethod.cpp | 9 +++++++++ src/Stream.cpp | 4 ++++ test/dtl_reduction.cpp | 6 ++++++ 4 files changed, 23 insertions(+) diff --git a/include/dtlmod/CompressionReductionMethod.hpp b/include/dtlmod/CompressionReductionMethod.hpp index e705dda..c46caff 100644 --- a/include/dtlmod/CompressionReductionMethod.hpp +++ b/include/dtlmod/CompressionReductionMethod.hpp @@ -79,6 +79,10 @@ class CompressionReductionMethod : public ReductionMethod { [[nodiscard]] double get_flop_amount_to_reduce_variable(const Variable& var) const override; [[nodiscard]] double get_flop_amount_to_decompress_variable(const Variable& var) const override; + + /// @brief Copy a publisher variable's parameterization to a subscriber variable. + /// Called by Stream::inquire_variable so that Engine::get() can compute decompression costs. + void propagate_for_subscriber(const Variable& publisher_var, const Variable& subscriber_var); }; /// \endcond } // namespace dtlmod diff --git a/src/CompressionReductionMethod.cpp b/src/CompressionReductionMethod.cpp index a4d39fa..f87bf39 100644 --- a/src/CompressionReductionMethod.cpp +++ b/src/CompressionReductionMethod.cpp @@ -139,4 +139,13 @@ void CompressionReductionMethod::parameterize_for_variable( // Always (re)create the parameterization — avoids field-by-field update complexity. per_variable_parameterizations_[&var] = std::make_shared(var, std::move(cfg)); } + +void CompressionReductionMethod::propagate_for_subscriber(const Variable& publisher_var, const Variable& subscriber_var) +{ + auto it = per_variable_parameterizations_.find(&publisher_var); + if (it != per_variable_parameterizations_.end()) + per_variable_parameterizations_[&subscriber_var] = + std::make_shared(subscriber_var, it->second->get_config()); +} + } // namespace dtlmod diff --git a/src/Stream.cpp b/src/Stream.cpp index 8f46791..05503b8 100644 --- a/src/Stream.cpp +++ b/src/Stream.cpp @@ -410,6 +410,10 @@ std::shared_ptr Stream::inquire_variable(std::string_view name) const if (var->second->is_reduced()) { new_var->is_reduced_with_ = var->second->get_reduction_method(); new_var->reduction_origin_ = var->second->reduction_origin_; + // Register the subscriber's variable in the compression method's map so that + // Engine::get() can compute decompression costs via get_flop_amount_to_decompress_variable. + if (auto compressor = std::dynamic_pointer_cast(new_var->is_reduced_with_)) + compressor->propagate_for_subscriber(*var->second, *new_var); } return new_var; diff --git a/test/dtl_reduction.cpp b/test/dtl_reduction.cpp index be2252e..e9c9137 100644 --- a/test/dtl_reduction.cpp +++ b/test/dtl_reduction.cpp @@ -571,8 +571,14 @@ TEST_F(DTLReductionTest, CompressionStagingEngine) auto dtl = dtlmod::DTL::connect(); auto stream = dtl->add_stream("my-output"); auto engine = stream->open("my-output", dtlmod::Stream::Mode::Subscribe); + XBT_INFO("Wait for the publisher to have set the compression reduction operation"); + sg4::this_actor::sleep_for(1); auto var = stream->inquire_variable("var"); + XBT_INFO("Verify that the subscriber variable carries the publisher compression state"); + ASSERT_TRUE(var->is_reduced()); + ASSERT_TRUE(var->is_reduced_by_publisher()); + XBT_INFO("Get the compressed variable (decompression cost should be applied on subscriber)"); engine->begin_transaction(); ASSERT_NO_THROW(engine->get(var)); From 3cce4e5f0ef22aed437506dfb13fc227de82e87c Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Wed, 4 Mar 2026 22:30:01 -0500 Subject: [PATCH 56/92] use latest version number for the installed .so --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 213e705..6194ab5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -224,7 +224,7 @@ set (CONFIG_FILES add_library(dtlmod SHARED ${SOURCE_FILES}) set_target_properties(dtlmod PROPERTIES - SOVERSION 0.2 + SOVERSION ${DTLMOD_RELEASE_VERSION} LINKER_LANGUAGE CXX PUBLIC_HEADER "${HEADER_FILES}") From 308ed00a9ddd2df0f95aeb55867444b719bf3113 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Fri, 13 Mar 2026 13:29:03 -0400 Subject: [PATCH 57/92] Propagate reduction method to subscribers for all reduction methods + test --- include/dtlmod/CompressionReductionMethod.hpp | 2 +- include/dtlmod/DecimationReductionMethod.hpp | 4 ++++ include/dtlmod/ReductionMethod.hpp | 1 + src/DecimationReductionMethod.cpp | 20 +++++++++++++++++++ src/Stream.cpp | 7 +++---- test/dtl_reduction.cpp | 12 +++++++++++ 6 files changed, 41 insertions(+), 5 deletions(-) diff --git a/include/dtlmod/CompressionReductionMethod.hpp b/include/dtlmod/CompressionReductionMethod.hpp index c46caff..34f30e4 100644 --- a/include/dtlmod/CompressionReductionMethod.hpp +++ b/include/dtlmod/CompressionReductionMethod.hpp @@ -82,7 +82,7 @@ class CompressionReductionMethod : public ReductionMethod { /// @brief Copy a publisher variable's parameterization to a subscriber variable. /// Called by Stream::inquire_variable so that Engine::get() can compute decompression costs. - void propagate_for_subscriber(const Variable& publisher_var, const Variable& subscriber_var); + void propagate_for_subscriber(const Variable& publisher_var, const Variable& subscriber_var) override; }; /// \endcond } // namespace dtlmod diff --git a/include/dtlmod/DecimationReductionMethod.hpp b/include/dtlmod/DecimationReductionMethod.hpp index a924bc2..f6b3cdb 100644 --- a/include/dtlmod/DecimationReductionMethod.hpp +++ b/include/dtlmod/DecimationReductionMethod.hpp @@ -98,6 +98,10 @@ class DecimationReductionMethod : public ReductionMethod { public: using ReductionMethod::ReductionMethod; + + /// @brief Copy a publisher variable's parameterization to a subscriber variable. + /// Called by Stream::inquire_variable so that subscribers can query reduced sizes. + void propagate_for_subscriber(const Variable& publisher_var, const Variable& subscriber_var) override; }; ///\endcond } // namespace dtlmod diff --git a/include/dtlmod/ReductionMethod.hpp b/include/dtlmod/ReductionMethod.hpp index 5ae33b7..60fcad3 100644 --- a/include/dtlmod/ReductionMethod.hpp +++ b/include/dtlmod/ReductionMethod.hpp @@ -38,6 +38,7 @@ class ReductionMethod { get_reduced_start_and_count_for(const Variable& var, simgrid::s4u::ActorPtr publisher) const = 0; virtual double get_flop_amount_to_reduce_variable(const Variable& var) const = 0; virtual double get_flop_amount_to_decompress_variable(const Variable& /*var*/) const { return 0.0; } + virtual void propagate_for_subscriber(const Variable& /*publisher_var*/, const Variable& /*subscriber_var*/) {} /// @brief Helper function to print out the name of the ReductionMethod. /// @return The corresponding string diff --git a/src/DecimationReductionMethod.cpp b/src/DecimationReductionMethod.cpp index f7e4165..d061510 100644 --- a/src/DecimationReductionMethod.cpp +++ b/src/DecimationReductionMethod.cpp @@ -123,6 +123,26 @@ void DecimationReductionMethod::parameterize_for_variable( } } +void DecimationReductionMethod::propagate_for_subscriber(const Variable& publisher_var, const Variable& subscriber_var) +{ + auto it = per_variable_parameterizations_.find(&publisher_var); + if (it == per_variable_parameterizations_.end()) + return; + + auto pub_param = it->second; + auto sub_param = std::make_shared(subscriber_var, pub_param->get_stride(), + pub_param->get_interpolation_method(), + pub_param->get_cost_per_element()); + sub_param->set_reduced_shape(pub_param->get_reduced_shape()); + + // The subscriber receives the full reduced variable, so its local region is the entire reduced shape. + const auto& reduced_shape = pub_param->get_reduced_shape(); + std::vector reduced_start(reduced_shape.size(), 0); + sub_param->set_reduced_local_start_and_count(sg4::Actor::self(), reduced_start, reduced_shape); + + per_variable_parameterizations_[&subscriber_var] = std::move(sub_param); +} + void DecimationReductionMethod::reduce_variable(const Variable& var) { auto parameterization = per_variable_parameterizations_[&var]; diff --git a/src/Stream.cpp b/src/Stream.cpp index 05503b8..510bb9c 100644 --- a/src/Stream.cpp +++ b/src/Stream.cpp @@ -410,10 +410,9 @@ std::shared_ptr Stream::inquire_variable(std::string_view name) const if (var->second->is_reduced()) { new_var->is_reduced_with_ = var->second->get_reduction_method(); new_var->reduction_origin_ = var->second->reduction_origin_; - // Register the subscriber's variable in the compression method's map so that - // Engine::get() can compute decompression costs via get_flop_amount_to_decompress_variable. - if (auto compressor = std::dynamic_pointer_cast(new_var->is_reduced_with_)) - compressor->propagate_for_subscriber(*var->second, *new_var); + // Register the subscriber's variable in the reduction method's map so that + // Engine::get() can compute costs, and subscribers can query reduced sizes. + new_var->is_reduced_with_->propagate_for_subscriber(*var->second, *new_var); } return new_var; diff --git a/test/dtl_reduction.cpp b/test/dtl_reduction.cpp index e9c9137..4ac8474 100644 --- a/test/dtl_reduction.cpp +++ b/test/dtl_reduction.cpp @@ -501,9 +501,21 @@ TEST_F(DTLReductionTest, DecimationStagingEngine) auto dtl = dtlmod::DTL::connect(); auto stream = dtl->add_stream("my-output"); auto engine = stream->open("my-output", dtlmod::Stream::Mode::Subscribe); + XBT_INFO("Wait for the publisher to have set the decimation reduction operation"); + sg4::this_actor::sleep_for(0.5); auto var = stream->inquire_variable("var"); + ASSERT_TRUE(var->is_reduced()); + ASSERT_TRUE(var->is_reduced_by_publisher()); + + XBT_INFO("Verify that the subscriber can access the reduction method set by the publisher"); + auto reduction = var->get_reduction_method(); + ASSERT_TRUE(reduction != nullptr); + XBT_INFO("Verify that the subscriber can get the reduced local size"); + auto reduced_size = reduction->get_reduced_variable_local_size(*var); + ASSERT_DOUBLE_EQ(reduced_size, 5000 * 5000 * 8.0); XBT_INFO("Get the decimated variable"); + engine->begin_transaction(); ASSERT_NO_THROW(engine->get(var)); engine->end_transaction(); From b577a6d3fae72cf5b39b8f5157cf63d505136e5b Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Tue, 17 Mar 2026 16:49:23 -0400 Subject: [PATCH 58/92] fix bug in SZ compression --- src/CompressionReductionMethod.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/CompressionReductionMethod.cpp b/src/CompressionReductionMethod.cpp index f87bf39..60d8e10 100644 --- a/src/CompressionReductionMethod.cpp +++ b/src/CompressionReductionMethod.cpp @@ -54,11 +54,12 @@ double CompressionReductionMethod::derive_compression_ratio(double accuracy, std double data_smoothness) { if (profile == "sz") { - // SZ-like prediction-based compressor: empirical fit from published benchmarks on scientific data. - // Higher smoothness -> better prediction -> higher ratio. - double alpha = 3.0; - double beta = 0.8; - return std::max(1.0, alpha * std::pow(-std::log10(accuracy), beta) * (0.5 + data_smoothness)); + // accuracy = absolute/relative error bound; larger = looser = more compression + // Empirical fit: ratio ≈ A × accuracy^B × (0.5 + data_smoothness) + // Calibrated to give ratio ≈ 7 at 1e-3 and ratio ≈ 2 at 1e-6. + double alpha = 24.4; + double beta = 0.181; + return std::max(1.0, alpha * std::pow(accuracy, beta) * (0.5 + data_smoothness)); } if (profile == "zfp") { // ZFP-like transform-based compressor: rate = bits-per-value derived from accuracy. From b6e51ec4be3317451e82f7499264fa205e967168 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Wed, 22 Apr 2026 16:03:28 -0400 Subject: [PATCH 59/92] this method is pure --- include/dtlmod/ReductionMethod.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/dtlmod/ReductionMethod.hpp b/include/dtlmod/ReductionMethod.hpp index 60fcad3..36c0542 100644 --- a/include/dtlmod/ReductionMethod.hpp +++ b/include/dtlmod/ReductionMethod.hpp @@ -38,7 +38,7 @@ class ReductionMethod { get_reduced_start_and_count_for(const Variable& var, simgrid::s4u::ActorPtr publisher) const = 0; virtual double get_flop_amount_to_reduce_variable(const Variable& var) const = 0; virtual double get_flop_amount_to_decompress_variable(const Variable& /*var*/) const { return 0.0; } - virtual void propagate_for_subscriber(const Variable& /*publisher_var*/, const Variable& /*subscriber_var*/) {} + virtual void propagate_for_subscriber(const Variable& /*publisher_var*/, const Variable& /*subscriber_var*/) = 0; /// @brief Helper function to print out the name of the ReductionMethod. /// @return The corresponding string From 0499dca0354366fea065765df277b12c617b6eed Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Wed, 22 Apr 2026 17:23:59 -0400 Subject: [PATCH 60/92] add Engine::cancel_transaction() feature --- CMakeLists.txt | 1 + ChangeLog | 8 + include/dtlmod/DTLException.hpp | 2 + include/dtlmod/Engine.hpp | 10 + include/dtlmod/FileEngine.hpp | 1 + include/dtlmod/StagingEngine.hpp | 1 + src/Engine.cpp | 6 + src/FileEngine.cpp | 51 ++++- src/StagingEngine.cpp | 53 ++++- src/bindings/python/dtlmod_python.cpp | 4 + test/dtl_cancel.cpp | 282 +++++++++++++++++++++++++ test/python/dtl_cancel.py | 286 ++++++++++++++++++++++++++ test/python/unit_tests_python.py | 3 +- 13 files changed, 696 insertions(+), 12 deletions(-) create mode 100644 test/dtl_cancel.cpp create mode 100644 test/python/dtl_cancel.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 6194ab5..c5b8f2d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -262,6 +262,7 @@ find_library(GTEST_LIBRARY NAMES gtest) find_path(GTEST_INCLUDE_DIR NAMES gtest/gtest.h PATHS /opt/gtest/include) if(GTEST_LIBRARY) set(TEST_FILES + test/dtl_cancel.cpp test/dtl_config.cpp test/dtl_connection.cpp test/dtl_file_engine.cpp diff --git a/ChangeLog b/ChangeLog index 5d825a0..18b2cd1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -17,6 +17,14 @@ Improvements: - Memory footprint of the File engine now grows as O(N_pub) instead of O(N_pub × N_transactions) for long-running concurrent streaming workloads +API Changes: + - New Engine method: + - Engine::cancel_transaction() cancel the currently ongoing transaction + performed by this engine. This method has to be called by an external + actor not involved in the transaction. It will make the transaction raise + a DTLMod::TransactionCancelledException that must be caught if you want + the publishers and subscribers to survive to the cancellation of this + transaction. ---------------------------------------------------------------------------- DTLMod (0.4) February 16, 2026 diff --git a/include/dtlmod/DTLException.hpp b/include/dtlmod/DTLException.hpp index 240a41a..8e5f465 100644 --- a/include/dtlmod/DTLException.hpp +++ b/include/dtlmod/DTLException.hpp @@ -70,6 +70,8 @@ DECLARE_DTLMOD_EXCEPTION(UnknownCompressionOptionException, "Unknown Compression DECLARE_DTLMOD_EXCEPTION(InconsistentCompressionRatioException, "Inconsistent Compression ratio"); DECLARE_DTLMOD_EXCEPTION(SubscriberSideCompressionException, "Compression can only be applied on the publisher side"); +DECLARE_DTLMOD_EXCEPTION(TransactionCancelledException, "Transaction cancelled"); + } // namespace dtlmod #endif // __DTLMOD_EXCEPTION_HPP__ diff --git a/include/dtlmod/Engine.hpp b/include/dtlmod/Engine.hpp index faf04f4..71419c0 100644 --- a/include/dtlmod/Engine.hpp +++ b/include/dtlmod/Engine.hpp @@ -16,6 +16,7 @@ #include #include +#include #include #include "dtlmod/ActorRegistry.hpp" @@ -56,8 +57,10 @@ class Engine { std::weak_ptr stream_; bool pub_ever_present_ = false; + std::atomic cancelled_{false}; ActorRegistry publishers_; + ActorRegistry subscribers_; sg4::ActivitySet pub_transaction_; @@ -90,6 +93,8 @@ class Engine { [[nodiscard]] bool pub_ever_present() const noexcept { return pub_ever_present_; } + [[nodiscard]] bool is_cancelled() const noexcept { return cancelled_; } + // Pure virtual methods for derived classes to implement virtual void create_transport(const Transport::Method& transport_method) = 0; virtual void begin_pub_transaction() = 0; @@ -98,6 +103,7 @@ class Engine { virtual void begin_sub_transaction() = 0; virtual void end_sub_transaction() = 0; virtual void sub_close() = 0; + virtual void cancel_activities() = 0; public: /// \cond EXCLUDE_FROM_DOCUMENTATION @@ -138,6 +144,10 @@ class Engine { /// @return The id of the ongoing transaction. [[nodiscard]] unsigned int get_current_transaction() const noexcept { return get_current_transaction_impl(); } + /// @brief Cancel all in-flight activities of the current transaction, unblocking publishers and subscribers. + /// @note Must be called from an external actor not participating in the transaction. + void cancel_transaction(); + /// @brief Close the Engine associated to a Stream. void close(); }; diff --git a/include/dtlmod/FileEngine.hpp b/include/dtlmod/FileEngine.hpp index 6056ca4..da5d29d 100644 --- a/include/dtlmod/FileEngine.hpp +++ b/include/dtlmod/FileEngine.hpp @@ -50,6 +50,7 @@ class FileEngine : public Engine { void begin_sub_transaction() override; void end_sub_transaction() override; void sub_close() override; + void cancel_activities() override; [[nodiscard]] unsigned int get_current_transaction_impl() const noexcept override { return current_pub_transaction_id_; diff --git a/include/dtlmod/StagingEngine.hpp b/include/dtlmod/StagingEngine.hpp index 741f19d..759d4de 100644 --- a/include/dtlmod/StagingEngine.hpp +++ b/include/dtlmod/StagingEngine.hpp @@ -39,6 +39,7 @@ class StagingEngine : public Engine { void begin_sub_transaction() override; void end_sub_transaction() override; void sub_close() override; + void cancel_activities() override; [[nodiscard]] unsigned int get_current_transaction_impl() const noexcept override { return current_pub_transaction_id_; diff --git a/src/Engine.cpp b/src/Engine.cpp index 77f9e96..0ef0ffd 100644 --- a/src/Engine.cpp +++ b/src/Engine.cpp @@ -86,6 +86,12 @@ void Engine::close() publishers_.contains(sg4::Actor::self()) ? pub_close() : sub_close(); } +void Engine::cancel_transaction() +{ + cancelled_ = true; + cancel_activities(); +} + //////////////////////////////////////////// ///////////////// INTERNALS //////////////// //////////////////////////////////////////// diff --git a/src/FileEngine.cpp b/src/FileEngine.cpp index f91840f..2aab332 100644 --- a/src/FileEngine.cpp +++ b/src/FileEngine.cpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -22,6 +23,19 @@ XBT_LOG_NEW_DEFAULT_SUBCATEGORY(dtlmod_file_engine, dtlmod_engine, "DTL logging namespace dtlmod { /// \cond EXCLUDE_FROM_DOCUMENTATION +void FileEngine::cancel_activities() +{ + // Cancelling write activities fires on_this_completion_cb, which calls notify_all() and drains the set naturally. + for (auto& [actor, aset] : file_pub_transaction_) + for (int i = 0; i < aset.size(); i++) + aset.at(i)->cancel(); + for (auto& [actor, aset] : file_sub_transaction_) + for (int i = 0; i < aset.size(); i++) + aset.at(i)->cancel(); + pub_transaction_completed_->notify_all(); + pub_activities_completed_->notify_all(); +} + // FileEngines require to know where to (virtually) write file. This information is given by fullpath which has the // following format: NetZone:FileSystem:PathToDirectory FileEngine::FileEngine(std::string_view fullpath, const std::shared_ptr& stream) @@ -86,6 +100,9 @@ std::string FileEngine::get_path_to_dataset() const void FileEngine::begin_pub_transaction() { + if (is_cancelled()) + throw TransactionCancelledException(XBT_THROW_POINT); + auto self = sg4::Actor::self(); if (!pub_transaction_in_progress_) { @@ -98,10 +115,12 @@ void FileEngine::begin_pub_transaction() // Wait for the completion of the Publish activities from the previous transaction XBT_DEBUG("Wait for the completion of %u publish activities from the previous transaction", file_pub_transaction_[self].size()); - while (file_pub_transaction_[self].size() > 0) { + while (!is_cancelled() && file_pub_transaction_[self].size() > 0) { std::unique_lock lock(*(get_publishers().get_mutex())); pub_activities_completed_->wait(lock); } + if (is_cancelled()) + throw TransactionCancelledException(XBT_THROW_POINT); XBT_DEBUG("All on-flight publish activities are completed. Proceed with the current transaction."); get_file_transport()->clear_to_write_in_transaction(self); } @@ -150,7 +169,7 @@ void FileEngine::pub_close() XBT_DEBUG("[%s] Wait for the completion of %u publish activities from the previous transaction", get_cname(), file_pub_transaction_[self].size()); - while (file_pub_transaction_[self].size() > 0) { + while (!is_cancelled() && file_pub_transaction_[self].size() > 0) { std::unique_lock lock(*(get_publishers().get_mutex())); pub_activities_completed_->wait(lock); } @@ -172,6 +191,9 @@ void FileEngine::pub_close() void FileEngine::begin_sub_transaction() { + if (is_cancelled()) + throw TransactionCancelledException(XBT_THROW_POINT); + // Only one subscriber has to do this if (!sub_transaction_in_progress_) { sub_transaction_in_progress_ = true; @@ -182,10 +204,12 @@ void FileEngine::begin_sub_transaction() // We have publishers on that stream, wait for them to complete a transaction first if (not get_publishers().is_empty()) { std::unique_lock lock(*get_subscribers().get_mutex()); - while (completed_pub_transaction_id_ < current_sub_transaction_id_) { + while (!is_cancelled() && completed_pub_transaction_id_ < current_sub_transaction_id_) { XBT_DEBUG("Wait for publishers to end the transaction I need"); pub_transaction_completed_->wait(lock); } + if (is_cancelled()) + throw TransactionCancelledException(XBT_THROW_POINT); XBT_DEBUG("Publishers stored metadata for that transaction, proceed"); } } @@ -197,11 +221,18 @@ void FileEngine::end_sub_transaction() // The files subscribers need to read may not have been fully written. Wait to be notified completion of the publish // activities - if (current_sub_transaction_id_ == current_pub_transaction_id_ && not get_publishers().is_empty()) { + if (!is_cancelled() && current_sub_transaction_id_ == current_pub_transaction_id_ && + not get_publishers().is_empty()) { XBT_DEBUG("Wait for the completion of publish activities from the current transaction"); pub_activities_completed_->wait(std::unique_lock(*get_subscribers().get_mutex())); XBT_DEBUG("All on-flight publish activities are completed. Proceed with the subscribe activities."); } + if (is_cancelled()) { + transport->close_sub_files(self); + transport->clear_to_read_in_transaction(self); + sub_transaction_in_progress_ = false; + throw TransactionCancelledException(XBT_THROW_POINT); + } // Subscriber get the list of files and size to read that has been build during the get() operations auto to_read = transport->get_to_read_in_transaction_by_actor(self); @@ -211,7 +242,17 @@ void FileEngine::end_sub_transaction() file_sub_transaction_[self].push(file->read_async(size)); XBT_DEBUG("Wait for the %d subscribe activities for the transaction", file_sub_transaction_[self].size()); - file_sub_transaction_[self].wait_all(); + try { + file_sub_transaction_[self].wait_all(); + } catch (const simgrid::CancelException&) { + if (!is_cancelled()) + throw; + file_sub_transaction_[self].clear(); + transport->close_sub_files(self); + transport->clear_to_read_in_transaction(self); + sub_transaction_in_progress_ = false; + throw TransactionCancelledException(XBT_THROW_POINT); + } file_sub_transaction_[self].clear(); // Close files opened in this transaction transport->close_sub_files(self); diff --git a/src/StagingEngine.cpp b/src/StagingEngine.cpp index 443c7fd..8304fb6 100644 --- a/src/StagingEngine.cpp +++ b/src/StagingEngine.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -21,6 +22,17 @@ XBT_LOG_NEW_DEFAULT_SUBCATEGORY(dtlmod_staging_engine, dtlmod_engine, "DTL loggi namespace dtlmod { /// \cond EXCLUDE_FROM_DOCUMENTATION +void StagingEngine::cancel_activities() +{ + for (int i = 0; i < get_pub_transaction().size(); i++) + get_pub_transaction().at(i)->cancel(); + for (int i = 0; i < get_sub_transaction().size(); i++) + get_sub_transaction().at(i)->cancel(); + first_pub_transaction_started_->notify_all(); + sub_transaction_started_->notify_all(); + pub_transaction_completed_->notify_all(); +} + void StagingEngine::create_transport(const Transport::Method& transport_method) { XBT_DEBUG("Create a new Staging Engine"); @@ -43,6 +55,9 @@ std::shared_ptr StagingEngine::get_staging_transport() const void StagingEngine::begin_pub_transaction() { + if (is_cancelled()) + throw TransactionCancelledException(XBT_THROW_POINT); + if (!pub_transaction_in_progress_) { pub_transaction_in_progress_ = true; current_pub_transaction_id_++; @@ -60,18 +75,28 @@ void StagingEngine::begin_pub_transaction() // Wait for the completion of the Publish activities from the previous transaction XBT_DEBUG("[T %d] (%d) Wait for the completion of %u publish activities from the previous transaction", current_pub_transaction_id_, current_sub_transaction_id_, get_pub_transaction().size()); - get_pub_transaction().wait_all(); + try { + get_pub_transaction().wait_all(); + } catch (const simgrid::CancelException&) { + if (!is_cancelled()) + throw; + get_pub_transaction().clear(); + throw TransactionCancelledException(XBT_THROW_POINT); + } XBT_DEBUG("All on-flight publish activities are completed. Proceed with the current transaction."); XBT_DEBUG("%u sub activities pending", get_sub_transaction().size()); get_pub_transaction().clear(); } // Then we wait for all subscribers to be at the same transaction - while (get_subscribers().is_empty() || current_pub_transaction_id_ > current_sub_transaction_id_) { + while (!is_cancelled() && + (get_subscribers().is_empty() || current_pub_transaction_id_ > current_sub_transaction_id_)) { XBT_DEBUG("Wait for subscribers"); sub_transaction_started_->wait(lock); } - // Publisher has been notified by subscribers, it can proceed with the transaction"); + if (is_cancelled()) + throw TransactionCancelledException(XBT_THROW_POINT); + // Publisher has been notified by subscribers, it can proceed with the transaction } void StagingEngine::end_pub_transaction() @@ -121,11 +146,16 @@ void StagingEngine::pub_close() void StagingEngine::begin_sub_transaction() { + if (is_cancelled()) + throw TransactionCancelledException(XBT_THROW_POINT); + if (current_sub_transaction_id_ == 0) { // This is the first transaction // Wait for at least one publisher to start a tran std::unique_lock lock(*get_subscribers().get_mutex()); - while (current_pub_transaction_id_ == 0) + while (!is_cancelled() && current_pub_transaction_id_ == 0) first_pub_transaction_started_->wait(lock); + if (is_cancelled()) + throw TransactionCancelledException(XBT_THROW_POINT); XBT_DEBUG("Publishers have started a transaction, create rendez-vous points"); // We now know the number of publishers, subscriber can create mailboxes/mqs with publishers get_staging_transport()->create_rendez_vous_points(); @@ -148,8 +178,10 @@ void StagingEngine::begin_sub_transaction() } std::unique_lock lock(*get_subscribers().get_mutex()); - while (completed_pub_transaction_id_ < current_sub_transaction_id_) + while (!is_cancelled() && completed_pub_transaction_id_ < current_sub_transaction_id_) pub_transaction_completed_->wait(lock); + if (is_cancelled()) + throw TransactionCancelledException(XBT_THROW_POINT); } void StagingEngine::end_sub_transaction() @@ -160,7 +192,16 @@ void StagingEngine::end_sub_transaction() if (get_subscribers().is_last_at_barrier()) { XBT_DEBUG("Wait for the %d subscribe activities for the transaction", get_sub_transaction().size()); - get_sub_transaction().wait_all(); + try { + get_sub_transaction().wait_all(); + } catch (const simgrid::CancelException&) { + if (!is_cancelled()) + throw; + get_sub_transaction().clear(); + sub_transaction_in_progress_ = false; + num_subscribers_starting_--; + throw TransactionCancelledException(XBT_THROW_POINT); + } XBT_DEBUG("All on-flight subscribe activities are completed. Proceed with the current transaction."); get_sub_transaction().clear(); } diff --git a/src/bindings/python/dtlmod_python.cpp b/src/bindings/python/dtlmod_python.cpp index 9f96a02..e0c60ab 100644 --- a/src/bindings/python/dtlmod_python.cpp +++ b/src/bindings/python/dtlmod_python.cpp @@ -88,6 +88,8 @@ PYBIND11_MODULE(dtlmod, m) py::register_exception(m, "InconsistentCompressionRatioException"); py::register_exception(m, "SubscriberSideCompressionException"); + py::register_exception(m, "TransactionCancelledException"); + /* Class DTL */ py::class_>(m, "DTL", "Data Transport Layer") .def_static("create", py::overload_cast(&DTL::create), py::call_guard(), @@ -233,6 +235,8 @@ PYBIND11_MODULE(dtlmod, m) "End a transaction on this Engine") .def_property_readonly("current_transaction", &Engine::get_current_transaction, "The id of the current transaction on this Engine (read-only)") + .def("cancel_transaction", &Engine::cancel_transaction, py::call_guard(), + "Cancel all in-flight activities of the current transaction (must be called from an external actor)") .def("close", &Engine::close, py::call_guard(), "Close this Engine"); py::enum_(engine, "Type", "The type of Engine") diff --git a/test/dtl_cancel.cpp b/test/dtl_cancel.cpp new file mode 100644 index 0000000..85560ad --- /dev/null +++ b/test/dtl_cancel.cpp @@ -0,0 +1,282 @@ +/* Copyright (c) 2026. The SWAT Team. All rights reserved. */ + +/* This program is free software; you can redistribute it and/or modify it + * under the terms of the license (GNU LGPL) which comes with this package. */ + +#include + +#include +#include +#include + +#include +#include +#include + +#include "./test_util.hpp" +#include "dtlmod/DTL.hpp" +#include "dtlmod/DTLException.hpp" + +XBT_LOG_NEW_DEFAULT_CATEGORY(dtlmod_test_cancel, "Logging category for this dtlmod test"); + +namespace sg4 = simgrid::s4u; +namespace sgfs = simgrid::fsmod; + +class DTLCancelTest : public ::testing::Test { +public: + DTLCancelTest() = default; + + sg4::NetZone* add_cluster(sg4::NetZone* root, const std::string& suffix, const int num_hosts) + { + auto* cluster = root->add_netzone_star("cluster" + suffix); + cluster->set_gateway(cluster->add_router("cluster" + suffix + "-router")); + auto* backbone = cluster->add_link("backbone" + suffix, "100Gbps")->set_latency("100us"); + for (int i = 0; i < num_hosts; i++) { + std::string name = "host-" + std::to_string(i) + suffix; + const auto* host = cluster->add_host(name, "1Gf"); + const auto* link = cluster->add_link(name + "_link", "10Gbps")->set_latency("10us"); + cluster->add_route(host, nullptr, {link, backbone}); + } + cluster->seal(); + return cluster; + } + + void setup_staging_platform() + { + auto* root = sg4::Engine::get_instance()->get_netzone_root(); + auto* internet = root->add_link("internet", "500MBps")->set_latency("1ms"); + auto* prod_cluster = add_cluster(root, ".prod", 4); + auto* cons_cluster = add_cluster(root, ".cons", 4); + root->add_route(prod_cluster, cons_cluster, {internet}); + root->seal(); + dtlmod::DTL::create(); + } + + void setup_file_platform() + { + sg4::NetZone* cluster = sg4::Engine::get_instance()->get_netzone_root()->add_netzone_star("cluster"); + auto pfs_server = cluster->add_host("pfs_server", "1Gf"); + std::vector pfs_disks; + for (int i = 0; i < 4; i++) + pfs_disks.push_back(pfs_server->add_disk("pfs_disk" + std::to_string(i), "2.5GBps", "1.2GBps")); + auto remote_storage = sgfs::JBODStorage::create("pfs_storage", pfs_disks); + remote_storage->set_raid_level(sgfs::JBODStorage::RAID::RAID5); + + std::vector> local_storages; + for (int i = 0; i < 4; i++) { + std::string hostname = "node-" + std::to_string(i); + auto* host = cluster->add_host(hostname, "1Gf"); + auto* disk = host->add_disk(hostname + "_disk", "5.5GBps", "2.1GBps"); + local_storages.push_back(sgfs::OneDiskStorage::create(hostname + "_local_storage", disk)); + std::string linkname = "link_" + std::to_string(i); + auto* link_up = cluster->add_link(linkname + "_UP", "1Gbps"); + auto* link_down = cluster->add_link(linkname + "_DOWN", "1Gbps"); + auto* loopback = + cluster->add_link(hostname + "_loopback", "10Gbps")->set_sharing_policy(sg4::Link::SharingPolicy::FATPIPE); + cluster->add_route(host, nullptr, {sg4::LinkInRoute(link_up)}, false); + cluster->add_route(nullptr, host, {sg4::LinkInRoute(link_down)}, false); + cluster->add_route(host, host, {loopback}); + } + cluster->seal(); + + auto my_fs = sgfs::FileSystem::create("my_fs"); + sgfs::FileSystem::register_file_system(cluster, my_fs); + my_fs->mount_partition("/pfs/", remote_storage, "500TB"); + for (int i = 0; i < 4; i++) + my_fs->mount_partition("/node-" + std::to_string(i) + "/scratch/", local_storages.at(i), "1TB"); + + dtlmod::DTL::create(); + } +}; + +// Publisher is stuck in begin_transaction() waiting for a subscriber that never shows up. +// An external canceller fires after 0.5s, unblocking the publisher with TransactionCancelledException. +// The subscriber registers but sleeps past the cancellation point, then gets TransactionCancelledException +// immediately on its own begin_transaction() because cancelled_ is already true. +TEST_F(DTLCancelTest, CancelStagingTransaction_WaitingForSubscriber_MQ) +{ + DO_TEST_WITH_FORK([this]() { + this->setup_staging_platform(); + auto* pub_host = sg4::Host::by_name("host-0.prod"); + auto* sub_host = sg4::Host::by_name("host-0.cons"); + auto* wdog_host = sg4::Host::by_name("host-1.prod"); + + pub_host->add_actor("PubTestActor", [wdog_host]() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + stream->set_engine_type(dtlmod::Engine::Type::Staging); + stream->set_transport_method(dtlmod::Transport::Method::MQ); + auto var = stream->define_variable("var", {1000, 1000}, {0, 0}, {1000, 1000}, sizeof(double)); + auto engine = stream->open("my-output", dtlmod::Stream::Mode::Publish); + + wdog_host->add_actor("Canceller", [engine]() { + sg4::this_actor::sleep_for(0.5); + XBT_INFO("Cancelling the transaction"); + engine->cancel_transaction(); + }); + + XBT_INFO("Begin transaction (will block waiting for subscriber)"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCancelledException); + XBT_INFO("Publisher caught TransactionCancelledException as expected"); + dtlmod::DTL::disconnect(); + }); + + sub_host->add_actor("SubTestActor", []() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + auto engine = stream->open("my-output", dtlmod::Stream::Mode::Subscribe); + auto var_sub = stream->inquire_variable("var"); + + sg4::this_actor::sleep_for(2.0); // sleep past the cancellation point + XBT_INFO("Begin transaction (cancelled_ already true)"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCancelledException); + XBT_INFO("Subscriber caught TransactionCancelledException as expected"); + dtlmod::DTL::disconnect(); + }); + + ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); + }); +} + +// Same scenario with Mailbox transport. +TEST_F(DTLCancelTest, CancelStagingTransaction_WaitingForSubscriber_Mailbox) +{ + DO_TEST_WITH_FORK([this]() { + this->setup_staging_platform(); + auto* pub_host = sg4::Host::by_name("host-0.prod"); + auto* sub_host = sg4::Host::by_name("host-0.cons"); + auto* wdog_host = sg4::Host::by_name("host-1.prod"); + + pub_host->add_actor("PubTestActor", [wdog_host]() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + stream->set_engine_type(dtlmod::Engine::Type::Staging); + stream->set_transport_method(dtlmod::Transport::Method::Mailbox); + auto var = stream->define_variable("var", {1000, 1000}, {0, 0}, {1000, 1000}, sizeof(double)); + auto engine = stream->open("my-output", dtlmod::Stream::Mode::Publish); + + wdog_host->add_actor("Canceller", [engine]() { + sg4::this_actor::sleep_for(0.5); + XBT_INFO("Cancelling the transaction"); + engine->cancel_transaction(); + }); + + XBT_INFO("Begin transaction (will block waiting for subscriber)"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCancelledException); + XBT_INFO("Publisher caught TransactionCancelledException as expected"); + dtlmod::DTL::disconnect(); + }); + + sub_host->add_actor("SubTestActor", []() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + auto engine = stream->open("my-output", dtlmod::Stream::Mode::Subscribe); + auto var_sub = stream->inquire_variable("var"); + + sg4::this_actor::sleep_for(2.0); + XBT_INFO("Begin transaction (cancelled_ already true)"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCancelledException); + XBT_INFO("Subscriber caught TransactionCancelledException as expected"); + dtlmod::DTL::disconnect(); + }); + + ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); + }); +} + +// Subscriber is stuck in begin_transaction() waiting for the publisher to start a transaction. +// Publisher opens the stream but never calls begin_transaction(). +// Canceller fires after 0.5s, unblocking the subscriber. +// Publisher then gets TransactionCancelledException immediately on its begin_transaction(). +TEST_F(DTLCancelTest, CancelStagingTransaction_WaitingForPublisher_MQ) +{ + DO_TEST_WITH_FORK([this]() { + this->setup_staging_platform(); + auto* pub_host = sg4::Host::by_name("host-0.prod"); + auto* sub_host = sg4::Host::by_name("host-0.cons"); + auto* wdog_host = sg4::Host::by_name("host-1.prod"); + + pub_host->add_actor("PubTestActor", [wdog_host]() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + stream->set_engine_type(dtlmod::Engine::Type::Staging); + stream->set_transport_method(dtlmod::Transport::Method::MQ); + auto var = stream->define_variable("var", {1000, 1000}, {0, 0}, {1000, 1000}, sizeof(double)); + auto engine = stream->open("my-output", dtlmod::Stream::Mode::Publish); + + wdog_host->add_actor("Canceller", [engine]() { + sg4::this_actor::sleep_for(0.5); + XBT_INFO("Cancelling the transaction"); + engine->cancel_transaction(); + }); + + sg4::this_actor::sleep_for(2.0); // sleep past the cancellation point + XBT_INFO("Begin transaction (cancelled_ already true)"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCancelledException); + XBT_INFO("Publisher caught TransactionCancelledException as expected"); + dtlmod::DTL::disconnect(); + }); + + sub_host->add_actor("SubTestActor", []() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + auto engine = stream->open("my-output", dtlmod::Stream::Mode::Subscribe); + auto var_sub = stream->inquire_variable("var"); + + XBT_INFO("Begin transaction (will block waiting for publisher to start a transaction)"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCancelledException); + XBT_INFO("Subscriber caught TransactionCancelledException as expected"); + dtlmod::DTL::disconnect(); + }); + + ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); + }); +} + +// FileEngine: subscriber is stuck in begin_transaction() waiting for the publisher to complete a transaction. +// Publisher opens the stream and registers but never calls begin_transaction(). +// Canceller fires after 0.5s, unblocking the subscriber. +TEST_F(DTLCancelTest, CancelFileEngineTransaction_WaitingForPublisher) +{ + DO_TEST_WITH_FORK([this]() { + this->setup_file_platform(); + auto* pub_host = sg4::Host::by_name("node-0"); + auto* sub_host = sg4::Host::by_name("node-1"); + auto* wdog_host = sg4::Host::by_name("node-2"); + + pub_host->add_actor("PubTestActor", [wdog_host]() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + stream->set_transport_method(dtlmod::Transport::Method::File); + stream->set_engine_type(dtlmod::Engine::Type::File); + auto var = stream->define_variable("var", {1000, 1000}, {0, 0}, {1000, 1000}, sizeof(double)); + auto engine = stream->open("cluster:my_fs:/node-0/scratch/my-output", dtlmod::Stream::Mode::Publish); + + wdog_host->add_actor("Canceller", [engine]() { + sg4::this_actor::sleep_for(0.5); + XBT_INFO("Cancelling the transaction"); + engine->cancel_transaction(); + }); + + sg4::this_actor::sleep_for(2.0); // sleep past the cancellation point + XBT_INFO("Begin transaction (cancelled_ already true)"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCancelledException); + XBT_INFO("Publisher caught TransactionCancelledException as expected"); + dtlmod::DTL::disconnect(); + }); + + sub_host->add_actor("SubTestActor", []() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + auto engine = stream->open("cluster:my_fs:/node-0/scratch/my-output", dtlmod::Stream::Mode::Subscribe); + auto var_sub = stream->inquire_variable("var"); + + XBT_INFO("Begin transaction (will block waiting for publisher to complete a transaction)"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCancelledException); + XBT_INFO("Subscriber caught TransactionCancelledException as expected"); + dtlmod::DTL::disconnect(); + }); + + ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); + }); +} diff --git a/test/python/dtl_cancel.py b/test/python/dtl_cancel.py new file mode 100644 index 0000000..bfc57c9 --- /dev/null +++ b/test/python/dtl_cancel.py @@ -0,0 +1,286 @@ +# Copyright (c) 2026. The SWAT Team. All rights reserved. +# +# This program is free software you can redistribute it and/or modify it +# under the terms of the license (GNU LGPL) which comes with this package. + +import ctypes +import sys +import multiprocessing +from simgrid import Engine, Host, this_actor, LinkInRoute +from dtlmod import DTL, Engine as DTLEngine, Stream, Transport, TransactionCancelledException + + +def add_cluster(root, suffix, num_hosts): + cluster = root.add_netzone_star(f"cluster{suffix}") + cluster.set_gateway(cluster.add_router(f"cluster{suffix}-router")) + backbone = cluster.add_link(f"backbone{suffix}", "100Gbps").set_latency("100us") + for h in range(num_hosts): + host = cluster.add_host(f"host-{h}{suffix}", "1Gf") + link = cluster.add_link(f"host-{h}{suffix}_link", "10Gbps").set_latency("10us") + cluster.add_route(host, None, [backbone, link]) + cluster.seal() + return cluster + + +def setup_staging_platform(): + e = Engine(sys.argv) + e.set_log_control("no_loc") + e.set_log_control("root.thresh:critical") + root = e.netzone_root + internet = root.add_link("internet", "500Mbps").set_latency("1ms") + prod_cluster = add_cluster(root, ".prod", 4) + cons_cluster = add_cluster(root, ".cons", 4) + root.add_route(prod_cluster, cons_cluster, [internet]) + root.seal() + DTL.create() + return e + + +def setup_file_platform(): + e = Engine(sys.argv) + e.set_log_control("no_loc") + e.set_log_control("root.thresh:critical") + from fsmod import FileSystem, JBODStorage, OneDiskStorage + + cluster = e.netzone_root.add_netzone_star("cluster") + pfs_server = cluster.add_host("pfs_server", "1Gf") + pfs_disks = [pfs_server.add_disk(f"pfs_disk{i}", "2.5GBps", "1.2GBps") for i in range(4)] + remote_storage = JBODStorage.create("pfs_storage", pfs_disks) + remote_storage.set_raid_level(JBODStorage.RAID.RAID5) + + local_storages = [] + for i in range(4): + hostname = f"node-{i}" + host = cluster.add_host(hostname, "1Gf") + disk = host.add_disk(f"{hostname}_disk", "5.5GBps", "2.1GBps") + local_storages.append(OneDiskStorage.create(f"{hostname}_local_storage", disk)) + link_up = cluster.add_link(f"link_{i}_UP", "1Gbps") + link_down = cluster.add_link(f"link_{i}_DOWN", "1Gbps") + loopback = cluster.add_link(f"{hostname}_loopback", "10Gbps") + cluster.add_route(host, None, [LinkInRoute(link_up)], False) + cluster.add_route(None, host, [LinkInRoute(link_down)], False) + cluster.add_route(host, host, [loopback]) + cluster.seal() + + my_fs = FileSystem.create("my_fs") + FileSystem.register_file_system(cluster, my_fs) + my_fs.mount_partition("/pfs/", remote_storage, "500TB") + for i in range(4): + my_fs.mount_partition(f"/node-{i}/scratch/", local_storages[i], "1TB") + + DTL.create() + return e + + +# Publisher stuck in begin_transaction() waiting for a subscriber that never shows up. +# Canceller fires at t=0.5s. Publisher catches TransactionCancelledException. +# Subscriber sleeps past the cancel point, then gets it immediately on its begin_transaction(). +def run_test_cancel_staging_waiting_for_subscriber_mq(): + e = setup_staging_platform() + engine_ref = [None] + + def pub_actor(): + dtl = DTL.connect() + stream = dtl.add_stream("my-output") + stream.set_engine_type(DTLEngine.Type.Staging).set_transport_method(Transport.Method.MQ) + stream.define_variable("var", (1000, 1000), (0, 0), (1000, 1000), ctypes.sizeof(ctypes.c_double)) + engine = stream.open("my-output", Stream.Mode.Publish) + engine_ref[0] = engine + + Host.by_name("host-1.prod").add_actor("Canceller", canceller_actor) + + this_actor.info("Begin transaction (will block waiting for subscriber)") + try: + engine.begin_transaction() + assert False, "Expected TransactionCancelledException" + except TransactionCancelledException: + this_actor.info("Publisher caught TransactionCancelledException as expected") + DTL.disconnect() + + def sub_actor(): + dtl = DTL.connect() + stream = dtl.add_stream("my-output") + engine = stream.open("my-output", Stream.Mode.Subscribe) + stream.inquire_variable("var") + this_actor.sleep_for(2.0) + this_actor.info("Begin transaction (cancelled_ already true)") + try: + engine.begin_transaction() + assert False, "Expected TransactionCancelledException" + except TransactionCancelledException: + this_actor.info("Subscriber caught TransactionCancelledException as expected") + DTL.disconnect() + + def canceller_actor(): + this_actor.sleep_for(0.5) + this_actor.info("Cancelling the transaction") + engine_ref[0].cancel_transaction() + + Host.by_name("host-0.prod").add_actor("PubTestActor", pub_actor) + Host.by_name("host-0.cons").add_actor("SubTestActor", sub_actor) + e.run() + + +# Same scenario with Mailbox transport. +def run_test_cancel_staging_waiting_for_subscriber_mailbox(): + e = setup_staging_platform() + engine_ref = [None] + + def pub_actor(): + dtl = DTL.connect() + stream = dtl.add_stream("my-output") + stream.set_engine_type(DTLEngine.Type.Staging).set_transport_method(Transport.Method.Mailbox) + stream.define_variable("var", (1000, 1000), (0, 0), (1000, 1000), ctypes.sizeof(ctypes.c_double)) + engine = stream.open("my-output", Stream.Mode.Publish) + engine_ref[0] = engine + + Host.by_name("host-1.prod").add_actor("Canceller", canceller_actor) + + this_actor.info("Begin transaction (will block waiting for subscriber)") + try: + engine.begin_transaction() + assert False, "Expected TransactionCancelledException" + except TransactionCancelledException: + this_actor.info("Publisher caught TransactionCancelledException as expected") + DTL.disconnect() + + def sub_actor(): + dtl = DTL.connect() + stream = dtl.add_stream("my-output") + engine = stream.open("my-output", Stream.Mode.Subscribe) + stream.inquire_variable("var") + this_actor.sleep_for(2.0) + this_actor.info("Begin transaction (cancelled_ already true)") + try: + engine.begin_transaction() + assert False, "Expected TransactionCancelledException" + except TransactionCancelledException: + this_actor.info("Subscriber caught TransactionCancelledException as expected") + DTL.disconnect() + + def canceller_actor(): + this_actor.sleep_for(0.5) + this_actor.info("Cancelling the transaction") + engine_ref[0].cancel_transaction() + + Host.by_name("host-0.prod").add_actor("PubTestActor", pub_actor) + Host.by_name("host-0.cons").add_actor("SubTestActor", sub_actor) + e.run() + + +# Subscriber stuck in begin_transaction() waiting for the publisher to start a transaction. +# Publisher opens the stream but sleeps before calling begin_transaction(). +# Canceller fires at t=0.5s. Subscriber catches TransactionCancelledException. +def run_test_cancel_staging_waiting_for_publisher_mq(): + e = setup_staging_platform() + engine_ref = [None] + + def pub_actor(): + dtl = DTL.connect() + stream = dtl.add_stream("my-output") + stream.set_engine_type(DTLEngine.Type.Staging).set_transport_method(Transport.Method.MQ) + stream.define_variable("var", (1000, 1000), (0, 0), (1000, 1000), ctypes.sizeof(ctypes.c_double)) + engine = stream.open("my-output", Stream.Mode.Publish) + engine_ref[0] = engine + + Host.by_name("host-1.prod").add_actor("Canceller", canceller_actor) + + this_actor.sleep_for(2.0) + this_actor.info("Begin transaction (cancelled_ already true)") + try: + engine.begin_transaction() + assert False, "Expected TransactionCancelledException" + except TransactionCancelledException: + this_actor.info("Publisher caught TransactionCancelledException as expected") + DTL.disconnect() + + def sub_actor(): + dtl = DTL.connect() + stream = dtl.add_stream("my-output") + engine = stream.open("my-output", Stream.Mode.Subscribe) + stream.inquire_variable("var") + this_actor.info("Begin transaction (will block waiting for publisher to start a transaction)") + try: + engine.begin_transaction() + assert False, "Expected TransactionCancelledException" + except TransactionCancelledException: + this_actor.info("Subscriber caught TransactionCancelledException as expected") + DTL.disconnect() + + def canceller_actor(): + this_actor.sleep_for(0.5) + this_actor.info("Cancelling the transaction") + engine_ref[0].cancel_transaction() + + Host.by_name("host-0.prod").add_actor("PubTestActor", pub_actor) + Host.by_name("host-0.cons").add_actor("SubTestActor", sub_actor) + e.run() + + +# FileEngine: subscriber stuck in begin_transaction() waiting for publisher to complete a transaction. +# Publisher opens the stream but sleeps before calling begin_transaction(). +# Canceller fires at t=0.5s. Subscriber catches TransactionCancelledException. +def run_test_cancel_file_engine_waiting_for_publisher(): + e = setup_file_platform() + engine_ref = [None] + + def pub_actor(): + dtl = DTL.connect() + stream = dtl.add_stream("my-output") + stream.set_engine_type(DTLEngine.Type.File).set_transport_method(Transport.Method.File) + stream.define_variable("var", (1000, 1000), (0, 0), (1000, 1000), ctypes.sizeof(ctypes.c_double)) + engine = stream.open("cluster:my_fs:/node-0/scratch/my-output", Stream.Mode.Publish) + engine_ref[0] = engine + + Host.by_name("node-2").add_actor("Canceller", canceller_actor) + + this_actor.sleep_for(2.0) + this_actor.info("Begin transaction (cancelled_ already true)") + try: + engine.begin_transaction() + assert False, "Expected TransactionCancelledException" + except TransactionCancelledException: + this_actor.info("Publisher caught TransactionCancelledException as expected") + DTL.disconnect() + + def sub_actor(): + dtl = DTL.connect() + stream = dtl.add_stream("my-output") + engine = stream.open("cluster:my_fs:/node-0/scratch/my-output", Stream.Mode.Subscribe) + stream.inquire_variable("var") + this_actor.info("Begin transaction (will block waiting for publisher to complete a transaction)") + try: + engine.begin_transaction() + assert False, "Expected TransactionCancelledException" + except TransactionCancelledException: + this_actor.info("Subscriber caught TransactionCancelledException as expected") + DTL.disconnect() + + def canceller_actor(): + this_actor.sleep_for(0.5) + this_actor.info("Cancelling the transaction") + engine_ref[0].cancel_transaction() + + Host.by_name("node-0").add_actor("PubTestActor", pub_actor) + Host.by_name("node-1").add_actor("SubTestActor", sub_actor) + e.run() + + +if __name__ == '__main__': + tests = [ + run_test_cancel_staging_waiting_for_subscriber_mq, + run_test_cancel_staging_waiting_for_subscriber_mailbox, + run_test_cancel_staging_waiting_for_publisher_mq, + run_test_cancel_file_engine_waiting_for_publisher, + ] + + for test in tests: + print(f"\nRun {test.__name__} ...") + p = multiprocessing.Process(target=test) + p.start() + p.join() + + if p.exitcode != 0: + print(f"FAILED: {test.__name__} (exit code {p.exitcode})") + else: + print(f"PASSED: {test.__name__}") diff --git a/test/python/unit_tests_python.py b/test/python/unit_tests_python.py index 9e9f7be..3ac92b8 100644 --- a/test/python/unit_tests_python.py +++ b/test/python/unit_tests_python.py @@ -9,7 +9,8 @@ "dtl_staging_engine.py", "dtl_stream.py", "dtl_variable.py", - "dtl_reduction.py" + "dtl_reduction.py", + "dtl_cancel.py" ] def run_script(script): From 0c5deb08030f7cb9ace0738a0c569c7aa814c2d3 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Wed, 22 Apr 2026 17:31:26 -0400 Subject: [PATCH 61/92] [CodeFactor] no multiple spaces before operator --- test/python/dtl_cancel.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/test/python/dtl_cancel.py b/test/python/dtl_cancel.py index bfc57c9..0a21264 100644 --- a/test/python/dtl_cancel.py +++ b/test/python/dtl_cancel.py @@ -54,9 +54,9 @@ def setup_file_platform(): host = cluster.add_host(hostname, "1Gf") disk = host.add_disk(f"{hostname}_disk", "5.5GBps", "2.1GBps") local_storages.append(OneDiskStorage.create(f"{hostname}_local_storage", disk)) - link_up = cluster.add_link(f"link_{i}_UP", "1Gbps") + link_up = cluster.add_link(f"link_{i}_UP", "1Gbps") link_down = cluster.add_link(f"link_{i}_DOWN", "1Gbps") - loopback = cluster.add_link(f"{hostname}_loopback", "10Gbps") + loopback = cluster.add_link(f"{hostname}_loopback", "10Gbps") cluster.add_route(host, None, [LinkInRoute(link_up)], False) cluster.add_route(None, host, [LinkInRoute(link_down)], False) cluster.add_route(host, host, [loopback]) @@ -80,7 +80,7 @@ def run_test_cancel_staging_waiting_for_subscriber_mq(): engine_ref = [None] def pub_actor(): - dtl = DTL.connect() + dtl = DTL.connect() stream = dtl.add_stream("my-output") stream.set_engine_type(DTLEngine.Type.Staging).set_transport_method(Transport.Method.MQ) stream.define_variable("var", (1000, 1000), (0, 0), (1000, 1000), ctypes.sizeof(ctypes.c_double)) @@ -98,9 +98,9 @@ def pub_actor(): DTL.disconnect() def sub_actor(): - dtl = DTL.connect() - stream = dtl.add_stream("my-output") - engine = stream.open("my-output", Stream.Mode.Subscribe) + dtl = DTL.connect() + stream = dtl.add_stream("my-output") + engine = stream.open("my-output", Stream.Mode.Subscribe) stream.inquire_variable("var") this_actor.sleep_for(2.0) this_actor.info("Begin transaction (cancelled_ already true)") @@ -127,7 +127,7 @@ def run_test_cancel_staging_waiting_for_subscriber_mailbox(): engine_ref = [None] def pub_actor(): - dtl = DTL.connect() + dtl = DTL.connect() stream = dtl.add_stream("my-output") stream.set_engine_type(DTLEngine.Type.Staging).set_transport_method(Transport.Method.Mailbox) stream.define_variable("var", (1000, 1000), (0, 0), (1000, 1000), ctypes.sizeof(ctypes.c_double)) @@ -145,9 +145,9 @@ def pub_actor(): DTL.disconnect() def sub_actor(): - dtl = DTL.connect() - stream = dtl.add_stream("my-output") - engine = stream.open("my-output", Stream.Mode.Subscribe) + dtl = DTL.connect() + stream = dtl.add_stream("my-output") + engine = stream.open("my-output", Stream.Mode.Subscribe) stream.inquire_variable("var") this_actor.sleep_for(2.0) this_actor.info("Begin transaction (cancelled_ already true)") @@ -176,7 +176,7 @@ def run_test_cancel_staging_waiting_for_publisher_mq(): engine_ref = [None] def pub_actor(): - dtl = DTL.connect() + dtl = DTL.connect() stream = dtl.add_stream("my-output") stream.set_engine_type(DTLEngine.Type.Staging).set_transport_method(Transport.Method.MQ) stream.define_variable("var", (1000, 1000), (0, 0), (1000, 1000), ctypes.sizeof(ctypes.c_double)) @@ -195,9 +195,9 @@ def pub_actor(): DTL.disconnect() def sub_actor(): - dtl = DTL.connect() - stream = dtl.add_stream("my-output") - engine = stream.open("my-output", Stream.Mode.Subscribe) + dtl = DTL.connect() + stream = dtl.add_stream("my-output") + engine = stream.open("my-output", Stream.Mode.Subscribe) stream.inquire_variable("var") this_actor.info("Begin transaction (will block waiting for publisher to start a transaction)") try: @@ -225,7 +225,7 @@ def run_test_cancel_file_engine_waiting_for_publisher(): engine_ref = [None] def pub_actor(): - dtl = DTL.connect() + dtl = DTL.connect() stream = dtl.add_stream("my-output") stream.set_engine_type(DTLEngine.Type.File).set_transport_method(Transport.Method.File) stream.define_variable("var", (1000, 1000), (0, 0), (1000, 1000), ctypes.sizeof(ctypes.c_double)) @@ -244,9 +244,9 @@ def pub_actor(): DTL.disconnect() def sub_actor(): - dtl = DTL.connect() - stream = dtl.add_stream("my-output") - engine = stream.open("cluster:my_fs:/node-0/scratch/my-output", Stream.Mode.Subscribe) + dtl = DTL.connect() + stream = dtl.add_stream("my-output") + engine = stream.open("cluster:my_fs:/node-0/scratch/my-output", Stream.Mode.Subscribe) stream.inquire_variable("var") this_actor.info("Begin transaction (will block waiting for publisher to complete a transaction)") try: From 88653923a8df25b3819084603d41c6c8ad99274d Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Thu, 23 Apr 2026 10:35:28 -0400 Subject: [PATCH 62/92] one more test of cancelling a transaction --- src/StagingEngine.cpp | 12 +++++++ test/dtl_cancel.cpp | 68 +++++++++++++++++++++++++++++++++++ test/python/dtl_cancel.py | 74 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 154 insertions(+) diff --git a/src/StagingEngine.cpp b/src/StagingEngine.cpp index 8304fb6..8391522 100644 --- a/src/StagingEngine.cpp +++ b/src/StagingEngine.cpp @@ -82,6 +82,11 @@ void StagingEngine::begin_pub_transaction() throw; get_pub_transaction().clear(); throw TransactionCancelledException(XBT_THROW_POINT); + } catch (const simgrid::NetworkFailureException&) { + if (!is_cancelled()) + throw; + get_pub_transaction().clear(); + throw TransactionCancelledException(XBT_THROW_POINT); } XBT_DEBUG("All on-flight publish activities are completed. Proceed with the current transaction."); XBT_DEBUG("%u sub activities pending", get_sub_transaction().size()); @@ -201,6 +206,13 @@ void StagingEngine::end_sub_transaction() sub_transaction_in_progress_ = false; num_subscribers_starting_--; throw TransactionCancelledException(XBT_THROW_POINT); + } catch (const simgrid::NetworkFailureException&) { + if (!is_cancelled()) + throw; + get_sub_transaction().clear(); + sub_transaction_in_progress_ = false; + num_subscribers_starting_--; + throw TransactionCancelledException(XBT_THROW_POINT); } XBT_DEBUG("All on-flight subscribe activities are completed. Proceed with the current transaction."); get_sub_transaction().clear(); diff --git a/test/dtl_cancel.cpp b/test/dtl_cancel.cpp index 85560ad..c844349 100644 --- a/test/dtl_cancel.cpp +++ b/test/dtl_cancel.cpp @@ -52,6 +52,17 @@ class DTLCancelTest : public ::testing::Test { dtlmod::DTL::create(); } + void setup_slow_staging_platform() + { + auto* root = sg4::Engine::get_instance()->get_netzone_root(); + auto* internet = root->add_link("internet", "1MBps")->set_latency("1ms"); + auto* prod_cluster = add_cluster(root, ".prod", 4); + auto* cons_cluster = add_cluster(root, ".cons", 4); + root->add_route(prod_cluster, cons_cluster, {internet}); + root->seal(); + dtlmod::DTL::create(); + } + void setup_file_platform() { sg4::NetZone* cluster = sg4::Engine::get_instance()->get_netzone_root()->add_netzone_star("cluster"); @@ -280,3 +291,60 @@ TEST_F(DTLCancelTest, CancelFileEngineTransaction_WaitingForPublisher) ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); }); } + +// Publisher and subscriber are both engaged in a long Mailbox transfer (Mailbox simulates bandwidth; MQ does not). +// Publisher completes T1 end_transaction() (starting slow async Comms) then blocks in T2 begin_transaction() +// waiting for T1 sends to complete. Subscriber blocks in T1 end_transaction() waiting for receives. +// Canceller fires after 0.5s, unblocking both with TransactionCancelledException. +TEST_F(DTLCancelTest, CancelStagingTransaction_MidTransaction_Mailbox) +{ + DO_TEST_WITH_FORK([this]() { + this->setup_slow_staging_platform(); + auto* pub_host = sg4::Host::by_name("host-0.prod"); + auto* sub_host = sg4::Host::by_name("host-0.cons"); + auto* wdog_host = sg4::Host::by_name("host-1.prod"); + + pub_host->add_actor("PubTestActor", [wdog_host]() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + stream->set_engine_type(dtlmod::Engine::Type::Staging); + stream->set_transport_method(dtlmod::Transport::Method::Mailbox); + auto var = stream->define_variable("var", {1000, 1000}, {0, 0}, {1000, 1000}, sizeof(double)); + auto engine = stream->open("my-output", dtlmod::Stream::Mode::Publish); + + wdog_host->add_actor("Canceller", [engine]() { + sg4::this_actor::sleep_for(0.5); + XBT_INFO("Cancelling the transaction"); + engine->cancel_transaction(); + }); + + // T1: completes normally, starting slow async Comms over the 1MBps internet link + engine->begin_transaction(); + engine->put(var); + engine->end_transaction(); + + // T2: blocks waiting for T1 Comms to drain -- cancelled mid-transfer + XBT_INFO("Begin T2 (will block waiting for T1 sends to complete over slow link)"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCancelledException); + XBT_INFO("Publisher caught TransactionCancelledException in T2 begin_transaction() as expected"); + dtlmod::DTL::disconnect(); + }); + + sub_host->add_actor("SubTestActor", []() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + auto engine = stream->open("my-output", dtlmod::Stream::Mode::Subscribe); + auto var_sub = stream->inquire_variable("var"); + + // T1: blocks in end_transaction() waiting for slow receives -- cancelled mid-transfer + engine->begin_transaction(); + engine->get(var_sub); + XBT_INFO("End T1 (will block waiting for receives over slow link)"); + ASSERT_THROW(engine->end_transaction(), dtlmod::TransactionCancelledException); + XBT_INFO("Subscriber caught TransactionCancelledException in T1 end_transaction() as expected"); + dtlmod::DTL::disconnect(); + }); + + ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); + }); +} diff --git a/test/python/dtl_cancel.py b/test/python/dtl_cancel.py index 0a21264..3419365 100644 --- a/test/python/dtl_cancel.py +++ b/test/python/dtl_cancel.py @@ -36,6 +36,20 @@ def setup_staging_platform(): return e +def setup_slow_staging_platform(): + e = Engine(sys.argv) + e.set_log_control("no_loc") + e.set_log_control("root.thresh:critical") + root = e.netzone_root + internet = root.add_link("internet", "10Mbps").set_latency("1ms") + prod_cluster = add_cluster(root, ".prod", 4) + cons_cluster = add_cluster(root, ".cons", 4) + root.add_route(prod_cluster, cons_cluster, [internet]) + root.seal() + DTL.create() + return e + + def setup_file_platform(): e = Engine(sys.argv) e.set_log_control("no_loc") @@ -266,12 +280,72 @@ def canceller_actor(): e.run() +# Publisher and subscriber are both engaged in a long Mailbox transfer. +# Publisher completes T1 end_transaction() (starting slow async comms) then blocks in T2 +# begin_transaction() waiting for T1 sends to complete. Subscriber blocks in T1 end_transaction() +# waiting for receives. Canceller fires at t=0.5s, unblocking both with TransactionCancelledException. +def run_test_cancel_staging_mid_transaction_mailbox(): + e = setup_slow_staging_platform() + engine_ref = [None] + + def pub_actor(): + dtl = DTL.connect() + stream = dtl.add_stream("my-output") + stream.set_engine_type(DTLEngine.Type.Staging).set_transport_method(Transport.Method.Mailbox) + var = stream.define_variable("var", (1000, 1000), (0, 0), (1000, 1000), ctypes.sizeof(ctypes.c_double)) + engine = stream.open("my-output", Stream.Mode.Publish) + engine_ref[0] = engine + + Host.by_name("host-1.prod").add_actor("Canceller", canceller_actor) + + # T1: completes normally (starts slow async sends over 10Mbps link) + engine.begin_transaction() + engine.put(var) + engine.end_transaction() + + # T2: blocks waiting for T1 sends to complete -- gets cancelled + this_actor.info("Begin T2 (will block waiting for T1 sends to complete over slow link)") + try: + engine.begin_transaction() + assert False, "Expected TransactionCancelledException" + except TransactionCancelledException: + this_actor.info("Publisher caught TransactionCancelledException in T2 begin_transaction() as expected") + DTL.disconnect() + + def sub_actor(): + dtl = DTL.connect() + stream = dtl.add_stream("my-output") + engine = stream.open("my-output", Stream.Mode.Subscribe) + var_sub = stream.inquire_variable("var") + + # T1: blocks in end_transaction() waiting for slow receives + engine.begin_transaction() + engine.get(var_sub) + this_actor.info("End T1 (will block waiting for receives over slow link)") + try: + engine.end_transaction() + assert False, "Expected TransactionCancelledException" + except TransactionCancelledException: + this_actor.info("Subscriber caught TransactionCancelledException in T1 end_transaction() as expected") + DTL.disconnect() + + def canceller_actor(): + this_actor.sleep_for(0.5) + this_actor.info("Cancelling the transaction") + engine_ref[0].cancel_transaction() + + Host.by_name("host-0.prod").add_actor("PubTestActor", pub_actor) + Host.by_name("host-0.cons").add_actor("SubTestActor", sub_actor) + e.run() + + if __name__ == '__main__': tests = [ run_test_cancel_staging_waiting_for_subscriber_mq, run_test_cancel_staging_waiting_for_subscriber_mailbox, run_test_cancel_staging_waiting_for_publisher_mq, run_test_cancel_file_engine_waiting_for_publisher, + run_test_cancel_staging_mid_transaction_mailbox, ] for test in tests: From c7ffb8a9b794eb0ec100233c5028540f71740bcb Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Thu, 23 Apr 2026 10:47:54 -0400 Subject: [PATCH 63/92] bump action version from v4 to v6 --- .github/workflows/build.yml | 8 ++++---- .github/workflows/weekly-checks.yml | 26 +++++++++++++------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 78568f6..ba02514 100755 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,7 +15,7 @@ jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: fetch-depth: 0 @@ -37,7 +37,7 @@ jobs: doxygen - name: Cache SimGrid - uses: actions/cache@v4 + uses: actions/cache@v6 id: cache-simgrid with: path: /opt/simgrid @@ -53,7 +53,7 @@ jobs: sudo cmake --install build - name: Cache FSMod - uses: actions/cache@v4 + uses: actions/cache@v6 id: cache-fsmod with: path: /opt/fsmod @@ -69,7 +69,7 @@ jobs: sudo cmake --install build - name: Cache Google Test - uses: actions/cache@v4 + uses: actions/cache@v6 id: cache-gtest with: path: /opt/gtest diff --git a/.github/workflows/weekly-checks.yml b/.github/workflows/weekly-checks.yml index 720732e..aa01b1c 100644 --- a/.github/workflows/weekly-checks.yml +++ b/.github/workflows/weekly-checks.yml @@ -29,7 +29,7 @@ jobs: env_options: "ASAN_OPTIONS=detect_leaks=1 UBSAN_OPTIONS=print_stacktrace=1" steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: fetch-depth: 0 @@ -43,7 +43,7 @@ jobs: pybind11-dev - name: Cache SimGrid - uses: actions/cache@v4 + uses: actions/cache@v6 id: cache-simgrid with: path: /opt/simgrid @@ -59,7 +59,7 @@ jobs: sudo cmake --install build - name: Cache FSMod - uses: actions/cache@v4 + uses: actions/cache@v6 id: cache-fsmod with: path: /opt/fsmod @@ -75,7 +75,7 @@ jobs: sudo cmake --install build - name: Cache Google Test - uses: actions/cache@v4 + uses: actions/cache@v6 id: cache-gtest with: path: /opt/gtest @@ -126,7 +126,7 @@ jobs: cd ../.. - name: Upload sanitizer report - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 if: always() with: name: ${{ matrix.sanitizer.name }}-report @@ -145,7 +145,7 @@ jobs: name: Valgrind Memory Check runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: fetch-depth: 0 @@ -160,7 +160,7 @@ jobs: valgrind - name: Cache SimGrid - uses: actions/cache@v4 + uses: actions/cache@v6 id: cache-simgrid with: path: /opt/simgrid @@ -176,7 +176,7 @@ jobs: sudo cmake --install build - name: Cache FSMod - uses: actions/cache@v4 + uses: actions/cache@v6 id: cache-fsmod with: path: /opt/fsmod @@ -192,7 +192,7 @@ jobs: sudo cmake --install build - name: Cache Google Test - uses: actions/cache@v4 + uses: actions/cache@v6 id: cache-gtest with: path: /opt/gtest @@ -225,7 +225,7 @@ jobs: cmake --build . --target valgrind - name: Upload Valgrind report - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 if: always() with: name: valgrind-report @@ -273,7 +273,7 @@ jobs: fi - name: Upload summary - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 if: always() with: name: valgrind-summary @@ -287,7 +287,7 @@ jobs: if: always() steps: - name: Download all artifacts - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v6 with: path: reports @@ -321,7 +321,7 @@ jobs: cat weekly-report.md - name: Upload weekly report - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: weekly-report path: weekly-report.md From 5f09ed33def9543b4b699dad7436ea2916192a18 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Thu, 23 Apr 2026 10:57:30 -0400 Subject: [PATCH 64/92] bump action/cache version from v4 to v5 --- .github/workflows/build.yml | 6 +++--- .github/workflows/weekly-checks.yml | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ba02514..7ae80cf 100755 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -37,7 +37,7 @@ jobs: doxygen - name: Cache SimGrid - uses: actions/cache@v6 + uses: actions/cache@v5 id: cache-simgrid with: path: /opt/simgrid @@ -53,7 +53,7 @@ jobs: sudo cmake --install build - name: Cache FSMod - uses: actions/cache@v6 + uses: actions/cache@v5 id: cache-fsmod with: path: /opt/fsmod @@ -69,7 +69,7 @@ jobs: sudo cmake --install build - name: Cache Google Test - uses: actions/cache@v6 + uses: actions/cache@v5 id: cache-gtest with: path: /opt/gtest diff --git a/.github/workflows/weekly-checks.yml b/.github/workflows/weekly-checks.yml index aa01b1c..0f9ab06 100644 --- a/.github/workflows/weekly-checks.yml +++ b/.github/workflows/weekly-checks.yml @@ -43,7 +43,7 @@ jobs: pybind11-dev - name: Cache SimGrid - uses: actions/cache@v6 + uses: actions/cache@v5 id: cache-simgrid with: path: /opt/simgrid @@ -59,7 +59,7 @@ jobs: sudo cmake --install build - name: Cache FSMod - uses: actions/cache@v6 + uses: actions/cache@v5 id: cache-fsmod with: path: /opt/fsmod @@ -75,7 +75,7 @@ jobs: sudo cmake --install build - name: Cache Google Test - uses: actions/cache@v6 + uses: actions/cache@v5 id: cache-gtest with: path: /opt/gtest @@ -160,7 +160,7 @@ jobs: valgrind - name: Cache SimGrid - uses: actions/cache@v6 + uses: actions/cache@v5 id: cache-simgrid with: path: /opt/simgrid @@ -176,7 +176,7 @@ jobs: sudo cmake --install build - name: Cache FSMod - uses: actions/cache@v6 + uses: actions/cache@v5 id: cache-fsmod with: path: /opt/fsmod @@ -192,7 +192,7 @@ jobs: sudo cmake --install build - name: Cache Google Test - uses: actions/cache@v6 + uses: actions/cache@v5 id: cache-gtest with: path: /opt/gtest From 6bad10984fa2405323f0470faee3d0889cfb99c7 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Tue, 28 Apr 2026 14:55:00 -0400 Subject: [PATCH 65/92] add optional parameters to DTL::add_stream --- ChangeLog | 4 ++++ include/dtlmod/DTL.hpp | 12 ++++++++++-- src/DTL.cpp | 9 +++++++-- src/bindings/python/dtlmod_python.cpp | 1 + 4 files changed, 22 insertions(+), 4 deletions(-) diff --git a/ChangeLog b/ChangeLog index 18b2cd1..929460b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -25,6 +25,10 @@ API Changes: a DTLMod::TransactionCancelledException that must be caught if you want the publishers and subscribers to survive to the cancellation of this transaction. + - Behavior: + - DTL::add_stream can now take an Engine Type and a Transport Method as + parameters (both defaulting to 'Undefined' if not specified). + ---------------------------------------------------------------------------- DTLMod (0.4) February 16, 2026 diff --git a/include/dtlmod/DTL.hpp b/include/dtlmod/DTL.hpp index a03ce99..a3a9af6 100644 --- a/include/dtlmod/DTL.hpp +++ b/include/dtlmod/DTL.hpp @@ -47,8 +47,16 @@ class DTL { /// @brief Add a data stream to the Data Transport Layer. /// @param name The name of the Stream to add to the DTL. - /// @return A handler on the newly created Stream object. - [[nodiscard]] std::shared_ptr add_stream(std::string_view name); + /// @param type The Engine type for the stream. Defaults to Engine::Type::Undefined, + /// which allows the stream to be configured later via set_engine_type(). + /// @param method The Transport method for the stream. Defaults to Transport::Method::Undefined, + /// which allows the stream to be configured later via set_transport_method(). + /// @return A shared pointer to the newly created Stream object. + /// @note The returned Stream can be further configured by calling set_engine_type() + /// and set_transport_method() if the optional parameters are not specified. + /// @see Stream::set_engine_type(), Stream::set_transport_method() + [[nodiscard]] std::shared_ptr add_stream(std::string_view name, Engine::Type type = Engine::Type::Undefined, + Transport::Method method = Transport::Method::Undefined); /// @brief Retrieve all streams declared in the Data Transport Layer. /// @return a map of handlers on Stream objects with their names as keys. diff --git a/src/DTL.cpp b/src/DTL.cpp index be53ac8..6fbb9b6 100644 --- a/src/DTL.cpp +++ b/src/DTL.cpp @@ -141,14 +141,19 @@ void DTL::disconnect() sg4::MessageQueue::by_name("dtlmod::connection_manager_handle")->get_unique(); } -std::shared_ptr DTL::add_stream(std::string_view name) +std::shared_ptr DTL::add_stream(std::string_view name, Engine::Type type, Transport::Method method) { // This has to be done in critical section to avoid concurrent creation. First actor to get the lock creates the // Stream. Other actors will retrieve it from the map. std::unique_lock lock(*mutex_); std::string name_str(name); - if (streams_.find(name_str) == streams_.end()) + if (streams_.find(name_str) == streams_.end()) { streams_.try_emplace(name_str, std::make_shared(name_str, this)); + if (type != Engine::Type::Undefined) + streams_[name_str]->set_engine_type(type); + if (method != Transport::Method::Undefined) + streams_[name_str]->set_transport_method(method); + } return streams_[name_str]; } diff --git a/src/bindings/python/dtlmod_python.cpp b/src/bindings/python/dtlmod_python.cpp index e0c60ab..3a29adb 100644 --- a/src/bindings/python/dtlmod_python.cpp +++ b/src/bindings/python/dtlmod_python.cpp @@ -100,6 +100,7 @@ PYBIND11_MODULE(dtlmod, m) .def_property_readonly("has_active_connections", &DTL::has_active_connections, "Check whether some simulated actors are currently connected to the DTL (read-only)") .def("add_stream", &DTL::add_stream, py::call_guard(), py::arg("name"), + py::arg("type") = Engine::Type::Undefined, py::arg("method") = Transport::Method::Undefined, "Add a data stream to the DTL") .def_property_readonly("all_streams", &DTL::get_all_streams, "Retrieve all streams declared in the DTL (read-only)") From 24f1823ac490bb5249fe22074915dd50f4a281cd Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Tue, 28 Apr 2026 15:09:00 -0400 Subject: [PATCH 66/92] reorder bindings file --- src/bindings/python/dtlmod_python.cpp | 68 +++++++++++++-------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/src/bindings/python/dtlmod_python.cpp b/src/bindings/python/dtlmod_python.cpp index 3a29adb..a4d3a51 100644 --- a/src/bindings/python/dtlmod_python.cpp +++ b/src/bindings/python/dtlmod_python.cpp @@ -90,6 +90,40 @@ PYBIND11_MODULE(dtlmod, m) py::register_exception(m, "TransactionCancelledException"); + /* Class Engine */ + py::class_> engine( + m, "Engine", "An Engine defines how data is transferred between the applications and the DTL"); + engine.def_property_readonly("name", &Engine::get_name, "The name of the Engine (read-only)") + .def("begin_transaction", &Engine::begin_transaction, py::call_guard(), + "Begin a transaction on this Engine") + .def("put", py::overload_cast&>(&Engine::put, py::const_), py::arg("var"), + py::call_guard(), "Put a Variable in the DTL using this Engine") + .def("put", py::overload_cast&, size_t>(&Engine::put, py::const_), py::arg("var"), + py::arg("simulated_size_in_bytes"), py::call_guard(), + "Put a Variable in the DTL using this Engine") + .def("get", &Engine::get, py::arg("var"), py::call_guard(), + "Get a Variable from the DTL using this Engine") + .def("end_transaction", &Engine::end_transaction, py::call_guard(), + "End a transaction on this Engine") + .def_property_readonly("current_transaction", &Engine::get_current_transaction, + "The id of the current transaction on this Engine (read-only)") + .def("cancel_transaction", &Engine::cancel_transaction, py::call_guard(), + "Cancel all in-flight activities of the current transaction (must be called from an external actor)") + .def("close", &Engine::close, py::call_guard(), "Close this Engine"); + + py::enum_(engine, "Type", "The type of Engine") + .value("Undefined", Engine::Type::Undefined) + .value("Staging", Engine::Type::Staging) + .value("File", Engine::Type::File); + + /* Class Transport */ + py::class_ transport(m, "Transport", "The transport method used by an Engine to transfer data"); + py::enum_(transport, "Method", "The transport method used by the Engine") + .value("Undefined", Transport::Method::Undefined) + .value("MQ", Transport::Method::MQ) + .value("Mailbox", Transport::Method::Mailbox) + .value("File", Transport::Method::File); + /* Class DTL */ py::class_>(m, "DTL", "Data Transport Layer") .def_static("create", py::overload_cast(&DTL::create), py::call_guard(), @@ -218,38 +252,4 @@ PYBIND11_MODULE(dtlmod, m) "Get the flop cost to reduce a Variable") .def("get_flop_amount_to_decompress_variable", &ReductionMethod::get_flop_amount_to_decompress_variable, py::arg("var"), "Get the flop cost to decompress a Variable"); - - /* Class Engine */ - py::class_> engine( - m, "Engine", "An Engine defines how data is transferred between the applications and the DTL"); - engine.def_property_readonly("name", &Engine::get_name, "The name of the Engine (read-only)") - .def("begin_transaction", &Engine::begin_transaction, py::call_guard(), - "Begin a transaction on this Engine") - .def("put", py::overload_cast&>(&Engine::put, py::const_), py::arg("var"), - py::call_guard(), "Put a Variable in the DTL using this Engine") - .def("put", py::overload_cast&, size_t>(&Engine::put, py::const_), py::arg("var"), - py::arg("simulated_size_in_bytes"), py::call_guard(), - "Put a Variable in the DTL using this Engine") - .def("get", &Engine::get, py::arg("var"), py::call_guard(), - "Get a Variable from the DTL using this Engine") - .def("end_transaction", &Engine::end_transaction, py::call_guard(), - "End a transaction on this Engine") - .def_property_readonly("current_transaction", &Engine::get_current_transaction, - "The id of the current transaction on this Engine (read-only)") - .def("cancel_transaction", &Engine::cancel_transaction, py::call_guard(), - "Cancel all in-flight activities of the current transaction (must be called from an external actor)") - .def("close", &Engine::close, py::call_guard(), "Close this Engine"); - - py::enum_(engine, "Type", "The type of Engine") - .value("Undefined", Engine::Type::Undefined) - .value("Staging", Engine::Type::Staging) - .value("File", Engine::Type::File); - - /* Class Transport */ - py::class_ transport(m, "Transport", "The transport method used by an Engine to transfer data"); - py::enum_(transport, "Method", "The transport method used by the Engine") - .value("Undefined", Transport::Method::Undefined) - .value("MQ", Transport::Method::MQ) - .value("Mailbox", Transport::Method::Mailbox) - .value("File", Transport::Method::File); } \ No newline at end of file From 592a5ee910902c6457fa3ab939918878ffd4f692 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Wed, 29 Apr 2026 11:34:58 -0400 Subject: [PATCH 67/92] new helper methods --- ChangeLog | 19 ++++++++++++++----- include/dtlmod/Stream.hpp | 6 ++++++ src/bindings/python/dtlmod_python.cpp | 7 +++++-- test/python/dtl_config.py | 12 ++++++------ test/python/dtl_stream.py | 6 +++--- 5 files changed, 34 insertions(+), 16 deletions(-) diff --git a/ChangeLog b/ChangeLog index 929460b..00c310f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -19,13 +19,22 @@ Improvements: API Changes: - New Engine method: - - Engine::cancel_transaction() cancel the currently ongoing transaction - performed by this engine. This method has to be called by an external + - Engine::cancel_transaction() cancel the currently ongoing transaction + performed by this engine. This method has to be called by an external actor not involved in the transaction. It will make the transaction raise - a DTLMod::TransactionCancelledException that must be caught if you want + a DTLMod::TransactionCancelledException that must be caught if you want the publishers and subscribers to survive to the cancellation of this - transaction. - - Behavior: + transaction. + - New Stream helper methods: + - Stream::get_engine_type() and Stream::get_transport_method() respectively + return the enum value of the engine type and transport method for a + Stream. + - As a consequence, in the Python bindings, the Stream.engine_type and + Stream.transport_method readonly properties that convert the engine type + and transport method to a string have been renamed to + Stream.engine_type_str and Stream.transport_method_str while the existing + properties now return the enum values. + - Behavior modification: - DTL::add_stream can now take an Engine Type and a Transport Method as parameters (both defaulting to 'Undefined' if not specified). diff --git a/include/dtlmod/Stream.hpp b/include/dtlmod/Stream.hpp index d03b74e..a48c8a0 100644 --- a/include/dtlmod/Stream.hpp +++ b/include/dtlmod/Stream.hpp @@ -100,9 +100,15 @@ class Stream : public std::enable_shared_from_this { /// @brief Helper function to print out the Engine::Type of the Stream. /// @return An optional containing the C-string if the type is valid, std::nullopt otherwise [[nodiscard]] std::optional get_engine_type_str() const noexcept; + /// @brief Helper function to get the Engine::Type of the Stream. + /// @return The Engine type + [[nodiscard]] Engine::Type get_engine_type() const noexcept { return engine_type_; } /// @brief Helper function to print out the Transport::Method of the Stream. /// @return An optional containing the C-string if the method is valid, std::nullopt otherwise [[nodiscard]] std::optional get_transport_method_str() const noexcept; + /// @brief Helper function to get the Transport::Method of the Stream. + /// @return The Transport method + [[nodiscard]] Transport::Method get_transport_method() const noexcept { return transport_method_; } /// @brief Helper function to know the access Mode of the Stream. /// @return The corresponding Stream::Mode [[nodiscard]] Mode get_access_mode() const noexcept { return access_mode_; } diff --git a/src/bindings/python/dtlmod_python.cpp b/src/bindings/python/dtlmod_python.cpp index a4d3a51..9f1185e 100644 --- a/src/bindings/python/dtlmod_python.cpp +++ b/src/bindings/python/dtlmod_python.cpp @@ -148,19 +148,22 @@ PYBIND11_MODULE(dtlmod, m) m, "Stream", "A Stream defines the connection between the applications that produce or consume data and the DTL"); stream.def_property_readonly("name", &Stream::get_name, "The name of the Stream (read-only)") .def_property_readonly( - "engine_type", + "engine_type_str", [](const Stream& self) { auto result = self.get_engine_type_str(); return result ? py::cast(*result) : py::cast(Py_None); }, "Print out the engine type of this Stream (read-only, returns None if invalid)") .def_property_readonly( - "transport_method", + "transport_method_str", [](const Stream& self) { auto result = self.get_transport_method_str(); return result ? py::cast(*result) : py::cast(Py_None); }, "Print out the transport method of this Stream (read-only, returns None if invalid)") + .def_property_readonly("engine_type", &Stream::get_engine_type, "Get the engine type of this Stream (read-only)") + .def_property_readonly("transport_method", &Stream::get_transport_method, + "Get the transport method of this Stream (read-only)") .def_property_readonly("access_mode", &Stream::get_access_mode_str, "Print out the access mode of this Stream (read-only)") .def_property_readonly("metadata_export", &Stream::does_export_metadata, diff --git a/test/python/dtl_config.py b/test/python/dtl_config.py index 648eca9..79159e9 100644 --- a/test/python/dtl_config.py +++ b/test/python/dtl_config.py @@ -7,7 +7,7 @@ import multiprocessing from simgrid import Engine, this_actor from fsmod import FileSystem, OneDiskStorage -from dtlmod import DTL, Stream +from dtlmod import DTL, Stream, Transport, Engine as DTLEngine def setup_platform(): e = Engine(sys.argv) @@ -37,8 +37,8 @@ def test_config_file(): this_actor.info("Open the stream") engine = stream.open("root:fs:/scratch/file", Stream.Mode.Publish) this_actor.info(f"Stream 1 is opened ({stream.engine_type},{stream.transport_method})") - assert stream.engine_type == "Engine::Type::File" - assert stream.transport_method == "Transport::Method::File" + assert stream.engine_type == DTLEngine.Type.File + assert stream.transport_method == Transport.Method.File assert stream.access_mode == "Mode::Publish" this_actor.info("Check if this stream is set to export metadata (it is)") assert True == stream.metadata_export @@ -56,9 +56,9 @@ def test_config_file(): assert None == dtl.stream_by_name("Unknown Stream") this_actor.info("Open the stream") engine = stream.open("staging", Stream.Mode.Publish) - this_actor.info(f"Stream 1 is opened ({stream.engine_type},{stream.transport_method})") - assert stream.engine_type == "Engine::Type::Staging" - assert stream.transport_method == "Transport::Method::MQ" + this_actor.info(f"Stream 1 is opened ({stream.engine_type_str},{stream.transport_method_str})") + assert stream.engine_type == DTLEngine.Type.Staging + assert stream.transport_method == Transport.Method.MQ this_actor.info("Let the actor sleep for 1 second") this_actor.sleep_for(1) this_actor.info("Close the engine") diff --git a/test/python/dtl_stream.py b/test/python/dtl_stream.py index 6b98b73..6e722b5 100644 --- a/test/python/dtl_stream.py +++ b/test/python/dtl_stream.py @@ -147,9 +147,9 @@ def test_producer_actor(): stream.set_engine_type(DTLEngine.Type.File) this_actor.info("Open the stream in Stream::Mode::Publish mode") engine = stream.open("zone:fs:/pfs/file", Stream.Mode.Publish) - this_actor.info(f"Stream is opened ({stream.engine_type},{stream.transport_method})") - assert stream.engine_type == "Engine::Type::File" - assert stream.transport_method == "Transport::Method::File" + this_actor.info(f"Stream is opened ({stream.engine_type_str},{stream.transport_method_str})") + assert stream.engine_type == DTLEngine.Type.File + assert stream.transport_method == Transport.Method.File this_actor.info("Let the actor sleep for 1 second") this_actor.sleep_for(1) this_actor.info("Close the engine") From 8fa60e2ef67552ad1c45050de29deaf79a644e45 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Wed, 29 Apr 2026 11:46:18 -0400 Subject: [PATCH 68/92] test the 3-parameter version of add_stream --- test/dtl_reduction.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/test/dtl_reduction.cpp b/test/dtl_reduction.cpp index 4ac8474..cd3404e 100644 --- a/test/dtl_reduction.cpp +++ b/test/dtl_reduction.cpp @@ -101,9 +101,7 @@ TEST_F(DTLReductionTest, SimpleDecimationFileEngine) XBT_INFO("Connect to the DTL"); auto dtl = dtlmod::DTL::connect(); XBT_INFO("Create a stream"); - auto stream = dtl->add_stream("my-output"); - stream->set_transport_method(dtlmod::Transport::Method::File); - stream->set_engine_type(dtlmod::Engine::Type::File); + auto stream = dtl->add_stream("my-output", dtlmod::Engine::Type::File, dtlmod::Transport::Method::File); stream->set_metadata_export(); XBT_INFO("Create a 3D variable"); auto var = From 7e28102e1ae23b099e1dede60bd7def9cc92b631 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Wed, 29 Apr 2026 12:26:04 -0400 Subject: [PATCH 69/92] add reduction methods to the DTL config file --- ChangeLog | 3 +++ include/dtlmod/Stream.hpp | 5 +++++ src/DTL.cpp | 5 +++++ src/Stream.cpp | 8 ++++++++ src/bindings/python/dtlmod_python.cpp | 7 +++++++ test/DTL-config.json | 3 ++- test/dtl_config.cpp | 6 ++++++ test/dtl_reduction.cpp | 2 +- test/python/dtl_config.py | 5 +++++ 9 files changed, 42 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index 00c310f..80f29cd 100644 --- a/ChangeLog +++ b/ChangeLog @@ -16,6 +16,7 @@ Improvements: metadata file has already been written by pub_close()) - Memory footprint of the File engine now grows as O(N_pub) instead of O(N_pub × N_transactions) for long-running concurrent streaming workloads + - API Changes: - New Engine method: @@ -34,6 +35,8 @@ API Changes: and transport method to a string have been renamed to Stream.engine_type_str and Stream.transport_method_str while the existing properties now return the enum values. + - Stream::get_reduction_method(name) to retrieve a reduction method + associated to the stream. - Behavior modification: - DTL::add_stream can now take an Engine Type and a Transport Method as parameters (both defaulting to 'Undefined' if not specified). diff --git a/include/dtlmod/Stream.hpp b/include/dtlmod/Stream.hpp index a48c8a0..2e1b92e 100644 --- a/include/dtlmod/Stream.hpp +++ b/include/dtlmod/Stream.hpp @@ -115,7 +115,12 @@ class Stream : public std::enable_shared_from_this { /// @brief Helper function to print out the access Mode of the Stream. /// @return The corresponding C-string [[nodiscard]] const char* get_access_mode_str() const noexcept { return mode_to_str(access_mode_); } + /// @brief Helper function to know the reduction method of the Stream. + /// @return An optional containing the ReductionMehtod if defined, std::nullopt otherwise + [[nodiscard]] std::optional> + get_reduction_method(std::string_view name) const noexcept; /// @brief Helper function to know if the Stream does export metadata or not + /// @param name the name of the reduction method /// @return a boolean indicating if the Stream does export metadata or not [[nodiscard]] bool does_export_metadata() const noexcept { return metadata_export_; } diff --git a/src/DTL.cpp b/src/DTL.cpp index 6fbb9b6..8072130 100644 --- a/src/DTL.cpp +++ b/src/DTL.cpp @@ -58,6 +58,11 @@ DTL::DTL(std::string_view filename) // And set its engine type and transport method streams_[name]->set_engine_type(type).set_transport_method(transport_method); + // Check if a reduction method must be defined for the stream + if (stream.contains("reduction_methods")) + for (const auto& method : stream["reduction_methods"]) + streams_[name]->define_reduction_method(method.get()); + // Check if metadata must be exported for this stream if (stream.contains("export_metadata")) { streams_[name]->set_metadata_export(); diff --git a/src/Stream.cpp b/src/Stream.cpp index 510bb9c..12b371b 100644 --- a/src/Stream.cpp +++ b/src/Stream.cpp @@ -104,6 +104,14 @@ std::optional Stream::get_transport_method_str() const noexcept return std::nullopt; // LCOV_EXCL_LINE } +std::optional> Stream::get_reduction_method(std::string_view name) const noexcept +{ + auto it = reduction_methods_.find(std::string(name)); + if (it == reduction_methods_.end()) + return std::nullopt; + return it->second; +} + Stream& Stream::set_transport_method(const Transport::Method& transport_method) { if (transport_method_ == transport_method) // No modification, just return diff --git a/src/bindings/python/dtlmod_python.cpp b/src/bindings/python/dtlmod_python.cpp index 9f1185e..d3dd35c 100644 --- a/src/bindings/python/dtlmod_python.cpp +++ b/src/bindings/python/dtlmod_python.cpp @@ -161,6 +161,13 @@ PYBIND11_MODULE(dtlmod, m) return result ? py::cast(*result) : py::cast(Py_None); }, "Print out the transport method of this Stream (read-only, returns None if invalid)") + .def( + "reduction_method", + [](const Stream& self, std::string_view name) -> py::object { + auto result = self.get_reduction_method(name); + return result ? py::cast(*result) : py::cast(Py_None); + }, + py::arg("name"), "Retrieve a reduction method by name, or None if not found") .def_property_readonly("engine_type", &Stream::get_engine_type, "Get the engine type of this Stream (read-only)") .def_property_readonly("transport_method", &Stream::get_transport_method, "Get the transport method of this Stream (read-only)") diff --git a/test/DTL-config.json b/test/DTL-config.json index ab1b84e..d196d06 100644 --- a/test/DTL-config.json +++ b/test/DTL-config.json @@ -20,7 +20,8 @@ "engine": { "type": "Staging", "transport_method": "Mailbox" - } + }, + "reduction_methods": ["compression"] } ] } \ No newline at end of file diff --git a/test/dtl_config.cpp b/test/dtl_config.cpp index 1f1b1bd..5f55a84 100644 --- a/test/dtl_config.cpp +++ b/test/dtl_config.cpp @@ -97,6 +97,12 @@ TEST_F(DTLConfigTest, ConfigFile) XBT_INFO("Close the engine"); ASSERT_NO_THROW(engine->close()); + XBT_INFO("Retrieve the Reduction Method defined for Stream3"); + ASSERT_NO_THROW(stream = dtl->get_stream_by_name("Stream3").value()); + ASSERT_FALSE(stream->get_reduction_method("decimation").has_value()); + ASSERT_TRUE(stream->get_reduction_method("compression").has_value()); + ASSERT_EQ(stream->get_reduction_method("compression").value()->get_name(), "compression"); + XBT_INFO("Check get_all_streams returns both configured streams"); const auto& all_streams = dtl->get_all_streams(); ASSERT_EQ(all_streams.size(), 3U); diff --git a/test/dtl_reduction.cpp b/test/dtl_reduction.cpp index cd3404e..1241c14 100644 --- a/test/dtl_reduction.cpp +++ b/test/dtl_reduction.cpp @@ -108,7 +108,7 @@ TEST_F(DTLReductionTest, SimpleDecimationFileEngine) stream->define_variable("var3D", {640, 640, 640}, {0, 0, 0}, {640, 640, 640}, sizeof(double)); XBT_INFO("Define a Decimation Reduction Method"); ASSERT_NO_THROW(decimator = stream->define_reduction_method("decimation")); - XBT_INFO("Open the stream in Pulish mode"); + XBT_INFO("Open the stream in Publish mode"); auto engine = stream->open("zone:my_fs:/host/scratch/my-working-dir/my-output", dtlmod::Stream::Mode::Publish); ASSERT_NO_THROW(sg4::this_actor::sleep_for(1)); XBT_INFO("Start a Transaction"); diff --git a/test/python/dtl_config.py b/test/python/dtl_config.py index 79159e9..75b6a3a 100644 --- a/test/python/dtl_config.py +++ b/test/python/dtl_config.py @@ -64,6 +64,11 @@ def test_config_file(): this_actor.info("Close the engine") engine.close() + this_actor.info("Retrieve the Reduction Method defined for Stream3") + stream = dtl.stream_by_name("Stream3") + assert None == stream.reduction_method("decimation") + assert stream.reduction_method("compression").name == "compression" + this_actor.info("Check all_streams returns both configured streams") all_streams = dtl.all_streams assert len(all_streams) == 3 From 9b579c0be83ef8ecd3a26a664474d16969262d1e Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Mon, 4 May 2026 19:12:07 -0400 Subject: [PATCH 70/92] throw CancelledException earlier --- src/FileEngine.cpp | 7 ++----- src/StagingEngine.cpp | 9 ++------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/src/FileEngine.cpp b/src/FileEngine.cpp index 2aab332..1c57a6e 100644 --- a/src/FileEngine.cpp +++ b/src/FileEngine.cpp @@ -242,11 +242,8 @@ void FileEngine::end_sub_transaction() file_sub_transaction_[self].push(file->read_async(size)); XBT_DEBUG("Wait for the %d subscribe activities for the transaction", file_sub_transaction_[self].size()); - try { - file_sub_transaction_[self].wait_all(); - } catch (const simgrid::CancelException&) { - if (!is_cancelled()) - throw; + file_sub_transaction_[self].wait_all(); + if (is_cancelled()) { file_sub_transaction_[self].clear(); transport->close_sub_files(self); transport->clear_to_read_in_transaction(self); diff --git a/src/StagingEngine.cpp b/src/StagingEngine.cpp index 8391522..2e292e4 100644 --- a/src/StagingEngine.cpp +++ b/src/StagingEngine.cpp @@ -77,20 +77,15 @@ void StagingEngine::begin_pub_transaction() current_pub_transaction_id_, current_sub_transaction_id_, get_pub_transaction().size()); try { get_pub_transaction().wait_all(); - } catch (const simgrid::CancelException&) { - if (!is_cancelled()) - throw; - get_pub_transaction().clear(); - throw TransactionCancelledException(XBT_THROW_POINT); } catch (const simgrid::NetworkFailureException&) { if (!is_cancelled()) throw; - get_pub_transaction().clear(); - throw TransactionCancelledException(XBT_THROW_POINT); } XBT_DEBUG("All on-flight publish activities are completed. Proceed with the current transaction."); XBT_DEBUG("%u sub activities pending", get_sub_transaction().size()); get_pub_transaction().clear(); + if (is_cancelled()) + throw TransactionCancelledException(XBT_THROW_POINT); } // Then we wait for all subscribers to be at the same transaction From d5cd91e1b2cb160cd0fa6ab441e6ae7ca7328907 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Mon, 4 May 2026 21:59:21 -0400 Subject: [PATCH 71/92] revise how cancel_transaction works --- ChangeLog | 12 ++-- include/dtlmod/DTLException.hpp | 2 +- include/dtlmod/Engine.hpp | 19 ++++-- include/dtlmod/FileEngine.hpp | 4 ++ include/dtlmod/StagingEngine.hpp | 4 ++ src/Engine.cpp | 10 ++- src/FileEngine.cpp | 37 ++++++----- src/StagingEngine.cpp | 43 +++++++------ src/bindings/python/dtlmod_python.cpp | 5 +- test/dtl_cancel.cpp | 72 ++++++++++----------- test/python/dtl_cancel.py | 90 +++++++++++++-------------- 11 files changed, 163 insertions(+), 135 deletions(-) diff --git a/ChangeLog b/ChangeLog index 80f29cd..cabe239 100644 --- a/ChangeLog +++ b/ChangeLog @@ -20,12 +20,12 @@ Improvements: API Changes: - New Engine method: - - Engine::cancel_transaction() cancel the currently ongoing transaction - performed by this engine. This method has to be called by an external - actor not involved in the transaction. It will make the transaction raise - a DTLMod::TransactionCancelledException that must be caught if you want - the publishers and subscribers to survive to the cancellation of this - transaction. + - Engine::cancel_transaction(unsigned int transaction_id) cancels a + specific transaction performed by this engine. This method has to be + called by an external actor not involved in the transaction. It will make + the transaction raise a DTLMod::TransactionCanceledException that must be + caught if you want the publishers and subscribers to survive to the + cancelation of this transaction. - New Stream helper methods: - Stream::get_engine_type() and Stream::get_transport_method() respectively return the enum value of the engine type and transport method for a diff --git a/include/dtlmod/DTLException.hpp b/include/dtlmod/DTLException.hpp index 8e5f465..1135233 100644 --- a/include/dtlmod/DTLException.hpp +++ b/include/dtlmod/DTLException.hpp @@ -70,7 +70,7 @@ DECLARE_DTLMOD_EXCEPTION(UnknownCompressionOptionException, "Unknown Compression DECLARE_DTLMOD_EXCEPTION(InconsistentCompressionRatioException, "Inconsistent Compression ratio"); DECLARE_DTLMOD_EXCEPTION(SubscriberSideCompressionException, "Compression can only be applied on the publisher side"); -DECLARE_DTLMOD_EXCEPTION(TransactionCancelledException, "Transaction cancelled"); +DECLARE_DTLMOD_EXCEPTION(TransactioncanceledException, "Transaction canceled"); } // namespace dtlmod diff --git a/include/dtlmod/Engine.hpp b/include/dtlmod/Engine.hpp index 71419c0..42b5820 100644 --- a/include/dtlmod/Engine.hpp +++ b/include/dtlmod/Engine.hpp @@ -57,7 +57,7 @@ class Engine { std::weak_ptr stream_; bool pub_ever_present_ = false; - std::atomic cancelled_{false}; + std::atomic canceled_transaction_id_{0}; ActorRegistry publishers_; @@ -77,8 +77,9 @@ class Engine { [[nodiscard]] const sg4::ActivitySet& get_sub_transaction() const noexcept { return sub_transaction_; } [[nodiscard]] sg4::ActivitySet& get_sub_transaction() noexcept { return sub_transaction_; } - // Protected virtual method for derived classes to implement - [[nodiscard]] virtual unsigned int get_current_transaction_impl() const noexcept = 0; + // Protected virtual methods for derived classes to implement + [[nodiscard]] virtual unsigned int get_current_transaction_impl() const noexcept = 0; + [[nodiscard]] virtual unsigned int get_current_sub_transaction_impl() const noexcept = 0; // Protected methods for derived classes only void close_stream() const; @@ -93,7 +94,11 @@ class Engine { [[nodiscard]] bool pub_ever_present() const noexcept { return pub_ever_present_; } - [[nodiscard]] bool is_cancelled() const noexcept { return cancelled_; } + [[nodiscard]] bool is_canceled() const noexcept { return canceled_transaction_id_ != 0; } + [[nodiscard]] bool is_transaction_canceled(unsigned int tx_id) const noexcept + { + return canceled_transaction_id_ == tx_id; + } // Pure virtual methods for derived classes to implement virtual void create_transport(const Transport::Method& transport_method) = 0; @@ -144,9 +149,11 @@ class Engine { /// @return The id of the ongoing transaction. [[nodiscard]] unsigned int get_current_transaction() const noexcept { return get_current_transaction_impl(); } - /// @brief Cancel all in-flight activities of the current transaction, unblocking publishers and subscribers. + /// @brief Cancel all in-flight activities of a specific transaction, unblocking publishers and subscribers. + /// @param transaction_id The id of the transaction to cancel. If both sides have already moved past this + /// transaction, the call is a no-op to avoid accidentally cancelling a subsequent transaction. /// @note Must be called from an external actor not participating in the transaction. - void cancel_transaction(); + void cancel_transaction(unsigned int transaction_id); /// @brief Close the Engine associated to a Stream. void close(); diff --git a/include/dtlmod/FileEngine.hpp b/include/dtlmod/FileEngine.hpp index da5d29d..c12e3e4 100644 --- a/include/dtlmod/FileEngine.hpp +++ b/include/dtlmod/FileEngine.hpp @@ -55,6 +55,10 @@ class FileEngine : public Engine { { return current_pub_transaction_id_; } + [[nodiscard]] unsigned int get_current_sub_transaction_impl() const noexcept override + { + return current_sub_transaction_id_; + } protected: [[nodiscard]] std::shared_ptr get_file_transport() const; diff --git a/include/dtlmod/StagingEngine.hpp b/include/dtlmod/StagingEngine.hpp index 759d4de..2794134 100644 --- a/include/dtlmod/StagingEngine.hpp +++ b/include/dtlmod/StagingEngine.hpp @@ -44,6 +44,10 @@ class StagingEngine : public Engine { { return current_pub_transaction_id_; } + [[nodiscard]] unsigned int get_current_sub_transaction_impl() const noexcept override + { + return current_sub_transaction_id_; + } protected: [[nodiscard]] std::shared_ptr get_staging_transport() const; diff --git a/src/Engine.cpp b/src/Engine.cpp index 0ef0ffd..00d1019 100644 --- a/src/Engine.cpp +++ b/src/Engine.cpp @@ -86,9 +86,15 @@ void Engine::close() publishers_.contains(sg4::Actor::self()) ? pub_close() : sub_close(); } -void Engine::cancel_transaction() +void Engine::cancel_transaction(unsigned int transaction_id) { - cancelled_ = true; + // No-op if both sides have already moved past the target transaction: cancelling now would + // affect the wrong (subsequent) transaction. + if (get_current_transaction_impl() > transaction_id && get_current_sub_transaction_impl() > transaction_id) + return; + // transaction_id == 0 means no transaction has started yet; treat as T1 so all checks against + // (current + 1 >= 1) still fire correctly. + canceled_transaction_id_.store(transaction_id == 0 ? 1 : transaction_id); cancel_activities(); } diff --git a/src/FileEngine.cpp b/src/FileEngine.cpp index 1c57a6e..f3554ce 100644 --- a/src/FileEngine.cpp +++ b/src/FileEngine.cpp @@ -100,8 +100,8 @@ std::string FileEngine::get_path_to_dataset() const void FileEngine::begin_pub_transaction() { - if (is_cancelled()) - throw TransactionCancelledException(XBT_THROW_POINT); + if (is_transaction_canceled(current_pub_transaction_id_ + 1)) + throw TransactioncanceledException(XBT_THROW_POINT); auto self = sg4::Actor::self(); @@ -115,12 +115,12 @@ void FileEngine::begin_pub_transaction() // Wait for the completion of the Publish activities from the previous transaction XBT_DEBUG("Wait for the completion of %u publish activities from the previous transaction", file_pub_transaction_[self].size()); - while (!is_cancelled() && file_pub_transaction_[self].size() > 0) { + while (!is_canceled() && file_pub_transaction_[self].size() > 0) { std::unique_lock lock(*(get_publishers().get_mutex())); pub_activities_completed_->wait(lock); } - if (is_cancelled()) - throw TransactionCancelledException(XBT_THROW_POINT); + if (is_transaction_canceled(current_pub_transaction_id_)) + throw TransactioncanceledException(XBT_THROW_POINT); XBT_DEBUG("All on-flight publish activities are completed. Proceed with the current transaction."); get_file_transport()->clear_to_write_in_transaction(self); } @@ -169,7 +169,7 @@ void FileEngine::pub_close() XBT_DEBUG("[%s] Wait for the completion of %u publish activities from the previous transaction", get_cname(), file_pub_transaction_[self].size()); - while (!is_cancelled() && file_pub_transaction_[self].size() > 0) { + while (!is_transaction_canceled(current_pub_transaction_id_) && file_pub_transaction_[self].size() > 0) { std::unique_lock lock(*(get_publishers().get_mutex())); pub_activities_completed_->wait(lock); } @@ -191,8 +191,8 @@ void FileEngine::pub_close() void FileEngine::begin_sub_transaction() { - if (is_cancelled()) - throw TransactionCancelledException(XBT_THROW_POINT); + if (is_transaction_canceled(current_sub_transaction_id_ + 1)) + throw TransactioncanceledException(XBT_THROW_POINT); // Only one subscriber has to do this if (!sub_transaction_in_progress_) { @@ -204,12 +204,15 @@ void FileEngine::begin_sub_transaction() // We have publishers on that stream, wait for them to complete a transaction first if (not get_publishers().is_empty()) { std::unique_lock lock(*get_subscribers().get_mutex()); - while (!is_cancelled() && completed_pub_transaction_id_ < current_sub_transaction_id_) { + while (!is_transaction_canceled(current_sub_transaction_id_) && + completed_pub_transaction_id_ < current_sub_transaction_id_) { XBT_DEBUG("Wait for publishers to end the transaction I need"); pub_transaction_completed_->wait(lock); } - if (is_cancelled()) - throw TransactionCancelledException(XBT_THROW_POINT); + if (is_transaction_canceled(current_sub_transaction_id_)) { + sub_transaction_in_progress_ = false; + throw TransactioncanceledException(XBT_THROW_POINT); + } XBT_DEBUG("Publishers stored metadata for that transaction, proceed"); } } @@ -221,17 +224,17 @@ void FileEngine::end_sub_transaction() // The files subscribers need to read may not have been fully written. Wait to be notified completion of the publish // activities - if (!is_cancelled() && current_sub_transaction_id_ == current_pub_transaction_id_ && - not get_publishers().is_empty()) { + if (!is_transaction_canceled(current_sub_transaction_id_) && + current_sub_transaction_id_ == current_pub_transaction_id_ && not get_publishers().is_empty()) { XBT_DEBUG("Wait for the completion of publish activities from the current transaction"); pub_activities_completed_->wait(std::unique_lock(*get_subscribers().get_mutex())); XBT_DEBUG("All on-flight publish activities are completed. Proceed with the subscribe activities."); } - if (is_cancelled()) { + if (is_transaction_canceled(current_sub_transaction_id_)) { transport->close_sub_files(self); transport->clear_to_read_in_transaction(self); sub_transaction_in_progress_ = false; - throw TransactionCancelledException(XBT_THROW_POINT); + throw TransactioncanceledException(XBT_THROW_POINT); } // Subscriber get the list of files and size to read that has been build during the get() operations @@ -243,12 +246,12 @@ void FileEngine::end_sub_transaction() XBT_DEBUG("Wait for the %d subscribe activities for the transaction", file_sub_transaction_[self].size()); file_sub_transaction_[self].wait_all(); - if (is_cancelled()) { + if (is_transaction_canceled(current_sub_transaction_id_)) { file_sub_transaction_[self].clear(); transport->close_sub_files(self); transport->clear_to_read_in_transaction(self); sub_transaction_in_progress_ = false; - throw TransactionCancelledException(XBT_THROW_POINT); + throw TransactioncanceledException(XBT_THROW_POINT); } file_sub_transaction_[self].clear(); // Close files opened in this transaction diff --git a/src/StagingEngine.cpp b/src/StagingEngine.cpp index 2e292e4..c8d2cdc 100644 --- a/src/StagingEngine.cpp +++ b/src/StagingEngine.cpp @@ -55,8 +55,8 @@ std::shared_ptr StagingEngine::get_staging_transport() const void StagingEngine::begin_pub_transaction() { - if (is_cancelled()) - throw TransactionCancelledException(XBT_THROW_POINT); + if (is_transaction_canceled(current_pub_transaction_id_ + 1)) + throw TransactioncanceledException(XBT_THROW_POINT); if (!pub_transaction_in_progress_) { pub_transaction_in_progress_ = true; @@ -78,24 +78,24 @@ void StagingEngine::begin_pub_transaction() try { get_pub_transaction().wait_all(); } catch (const simgrid::NetworkFailureException&) { - if (!is_cancelled()) + if (!is_canceled()) throw; } XBT_DEBUG("All on-flight publish activities are completed. Proceed with the current transaction."); XBT_DEBUG("%u sub activities pending", get_sub_transaction().size()); get_pub_transaction().clear(); - if (is_cancelled()) - throw TransactionCancelledException(XBT_THROW_POINT); + if (is_transaction_canceled(current_pub_transaction_id_)) + throw TransactioncanceledException(XBT_THROW_POINT); } // Then we wait for all subscribers to be at the same transaction - while (!is_cancelled() && + while (!is_transaction_canceled(current_pub_transaction_id_) && (get_subscribers().is_empty() || current_pub_transaction_id_ > current_sub_transaction_id_)) { XBT_DEBUG("Wait for subscribers"); sub_transaction_started_->wait(lock); } - if (is_cancelled()) - throw TransactionCancelledException(XBT_THROW_POINT); + if (is_transaction_canceled(current_pub_transaction_id_)) + throw TransactioncanceledException(XBT_THROW_POINT); // Publisher has been notified by subscribers, it can proceed with the transaction } @@ -146,16 +146,16 @@ void StagingEngine::pub_close() void StagingEngine::begin_sub_transaction() { - if (is_cancelled()) - throw TransactionCancelledException(XBT_THROW_POINT); + if (is_transaction_canceled(current_sub_transaction_id_ + 1)) + throw TransactioncanceledException(XBT_THROW_POINT); if (current_sub_transaction_id_ == 0) { // This is the first transaction // Wait for at least one publisher to start a tran std::unique_lock lock(*get_subscribers().get_mutex()); - while (!is_cancelled() && current_pub_transaction_id_ == 0) + while (!is_transaction_canceled(current_sub_transaction_id_ + 1) && current_pub_transaction_id_ == 0) first_pub_transaction_started_->wait(lock); - if (is_cancelled()) - throw TransactionCancelledException(XBT_THROW_POINT); + if (is_transaction_canceled(current_sub_transaction_id_ + 1)) + throw TransactioncanceledException(XBT_THROW_POINT); XBT_DEBUG("Publishers have started a transaction, create rendez-vous points"); // We now know the number of publishers, subscriber can create mailboxes/mqs with publishers get_staging_transport()->create_rendez_vous_points(); @@ -178,10 +178,13 @@ void StagingEngine::begin_sub_transaction() } std::unique_lock lock(*get_subscribers().get_mutex()); - while (!is_cancelled() && completed_pub_transaction_id_ < current_sub_transaction_id_) + while (!is_transaction_canceled(current_sub_transaction_id_) && + completed_pub_transaction_id_ < current_sub_transaction_id_) pub_transaction_completed_->wait(lock); - if (is_cancelled()) - throw TransactionCancelledException(XBT_THROW_POINT); + if (is_transaction_canceled(current_sub_transaction_id_)) { + sub_transaction_in_progress_ = false; + throw TransactioncanceledException(XBT_THROW_POINT); + } } void StagingEngine::end_sub_transaction() @@ -195,19 +198,19 @@ void StagingEngine::end_sub_transaction() try { get_sub_transaction().wait_all(); } catch (const simgrid::CancelException&) { - if (!is_cancelled()) + if (!is_canceled()) throw; get_sub_transaction().clear(); sub_transaction_in_progress_ = false; num_subscribers_starting_--; - throw TransactionCancelledException(XBT_THROW_POINT); + throw TransactioncanceledException(XBT_THROW_POINT); } catch (const simgrid::NetworkFailureException&) { - if (!is_cancelled()) + if (!is_canceled()) throw; get_sub_transaction().clear(); sub_transaction_in_progress_ = false; num_subscribers_starting_--; - throw TransactionCancelledException(XBT_THROW_POINT); + throw TransactioncanceledException(XBT_THROW_POINT); } XBT_DEBUG("All on-flight subscribe activities are completed. Proceed with the current transaction."); get_sub_transaction().clear(); diff --git a/src/bindings/python/dtlmod_python.cpp b/src/bindings/python/dtlmod_python.cpp index d3dd35c..e60e7f2 100644 --- a/src/bindings/python/dtlmod_python.cpp +++ b/src/bindings/python/dtlmod_python.cpp @@ -88,7 +88,7 @@ PYBIND11_MODULE(dtlmod, m) py::register_exception(m, "InconsistentCompressionRatioException"); py::register_exception(m, "SubscriberSideCompressionException"); - py::register_exception(m, "TransactionCancelledException"); + py::register_exception(m, "TransactioncanceledException"); /* Class Engine */ py::class_> engine( @@ -108,7 +108,8 @@ PYBIND11_MODULE(dtlmod, m) .def_property_readonly("current_transaction", &Engine::get_current_transaction, "The id of the current transaction on this Engine (read-only)") .def("cancel_transaction", &Engine::cancel_transaction, py::call_guard(), - "Cancel all in-flight activities of the current transaction (must be called from an external actor)") + py::arg("transaction_id"), + "Cancel all in-flight activities of a specific transaction (must be called from an external actor)") .def("close", &Engine::close, py::call_guard(), "Close this Engine"); py::enum_(engine, "Type", "The type of Engine") diff --git a/test/dtl_cancel.cpp b/test/dtl_cancel.cpp index c844349..15c7e42 100644 --- a/test/dtl_cancel.cpp +++ b/test/dtl_cancel.cpp @@ -101,9 +101,9 @@ class DTLCancelTest : public ::testing::Test { }; // Publisher is stuck in begin_transaction() waiting for a subscriber that never shows up. -// An external canceller fires after 0.5s, unblocking the publisher with TransactionCancelledException. -// The subscriber registers but sleeps past the cancellation point, then gets TransactionCancelledException -// immediately on its own begin_transaction() because cancelled_ is already true. +// An external canceller fires after 0.5s, unblocking the publisher with TransactioncanceledException. +// The subscriber registers but sleeps past the cancellation point, then gets TransactioncanceledException +// immediately on its own begin_transaction() because canceled_ is already true. TEST_F(DTLCancelTest, CancelStagingTransaction_WaitingForSubscriber_MQ) { DO_TEST_WITH_FORK([this]() { @@ -123,12 +123,12 @@ TEST_F(DTLCancelTest, CancelStagingTransaction_WaitingForSubscriber_MQ) wdog_host->add_actor("Canceller", [engine]() { sg4::this_actor::sleep_for(0.5); XBT_INFO("Cancelling the transaction"); - engine->cancel_transaction(); + engine->cancel_transaction(engine->get_current_transaction()); }); XBT_INFO("Begin transaction (will block waiting for subscriber)"); - ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCancelledException); - XBT_INFO("Publisher caught TransactionCancelledException as expected"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactioncanceledException); + XBT_INFO("Publisher caught TransactioncanceledException as expected"); dtlmod::DTL::disconnect(); }); @@ -139,9 +139,9 @@ TEST_F(DTLCancelTest, CancelStagingTransaction_WaitingForSubscriber_MQ) auto var_sub = stream->inquire_variable("var"); sg4::this_actor::sleep_for(2.0); // sleep past the cancellation point - XBT_INFO("Begin transaction (cancelled_ already true)"); - ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCancelledException); - XBT_INFO("Subscriber caught TransactionCancelledException as expected"); + XBT_INFO("Begin transaction (canceled_ already true)"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactioncanceledException); + XBT_INFO("Subscriber caught TransactioncanceledException as expected"); dtlmod::DTL::disconnect(); }); @@ -169,12 +169,12 @@ TEST_F(DTLCancelTest, CancelStagingTransaction_WaitingForSubscriber_Mailbox) wdog_host->add_actor("Canceller", [engine]() { sg4::this_actor::sleep_for(0.5); XBT_INFO("Cancelling the transaction"); - engine->cancel_transaction(); + engine->cancel_transaction(engine->get_current_transaction()); }); XBT_INFO("Begin transaction (will block waiting for subscriber)"); - ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCancelledException); - XBT_INFO("Publisher caught TransactionCancelledException as expected"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactioncanceledException); + XBT_INFO("Publisher caught TransactioncanceledException as expected"); dtlmod::DTL::disconnect(); }); @@ -185,9 +185,9 @@ TEST_F(DTLCancelTest, CancelStagingTransaction_WaitingForSubscriber_Mailbox) auto var_sub = stream->inquire_variable("var"); sg4::this_actor::sleep_for(2.0); - XBT_INFO("Begin transaction (cancelled_ already true)"); - ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCancelledException); - XBT_INFO("Subscriber caught TransactionCancelledException as expected"); + XBT_INFO("Begin transaction (canceled_ already true)"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactioncanceledException); + XBT_INFO("Subscriber caught TransactioncanceledException as expected"); dtlmod::DTL::disconnect(); }); @@ -198,7 +198,7 @@ TEST_F(DTLCancelTest, CancelStagingTransaction_WaitingForSubscriber_Mailbox) // Subscriber is stuck in begin_transaction() waiting for the publisher to start a transaction. // Publisher opens the stream but never calls begin_transaction(). // Canceller fires after 0.5s, unblocking the subscriber. -// Publisher then gets TransactionCancelledException immediately on its begin_transaction(). +// Publisher then gets TransactioncanceledException immediately on its begin_transaction(). TEST_F(DTLCancelTest, CancelStagingTransaction_WaitingForPublisher_MQ) { DO_TEST_WITH_FORK([this]() { @@ -218,13 +218,13 @@ TEST_F(DTLCancelTest, CancelStagingTransaction_WaitingForPublisher_MQ) wdog_host->add_actor("Canceller", [engine]() { sg4::this_actor::sleep_for(0.5); XBT_INFO("Cancelling the transaction"); - engine->cancel_transaction(); + engine->cancel_transaction(engine->get_current_transaction()); }); sg4::this_actor::sleep_for(2.0); // sleep past the cancellation point - XBT_INFO("Begin transaction (cancelled_ already true)"); - ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCancelledException); - XBT_INFO("Publisher caught TransactionCancelledException as expected"); + XBT_INFO("Begin transaction (canceled_ already true)"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactioncanceledException); + XBT_INFO("Publisher caught TransactioncanceledException as expected"); dtlmod::DTL::disconnect(); }); @@ -235,8 +235,8 @@ TEST_F(DTLCancelTest, CancelStagingTransaction_WaitingForPublisher_MQ) auto var_sub = stream->inquire_variable("var"); XBT_INFO("Begin transaction (will block waiting for publisher to start a transaction)"); - ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCancelledException); - XBT_INFO("Subscriber caught TransactionCancelledException as expected"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactioncanceledException); + XBT_INFO("Subscriber caught TransactioncanceledException as expected"); dtlmod::DTL::disconnect(); }); @@ -266,13 +266,13 @@ TEST_F(DTLCancelTest, CancelFileEngineTransaction_WaitingForPublisher) wdog_host->add_actor("Canceller", [engine]() { sg4::this_actor::sleep_for(0.5); XBT_INFO("Cancelling the transaction"); - engine->cancel_transaction(); + engine->cancel_transaction(engine->get_current_transaction()); }); sg4::this_actor::sleep_for(2.0); // sleep past the cancellation point - XBT_INFO("Begin transaction (cancelled_ already true)"); - ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCancelledException); - XBT_INFO("Publisher caught TransactionCancelledException as expected"); + XBT_INFO("Begin transaction (canceled_ already true)"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactioncanceledException); + XBT_INFO("Publisher caught TransactioncanceledException as expected"); dtlmod::DTL::disconnect(); }); @@ -283,8 +283,8 @@ TEST_F(DTLCancelTest, CancelFileEngineTransaction_WaitingForPublisher) auto var_sub = stream->inquire_variable("var"); XBT_INFO("Begin transaction (will block waiting for publisher to complete a transaction)"); - ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCancelledException); - XBT_INFO("Subscriber caught TransactionCancelledException as expected"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactioncanceledException); + XBT_INFO("Subscriber caught TransactioncanceledException as expected"); dtlmod::DTL::disconnect(); }); @@ -295,7 +295,7 @@ TEST_F(DTLCancelTest, CancelFileEngineTransaction_WaitingForPublisher) // Publisher and subscriber are both engaged in a long Mailbox transfer (Mailbox simulates bandwidth; MQ does not). // Publisher completes T1 end_transaction() (starting slow async Comms) then blocks in T2 begin_transaction() // waiting for T1 sends to complete. Subscriber blocks in T1 end_transaction() waiting for receives. -// Canceller fires after 0.5s, unblocking both with TransactionCancelledException. +// Canceller fires after 0.5s, unblocking both with TransactioncanceledException. TEST_F(DTLCancelTest, CancelStagingTransaction_MidTransaction_Mailbox) { DO_TEST_WITH_FORK([this]() { @@ -315,7 +315,7 @@ TEST_F(DTLCancelTest, CancelStagingTransaction_MidTransaction_Mailbox) wdog_host->add_actor("Canceller", [engine]() { sg4::this_actor::sleep_for(0.5); XBT_INFO("Cancelling the transaction"); - engine->cancel_transaction(); + engine->cancel_transaction(engine->get_current_transaction()); }); // T1: completes normally, starting slow async Comms over the 1MBps internet link @@ -323,10 +323,10 @@ TEST_F(DTLCancelTest, CancelStagingTransaction_MidTransaction_Mailbox) engine->put(var); engine->end_transaction(); - // T2: blocks waiting for T1 Comms to drain -- cancelled mid-transfer + // T2: blocks waiting for T1 Comms to drain -- canceled mid-transfer XBT_INFO("Begin T2 (will block waiting for T1 sends to complete over slow link)"); - ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCancelledException); - XBT_INFO("Publisher caught TransactionCancelledException in T2 begin_transaction() as expected"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactioncanceledException); + XBT_INFO("Publisher caught TransactioncanceledException in T2 begin_transaction() as expected"); dtlmod::DTL::disconnect(); }); @@ -336,12 +336,12 @@ TEST_F(DTLCancelTest, CancelStagingTransaction_MidTransaction_Mailbox) auto engine = stream->open("my-output", dtlmod::Stream::Mode::Subscribe); auto var_sub = stream->inquire_variable("var"); - // T1: blocks in end_transaction() waiting for slow receives -- cancelled mid-transfer + // T1: blocks in end_transaction() waiting for slow receives -- canceled mid-transfer engine->begin_transaction(); engine->get(var_sub); XBT_INFO("End T1 (will block waiting for receives over slow link)"); - ASSERT_THROW(engine->end_transaction(), dtlmod::TransactionCancelledException); - XBT_INFO("Subscriber caught TransactionCancelledException in T1 end_transaction() as expected"); + ASSERT_THROW(engine->end_transaction(), dtlmod::TransactioncanceledException); + XBT_INFO("Subscriber caught TransactioncanceledException in T1 end_transaction() as expected"); dtlmod::DTL::disconnect(); }); diff --git a/test/python/dtl_cancel.py b/test/python/dtl_cancel.py index 3419365..f16840d 100644 --- a/test/python/dtl_cancel.py +++ b/test/python/dtl_cancel.py @@ -7,7 +7,7 @@ import sys import multiprocessing from simgrid import Engine, Host, this_actor, LinkInRoute -from dtlmod import DTL, Engine as DTLEngine, Stream, Transport, TransactionCancelledException +from dtlmod import DTL, Engine as DTLEngine, Stream, Transport, TransactioncanceledException def add_cluster(root, suffix, num_hosts): @@ -87,7 +87,7 @@ def setup_file_platform(): # Publisher stuck in begin_transaction() waiting for a subscriber that never shows up. -# Canceller fires at t=0.5s. Publisher catches TransactionCancelledException. +# Canceller fires at t=0.5s. Publisher catches TransactioncanceledException. # Subscriber sleeps past the cancel point, then gets it immediately on its begin_transaction(). def run_test_cancel_staging_waiting_for_subscriber_mq(): e = setup_staging_platform() @@ -106,9 +106,9 @@ def pub_actor(): this_actor.info("Begin transaction (will block waiting for subscriber)") try: engine.begin_transaction() - assert False, "Expected TransactionCancelledException" - except TransactionCancelledException: - this_actor.info("Publisher caught TransactionCancelledException as expected") + assert False, "Expected TransactioncanceledException" + except TransactioncanceledException: + this_actor.info("Publisher caught TransactioncanceledException as expected") DTL.disconnect() def sub_actor(): @@ -117,18 +117,18 @@ def sub_actor(): engine = stream.open("my-output", Stream.Mode.Subscribe) stream.inquire_variable("var") this_actor.sleep_for(2.0) - this_actor.info("Begin transaction (cancelled_ already true)") + this_actor.info("Begin transaction (canceled_ already true)") try: engine.begin_transaction() - assert False, "Expected TransactionCancelledException" - except TransactionCancelledException: - this_actor.info("Subscriber caught TransactionCancelledException as expected") + assert False, "Expected TransactioncanceledException" + except TransactioncanceledException: + this_actor.info("Subscriber caught TransactioncanceledException as expected") DTL.disconnect() def canceller_actor(): this_actor.sleep_for(0.5) this_actor.info("Cancelling the transaction") - engine_ref[0].cancel_transaction() + engine_ref[0].cancel_transaction(engine_ref[0].current_transaction) Host.by_name("host-0.prod").add_actor("PubTestActor", pub_actor) Host.by_name("host-0.cons").add_actor("SubTestActor", sub_actor) @@ -153,9 +153,9 @@ def pub_actor(): this_actor.info("Begin transaction (will block waiting for subscriber)") try: engine.begin_transaction() - assert False, "Expected TransactionCancelledException" - except TransactionCancelledException: - this_actor.info("Publisher caught TransactionCancelledException as expected") + assert False, "Expected TransactioncanceledException" + except TransactioncanceledException: + this_actor.info("Publisher caught TransactioncanceledException as expected") DTL.disconnect() def sub_actor(): @@ -164,18 +164,18 @@ def sub_actor(): engine = stream.open("my-output", Stream.Mode.Subscribe) stream.inquire_variable("var") this_actor.sleep_for(2.0) - this_actor.info("Begin transaction (cancelled_ already true)") + this_actor.info("Begin transaction (canceled_ already true)") try: engine.begin_transaction() - assert False, "Expected TransactionCancelledException" - except TransactionCancelledException: - this_actor.info("Subscriber caught TransactionCancelledException as expected") + assert False, "Expected TransactioncanceledException" + except TransactioncanceledException: + this_actor.info("Subscriber caught TransactioncanceledException as expected") DTL.disconnect() def canceller_actor(): this_actor.sleep_for(0.5) this_actor.info("Cancelling the transaction") - engine_ref[0].cancel_transaction() + engine_ref[0].cancel_transaction(engine_ref[0].current_transaction) Host.by_name("host-0.prod").add_actor("PubTestActor", pub_actor) Host.by_name("host-0.cons").add_actor("SubTestActor", sub_actor) @@ -184,7 +184,7 @@ def canceller_actor(): # Subscriber stuck in begin_transaction() waiting for the publisher to start a transaction. # Publisher opens the stream but sleeps before calling begin_transaction(). -# Canceller fires at t=0.5s. Subscriber catches TransactionCancelledException. +# Canceller fires at t=0.5s. Subscriber catches TransactioncanceledException. def run_test_cancel_staging_waiting_for_publisher_mq(): e = setup_staging_platform() engine_ref = [None] @@ -200,12 +200,12 @@ def pub_actor(): Host.by_name("host-1.prod").add_actor("Canceller", canceller_actor) this_actor.sleep_for(2.0) - this_actor.info("Begin transaction (cancelled_ already true)") + this_actor.info("Begin transaction (canceled_ already true)") try: engine.begin_transaction() - assert False, "Expected TransactionCancelledException" - except TransactionCancelledException: - this_actor.info("Publisher caught TransactionCancelledException as expected") + assert False, "Expected TransactioncanceledException" + except TransactioncanceledException: + this_actor.info("Publisher caught TransactioncanceledException as expected") DTL.disconnect() def sub_actor(): @@ -216,15 +216,15 @@ def sub_actor(): this_actor.info("Begin transaction (will block waiting for publisher to start a transaction)") try: engine.begin_transaction() - assert False, "Expected TransactionCancelledException" - except TransactionCancelledException: - this_actor.info("Subscriber caught TransactionCancelledException as expected") + assert False, "Expected TransactioncanceledException" + except TransactioncanceledException: + this_actor.info("Subscriber caught TransactioncanceledException as expected") DTL.disconnect() def canceller_actor(): this_actor.sleep_for(0.5) this_actor.info("Cancelling the transaction") - engine_ref[0].cancel_transaction() + engine_ref[0].cancel_transaction(engine_ref[0].current_transaction) Host.by_name("host-0.prod").add_actor("PubTestActor", pub_actor) Host.by_name("host-0.cons").add_actor("SubTestActor", sub_actor) @@ -233,7 +233,7 @@ def canceller_actor(): # FileEngine: subscriber stuck in begin_transaction() waiting for publisher to complete a transaction. # Publisher opens the stream but sleeps before calling begin_transaction(). -# Canceller fires at t=0.5s. Subscriber catches TransactionCancelledException. +# Canceller fires at t=0.5s. Subscriber catches TransactioncanceledException. def run_test_cancel_file_engine_waiting_for_publisher(): e = setup_file_platform() engine_ref = [None] @@ -249,12 +249,12 @@ def pub_actor(): Host.by_name("node-2").add_actor("Canceller", canceller_actor) this_actor.sleep_for(2.0) - this_actor.info("Begin transaction (cancelled_ already true)") + this_actor.info("Begin transaction (canceled_ already true)") try: engine.begin_transaction() - assert False, "Expected TransactionCancelledException" - except TransactionCancelledException: - this_actor.info("Publisher caught TransactionCancelledException as expected") + assert False, "Expected TransactioncanceledException" + except TransactioncanceledException: + this_actor.info("Publisher caught TransactioncanceledException as expected") DTL.disconnect() def sub_actor(): @@ -265,15 +265,15 @@ def sub_actor(): this_actor.info("Begin transaction (will block waiting for publisher to complete a transaction)") try: engine.begin_transaction() - assert False, "Expected TransactionCancelledException" - except TransactionCancelledException: - this_actor.info("Subscriber caught TransactionCancelledException as expected") + assert False, "Expected TransactioncanceledException" + except TransactioncanceledException: + this_actor.info("Subscriber caught TransactioncanceledException as expected") DTL.disconnect() def canceller_actor(): this_actor.sleep_for(0.5) this_actor.info("Cancelling the transaction") - engine_ref[0].cancel_transaction() + engine_ref[0].cancel_transaction(engine_ref[0].current_transaction) Host.by_name("node-0").add_actor("PubTestActor", pub_actor) Host.by_name("node-1").add_actor("SubTestActor", sub_actor) @@ -283,7 +283,7 @@ def canceller_actor(): # Publisher and subscriber are both engaged in a long Mailbox transfer. # Publisher completes T1 end_transaction() (starting slow async comms) then blocks in T2 # begin_transaction() waiting for T1 sends to complete. Subscriber blocks in T1 end_transaction() -# waiting for receives. Canceller fires at t=0.5s, unblocking both with TransactionCancelledException. +# waiting for receives. Canceller fires at t=0.5s, unblocking both with TransactioncanceledException. def run_test_cancel_staging_mid_transaction_mailbox(): e = setup_slow_staging_platform() engine_ref = [None] @@ -303,13 +303,13 @@ def pub_actor(): engine.put(var) engine.end_transaction() - # T2: blocks waiting for T1 sends to complete -- gets cancelled + # T2: blocks waiting for T1 sends to complete -- gets canceled this_actor.info("Begin T2 (will block waiting for T1 sends to complete over slow link)") try: engine.begin_transaction() - assert False, "Expected TransactionCancelledException" - except TransactionCancelledException: - this_actor.info("Publisher caught TransactionCancelledException in T2 begin_transaction() as expected") + assert False, "Expected TransactioncanceledException" + except TransactioncanceledException: + this_actor.info("Publisher caught TransactioncanceledException in T2 begin_transaction() as expected") DTL.disconnect() def sub_actor(): @@ -324,15 +324,15 @@ def sub_actor(): this_actor.info("End T1 (will block waiting for receives over slow link)") try: engine.end_transaction() - assert False, "Expected TransactionCancelledException" - except TransactionCancelledException: - this_actor.info("Subscriber caught TransactionCancelledException in T1 end_transaction() as expected") + assert False, "Expected TransactioncanceledException" + except TransactioncanceledException: + this_actor.info("Subscriber caught TransactioncanceledException in T1 end_transaction() as expected") DTL.disconnect() def canceller_actor(): this_actor.sleep_for(0.5) this_actor.info("Cancelling the transaction") - engine_ref[0].cancel_transaction() + engine_ref[0].cancel_transaction(engine_ref[0].current_transaction) Host.by_name("host-0.prod").add_actor("PubTestActor", pub_actor) Host.by_name("host-0.cons").add_actor("SubTestActor", sub_actor) From 3879071645ebac5d985a74531c85802b934ce3ae Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Mon, 4 May 2026 22:16:38 -0400 Subject: [PATCH 72/92] CamelCase --- include/dtlmod/DTLException.hpp | 2 +- src/FileEngine.cpp | 12 ++--- src/StagingEngine.cpp | 16 +++--- src/bindings/python/dtlmod_python.cpp | 2 +- test/dtl_cancel.cpp | 48 +++++++++--------- test/python/dtl_cancel.py | 70 +++++++++++++-------------- 6 files changed, 75 insertions(+), 75 deletions(-) diff --git a/include/dtlmod/DTLException.hpp b/include/dtlmod/DTLException.hpp index 1135233..7979045 100644 --- a/include/dtlmod/DTLException.hpp +++ b/include/dtlmod/DTLException.hpp @@ -70,7 +70,7 @@ DECLARE_DTLMOD_EXCEPTION(UnknownCompressionOptionException, "Unknown Compression DECLARE_DTLMOD_EXCEPTION(InconsistentCompressionRatioException, "Inconsistent Compression ratio"); DECLARE_DTLMOD_EXCEPTION(SubscriberSideCompressionException, "Compression can only be applied on the publisher side"); -DECLARE_DTLMOD_EXCEPTION(TransactioncanceledException, "Transaction canceled"); +DECLARE_DTLMOD_EXCEPTION(TransactionCanceledException, "Transaction canceled"); } // namespace dtlmod diff --git a/src/FileEngine.cpp b/src/FileEngine.cpp index f3554ce..384b469 100644 --- a/src/FileEngine.cpp +++ b/src/FileEngine.cpp @@ -101,7 +101,7 @@ std::string FileEngine::get_path_to_dataset() const void FileEngine::begin_pub_transaction() { if (is_transaction_canceled(current_pub_transaction_id_ + 1)) - throw TransactioncanceledException(XBT_THROW_POINT); + throw TransactionCanceledException(XBT_THROW_POINT); auto self = sg4::Actor::self(); @@ -120,7 +120,7 @@ void FileEngine::begin_pub_transaction() pub_activities_completed_->wait(lock); } if (is_transaction_canceled(current_pub_transaction_id_)) - throw TransactioncanceledException(XBT_THROW_POINT); + throw TransactionCanceledException(XBT_THROW_POINT); XBT_DEBUG("All on-flight publish activities are completed. Proceed with the current transaction."); get_file_transport()->clear_to_write_in_transaction(self); } @@ -192,7 +192,7 @@ void FileEngine::pub_close() void FileEngine::begin_sub_transaction() { if (is_transaction_canceled(current_sub_transaction_id_ + 1)) - throw TransactioncanceledException(XBT_THROW_POINT); + throw TransactionCanceledException(XBT_THROW_POINT); // Only one subscriber has to do this if (!sub_transaction_in_progress_) { @@ -211,7 +211,7 @@ void FileEngine::begin_sub_transaction() } if (is_transaction_canceled(current_sub_transaction_id_)) { sub_transaction_in_progress_ = false; - throw TransactioncanceledException(XBT_THROW_POINT); + throw TransactionCanceledException(XBT_THROW_POINT); } XBT_DEBUG("Publishers stored metadata for that transaction, proceed"); } @@ -234,7 +234,7 @@ void FileEngine::end_sub_transaction() transport->close_sub_files(self); transport->clear_to_read_in_transaction(self); sub_transaction_in_progress_ = false; - throw TransactioncanceledException(XBT_THROW_POINT); + throw TransactionCanceledException(XBT_THROW_POINT); } // Subscriber get the list of files and size to read that has been build during the get() operations @@ -251,7 +251,7 @@ void FileEngine::end_sub_transaction() transport->close_sub_files(self); transport->clear_to_read_in_transaction(self); sub_transaction_in_progress_ = false; - throw TransactioncanceledException(XBT_THROW_POINT); + throw TransactionCanceledException(XBT_THROW_POINT); } file_sub_transaction_[self].clear(); // Close files opened in this transaction diff --git a/src/StagingEngine.cpp b/src/StagingEngine.cpp index c8d2cdc..26a1823 100644 --- a/src/StagingEngine.cpp +++ b/src/StagingEngine.cpp @@ -56,7 +56,7 @@ std::shared_ptr StagingEngine::get_staging_transport() const void StagingEngine::begin_pub_transaction() { if (is_transaction_canceled(current_pub_transaction_id_ + 1)) - throw TransactioncanceledException(XBT_THROW_POINT); + throw TransactionCanceledException(XBT_THROW_POINT); if (!pub_transaction_in_progress_) { pub_transaction_in_progress_ = true; @@ -85,7 +85,7 @@ void StagingEngine::begin_pub_transaction() XBT_DEBUG("%u sub activities pending", get_sub_transaction().size()); get_pub_transaction().clear(); if (is_transaction_canceled(current_pub_transaction_id_)) - throw TransactioncanceledException(XBT_THROW_POINT); + throw TransactionCanceledException(XBT_THROW_POINT); } // Then we wait for all subscribers to be at the same transaction @@ -95,7 +95,7 @@ void StagingEngine::begin_pub_transaction() sub_transaction_started_->wait(lock); } if (is_transaction_canceled(current_pub_transaction_id_)) - throw TransactioncanceledException(XBT_THROW_POINT); + throw TransactionCanceledException(XBT_THROW_POINT); // Publisher has been notified by subscribers, it can proceed with the transaction } @@ -147,7 +147,7 @@ void StagingEngine::pub_close() void StagingEngine::begin_sub_transaction() { if (is_transaction_canceled(current_sub_transaction_id_ + 1)) - throw TransactioncanceledException(XBT_THROW_POINT); + throw TransactionCanceledException(XBT_THROW_POINT); if (current_sub_transaction_id_ == 0) { // This is the first transaction // Wait for at least one publisher to start a tran @@ -155,7 +155,7 @@ void StagingEngine::begin_sub_transaction() while (!is_transaction_canceled(current_sub_transaction_id_ + 1) && current_pub_transaction_id_ == 0) first_pub_transaction_started_->wait(lock); if (is_transaction_canceled(current_sub_transaction_id_ + 1)) - throw TransactioncanceledException(XBT_THROW_POINT); + throw TransactionCanceledException(XBT_THROW_POINT); XBT_DEBUG("Publishers have started a transaction, create rendez-vous points"); // We now know the number of publishers, subscriber can create mailboxes/mqs with publishers get_staging_transport()->create_rendez_vous_points(); @@ -183,7 +183,7 @@ void StagingEngine::begin_sub_transaction() pub_transaction_completed_->wait(lock); if (is_transaction_canceled(current_sub_transaction_id_)) { sub_transaction_in_progress_ = false; - throw TransactioncanceledException(XBT_THROW_POINT); + throw TransactionCanceledException(XBT_THROW_POINT); } } @@ -203,14 +203,14 @@ void StagingEngine::end_sub_transaction() get_sub_transaction().clear(); sub_transaction_in_progress_ = false; num_subscribers_starting_--; - throw TransactioncanceledException(XBT_THROW_POINT); + throw TransactionCanceledException(XBT_THROW_POINT); } catch (const simgrid::NetworkFailureException&) { if (!is_canceled()) throw; get_sub_transaction().clear(); sub_transaction_in_progress_ = false; num_subscribers_starting_--; - throw TransactioncanceledException(XBT_THROW_POINT); + throw TransactionCanceledException(XBT_THROW_POINT); } XBT_DEBUG("All on-flight subscribe activities are completed. Proceed with the current transaction."); get_sub_transaction().clear(); diff --git a/src/bindings/python/dtlmod_python.cpp b/src/bindings/python/dtlmod_python.cpp index e60e7f2..9c0145f 100644 --- a/src/bindings/python/dtlmod_python.cpp +++ b/src/bindings/python/dtlmod_python.cpp @@ -88,7 +88,7 @@ PYBIND11_MODULE(dtlmod, m) py::register_exception(m, "InconsistentCompressionRatioException"); py::register_exception(m, "SubscriberSideCompressionException"); - py::register_exception(m, "TransactioncanceledException"); + py::register_exception(m, "TransactionCanceledException"); /* Class Engine */ py::class_> engine( diff --git a/test/dtl_cancel.cpp b/test/dtl_cancel.cpp index 15c7e42..b25ac62 100644 --- a/test/dtl_cancel.cpp +++ b/test/dtl_cancel.cpp @@ -101,8 +101,8 @@ class DTLCancelTest : public ::testing::Test { }; // Publisher is stuck in begin_transaction() waiting for a subscriber that never shows up. -// An external canceller fires after 0.5s, unblocking the publisher with TransactioncanceledException. -// The subscriber registers but sleeps past the cancellation point, then gets TransactioncanceledException +// An external canceller fires after 0.5s, unblocking the publisher with TransactionCanceledException. +// The subscriber registers but sleeps past the cancellation point, then gets TransactionCanceledException // immediately on its own begin_transaction() because canceled_ is already true. TEST_F(DTLCancelTest, CancelStagingTransaction_WaitingForSubscriber_MQ) { @@ -127,8 +127,8 @@ TEST_F(DTLCancelTest, CancelStagingTransaction_WaitingForSubscriber_MQ) }); XBT_INFO("Begin transaction (will block waiting for subscriber)"); - ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactioncanceledException); - XBT_INFO("Publisher caught TransactioncanceledException as expected"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCanceledException); + XBT_INFO("Publisher caught TransactionCanceledException as expected"); dtlmod::DTL::disconnect(); }); @@ -140,8 +140,8 @@ TEST_F(DTLCancelTest, CancelStagingTransaction_WaitingForSubscriber_MQ) sg4::this_actor::sleep_for(2.0); // sleep past the cancellation point XBT_INFO("Begin transaction (canceled_ already true)"); - ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactioncanceledException); - XBT_INFO("Subscriber caught TransactioncanceledException as expected"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCanceledException); + XBT_INFO("Subscriber caught TransactionCanceledException as expected"); dtlmod::DTL::disconnect(); }); @@ -173,8 +173,8 @@ TEST_F(DTLCancelTest, CancelStagingTransaction_WaitingForSubscriber_Mailbox) }); XBT_INFO("Begin transaction (will block waiting for subscriber)"); - ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactioncanceledException); - XBT_INFO("Publisher caught TransactioncanceledException as expected"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCanceledException); + XBT_INFO("Publisher caught TransactionCanceledException as expected"); dtlmod::DTL::disconnect(); }); @@ -186,8 +186,8 @@ TEST_F(DTLCancelTest, CancelStagingTransaction_WaitingForSubscriber_Mailbox) sg4::this_actor::sleep_for(2.0); XBT_INFO("Begin transaction (canceled_ already true)"); - ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactioncanceledException); - XBT_INFO("Subscriber caught TransactioncanceledException as expected"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCanceledException); + XBT_INFO("Subscriber caught TransactionCanceledException as expected"); dtlmod::DTL::disconnect(); }); @@ -198,7 +198,7 @@ TEST_F(DTLCancelTest, CancelStagingTransaction_WaitingForSubscriber_Mailbox) // Subscriber is stuck in begin_transaction() waiting for the publisher to start a transaction. // Publisher opens the stream but never calls begin_transaction(). // Canceller fires after 0.5s, unblocking the subscriber. -// Publisher then gets TransactioncanceledException immediately on its begin_transaction(). +// Publisher then gets TransactionCanceledException immediately on its begin_transaction(). TEST_F(DTLCancelTest, CancelStagingTransaction_WaitingForPublisher_MQ) { DO_TEST_WITH_FORK([this]() { @@ -223,8 +223,8 @@ TEST_F(DTLCancelTest, CancelStagingTransaction_WaitingForPublisher_MQ) sg4::this_actor::sleep_for(2.0); // sleep past the cancellation point XBT_INFO("Begin transaction (canceled_ already true)"); - ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactioncanceledException); - XBT_INFO("Publisher caught TransactioncanceledException as expected"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCanceledException); + XBT_INFO("Publisher caught TransactionCanceledException as expected"); dtlmod::DTL::disconnect(); }); @@ -235,8 +235,8 @@ TEST_F(DTLCancelTest, CancelStagingTransaction_WaitingForPublisher_MQ) auto var_sub = stream->inquire_variable("var"); XBT_INFO("Begin transaction (will block waiting for publisher to start a transaction)"); - ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactioncanceledException); - XBT_INFO("Subscriber caught TransactioncanceledException as expected"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCanceledException); + XBT_INFO("Subscriber caught TransactionCanceledException as expected"); dtlmod::DTL::disconnect(); }); @@ -271,8 +271,8 @@ TEST_F(DTLCancelTest, CancelFileEngineTransaction_WaitingForPublisher) sg4::this_actor::sleep_for(2.0); // sleep past the cancellation point XBT_INFO("Begin transaction (canceled_ already true)"); - ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactioncanceledException); - XBT_INFO("Publisher caught TransactioncanceledException as expected"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCanceledException); + XBT_INFO("Publisher caught TransactionCanceledException as expected"); dtlmod::DTL::disconnect(); }); @@ -283,8 +283,8 @@ TEST_F(DTLCancelTest, CancelFileEngineTransaction_WaitingForPublisher) auto var_sub = stream->inquire_variable("var"); XBT_INFO("Begin transaction (will block waiting for publisher to complete a transaction)"); - ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactioncanceledException); - XBT_INFO("Subscriber caught TransactioncanceledException as expected"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCanceledException); + XBT_INFO("Subscriber caught TransactionCanceledException as expected"); dtlmod::DTL::disconnect(); }); @@ -295,7 +295,7 @@ TEST_F(DTLCancelTest, CancelFileEngineTransaction_WaitingForPublisher) // Publisher and subscriber are both engaged in a long Mailbox transfer (Mailbox simulates bandwidth; MQ does not). // Publisher completes T1 end_transaction() (starting slow async Comms) then blocks in T2 begin_transaction() // waiting for T1 sends to complete. Subscriber blocks in T1 end_transaction() waiting for receives. -// Canceller fires after 0.5s, unblocking both with TransactioncanceledException. +// Canceller fires after 0.5s, unblocking both with TransactionCanceledException. TEST_F(DTLCancelTest, CancelStagingTransaction_MidTransaction_Mailbox) { DO_TEST_WITH_FORK([this]() { @@ -325,8 +325,8 @@ TEST_F(DTLCancelTest, CancelStagingTransaction_MidTransaction_Mailbox) // T2: blocks waiting for T1 Comms to drain -- canceled mid-transfer XBT_INFO("Begin T2 (will block waiting for T1 sends to complete over slow link)"); - ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactioncanceledException); - XBT_INFO("Publisher caught TransactioncanceledException in T2 begin_transaction() as expected"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCanceledException); + XBT_INFO("Publisher caught TransactionCanceledException in T2 begin_transaction() as expected"); dtlmod::DTL::disconnect(); }); @@ -340,8 +340,8 @@ TEST_F(DTLCancelTest, CancelStagingTransaction_MidTransaction_Mailbox) engine->begin_transaction(); engine->get(var_sub); XBT_INFO("End T1 (will block waiting for receives over slow link)"); - ASSERT_THROW(engine->end_transaction(), dtlmod::TransactioncanceledException); - XBT_INFO("Subscriber caught TransactioncanceledException in T1 end_transaction() as expected"); + ASSERT_THROW(engine->end_transaction(), dtlmod::TransactionCanceledException); + XBT_INFO("Subscriber caught TransactionCanceledException in T1 end_transaction() as expected"); dtlmod::DTL::disconnect(); }); diff --git a/test/python/dtl_cancel.py b/test/python/dtl_cancel.py index f16840d..8573b4a 100644 --- a/test/python/dtl_cancel.py +++ b/test/python/dtl_cancel.py @@ -7,7 +7,7 @@ import sys import multiprocessing from simgrid import Engine, Host, this_actor, LinkInRoute -from dtlmod import DTL, Engine as DTLEngine, Stream, Transport, TransactioncanceledException +from dtlmod import DTL, Engine as DTLEngine, Stream, Transport, TransactionCanceledException def add_cluster(root, suffix, num_hosts): @@ -87,7 +87,7 @@ def setup_file_platform(): # Publisher stuck in begin_transaction() waiting for a subscriber that never shows up. -# Canceller fires at t=0.5s. Publisher catches TransactioncanceledException. +# Canceller fires at t=0.5s. Publisher catches TransactionCanceledException. # Subscriber sleeps past the cancel point, then gets it immediately on its begin_transaction(). def run_test_cancel_staging_waiting_for_subscriber_mq(): e = setup_staging_platform() @@ -106,9 +106,9 @@ def pub_actor(): this_actor.info("Begin transaction (will block waiting for subscriber)") try: engine.begin_transaction() - assert False, "Expected TransactioncanceledException" - except TransactioncanceledException: - this_actor.info("Publisher caught TransactioncanceledException as expected") + assert False, "Expected TransactionCanceledException" + except TransactionCanceledException: + this_actor.info("Publisher caught TransactionCanceledException as expected") DTL.disconnect() def sub_actor(): @@ -120,9 +120,9 @@ def sub_actor(): this_actor.info("Begin transaction (canceled_ already true)") try: engine.begin_transaction() - assert False, "Expected TransactioncanceledException" - except TransactioncanceledException: - this_actor.info("Subscriber caught TransactioncanceledException as expected") + assert False, "Expected TransactionCanceledException" + except TransactionCanceledException: + this_actor.info("Subscriber caught TransactionCanceledException as expected") DTL.disconnect() def canceller_actor(): @@ -153,9 +153,9 @@ def pub_actor(): this_actor.info("Begin transaction (will block waiting for subscriber)") try: engine.begin_transaction() - assert False, "Expected TransactioncanceledException" - except TransactioncanceledException: - this_actor.info("Publisher caught TransactioncanceledException as expected") + assert False, "Expected TransactionCanceledException" + except TransactionCanceledException: + this_actor.info("Publisher caught TransactionCanceledException as expected") DTL.disconnect() def sub_actor(): @@ -167,9 +167,9 @@ def sub_actor(): this_actor.info("Begin transaction (canceled_ already true)") try: engine.begin_transaction() - assert False, "Expected TransactioncanceledException" - except TransactioncanceledException: - this_actor.info("Subscriber caught TransactioncanceledException as expected") + assert False, "Expected TransactionCanceledException" + except TransactionCanceledException: + this_actor.info("Subscriber caught TransactionCanceledException as expected") DTL.disconnect() def canceller_actor(): @@ -184,7 +184,7 @@ def canceller_actor(): # Subscriber stuck in begin_transaction() waiting for the publisher to start a transaction. # Publisher opens the stream but sleeps before calling begin_transaction(). -# Canceller fires at t=0.5s. Subscriber catches TransactioncanceledException. +# Canceller fires at t=0.5s. Subscriber catches TransactionCanceledException. def run_test_cancel_staging_waiting_for_publisher_mq(): e = setup_staging_platform() engine_ref = [None] @@ -203,9 +203,9 @@ def pub_actor(): this_actor.info("Begin transaction (canceled_ already true)") try: engine.begin_transaction() - assert False, "Expected TransactioncanceledException" - except TransactioncanceledException: - this_actor.info("Publisher caught TransactioncanceledException as expected") + assert False, "Expected TransactionCanceledException" + except TransactionCanceledException: + this_actor.info("Publisher caught TransactionCanceledException as expected") DTL.disconnect() def sub_actor(): @@ -216,9 +216,9 @@ def sub_actor(): this_actor.info("Begin transaction (will block waiting for publisher to start a transaction)") try: engine.begin_transaction() - assert False, "Expected TransactioncanceledException" - except TransactioncanceledException: - this_actor.info("Subscriber caught TransactioncanceledException as expected") + assert False, "Expected TransactionCanceledException" + except TransactionCanceledException: + this_actor.info("Subscriber caught TransactionCanceledException as expected") DTL.disconnect() def canceller_actor(): @@ -233,7 +233,7 @@ def canceller_actor(): # FileEngine: subscriber stuck in begin_transaction() waiting for publisher to complete a transaction. # Publisher opens the stream but sleeps before calling begin_transaction(). -# Canceller fires at t=0.5s. Subscriber catches TransactioncanceledException. +# Canceller fires at t=0.5s. Subscriber catches TransactionCanceledException. def run_test_cancel_file_engine_waiting_for_publisher(): e = setup_file_platform() engine_ref = [None] @@ -252,9 +252,9 @@ def pub_actor(): this_actor.info("Begin transaction (canceled_ already true)") try: engine.begin_transaction() - assert False, "Expected TransactioncanceledException" - except TransactioncanceledException: - this_actor.info("Publisher caught TransactioncanceledException as expected") + assert False, "Expected TransactionCanceledException" + except TransactionCanceledException: + this_actor.info("Publisher caught TransactionCanceledException as expected") DTL.disconnect() def sub_actor(): @@ -265,9 +265,9 @@ def sub_actor(): this_actor.info("Begin transaction (will block waiting for publisher to complete a transaction)") try: engine.begin_transaction() - assert False, "Expected TransactioncanceledException" - except TransactioncanceledException: - this_actor.info("Subscriber caught TransactioncanceledException as expected") + assert False, "Expected TransactionCanceledException" + except TransactionCanceledException: + this_actor.info("Subscriber caught TransactionCanceledException as expected") DTL.disconnect() def canceller_actor(): @@ -283,7 +283,7 @@ def canceller_actor(): # Publisher and subscriber are both engaged in a long Mailbox transfer. # Publisher completes T1 end_transaction() (starting slow async comms) then blocks in T2 # begin_transaction() waiting for T1 sends to complete. Subscriber blocks in T1 end_transaction() -# waiting for receives. Canceller fires at t=0.5s, unblocking both with TransactioncanceledException. +# waiting for receives. Canceller fires at t=0.5s, unblocking both with TransactionCanceledException. def run_test_cancel_staging_mid_transaction_mailbox(): e = setup_slow_staging_platform() engine_ref = [None] @@ -307,9 +307,9 @@ def pub_actor(): this_actor.info("Begin T2 (will block waiting for T1 sends to complete over slow link)") try: engine.begin_transaction() - assert False, "Expected TransactioncanceledException" - except TransactioncanceledException: - this_actor.info("Publisher caught TransactioncanceledException in T2 begin_transaction() as expected") + assert False, "Expected TransactionCanceledException" + except TransactionCanceledException: + this_actor.info("Publisher caught TransactionCanceledException in T2 begin_transaction() as expected") DTL.disconnect() def sub_actor(): @@ -324,9 +324,9 @@ def sub_actor(): this_actor.info("End T1 (will block waiting for receives over slow link)") try: engine.end_transaction() - assert False, "Expected TransactioncanceledException" - except TransactioncanceledException: - this_actor.info("Subscriber caught TransactioncanceledException in T1 end_transaction() as expected") + assert False, "Expected TransactionCanceledException" + except TransactionCanceledException: + this_actor.info("Subscriber caught TransactionCanceledException in T1 end_transaction() as expected") DTL.disconnect() def canceller_actor(): From 3cd90e71f7780de90ee3c78660fc9fc9f73ebc9c Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Mon, 4 May 2026 22:25:29 -0400 Subject: [PATCH 73/92] restore catching simgrid's CancelException --- src/FileEngine.cpp | 12 +++++++++++- src/StagingEngine.cpp | 5 +++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/FileEngine.cpp b/src/FileEngine.cpp index 384b469..1aa0317 100644 --- a/src/FileEngine.cpp +++ b/src/FileEngine.cpp @@ -245,7 +245,17 @@ void FileEngine::end_sub_transaction() file_sub_transaction_[self].push(file->read_async(size)); XBT_DEBUG("Wait for the %d subscribe activities for the transaction", file_sub_transaction_[self].size()); - file_sub_transaction_[self].wait_all(); + try { + file_sub_transaction_[self].wait_all(); + } catch (const simgrid::CancelException&) { + if (!is_canceled()) + throw; + file_sub_transaction_[self].clear(); + transport->close_sub_files(self); + transport->clear_to_read_in_transaction(self); + sub_transaction_in_progress_ = false; + throw TransactionCanceledException(XBT_THROW_POINT); + } if (is_transaction_canceled(current_sub_transaction_id_)) { file_sub_transaction_[self].clear(); transport->close_sub_files(self); diff --git a/src/StagingEngine.cpp b/src/StagingEngine.cpp index 26a1823..acdf9ec 100644 --- a/src/StagingEngine.cpp +++ b/src/StagingEngine.cpp @@ -77,6 +77,11 @@ void StagingEngine::begin_pub_transaction() current_pub_transaction_id_, current_sub_transaction_id_, get_pub_transaction().size()); try { get_pub_transaction().wait_all(); + } catch (const simgrid::CancelException&) { + if (!is_canceled()) + throw; + get_pub_transaction().clear(); + throw TransactionCanceledException(XBT_THROW_POINT); } catch (const simgrid::NetworkFailureException&) { if (!is_canceled()) throw; From d88ee9f81c5ffcb0929c1766e4a495e61829c28a Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Tue, 5 May 2026 11:52:18 -0400 Subject: [PATCH 74/92] cache the var name not the variable --- include/dtlmod/CompressionReductionMethod.hpp | 7 +++++-- src/CompressionReductionMethod.cpp | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/dtlmod/CompressionReductionMethod.hpp b/include/dtlmod/CompressionReductionMethod.hpp index 34f30e4..622ee7e 100644 --- a/include/dtlmod/CompressionReductionMethod.hpp +++ b/include/dtlmod/CompressionReductionMethod.hpp @@ -25,11 +25,14 @@ class CompressionReductionMethod : public ReductionMethod { }; class ParameterizedCompression { - const Variable* var_; // non-owning: the Variable outlives the parameterization (both owned by Stream) + std::string var_name_; CompressionConfig cfg_; public: - ParameterizedCompression(const Variable& var, CompressionConfig cfg) : var_(&var), cfg_(std::move(cfg)) {} + ParameterizedCompression(const Variable& var, CompressionConfig cfg) + : var_name_(var.get_name()), cfg_(std::move(cfg)) + { + } [[nodiscard]] double get_accuracy() const { return cfg_.accuracy; } [[nodiscard]] double get_compression_cost_per_element() const { return cfg_.compression_cost_per_element; } diff --git a/src/CompressionReductionMethod.cpp b/src/CompressionReductionMethod.cpp index 60d8e10..683f410 100644 --- a/src/CompressionReductionMethod.cpp +++ b/src/CompressionReductionMethod.cpp @@ -18,7 +18,7 @@ double CompressionReductionMethod::ParameterizedCompression::get_effective_ratio if (cfg_.ratio_variability <= 0.0) return cfg_.compression_ratio; // Deterministic noise from hash of (variable_name, transaction_id) - size_t seed = std::hash{}(var_->get_name()) ^ (std::hash{}(transaction_id) << 1); + size_t seed = std::hash{}(var_name_) ^ (std::hash{}(transaction_id) << 1); // Map to [1 - variability, 1 + variability] double noise = 1.0 + cfg_.ratio_variability * (2.0 * (seed % 10001) / 10000.0 - 1.0); return std::max(1.0, cfg_.compression_ratio * noise); From 4843d63b8315bae9903a424ca33b60a000ab263c Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Wed, 20 May 2026 21:19:41 -0400 Subject: [PATCH 75/92] fix transaction cancellation with StagingEngine --- src/StagingEngine.cpp | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/src/StagingEngine.cpp b/src/StagingEngine.cpp index acdf9ec..77d8605 100644 --- a/src/StagingEngine.cpp +++ b/src/StagingEngine.cpp @@ -134,7 +134,19 @@ void StagingEngine::pub_close() pub_closing_ = true; XBT_DEBUG("[%s] Wait for the completion of %u publish activities from the previous transaction", get_cname(), get_pub_transaction().size()); - get_pub_transaction().wait_all(); + try { + get_pub_transaction().wait_all(); + } catch (const simgrid::CancelException&) { + if (!is_canceled()) + throw; + for (size_t i = 0; i < get_pub_transaction().size(); i++) + get_pub_transaction().at(i)->cancel(); + } catch (const simgrid::NetworkFailureException&) { + if (!is_canceled()) + throw; + for (size_t i = 0; i < get_pub_transaction().size(); i++) + get_pub_transaction().at(i)->cancel(); + } get_pub_transaction().clear(); XBT_DEBUG("[%s] last publish transaction is over", get_cname()); current_pub_transaction_id_++; @@ -188,6 +200,7 @@ void StagingEngine::begin_sub_transaction() pub_transaction_completed_->wait(lock); if (is_transaction_canceled(current_sub_transaction_id_)) { sub_transaction_in_progress_ = false; + num_subscribers_starting_--; throw TransactionCanceledException(XBT_THROW_POINT); } } @@ -239,7 +252,19 @@ void StagingEngine::sub_close() // I'm the first to close sub_closing_ = true; XBT_DEBUG("Wait for the %d subscribe activities for the transaction", get_sub_transaction().size()); - get_sub_transaction().wait_all(); + try { + get_sub_transaction().wait_all(); + } catch (const simgrid::CancelException&) { + if (!is_canceled()) + throw; + for (size_t i = 0; i < get_sub_transaction().size(); i++) + get_sub_transaction().at(i)->cancel(); + } catch (const simgrid::NetworkFailureException&) { + if (!is_canceled()) + throw; + for (size_t i = 0; i < get_sub_transaction().size(); i++) + get_sub_transaction().at(i)->cancel(); + } XBT_DEBUG("All on-flight subscribe activities are completed. Proceed with the current transaction."); get_sub_transaction().clear(); } From 80c88d71646f2652a4eebaddd899fb36a09b099d Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Sun, 24 May 2026 19:43:47 -0400 Subject: [PATCH 76/92] update doc --- doc/source/app_API.rst | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/doc/source/app_API.rst b/doc/source/app_API.rst index 45b3bbd..e3f0b67 100644 --- a/doc/source/app_API.rst +++ b/doc/source/app_API.rst @@ -27,7 +27,26 @@ DTL A |Concept_DTL|_ is created by calling :cpp:func:`DTL::create() ` at the beginning of the :cpp:func:`main()` function of your simulator. This function can take as an optional argument a JSON configuration file that describes the different |Concept_Streams|_ to be created during the simulation each with a **name**, -|Concept_Engine|_ type, and |Concept_Transport|_ method. +|Concept_Engine|_ type, |Concept_Transport|_ method, and optionally a list of reduction methods and a flag to +enable metadata export. A minimal stream entry looks like: + +.. code-block:: json + + { + "streams": [ + { + "name": "my-output", + "engine_type": "File", + "transport_method": "File", + "reduction_methods": ["compression"], + "export_metadata": true + } + ] + } + +The ``"reduction_methods"`` array accepts any combination of ``"decimation"`` and ``"compression"``. Each listed +method is pre-registered on the stream (equivalent to calling :cpp:func:`Stream::define_reduction_method +`) and can then be applied to individual variables. A common in situ processing scenario is that some analyses or visualization are only needed when certain conditions are met. In such cases, a new process is spawned, subscribes to some variables, and analyzes or visualizes data. @@ -274,7 +293,7 @@ Stream factory .. group-tab:: C++ - .. doxygenfunction:: dtlmod::DTL::add_stream(const std::string& name) + .. doxygenfunction:: dtlmod::DTL::add_stream(std::string_view name, Engine::Type type, Transport::Method method) .. doxygenfunction:: dtlmod::DTL::get_stream_by_name(const std::string& name) const .. doxygenfunction:: dtlmod::DTL::get_all_streams @@ -313,15 +332,20 @@ Properties .. group-tab:: C++ + .. doxygenfunction:: dtlmod::Stream::get_engine_type() const .. doxygenfunction:: dtlmod::Stream::get_engine_type_str() const + .. doxygenfunction:: dtlmod::Stream::get_transport_method() const .. doxygenfunction:: dtlmod::Stream::get_transport_method_str() const .. doxygenfunction:: dtlmod::Stream::get_access_mode_str() const .. doxygenfunction:: does_export_metadata() const + .. doxygenfunction:: dtlmod::Stream::get_reduction_method(std::string_view name) const .. group-tab:: Python .. autoproperty:: dtlmod.Stream.engine_type + .. autoproperty:: dtlmod.Stream.engine_type_str .. autoproperty:: dtlmod.Stream.transport_method + .. autoproperty:: dtlmod.Stream.transport_method_str .. autoproperty:: dtlmod.Stream.access_mode .. autoproperty:: dtlmod.Stream.metadata_export @@ -394,6 +418,7 @@ Transactions .. doxygenfunction:: dtlmod::Engine::put(std::shared_ptr var, size_t simulated_size_in_bytes) const .. doxygenfunction:: dtlmod::Engine::get(std::shared_ptr var) const .. doxygenfunction:: dtlmod::Engine::end_transaction() + .. doxygenfunction:: dtlmod::Engine::cancel_transaction(unsigned int transaction_id) .. group-tab:: Python @@ -401,6 +426,7 @@ Transactions .. automethod:: dtlmod.Engine.put .. automethod:: dtlmod.Engine.get .. automethod:: dtlmod.Engine.end_transaction + .. automethod:: dtlmod.Engine.cancel_transaction .. _API_dtlmod_Variable: From 62af390fd85e483d97b02fa3c9506bf284e93837 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Sun, 24 May 2026 19:44:14 -0400 Subject: [PATCH 77/92] update Changelog for v0.5 --- ChangeLog | 84 ++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 65 insertions(+), 19 deletions(-) diff --git a/ChangeLog b/ChangeLog index cabe239..157b52d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,45 +1,91 @@ ---------------------------------------------------------------------------- -DTLMod (0.5) not released yet (target: May 2026) +DTLMod (0.5) May 25, 2026 Improvements: + - Transaction cancellation support + - New Engine::cancel_transaction(transaction_id) allows an external actor + to cancel a specific in-flight transaction, unblocking both its + publishers and subscribers + - Publishers and subscribers receive a TransactionCanceledException; + catching it allows them to survive the cancellation and continue to + subsequent transactions + - Works with both File and Staging engines across any number of publishers + and subscribers - Memory efficiency: progressive eviction of metadata transaction entries - Once all subscribers have consumed a transaction, its entries are evicted from the in-memory Metadata::transaction_infos_ map - When metadata export is enabled and publishers and subscribers coexist - (file streaming), evicted entries are progressively flushed to - per-variable temporary files; the final metadata file is assembled at - pub_close() from those files and any remaining in-memory entries, + (file streaming), evicted entries are progressively flushed to + per-variable temporary files; the final metadata file is assembled at + pub_close() from those files and any remaining in-memory entries, preserving the existing format and transaction count - When the stream is opened by subscribers only after all publishers have closed (sequential scenario), memory-only eviction is performed (the metadata file has already been written by pub_close()) - Memory footprint of the File engine now grows as O(N_pub) instead of O(N_pub × N_transactions) for long-running concurrent streaming workloads - - + - Improved test coverage + - Comprehensive C++ and Python test suite for transaction cancellation, + covering cancellation while waiting for a subscriber, while waiting for + a publisher, and mid-transaction for both File and Staging engines + - Tests for the 3-parameter form of DTL::add_stream + - Tests for propagation of reduction parameters to subscribers for both + compression and decimation methods + +Bug Fixes: + - SZ compression model: the ratio formula incorrectly used -log10(accuracy) + instead of accuracy^beta, inverting the accuracy-to-ratio relationship; + constants recalibrated to give ratio ≈ 7 at accuracy = 1e-3 and ratio ≈ 2 + at accuracy = 1e-6 + - CompressionReductionMethod did not register the subscriber-side variable + during inquire_variable, preventing Engine::get() from simulating + decompression costs + - DecimationReductionMethod did not propagate its parameterization to + subscribers, preventing them from querying reduced variable sizes after + inquire_variable + - Transaction cancellation in StagingEngine: CancelException and + NetworkFailureException thrown during pub_close and sub_close were not + caught when a cancellation was in progress, leaving in-flight activities + in an inconsistent state + - StagingEngine::begin_sub_transaction did not decrement + num_subscribers_starting_ when a transaction was canceled, causing an + imbalance in subsequent synchronization + - CompressionReductionMethod::ParameterizedCompression stored a raw pointer + to the associated Variable; replaced with the variable name string to + eliminate potential dangling references API Changes: - New Engine method: - Engine::cancel_transaction(unsigned int transaction_id) cancels a - specific transaction performed by this engine. This method has to be - called by an external actor not involved in the transaction. It will make - the transaction raise a DTLMod::TransactionCanceledException that must be - caught if you want the publishers and subscribers to survive to the - cancelation of this transaction. + specific transaction performed by this engine. This method must be + called from an external actor not involved in the transaction. It raises + a TransactionCanceledException that publishers and subscribers must catch + to survive the cancellation and continue to subsequent transactions. If + both sides have already moved past the given transaction ID, the call is + a no-op to avoid accidentally canceling a subsequent transaction. - New Stream helper methods: - Stream::get_engine_type() and Stream::get_transport_method() respectively - return the enum value of the engine type and transport method for a + return the enum value of the engine type and transport method for a Stream. - As a consequence, in the Python bindings, the Stream.engine_type and - Stream.transport_method readonly properties that convert the engine type - and transport method to a string have been renamed to - Stream.engine_type_str and Stream.transport_method_str while the existing + Stream.transport_method readonly properties that return the engine type + and transport method as a string have been renamed to + Stream.engine_type_str and Stream.transport_method_str; the existing properties now return the enum values. - - Stream::get_reduction_method(name) to retrieve a reduction method - associated to the stream. - - Behavior modification: - - DTL::add_stream can now take an Engine Type and a Transport Method as - parameters (both defaulting to 'Undefined' if not specified). + - Stream::get_reduction_method(name) retrieves a reduction method + associated to the stream by name, returning std::nullopt if not found. + - New ReductionMethod virtual method: + - ReductionMethod::propagate_for_subscriber(publisher_var, subscriber_var) + copies the publisher-side parameterization to the subscriber variable; + the default implementation is a no-op; both DecimationReductionMethod + and CompressionReductionMethod provide overrides called by + Stream::inquire_variable. + - Behavior modifications: + - DTL::add_stream can now take an Engine::Type and a Transport::Method as + optional parameters (both defaulting to Undefined if not specified). + - Reduction methods can now be declared in the DTL configuration file via + a "reduction_methods" array in the stream definition. ---------------------------------------------------------------------------- From d3ec6b34fe6ee66fc11c53719dfcf345d37273f6 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Sun, 24 May 2026 19:44:53 -0400 Subject: [PATCH 78/92] augment test_cancel --- test/dtl_cancel.cpp | 112 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/test/dtl_cancel.cpp b/test/dtl_cancel.cpp index b25ac62..f1b2c82 100644 --- a/test/dtl_cancel.cpp +++ b/test/dtl_cancel.cpp @@ -292,6 +292,118 @@ TEST_F(DTLCancelTest, CancelFileEngineTransaction_WaitingForPublisher) }); } +// Both publisher and subscriber complete two transactions before the canceller fires. +// cancel_transaction(0) is a no-op because both sides have already advanced past transaction 0 +// (i.e., both current_pub_transaction_id_ and current_sub_transaction_id_ are > 0). +// This exercises the get_current_sub_transaction_impl() path in the no-op guard of +// Engine::cancel_transaction() for the StagingEngine. +TEST_F(DTLCancelTest, CancelNoOp_BothSidesPastTransaction_StagingMQ) +{ + DO_TEST_WITH_FORK([this]() { + this->setup_staging_platform(); + auto* pub_host = sg4::Host::by_name("host-0.prod"); + auto* sub_host = sg4::Host::by_name("host-0.cons"); + auto* wdog_host = sg4::Host::by_name("host-1.prod"); + + pub_host->add_actor("PubTestActor", [wdog_host]() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + stream->set_engine_type(dtlmod::Engine::Type::Staging); + stream->set_transport_method(dtlmod::Transport::Method::MQ); + auto var = stream->define_variable("var", {100}, {0}, {100}, sizeof(double)); + auto engine = stream->open("my-output", dtlmod::Stream::Mode::Publish); + + wdog_host->add_actor("Canceller", [engine]() { + sg4::this_actor::sleep_for(0.5); + XBT_INFO("Both sides completed T1 and T2 at t=0; cancel_transaction(0) must be a no-op"); + ASSERT_NO_THROW(engine->cancel_transaction(0)); + XBT_INFO("cancel_transaction returned as a no-op as expected"); + }); + + // Both T1 and T2 complete instantly with MQ; both current IDs are 2 before the canceller fires + ASSERT_NO_THROW(engine->begin_transaction()); + ASSERT_NO_THROW(engine->end_transaction()); + ASSERT_NO_THROW(engine->begin_transaction()); + ASSERT_NO_THROW(engine->end_transaction()); + + sg4::this_actor::sleep_for(2.0); // keep engine alive until canceller fires at 0.5s + dtlmod::DTL::disconnect(); + }); + + sub_host->add_actor("SubTestActor", []() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + auto engine = stream->open("my-output", dtlmod::Stream::Mode::Subscribe); + (void)stream->inquire_variable("var"); + + ASSERT_NO_THROW(engine->begin_transaction()); + ASSERT_NO_THROW(engine->end_transaction()); + ASSERT_NO_THROW(engine->begin_transaction()); + ASSERT_NO_THROW(engine->end_transaction()); + dtlmod::DTL::disconnect(); + }); + + ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); + }); +} + +// Same scenario for the FileEngine: both sides complete two full transactions (including +// actual file I/O) so that both current_pub_transaction_id_ and current_sub_transaction_id_ +// are > 0 when the canceller fires. cancel_transaction(0) must be a no-op and must call +// get_current_sub_transaction_impl() through the no-op guard of Engine::cancel_transaction(). +TEST_F(DTLCancelTest, CancelNoOp_BothSidesPastTransaction_FileEngine) +{ + DO_TEST_WITH_FORK([this]() { + this->setup_file_platform(); + auto* pub_host = sg4::Host::by_name("node-0"); + auto* sub_host = sg4::Host::by_name("node-1"); + auto* wdog_host = sg4::Host::by_name("node-2"); + + pub_host->add_actor("PubTestActor", [wdog_host]() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + stream->set_transport_method(dtlmod::Transport::Method::File); + stream->set_engine_type(dtlmod::Engine::Type::File); + auto var = stream->define_variable("var", {100}, {0}, {100}, sizeof(double)); + auto engine = stream->open("cluster:my_fs:/node-0/scratch/my-output", dtlmod::Stream::Mode::Publish); + + wdog_host->add_actor("Canceller", [engine]() { + sg4::this_actor::sleep_for(1.0); + XBT_INFO("Both sides completed T1 and T2; cancel_transaction(0) must be a no-op"); + ASSERT_NO_THROW(engine->cancel_transaction(0)); + XBT_INFO("cancel_transaction returned as a no-op as expected"); + }); + + ASSERT_NO_THROW(engine->begin_transaction()); + engine->put(var); + ASSERT_NO_THROW(engine->end_transaction()); + ASSERT_NO_THROW(engine->begin_transaction()); + engine->put(var); + ASSERT_NO_THROW(engine->end_transaction()); + + sg4::this_actor::sleep_for(2.0); // keep engine alive until canceller fires at 1.0s + dtlmod::DTL::disconnect(); + }); + + sub_host->add_actor("SubTestActor", []() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + auto engine = stream->open("cluster:my_fs:/node-0/scratch/my-output", dtlmod::Stream::Mode::Subscribe); + auto var_sub = stream->inquire_variable("var"); + + ASSERT_NO_THROW(engine->begin_transaction()); + engine->get(var_sub); + ASSERT_NO_THROW(engine->end_transaction()); + ASSERT_NO_THROW(engine->begin_transaction()); + engine->get(var_sub); + ASSERT_NO_THROW(engine->end_transaction()); + dtlmod::DTL::disconnect(); + }); + + ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); + }); +} + // Publisher and subscriber are both engaged in a long Mailbox transfer (Mailbox simulates bandwidth; MQ does not). // Publisher completes T1 end_transaction() (starting slow async Comms) then blocks in T2 begin_transaction() // waiting for T1 sends to complete. Subscriber blocks in T1 end_transaction() waiting for receives. From bb280b410f82baf792aac9341ba931d24eb5ad7e Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Sun, 24 May 2026 19:45:32 -0400 Subject: [PATCH 79/92] exclude SimGrid's exception catching from coverage --- src/StagingEngine.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/StagingEngine.cpp b/src/StagingEngine.cpp index 77d8605..2a07a8d 100644 --- a/src/StagingEngine.cpp +++ b/src/StagingEngine.cpp @@ -82,10 +82,10 @@ void StagingEngine::begin_pub_transaction() throw; get_pub_transaction().clear(); throw TransactionCanceledException(XBT_THROW_POINT); - } catch (const simgrid::NetworkFailureException&) { + } catch (const simgrid::NetworkFailureException&) { // LCOV_EXCL_START if (!is_canceled()) throw; - } + } // LCOV_EXCL_STOP XBT_DEBUG("All on-flight publish activities are completed. Proceed with the current transaction."); XBT_DEBUG("%u sub activities pending", get_sub_transaction().size()); get_pub_transaction().clear(); @@ -141,12 +141,12 @@ void StagingEngine::pub_close() throw; for (size_t i = 0; i < get_pub_transaction().size(); i++) get_pub_transaction().at(i)->cancel(); - } catch (const simgrid::NetworkFailureException&) { + } catch (const simgrid::NetworkFailureException&) { // LCOV_EXCL_START if (!is_canceled()) throw; for (size_t i = 0; i < get_pub_transaction().size(); i++) get_pub_transaction().at(i)->cancel(); - } + } // LCOV_EXCL_STOP get_pub_transaction().clear(); XBT_DEBUG("[%s] last publish transaction is over", get_cname()); current_pub_transaction_id_++; @@ -222,14 +222,14 @@ void StagingEngine::end_sub_transaction() sub_transaction_in_progress_ = false; num_subscribers_starting_--; throw TransactionCanceledException(XBT_THROW_POINT); - } catch (const simgrid::NetworkFailureException&) { + } catch (const simgrid::NetworkFailureException&) { // LCOV_EXCL_START if (!is_canceled()) throw; get_sub_transaction().clear(); sub_transaction_in_progress_ = false; num_subscribers_starting_--; throw TransactionCanceledException(XBT_THROW_POINT); - } + } // LCOV_EXCL_STOP XBT_DEBUG("All on-flight subscribe activities are completed. Proceed with the current transaction."); get_sub_transaction().clear(); } @@ -259,12 +259,12 @@ void StagingEngine::sub_close() throw; for (size_t i = 0; i < get_sub_transaction().size(); i++) get_sub_transaction().at(i)->cancel(); - } catch (const simgrid::NetworkFailureException&) { + } catch (const simgrid::NetworkFailureException&) { // LCOV_EXCL_START if (!is_canceled()) throw; for (size_t i = 0; i < get_sub_transaction().size(); i++) get_sub_transaction().at(i)->cancel(); - } + } // LCOV_EXCL_STOP XBT_DEBUG("All on-flight subscribe activities are completed. Proceed with the current transaction."); get_sub_transaction().clear(); } From 7d4b61cfa261b414b5b9e80ffd6d2082e48f42b9 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Sun, 24 May 2026 19:47:55 -0400 Subject: [PATCH 80/92] bump version number --- CMakeLists.txt | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c5b8f2d..9b2092b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ if(POLICY CMP0167) cmake_policy(SET CMP0167 NEW) endif() -project(dtlmod VERSION 0.4 DESCRIPTION "Data Transport Layer Module") +project(dtlmod VERSION 0.5 DESCRIPTION "Data Transport Layer Module") include(GNUInstallDirs) find_package(Boost 1.48) @@ -66,7 +66,7 @@ endif() # build the version number set(DTLMOD_VERSION_MAJOR "0") -set(DTLMOD_VERSION_MINOR "4") +set(DTLMOD_VERSION_MINOR "5") set(DTLMOD_VERSION_PATCH "0") set(DTLMOD_VERSION_EXTRA "") diff --git a/pyproject.toml b/pyproject.toml index 418e16e..fa93080 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "dtlmod" -version = "0.4" +version = "0.5" description = "A versatile simulated data transport layer SimGrid module" authors = [ { name = "The SWAT Team", email = "simgrid-community@inria.fr" } From 945098611b6689b4cf4f260598303ba1833a4cdd Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Sun, 24 May 2026 19:51:21 -0400 Subject: [PATCH 81/92] remove duplicate dependencies --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index fa93080..9af284d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,6 @@ dependencies = [ "simgrid>=4.1", "fsmod>=0.4.0" ] -dependencies = ["pybind11>=2.4"] classifiers = [ "Development Status :: 4 - Beta", From cf830fed7b950c43fa781f03d14464a222fe2363 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Sun, 24 May 2026 20:11:39 -0400 Subject: [PATCH 82/92] fix sign-compare warnings --- src/StagingEngine.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/StagingEngine.cpp b/src/StagingEngine.cpp index 2a07a8d..b52c796 100644 --- a/src/StagingEngine.cpp +++ b/src/StagingEngine.cpp @@ -139,12 +139,12 @@ void StagingEngine::pub_close() } catch (const simgrid::CancelException&) { if (!is_canceled()) throw; - for (size_t i = 0; i < get_pub_transaction().size(); i++) + for (int i = 0; i < get_pub_transaction().size(); i++) get_pub_transaction().at(i)->cancel(); } catch (const simgrid::NetworkFailureException&) { // LCOV_EXCL_START if (!is_canceled()) throw; - for (size_t i = 0; i < get_pub_transaction().size(); i++) + for (int i = 0; i < get_pub_transaction().size(); i++) get_pub_transaction().at(i)->cancel(); } // LCOV_EXCL_STOP get_pub_transaction().clear(); @@ -257,12 +257,12 @@ void StagingEngine::sub_close() } catch (const simgrid::CancelException&) { if (!is_canceled()) throw; - for (size_t i = 0; i < get_sub_transaction().size(); i++) + for (int i = 0; i < get_sub_transaction().size(); i++) get_sub_transaction().at(i)->cancel(); } catch (const simgrid::NetworkFailureException&) { // LCOV_EXCL_START if (!is_canceled()) throw; - for (size_t i = 0; i < get_sub_transaction().size(); i++) + for (int i = 0; i < get_sub_transaction().size(); i++) get_sub_transaction().at(i)->cancel(); } // LCOV_EXCL_STOP XBT_DEBUG("All on-flight subscribe activities are completed. Proceed with the current transaction."); From 496897def927da5be1b8f5065dead73288870379 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Sun, 24 May 2026 20:12:23 -0400 Subject: [PATCH 83/92] test progressive flushing of Metadata --- test/dtl_file_engine.cpp | 105 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 104 insertions(+), 1 deletion(-) diff --git a/test/dtl_file_engine.cpp b/test/dtl_file_engine.cpp index 3cb4bd2..6f5066d 100644 --- a/test/dtl_file_engine.cpp +++ b/test/dtl_file_engine.cpp @@ -462,7 +462,7 @@ TEST_F(DTLFileEngineTest, MetadataExport) ASSERT_EQ(file_contents, expected_contents); std::remove(metadata_file_name.c_str()); - + XBT_INFO("Disconnect the actor"); dtlmod::DTL::disconnect(); }); @@ -471,3 +471,106 @@ TEST_F(DTLFileEngineTest, MetadataExport) ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); }); } + +TEST_F(DTLFileEngineTest, MetadataExportProgressiveFlushing) +{ + DO_TEST_WITH_FORK([this]() { + this->setup_platform(); + std::string metadata_file_name; + + auto* pub_host = sg4::Host::by_name("node-0"); + auto* sub_host = sg4::Host::by_name("node-1"); + + pub_host->add_actor("node-0_pub", [this, &metadata_file_name]() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + stream->set_transport_method(dtlmod::Transport::Method::File); + stream->set_engine_type(dtlmod::Engine::Type::File); + XBT_INFO("Set metadata export for that stream"); + stream->set_metadata_export(); + XBT_INFO("Create a 2D-array variable with 10kx10k double"); + auto var = stream->define_variable("var", {10000, 10000}, {0, 0}, {10000, 10000}, sizeof(double)); + auto engine = stream->open("cluster:my_fs:/pfs/my-working-dir/my-output", dtlmod::Stream::Mode::Publish); + XBT_INFO("Stream '%s' (Engine '%s') is ready for publishing", stream->get_cname(), engine->get_cname()); + + sg4::this_actor::sleep_for(0.5); // Let subscriber join the engine + + XBT_INFO("Start Transaction 1"); + ASSERT_NO_THROW(engine->begin_transaction()); + ASSERT_NO_THROW(engine->put(var)); + ASSERT_NO_THROW(engine->end_transaction()); + + // Sleep long enough for the subscriber to complete reading tx 1 before we close. + // write_transaction_to_stream is called during the subscriber's end_transaction() because + // metadata_exported_ is still false while we sleep here. + XBT_INFO("Sleep 100s to remain alive while subscriber reads Transaction 1"); + sg4::this_actor::sleep_for(100.0); + + XBT_INFO("Start Transaction 2"); + ASSERT_NO_THROW(engine->begin_transaction()); + ASSERT_NO_THROW(engine->put(var)); + ASSERT_NO_THROW(engine->end_transaction()); + + XBT_INFO("Close the engine — triggers export_metadata_to_file, which reads the .prog file"); + ASSERT_NO_THROW(engine->close()); + metadata_file_name = stream->get_metadata_file_name(); + XBT_INFO("Disconnect the actor"); + dtlmod::DTL::disconnect(); + }); + + sub_host->add_actor("node-1_sub", [this]() { + auto dtl = dtlmod::DTL::connect(); + sg4::this_actor::sleep_for(0.5); + auto stream = dtl->add_stream("my-output"); + auto engine = stream->open("cluster:my_fs:/pfs/my-working-dir/my-output", dtlmod::Stream::Mode::Subscribe); + auto var_sub = stream->inquire_variable("var"); + + // Read tx 1 while publisher is sleeping (metadata_exported_=false): + // end_transaction() triggers flush_and_evict_transaction() → write_transaction_to_stream() + XBT_INFO("Read Transaction 1 while publisher is still alive"); + ASSERT_NO_THROW(engine->begin_transaction()); + ASSERT_NO_THROW(engine->get(var_sub)); + ASSERT_NO_THROW(engine->end_transaction()); + + // Directly verify that write_transaction_to_stream wrote tx 1 to the .prog file + std::string prog_file = stream->get_metadata_file_name() + ".var.prog"; + XBT_INFO("Check that the .prog file '%s' exists", prog_file.c_str()); + ASSERT_TRUE(std::ifstream(prog_file).good()); + + // Read tx 2 (blocks until publisher wakes up and publishes it) + XBT_INFO("Read Transaction 2"); + ASSERT_NO_THROW(engine->begin_transaction()); + ASSERT_NO_THROW(engine->get(var_sub)); + ASSERT_NO_THROW(engine->end_transaction()); + + XBT_INFO("Close the engine"); + ASSERT_NO_THROW(engine->close()); + XBT_INFO("Disconnect the actor"); + dtlmod::DTL::disconnect(); + }); + + // Run the simulation + ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); + + // The final metadata file should contain both transactions: + // - tx 1 was written progressively by write_transaction_to_stream to the .prog file + // - tx 2 was held in memory and written by export_to_file at publisher close + XBT_INFO("Check the contents of '%s'", metadata_file_name.c_str()); + std::ifstream file(metadata_file_name); + ASSERT_TRUE(file.is_open()); + std::string file_contents((std::istreambuf_iterator(file)), std::istreambuf_iterator()); + file.close(); + + const std::string expected_contents = "8\tvar\t2*{10000,10000}\n" + " Transaction 1:\n" + " /pfs/my-working-dir/my-output/data.0: [0:10000, 0:10000]\n" + " Transaction 2:\n" + " /pfs/my-working-dir/my-output/data.0: [0:10000, 0:10000]\n"; + + ASSERT_EQ(file_contents, expected_contents); + std::remove(metadata_file_name.c_str()); + + // The .prog file must have been removed by export_metadata_to_file after merging + ASSERT_FALSE(std::ifstream(metadata_file_name + ".var.prog").good()); + }); +} From d18771c9a7eceadf349543cc36b9732fcf359aa6 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Sun, 24 May 2026 20:49:28 -0400 Subject: [PATCH 84/92] wire up, test and document per-transaction compression ratio variability --- ChangeLog | 18 +++++++ doc/source/Compression.rst | 21 ++++++-- include/dtlmod/CompressionReductionMethod.hpp | 6 ++- include/dtlmod/DecimationReductionMethod.hpp | 6 ++- include/dtlmod/ReductionMethod.hpp | 4 +- src/CompressionReductionMethod.cpp | 10 ++-- src/Engine.cpp | 5 +- src/bindings/python/dtlmod_python.cpp | 4 +- test/dtl_reduction.cpp | 52 +++++++++++++++++++ 9 files changed, 109 insertions(+), 17 deletions(-) diff --git a/ChangeLog b/ChangeLog index 157b52d..5cf6398 100644 --- a/ChangeLog +++ b/ChangeLog @@ -3,6 +3,24 @@ DTLMod (0.5) May 25, 2026 Improvements: + - Per-transaction compression ratio variability is now active + - CompressionReductionMethod::get_reduced_variable_global_size() and + get_reduced_variable_local_size() now accept an optional transaction_id + parameter and call get_effective_ratio(transaction_id) to compute the + actual ratio, which may be perturbed around the nominal value when + ratio_variability > 0 + - Engine::put() passes the current transaction id to both calls, so the + simulated transfer size varies realistically from transaction to transaction + when ratio_variability is set + - The same transaction_id always produces the same effective ratio + (deterministic hash of variable name and transaction id), making + simulations reproducible + - ReductionMethod base-class virtuals and all overrides (Compression, + Decimation) updated accordingly; Decimation ignores the parameter as its + reduced sizes are transaction-independent + - Python bindings updated: get_reduced_variable_global_size() and + get_reduced_variable_local_size() accept an optional transaction_id + keyword argument (default 0) - Transaction cancellation support - New Engine::cancel_transaction(transaction_id) allows an external actor to cancel a specific in-flight transaction, unblocking both its diff --git a/doc/source/Compression.rst b/doc/source/Compression.rst index f677e04..082e3e9 100644 --- a/doc/source/Compression.rst +++ b/doc/source/Compression.rst @@ -95,9 +95,24 @@ Per-transaction variability --------------------------- In practice, the compression ratio achieved on a given variable varies from one time step to the next as the data -evolves. DTLMod can model this variability through an optional **ratio variability** parameter that introduces a -bounded, deterministic perturbation around the nominal compression ratio at each transaction. This enables the -simulation of realistic scenarios in which the effectiveness of compression fluctuates over the course of a run. +evolves. DTLMod models this variability through an optional **ratio variability** parameter that introduces a +bounded, deterministic perturbation around the nominal compression ratio at each transaction. + +For a given transaction :math:`t`, the effective ratio is computed as: + +.. math:: + + r_{\text{eff}}(t) = \max\!\Big(1,\; r \cdot \big(1 + \delta \cdot (2 h(t) - 1)\big)\Big) + +where :math:`r` is the nominal compression ratio, :math:`\delta` is the ratio variability, and :math:`h(t)` is a +deterministic value in :math:`[0, 1]` derived from a hash of the variable name and the transaction id. This ensures +that simulations are fully reproducible: the same transaction always produces the same effective ratio. + +The effective ratio is used by :cpp:func:`Engine::put()` to determine the actual transfer size for each transaction. +The :cpp:func:`ReductionMethod::get_reduced_variable_global_size()` and +:cpp:func:`ReductionMethod::get_reduced_variable_local_size()` query methods also accept an optional +``transaction_id`` argument so that users can inspect the effective size for any transaction before running the +simulation. Re-parameterization ------------------- diff --git a/include/dtlmod/CompressionReductionMethod.hpp b/include/dtlmod/CompressionReductionMethod.hpp index 622ee7e..f2ab675 100644 --- a/include/dtlmod/CompressionReductionMethod.hpp +++ b/include/dtlmod/CompressionReductionMethod.hpp @@ -66,8 +66,10 @@ class CompressionReductionMethod : public ReductionMethod { void reduce_variable(const Variable& /* var*/) override { /* Variable metadata are not modfied when using compression */ } - [[nodiscard]] size_t get_reduced_variable_global_size(const Variable& var) const override; - [[nodiscard]] size_t get_reduced_variable_local_size(const Variable& var) const override; + [[nodiscard]] size_t get_reduced_variable_global_size(const Variable& var, + unsigned int transaction_id = 0) const override; + [[nodiscard]] size_t get_reduced_variable_local_size(const Variable& var, + unsigned int transaction_id = 0) const override; [[nodiscard]] const std::vector& get_reduced_variable_shape(const Variable& var) const override { diff --git a/include/dtlmod/DecimationReductionMethod.hpp b/include/dtlmod/DecimationReductionMethod.hpp index f6b3cdb..afe27eb 100644 --- a/include/dtlmod/DecimationReductionMethod.hpp +++ b/include/dtlmod/DecimationReductionMethod.hpp @@ -70,12 +70,14 @@ class DecimationReductionMethod : public ReductionMethod { void reduce_variable(const Variable& var) override; - [[nodiscard]] size_t get_reduced_variable_global_size(const Variable& var) const override + [[nodiscard]] size_t get_reduced_variable_global_size(const Variable& var, + unsigned int /*transaction_id*/ = 0) const override { return per_variable_parameterizations_.at(&var)->get_global_reduced_size(); } - [[nodiscard]] size_t get_reduced_variable_local_size(const Variable& var) const override + [[nodiscard]] size_t get_reduced_variable_local_size(const Variable& var, + unsigned int /*transaction_id*/ = 0) const override { return per_variable_parameterizations_.at(&var)->get_local_reduced_size(); } diff --git a/include/dtlmod/ReductionMethod.hpp b/include/dtlmod/ReductionMethod.hpp index 36c0542..43524d0 100644 --- a/include/dtlmod/ReductionMethod.hpp +++ b/include/dtlmod/ReductionMethod.hpp @@ -31,8 +31,8 @@ class ReductionMethod { virtual void parameterize_for_variable(const Variable& var, const std::map>& parameters) = 0; virtual void reduce_variable(const Variable& var) = 0; - virtual size_t get_reduced_variable_global_size(const Variable& var) const = 0; - virtual size_t get_reduced_variable_local_size(const Variable& var) const = 0; + virtual size_t get_reduced_variable_global_size(const Variable& var, unsigned int transaction_id = 0) const = 0; + virtual size_t get_reduced_variable_local_size(const Variable& var, unsigned int transaction_id = 0) const = 0; virtual const std::vector& get_reduced_variable_shape(const Variable& var) const = 0; virtual const std::pair, std::vector>& get_reduced_start_and_count_for(const Variable& var, simgrid::s4u::ActorPtr publisher) const = 0; diff --git a/src/CompressionReductionMethod.cpp b/src/CompressionReductionMethod.cpp index 683f410..354fc63 100644 --- a/src/CompressionReductionMethod.cpp +++ b/src/CompressionReductionMethod.cpp @@ -24,15 +24,17 @@ double CompressionReductionMethod::ParameterizedCompression::get_effective_ratio return std::max(1.0, cfg_.compression_ratio * noise); } -size_t CompressionReductionMethod::get_reduced_variable_global_size(const Variable& var) const +size_t CompressionReductionMethod::get_reduced_variable_global_size(const Variable& var, + unsigned int transaction_id) const { - auto ratio = per_variable_parameterizations_.at(&var)->get_compression_ratio(); + auto ratio = per_variable_parameterizations_.at(&var)->get_effective_ratio(transaction_id); return static_cast(std::ceil(static_cast(var.get_global_size()) / ratio)); } -size_t CompressionReductionMethod::get_reduced_variable_local_size(const Variable& var) const +size_t CompressionReductionMethod::get_reduced_variable_local_size(const Variable& var, + unsigned int transaction_id) const { - auto ratio = per_variable_parameterizations_.at(&var)->get_compression_ratio(); + auto ratio = per_variable_parameterizations_.at(&var)->get_effective_ratio(transaction_id); return static_cast(std::ceil(static_cast(var.get_local_size()) / ratio)); } diff --git a/src/Engine.cpp b/src/Engine.cpp index 00d1019..c2265f2 100644 --- a/src/Engine.cpp +++ b/src/Engine.cpp @@ -39,8 +39,9 @@ void Engine::put(const std::shared_ptr& var) const XBT_DEBUG("Variable %s has been reduced!", var->get_cname()); // Now put the reduced version of the variable into the DTL, i.e., using its reduced local size. XBT_DEBUG("Put this reduced version of %s (initial size = %zu, reduced size = %zu)", var->get_cname(), - var->get_local_size(), var->get_reduction_method()->get_reduced_variable_local_size(*var)); - transport_->put(var, var->get_reduction_method()->get_reduced_variable_local_size(*var)); + var->get_local_size(), + var->get_reduction_method()->get_reduced_variable_local_size(*var, get_current_transaction())); + transport_->put(var, var->get_reduction_method()->get_reduced_variable_local_size(*var, get_current_transaction())); } else transport_->put(var, var->get_local_size()); } diff --git a/src/bindings/python/dtlmod_python.cpp b/src/bindings/python/dtlmod_python.cpp index 9c0145f..e6ddf25 100644 --- a/src/bindings/python/dtlmod_python.cpp +++ b/src/bindings/python/dtlmod_python.cpp @@ -254,9 +254,9 @@ PYBIND11_MODULE(dtlmod, m) "A reduction method applied to Variables in a Stream") .def_property_readonly("name", &ReductionMethod::get_name, "The name of the ReductionMethod (read-only)") .def("get_reduced_variable_global_size", &ReductionMethod::get_reduced_variable_global_size, py::arg("var"), - "Get the reduced global size of a Variable") + py::arg("transaction_id") = 0, "Get the reduced global size of a Variable") .def("get_reduced_variable_local_size", &ReductionMethod::get_reduced_variable_local_size, py::arg("var"), - "Get the reduced local size of a Variable") + py::arg("transaction_id") = 0, "Get the reduced local size of a Variable") .def("get_reduced_variable_shape", &ReductionMethod::get_reduced_variable_shape, py::arg("var"), "Get the reduced shape of a Variable") .def("get_flop_amount_to_reduce_variable", &ReductionMethod::get_flop_amount_to_reduce_variable, py::arg("var"), diff --git a/test/dtl_reduction.cpp b/test/dtl_reduction.cpp index 1241c14..da04013 100644 --- a/test/dtl_reduction.cpp +++ b/test/dtl_reduction.cpp @@ -451,6 +451,58 @@ TEST_F(DTLReductionTest, CompressionWithDerivedRatio) }); } +TEST_F(DTLReductionTest, CompressionWithVariableRatio) +{ + DO_TEST_WITH_FORK([this]() { + this->setup_platform(); + host_->add_actor("Publisher", [this]() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + stream->set_transport_method(dtlmod::Transport::Method::File); + stream->set_engine_type(dtlmod::Engine::Type::File); + auto var = stream->define_variable("var2D", {1000, 1000}, {0, 0}, {1000, 1000}, sizeof(double)); + auto compressor = stream->define_reduction_method("compression"); + auto engine = stream->open("zone:my_fs:/host/scratch/my-working-dir/my-output", dtlmod::Stream::Mode::Publish); + sg4::this_actor::sleep_for(1); + + XBT_INFO("With ratio_variability=0, effective size is the same for all transaction ids"); + ASSERT_NO_THROW(var->set_reduction_operation(compressor, {{"compression_ratio", "10"}})); + size_t base_size = static_cast(std::ceil(sizeof(double) * 1000.0 * 1000.0 / 10.0)); + ASSERT_EQ(compressor->get_reduced_variable_global_size(*var, 1), base_size); + ASSERT_EQ(compressor->get_reduced_variable_global_size(*var, 2), base_size); + ASSERT_EQ(compressor->get_reduced_variable_global_size(*var, 3), base_size); + + XBT_INFO("With ratio_variability=0.3, different transaction ids produce different effective sizes"); + ASSERT_NO_THROW(var->set_reduction_operation(compressor, {{"ratio_variability", "0.3"}})); + size_t size_tx1 = compressor->get_reduced_variable_local_size(*var, 1); + size_t size_tx2 = compressor->get_reduced_variable_local_size(*var, 2); + size_t size_tx3 = compressor->get_reduced_variable_local_size(*var, 3); + XBT_INFO("Reduced sizes: tx1=%zu tx2=%zu tx3=%zu (base=%zu)", size_tx1, size_tx2, size_tx3, base_size); + + // All effective sizes must be within the variability bounds (ratio in [10*0.7, 10*1.3]) + size_t min_size = static_cast(std::ceil(sizeof(double) * 1000.0 * 1000.0 / (10.0 * 1.3))); + size_t max_size = static_cast(std::ceil(sizeof(double) * 1000.0 * 1000.0 / (10.0 * 0.7))); + ASSERT_GE(size_tx1, min_size); + ASSERT_LE(size_tx1, max_size); + ASSERT_GE(size_tx2, min_size); + ASSERT_LE(size_tx2, max_size); + // At least two of the three transaction ids must differ (hash collision probability is negligible) + ASSERT_TRUE(size_tx1 != size_tx2 || size_tx1 != size_tx3 || size_tx2 != size_tx3); + // The hash is deterministic: the same transaction_id always gives the same size + ASSERT_EQ(compressor->get_reduced_variable_local_size(*var, 1), size_tx1); + + engine->begin_transaction(); + ASSERT_NO_THROW(engine->put(var)); + engine->end_transaction(); + engine->close(); + + dtlmod::DTL::disconnect(); + }); + + ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); + }); +} + TEST_F(DTLReductionTest, DecimationStagingEngine) { DO_TEST_WITH_FORK([this]() { From 0a34f25de36ea7b8c62e90e3fd443affd404ee62 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Mon, 25 May 2026 00:54:31 -0400 Subject: [PATCH 85/92] update --- .github/workflows/build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 7ae80cf..073308f 100755 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -41,14 +41,14 @@ jobs: id: cache-simgrid with: path: /opt/simgrid - key: ${{ runner.os }}-simgrid-python-v2 + key: ${{ runner.os }}-simgrid-python-v3 - name: Install SimGrid if: steps.cache-simgrid.outputs.cache-hit != 'true' run: | git clone --depth 1 https://framagit.org/simgrid/simgrid.git cd simgrid - cmake -B build -Denable_smpi=OFF -Denable_model-checking=OFF -Denable_python=ON -DCMAKE_INSTALL_PREFIX=/opt/simgrid + cmake -B build -Denable_smpi=OFF -Denable_model-checking=OFF -Denable_smemory=OFF -Denable_python=ON -DCMAKE_INSTALL_PREFIX=/opt/simgrid cmake --build build -j$(nproc) sudo cmake --install build From 2ac0ab0de0a4a2f922c0e9cff540a3cbee6337cf Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Mon, 25 May 2026 00:57:23 -0400 Subject: [PATCH 86/92] more tests of cancelation --- test/dtl_cancel.cpp | 274 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 274 insertions(+) diff --git a/test/dtl_cancel.cpp b/test/dtl_cancel.cpp index f1b2c82..cc20e91 100644 --- a/test/dtl_cancel.cpp +++ b/test/dtl_cancel.cpp @@ -63,6 +63,42 @@ class DTLCancelTest : public ::testing::Test { dtlmod::DTL::create(); } + void setup_slow_file_platform() + { + sg4::NetZone* cluster = sg4::Engine::get_instance()->get_netzone_root()->add_netzone_star("cluster"); + auto pfs_server = cluster->add_host("pfs_server", "1Gf"); + std::vector pfs_disks; + for (int i = 0; i < 4; i++) + pfs_disks.push_back(pfs_server->add_disk("pfs_disk" + std::to_string(i), "1MBps", "1MBps")); + auto remote_storage = sgfs::JBODStorage::create("pfs_storage", pfs_disks); + remote_storage->set_raid_level(sgfs::JBODStorage::RAID::RAID5); + + std::vector> local_storages; + for (int i = 0; i < 4; i++) { + std::string hostname = "node-" + std::to_string(i); + auto* host = cluster->add_host(hostname, "1Gf"); + auto* disk = host->add_disk(hostname + "_disk", "1MBps", "1MBps"); + local_storages.push_back(sgfs::OneDiskStorage::create(hostname + "_local_storage", disk)); + std::string linkname = "link_" + std::to_string(i); + auto* link_up = cluster->add_link(linkname + "_UP", "1Gbps"); + auto* link_down = cluster->add_link(linkname + "_DOWN", "1Gbps"); + auto* loopback = + cluster->add_link(hostname + "_loopback", "10Gbps")->set_sharing_policy(sg4::Link::SharingPolicy::FATPIPE); + cluster->add_route(host, nullptr, {sg4::LinkInRoute(link_up)}, false); + cluster->add_route(nullptr, host, {sg4::LinkInRoute(link_down)}, false); + cluster->add_route(host, host, {loopback}); + } + cluster->seal(); + + auto my_fs = sgfs::FileSystem::create("my_fs"); + sgfs::FileSystem::register_file_system(cluster, my_fs); + my_fs->mount_partition("/pfs/", remote_storage, "500TB"); + for (int i = 0; i < 4; i++) + my_fs->mount_partition("/node-" + std::to_string(i) + "/scratch/", local_storages.at(i), "1TB"); + + dtlmod::DTL::create(); + } + void setup_file_platform() { sg4::NetZone* cluster = sg4::Engine::get_instance()->get_netzone_root()->add_netzone_star("cluster"); @@ -460,3 +496,241 @@ TEST_F(DTLCancelTest, CancelStagingTransaction_MidTransaction_Mailbox) ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); }); } + +// Publisher begins T1 and then sleeps (simulating slow computation before end_transaction). +// Subscriber connects and immediately calls begin_transaction(), which blocks waiting for the publisher +// to signal pub_transaction_completed (StagingEngine lines 202-204). +// Canceller fires during that wait, unblocking the subscriber with TransactionCanceledException. +TEST_F(DTLCancelTest, CancelStagingTransaction_SubWaitingForPubToEndTx_MQ) +{ + DO_TEST_WITH_FORK([this]() { + this->setup_staging_platform(); + auto* pub_host = sg4::Host::by_name("host-0.prod"); + auto* sub_host = sg4::Host::by_name("host-0.cons"); + auto* wdog_host = sg4::Host::by_name("host-1.prod"); + + pub_host->add_actor("PubTestActor", [wdog_host]() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + stream->set_engine_type(dtlmod::Engine::Type::Staging); + stream->set_transport_method(dtlmod::Transport::Method::MQ); + [[maybe_unused]] auto var = stream->define_variable("var", {100}, {0}, {100}, sizeof(double)); + auto engine = stream->open("my-output", dtlmod::Stream::Mode::Publish); + + wdog_host->add_actor("Canceller", [engine]() { + sg4::this_actor::sleep_for(0.5); + XBT_INFO("Cancelling the transaction"); + engine->cancel_transaction(engine->get_current_transaction()); + }); + + ASSERT_NO_THROW(engine->begin_transaction()); + sg4::this_actor::sleep_for(2.0); // hold T1 open long enough for sub to block on it + ASSERT_NO_THROW(engine->end_transaction()); + dtlmod::DTL::disconnect(); + }); + + sub_host->add_actor("SubTestActor", []() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + auto engine = stream->open("my-output", dtlmod::Stream::Mode::Subscribe); + [[maybe_unused]] auto var_sub = stream->inquire_variable("var"); + + XBT_INFO("Subscriber calling begin_transaction() — will block waiting for pub to end T1"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCanceledException); + XBT_INFO("Subscriber caught TransactionCanceledException as expected"); + dtlmod::DTL::disconnect(); + }); + + ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); + }); +} + +// Subscriber sleeps past an already-fired cancellation, then calls begin_transaction(). +// FileEngine line 195: the early-exit check fires immediately on begin_sub_transaction() +// because canceled_transaction_id_ is already set before the subscriber enters. +TEST_F(DTLCancelTest, CancelFileEngineTransaction_SubAlreadyCanceled) +{ + DO_TEST_WITH_FORK([this]() { + this->setup_file_platform(); + auto* pub_host = sg4::Host::by_name("node-0"); + auto* sub_host = sg4::Host::by_name("node-1"); + auto* wdog_host = sg4::Host::by_name("node-2"); + + pub_host->add_actor("PubTestActor", [wdog_host]() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + stream->set_transport_method(dtlmod::Transport::Method::File); + stream->set_engine_type(dtlmod::Engine::Type::File); + [[maybe_unused]] auto var = stream->define_variable("var", {100}, {0}, {100}, sizeof(double)); + auto engine = stream->open("cluster:my_fs:/node-0/scratch/my-output", dtlmod::Stream::Mode::Publish); + + wdog_host->add_actor("Canceller", [engine]() { + sg4::this_actor::sleep_for(0.5); + XBT_INFO("Cancelling the transaction"); + engine->cancel_transaction(engine->get_current_transaction()); + }); + + sg4::this_actor::sleep_for(3.0); + dtlmod::DTL::disconnect(); + }); + + sub_host->add_actor("SubTestActor", []() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + auto engine = stream->open("cluster:my_fs:/node-0/scratch/my-output", dtlmod::Stream::Mode::Subscribe); + [[maybe_unused]] auto var_sub = stream->inquire_variable("var"); + + sg4::this_actor::sleep_for(2.0); // sleep past the cancellation point + XBT_INFO("Subscriber calling begin_transaction() after cancellation already fired"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCanceledException); + XBT_INFO("Subscriber caught TransactionCanceledException as expected"); + dtlmod::DTL::disconnect(); + }); + + ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); + }); +} + +// Publisher does T1 (put on slow disk), then immediately starts T2 begin_transaction(), +// which blocks waiting for T1 write activities to complete (FileEngine line 123). +// Canceller fires at 1s while writes are still in flight, unblocking publisher with +// TransactionCanceledException on T2 begin_transaction(). +TEST_F(DTLCancelTest, CancelFileEngineTransaction_PubMidWrite) +{ + DO_TEST_WITH_FORK([this]() { + this->setup_slow_file_platform(); + auto* pub_host = sg4::Host::by_name("node-0"); + auto* wdog_host = sg4::Host::by_name("node-1"); + + pub_host->add_actor("PubTestActor", [wdog_host]() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + stream->set_transport_method(dtlmod::Transport::Method::File); + stream->set_engine_type(dtlmod::Engine::Type::File); + auto var = stream->define_variable("var", {1000, 1000}, {0, 0}, {1000, 1000}, sizeof(double)); + auto engine = stream->open("cluster:my_fs:/node-0/scratch/my-output", dtlmod::Stream::Mode::Publish); + + wdog_host->add_actor("Canceller", [engine]() { + sg4::this_actor::sleep_for(1.0); + XBT_INFO("Cancelling the transaction"); + engine->cancel_transaction(engine->get_current_transaction()); + }); + + // T1: begin + put (starts slow async writes), then end_transaction (returns immediately) + ASSERT_NO_THROW(engine->begin_transaction()); + ASSERT_NO_THROW(engine->put(var)); + ASSERT_NO_THROW(engine->end_transaction()); + + // T2: blocks waiting for T1 writes to finish on slow disk — canceled while waiting + XBT_INFO("Begin T2 (will block waiting for T1 writes to complete on slow disk)"); + ASSERT_THROW(engine->begin_transaction(), dtlmod::TransactionCanceledException); + XBT_INFO("Publisher caught TransactionCanceledException in T2 begin_transaction() as expected"); + dtlmod::DTL::disconnect(); + }); + + ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); + }); +} + +// Publisher does T1 on slow disk; subscriber's end_sub_transaction() blocks waiting for the +// publisher's writes to complete (FileEngine lines 234-237: pub_activities_completed CV wait). +// Canceller fires at 1s while writes are still in flight, unblocking subscriber with +// TransactionCanceledException. +TEST_F(DTLCancelTest, CancelFileEngineTransaction_SubWaitingForPubWrites) +{ + DO_TEST_WITH_FORK([this]() { + this->setup_slow_file_platform(); + auto* pub_host = sg4::Host::by_name("node-0"); + auto* sub_host = sg4::Host::by_name("node-1"); + auto* wdog_host = sg4::Host::by_name("node-2"); + + pub_host->add_actor("PubTestActor", [wdog_host]() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + stream->set_transport_method(dtlmod::Transport::Method::File); + stream->set_engine_type(dtlmod::Engine::Type::File); + auto var = stream->define_variable("var", {1000, 1000}, {0, 0}, {1000, 1000}, sizeof(double)); + auto engine = stream->open("cluster:my_fs:/node-0/scratch/my-output", dtlmod::Stream::Mode::Publish); + + wdog_host->add_actor("Canceller", [engine]() { + sg4::this_actor::sleep_for(1.0); + XBT_INFO("Cancelling the transaction"); + engine->cancel_transaction(engine->get_current_transaction()); + }); + + ASSERT_NO_THROW(engine->begin_transaction()); + ASSERT_NO_THROW(engine->put(var)); + ASSERT_NO_THROW(engine->end_transaction()); + sg4::this_actor::sleep_for(5.0); + dtlmod::DTL::disconnect(); + }); + + sub_host->add_actor("SubTestActor", []() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + auto engine = stream->open("cluster:my_fs:/node-0/scratch/my-output", dtlmod::Stream::Mode::Subscribe); + auto var_sub = stream->inquire_variable("var"); + + // T1: begin and get succeed; end_transaction blocks waiting for pub writes — canceled there + ASSERT_NO_THROW(engine->begin_transaction()); + ASSERT_NO_THROW(engine->get(var_sub)); + XBT_INFO("End T1 (will block waiting for pub writes to complete on slow disk)"); + ASSERT_THROW(engine->end_transaction(), dtlmod::TransactionCanceledException); + XBT_INFO("Subscriber caught TransactionCanceledException in T1 end_transaction() as expected"); + dtlmod::DTL::disconnect(); + }); + + ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); + }); +} + +// Publisher writes 8MB to slow disk (~2.67s on 3MBps RAID5). Writes complete before the cancel at 4s. +// Subscriber starts reading after writes complete; canceller fires at 4s during the slow reads +// (FileEngine lines 31, 250-258). Subscriber's end_transaction() catches TransactionCanceledException. +TEST_F(DTLCancelTest, CancelFileEngineTransaction_SubMidRead) +{ + DO_TEST_WITH_FORK([this]() { + this->setup_slow_file_platform(); + auto* pub_host = sg4::Host::by_name("node-0"); + auto* sub_host = sg4::Host::by_name("node-1"); + auto* wdog_host = sg4::Host::by_name("node-2"); + + pub_host->add_actor("PubTestActor", [wdog_host]() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + stream->set_transport_method(dtlmod::Transport::Method::File); + stream->set_engine_type(dtlmod::Engine::Type::File); + auto var = stream->define_variable("var", {1000, 1000}, {0, 0}, {1000, 1000}, sizeof(double)); + auto engine = stream->open("cluster:my_fs:/node-0/scratch/my-output", dtlmod::Stream::Mode::Publish); + + wdog_host->add_actor("Canceller", [engine]() { + sg4::this_actor::sleep_for(4.0); // after writes finish (~2.67s) but during reads + XBT_INFO("Cancelling the transaction"); + engine->cancel_transaction(engine->get_current_transaction()); + }); + + ASSERT_NO_THROW(engine->begin_transaction()); + ASSERT_NO_THROW(engine->put(var)); + ASSERT_NO_THROW(engine->end_transaction()); + sg4::this_actor::sleep_for(10.0); + dtlmod::DTL::disconnect(); + }); + + sub_host->add_actor("SubTestActor", []() { + auto dtl = dtlmod::DTL::connect(); + auto stream = dtl->add_stream("my-output"); + auto engine = stream->open("cluster:my_fs:/node-0/scratch/my-output", dtlmod::Stream::Mode::Subscribe); + auto var_sub = stream->inquire_variable("var"); + + // T1: begin/get/end — end_transaction blocks during slow reads, canceled at 4s + ASSERT_NO_THROW(engine->begin_transaction()); + ASSERT_NO_THROW(engine->get(var_sub)); + XBT_INFO("End T1 (will block waiting for reads to complete on slow disk)"); + ASSERT_THROW(engine->end_transaction(), dtlmod::TransactionCanceledException); + XBT_INFO("Subscriber caught TransactionCanceledException in T1 end_transaction() as expected"); + dtlmod::DTL::disconnect(); + }); + + ASSERT_NO_THROW(sg4::Engine::get_instance()->run()); + }); +} From bc46af4a4506ae34e8d5dc51599112c9d864f124 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Mon, 25 May 2026 01:33:28 -0400 Subject: [PATCH 87/92] force rebuild of FSMod --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 073308f..219934f 100755 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -57,7 +57,7 @@ jobs: id: cache-fsmod with: path: /opt/fsmod - key: ${{ runner.os }}-fsmod-python-v2 + key: ${{ runner.os }}-fsmod-python-v3 - name: Install FSMod if: steps.cache-fsmod.outputs.cache-hit != 'true' From 5b6f7cfd99bb6a2b5de03035d8507e34d2b12991 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Mon, 25 May 2026 11:39:44 -0400 Subject: [PATCH 88/92] test one more function --- test/dtl_reduction.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/test/dtl_reduction.cpp b/test/dtl_reduction.cpp index da04013..fccce47 100644 --- a/test/dtl_reduction.cpp +++ b/test/dtl_reduction.cpp @@ -121,6 +121,7 @@ TEST_F(DTLReductionTest, SimpleDecimationFileEngine) sg4::this_actor::sleep_until(6); XBT_INFO("Assign the decimation method to 'var3D'"); ASSERT_NO_THROW(var->set_reduction_operation(decimator, {{"stride", "1,2,4"}})); + ASSERT_EQ(decimator->get_reduced_variable_global_size(*var), 640 * 320 * 160 * sizeof(double)); XBT_INFO("Check that the variable is marked as 'reduced'"); ASSERT_TRUE(var->is_reduced()); XBT_INFO("Start a Transaction"); From 39e2507b682d3d33e1f5f916ee76246a6637eeae Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Mon, 25 May 2026 12:36:59 -0400 Subject: [PATCH 89/92] bump required version of FSMod --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9b2092b..c9abc79 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,7 +12,7 @@ project(dtlmod VERSION 0.5 DESCRIPTION "Data Transport Layer Module") include(GNUInstallDirs) find_package(Boost 1.48) find_package(SimGrid 4.1 REQUIRED) -find_package(FSMod 0.4 REQUIRED) +find_package(FSMod 0.4.1 REQUIRED) find_package(nlohmann_json REQUIRED) # Note: Global include_directories removed in favor of target-specific includes below From a7c0ebd8b3de46563c445014e9b7dae51112ba27 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Mon, 1 Jun 2026 16:57:21 -0400 Subject: [PATCH 90/92] move to the factory agnostic bindings --- src/bindings/python/dtlmod_python.cpp | 31 ++++++++++++++------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/src/bindings/python/dtlmod_python.cpp b/src/bindings/python/dtlmod_python.cpp index e6ddf25..3db76ad 100644 --- a/src/bindings/python/dtlmod_python.cpp +++ b/src/bindings/python/dtlmod_python.cpp @@ -26,6 +26,7 @@ #include #include +#include #include namespace py = pybind11; @@ -94,23 +95,23 @@ PYBIND11_MODULE(dtlmod, m) py::class_> engine( m, "Engine", "An Engine defines how data is transferred between the applications and the DTL"); engine.def_property_readonly("name", &Engine::get_name, "The name of the Engine (read-only)") - .def("begin_transaction", &Engine::begin_transaction, py::call_guard(), + .def("begin_transaction", &Engine::begin_transaction, py::call_guard(), "Begin a transaction on this Engine") .def("put", py::overload_cast&>(&Engine::put, py::const_), py::arg("var"), - py::call_guard(), "Put a Variable in the DTL using this Engine") + py::call_guard(), "Put a Variable in the DTL using this Engine") .def("put", py::overload_cast&, size_t>(&Engine::put, py::const_), py::arg("var"), - py::arg("simulated_size_in_bytes"), py::call_guard(), + py::arg("simulated_size_in_bytes"), py::call_guard(), "Put a Variable in the DTL using this Engine") - .def("get", &Engine::get, py::arg("var"), py::call_guard(), + .def("get", &Engine::get, py::arg("var"), py::call_guard(), "Get a Variable from the DTL using this Engine") - .def("end_transaction", &Engine::end_transaction, py::call_guard(), + .def("end_transaction", &Engine::end_transaction, py::call_guard(), "End a transaction on this Engine") .def_property_readonly("current_transaction", &Engine::get_current_transaction, "The id of the current transaction on this Engine (read-only)") - .def("cancel_transaction", &Engine::cancel_transaction, py::call_guard(), + .def("cancel_transaction", &Engine::cancel_transaction, py::call_guard(), py::arg("transaction_id"), "Cancel all in-flight activities of a specific transaction (must be called from an external actor)") - .def("close", &Engine::close, py::call_guard(), "Close this Engine"); + .def("close", &Engine::close, py::call_guard(), "Close this Engine"); py::enum_(engine, "Type", "The type of Engine") .value("Undefined", Engine::Type::Undefined) @@ -127,14 +128,14 @@ PYBIND11_MODULE(dtlmod, m) /* Class DTL */ py::class_>(m, "DTL", "Data Transport Layer") - .def_static("create", py::overload_cast(&DTL::create), py::call_guard(), - py::arg("filename") = "", "Create the DTL (no return)") - .def_static("connect", &DTL::connect, py::call_guard(), "Connect an Actor to the DTL") - .def_static("disconnect", &DTL::disconnect, py::call_guard(), + .def_static("create", py::overload_cast(&DTL::create), + py::call_guard(), py::arg("filename") = "", "Create the DTL (no return)") + .def_static("connect", &DTL::connect, py::call_guard(), "Connect an Actor to the DTL") + .def_static("disconnect", &DTL::disconnect, py::call_guard(), "Disconnect an Actor from the DTL") .def_property_readonly("has_active_connections", &DTL::has_active_connections, "Check whether some simulated actors are currently connected to the DTL (read-only)") - .def("add_stream", &DTL::add_stream, py::call_guard(), py::arg("name"), + .def("add_stream", &DTL::add_stream, py::call_guard(), py::arg("name"), py::arg("type") = Engine::Type::Undefined, py::arg("method") = Transport::Method::Undefined, "Add a data stream to the DTL") .def_property_readonly("all_streams", &DTL::get_all_streams, @@ -185,7 +186,7 @@ PYBIND11_MODULE(dtlmod, m) .def("unset_metadata_export", &Stream::unset_metadata_export, "Specify that metadata must not be exported for that stream") // Engine factory - .def("open", &Stream::open, py::arg("name"), py::call_guard(), py::arg("mode"), + .def("open", &Stream::open, py::arg("name"), py::call_guard(), py::arg("mode"), "Open a Stream and create an Engine") .def_property_readonly("num_publishers", &Stream::get_num_publishers, "The number of actors connected to this Stream in Mode::Publish (read-only)") @@ -197,14 +198,14 @@ PYBIND11_MODULE(dtlmod, m) [](Stream& self, std::string_view name, size_t element_size) { return self.define_variable(name, element_size); }, - py::call_guard(), py::arg("name"), py::arg("element_size"), + py::call_guard(), py::arg("name"), py::arg("element_size"), "Define a scalar variable for this Stream") .def( "define_variable", [](Stream& self, std::string_view name, const std::vector& shape, const std::vector& start, const std::vector& count, size_t element_size) { return self.define_variable(name, shape, start, count, element_size); }, - py::call_guard(), py::arg("name"), py::arg("shape"), py::arg("start"), + py::call_guard(), py::arg("name"), py::arg("shape"), py::arg("start"), py::arg("count"), py::arg("element_size"), "Define a variable for this Stream") .def_property_readonly("all_variables", &Stream::get_all_variables, "Retrieve the list of Variables by names") .def_property_readonly("metadata_file_name", &Stream::get_metadata_file_name, From be3abd786adbdf4353242b1622a0843df6aa9c07 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Mon, 1 Jun 2026 17:26:44 -0400 Subject: [PATCH 91/92] update weekly CI action --- .github/workflows/weekly-checks.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/weekly-checks.yml b/.github/workflows/weekly-checks.yml index 0f9ab06..a855bf7 100644 --- a/.github/workflows/weekly-checks.yml +++ b/.github/workflows/weekly-checks.yml @@ -63,7 +63,7 @@ jobs: id: cache-fsmod with: path: /opt/fsmod - key: ${{ runner.os }}-fsmod-python-v2 + key: ${{ runner.os }}-fsmod-python-v3 - name: Install FSMod if: steps.cache-fsmod.outputs.cache-hit != 'true' @@ -180,7 +180,7 @@ jobs: id: cache-fsmod with: path: /opt/fsmod - key: ${{ runner.os }}-fsmod-python-v2 + key: ${{ runner.os }}-fsmod-python-v3 - name: Install FSMod if: steps.cache-fsmod.outputs.cache-hit != 'true' From be35e5e1087d70dc7e03852d2ee779f786b068e9 Mon Sep 17 00:00:00 2001 From: Fred Suter Date: Mon, 1 Jun 2026 17:32:36 -0400 Subject: [PATCH 92/92] force cache refresh in github actions --- .github/workflows/weekly-checks.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/weekly-checks.yml b/.github/workflows/weekly-checks.yml index a855bf7..c3b1c96 100644 --- a/.github/workflows/weekly-checks.yml +++ b/.github/workflows/weekly-checks.yml @@ -47,7 +47,7 @@ jobs: id: cache-simgrid with: path: /opt/simgrid - key: ${{ runner.os }}-simgrid-python-v2 + key: ${{ runner.os }}-simgrid-python-v3 - name: Install SimGrid if: steps.cache-simgrid.outputs.cache-hit != 'true' @@ -164,7 +164,7 @@ jobs: id: cache-simgrid with: path: /opt/simgrid - key: ${{ runner.os }}-simgrid-python-v2 + key: ${{ runner.os }}-simgrid-python-v3 - name: Install SimGrid if: steps.cache-simgrid.outputs.cache-hit != 'true'