diff --git a/src/include/gridfire/solver/strategies/CVODE_solver_strategy.h b/src/include/gridfire/solver/strategies/CVODE_solver_strategy.h index f7d302b1..b652f35e 100644 --- a/src/include/gridfire/solver/strategies/CVODE_solver_strategy.h +++ b/src/include/gridfire/solver/strategies/CVODE_solver_strategy.h @@ -175,14 +175,24 @@ namespace gridfire::solver { const size_t num_steps; ///< Number of CVODE steps taken so far. const DynamicEngine& engine; ///< Reference to the engine. const std::vector& networkSpecies; ///< Species layout. + const size_t currentConvergenceFailures; ///< Total number of convergence failures + const size_t currentNonlinearIterations; ///< Total number of non linear iterations /** * @brief Construct a context snapshot. */ TimestepContext( - double t, const N_Vector& state, double dt, double last_step_time, - double t9, double rho, size_t num_steps, const DynamicEngine& engine, - const std::vector& networkSpecies + double t, + const N_Vector& state, + double dt, + double last_step_time, + double t9, + double rho, + size_t num_steps, + const DynamicEngine& engine, + const std::vector& networkSpecies, + size_t currentConvergenceFailure, + size_t currentNonlinearIterations ); /** @@ -281,7 +291,7 @@ namespace gridfire::solver { SUNLinearSolver m_LS = nullptr; ///< Dense linear solver. - TimestepCallback m_callback; ///< Optional per-step callback. + std::optional m_callback; ///< Optional per-step callback. int m_num_steps = 0; ///< CVODE step counter (used for diagnostics and triggers). bool m_stdout_logging_enabled = true; ///< If true, print per-step logs and use CV_ONE_STEP. diff --git a/src/include/gridfire/solver/strategies/triggers/engine_partitioning_trigger.h b/src/include/gridfire/solver/strategies/triggers/engine_partitioning_trigger.h index 64507542..3b242609 100644 --- a/src/include/gridfire/solver/strategies/triggers/engine_partitioning_trigger.h +++ b/src/include/gridfire/solver/strategies/triggers/engine_partitioning_trigger.h @@ -71,6 +71,8 @@ namespace gridfire::trigger::solver::CVODE { * (ctx.t - last_trigger_time) - interval for diagnostics. */ void update(const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx) override; + + void step(const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx) override; /** * @brief Reset counters and last trigger bookkeeping (time and delta) to zero. */ @@ -149,6 +151,7 @@ namespace gridfire::trigger::solver::CVODE { * @param ctx CVODE timestep context (unused except for symmetry with interface). */ void update(const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx) override; + void step(const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx) override; /** @brief Reset counters to zero. */ void reset() override; @@ -233,6 +236,7 @@ namespace gridfire::trigger::solver::CVODE { * @param ctx CVODE timestep context. */ void update(const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx) override; + void step(const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx) override; /** @brief Reset counters and clear the dt window. */ void reset() override; @@ -268,6 +272,50 @@ namespace gridfire::trigger::solver::CVODE { std::deque m_timestep_window; }; + class ConvergenceFailureTrigger final : public Trigger { + public: + explicit ConvergenceFailureTrigger(size_t totalFailures, float relativeFailureRate, size_t windowSize); + + bool check(const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx) const override; + + void update(const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx) override; + + void step(const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx) override; + + void reset() override; + + [[nodiscard]] std::string name() const override; + + [[nodiscard]] std::string describe() const override; + + [[nodiscard]] TriggerResult why(const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx) const override; + + [[nodiscard]] size_t numTriggers() const override; + + [[nodiscard]] size_t numMisses() const override; + + private: + /** @brief Logger used for trace/error diagnostics. */ + quill::Logger* m_logger = fourdst::logging::LogManager::getInstance().getLogger("log"); + /** @name Diagnostics counters */ + ///@{ + mutable size_t m_hits = 0; + mutable size_t m_misses = 0; + mutable size_t m_updates = 0; + mutable size_t m_resets = 0; + + size_t m_totalFailures; + float m_relativeFailureRate; + size_t m_windowSize; + + std::deque m_window; + + private: + float current_mean() const; + bool abs_failure(const gridfire::solver::CVODESolverStrategy::TimestepContext& ctx) const; + bool rel_failure(const gridfire::solver::CVODESolverStrategy::TimestepContext& ctx) const; + }; + /** * @brief Compose a trigger suitable for deciding engine re-partitioning during CVODE solves. * diff --git a/src/include/gridfire/trigger/trigger_abstract.h b/src/include/gridfire/trigger/trigger_abstract.h index 6d366777..16d1c117 100644 --- a/src/include/gridfire/trigger/trigger_abstract.h +++ b/src/include/gridfire/trigger/trigger_abstract.h @@ -42,6 +42,12 @@ namespace gridfire::trigger { * @param ctx Context snapshot used to update state. */ virtual void update(const TriggerContextStruct& ctx) = 0; + + /** + * @brief similar to update but intended to be run on every step not just those where the trigger triggered + * @param ctx Context snapshot used to update state. + */ + virtual void step(const TriggerContextStruct& ctx) = 0; /** * @brief Reset internal state and diagnostics counters. */ diff --git a/src/include/gridfire/trigger/trigger_logical.h b/src/include/gridfire/trigger/trigger_logical.h index 78fffbee..04b816b3 100644 --- a/src/include/gridfire/trigger/trigger_logical.h +++ b/src/include/gridfire/trigger/trigger_logical.h @@ -52,6 +52,8 @@ namespace gridfire::trigger { * @brief Update both sub-triggers and increment update counter. */ void update(const TriggerContextStruct& ctx) override; + + void step(const TriggerContextStruct& ctx) override; /** * @brief Reset both sub-triggers and local counters. */ @@ -84,6 +86,7 @@ namespace gridfire::trigger { mutable size_t m_misses = 0; mutable size_t m_updates = 0; mutable size_t m_resets = 0; + mutable size_t m_steps = 0; }; /** @@ -101,6 +104,7 @@ namespace gridfire::trigger { bool check(const TriggerContextStruct& ctx) const override; void update(const TriggerContextStruct& ctx) override; + void step(const TriggerContextStruct& ctx) override; void reset() override; std::string name() const override; TriggerResult why(const TriggerContextStruct& ctx) const override; @@ -116,6 +120,7 @@ namespace gridfire::trigger { mutable size_t m_misses = 0; mutable size_t m_updates = 0; mutable size_t m_resets = 0; + mutable size_t m_steps = 0; }; /** @@ -133,6 +138,7 @@ namespace gridfire::trigger { bool check(const TriggerContextStruct& ctx) const override; void update(const TriggerContextStruct& ctx) override; + void step(const TriggerContextStruct& ctx) override; void reset() override; std::string name() const override; @@ -148,7 +154,7 @@ namespace gridfire::trigger { mutable size_t m_misses = 0; mutable size_t m_updates = 0; mutable size_t m_resets = 0; - + mutable size_t m_steps = 0; }; /** @@ -168,6 +174,7 @@ namespace gridfire::trigger { bool check(const TriggerContextStruct& ctx) const override; void update(const TriggerContextStruct& ctx) override; + void step(const TriggerContextStruct& ctx) override; void reset() override; std::string name() const override; @@ -185,6 +192,7 @@ namespace gridfire::trigger { mutable size_t m_misses = 0; mutable size_t m_updates = 0; mutable size_t m_resets = 0; + mutable size_t m_steps = 0; }; /////////////////////////////// @@ -216,6 +224,13 @@ namespace gridfire::trigger { m_updates++; } + template + void AndTrigger::step(const TriggerContextStruct &ctx) { + m_A->step(ctx); + m_B->step(ctx); + m_steps++; + } + template void AndTrigger::reset() { m_A->reset(); @@ -301,6 +316,13 @@ namespace gridfire::trigger { m_updates++; } + template + void OrTrigger::step(const TriggerContextStruct &ctx) { + m_A->step(ctx); + m_B->step(ctx); + m_steps++; + } + template void OrTrigger::reset() { m_A->reset(); @@ -383,6 +405,12 @@ namespace gridfire::trigger { m_updates++; } + template + void NotTrigger::step(const TriggerContextStruct &ctx) { + m_A->step(ctx); + m_steps++; + } + template void NotTrigger::reset() { m_A->reset(); @@ -457,6 +485,12 @@ namespace gridfire::trigger { m_updates++; } + template + void EveryNthTrigger::step(const TriggerContextStruct &ctx) { + m_A->step(ctx); + m_steps++; + } + template void EveryNthTrigger::reset() { m_A->reset(); @@ -508,7 +542,4 @@ namespace gridfire::trigger { size_t EveryNthTrigger::numMisses() const { return m_misses; } - - - } diff --git a/src/lib/solver/strategies/CVODE_solver_strategy.cpp b/src/lib/solver/strategies/CVODE_solver_strategy.cpp index 42c4c57b..c3895751 100644 --- a/src/lib/solver/strategies/CVODE_solver_strategy.cpp +++ b/src/lib/solver/strategies/CVODE_solver_strategy.cpp @@ -90,7 +90,9 @@ namespace gridfire::solver { const double rho, const size_t num_steps, const DynamicEngine &engine, - const std::vector &networkSpecies + const std::vector &networkSpecies, + const size_t currentConvergenceFailure, + const size_t currentNonlinearIterations ) : t(t), state(state), @@ -100,7 +102,9 @@ namespace gridfire::solver { rho(rho), num_steps(num_steps), engine(engine), - networkSpecies(networkSpecies) + networkSpecies(networkSpecies), + currentConvergenceFailures(currentConvergenceFailure), + currentNonlinearIterations(currentNonlinearIterations) {} std::vector> CVODESolverStrategy::TimestepContext::describe() const { @@ -114,6 +118,8 @@ namespace gridfire::solver { description.emplace_back("num_steps", "Number of Steps Taken So Far"); description.emplace_back("engine", "Reference to the DynamicEngine"); description.emplace_back("networkSpecies", "Reference to the list of network species"); + description.emplace_back("currentConvergenceFailures", "Number of convergence failures for the current step"); + description.emplace_back("currentNonLinearIterations", "Number of nonlinear iterations for the current step"); return description; } @@ -179,7 +185,16 @@ namespace gridfire::solver { [[maybe_unused]] double last_callback_time = 0; m_num_steps = 0; double accumulated_energy = 0.0; - int total_update_stages_triggered = 0; + + size_t total_convergence_failures = 0; + size_t total_nonlinear_iterations = 0; + size_t total_update_stages_triggered = 0; + + size_t prev_nonlinear_iterations = 0; + size_t prev_convergence_failures = 0; + + size_t total_steps = 0; + LOG_TRACE_L1(m_logger, "Starting CVODE iteration"); while (current_time < netIn.tMax) { user_data.T9 = T9; @@ -209,28 +224,44 @@ namespace gridfire::solver { sunrealtype* y_data = N_VGetArrayPointer(m_Y); const double current_energy = y_data[numSpecies]; // Specific energy rate + size_t iter_diff = (total_nonlinear_iterations + nliters) - prev_nonlinear_iterations; + size_t convFail_diff = (total_convergence_failures + nlcfails) - prev_convergence_failures; if (m_stdout_logging_enabled) { std::cout << std::scientific << std::setprecision(3) - << "Step: " << std::setw(6) << n_steps - << " | Time: " << current_time << " [s]" - << " | Last Step Size: " << last_step_size - << " | Current Lightest Molar Abundance: " << y_data[0] << " [mol/g]" - << " | Accumulated Energy: " << current_energy << " [erg/g]" - << " | Total Non Linear Iterations: " << std::setw(2) << nliters - << " | Total Convergence Failures: " << std::setw(2) << nlcfails + << "Step: " << std::setw(6) << total_steps + n_steps + << " | Updates: " << std::setw(3) << total_update_stages_triggered + << " | Epoch Steps: " << std::setw(4) << n_steps + << " | t: " << current_time << " [s]" + << " | dt: " << last_step_size << " [s]" + // << " | Molar Abundance (min a): " << y_data[0] << " [mol/g]" + // << " | Accumulated Energy: " << current_energy << " [erg/g]" + << " | Iterations: " << std::setw(6) << total_nonlinear_iterations + nliters + << " (+" << std::setw(2) << iter_diff << ")" + << " | Total Convergence Failures: " << std::setw(2) << total_convergence_failures + nlcfails + << " (+" << std::setw(2) << convFail_diff << ")" << "\n"; } auto ctx = TimestepContext( current_time, - reinterpret_cast(y_data), + m_Y, last_step_size, last_callback_time, T9, netIn.density, n_steps, m_engine, - m_engine.getNetworkSpecies()); + m_engine.getNetworkSpecies(), + convFail_diff, + iter_diff); + + prev_nonlinear_iterations = nliters + total_nonlinear_iterations; + prev_convergence_failures = nlcfails + total_convergence_failures; + + if (m_callback.has_value()) { + m_callback.value()(ctx); + } + trigger->step(ctx); if (trigger->check(ctx)) { if (m_stdout_logging_enabled && displayTrigger) { @@ -238,6 +269,10 @@ namespace gridfire::solver { } trigger->update(ctx); accumulated_energy += current_energy; // Add the specific energy rate to the accumulated energy + total_nonlinear_iterations += nliters; + total_convergence_failures += nlcfails; + total_steps += n_steps; + LOG_INFO( m_logger, "Engine Update Triggered at time {} ({} update{} triggered). Current total specific energy {} [erg/g]", diff --git a/src/lib/solver/strategies/triggers/engine_partitioning_trigger.cpp b/src/lib/solver/strategies/triggers/engine_partitioning_trigger.cpp index 0cb30a3b..c5d7c469 100644 --- a/src/lib/solver/strategies/triggers/engine_partitioning_trigger.cpp +++ b/src/lib/solver/strategies/triggers/engine_partitioning_trigger.cpp @@ -46,6 +46,12 @@ namespace gridfire::trigger::solver::CVODE { } } + void SimulationTimeTrigger::step( + const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx + ) { + // --- SimulationTimeTrigger::step does nothing and is intentionally left blank --- // + } + void SimulationTimeTrigger::reset() { m_misses = 0; m_hits = 0; @@ -113,6 +119,12 @@ namespace gridfire::trigger::solver::CVODE { m_updates++; } + void OffDiagonalTrigger::step( + const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx + ) { + // --- OffDiagonalTrigger::step does nothing and is intentionally left blank --- // + } + void OffDiagonalTrigger::reset() { m_misses = 0; m_hits = 0; @@ -199,6 +211,12 @@ namespace gridfire::trigger::solver::CVODE { m_updates++; } + void TimestepCollapseTrigger::step( + const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx + ) { + // --- TimestepCollapseTrigger::step does nothing and is intentionally left blank --- // + } + void TimestepCollapseTrigger::reset() { m_misses = 0; m_hits = 0; @@ -239,6 +257,119 @@ namespace gridfire::trigger::solver::CVODE { return m_misses; } + ConvergenceFailureTrigger::ConvergenceFailureTrigger( + const size_t totalFailures, + const float relativeFailureRate, + const size_t windowSize + ) : + m_totalFailures(totalFailures), + m_relativeFailureRate(relativeFailureRate), + m_windowSize(windowSize) {} + + bool ConvergenceFailureTrigger::check( + const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx + ) const { + if (m_window.size() != m_windowSize) { + m_misses++; + return false; // Short circuit if not enough data has been seen yet. + } + if (abs_failure(ctx) || rel_failure(ctx)) { + m_hits++; + return true; + } + m_misses++; + return false; + } + + void ConvergenceFailureTrigger::update( + const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx + ) { + // --- ConvergenceFailureTrigger::update does nothing and is intentionally left blank --- // + } + + void ConvergenceFailureTrigger::step( + const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx + ) { + push_to_fixed_deque(m_window, ctx.currentConvergenceFailures, m_windowSize); + m_updates++; + } + + void ConvergenceFailureTrigger::reset() { + m_window.clear(); + m_hits = 0; + m_misses = 0; + m_updates = 0; + m_resets++; + } + + std::string ConvergenceFailureTrigger::name() const { + return "ConvergenceFailureTrigger"; + } + + std::string ConvergenceFailureTrigger::describe() const { + return "ConvergenceFailureTrigger(abs_failure_threshold=" + std::to_string(m_totalFailures) + ", rel_failure_threshold=" + std::to_string(m_relativeFailureRate) + ", windowSize=" + std::to_string(m_windowSize) + ")"; + } + + TriggerResult ConvergenceFailureTrigger::why(const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx) const { + TriggerResult result; + result.name = name(); + + if (m_window.size() != m_windowSize) { + result.value = false; + result.description = "Not triggered because trigger has not seen sufficient data to build up window of size " + std::to_string(m_windowSize); + return result; + } + if (abs_failure(ctx)) { + result.value = true; + result.description = std::format("Triggered because number of convergence failures ({}) exceeded absolute tolerances", ctx.currentConvergenceFailures); + return result; + } + if (rel_failure(ctx)) { + result.value = true; + result.description = std::format("Triggered because number of convergence failures - the mean ({} - {}) exceeded tolerances relative to mean ({} * {})", ctx.currentConvergenceFailures, current_mean(), current_mean(), m_relativeFailureRate); + return result; + } + + result.value = false; + result.description = "Not triggered because total number of convergence failures and relative number of convergence triggers did not grow sufficiently"; + return result; + } + + size_t ConvergenceFailureTrigger::numTriggers() const { + return m_hits; + } + + size_t ConvergenceFailureTrigger::numMisses() const { + return m_misses; + } + + float ConvergenceFailureTrigger::current_mean() const { + float acc = 0; + for (const auto nlcfails: m_window) { + acc += nlcfails; + } + return acc / m_windowSize; + } + + bool ConvergenceFailureTrigger::abs_failure( + const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx + ) const { + if (ctx.currentConvergenceFailures > m_totalFailures) { + return true; + } + return false; + } + + bool ConvergenceFailureTrigger::rel_failure( + const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx + ) const { + const float mean = current_mean(); + if (ctx.currentConvergenceFailures - mean > m_relativeFailureRate * mean) { + return true; + } + return false; + } + std::unique_ptr> makeEnginePartitioningTrigger( const double simulationTimeInterval, const double offDiagonalThreshold, @@ -253,16 +384,19 @@ namespace gridfire::trigger::solver::CVODE { // 1. Trigger every 1000th time that the simulation time exceeds the simulationTimeInterval // 2. OR if any off-diagonal Jacobian entry exceeds the offDiagonalThreshold // 3. OR every 10th time that the timestep growth exceeds the timestepGrowthThreshold (relative or absolute) + // 4. OR if the number of convergence failures begins to grow // TODO: This logic likely needs to be revisited; however, for now it is easy enough to change and test and it works reasonably well auto simulationTimeTrigger = std::make_unique>(std::make_unique(simulationTimeInterval), 1000); auto offDiagTrigger = std::make_unique(offDiagonalThreshold); auto timestepGrowthTrigger = std::make_unique>(std::make_unique(timestepGrowthThreshold, timestepGrowthRelative, timestepGrowthWindowSize), 10); + auto convergenceFailureTrigger = std::make_unique(5, 1.0f, 10); // Combine the triggers using logical OR auto orTriggerA = std::make_unique>(std::move(simulationTimeTrigger), std::move(offDiagTrigger)); auto orTriggerB = std::make_unique>(std::move(orTriggerA), std::move(timestepGrowthTrigger)); + auto orTriggerC = std::make_unique>(std::move(orTriggerB), std::move(convergenceFailureTrigger)); - return orTriggerB; + return orTriggerC; } } \ No newline at end of file