feat(triggers): added ConvergenceFailureTrigger

added new trigger which throws on convergence failures. This also required adding a new "step" method for other triggers which gets called every step instead of every trigger throw. The ConvergenceFailureTrigger has sucsessgully let teh engine evolve to 10Gyr without any meaningful growth of convergence failures.
This commit is contained in:
2025-11-04 13:22:57 -05:00
parent 7f19143ff6
commit 72a3f5bf4c
6 changed files with 285 additions and 21 deletions

View File

@@ -175,14 +175,24 @@ namespace gridfire::solver {
const size_t num_steps; ///< Number of CVODE steps taken so far.
const DynamicEngine& engine; ///< Reference to the engine.
const std::vector<fourdst::atomic::Species>& networkSpecies; ///< Species layout.
const size_t currentConvergenceFailures; ///< Total number of convergence failures
const size_t currentNonlinearIterations; ///< Total number of non linear iterations
/**
* @brief Construct a context snapshot.
*/
TimestepContext(
double t, const N_Vector& state, double dt, double last_step_time,
double t9, double rho, size_t num_steps, const DynamicEngine& engine,
const std::vector<fourdst::atomic::Species>& networkSpecies
double t,
const N_Vector& state,
double dt,
double last_step_time,
double t9,
double rho,
size_t num_steps,
const DynamicEngine& engine,
const std::vector<fourdst::atomic::Species>& networkSpecies,
size_t currentConvergenceFailure,
size_t currentNonlinearIterations
);
/**
@@ -281,7 +291,7 @@ namespace gridfire::solver {
SUNLinearSolver m_LS = nullptr; ///< Dense linear solver.
TimestepCallback m_callback; ///< Optional per-step callback.
std::optional<TimestepCallback> m_callback; ///< Optional per-step callback.
int m_num_steps = 0; ///< CVODE step counter (used for diagnostics and triggers).
bool m_stdout_logging_enabled = true; ///< If true, print per-step logs and use CV_ONE_STEP.

View File

@@ -71,6 +71,8 @@ namespace gridfire::trigger::solver::CVODE {
* (ctx.t - last_trigger_time) - interval for diagnostics.
*/
void update(const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx) override;
void step(const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx) override;
/**
* @brief Reset counters and last trigger bookkeeping (time and delta) to zero.
*/
@@ -149,6 +151,7 @@ namespace gridfire::trigger::solver::CVODE {
* @param ctx CVODE timestep context (unused except for symmetry with interface).
*/
void update(const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx) override;
void step(const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx) override;
/** @brief Reset counters to zero. */
void reset() override;
@@ -233,6 +236,7 @@ namespace gridfire::trigger::solver::CVODE {
* @param ctx CVODE timestep context.
*/
void update(const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx) override;
void step(const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx) override;
/** @brief Reset counters and clear the dt window. */
void reset() override;
@@ -268,6 +272,50 @@ namespace gridfire::trigger::solver::CVODE {
std::deque<double> m_timestep_window;
};
class ConvergenceFailureTrigger final : public Trigger<gridfire::solver::CVODESolverStrategy::TimestepContext> {
public:
explicit ConvergenceFailureTrigger(size_t totalFailures, float relativeFailureRate, size_t windowSize);
bool check(const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx) const override;
void update(const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx) override;
void step(const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx) override;
void reset() override;
[[nodiscard]] std::string name() const override;
[[nodiscard]] std::string describe() const override;
[[nodiscard]] TriggerResult why(const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx) const override;
[[nodiscard]] size_t numTriggers() const override;
[[nodiscard]] size_t numMisses() const override;
private:
/** @brief Logger used for trace/error diagnostics. */
quill::Logger* m_logger = fourdst::logging::LogManager::getInstance().getLogger("log");
/** @name Diagnostics counters */
///@{
mutable size_t m_hits = 0;
mutable size_t m_misses = 0;
mutable size_t m_updates = 0;
mutable size_t m_resets = 0;
size_t m_totalFailures;
float m_relativeFailureRate;
size_t m_windowSize;
std::deque<size_t> m_window;
private:
float current_mean() const;
bool abs_failure(const gridfire::solver::CVODESolverStrategy::TimestepContext& ctx) const;
bool rel_failure(const gridfire::solver::CVODESolverStrategy::TimestepContext& ctx) const;
};
/**
* @brief Compose a trigger suitable for deciding engine re-partitioning during CVODE solves.
*

View File

@@ -42,6 +42,12 @@ namespace gridfire::trigger {
* @param ctx Context snapshot used to update state.
*/
virtual void update(const TriggerContextStruct& ctx) = 0;
/**
* @brief similar to update but intended to be run on every step not just those where the trigger triggered
* @param ctx Context snapshot used to update state.
*/
virtual void step(const TriggerContextStruct& ctx) = 0;
/**
* @brief Reset internal state and diagnostics counters.
*/

View File

@@ -52,6 +52,8 @@ namespace gridfire::trigger {
* @brief Update both sub-triggers and increment update counter.
*/
void update(const TriggerContextStruct& ctx) override;
void step(const TriggerContextStruct& ctx) override;
/**
* @brief Reset both sub-triggers and local counters.
*/
@@ -84,6 +86,7 @@ namespace gridfire::trigger {
mutable size_t m_misses = 0;
mutable size_t m_updates = 0;
mutable size_t m_resets = 0;
mutable size_t m_steps = 0;
};
/**
@@ -101,6 +104,7 @@ namespace gridfire::trigger {
bool check(const TriggerContextStruct& ctx) const override;
void update(const TriggerContextStruct& ctx) override;
void step(const TriggerContextStruct& ctx) override;
void reset() override;
std::string name() const override;
TriggerResult why(const TriggerContextStruct& ctx) const override;
@@ -116,6 +120,7 @@ namespace gridfire::trigger {
mutable size_t m_misses = 0;
mutable size_t m_updates = 0;
mutable size_t m_resets = 0;
mutable size_t m_steps = 0;
};
/**
@@ -133,6 +138,7 @@ namespace gridfire::trigger {
bool check(const TriggerContextStruct& ctx) const override;
void update(const TriggerContextStruct& ctx) override;
void step(const TriggerContextStruct& ctx) override;
void reset() override;
std::string name() const override;
@@ -148,7 +154,7 @@ namespace gridfire::trigger {
mutable size_t m_misses = 0;
mutable size_t m_updates = 0;
mutable size_t m_resets = 0;
mutable size_t m_steps = 0;
};
/**
@@ -168,6 +174,7 @@ namespace gridfire::trigger {
bool check(const TriggerContextStruct& ctx) const override;
void update(const TriggerContextStruct& ctx) override;
void step(const TriggerContextStruct& ctx) override;
void reset() override;
std::string name() const override;
@@ -185,6 +192,7 @@ namespace gridfire::trigger {
mutable size_t m_misses = 0;
mutable size_t m_updates = 0;
mutable size_t m_resets = 0;
mutable size_t m_steps = 0;
};
///////////////////////////////
@@ -216,6 +224,13 @@ namespace gridfire::trigger {
m_updates++;
}
template <typename TriggerContextStruct>
void AndTrigger<TriggerContextStruct>::step(const TriggerContextStruct &ctx) {
m_A->step(ctx);
m_B->step(ctx);
m_steps++;
}
template <typename TriggerContextStruct>
void AndTrigger<TriggerContextStruct>::reset() {
m_A->reset();
@@ -301,6 +316,13 @@ namespace gridfire::trigger {
m_updates++;
}
template <typename TriggerContextStruct>
void OrTrigger<TriggerContextStruct>::step(const TriggerContextStruct &ctx) {
m_A->step(ctx);
m_B->step(ctx);
m_steps++;
}
template <typename TriggerContextStruct>
void OrTrigger<TriggerContextStruct>::reset() {
m_A->reset();
@@ -383,6 +405,12 @@ namespace gridfire::trigger {
m_updates++;
}
template <typename TriggerContextStruct>
void NotTrigger<TriggerContextStruct>::step(const TriggerContextStruct &ctx) {
m_A->step(ctx);
m_steps++;
}
template <typename TriggerContextStruct>
void NotTrigger<TriggerContextStruct>::reset() {
m_A->reset();
@@ -457,6 +485,12 @@ namespace gridfire::trigger {
m_updates++;
}
template <typename TriggerContextStruct>
void EveryNthTrigger<TriggerContextStruct>::step(const TriggerContextStruct &ctx) {
m_A->step(ctx);
m_steps++;
}
template <typename TriggerContextStruct>
void EveryNthTrigger<TriggerContextStruct>::reset() {
m_A->reset();
@@ -508,7 +542,4 @@ namespace gridfire::trigger {
size_t EveryNthTrigger<TriggerContextStruct>::numMisses() const {
return m_misses;
}
}

View File

@@ -90,7 +90,9 @@ namespace gridfire::solver {
const double rho,
const size_t num_steps,
const DynamicEngine &engine,
const std::vector<fourdst::atomic::Species> &networkSpecies
const std::vector<fourdst::atomic::Species> &networkSpecies,
const size_t currentConvergenceFailure,
const size_t currentNonlinearIterations
) :
t(t),
state(state),
@@ -100,7 +102,9 @@ namespace gridfire::solver {
rho(rho),
num_steps(num_steps),
engine(engine),
networkSpecies(networkSpecies)
networkSpecies(networkSpecies),
currentConvergenceFailures(currentConvergenceFailure),
currentNonlinearIterations(currentNonlinearIterations)
{}
std::vector<std::tuple<std::string, std::string>> CVODESolverStrategy::TimestepContext::describe() const {
@@ -114,6 +118,8 @@ namespace gridfire::solver {
description.emplace_back("num_steps", "Number of Steps Taken So Far");
description.emplace_back("engine", "Reference to the DynamicEngine");
description.emplace_back("networkSpecies", "Reference to the list of network species");
description.emplace_back("currentConvergenceFailures", "Number of convergence failures for the current step");
description.emplace_back("currentNonLinearIterations", "Number of nonlinear iterations for the current step");
return description;
}
@@ -179,7 +185,16 @@ namespace gridfire::solver {
[[maybe_unused]] double last_callback_time = 0;
m_num_steps = 0;
double accumulated_energy = 0.0;
int total_update_stages_triggered = 0;
size_t total_convergence_failures = 0;
size_t total_nonlinear_iterations = 0;
size_t total_update_stages_triggered = 0;
size_t prev_nonlinear_iterations = 0;
size_t prev_convergence_failures = 0;
size_t total_steps = 0;
LOG_TRACE_L1(m_logger, "Starting CVODE iteration");
while (current_time < netIn.tMax) {
user_data.T9 = T9;
@@ -209,28 +224,44 @@ namespace gridfire::solver {
sunrealtype* y_data = N_VGetArrayPointer(m_Y);
const double current_energy = y_data[numSpecies]; // Specific energy rate
size_t iter_diff = (total_nonlinear_iterations + nliters) - prev_nonlinear_iterations;
size_t convFail_diff = (total_convergence_failures + nlcfails) - prev_convergence_failures;
if (m_stdout_logging_enabled) {
std::cout << std::scientific << std::setprecision(3)
<< "Step: " << std::setw(6) << n_steps
<< " | Time: " << current_time << " [s]"
<< " | Last Step Size: " << last_step_size
<< " | Current Lightest Molar Abundance: " << y_data[0] << " [mol/g]"
<< " | Accumulated Energy: " << current_energy << " [erg/g]"
<< " | Total Non Linear Iterations: " << std::setw(2) << nliters
<< " | Total Convergence Failures: " << std::setw(2) << nlcfails
<< "Step: " << std::setw(6) << total_steps + n_steps
<< " | Updates: " << std::setw(3) << total_update_stages_triggered
<< " | Epoch Steps: " << std::setw(4) << n_steps
<< " | t: " << current_time << " [s]"
<< " | dt: " << last_step_size << " [s]"
// << " | Molar Abundance (min a): " << y_data[0] << " [mol/g]"
// << " | Accumulated Energy: " << current_energy << " [erg/g]"
<< " | Iterations: " << std::setw(6) << total_nonlinear_iterations + nliters
<< " (+" << std::setw(2) << iter_diff << ")"
<< " | Total Convergence Failures: " << std::setw(2) << total_convergence_failures + nlcfails
<< " (+" << std::setw(2) << convFail_diff << ")"
<< "\n";
}
auto ctx = TimestepContext(
current_time,
reinterpret_cast<N_Vector>(y_data),
m_Y,
last_step_size,
last_callback_time,
T9,
netIn.density,
n_steps,
m_engine,
m_engine.getNetworkSpecies());
m_engine.getNetworkSpecies(),
convFail_diff,
iter_diff);
prev_nonlinear_iterations = nliters + total_nonlinear_iterations;
prev_convergence_failures = nlcfails + total_convergence_failures;
if (m_callback.has_value()) {
m_callback.value()(ctx);
}
trigger->step(ctx);
if (trigger->check(ctx)) {
if (m_stdout_logging_enabled && displayTrigger) {
@@ -238,6 +269,10 @@ namespace gridfire::solver {
}
trigger->update(ctx);
accumulated_energy += current_energy; // Add the specific energy rate to the accumulated energy
total_nonlinear_iterations += nliters;
total_convergence_failures += nlcfails;
total_steps += n_steps;
LOG_INFO(
m_logger,
"Engine Update Triggered at time {} ({} update{} triggered). Current total specific energy {} [erg/g]",

View File

@@ -46,6 +46,12 @@ namespace gridfire::trigger::solver::CVODE {
}
}
void SimulationTimeTrigger::step(
const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx
) {
// --- SimulationTimeTrigger::step does nothing and is intentionally left blank --- //
}
void SimulationTimeTrigger::reset() {
m_misses = 0;
m_hits = 0;
@@ -113,6 +119,12 @@ namespace gridfire::trigger::solver::CVODE {
m_updates++;
}
void OffDiagonalTrigger::step(
const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx
) {
// --- OffDiagonalTrigger::step does nothing and is intentionally left blank --- //
}
void OffDiagonalTrigger::reset() {
m_misses = 0;
m_hits = 0;
@@ -199,6 +211,12 @@ namespace gridfire::trigger::solver::CVODE {
m_updates++;
}
void TimestepCollapseTrigger::step(
const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx
) {
// --- TimestepCollapseTrigger::step does nothing and is intentionally left blank --- //
}
void TimestepCollapseTrigger::reset() {
m_misses = 0;
m_hits = 0;
@@ -239,6 +257,119 @@ namespace gridfire::trigger::solver::CVODE {
return m_misses;
}
ConvergenceFailureTrigger::ConvergenceFailureTrigger(
const size_t totalFailures,
const float relativeFailureRate,
const size_t windowSize
) :
m_totalFailures(totalFailures),
m_relativeFailureRate(relativeFailureRate),
m_windowSize(windowSize) {}
bool ConvergenceFailureTrigger::check(
const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx
) const {
if (m_window.size() != m_windowSize) {
m_misses++;
return false; // Short circuit if not enough data has been seen yet.
}
if (abs_failure(ctx) || rel_failure(ctx)) {
m_hits++;
return true;
}
m_misses++;
return false;
}
void ConvergenceFailureTrigger::update(
const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx
) {
// --- ConvergenceFailureTrigger::update does nothing and is intentionally left blank --- //
}
void ConvergenceFailureTrigger::step(
const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx
) {
push_to_fixed_deque(m_window, ctx.currentConvergenceFailures, m_windowSize);
m_updates++;
}
void ConvergenceFailureTrigger::reset() {
m_window.clear();
m_hits = 0;
m_misses = 0;
m_updates = 0;
m_resets++;
}
std::string ConvergenceFailureTrigger::name() const {
return "ConvergenceFailureTrigger";
}
std::string ConvergenceFailureTrigger::describe() const {
return "ConvergenceFailureTrigger(abs_failure_threshold=" + std::to_string(m_totalFailures) + ", rel_failure_threshold=" + std::to_string(m_relativeFailureRate) + ", windowSize=" + std::to_string(m_windowSize) + ")";
}
TriggerResult ConvergenceFailureTrigger::why(const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx) const {
TriggerResult result;
result.name = name();
if (m_window.size() != m_windowSize) {
result.value = false;
result.description = "Not triggered because trigger has not seen sufficient data to build up window of size " + std::to_string(m_windowSize);
return result;
}
if (abs_failure(ctx)) {
result.value = true;
result.description = std::format("Triggered because number of convergence failures ({}) exceeded absolute tolerances", ctx.currentConvergenceFailures);
return result;
}
if (rel_failure(ctx)) {
result.value = true;
result.description = std::format("Triggered because number of convergence failures - the mean ({} - {}) exceeded tolerances relative to mean ({} * {})", ctx.currentConvergenceFailures, current_mean(), current_mean(), m_relativeFailureRate);
return result;
}
result.value = false;
result.description = "Not triggered because total number of convergence failures and relative number of convergence triggers did not grow sufficiently";
return result;
}
size_t ConvergenceFailureTrigger::numTriggers() const {
return m_hits;
}
size_t ConvergenceFailureTrigger::numMisses() const {
return m_misses;
}
float ConvergenceFailureTrigger::current_mean() const {
float acc = 0;
for (const auto nlcfails: m_window) {
acc += nlcfails;
}
return acc / m_windowSize;
}
bool ConvergenceFailureTrigger::abs_failure(
const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx
) const {
if (ctx.currentConvergenceFailures > m_totalFailures) {
return true;
}
return false;
}
bool ConvergenceFailureTrigger::rel_failure(
const gridfire::solver::CVODESolverStrategy::TimestepContext &ctx
) const {
const float mean = current_mean();
if (ctx.currentConvergenceFailures - mean > m_relativeFailureRate * mean) {
return true;
}
return false;
}
std::unique_ptr<Trigger<gridfire::solver::CVODESolverStrategy::TimestepContext>> makeEnginePartitioningTrigger(
const double simulationTimeInterval,
const double offDiagonalThreshold,
@@ -253,16 +384,19 @@ namespace gridfire::trigger::solver::CVODE {
// 1. Trigger every 1000th time that the simulation time exceeds the simulationTimeInterval
// 2. OR if any off-diagonal Jacobian entry exceeds the offDiagonalThreshold
// 3. OR every 10th time that the timestep growth exceeds the timestepGrowthThreshold (relative or absolute)
// 4. OR if the number of convergence failures begins to grow
// TODO: This logic likely needs to be revisited; however, for now it is easy enough to change and test and it works reasonably well
auto simulationTimeTrigger = std::make_unique<EveryNthTrigger<ctx_t>>(std::make_unique<SimulationTimeTrigger>(simulationTimeInterval), 1000);
auto offDiagTrigger = std::make_unique<OffDiagonalTrigger>(offDiagonalThreshold);
auto timestepGrowthTrigger = std::make_unique<EveryNthTrigger<ctx_t>>(std::make_unique<TimestepCollapseTrigger>(timestepGrowthThreshold, timestepGrowthRelative, timestepGrowthWindowSize), 10);
auto convergenceFailureTrigger = std::make_unique<ConvergenceFailureTrigger>(5, 1.0f, 10);
// Combine the triggers using logical OR
auto orTriggerA = std::make_unique<OrTrigger<ctx_t>>(std::move(simulationTimeTrigger), std::move(offDiagTrigger));
auto orTriggerB = std::make_unique<OrTrigger<ctx_t>>(std::move(orTriggerA), std::move(timestepGrowthTrigger));
auto orTriggerC = std::make_unique<OrTrigger<ctx_t>>(std::move(orTriggerB), std::move(convergenceFailureTrigger));
return orTriggerB;
return orTriggerC;
}
}