perf(thread saftey): All Engines are now thread safe

Previously engines were not thread safe, a seperate engine would be
needed for every thread. This is no longer the case. This allows for
much more efficient parallel execution
This commit is contained in:
2025-12-12 12:08:47 -05:00
parent c7574a2f3d
commit e114c0e240
46 changed files with 3685 additions and 1604 deletions

View File

@@ -1,9 +1,12 @@
// ReSharper disable CppUnusedIncludeDirective
#include <iostream>
#include <fstream>
#include <chrono>
#include <thread>
#include <format>
#include "gridfire/gridfire.h"
#include <cppad/utility/thread_alloc.hpp> // Required for parallel_setup
#include "fourdst/composition/composition.h"
#include "fourdst/logging/logging.h"
@@ -17,7 +20,15 @@
#include <clocale>
#include "gridfire/reaction/reaclib.h"
#include <omp.h>
unsigned long get_thread_id() {
return static_cast<unsigned long>(omp_get_thread_num());
}
bool in_parallel() {
return omp_in_parallel() != 0;
}
static std::terminate_handler g_previousHandler = nullptr;
static std::vector<std::pair<double, std::unordered_map<std::string, std::pair<double, double>>>> g_callbackHistory;
@@ -110,14 +121,14 @@ void log_results(const gridfire::NetOut& netOut, const gridfire::NetIn& netIn) {
std::vector<std::string> rowLabels = [&]() -> std::vector<std::string> {
std::vector<std::string> labels;
for (const auto& species : logSpecies) {
labels.push_back(std::string(species.name()));
labels.emplace_back(species.name());
}
labels.push_back("ε");
labels.push_back("dε/dT");
labels.push_back("dε/dρ");
labels.push_back("Eν");
labels.push_back("Fν");
labels.push_back("<μ>");
labels.emplace_back("ε");
labels.emplace_back("dε/dT");
labels.emplace_back("dε/dρ");
labels.emplace_back("Eν");
labels.emplace_back("Fν");
labels.emplace_back("<μ>");
return labels;
}();
@@ -145,13 +156,13 @@ void record_abundance_history_callback(const gridfire::solver::CVODESolverStrate
const auto& engine = ctx.engine;
// std::unordered_map<std::string, std::pair<double, double>> abundances;
std::vector<double> Y;
for (const auto& species : engine.getNetworkSpecies()) {
const size_t sid = engine.getSpeciesIndex(species);
for (const auto& species : engine.getNetworkSpecies(ctx.state_ctx)) {
const size_t sid = engine.getSpeciesIndex(ctx.state_ctx, species);
double y = N_VGetArrayPointer(ctx.state)[sid];
Y.push_back(y > 0.0 ? y : 0.0); // Regularize tiny negative abundances to zero
}
fourdst::composition::Composition comp(engine.getNetworkSpecies(), Y);
fourdst::composition::Composition comp(engine.getNetworkSpecies(ctx.state_ctx), Y);
std::unordered_map<std::string, std::pair<double, double>> abundances;
@@ -225,45 +236,116 @@ void callback_main(const gridfire::solver::CVODESolverStrategy::TimestepContext&
record_abundance_history_callback(ctx);
}
int main(int argc, char** argv) {
int main() {
using namespace gridfire;
CLI::App app{"GridFire Sandbox Application."};
constexpr size_t breaks = 100;
constexpr size_t breaks = 1;
double temp = 1.5e7;
double rho = 1.5e2;
double tMax = 3.1536e+17/breaks;
double tMax = 3.1536e+16/breaks;
app.add_option("-t,--temp", temp, "Temperature in K (Default 1.5e7K)");
app.add_option("-r,--rho", rho, "Density in g/cm^3 (Default 1.5e2g/cm^3)");
app.add_option("--tmax", tMax, "Maximum simulation time in s (Default 3.1536e17s)");
CLI11_PARSE(app, argc, argv);
NetIn netIn = init(temp, rho, tMax);
const NetIn netIn = init(temp, rho, tMax);
policy::MainSequencePolicy stellarPolicy(netIn.composition);
stellarPolicy.construct();
engine::DynamicEngine& engine = stellarPolicy.construct();
policy::ConstructionResults construct = stellarPolicy.construct();
std::println("Sandbox Engine Stack: {}", stellarPolicy);
std::println("Scratch Blob State: {}", *construct.scratch_blob);
solver::CVODESolverStrategy solver(engine);
solver.set_stdout_logging_enabled(false);
// solver.set_callback(solver::CVODESolverStrategy::TimestepCallback(callback_main));
fourdst::composition::Composition reinputComp = netIn.composition;
NetOut netOut;
const auto timer = std::chrono::high_resolution_clock::now();
for (int i = 0; i < breaks; ++i) {
NetIn in({.composition = reinputComp, .temperature = temp, .density = rho, .tMax = tMax, .dt0 = 1e-12});
netOut = solver.evaluate(in, false, false);
reinputComp = netOut.composition;
constexpr size_t runs = 1000;
auto startTime = std::chrono::high_resolution_clock::now();
// arrays to store timings
std::array<std::chrono::duration<double>, runs> setup_times;
std::array<std::chrono::duration<double>, runs> eval_times;
std::array<NetOut, runs> serial_results;
for (size_t i = 0; i < runs; ++i) {
auto start_setup_time = std::chrono::high_resolution_clock::now();
std::print("Run {}/{}\r", i + 1, runs);
solver::CVODESolverStrategy solver(construct.engine, *construct.scratch_blob);
// solver.set_callback(solver::CVODESolverStrategy::TimestepCallback(callback_main));
solver.set_stdout_logging_enabled(false);
auto end_setup_time = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> setup_elapsed = end_setup_time - start_setup_time;
setup_times[i] = setup_elapsed;
auto start_eval_time = std::chrono::high_resolution_clock::now();
const NetOut netOut = solver.evaluate(netIn);
auto end_eval_time = std::chrono::high_resolution_clock::now();
serial_results[i] = netOut;
std::chrono::duration<double> eval_elapsed = end_eval_time - start_eval_time;
eval_times[i] = eval_elapsed;
// log_results(netOut, netIn);
}
auto endTime = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = endTime - startTime;
std::println("");
// Summarize serial timings
double total_setup_time = 0.0;
double total_eval_time = 0.0;
for (size_t i = 0; i < runs; ++i) {
total_setup_time += setup_times[i].count();
total_eval_time += eval_times[i].count();
}
std::println("Average Setup Time over {} runs: {:.6f} seconds", runs, total_setup_time / runs);
std::println("Average Evaluation Time over {} runs: {:.6f} seconds", runs, total_eval_time / runs);
std::println("Total Time for {} runs: {:.6f} seconds", runs, elapsed.count());
std::println("Final H-1 Abundances Serial: {}", serial_results[0].composition.getMolarAbundance(fourdst::atomic::H_1));
CppAD::thread_alloc::parallel_setup(
static_cast<size_t>(omp_get_max_threads()), // Max threads
[]() -> bool { return in_parallel(); }, // Function to get thread ID
[]() -> size_t { return get_thread_id(); } // Function to check parallel state
);
// OPTIONAL: Prevent CppAD from returning memory to the system
// during execution to reduce overhead (can speed up tight loops)
CppAD::thread_alloc::hold_memory(true);
std::array<NetOut, runs> parallelResults;
std::array<std::chrono::duration<double>, runs> setupTimes;
std::array<std::chrono::duration<double>, runs> evalTimes;
std::array<std::unique_ptr<gridfire::engine::scratch::StateBlob>, runs> workspaces;
for (size_t i = 0; i < runs; ++i) {
workspaces[i] = construct.scratch_blob->clone_structure();
}
const auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - timer).count();
std::cout << "Average execution time over run: " << duration/breaks << " ms" << std::endl;
std::cout << "Total execution time over " << breaks << " runs: " << duration << " ms" << std::endl;
log_results(netOut, netIn);
// log_callback_data(temp);
}
// Parallel runs
startTime = std::chrono::high_resolution_clock::now();
#pragma omp parallel for
for (size_t i = 0; i < runs; ++i) {
auto start_setup_time = std::chrono::high_resolution_clock::now();
solver::CVODESolverStrategy solver(construct.engine, *workspaces[i]);
solver.set_stdout_logging_enabled(false);
auto end_setup_time = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> setup_elapsed = end_setup_time - start_setup_time;
setupTimes[i] = setup_elapsed;
auto start_eval_time = std::chrono::high_resolution_clock::now();
parallelResults[i] = solver.evaluate(netIn);
auto end_eval_time = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> eval_elapsed = end_eval_time - start_eval_time;
evalTimes[i] = eval_elapsed;
}
endTime = std::chrono::high_resolution_clock::now();
elapsed = endTime - startTime;
std::println("");
// Summarize parallel timings
total_setup_time = 0.0;
total_eval_time = 0.0;
for (size_t i = 0; i < runs; ++i) {
total_setup_time += setupTimes[i].count();
total_eval_time += evalTimes[i].count();
}
std::println("Average Parallel Setup Time over {} runs: {:.6f} seconds", runs, total_setup_time / runs);
std::println("Average Parallel Evaluation Time over {} runs: {:.6f} seconds", runs, total_eval_time / runs);
std::println("Total Parallel Time for {} runs: {:.6f} seconds", runs, elapsed.count());
std::println("Final H-1 Abundances Parallel: {}", utils::iterable_to_delimited_string(parallelResults, ",", [](const auto& result) {
return result.composition.getMolarAbundance(fourdst::atomic::H_1);
}));
}