atomr_accel_cuda/error.rs
1//! Error taxonomy and the supervisor decider for context-poisoning recovery
2//! (§5.3, §5.11 of the architecture document).
3//!
4//! Rakka's supervisor (`atomr_core::supervision::Decider`) inspects a panic
5//! message string rather than a typed error value. To trigger the doc's
6//! `OneForOne::default().on::<ContextPoisoned>(Restart)` behaviour, an
7//! actor that detects context poisoning panics with a message containing
8//! the string `"ContextPoisoned"`. The decider exposed here parses these
9//! markers back into supervisor directives.
10
11use atomr_core::supervision::{Directive, OneForOneStrategy, SupervisorOf, SupervisorStrategy};
12use std::time::Duration;
13use thiserror::Error;
14
15/// Marker prefix used in panic messages to signal a poisoned-context error.
16/// Matched by [`decider`].
17pub const CONTEXT_POISONED_TAG: &str = "ContextPoisoned";
18/// Marker prefix for OOM errors that the supervisor should `Resume` past.
19pub const OUT_OF_MEMORY_TAG: &str = "OutOfMemory";
20/// Marker prefix for fatal errors that should stop the device entirely.
21pub const UNRECOVERABLE_TAG: &str = "Unrecoverable";
22
23#[derive(Debug, Error)]
24pub enum GpuError {
25 /// CUDA context is in a sticky-error state (§5.3). Triggers
26 /// `ContextActor` restart and a generation bump on `DeviceState`.
27 #[error("ContextPoisoned: {0}")]
28 ContextPoisoned(String),
29
30 /// Allocation failed but the context is still usable. Supervisor
31 /// `Resume`s the actor.
32 #[error("OutOfMemory: {0}")]
33 OutOfMemory(String),
34
35 /// Hardware fault or repeated poisoning past the retry budget.
36 #[error("Unrecoverable: {0}")]
37 Unrecoverable(String),
38
39 /// `GpuRef::access()` was called on a buffer whose context was
40 /// rebuilt or whose `DeviceActor` is shutting down (§5.8).
41 #[error("GpuRef stale: {0}")]
42 GpuRefStale(&'static str),
43
44 #[error("cudarc driver error: {0}")]
45 Driver(String),
46
47 /// cuBLAS-specific error. Retained for back-compat — new library
48 /// actors should emit [`GpuError::LibraryError`] with `lib = "cublas"`
49 /// instead. Will be removed in a future release.
50 #[deprecated(note = "use GpuError::LibraryError { lib: \"cublas\", msg } instead")]
51 #[error("cudarc cuBLAS error: {0}")]
52 Cublas(String),
53
54 /// Generic library error tagged with the originating CUDA library
55 /// name (e.g. `"cudnn"`, `"cufft"`, `"curand"`, `"cusolver"`,
56 /// `"cublaslt"`, `"nvrtc"`, `"nccl"`). Callers that need to
57 /// discriminate library failures match on `lib`.
58 #[error("cudarc {lib} error: {msg}")]
59 LibraryError { lib: &'static str, msg: String },
60
61 #[error("ask timed out before GPU completion")]
62 Timeout,
63}
64
65impl GpuError {
66 /// Construct a tagged library error.
67 pub fn lib(lib: &'static str, msg: impl Into<String>) -> Self {
68 Self::LibraryError {
69 lib,
70 msg: msg.into(),
71 }
72 }
73}
74
75impl GpuError {
76 /// Format suitable for panicking out of an actor handler so that the
77 /// atomr supervisor's decider can route it.
78 pub fn panic_message(&self) -> String {
79 self.to_string()
80 }
81}
82
83/// The supervisor decider used by `DeviceActor` to route `ContextActor`
84/// failures (§5.11).
85///
86/// Rakka 0.2.0 ships a typed `SupervisorOf<C>` trait (see the
87/// [`device_supervisor`] impl below) that lets `DeviceActor` pattern-
88/// match on `&GpuError` directly. The closure-based `decider()` here
89/// is retained as the runtime fallback used by
90/// [`device_supervisor_strategy`] — actors without an explicit
91/// `SupervisorOf<C>` impl fall through to this string-matching path,
92/// and panicking remains the failure transport regardless (since
93/// `Actor::handle` returns `()`). The typed trait simply replaces the
94/// receive-side parsing.
95pub fn decider() -> impl Fn(&str) -> Directive + Send + Sync + 'static {
96 |panic_msg: &str| {
97 if panic_msg.contains(CONTEXT_POISONED_TAG) {
98 Directive::Restart
99 } else if panic_msg.contains(OUT_OF_MEMORY_TAG) {
100 Directive::Resume
101 } else if panic_msg.contains(UNRECOVERABLE_TAG) {
102 Directive::Stop
103 } else {
104 // Default: surface the failure rather than masking it.
105 Directive::Escalate
106 }
107 }
108}
109
110/// Build the `SupervisorStrategy` `DeviceActor` applies to its
111/// `ContextActor` child (§5.11). Three retries inside a one-minute window;
112/// past that, the circuit opens and the device stops.
113pub fn device_supervisor_strategy() -> SupervisorStrategy {
114 OneForOneStrategy::new()
115 .with_max_retries(3)
116 .with_within(Duration::from_secs(60))
117 .with_decider(decider())
118 .into()
119}
120
121/// Typed `SupervisorOf<ContextActor>` adapter for `DeviceActor`.
122///
123/// atomr 0.2.0 added the [`SupervisorOf`] trait so a parent can decide
124/// child failures by pattern-matching a typed error rather than parsing
125/// the panic-string. The implementation here lives behind the
126/// [`device_supervisor`] zero-sized type so it can be used either
127/// independently (`DeviceSupervisor.decide(&err)`) or attached to
128/// future call sites that take a `SupervisorOf<C>` constraint.
129///
130/// We attach the impl to a marker rather than directly to `DeviceActor`
131/// so that the `error` module stays free of a circular dependency on
132/// `device::DeviceActor` / `device::ContextActor`. The decision logic
133/// is identical to the closure in [`decider`] — and indeed
134/// [`DeviceSupervisor::decide_str`] is what the closure-based code path
135/// internally calls.
136pub struct DeviceSupervisor;
137
138impl DeviceSupervisor {
139 /// Typed decider over `&GpuError`. Mirrors the panic-string match
140 /// in [`decider`].
141 pub fn decide(err: &GpuError) -> Directive {
142 match err {
143 GpuError::ContextPoisoned(_) => Directive::Restart,
144 GpuError::OutOfMemory(_) => Directive::Resume,
145 GpuError::Unrecoverable(_) => Directive::Stop,
146 GpuError::Timeout
147 | GpuError::GpuRefStale(_)
148 | GpuError::Driver(_)
149 | GpuError::LibraryError { .. } => Directive::Escalate,
150 #[allow(deprecated)]
151 GpuError::Cublas(_) => Directive::Escalate,
152 }
153 }
154
155 /// Convenience: decide directly from the panic-string transport.
156 /// Equivalent to invoking [`decider`] but available as a free
157 /// function for callers who already have the panic message in
158 /// hand.
159 pub fn decide_str(panic_msg: &str) -> Directive {
160 if panic_msg.contains(CONTEXT_POISONED_TAG) {
161 Directive::Restart
162 } else if panic_msg.contains(OUT_OF_MEMORY_TAG) {
163 Directive::Resume
164 } else if panic_msg.contains(UNRECOVERABLE_TAG) {
165 Directive::Stop
166 } else {
167 Directive::Escalate
168 }
169 }
170}
171
172/// Blanket `SupervisorOf<C>` impl: any atomr actor `C` whose failures
173/// the application classifies as [`GpuError`] can be supervised by
174/// this marker. The trait's `decide` method dispatches to
175/// [`DeviceSupervisor::decide`].
176impl<C> SupervisorOf<C> for DeviceSupervisor
177where
178 C: atomr_core::actor::Actor,
179{
180 type ChildError = GpuError;
181
182 fn decide(&self, err: &GpuError) -> Directive {
183 DeviceSupervisor::decide(err)
184 }
185}
186
187#[cfg(test)]
188mod tests {
189 use super::*;
190
191 #[test]
192 fn decider_routes_context_poisoned_to_restart() {
193 let d = decider();
194 assert_eq!(d("ContextPoisoned: cuInit failed"), Directive::Restart);
195 }
196
197 #[test]
198 fn decider_routes_out_of_memory_to_resume() {
199 let d = decider();
200 assert_eq!(d("OutOfMemory: alloc 1GB"), Directive::Resume);
201 }
202
203 #[test]
204 fn decider_routes_unrecoverable_to_stop() {
205 let d = decider();
206 assert_eq!(d("Unrecoverable: hardware fault"), Directive::Stop);
207 }
208
209 #[test]
210 fn decider_escalates_unknown_panics() {
211 let d = decider();
212 assert_eq!(d("some other panic"), Directive::Escalate);
213 }
214
215 #[test]
216 fn typed_supervisor_routes_context_poisoned_to_restart() {
217 let err = GpuError::ContextPoisoned("simulated".into());
218 assert_eq!(DeviceSupervisor::decide(&err), Directive::Restart);
219 }
220
221 #[test]
222 fn typed_supervisor_routes_oom_to_resume() {
223 let err = GpuError::OutOfMemory("alloc 1GB".into());
224 assert_eq!(DeviceSupervisor::decide(&err), Directive::Resume);
225 }
226
227 #[test]
228 fn typed_supervisor_routes_unrecoverable_to_stop() {
229 let err = GpuError::Unrecoverable("hw fault".into());
230 assert_eq!(DeviceSupervisor::decide(&err), Directive::Stop);
231 }
232
233 #[test]
234 fn typed_supervisor_escalates_other() {
235 let err = GpuError::Timeout;
236 assert_eq!(DeviceSupervisor::decide(&err), Directive::Escalate);
237 let err = GpuError::GpuRefStale("stale");
238 assert_eq!(DeviceSupervisor::decide(&err), Directive::Escalate);
239 let err = GpuError::lib("cublas", "x");
240 assert_eq!(DeviceSupervisor::decide(&err), Directive::Escalate);
241 }
242}