Skip to main content

atomr_accel_cuda/
error.rs

1//! Error taxonomy and the supervisor decider for context-poisoning recovery
2//! (§5.3, §5.11 of the architecture document).
3//!
4//! Rakka's supervisor (`atomr_core::supervision::Decider`) inspects a panic
5//! message string rather than a typed error value. To trigger the doc's
6//! `OneForOne::default().on::<ContextPoisoned>(Restart)` behaviour, an
7//! actor that detects context poisoning panics with a message containing
8//! the string `"ContextPoisoned"`. The decider exposed here parses these
9//! markers back into supervisor directives.
10
11use atomr_core::supervision::{Directive, OneForOneStrategy, SupervisorOf, SupervisorStrategy};
12use std::time::Duration;
13use thiserror::Error;
14
15/// Marker prefix used in panic messages to signal a poisoned-context error.
16/// Matched by [`decider`].
17pub const CONTEXT_POISONED_TAG: &str = "ContextPoisoned";
18/// Marker prefix for OOM errors that the supervisor should `Resume` past.
19pub const OUT_OF_MEMORY_TAG: &str = "OutOfMemory";
20/// Marker prefix for fatal errors that should stop the device entirely.
21pub const UNRECOVERABLE_TAG: &str = "Unrecoverable";
22
23#[derive(Debug, Error)]
24pub enum GpuError {
25    /// CUDA context is in a sticky-error state (§5.3). Triggers
26    /// `ContextActor` restart and a generation bump on `DeviceState`.
27    #[error("ContextPoisoned: {0}")]
28    ContextPoisoned(String),
29
30    /// Allocation failed but the context is still usable. Supervisor
31    /// `Resume`s the actor.
32    #[error("OutOfMemory: {0}")]
33    OutOfMemory(String),
34
35    /// Hardware fault or repeated poisoning past the retry budget.
36    #[error("Unrecoverable: {0}")]
37    Unrecoverable(String),
38
39    /// `GpuRef::access()` was called on a buffer whose context was
40    /// rebuilt or whose `DeviceActor` is shutting down (§5.8).
41    #[error("GpuRef stale: {0}")]
42    GpuRefStale(&'static str),
43
44    #[error("cudarc driver error: {0}")]
45    Driver(String),
46
47    /// cuBLAS-specific error. Retained for back-compat — new library
48    /// actors should emit [`GpuError::LibraryError`] with `lib = "cublas"`
49    /// instead. Will be removed in a future release.
50    #[deprecated(note = "use GpuError::LibraryError { lib: \"cublas\", msg } instead")]
51    #[error("cudarc cuBLAS error: {0}")]
52    Cublas(String),
53
54    /// Generic library error tagged with the originating CUDA library
55    /// name (e.g. `"cudnn"`, `"cufft"`, `"curand"`, `"cusolver"`,
56    /// `"cublaslt"`, `"nvrtc"`, `"nccl"`). Callers that need to
57    /// discriminate library failures match on `lib`.
58    #[error("cudarc {lib} error: {msg}")]
59    LibraryError { lib: &'static str, msg: String },
60
61    #[error("ask timed out before GPU completion")]
62    Timeout,
63}
64
65impl GpuError {
66    /// Construct a tagged library error.
67    pub fn lib(lib: &'static str, msg: impl Into<String>) -> Self {
68        Self::LibraryError {
69            lib,
70            msg: msg.into(),
71        }
72    }
73}
74
75impl GpuError {
76    /// Format suitable for panicking out of an actor handler so that the
77    /// atomr supervisor's decider can route it.
78    pub fn panic_message(&self) -> String {
79        self.to_string()
80    }
81}
82
83/// The supervisor decider used by `DeviceActor` to route `ContextActor`
84/// failures (§5.11).
85///
86/// Rakka 0.2.0 ships a typed `SupervisorOf<C>` trait (see the
87/// [`device_supervisor`] impl below) that lets `DeviceActor` pattern-
88/// match on `&GpuError` directly. The closure-based `decider()` here
89/// is retained as the runtime fallback used by
90/// [`device_supervisor_strategy`] — actors without an explicit
91/// `SupervisorOf<C>` impl fall through to this string-matching path,
92/// and panicking remains the failure transport regardless (since
93/// `Actor::handle` returns `()`). The typed trait simply replaces the
94/// receive-side parsing.
95pub fn decider() -> impl Fn(&str) -> Directive + Send + Sync + 'static {
96    |panic_msg: &str| {
97        if panic_msg.contains(CONTEXT_POISONED_TAG) {
98            Directive::Restart
99        } else if panic_msg.contains(OUT_OF_MEMORY_TAG) {
100            Directive::Resume
101        } else if panic_msg.contains(UNRECOVERABLE_TAG) {
102            Directive::Stop
103        } else {
104            // Default: surface the failure rather than masking it.
105            Directive::Escalate
106        }
107    }
108}
109
110/// Build the `SupervisorStrategy` `DeviceActor` applies to its
111/// `ContextActor` child (§5.11). Three retries inside a one-minute window;
112/// past that, the circuit opens and the device stops.
113pub fn device_supervisor_strategy() -> SupervisorStrategy {
114    OneForOneStrategy::new()
115        .with_max_retries(3)
116        .with_within(Duration::from_secs(60))
117        .with_decider(decider())
118        .into()
119}
120
121/// Typed `SupervisorOf<ContextActor>` adapter for `DeviceActor`.
122///
123/// atomr 0.2.0 added the [`SupervisorOf`] trait so a parent can decide
124/// child failures by pattern-matching a typed error rather than parsing
125/// the panic-string. The implementation here lives behind the
126/// [`device_supervisor`] zero-sized type so it can be used either
127/// independently (`DeviceSupervisor.decide(&err)`) or attached to
128/// future call sites that take a `SupervisorOf<C>` constraint.
129///
130/// We attach the impl to a marker rather than directly to `DeviceActor`
131/// so that the `error` module stays free of a circular dependency on
132/// `device::DeviceActor` / `device::ContextActor`. The decision logic
133/// is identical to the closure in [`decider`] — and indeed
134/// [`DeviceSupervisor::decide_str`] is what the closure-based code path
135/// internally calls.
136pub struct DeviceSupervisor;
137
138impl DeviceSupervisor {
139    /// Typed decider over `&GpuError`. Mirrors the panic-string match
140    /// in [`decider`].
141    pub fn decide(err: &GpuError) -> Directive {
142        match err {
143            GpuError::ContextPoisoned(_) => Directive::Restart,
144            GpuError::OutOfMemory(_) => Directive::Resume,
145            GpuError::Unrecoverable(_) => Directive::Stop,
146            GpuError::Timeout
147            | GpuError::GpuRefStale(_)
148            | GpuError::Driver(_)
149            | GpuError::LibraryError { .. } => Directive::Escalate,
150            #[allow(deprecated)]
151            GpuError::Cublas(_) => Directive::Escalate,
152        }
153    }
154
155    /// Convenience: decide directly from the panic-string transport.
156    /// Equivalent to invoking [`decider`] but available as a free
157    /// function for callers who already have the panic message in
158    /// hand.
159    pub fn decide_str(panic_msg: &str) -> Directive {
160        if panic_msg.contains(CONTEXT_POISONED_TAG) {
161            Directive::Restart
162        } else if panic_msg.contains(OUT_OF_MEMORY_TAG) {
163            Directive::Resume
164        } else if panic_msg.contains(UNRECOVERABLE_TAG) {
165            Directive::Stop
166        } else {
167            Directive::Escalate
168        }
169    }
170}
171
172/// Blanket `SupervisorOf<C>` impl: any atomr actor `C` whose failures
173/// the application classifies as [`GpuError`] can be supervised by
174/// this marker. The trait's `decide` method dispatches to
175/// [`DeviceSupervisor::decide`].
176impl<C> SupervisorOf<C> for DeviceSupervisor
177where
178    C: atomr_core::actor::Actor,
179{
180    type ChildError = GpuError;
181
182    fn decide(&self, err: &GpuError) -> Directive {
183        DeviceSupervisor::decide(err)
184    }
185}
186
187#[cfg(test)]
188mod tests {
189    use super::*;
190
191    #[test]
192    fn decider_routes_context_poisoned_to_restart() {
193        let d = decider();
194        assert_eq!(d("ContextPoisoned: cuInit failed"), Directive::Restart);
195    }
196
197    #[test]
198    fn decider_routes_out_of_memory_to_resume() {
199        let d = decider();
200        assert_eq!(d("OutOfMemory: alloc 1GB"), Directive::Resume);
201    }
202
203    #[test]
204    fn decider_routes_unrecoverable_to_stop() {
205        let d = decider();
206        assert_eq!(d("Unrecoverable: hardware fault"), Directive::Stop);
207    }
208
209    #[test]
210    fn decider_escalates_unknown_panics() {
211        let d = decider();
212        assert_eq!(d("some other panic"), Directive::Escalate);
213    }
214
215    #[test]
216    fn typed_supervisor_routes_context_poisoned_to_restart() {
217        let err = GpuError::ContextPoisoned("simulated".into());
218        assert_eq!(DeviceSupervisor::decide(&err), Directive::Restart);
219    }
220
221    #[test]
222    fn typed_supervisor_routes_oom_to_resume() {
223        let err = GpuError::OutOfMemory("alloc 1GB".into());
224        assert_eq!(DeviceSupervisor::decide(&err), Directive::Resume);
225    }
226
227    #[test]
228    fn typed_supervisor_routes_unrecoverable_to_stop() {
229        let err = GpuError::Unrecoverable("hw fault".into());
230        assert_eq!(DeviceSupervisor::decide(&err), Directive::Stop);
231    }
232
233    #[test]
234    fn typed_supervisor_escalates_other() {
235        let err = GpuError::Timeout;
236        assert_eq!(DeviceSupervisor::decide(&err), Directive::Escalate);
237        let err = GpuError::GpuRefStale("stale");
238        assert_eq!(DeviceSupervisor::decide(&err), Directive::Escalate);
239        let err = GpuError::lib("cublas", "x");
240        assert_eq!(DeviceSupervisor::decide(&err), Directive::Escalate);
241    }
242}