atomr_accel_cuda/kernel/blas_lt/
workspace.rs

1//! cuBLASLt workspace pool — recycles per-heuristic device buffers.
2//!
3//! cuBLASLt's `cublasLtMatmul` takes an opaque `workspace` device
4//! pointer + a `workspaceSizeInBytes`. The size depends on the
5//! selected algorithm; the heuristic cache reports a per-algorithm
6//! `workspaceSize` and we want to avoid allocating a fresh slab on
7//! every call. This pool buckets free slabs by **rounded-up size
8//! class** (next power of two ≥ requested) and hands them out under
9//! a `WorkspaceLease` RAII guard that returns the slab on Drop.
10//!
11//! The pool is intentionally a **plain struct** (not a separate
12//! actor) because `BlasLtActor` already has single-threaded ownership
13//! of all matmul calls. Wrapping it in another actor would just add a
14//! mailbox hop on the hot path. If a future phase needs cross-actor
15//! sharing we'll lift this to its own actor exactly the way
16//! `PinnedBufferPool` (whose allocator pattern this module mirrors)
17//! is structured.
18
19use std::collections::HashMap;
20use std::sync::Arc;
21
22use cudarc::driver::{CudaSlice, CudaStream};
23use parking_lot::Mutex;
24
25use crate::error::GpuError;
26
27/// Default cap on number of pooled slabs per size class. Beyond this,
28/// excess returns are dropped instead of pooled. With the default 256
29/// distinct heuristic shapes and a typical 2-3 distinct workspace
30/// classes (4 MiB, 32 MiB, 256 MiB) we expect ≤ a few hundred MiB of
31/// pinned VRAM in steady state.
32pub const DEFAULT_POOL_CAPACITY_PER_CLASS: usize = 4;
33
34/// Round a workspace request up to the next power of two ≥ 1 KiB.
35/// Bucketing by power-of-two limits the long-tail of unique sizes
36/// the pool tracks.
37pub fn size_class(bytes: usize) -> usize {
38    bytes.max(1024).next_power_of_two()
39}
40
41/// Inner pool state — guarded by a single mutex.
42struct WorkspacePoolInner {
43    /// Free slabs grouped by size class.
44    free: HashMap<usize, Vec<Arc<CudaSlice<u8>>>>,
45    /// Maximum number of free slabs to retain per size class.
46    per_class_capacity: usize,
47    /// Sum of bytes currently held in `free`. Tracked for
48    /// observability + to feed a future high-watermark eviction.
49    bytes_pooled: usize,
50}
51
52/// Cloneable handle to the workspace pool.
53#[derive(Clone)]
54pub struct WorkspacePool {
55    inner: Arc<Mutex<WorkspacePoolInner>>,
56}
57
58impl WorkspacePool {
59    pub fn new() -> Self {
60        Self::with_capacity(DEFAULT_POOL_CAPACITY_PER_CLASS)
61    }
62
63    pub fn with_capacity(per_class: usize) -> Self {
64        Self {
65            inner: Arc::new(Mutex::new(WorkspacePoolInner {
66                free: HashMap::new(),
67                per_class_capacity: per_class.max(1),
68                bytes_pooled: 0,
69            })),
70        }
71    }
72
73    /// Acquire a workspace slab of at least `requested_bytes`. If the
74    /// pool has a free slab in the matching size class it's reused;
75    /// otherwise a fresh `CudaSlice<u8>` is allocated against `stream`.
76    ///
77    /// The returned [`WorkspaceLease`] auto-returns to the pool on
78    /// Drop. Callers should *not* manually clone the inner slice
79    /// across the boundary — the kernel envelope already keeps the
80    /// `Arc<CudaSlice<u8>>` alive for the duration of the kernel.
81    pub fn acquire(
82        &self,
83        stream: &Arc<CudaStream>,
84        requested_bytes: usize,
85    ) -> Result<WorkspaceLease, GpuError> {
86        let class = size_class(requested_bytes.max(1));
87
88        // Check the pool first.
89        let pooled = {
90            let mut g = self.inner.lock();
91            if let Some(bucket) = g.free.get_mut(&class) {
92                let popped = bucket.pop();
93                if let Some(ref s) = popped {
94                    g.bytes_pooled = g.bytes_pooled.saturating_sub(s.len());
95                }
96                popped
97            } else {
98                None
99            }
100        };
101
102        let slab = match pooled {
103            Some(s) => s,
104            None => {
105                let s = unsafe { stream.alloc::<u8>(class) }.map_err(|e| {
106                    GpuError::OutOfMemory(format!("cublaslt workspace alloc {class}B: {e}"))
107                })?;
108                Arc::new(s)
109            }
110        };
111
112        Ok(WorkspaceLease {
113            slab: Some(slab),
114            class,
115            pool: self.inner.clone(),
116        })
117    }
118
119    /// Number of slabs currently in the free list across all size
120    /// classes. Diagnostic only.
121    pub fn pooled_slabs(&self) -> usize {
122        let g = self.inner.lock();
123        g.free.values().map(|v| v.len()).sum()
124    }
125
126    /// Total bytes currently in the free list.
127    pub fn pooled_bytes(&self) -> usize {
128        self.inner.lock().bytes_pooled
129    }
130}
131
132impl Default for WorkspacePool {
133    fn default() -> Self {
134        Self::new()
135    }
136}
137
138/// RAII lease — returns the slab to the pool on Drop. Callers can
139/// take a shared reference to the inner slice for kernel launch via
140/// [`WorkspaceLease::slice`].
141pub struct WorkspaceLease {
142    slab: Option<Arc<CudaSlice<u8>>>,
143    class: usize,
144    pool: Arc<Mutex<WorkspacePoolInner>>,
145}
146
147impl WorkspaceLease {
148    /// Reference to the underlying device slice (`CudaSlice<u8>`).
149    pub fn slice(&self) -> &Arc<CudaSlice<u8>> {
150        self.slab
151            .as_ref()
152            .expect("WorkspaceLease::slice after Drop")
153    }
154
155    /// Size class (rounded-up bytes) of the leased slab.
156    pub fn size(&self) -> usize {
157        self.class
158    }
159}
160
161impl Drop for WorkspaceLease {
162    fn drop(&mut self) {
163        let Some(slab) = self.slab.take() else {
164            return;
165        };
166        let mut g = self.pool.lock();
167        let cap = g.per_class_capacity;
168        let bucket = g.free.entry(self.class).or_default();
169        if bucket.len() < cap {
170            let bytes = slab.len();
171            bucket.push(slab);
172            g.bytes_pooled = g.bytes_pooled.saturating_add(bytes);
173        }
174        // else: drop the slab; CudaSlice's Drop frees the device memory.
175    }
176}
177
178#[cfg(test)]
179mod tests {
180    //! These tests stay GPU-free by exercising the pool's bookkeeping
181    //! through a hand-rolled `WorkspaceLease` path that reuses an
182    //! `Arc<CudaSlice<u8>>` we never construct (we use a synthetic
183    //! return helper that mirrors the Drop contract).
184    use super::*;
185
186    /// Reach into pool internals from tests to seed a free-list slab
187    /// without touching the GPU. Mirrors the Drop path.
188    fn seed_free_slot(pool: &WorkspacePool, class: usize, slab_bytes: usize) {
189        let mut g = pool.inner.lock();
190        // Synthesize a fake CudaSlice<u8>… we can't, so seed the
191        // bookkeeping directly: track the bytes_pooled but skip the
192        // actual `Arc<CudaSlice<u8>>` since constructing one without
193        // a context isn't possible. The pool exposes `pooled_bytes`
194        // so the recycle test verifies size-class accounting only.
195        g.bytes_pooled = g.bytes_pooled.saturating_add(slab_bytes);
196        // Ensure the bucket key exists (so we know recycling happens
197        // *into* this class on a subsequent return).
198        g.free.entry(class).or_default();
199    }
200
201    #[test]
202    fn size_class_rounds_up() {
203        assert_eq!(size_class(1), 1024);
204        assert_eq!(size_class(1024), 1024);
205        assert_eq!(size_class(1025), 2048);
206        assert_eq!(size_class(4 * 1024 * 1024), 4 * 1024 * 1024);
207        assert_eq!(size_class(4 * 1024 * 1024 + 1), 8 * 1024 * 1024);
208        assert_eq!(size_class(0), 1024);
209    }
210
211    #[test]
212    fn workspace_pool_recycles() {
213        // We can't allocate real CudaSlices without a GPU, so this
214        // test exercises the pool's bookkeeping (size_class +
215        // pooled_bytes accounting) and checks that the bucket map
216        // tracks classes correctly across "returns".
217        let pool = WorkspacePool::with_capacity(2);
218        assert_eq!(pool.pooled_slabs(), 0);
219
220        seed_free_slot(&pool, size_class(4 * 1024 * 1024), 4 * 1024 * 1024);
221        assert_eq!(pool.pooled_bytes(), 4 * 1024 * 1024);
222
223        seed_free_slot(&pool, size_class(33_554_432), 33_554_432);
224        assert_eq!(pool.pooled_bytes(), 4 * 1024 * 1024 + 33_554_432);
225        // Two distinct size classes — both buckets exist.
226        assert!(pool
227            .inner
228            .lock()
229            .free
230            .contains_key(&size_class(4 * 1024 * 1024)));
231        assert!(pool.inner.lock().free.contains_key(&size_class(33_554_432)));
232    }
233
234    #[test]
235    fn pool_capacity_clamps_to_one() {
236        let pool = WorkspacePool::with_capacity(0);
237        assert_eq!(pool.inner.lock().per_class_capacity, 1);
238    }
239}
atomr_accel_cuda/kernel/blas_lt/workspace.rs

atomr_accel_cuda/kernel/blas_lt/
workspace.rs