atomr_accel_cuda/hopper/mod.rs
1//! Hopper / Blackwell primitives (Phase 5).
2//!
3//! Hopper (sm_90 / sm_90a) introduced four kernel-side primitives that
4//! materially change how high-throughput CUDA kernels are written:
5//!
6//! 1. **Tensor Memory Accelerator (TMA)** — bulk asynchronous tensor
7//! copies between global and shared memory described by an opaque
8//! `CUtensorMap` (built host-side via `cuTensorMapEncodeTiled`).
9//! See [`tma`].
10//! 2. **WGMMA** — warp-group matrix multiply accumulate, the
11//! successor to MMA. Issued from a warpgroup of 128 threads via
12//! `wgmma.mma_async.sync`. See [`wgmma`].
13//! 3. **`cp.async`** — already on Ampere, but Hopper adds the
14//! bulk-asynchronous TMA-driven `cp.async.bulk` that fences with
15//! barrier objects. See [`cp_async`].
16//! 4. **Thread-block clusters** — a new launch dimension above grid /
17//! block that exposes Distributed Shared Memory (DSM) and the
18//! `cluster.sync` barrier. See [`cluster`].
19//!
20//! Blackwell (sm_100 / sm_120) adds the second-generation TMA, larger
21//! cluster sizes, the new fp4 / fp6 / mxfp variants, and tensor memory
22//! (TMEM) that backs `tcgen05.mma`. The `blackwell` cargo feature gates
23//! the additional intrinsics; the host-side wrappers are shared with
24//! Hopper through this module.
25//!
26//! ## Layout
27//!
28//! * [`tma`] — `TensorMapDescriptor` builder + the safe wrapper around
29//! `cuTensorMapEncodeTiled`.
30//! * [`wgmma`] — public re-exports of the macro-defined `wgmma_*`
31//! intrinsics (definitions live in `include/atomr_hopper.cuh`).
32//! * [`cp_async`] — `cp.async` pipeline macro shims.
33//! * [`cluster`] — [`LaunchSpec`] and the safe wrapper around
34//! `cudaLaunchKernelExC` for cluster-dim launches; DSM helpers.
35
36pub mod cluster;
37pub mod cp_async;
38pub mod tma;
39pub mod wgmma;
40
41pub use cluster::{ClusterDim, LaunchSpec};
42pub use tma::{TensorMapDataType, TensorMapDescriptor, TensorMapInterleave, TensorMapSwizzle};
43
44/// Path to the vendored hopper header (`atomr_hopper.cuh`) shipped
45/// alongside the crate. NVRTC kernels can `--include-path` this and
46/// `#include "atomr_hopper.cuh"` to pick up the wgmma / cp.async /
47/// cluster macro shims.
48pub const ATOMR_HOPPER_HEADER_REL_PATH: &str = "include/atomr_hopper.cuh";
49
50/// Returns the absolute filesystem path to `atomr_hopper.cuh` if it
51/// exists alongside the crate sources. Returns `None` for installations
52/// that strip the `include/` directory (e.g. crates.io binary
53/// publication of just the compiled lib).
54pub fn atomr_hopper_header_path() -> Option<std::path::PathBuf> {
55 let p = std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join(ATOMR_HOPPER_HEADER_REL_PATH);
56 if p.exists() {
57 Some(p)
58 } else {
59 None
60 }
61}
62
63#[cfg(test)]
64mod tests {
65 use super::*;
66
67 #[test]
68 fn header_path_resolves_in_workspace() {
69 // The header ships in-tree under `include/`; in the workspace
70 // build the file must exist.
71 let p = atomr_hopper_header_path();
72 assert!(
73 p.is_some(),
74 "atomr_hopper.cuh must ship alongside the crate"
75 );
76 }
77}