Skip to main content

atomr_accel_cuda/hopper/
mod.rs

1//! Hopper / Blackwell primitives (Phase 5).
2//!
3//! Hopper (sm_90 / sm_90a) introduced four kernel-side primitives that
4//! materially change how high-throughput CUDA kernels are written:
5//!
6//! 1. **Tensor Memory Accelerator (TMA)** — bulk asynchronous tensor
7//!    copies between global and shared memory described by an opaque
8//!    `CUtensorMap` (built host-side via `cuTensorMapEncodeTiled`).
9//!    See [`tma`].
10//! 2. **WGMMA** — warp-group matrix multiply accumulate, the
11//!    successor to MMA. Issued from a warpgroup of 128 threads via
12//!    `wgmma.mma_async.sync`. See [`wgmma`].
13//! 3. **`cp.async`** — already on Ampere, but Hopper adds the
14//!    bulk-asynchronous TMA-driven `cp.async.bulk` that fences with
15//!    barrier objects. See [`cp_async`].
16//! 4. **Thread-block clusters** — a new launch dimension above grid /
17//!    block that exposes Distributed Shared Memory (DSM) and the
18//!    `cluster.sync` barrier. See [`cluster`].
19//!
20//! Blackwell (sm_100 / sm_120) adds the second-generation TMA, larger
21//! cluster sizes, the new fp4 / fp6 / mxfp variants, and tensor memory
22//! (TMEM) that backs `tcgen05.mma`. The `blackwell` cargo feature gates
23//! the additional intrinsics; the host-side wrappers are shared with
24//! Hopper through this module.
25//!
26//! ## Layout
27//!
28//! * [`tma`] — `TensorMapDescriptor` builder + the safe wrapper around
29//!   `cuTensorMapEncodeTiled`.
30//! * [`wgmma`] — public re-exports of the macro-defined `wgmma_*`
31//!   intrinsics (definitions live in `include/atomr_hopper.cuh`).
32//! * [`cp_async`] — `cp.async` pipeline macro shims.
33//! * [`cluster`] — [`LaunchSpec`] and the safe wrapper around
34//!   `cudaLaunchKernelExC` for cluster-dim launches; DSM helpers.
35
36pub mod cluster;
37pub mod cp_async;
38pub mod tma;
39pub mod wgmma;
40
41pub use cluster::{ClusterDim, LaunchSpec};
42pub use tma::{TensorMapDataType, TensorMapDescriptor, TensorMapInterleave, TensorMapSwizzle};
43
44/// Path to the vendored hopper header (`atomr_hopper.cuh`) shipped
45/// alongside the crate. NVRTC kernels can `--include-path` this and
46/// `#include "atomr_hopper.cuh"` to pick up the wgmma / cp.async /
47/// cluster macro shims.
48pub const ATOMR_HOPPER_HEADER_REL_PATH: &str = "include/atomr_hopper.cuh";
49
50/// Returns the absolute filesystem path to `atomr_hopper.cuh` if it
51/// exists alongside the crate sources. Returns `None` for installations
52/// that strip the `include/` directory (e.g. crates.io binary
53/// publication of just the compiled lib).
54pub fn atomr_hopper_header_path() -> Option<std::path::PathBuf> {
55    let p = std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join(ATOMR_HOPPER_HEADER_REL_PATH);
56    if p.exists() {
57        Some(p)
58    } else {
59        None
60    }
61}
62
63#[cfg(test)]
64mod tests {
65    use super::*;
66
67    #[test]
68    fn header_path_resolves_in_workspace() {
69        // The header ships in-tree under `include/`; in the workspace
70        // build the file must exist.
71        let p = atomr_hopper_header_path();
72        assert!(
73            p.is_some(),
74            "atomr_hopper.cuh must ship alongside the crate"
75        );
76    }
77}