furiosa_visa_std/runtime/
mod.rs

1//! Runtime for dispatching device functions to CPU or device.
2
3mod backend;
4mod convert;
5mod ffi;
6mod kernel;
7
8pub use backend::{Backend, Cpu, Npu};
9pub use ffi::NpuDesc;
10pub use kernel::{Buffer, Kernel};
11
12/// The [`Backend`] this compilation targets: [`Npu`] under `cfg(furiosa_opt)`, [`Cpu`] otherwise.
13#[cfg(furiosa_opt)]
14pub type CurrentBackend = Npu;
15
16#[cfg(not(furiosa_opt))]
17#[doc(hidden)]
18pub type CurrentBackend = Cpu;
19
20/// Builds the on-disk `.bin` path for a kernel, from the perspective of the `#[device]` macro expansion.
21/// `module_path!()` includes the crate name (`my_crate::sub::mod`), but the plugin's artifact names use
22/// `tcx.def_path_str` which strips it (`sub::mod`); this helper drops the leading crate segment and mangles
23/// `::` → `__` so the stem agrees with `rustc_plugin::build_one`.
24pub fn kernel_path(out_dir: &str, pkg: &str, module_path: &str, fn_name: &str) -> String {
25    let stem = module_path
26        .split("::")
27        .chain(std::iter::once(fn_name))
28        .skip(1)
29        .collect::<Vec<_>>()
30        .join("__");
31    format!("{out_dir}/{pkg}/{stem}.bin")
32}
33
34/// Trait for applying a function to arguments.
35///
36/// Allows `launch(f, (a, b, c))` to call `f(a, b, c)` instead of `f((a, b, c))`. Single reference args can be
37/// passed directly without tuple wrapper.
38pub trait TupleApply<Args> {
39    /// Return type of the function.
40    type Output;
41    /// Apply the function to the arguments.
42    fn apply(self, args: Args) -> Self::Output;
43}
44
45impl<F, A, R> TupleApply<&mut A> for F
46where
47    F: FnOnce(&mut A) -> R,
48{
49    type Output = R;
50    fn apply(self, a: &mut A) -> R {
51        self(a)
52    }
53}
54
55impl<F, A, R> TupleApply<&A> for F
56where
57    F: FnOnce(&A) -> R,
58{
59    type Output = R;
60    fn apply(self, a: &A) -> R {
61        self(a)
62    }
63}
64
65macro_rules! impl_tuple_apply {
66    ($($T:ident),+) => {
67        #[expect(non_snake_case, reason = "type parameters A..Z used as destructuring variable names")]
68        impl<Func, $($T,)+ Ret> TupleApply<($($T,)+)> for Func
69        where
70            Func: FnOnce($($T,)+) -> Ret,
71        {
72            type Output = Ret;
73            fn apply(self, ($($T,)+): ($($T,)+)) -> Ret {
74                self($($T,)+)
75            }
76        }
77    };
78}
79
80impl_tuple_apply!(A, B);
81impl_tuple_apply!(A, B, C);
82impl_tuple_apply!(A, B, C, D);
83impl_tuple_apply!(A, B, C, D, E);
84impl_tuple_apply!(A, B, C, D, E, G);
85impl_tuple_apply!(A, B, C, D, E, G, H);
86impl_tuple_apply!(A, B, C, D, E, G, H, I);
87impl_tuple_apply!(A, B, C, D, E, G, H, I, J);
88impl_tuple_apply!(A, B, C, D, E, G, H, I, J, K);
89impl_tuple_apply!(A, B, C, D, E, G, H, I, J, K, L);
90impl_tuple_apply!(A, B, C, D, E, G, H, I, J, K, L, M);
91impl_tuple_apply!(A, B, C, D, E, G, H, I, J, K, L, M, N);
92impl_tuple_apply!(A, B, C, D, E, G, H, I, J, K, L, M, N, O);
93impl_tuple_apply!(A, B, C, D, E, G, H, I, J, K, L, M, N, O, P);
94impl_tuple_apply!(A, B, C, D, E, G, H, I, J, K, L, M, N, O, P, Q);
95impl_tuple_apply!(A, B, C, D, E, G, H, I, J, K, L, M, N, O, P, Q, R);
96impl_tuple_apply!(A, B, C, D, E, G, H, I, J, K, L, M, N, O, P, Q, R, S);
97impl_tuple_apply!(A, B, C, D, E, G, H, I, J, K, L, M, N, O, P, Q, R, S, T);
98impl_tuple_apply!(A, B, C, D, E, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U);
99impl_tuple_apply!(A, B, C, D, E, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V);
100impl_tuple_apply!(A, B, C, D, E, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V, W);
101
102/// Marker trait for types that can be sent to device functions.
103///
104/// # Implements DeviceSend
105///
106/// - Scalars: `bool`, `i8`-`i64`, `u8`-`u64`, `usize`, `isize`, `f32`, `f64`
107/// - Device memory types: `HbmTensor`, `HbmTensorView`, `HbmTensorViewMut`
108/// - Context types: `&mut Context`
109/// - Tuples of DeviceSend types (for argument composition)
110///
111/// # Does NOT implement DeviceSend
112///
113/// - `HostTensor` - lives in host memory
114/// - `Vec<T>`, `String`, etc. - general collections
115/// - User-defined types - cannot impl without crate access
116pub(crate) trait DeviceSend {}
117
118impl DeviceSend for () {}
119impl DeviceSend for bool {}
120impl DeviceSend for i8 {}
121impl DeviceSend for i16 {}
122impl DeviceSend for i32 {}
123impl DeviceSend for i64 {}
124impl DeviceSend for isize {}
125impl DeviceSend for u8 {}
126impl DeviceSend for u16 {}
127impl DeviceSend for u32 {}
128impl DeviceSend for u64 {}
129impl DeviceSend for usize {}
130impl DeviceSend for f32 {}
131impl DeviceSend for f64 {}
132
133macro_rules! impl_device_send_tuple {
134    ($($T:ident),+) => {
135        impl<$($T: DeviceSend),+> DeviceSend for ($($T,)+) {}
136    };
137}
138
139impl_device_send_tuple!(A);
140impl_device_send_tuple!(A, B);
141impl_device_send_tuple!(A, B, C);
142impl_device_send_tuple!(A, B, C, D);
143impl_device_send_tuple!(A, B, C, D, E);
144impl_device_send_tuple!(A, B, C, D, E, F);
145impl_device_send_tuple!(A, B, C, D, E, F, G);
146impl_device_send_tuple!(A, B, C, D, E, F, G, H);
147impl_device_send_tuple!(A, B, C, D, E, F, G, H, I);
148impl_device_send_tuple!(A, B, C, D, E, F, G, H, I, J);
149impl_device_send_tuple!(A, B, C, D, E, F, G, H, I, J, K);
150impl_device_send_tuple!(A, B, C, D, E, F, G, H, I, J, K, L);
151impl_device_send_tuple!(A, B, C, D, E, F, G, H, I, J, K, L, M);
152impl_device_send_tuple!(A, B, C, D, E, F, G, H, I, J, K, L, M, N);
153impl_device_send_tuple!(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O);
154impl_device_send_tuple!(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P);
155impl_device_send_tuple!(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q);
156impl_device_send_tuple!(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R);
157impl_device_send_tuple!(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S);
158impl_device_send_tuple!(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T);
159impl_device_send_tuple!(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U);
160impl_device_send_tuple!(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V);
161impl_device_send_tuple!(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V, W);
162
163impl<T> DeviceSend for std::marker::PhantomData<T> {}
164
165/// Device function trait, generated by `#[device]` macro.
166///
167/// `cargo <subcommand>`: `execute()` runs the original function body on CPU. `cargo furiosa-opt <subcommand>`:
168/// `execute()` loads the compiled EDF and runs on NPU.
169#[expect(
170    private_bounds,
171    reason = "DeviceSend is intentionally sealed to prevent foreign impls"
172)]
173pub trait DeviceFn<Args: DeviceSend> {
174    /// Return type of the device function.
175    type Output: DeviceSend;
176    /// Execute the device function.
177    fn execute(args: Args) -> impl std::future::Future<Output = Self::Output>;
178}
179
180/// Launches a device function. Takes `F` by value so callers can pass the snake_case const emitted by
181/// `#[device]` (`launch(my_fn, args)`) rather than turbofishing the generated PascalCase unit struct
182/// (`<MyFn as DeviceFn<_>>::execute(args)`). The value is discarded; only its type drives trait dispatch.
183#[expect(
184    private_bounds,
185    reason = "DeviceSend is intentionally sealed to prevent foreign impls"
186)]
187pub async fn launch<F, P>(_f: F, args: P) -> F::Output
188where
189    F: DeviceFn<P>,
190    P: DeviceSend,
191{
192    F::execute(args).await
193}