furiosa_visa_std/runtime/
kernel.rs

1use furiosa_mapping::M;
2
3use super::ffi;
4use crate::memory_tensor::HbmTensor;
5use crate::prelude::{Address, HostTensor};
6use crate::scalar::{Opt, Scalar};
7
8/// Opaque handle to a device-runtime buffer.
9///
10/// Wraps a raw pointer returned by the FFI layer. Automatically freed on drop.
11#[derive(Debug)]
12pub struct Buffer(*mut ffi::Buffer);
13
14unsafe impl Send for Buffer {}
15unsafe impl Sync for Buffer {}
16
17impl Drop for Buffer {
18    fn drop(&mut self) {
19        if !self.0.is_null() {
20            unsafe { ffi::lib().furiosa_buffer_free(self.0) }
21        }
22    }
23}
24
25impl Clone for Buffer {
26    fn clone(&self) -> Self {
27        Buffer(unsafe { ffi::lib().furiosa_buffer_clone(self.0) })
28    }
29}
30
31impl Buffer {
32    pub(super) fn from_raw(ptr: *mut ffi::Buffer) -> Self {
33        Buffer(ptr)
34    }
35
36    pub(super) fn as_ptr(&self) -> *const ffi::Buffer {
37        self.0
38    }
39
40    fn cpu(size: usize) -> Self {
41        let ptr = unsafe { ffi::lib().furiosa_buffer_cpu(size) };
42        assert!(!ptr.is_null(), "failed to allocate CPU buffer");
43        Buffer(ptr)
44    }
45
46    fn npu(addr: u64, len: usize) -> Self {
47        Buffer(unsafe { ffi::lib().furiosa_buffer_from_npu(ffi::rt(), addr, len) })
48    }
49
50    fn data_ptr(&self) -> *mut u8 {
51        unsafe { ffi::lib().furiosa_buffer_addr(self.as_ptr()) as *mut u8 }
52    }
53}
54
55/// Device kernel loaded from a serialized binary.
56pub struct Kernel {
57    ptr: *mut ffi::Kernel,
58}
59
60unsafe impl Send for Kernel {}
61unsafe impl Sync for Kernel {}
62
63impl std::fmt::Debug for Kernel {
64    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
65        f.debug_struct("Kernel").finish_non_exhaustive()
66    }
67}
68
69impl Drop for Kernel {
70    fn drop(&mut self) {
71        unsafe { ffi::lib().furiosa_kernel_free(self.ptr) }
72    }
73}
74
75impl Kernel {
76    /// Load kernel from serialized binary. On miss, defers to [`crate::diag`] for structured panic rendering;
77    /// the runtime itself does no formatting or log parsing.
78    pub async fn load(path: &str) -> Self {
79        crate::diag::install_hook();
80        let Ok(data) = std::fs::read(path) else {
81            match crate::diag::failure_payload(path) {
82                Some(payload) => panic!("{payload}"),
83                None => panic!("failed to load kernel `{path}`"),
84            }
85        };
86        log::debug!("load: {} bytes", data.len());
87
88        let ptr = unsafe { ffi::lib().furiosa_kernel_load(ffi::rt(), data.as_ptr(), data.len()) };
89        assert!(!ptr.is_null(), "failed to load kernel");
90
91        log::debug!("load: kernel loaded");
92        Kernel { ptr }
93    }
94
95    /// Copies host tensor to device via DMA.
96    pub async fn write<D: Scalar, Element: M, Chip: M, Element2: M>(
97        host: &HostTensor<D, Element>,
98        addr: Address,
99    ) -> HbmTensor<D, Chip, Element2> {
100        let stride = std::mem::size_of::<D>();
101        let len = host.data().len() * stride;
102        log::debug!("write: addr=0x{addr:x}, len={len}");
103
104        let src = Buffer::cpu(len);
105        let ptr = src.data_ptr();
106        for (i, opt) in host.data().iter().enumerate() {
107            let offset = i * stride;
108            unsafe {
109                match opt {
110                    Opt::Init(val) => {
111                        std::ptr::copy_nonoverlapping(val as *const D as *const u8, ptr.add(offset), stride);
112                    }
113                    Opt::Uninit => {
114                        std::ptr::write_bytes(ptr.add(offset), 0, stride);
115                    }
116                }
117            }
118        }
119
120        let dst = Buffer::npu(addr, len);
121        assert!(
122            unsafe { ffi::lib().furiosa_write(ffi::rt(), src.as_ptr(), dst.as_ptr()) } == 0,
123            "DMA write failed"
124        );
125        unsafe { HbmTensor::from_addr(addr) }
126    }
127
128    /// Copies device tensor to host via DMA.
129    pub async fn read<D: Scalar, Chip: M, Element: M, Element2: M>(
130        hbm: &HbmTensor<D, Chip, Element>,
131    ) -> HostTensor<D, Element2> {
132        let stride = std::mem::size_of::<D>();
133        let len = hbm.data().len() * stride;
134        log::debug!("read: addr=0x{:x}, len={len}", hbm.address());
135
136        let src = Buffer::npu(hbm.address(), len);
137        let dst = Buffer::cpu(len);
138        let count = hbm.data().len();
139        assert!(
140            unsafe { ffi::lib().furiosa_read(ffi::rt(), src.as_ptr(), dst.as_ptr()) } == 0,
141            "DMA read failed"
142        );
143        let ptr = dst.data_ptr() as *const D;
144        let elems: Vec<Opt<D>> = (0..count)
145            .map(|i| Opt::Init(unsafe { std::ptr::read(ptr.add(i)) }))
146            .collect();
147        HostTensor::from_buf(elems)
148    }
149
150    /// Execute kernel.
151    pub async fn run(&self, inputs: &[Buffer], outputs: &[Buffer]) {
152        log::debug!("run: inputs={}, outputs={}", inputs.len(), outputs.len());
153        let in_ptrs: Vec<*const ffi::Buffer> = inputs.iter().map(|b| b.as_ptr()).collect();
154        let out_ptrs: Vec<*const ffi::Buffer> = outputs.iter().map(|b| b.as_ptr()).collect();
155        assert!(
156            unsafe {
157                ffi::lib().furiosa_kernel_run(
158                    self.ptr,
159                    ffi::rt(),
160                    in_ptrs.as_ptr(),
161                    in_ptrs.len(),
162                    out_ptrs.as_ptr(),
163                    out_ptrs.len(),
164                )
165            } == 0,
166            "kernel execution failed"
167        );
168    }
169
170    /// Allocate a buffer on the device.
171    pub fn alloc(&self, size: usize) -> Buffer {
172        let ptr = unsafe { ffi::lib().furiosa_buffer_npu(ffi::rt(), size) };
173        assert!(!ptr.is_null(), "failed to allocate buffer");
174        Buffer(ptr)
175    }
176}