furiosa_visa_std/runtime/
kernel.rs1use furiosa_mapping::M;
2
3use super::ffi;
4use crate::memory_tensor::HbmTensor;
5use crate::prelude::{Address, HostTensor};
6use crate::scalar::{Opt, Scalar};
7
8#[derive(Debug)]
12pub struct Buffer(*mut ffi::Buffer);
13
14unsafe impl Send for Buffer {}
15unsafe impl Sync for Buffer {}
16
17impl Drop for Buffer {
18 fn drop(&mut self) {
19 if !self.0.is_null() {
20 unsafe { ffi::lib().furiosa_buffer_free(self.0) }
21 }
22 }
23}
24
25impl Clone for Buffer {
26 fn clone(&self) -> Self {
27 Buffer(unsafe { ffi::lib().furiosa_buffer_clone(self.0) })
28 }
29}
30
31impl Buffer {
32 pub(super) fn from_raw(ptr: *mut ffi::Buffer) -> Self {
33 Buffer(ptr)
34 }
35
36 pub(super) fn as_ptr(&self) -> *const ffi::Buffer {
37 self.0
38 }
39
40 fn cpu(size: usize) -> Self {
41 let ptr = unsafe { ffi::lib().furiosa_buffer_cpu(size) };
42 assert!(!ptr.is_null(), "failed to allocate CPU buffer");
43 Buffer(ptr)
44 }
45
46 fn npu(addr: u64, len: usize) -> Self {
47 Buffer(unsafe { ffi::lib().furiosa_buffer_from_npu(ffi::rt(), addr, len) })
48 }
49
50 fn data_ptr(&self) -> *mut u8 {
51 unsafe { ffi::lib().furiosa_buffer_addr(self.as_ptr()) as *mut u8 }
52 }
53}
54
55pub struct Kernel {
57 ptr: *mut ffi::Kernel,
58}
59
60unsafe impl Send for Kernel {}
61unsafe impl Sync for Kernel {}
62
63impl std::fmt::Debug for Kernel {
64 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
65 f.debug_struct("Kernel").finish_non_exhaustive()
66 }
67}
68
69impl Drop for Kernel {
70 fn drop(&mut self) {
71 unsafe { ffi::lib().furiosa_kernel_free(self.ptr) }
72 }
73}
74
75impl Kernel {
76 pub async fn load(path: &str) -> Self {
79 crate::diag::install_hook();
80 let Ok(data) = std::fs::read(path) else {
81 match crate::diag::failure_payload(path) {
82 Some(payload) => panic!("{payload}"),
83 None => panic!("failed to load kernel `{path}`"),
84 }
85 };
86 log::debug!("load: {} bytes", data.len());
87
88 let ptr = unsafe { ffi::lib().furiosa_kernel_load(ffi::rt(), data.as_ptr(), data.len()) };
89 assert!(!ptr.is_null(), "failed to load kernel");
90
91 log::debug!("load: kernel loaded");
92 Kernel { ptr }
93 }
94
95 pub async fn write<D: Scalar, Element: M, Chip: M, Element2: M>(
97 host: &HostTensor<D, Element>,
98 addr: Address,
99 ) -> HbmTensor<D, Chip, Element2> {
100 let stride = std::mem::size_of::<D>();
101 let len = host.data().len() * stride;
102 log::debug!("write: addr=0x{addr:x}, len={len}");
103
104 let src = Buffer::cpu(len);
105 let ptr = src.data_ptr();
106 for (i, opt) in host.data().iter().enumerate() {
107 let offset = i * stride;
108 unsafe {
109 match opt {
110 Opt::Init(val) => {
111 std::ptr::copy_nonoverlapping(val as *const D as *const u8, ptr.add(offset), stride);
112 }
113 Opt::Uninit => {
114 std::ptr::write_bytes(ptr.add(offset), 0, stride);
115 }
116 }
117 }
118 }
119
120 let dst = Buffer::npu(addr, len);
121 assert!(
122 unsafe { ffi::lib().furiosa_write(ffi::rt(), src.as_ptr(), dst.as_ptr()) } == 0,
123 "DMA write failed"
124 );
125 unsafe { HbmTensor::from_addr(addr) }
126 }
127
128 pub async fn read<D: Scalar, Chip: M, Element: M, Element2: M>(
130 hbm: &HbmTensor<D, Chip, Element>,
131 ) -> HostTensor<D, Element2> {
132 let stride = std::mem::size_of::<D>();
133 let len = hbm.data().len() * stride;
134 log::debug!("read: addr=0x{:x}, len={len}", hbm.address());
135
136 let src = Buffer::npu(hbm.address(), len);
137 let dst = Buffer::cpu(len);
138 let count = hbm.data().len();
139 assert!(
140 unsafe { ffi::lib().furiosa_read(ffi::rt(), src.as_ptr(), dst.as_ptr()) } == 0,
141 "DMA read failed"
142 );
143 let ptr = dst.data_ptr() as *const D;
144 let elems: Vec<Opt<D>> = (0..count)
145 .map(|i| Opt::Init(unsafe { std::ptr::read(ptr.add(i)) }))
146 .collect();
147 HostTensor::from_buf(elems)
148 }
149
150 pub async fn run(&self, inputs: &[Buffer], outputs: &[Buffer]) {
152 log::debug!("run: inputs={}, outputs={}", inputs.len(), outputs.len());
153 let in_ptrs: Vec<*const ffi::Buffer> = inputs.iter().map(|b| b.as_ptr()).collect();
154 let out_ptrs: Vec<*const ffi::Buffer> = outputs.iter().map(|b| b.as_ptr()).collect();
155 assert!(
156 unsafe {
157 ffi::lib().furiosa_kernel_run(
158 self.ptr,
159 ffi::rt(),
160 in_ptrs.as_ptr(),
161 in_ptrs.len(),
162 out_ptrs.as_ptr(),
163 out_ptrs.len(),
164 )
165 } == 0,
166 "kernel execution failed"
167 );
168 }
169
170 pub fn alloc(&self, size: usize) -> Buffer {
172 let ptr = unsafe { ffi::lib().furiosa_buffer_npu(ffi::rt(), size) };
173 assert!(!ptr.is_null(), "failed to allocate buffer");
174 Buffer(ptr)
175 }
176}