furiosa_visa_std/runtime/
backend.rs

1use furiosa_mapping::*;
2
3use crate::memory_tensor::{Address, HbmTensor, HostTensor};
4use crate::raw_tensor::*;
5use crate::scalar::{Opt, Scalar};
6use crate::tensor::Tensor;
7
8use super::Kernel;
9
10/// Backend for tensor operations.
11///
12/// `Cpu`: interprets operations via mapping expressions (default).
13/// `Npu`: dispatches to device via PCIe DMA.
14pub trait Backend {
15    /// Create a tensor from a flat buffer.
16    fn from_buf<D: Scalar, Mapping: M>(data: Vec<Opt<D>>) -> Tensor<D, Mapping>;
17    /// Serialize a tensor to a flat buffer.
18    fn to_buf<D: Scalar, Mapping: M>(tensor: &Tensor<D, Mapping>) -> Vec<Opt<D>>;
19    /// Transfer host tensor to HBM.
20    fn to_hbm<D: Scalar, Element: M, Chip: M, Element2: M>(
21        host: &HostTensor<D, Element>,
22        address: Address,
23    ) -> impl std::future::Future<Output = HbmTensor<D, Chip, Element2>>;
24    /// Transfer HBM tensor to host.
25    fn to_host<D: Scalar, Chip: M, Element: M, Element2: M>(
26        hbm: &HbmTensor<D, Chip, Element>,
27    ) -> impl std::future::Future<Output = HostTensor<D, Element2>>;
28}
29
30/// CPU backend: mapping-expression-based interpretation.
31#[derive(Debug)]
32pub struct Cpu;
33
34impl Backend for Cpu {
35    fn from_buf<D: Scalar, Mapping: M>(data: Vec<Opt<D>>) -> Tensor<D, Mapping> {
36        let mut inner = RawTensor::from_elem::<Mapping>(Opt::Uninit);
37        let mapping = Mapping::to_value().factorize();
38        for (index, math_index) in Index::new().gen_indexes(mapping).into_iter().enumerate() {
39            inner.write_index(math_index, data[index]);
40        }
41        Tensor::from_raw(inner)
42    }
43
44    fn to_buf<D: Scalar, Mapping: M>(tensor: &Tensor<D, Mapping>) -> Vec<Opt<D>> {
45        let mapping = Mapping::to_value().factorize();
46        Index::new()
47            .gen_indexes(mapping)
48            .into_iter()
49            .map(|index| tensor.raw().read_index(index))
50            .collect()
51    }
52
53    async fn to_hbm<D: Scalar, Element: M, Chip: M, Element2: M>(
54        host: &HostTensor<D, Element>,
55        address: Address,
56    ) -> HbmTensor<D, Chip, Element2> {
57        HbmTensor::new(host.inner_tensor().transpose(true), address)
58    }
59
60    async fn to_host<D: Scalar, Chip: M, Element: M, Element2: M>(
61        hbm: &HbmTensor<D, Chip, Element>,
62    ) -> HostTensor<D, Element2> {
63        hbm.inner_tensor().transpose(true).into()
64    }
65}
66
67/// NPU backend: PCIe DMA to/from device.
68#[derive(Debug)]
69pub struct Npu;
70
71impl Backend for Npu {
72    fn from_buf<D: Scalar, Mapping: M>(data: Vec<Opt<D>>) -> Tensor<D, Mapping> {
73        Tensor::from_raw(RawTensor::<D>::from_vec::<Mapping>(data))
74    }
75
76    fn to_buf<D: Scalar, Mapping: M>(tensor: &Tensor<D, Mapping>) -> Vec<Opt<D>> {
77        tensor.raw().data.iter().cloned().collect()
78    }
79
80    async fn to_hbm<D: Scalar, Element: M, Chip: M, Element2: M>(
81        host: &HostTensor<D, Element>,
82        address: Address,
83    ) -> HbmTensor<D, Chip, Element2> {
84        Kernel::write(host, address).await
85    }
86
87    async fn to_host<D: Scalar, Chip: M, Element: M, Element2: M>(
88        hbm: &HbmTensor<D, Chip, Element>,
89    ) -> HostTensor<D, Element2> {
90        Kernel::read(hbm).await
91    }
92}