Source code for furiosa_llm.artifact.types

import json
import logging
import os
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union

from pydantic import BaseModel, RootModel

from ..models import ModelMetadata
from ..models.config_types import GeneratorConfig, ModelRewritingConfig, ParallelConfig

logger = logging.getLogger(__name__)


[docs] class ArtifactVersion(BaseModel): furiosa_llm: str furiosa_compiler: str
[docs] class ArtifactMetadata(BaseModel): artifact_id: str name: str timestamp: int version: ArtifactVersion
[docs] class Artifact(BaseModel): metadata: ArtifactMetadata devices: str generator_config: GeneratorConfig hf_config: Dict[str, Any] model_metadata: ModelMetadata model_rewriting_config: ModelRewritingConfig parallel_config: ParallelConfig pipelines: List[Dict[str, Any]] = [] def append_pipeline(self, pipeline_dict: Dict[str, Any]): self.pipelines.append(pipeline_dict) def export(self, path: Union[str, os.PathLike]): with open(path, "w") as f: f.write(RootModel[Artifact](self).model_dump_json(indent=2)) @classmethod def load(cls, path: Union[str, os.PathLike]) -> "Artifact": try: with open(path) as f: o = json.load(f) return Artifact(**o) except Exception as e: logger.error(e) raise ValueError("Artifact schema mismatched.")
class RuntimeConfig(BaseModel): """ * npu_queue_limit: Maximum number of tasks that can be queued in the hardward * max_processing_samples: Maximum number of samples that can be processed by the scheduler * spare_blocks_ratio: Ratio of spare blocks that are reserved by scheduler. Smaller value will force the scheduler to use dram aggressively * is_offline: If True, use strategies optimzed for offline scenario * paged_attention_num_blocks: The maximum number of blocks that each k/v storage per layer can store. * prefill_chunk_size: Prefill chunk size used for chunked prefill. """ npu_queue_limit: int max_processing_samples: int spare_blocks_ratio: float is_offline: bool paged_attention_num_blocks: Optional[int] = None prefill_buckets: Optional[Sequence[Tuple[int, int]]] = None decode_buckets: Optional[Sequence[Tuple[int, int]]] = None prefill_chunk_size: Optional[int] = None def export(self, path: Union[str, os.PathLike]): with open(path, "w") as f: f.write(RootModel[RuntimeConfig](self).model_dump_json(indent=2)) @classmethod def load(cls, path: Union[str, os.PathLike]) -> "RuntimeConfig": with open(path) as f: o = json.load(f) return RuntimeConfig(**o)