Source code for furiosa_llm.artifact.types

import json
import logging
import os
from typing import Any, Dict, List, Optional, Tuple, Union

from pydantic import BaseModel, RootModel
from typing_extensions import Self

from furiosa_llm.models.metadata import LLMConfig, ModelMetadata

from ..models.config_types import (
    GeneratorConfig,
    ModelRewritingConfig,
    ParallelConfig,
    PipelineMetadata,
)

logger = logging.getLogger(__name__)


[docs] class ArtifactVersion(BaseModel): furiosa_llm: str furiosa_compiler: str
[docs] class ArtifactMetadata(BaseModel): artifact_id: str name: str timestamp: int version: ArtifactVersion
class ModelMetadataForArtifact(ModelMetadata): """ Child class of ModelMetadata used for loading artifacts. The object doesn't cause any huggingface config / weight download for obtaining all configs needed for artifact loading. """ config_: Optional[Dict[str, Any]] = None model_qname_: Optional[str] = None def __init__( self, pretrained_id: str, task_type: Optional[str] = None, llm_config: LLMConfig = LLMConfig(), hf_configs: Dict = {}, model_weight_path: Optional[os.PathLike] = None, trust_remote_code: Optional[bool] = None, config_: Optional[Dict[str, Any]] = None, model_qname_: Optional[str] = None, ): super(ModelMetadataForArtifact, self).__init__( pretrained_id, task_type, llm_config, hf_configs, model_weight_path, trust_remote_code, ) self.config_ = config_ self.model_qname_ = model_qname_ @classmethod def from_metadata( cls, model_metadata: ModelMetadata, config: Optional[Dict[str, Any]] = None, model_qname: Optional[str] = None, ) -> Self: return cls( **model_metadata.model_dump(), config_=config, model_qname_=model_qname, ) @property def config_dict(self) -> Dict[str, Any]: if self.config_ is None: return super().config_dict return self.config_ @property def model_qname(self) -> str: if self.model_qname_ is None: return super().model_qname return self.model_qname_
[docs] class Artifact(BaseModel): metadata: ArtifactMetadata devices: str generator_config: GeneratorConfig hf_config: Dict[str, Any] model_metadata: ModelMetadata model_rewriting_config: ModelRewritingConfig parallel_config: ParallelConfig pipelines: List[Dict[str, Any]] = [] pipeline_metadata_list: Optional[List[PipelineMetadata]] = None # TODO: store this field somewhere else. max_prompt_len: Optional[int] = None def append_pipeline(self, pipeline_dict: Dict[str, Any]): self.pipelines.append(pipeline_dict) def export(self, path: Union[str, os.PathLike]): with open(path, "w") as f: f.write(RootModel[Artifact](self).model_dump_json(indent=2)) @classmethod def load(cls, path: Union[str, os.PathLike]) -> "Artifact": try: with open(path) as f: o = json.load(f) return Artifact(**o) except Exception as e: logger.error(e) raise ValueError("Artifact schema mismatched.")
class RuntimeConfig(BaseModel): """ * npu_queue_limit: Maximum number of tasks that can be queued in the hardward * max_processing_samples: Maximum number of samples that can be processed by the scheduler * spare_blocks_ratio: Ratio of spare blocks that are reserved by scheduler. Smaller value will force the scheduler to use dram aggressively * is_offline: If True, use strategies optimzed for offline scenario * paged_attention_num_blocks: The maximum number of blocks that each k/v storage per layer can store. * prefill_chunk_size: Prefill chunk size used for chunked prefill. """ npu_queue_limit: int max_processing_samples: int spare_blocks_ratio: float is_offline: bool paged_attention_num_blocks: Optional[int] = None prefill_buckets: Optional[List[Tuple[int, int]]] = None decode_buckets: Optional[List[Tuple[int, int]]] = None prefill_chunk_size: Optional[int] = None def export(self, path: Union[str, os.PathLike]): with open(path, "w") as f: f.write(RootModel[RuntimeConfig](self).model_dump_json(indent=2)) @classmethod def load(cls, path: Union[str, os.PathLike]) -> "RuntimeConfig": with open(path) as f: o = json.load(f) return RuntimeConfig(**o)