Source code for rktransformers.configuration

# Copyright 2025 Emmanuel Cortes. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from dataclasses import dataclass, field, fields
from typing import Any

from rktransformers.constants import (
    DEFAULT_BATCH_SIZE,
    DEFAULT_MAX_SEQ_LENGTH,
    DEFAULT_OPSET,
    OpsetType,
    OptimizationLevelType,
    PlatformType,
    QuantizedAlgorithmType,
    QuantizedDtypeType,
    QuantizedMethodType,
    SupportedTaskType,
)
from rktransformers.utils.env_utils import get_rktransformers_version


[docs] @dataclass class QuantizationConfig: """ Configuration for RKNN quantization. Quantization reduces model size and improves inference speed by converting weights and activations to lower precision (e.g., int8 instead of float32). Args: do_quantization: Enable quantization during build. Requires a calibration dataset. dataset_name: HuggingFace dataset name for quantization calibration (e.g., 'wikitext'). Auto-detected: Not auto-detected, must be provided if do_quantization=True. dataset_subset: Subset name for the dataset (e.g., 'ax' for 'nyu-mll/glue'). dataset_size: Number of samples to use from the dataset for calibration. Recommendation: 100-500 samples is usually sufficient. dataset_split: Dataset splits to use (e.g., ["train", "validation"]). Auto-detected: Uses ["train", "validation", "test"] if not specified. dataset_columns: List of dataset columns to use for calibration (e.g., ["question", "context"]). If not specified, falls back to auto-detection. quantized_dtype: Quantization data type. Options: - "w8a8" (8-bit weights and activations) - "w8a16" (8-bit weights, 16-bit activations) - "w16a16i" (16-bit weights, 16-bit activations, int8) - "w16a16i_dfp" (16-bit weights, 16-bit activations, float) - "w4a16" (4-bit weights, 16-bit activations) Recommendation: "w8a8" for best performance, "w16a16" for better accuracy. quantized_algorithm: Quantization calibration algorithm. Options: - "normal" (normal quantization) - "mmse" (minimum mean square error) - "kl_divergence" (Kullback-Leibler divergence) - "gdq" (gradient descent quantization) Recommendation: "normal" is fastest, "kl_divergence" may provide better accuracy. quantized_method: Quantization granularity. Options: - channel: (per-channel) - layer: (per-layer) - group{SIZE}: (group quantization) where SIZE is multiple of 32 between 32 and 256. Recommendation: "channel" provides better accuracy. quantized_hybrid_level: Hybrid quantization level (0-3). Higher values keep more layers in float for better accuracy but larger size. quant_img_RGB2BGR: Convert RGB to BGR during quantization (for image models only). auto_hybrid_cos_thresh: Cosine distance threshold for automatic hybrid quantization. Default: 0.98. Used when auto_hybrid is enabled in build. auto_hybrid_euc_thresh: Euclidean distance threshold for automatic hybrid quantization. Default: None. Used when auto_hybrid is enabled in build. """ # noqa: E501 do_quantization: bool = False dataset_name: str | None = None dataset_subset: str | None = None dataset_size: int = 128 dataset_split: list[str] | None = None dataset_columns: list[str] | None = None quantized_dtype: QuantizedDtypeType = "w8a8" quantized_algorithm: QuantizedAlgorithmType = "normal" quantized_method: QuantizedMethodType = "channel" quantized_hybrid_level: int = 0 quant_img_RGB2BGR: bool = False auto_hybrid_cos_thresh: float = 0.98 auto_hybrid_euc_thresh: float | None = None # Validation def __post_init__(self): if self.do_quantization and not self.dataset_name: # It's possible to provide a custom dataset file path instead of a HF dataset name, # but for this CLI we primarily support HF datasets or pre-processed files. pass
[docs] def to_dict(self) -> dict[str, Any]: """Convert to dictionary for serialization.""" return { "do_quantization": self.do_quantization, "dataset_name": self.dataset_name, "dataset_subset": self.dataset_subset, "dataset_size": self.dataset_size, "dataset_split": self.dataset_split, "dataset_columns": self.dataset_columns, "quantized_dtype": self.quantized_dtype, "quantized_algorithm": self.quantized_algorithm, "quantized_method": self.quantized_method, "quantized_hybrid_level": self.quantized_hybrid_level, "quant_img_RGB2BGR": self.quant_img_RGB2BGR, "auto_hybrid_cos_thresh": self.auto_hybrid_cos_thresh, "auto_hybrid_euc_thresh": self.auto_hybrid_euc_thresh, }
[docs] @dataclass class OptimizationConfig: """ Configuration for RKNN graph optimization. These optimizations transform the model graph for better performance on NPU. Args: optimization_level: Graph optimization level (0-3). - 0: No optimization - 1: Basic optimization - 2: Moderate optimization - 3: Aggressive optimization (recommended) Recommendation: Use 3 for best performance. enable_flash_attention: Enable Flash Attention optimization for transformer models. Significantly improves attention layer performance. Recommendation: Enable for transformer/BERT models. remove_weight: Remove weights from the model (for weight-sharing scenarios). compress_weight: Compress model weights to reduce model size. remove_reshape: Remove redundant reshape operations. sparse_infer: Enable sparse inference optimization. model_pruning: Enable model pruning to remove redundant connections. """ optimization_level: OptimizationLevelType = 0 enable_flash_attention: bool = False remove_weight: bool = False compress_weight: bool = False remove_reshape: bool = False sparse_infer: bool = False model_pruning: bool = False
[docs] def to_dict(self) -> dict[str, Any]: """Convert to dictionary for serialization.""" return { "optimization_level": self.optimization_level, "enable_flash_attention": self.enable_flash_attention, "remove_weight": self.remove_weight, "compress_weight": self.compress_weight, "remove_reshape": self.remove_reshape, "sparse_infer": self.sparse_infer, "model_pruning": self.model_pruning, }
[docs] @dataclass class RKNNConfig: """ This configures the conversion of ONNX models to RKNN format for Rockchip NPU. Args: target_platform: Target Rockchip platform. Options: - "rk3588" - "rk3576" - "rk3568" - "rk3566" - "rk3562" Auto-detected: Not auto-detected, defaults to "rk3588". quantization: Quantization configuration (see :class:`QuantizationConfig`). optimization: Optimization configuration (see :class:`OptimizationConfig`). model_input_names: Names of model inputs (e.g., ["input_ids", "attention_mask", "token_type_ids"]). Auto-detected: Optimum automatically determines required inputs during ONNX export based on the model's architecture. - BERT models that use segment embeddings: Will include token_type_ids automatically - RoBERTa, sentence transformers: Will exclude token_type_ids automatically Note: This parameter is primarily used for RKNN conversion. The ONNX export process inspects the exported model to determine actual inputs. type_vocab_size: Token type vocabulary size (informational, from model's config.json). Auto-detected: Read from model's config.json. batch_size: Batch size for input shapes during ONNX export and RKNN conversion (e.g., [batch_size, max_seq_length]). This controls the shape of inputs, not which inputs are included. Example: batch_size=1 creates inputs shaped [1, 128], batch_size=4 creates [4, 128]. max_seq_length: Maximum sequence length for input shapes during ONNX export and RKNN conversion. Auto-detected: Read from model's config.json (max_position_embeddings). Falls back to 512 if not found in config. Note: large sequence length causes the RKNN export to segmentation fault. float_dtype: Floating point data type for non-quantized operations. Options: - "float16" mean_values: Mean values for input normalization (for image models). std_values: Standard deviation values for input normalization (for image models). custom_string: Custom configuration string passed to RKNN toolkit. inputs_yuv_fmt: YUV format for inputs (for image models). single_core_mode: Run model on single NPU core instead of multi-core. Only applicable for rk3588. Reduces model size. dynamic_input: Dynamic input shapes configuration. Experimental feature with sparse support on Rockchip NPUs. Format: [[[batch_size,seq_length],[batch_size,seq_length],...],[[batch_size,seq_length],[batch_size,seq_length],...],...]. e.g.: [[[1,128],[1,128],...],[[1,256],[1,256],...],...]. op_target: Specify the target device for specific operations. Useful for offloading operations to the CPU which are not supported by the NPU. Format: {'op_id':'cpu', 'op_id3':'cpu'}. Default is None. Example:: unsupported_used = [(node.op_type, node.name) for node in model.graph.node if node.op_type in unsupported] op_target = {n:"cpu" for _,n in unsupported_used} model_name_or_path: Path to input ONNX model file or Hugging Face model ID. output_path: Path for output RKNN model file or directory. Optional. Defaults to the model's parent directory (for local files) or current directory (for Hub models). push_to_hub: Upload the exported model to HuggingFace Hub. hub_model_id: HuggingFace Hub repository ID (required if push_to_hub=True). Should include username/namespace (e.g., "username/model-name"). If no namespace is provided (e.g., "model-name"), the username will be auto-detected from the token via whoami() API. hub_token: HuggingFace Hub authentication token. hub_private_repo: Create a private repository on HuggingFace Hub. opset: ONNX opset version. Minimum: 14 (required for SDPA). Maximum: 19. (maximum supported by RKNN). (default: 19). task: Task type for export (default: "auto"). - 'auto': Uses optimum to detect the task based on model architecture. - Can be used to export models supported by optimum and not rk-transformers runtime functionality, in which case, the user is responsible for developing inference code using rknn-toolkit-lite2 library or subclassing `rktransformers.RKModel`. - ForSequenceClassification -> sequence-classification - ForMaskedLM -> fill-mask - ForQuestionAnswering -> question-answering - ForTokenClassification -> token-classification - ForMultipleChoice -> multiple-choice - ForFeatureExtraction -> feature-extraction - Fallback: feature-extraction if XYZModel task_kwargs: Task-specific keyword arguments for ONNX export (dict[str, Any]). Example: For multiple-choice tasks, use {"num_choices": 4}. These kwargs are passed directly to optimum's main_export function. """ # noqa: E501 target_platform: PlatformType = "rk3588" quantization: QuantizationConfig = field(default_factory=QuantizationConfig) optimization: OptimizationConfig = field(default_factory=OptimizationConfig) # Model input configuration model_input_names: list[str] | None = None type_vocab_size: int | None = None # Model dimensions batch_size: int = DEFAULT_BATCH_SIZE max_seq_length: int | None = DEFAULT_MAX_SEQ_LENGTH task_kwargs: dict[str, Any] | None = None # RKNN-specific parameters float_dtype: str = "float16" mean_values: list[list[float]] | None = None std_values: list[list[float]] | None = None custom_string: str | None = None inputs_yuv_fmt: list[str] | None = None single_core_mode: bool = False dynamic_input: list[list[list[int]]] | None = None op_target: dict[str, str] | None = None # Export settings model_name_or_path: str | None = None output_path: str | None = None push_to_hub: bool = False hub_model_id: str | None = None hub_token: str | None = None hub_private_repo: bool = False hub_create_pr: bool = False # Optimum export settings opset: OpsetType | None = DEFAULT_OPSET task: SupportedTaskType = "auto"
[docs] def to_dict(self) -> dict[str, Any]: """ Convert config to dictionary for RKNN.config(). This includes only parameters relevant to RKNN.config() otherwise RKNN will raise errors. """ config_dict = { "target_platform": self.target_platform, "mean_values": self.mean_values, "std_values": self.std_values, # batch_size and max_seq_length are not direct RKNN.config args but used in build/load "quantized_dtype": self.quantization.quantized_dtype, "quantized_algorithm": self.quantization.quantized_algorithm, "quantized_method": self.quantization.quantized_method, "quantized_hybrid_level": self.quantization.quantized_hybrid_level, "quant_img_RGB2BGR": self.quantization.quant_img_RGB2BGR, "float_dtype": self.float_dtype, "optimization_level": self.optimization.optimization_level, "custom_string": self.custom_string, "remove_weight": self.optimization.remove_weight, "compress_weight": self.optimization.compress_weight, "inputs_yuv_fmt": self.inputs_yuv_fmt, "single_core_mode": self.single_core_mode, "dynamic_input": self.dynamic_input, "model_pruning": self.optimization.model_pruning, "remove_reshape": self.optimization.remove_reshape, "sparse_infer": self.optimization.sparse_infer, "enable_flash_attention": self.optimization.enable_flash_attention, "op_target": self.op_target, "auto_hybrid_cos_thresh": self.quantization.auto_hybrid_cos_thresh, "auto_hybrid_euc_thresh": self.quantization.auto_hybrid_euc_thresh, } # Filter out None values to let RKNN use defaults return {k: v for k, v in config_dict.items() if v is not None}
[docs] def to_export_dict(self) -> dict[str, Any]: """ Convert complete config to dictionary for export/persistence in config.json ("rknn" key). This includes ALL configuration parameters for reproducibility. """ export_dict = { # rktransformers configuration "rktransformers_version": get_rktransformers_version(), # Model configuration "model_input_names": self.model_input_names, "batch_size": self.batch_size, "max_seq_length": self.max_seq_length, "task_kwargs": self.task_kwargs, "float_dtype": self.float_dtype, # Platform configuration "target_platform": self.target_platform, "single_core_mode": self.single_core_mode, # Other RKNN parameters "mean_values": self.mean_values, "std_values": self.std_values, "custom_string": self.custom_string, "inputs_yuv_fmt": self.inputs_yuv_fmt, "dynamic_input": self.dynamic_input, # Optimum export settings "opset": self.opset, "task": self.task, # Nested configurations "quantization": self.quantization.to_dict(), "optimization": self.optimization.to_dict(), } return {k: v for k, v in export_dict.items()}
[docs] @classmethod def from_dict(cls, data: dict[str, Any]) -> "RKNNConfig": """Load configuration from a dictionary.""" # Create copies of nested configs to avoid modifying input data = data.copy() if "quantization" in data and isinstance(data["quantization"], dict): data["quantization"] = QuantizationConfig(**data["quantization"]) if "optimization" in data and isinstance(data["optimization"], dict): data["optimization"] = OptimizationConfig(**data["optimization"]) # Filter arguments to only those accepted by __init__ valid_fields = {f.name for f in fields(cls)} init_args = {k: v for k, v in data.items() if k in valid_fields} return cls(**init_args)