Add model

Browse files

Files changed (14) hide show

README.md +199 -0
acip_model.py +179 -0
config.json +99 -0
generation_config.json +4 -0
model-00001-of-00005.safetensors +3 -0
model-00002-of-00005.safetensors +3 -0
model-00003-of-00005.safetensors +3 -0
model-00004-of-00005.safetensors +3 -0
model-00005-of-00005.safetensors +3 -0
model.safetensors.index.json +0 -0
parametrized_layer.py +211 -0
parametrized_model.py +747 -0
projected_layer.py +308 -0
utils.py +83 -0

README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+library_name: transformers
+tags: []
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]

acip_model.py ADDED Viewed

	@@ -0,0 +1,179 @@

+from typing import Any
+import torch
+from transformers import PreTrainedModel
+from .parametrized_model import ParametrizedModel, ParametrizedModelConfig
+class ACIPModelConfig(ParametrizedModelConfig):
+    """
+    Configuration for `ACIPModel`. Same functionality as `ParametrizedModelConfig`.
+    See Also:
+        - `ParametrizedModelConfig`
+        - `ACIPModel`
+    """
+    model_type = "acip_model"
+class ACIPModel(ParametrizedModel):
+    """
+    This class extends `ParametrizedModel` by additional functionality required for ACIP.
+    It manages a `score_map` that stores the scores of the parametrized modules' target parameters,
+    which are updated during tuning by the ACIP method.
+    Moreover, it provides `prune_model_by_score` that prunes the target parameters of the model according to
+    their scores to achieve any given compression ratio.
+    Notes: The `score_map` is managed in float32 internally because a lower precision may lead to unexpected numerical
+        inaccuracies in the resulting parameter ranking. Fortunately, the memory consumption is negligible compared to
+        the model weights itself.
+    See Also:
+        - `ParametrizedModel`
+        - `ACIPModelConfig`
+    """
+    config_class = ACIPModelConfig
+    def __init__(self, config: ACIPModelConfig, base_model: PreTrainedModel | None = None, **_: Any):
+        super().__init__(config, base_model)
+        self.config = config  # redundant but enables type hinting for ACIPModelConfig
+        self._score_map: dict[str, torch.Tensor] | None = None
+        # Register and initialize score map buffers
+        # Important: don't run _update_score_map here because load_state_dict might still override the buffers
+        self._init_score_map_buffers()
+    def _init_score_map_buffers(self):
+        """
+        Register and initialize score map buffers in parametrized modules (with random numbers).
+        Each target parameter "p_name" is associated with a buffer "p_name_score" that stores its score vector.
+        """
+        for m_name, module in self.parametrized_modules.items():
+            for p_name, param in module.parametrization.get_target_params().items():
+                module.parametrization.register_buffer(p_name + "_score", torch.ones_like(param.data).float())
+    def _update_score_map(self):
+        """Render `score_map` from the parametrized modules' score buffers."""
+        self._score_map = {}
+        for m_name, module in self.parametrized_modules.items():
+            for p_name in module.parametrization.get_target_params().keys():
+                self._score_map[f"{m_name}.parametrization.{p_name}"] = module.parametrization.get_buffer(
+                    p_name + "_score"
+                )
+    @property
+    def score_map(self) -> dict[str, torch.Tensor]:
+        """Returns the score map as Tensor dictionary whose keys match those of `self.get_target_params`."""
+        if self._score_map is None:
+            self._update_score_map()
+        return self._score_map
+    @score_map.setter
+    def score_map(self, score_map: dict[str, torch.Tensor]) -> None:
+        """
+        Updates `score_map` and the corresponding parametrized modules' score buffers.
+        Args:
+            score_map: Dictionary whose keys should match (a subset of) `self.get_target_params`.
+        """
+        if self._score_map is None:
+            self._update_score_map()
+        # score_map.keys() can be a subset of self.get_target_params().keys()
+        for p_name, score in score_map.items():
+            buffer = self.model.get_buffer(p_name + "_score")
+            if buffer.shape != score.shape:
+                raise ValueError(
+                    f"Score map for '{p_name}' has incorrect shape: expected {buffer.shape}, got {score.shape}"
+                )
+            # cast to float32 to avoid numerical instabilities
+            buffer.copy_(score.detach().float())
+            self._score_map[p_name] = buffer
+    def _predict_compression_ratio_by_score(self, k: int, full: bool = False) -> tuple[float, dict[str, torch.Tensor]]:
+        """
+        Helper function that checks what would happen if the k smallest target parameters are pruned
+        according to the global score map ranking. It returns the resulting compression ratio
+        and the corresponding parameter masks.
+        Args:
+            k: Number of target parameters to prune.
+            full: Whether to count the number of parameters of the entire model or only the parametrized modules.
+                See also `ParametrizedModel.get_num_params`.
+        Returns: Tuple of compression ratio and parameter masks. The masks indicate which parameters to keep.
+        """
+        # Find the threshold value for the k smallest entries according to the global score map ranking.
+        score_map_cat = torch.cat([param.flatten() for param in self.score_map.values()])
+        threshold = torch.kthvalue(score_map_cat, k).values.item()
+        # Create a set of parameter masks marking which values to keep.
+        param_masks = {}
+        for p_name, score in self.score_map.items():
+            param_masks[p_name] = (score > threshold).to(dtype=score.dtype)
+        # Compute hypothetical compression ratio if param_masks would be used as masks for the target parameters.
+        compression_ratio = self.get_compression_ratio(full=full, target_params=param_masks)
+        return compression_ratio, param_masks
+    def _get_param_masks(self, compression_ratio: float, full: bool = False) -> dict[str, torch.Tensor]:
+        """
+        Helper function that determines which parameters to keep to reach a target compression ratio.
+        Instead of looping over `k -> _predict_compression_ratio_by_score(k)`, a binary search can be used because
+        the compression ratio is monotonically increasing in k.
+        Args:
+            compression_ratio: Target compression ratio.
+            full: Whether to count the number of parameters of the entire model or only the parametrized modules.
+                See also `ParametrizedModel.get_num_params`.
+        Returns: Parameter masks indicating which parameters to keep to reach the target compression ratio.
+        """
+        if compression_ratio == 1.0:
+            return {p_name: torch.ones_like(score) for p_name, score in self.score_map.items()}
+        # Perform a binary search to find the smallest k such that the compression ratio is at least compression_ratio.
+        # Here, k_lo and k_hi are the lower and upper bound of the search interval.
+        k_lo, k_hi = 1, sum(score.numel() for score in self.score_map.values())
+        while k_lo < k_hi:
+            k_mid = (k_lo + k_hi + 1) // 2  # round up to ensure low <= mid
+            ratio, _ = self._predict_compression_ratio_by_score(k=k_mid, full=full)
+            if ratio > compression_ratio:
+                k_lo = k_mid
+            else:
+                k_hi = k_mid - 1
+        k = k_lo
+        # TODO: handle tie-breaks
+        return self._predict_compression_ratio_by_score(k=k, full=full)[1]
+    def prune_model_by_score(self, compression_ratio: float, full: bool = False) -> None:
+        """
+        This method prunes the target parameters of the model according to their scores to achieve
+        a given compression ratio.
+        This can be efficiently implemented by a simple binary search strategy:
+        We find the smallest number of parameters to be pruned according to the score map ranking
+        such that the resulting compression ratio is at least the target `compression_ratio`.
+        Args:
+            compression_ratio: The target compression ratio.
+            full: Whether to count the number of parameters of the entire model or only the parametrized modules.
+                See also `ParametrizedModel.get_num_params`.
+        """
+        param_masks = self._get_param_masks(compression_ratio=compression_ratio, full=full)
+        # Reset the target parameters according to the parameter masks
+        for p_name, param in self.get_target_params().items():
+            param.data[param_masks[p_name] > 0.0] = 1.0  # dummy value, will be rescaled by reset_target_params
+            param.data[param_masks[p_name] == 0.0] = 0.0
+        for m_name, module in self.parametrized_modules.items():
+            if any(p_name.startswith(m_name) for p_name in param_masks.keys()):
+                module.parametrization.reset_target_params(mode="nonzero")
+# Register ACIPModelConfig and ACIPModel for AutoModel
+# Required to push custom model to Huggingface Hub (see https://huggingface.co/docs/transformers/en/custom_models)
+ACIPModelConfig.register_for_auto_class()
+ACIPModel.register_for_auto_class("AutoModel")

config.json ADDED Viewed

	@@ -0,0 +1,99 @@

+{
+  "_name_or_path": "/rwthfs/rz/cluster/home/cp343770/p-res-mwl-llmcompression/artifacts/runs/paper_v3/compress__llama1_7b/model",
+  "adapter_config": {
+    "peft_config": {
+      "default": {
+        "alpha_pattern": {},
+        "auto_mapping": null,
+        "base_model_name_or_path": "jeffwan/llama-7b-hf",
+        "bias": "none",
+        "eva_config": null,
+        "exclude_modules": [
+          "base",
+          "parametrization",
+          "ortho"
+        ],
+        "fan_in_fan_out": false,
+        "inference_mode": false,
+        "init_lora_weights": true,
+        "layer_replication": null,
+        "layers_pattern": null,
+        "layers_to_transform": null,
+        "loftq_config": {},
+        "lora_alpha": 16,
+        "lora_bias": false,
+        "lora_dropout": 0.05,
+        "megatron_config": null,
+        "megatron_core": "megatron.core",
+        "modules_to_save": null,
+        "peft_type": "LORA",
+        "r": 32,
+        "rank_pattern": {},
+        "revision": null,
+        "target_modules": [
+          "q_proj",
+          "base",
+          "o_proj",
+          "gate_proj",
+          "down_proj",
+          "up_proj",
+          "v_proj",
+          "k_proj",
+          "ortho"
+        ],
+        "task_type": "CAUSAL_LM",
+        "use_dora": false,
+        "use_rslora": false
+      }
+    }
+  },
+  "architectures": [
+    "ACIPModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "acip_model.ACIPModelConfig",
+    "AutoModel": "acip_model.ACIPModel"
+  },
+  "base_model_config": {
+    "pretrained_config": null,
+    "pretrained_model_cls": "transformers.models.auto.modeling_auto.AutoModelForCausalLM",
+    "pretrained_model_kwargs": {
+      "pretrained_model_name_or_path": "jeffwan/llama-7b-hf",
+      "torch_dtype": "bfloat16"
+    }
+  },
+  "model_mode": "train",
+  "model_type": "acip_model",
+  "parametrization_config": {
+    "exclude_modules": null,
+    "module_factory_cls": "svd",
+    "module_factory_kwargs": {
+      "mask_func": "ste",
+      "mask_scaling_factor": 0.02
+    },
+    "target_modules": [
+      "k_proj",
+      "down_proj",
+      "o_proj",
+      "v_proj",
+      "gate_proj",
+      "q_proj",
+      "up_proj"
+    ]
+  },
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.46.3",
+  "weight_quantization_config": {
+    "exclude_modules": null,
+    "module_factory_cls": "bitsandbytes.nn.Linear4bit",
+    "module_factory_kwargs": {
+      "compute_dtype": "torch.bfloat16",
+      "quant_type": "fp4"
+    },
+    "target_modules": [
+      "ortho",
+      "base",
+      "base_layer"
+    ]
+  }
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.46.3"
+}

model-00001-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ef5fdccfa838748228160684b44f4c37a99a944cd330ae6af63d94188e9c8e3
+size 4979348184

model-00002-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d6182e14726f23f7f25faa53bab676e79acf5d3c3197624a2d15f7d458755db6
+size 4989605584

model-00003-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dda70e06b09128d1b3309f98651a4fe9fb111bc360716f3d5e10e327a606a847
+size 5000010480

model-00004-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fbf327dd596abbea5916f36ae9b76eea6f45440e8c1fce3430fa8feaa153e203
+size 4910324992

model-00005-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:392b87946099ac53941f45818bb40be4af393d694f23bb7b7465674ba3503104
+size 1281214432

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

parametrized_layer.py ADDED Viewed

	@@ -0,0 +1,211 @@

+from abc import ABC, abstractmethod
+from typing import ClassVar, Literal, Protocol, runtime_checkable, Type
+import torch
+from torch import nn
+class Parametrization(nn.Module, ABC):
+    """
+    Abstract base class for parametrizations.
+    A parametrization can be injected into any torch module of type `base_class` by `parametrize_module`.
+    A parametrized module will follow the `ParametrizedModule` interface.
+    This will overload the weight, bias, and forward of the module so that they play together with
+    the parametrization. The external behavior of the parametrized module remains unchanged, for instance,
+    a parametrized `Linear` module will still work as expected.
+    Attributes:
+        base_class: The base class of the module that can be parametrized.
+        initialized: A flag that indicates whether the parametrization has been initialized.
+    """
+    initialized: bool = False
+    base_class: ClassVar[Type[nn.Module]]
+    def initialize(self, base_module: "Parametrization.base_class") -> None:
+        self._initialize(base_module)
+        self.initialized = True
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Compute the forward pass of the parametrization.
+        This is particularly important when a standard forward pass based on `weight` would be inefficient.
+        """
+        assert self.initialized
+        x = self._forward(x)
+        return x
+    @property
+    def weight(self) -> torch.Tensor:
+        """Compute the weight tensor of the parametrization."""
+        return self._weight()
+    @property
+    def bias(self) -> torch.Tensor | None:
+        """Compute the bias tensor of the parametrization."""
+        return self._bias()
+    @abstractmethod
+    def _forward(self, x: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+    @abstractmethod
+    def _initialize(self, base_module: "Parametrization.base_class") -> None:
+        """
+        Initialize the parametrization based on a given base module.
+        This method should build the internal representation the module's weight and bias,
+        registering all required buffers and parameters in `self`.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def _weight(self) -> torch.Tensor:
+        raise NotImplementedError
+    @abstractmethod
+    def _bias(self) -> torch.Tensor | None:
+        raise NotImplementedError
+    @abstractmethod
+    def get_target_params(self) -> dict[str, torch.nn.Parameter]:
+        """
+        Return the (tunable) target parameters of the parametrization.
+        Here, "target parameters" means that they can be tuned and potentially compressed
+        by `self.reset_target_params(mode="compress")`.
+        Other torch parameters of the module could be tuned as well, but should not returned here.
+        The returned dictionary should be compatible with `self.named_parameters()`.
+        See Also:
+            - `ParametrizedModel.get_target_params`
+            - `ParametrizedModel.compress`
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def reset_target_params(self, mode: Literal["full", "nonzero", "compress"] = "full") -> None:
+        """
+        Reset the target parameters of the parametrization according to a given mode.
+        Args:
+            mode: The reset mode.
+                "full" means reset to original value at initialization.
+                "nonzero" means reset all non-zero values to original value at initialization.
+                "compress" means the all zero values are removed and the the parameters are compressed accordingly.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def get_num_params(self, compressed: bool = False, target_params: dict[str, torch.Tensor] | None = None) -> int:
+        """
+        Computes the (effective) number of parameters of the parametrization.
+        Args:
+            compressed: Whether to count the number of parameters as if the module was actually compressed.
+                If `False`, the number of parameters is the same as in the original module.
+            target_params: Count the number of parameters as if `target_params` were used instead of
+                `self.get_target_params()`. This "what if" feature is important when pruning
+                a full `ParametrizedModel` to a certain target ratio.
+        """
+        raise NotImplementedError
+@runtime_checkable
+class ParametrizedModule(Protocol):
+    """
+    Interface for a parametrized `nn.Module`.
+    It ensures that `weight` and `bias` are forwarded to the `Parametrization` instance.
+    Attributes:
+        parametrization: The `Parametrization` instance of the module.
+        _forward: The original forward function of the module.
+        __old_class__: The original class of the module.
+    Notes:
+        `_forward` and `__old_class__` are used by `parametrize_module` and `unparametrize_module`
+         to allow restoring the original behavior of the module.
+    """
+    parametrization: Parametrization
+    _forward: callable
+    __old_class__: type[nn.Module]
+    @property
+    def weight(self):
+        return self.parametrization.weight
+    @property
+    def bias(self):
+        return self.parametrization.bias
+def parametrize_module(module: nn.Module, parametrization: Parametrization) -> ParametrizedModule and nn.Module:
+    """
+    Parametrize a module using a `Parametrization` instance.
+    Args:
+        module: The module to be parametrized.
+        parametrization: The `Parametrization` instance to be applied to the module.
+    Returns: The parametrized module using the `ParametrizedModule` interface.
+    Notes:
+        Adopted from https://stackoverflow.com/a/31075641
+    """
+    assert isinstance(module, parametrization.base_class)
+    module.__old_class__ = module.__class__
+    # Initializes the parametrization and adds it to the module
+    module.add_module("parametrization", parametrization)
+    module.parametrization.initialize(module)
+    # Save the original forward in case we want to remove the parametrization again
+    module._forward = module.forward
+    # Cast to new parametrized object class type
+    del module.weight
+    del module.bias
+    module.__class__ = type("Parametrized" + module.__class__.__name__, (module.__class__, ParametrizedModule), {})
+    # Make sure that we utilize the forward function of the parametrization
+    module.forward = module.parametrization.forward
+    return module
+def unparametrize_module(module: ParametrizedModule) -> nn.Module:
+    """
+    Revert the parametrization of a module.
+    Args:
+        module: A module that has been parametrized by `parametrize_module`.
+    Returns: The original module.
+    Notes:
+        Adopted from https://stackoverflow.com/a/31075641
+    """
+    # Make sure to save weight and bias in intermediate variables
+    weight = module.weight
+    bias = module.bias
+    assert isinstance(module, nn.Module)
+    # This line will remove properties module.weight and module.bias
+    module.__class__ = type(module.__old_class__.__name__, (module.__old_class__,), {})
+    delattr(module, "__old_class__")
+    # Add weight and bias as native parameters to the module again
+    module.register_parameter("weight", nn.Parameter(weight, weight.requires_grad))
+    if bias is not None:
+        module.register_parameter("bias", nn.Parameter(bias, bias.requires_grad))
+    else:
+        module.register_parameter("bias", None)
+    # Recover the original forward pass and get rid of the parametrization
+    del module.parametrization
+    module.forward = module._forward
+    delattr(module, "_forward")
+    return module

parametrized_model.py ADDED Viewed

	@@ -0,0 +1,747 @@

+import logging
+import os
+from dataclasses import asdict, dataclass, field
+from typing import Any, Literal, Type
+import torch
+from peft import PeftConfig
+from peft.tuners.tuners_utils import _maybe_include_all_linear_layers, check_target_module_exists
+from torch import nn
+from transformers import AutoConfig, PretrainedConfig, PreTrainedModel
+from .parametrized_layer import Parametrization, parametrize_module, ParametrizedModule, unparametrize_module
+from .projected_layer import SVDLinearParametrization
+from .utils import get_class_from_str, get_str_from_class, init_empty_weights
+logger = logging.getLogger(__name__)
+@dataclass
+class BaseModelConfig:
+    """
+    Configuration for the base model to be parametrized by `ParametrizedModel`.
+    Attributes:
+        pretrained_model_cls: The class of the base model. Child class of `PreTrainedModel`.
+        pretrained_model_kwargs: Keyword arguments used when creating the base model in the constructor
+            of `ParametrizedModel` via `from_pretrained`.
+        pretrained_config: Optional config used when creating the base model in the constructor
+            of `ParametrizedModel` via `from_pretrained`.
+    See Also:
+        `ParametrizedModelConfig`
+    """
+    pretrained_model_cls: Type[PreTrainedModel]
+    pretrained_model_kwargs: dict[str, Any] = field(default_factory=dict)
+    pretrained_config: PretrainedConfig | None = None
+    def __post_init__(self):
+        # if pretrained_model_cls is a string, convert it to a class (required for deserialization from JSON config)
+        if isinstance(self.pretrained_model_cls, str):
+            self.pretrained_model_cls = get_class_from_str(self.pretrained_model_cls)  # noqa
+        else:
+            self.pretrained_model_cls = self.pretrained_model_cls
+    def to_dict(self) -> dict[str, Any]:
+        config_dict = asdict(self)  # type: ignore
+        # make sure that pretrained_model_cls and pretrained_config are JSON serializable
+        config_dict["pretrained_model_cls"] = get_str_from_class(self.pretrained_model_cls)
+        if self.pretrained_config is not None:
+            config_dict["pretrained_config"] = self.pretrained_config.to_dict()
+        return config_dict
+    @classmethod
+    def from_dict(cls, config_dict: dict[str, Any]) -> "BaseModelConfig":
+        # try to deserialize pretrained_config with AutoConfig otherwise fall back to PretrainedConfig
+        try:
+            if config_dict["pretrained_config"] is not None:
+                # try AutoConfig to find the right model config class
+                config_dict["pretrained_config"] = AutoConfig.for_model(**config_dict["pretrained_config"])
+        except ValueError:
+            logger.warning("Unrecognized model identifier in AutoConfig, using PretrainedConfig instead.")
+            config_dict["pretrained_config"] = PretrainedConfig.from_dict(config_dict["pretrained_config"])
+        return cls(**config_dict)
+# Predefined parametrization classes for `ParametrizationConfig.module_factory_cls` (avoids absolute package imports)
+PARAMETRIZATION_FACTORY_REGISTRY: dict[str, Type[Parametrization]] = {
+    "svd": SVDLinearParametrization,
+}
+@dataclass
+class ParametrizationConfig:
+    """
+    Configuration for the parametrization to be applied to the linear layers of the base model in `ParametrizedModel`.
+    Attributes:
+        module_factory_cls: The class name of the parametrization to be applied to linear layers.
+            Can be a string representing a class name (with absolute module path) or a predefined key
+            from `PARAMETRIZATION_FACTORY_REGISTRY`.
+            Use `parse_module_factory_cls` to get the actual class when creating the parametrization.
+        module_factory_kwargs: Keyword arguments used when creating the parametrization with `module_factory_cls`.
+        target_modules: A (list of) string(s) specifying the names of the linear layers to be parametrized.
+            Follows the same semantics as Huggingface's `PeftConfig`, see also `check_target_module_exists`.
+            If a string, a regex match will be performed; if a list, a module will be parametrized if its name ends
+            with any of the strings in `target_modules`.
+        exclude_modules: A list of strings specifying the names of the linear layers to be excluded from
+            parametrization. A module will be excluded if any of the strings in `exclude_modules` is in its name.
+    See Also:
+        `ParametrizedModelConfig`
+    """
+    module_factory_cls: str
+    module_factory_kwargs: dict[str, Any] = field(default_factory=dict)
+    target_modules: str | list[str] | None = None
+    exclude_modules: list[str] | None = None
+    def parse_module_factory_cls(self) -> Type[Parametrization]:
+        """Returns the class of the parametrization to be applied to linear layers."""
+        try:
+            if self.module_factory_cls in PARAMETRIZATION_FACTORY_REGISTRY:
+                module_factory_cls = PARAMETRIZATION_FACTORY_REGISTRY[self.module_factory_cls]
+            else:
+                module_factory_cls = get_class_from_str(self.module_factory_cls)
+        except Exception:
+            raise ValueError(f"Unrecognized parametrization class: {self.module_factory_cls}")
+        return module_factory_cls
+    def to_dict(self) -> dict[str, Any]:
+        config_dict = asdict(self)  # type: ignore
+        # _maybe_include_all_linear_layers creates sets which does not work with JSON serialization, so cast to list
+        for key, value in config_dict.items():
+            if isinstance(value, set):
+                config_dict[key] = list(value)
+        return config_dict
+    @classmethod
+    def from_dict(cls, config_dict: dict[str, Any]) -> "ParametrizationConfig":
+        return cls(**config_dict)
+@dataclass
+class AdapterConfig:
+    """
+    Configuration for the Huggingface Peft adapters to be applied to the base model.
+    Attributes:
+        peft_config: One or more adapter `PeftConfig`s to be applied to the base model.
+            If a single `PeftConfig` is provided, it will wrapped by a dict with key "default".
+            The dictionary keys will be used as adapter names in `PretrainedModel.add_adapter`.
+    See Also:
+        `ParametrizedModelConfig`
+    """
+    peft_config: PeftConfig | dict[str, PeftConfig]
+    def __post_init__(self):
+        if isinstance(self.peft_config, PeftConfig):
+            self.peft_config = {"default": self.peft_config}
+    def to_dict(self) -> dict[str, Any]:
+        config_dict = asdict(self)  # type: ignore
+        # Make each PeftConfig JSON serializable
+        for adapter_name, peft_config in self.peft_config.items():
+            peft_config_dict = peft_config.to_dict()
+            # Peft casts lists to sets, which are not JSON serializable, so cast to list manually
+            for key, value in peft_config_dict.items():
+                if isinstance(value, set):
+                    peft_config_dict[key] = list(value)
+            config_dict["peft_config"][adapter_name] = peft_config_dict
+        return config_dict
+    @classmethod
+    def from_dict(cls, config_dict: dict[str, Any]) -> "AdapterConfig":
+        # Deserialize each PeftConfig automatically with from_peft_type
+        for key, peft_config in config_dict["peft_config"].items():
+            config_dict["peft_config"][key] = PeftConfig.from_peft_type(**peft_config)
+        return cls(**config_dict)
+try:
+    # Prevent import errors because for some systems like macOS, bitsandbytes cannot be installed directly
+    import bitsandbytes
+    # Predefined quantization classes for `WeightQuantizationConfig.module_factory_cls`
+    # (avoids absolute package imports)
+    QUANTIZATION_FACTORY_REGISTRY: dict[str, Type[nn.Linear]] = {
+        "bnb4bit": bitsandbytes.nn.Linear4bit,
+    }
+except ImportError:
+    logger.warning("bitsandbytes is not installed, skipping quantization.")
+    QUANTIZATION_FACTORY_REGISTRY: dict[str, Type[nn.Linear]] = {}
+@dataclass
+class WeightQuantizationConfig:
+    """
+    Configuration for an (optional) weight quantization to be applied to the base model.
+    So far, only fp4 quantization with bitsandbytes has been tested, but analogous bitsandbytes
+    quantizations should work as well. `module_factory_cls` might also use a different quantization library,
+    as long as it is compatible with the module replacement strategy in `ParametrizedModule.quantize`.
+    Attributes:
+        module_factory_cls: The class name of the quantization to be applied to linear layers.
+            Can be a string representing a class name (with absolute module path) or a predefined key
+            from `QUANTIZATION_FACTORY_REGISTRY`.
+            Use `parse_module_factory_cls` to get the actual class when creating the quantization.
+        module_factory_kwargs: Keyword arguments used when creating the quantization with `module_factory_cls`.
+        target_modules: A (list of) string(s) specifying the names of the linear layers to be quantized.
+            Follows the same semantics as Huggingface's `PeftConfig`, see also `check_target_module_exists`.
+            If a string, a regex match will be performed; if a list, a module will be quantized if its name ends
+            with any of the strings in `target_modules`.
+        exclude_modules: A list of strings specifying the names of the linear layers to be excluded from
+            quantization. A module will be excluded if any of the strings in `exclude_modules` is in its name.
+    See Also:
+        `ParametrizedModelConfig`
+    """
+    module_factory_cls: str
+    module_factory_kwargs: dict[str, Any] = field(default_factory=dict)
+    target_modules: str | list[str] | None = None
+    exclude_modules: list[str] | None = None
+    def parse_module_factory_cls(self) -> Type[nn.Linear]:
+        """Returns the class of the quantization to be applied to linear layers."""
+        try:
+            if self.module_factory_cls in QUANTIZATION_FACTORY_REGISTRY:
+                module_factory_cls = QUANTIZATION_FACTORY_REGISTRY[self.module_factory_cls]
+            else:
+                module_factory_cls = get_class_from_str(self.module_factory_cls)
+        except Exception:
+            raise ValueError(f"Unrecognized quantization class: {self.module_factory_cls}")
+        return module_factory_cls
+    def to_dict(self) -> dict[str, Any]:
+        config_dict = asdict(self)  # type: ignore
+        # Make torch.dtype fields JSON serializable
+        for key, value in config_dict["module_factory_kwargs"].items():
+            if isinstance(value, torch.dtype):
+                config_dict["module_factory_kwargs"][key] = str(value)
+        # _maybe_include_all_linear_layers creates sets which does not work with JSON serialization, so cast to list
+        for key, value in config_dict.items():
+            if isinstance(value, set):
+                config_dict[key] = list(value)
+        return config_dict
+    @classmethod
+    def from_dict(cls, config_dict: dict[str, Any]) -> "WeightQuantizationConfig":
+        # Deserialize torch.dtype fields
+        for key, value in config_dict["module_factory_kwargs"].items():
+            if isinstance(value, str) and value.startswith("torch."):
+                dtype_name = value.split(".")[-1]
+                config_dict["module_factory_kwargs"][key] = getattr(torch, dtype_name)
+        return cls(**config_dict)
+class ParametrizedModelConfig(PretrainedConfig):
+    """
+    Configuration for `ParametrizedModel` implementing a `PretrainedConfig` to be fully compatible with
+    Huggingface's `PreTrainedModel` framework.
+    See Also:
+        - `BaseModelConfig`
+        - `ParametrizationConfig`
+        - `AdapterConfig`
+        - `WeightQuantizationConfig`
+        - `ParametrizedModel`
+    """
+    model_type = "parametrized_model"
+    def __init__(
+        self,
+        base_model_config: BaseModelConfig | None = None,
+        parametrization_config: ParametrizationConfig | None = None,
+        adapter_config: AdapterConfig | None = None,
+        weight_quantization_config: WeightQuantizationConfig | None = None,
+        model_mode: Literal["train", "eval"] = "train",
+        **kwargs: Any,
+    ):
+        """
+        Initializes a `ParametrizedModelConfig`, serving as a container for `BaseModelConfig`, `ParametrizationConfig`,
+        `AdapterConfig`, and `WeightQuantizationConfig`.
+        Args:
+            base_model_config: `BaseModelConfig`
+            parametrization_config: `ParametrizationConfig`
+            adapter_config: `AdapterConfig`
+            weight_quantization_config: `WeightQuantizationConfig`
+            model_mode: Whether to initialize the model in train or eval mode.
+            **kwargs: Keyword arguments forwarded to `PretrainedConfig`.
+        """
+        self.base_model_config = base_model_config
+        self.parametrization_config = parametrization_config
+        self.adapter_config = adapter_config
+        self.weight_quantization_config = weight_quantization_config
+        self.model_mode = model_mode
+        super().__init__(**kwargs)
+    def _convert_to_dict(self, config_dict: dict[str, Any]) -> dict[str, Any]:
+        if self.base_model_config is not None:
+            config_dict["base_model_config"] = self.base_model_config.to_dict()
+        if self.parametrization_config is not None:
+            config_dict["parametrization_config"] = self.parametrization_config.to_dict()
+        if self.adapter_config is not None:
+            config_dict["adapter_config"] = self.adapter_config.to_dict()
+        if self.weight_quantization_config is not None:
+            config_dict["weight_quantization_config"] = self.weight_quantization_config.to_dict()
+        return config_dict
+    def to_diff_dict(self):
+        # Override PretrainedConfig to_diff_dict to make subconfigs JSON serializable.
+        config_dict = super().to_diff_dict()
+        return self._convert_to_dict(config_dict)
+    def to_dict(self):
+        # Override PretrainedConfig to_diff to make subconfigs JSON serializable.
+        config_dict = super().to_dict()
+        return self._convert_to_dict(config_dict)
+    @classmethod
+    def from_dict(cls, config_dict: dict[str, Any], **kwargs: Any) -> PretrainedConfig:
+        # Deserialize BaseModelConfig
+        base_model_config_dict: dict[str, Any] | None = config_dict.pop("base_model_config", None)
+        if base_model_config_dict is not None:
+            base_model_config = BaseModelConfig.from_dict(base_model_config_dict)
+        else:
+            base_model_config = None
+        # Deserialize ParametrizationConfig
+        parametrization_config_dict: dict[str, Any] | None = config_dict.pop("parametrization_config", None)
+        if parametrization_config_dict is not None:
+            parametrization_config = ParametrizationConfig.from_dict(parametrization_config_dict)
+        else:
+            parametrization_config = None
+        # Deserialize AdapterConfig
+        adapter_config_dict: dict[str, Any] | None = config_dict.pop("adapter_config", None)
+        if adapter_config_dict is not None:
+            adapter_config = AdapterConfig.from_dict(adapter_config_dict)
+        else:
+            adapter_config = None
+        # Deserialize WeightQuantizationConfig
+        weight_quantization_config_dict: dict[str, Any] | None = config_dict.pop("weight_quantization_config", None)
+        if weight_quantization_config_dict is not None:
+            weight_quantization_config = WeightQuantizationConfig.from_dict(weight_quantization_config_dict)
+        else:
+            weight_quantization_config = None
+        config = super().from_dict(config_dict, **kwargs)
+        # Handle special case when return_unused_kwargs is True
+        if "return_unused_kwargs" in kwargs and kwargs["return_unused_kwargs"] is True:
+            config[0].base_model_config = base_model_config
+            config[0].parametrization_config = parametrization_config
+            config[0].adapter_config = adapter_config
+            config[0].weight_quantization_config = weight_quantization_config
+        else:
+            config.base_model_config = base_model_config
+            config.parametrization_config = parametrization_config
+            config.adapter_config = adapter_config
+            config.weight_quantization_config = weight_quantization_config
+        return config
+class ParametrizedModel(PreTrainedModel):
+    """
+    Base class for parametrized models implemented as a custom Huggingface `PreTrainedModel`.
+    It wraps any base model of type `PreTrainedModel` in `self.model`, whose linear layers can be
+    parametrized (`parametrize`), equipped with adapters (`inject_adapters`), and quantized (`quantize`).
+    The corresponding modules are accessed via `parametrized_modules`, `adapter_modules`,
+    and `quantized_modules`, respectively.
+    The class also provides several convenience methods to manage the parametrization: `get_target_params`,
+    `get_num_params`, `get_compression_ratio`, `reset_target_params`, `compress`.
+    Standard functionality (`forward`, `generate`, `save_pretrained`, `from_pretrained`) is essentially forwarded
+    to the wrapped model.
+    See Also:
+        `ParametrizedModelConfig`
+    """
+    config_class = ParametrizedModelConfig
+    def __init__(self, config: ParametrizedModelConfig, base_model: PreTrainedModel | None = None, **_: Any):
+        """
+        Initialize the `ParametrizedModel` from a given configuration or an existing base model.
+        Args:
+            config: `ParametrizedModelConfig` to be used.
+            base_model: If provided, this base model is used instead of creating it from `config.base_model_config`.
+            **_: Ignored keyword arguments to prevent unexpected keyword errors.
+        See Also: `BaseModelConfig`
+        """
+        super().__init__(config)
+        self.config = config  # redundant but enables type hinting for ParametrizedModelConfig
+        # Either use an existing base model or create a new one from config.base_model_config
+        if base_model is None:
+            if self.config.base_model_config is None:
+                raise ValueError("Either base_model or base_model_config must be provided.")
+            self.model = self.config.base_model_config.pretrained_model_cls.from_pretrained(
+                config=self.config.base_model_config.pretrained_config,
+                **self.config.base_model_config.pretrained_model_kwargs,
+            )
+        else:
+            self.model = base_model
+        # Set base model to train or eval mode.
+        self.train(self.config.model_mode == "train")
+        logger.info(f"Base model {self.model.__class__} created.")
+        # Perform parametrization.
+        self._parametrized_modules: dict[str, ParametrizedModule] | None = None
+        self.parametrize()
+        # Inject adapters.
+        self._adapter_modules: dict[str, nn.Module] | None = None
+        self.inject_adapters()
+        # Quantization needs to be performed manually via `quantize` because this is fully optional.
+        self._quantized_modules: dict[str, nn.Linear] | None = None
+        # Modified modules are initalized after parametrize and inject_adapters because they may alter the nested
+        # module and parameter structure of the model.
+        _ = self.parametrized_modules
+        _ = self.adapter_modules
+        _ = self.quantized_modules
+        # Initially disable all tunable parameters to avoid unexpected behavior.
+        # Tunable parameter selection should be handled by the optimizer factory in `BaseLitModule`.
+        for param in self.parameters():
+            param.requires_grad = False
+    @property
+    def base_model_name_or_path(self) -> str:
+        """Convenience method to return the name or path of the base model."""
+        return self.model.name_or_path  # type: ignore
+    def forward(self, *args, **kwargs) -> Any:
+        return self.model(*args, **kwargs)
+    def generate(self, *args, **kwargs) -> Any:
+        return self.model.generate(*args, **kwargs)
+    def save_pretrained(
+        self,
+        save_directory: str | os.PathLike,
+        state_dict: dict | None = None,
+        include_filter: list[str] | None = None,
+        exclude_filter: list[str] | None = None,
+        **kwargs: Any,
+    ) -> None:
+        """
+        Override of the default `save_pretrained` method to allow filtering of the saved state dict.
+        Args:
+            save_directory: Directory to save the model to.
+            state_dict: Manuel override of the state dict to be saved.
+                If None, `include_filter` and `exclude_filter` are applied to `self.state_dict()`.
+            include_filter: List of state dict keys to include from the state dict.
+                Match when the key ends with any of the strings in the list.
+                If None, all keys are included.
+            exclude_filter: List of state dict keys to exclude from in the state dict.
+                Match when the key ends with any of the strings in the list.
+                If None, no keys are excluded.
+            **kwargs: Keyword arguments to be passed to the default `save_pretrained` method.
+        See Also:
+            `PreTrainedModel.save_pretrained`
+        """
+        if state_dict is None:
+            state_dict = self.state_dict()
+            if include_filter is not None:
+                state_dict = {k: v for k, v in state_dict.items() if any(k.endswith(f) for f in include_filter)}
+            if exclude_filter is not None:
+                state_dict = {k: v for k, v in state_dict.items() if not any(k.endswith(f) for f in exclude_filter)}
+        super().save_pretrained(save_directory=save_directory, state_dict=state_dict, **kwargs)
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str | os.PathLike | None,
+        *model_args: Any,
+        with_init_empty_weights: bool = True,
+        **kwargs: Any,
+    ) -> PreTrainedModel:
+        """
+        Override of the default `from_pretrained` method to allow initialization with empty weights.
+        Args:
+            pretrained_model_name_or_path: Model name or path.
+            *model_args: Arguments to be passed to the default `from_pretrained` method.
+            with_init_empty_weights: Whether to initialize the model with empty weights or not.
+            **kwargs: Keyword arguments to be passed to the default `from_pretrained` method.
+        """
+        with init_empty_weights(with_init_empty_weights):
+            return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+    @property
+    def parametrized_modules(self) -> dict[str, ParametrizedModule]:
+        """
+        Returns a dictionary of all parametrized modules in the model.
+        The returned dictionary is compatible with `self.model.named_modules()`.
+        """
+        if self._parametrized_modules is None:
+            self._parametrized_modules = {}
+            if self.config.parametrization_config is None:
+                return self._parametrized_modules
+            for m_name, module in self.model.named_modules():
+                if isinstance(module, ParametrizedModule):
+                    self._parametrized_modules[m_name] = module
+        return self._parametrized_modules
+    @property
+    def adapter_modules(self) -> dict[str, nn.Module]:
+        """
+        Returns a dictionary of all adapter modules in the model.
+        The returned dictionary is compatible with `self.model.named_modules()`.
+        """
+        if self._adapter_modules is None:
+            self._adapter_modules = {}
+            if self.config.adapter_config is None:
+                return self._adapter_modules
+            try:
+                # Use the adapter management of `PreTrainedModel` to retrieve the adapter modules.
+                for adapter_name in self.model.active_adapters():
+                    for m_name in self.model.get_adapter_state_dict(adapter_name).keys():
+                        adapter_m_name = f"{m_name.rsplit('.', 1)[0]}.{adapter_name}"
+                        self._adapter_modules[adapter_m_name] = self.model.get_submodule(adapter_m_name)
+            except ValueError as e:
+                logger.warning(e)
+        return self._adapter_modules
+    @property
+    def quantized_modules(self) -> dict[str, nn.Linear]:
+        """
+        Returns a dictionary of all quantized modules in the model.
+        The returned dictionary is compatible with `self.model.named_modules()`.
+        """
+        if self._quantized_modules is None:
+            self._quantized_modules = {}
+            if self.config.weight_quantization_config is None:
+                return self._quantized_modules
+            try:
+                module_factory_cls = self.config.weight_quantization_config.parse_module_factory_cls()
+            except Exception as e:
+                logger.warning(f"Could not parse weight quantization config, quantization not available.\nError: {e}")
+                return self._quantized_modules
+            for m_name, module in self.model.named_modules():
+                if isinstance(module, module_factory_cls):
+                    self._quantized_modules[m_name] = module
+        return self._quantized_modules
+    def parametrize(self) -> None:
+        """
+        Parametrize the `target_modules` from `ParametrizationConfig` using `parametrized_layer.parametrize_module`.
+        See Also: `ParametrizationConfig`
+        """
+        if self.config.parametrization_config is None:
+            logger.debug("Model parametrization is disabled.")
+            return
+        # Use peft semantics, e.g, "all-linear" to include all linear layers
+        # TODO: Replace by own helper function to avoid unnecessary dependencies
+        config: ParametrizationConfig = _maybe_include_all_linear_layers(  # type: ignore
+            self.config.parametrization_config,  # type: ignore
+            self.model,
+        )
+        module_factory_cls = config.parse_module_factory_cls()
+        for m_name, module in self.model.named_modules():
+            # Only modify the modules that are targeted
+            if config.exclude_modules is not None and any(key in m_name for key in config.exclude_modules):
+                continue
+            if not check_target_module_exists(config, m_name):
+                continue
+            parametrization = module_factory_cls(**config.module_factory_kwargs)
+            parametrize_module(module=module, parametrization=parametrization)
+            logger.debug(f"Parametrized {module.__class__} module {m_name} as {parametrization.__class__}")
+        self._parametrized_modules = None  # reset parametrized modules
+        logger.info("Parametrization completed.")
+    def inject_adapters(self) -> None:
+        """
+        Inject adapters according to `AdapterConfig` using the adapter management of `PreTrainedModel`.
+        See Also: `AdapterConfig`
+        """
+        if self.config.adapter_config is None:
+            logger.debug("Adapter injection is disabled.")
+            return
+        for adapter_name, peft_config in self.config.adapter_config.peft_config.items():
+            self.model.add_adapter(peft_config, adapter_name=adapter_name)
+        self.model.set_adapter(list(self.config.adapter_config.peft_config.keys()))
+        self._adapter_modules = None  # reset adapter modules
+        logger.info("Adapters injected.")
+    def quantize(self) -> None:
+        """
+        Quantize the `target_modules` from `WeightQuantizationConfig`.
+        See Also: `WeightQuantizationConfig`
+        """
+        if self.config.weight_quantization_config is None:
+            logger.debug("Weight quantization is disabled.")
+            return
+        # Use peft semantics e.g "all-linear" to include all linear layers
+        # TODO: Replace by own helper function to avoid unnecessary dependencies
+        config: WeightQuantizationConfig = _maybe_include_all_linear_layers(  # type: ignore
+            self.config.weight_quantization_config,  # type: ignore
+            self.model,
+        )
+        module_factory_cls = config.parse_module_factory_cls()
+        for m_name, module in self.model.named_modules():
+            # Only modify the modules that are targeted
+            if config.exclude_modules is not None and any(key in m_name for key in config.exclude_modules):
+                continue
+            if not check_target_module_exists(config, m_name) or isinstance(module, ParametrizedModule):
+                continue
+            if not isinstance(module, nn.Linear):
+                continue
+            # Important: This module must NOT be created in a device context like with_init_device("cuda")
+            quantized_module = module_factory_cls(
+                module.in_features,
+                module.out_features,
+                bias=module.bias is not None,
+                device=module.weight.device,
+                **config.module_factory_kwargs,
+            )
+            # cf. https://huggingface.co/docs/bitsandbytes/reference/nn/linear4bit#bitsandbytes.nn.Linear4bit.example
+            quantized_module.load_state_dict(module.state_dict())
+            quantized_module = quantized_module.to(module.weight.device)
+            quantized_module.weight.requires_grad = False
+            logger.debug(f"Quantized {module.__class__} module {m_name} to {quantized_module.__class__}")
+            # Replace the target module by the quantized module
+            parent_name, child_name = m_name.rsplit(".", 1)
+            parent_module = self.model.get_submodule(parent_name)
+            parent_module.add_module(child_name, quantized_module)
+        self._quantized_modules = None  # reset quantized modules
+        logger.info("Quantization completed.")
+    def get_target_params(self) -> dict[str, nn.Parameter]:
+        """
+        Lifts `Parametrization.get_target_params` to the model scope.
+        The returned dictionary should be compatible with `self.model.named_parameters()`.
+        See Also:
+            `Parametrization.get_target_params`
+        """
+        target_params = {}
+        for m_name, module in self.parametrized_modules.items():
+            for p_name, param in module.parametrization.get_target_params().items():
+                target_params[f"{m_name}.parametrization.{p_name}"] = param
+        return target_params
+    def get_num_params(
+        self, compressed: bool = False, full: bool = False, target_params: dict[str, torch.Tensor] | None = None
+    ) -> int:
+        """
+        Lifts `Parametrization.get_num_params` to the model scope.
+        Computes the (effective) number of parameters of the entire model.
+        Args:
+            compressed: Whether to count the number of parameters as if the parametrized modules were actually
+                compressed. If `False`, the number of parameters is the same as in the original module.
+            full: If `True`, all parameters of the model are counted, if `False` only those of parametrized modules.
+                Default is `False`, which follows the most common convention in the compression literature.
+            target_params: Count the number of parameters as if `target_params` were used instead of
+                the parametrized modules' target parameters. The dictionary keys should be compatible with those of
+                `self.get_target_params`.
+        See Also:
+            `Parametrization.get_num_params`
+        """
+        num_params_full = 0
+        if full:
+            for name, param in self.model.named_parameters():
+                if "parametrization" not in name:  # exclude parametrized modules here (counted below)
+                    if hasattr(param, "quant_state"):  # HOTFIX: special case for bitsandbytes-quantized parameters
+                        num_params_full += param.numel() * 2
+                    else:
+                        num_params_full += param.numel()
+        num_params = 0
+        for module_name, module in self.parametrized_modules.items():
+            module_target_params = None
+            if compressed and target_params is not None:
+                # Make target_params' keys those of parametrized models, i.e., trim f"{module_name}.parametrization."
+                prefix = f"{module_name}.parametrization."
+                # Filter and re-map keys for the current module
+                module_target_params = {
+                    key[len(prefix) :]: value for key, value in target_params.items() if key.startswith(prefix)
+                }
+                if not module_target_params:
+                    module_target_params = None
+            num_params += module.parametrization.get_num_params(
+                compressed=compressed, target_params=module_target_params
+            )
+        num_params = num_params + num_params_full
+        if num_params == 0:
+            # dummy to avoid division by zero (e.g., if there are no parametrized_modules and full=False)
+            num_params = 1e-6
+        return num_params
+    def get_compression_ratio(self, full: bool = False, target_params: dict[str, torch.Tensor] | None = None) -> float:
+        """
+        Convenience function to compute the compression ratio of the present model.
+        See Also:
+            `get_num_params`
+        """
+        return self.get_num_params(compressed=True, full=full, target_params=target_params) / self.get_num_params(
+            full=full
+        )
+    def reset_target_params(self, mode: Literal["full", "nonzero", "compress"] = "full") -> None:
+        """
+        Lifts `Parametrization.reset_target_params` to the model scope.
+        Args:
+            mode: The reset mode, see `Parametrization.reset_target_params`.
+        See Also:
+            `Parametrization.reset_target_params`
+        """
+        for m_name, module in self.parametrized_modules.items():
+            module.parametrization.reset_target_params(mode=mode)
+    def compress(self) -> None:
+        """
+        Compresses all parametrized modules using `Parametrization.reset_target_params(mode="compress")`.
+        If no compression is possible, the module is unparametrized and removed from `parametrized_modules`.
+        """
+        removed_parametrized_modules = []
+        for m_name, module in self.parametrized_modules.items():
+            if module.parametrization.get_num_params(compressed=True) / module.parametrization.get_num_params() >= 1.0:
+                unparametrize_module(module)
+                removed_parametrized_modules.append(m_name)
+                logger.debug(f"Unparametrizing {module.__class__} module {m_name}")
+            else:
+                module.parametrization.reset_target_params(mode="compress")
+                logger.debug(f"Compressing {module.__class__} module {m_name}")
+        for m_name in removed_parametrized_modules:
+            self.parametrized_modules.pop(m_name)
+        logger.info("Compression completed.")
+# Register ParametrizedModelConfig and ParametrizedModel for AutoModel
+# Required to push custom model to Huggingface Hub (see https://huggingface.co/docs/transformers/en/custom_models)
+ParametrizedModelConfig.register_for_auto_class()
+ParametrizedModel.register_for_auto_class("AutoModel")

projected_layer.py ADDED Viewed

	@@ -0,0 +1,308 @@

+import math
+from abc import ABC, abstractmethod
+from logging import getLogger
+from typing import Literal
+import torch
+from torch import nn
+from torch.nn import functional as F
+from .parametrized_layer import Parametrization
+from .utils import use_init_empty_weights
+logger = getLogger(__name__)
+class CompressionCriterion(ABC):
+    """
+    Abstract class for compression criterion of a (target) parameter of a parametrized module.
+    """
+    @abstractmethod
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: A tensor of any shape
+        Returns: A boolean mask of the same shape as `x` where `False` indicates that the entry can be removed.
+        """
+        raise NotImplementedError
+class ThresholdCriterion(CompressionCriterion):
+    """
+    Compression criterion based on a threshold. All entries below `self.threshold` can be removed.
+    """
+    def __init__(self, threshold: float = 0.0):
+        self.threshold = threshold
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        return x > self.threshold
+class ProjectedLinearParametrization(Parametrization, ABC):
+    """
+    Implementation of a linear layer parametrization, factorizing the weight matrix as
+    `weight = ortho.weight @ torch.diag(mask) @ base.weight`.
+    Here, `ortho` is a linear layer with orthogonal columns, `mask` represents a (binary) diagonal matrix
+    that can be pruned, and `base` is a linear layer (determined by the choice of `ortho`).
+    Any child class needs to implement `_ortho_init` which creates `ortho`. Based on this, `mask` and `base` are
+    initialized such that the original weight matrix is obtained at initialization.
+    `mask` corresponds to the only target parameter of this parametrization. Pruning it will result in
+    a low-rank matrix representation of the parametrized linear module.
+    """
+    base_class = nn.Linear
+    def __init__(
+        self,
+        mask_func: Literal["ste", "relu", "none"] = "ste",
+        mask_scaling_factor: float | str = "norm",
+        compression_criterion: CompressionCriterion = ThresholdCriterion(),
+    ):
+        """
+        Args:
+            mask_func: A function applied to the mask parameter in each forward pass implementing
+                custom functionalities. Available options: ["ste", "relu", "none"].
+                "ste" means using a straight-through estimator, i.e., in the forward pass, `mask` is binarized, which
+                is ignored in the backward pass. Before `mask` passed through a ReLU activation.
+                "relu" means that `mask` is passed through a ReLU activation.
+                "none" means that `mask` is not modified.
+            mask_scaling_factor: Conceptually, `mask` is initialized with ones, but rescaling to a smaller value
+                can vastly improve the training speed. `mask_scaling_factor` specifies this rescaling factor.
+                The rescaling should be compensated by scaling `ortho` accordingly in `self._ortho_init`.
+                If `mask_scaling_factor='norm'`, the scaling factor is chosen such that `mask` has unit L2 norm
+                (note that this can lead to a different behavior in model tuning than for a fixed factor
+                 when some target parameters have different number of elements).
+            compression_criterion: `CompressionCriterion` to be used in `self.reset_target_params(mode="compress")`.
+        """
+        super().__init__()
+        self.mask_func = {
+            "ste": mask_func_ste,
+            "relu": mask_func_relu,
+            "none": mask_func_none,
+        }[mask_func]
+        self._mask_scaling_factor = mask_scaling_factor
+        self.compression_criterion = compression_criterion
+    def _forward(self, x: torch.Tensor) -> torch.Tensor:
+        # This implementation avoids an explicit materalization of `weight`.
+        x = self.base(x)
+        x = self.mask_func(self.mask, self.mask_scaling_factor) * x
+        x = self.ortho(x)
+        return x
+    def _weight(self) -> torch.Tensor:
+        # Compute the original weight matrix, don't use this in forward pass for efficiency reasons
+        mask = self.mask_func(self.mask, self.mask_scaling_factor)
+        return self.ortho.weight @ torch.diag(mask) @ self.base.weight
+    def _bias(self) -> torch.Tensor | None:
+        return self.ortho.bias
+    def _initialize(self, base_module: base_class) -> None:
+        factory_kwargs = {"device": base_module.weight.device, "dtype": base_module.weight.dtype}
+        in_dim, out_dim = base_module.in_features, base_module.out_features
+        proj_dim = min(in_dim, out_dim)  # infer mask (bottleneck) dimension
+        # Initialize ortho layer ....
+        self.add_module(
+            "ortho",
+            nn.Linear(in_features=proj_dim, out_features=out_dim, bias=base_module.bias is not None, **factory_kwargs),
+        )
+        self._ortho_init(base_module.weight)
+        if base_module.bias is not None:
+            # It is important that ortho carries the bias (and not base) because ortho is used to compute the final
+            # output of the forward pass
+            self.ortho.bias.data.copy_(base_module.bias.data)
+        # ... and compute the base layer based on the choice of ortho (this only works of ortho has orthogonal columns)
+        base = base_module.__class__(in_features=in_dim, out_features=proj_dim, bias=False, **factory_kwargs)
+        base.weight.data.copy_(self.ortho.weight.data.T @ base_module.weight.data)
+        self.add_module("base", base)
+        # Creating (tunable) mask parameter ...
+        self.register_parameter("mask", torch.nn.Parameter(torch.ones(proj_dim, **factory_kwargs)))
+        # ... and rescale mask properly in a separate step
+        # (because reset_target_params calls mask_scaling_factor, which in turn may require mask to already exist)
+        self.reset_target_params()
+    @abstractmethod
+    def _ortho_init(self, weight: torch.Tensor) -> None:
+        """
+        Initialize ortho layer. Must be implemented by child class.
+        Args:
+            weight: Weight matrix of the original linear layer module.
+        """
+        raise NotImplementedError
+    def get_target_params(self) -> dict[str, torch.nn.Parameter]:
+        return {"mask": self.mask}
+    @property
+    def mask_scaling_factor(self) -> float:
+        if self._mask_scaling_factor == "norm":
+            # Choose scaling factor such that mask has unit L2 norm.
+            # Note: mask already needs to exist at this point to infer its shape.
+            self._mask_scaling_factor = 1 / math.sqrt(self.mask.numel())
+            return self._mask_scaling_factor
+        elif isinstance(self._mask_scaling_factor, float):
+            return self._mask_scaling_factor
+        else:
+            raise ValueError(f"Invalid mask_scaling_factor: {self._mask_scaling_factor}")
+    @property
+    def in_features(self) -> int:
+        return self.base.in_features
+    @property
+    def out_features(self) -> int:
+        return self.ortho.out_features
+    def reset_target_params(self, mode: Literal["full", "nonzero", "compress"] = "full") -> None:
+        with torch.no_grad():
+            if mode == "full":
+                # Scale mask values properly by self.mask_scaling_factor
+                self.mask.data = torch.ones_like(self.mask.data) * self.mask_scaling_factor
+            elif mode == "nonzero":
+                # Scale mask values properly by self.mask_scaling_factor
+                self.mask.data[self.mask.data > 0] = 1.0 * self.mask_scaling_factor
+                self.mask.data[self.mask.data < 0] = 0.0
+            elif mode == "compress":
+                if self.compression_criterion is None:
+                    logger.warning("Compression criterion is not set. No op...")
+                    return
+                # Select entries of parameter mask that should be kept
+                dim_select = self.compression_criterion(self.mask)
+                # Create and register compressed layers and mask
+                new_base = new_linear_from_mask(self.base, dim_select, column_select=False)
+                new_ortho = new_linear_from_mask(self.ortho, dim_select, column_select=True)
+                new_mask = self.mask[dim_select].clone().detach()
+                del self.mask, self.base, self.ortho
+                self.register_module("base", new_base)
+                self.register_module("ortho", new_ortho)
+                self.register_parameter("mask", nn.Parameter(new_mask))
+            else:
+                raise ValueError(f"Invalid mode: {mode}")
+    def get_num_params(self, compressed: bool = False, target_params: dict[str, torch.Tensor] | None = None) -> int:
+        if not compressed:
+            # Compute number of parameters for full linear layer
+            num_params = self.in_features * self.out_features
+            if self.bias is not None:
+                num_params += self.out_features
+            return num_params
+        else:
+            # Compute number of mask values that could be discarded by self.reset_target_params(mode="compress") ...
+            if target_params is not None:
+                sparsity = mask_sparsity(target_params["mask"] != 0.0, threshold=0.0)
+            else:
+                sparsity = mask_sparsity(self.mask)
+            # ... and compute the (hypothetical) number of parameters for a compressed module.
+            num_params = self.in_features * sparsity + sparsity * self.out_features
+            if self.bias is not None:
+                num_params += self.out_features
+            # If the number of parameters for the compressed module would be larger than the number of parameters
+            # for the full module, return the latter because we can always unparametrize to the original module if
+            # compression would not be effective.
+            num_params = min(self.get_num_params(compressed=False), num_params)
+            return num_params
+class SVDLinearParametrization(ProjectedLinearParametrization):
+    """
+    Implementation of a linear layer parametrization using SVD decomposition.
+    If the SVD of weight is U * S * V^T, then `ortho.weight = U` and `base.weight = S * V^T`.
+    As base is computed automatically by `_initialize`, `_ortho_init` only needs to compute U and
+    scale it properly with `mask_scaling_factor`. The singular values S are buffered just in case they are needed
+    in the tuning process.
+    """
+    def _ortho_init(self, weight: torch.Tensor) -> None:
+        k = min(weight.shape[0], weight.shape[1])
+        if use_init_empty_weights.get():
+            # Check if the init_empty_weights context is active which avoids a (costly) SVD computation and just
+            # initializes U and S as empty tensors. They are loaded later from a pretrained model.
+            logger.debug("Parametrizing with empty weights.")
+            U = torch.empty(weight.shape[0], k)
+            S = torch.empty(k, 1)
+        else:
+            # Detaching is important to avoid memory leaks. torch.linalg.svd only works with float32.
+            U, S, _ = torch.linalg.svd(weight.detach().float(), full_matrices=False)
+            # Rescaling U based on mask_scaling_factor
+            # This step is somewhat manual because calling mask_scaling_factor requires the mask to already exist
+            if self._mask_scaling_factor == "norm":
+                U = math.pow(k, 1 / 4) * U
+            else:
+                U = math.sqrt(1 / self._mask_scaling_factor) * U
+        factory_kwargs = {"device": weight.device, "dtype": weight.dtype}
+        self.ortho.weight.data.copy_(U.detach().to(**factory_kwargs))
+        self.register_buffer("S", S.detach().flatten().to(**factory_kwargs))
+def mask_func_ste(mask: torch.Tensor, mask_scaling_factor: float) -> torch.Tensor:
+    # See ProjectedLinearParametrization.__init__ for more details.
+    mask = F.relu(mask)
+    return (mask > 0).to(mask.dtype).detach() * mask_scaling_factor + mask - mask.detach()
+def mask_func_relu(mask: torch.Tensor, mask_scaling_factor: float) -> torch.Tensor:
+    # See ProjectedLinearParametrization.__init__ for more details.
+    return F.relu(mask)
+def mask_func_none(mask: torch.Tensor, mask_scaling_factor: float) -> torch.Tensor:
+    # See ProjectedLinearParametrization.__init__ for more details.
+    return mask
+def mask_sparsity(mask: torch.Tensor, threshold: float = 0.0) -> int:
+    """Simple util function to compute the number of non-zero elements of a mask, where an element is considered
+    non-zero if its value is strictly greater than `threshold`."""
+    return torch.count_nonzero(mask > threshold).item()
+def new_linear_from_mask(module: nn.Linear, dim_select: torch.Tensor, column_select=True) -> nn.Linear:
+    """
+    Creates a new linear layer from an existing one based on a mask indicating which columns/rows to keep.
+    Args:
+        module: Module to be pruned.
+        dim_select: Boolean tensor mask indicating which columns/rows to keep.
+        column_select: Whether to prune columns (True) or rows (False) according to `dim_select`.
+    Returns: Pruned module.
+    """
+    assert dim_select.dtype == torch.bool, "dim_select must be boolean"
+    in_features, out_features = module.in_features, module.out_features
+    sparsity = dim_select.sum().item()
+    if column_select:
+        in_features = sparsity
+    else:
+        out_features = sparsity
+    new_module = module.__class__(
+        in_features=in_features,
+        out_features=out_features,
+        bias=module.bias is not None,
+        device=module.weight.device,
+        dtype=module.weight.dtype,
+    )
+    weight = module.weight.data
+    if column_select:
+        weight = weight[:, dim_select]
+    else:
+        weight = weight[dim_select, :]
+    new_module.weight.data.copy_(weight.detach())
+    if new_module.bias is not None:
+        if column_select:
+            new_module.bias.data.copy_(module.bias.detach())
+        else:
+            # If rows are pruned, the bias needs to be pruned as well
+            new_module.bias.data.copy_(module.bias[dim_select].detach())
+    return new_module

utils.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import contextvars
+import importlib
+from contextlib import contextmanager
+from typing import Any, Type
+def get_class_from_str(class_str: str, package: str | None = None) -> Type[Any]:
+    """
+    Converts a string to the corresponding class object, supporting relative imports.
+    For relative module paths (starting with '.'), a package must be provided.
+    Args:
+        class_str: String representation of the class, either absolute or relative.
+        package: Package context, only required for relative imports.
+    Returns: Class object corresponding to the provided string.
+    """
+    if not isinstance(class_str, str) and isinstance(class_str, type):
+        return class_str
+    module_path, _, class_name = class_str.rpartition(".")
+    if not module_path and class_str.startswith("."):
+        module_path = "."
+    if module_path.startswith("."):
+        if not package:
+            raise ValueError("Relative module path provided without a package context.")
+        module = importlib.import_module(module_path, package=package)
+    else:
+        module = importlib.import_module(module_path)
+    return getattr(module, class_name)
+def get_str_from_class(cls: Type[Any], package: str | None = None) -> str:
+    """
+    Converts a class object to its string representation.
+    If a package is provided and the class's module is a submodule of the package,
+    the returned string will use a relative import.
+    Otherwise, an absolute import string is returned.
+    Args:
+        cls: Class object to convert.
+        package: Package context, only required for relative imports.
+    Returns: String representation of the class.
+    """
+    if isinstance(cls, str):
+        return cls
+    module_path = cls.__module__
+    class_name = cls.__name__
+    if package:
+        # When class is defined directly in the package's __init__.py
+        if module_path == package:
+            return f".{class_name}"
+        # When class is in a submodule of the package
+        elif module_path.startswith(package + "."):
+            # Get the relative part (including the dot)
+            relative = module_path[len(package) :]
+            if not relative.startswith("."):
+                relative = "." + relative
+            return f"{relative}.{class_name}"
+    return f"{module_path}.{class_name}"
+use_init_empty_weights = contextvars.ContextVar("init_empty_weights", default=False)
+@contextmanager
+def init_empty_weights(value: bool):
+    """
+    Context manager to indicate that a (parametrized) model should be initialized with empty weights or not.
+    If active, `use_init_empty_weights` will be set to `True` otherwise to `False`.
+    To check if the context is active, import and check `use_init_empty_weights.get()`.
+    Args:
+        value: Indicates whether the model should be initialized with empty weights or not.
+    """
+    token = use_init_empty_weights.set(value)
+    try:
+        yield
+    finally:
+        use_init_empty_weights.reset(token)