[Fix] Fully functional FSDP one-shot process (#2305)

dbogunowicz · rahul-tuli · bogunowicz@arrival.com · web-flow · commit 451d838968dc · 2024-05-28T14:24:02.000-04:00
* Update tests; diff updated on compressed tensors side

* Style

* Initial commit

* fix the FSDP name stripping

* cleanup after rebase

* refactoring

---------

Co-authored-by: Rahul Tuli &lt;rahul@neuralmagic.com&gt;
Co-authored-by: bogunowicz@arrival.com &lt;bogunowicz@arrival.com&gt;
diff --git a/src/sparseml/modifiers/quantization/gptq/pytorch.py b/src/sparseml/modifiers/quantization/gptq/pytorch.py
@@ -23,6 +23,7 @@
 from sparseml.modifiers.quantization.gptq.utils.gptq_wrapper import GPTQWrapper
 from sparseml.modifiers.utils.layer_compressor import LayerCompressor
 from sparseml.modifiers.utils.pytorch_helpers import run_calibration_forward
+from sparseml.utils.fsdp.context import fix_fsdp_module_name
 
 
 __all__ = ["GPTQModifierPyTorch"]
@@ -116,6 +117,7 @@ def initialize_compression(
         self.layer_compressors_ = []
 
         for idx, (name, layer) in enumerate(self.compressible_layers_.items()):
+            name = fix_fsdp_module_name(name)
             _LOGGER.info(f"Preparing {name} for compression")
             args = self._pruning_arguments()
             comp_cls = self._compression_class()
diff --git a/src/sparseml/pytorch/utils/sparsification.py b/src/sparseml/pytorch/utils/sparsification.py
@@ -69,6 +69,10 @@ def __init__(
         self.state_dict = state_dict
 
         if self.state_dict is not None:
+            # when analyzing an FSDP model, the state_dict does not differentiate
+            # between trainable and non-trainable parameters
+            # (e.g. it can contain buffers) this means that the
+            # self.trainable_parameters may be overestimated
             self.trainable_params = [param for _, param in state_dict.items()]
         else:
             self.trainable_params = list(
diff --git a/src/sparseml/utils/fsdp/context.py b/src/sparseml/utils/fsdp/context.py
@@ -30,7 +30,7 @@
     "fix_fsdp_module_name",
 ]
 
-FSDP_WRAPPER_NAME = "_fsdp_wrapped_module."
+FSDP_WRAPPER_NAME = "_fsdp_wrapped_module"
 
 
 def summon_full_params_context(model, offload_to_cpu: bool = False):
@@ -61,9 +61,13 @@ def main_process_first_context():
 
 def fix_fsdp_module_name(name: str) -> str:
     """
-    Remove FSDP wrapper prefixes from a module name
+    Remove FSDP wrapper prefixes from a module name.
+    Accounts for scenario where FSDP_WRAPPER_NAME is
+    at the end of the name, as well as in the middle.
 
     :param name: name to strip
     :return: stripped name
     """
-    return name.replace(FSDP_WRAPPER_NAME, "")
+    return name.replace(FSDP_WRAPPER_NAME + ".", "").replace(
+        "." + FSDP_WRAPPER_NAME, ""
+    )
diff --git a/src/sparseml/utils/pytorch/module.py b/src/sparseml/utils/pytorch/module.py
@@ -188,7 +188,6 @@ def get_layer(target: str, module: Module) -> Tuple[str, Module]:
 
 
 def set_layer(target: str, layer: Module, module: Module) -> Module:
-    target = fix_fsdp_module_name(target)
     with summon_full_params_context(module):
         # importing here to avoid circular import
         from sparseml.utils.fsdp.helpers import maybe_get_wrapped