Handle single-device models without hf_device_map after Transformers optimization (#2401)

ZX-ModelCloud · IlyasMoutawwakil · web-flow · commit 114647f51404 · 2026-01-15T09:19:41.000+01:00
* When `hf_device_map` does not exist, infer the `device_map`

Signed-off-by: ZX-ModelCloud &lt;zx@modelcloud.ai&gt;

* cleanup

Signed-off-by: ZX-ModelCloud &lt;zx@modelcloud.ai&gt;

* cleanup

Signed-off-by: ZX-ModelCloud &lt;zx@modelcloud.ai&gt;

* Update optimum/gptq/quantizer.py

Co-authored-by: Ilyas Moutawwakil &lt;57442720+IlyasMoutawwakil@users.noreply.github.com&gt;

* cleanup

Signed-off-by: ZX-ModelCloud &lt;zx@modelcloud.ai&gt;

* cleanup

Signed-off-by: ZX-ModelCloud &lt;zx@modelcloud.ai&gt;

* Fix device_map value to use param.device

Signed-off-by: ZX-ModelCloud &lt;zx@modelcloud.ai&gt;

---------

Signed-off-by: ZX-ModelCloud &lt;zx@modelcloud.ai&gt;
Co-authored-by: Ilyas Moutawwakil &lt;57442720+IlyasMoutawwakil@users.noreply.github.com&gt;
diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py
@@ -691,7 +691,13 @@ def pack_model(
         layers = get_layers(model)
         layers = {n: layers[n] for n in quantizers}
 
-        self.select_quant_linear(device_map=model.hf_device_map, pack=True)
+        if hasattr(model, "hf_device_map"):
+            device_map = model.hf_device_map
+        else:
+            # Transformers: skip accelerate hooks when device_map resolves to a single device
+            device_map = {"": next(model.parameters()).device}
+
+        self.select_quant_linear(device_map=device_map, pack=True)
 
         self._replace_by_quant_layers(model, quantizers)
         qlayers = get_layers(model, [self.quant_linear])