InternLM · RunningLeon · Nov 27, 2025 · Jan 21, 2026 · Jan 21, 2026 · Copilot
diff --git a/lmdeploy/pytorch/config.py b/lmdeploy/pytorch/config.py
@@ -347,12 +347,19 @@ def from_pretrained(
             is_draft_model=is_draft_model,
             spec_method=spec_method,
         )
-
+        enforce_fp32_head = False
         if hf_overrides is not None:
             logger = get_logger('lmdeploy')
             logger.warning(f'Overriding HF config with {hf_overrides}')
             override_hf_config(model_config.hf_config, hf_overrides)
-
+            enforce_fp32_head = hf_overrides.get('enforce_fp32_head', False)
+
+        # deal with fp32_head
+        setattr(model_config.hf_config, 'enforce_fp32_head', enforce_fp32_head)
+        if hasattr(model_config.hf_config, 'text_config'):
+            setattr(model_config.hf_config.text_config, 'enforce_fp32_head', enforce_fp32_head)
+        if hasattr(model_config.hf_config, 'llm_config'):
+            setattr(model_config.hf_config.llm_config, 'enforce_fp32_head', enforce_fp32_head)
         # for serialization of transformers modules
         maybe_register_config_serialize_by_value(trust_remote_code)
 

diff --git a/lmdeploy/pytorch/models/gpt_oss.py b/lmdeploy/pytorch/models/gpt_oss.py
@@ -9,12 +9,14 @@
 from transformers.configuration_utils import PretrainedConfig
 
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, build_rotary_embedding_from_config
-from lmdeploy.pytorch.nn.linear import build_o_proj, build_qkv_proj, build_rowwise_linear
+from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, ParallelEmbedding, RMSNorm,
+                                 build_rotary_embedding_from_config)
+from lmdeploy.pytorch.nn.linear import build_o_proj, build_qkv_proj
 from lmdeploy.pytorch.nn.moe import build_fused_moe
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .utils.cudagraph import CudaGraphMixin
+from .utils.model import DeployModelMixinV1
 
 
 class GptOssAttention(nn.Module):
@@ -333,11 +335,14 @@ class GptOssModel(nn.Module):
 
     def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: torch.device = None):
         super().__init__()
-        self.embed_tokens = nn.Embedding(config.vocab_size,
-                                         config.hidden_size,
-                                         config.pad_token_id,
-                                         dtype=dtype,
-                                         device=device)
+        self.embed_tokens = ParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            config.pad_token_id,
+            dtype=dtype,
+            device=device,
+            force_dtype=torch.float32 if getattr(config, 'enforce_fp32_head') else None,
-            force_dtype=torch.float32 if getattr(config, 'enforce_fp32_head') else None,
+            force_dtype=torch.float32 if getattr(config, 'enforce_fp32_head', False) else None,
-            force_dtype=torch.float32 if getattr(config, 'enforce_fp32_head') else None,
+            force_dtype=torch.float32 if getattr(config, 'enforce_fp32_head', False) else None,
+        )
 
         # build all decode layers
         self.layers = nn.ModuleList([
@@ -394,7 +399,7 @@ def get_input_embeddings(self):
         return self.embed_tokens
 
 
-class GptOssForCausalLM(nn.Module, CudaGraphMixin):
+class GptOssForCausalLM(nn.Module, DeployModelMixinV1, CudaGraphMixin):
     """ModelForCausalLM."""
 
     packed_modules_mapping = {
@@ -416,11 +421,7 @@ def __init__(self,
         # build model
         self.model = GptOssModel(config, dtype=dtype, device=device)
         # build lm_head
-        self.lm_head = build_rowwise_linear(config.hidden_size,
-                                            config.vocab_size,
-                                            bias=False,
-                                            dtype=dtype,
-                                            device=device)
+        self.lm_head = self.build_lm_head(config.hidden_size, config.vocab_size, bias=False, dtype=dtype, device=device)
 
     def forward(
         self,
@@ -441,15 +442,6 @@ def forward(
         )
         return hidden_states
 
-    def get_logits(self, hidden_states: torch.Tensor):
-        """Compute logits of the model output."""
-        return self.lm_head(hidden_states)
-
-    def update_weights(self):
-        """Update weights."""
-        if self.config.tie_word_embeddings:
-            self.lm_head.weight = self.model.embed_tokens.weight
-
     def get_input_embeddings(self):
         """Get input embeddings."""
         return self.model.get_input_embeddings()

diff --git a/lmdeploy/pytorch/models/internlm2.py b/lmdeploy/pytorch/models/internlm2.py
@@ -7,12 +7,13 @@
 from transformers.configuration_utils import PretrainedConfig
 
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, SiluAndMul, build_rotary_embedding_from_config
-from lmdeploy.pytorch.nn.linear import (build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj,
-                                        build_rowwise_linear)
+from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, ParallelEmbedding, RMSNorm, SiluAndMul,
+                                 build_rotary_embedding_from_config)
+from lmdeploy.pytorch.nn.linear import build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .utils.cudagraph import CudaGraphMixin
+from .utils.model import DeployModelMixinV1
 
 
 class InternLM2Attention(nn.Module):
@@ -208,12 +209,13 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
-        self.tok_embeddings = nn.Embedding(config.vocab_size,
-                                           config.hidden_size,
-                                           self.padding_idx,
-                                           dtype=dtype,
-                                           device=device)
-
+        self.tok_embeddings = ParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            self.padding_idx,
+            dtype=dtype,
+            device=device,
+            force_dtype=torch.float32 if getattr(config, 'enforce_fp32_head') else None)
-            force_dtype=torch.float32 if getattr(config, 'enforce_fp32_head') else None)
+            force_dtype=torch.float32 if getattr(config, 'enforce_fp32_head', False) else None)
-            force_dtype=torch.float32 if getattr(config, 'enforce_fp32_head') else None)
+            force_dtype=torch.float32 if getattr(config, 'enforce_fp32_head', False) else None)
         # build all decode layers
         self.layers = nn.ModuleList([
             InternLM2DecoderLayer(config, layer_idx, dtype=dtype, device=device)
@@ -269,7 +271,7 @@ def get_input_embeddings(self):
         return self.tok_embeddings
 
 
-class InternLM2ForCausalLM(nn.Module, CudaGraphMixin):
+class InternLM2ForCausalLM(nn.Module, DeployModelMixinV1, CudaGraphMixin):
     """Rewrote model of InternLM2ForCausalLM."""
 
     packed_modules_mapping = {
@@ -290,11 +292,7 @@ def __init__(self,
         # build Model
         self.model = InternLM2Model(config, dtype=dtype, device=device)
         # build lm_head
-        self.output = build_rowwise_linear(config.hidden_size,
-                                           config.vocab_size,
-                                           bias=False,
-                                           dtype=dtype,
-                                           device=device)
+        self.output = self.build_lm_head(config.hidden_size, config.vocab_size, bias=False, dtype=dtype, device=device)
 
     def forward(
         self,

diff --git a/lmdeploy/pytorch/models/internlm3.py b/lmdeploy/pytorch/models/internlm3.py
@@ -7,12 +7,13 @@
 from transformers.configuration_utils import PretrainedConfig
 
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, SiluAndMul, build_rotary_embedding_from_config
-from lmdeploy.pytorch.nn.linear import (build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj,
-                                        build_rowwise_linear)
+from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, ParallelEmbedding, RMSNorm, SiluAndMul,
+                                 build_rotary_embedding_from_config)
+from lmdeploy.pytorch.nn.linear import build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .utils.cudagraph import CudaGraphMixin
+from .utils.model import DeployModelMixinV1
 
 
 class InternLM3Attention(nn.Module):
@@ -210,11 +211,14 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
-        self.embed_tokens = nn.Embedding(config.vocab_size,
-                                         config.hidden_size,
-                                         self.padding_idx,
-                                         dtype=dtype,
-                                         device=device)
+        self.embed_tokens = ParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            self.padding_idx,
+            dtype=dtype,
+            device=device,
+            force_dtype=torch.float32 if getattr(config, 'enforce_fp32_head') else None,
-            force_dtype=torch.float32 if getattr(config, 'enforce_fp32_head') else None,
+            force_dtype=torch.float32 if getattr(config, 'enforce_fp32_head', False) else None,
-            force_dtype=torch.float32 if getattr(config, 'enforce_fp32_head') else None,
+            force_dtype=torch.float32 if getattr(config, 'enforce_fp32_head', False) else None,
+        )
 
         # build all decode layers
         self.layers = nn.ModuleList([
@@ -271,7 +275,7 @@ def get_input_embeddings(self):
         return self.embed_tokens
 
 
-class InternLM3ForCausalLM(nn.Module, CudaGraphMixin):
+class InternLM3ForCausalLM(nn.Module, DeployModelMixinV1, CudaGraphMixin):
     """Rewrote model of InternLM3ForCausalLM."""
 
     packed_modules_mapping = {
@@ -297,11 +301,7 @@ def __init__(self,
         # build InternLM3Model
         self.model = InternLM3Model(config, dtype=dtype, device=device)
         # build lm_head
-        self.lm_head = build_rowwise_linear(config.hidden_size,
-                                            config.vocab_size,
-                                            bias=False,
-                                            dtype=dtype,
-                                            device=device)
+        self.lm_head = self.build_lm_head(config.hidden_size, config.vocab_size, bias=False, dtype=dtype, device=device)
 
     def forward(
         self,
@@ -322,15 +322,6 @@ def forward(
         )
         return hidden_states
 
-    def update_weights(self):
-        """Update weights."""
-        if self.config.tie_word_embeddings:
-            self.lm_head.weight = self.model.embed_tokens.weight
-
-    def get_logits(self, hidden_states: torch.Tensor):
-        """Compute logits of the model output."""
-        return self.lm_head(hidden_states)
-
     def get_input_embeddings(self):
         """Get input embeddings."""
         return self.model.get_input_embeddings()

diff --git a/lmdeploy/pytorch/models/internvl.py b/lmdeploy/pytorch/models/internvl.py
@@ -19,7 +19,7 @@
 
 from .patch import build_model_from_hf_config
 from .utils.cudagraph import CudaGraphMixin
-from .utils.model import DeployModelMixin, vlm_model
+from .utils.model import DeployModelMixinV1, vlm_model
 
 
 class Gating(nn.Module):
@@ -444,7 +444,7 @@ def forward(
         return last_hidden_state
 
 
-class InternVLChatModel(nn.Module, DeployModelMixin, CudaGraphMixin):
+class InternVLChatModel(nn.Module, DeployModelMixinV1, CudaGraphMixin):
 
     def __init__(self,
                  config: PretrainedConfig,
@@ -801,9 +801,9 @@ def forward(
                                                position_ids=position_ids,
                                                attn_metadata=attn_metadata)
 
-    def get_logits(self, hidden_states: torch.Tensor):
-        """Compute logits of the model output."""
-        return self.language_model.get_logits(hidden_states)
+    def get_lm_head(self):
+        """Get lm_head."""
+        return self.language_model.get_lm_head()
 
     def get_input_embeddings(self):
         """Get input embeddings."""

diff --git a/lmdeploy/pytorch/models/internvl3_hf.py b/lmdeploy/pytorch/models/internvl3_hf.py
@@ -20,7 +20,7 @@
 
 from .patch import build_model_from_hf_config
 from .utils.cudagraph import CudaGraphMixin
-from .utils.model import DeployModelMixin, vlm_model
+from .utils.model import DeployModelMixinV1, vlm_model
 
 
 @torch.compile(dynamic=True)
@@ -439,7 +439,7 @@ def forward(self, image_features):
         return hidden_states
 
 
-class InternVLForConditionalGeneration(nn.Module, DeployModelMixin, CudaGraphMixin):
+class InternVLForConditionalGeneration(nn.Module, DeployModelMixinV1, CudaGraphMixin):
 
     def __init__(self,
                  config: PretrainedConfig,
@@ -485,9 +485,9 @@ def _mark_dynamic_once(self, pixel_values, dims):
         torch._dynamo.mark_dynamic(pixel_values, dims)
         self.has_compiled_vit = True
 
-    def get_logits(self, hidden_states: torch.Tensor):
-        """Compute logits of the model output."""
-        return self.language_model.get_logits(hidden_states)
+    def get_lm_head(self):
+        """Get lm_head."""
+        return self.language_model.get_lm_head()
 
     def get_input_embeddings(self):
         """Get input embeddings."""

diff --git a/lmdeploy/pytorch/models/phi3.py b/lmdeploy/pytorch/models/phi3.py
@@ -8,12 +8,12 @@
 
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, SiluAndMul
-from lmdeploy.pytorch.nn.linear import (build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj,
-                                        build_rowwise_linear)
+from lmdeploy.pytorch.nn.linear import build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj
 from lmdeploy.pytorch.nn.rotary_embedding import build_rotary_embedding_from_config
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .utils.cudagraph import CudaGraphMixin
+from .utils.model import DeployModelMixinV1
 
 
 class Phi3Attention(nn.Module):
@@ -272,7 +272,7 @@ def get_input_embeddings(self):
         return self.embed_tokens
 
 
-class Phi3ForCausalLM(nn.Module, CudaGraphMixin):
+class Phi3ForCausalLM(nn.Module, DeployModelMixinV1, CudaGraphMixin):
     """ModelForCausalLM."""
 
     packed_modules_mapping = {
@@ -293,11 +293,7 @@ def __init__(self,
         # build model
         self.model = Phi3Model(config, dtype=dtype, device=device)
         # build lm_head
-        self.lm_head = build_rowwise_linear(config.hidden_size,
-                                            config.vocab_size,
-                                            bias=False,
-                                            dtype=dtype,
-                                            device=device)
+        self.lm_head = self.build_lm_head(config.hidden_size, config.vocab_size, bias=False, dtype=dtype, device=device)
 
     def forward(
         self,
@@ -318,15 +314,6 @@ def forward(
         )
         return hidden_states
 
-    def get_logits(self, hidden_states: torch.Tensor):
-        """Compute logits of the model output."""
-        return self.lm_head(hidden_states)
-
-    def update_weights(self):
-        """Update weights."""
-        if self.config.tie_word_embeddings:
-            self.lm_head.weight = self.model.embed_tokens.weight
-
     def get_input_embeddings(self):
         """Get input embeddings."""
         return self.model.get_input_embeddings()

diff --git a/lmdeploy/pytorch/models/phi3_v.py b/lmdeploy/pytorch/models/phi3_v.py
@@ -9,11 +9,10 @@
 from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
-from lmdeploy.pytorch.nn.linear import build_rowwise_linear
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .phi3 import Phi3ForCausalLM, Phi3Model
-from .utils.model import DeployModelMixin, vlm_model
+from .utils.model import vlm_model
 
 CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(attention_dropout=0.0,
                                                      dropout=0.0,
@@ -264,7 +263,7 @@ def forward(
         )
 
 
-class Phi3VForCausalLM(Phi3ForCausalLM, DeployModelMixin):
+class Phi3VForCausalLM(Phi3ForCausalLM):
 
     def __init__(self,
                  config: PretrainedConfig,
@@ -277,11 +276,7 @@ def __init__(self,
         # build model
         self.model = Phi3VModel(config, dtype=dtype, device=device)
         # build lm_head
-        self.lm_head = build_rowwise_linear(config.hidden_size,
-                                            config.vocab_size,
-                                            bias=False,
-                                            dtype=dtype,
-                                            device=device)
+        self.lm_head = self.build_lm_head(config.hidden_size, config.vocab_size, bias=False, dtype=dtype, device=device)
 
         self.input_processor = Phi3VInputProcessor(config, dtype)