huggingface · ArthurZucker · Nov 15, 2025 · Nov 6, 2025 · Nov 6, 2025 · Nov 6, 2025
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -1066,6 +1066,8 @@
         title: Gemma3n
       - local: model_doc/git
         title: GIT
+      - local: model_doc/glm46v
+        title: Glm46V
       - local: model_doc/glm4v
         title: glm4v
       - local: model_doc/glm4v_moe

diff --git a/docs/source/en/model_doc/glm46v.md b/docs/source/en/model_doc/glm46v.md
@@ -0,0 +1,34 @@
+# GLM-4.6V
+
+## Glm46VConfig
+
+[[autodoc]] Glm46VConfig
+
+## Glm46VImageProcessor
+
+[[autodoc]] Glm46VImageProcessor
+    - preprocess
+
+## Glm46VVideoProcessor
+
+[[autodoc]] Glm46VVideoProcessor
+    - preprocess
+
+## Glm46VImageProcessorFast
+
+[[autodoc]] Glm46VImageProcessorFast
+    - preprocess
+
+## Glm46VProcessor
+
+[[autodoc]] Glm46VProcessor
+
+## Glm46VModel
+
+[[autodoc]] Glm46VModel
+    - forward
+
+## Glm46VForConditionalGeneration
+
+[[autodoc]] Glm46VForConditionalGeneration
+    - forward
diff --git a/docs/source/en/model_doc/glm4v.md b/docs/source/en/model_doc/glm4v.md
@@ -170,6 +170,11 @@ print(output_text)
 
 [[autodoc]] Glm4vConfig
 
+
+## Glm4vVisionConfig
+
+[[autodoc]] Glm4vVisionConfig
+
 ## Glm4vTextConfig
 
 [[autodoc]] Glm4vTextConfig
@@ -193,6 +198,11 @@ print(output_text)
 
 [[autodoc]] Glm4vProcessor
 
+## Glm4vVisionModel
+
+[[autodoc]] Glm4vVisionModel
+    - forward
+
 ## Glm4vTextModel
 
 [[autodoc]] Glm4vTextModel

diff --git a/docs/source/en/model_doc/glm4v_moe.md b/docs/source/en/model_doc/glm4v_moe.md
@@ -22,7 +22,7 @@ rendered properly in your Markdown viewer.
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">    </div>
 </div>
 
-# Glm4vMoe
+# Glm4vMoeMoe
 
 ## Overview
 
@@ -48,10 +48,20 @@ The model also introduces a **Thinking Mode** switch, allowing users to balance
 
 [[autodoc]] Glm4vMoeConfig
 
+
+## Glm4vMoeVisionConfig
+
+[[autodoc]] Glm4vMoeVisionConfig
+
 ## Glm4vMoeTextConfig
 
 [[autodoc]] Glm4vMoeTextConfig
 
+## Glm4vMoeVisionModel
+
+[[autodoc]] Glm4vMoeVisionModel
+    - forward
+
 ## Glm4vMoeTextModel
 
 [[autodoc]] Glm4vMoeTextModel
@@ -65,4 +75,4 @@ The model also introduces a **Thinking Mode** switch, allowing users to balance
 ## Glm4vMoeForConditionalGeneration
 
 [[autodoc]] Glm4vMoeForConditionalGeneration
-    - forward
+    - forward
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -142,6 +142,9 @@
     from .git import *
     from .glm import *
     from .glm4 import *
+    from .glm4v import *
+    from .glm4v_moe import *
+    from .glm46v import *
     from .glpn import *
     from .got_ocr2 import *
     from .gpt2 import *

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -172,11 +172,14 @@
         ("git", "GitConfig"),
         ("glm", "GlmConfig"),
         ("glm4", "Glm4Config"),
+        ("glm46v", "Glm46VConfig"),
         ("glm4_moe", "Glm4MoeConfig"),
         ("glm4v", "Glm4vConfig"),
         ("glm4v_moe", "Glm4vMoeConfig"),
         ("glm4v_moe_text", "Glm4vMoeTextConfig"),
+        ("glm4v_moe_vision", "Glm4vMoeVisionConfig"),
         ("glm4v_text", "Glm4vTextConfig"),
+        ("glm4v_vision", "Glm4vVisionConfig"),
         ("glpn", "GLPNConfig"),
         ("got_ocr2", "GotOcr2Config"),
         ("gpt-sw3", "GPT2Config"),
@@ -620,11 +623,14 @@
         ("git", "GIT"),
         ("glm", "GLM"),
         ("glm4", "GLM4"),
+        ("glm46v", "Glm46V"),
         ("glm4_moe", "Glm4MoE"),
         ("glm4v", "GLM4V"),
         ("glm4v_moe", "GLM4VMOE"),
         ("glm4v_moe_text", "GLM4VMOE"),
+        ("glm4v_moe_vision", "Glm4vMoeVisionModel"),
         ("glm4v_text", "GLM4V"),
+        ("glm4v_vision", "Glm4vVisionModel"),
         ("glpn", "GLPN"),
         ("got_ocr2", "GOT-OCR2"),
         ("gpt-sw3", "GPT-Sw3"),
@@ -983,6 +989,8 @@
         ("gemma3n_audio", "gemma3n"),
         ("gemma3n_text", "gemma3n"),
         ("gemma3n_vision", "gemma3n"),
+        ("glm4v_vision", "glm4v"),
+        ("glm4v_moe_vision", "glm4v_moe"),
         ("glm4v_text", "glm4v"),
         ("glm4v_moe_text", "glm4v_moe"),
         ("idefics3_vision", "idefics3"),

diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
@@ -109,6 +109,7 @@
             ("gemma3", ("Gemma3ImageProcessor", "Gemma3ImageProcessorFast")),
             ("gemma3n", ("SiglipImageProcessor", "SiglipImageProcessorFast")),
             ("git", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
+            ("glm46v", ("Glm46VImageProcessor", "Glm46VImageProcessorFast")),
             ("glm4v", ("Glm4vImageProcessor", "Glm4vImageProcessorFast")),
             ("glpn", ("GLPNImageProcessor", "GLPNImageProcessorFast")),
             ("got_ocr2", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -175,11 +175,14 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("git", "GitModel"),
         ("glm", "GlmModel"),
         ("glm4", "Glm4Model"),
+        ("glm46v", "Glm46VModel"),
         ("glm4_moe", "Glm4MoeModel"),
         ("glm4v", "Glm4vModel"),
         ("glm4v_moe", "Glm4vMoeModel"),
         ("glm4v_moe_text", "Glm4vMoeTextModel"),
+        ("glm4v_moe_vision", "Glm4vMoeVisionModel"),
         ("glm4v_text", "Glm4vTextModel"),
+        ("glm4v_vision", "Glm4vVisionModel"),
         ("glpn", "GLPNModel"),
         ("got_ocr2", "GotOcr2Model"),
         ("gpt-sw3", "GPT2Model"),
@@ -1032,6 +1035,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("gemma3", "Gemma3ForConditionalGeneration"),
         ("gemma3n", "Gemma3nForConditionalGeneration"),
         ("git", "GitForCausalLM"),
+        ("glm46v", "Glm46VForConditionalGeneration"),
         ("glm4v", "Glm4vForConditionalGeneration"),
         ("glm4v_moe", "Glm4vMoeForConditionalGeneration"),
         ("got_ocr2", "GotOcr2ForConditionalGeneration"),

diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
@@ -75,6 +75,7 @@
         ("gemma3", "Gemma3Processor"),
         ("gemma3n", "Gemma3nProcessor"),
         ("git", "GitProcessor"),
+        ("glm46v", "Glm46VProcessor"),
         ("glm4v", "Glm4vProcessor"),
         ("glm4v_moe", "Glm4vProcessor"),
         ("got_ocr2", "GotOcr2Processor"),

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
@@ -308,6 +308,7 @@
         ("git", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
         ("glm", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
         ("glm4", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+        ("glm46v", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
         ("glm4_moe", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
         ("glm4v", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
         ("glm4v_moe", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),

diff --git a/src/transformers/models/auto/video_processing_auto.py b/src/transformers/models/auto/video_processing_auto.py
@@ -53,6 +53,7 @@
 else:
     VIDEO_PROCESSOR_MAPPING_NAMES = OrderedDict(
         [
+            ("glm46v", "Glm46VVideoProcessor"),
             ("glm4v", "Glm4vVideoProcessor"),
             ("instructblip", "InstructBlipVideoVideoProcessor"),
             ("instructblipvideo", "InstructBlipVideoVideoProcessor"),

diff --git a/src/transformers/models/glm46v/__init__.py b/src/transformers/models/glm46v/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_glm46v import *
+    from .image_processing_glm46v import *
+    from .image_processing_glm46v_fast import *
+    from .modeling_glm46v import *
+    from .processing_glm46v import *
+    from .video_processing_glm46v import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/glm46v/configuration_glm46v.py b/src/transformers/models/glm46v/configuration_glm46v.py
@@ -0,0 +1,106 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/glm46v/modular_glm46v.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_glm46v.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 the HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ...configuration_utils import PreTrainedConfig
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+class Glm46VConfig(PreTrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Glm4vModel`]. It is used to instantiate a
+    GLM-4.6V model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of
+    GLM-4.1V-9B-Thinking [zai-org/GLM-4.1V-9B-Thinking](https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking).
+
+    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PreTrainedConfig`] for more information.
+
+    Args:
+        text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Glm4vTextConfig`):
+            The config object or dictionary of the text backbone.
+        vision_config (`Union[PreTrainedConfig, dict]`,  *optional*, defaults to `Glm4vVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        image_token_id (`int`, *optional*, defaults to 151343):
+            The image token index to encode the image prompt.
+        video_token_id (`int`, *optional*, defaults to 151344):
+            The video token index to encode the image prompt.
+        image_start_token_id (`int`, *optional*, defaults to 151339):
+            The image start token index to encode the start of image.
+        image_end_token_id (`int`, *optional*, defaults to 151340):
+            The image end token index to encode the end of image.
+        video_start_token_id (`int`, *optional*, defaults to 151361):
+            The video start token index to encode the start of video.
+        video_end_token_id (`int`, *optional*, defaults to 151362):
+            The video end token index to encode the end of video.
+
+    ```python
+    >>> from transformers import Glm46VForConditionalGeneration, Glm46VConfig
+
+    >>> # Initializing a GLM-4.6V style configuration
+    >>> configuration = Glm46VConfig()
+
+    >>> # Initializing a model from the GLM-4.6V style configuration
+    >>> model = Glm4vForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "glm46v"
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        image_token_id=151343,
+        video_token_id=151344,
+        image_start_token_id=151339,
+        image_end_token_id=151340,
+        video_start_token_id=151361,
+        video_end_token_id=151362,
+        **kwargs,
+    ):
+        if isinstance(vision_config, dict):
+            vision_config["model_type"] = vision_config.get("model_type", "glm4v_vision")
+            self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        elif vision_config is None:
+            self.vision_config = CONFIG_MAPPING["glm4v_vision"]()
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "glm4v_text")
+            self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            self.text_config = CONFIG_MAPPING["glm4v_text"]()
+
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.video_start_token_id = video_start_token_id
+        self.video_end_token_id = video_end_token_id
+        self.image_start_token_id = image_start_token_id
+        self.image_end_token_id = image_end_token_id
+
+        super().__init__(**kwargs)
+
+
+__all__ = ["Glm46VConfig"]