FEAT: chatglm3 (xorbitsai#587)

aresnow1 · Oct 30, 2023 · 91ee9a0 · 91ee9a0
1 parent 919ffaa
commit 91ee9a0
Show file tree

Hide file tree

Showing 9 changed files with 245 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -30,6 +30,7 @@ potential of cutting-edge AI models.
 - Support grammar-based sampling for ggml models: [#525](https://github.com/xorbitsai/inference/pull/525)
 - Incorporate vLLM: [#445](https://github.com/xorbitsai/inference/pull/445)
 ### New Models
+- Built-in support for [chatglm3](https://huggingface.co/THUDM/chatglm3-6b)
 - Built-in support for [mistral-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) and [mistral-instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1): [#510](https://github.com/xorbitsai/inference/pull/510)
 ### Integrations
 - [Dify](https://docs.dify.ai/advanced/model-configuration/xinference): an LLMOps platform that enables developers (and even non-developers) to quickly build useful applications based on large language models, ensuring they are visual, operable, and improvable.

diff --git a/README_zh_CN.md b/README_zh_CN.md
@@ -27,6 +27,7 @@ Xorbits Inference（Xinference）是一个性能强大且功能全面的分布
 - 投机采样: [#509](https://github.com/xorbitsai/inference/pull/509)
 - 引入 vLLM: [#445](https://github.com/xorbitsai/inference/pull/445)
 ### 新模型
+- 内置 [chatglm3](https://huggingface.co/THUDM/chatglm3-6b)
 - 内置 [mistral-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) 与 [mistral-instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1): [#510](https://github.com/xorbitsai/inference/pull/510)
 ### 集成
 - [Dify](https://docs.dify.ai/advanced/model-configuration/xinference): 一个涵盖了大型语言模型开发、部署、维护和优化的 LLMOps 平台。

diff --git a/doc/source/models/builtin/chatglm3-32k.rst b/doc/source/models/builtin/chatglm3-32k.rst
@@ -0,0 +1,32 @@
+.. _models_builtin_chatglm3_32k:
+
+
+============
+ChatGLM3-32K
+============
+
+- **Context Length:** 32768
+- **Model Name:** chatglm3-32k
+- **Languages:** en, zh
+- **Abilities:** chat
+- **Description:** ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.
+
+Specifications
+^^^^^^^^^^^^^^
+
+Model Spec (pytorch, 6 Billion)
++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 6
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** THUDM/chatglm3-6b-32k
+
+Execute the following command to launch the model, remember to replace `${quantization}` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name chatglm3-32k --size-in-billions 6 --model-format pytorch --quantization ${quantization}
+
+.. note::
+
+   4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/chatglm3.rst b/doc/source/models/builtin/chatglm3.rst
@@ -0,0 +1,32 @@
+.. _models_builtin_chatglm3:
+
+
+========
+ChatGLM3
+========
+
+- **Context Length:** 8192
+- **Model Name:** chatglm3
+- **Languages:** en, zh
+- **Abilities:** chat
+- **Description:** ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.
+
+Specifications
+^^^^^^^^^^^^^^
+
+Model Spec (pytorch, 6 Billion)
++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 6
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** THUDM/chatglm3-6b
+
+Execute the following command to launch the model, remember to replace `${quantization}` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name chatglm3 --size-in-billions 6 --model-format pytorch --quantization ${quantization}
+
+.. note::
+
+   4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/index.rst b/doc/source/models/builtin/index.rst
@@ -27,6 +27,8 @@ Chat & Instruction-following Models
 - :ref:`ChatGLM <models_builtin_chatglm>`
 - :ref:`ChatGLM2 <models_builtin_chatglm2>`
 - :ref:`ChatGLM2-32k <models_builtin_chatglm2_32k>`
+- :ref:`ChatGLM3 <models_builtin_chatglm3>`
+- :ref:`ChatGLM3-32k <models_builtin_chatglm3_32k>`
 - :ref:`CodeLlama-Instruct <models_builtin_code_llama_instruct>`
 - :ref:`Falcon Instruct <models_builtin_falcon_instruct>`
 - :ref:`InternLM Chat <models_builtin_internlm_chat>`
@@ -69,6 +71,8 @@ Code Assistant Models
    chatglm
    chatglm2-32k
    chatglm2
+   chatglm3-32k
+   chatglm3
    code-llama
    code-llama-instruct
    code-llama-python

diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
@@ -481,6 +481,74 @@
       "intra_message_sep": "\n\n"
     }
   },
+  {
+    "version": 1,
+    "context_length": 8192,
+    "model_name": "chatglm3",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 6,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "THUDM/chatglm3-6b",
+        "model_revision": "fc3235f807ef5527af598c05f04f2ffd17f48bab"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "CHATGLM3",
+      "system_prompt": "",
+      "roles": [
+        "user",
+        "assistant"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "chatglm3-32k",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 6,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "THUDM/chatglm3-6b-32k",
+        "model_revision": "339f17ff464d47b5077527c2b34e80a7719ede3e"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "CHATGLM3",
+      "system_prompt": "",
+      "roles": [
+        "user",
+        "assistant"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 4096,

diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
@@ -273,6 +273,76 @@
       "intra_message_sep": "\n\n"
     }
   },
+  {
+    "version": 1,
+    "context_length": 8192,
+    "model_name": "chatglm3",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 6,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "ZhipuAI/chatglm3-6b",
+        "model_revision": "v1.0.0"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "CHATGLM3",
+      "system_prompt": "",
+      "roles": [
+        "user",
+        "assistant"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "chatglm3-32k",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 6,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "ZhipuAI/chatglm3-6b-32k",
+        "model_revision": "v1.0.0"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "CHATGLM3",
+      "system_prompt": "",
+      "roles": [
+        "user",
+        "assistant"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 8192,

diff --git a/xinference/model/llm/tests/test_utils.py b/xinference/model/llm/tests/test_utils.py
@@ -216,6 +216,29 @@ def test_prompt_style_chatglm_v2():
     )
 
 
+def test_prompt_style_chatglm_v3():
+    prompt_style = PromptStyleV1(
+        style_name="CHATGLM3",
+        system_prompt="",
+        roles=["user", "assistant"],
+    )
+    chat_history = [
+        ChatCompletionMessage(role=prompt_style.roles[0], content="Hi there."),
+        ChatCompletionMessage(
+            role=prompt_style.roles[1], content="Hello, how may I help you?"
+        ),
+    ]
+    expected = (
+        "<|user|> \n Hi there."
+        "<|assistant|> \n Hello, how may I help you?"
+        "<|user|> \n Write a poem."
+        "<|assistant|>"
+    )
+    assert expected == ChatModelMixin.get_prompt(
+        "Write a poem.", chat_history, prompt_style
+    )
+
+
 def test_prompt_style_qwen():
     prompt_style = PromptStyleV1(
         style_name="QWEN",

diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py
@@ -121,6 +121,20 @@ def get_prompt(
                 else:
                     ret += role + "："
             return ret
+        elif prompt_style.style_name == "CHATGLM3":
+            ret = (
+                f"<|system|> \n {prompt_style.system_prompt}"
+                if prompt_style.system_prompt
+                else ""
+            )
+            for i, message in enumerate(chat_history):
+                role = message["role"]
+                content = message["content"]
+                if content:
+                    ret += f"<|{role}|> \n {content}"
+                else:
+                    ret += f"<|{role}|>"
+            return ret
         elif prompt_style.style_name == "QWEN":
             ret = f"<|im_start|>system\n{prompt_style.system_prompt}<|im_end|>"
             for message in chat_history: