From cd96f1e42907031a3728045dd98051dd0586548e Mon Sep 17 00:00:00 2001
From: Minamiyama <minamiyama@qq.com>
Date: Fri, 2 May 2025 11:45:39 +0800
Subject: [PATCH 1/7] feat: add Kimi-VL model cards

---
 xinference/model/llm/llm_family.json          | 66 +++++++++++++++++-
 .../model/llm/llm_family_modelscope.json      | 68 ++++++++++++++++++-
 2 files changed, 132 insertions(+), 2 deletions(-)

diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index b7849d276a..1649aab0a3 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -9065,7 +9065,8 @@
     "model_specs": [
       {
         "model_format": "pytorch",
-        "model_size_in_billions": 3,
+        "model_size_in_billions": 16,
+        "activated_size_in_billions": 3,
         "quantizations": [
           "none"
         ],
@@ -9080,6 +9081,69 @@
       "<|im_end|>"
     ]
   },
+  {
+    "version": 1,
+    "context_length": 128000,
+    "model_name": "Kimi-VL-A3B-Instruct",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Kimi-VL, an efficient open-source Mixture-of-Experts (MoE) vision-language model (VLM) that offers advanced multimodal reasoning, long-context understanding, and strong agent capabilities",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 16,
+        "activated_size_in_billions": 3,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "moonshotai/Kimi-VL-A3B-Instruct"
+      }
+    ],
+    "chat_template": "{%- for message in messages -%}{%- if loop.first and messages[0]['role'] != 'system' -%}{{'<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>'}}{%- endif -%}{%- if message['role'] == 'system' -%}{{'<|im_system|>'}}{%- endif -%}{%- if message['role'] == 'user' -%}{{'<|im_user|>'}}{%- endif -%}{%- if message['role'] == 'assistant' -%}{{'<|im_assistant|>'}}{%- endif -%}{{- message['role'] -}}{{'<|im_middle|>'}}{%- if message['content'] is string -%}{{- message['content'] + '<|im_end|>' -}}{%- else -%}{%- for content in message['content'] -%}{%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}{{'<|media_start|>image<|media_content|><|media_pad|><|media_end|>'}}{%- else -%}{{content['text']}}{%- endif -%}{%- endfor -%}{{'<|im_end|>'}}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{'<|im_assistant|>assistant<|im_middle|>'}}{%- endif -%}",
+    "stop_token_ids": [
+      163586
+    ],
+    "stop": [
+      "<|im_end|>"
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 128000,
+    "model_name": "Kimi-VL-A3B-Thinking",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "reasoning"
+    ],
+    "model_description": "Kimi-VL, an efficient open-source Mixture-of-Experts (MoE) vision-language model (VLM) that offers advanced multimodal reasoning, long-context understanding, and strong agent capabilities",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 16,
+        "activated_size_in_billions": 3,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "moonshotai/Kimi-VL-A3B-Thinking"
+      }
+    ],
+    "chat_template": "{%- for message in messages -%}{%- if loop.first and messages[0]['role'] != 'system' -%}{{'<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>'}}{%- endif -%}{%- if message['role'] == 'system' -%}{{'<|im_system|>'}}{%- endif -%}{%- if message['role'] == 'user' -%}{{'<|im_user|>'}}{%- endif -%}{%- if message['role'] == 'assistant' -%}{{'<|im_assistant|>'}}{%- endif -%}{{- message['role'] -}}{{'<|im_middle|>'}}{%- if message['content'] is string -%}{{- message['content'] + '<|im_end|>' -}}{%- else -%}{%- for content in message['content'] -%}{%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}{{'<|media_start|>image<|media_content|><|media_pad|><|media_end|>'}}{%- else -%}{{content['text']}}{%- endif -%}{%- endfor -%}{{'<|im_end|>'}}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{'<|im_assistant|>assistant<|im_middle|>'}}{%- endif -%}",
+    "stop_token_ids": [
+      163586
+    ],
+    "stop": [
+      "<|im_end|>"
+    ]
+  },
   {
     "version": 1,
     "context_length": 131072,
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
index 621b200643..1f7a4596e3 100644
--- a/xinference/model/llm/llm_family_modelscope.json
+++ b/xinference/model/llm/llm_family_modelscope.json
@@ -7217,7 +7217,8 @@
     "model_specs": [
       {
         "model_format": "pytorch",
-        "model_size_in_billions": 3,
+        "model_size_in_billions": 16,
+        "activated_size_in_billions": 3,
         "quantizations": [
           "none"
         ],
@@ -7233,6 +7234,71 @@
       "<|im_end|>"
     ]
   },
+  {
+    "version": 1,
+    "context_length": 128000,
+    "model_name": "Kimi-VL-A3B-Instruct",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Kimi-VL, an efficient open-source Mixture-of-Experts (MoE) vision-language model (VLM) that offers advanced multimodal reasoning, long-context understanding, and strong agent capabilities",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 16,
+        "activated_size_in_billions": 3,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "moonshotai/Kimi-VL-A3B-Instruct",
+        "model_hub": "modelscope"
+      }
+    ],
+    "chat_template": "{%- for message in messages -%}{%- if loop.first and messages[0]['role'] != 'system' -%}{{'<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>'}}{%- endif -%}{%- if message['role'] == 'system' -%}{{'<|im_system|>'}}{%- endif -%}{%- if message['role'] == 'user' -%}{{'<|im_user|>'}}{%- endif -%}{%- if message['role'] == 'assistant' -%}{{'<|im_assistant|>'}}{%- endif -%}{{- message['role'] -}}{{'<|im_middle|>'}}{%- if message['content'] is string -%}{{- message['content'] + '<|im_end|>' -}}{%- else -%}{%- for content in message['content'] -%}{%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}{{'<|media_start|>image<|media_content|><|media_pad|><|media_end|>'}}{%- else -%}{{content['text']}}{%- endif -%}{%- endfor -%}{{'<|im_end|>'}}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{'<|im_assistant|>assistant<|im_middle|>'}}{%- endif -%}",
+    "stop_token_ids": [
+      163586
+    ],
+    "stop": [
+      "<|im_end|>"
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 128000,
+    "model_name": "Kimi-VL-A3B-Thinking",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "reasoning"
+    ],
+    "model_description": "Kimi-VL, an efficient open-source Mixture-of-Experts (MoE) vision-language model (VLM) that offers advanced multimodal reasoning, long-context understanding, and strong agent capabilities",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 16,
+        "activated_size_in_billions": 3,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "moonshotai/Kimi-VL-A3B-Thinking",
+        "model_hub": "modelscope"
+      }
+    ],
+    "chat_template": "{%- for message in messages -%}{%- if loop.first and messages[0]['role'] != 'system' -%}{{'<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>'}}{%- endif -%}{%- if message['role'] == 'system' -%}{{'<|im_system|>'}}{%- endif -%}{%- if message['role'] == 'user' -%}{{'<|im_user|>'}}{%- endif -%}{%- if message['role'] == 'assistant' -%}{{'<|im_assistant|>'}}{%- endif -%}{{- message['role'] -}}{{'<|im_middle|>'}}{%- if message['content'] is string -%}{{- message['content'] + '<|im_end|>' -}}{%- else -%}{%- for content in message['content'] -%}{%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}{{'<|media_start|>image<|media_content|><|media_pad|><|media_end|>'}}{%- else -%}{{content['text']}}{%- endif -%}{%- endfor -%}{{'<|im_end|>'}}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{'<|im_assistant|>assistant<|im_middle|>'}}{%- endif -%}",
+    "stop_token_ids": [
+      163586
+    ],
+    "stop": [
+      "<|im_end|>"
+    ]
+  },
   {
     "version": 1,
     "context_length": 131072,

From f71f9d7b92dd79c3b340821188a4bf5c8339221d Mon Sep 17 00:00:00 2001
From: Minamiyama <minamiyama@qq.com>
Date: Fri, 2 May 2025 11:59:37 +0800
Subject: [PATCH 2/7] =?UTF-8?q?feat(vllm):=20=E6=B7=BB=E5=8A=A0=E5=AF=B9Ki?=
 =?UTF-8?q?mi-VL-A3B-Instruct=E5=92=8CKimi-VL-A3B-Thinking=E8=A7=86?=
 =?UTF-8?q?=E8=A7=89=E6=A8=A1=E5=9E=8B=E7=9A=84=E6=94=AF=E6=8C=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 xinference/model/llm/vllm/core.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index cd67f2670f..6b3807926e 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -242,6 +242,8 @@ class VLLMGenerateConfig(TypedDict, total=False):
 
 if VLLM_INSTALLED and vllm.__version__ >= "0.8.5":
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen3")
+    VLLM_SUPPORTED_VISION_MODEL_LIST.append("Kimi-VL-A3B-Instruct")
+    VLLM_SUPPORTED_VISION_MODEL_LIST.append("Kimi-VL-A3B-Thinking")
 
 
 class VLLMModel(LLM):

From 2509f4aee99a0747c9c5efa554590469c5b8c9ec Mon Sep 17 00:00:00 2001
From: Minamiyama <minamiyama@qq.com>
Date: Fri, 2 May 2025 12:21:05 +0800
Subject: [PATCH 3/7] feat: [WIP]transformers impl

---
 xinference/model/llm/transformers/kimi_vl.py | 228 +++++++++++++++++++
 1 file changed, 228 insertions(+)
 create mode 100644 xinference/model/llm/transformers/kimi_vl.py

diff --git a/xinference/model/llm/transformers/kimi_vl.py b/xinference/model/llm/transformers/kimi_vl.py
new file mode 100644
index 0000000000..b1f18427d4
--- /dev/null
+++ b/xinference/model/llm/transformers/kimi_vl.py
@@ -0,0 +1,228 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import uuid
+from typing import Dict, Iterator, List, Optional, Union
+
+import torch
+from PIL import Image
+
+from ....model.utils import select_device
+from ....types import (
+    ChatCompletion,
+    ChatCompletionChunk,
+    ChatCompletionMessage,
+    CompletionChunk,
+)
+from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..utils import generate_chat_completion, generate_completion_chunk
+from .core import PytorchChatModel, PytorchGenerateConfig
+from .utils import cache_clean
+
+logger = logging.getLogger(__name__)
+
+@register_transformer
+@register_non_default_model(
+    "Kimi-VL-A3B-Instruct", "Kimi-VL-A3B-Thinking"
+)
+class KimiVLChatModel(PytorchChatModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._tokenizer = None
+        self._model = None
+        self._device = None
+        self._processor = None
+
+    @classmethod
+    def match_json(
+        cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
+            return False
+        llm_family = model_family.model_family or model_family.model_name
+        if "kimi-vl".lower() in llm_family.lower():
+            return True
+        return False
+
+    def load(self):
+        import importlib.util
+        from transformers import AutoModelForCausalLM, AutoProcessor
+
+        self._device = self._pytorch_model_config.get("device", "auto")
+        self._device = select_device(self._device)
+
+        # 构建模型加载参数
+        model_kwargs = {
+            "pretrained_model_name_or_path": self.model_path,
+            "device_map": self._device,
+            "trust_remote_code": True,
+            "torch_dtype": "auto"
+        }
+
+        flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
+        if flash_attn_installed:
+            model_kwargs.update({
+                "torch_dtype": torch.bfloat16,
+                "attn_implementation": "flash_attention_2"
+            })
+
+        self._model = AutoModelForCausalLM.from_pretrained(**model_kwargs)
+        self._processor = AutoProcessor.from_pretrained(self.model_path, trust_remote_code=True)
+
+    @cache_clean
+    def chat(
+        self,
+        messages: List[ChatCompletionMessage],  # type: ignore
+        generate_config: Optional[PytorchGenerateConfig] = None,
+    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+        messages = self._transform_messages(messages)
+
+        generate_config = generate_config if generate_config else {}
+
+        stream = generate_config.get("stream", False) if generate_config else False
+
+        if stream:
+            it = self._generate_stream(messages, generate_config)
+            return self._to_chat_completion_chunks(it)
+        else:
+            c = self._generate(messages, generate_config)
+            return c
+
+    def _generate(
+        self, messages: List, config: PytorchGenerateConfig = {}
+    ) -> ChatCompletion:
+        input_ids, attention_mask, pixel_values, gen_kwargs = self._generate_chat_data(
+            messages, config
+        )
+
+        # generate output
+        with torch.inference_mode():
+            gen_kwargs.update(
+                dict(
+                    pixel_values=pixel_values,
+                    attention_mask=attention_mask,
+                )
+            )
+
+            output_ids = self._model.generate(
+                input_ids,
+                **gen_kwargs,
+            )[0]
+            output = self._text_tokenizer.decode(output_ids, skip_special_tokens=True)
+        return generate_chat_completion(self.model_uid, output)
+
+    def _generate_stream(
+        self, messages: List, config: PytorchGenerateConfig = {}
+    ) -> Iterator[CompletionChunk]:
+        from threading import Thread
+
+        from transformers import TextIteratorStreamer
+
+        input_ids, attention_mask, pixel_values, gen_kwargs = self._generate_chat_data(
+            messages, config
+        )
+
+        _, inputs_embeds, _, attention_mask = self._model.merge_multimodal(
+            text_input_ids=input_ids,
+            text_attention_masks=attention_mask,
+            text_labels=None,
+            pixel_values=pixel_values,
+            left_padding=True,
+        )
+
+        streamer = TextIteratorStreamer(
+            self._text_tokenizer, timeout=60, skip_prompt=True, skip_special_tokens=True
+        )
+
+        gen_kwargs.update(
+            dict(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                streamer=streamer,
+            )
+        )
+
+        inputs_embeds = inputs_embeds.detach()
+        torch.cuda.empty_cache()
+
+        thread = Thread(target=self._model.llm.generate, kwargs=gen_kwargs)
+        thread.start()
+
+        completion_id = str(uuid.uuid1())
+
+        for new_text in streamer:
+            yield generate_completion_chunk(
+                chunk_text=new_text,
+                finish_reason=None,
+                chunk_id=completion_id,
+                model_uid=self.model_uid,
+                prompt_tokens=-1,
+                completion_tokens=-1,
+                total_tokens=-1,
+                has_choice=True,
+                has_content=True,
+            )
+
+        yield generate_completion_chunk(
+            chunk_text=None,
+            finish_reason="stop",
+            chunk_id=completion_id,
+            model_uid=self.model_uid,
+            prompt_tokens=-1,
+            completion_tokens=-1,
+            total_tokens=-1,
+            has_choice=True,
+            has_content=False,
+        )
+
+    def _convert_video_tensors_to_pil(self, video_inputs: List) -> List[Image.Image]:
+        """Convert video tensors to a list of PIL images"""
+        from torchvision import transforms
+
+        to_pil = transforms.ToPILImage()
+        pil_images = []
+
+        for video_tensor_4d in video_inputs:
+            if isinstance(video_tensor_4d, torch.Tensor):
+                # Verify it's a 4D tensor
+                if video_tensor_4d.ndim == 4:
+                    # Iterate through the first dimension (frames) of 4D tensor
+                    for i in range(video_tensor_4d.size(0)):
+                        frame_tensor_3d = video_tensor_4d[
+                            i
+                        ]  # Get 3D frame tensor [C, H, W]
+                        # Ensure tensor is on CPU before conversion
+                        if frame_tensor_3d.is_cuda:
+                            frame_tensor_3d = frame_tensor_3d.cpu()
+                        try:
+                            pil_image = to_pil(frame_tensor_3d)
+                            pil_images.append(pil_image)
+                        except Exception as e:
+                            logger.error(
+                                f"Error converting frame {i} to PIL Image: {e}"
+                            )
+                            # Can choose to skip this frame or handle error differently
+                else:
+                    logger.warning(
+                        f"Expected 4D tensor in video_inputs, but got {video_tensor_4d.ndim}D. Skipping this tensor."
+                    )
+            elif isinstance(video_tensor_4d, Image.Image):
+                # If fetch_video returns Image list, add directly
+                pil_images.append(video_tensor_4d)
+            else:
+                logger.warning(
+                    f"Unexpected type in video_inputs: {type(video_tensor_4d)}. Skipping."
+                )
+
+        return pil_images

From 0c727d4f2df8f6f6851bb5b1124efdfe82fd4dc7 Mon Sep 17 00:00:00 2001
From: Minamiyama <minamiyama@qq.com>
Date: Sat, 3 May 2025 03:00:15 +0800
Subject: [PATCH 4/7] [WIP]model vision ability

---
 xinference/model/llm/llm_family.json            | 4 +++-
 xinference/model/llm/llm_family_modelscope.json | 4 +++-
 xinference/model/llm/transformers/kimi_vl.py    | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 1649aab0a3..4a1e254d18 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -9090,7 +9090,8 @@
       "zh"
     ],
     "model_ability": [
-      "chat"
+      "chat",
+      "vision"
     ],
     "model_description": "Kimi-VL, an efficient open-source Mixture-of-Experts (MoE) vision-language model (VLM) that offers advanced multimodal reasoning, long-context understanding, and strong agent capabilities",
     "model_specs": [
@@ -9122,6 +9123,7 @@
     ],
     "model_ability": [
       "chat",
+      "vision",
       "reasoning"
     ],
     "model_description": "Kimi-VL, an efficient open-source Mixture-of-Experts (MoE) vision-language model (VLM) that offers advanced multimodal reasoning, long-context understanding, and strong agent capabilities",
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
index 1f7a4596e3..633fe21f66 100644
--- a/xinference/model/llm/llm_family_modelscope.json
+++ b/xinference/model/llm/llm_family_modelscope.json
@@ -7243,7 +7243,8 @@
       "zh"
     ],
     "model_ability": [
-      "chat"
+      "chat",
+      "vision"
     ],
     "model_description": "Kimi-VL, an efficient open-source Mixture-of-Experts (MoE) vision-language model (VLM) that offers advanced multimodal reasoning, long-context understanding, and strong agent capabilities",
     "model_specs": [
@@ -7276,6 +7277,7 @@
     ],
     "model_ability": [
       "chat",
+      "vision",
       "reasoning"
     ],
     "model_description": "Kimi-VL, an efficient open-source Mixture-of-Experts (MoE) vision-language model (VLM) that offers advanced multimodal reasoning, long-context understanding, and strong agent capabilities",
diff --git a/xinference/model/llm/transformers/kimi_vl.py b/xinference/model/llm/transformers/kimi_vl.py
index b1f18427d4..03bf2a31d2 100644
--- a/xinference/model/llm/transformers/kimi_vl.py
+++ b/xinference/model/llm/transformers/kimi_vl.py
@@ -51,7 +51,7 @@ def match_json(
         if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
             return False
         llm_family = model_family.model_family or model_family.model_name
-        if "kimi-vl".lower() in llm_family.lower():
+        if "kimi-vl-".lower() in llm_family.lower():
             return True
         return False
 

From eefa8e94b791bc63b177f06b416440bd29ae81a6 Mon Sep 17 00:00:00 2001
From: Minamiyama <minamiyama@qq.com>
Date: Sat, 3 May 2025 09:59:58 +0800
Subject: [PATCH 5/7] [WIP]generate

---
 xinference/model/llm/transformers/kimi_vl.py | 107 +++++--------------
 1 file changed, 24 insertions(+), 83 deletions(-)

diff --git a/xinference/model/llm/transformers/kimi_vl.py b/xinference/model/llm/transformers/kimi_vl.py
index 03bf2a31d2..2993d57fff 100644
--- a/xinference/model/llm/transformers/kimi_vl.py
+++ b/xinference/model/llm/transformers/kimi_vl.py
@@ -17,6 +17,7 @@
 
 import torch
 from PIL import Image
+from torch.cuda import temperature
 
 from ....model.utils import select_device
 from ....types import (
@@ -25,9 +26,9 @@
     ChatCompletionMessage,
     CompletionChunk,
 )
-from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
 from ..utils import generate_chat_completion, generate_completion_chunk
-from .core import PytorchChatModel, PytorchGenerateConfig
+from .core import PytorchChatModel, PytorchGenerateConfig, register_non_default_model
 from .utils import cache_clean
 
 logger = logging.getLogger(__name__)
@@ -77,6 +78,9 @@ def load(self):
                 "attn_implementation": "flash_attention_2"
             })
 
+        kwargs = self.apply_bnb_quantization()
+        model_kwargs.update(kwargs)
+
         self._model = AutoModelForCausalLM.from_pretrained(**model_kwargs)
         self._processor = AutoProcessor.from_pretrained(self.model_path, trust_remote_code=True)
 
@@ -93,8 +97,11 @@ def chat(
         stream = generate_config.get("stream", False) if generate_config else False
 
         if stream:
-            it = self._generate_stream(messages, generate_config)
-            return self._to_chat_completion_chunks(it)
+            raise NotImplementedError(
+                "Kimi-VL-A3B-Instruct does not support stream generation yet."
+            )
+            # it = self._generate_stream(messages, generate_config)
+            # return self._to_chat_completion_chunks(it)
         else:
             c = self._generate(messages, generate_config)
             return c
@@ -102,89 +109,23 @@ def chat(
     def _generate(
         self, messages: List, config: PytorchGenerateConfig = {}
     ) -> ChatCompletion:
-        input_ids, attention_mask, pixel_values, gen_kwargs = self._generate_chat_data(
-            messages, config
-        )
-
-        # generate output
-        with torch.inference_mode():
-            gen_kwargs.update(
-                dict(
-                    pixel_values=pixel_values,
-                    attention_mask=attention_mask,
-                )
-            )
-
-            output_ids = self._model.generate(
-                input_ids,
-                **gen_kwargs,
-            )[0]
-            output = self._text_tokenizer.decode(output_ids, skip_special_tokens=True)
-        return generate_chat_completion(self.model_uid, output)
+        text = self._processor.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
+        image = None
+        inputs = self._processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(self._model.device)
+        generated_ids = self._model.generate(**inputs, max_new_tokens=config.get("max_tokens", 2048), temperature=config.get("temperature", 0.7))
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        response = self._processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+        logger.info(f"============输出: {response}")
+        return generate_chat_completion(self.model_uid, response)
 
     def _generate_stream(
         self, messages: List, config: PytorchGenerateConfig = {}
     ) -> Iterator[CompletionChunk]:
-        from threading import Thread
-
-        from transformers import TextIteratorStreamer
-
-        input_ids, attention_mask, pixel_values, gen_kwargs = self._generate_chat_data(
-            messages, config
-        )
-
-        _, inputs_embeds, _, attention_mask = self._model.merge_multimodal(
-            text_input_ids=input_ids,
-            text_attention_masks=attention_mask,
-            text_labels=None,
-            pixel_values=pixel_values,
-            left_padding=True,
-        )
-
-        streamer = TextIteratorStreamer(
-            self._text_tokenizer, timeout=60, skip_prompt=True, skip_special_tokens=True
-        )
-
-        gen_kwargs.update(
-            dict(
-                inputs_embeds=inputs_embeds,
-                attention_mask=attention_mask,
-                streamer=streamer,
-            )
-        )
-
-        inputs_embeds = inputs_embeds.detach()
-        torch.cuda.empty_cache()
-
-        thread = Thread(target=self._model.llm.generate, kwargs=gen_kwargs)
-        thread.start()
-
-        completion_id = str(uuid.uuid1())
-
-        for new_text in streamer:
-            yield generate_completion_chunk(
-                chunk_text=new_text,
-                finish_reason=None,
-                chunk_id=completion_id,
-                model_uid=self.model_uid,
-                prompt_tokens=-1,
-                completion_tokens=-1,
-                total_tokens=-1,
-                has_choice=True,
-                has_content=True,
-            )
-
-        yield generate_completion_chunk(
-            chunk_text=None,
-            finish_reason="stop",
-            chunk_id=completion_id,
-            model_uid=self.model_uid,
-            prompt_tokens=-1,
-            completion_tokens=-1,
-            total_tokens=-1,
-            has_choice=True,
-            has_content=False,
-        )
+        pass
 
     def _convert_video_tensors_to_pil(self, video_inputs: List) -> List[Image.Image]:
         """Convert video tensors to a list of PIL images"""

From 26836d29b540d08658cb18dddc4187e6c7156b52 Mon Sep 17 00:00:00 2001
From: Minamiyama <minamiyama@qq.com>
Date: Mon, 21 Jul 2025 14:34:06 +0800
Subject: [PATCH 6/7] feat(model): add Kimi-VL-A3B models to llm_family.json

Add support for Kimi-VL-A3B-Instruct and Kimi-VL-A3B-Thinking-2506 vision-language models with multimodal reasoning capabilities
---
 xinference/model/llm/llm_family.json | 102 +++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)

diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index bd0ba06e0a..f9dbe65425 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -18670,5 +18670,107 @@
         "#system_numpy#"
       ]
     }
+  },
+  {
+    "version": 2,
+    "context_length": 128000,
+    "model_name": "Kimi-VL-A3B-Instruct",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "vision",
+      "reasoning"
+    ],
+    "model_description": "Kimi-VL, an efficient open-source Mixture-of-Experts (MoE) vision-language model (VLM) that offers advanced multimodal reasoning, long-context understanding, and strong agent capabilities",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 16,
+        "activated_size_in_billions": 3,
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "none"
+            ],
+            "model_id": "moonshotai/Kimi-VL-A3B-Instruct"
+          },
+          "modelscope": {
+            "quantizations": [
+              "none"
+            ],
+            "model_id": "moonshotai/Kimi-VL-A3B-Instruct",
+            "model_revision": "master"
+          }
+        }
+      }
+    ],
+    "chat_template": "{%- for message in messages -%}{%- if loop.first and messages[0]['role'] != 'system' -%}{{'<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>'}}{%- endif -%}{%- if message['role'] == 'system' -%}{{'<|im_system|>'}}{%- endif -%}{%- if message['role'] == 'user' -%}{{'<|im_user|>'}}{%- endif -%}{%- if message['role'] == 'assistant' -%}{{'<|im_assistant|>'}}{%- endif -%}{{- message['role'] -}}{{'<|im_middle|>'}}{%- if message['content'] is string -%}{{- message['content'] + '<|im_end|>' -}}{%- else -%}{%- for content in message['content'] -%}{%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}{{'<|media_start|>image<|media_content|><|media_pad|><|media_end|>'}}{%- else -%}{{content['text']}}{%- endif -%}{%- endfor -%}{{'<|im_end|>'}}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{'<|im_assistant|>assistant<|im_middle|>'}}{%- endif -%}",
+    "stop_token_ids": [
+      163586
+    ],
+    "stop": [
+      "<|im_end|>"
+    ],
+    "reasoning_start_tag": "<think>",
+    "reasoning_end_tag": "</think>",
+    "virtualenv": {
+      "packages": [
+        "transformers>=4.51.3"
+      ]
+    }
+  },
+  {
+    "version": 2,
+    "context_length": 128000,
+    "model_name": "Kimi-VL-A3B-Thinking-2506",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "vision",
+      "reasoning"
+    ],
+    "model_description": "Kimi-VL, an efficient open-source Mixture-of-Experts (MoE) vision-language model (VLM) that offers advanced multimodal reasoning, long-context understanding, and strong agent capabilities",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 16,
+        "activated_size_in_billions": 3,
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "none"
+            ],
+            "model_id": "moonshotai/Kimi-VL-A3B-Thinking-2506"
+          },
+          "modelscope": {
+            "quantizations": [
+              "none"
+            ],
+            "model_id": "moonshotai/Kimi-VL-A3B-Thinking-2506",
+            "model_revision": "master"
+          }
+        }
+      }
+    ],
+    "chat_template": "{%- for message in messages -%}{%- if loop.first and messages[0]['role'] != 'system' -%}{{'<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>'}}{%- endif -%}{%- if message['role'] == 'system' -%}{{'<|im_system|>'}}{%- endif -%}{%- if message['role'] == 'user' -%}{{'<|im_user|>'}}{%- endif -%}{%- if message['role'] == 'assistant' -%}{{'<|im_assistant|>'}}{%- endif -%}{{- message['role'] -}}{{'<|im_middle|>'}}{%- if message['content'] is string -%}{{- message['content'] + '<|im_end|>' -}}{%- else -%}{%- for content in message['content'] -%}{%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}{{'<|media_start|>image<|media_content|><|media_pad|><|media_end|>'}}{%- else -%}{{content['text']}}{%- endif -%}{%- endfor -%}{{'<|im_end|>'}}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{'<|im_assistant|>assistant<|im_middle|>'}}{%- endif -%}",
+    "stop_token_ids": [
+      163586
+    ],
+    "stop": [
+      "<|im_end|>"
+    ],
+    "reasoning_start_tag": "<think>",
+    "reasoning_end_tag": "</think>",
+    "virtualenv": {
+      "packages": [
+        "transformers>=4.51.3"
+      ]
+    }
   }
 ]

From 64f7e1655f25f4f8a415abf1e7e7697df1d47fd0 Mon Sep 17 00:00:00 2001
From: Minamiyama <minamiyama@qq.com>
Date: Mon, 21 Jul 2025 14:36:21 +0800
Subject: [PATCH 7/7] fix(vllm): update Kimi-VL-A3B-Thinking model name to
 include version

---
 xinference/model/llm/vllm/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index 90a221612e..1c89fb1ae8 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -259,7 +259,7 @@ class VLLMGenerateConfig(TypedDict, total=False):
 if VLLM_INSTALLED and vllm.__version__ >= "0.8.5":
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen3")
     VLLM_SUPPORTED_VISION_MODEL_LIST.append("Kimi-VL-A3B-Instruct")
-    VLLM_SUPPORTED_VISION_MODEL_LIST.append("Kimi-VL-A3B-Thinking")
+    VLLM_SUPPORTED_VISION_MODEL_LIST.append("Kimi-VL-A3B-Thinking-2506")
 
 if VLLM_INSTALLED and vllm.__version__ >= "0.9.1":
     VLLM_SUPPORTED_CHAT_MODELS.append("minicpm4")