From cd96f1e42907031a3728045dd98051dd0586548e Mon Sep 17 00:00:00 2001 From: Minamiyama Date: Fri, 2 May 2025 11:45:39 +0800 Subject: [PATCH 1/7] feat: add Kimi-VL model cards --- xinference/model/llm/llm_family.json | 66 +++++++++++++++++- .../model/llm/llm_family_modelscope.json | 68 ++++++++++++++++++- 2 files changed, 132 insertions(+), 2 deletions(-) diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index b7849d276a..1649aab0a3 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -9065,7 +9065,8 @@ "model_specs": [ { "model_format": "pytorch", - "model_size_in_billions": 3, + "model_size_in_billions": 16, + "activated_size_in_billions": 3, "quantizations": [ "none" ], @@ -9080,6 +9081,69 @@ "<|im_end|>" ] }, + { + "version": 1, + "context_length": 128000, + "model_name": "Kimi-VL-A3B-Instruct", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat" + ], + "model_description": "Kimi-VL, an efficient open-source Mixture-of-Experts (MoE) vision-language model (VLM) that offers advanced multimodal reasoning, long-context understanding, and strong agent capabilities", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 16, + "activated_size_in_billions": 3, + "quantizations": [ + "none" + ], + "model_id": "moonshotai/Kimi-VL-A3B-Instruct" + } + ], + "chat_template": "{%- for message in messages -%}{%- if loop.first and messages[0]['role'] != 'system' -%}{{'<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>'}}{%- endif -%}{%- if message['role'] == 'system' -%}{{'<|im_system|>'}}{%- endif -%}{%- if message['role'] == 'user' -%}{{'<|im_user|>'}}{%- endif -%}{%- if message['role'] == 'assistant' -%}{{'<|im_assistant|>'}}{%- endif -%}{{- message['role'] -}}{{'<|im_middle|>'}}{%- if message['content'] is string -%}{{- message['content'] + '<|im_end|>' -}}{%- else -%}{%- for content in message['content'] -%}{%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}{{'<|media_start|>image<|media_content|><|media_pad|><|media_end|>'}}{%- else -%}{{content['text']}}{%- endif -%}{%- endfor -%}{{'<|im_end|>'}}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{'<|im_assistant|>assistant<|im_middle|>'}}{%- endif -%}", + "stop_token_ids": [ + 163586 + ], + "stop": [ + "<|im_end|>" + ] + }, + { + "version": 1, + "context_length": 128000, + "model_name": "Kimi-VL-A3B-Thinking", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat", + "reasoning" + ], + "model_description": "Kimi-VL, an efficient open-source Mixture-of-Experts (MoE) vision-language model (VLM) that offers advanced multimodal reasoning, long-context understanding, and strong agent capabilities", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 16, + "activated_size_in_billions": 3, + "quantizations": [ + "none" + ], + "model_id": "moonshotai/Kimi-VL-A3B-Thinking" + } + ], + "chat_template": "{%- for message in messages -%}{%- if loop.first and messages[0]['role'] != 'system' -%}{{'<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>'}}{%- endif -%}{%- if message['role'] == 'system' -%}{{'<|im_system|>'}}{%- endif -%}{%- if message['role'] == 'user' -%}{{'<|im_user|>'}}{%- endif -%}{%- if message['role'] == 'assistant' -%}{{'<|im_assistant|>'}}{%- endif -%}{{- message['role'] -}}{{'<|im_middle|>'}}{%- if message['content'] is string -%}{{- message['content'] + '<|im_end|>' -}}{%- else -%}{%- for content in message['content'] -%}{%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}{{'<|media_start|>image<|media_content|><|media_pad|><|media_end|>'}}{%- else -%}{{content['text']}}{%- endif -%}{%- endfor -%}{{'<|im_end|>'}}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{'<|im_assistant|>assistant<|im_middle|>'}}{%- endif -%}", + "stop_token_ids": [ + 163586 + ], + "stop": [ + "<|im_end|>" + ] + }, { "version": 1, "context_length": 131072, diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index 621b200643..1f7a4596e3 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -7217,7 +7217,8 @@ "model_specs": [ { "model_format": "pytorch", - "model_size_in_billions": 3, + "model_size_in_billions": 16, + "activated_size_in_billions": 3, "quantizations": [ "none" ], @@ -7233,6 +7234,71 @@ "<|im_end|>" ] }, + { + "version": 1, + "context_length": 128000, + "model_name": "Kimi-VL-A3B-Instruct", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat" + ], + "model_description": "Kimi-VL, an efficient open-source Mixture-of-Experts (MoE) vision-language model (VLM) that offers advanced multimodal reasoning, long-context understanding, and strong agent capabilities", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 16, + "activated_size_in_billions": 3, + "quantizations": [ + "none" + ], + "model_id": "moonshotai/Kimi-VL-A3B-Instruct", + "model_hub": "modelscope" + } + ], + "chat_template": "{%- for message in messages -%}{%- if loop.first and messages[0]['role'] != 'system' -%}{{'<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>'}}{%- endif -%}{%- if message['role'] == 'system' -%}{{'<|im_system|>'}}{%- endif -%}{%- if message['role'] == 'user' -%}{{'<|im_user|>'}}{%- endif -%}{%- if message['role'] == 'assistant' -%}{{'<|im_assistant|>'}}{%- endif -%}{{- message['role'] -}}{{'<|im_middle|>'}}{%- if message['content'] is string -%}{{- message['content'] + '<|im_end|>' -}}{%- else -%}{%- for content in message['content'] -%}{%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}{{'<|media_start|>image<|media_content|><|media_pad|><|media_end|>'}}{%- else -%}{{content['text']}}{%- endif -%}{%- endfor -%}{{'<|im_end|>'}}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{'<|im_assistant|>assistant<|im_middle|>'}}{%- endif -%}", + "stop_token_ids": [ + 163586 + ], + "stop": [ + "<|im_end|>" + ] + }, + { + "version": 1, + "context_length": 128000, + "model_name": "Kimi-VL-A3B-Thinking", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat", + "reasoning" + ], + "model_description": "Kimi-VL, an efficient open-source Mixture-of-Experts (MoE) vision-language model (VLM) that offers advanced multimodal reasoning, long-context understanding, and strong agent capabilities", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 16, + "activated_size_in_billions": 3, + "quantizations": [ + "none" + ], + "model_id": "moonshotai/Kimi-VL-A3B-Thinking", + "model_hub": "modelscope" + } + ], + "chat_template": "{%- for message in messages -%}{%- if loop.first and messages[0]['role'] != 'system' -%}{{'<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>'}}{%- endif -%}{%- if message['role'] == 'system' -%}{{'<|im_system|>'}}{%- endif -%}{%- if message['role'] == 'user' -%}{{'<|im_user|>'}}{%- endif -%}{%- if message['role'] == 'assistant' -%}{{'<|im_assistant|>'}}{%- endif -%}{{- message['role'] -}}{{'<|im_middle|>'}}{%- if message['content'] is string -%}{{- message['content'] + '<|im_end|>' -}}{%- else -%}{%- for content in message['content'] -%}{%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}{{'<|media_start|>image<|media_content|><|media_pad|><|media_end|>'}}{%- else -%}{{content['text']}}{%- endif -%}{%- endfor -%}{{'<|im_end|>'}}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{'<|im_assistant|>assistant<|im_middle|>'}}{%- endif -%}", + "stop_token_ids": [ + 163586 + ], + "stop": [ + "<|im_end|>" + ] + }, { "version": 1, "context_length": 131072, From f71f9d7b92dd79c3b340821188a4bf5c8339221d Mon Sep 17 00:00:00 2001 From: Minamiyama Date: Fri, 2 May 2025 11:59:37 +0800 Subject: [PATCH 2/7] =?UTF-8?q?feat(vllm):=20=E6=B7=BB=E5=8A=A0=E5=AF=B9Ki?= =?UTF-8?q?mi-VL-A3B-Instruct=E5=92=8CKimi-VL-A3B-Thinking=E8=A7=86?= =?UTF-8?q?=E8=A7=89=E6=A8=A1=E5=9E=8B=E7=9A=84=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- xinference/model/llm/vllm/core.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index cd67f2670f..6b3807926e 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -242,6 +242,8 @@ class VLLMGenerateConfig(TypedDict, total=False): if VLLM_INSTALLED and vllm.__version__ >= "0.8.5": VLLM_SUPPORTED_CHAT_MODELS.append("qwen3") + VLLM_SUPPORTED_VISION_MODEL_LIST.append("Kimi-VL-A3B-Instruct") + VLLM_SUPPORTED_VISION_MODEL_LIST.append("Kimi-VL-A3B-Thinking") class VLLMModel(LLM): From 2509f4aee99a0747c9c5efa554590469c5b8c9ec Mon Sep 17 00:00:00 2001 From: Minamiyama Date: Fri, 2 May 2025 12:21:05 +0800 Subject: [PATCH 3/7] feat: [WIP]transformers impl --- xinference/model/llm/transformers/kimi_vl.py | 228 +++++++++++++++++++ 1 file changed, 228 insertions(+) create mode 100644 xinference/model/llm/transformers/kimi_vl.py diff --git a/xinference/model/llm/transformers/kimi_vl.py b/xinference/model/llm/transformers/kimi_vl.py new file mode 100644 index 0000000000..b1f18427d4 --- /dev/null +++ b/xinference/model/llm/transformers/kimi_vl.py @@ -0,0 +1,228 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import uuid +from typing import Dict, Iterator, List, Optional, Union + +import torch +from PIL import Image + +from ....model.utils import select_device +from ....types import ( + ChatCompletion, + ChatCompletionChunk, + ChatCompletionMessage, + CompletionChunk, +) +from ..llm_family import LLMFamilyV1, LLMSpecV1 +from ..utils import generate_chat_completion, generate_completion_chunk +from .core import PytorchChatModel, PytorchGenerateConfig +from .utils import cache_clean + +logger = logging.getLogger(__name__) + +@register_transformer +@register_non_default_model( + "Kimi-VL-A3B-Instruct", "Kimi-VL-A3B-Thinking" +) +class KimiVLChatModel(PytorchChatModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._tokenizer = None + self._model = None + self._device = None + self._processor = None + + @classmethod + def match_json( + cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str + ) -> bool: + if model_spec.model_format not in ["pytorch", "gptq", "awq"]: + return False + llm_family = model_family.model_family or model_family.model_name + if "kimi-vl".lower() in llm_family.lower(): + return True + return False + + def load(self): + import importlib.util + from transformers import AutoModelForCausalLM, AutoProcessor + + self._device = self._pytorch_model_config.get("device", "auto") + self._device = select_device(self._device) + + # 构建模型加载参数 + model_kwargs = { + "pretrained_model_name_or_path": self.model_path, + "device_map": self._device, + "trust_remote_code": True, + "torch_dtype": "auto" + } + + flash_attn_installed = importlib.util.find_spec("flash_attn") is not None + if flash_attn_installed: + model_kwargs.update({ + "torch_dtype": torch.bfloat16, + "attn_implementation": "flash_attention_2" + }) + + self._model = AutoModelForCausalLM.from_pretrained(**model_kwargs) + self._processor = AutoProcessor.from_pretrained(self.model_path, trust_remote_code=True) + + @cache_clean + def chat( + self, + messages: List[ChatCompletionMessage], # type: ignore + generate_config: Optional[PytorchGenerateConfig] = None, + ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: + messages = self._transform_messages(messages) + + generate_config = generate_config if generate_config else {} + + stream = generate_config.get("stream", False) if generate_config else False + + if stream: + it = self._generate_stream(messages, generate_config) + return self._to_chat_completion_chunks(it) + else: + c = self._generate(messages, generate_config) + return c + + def _generate( + self, messages: List, config: PytorchGenerateConfig = {} + ) -> ChatCompletion: + input_ids, attention_mask, pixel_values, gen_kwargs = self._generate_chat_data( + messages, config + ) + + # generate output + with torch.inference_mode(): + gen_kwargs.update( + dict( + pixel_values=pixel_values, + attention_mask=attention_mask, + ) + ) + + output_ids = self._model.generate( + input_ids, + **gen_kwargs, + )[0] + output = self._text_tokenizer.decode(output_ids, skip_special_tokens=True) + return generate_chat_completion(self.model_uid, output) + + def _generate_stream( + self, messages: List, config: PytorchGenerateConfig = {} + ) -> Iterator[CompletionChunk]: + from threading import Thread + + from transformers import TextIteratorStreamer + + input_ids, attention_mask, pixel_values, gen_kwargs = self._generate_chat_data( + messages, config + ) + + _, inputs_embeds, _, attention_mask = self._model.merge_multimodal( + text_input_ids=input_ids, + text_attention_masks=attention_mask, + text_labels=None, + pixel_values=pixel_values, + left_padding=True, + ) + + streamer = TextIteratorStreamer( + self._text_tokenizer, timeout=60, skip_prompt=True, skip_special_tokens=True + ) + + gen_kwargs.update( + dict( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + streamer=streamer, + ) + ) + + inputs_embeds = inputs_embeds.detach() + torch.cuda.empty_cache() + + thread = Thread(target=self._model.llm.generate, kwargs=gen_kwargs) + thread.start() + + completion_id = str(uuid.uuid1()) + + for new_text in streamer: + yield generate_completion_chunk( + chunk_text=new_text, + finish_reason=None, + chunk_id=completion_id, + model_uid=self.model_uid, + prompt_tokens=-1, + completion_tokens=-1, + total_tokens=-1, + has_choice=True, + has_content=True, + ) + + yield generate_completion_chunk( + chunk_text=None, + finish_reason="stop", + chunk_id=completion_id, + model_uid=self.model_uid, + prompt_tokens=-1, + completion_tokens=-1, + total_tokens=-1, + has_choice=True, + has_content=False, + ) + + def _convert_video_tensors_to_pil(self, video_inputs: List) -> List[Image.Image]: + """Convert video tensors to a list of PIL images""" + from torchvision import transforms + + to_pil = transforms.ToPILImage() + pil_images = [] + + for video_tensor_4d in video_inputs: + if isinstance(video_tensor_4d, torch.Tensor): + # Verify it's a 4D tensor + if video_tensor_4d.ndim == 4: + # Iterate through the first dimension (frames) of 4D tensor + for i in range(video_tensor_4d.size(0)): + frame_tensor_3d = video_tensor_4d[ + i + ] # Get 3D frame tensor [C, H, W] + # Ensure tensor is on CPU before conversion + if frame_tensor_3d.is_cuda: + frame_tensor_3d = frame_tensor_3d.cpu() + try: + pil_image = to_pil(frame_tensor_3d) + pil_images.append(pil_image) + except Exception as e: + logger.error( + f"Error converting frame {i} to PIL Image: {e}" + ) + # Can choose to skip this frame or handle error differently + else: + logger.warning( + f"Expected 4D tensor in video_inputs, but got {video_tensor_4d.ndim}D. Skipping this tensor." + ) + elif isinstance(video_tensor_4d, Image.Image): + # If fetch_video returns Image list, add directly + pil_images.append(video_tensor_4d) + else: + logger.warning( + f"Unexpected type in video_inputs: {type(video_tensor_4d)}. Skipping." + ) + + return pil_images From 0c727d4f2df8f6f6851bb5b1124efdfe82fd4dc7 Mon Sep 17 00:00:00 2001 From: Minamiyama Date: Sat, 3 May 2025 03:00:15 +0800 Subject: [PATCH 4/7] [WIP]model vision ability --- xinference/model/llm/llm_family.json | 4 +++- xinference/model/llm/llm_family_modelscope.json | 4 +++- xinference/model/llm/transformers/kimi_vl.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 1649aab0a3..4a1e254d18 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -9090,7 +9090,8 @@ "zh" ], "model_ability": [ - "chat" + "chat", + "vision" ], "model_description": "Kimi-VL, an efficient open-source Mixture-of-Experts (MoE) vision-language model (VLM) that offers advanced multimodal reasoning, long-context understanding, and strong agent capabilities", "model_specs": [ @@ -9122,6 +9123,7 @@ ], "model_ability": [ "chat", + "vision", "reasoning" ], "model_description": "Kimi-VL, an efficient open-source Mixture-of-Experts (MoE) vision-language model (VLM) that offers advanced multimodal reasoning, long-context understanding, and strong agent capabilities", diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index 1f7a4596e3..633fe21f66 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -7243,7 +7243,8 @@ "zh" ], "model_ability": [ - "chat" + "chat", + "vision" ], "model_description": "Kimi-VL, an efficient open-source Mixture-of-Experts (MoE) vision-language model (VLM) that offers advanced multimodal reasoning, long-context understanding, and strong agent capabilities", "model_specs": [ @@ -7276,6 +7277,7 @@ ], "model_ability": [ "chat", + "vision", "reasoning" ], "model_description": "Kimi-VL, an efficient open-source Mixture-of-Experts (MoE) vision-language model (VLM) that offers advanced multimodal reasoning, long-context understanding, and strong agent capabilities", diff --git a/xinference/model/llm/transformers/kimi_vl.py b/xinference/model/llm/transformers/kimi_vl.py index b1f18427d4..03bf2a31d2 100644 --- a/xinference/model/llm/transformers/kimi_vl.py +++ b/xinference/model/llm/transformers/kimi_vl.py @@ -51,7 +51,7 @@ def match_json( if model_spec.model_format not in ["pytorch", "gptq", "awq"]: return False llm_family = model_family.model_family or model_family.model_name - if "kimi-vl".lower() in llm_family.lower(): + if "kimi-vl-".lower() in llm_family.lower(): return True return False From eefa8e94b791bc63b177f06b416440bd29ae81a6 Mon Sep 17 00:00:00 2001 From: Minamiyama Date: Sat, 3 May 2025 09:59:58 +0800 Subject: [PATCH 5/7] [WIP]generate --- xinference/model/llm/transformers/kimi_vl.py | 107 +++++-------------- 1 file changed, 24 insertions(+), 83 deletions(-) diff --git a/xinference/model/llm/transformers/kimi_vl.py b/xinference/model/llm/transformers/kimi_vl.py index 03bf2a31d2..2993d57fff 100644 --- a/xinference/model/llm/transformers/kimi_vl.py +++ b/xinference/model/llm/transformers/kimi_vl.py @@ -17,6 +17,7 @@ import torch from PIL import Image +from torch.cuda import temperature from ....model.utils import select_device from ....types import ( @@ -25,9 +26,9 @@ ChatCompletionMessage, CompletionChunk, ) -from ..llm_family import LLMFamilyV1, LLMSpecV1 +from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer from ..utils import generate_chat_completion, generate_completion_chunk -from .core import PytorchChatModel, PytorchGenerateConfig +from .core import PytorchChatModel, PytorchGenerateConfig, register_non_default_model from .utils import cache_clean logger = logging.getLogger(__name__) @@ -77,6 +78,9 @@ def load(self): "attn_implementation": "flash_attention_2" }) + kwargs = self.apply_bnb_quantization() + model_kwargs.update(kwargs) + self._model = AutoModelForCausalLM.from_pretrained(**model_kwargs) self._processor = AutoProcessor.from_pretrained(self.model_path, trust_remote_code=True) @@ -93,8 +97,11 @@ def chat( stream = generate_config.get("stream", False) if generate_config else False if stream: - it = self._generate_stream(messages, generate_config) - return self._to_chat_completion_chunks(it) + raise NotImplementedError( + "Kimi-VL-A3B-Instruct does not support stream generation yet." + ) + # it = self._generate_stream(messages, generate_config) + # return self._to_chat_completion_chunks(it) else: c = self._generate(messages, generate_config) return c @@ -102,89 +109,23 @@ def chat( def _generate( self, messages: List, config: PytorchGenerateConfig = {} ) -> ChatCompletion: - input_ids, attention_mask, pixel_values, gen_kwargs = self._generate_chat_data( - messages, config - ) - - # generate output - with torch.inference_mode(): - gen_kwargs.update( - dict( - pixel_values=pixel_values, - attention_mask=attention_mask, - ) - ) - - output_ids = self._model.generate( - input_ids, - **gen_kwargs, - )[0] - output = self._text_tokenizer.decode(output_ids, skip_special_tokens=True) - return generate_chat_completion(self.model_uid, output) + text = self._processor.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt") + image = None + inputs = self._processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(self._model.device) + generated_ids = self._model.generate(**inputs, max_new_tokens=config.get("max_tokens", 2048), temperature=config.get("temperature", 0.7)) + generated_ids_trimmed = [ + out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) + ] + response = self._processor.batch_decode( + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False + )[0] + logger.info(f"============输出: {response}") + return generate_chat_completion(self.model_uid, response) def _generate_stream( self, messages: List, config: PytorchGenerateConfig = {} ) -> Iterator[CompletionChunk]: - from threading import Thread - - from transformers import TextIteratorStreamer - - input_ids, attention_mask, pixel_values, gen_kwargs = self._generate_chat_data( - messages, config - ) - - _, inputs_embeds, _, attention_mask = self._model.merge_multimodal( - text_input_ids=input_ids, - text_attention_masks=attention_mask, - text_labels=None, - pixel_values=pixel_values, - left_padding=True, - ) - - streamer = TextIteratorStreamer( - self._text_tokenizer, timeout=60, skip_prompt=True, skip_special_tokens=True - ) - - gen_kwargs.update( - dict( - inputs_embeds=inputs_embeds, - attention_mask=attention_mask, - streamer=streamer, - ) - ) - - inputs_embeds = inputs_embeds.detach() - torch.cuda.empty_cache() - - thread = Thread(target=self._model.llm.generate, kwargs=gen_kwargs) - thread.start() - - completion_id = str(uuid.uuid1()) - - for new_text in streamer: - yield generate_completion_chunk( - chunk_text=new_text, - finish_reason=None, - chunk_id=completion_id, - model_uid=self.model_uid, - prompt_tokens=-1, - completion_tokens=-1, - total_tokens=-1, - has_choice=True, - has_content=True, - ) - - yield generate_completion_chunk( - chunk_text=None, - finish_reason="stop", - chunk_id=completion_id, - model_uid=self.model_uid, - prompt_tokens=-1, - completion_tokens=-1, - total_tokens=-1, - has_choice=True, - has_content=False, - ) + pass def _convert_video_tensors_to_pil(self, video_inputs: List) -> List[Image.Image]: """Convert video tensors to a list of PIL images""" From 26836d29b540d08658cb18dddc4187e6c7156b52 Mon Sep 17 00:00:00 2001 From: Minamiyama Date: Mon, 21 Jul 2025 14:34:06 +0800 Subject: [PATCH 6/7] feat(model): add Kimi-VL-A3B models to llm_family.json Add support for Kimi-VL-A3B-Instruct and Kimi-VL-A3B-Thinking-2506 vision-language models with multimodal reasoning capabilities --- xinference/model/llm/llm_family.json | 102 +++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index bd0ba06e0a..f9dbe65425 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -18670,5 +18670,107 @@ "#system_numpy#" ] } + }, + { + "version": 2, + "context_length": 128000, + "model_name": "Kimi-VL-A3B-Instruct", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat", + "vision", + "reasoning" + ], + "model_description": "Kimi-VL, an efficient open-source Mixture-of-Experts (MoE) vision-language model (VLM) that offers advanced multimodal reasoning, long-context understanding, and strong agent capabilities", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 16, + "activated_size_in_billions": 3, + "model_src": { + "huggingface": { + "quantizations": [ + "none" + ], + "model_id": "moonshotai/Kimi-VL-A3B-Instruct" + }, + "modelscope": { + "quantizations": [ + "none" + ], + "model_id": "moonshotai/Kimi-VL-A3B-Instruct", + "model_revision": "master" + } + } + } + ], + "chat_template": "{%- for message in messages -%}{%- if loop.first and messages[0]['role'] != 'system' -%}{{'<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>'}}{%- endif -%}{%- if message['role'] == 'system' -%}{{'<|im_system|>'}}{%- endif -%}{%- if message['role'] == 'user' -%}{{'<|im_user|>'}}{%- endif -%}{%- if message['role'] == 'assistant' -%}{{'<|im_assistant|>'}}{%- endif -%}{{- message['role'] -}}{{'<|im_middle|>'}}{%- if message['content'] is string -%}{{- message['content'] + '<|im_end|>' -}}{%- else -%}{%- for content in message['content'] -%}{%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}{{'<|media_start|>image<|media_content|><|media_pad|><|media_end|>'}}{%- else -%}{{content['text']}}{%- endif -%}{%- endfor -%}{{'<|im_end|>'}}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{'<|im_assistant|>assistant<|im_middle|>'}}{%- endif -%}", + "stop_token_ids": [ + 163586 + ], + "stop": [ + "<|im_end|>" + ], + "reasoning_start_tag": "", + "reasoning_end_tag": "", + "virtualenv": { + "packages": [ + "transformers>=4.51.3" + ] + } + }, + { + "version": 2, + "context_length": 128000, + "model_name": "Kimi-VL-A3B-Thinking-2506", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat", + "vision", + "reasoning" + ], + "model_description": "Kimi-VL, an efficient open-source Mixture-of-Experts (MoE) vision-language model (VLM) that offers advanced multimodal reasoning, long-context understanding, and strong agent capabilities", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 16, + "activated_size_in_billions": 3, + "model_src": { + "huggingface": { + "quantizations": [ + "none" + ], + "model_id": "moonshotai/Kimi-VL-A3B-Thinking-2506" + }, + "modelscope": { + "quantizations": [ + "none" + ], + "model_id": "moonshotai/Kimi-VL-A3B-Thinking-2506", + "model_revision": "master" + } + } + } + ], + "chat_template": "{%- for message in messages -%}{%- if loop.first and messages[0]['role'] != 'system' -%}{{'<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>'}}{%- endif -%}{%- if message['role'] == 'system' -%}{{'<|im_system|>'}}{%- endif -%}{%- if message['role'] == 'user' -%}{{'<|im_user|>'}}{%- endif -%}{%- if message['role'] == 'assistant' -%}{{'<|im_assistant|>'}}{%- endif -%}{{- message['role'] -}}{{'<|im_middle|>'}}{%- if message['content'] is string -%}{{- message['content'] + '<|im_end|>' -}}{%- else -%}{%- for content in message['content'] -%}{%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}{{'<|media_start|>image<|media_content|><|media_pad|><|media_end|>'}}{%- else -%}{{content['text']}}{%- endif -%}{%- endfor -%}{{'<|im_end|>'}}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{'<|im_assistant|>assistant<|im_middle|>'}}{%- endif -%}", + "stop_token_ids": [ + 163586 + ], + "stop": [ + "<|im_end|>" + ], + "reasoning_start_tag": "", + "reasoning_end_tag": "", + "virtualenv": { + "packages": [ + "transformers>=4.51.3" + ] + } } ] From 64f7e1655f25f4f8a415abf1e7e7697df1d47fd0 Mon Sep 17 00:00:00 2001 From: Minamiyama Date: Mon, 21 Jul 2025 14:36:21 +0800 Subject: [PATCH 7/7] fix(vllm): update Kimi-VL-A3B-Thinking model name to include version --- xinference/model/llm/vllm/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index 90a221612e..1c89fb1ae8 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -259,7 +259,7 @@ class VLLMGenerateConfig(TypedDict, total=False): if VLLM_INSTALLED and vllm.__version__ >= "0.8.5": VLLM_SUPPORTED_CHAT_MODELS.append("qwen3") VLLM_SUPPORTED_VISION_MODEL_LIST.append("Kimi-VL-A3B-Instruct") - VLLM_SUPPORTED_VISION_MODEL_LIST.append("Kimi-VL-A3B-Thinking") + VLLM_SUPPORTED_VISION_MODEL_LIST.append("Kimi-VL-A3B-Thinking-2506") if VLLM_INSTALLED and vllm.__version__ >= "0.9.1": VLLM_SUPPORTED_CHAT_MODELS.append("minicpm4")