From 7689c3282b712d4a273ad264c99331e3832a9c3f Mon Sep 17 00:00:00 2001 From: smy <18813151339@163.com> Date: Tue, 2 Jun 2026 16:41:41 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E5=AF=B9Blackwell?= =?UTF-8?q?=E6=9E=B6=E6=9E=84=E6=98=BE=E5=8D=A1=E4=B8=8D=E6=94=AF=E6=8C=81?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 118 +++++++++++++++++- create_symlinks.sh | 69 ++++++++++ download_weights-bak.sh | 51 ++++++++ download_weights.sh | 0 entrypoint.sh | 0 inference.sh | 0 .../detection/sfd/sfd_detector.py | 5 +- musetalk/utils/preprocessing.py | 11 ++ scripts/inference.py | 10 +- train.sh | 0 10 files changed, 256 insertions(+), 8 deletions(-) create mode 100755 create_symlinks.sh create mode 100755 download_weights-bak.sh mode change 100644 => 100755 download_weights.sh mode change 100644 => 100755 entrypoint.sh mode change 100644 => 100755 inference.sh mode change 100644 => 100755 train.sh diff --git a/README.md b/README.md index 49e76c44..4dec18c3 100644 --- a/README.md +++ b/README.md @@ -222,25 +222,28 @@ You can also download the weights manually from the following links: 1. Download our trained [weights](https://huggingface.co/TMElyralab/MuseTalk/tree/main) 2. Download the weights of other components: - [sd-vae-ft-mse](https://huggingface.co/stabilityai/sd-vae-ft-mse/tree/main) - - [whisper](https://huggingface.co/openai/whisper-tiny/tree/main) + - [whisper](https://huggingface.co/openai/whisper-tiny/tree/main) (make sure to include `preprocessor_config.json`) - [dwpose](https://huggingface.co/yzd-v/DWPose/tree/main) - [syncnet](https://huggingface.co/ByteDance/LatentSync/tree/main) - [face-parse-bisent](https://drive.google.com/file/d/154JgKpzCPW82qINcVieuPH3fZ2e0P812/view?pli=1) - [resnet18](https://download.pytorch.org/models/resnet18-5c106cde.pth) + - [s3fd](https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth) (rename to `s3fd.pth` and place in `models/s3fd/`) Finally, these weights should be organized in `models` as follows: ``` ./models/ ├── musetalk -│ └── musetalk.json +│ ├── musetalk.json │ └── pytorch_model.bin ├── musetalkV15 -│ └── musetalk.json +│ ├── musetalk.json │ └── unet.pth ├── syncnet │ └── latentsync_syncnet.pt ├── dwpose │ └── dw-ll_ucoco_384.pth +├── s3fd +│ └── s3fd.pth ├── face-parse-bisent │ ├── 79999_iter.pth │ └── resnet18-5c106cde.pth @@ -251,8 +254,13 @@ Finally, these weights should be organized in `models` as follows: ├── config.json ├── pytorch_model.bin └── preprocessor_config.json - + ``` + +> **Note:** +> - `s3fd/s3fd.pth` is the SFD face detection model weight, download from [s3fd-619a316812.pth](https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth) and rename to `s3fd.pth`. +> - `whisper/preprocessor_config.json` is required by `AutoFeatureExtractor`. If it is missing from the downloaded whisper weights, you can download it from [openai/whisper-tiny](https://huggingface.co/openai/whisper-tiny/blob/main/preprocessor_config.json). +> - If you encounter `ImportError: cannot import name 'split_torch_state_dict_into_shards' from 'huggingface_hub'`, upgrade `huggingface_hub` to fix the version incompatibility: `pip install --upgrade huggingface_hub`. ## Quickstart ### Inference @@ -512,6 +520,108 @@ python -m scripts.inference --inference_config configs/inference/test.yaml --bbo As a complete solution to virtual human generation, you are suggested to first apply [MuseV](https://github.com/TMElyralab/MuseV) to generate a video (text-to-video, image-to-video or pose-to-video) by referring [this](https://github.com/TMElyralab/MuseV?tab=readme-ov-file#text2video). Frame interpolation is suggested to increase frame rate. Then, you can use `MuseTalk` to generate a lip-sync video by referring [this](https://github.com/TMElyralab/MuseTalk?tab=readme-ov-file#inference). +# Troubleshooting + +## New GPU Architecture (Blackwell sm_120) + +If you're using **NVIDIA RTX PRO 6000 Blackwell** or other new GPU with `sm_120` architecture, you'll encounter: + +``` +CUDA error: no kernel image is available for execution on the device +The current PyTorch install supports CUDA capabilities sm_37 sm_50 ... sm_90. +``` + +This is because the default PyTorch 2.0.1 does not support the Blackwell architecture. Follow these steps: + +### Step 1: Upgrade PyTorch + +Install **PyTorch 2.6+ with CUDA 12.6/12.8** which supports `sm_120`: + +```bash +# For CUDA 12.8 (tested with PyTorch 2.9.0) +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128 + +# Or for CUDA 12.6 +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126 +``` + +### Step 2: Reinstall mmcv for the New CUDA Version + +After upgrading PyTorch, the existing mmcv (compiled for older CUDA) will fail with: + +``` +ImportError: libcudart.so.11.0: cannot open shared object file: No such file or directory +``` + +**Important version constraint:** `mmdet 3.1.0` and `mmpose 1.1.0` require **mmcv 2.0.1 ~ 2.1.x**. Do **not** use mmcv 2.2.0+ as they have strict runtime version checks that will raise `AssertionError`. + +If `mim install mmcv` finds a pre-built wheel for your CUDA/PyTorch combo: +```bash +pip uninstall mmcv -y +mim install "mmcv==2.1.0" +``` + +If there is **no pre-built wheel** (common for new CUDA/PyTorch combinations), install from source: +```bash +pip install setuptools +pip install mmcv==2.1.0 --no-binary mmcv --no-build-isolation +``` + +The `--no-build-isolation` flag is required so that pip uses the `setuptools` already in your environment instead of creating an isolated build environment (which may lack `pkg_resources`). + +### Step 3: Verify Installation + +```bash +python -c "import mmcv; import mmdet; import mmpose; print('mmcv:', mmcv.__version__); print('mmdet:', mmdet.__version__); print('mmpose:', mmpose.__version__)" +``` + +Expected output: +``` +mmcv: 2.1.0 +mmdet: 3.1.0 +mmpose: 1.1.0 +``` + +## PyTorch 2.6+ Checkpoint Loading Error + +If you see `_pickle.UnpicklingError: Weights only load failed` when loading dwpose checkpoint: + +``` +WeightsUnpickler error: Unsupported global: GLOBAL numpy.core.multiarray._reconstruct +``` + +This is because PyTorch 2.6+ changed `torch.load` default to `weights_only=True`. The code has been patched to handle this automatically. If you encounter this issue, make sure you have the latest version of `preprocessing.py`. + +## FFmpeg Missing PNG Encoder + +``` +[vost#0:0] Automatic encoder selection failed Default encoder for format image2 (codec png) is probably disabled. +Error opening output files: Encoder not found +``` + +This usually happens when FFmpeg is compiled without certain encoders (e.g., Windows static builds with `--disable-x86asm`). + +**Solution 1**: Use system FFmpeg: +```bash +sudo apt-get install ffmpeg # Ubuntu/Debian +``` + +**Solution 2**: Specify FFmpeg path when running inference: +```bash +python -m scripts.inference --ffmpeg_path /usr/bin ... +``` + +**Solution 3**: Add system FFmpeg to PATH with higher priority: +```bash +export PATH=/usr/bin:$PATH +``` + +**Verify** your FFmpeg has PNG encoder: +```bash +ffmpeg -encoders | grep png +# Should show: VF...D png PNG (Portable Network Graphics) image +``` + # Acknowledgement 1. We thank open-source components like [whisper](https://github.com/openai/whisper), [dwpose](https://github.com/IDEA-Research/DWPose), [face-alignment](https://github.com/1adrianb/face-alignment), [face-parsing](https://github.com/zllrunning/face-parsing.PyTorch), [S3FD](https://github.com/yxlijun/S3FD.pytorch) and [LatentSync](https://huggingface.co/ByteDance/LatentSync/tree/main). 1. MuseTalk has referred much to [diffusers](https://github.com/huggingface/diffusers) and [isaacOnline/whisper](https://github.com/isaacOnline/whisper/tree/extract-embeddings). diff --git a/create_symlinks.sh b/create_symlinks.sh new file mode 100755 index 00000000..211ec138 --- /dev/null +++ b/create_symlinks.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +# 源目录 +SOURCE_DIR="/mnt/f/Models_Download/MuseTalk" + +# 目标目录(项目中的 models 目录) +TARGET_DIR="/home/smy/LM_projects/agentscope/MuseTalk/models" + +echo "=========================================" +echo "创建 MuseTalk 模型软链接" +echo "=========================================" +echo "源目录: $SOURCE_DIR" +echo "目标目录: $TARGET_DIR" +echo "=========================================" + +# 检查源目录是否存在 +if [ ! -d "$SOURCE_DIR" ]; then + echo "错误: 源目录不存在: $SOURCE_DIR" + exit 1 +fi + +# 创建目标目录(如果不存在) +if [ ! -d "$TARGET_DIR" ]; then + echo "创建目标目录: $TARGET_DIR" + mkdir -p "$TARGET_DIR" +fi + +# 遍历源目录中的所有文件夹 +count=0 +for folder in "$SOURCE_DIR"/*/; do + # 检查是否是有效的目录 + if [ -d "$folder" ]; then + # 获取文件夹名称 + folder_name=$(basename "$folder") + + # 跳过隐藏文件夹(以 . 开头) + if [[ "$folder_name" == .* ]]; then + echo "跳过隐藏文件夹: $folder_name" + continue + fi + + # 检查目标链接是否已存在 + if [ -e "$TARGET_DIR/$folder_name" ] || [ -L "$TARGET_DIR/$folder_name" ]; then + echo "警告: 目标已存在,跳过: $folder_name" + continue + fi + + # 创建绝对路径 + source_absolute_path=$(cd "$folder" && pwd) + + # 创建软链接 + ln -s "$source_absolute_path" "$TARGET_DIR/$folder_name" + if [ $? -eq 0 ]; then + echo "✓ 创建软链接: $folder_name -> $source_absolute_path" + count=$((count + 1)) + else + echo "✗ 创建软链接失败: $folder_name" + fi + fi +done + +echo "=========================================" +echo "完成!共创建了 $count 个软链接" +echo "=========================================" + +# 显示创建的软链接 +echo "" +echo "创建的软链接列表:" +ls -la "$TARGET_DIR" diff --git a/download_weights-bak.sh b/download_weights-bak.sh new file mode 100755 index 00000000..affa5a9d --- /dev/null +++ b/download_weights-bak.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# Set the checkpoints directory +CheckpointsDir="models" + +# Create necessary directories +mkdir -p models/musetalk models/musetalkV15 models/syncnet models/dwpose models/face-parse-bisent models/sd-vae models/whisper + +# Install required packages +pip install -U "huggingface_hub[cli]" +pip install gdown + +# Set HuggingFace mirror endpoint +export HF_ENDPOINT=https://hf-mirror.com + +# Download MuseTalk V1.0 weights +huggingface-cli download TMElyralab/MuseTalk \ + --local-dir $CheckpointsDir \ + --include "musetalk/musetalk.json" "musetalk/pytorch_model.bin" + +# Download MuseTalk V1.5 weights (unet.pth) +huggingface-cli download TMElyralab/MuseTalk \ + --local-dir $CheckpointsDir \ + --include "musetalkV15/musetalk.json" "musetalkV15/unet.pth" + +# Download SD VAE weights +huggingface-cli download stabilityai/sd-vae-ft-mse \ + --local-dir $CheckpointsDir/sd-vae \ + --include "config.json" "diffusion_pytorch_model.bin" + +# Download Whisper weights +huggingface-cli download openai/whisper-tiny \ + --local-dir $CheckpointsDir/whisper \ + --include "config.json" "pytorch_model.bin" "preprocessor_config.json" + +# Download DWPose weights +huggingface-cli download yzd-v/DWPose \ + --local-dir $CheckpointsDir/dwpose \ + --include "dw-ll_ucoco_384.pth" + +# Download SyncNet weights +huggingface-cli download ByteDance/LatentSync \ + --local-dir $CheckpointsDir/syncnet \ + --include "latentsync_syncnet.pt" + +# Download Face Parse Bisent weights +gdown --id 154JgKpzCPW82qINcVieuPH3fZ2e0P812 -O $CheckpointsDir/face-parse-bisent/79999_iter.pth +curl -L https://download.pytorch.org/models/resnet18-5c106cde.pth \ + -o $CheckpointsDir/face-parse-bisent/resnet18-5c106cde.pth + +echo "✅ All weights have been downloaded successfully!" diff --git a/download_weights.sh b/download_weights.sh old mode 100644 new mode 100755 diff --git a/entrypoint.sh b/entrypoint.sh old mode 100644 new mode 100755 diff --git a/inference.sh b/inference.sh old mode 100644 new mode 100755 diff --git a/musetalk/utils/face_detection/detection/sfd/sfd_detector.py b/musetalk/utils/face_detection/detection/sfd/sfd_detector.py index 8fbce152..f7fd7c1f 100755 --- a/musetalk/utils/face_detection/detection/sfd/sfd_detector.py +++ b/musetalk/utils/face_detection/detection/sfd/sfd_detector.py @@ -8,13 +8,16 @@ from .bbox import * from .detect import * +_PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..', '..', '..')) +_SFD_MODEL_PATH = os.path.join(_PROJECT_ROOT, 'models', 's3fd', 's3fd.pth') + models_urls = { 's3fd': 'https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth', } class SFDDetector(FaceDetector): - def __init__(self, device, path_to_detector=os.path.join(os.path.dirname(os.path.abspath(__file__)), 's3fd.pth'), verbose=False): + def __init__(self, device, path_to_detector=_SFD_MODEL_PATH, verbose=False): super(SFDDetector, self).__init__(device, verbose) # Initialise the face detector diff --git a/musetalk/utils/preprocessing.py b/musetalk/utils/preprocessing.py index 978480c6..9e321938 100755 --- a/musetalk/utils/preprocessing.py +++ b/musetalk/utils/preprocessing.py @@ -12,6 +12,17 @@ import torch from tqdm import tqdm +# Monkey-patch mmengine's torch.load to disable weights_only for PyTorch 2.6+ +import mmengine.runner.checkpoint as _ckpt_module +_orig_torch_load = torch.load +def _patched_torch_load(*args, **kwargs): + if 'weights_only' not in kwargs: + kwargs['weights_only'] = False + return _orig_torch_load(*args, **kwargs) +# Patch at module level so mmengine uses it +torch.load = _patched_torch_load +_ckpt_module.torch.load = _patched_torch_load + # initialize the mmpose model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") config_file = './musetalk/utils/dwpose/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py' diff --git a/scripts/inference.py b/scripts/inference.py index 428afb99..20f0c640 100644 --- a/scripts/inference.py +++ b/scripts/inference.py @@ -29,8 +29,12 @@ def fast_check_ffmpeg(): @torch.no_grad() def main(args): - # Configure ffmpeg path - if not fast_check_ffmpeg(): + # Configure ffmpeg path - always use provided ffmpeg_path if specified + if args.ffmpeg_path: + print(f"Using ffmpeg from: {args.ffmpeg_path}") + path_separator = ';' if sys.platform == 'win32' else ':' + os.environ["PATH"] = f"{args.ffmpeg_path}{path_separator}{os.environ['PATH']}" + elif not fast_check_ffmpeg(): print("Adding ffmpeg to PATH") # Choose path separator based on operating system path_separator = ';' if sys.platform == 'win32' else ':' @@ -250,7 +254,7 @@ def main(args): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--ffmpeg_path", type=str, default="./ffmpeg-4.4-amd64-static/", help="Path to ffmpeg executable") + parser.add_argument("--ffmpeg_path", type=str, default="/usr/bin", help="Path to ffmpeg executable") parser.add_argument("--gpu_id", type=int, default=0, help="GPU ID to use") parser.add_argument("--vae_type", type=str, default="sd-vae", help="Type of VAE model") parser.add_argument("--unet_config", type=str, default="./models/musetalk/config.json", help="Path to UNet configuration file") diff --git a/train.sh b/train.sh old mode 100644 new mode 100755 From 32d3b6c5f1e5da77e99db69bf1e3ef2b55b56839 Mon Sep 17 00:00:00 2001 From: smy <18813151339@163.com> Date: Tue, 2 Jun 2026 16:50:41 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E5=9C=A8readme=E4=B8=AD=E8=A1=A5=E5=85=85?= =?UTF-8?q?=E4=BA=86=E6=8E=A8=E7=90=86=E6=97=B6=E7=BC=BA=E5=B0=91=E7=9A=84?= =?UTF-8?q?s3fd=E6=A8=A1=E5=9E=8B=E7=9A=84=E4=BF=A1=E6=81=AF=EF=BC=8C?= =?UTF-8?q?=E5=B9=B6=E4=BF=AE=E6=94=B9=E4=BA=86sfd=5Fdetector.py=E8=84=9A?= =?UTF-8?q?=E6=9C=AC=E4=B8=AD=E8=AF=A5=E6=A8=A1=E5=9E=8B=E9=BB=98=E8=AE=A4?= =?UTF-8?q?=E8=B7=AF=E5=BE=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 ++- download_weights-bak.sh | 51 ----------------------------------------- 2 files changed, 2 insertions(+), 52 deletions(-) delete mode 100755 download_weights-bak.sh diff --git a/.gitignore b/.gitignore index c83750f5..0f53551c 100644 --- a/.gitignore +++ b/.gitignore @@ -15,4 +15,5 @@ ffmprobe* ffplay* debug exp_out -.gradio \ No newline at end of file +.gradio +.claude/ \ No newline at end of file diff --git a/download_weights-bak.sh b/download_weights-bak.sh deleted file mode 100755 index affa5a9d..00000000 --- a/download_weights-bak.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/bash - -# Set the checkpoints directory -CheckpointsDir="models" - -# Create necessary directories -mkdir -p models/musetalk models/musetalkV15 models/syncnet models/dwpose models/face-parse-bisent models/sd-vae models/whisper - -# Install required packages -pip install -U "huggingface_hub[cli]" -pip install gdown - -# Set HuggingFace mirror endpoint -export HF_ENDPOINT=https://hf-mirror.com - -# Download MuseTalk V1.0 weights -huggingface-cli download TMElyralab/MuseTalk \ - --local-dir $CheckpointsDir \ - --include "musetalk/musetalk.json" "musetalk/pytorch_model.bin" - -# Download MuseTalk V1.5 weights (unet.pth) -huggingface-cli download TMElyralab/MuseTalk \ - --local-dir $CheckpointsDir \ - --include "musetalkV15/musetalk.json" "musetalkV15/unet.pth" - -# Download SD VAE weights -huggingface-cli download stabilityai/sd-vae-ft-mse \ - --local-dir $CheckpointsDir/sd-vae \ - --include "config.json" "diffusion_pytorch_model.bin" - -# Download Whisper weights -huggingface-cli download openai/whisper-tiny \ - --local-dir $CheckpointsDir/whisper \ - --include "config.json" "pytorch_model.bin" "preprocessor_config.json" - -# Download DWPose weights -huggingface-cli download yzd-v/DWPose \ - --local-dir $CheckpointsDir/dwpose \ - --include "dw-ll_ucoco_384.pth" - -# Download SyncNet weights -huggingface-cli download ByteDance/LatentSync \ - --local-dir $CheckpointsDir/syncnet \ - --include "latentsync_syncnet.pt" - -# Download Face Parse Bisent weights -gdown --id 154JgKpzCPW82qINcVieuPH3fZ2e0P812 -O $CheckpointsDir/face-parse-bisent/79999_iter.pth -curl -L https://download.pytorch.org/models/resnet18-5c106cde.pth \ - -o $CheckpointsDir/face-parse-bisent/resnet18-5c106cde.pth - -echo "✅ All weights have been downloaded successfully!"