diff --git a/.gitignore b/.gitignore index c83750f5..c5e9bb56 100644 --- a/.gitignore +++ b/.gitignore @@ -15,4 +15,5 @@ ffmprobe* ffplay* debug exp_out -.gradio \ No newline at end of file +.gradio +venv/ \ No newline at end of file diff --git a/musetalk/utils/blending.py b/musetalk/utils/blending.py index fa3effcd..35f69426 100755 --- a/musetalk/utils/blending.py +++ b/musetalk/utils/blending.py @@ -78,7 +78,8 @@ def get_image(image, face, face_box, upper_boundary_ratio=0.5, expand=1.5, mode= # 对掩码进行高斯模糊,使边缘更平滑 - blur_kernel_size = int(0.05 * ori_shape[0] // 2 * 2) + 1 # 计算模糊核大小 + # Optimized blur kernel at 0.15 - balances smooth edges without affecting lip stability + blur_kernel_size = int(0.15 * ori_shape[0] // 2 * 2) + 1 # 计算模糊核大小 mask_array = cv2.GaussianBlur(np.array(modified_mask_image), (blur_kernel_size, blur_kernel_size), 0) # 高斯模糊 #mask_array = np.array(modified_mask_image) mask_image = Image.fromarray(mask_array) # 将模糊后的掩码转换回 PIL 图像 @@ -131,6 +132,7 @@ def get_image_prepare_material(image, face_box, upper_boundary_ratio=0.5, expand modified_mask_image = Image.new('L', ori_shape, 0) modified_mask_image.paste(mask_image.crop((0, top_boundary, width, height)), (0, top_boundary)) - blur_kernel_size = int(0.1 * ori_shape[0] // 2 * 2) + 1 + # Optimized blur kernel at 0.15 - balances smooth edges without affecting lip stability + blur_kernel_size = int(0.15 * ori_shape[0] // 2 * 2) + 1 mask_array = cv2.GaussianBlur(np.array(modified_mask_image), (blur_kernel_size, blur_kernel_size), 0) return mask_array, crop_box diff --git a/scripts/realtime_inference.py b/scripts/realtime_inference.py index 579b050f..b801ad2a 100644 --- a/scripts/realtime_inference.py +++ b/scripts/realtime_inference.py @@ -181,9 +181,9 @@ def prepare_material(self): latents = vae.get_latents_for_unet(resized_crop_frame) input_latent_list.append(latents) - self.frame_list_cycle = frame_list + frame_list[::-1] - self.coord_list_cycle = coord_list + coord_list[::-1] - self.input_latent_list_cycle = input_latent_list + input_latent_list[::-1] + self.frame_list_cycle = frame_list + self.coord_list_cycle = coord_list + self.input_latent_list_cycle = input_latent_list self.mask_coords_list_cycle = [] self.mask_list_cycle = [] @@ -211,6 +211,9 @@ def prepare_material(self): def process_frames(self, res_frame_queue, video_len, skip_save_images): print(video_len) + prev_combine_frame = None # Store previous frame for temporal smoothing + temporal_alpha = 0.3 # Smoothing factor: 0.3 means 30% previous + 70% current + while True: if self.idx >= video_len - 1: break @@ -224,13 +227,30 @@ def process_frames(self, res_frame_queue, video_len, skip_save_images): ori_frame = copy.deepcopy(self.frame_list_cycle[self.idx % (len(self.frame_list_cycle))]) x1, y1, x2, y2 = bbox try: - res_frame = cv2.resize(res_frame.astype(np.uint8), (x2 - x1, y2 - y1)) + # Use LANCZOS4 for higher quality upscaling of lip-sync region + res_frame = cv2.resize(res_frame.astype(np.uint8), (x2 - x1, y2 - y1), interpolation=cv2.INTER_LANCZOS4) except: continue mask = self.mask_list_cycle[self.idx % (len(self.mask_list_cycle))] mask_crop_box = self.mask_coords_list_cycle[self.idx % (len(self.mask_coords_list_cycle))] combine_frame = get_image_blending(ori_frame,res_frame,bbox,mask,mask_crop_box) + # Apply sharpening to the lip-sync region to match rest of video + # Extract the blended region and apply unsharp mask + lip_region = combine_frame[y1:y2, x1:x2] + gaussian = cv2.GaussianBlur(lip_region, (0, 0), 2.0) + sharpened_lip = cv2.addWeighted(lip_region, 1.5, gaussian, -0.5, 0) + combine_frame[y1:y2, x1:x2] = sharpened_lip + + # Apply temporal smoothing to reduce stuttering/jitter + if prev_combine_frame is not None: + combine_frame = cv2.addWeighted( + prev_combine_frame, temporal_alpha, + combine_frame, 1 - temporal_alpha, + 0 + ) + prev_combine_frame = combine_frame.copy() + if skip_save_images is False: cv2.imwrite(f"{self.avatar_path}/tmp/{str(self.idx).zfill(8)}.png", combine_frame) self.idx = self.idx + 1 @@ -292,13 +312,14 @@ def inference(self, audio_path, out_vid_name, fps, skip_save_images): time.time() - start_time)) if out_vid_name is not None and args.skip_save_images is False: - # optional - cmd_img2video = f"ffmpeg -y -v warning -r {fps} -f image2 -i {self.avatar_path}/tmp/%08d.png -vcodec libx264 -vf format=yuv420p -crf 18 {self.avatar_path}/temp.mp4" + # optional - using CRF 15 for higher quality lip-sync preservation + cmd_img2video = f"ffmpeg -y -v warning -r {fps} -f image2 -i {self.avatar_path}/tmp/%08d.png -vcodec libx264 -vf format=yuv420p -crf 15 {self.avatar_path}/temp.mp4" print(cmd_img2video) os.system(cmd_img2video) output_vid = os.path.join(self.video_out_path, out_vid_name + ".mp4") # on - cmd_combine_audio = f"ffmpeg -y -v warning -i {audio_path} -i {self.avatar_path}/temp.mp4 {output_vid}" + # Explicitly set output fps to match generated frames and improve quality + cmd_combine_audio = f"ffmpeg -y -v warning -i {audio_path} -i {self.avatar_path}/temp.mp4 -c:v libx264 -preset slow -crf 18 -r {fps} -c:a aac -b:a 192k {output_vid}" print(cmd_combine_audio) os.system(cmd_combine_audio)