Spaces:

mrbear1024
/

mimictalk

Build error

App Files Files Community

mrbear1024 commited on 21 days ago

Commit

8eb4303

0 Parent(s):

init project

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +12 -0
.gitignore +201 -0
LICENSE +21 -0
README-zh.md +157 -0
README.md +155 -0
checkpoints/.gitkeep +0 -0
data_gen/eg3d/convert_to_eg3d_convention.py +146 -0
data_gen/runs/binarizer_nerf.py +335 -0
data_gen/runs/binarizer_th1kh.py +100 -0
data_gen/runs/nerf/process_guide.md +49 -0
data_gen/runs/nerf/run.sh +51 -0
data_gen/utils/mp_feature_extractors/face_landmarker.py +130 -0
data_gen/utils/mp_feature_extractors/face_landmarker.task +3 -0
data_gen/utils/mp_feature_extractors/mp_segmenter.py +303 -0
data_gen/utils/mp_feature_extractors/selfie_multiclass_256x256.tflite +3 -0
data_gen/utils/path_converter.py +24 -0
data_gen/utils/process_audio/extract_hubert.py +92 -0
data_gen/utils/process_audio/extract_mel_f0.py +148 -0
data_gen/utils/process_audio/resample_audio_to_16k.py +49 -0
data_gen/utils/process_image/extract_lm2d.py +197 -0
data_gen/utils/process_image/extract_segment_imgs.py +114 -0
data_gen/utils/process_image/fit_3dmm_landmark.py +369 -0
data_gen/utils/process_video/euler2quaterion.py +35 -0
data_gen/utils/process_video/extract_blink.py +50 -0
data_gen/utils/process_video/extract_lm2d.py +164 -0
data_gen/utils/process_video/extract_segment_imgs.py +494 -0
data_gen/utils/process_video/fit_3dmm_landmark.py +565 -0
data_gen/utils/process_video/inpaint_torso_imgs.py +193 -0
data_gen/utils/process_video/resample_video_to_25fps_resize_to_512.py +87 -0
data_gen/utils/process_video/split_video_to_imgs.py +53 -0
data_util/face3d_helper.py +309 -0
deep_3drecon/BFM/.gitkeep +0 -0
deep_3drecon/BFM/basel_53201.txt +0 -0
deep_3drecon/BFM/index_mp468_from_mesh35709_v1.npy +3 -0
deep_3drecon/BFM/index_mp468_from_mesh35709_v2.npy +3 -0
deep_3drecon/BFM/index_mp468_from_mesh35709_v3.1.npy +3 -0
deep_3drecon/BFM/index_mp468_from_mesh35709_v3.npy +3 -0
deep_3drecon/BFM/select_vertex_id.mat +0 -0
deep_3drecon/BFM/similarity_Lm3D_all.mat +0 -0
deep_3drecon/__init__.py +1 -0
deep_3drecon/bfm_left_eye_faces.npy +3 -0
deep_3drecon/bfm_right_eye_faces.npy +3 -0
deep_3drecon/data_preparation.py +45 -0
deep_3drecon/deep_3drecon_models/__init__.py +67 -0
deep_3drecon/deep_3drecon_models/arcface_torch/README.md +218 -0
deep_3drecon/deep_3drecon_models/arcface_torch/backbones/__init__.py +85 -0
deep_3drecon/deep_3drecon_models/arcface_torch/backbones/iresnet.py +194 -0
deep_3drecon/deep_3drecon_models/arcface_torch/backbones/iresnet2060.py +176 -0
deep_3drecon/deep_3drecon_models/arcface_torch/backbones/mobilefacenet.py +147 -0
deep_3drecon/deep_3drecon_models/arcface_torch/backbones/vit.py +280 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,12 @@

+data_gen/utils/mp_feature_extractors/selfie_multiclass_256x256.tflite filter=lfs diff=lfs merge=lfs -text
+data_gen/utils/mp_feature_extractors/face_landmarker.task filter=lfs diff=lfs merge=lfs -text
+utils/audio/pitch/bin/ReaperF0 filter=lfs diff=lfs merge=lfs -text
+utils/audio/pitch/bin/ExtractF0ByStraight filter=lfs diff=lfs merge=lfs -text
+utils/audio/pitch/bin/InterpF0 filter=lfs diff=lfs merge=lfs -text
+deep_3drecon/bfm_left_eye_faces.npy filter=lfs diff=lfs merge=lfs -text
+deep_3drecon/bfm_right_eye_faces.npy filter=lfs diff=lfs merge=lfs -text
+deep_3drecon/ncc_code.npy filter=lfs diff=lfs merge=lfs -text
+deep_3drecon/BFM/index_mp468_from_mesh35709_v1.npy filter=lfs diff=lfs merge=lfs -text
+deep_3drecon/BFM/index_mp468_from_mesh35709_v2.npy filter=lfs diff=lfs merge=lfs -text
+deep_3drecon/BFM/index_mp468_from_mesh35709_v3.1.npy filter=lfs diff=lfs merge=lfs -text
+deep_3drecon/BFM/index_mp468_from_mesh35709_v3.npy filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,201 @@

+# big files
+data_util/face_tracking/3DMM/01_MorphableModel.mat
+data_util/face_tracking/3DMM/3DMM_info.npy
+!/deep_3drecon/BFM/.gitkeep
+deep_3drecon/BFM/Exp_Pca.bin
+deep_3drecon/BFM/01_MorphableModel.mat
+deep_3drecon/BFM/BFM_model_front.mat
+deep_3drecon/network/FaceReconModel.pb
+deep_3drecon/checkpoints/*
+.vscode
+### Project ignore
+./checkpoints/*
+/checkpoints/*
+!/checkpoints/.gitkeep
+/data/*
+!/data/.gitkeep
+infer_out
+rsync
+.idea
+.DS_Store
+bak
+tmp
+*.tar.gz
+mos
+nbs
+/configs_usr/*
+!/configs_usr/.gitkeep
+/egs_usr/*
+!/egs_usr/.gitkeep
+/rnnoise
+#/usr/*
+#!/usr/.gitkeep
+scripts_usr
+# Created by .ignore support plugin (hsz.mobi)
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+data_util/deepspeech_features/deepspeech-0.9.2-models.pbmm
+deep_3drecon/mesh_renderer/bazel-bin
+deep_3drecon/mesh_renderer/bazel-mesh_renderer
+deep_3drecon/mesh_renderer/bazel-out
+deep_3drecon/mesh_renderer/bazel-testlogs
+.nfs*
+infer_outs/*
+*.pth
+venv_113/*
+*.pt
+experiments/trials
+flame_3drecon/*
+temp/
+/kill.sh
+/datasets
+data_util/imagenet_classes.txt
+process_data_May.sh
+/env_prepare_reproduce.md
+/my_debug.py
+utils/metrics/shape_predictor_68_face_landmarks.dat
+*.mp4
+_torchshow/
+*.png
+*.jpg
+*.mrc
+deep_3drecon/BFM/BFM_exp_idx.mat
+deep_3drecon/BFM/BFM_front_idx.mat
+deep_3drecon/BFM/facemodel_info.mat
+deep_3drecon/BFM/index_mp468_from_mesh35709.npy
+deep_3drecon/BFM/mediapipe_in_bfm53201.npy
+deep_3drecon/BFM/std_exp.txt
+!data/raw/examples/*
+/heckpoints_mimictalk

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 ZhenhuiYe
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README-zh.md ADDED Viewed

	@@ -0,0 +1,157 @@

+# MimicTalk: Mimicking a personalized and expressive 3D talking face in few minutes | NeurIPS 2024
+[![arXiv](https://img.shields.io/badge/arXiv-Paper-%3CCOLOR%3E.svg)](https://arxiv.org/abs/2401.08503)| [![GitHub Stars](https://img.shields.io/github/stars/yerfor/MimicTalk
+)](https://github.com/yerfor/MimicTalk)  | [English Readme](./README.md)
+这个仓库是MimicTalk的官方PyTorch实现, 用于实现特定说话人的高表现力的虚拟人视频合成。该仓库代码基于我们先前的工作[Real3D-Portrait](https://github.com/yerfor/Real3DPortrait) (ICLR 2024)，即基于NeRF的one-shot说话人合成，这让Mimictalk的训练加速且效果增强。您可以访问我们的[项目页面](https://mimictalk.github.io/)以观看Demo视频, 阅读我们的[论文](https://arxiv.org/abs/2410.06734)以了解技术细节。
+<p align="center">
+    <br>
+    <img src="assets/mimictalk.png" width="100%"/>
+    <br>
+</p>
+# 快速上手！
+## 安装环境
+请参照[环境配置文档](docs/prepare_env/install_guide-zh.md)，配置Conda环境`mimictalk`
+## 下载预训练与第三方模型
+### 3DMM BFM模型
+下载3DMM BFM模型：[Google Drive](https://drive.google.com/drive/folders/1o4t5YIw7w4cMUN4bgU9nPf6IyWVG1bEk?usp=sharing) 或 [BaiduYun Disk](https://pan.baidu.com/s/1aqv1z_qZ23Vp2VP4uxxblQ?pwd=m9q5 ) 提取码: m9q5
+下载完成后，放置全部的文件到`deep_3drecon/BFM`里，文件结构如下：
+```
+deep_3drecon/BFM/
+├── 01_MorphableModel.mat
+├── BFM_exp_idx.mat
+├── BFM_front_idx.mat
+├── BFM_model_front.mat
+├── Exp_Pca.bin
+├── facemodel_info.mat
+├── index_mp468_from_mesh35709.npy
+├── mediapipe_in_bfm53201.npy
+└── std_exp.txt
+```
+### 预训练模型
+下载预训练的MimicTalk相关Checkpoints：[Google Drive](https://drive.google.com/drive/folders/1Kc6ueDO9HFDN3BhtJCEKNCZtyKHSktaA?usp=sharing) or [BaiduYun Disk](https://pan.baidu.com/s/1nQKyGV5JB6rJtda7qsThUg?pwd=mimi) 提取码: mimi
+下载完成后，放置全部的文件到`checkpoints`与`checkpoints_mimictalk`里并解压，文件结构如下：
+```
+checkpoints/
+├── mimictalk_orig
+│   └── os_secc2plane_torso
+│       ├── config.yaml
+│       └── model_ckpt_steps_100000.ckpt
+|-- 240112_icl_audio2secc_vox2_cmlr
+│     ├── config.yaml
+│     └── model_ckpt_steps_1856000.ckpt
+└── pretrained_ckpts
+    └── mit_b0.pth
+checkpoints_mimictalk/
+└── German_20s
+    ├── config.yaml
+    └── model_ckpt_steps_10000.ckpt
+```
+## MimicTalk训练与推理的最简命令
+```
+python inference/train_mimictalk_on_a_video.py # train the model, this may take 10 minutes for 2,000 steps
+python inference/mimictalk_infer.py # infer the model
+```
+# 训练与推理细节
+我们目前提供了**命令行（CLI）**与**Gradio WebUI**推理方式。音频驱动推理的人像信息来自于`torso_ckpt`，因此需要至少再提供`driving audio`用于推理。另外，可以提供`style video`让模型能够预测与该视频风格一致的说话人动作。
+首先，切换至项目根目录并启用Conda环境：
+```bash
+cd <Real3DPortraitRoot>
+conda activate mimictalk
+export PYTHONPATH=./
+export HF_ENDPOINT=https://hf-mirror.com
+```
+## Gradio WebUI推理
+启动Gradio WebUI，按照提示上传素材，点击`Training`按钮进行训练；训练完成后点击`Generate`按钮即可推理：
+```bash
+python inference/app_mimictalk.py
+```
+## 命令行特定说话人训练
+需要至少提供`source video`，训练指令：
+```bash
+python inference/train_mimictalk_on_a_video.py \
+--video_id <PATH_TO_SOURCE_VIDEO> \
+--max_updates <UPDATES_NUMBER> \
+--work_dir <PATH_TO_SAVING_CKPT>
+```
+一些可选参数注释：
+- `--torso_ckpt` 预训练的Real3D-Portrait模型
+- `--max_updates` 训练更新次数
+- `--batch_size` 训练的batch size： `1` 需要约8GB显存; `2`需要约15GB显存
+- `--lr_triplane` triplane的学习率：对于视频输入,  应为0.1; 对于图片输入，应为0.001
+- `--work_dir` 未指定时，将默认存储在`checkpoints_mimictalk/`中
+指令示例：
+```bash
+python inference/train_mimictalk_on_a_video.py \
+--video_id data/raw/videos/German_20s.mp4 \
+--max_updates 2000 \
+--work_dir checkpoints_mimictalk/German_20s
+```
+## 命令行推理
+需要至少提供`driving audio`，可选提供`driving style`，推理指令：
+```bash
+python inference/mimictalk_infer.py \
+--drv_aud <PATH_TO_AUDIO> \
+--drv_style <PATH_TO_STYLE_VIDEO, OPTIONAL> \
+--drv_pose <PATH_TO_POSE_VIDEO, OPTIONAL> \
+--bg_img <PATH_TO_BACKGROUND_IMAGE, OPTIONAL> \
+--out_name <PATH_TO_OUTPUT_VIDEO, OPTIONAL>
+```
+一些可选参数注释：
+- `--drv_pose` 指定时提供了运动pose信息，不指定则为静态运动
+- `--bg_img` 指定时提供了背景信息，不指定则为source image提取的背景
+- `--mouth_amp` 嘴部张幅参数，值越大张幅越大
+- `--map_to_init_pose` 值为`True`时，首帧的pose将被映��到source pose，后续帧也作相同变换
+- `--temperature` 代表audio2motion的采样温度，值越大结果越多样，但同时精确度越低
+- `--out_name` 不指定时，结果将保存在`infer_out/tmp/`中
+- `--out_mode` 值为`final`时，只输出说话人视频；值为`concat_debug`时，同时输出一些可视化的中间结果
+推理命令例子：
+```bash
+python inference/mimictalk_infer.py \
+--drv_aud data/raw/examples/Obama_5s.wav \
+--drv_pose data/raw/examples/German_20s.mp4 \
+--drv_style data/raw/examples/German_20s.mp4 \
+--bg_img data/raw/examples/bg.png \
+--out_name output.mp4 \
+--out_mode final
+```
+# 声明
+任何组织或个人未经本人同意，不得使用本文提及的任何技术生成他人说话的视频，包括但不限于政府领导人、政界人士、社会名流等。如不遵守本条款，则可能违反版权法。
+# 引用我们
+如果这个仓库对你有帮助，请考虑引用我们的工作：
+```
+@inproceedings{ye2024mimicktalk,
+    author    = {Ye, Zhenhui and Zhong, Tianyun and Ren, Yi and Yang, Jiaqi and Li, Weichuang and Huang, Jiangwei and Jiang, Ziyue and He, Jinzheng and Huang, Rongjie and Liu, Jinglin and Zhang, Chen and Yin, Xiang and Ma, Zejun and Zhao, Zhou},
+    title     = {MimicTalk: Mimicking a personalized and expressive 3D talking face in few minutes},
+    journal   = {NeurIPS},
+    year      = {2024},
+}
+@inproceedings{ye2024real3d,
+  title = {Real3D-Portrait: One-shot Realistic 3D Talking Portrait Synthesis},
+  author = {Ye, Zhenhui and Zhong, Tianyun and Ren, Yi and Yang, Jiaqi and Li, Weichuang and Huang, Jiawei and Jiang, Ziyue and He, Jinzheng and Huang, Rongjie and Liu, Jinglin and others},
+  journal  = {ICLR},
+  year={2024}
+}
+```

README.md ADDED Viewed

	@@ -0,0 +1,155 @@

+# MimicTalk: Mimicking a personalized and expressive 3D talking face in few minutes | NeurIPS 2024
+[![arXiv](https://img.shields.io/badge/arXiv-Paper-%3CCOLOR%3E.svg)](https://arxiv.org/abs/2401.08503)| [![GitHub Stars](https://img.shields.io/github/stars/yerfor/MimicTalk
+)](https://github.com/yerfor/MimicTalk) | [中文文档](./README-zh.md)
+This is the official repo of MimicTalk with Pytorch implementation, for training a personalized and expressive talking avatar in minutes. The code is built upon our previous work, [Real3D-Portrait](https://github.com/yerfor/Real3DPortrait) (ICLR 2024), which is a one-shot NeRF-based talking avatar system and enables the fast training and good quality of our MimicTalk. You can visit our [Demo Page](https://mimictalk.github.io/) for watching demo videos, and read our [Paper](https://arxiv.org/abs/2410.06734) for technical details.
+<p align="center">
+    <br>
+    <img src="assets/mimictalk.png" width="100%"/>
+    <br>
+</p>
+# Quick Start!
+## Environment Installation
+Please refer to [Installation Guide](docs/prepare_env/install_guide.md), prepare a Conda environment `mimictalk`.
+## Download Pre-trained & Third-Party Models
+### 3DMM BFM Model
+Download 3DMM BFM Model from [Google Drive](https://drive.google.com/drive/folders/1o4t5YIw7w4cMUN4bgU9nPf6IyWVG1bEk?usp=sharing) or [BaiduYun Disk](https://pan.baidu.com/s/1aqv1z_qZ23Vp2VP4uxxblQ?pwd=m9q5 ) with Password m9q5.
+Put all the files in `deep_3drecon/BFM`, the file structure will be like this:
+```
+deep_3drecon/BFM/
+├── 01_MorphableModel.mat
+├── BFM_exp_idx.mat
+├── BFM_front_idx.mat
+├── BFM_model_front.mat
+├── Exp_Pca.bin
+├── facemodel_info.mat
+├── index_mp468_from_mesh35709.npy
+└── std_exp.txt
+```
+### Pre-trained Real3D-Portrait & MimicTalk
+Download Pre-trained MimicTalk Checkpoints：[Google Drive](https://drive.google.com/drive/folders/1Kc6ueDO9HFDN3BhtJCEKNCZtyKHSktaA?usp=sharing) or [BaiduYun Disk](https://pan.baidu.com/s/1nQKyGV5JB6rJtda7qsThUg?pwd=mimi) with Password `mimi`
+Put the zip files in `checkpoints` & `checkpoints_mimictalk` and unzip them, the file structure will be like this:
+```
+checkpoints/
+├── mimictalk_orig
+│   └── os_secc2plane_torso
+│       ├── config.yaml
+│       └── model_ckpt_steps_100000.ckpt
+|-- 240112_icl_audio2secc_vox2_cmlr
+│     ├── config.yaml
+│     └── model_ckpt_steps_1856000.ckpt
+└── pretrained_ckpts
+    └── mit_b0.pth
+checkpoints_mimictalk/
+└── German_20s
+    ├── config.yaml
+    └── model_ckpt_steps_10000.ckpt
+```
+## Train & Infer MimicTalk in two lines
+```
+python inference/train_mimictalk_on_a_video.py # train the model, this may take 10 minutes for 2,000 steps
+python inference/mimictalk_infer.py # infer the model
+```
+# Detailed options for train & infer
+Currently, we provide **CLI**, **Gradio WebUI** for inference. We support Audio-Driven talking head generation for specific-person (which is from `torso_ckpt`), and at least prepare `driving audio` for inference. Optionly, providing `style video` enables model to predict corressponding talking style with it.
+Firstly, switch to project folder and activate conda environment:
+```bash
+cd <mimictalkRoot>
+conda activate mimictalk
+export PYTHONPATH=./
+export HF_ENDPOINT=https://hf-mirror.com
+```
+## Gradio WebUI
+Run Gradio WebUI demo, upload resouces in webpage，click `Training` button to train a person-specific MimicTalk model, and then click `Generate` button to inference with arbitary audio and style：
+```bash
+python inference/app_mimictalk.py
+```
+## CLI Training for specific-person video
+Provide `source video` for specific-person:
+```bash
+python inference/train_mimictalk_on_a_video.py \
+--video_id <PATH_TO_SOURCE_VIDEO> \
+--max_updates <UPDATES_NUMBER> \
+--work_dir <PATH_TO_SAVING_CKPT>
+```
+Some training optional parameters:
+- `--torso_ckpt` Pre-trained Real3d-Portrait checkpoints path
+- `--max_updates` The number of training updates.
+- `--batch_size` Batch size during training: `1` needs about 8GB VRAM; `2` needs about 15GB
+- `--lr_triplane` Learning rate of triplane: for video, 0.1; for an image, 0.001
+- `--work_dir` When not assigned, the results will be stored at `checkpoints_mimictalk/`.
+Commandline example:
+```bash
+python inference/train_mimictalk_on_a_video.py \
+--video_id data/raw/videos/German_20s.mp4 \
+--max_updates 2000 \
+--work_dir checkpoints_mimictalk/German_20s
+```
+## CLI Inference
+Provide `driving audio` and `driving style` (Optionly):
+```bash
+python inference/mimictalk_infer.py \
+--drv_aud <PATH_TO_AUDIO> \
+--drv_style <PATH_TO_STYLE_VIDEO, OPTIONAL> \
+--drv_pose <PATH_TO_POSE_VIDEO, OPTIONAL> \
+--bg_img <PATH_TO_BACKGROUND_IMAGE, OPTIONAL> \
+--out_name <PATH_TO_OUTPUT_VIDEO, OPTIONAL>
+```
+Some inference optional parameters：
+- `--drv_pose` provide motion pose information, default to be static poses
+- `--bg_img` provide background information, default to be image extracted from source
+- `--map_to_init_pose` when set to `True`, the initial pose will be mapped to source pose, and other poses will be equally transformed
+- `--temperature` stands for the sampling temperature of audio2motion, higher for more diverse results at the expense of lower accuracy
+- `--out_name` When not assigned, the results will be stored at `infer_out/tmp/`.
+- `--out_mode` When `final`, only outputs the final result; when `concat_debug`, also outputs visualization of several intermediate process.
+Commandline example:
+```bash
+python inference/mimictalk_infer.py \
+--drv_aud data/raw/examples/Obama_5s.wav \
+--drv_pose data/raw/examples/German_20s.mp4 \
+--drv_style data/raw/examples/German_20s.mp4 \
+--bg_img data/raw/examples/bg.png \
+--out_name output.mp4 \
+--out_mode final
+```
+# Disclaimer
+Any organization or individual is prohibited from using any technology mentioned in this paper to generate someone's talking video without his/her consent, including but not limited to government leaders, political figures, and celebrities. If you do not comply with this item, you could be in violation of copyright laws.
+# Citation
+If you found this repo helpful to your work, please consider cite us:
+```
+@inproceedings{ye2024mimicktalk,
+    author    = {Ye, Zhenhui and Zhong, Tianyun and Ren, Yi and Yang, Jiaqi and Li, Weichuang and Huang, Jiangwei and Jiang, Ziyue and He, Jinzheng and Huang, Rongjie and Liu, Jinglin and Zhang, Chen and Yin, Xiang and Ma, Zejun and Zhao, Zhou},
+    title     = {MimicTalk: Mimicking a personalized and expressive 3D talking face in few minutes},
+    journal   = {NeurIPS},
+    year      = {2024},
+}
+@inproceedings{ye2024real3d,
+  title = {Real3D-Portrait: One-shot Realistic 3D Talking Portrait Synthesis},
+  author = {Ye, Zhenhui and Zhong, Tianyun and Ren, Yi and Yang, Jiaqi and Li, Weichuang and Huang, Jiawei and Jiang, Ziyue and He, Jinzheng and Huang, Rongjie and Liu, Jinglin and others},
+  journal  = {ICLR},
+  year={2024}
+}
+```

checkpoints/.gitkeep ADDED Viewed

File without changes

data_gen/eg3d/convert_to_eg3d_convention.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import numpy as np
+import torch
+import copy
+from utils.commons.tensor_utils import convert_to_tensor, convert_to_np
+from deep_3drecon.deep_3drecon_models.bfm import ParametricFaceModel
+def _fix_intrinsics(intrinsics):
+    """
+    intrinsics: [3,3], not batch-wise
+    """
+    # unnormalized                                normalized
+    # [[ f_x, s=0,    x_0]             [[ f_x/size_x,   s=0,            x_0/size_x=0.5]
+    #  [ 0,   f_y,  y_0]      ->      [ 0,            f_y/size_y,   y_0/size_y=0.5]
+    #  [ 0,   0,    1  ]]             [ 0,            0,            1         ]]
+    intrinsics = np.array(intrinsics).copy()
+    assert intrinsics.shape == (3, 3), intrinsics
+    intrinsics[0,0] = 2985.29/700
+    intrinsics[1,1] = 2985.29/700
+    intrinsics[0,2] = 1/2
+    intrinsics[1,2] = 1/2
+    assert intrinsics[0,1] == 0
+    assert intrinsics[2,2] == 1
+    assert intrinsics[1,0] == 0
+    assert intrinsics[2,0] == 0
+    assert intrinsics[2,1] == 0
+    return intrinsics
+# Used in original submission
+def _fix_pose_orig(pose):
+    """
+    pose: [4,4], not batch-wise
+    """
+    pose = np.array(pose).copy()
+    location = pose[:3, 3]
+    radius = np.linalg.norm(location)
+    pose[:3, 3] = pose[:3, 3]/radius * 2.7
+    return pose
+def get_eg3d_convention_camera_pose_intrinsic(item):
+    """
+    item: a dict during binarize
+    """
+    if item['euler'].ndim == 1:
+        angle = convert_to_tensor(copy.copy(item['euler']))
+        trans = copy.deepcopy(item['trans'])
+        # handle the difference of euler axis between eg3d and ours
+        # see data_gen/process_ffhq_for_eg3d/transplant_eg3d_ckpt_into_our_convention.ipynb
+        # angle += torch.tensor([0, 3.1415926535, 3.1415926535], device=angle.device)
+        R = ParametricFaceModel.compute_rotation(angle.unsqueeze(0))[0].cpu().numpy()
+        trans[2] += -10
+        c = -np.dot(R, trans)
+        pose = np.eye(4)
+        pose[:3,:3] = R
+        c *= 0.27 # normalize camera radius
+        c[1] += 0.006 # additional offset used in submission
+        c[2] += 0.161 # additional offset used in submission
+        pose[0,3] = c[0]
+        pose[1,3] = c[1]
+        pose[2,3] = c[2]
+        focal = 2985.29 # = 1015*1024/224*(300/466.285),
+        # todo： 如果修改了fit 3dmm阶段的camera intrinsic，这里也要跟着改
+        pp = 512#112
+        w = 1024#224
+        h = 1024#224
+        K = np.eye(3)
+        K[0][0] = focal
+        K[1][1] = focal
+        K[0][2] = w/2.0
+        K[1][2] = h/2.0
+        convention_K = _fix_intrinsics(K)
+        Rot = np.eye(3)
+        Rot[0, 0] = 1
+        Rot[1, 1] = -1
+        Rot[2, 2] = -1
+        pose[:3, :3] = np.dot(pose[:3, :3], Rot) # permute axes
+        convention_pose = _fix_pose_orig(pose)
+        item['c2w'] = pose
+        item['convention_c2w'] = convention_pose
+        item['intrinsics'] = convention_K
+        return item
+    else:
+        num_samples = len(item['euler'])
+        eulers_all = convert_to_tensor(copy.deepcopy(item['euler'])) # [B, 3]
+        trans_all = copy.deepcopy(item['trans']) # [B, 3]
+        # handle the difference of euler axis between eg3d and ours
+        # see data_gen/process_ffhq_for_eg3d/transplant_eg3d_ckpt_into_our_convention.ipynb
+        # eulers_all += torch.tensor([0, 3.1415926535, 3.1415926535], device=eulers_all.device).unsqueeze(0).repeat([eulers_all.shape[0],1])
+        intrinsics = []
+        poses = []
+        convention_poses = []
+        for i in range(num_samples):
+            angle = eulers_all[i]
+            trans = trans_all[i]
+            R = ParametricFaceModel.compute_rotation(angle.unsqueeze(0))[0].cpu().numpy()
+            trans[2] += -10
+            c = -np.dot(R, trans)
+            pose = np.eye(4)
+            pose[:3,:3] = R
+            c *= 0.27 # normalize camera radius
+            c[1] += 0.006 # additional offset used in submission
+            c[2] += 0.161 # additional offset used in submission
+            pose[0,3] = c[0]
+            pose[1,3] = c[1]
+            pose[2,3] = c[2]
+            focal = 2985.29 # = 1015*1024/224*(300/466.285),
+            # todo： 如果修改了fit 3dmm阶段的camera intrinsic，这里也要跟着改
+            pp = 512#112
+            w = 1024#224
+            h = 1024#224
+            K = np.eye(3)
+            K[0][0] = focal
+            K[1][1] = focal
+            K[0][2] = w/2.0
+            K[1][2] = h/2.0
+            convention_K = _fix_intrinsics(K)
+            intrinsics.append(convention_K)
+            Rot = np.eye(3)
+            Rot[0, 0] = 1
+            Rot[1, 1] = -1
+            Rot[2, 2] = -1
+            pose[:3, :3] = np.dot(pose[:3, :3], Rot)
+            convention_pose = _fix_pose_orig(pose)
+            convention_poses.append(convention_pose)
+            poses.append(pose)
+        intrinsics = np.stack(intrinsics) # [B, 3, 3]
+        poses = np.stack(poses) # [B, 4, 4]
+        convention_poses = np.stack(convention_poses) # [B, 4, 4]
+        item['intrinsics'] = intrinsics
+        item['c2w'] = poses
+        item['convention_c2w'] = convention_poses
+        return item

data_gen/runs/binarizer_nerf.py ADDED Viewed

	@@ -0,0 +1,335 @@

+import os
+import numpy as np
+import math
+import json
+import imageio
+import torch
+import tqdm
+import cv2
+from data_util.face3d_helper import Face3DHelper
+from utils.commons.euler2rot import euler_trans_2_c2w, c2w_to_euler_trans
+from data_gen.utils.process_video.euler2quaterion import euler2quaterion, quaterion2euler
+from deep_3drecon.deep_3drecon_models.bfm import ParametricFaceModel
+def euler2rot(euler_angle):
+    batch_size = euler_angle.shape[0]
+    theta = euler_angle[:, 0].reshape(-1, 1, 1)
+    phi = euler_angle[:, 1].reshape(-1, 1, 1)
+    psi = euler_angle[:, 2].reshape(-1, 1, 1)
+    one = torch.ones(batch_size, 1, 1).to(euler_angle.device)
+    zero = torch.zeros(batch_size, 1, 1).to(euler_angle.device)
+    rot_x = torch.cat((
+        torch.cat((one, zero, zero), 1),
+        torch.cat((zero, theta.cos(), theta.sin()), 1),
+        torch.cat((zero, -theta.sin(), theta.cos()), 1),
+    ), 2)
+    rot_y = torch.cat((
+        torch.cat((phi.cos(), zero, -phi.sin()), 1),
+        torch.cat((zero, one, zero), 1),
+        torch.cat((phi.sin(), zero, phi.cos()), 1),
+    ), 2)
+    rot_z = torch.cat((
+        torch.cat((psi.cos(), -psi.sin(), zero), 1),
+        torch.cat((psi.sin(), psi.cos(), zero), 1),
+        torch.cat((zero, zero, one), 1)
+    ), 2)
+    return torch.bmm(rot_x, torch.bmm(rot_y, rot_z))
+def rot2euler(rot_mat):
+    batch_size = len(rot_mat)
+    # we assert that y in in [-0.5pi, 0.5pi]
+    cos_y = torch.sqrt(rot_mat[:, 1, 2] * rot_mat[:, 1, 2] + rot_mat[:, 2, 2] * rot_mat[:, 2, 2])
+    theta_x = torch.atan2(-rot_mat[:, 1, 2], rot_mat[:, 2, 2])
+    theta_y = torch.atan2(rot_mat[:, 2, 0], cos_y)
+    theta_z = torch.atan2(rot_mat[:, 0, 1], rot_mat[:, 0, 0])
+    euler_angles = torch.zeros([batch_size, 3])
+    euler_angles[:, 0] = theta_x
+    euler_angles[:, 1] = theta_y
+    euler_angles[:, 2] = theta_z
+    return euler_angles
+index_lm68_from_lm468 = [127,234,93,132,58,136,150,176,152,400,379,365,288,361,323,454,356,70,63,105,66,107,336,296,334,293,300,168,197,5,4,75,97,2,326,305,
+                         33,160,158,133,153,144,362,385,387,263,373,380,61,40,37,0,267,270,291,321,314,17,84,91,78,81,13,311,308,402,14,178]
+def plot_lm2d(lm2d):
+    WH = 512
+    img = np.ones([WH, WH, 3], dtype=np.uint8) * 255
+    for i in range(len(lm2d)):
+        x, y = lm2d[i]
+        color = (255,0,0)
+        img = cv2.circle(img, center=(int(x),int(y)), radius=3, color=color, thickness=-1)
+        font = cv2.FONT_HERSHEY_SIMPLEX
+    for i in range(len(lm2d)):
+        x, y = lm2d[i]
+        img = cv2.putText(img, f"{i}", org=(int(x),int(y)), fontFace=font, fontScale=0.3, color=(255,0,0))
+    return img
+def get_face_rect(lms, h, w):
+    """
+    lms: [68, 2]
+    h, w: int
+    return: [4,]
+    """
+    assert len(lms) == 68
+    # min_x, max_x = np.min(lms, 0)[0], np.max(lms, 0)[0]
+    min_x, max_x = np.min(lms[:, 0]), np.max(lms[:, 0])
+    cx = int((min_x+max_x)/2.0)
+    cy = int(lms[27, 1])
+    h_w = int((max_x-cx)*1.5)
+    h_h = int((lms[8, 1]-cy)*1.15)
+    rect_x = cx - h_w
+    rect_y = cy - h_h
+    if rect_x < 0:
+        rect_x = 0
+    if rect_y < 0:
+        rect_y = 0
+    rect_w = min(w-1-rect_x, 2*h_w)
+    rect_h = min(h-1-rect_y, 2*h_h)
+    # rect = np.array((rect_x, rect_y, rect_w, rect_h), dtype=np.int32)
+    # rect = [rect_x, rect_y, rect_w, rect_h]
+    rect = [rect_x, rect_x + rect_w, rect_y, rect_y + rect_h] # min_j,  max_j, min_i, max_i
+    return rect # this x is width, y is height
+def get_lip_rect(lms, h, w):
+    """
+    lms: [68, 2]
+    h, w: int
+    return: [4,]
+    """
+    # this x is width, y is height
+    # for lms, lms[:, 0] is width, lms[:, 1] is height
+    assert len(lms) == 68
+    lips = slice(48, 60)
+    lms = lms[lips]
+    min_x, max_x = np.min(lms[:, 0]), np.max(lms[:, 0])
+    min_y, max_y = np.min(lms[:, 1]), np.max(lms[:, 1])
+    cx = int((min_x+max_x)/2.0)
+    cy = int((min_y+max_y)/2.0)
+    h_w = int((max_x-cx)*1.2)
+    h_h = int((max_y-cy)*1.2)
+    h_w = max(h_w, h_h)
+    h_h = h_w
+    rect_x = cx - h_w
+    rect_y = cy - h_h
+    rect_w = 2*h_w
+    rect_h = 2*h_h
+    if rect_x < 0:
+        rect_x = 0
+    if rect_y < 0:
+        rect_y = 0
+    if rect_x + rect_w > w:
+        rect_x = w - rect_w
+    if rect_y + rect_h > h:
+        rect_y = h - rect_h
+    rect = [rect_x, rect_x + rect_w, rect_y, rect_y + rect_h] # min_j,  max_j, min_i, max_i
+    return rect # this x is width, y is height
+# def get_lip_rect(lms, h, w):
+#     """
+#     lms: [68, 2]
+#     h, w: int
+#     return: [4,]
+#     """
+#     assert len(lms) == 68
+#     lips = slice(48, 60)
+#     # this x is width, y is height
+#     xmin, xmax = int(lms[lips, 1].min()), int(lms[lips, 1].max())
+#     ymin, ymax = int(lms[lips, 0].min()), int(lms[lips, 0].max())
+#     # padding to H == W
+#     cx = (xmin + xmax) // 2
+#     cy = (ymin + ymax) // 2
+#     l = max(xmax - xmin, ymax - ymin) // 2
+#     xmin = max(0, cx - l)
+#     xmax = min(h, cx + l)
+#     ymin = max(0, cy - l)
+#     ymax = min(w, cy + l)
+#     lip_rect = [xmin, xmax, ymin, ymax]
+#     return lip_rect
+def get_win_conds(conds, idx, smo_win_size=8, pad_option='zero'):
+    """
+    conds: [b, t=16, h=29]
+    idx: long, time index of the selected frame
+    """
+    idx = max(0, idx)
+    idx = min(idx, conds.shape[0]-1)
+    smo_half_win_size = smo_win_size//2
+    left_i = idx - smo_half_win_size
+    right_i = idx + (smo_win_size - smo_half_win_size)
+    pad_left, pad_right = 0, 0
+    if left_i < 0:
+        pad_left = -left_i
+        left_i = 0
+    if right_i > conds.shape[0]:
+        pad_right = right_i - conds.shape[0]
+        right_i = conds.shape[0]
+    conds_win = conds[left_i:right_i]
+    if pad_left > 0:
+        if pad_option == 'zero':
+            conds_win = np.concatenate([np.zeros_like(conds_win)[:pad_left], conds_win], axis=0)
+        elif pad_option == 'edge':
+            edge_value = conds[0][np.newaxis, ...]
+            conds_win = np.concatenate([edge_value] * pad_left + [conds_win], axis=0)
+        else:
+            raise NotImplementedError
+    if pad_right > 0:
+        if pad_option == 'zero':
+            conds_win = np.concatenate([conds_win, np.zeros_like(conds_win)[:pad_right]], axis=0)
+        elif pad_option == 'edge':
+            edge_value = conds[-1][np.newaxis, ...]
+            conds_win = np.concatenate([conds_win] + [edge_value] * pad_right , axis=0)
+        else:
+            raise NotImplementedError
+    assert conds_win.shape[0] == smo_win_size
+    return conds_win
+def load_processed_data(processed_dir):
+    # load necessary files
+    background_img_name = os.path.join(processed_dir, "bg.jpg")
+    assert os.path.exists(background_img_name)
+    head_img_dir = os.path.join(processed_dir, "head_imgs")
+    torso_img_dir = os.path.join(processed_dir, "inpaint_torso_imgs")
+    gt_img_dir = os.path.join(processed_dir, "gt_imgs")
+    hubert_npy_name = os.path.join(processed_dir, "aud_hubert.npy")
+    mel_f0_npy_name = os.path.join(processed_dir, "aud_mel_f0.npy")
+    coeff_npy_name = os.path.join(processed_dir, "coeff_fit_mp.npy")
+    lm2d_npy_name = os.path.join(processed_dir, "lms_2d.npy")
+    ret_dict = {}
+    ret_dict['bg_img'] = imageio.imread(background_img_name)
+    ret_dict['H'], ret_dict['W'] = ret_dict['bg_img'].shape[:2]
+    ret_dict['focal'], ret_dict['cx'], ret_dict['cy'] = face_model.focal, face_model.center, face_model.center
+    print("loading lm2d coeff ...")
+    lm2d_arr = np.load(lm2d_npy_name)
+    face_rect_lst = []
+    lip_rect_lst = []
+    for lm2d in lm2d_arr:
+        if len(lm2d) in [468, 478]:
+            lm2d = lm2d[index_lm68_from_lm468]
+        face_rect = get_face_rect(lm2d, ret_dict['H'], ret_dict['W'])
+        lip_rect = get_lip_rect(lm2d, ret_dict['H'], ret_dict['W'])
+        face_rect_lst.append(face_rect)
+        lip_rect_lst.append(lip_rect)
+    face_rects = np.stack(face_rect_lst, axis=0) # [T, 4]
+    print("loading fitted 3dmm coeff ...")
+    coeff_dict = np.load(coeff_npy_name, allow_pickle=True).tolist()
+    identity_arr = coeff_dict['id']
+    exp_arr = coeff_dict['exp']
+    ret_dict['id'] = identity_arr
+    ret_dict['exp'] = exp_arr
+    euler_arr = ret_dict['euler'] = coeff_dict['euler']
+    trans_arr = ret_dict['trans'] = coeff_dict['trans']
+    print("calculating lm3d ...")
+    idexp_lm3d_arr = face3d_helper.reconstruct_idexp_lm3d(torch.from_numpy(identity_arr), torch.from_numpy(exp_arr)).cpu().numpy().reshape([-1, 68*3])
+    len_motion = len(idexp_lm3d_arr)
+    video_idexp_lm3d_mean = idexp_lm3d_arr.mean(axis=0)
+    video_idexp_lm3d_std = idexp_lm3d_arr.std(axis=0)
+    ret_dict['idexp_lm3d'] = idexp_lm3d_arr
+    ret_dict['idexp_lm3d_mean'] = video_idexp_lm3d_mean
+    ret_dict['idexp_lm3d_std'] = video_idexp_lm3d_std
+    # now we convert the euler_trans from deep3d convention to adnerf convention
+    eulers = torch.FloatTensor(euler_arr)
+    trans = torch.FloatTensor(trans_arr)
+    rots = face_model.compute_rotation(eulers) # rotation matrix is a better intermediate for convention-transplan than euler
+    # handle the camera pose to geneface's convention
+    trans[:, 2] = 10 - trans[:, 2] # 抵消fit阶段的to_camera操作，即trans[...,2] = 10 - trans[...,2]
+    rots = rots.permute(0, 2, 1)
+    trans[:, 2] = - trans[:,2] # 因为intrinsic proj不同
+    # below is the NeRF camera preprocessing strategy, see `save_transforms` in data_util/process.py
+    trans = trans / 10.0
+    rots_inv = rots.permute(0, 2, 1)
+    trans_inv = - torch.bmm(rots_inv, trans.unsqueeze(2))
+    pose = torch.eye(4, dtype=torch.float32).unsqueeze(0).repeat([len_motion, 1, 1]) # [T, 4, 4]
+    pose[:, :3, :3] = rots_inv
+    pose[:, :3, 3] = trans_inv[:, :, 0]
+    c2w_transform_matrices = pose.numpy()
+    # process the audio features used for postnet training
+    print("loading hubert ...")
+    hubert_features = np.load(hubert_npy_name)
+    print("loading Mel and F0 ...")
+    mel_f0_features = np.load(mel_f0_npy_name, allow_pickle=True).tolist()
+    ret_dict['hubert'] = hubert_features
+    ret_dict['mel'] = mel_f0_features['mel']
+    ret_dict['f0'] = mel_f0_features['f0']
+    # obtaining train samples
+    frame_indices = list(range(len_motion))
+    num_train = len_motion // 11 * 10
+    train_indices = frame_indices[:num_train]
+    val_indices = frame_indices[num_train:]
+    for split in ['train', 'val']:
+        if split == 'train':
+            indices = train_indices
+            samples = []
+            ret_dict['train_samples'] = samples
+        elif split == 'val':
+            indices = val_indices
+            samples = []
+            ret_dict['val_samples'] = samples
+        for idx in indices:
+            sample = {}
+            sample['idx'] = idx
+            sample['head_img_fname'] = os.path.join(head_img_dir,f"{idx:08d}.png")
+            sample['torso_img_fname'] = os.path.join(torso_img_dir,f"{idx:08d}.png")
+            sample['gt_img_fname'] = os.path.join(gt_img_dir,f"{idx:08d}.jpg")
+            # assert os.path.exists(sample['head_img_fname']) and os.path.exists(sample['torso_img_fname']) and os.path.exists(sample['gt_img_fname'])
+            sample['face_rect'] = face_rects[idx]
+            sample['lip_rect'] = lip_rect_lst[idx]
+            sample['c2w'] = c2w_transform_matrices[idx]
+            samples.append(sample)
+    return ret_dict
+class Binarizer:
+    def __init__(self):
+        self.data_dir = 'data/'
+    def parse(self, video_id):
+        processed_dir = os.path.join(self.data_dir, 'processed/videos', video_id)
+        binary_dir = os.path.join(self.data_dir, 'binary/videos', video_id)
+        out_fname = os.path.join(binary_dir, "trainval_dataset.npy")
+        os.makedirs(binary_dir, exist_ok=True)
+        ret = load_processed_data(processed_dir)
+        mel_name = os.path.join(processed_dir, 'aud_mel_f0.npy')
+        mel_f0_dict = np.load(mel_name, allow_pickle=True).tolist()
+        ret.update(mel_f0_dict)
+        np.save(out_fname, ret, allow_pickle=True)
+if __name__ == '__main__':
+    from argparse import ArgumentParser
+    parser = ArgumentParser()
+    parser.add_argument('--video_id', type=str, default='May', help='')
+    args = parser.parse_args()
+    ### Process Single Long Audio for NeRF dataset
+    video_id = args.video_id
+    face_model = ParametricFaceModel(bfm_folder='deep_3drecon/BFM',
+                camera_distance=10, focal=1015)
+    face_model.to("cpu")
+    face3d_helper = Face3DHelper()
+    binarizer = Binarizer()
+    binarizer.parse(video_id)
+    print(f"Binarization for {video_id} Done!")

data_gen/runs/binarizer_th1kh.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import os
+import numpy as np
+from scipy.misc import face
+import torch
+from tqdm import trange
+import pickle
+from copy import deepcopy
+from data_util.face3d_helper import Face3DHelper
+from utils.commons.indexed_datasets import IndexedDataset, IndexedDatasetBuilder
+def load_video_npy(fn):
+    assert fn.endswith("_coeff_fit_mp.npy")
+    ret_dict = np.load(fn,allow_pickle=True).item()
+    video_dict = {
+        'euler': ret_dict['euler'], # [T, 3]
+        'trans': ret_dict['trans'], # [T, 3]
+        'id': ret_dict['id'], # [T, 80]
+        'exp': ret_dict['exp'], # [T, 64]
+    }
+    return video_dict
+def cal_lm3d_in_video_dict(video_dict, face3d_helper):
+    identity = video_dict['id']
+    exp = video_dict['exp']
+    idexp_lm3d = face3d_helper.reconstruct_idexp_lm3d(identity, exp).cpu().numpy()
+    video_dict['idexp_lm3d'] = idexp_lm3d
+def load_audio_npy(fn):
+    assert fn.endswith(".npy")
+    ret_dict = np.load(fn,allow_pickle=True).item()
+    audio_dict = {
+        "mel": ret_dict['mel'], # [T, 80]
+        "f0": ret_dict['f0'], # [T,1]
+    }
+    return audio_dict
+if __name__ == '__main__':
+    face3d_helper = Face3DHelper(use_gpu=False)
+    import glob,tqdm
+    prefixs = ['val', 'train']
+    binarized_ds_path = "data/binary/th1kh"
+    os.makedirs(binarized_ds_path, exist_ok=True)
+    for prefix in prefixs:
+        databuilder = IndexedDatasetBuilder(os.path.join(binarized_ds_path, prefix), gzip=False, default_idx_size=1024*1024*1024*2)
+        raw_base_dir =  '/mnt/bn/ailabrenyi/entries/yezhenhui/datasets/raw/TH1KH_512/video'
+        mp4_names = glob.glob(os.path.join(raw_base_dir, '*.mp4'))
+        mp4_names = mp4_names[:1000]
+        cnt = 0
+        scnt = 0
+        pbar = tqdm.tqdm(enumerate(mp4_names), total=len(mp4_names))
+        for i, mp4_name in pbar:
+            cnt += 1
+            if prefix == 'train':
+                if i % 100 == 0:
+                    continue
+            else:
+                if i % 100 != 0:
+                    continue
+            hubert_npy_name = mp4_name.replace("/video/", "/hubert/").replace(".mp4", "_hubert.npy")
+            audio_npy_name = mp4_name.replace("/video/", "/mel_f0/").replace(".mp4", "_mel_f0.npy")
+            video_npy_name = mp4_name.replace("/video/", "/coeff_fit_mp/").replace(".mp4", "_coeff_fit_mp.npy")
+            if not os.path.exists(audio_npy_name):
+                print(f"Skip item for audio npy not found.")
+                continue
+            if not os.path.exists(video_npy_name):
+                print(f"Skip item for video npy not found.")
+                continue
+            if (not os.path.exists(hubert_npy_name)):
+                print(f"Skip item for hubert_npy not found.")
+                continue
+            audio_dict = load_audio_npy(audio_npy_name)
+            hubert = np.load(hubert_npy_name)
+            video_dict = load_video_npy(video_npy_name)
+            com_img_dir = mp4_name.replace("/video/", "/com_imgs/").replace(".mp4", "")
+            num_com_imgs = len(glob.glob(os.path.join(com_img_dir, '*')))
+            num_frames = len(video_dict['exp'])
+            if num_com_imgs != num_frames:
+                print(f"Skip item for length mismatch.")
+                continue
+            mel = audio_dict['mel']
+            if mel.shape[0] < 32: # the video is shorter than 0.6s
+                print(f"Skip item for too short.")
+                continue
+            audio_dict.update(video_dict)
+            audio_dict['item_id'] = os.path.basename(mp4_name)[:-4]
+            audio_dict['hubert'] = hubert # [T_x, hid=1024]
+            audio_dict['img_dir'] = com_img_dir
+            databuilder.add_item(audio_dict)
+            scnt += 1
+            pbar.set_postfix({'success': scnt, 'success rate': scnt / cnt})
+        databuilder.finalize()
+        print(f"{prefix} set has {cnt} samples!")

data_gen/runs/nerf/process_guide.md ADDED Viewed

	@@ -0,0 +1,49 @@

+# 温馨提示：第一次执行可以先一步步跑完下面的命令行，把环境跑通后，之后可以直接运行同目录的run.sh，一键完成下面的所有步骤。
+# Step0. 将视频Crop到512x512分辨率，25FPS，确保每一帧都有目标人脸
+```
+ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -vf fps=25,scale=w=512:h=512 -qmin 1 -q:v 1 data/raw/videos/${VIDEO_ID}_512.mp4
+mv data/raw/videos/${VIDEO_ID}.mp4 data/raw/videos/${VIDEO_ID}_to_rm.mp4
+mv data/raw/videos/${VIDEO_ID}_512.mp4 data/raw/videos/${VIDEO_ID}.mp4
+```
+# step1: 提取音频特征, 如mel, f0, hubuert, esperanto
+```
+export CUDA_VISIBLE_DEVICES=0
+export VIDEO_ID=May
+mkdir -p data/processed/videos/${VIDEO_ID}
+ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -f wav -ar 16000 data/processed/videos/${VIDEO_ID}/aud.wav
+python data_gen/utils/process_audio/extract_hubert.py --video_id=${VIDEO_ID}
+python data_gen/utils/process_audio/extract_mel_f0.py --video_id=${VIDEO_ID}
+```
+# Step2. 提取图片
+```
+export VIDEO_ID=May
+export CUDA_VISIBLE_DEVICES=0
+mkdir -p data/processed/videos/${VIDEO_ID}/gt_imgs
+ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -vf fps=25,scale=w=512:h=512 -qmin 1 -q:v 1 -start_number 0 data/processed/videos/${VIDEO_ID}/gt_imgs/%08d.jpg
+python data_gen/utils/process_video/extract_segment_imgs.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 # extract image, segmap, and background
+```
+# Step3. 提取lm2d_mediapipe
+### 提取2D landmark用于之后Fit 3DMM
+### num_workers是本机上的CPU worker数量；total_process是使用的机器数；process_id是本机的编号
+```
+export VIDEO_ID=May
+python data_gen/utils/process_video/extract_lm2d.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4
+```
+# Step3. fit 3dmm
+```
+export VIDEO_ID=May
+export CUDA_VISIBLE_DEVICES=0
+python data_gen/utils/process_video/fit_3dmm_landmark.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 --reset  --debug --id_mode=global
+```
+# Step4. Binarize
+```
+export VIDEO_ID=May
+python data_gen/runs/binarizer_nerf.py --video_id=${VIDEO_ID}
+```
+可以看到在`data/binary/videos/Mayssss`目录下得到了数据集。

data_gen/runs/nerf/run.sh ADDED Viewed

	@@ -0,0 +1,51 @@

+# usage: CUDA_VISIBLE_DEVICES=0 bash data_gen/runs/nerf/run.sh <VIDEO_ID>
+# please place video to data/raw/videos/${VIDEO_ID}.mp4
+VIDEO_ID=$1
+echo Processing $VIDEO_ID
+echo Resizing the video to 512x512
+ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -vf fps=25,scale=w=512:h=512 -qmin 1 -q:v 1 -y data/raw/videos/${VIDEO_ID}_512.mp4
+mv data/raw/videos/${VIDEO_ID}.mp4 data/raw/videos/${VIDEO_ID}_to_rm.mp4
+mv data/raw/videos/${VIDEO_ID}_512.mp4 data/raw/videos/${VIDEO_ID}.mp4
+echo Done
+echo The old video is moved to data/raw/videos/${VIDEO_ID}.mp4 data/raw/videos/${VIDEO_ID}_to_rm.mp4
+echo mkdir -p data/processed/videos/${VIDEO_ID}
+mkdir -p data/processed/videos/${VIDEO_ID}
+echo Done
+# extract audio file from the training video
+echo ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -f wav -ar 16000 -v quiet -y data/processed/videos/${VIDEO_ID}/aud.wav
+ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -f wav -ar 16000 -v quiet -y data/processed/videos/${VIDEO_ID}/aud.wav
+echo Done
+# extract hubert_mel_f0 from audio
+echo python data_gen/utils/process_audio/extract_hubert.py --video_id=${VIDEO_ID}
+python data_gen/utils/process_audio/extract_hubert.py --video_id=${VIDEO_ID}
+echo python data_gen/utils/process_audio/extract_mel_f0.py --video_id=${VIDEO_ID}
+python data_gen/utils/process_audio/extract_mel_f0.py --video_id=${VIDEO_ID}
+echo Done
+# extract segment images
+echo mkdir -p data/processed/videos/${VIDEO_ID}/gt_imgs
+mkdir -p data/processed/videos/${VIDEO_ID}/gt_imgs
+echo ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -vf fps=25,scale=w=512:h=512 -qmin 1 -q:v 1 -start_number 0 -v quiet data/processed/videos/${VIDEO_ID}/gt_imgs/%08d.jpg
+ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -vf fps=25,scale=w=512:h=512 -qmin 1 -q:v 1 -start_number 0 -v quiet data/processed/videos/${VIDEO_ID}/gt_imgs/%08d.jpg
+echo Done
+echo python data_gen/utils/process_video/extract_segment_imgs.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 # extract image, segmap, and background
+python data_gen/utils/process_video/extract_segment_imgs.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 # extract image, segmap, and background
+echo Done
+echo python data_gen/utils/process_video/extract_lm2d.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4
+python data_gen/utils/process_video/extract_lm2d.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4
+echo Done
+pkill -f void*
+echo python data_gen/utils/process_video/fit_3dmm_landmark.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 --reset --debug --id_mode=global
+python data_gen/utils/process_video/fit_3dmm_landmark.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 --reset --debug --id_mode=global
+echo Done
+echo python data_gen/runs/binarizer_nerf.py --video_id=${VIDEO_ID}
+python data_gen/runs/binarizer_nerf.py --video_id=${VIDEO_ID}
+echo Done

data_gen/utils/mp_feature_extractors/face_landmarker.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import mediapipe as mp
+from mediapipe.tasks import python
+from mediapipe.tasks.python import vision
+import numpy as np
+import cv2
+import os
+import copy
+# simplified mediapipe ldm at https://github.com/k-m-irfan/simplified_mediapipe_face_landmarks
+index_lm141_from_lm478 = [70,63,105,66,107,55,65,52,53,46] + [300,293,334,296,336,285,295,282,283,276] + [33,246,161,160,159,158,157,173,133,155,154,153,145,144,163,7] + [263,466,388,387,386,385,384,398,362,382,381,380,374,373,390,249] + [78,191,80,81,82,13,312,311,310,415,308,324,318,402,317,14,87,178,88,95] + [61,185,40,39,37,0,267,269,270,409,291,375,321,405,314,17,84,181,91,146] + [10,338,297,332,284,251,389,356,454,323,361,288,397,365,379,378,400,377,152,148,176,149,150,136,172,58,132,93,234,127,162,21,54,103,67,109] + [468,469,470,471,472] + [473,474,475,476,477] + [64,4,294]
+# lm141 without iris
+index_lm131_from_lm478 = [70,63,105,66,107,55,65,52,53,46] + [300,293,334,296,336,285,295,282,283,276] + [33,246,161,160,159,158,157,173,133,155,154,153,145,144,163,7] + [263,466,388,387,386,385,384,398,362,382,381,380,374,373,390,249] + [78,191,80,81,82,13,312,311,310,415,308,324,318,402,317,14,87,178,88,95] + [61,185,40,39,37,0,267,269,270,409,291,375,321,405,314,17,84,181,91,146] + [10,338,297,332,284,251,389,356,454,323,361,288,397,365,379,378,400,377,152,148,176,149,150,136,172,58,132,93,234,127,162,21,54,103,67,109] + [64,4,294]
+# face alignment lm68
+index_lm68_from_lm478 = [127,234,93,132,58,136,150,176,152,400,379,365,288,361,323,454,356,70,63,105,66,107,336,296,334,293,300,168,197,5,4,75,97,2,326,305,
+                         33,160,158,133,153,144,362,385,387,263,373,380,61,40,37,0,267,270,291,321,314,17,84,91,78,81,13,311,308,402,14,178]
+# used for weights for key parts
+unmatch_mask_from_lm478 = [ 93, 127, 132, 234, 323, 356, 361, 454]
+index_eye_from_lm478 = [33,246,161,160,159,158,157,173,133,155,154,153,145,144,163,7] + [263,466,388,387,386,385,384,398,362,382,381,380,374,373,390,249]
+index_innerlip_from_lm478 = [78,191,80,81,82,13,312,311,310,415,308,324,318,402,317,14,87,178,88,95]
+index_outerlip_from_lm478 = [61,185,40,39,37,0,267,269,270,409,291,375,321,405,314,17,84,181,91,146]
+index_withinmouth_from_lm478 = [76, 62] + [184, 183, 74, 72, 73, 41, 72, 38, 11, 12, 302, 268, 303, 271, 304, 272, 408, 407] + [292, 306] +  [325, 307, 319, 320, 403, 404, 316, 315, 15, 16, 86, 85, 179, 180, 89, 90, 96, 77]
+index_mouth_from_lm478 = index_innerlip_from_lm478 + index_outerlip_from_lm478 + index_withinmouth_from_lm478
+index_yaw_from_lm68 = list(range(0, 17))
+index_brow_from_lm68 = list(range(17, 27))
+index_nose_from_lm68 = list(range(27, 36))
+index_eye_from_lm68 = list(range(36, 48))
+index_mouth_from_lm68 = list(range(48, 68))
+def read_video_to_frames(video_name):
+    frames = []
+    cap = cv2.VideoCapture(video_name)
+    while cap.isOpened():
+        ret, frame_bgr = cap.read()
+        if frame_bgr is None:
+            break
+        frames.append(frame_bgr)
+    frames = np.stack(frames)
+    frames = np.flip(frames, -1) # BGR ==> RGB
+    return frames
+class MediapipeLandmarker:
+    def __init__(self):
+        model_path = 'data_gen/utils/mp_feature_extractors/face_landmarker.task'
+        if not os.path.exists(model_path):
+            os.makedirs(os.path.dirname(model_path), exist_ok=True)
+            print("downloading face_landmarker model from mediapipe...")
+            model_url = 'https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/latest/face_landmarker.task'
+            os.system(f"wget {model_url}")
+            os.system(f"mv face_landmarker.task {model_path}")
+            print("download success")
+        base_options = python.BaseOptions(model_asset_path=model_path)
+        self.image_mode_options = vision.FaceLandmarkerOptions(base_options=base_options,
+                        running_mode=vision.RunningMode.IMAGE, # IMAGE, VIDEO, LIVE_STREAM
+                        num_faces=1)
+        self.video_mode_options = vision.FaceLandmarkerOptions(base_options=base_options,
+                        running_mode=vision.RunningMode.VIDEO, # IMAGE, VIDEO, LIVE_STREAM
+                        num_faces=1)
+    def extract_lm478_from_img_name(self, img_name):
+        img = cv2.imread(img_name)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img_lm478 = self.extract_lm478_from_img(img)
+        return img_lm478
+    def extract_lm478_from_img(self, img):
+        img_landmarker = vision.FaceLandmarker.create_from_options(self.image_mode_options)
+        frame = mp.Image(image_format=mp.ImageFormat.SRGB, data=img.astype(np.uint8))
+        img_face_landmarker_result = img_landmarker.detect(image=frame)
+        img_ldm_i = img_face_landmarker_result.face_landmarks[0]
+        img_face_landmarks = np.array([[l.x, l.y, l.z] for l in img_ldm_i])
+        H, W, _ = img.shape
+        img_lm478 = np.array(img_face_landmarks)[:, :2] * np.array([W, H]).reshape([1,2]) # [478, 2]
+        return img_lm478
+    def extract_lm478_from_video_name(self, video_name, fps=25, anti_smooth_factor=2):
+        frames = read_video_to_frames(video_name)
+        img_lm478, vid_lm478 = self.extract_lm478_from_frames(frames, fps, anti_smooth_factor)
+        return img_lm478, vid_lm478
+    def extract_lm478_from_frames(self, frames, fps=25, anti_smooth_factor=20):
+        """
+        frames: RGB, uint8
+        anti_smooth_factor: float, 对video模式的interval进行修改, 1代表无修改, 越大越接近image mode
+        """
+        img_mpldms = []
+        vid_mpldms = []
+        img_landmarker = vision.FaceLandmarker.create_from_options(self.image_mode_options)
+        vid_landmarker = vision.FaceLandmarker.create_from_options(self.video_mode_options)
+        for i in range(len(frames)):
+            frame = mp.Image(image_format=mp.ImageFormat.SRGB, data=frames[i].astype(np.uint8))
+            img_face_landmarker_result = img_landmarker.detect(image=frame)
+            vid_face_landmarker_result = vid_landmarker.detect_for_video(image=frame, timestamp_ms=int((1000/fps)*anti_smooth_factor*i))
+            try:
+                img_ldm_i = img_face_landmarker_result.face_landmarks[0]
+                vid_ldm_i = vid_face_landmarker_result.face_landmarks[0]
+            except:
+                print(f"Warning: failed detect ldm in idx={i}, use previous frame results.")
+            img_face_landmarks = np.array([[l.x, l.y, l.z] for l in img_ldm_i])
+            vid_face_landmarks = np.array([[l.x, l.y, l.z] for l in vid_ldm_i])
+            img_mpldms.append(img_face_landmarks)
+            vid_mpldms.append(vid_face_landmarks)
+        img_lm478 = np.stack(img_mpldms)[..., :2]
+        vid_lm478 = np.stack(vid_mpldms)[..., :2]
+        bs, H, W, _ = frames.shape
+        img_lm478 = np.array(img_lm478)[..., :2] * np.array([W, H]).reshape([1,1,2]) # [T, 478, 2]
+        vid_lm478 = np.array(vid_lm478)[..., :2] * np.array([W, H]).reshape([1,1,2]) # [T, 478, 2]
+        return img_lm478, vid_lm478
+    def combine_vid_img_lm478_to_lm68(self, img_lm478, vid_lm478):
+        img_lm68 = img_lm478[:, index_lm68_from_lm478]
+        vid_lm68 = vid_lm478[:, index_lm68_from_lm478]
+        combined_lm68 = copy.deepcopy(img_lm68)
+        combined_lm68[:, index_yaw_from_lm68] = vid_lm68[:, index_yaw_from_lm68]
+        combined_lm68[:, index_brow_from_lm68] = vid_lm68[:, index_brow_from_lm68]
+        combined_lm68[:, index_nose_from_lm68] = vid_lm68[:, index_nose_from_lm68]
+        return combined_lm68
+    def combine_vid_img_lm478_to_lm478(self, img_lm478, vid_lm478):
+        combined_lm478 = copy.deepcopy(vid_lm478)
+        combined_lm478[:, index_mouth_from_lm478] = img_lm478[:, index_mouth_from_lm478]
+        combined_lm478[:, index_eye_from_lm478] = img_lm478[:, index_eye_from_lm478]
+        return combined_lm478
+if __name__ == '__main__':
+    landmarker = MediapipeLandmarker()
+    ret = landmarker.extract_lm478_from_video_name("00000.mp4")

data_gen/utils/mp_feature_extractors/face_landmarker.task ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64184e229b263107bc2b804c6625db1341ff2bb731874b0bcc2fe6544e0bc9ff
+size 3758596

data_gen/utils/mp_feature_extractors/mp_segmenter.py ADDED Viewed

	@@ -0,0 +1,303 @@

+import os
+import copy
+import numpy as np
+import tqdm
+import mediapipe as mp
+import torch
+from mediapipe.tasks import python
+from mediapipe.tasks.python import vision
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm, multiprocess_run
+from utils.commons.tensor_utils import convert_to_np
+from sklearn.neighbors import NearestNeighbors
+def scatter_np(condition_img, classSeg=5):
+# def scatter(condition_img, classSeg=19, label_size=(512, 512)):
+    batch, c, height, width = condition_img.shape
+    # if height != label_size[0] or width != label_size[1]:
+        # condition_img= F.interpolate(condition_img, size=label_size, mode='nearest')
+    input_label = np.zeros([batch, classSeg, condition_img.shape[2], condition_img.shape[3]]).astype(np.int_)
+    # input_label = torch.zeros(batch, classSeg, *label_size, device=condition_img.device)
+    np.put_along_axis(input_label, condition_img, 1, 1)
+    return input_label
+def scatter(condition_img, classSeg=19):
+# def scatter(condition_img, classSeg=19, label_size=(512, 512)):
+    batch, c, height, width = condition_img.size()
+    # if height != label_size[0] or width != label_size[1]:
+        # condition_img= F.interpolate(condition_img, size=label_size, mode='nearest')
+    input_label = torch.zeros(batch, classSeg, condition_img.shape[2], condition_img.shape[3], device=condition_img.device)
+    # input_label = torch.zeros(batch, classSeg, *label_size, device=condition_img.device)
+    return input_label.scatter_(1, condition_img.long(), 1)
+def encode_segmap_mask_to_image(segmap):
+    # rgb
+    _,h,w = segmap.shape
+    encoded_img = np.ones([h,w,3],dtype=np.uint8) * 255
+    colors = [(255,255,255),(255,255,0),(255,0,255),(0,255,255),(255,0,0),(0,255,0)]
+    for i, color in enumerate(colors):
+        mask = segmap[i].astype(int)
+        index = np.where(mask != 0)
+        encoded_img[index[0], index[1], :] = np.array(color)
+    return encoded_img.astype(np.uint8)
+def decode_segmap_mask_from_image(encoded_img):
+    # rgb
+    colors = [(255,255,255),(255,255,0),(255,0,255),(0,255,255),(255,0,0),(0,255,0)]
+    bg = (encoded_img[..., 0] == 255) & (encoded_img[..., 1] == 255) & (encoded_img[..., 2] == 255)
+    hair = (encoded_img[..., 0] == 255) & (encoded_img[..., 1] == 255) & (encoded_img[..., 2] == 0)
+    body_skin = (encoded_img[..., 0] == 255) & (encoded_img[..., 1] == 0) & (encoded_img[..., 2] == 255)
+    face_skin = (encoded_img[..., 0] == 0) & (encoded_img[..., 1] == 255) & (encoded_img[..., 2] == 255)
+    clothes = (encoded_img[..., 0] == 255) & (encoded_img[..., 1] == 0) & (encoded_img[..., 2] == 0)
+    others = (encoded_img[..., 0] == 0) & (encoded_img[..., 1] == 255) & (encoded_img[..., 2] == 0)
+    segmap = np.stack([bg, hair, body_skin, face_skin, clothes, others], axis=0)
+    return segmap.astype(np.uint8)
+def read_video_frame(video_name, frame_id):
+    # https://blog.csdn.net/bby1987/article/details/108923361
+    # frame_num = video_capture.get(cv2.CAP_PROP_FRAME_COUNT) # ==> 总帧数
+    # fps = video_capture.get(cv2.CAP_PROP_FPS)               # ==> 帧率
+    # width = video_capture.get(cv2.CAP_PROP_FRAME_WIDTH)     # ==> 视频宽度
+    # height = video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT)   # ==> 视频高度
+    # pos = video_capture.get(cv2.CAP_PROP_POS_FRAMES)        # ==> 句柄位置
+    # video_capture.set(cv2.CAP_PROP_POS_FRAMES, 1000)        # ==> 设置句柄位置
+    # pos = video_capture.get(cv2.CAP_PROP_POS_FRAMES)        # ==> 此时 pos = 1000.0
+    # video_capture.release()
+    vr = cv2.VideoCapture(video_name)
+    vr.set(cv2.CAP_PROP_POS_FRAMES, frame_id)
+    _, frame = vr.read()
+    return frame
+def decode_segmap_mask_from_segmap_video_frame(video_frame):
+    # video_frame: 0~255 BGR, obtained by read_video_frame
+    def assign_values(array):
+        remainder = array % 40  # 计算数组中每个值与40的余数
+        assigned_values = np.where(remainder <= 20, array - remainder, array + (40 - remainder))
+        return assigned_values
+    segmap = video_frame.mean(-1)
+    segmap = assign_values(segmap) // 40 # [H, W] with value 0~5
+    segmap_mask = scatter_np(segmap[None, None, ...], classSeg=6)[0] # [6, H, W]
+    return segmap.astype(np.uint8)
+def extract_background(img_lst, segmap_lst=None):
+    """
+    img_lst: list of rgb ndarray
+    """
+    # only use 1/20 images
+    num_frames = len(img_lst)
+    img_lst = img_lst[::20] if num_frames > 20 else img_lst[0:1]
+    if segmap_lst is not None:
+        segmap_lst = segmap_lst[::20] if num_frames > 20 else segmap_lst[0:1]
+        assert len(img_lst) == len(segmap_lst)
+    # get H/W
+    h, w = img_lst[0].shape[:2]
+    # nearest neighbors
+    all_xys = np.mgrid[0:h, 0:w].reshape(2, -1).transpose()
+    distss = []
+    for idx, img in enumerate(img_lst):
+        if segmap_lst is not None:
+            segmap = segmap_lst[idx]
+        else:
+            segmap = seg_model._cal_seg_map(img)
+        bg = (segmap[0]).astype(bool)
+        fg_xys = np.stack(np.nonzero(~bg)).transpose(1, 0)
+        nbrs = NearestNeighbors(n_neighbors=1, algorithm='kd_tree').fit(fg_xys)
+        dists, _ = nbrs.kneighbors(all_xys)
+        distss.append(dists)
+    distss = np.stack(distss)
+    max_dist = np.max(distss, 0)
+    max_id = np.argmax(distss, 0)
+    bc_pixs = max_dist > 10 # 5
+    bc_pixs_id = np.nonzero(bc_pixs)
+    bc_ids = max_id[bc_pixs]
+    num_pixs = distss.shape[1]
+    imgs = np.stack(img_lst).reshape(-1, num_pixs, 3)
+    bg_img = np.zeros((h*w, 3), dtype=np.uint8)
+    bg_img[bc_pixs_id, :] = imgs[bc_ids, bc_pixs_id, :]
+    bg_img = bg_img.reshape(h, w, 3)
+    max_dist = max_dist.reshape(h, w)
+    bc_pixs = max_dist > 10 # 5
+    bg_xys = np.stack(np.nonzero(~bc_pixs)).transpose()
+    fg_xys = np.stack(np.nonzero(bc_pixs)).transpose()
+    nbrs = NearestNeighbors(n_neighbors=1, algorithm='kd_tree').fit(fg_xys)
+    distances, indices = nbrs.kneighbors(bg_xys)
+    bg_fg_xys = fg_xys[indices[:, 0]]
+    bg_img[bg_xys[:, 0], bg_xys[:, 1], :] = bg_img[bg_fg_xys[:, 0], bg_fg_xys[:, 1], :]
+    return bg_img
+global_segmenter = None
+def job_cal_seg_map_for_image(img, segmenter_options=None, segmenter=None):
+    """
+    被 MediapipeSegmenter.multiprocess_cal_seg_map_for_a_video所使用, 专门用来处理单个长视频.
+    """
+    global global_segmenter
+    if segmenter is not None:
+        segmenter_actual = segmenter
+    else:
+        global_segmenter = vision.ImageSegmenter.create_from_options(segmenter_options) if global_segmenter is None else global_segmenter
+        segmenter_actual = global_segmenter
+    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=img)
+    out = segmenter_actual.segment(mp_image)
+    segmap = out.category_mask.numpy_view().copy() # [H, W]
+    segmap_mask = scatter_np(segmap[None, None, ...], classSeg=6)[0] # [6, H, W]
+    segmap_image = segmap[:, :, None].repeat(3, 2).astype(float)
+    segmap_image = (segmap_image * 40).astype(np.uint8)
+    return segmap_mask, segmap_image
+class MediapipeSegmenter:
+    def __init__(self):
+        model_path = 'data_gen/utils/mp_feature_extractors/selfie_multiclass_256x256.tflite'
+        if not os.path.exists(model_path):
+            os.makedirs(os.path.dirname(model_path), exist_ok=True)
+            print("downloading segmenter model from mediapipe...")
+            os.system(f"wget https://storage.googleapis.com/mediapipe-models/image_segmenter/selfie_multiclass_256x256/float32/latest/selfie_multiclass_256x256.tflite")
+            os.system(f"mv selfie_multiclass_256x256.tflite {model_path}")
+            print("download success")
+        base_options = python.BaseOptions(model_asset_path=model_path)
+        self.options = vision.ImageSegmenterOptions(base_options=base_options,running_mode=vision.RunningMode.IMAGE, output_category_mask=True)
+        self.video_options = vision.ImageSegmenterOptions(base_options=base_options,running_mode=vision.RunningMode.VIDEO, output_category_mask=True)
+    def multiprocess_cal_seg_map_for_a_video(self, imgs, num_workers=4):
+        """
+        并行处理单个长视频
+        imgs: list of rgb array in 0~255
+        """
+        segmap_masks = []
+        segmap_images = []
+        img_lst = [(self.options, imgs[i]) for i in range(len(imgs))]
+        for (i, res) in multiprocess_run_tqdm(job_cal_seg_map_for_image, args=img_lst, num_workers=num_workers, desc='extracting from a video in multi-process'):
+            segmap_mask, segmap_image = res
+            segmap_masks.append(segmap_mask)
+            segmap_images.append(segmap_image)
+        return segmap_masks, segmap_images
+    def _cal_seg_map_for_video(self, imgs, segmenter=None, return_onehot_mask=True, return_segmap_image=True):
+        segmenter = vision.ImageSegmenter.create_from_options(self.video_options) if segmenter is None else segmenter
+        assert return_onehot_mask or return_segmap_image # you should at least return one
+        segmap_masks = []
+        segmap_images = []
+        for i in tqdm.trange(len(imgs), desc="extracting segmaps from a video..."):
+            img = imgs[i]
+            mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=img)
+            out = segmenter.segment_for_video(mp_image, 40 * i)
+            segmap = out.category_mask.numpy_view().copy() # [H, W]
+            if return_onehot_mask:
+                segmap_mask = scatter_np(segmap[None, None, ...], classSeg=6)[0] # [6, H, W]
+                segmap_masks.append(segmap_mask)
+            if return_segmap_image:
+                segmap_image = segmap[:, :, None].repeat(3, 2).astype(float)
+                segmap_image = (segmap_image * 40).astype(np.uint8)
+                segmap_images.append(segmap_image)
+        if return_onehot_mask and return_segmap_image:
+            return segmap_masks, segmap_images
+        elif return_onehot_mask:
+            return segmap_masks
+        elif return_segmap_image:
+            return segmap_images
+    def _cal_seg_map(self, img, segmenter=None, return_onehot_mask=True):
+        """
+        segmenter: vision.ImageSegmenter.create_from_options(options)
+        img: numpy, [H, W, 3], 0~255
+        segmap: [C, H, W]
+        0 - background
+        1 - hair
+        2 - body-skin
+        3 - face-skin
+        4 - clothes
+        5 - others (accessories)
+        """
+        assert img.ndim == 3
+        segmenter = vision.ImageSegmenter.create_from_options(self.options) if segmenter is None else segmenter
+        image = mp.Image(image_format=mp.ImageFormat.SRGB, data=img)
+        out = segmenter.segment(image)
+        segmap = out.category_mask.numpy_view().copy() # [H, W]
+        if return_onehot_mask:
+            segmap = scatter_np(segmap[None, None, ...], classSeg=6)[0] # [6, H, W]
+        return segmap
+    def _seg_out_img_with_segmap(self, img, segmap, mode='head'):
+        """
+        img: [h,w,c], img is in 0~255, np
+        """
+        #
+        img = copy.deepcopy(img)
+        if mode == 'head':
+            selected_mask = segmap[[1,3,5] , :, :].sum(axis=0)[None,:] > 0.5 # glasses 也属于others
+            img[~selected_mask.repeat(3,axis=0).transpose(1,2,0)] = 0 # (-1,-1,-1) denotes black in our [-1,1] convention
+            # selected_mask = segmap[[1,3] , :, :].sum(dim=0, keepdim=True) > 0.5
+        elif mode == 'person':
+            selected_mask = segmap[[1,2,3,4,5], :, :].sum(axis=0)[None,:] > 0.5
+            img[~selected_mask.repeat(3,axis=0).transpose(1,2,0)] = 0 # (-1,-1,-1) denotes black in our [-1,1] convention
+        elif mode == 'torso':
+            selected_mask = segmap[[2,4], :, :].sum(axis=0)[None,:] > 0.5
+            img[~selected_mask.repeat(3,axis=0).transpose(1,2,0)] = 0 # (-1,-1,-1) denotes black in our [-1,1] convention
+        elif mode == 'torso_with_bg':
+            selected_mask = segmap[[0, 2,4], :, :].sum(axis=0)[None,:] > 0.5
+            img[~selected_mask.repeat(3,axis=0).transpose(1,2,0)] = 0 # (-1,-1,-1) denotes black in our [-1,1] convention
+        elif mode == 'bg':
+            selected_mask = segmap[[0], :, :].sum(axis=0)[None,:] > 0.5  # only seg out 0, which means background
+            img[~selected_mask.repeat(3,axis=0).transpose(1,2,0)] = 0 # (-1,-1,-1) denotes black in our [-1,1] convention
+        elif mode == 'full':
+            pass
+        else:
+            raise NotImplementedError()
+        return img, selected_mask
+    def _seg_out_img(self, img, segmenter=None, mode='head'):
+        """
+        imgs [H, W, 3] 0-255
+        return : person_img [B, 3, H, W]
+        """
+        segmenter = vision.ImageSegmenter.create_from_options(self.options) if segmenter is None else segmenter
+        segmap = self._cal_seg_map(img, segmenter=segmenter, return_onehot_mask=True) # [B, 19, H, W]
+        return self._seg_out_img_with_segmap(img, segmap, mode=mode)
+    def seg_out_imgs(self, img, mode='head'):
+        """
+        api for pytorch img, -1~1
+        img: [B, 3, H, W], -1~1
+        """
+        device = img.device
+        img = convert_to_np(img.permute(0, 2, 3, 1)) # [B, H, W, 3]
+        img = ((img + 1) * 127.5).astype(np.uint8)
+        img_lst = [copy.deepcopy(img[i]) for i in range(len(img))]
+        out_lst = []
+        for im in img_lst:
+            out = self._seg_out_img(im, mode=mode)
+            out_lst.append(out)
+        seg_imgs = np.stack(out_lst) # [B, H, W, 3]
+        seg_imgs = (seg_imgs - 127.5) / 127.5
+        seg_imgs = torch.from_numpy(seg_imgs).permute(0, 3, 1, 2).to(device)
+        return seg_imgs
+if __name__ == '__main__':
+    import imageio, cv2, tqdm
+    import torchshow as ts
+    img = imageio.imread("1.png")
+    img = cv2.resize(img, (512,512))
+    seg_model = MediapipeSegmenter()
+    img = torch.tensor(img).unsqueeze(0).repeat([1, 1, 1, 1]).permute(0, 3,1,2)
+    img = (img-127.5)/127.5
+    out = seg_model.seg_out_imgs(img, 'torso')
+    ts.save(out,"torso.png")
+    out = seg_model.seg_out_imgs(img, 'head')
+    ts.save(out,"head.png")
+    out = seg_model.seg_out_imgs(img, 'bg')
+    ts.save(out,"bg.png")
+    img = convert_to_np(img.permute(0, 2, 3, 1)) # [B, H, W, 3]
+    img = ((img + 1) * 127.5).astype(np.uint8)
+    bg = extract_background(img)
+    ts.save(bg,"bg2.png")

data_gen/utils/mp_feature_extractors/selfie_multiclass_256x256.tflite ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c6748b1253a99067ef71f7e26ca71096cd449baefa8f101900ea23016507e0e0
+size 16371837

data_gen/utils/path_converter.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import os
+class PathConverter():
+    def __init__(self):
+        self.prefixs = {
+            "vid": "/video/",
+            "gt": "/gt_imgs/",
+            "head": "/head_imgs/",
+            "torso": "/torso_imgs/",
+            "person": "/person_imgs/",
+            "torso_with_bg": "/torso_with_bg_imgs/",
+            "single_bg": "/bg_img/",
+            "bg": "/bg_imgs/",
+            "segmaps": "/segmaps/",
+            "inpaint_torso": "/inpaint_torso_imgs/",
+            "com": "/com_imgs/",
+            "inpaint_torso_with_com_bg": "/inpaint_torso_with_com_bg_imgs/",
+        }
+    def to(self, path: str, old_pattern: str, new_pattern: str):
+        return path.replace(self.prefixs[old_pattern], self.prefixs[new_pattern], 1)
+pc = PathConverter()

data_gen/utils/process_audio/extract_hubert.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from transformers import Wav2Vec2Processor, HubertModel
+import soundfile as sf
+import numpy as np
+import torch
+import os
+from utils.commons.hparams import set_hparams, hparams
+wav2vec2_processor = None
+hubert_model = None
+def get_hubert_from_16k_wav(wav_16k_name):
+    speech_16k, _ = sf.read(wav_16k_name)
+    hubert = get_hubert_from_16k_speech(speech_16k)
+    return hubert
+@torch.no_grad()
+def get_hubert_from_16k_speech(speech, device="cuda:0"):
+    global hubert_model, wav2vec2_processor
+    local_path = '/home/tiger/.cache/huggingface/hub/models--facebook--hubert-large-ls960-ft/snapshots/ece5fabbf034c1073acae96d5401b25be96709d8'
+    if hubert_model is None:
+        print("Loading the HuBERT Model...")
+        print("Loading the Wav2Vec2 Processor...")
+        if os.path.exists(local_path):
+            hubert_model = HubertModel.from_pretrained(local_path)
+            wav2vec2_processor = Wav2Vec2Processor.from_pretrained(local_path)
+        else:
+            hubert_model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
+            wav2vec2_processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
+    hubert_model = hubert_model.to(device)
+    if speech.ndim ==2:
+        speech = speech[:, 0] # [T, 2] ==> [T,]
+    input_values_all = wav2vec2_processor(speech, return_tensors="pt", sampling_rate=16000).input_values # [1, T]
+    input_values_all = input_values_all.to(device)
+    # For long audio sequence, due to the memory limitation, we cannot process them in one run
+    # HuBERT process the wav with a CNN of stride [5,2,2,2,2,2], making a stride of 320
+    # Besides, the kernel is [10,3,3,3,3,2,2], making 400 a fundamental unit to get 1 time step.
+    # So the CNN is euqal to a big Conv1D with kernel k=400 and stride s=320
+    # We have the equation to calculate out time step: T = floor((t-k)/s)
+    # To prevent overlap, we set each clip length of (K+S*(N-1)), where N is the expected length T of this clip
+    # The start point of next clip should roll back with a length of (kernel-stride) so it is stride * N
+    kernel = 400
+    stride = 320
+    clip_length = stride * 1000
+    num_iter = input_values_all.shape[1] // clip_length
+    expected_T = (input_values_all.shape[1] - (kernel-stride)) // stride
+    res_lst = []
+    for i in range(num_iter):
+        if i == 0:
+            start_idx = 0
+            end_idx = clip_length - stride + kernel
+        else:
+            start_idx = clip_length * i
+            end_idx = start_idx + (clip_length - stride + kernel)
+        input_values = input_values_all[:, start_idx: end_idx]
+        hidden_states = hubert_model.forward(input_values).last_hidden_state # [B=1, T=pts//320, hid=1024]
+        res_lst.append(hidden_states[0])
+    if num_iter > 0:
+        input_values = input_values_all[:, clip_length * num_iter:]
+    else:
+        input_values = input_values_all
+    if input_values.shape[1] >= kernel: # if the last batch is shorter than kernel_size, skip it
+        hidden_states = hubert_model(input_values).last_hidden_state # [B=1, T=pts//320, hid=1024]
+        res_lst.append(hidden_states[0])
+    ret = torch.cat(res_lst, dim=0).cpu() # [T, 1024]
+    assert abs(ret.shape[0] - expected_T) <= 1
+    if ret.shape[0] < expected_T: # if skipping the last short
+        ret = torch.cat([ret, ret[:, -1:, :].repeat([1,expected_T-ret.shape[0],1])], dim=1)
+    else:
+        ret = ret[:expected_T]
+    return ret
+if __name__ == '__main__':
+    from argparse import ArgumentParser
+    parser = ArgumentParser()
+    parser.add_argument('--video_id', type=str, default='May', help='')
+    args = parser.parse_args()
+    ### Process Single Long Audio for NeRF dataset
+    person_id = args.video_id
+    wav_16k_name = f"data/processed/videos/{person_id}/aud.wav"
+    hubert_npy_name = f"data/processed/videos/{person_id}/aud_hubert.npy"
+    speech_16k, _ = sf.read(wav_16k_name)
+    hubert_hidden = get_hubert_from_16k_speech(speech_16k)
+    np.save(hubert_npy_name, hubert_hidden.detach().numpy())
+    print(f"Saved at {hubert_npy_name}")

data_gen/utils/process_audio/extract_mel_f0.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import numpy as np
+import torch
+import glob
+import os
+import tqdm
+import librosa
+import parselmouth
+from utils.commons.pitch_utils import f0_to_coarse
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm
+from utils.commons.os_utils import multiprocess_glob
+from utils.audio.io import save_wav
+from moviepy.editor import VideoFileClip
+from utils.commons.hparams import hparams, set_hparams
+def resample_wav(wav_name, out_name, sr=16000):
+    wav_raw, sr = librosa.core.load(wav_name, sr=sr)
+    save_wav(wav_raw, out_name, sr)
+def split_wav(mp4_name, wav_name=None):
+    if wav_name is None:
+        wav_name = mp4_name.replace(".mp4", ".wav").replace("/video/", "/audio/")
+    if os.path.exists(wav_name):
+        return wav_name
+    os.makedirs(os.path.dirname(wav_name), exist_ok=True)
+    video = VideoFileClip(mp4_name,verbose=False)
+    dur = video.duration
+    audio = video.audio
+    assert audio is not None
+    audio.write_audiofile(wav_name,fps=16000,verbose=False,logger=None)
+    return wav_name
+def librosa_pad_lr(x, fsize, fshift, pad_sides=1):
+    '''compute right padding (final frame) or both sides padding (first and final frames)
+    '''
+    assert pad_sides in (1, 2)
+    # return int(fsize // 2)
+    pad = (x.shape[0] // fshift + 1) * fshift - x.shape[0]
+    if pad_sides == 1:
+        return 0, pad
+    else:
+        return pad // 2, pad // 2 + pad % 2
+def extract_mel_from_fname(wav_path,
+                      fft_size=512,
+                      hop_size=320,
+                      win_length=512,
+                      window="hann",
+                      num_mels=80,
+                      fmin=80,
+                      fmax=7600,
+                      eps=1e-6,
+                      sample_rate=16000,
+                      min_level_db=-100):
+    if isinstance(wav_path, str):
+        wav, _ = librosa.core.load(wav_path, sr=sample_rate)
+    else:
+        wav = wav_path
+    # get amplitude spectrogram
+    x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size,
+                          win_length=win_length, window=window, center=False)
+    spc = np.abs(x_stft)  # (n_bins, T)
+    # get mel basis
+    fmin = 0 if fmin == -1 else fmin
+    fmax = sample_rate / 2 if fmax == -1 else fmax
+    mel_basis = librosa.filters.mel(sr=sample_rate, n_fft=fft_size, n_mels=num_mels, fmin=fmin, fmax=fmax)
+    mel = mel_basis @ spc
+    mel = np.log10(np.maximum(eps, mel))  # (n_mel_bins, T)
+    mel = mel.T
+    l_pad, r_pad = librosa_pad_lr(wav, fft_size, hop_size, 1)
+    wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0)
+    return wav.T, mel
+def extract_f0_from_wav_and_mel(wav, mel,
+                        hop_size=320,
+                        audio_sample_rate=16000,
+                        ):
+    time_step = hop_size / audio_sample_rate * 1000
+    f0_min = 80
+    f0_max = 750
+    f0 = parselmouth.Sound(wav, audio_sample_rate).to_pitch_ac(
+        time_step=time_step / 1000, voicing_threshold=0.6,
+        pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
+    delta_l = len(mel) - len(f0)
+    assert np.abs(delta_l) <= 8
+    if delta_l > 0:
+        f0 = np.concatenate([f0, [f0[-1]] * delta_l], 0)
+    f0 = f0[:len(mel)]
+    pitch_coarse = f0_to_coarse(f0)
+    return f0, pitch_coarse
+def extract_mel_f0_from_fname(wav_name=None, out_name=None):
+    try:
+        out_name = wav_name.replace(".wav", "_mel_f0.npy").replace("/audio/", "/mel_f0/")
+        os.makedirs(os.path.dirname(out_name), exist_ok=True)
+        wav, mel = extract_mel_from_fname(wav_name)
+        f0, f0_coarse = extract_f0_from_wav_and_mel(wav, mel)
+        out_dict = {
+            "mel": mel, # [T, 80]
+            "f0": f0,
+        }
+        np.save(out_name, out_dict)
+    except Exception as e:
+        print(e)
+def extract_mel_f0_from_video_name(mp4_name, wav_name=None, out_name=None):
+    if mp4_name.endswith(".mp4"):
+        wav_name = split_wav(mp4_name, wav_name)
+        if out_name is None:
+            out_name = mp4_name.replace(".mp4", "_mel_f0.npy").replace("/video/", "/mel_f0/")
+    elif mp4_name.endswith(".wav"):
+        wav_name = mp4_name
+        if out_name is None:
+            out_name = mp4_name.replace(".wav", "_mel_f0.npy").replace("/audio/", "/mel_f0/")
+    os.makedirs(os.path.dirname(out_name), exist_ok=True)
+    wav, mel = extract_mel_from_fname(wav_name)
+    f0, f0_coarse = extract_f0_from_wav_and_mel(wav, mel)
+    out_dict = {
+        "mel": mel, # [T, 80]
+        "f0": f0,
+    }
+    np.save(out_name, out_dict)
+if __name__ == '__main__':
+    from argparse import ArgumentParser
+    parser = ArgumentParser()
+    parser.add_argument('--video_id', type=str, default='May', help='')
+    args = parser.parse_args()
+    ### Process Single Long Audio for NeRF dataset
+    person_id = args.video_id
+    wav_16k_name = f"data/processed/videos/{person_id}/aud.wav"
+    out_name = f"data/processed/videos/{person_id}/aud_mel_f0.npy"
+    extract_mel_f0_from_video_name(wav_16k_name, out_name)
+    print(f"Saved at {out_name}")

data_gen/utils/process_audio/resample_audio_to_16k.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import os, glob
+from utils.commons.os_utils import multiprocess_glob
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm
+def extract_wav16k_job(audio_name:str):
+    out_path = audio_name.replace("/audio_raw/","/audio/",1)
+    assert out_path != audio_name # prevent inplace
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    ffmpeg_path = "/usr/bin/ffmpeg"
+    cmd = f'{ffmpeg_path} -i {audio_name} -ar 16000 -v quiet -y {out_path}'
+    os.system(cmd)
+if __name__ == '__main__':
+    import argparse, glob, tqdm, random
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--aud_dir", default='/home/tiger/datasets/raw/CMLR/audio_raw/')
+    parser.add_argument("--ds_name", default='CMLR')
+    parser.add_argument("--num_workers", default=64, type=int)
+    parser.add_argument("--process_id", default=0, type=int)
+    parser.add_argument("--total_process", default=1, type=int)
+    args = parser.parse_args()
+    print(f"args {args}")
+    aud_dir = args.aud_dir
+    ds_name = args.ds_name
+    if ds_name in ['CMLR']:
+        aud_name_pattern = os.path.join(aud_dir, "*/*/*.wav")
+        aud_names = multiprocess_glob(aud_name_pattern)
+    else:
+        raise NotImplementedError()
+    aud_names = sorted(aud_names)
+    print(f"total audio number : {len(aud_names)}")
+    print(f"first {aud_names[0]} last {aud_names[-1]}")
+    # exit()
+    process_id = args.process_id
+    total_process = args.total_process
+    if total_process > 1:
+        assert process_id <= total_process -1
+        num_samples_per_process = len(aud_names) // total_process
+        if process_id == total_process:
+            aud_names = aud_names[process_id * num_samples_per_process : ]
+        else:
+            aud_names = aud_names[process_id * num_samples_per_process : (process_id+1) * num_samples_per_process]
+    for i, res in multiprocess_run_tqdm(extract_wav16k_job, aud_names, num_workers=args.num_workers, desc="resampling videos"):
+        pass

data_gen/utils/process_image/extract_lm2d.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import os
+os.environ["OMP_NUM_THREADS"] = "1"
+import sys
+import glob
+import cv2
+import tqdm
+import numpy as np
+from data_gen.utils.mp_feature_extractors.face_landmarker import MediapipeLandmarker
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm
+import warnings
+warnings.filterwarnings('ignore')
+import random
+random.seed(42)
+import pickle
+import json
+import gzip
+from typing import Any
+def load_file(filename, is_gzip: bool = False, is_json: bool = False) -> Any:
+    if is_json:
+        if is_gzip:
+            with gzip.open(filename, "r", encoding="utf-8") as f:
+                loaded_object = json.load(f)
+                return loaded_object
+        else:
+            with open(filename, "r", encoding="utf-8") as f:
+                loaded_object = json.load(f)
+                return loaded_object
+    else:
+        if is_gzip:
+            with gzip.open(filename, "rb") as f:
+                loaded_object = pickle.load(f)
+                return loaded_object
+        else:
+            with open(filename, "rb") as f:
+                loaded_object = pickle.load(f)
+                return loaded_object
+def save_file(filename, content, is_gzip: bool = False, is_json: bool = False) -> None:
+    if is_json:
+        if is_gzip:
+            with gzip.open(filename, "w", encoding="utf-8") as f:
+                json.dump(content, f)
+        else:
+            with open(filename, "w", encoding="utf-8") as f:
+                json.dump(content, f)
+    else:
+        if is_gzip:
+            with gzip.open(filename, "wb") as f:
+                pickle.dump(content, f)
+        else:
+            with open(filename, "wb") as f:
+                pickle.dump(content, f)
+face_landmarker = None
+def extract_lms_mediapipe_job(img):
+    if img is None:
+        return None
+    global face_landmarker
+    if face_landmarker is None:
+        face_landmarker = MediapipeLandmarker()
+    lm478 = face_landmarker.extract_lm478_from_img(img)
+    return lm478
+def extract_landmark_job(img_name):
+    try:
+        # if img_name == 'datasets/PanoHeadGen/raw/images/multi_view/chunk_0/seed0000002.png':
+            # print(1)
+            # input()
+        out_name = img_name.replace("/images_512/", "/lms_2d/").replace(".png","_lms.npy")
+        if os.path.exists(out_name):
+            print("out exists, skip...")
+            return
+        try:
+            os.makedirs(os.path.dirname(out_name), exist_ok=True)
+        except:
+            pass
+        img = cv2.imread(img_name)[:,:,::-1]
+        if img is not None:
+            lm468 = extract_lms_mediapipe_job(img)
+            if lm468 is not None:
+                np.save(out_name, lm468)
+        # print("Hahaha, solve one item!!!")
+    except Exception as e:
+        print(e)
+        pass
+def out_exist_job(img_name):
+    out_name = img_name.replace("/images_512/", "/lms_2d/").replace(".png","_lms.npy")
+    if  os.path.exists(out_name):
+        return None
+    else:
+        return img_name
+# def get_todo_img_names(img_names):
+#     todo_img_names = []
+#     for i, res in multiprocess_run_tqdm(out_exist_job, img_names, num_workers=64):
+#         if res is not None:
+#             todo_img_names.append(res)
+#     return todo_img_names
+if __name__ == '__main__':
+    import argparse, glob, tqdm, random
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--img_dir", default='/home/tiger/datasets/raw/FFHQ/images_512/')
+    parser.add_argument("--ds_name", default='FFHQ')
+    parser.add_argument("--num_workers", default=64, type=int)
+    parser.add_argument("--process_id", default=0, type=int)
+    parser.add_argument("--total_process", default=1, type=int)
+    parser.add_argument("--reset", action='store_true')
+    parser.add_argument("--img_names_file", default="img_names.pkl", type=str)
+    parser.add_argument("--load_img_names", action="store_true")
+    args = parser.parse_args()
+    print(f"args {args}")
+    img_dir = args.img_dir
+    img_names_file = os.path.join(img_dir, args.img_names_file)
+    if args.load_img_names:
+        img_names = load_file(img_names_file)
+        print(f"load image names from {img_names_file}")
+    else:
+        if args.ds_name == 'FFHQ_MV':
+            img_name_pattern1 = os.path.join(img_dir, "ref_imgs/*.png")
+            img_names1 = glob.glob(img_name_pattern1)
+            img_name_pattern2 = os.path.join(img_dir, "mv_imgs/*.png")
+            img_names2 = glob.glob(img_name_pattern2)
+            img_names = img_names1 + img_names2
+            img_names = sorted(img_names)
+        elif args.ds_name == 'FFHQ':
+            img_name_pattern = os.path.join(img_dir, "*.png")
+            img_names = glob.glob(img_name_pattern)
+            img_names = sorted(img_names)
+        elif args.ds_name == "PanoHeadGen":
+            # img_name_patterns = ["ref/*/*.png", "multi_view/*/*.png", "reverse/*/*.png"]
+            img_name_patterns = ["ref/*/*.png"]
+            img_names = []
+            for img_name_pattern in img_name_patterns:
+                img_name_pattern_full = os.path.join(img_dir, img_name_pattern)
+                img_names_part = glob.glob(img_name_pattern_full)
+                img_names.extend(img_names_part)
+            img_names = sorted(img_names)
+    # save image names
+    if not args.load_img_names:
+        save_file(img_names_file, img_names)
+        print(f"save image names in {img_names_file}")
+    print(f"total images number: {len(img_names)}")
+    process_id = args.process_id
+    total_process = args.total_process
+    if total_process > 1:
+        assert process_id <= total_process -1
+        num_samples_per_process = len(img_names) // total_process
+        if process_id == total_process:
+            img_names = img_names[process_id * num_samples_per_process : ]
+        else:
+            img_names = img_names[process_id * num_samples_per_process : (process_id+1) * num_samples_per_process]
+    # if not args.reset:
+        # img_names = get_todo_img_names(img_names)
+    print(f"todo_image {img_names[:10]}")
+    print(f"processing images number in this process: {len(img_names)}")
+    # print(f"todo images number: {len(img_names)}")
+    # input()
+    # exit()
+    if args.num_workers == 1:
+        index = 0
+        for img_name in tqdm.tqdm(img_names, desc=f"Root process {args.process_id}: extracting MP-based landmark2d"):
+            try:
+                extract_landmark_job(img_name)
+            except Exception as e:
+                print(e)
+                pass
+            if index % max(1, int(len(img_names) * 0.003)) == 0:
+                print(f"processed {index} / {len(img_names)}")
+                sys.stdout.flush()
+            index += 1
+    else:
+        for i, res in multiprocess_run_tqdm(
+            extract_landmark_job, img_names,
+            num_workers=args.num_workers,
+            desc=f"Root {args.process_id}: extracing MP-based landmark2d"):
+            # if index % max(1, int(len(img_names) * 0.003)) == 0:
+            print(f"processed {i+1} / {len(img_names)}")
+            sys.stdout.flush()
+        print(f"Root {args.process_id}: Finished extracting.")

data_gen/utils/process_image/extract_segment_imgs.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import os
+os.environ["OMP_NUM_THREADS"] = "1"
+import glob
+import cv2
+import tqdm
+import numpy as np
+import PIL
+from utils.commons.tensor_utils import convert_to_np
+import torch
+import mediapipe as mp
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm
+from data_gen.utils.mp_feature_extractors.mp_segmenter import MediapipeSegmenter
+from data_gen.utils.process_video.extract_segment_imgs import inpaint_torso_job, extract_background, save_rgb_image_to_path
+seg_model = MediapipeSegmenter()
+def extract_segment_job(img_name):
+    try:
+        img = cv2.imread(img_name)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        segmap = seg_model._cal_seg_map(img)
+        bg_img = extract_background([img], [segmap])
+        out_img_name = img_name.replace("/images_512/",f"/bg_img/").replace(".mp4", ".jpg")
+        save_rgb_image_to_path(bg_img, out_img_name)
+        com_img = img.copy()
+        bg_part = segmap[0].astype(bool)[..., None].repeat(3,axis=-1)
+        com_img[bg_part] = bg_img[bg_part]
+        out_img_name = img_name.replace("/images_512/",f"/com_imgs/")
+        save_rgb_image_to_path(com_img, out_img_name)
+        for mode in ['head', 'torso', 'person', 'torso_with_bg', 'bg']:
+            out_img, _ = seg_model._seg_out_img_with_segmap(img, segmap, mode=mode)
+            out_img_name = img_name.replace("/images_512/",f"/{mode}_imgs/")
+            out_img = cv2.cvtColor(out_img, cv2.COLOR_RGB2BGR)
+            try: os.makedirs(os.path.dirname(out_img_name), exist_ok=True)
+            except: pass
+            cv2.imwrite(out_img_name, out_img)
+        inpaint_torso_img, inpaint_torso_with_bg_img, _, _ = inpaint_torso_job(img, segmap)
+        out_img_name = img_name.replace("/images_512/",f"/inpaint_torso_imgs/")
+        save_rgb_image_to_path(inpaint_torso_img, out_img_name)
+        inpaint_torso_with_bg_img[bg_part] = bg_img[bg_part]
+        out_img_name = img_name.replace("/images_512/",f"/inpaint_torso_with_com_bg_imgs/")
+        save_rgb_image_to_path(inpaint_torso_with_bg_img, out_img_name)
+        return 0
+    except Exception as e:
+        print(e)
+        return 1
+def out_exist_job(img_name):
+    out_name1 = img_name.replace("/images_512/", "/head_imgs/")
+    out_name2 = img_name.replace("/images_512/", "/com_imgs/")
+    out_name3 = img_name.replace("/images_512/", "/inpaint_torso_with_com_bg_imgs/")
+    if  os.path.exists(out_name1) and os.path.exists(out_name2) and os.path.exists(out_name3):
+        return None
+    else:
+        return img_name
+def get_todo_img_names(img_names):
+    todo_img_names = []
+    for i, res in multiprocess_run_tqdm(out_exist_job, img_names, num_workers=64):
+        if res is not None:
+            todo_img_names.append(res)
+    return todo_img_names
+if __name__ == '__main__':
+    import argparse, glob, tqdm, random
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--img_dir", default='./images_512')
+    # parser.add_argument("--img_dir", default='/home/tiger/datasets/raw/FFHQ/images_512')
+    parser.add_argument("--ds_name", default='FFHQ')
+    parser.add_argument("--num_workers", default=1, type=int)
+    parser.add_argument("--seed", default=0, type=int)
+    parser.add_argument("--process_id", default=0, type=int)
+    parser.add_argument("--total_process", default=1, type=int)
+    parser.add_argument("--reset", action='store_true')
+    args = parser.parse_args()
+    img_dir = args.img_dir
+    if args.ds_name == 'FFHQ_MV':
+        img_name_pattern1 = os.path.join(img_dir, "ref_imgs/*.png")
+        img_names1 = glob.glob(img_name_pattern1)
+        img_name_pattern2 = os.path.join(img_dir, "mv_imgs/*.png")
+        img_names2 = glob.glob(img_name_pattern2)
+        img_names = img_names1 + img_names2
+    elif args.ds_name == 'FFHQ':
+        img_name_pattern = os.path.join(img_dir, "*.png")
+        img_names = glob.glob(img_name_pattern)
+    img_names = sorted(img_names)
+    random.seed(args.seed)
+    random.shuffle(img_names)
+    process_id = args.process_id
+    total_process = args.total_process
+    if total_process > 1:
+        assert process_id <= total_process -1
+        num_samples_per_process = len(img_names) // total_process
+        if process_id == total_process:
+            img_names = img_names[process_id * num_samples_per_process : ]
+        else:
+            img_names = img_names[process_id * num_samples_per_process : (process_id+1) * num_samples_per_process]
+    if not args.reset:
+        img_names = get_todo_img_names(img_names)
+    print(f"todo images number: {len(img_names)}")
+    for vid_name in multiprocess_run_tqdm(extract_segment_job ,img_names, desc=f"Root process {args.process_id}: extracting segment images", num_workers=args.num_workers):
+        pass

data_gen/utils/process_image/fit_3dmm_landmark.py ADDED Viewed

	@@ -0,0 +1,369 @@

+from numpy.core.numeric import require
+from numpy.lib.function_base import quantile
+import torch
+import torch.nn.functional as F
+import copy
+import numpy as np
+import os
+import sys
+import cv2
+import argparse
+import tqdm
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm
+from data_gen.utils.mp_feature_extractors.face_landmarker import MediapipeLandmarker
+from deep_3drecon.deep_3drecon_models.bfm import ParametricFaceModel
+import pickle
+face_model = ParametricFaceModel(bfm_folder='deep_3drecon/BFM',
+            camera_distance=10, focal=1015, keypoint_mode='mediapipe')
+face_model.to("cuda")
+index_lm68_from_lm468 = [127,234,93,132,58,136,150,176,152,400,379,365,288,361,323,454,356,70,63,105,66,107,336,296,334,293,300,168,197,5,4,75,97,2,326,305,
+                         33,160,158,133,153,144,362,385,387,263,373,380,61,40,37,0,267,270,291,321,314,17,84,91,78,81,13,311,308,402,14,178]
+dir_path = os.path.dirname(os.path.realpath(__file__))
+LAMBDA_REG_ID = 0.3
+LAMBDA_REG_EXP = 0.05
+def save_file(name, content):
+    with open(name, "wb") as f:
+        pickle.dump(content, f)
+def load_file(name):
+    with open(name, "rb") as f:
+        content = pickle.load(f)
+    return content
+def cal_lan_loss_mp(proj_lan, gt_lan):
+    # [B, 68, 2]
+    loss = (proj_lan - gt_lan).pow(2)
+    # loss = (proj_lan - gt_lan).abs()
+    unmatch_mask = [ 93, 127, 132, 234, 323, 356, 361, 454]
+    eye = [33,246,161,160,159,158,157,173,133,155,154,153,145,144,163,7] + [263,466,388,387,386,385,384,398,362,382,381,380,374,373,390,249]
+    inner_lip = [78,191,80,81,82,13,312,311,310,415,308,324,318,402,317,14,87,178,88,95]
+    outer_lip = [61,185,40,39,37,0,267,269,270,409,291,375,321,405,314,17,84,181,91,146]
+    weights = torch.ones_like(loss)
+    weights[:, eye] = 5
+    weights[:, inner_lip] = 2
+    weights[:, outer_lip] = 2
+    weights[:, unmatch_mask] = 0
+    loss = loss * weights
+    return torch.mean(loss)
+def cal_lan_loss(proj_lan, gt_lan):
+    # [B, 68, 2]
+    loss = (proj_lan - gt_lan)** 2
+    # use the ldm weights from deep3drecon, see deep_3drecon/deep_3drecon_models/losses.py
+    weights = torch.zeros_like(loss)
+    weights = torch.ones_like(loss)
+    weights[:, 36:48, :] = 3 # eye 12 points
+    weights[:, -8:, :] =  3 # inner lip 8 points
+    weights[:, 28:31, :] =  3 # nose 3 points
+    loss = loss * weights
+    return torch.mean(loss)
+def set_requires_grad(tensor_list):
+    for tensor in tensor_list:
+        tensor.requires_grad = True
+def read_video_to_frames(img_name):
+    frames = []
+    cap = cv2.VideoCapture(img_name)
+    while cap.isOpened():
+        ret, frame_bgr = cap.read()
+        if frame_bgr is None:
+            break
+        frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
+        frames.append(frame_rgb)
+    return np.stack(frames)
+@torch.enable_grad()
+def fit_3dmm_for_a_image(img_name, debug=False, keypoint_mode='mediapipe', device="cuda:0", save=True):
+    img = cv2.imread(img_name)
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    img_h, img_w = img.shape[0], img.shape[0]
+    assert img_h == img_w
+    num_frames = 1
+    lm_name = img_name.replace("/images_512/", "/lms_2d/").replace(".png", "_lms.npy")
+    if lm_name.endswith('_lms.npy') and os.path.exists(lm_name):
+        lms = np.load(lm_name)
+    else:
+        # print("lms_2d file not found, try to extract it from image...")
+        try:
+            landmarker = MediapipeLandmarker()
+            lms = landmarker.extract_lm478_from_img_name(img_name)
+            # lms = landmarker.extract_lm478_from_img(img)
+        except Exception as e:
+            print(e)
+            return
+        if lms is None:
+            print("get None lms_2d, please check whether each frame has one head, exiting...")
+            return
+    lms = lms[:468].reshape([468,2])
+    lms = torch.FloatTensor(lms).to(device=device)
+    lms[..., 1] = img_h - lms[..., 1] # flip the height axis
+    if keypoint_mode == 'mediapipe':
+        cal_lan_loss_fn = cal_lan_loss_mp
+        out_name = img_name.replace("/images_512/", "/coeff_fit_mp/").replace(".png", "_coeff_fit_mp.npy")
+    else:
+        cal_lan_loss_fn = cal_lan_loss
+        out_name = img_name.replace("/images_512/", "/coeff_fit_lm68/").replace(".png", "_coeff_fit_lm68.npy")
+    try:
+        os.makedirs(os.path.dirname(out_name), exist_ok=True)
+    except:
+        pass
+    id_dim, exp_dim = 80, 64
+    sel_ids = np.arange(0, num_frames, 40)
+    sel_num = sel_ids.shape[0]
+    arg_focal = face_model.focal
+    h = w = face_model.center * 2
+    img_scale_factor = img_h / h
+    lms /= img_scale_factor
+    cxy = torch.tensor((w / 2.0, h / 2.0), dtype=torch.float).to(device=device)
+    id_para = lms.new_zeros((num_frames, id_dim), requires_grad=True) # lms.new_zeros((1, id_dim), requires_grad=True)
+    exp_para = lms.new_zeros((num_frames, exp_dim), requires_grad=True)
+    euler_angle = lms.new_zeros((num_frames, 3), requires_grad=True)
+    trans = lms.new_zeros((num_frames, 3), requires_grad=True)
+    focal_length = lms.new_zeros(1, requires_grad=True)
+    focal_length.data += arg_focal
+    set_requires_grad([id_para, exp_para, euler_angle, trans])
+    optimizer_idexp = torch.optim.Adam([id_para, exp_para], lr=.1)
+    optimizer_frame = torch.optim.Adam([euler_angle, trans], lr=.1)
+    # 其他参数初始化，先训练euler和trans
+    for _ in range(200):
+        proj_geo = face_model.compute_for_landmark_fit(
+            id_para, exp_para, euler_angle, trans)
+        loss_lan = cal_lan_loss_fn(proj_geo[:, :, :2], lms.detach())
+        loss = loss_lan
+        optimizer_frame.zero_grad()
+        loss.backward()
+        optimizer_frame.step()
+    # print(f"loss_lan: {loss_lan.item():.2f}, euler_abs_mean: {euler_angle.abs().mean().item():.4f}, euler_std: {euler_angle.std().item():.4f}, euler_min: {euler_angle.min().item():.4f}, euler_max: {euler_angle.max().item():.4f}")
+    # print(f"trans_z_mean: {trans[...,2].mean().item():.4f}, trans_z_std: {trans[...,2].std().item():.4f}, trans_min: {trans[...,2].min().item():.4f}, trans_max: {trans[...,2].max().item():.4f}")
+    for param_group in optimizer_frame.param_groups:
+        param_group['lr'] = 0.1
+    # "jointly roughly training id exp euler trans"
+    for _ in range(200):
+        proj_geo = face_model.compute_for_landmark_fit(
+            id_para, exp_para, euler_angle, trans)
+        loss_lan = cal_lan_loss_fn(
+            proj_geo[:, :, :2], lms.detach())
+        loss_regid = torch.mean(id_para*id_para) # 正则化
+        loss_regexp = torch.mean(exp_para * exp_para)
+        loss = loss_lan  + loss_regid * LAMBDA_REG_ID + loss_regexp * LAMBDA_REG_EXP
+        optimizer_idexp.zero_grad()
+        optimizer_frame.zero_grad()
+        loss.backward()
+        optimizer_idexp.step()
+        optimizer_frame.step()
+    # print(f"loss_lan: {loss_lan.item():.2f}, loss_reg_id: {loss_regid.item():.2f},loss_reg_exp: {loss_regexp.item():.2f},")
+    # print(f"euler_abs_mean: {euler_angle.abs().mean().item():.4f}, euler_std: {euler_angle.std().item():.4f}, euler_min: {euler_angle.min().item():.4f}, euler_max: {euler_angle.max().item():.4f}")
+    # print(f"trans_z_mean: {trans[...,2].mean().item():.4f}, trans_z_std: {trans[...,2].std().item():.4f}, trans_min: {trans[...,2].min().item():.4f}, trans_max: {trans[...,2].max().item():.4f}")
+    # start fine training, intialize from the roughly trained results
+    id_para_ = lms.new_zeros((num_frames, id_dim), requires_grad=True)
+    id_para_.data = id_para.data.clone()
+    id_para = id_para_
+    exp_para_ = lms.new_zeros((num_frames, exp_dim), requires_grad=True)
+    exp_para_.data = exp_para.data.clone()
+    exp_para = exp_para_
+    euler_angle_ = lms.new_zeros((num_frames, 3), requires_grad=True)
+    euler_angle_.data = euler_angle.data.clone()
+    euler_angle = euler_angle_
+    trans_ = lms.new_zeros((num_frames, 3), requires_grad=True)
+    trans_.data = trans.data.clone()
+    trans = trans_
+    batch_size = 1
+    # "fine fitting the 3DMM in batches"
+    for i in range(int((num_frames-1)/batch_size+1)):
+        if (i+1)*batch_size > num_frames:
+            start_n = num_frames-batch_size
+            sel_ids = np.arange(max(num_frames-batch_size,0), num_frames)
+        else:
+            start_n = i*batch_size
+            sel_ids = np.arange(i*batch_size, i*batch_size+batch_size)
+        sel_lms = lms[sel_ids]
+        sel_id_para = id_para.new_zeros(
+            (batch_size, id_dim), requires_grad=True)
+        sel_id_para.data = id_para[sel_ids].clone()
+        sel_exp_para = exp_para.new_zeros(
+            (batch_size, exp_dim), requires_grad=True)
+        sel_exp_para.data = exp_para[sel_ids].clone()
+        sel_euler_angle = euler_angle.new_zeros(
+            (batch_size, 3), requires_grad=True)
+        sel_euler_angle.data = euler_angle[sel_ids].clone()
+        sel_trans = trans.new_zeros((batch_size, 3), requires_grad=True)
+        sel_trans.data = trans[sel_ids].clone()
+        set_requires_grad([sel_id_para, sel_exp_para, sel_euler_angle, sel_trans])
+        optimizer_cur_batch = torch.optim.Adam(
+            [sel_id_para, sel_exp_para, sel_euler_angle, sel_trans], lr=0.005)
+        for j in range(50):
+            proj_geo = face_model.compute_for_landmark_fit(
+                sel_id_para, sel_exp_para, sel_euler_angle, sel_trans)
+            loss_lan = cal_lan_loss_fn(
+                proj_geo[:, :, :2], lms.unsqueeze(0).detach())
+            loss_regid = torch.mean(sel_id_para*sel_id_para) # 正则化
+            loss_regexp = torch.mean(sel_exp_para*sel_exp_para)
+            loss = loss_lan + loss_regid * LAMBDA_REG_ID + loss_regexp * LAMBDA_REG_EXP
+            optimizer_cur_batch.zero_grad()
+            loss.backward()
+            optimizer_cur_batch.step()
+        print(f"batch {i} | loss_lan: {loss_lan.item():.2f}, loss_reg_id: {loss_regid.item():.2f},loss_reg_exp: {loss_regexp.item():.2f}")
+        id_para[sel_ids].data = sel_id_para.data.clone()
+        exp_para[sel_ids].data = sel_exp_para.data.clone()
+        euler_angle[sel_ids].data = sel_euler_angle.data.clone()
+        trans[sel_ids].data = sel_trans.data.clone()
+    coeff_dict = {'id': id_para.detach().cpu().numpy(), 'exp': exp_para.detach().cpu().numpy(),
+                'euler': euler_angle.detach().cpu().numpy(), 'trans': trans.detach().cpu().numpy()}
+    if save:
+        np.save(out_name, coeff_dict, allow_pickle=True)
+    if debug:
+        import imageio
+        debug_name = img_name.replace("/images_512/", "/coeff_fit_mp_debug/").replace(".png", "_debug.png").replace(".jpg", "_debug.jpg")
+        try: os.makedirs(os.path.dirname(debug_name), exist_ok=True)
+        except: pass
+        proj_geo = face_model.compute_for_landmark_fit(id_para, exp_para, euler_angle, trans)
+        lm68s = proj_geo[:,:,:2].detach().cpu().numpy()  # [T, 68,2]
+        lm68s = lm68s * img_scale_factor
+        lms = lms * img_scale_factor
+        lm68s[..., 1] = img_h - lm68s[..., 1] # flip the height axis
+        lms[..., 1] = img_h - lms[..., 1] # flip the height axis
+        lm68s = lm68s.astype(int)
+        lm68s = lm68s.reshape([-1,2])
+        lms = lms.cpu().numpy().astype(int).reshape([-1,2])
+        for lm in lm68s:
+            img = cv2.circle(img, lm, 1, (0, 0, 255), thickness=-1)
+        for gt_lm in lms:
+            img = cv2.circle(img, gt_lm, 2, (255, 0, 0), thickness=1)
+        imageio.imwrite(debug_name, img)
+        print(f"debug img saved at {debug_name}")
+    return coeff_dict
+def out_exist_job(vid_name):
+    out_name = vid_name.replace("/images_512/", "/coeff_fit_mp/").replace(".png","_coeff_fit_mp.npy")
+    # if os.path.exists(out_name) or not os.path.exists(lms_name):
+    if os.path.exists(out_name):
+        return None
+    else:
+        return vid_name
+def get_todo_img_names(img_names):
+    todo_img_names = []
+    for i, res in multiprocess_run_tqdm(out_exist_job, img_names, num_workers=16):
+        if res is not None:
+            todo_img_names.append(res)
+    return todo_img_names
+if __name__ == '__main__':
+    import argparse, glob, tqdm
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--img_dir", default='/home/tiger/datasets/raw/FFHQ/images_512')
+    parser.add_argument("--ds_name", default='FFHQ')
+    parser.add_argument("--seed", default=0, type=int)
+    parser.add_argument("--process_id", default=0, type=int)
+    parser.add_argument("--total_process", default=1, type=int)
+    parser.add_argument("--keypoint_mode", default='mediapipe', type=str)
+    parser.add_argument("--debug", action='store_true')
+    parser.add_argument("--reset", action='store_true')
+    parser.add_argument("--device", default="cuda:0", type=str)
+    parser.add_argument("--output_log", action='store_true')
+    parser.add_argument("--load_names", action="store_true")
+    args = parser.parse_args()
+    img_dir = args.img_dir
+    load_names = args.load_names
+    print(f"args {args}")
+    if args.ds_name == 'single_img':
+        img_names = [img_dir]
+    else:
+        img_names_path = os.path.join(img_dir, "img_dir.pkl")
+        if os.path.exists(img_names_path) and load_names:
+            print(f"loading vid names from {img_names_path}")
+            img_names = load_file(img_names_path)
+        else:
+            if args.ds_name == 'FFHQ_MV':
+                img_name_pattern1 = os.path.join(img_dir, "ref_imgs/*.png")
+                img_names1 = glob.glob(img_name_pattern1)
+                img_name_pattern2 = os.path.join(img_dir, "mv_imgs/*.png")
+                img_names2 = glob.glob(img_name_pattern2)
+                img_names = img_names1 + img_names2
+                img_names = sorted(img_names)
+            elif args.ds_name == 'FFHQ':
+                img_name_pattern = os.path.join(img_dir, "*.png")
+                img_names = glob.glob(img_name_pattern)
+                img_names = sorted(img_names)
+            elif args.ds_name == "PanoHeadGen":
+                img_name_patterns = ["ref/*/*.png"]
+                img_names = []
+                for img_name_pattern in img_name_patterns:
+                    img_name_pattern_full = os.path.join(img_dir, img_name_pattern)
+                    img_names_part = glob.glob(img_name_pattern_full)
+                    img_names.extend(img_names_part)
+                img_names = sorted(img_names)
+            print(f"saving image names to {img_names_path}")
+            save_file(img_names_path, img_names)
+    # import random
+    # random.seed(args.seed)
+    # random.shuffle(img_names)
+    face_model = ParametricFaceModel(bfm_folder='deep_3drecon/BFM',
+                camera_distance=10, focal=1015, keypoint_mode=args.keypoint_mode)
+    face_model.to(torch.device(args.device))
+    process_id = args.process_id
+    total_process = args.total_process
+    if total_process > 1:
+        assert process_id <= total_process -1 and process_id >= 0
+        num_samples_per_process = len(img_names) // total_process
+        if process_id == total_process:
+            img_names = img_names[process_id * num_samples_per_process : ]
+        else:
+            img_names = img_names[process_id * num_samples_per_process : (process_id+1) * num_samples_per_process]
+    print(f"image names number (before fileter): {len(img_names)}")
+    if not args.reset:
+        img_names = get_todo_img_names(img_names)
+    print(f"image names number (after  fileter): {len(img_names)}")
+    for i in tqdm.trange(len(img_names), desc=f"process {process_id}: fitting 3dmm ..."):
+        img_name = img_names[i]
+        try:
+            fit_3dmm_for_a_image(img_name, args.debug, device=args.device)
+        except Exception as e:
+            print(img_name, e)
+        if args.output_log and i % max(int(len(img_names) * 0.003), 1) == 0:
+            print(f"process {process_id}: {i + 1} / {len(img_names)} done")
+            sys.stdout.flush()
+            sys.stderr.flush()
+    print(f"process {process_id}: fitting 3dmm all done")

data_gen/utils/process_video/euler2quaterion.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import numpy as np
+import torch
+import math
+import numba
+from scipy.spatial.transform import Rotation as R
+def euler2quaterion(euler, use_radian=True):
+    """
+    euler: np.array, [batch, 3]
+    return: the quaterion, np.array, [batch, 4]
+    """
+    r = R.from_euler('xyz',euler, degrees=not use_radian)
+    return r.as_quat()
+def quaterion2euler(quat, use_radian=True):
+    """
+    quat: np.array, [batch, 4]
+    return: the euler, np.array, [batch, 3]
+    """
+    r = R.from_quat(quat)
+    return r.as_euler('xyz', degrees=not use_radian)
+def rot2quaterion(rot):
+    r = R.from_matrix(rot)
+    return r.as_quat()
+def quaterion2rot(quat):
+    r = R.from_quat(quat)
+    return r.as_matrix()
+if __name__ == '__main__':
+    euler = np.array([89.999,89.999,89.999] * 100).reshape([100,3])
+    q = euler2quaterion(euler, use_radian=False)
+    e = quaterion2euler(q, use_radian=False)
+    print(" ")

data_gen/utils/process_video/extract_blink.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import numpy as np
+from data_util.face3d_helper import Face3DHelper
+from utils.commons.tensor_utils import convert_to_tensor
+def polygon_area(x, y):
+    """
+    x: [T, K=6]
+    y: [T, K=6]
+    return: [T,]
+    """
+    x_ = x - x.mean(axis=-1, keepdims=True)
+    y_ = y - y.mean(axis=-1, keepdims=True)
+    correction = x_[:,-1] * y_[:,0] - y_[:,-1]* x_[:,0]
+    main_area = (x_[:,:-1] * y_[:,1:]).sum(axis=-1) - (y_[:,:-1] * x_[:,1:]).sum(axis=-1)
+    return 0.5 * np.abs(main_area + correction)
+def get_eye_area_percent(id, exp, face3d_helper):
+    id = convert_to_tensor(id)
+    exp = convert_to_tensor(exp)
+    cano_lm3d = face3d_helper.reconstruct_cano_lm3d(id, exp)
+    cano_lm2d = (cano_lm3d[..., :2] + 1) / 2
+    lms = cano_lm2d.cpu().numpy()
+    eyes_left = slice(36, 42)
+    eyes_right = slice(42, 48)
+    area_left = polygon_area(lms[:, eyes_left, 0], lms[:, eyes_left, 1])
+    area_right = polygon_area(lms[:, eyes_right, 0], lms[:, eyes_right, 1])
+    # area percentage of two eyes of the whole image...
+    area_percent = (area_left + area_right) / 1 * 100 # recommend threshold is 0.25%
+    return area_percent # [T,]
+if __name__ == '__main__':
+    import numpy as np
+    import imageio
+    import cv2
+    import torch
+    from data_gen.utils.process_video.extract_lm2d import extract_lms_mediapipe_job, read_video_to_frames, index_lm68_from_lm468
+    from data_gen.utils.process_video.fit_3dmm_landmark import fit_3dmm_for_a_video
+    from data_util.face3d_helper import Face3DHelper
+    face3d_helper = Face3DHelper()
+    video_name = 'data/raw/videos/May_10s.mp4'
+    frames = read_video_to_frames(video_name)
+    coeff = fit_3dmm_for_a_video(video_name, save=False)
+    area_percent = get_eye_area_percent(torch.tensor(coeff['id']), torch.tensor(coeff['exp']), face3d_helper)
+    writer = imageio.get_writer("1.mp4", fps=25)
+    for idx, frame in enumerate(frames):
+        frame = cv2.putText(frame, f"{area_percent[idx]:.2f}", org=(128,128), fontFace=cv2.FONT_HERSHEY_COMPLEX, fontScale=1, color=(255,0,0), thickness=1)
+        writer.append_data(frame)
+    writer.close()

data_gen/utils/process_video/extract_lm2d.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import os
+os.environ["OMP_NUM_THREADS"] = "1"
+import sys
+import glob
+import cv2
+import pickle
+import tqdm
+import numpy as np
+import mediapipe as mp
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm
+from utils.commons.os_utils import multiprocess_glob
+from data_gen.utils.mp_feature_extractors.face_landmarker import MediapipeLandmarker
+import warnings
+import traceback
+warnings.filterwarnings('ignore')
+"""
+基于Face_aligment的lm68已被弃用,因为其：
+1. 对眼睛部位的预测精度极低
+2. 无法在大偏转角度时准确预测被遮挡的下颚线, 导致大角度时3dmm的GT label就是有问题的, 从而影响性能
+我们目前转而使用基于mediapipe的lm68
+"""
+# def extract_landmarks(ori_imgs_dir):
+#     print(f'[INFO] ===== extract face landmarks from {ori_imgs_dir} =====')
+#     fa = face_alignment.FaceAlignment(face_alignment.LandmarksType._2D, flip_input=False)
+#     image_paths = glob.glob(os.path.join(ori_imgs_dir, '*.png'))
+#     for image_path in tqdm.tqdm(image_paths):
+#         out_name = image_path.replace("/images_512/", "/lms_2d/").replace(".png",".lms")
+#         if os.path.exists(out_name):
+#             continue
+#         input = cv2.imread(image_path, cv2.IMREAD_UNCHANGED) # [H, W, 3]
+#         input = cv2.cvtColor(input, cv2.COLOR_BGR2RGB)
+#         preds = fa.get_landmarks(input)
+#         if preds is None:
+#             print(f"Skip {image_path} for no face detected")
+#             continue
+#         if len(preds) > 0:
+#             lands = preds[0].reshape(-1, 2)[:,:2]
+#             os.makedirs(os.path.dirname(out_name), exist_ok=True)
+#             np.savetxt(out_name, lands, '%f')
+#     del fa
+#     print(f'[INFO] ===== extracted face landmarks =====')
+def save_file(name, content):
+    with open(name, "wb") as f:
+        pickle.dump(content, f)
+def load_file(name):
+    with open(name, "rb") as f:
+        content = pickle.load(f)
+    return content
+face_landmarker = None
+def extract_landmark_job(video_name, nerf=False):
+    try:
+        if nerf:
+            out_name = video_name.replace("/raw/", "/processed/").replace(".mp4","/lms_2d.npy")
+        else:
+            out_name = video_name.replace("/video/", "/lms_2d/").replace(".mp4","_lms.npy")
+        if os.path.exists(out_name):
+            # print("out exists, skip...")
+            return
+        try:
+            os.makedirs(os.path.dirname(out_name), exist_ok=True)
+        except:
+            pass
+        global face_landmarker
+        if face_landmarker is None:
+            face_landmarker = MediapipeLandmarker()
+        img_lm478, vid_lm478 = face_landmarker.extract_lm478_from_video_name(video_name)
+        lm478 = face_landmarker.combine_vid_img_lm478_to_lm478(img_lm478, vid_lm478)
+        np.save(out_name, lm478)
+        return True
+        # print("Hahaha, solve one item!!!")
+    except Exception as e:
+        traceback.print_exc()
+        return False
+def out_exist_job(vid_name):
+    out_name = vid_name.replace("/video/", "/lms_2d/").replace(".mp4","_lms.npy")
+    if os.path.exists(out_name):
+        return None
+    else:
+        return vid_name
+def get_todo_vid_names(vid_names):
+    if len(vid_names) == 1: # nerf
+        return vid_names
+    todo_vid_names = []
+    for i, res in multiprocess_run_tqdm(out_exist_job, vid_names, num_workers=128):
+        if res is not None:
+            todo_vid_names.append(res)
+    return todo_vid_names
+if __name__ == '__main__':
+    import argparse, glob, tqdm, random
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--vid_dir", default='nerf')
+    parser.add_argument("--ds_name", default='data/raw/videos/May.mp4')
+    parser.add_argument("--num_workers", default=2, type=int)
+    parser.add_argument("--process_id", default=0, type=int)
+    parser.add_argument("--total_process", default=1, type=int)
+    parser.add_argument("--reset", action="store_true")
+    parser.add_argument("--load_names", action="store_true")
+    args = parser.parse_args()
+    vid_dir = args.vid_dir
+    ds_name = args.ds_name
+    load_names = args.load_names
+    if ds_name.lower() == 'nerf': # 处理单个视频
+        vid_names = [vid_dir]
+        out_names = [video_name.replace("/raw/", "/processed/").replace(".mp4","/lms_2d.npy") for video_name in vid_names]
+    else: # 处理整个数据集
+        if ds_name in ['lrs3_trainval']:
+            vid_name_pattern = os.path.join(vid_dir, "*/*.mp4")
+        elif ds_name in ['TH1KH_512', 'CelebV-HQ']:
+            vid_name_pattern = os.path.join(vid_dir, "*.mp4")
+        elif ds_name in ['lrs2', 'lrs3', 'voxceleb2', 'CMLR']:
+            vid_name_pattern = os.path.join(vid_dir, "*/*/*.mp4")
+        elif ds_name in ["RAVDESS", 'VFHQ']:
+            vid_name_pattern = os.path.join(vid_dir, "*/*/*/*.mp4")
+        else:
+            raise NotImplementedError()
+        vid_names_path = os.path.join(vid_dir, "vid_names.pkl")
+        if os.path.exists(vid_names_path) and load_names:
+            print(f"loading vid names from {vid_names_path}")
+            vid_names = load_file(vid_names_path)
+        else:
+            vid_names = multiprocess_glob(vid_name_pattern)
+        vid_names = sorted(vid_names)
+        if not load_names:
+            print(f"saving vid names to {vid_names_path}")
+            save_file(vid_names_path, vid_names)
+        out_names = [video_name.replace("/video/", "/lms_2d/").replace(".mp4","_lms.npy") for video_name in vid_names]
+    process_id = args.process_id
+    total_process = args.total_process
+    if total_process > 1:
+        assert process_id <= total_process -1
+        num_samples_per_process = len(vid_names) // total_process
+        if process_id == total_process:
+            vid_names = vid_names[process_id * num_samples_per_process : ]
+        else:
+            vid_names = vid_names[process_id * num_samples_per_process : (process_id+1) * num_samples_per_process]
+    if not args.reset:
+        vid_names = get_todo_vid_names(vid_names)
+    print(f"todo videos number: {len(vid_names)}")
+    fail_cnt = 0
+    job_args = [(vid_name, ds_name=='nerf') for vid_name in vid_names]
+    for (i, res) in multiprocess_run_tqdm(extract_landmark_job, job_args, num_workers=args.num_workers, desc=f"Root {args.process_id}: extracing MP-based landmark2d"):
+        if res is False:
+            fail_cnt += 1
+        print(f"finished {i + 1} / {len(vid_names)} = {(i + 1) / len(vid_names):.4f}, failed {fail_cnt} / {i + 1} = {fail_cnt / (i + 1):.4f}")
+        sys.stdout.flush()
+        pass

data_gen/utils/process_video/extract_segment_imgs.py ADDED Viewed

	@@ -0,0 +1,494 @@

+import os
+os.environ["OMP_NUM_THREADS"] = "1"
+import random
+import glob
+import cv2
+import tqdm
+import numpy as np
+from typing import Union
+from utils.commons.tensor_utils import convert_to_np
+from utils.commons.os_utils import multiprocess_glob
+import pickle
+import traceback
+import multiprocessing
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm
+from scipy.ndimage import binary_erosion, binary_dilation
+from sklearn.neighbors import NearestNeighbors
+from mediapipe.tasks.python import vision
+from data_gen.utils.mp_feature_extractors.mp_segmenter import MediapipeSegmenter, encode_segmap_mask_to_image, decode_segmap_mask_from_image, job_cal_seg_map_for_image
+seg_model   = None
+segmenter   = None
+mat_model   = None
+lama_model  = None
+lama_config = None
+from data_gen.utils.process_video.split_video_to_imgs import extract_img_job
+BG_NAME_MAP = {
+    "knn": "",
+}
+FRAME_SELECT_INTERVAL = 5
+SIM_METHOD = "mse"
+SIM_THRESHOLD = 3
+def save_file(name, content):
+    with open(name, "wb") as f:
+        pickle.dump(content, f)
+def load_file(name):
+    with open(name, "rb") as f:
+        content = pickle.load(f)
+    return content
+def save_rgb_alpha_image_to_path(img, alpha, img_path):
+    try: os.makedirs(os.path.dirname(img_path), exist_ok=True)
+    except: pass
+    cv2.imwrite(img_path, np.concatenate([cv2.cvtColor(img, cv2.COLOR_RGB2BGR), alpha], axis=-1))
+def save_rgb_image_to_path(img, img_path):
+    try: os.makedirs(os.path.dirname(img_path), exist_ok=True)
+    except: pass
+    cv2.imwrite(img_path, cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
+def load_rgb_image_to_path(img_path):
+    return cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB)
+def image_similarity(x: np.ndarray, y: np.ndarray, method="mse"):
+    if method == "mse":
+        return np.mean((x - y) ** 2)
+    else:
+        raise NotImplementedError
+def extract_background(img_lst, segmap_mask_lst=None, method="knn", device='cpu', mix_bg=True):
+    """
+    img_lst: list of rgb ndarray
+    method: "knn"
+    """
+    global segmenter
+    global seg_model
+    global mat_model
+    global lama_model
+    global lama_config
+    assert len(img_lst) > 0
+    if segmap_mask_lst is not None:
+        assert len(segmap_mask_lst) == len(img_lst)
+    else:
+        del segmenter
+        del seg_model
+        seg_model = MediapipeSegmenter()
+        segmenter = vision.ImageSegmenter.create_from_options(seg_model.video_options)
+    def get_segmap_mask(img_lst, segmap_mask_lst, index):
+        if segmap_mask_lst is not None:
+            segmap = refresh_segment_mask(segmap_mask_lst[index])
+        else:
+            segmap = seg_model._cal_seg_map(refresh_image(img_lst[index]), segmenter=segmenter)
+        return segmap
+    if method == "knn":
+        num_frames = len(img_lst)
+        if num_frames < 100:
+            FRAME_SELECT_INTERVAL = 5
+        elif num_frames < 10000:
+            FRAME_SELECT_INTERVAL = 20
+        else:
+            FRAME_SELECT_INTERVAL = num_frames // 500
+        img_lst = img_lst[::FRAME_SELECT_INTERVAL] if num_frames > FRAME_SELECT_INTERVAL else img_lst[0:1]
+        if segmap_mask_lst is not None:
+            segmap_mask_lst = segmap_mask_lst[::FRAME_SELECT_INTERVAL] if num_frames > FRAME_SELECT_INTERVAL else segmap_mask_lst[0:1]
+            assert len(img_lst) == len(segmap_mask_lst)
+        # get H/W
+        h, w = refresh_image(img_lst[0]).shape[:2]
+        # nearest neighbors
+        all_xys = np.mgrid[0:h, 0:w].reshape(2, -1).transpose() # [512*512, 2] coordinate grid
+        distss = []
+        for idx, img in tqdm.tqdm(enumerate(img_lst), desc='combining backgrounds...', total=len(img_lst)):
+            segmap = get_segmap_mask(img_lst=img_lst, segmap_mask_lst=segmap_mask_lst, index=idx)
+            bg = (segmap[0]).astype(bool) # [h,w] bool mask
+            fg_xys = np.stack(np.nonzero(~bg)).transpose(1, 0) # [N_nonbg,2] coordinate of non-bg pixels
+            nbrs = NearestNeighbors(n_neighbors=1, algorithm='kd_tree').fit(fg_xys)
+            dists, _ = nbrs.kneighbors(all_xys) # [512*512, 1] distance to nearest non-bg pixel
+            distss.append(dists)
+        distss = np.stack(distss) # [B, 512*512, 1]
+        max_dist = np.max(distss, 0) # [512*512, 1]
+        max_id = np.argmax(distss, 0) # id of frame
+        bc_pixs = max_dist > 10 # 在各个frame有一个出现过是bg的pixel，bg标准是离最近的non-bg pixel距离大于10
+        bc_pixs_id = np.nonzero(bc_pixs)
+        bc_ids = max_id[bc_pixs]
+        # TODO: maybe we should reimplement here to avoid memory costs?
+        # though there is upper limits of images here
+        num_pixs = distss.shape[1]
+        bg_img = np.zeros((h*w, 3), dtype=np.uint8)
+        img_lst = [refresh_image(img) for img in img_lst]
+        imgs = np.stack(img_lst).reshape(-1, num_pixs, 3)
+        bg_img[bc_pixs_id, :] = imgs[bc_ids, bc_pixs_id, :] # 对那些铁bg的pixel，直接去对应的image里面采样
+        bg_img = bg_img.reshape(h, w, 3)
+        max_dist = max_dist.reshape(h, w)
+        bc_pixs = max_dist > 10 # 5
+        bg_xys = np.stack(np.nonzero(~bc_pixs)).transpose()
+        fg_xys = np.stack(np.nonzero(bc_pixs)).transpose()
+        nbrs = NearestNeighbors(n_neighbors=1, algorithm='kd_tree').fit(fg_xys)
+        distances, indices = nbrs.kneighbors(bg_xys) # 对non-bg img，用KNN找最近的bg pixel
+        bg_fg_xys = fg_xys[indices[:, 0]]
+        bg_img[bg_xys[:, 0], bg_xys[:, 1], :] = bg_img[bg_fg_xys[:, 0], bg_fg_xys[:, 1], :]
+    else:
+        raise NotImplementedError # deperated
+    return bg_img
+def inpaint_torso_job(gt_img, segmap):
+    bg_part = (segmap[0]).astype(bool)
+    head_part = (segmap[1] + segmap[3] + segmap[5]).astype(bool)
+    neck_part = (segmap[2]).astype(bool)
+    torso_part = (segmap[4]).astype(bool)
+    img = gt_img.copy()
+    img[head_part] = 0
+    # torso part "vertical" in-painting...
+    L = 8 + 1
+    torso_coords = np.stack(np.nonzero(torso_part), axis=-1) # [M, 2]
+    # lexsort: sort 2D coords first by y then by x,
+    # ref: https://stackoverflow.com/questions/2706605/sorting-a-2d-numpy-array-by-multiple-axes
+    inds = np.lexsort((torso_coords[:, 0], torso_coords[:, 1]))
+    torso_coords = torso_coords[inds]
+    # choose the top pixel for each column
+    u, uid, ucnt = np.unique(torso_coords[:, 1], return_index=True, return_counts=True)
+    top_torso_coords = torso_coords[uid] # [m, 2]
+    # only keep top-is-head pixels
+    top_torso_coords_up = top_torso_coords.copy() - np.array([1, 0]) # [N, 2]
+    mask = head_part[tuple(top_torso_coords_up.T)]
+    if mask.any():
+        top_torso_coords = top_torso_coords[mask]
+        # get the color
+        top_torso_colors = gt_img[tuple(top_torso_coords.T)] # [m, 3]
+        # construct inpaint coords (vertically up, or minus in x)
+        inpaint_torso_coords = top_torso_coords[None].repeat(L, 0) # [L, m, 2]
+        inpaint_offsets = np.stack([-np.arange(L), np.zeros(L, dtype=np.int32)], axis=-1)[:, None] # [L, 1, 2]
+        inpaint_torso_coords += inpaint_offsets
+        inpaint_torso_coords = inpaint_torso_coords.reshape(-1, 2) # [Lm, 2]
+        inpaint_torso_colors = top_torso_colors[None].repeat(L, 0) # [L, m, 3]
+        darken_scaler = 0.98 ** np.arange(L).reshape(L, 1, 1) # [L, 1, 1]
+        inpaint_torso_colors = (inpaint_torso_colors * darken_scaler).reshape(-1, 3) # [Lm, 3]
+        # set color
+        img[tuple(inpaint_torso_coords.T)] = inpaint_torso_colors
+        inpaint_torso_mask = np.zeros_like(img[..., 0]).astype(bool)
+        inpaint_torso_mask[tuple(inpaint_torso_coords.T)] = True
+    else:
+        inpaint_torso_mask = None
+    # neck part "vertical" in-painting...
+    push_down = 4
+    L = 48 + push_down + 1
+    neck_part = binary_dilation(neck_part, structure=np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=bool), iterations=3)
+    neck_coords = np.stack(np.nonzero(neck_part), axis=-1) # [M, 2]
+    # lexsort: sort 2D coords first by y then by x,
+    # ref: https://stackoverflow.com/questions/2706605/sorting-a-2d-numpy-array-by-multiple-axes
+    inds = np.lexsort((neck_coords[:, 0], neck_coords[:, 1]))
+    neck_coords = neck_coords[inds]
+    # choose the top pixel for each column
+    u, uid, ucnt = np.unique(neck_coords[:, 1], return_index=True, return_counts=True)
+    top_neck_coords = neck_coords[uid] # [m, 2]
+    # only keep top-is-head pixels
+    top_neck_coords_up = top_neck_coords.copy() - np.array([1, 0])
+    mask = head_part[tuple(top_neck_coords_up.T)]
+    top_neck_coords = top_neck_coords[mask]
+    # push these top down for 4 pixels to make the neck inpainting more natural...
+    offset_down = np.minimum(ucnt[mask] - 1, push_down)
+    top_neck_coords += np.stack([offset_down, np.zeros_like(offset_down)], axis=-1)
+    # get the color
+    top_neck_colors = gt_img[tuple(top_neck_coords.T)] # [m, 3]
+    # construct inpaint coords (vertically up, or minus in x)
+    inpaint_neck_coords = top_neck_coords[None].repeat(L, 0) # [L, m, 2]
+    inpaint_offsets = np.stack([-np.arange(L), np.zeros(L, dtype=np.int32)], axis=-1)[:, None] # [L, 1, 2]
+    inpaint_neck_coords += inpaint_offsets
+    inpaint_neck_coords = inpaint_neck_coords.reshape(-1, 2) # [Lm, 2]
+    inpaint_neck_colors = top_neck_colors[None].repeat(L, 0) # [L, m, 3]
+    darken_scaler = 0.98 ** np.arange(L).reshape(L, 1, 1) # [L, 1, 1]
+    inpaint_neck_colors = (inpaint_neck_colors * darken_scaler).reshape(-1, 3) # [Lm, 3]
+    # set color
+    img[tuple(inpaint_neck_coords.T)] = inpaint_neck_colors
+    # apply blurring to the inpaint area to avoid vertical-line artifects...
+    inpaint_mask = np.zeros_like(img[..., 0]).astype(bool)
+    inpaint_mask[tuple(inpaint_neck_coords.T)] = True
+    blur_img = img.copy()
+    blur_img = cv2.GaussianBlur(blur_img, (5, 5), cv2.BORDER_DEFAULT)
+    img[inpaint_mask] = blur_img[inpaint_mask]
+    # set mask
+    torso_img_mask = (neck_part | torso_part | inpaint_mask)
+    torso_with_bg_img_mask = (bg_part | neck_part | torso_part | inpaint_mask)
+    if inpaint_torso_mask is not None:
+        torso_img_mask = torso_img_mask | inpaint_torso_mask
+        torso_with_bg_img_mask = torso_with_bg_img_mask | inpaint_torso_mask
+    torso_img = img.copy()
+    torso_img[~torso_img_mask] = 0
+    torso_with_bg_img = img.copy()
+    torso_img[~torso_with_bg_img_mask] = 0
+    return torso_img, torso_img_mask, torso_with_bg_img, torso_with_bg_img_mask
+def load_segment_mask_from_file(filename: str):
+    encoded_segmap = load_rgb_image_to_path(filename)
+    segmap_mask = decode_segmap_mask_from_image(encoded_segmap)
+    return segmap_mask
+# load segment mask to memory if not loaded yet
+def refresh_segment_mask(segmap_mask: Union[str, np.ndarray]):
+    if isinstance(segmap_mask, str):
+        segmap_mask = load_segment_mask_from_file(segmap_mask)
+    return segmap_mask
+# load segment mask to memory if not loaded yet
+def refresh_image(image: Union[str, np.ndarray]):
+    if isinstance(image, str):
+        image = load_rgb_image_to_path(image)
+    return image
+def generate_segment_imgs_job(img_name, segmap, img):
+    out_img_name = segmap_name = img_name.replace("/gt_imgs/", "/segmaps/").replace(".jpg", ".png") # 存成jpg的话，pixel value会有误差
+    try: os.makedirs(os.path.dirname(out_img_name), exist_ok=True)
+    except: pass
+    encoded_segmap = encode_segmap_mask_to_image(segmap)
+    save_rgb_image_to_path(encoded_segmap, out_img_name)
+    for mode in ['head', 'torso', 'person', 'bg']:
+        out_img, mask = seg_model._seg_out_img_with_segmap(img, segmap, mode=mode)
+        img_alpha = 255 * np.ones((img.shape[0], img.shape[1], 1), dtype=np.uint8) # alpha
+        mask = mask[0][..., None]
+        img_alpha[~mask] = 0
+        out_img_name = img_name.replace("/gt_imgs/", f"/{mode}_imgs/").replace(".jpg", ".png")
+        save_rgb_alpha_image_to_path(out_img, img_alpha, out_img_name)
+    inpaint_torso_img, inpaint_torso_img_mask, inpaint_torso_with_bg_img, inpaint_torso_with_bg_img_mask = inpaint_torso_job(img, segmap)
+    img_alpha = 255 * np.ones((img.shape[0], img.shape[1], 1), dtype=np.uint8) # alpha
+    img_alpha[~inpaint_torso_img_mask[..., None]] = 0
+    out_img_name = img_name.replace("/gt_imgs/", f"/inpaint_torso_imgs/").replace(".jpg", ".png")
+    save_rgb_alpha_image_to_path(inpaint_torso_img, img_alpha, out_img_name)
+    return segmap_name
+def segment_and_generate_for_image_job(img_name, img, segmenter_options=None, segmenter=None, store_in_memory=False):
+    img = refresh_image(img)
+    segmap_mask, segmap_image = job_cal_seg_map_for_image(img, segmenter_options=segmenter_options, segmenter=segmenter)
+    segmap_name = generate_segment_imgs_job(img_name=img_name, segmap=segmap_mask, img=img)
+    if store_in_memory:
+        return segmap_mask
+    else:
+        return segmap_name
+def extract_segment_job(
+    video_name,
+    nerf=False,
+    background_method='knn',
+    device="cpu",
+    total_gpus=0,
+    mix_bg=True,
+    store_in_memory=False, # set to True to speed up a bit of preprocess, but leads to HUGE memory costs (100GB for 5-min video)
+    force_single_process=False, # turn this on if you find multi-process does not work on your environment
+):
+    global segmenter
+    global seg_model
+    del segmenter
+    del seg_model
+    seg_model = MediapipeSegmenter()
+    segmenter = vision.ImageSegmenter.create_from_options(seg_model.options)
+    # nerf means that we extract only one video, so can enable multi-process acceleration
+    multiprocess_enable = nerf and not force_single_process
+    try:
+        if "cuda" in device:
+            # determine which cuda index from subprocess id
+            pname = multiprocessing.current_process().name
+            pid = int(pname.rsplit("-", 1)[-1]) - 1
+            cuda_id = pid % total_gpus
+            device = f"cuda:{cuda_id}"
+        if nerf: # single video
+            raw_img_dir = video_name.replace(".mp4", "/gt_imgs/").replace("/raw/","/processed/")
+        else: # whole dataset
+            raw_img_dir = video_name.replace(".mp4", "").replace("/video/", "/gt_imgs/")
+        if not os.path.exists(raw_img_dir):
+            extract_img_job(video_name, raw_img_dir) # use ffmpeg to split video into imgs
+        img_names = glob.glob(os.path.join(raw_img_dir, "*.jpg"))
+        img_lst = []
+        for img_name in img_names:
+            if store_in_memory:
+                img = load_rgb_image_to_path(img_name)
+            else:
+                img = img_name
+            img_lst.append(img)
+        print("| Extracting Segmaps && Saving...")
+        args = []
+        segmap_mask_lst = []
+        # preparing parameters for segment
+        for i in range(len(img_lst)):
+            img_name = img_names[i]
+            img = img_lst[i]
+            if multiprocess_enable: # create seg_model in subprocesses here
+                options = seg_model.options
+                segmenter_arg = None
+            else: # use seg_model of this process
+                options = None
+                segmenter_arg = segmenter
+            arg = (img_name, img, options, segmenter_arg, store_in_memory)
+            args.append(arg)
+        if multiprocess_enable:
+            for (_, res) in multiprocess_run_tqdm(segment_and_generate_for_image_job, args=args, num_workers=16, desc='generating segment images in multi-processes...'):
+                segmap_mask = res
+                segmap_mask_lst.append(segmap_mask)
+        else:
+            for index in tqdm.tqdm(range(len(img_lst)), desc="generating segment images in single-process..."):
+                segmap_mask = segment_and_generate_for_image_job(*args[index])
+                segmap_mask_lst.append(segmap_mask)
+        print("| Extracted Segmaps Done.")
+        print("| Extracting background...")
+        bg_prefix_name = f"bg{BG_NAME_MAP[background_method]}"
+        bg_img = extract_background(img_lst, segmap_mask_lst, method=background_method, device=device, mix_bg=mix_bg)
+        if nerf:
+            out_img_name = video_name.replace("/raw/", "/processed/").replace(".mp4", f"/{bg_prefix_name}.jpg")
+        else:
+            out_img_name = video_name.replace("/video/", f"/{bg_prefix_name}_img/").replace(".mp4", ".jpg")
+        save_rgb_image_to_path(bg_img, out_img_name)
+        print("| Extracted background done.")
+        print("| Extracting com_imgs...")
+        com_prefix_name = f"com{BG_NAME_MAP[background_method]}"
+        for i in tqdm.trange(len(img_names), desc='extracting com_imgs'):
+            img_name = img_names[i]
+            com_img = refresh_image(img_lst[i]).copy()
+            segmap = refresh_segment_mask(segmap_mask_lst[i])
+            bg_part = segmap[0].astype(bool)[..., None].repeat(3,axis=-1)
+            com_img[bg_part] = bg_img[bg_part]
+            out_img_name = img_name.replace("/gt_imgs/", f"/{com_prefix_name}_imgs/")
+            save_rgb_image_to_path(com_img, out_img_name)
+        print("| Extracted com_imgs done.")
+        return 0
+    except Exception as e:
+        print(str(type(e)), e)
+        traceback.print_exc(e)
+        return 1
+def out_exist_job(vid_name, background_method='knn'):
+    com_prefix_name = f"com{BG_NAME_MAP[background_method]}"
+    img_dir = vid_name.replace("/video/", "/gt_imgs/").replace(".mp4", "")
+    out_dir1 = img_dir.replace("/gt_imgs/", "/head_imgs/")
+    out_dir2 = img_dir.replace("/gt_imgs/", f"/{com_prefix_name}_imgs/")
+    if os.path.exists(img_dir) and os.path.exists(out_dir1) and os.path.exists(out_dir1) and os.path.exists(out_dir2) :
+        num_frames = len(os.listdir(img_dir))
+        if len(os.listdir(out_dir1)) == num_frames and len(os.listdir(out_dir2)) == num_frames:
+            return None
+        else:
+            return vid_name
+    else:
+        return vid_name
+def get_todo_vid_names(vid_names, background_method='knn'):
+    if len(vid_names) == 1: # nerf
+        return vid_names
+    todo_vid_names = []
+    fn_args = [(vid_name, background_method) for vid_name in vid_names]
+    for i, res in multiprocess_run_tqdm(out_exist_job, fn_args, num_workers=16, desc="checking todo videos..."):
+        if res is not None:
+            todo_vid_names.append(res)
+    return todo_vid_names
+if __name__ == '__main__':
+    import argparse, glob, tqdm, random
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--vid_dir", default='/home/tiger/datasets/raw/TH1KH_512/video')
+    parser.add_argument("--ds_name", default='TH1KH_512')
+    parser.add_argument("--num_workers", default=48, type=int)
+    parser.add_argument("--seed", default=0, type=int)
+    parser.add_argument("--process_id", default=0, type=int)
+    parser.add_argument("--total_process", default=1, type=int)
+    parser.add_argument("--reset", action='store_true')
+    parser.add_argument("--load_names", action="store_true")
+    parser.add_argument("--background_method", choices=['knn', 'mat', 'ddnm', 'lama'], type=str, default='knn')
+    parser.add_argument("--total_gpus", default=0, type=int) # zero gpus means utilizing cpu
+    parser.add_argument("--no_mix_bg", action="store_true")
+    parser.add_argument("--store_in_memory", action="store_true") # set to True to speed up preprocess, but leads to high memory costs
+    parser.add_argument("--force_single_process", action="store_true") # turn this on if you find multi-process does not work on your environment
+    args = parser.parse_args()
+    vid_dir = args.vid_dir
+    ds_name = args.ds_name
+    load_names = args.load_names
+    background_method = args.background_method
+    total_gpus = args.total_gpus
+    mix_bg = not args.no_mix_bg
+    store_in_memory = args.store_in_memory
+    force_single_process = args.force_single_process
+    devices = os.environ.get('CUDA_VISIBLE_DEVICES', '').split(",")
+    for d in devices[:total_gpus]:
+        os.system(f'pkill -f "voidgpu{d}"')
+    if ds_name.lower() == 'nerf': # 处理单个视频
+        vid_names = [vid_dir]
+        out_names = [video_name.replace("/raw/", "/processed/").replace(".mp4","_lms.npy") for video_name in vid_names]
+    else: # 处理整个数据集
+        if ds_name in ['lrs3_trainval']:
+            vid_name_pattern = os.path.join(vid_dir, "*/*.mp4")
+        elif ds_name in ['TH1KH_512', 'CelebV-HQ']:
+            vid_name_pattern = os.path.join(vid_dir, "*.mp4")
+        elif ds_name in ['lrs2', 'lrs3', 'voxceleb2']:
+            vid_name_pattern = os.path.join(vid_dir, "*/*/*.mp4")
+        elif ds_name in ["RAVDESS", 'VFHQ']:
+            vid_name_pattern = os.path.join(vid_dir, "*/*/*/*.mp4")
+        else:
+            raise NotImplementedError()
+        vid_names_path = os.path.join(vid_dir, "vid_names.pkl")
+        if os.path.exists(vid_names_path) and load_names:
+            print(f"loading vid names from {vid_names_path}")
+            vid_names = load_file(vid_names_path)
+        else:
+            vid_names = multiprocess_glob(vid_name_pattern)
+        vid_names = sorted(vid_names)
+        print(f"saving vid names to {vid_names_path}")
+        save_file(vid_names_path, vid_names)
+    vid_names = sorted(vid_names)
+    random.seed(args.seed)
+    random.shuffle(vid_names)
+    process_id = args.process_id
+    total_process = args.total_process
+    if total_process > 1:
+        assert process_id <= total_process -1
+        num_samples_per_process = len(vid_names) // total_process
+        if process_id == total_process:
+            vid_names = vid_names[process_id * num_samples_per_process : ]
+        else:
+            vid_names = vid_names[process_id * num_samples_per_process : (process_id+1) * num_samples_per_process]
+    if not args.reset:
+        vid_names = get_todo_vid_names(vid_names, background_method)
+    print(f"todo videos number: {len(vid_names)}")
+    device = "cuda" if total_gpus > 0 else "cpu"
+    extract_job = extract_segment_job
+    fn_args = [(vid_name, ds_name=='nerf', background_method, device, total_gpus, mix_bg, store_in_memory, force_single_process) for i, vid_name in enumerate(vid_names)]
+    if ds_name == 'nerf': # 处理单个视频
+        extract_job(*fn_args[0])
+    else:
+        for vid_name in multiprocess_run_tqdm(extract_job, fn_args, desc=f"Root process {args.process_id}:  segment images", num_workers=args.num_workers):
+            pass

data_gen/utils/process_video/fit_3dmm_landmark.py ADDED Viewed

	@@ -0,0 +1,565 @@

+# This is a script for efficienct 3DMM coefficient extraction.
+# It could reconstruct accurate 3D face in real-time.
+# It is built upon BFM 2009 model and mediapipe landmark extractor.
+# It is authored by ZhenhuiYe ([email protected]), free to contact him for any suggestion on improvement!
+from numpy.core.numeric import require
+from numpy.lib.function_base import quantile
+import torch
+import torch.nn.functional as F
+import copy
+import numpy as np
+import random
+import pickle
+import os
+import sys
+import cv2
+import argparse
+import tqdm
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm
+from data_gen.utils.mp_feature_extractors.face_landmarker import MediapipeLandmarker, read_video_to_frames
+from deep_3drecon.deep_3drecon_models.bfm import ParametricFaceModel
+from deep_3drecon.secc_renderer import SECC_Renderer
+from utils.commons.os_utils import multiprocess_glob
+face_model = ParametricFaceModel(bfm_folder='deep_3drecon/BFM',
+            camera_distance=10, focal=1015, keypoint_mode='mediapipe')
+face_model.to(torch.device("cuda:0"))
+dir_path = os.path.dirname(os.path.realpath(__file__))
+def draw_axes(img, pitch, yaw, roll, tx, ty, size=50):
+    # yaw = -yaw
+    pitch = - pitch
+    roll = - roll
+    rotation_matrix = cv2.Rodrigues(np.array([pitch, yaw, roll]))[0].astype(np.float64)
+    axes_points = np.array([
+        [1, 0, 0, 0],
+        [0, 1, 0, 0],
+        [0, 0, 1, 0]
+    ], dtype=np.float64)
+    axes_points = rotation_matrix @ axes_points
+    axes_points = (axes_points[:2, :] * size).astype(int)
+    axes_points[0, :] = axes_points[0, :] + tx
+    axes_points[1, :] = axes_points[1, :] + ty
+    new_img = img.copy()
+    cv2.line(new_img, tuple(axes_points[:, 3].ravel()), tuple(axes_points[:, 0].ravel()), (255, 0, 0), 3)
+    cv2.line(new_img, tuple(axes_points[:, 3].ravel()), tuple(axes_points[:, 1].ravel()), (0, 255, 0), 3)
+    cv2.line(new_img, tuple(axes_points[:, 3].ravel()), tuple(axes_points[:, 2].ravel()), (0, 0, 255), 3)
+    return new_img
+def save_file(name, content):
+    with open(name, "wb") as f:
+        pickle.dump(content, f)
+def load_file(name):
+    with open(name, "rb") as f:
+        content = pickle.load(f)
+    return content
+def cal_lap_loss(in_tensor):
+    # [T, 68, 2]
+    t = in_tensor.shape[0]
+    in_tensor = in_tensor.reshape([t, -1]).permute(1,0).unsqueeze(1) # [c, 1, t]
+    in_tensor = torch.cat([in_tensor[:, :, 0:1], in_tensor, in_tensor[:, :, -1:]], dim=-1)
+    lap_kernel = torch.Tensor((-0.5, 1.0, -0.5)).reshape([1,1,3]).float().to(in_tensor.device) # [1, 1, kw]
+    loss_lap = 0
+    out_tensor = F.conv1d(in_tensor, lap_kernel)
+    loss_lap += torch.mean(out_tensor**2)
+    return loss_lap
+def cal_vel_loss(ldm):
+    # [B, 68, 2]
+    vel = ldm[1:] - ldm[:-1]
+    return torch.mean(torch.abs(vel))
+def cal_lan_loss(proj_lan, gt_lan):
+    # [B, 68, 2]
+    loss = (proj_lan - gt_lan)** 2
+    # use the ldm weights from deep3drecon, see deep_3drecon/deep_3drecon_models/losses.py
+    weights = torch.zeros_like(loss)
+    weights = torch.ones_like(loss)
+    weights[:, 36:48, :] = 3 # eye 12 points
+    weights[:, -8:, :] =  3 # inner lip 8 points
+    weights[:, 28:31, :] =  3 # nose 3 points
+    loss = loss * weights
+    return torch.mean(loss)
+def cal_lan_loss_mp(proj_lan, gt_lan, mean:bool=True):
+    # [B, 68, 2]
+    loss = (proj_lan - gt_lan).pow(2)
+    # loss = (proj_lan - gt_lan).abs()
+    unmatch_mask = [ 93, 127, 132, 234, 323, 356, 361, 454]
+    upper_eye = [161,160,159,158,157] + [388,387,386,385,384]
+    eye = [33,246,161,160,159,158,157,173,133,155,154,153,145,144,163,7] + [263,466,388,387,386,385,384,398,362,382,381,380,374,373,390,249]
+    inner_lip = [78,191,80,81,82,13,312,311,310,415,308,324,318,402,317,14,87,178,88,95]
+    outer_lip = [61,185,40,39,37,0,267,269,270,409,291,375,321,405,314,17,84,181,91,146]
+    weights = torch.ones_like(loss)
+    weights[:, eye] = 3
+    weights[:, upper_eye] = 20
+    weights[:, inner_lip] = 5
+    weights[:, outer_lip] = 5
+    weights[:, unmatch_mask] = 0
+    loss = loss * weights
+    if mean:
+        loss = torch.mean(loss)
+    return loss
+def cal_acceleration_loss(trans):
+    vel = trans[1:] - trans[:-1]
+    acc = vel[1:] - vel[:-1]
+    return torch.mean(torch.abs(acc))
+def cal_acceleration_ldm_loss(ldm):
+    # [B, 68, 2]
+    vel = ldm[1:] - ldm[:-1]
+    acc = vel[1:] - vel[:-1]
+    lip_weight = 0.25 # we dont want smooth the lip too much
+    acc[48:68] *= lip_weight
+    return torch.mean(torch.abs(acc))
+def set_requires_grad(tensor_list):
+    for tensor in tensor_list:
+        tensor.requires_grad = True
+@torch.enable_grad()
+def fit_3dmm_for_a_video(
+    video_name,
+    nerf=False, # use the file name convention for GeneFace++
+    id_mode='global',
+    debug=False,
+    keypoint_mode='mediapipe',
+    large_yaw_threshold=9999999.9,
+    save=True
+) -> bool: # True: good, False: bad
+    assert video_name.endswith(".mp4"), "this function only support video as input"
+    if id_mode == 'global':
+        LAMBDA_REG_ID = 0.2
+        LAMBDA_REG_EXP = 0.6
+        LAMBDA_REG_LAP = 1.0
+        LAMBDA_REG_VEL_ID = 0.0 # laplcaian is all you need for temporal consistency
+        LAMBDA_REG_VEL_EXP = 0.0 # laplcaian is all you need for temporal consistency
+    else:
+        LAMBDA_REG_ID = 0.3
+        LAMBDA_REG_EXP = 0.05
+        LAMBDA_REG_LAP = 1.0
+        LAMBDA_REG_VEL_ID = 0.0 # laplcaian is all you need for temporal consistency
+        LAMBDA_REG_VEL_EXP = 0.0 # laplcaian is all you need for temporal consistency
+    frames = read_video_to_frames(video_name) # [T, H, W, 3]
+    img_h, img_w = frames.shape[1], frames.shape[2]
+    assert img_h == img_w
+    num_frames = len(frames)
+    if nerf: # single video
+        lm_name = video_name.replace("/raw/", "/processed/").replace(".mp4","/lms_2d.npy")
+    else:
+        lm_name = video_name.replace("/video/", "/lms_2d/").replace(".mp4", "_lms.npy")
+    if os.path.exists(lm_name):
+        lms = np.load(lm_name)
+    else:
+        print(f"lms_2d file not found, try to extract it from video... {lm_name}")
+        try:
+            landmarker = MediapipeLandmarker()
+            img_lm478, vid_lm478 = landmarker.extract_lm478_from_frames(frames, anti_smooth_factor=20)
+            lms = landmarker.combine_vid_img_lm478_to_lm478(img_lm478, vid_lm478)
+        except Exception as e:
+            print(e)
+            return False
+        if lms is None:
+            print(f"get None lms_2d, please check whether each frame has one head, exiting... {lm_name}")
+            return False
+    lms = lms[:, :468, :]
+    lms = torch.FloatTensor(lms).cuda()
+    lms[..., 1] = img_h - lms[..., 1] # flip the height axis
+    if keypoint_mode == 'mediapipe':
+        # default
+        cal_lan_loss_fn = cal_lan_loss_mp
+        if nerf: # single video
+            out_name = video_name.replace("/raw/", "/processed/").replace(".mp4", "/coeff_fit_mp.npy")
+        else:
+            out_name = video_name.replace("/video/", "/coeff_fit_mp/").replace(".mp4", "_coeff_fit_mp.npy")
+    else:
+        # lm68 is less accurate than mp
+        cal_lan_loss_fn = cal_lan_loss
+        if nerf: # single video
+            out_name = video_name.replace("/raw/", "/processed/").replace(".mp4", "_coeff_fit_lm68.npy")
+        else:
+            out_name = video_name.replace("/video/", "/coeff_fit_lm68/").replace(".mp4", "_coeff_fit_lm68.npy")
+    try:
+        os.makedirs(os.path.dirname(out_name), exist_ok=True)
+    except:
+        pass
+    id_dim, exp_dim = 80, 64
+    sel_ids = np.arange(0, num_frames, 40)
+    h = w = face_model.center * 2
+    img_scale_factor = img_h / h
+    lms /= img_scale_factor # rescale lms into [0,224]
+    if id_mode == 'global':
+        # default choice by GeneFace++ and later works
+        id_para = lms.new_zeros((1, id_dim), requires_grad=True)
+    elif id_mode == 'finegrained':
+        # legacy choice by GeneFace1 (ICLR 2023)
+        id_para = lms.new_zeros((num_frames, id_dim), requires_grad=True)
+    else: raise NotImplementedError(f"id mode {id_mode} not supported! we only support global or finegrained.")
+    exp_para = lms.new_zeros((num_frames, exp_dim), requires_grad=True)
+    euler_angle = lms.new_zeros((num_frames, 3), requires_grad=True)
+    trans = lms.new_zeros((num_frames, 3), requires_grad=True)
+    set_requires_grad([id_para, exp_para, euler_angle, trans])
+    optimizer_idexp = torch.optim.Adam([id_para, exp_para], lr=.1)
+    optimizer_frame = torch.optim.Adam([euler_angle, trans], lr=.1)
+    # 其他参数初始化，先训练euler和trans
+    for _ in range(200):
+        if id_mode == 'global':
+            proj_geo = face_model.compute_for_landmark_fit(
+                id_para.expand((num_frames, id_dim)), exp_para, euler_angle, trans)
+        else:
+            proj_geo = face_model.compute_for_landmark_fit(
+                id_para, exp_para, euler_angle, trans)
+        loss_lan = cal_lan_loss_fn(proj_geo[:, :, :2], lms.detach())
+        loss = loss_lan
+        optimizer_frame.zero_grad()
+        loss.backward()
+        optimizer_frame.step()
+    # print(f"loss_lan: {loss_lan.item():.2f}, euler_abs_mean: {euler_angle.abs().mean().item():.4f}, euler_std: {euler_angle.std().item():.4f}, euler_min: {euler_angle.min().item():.4f}, euler_max: {euler_angle.max().item():.4f}")
+    # print(f"trans_z_mean: {trans[...,2].mean().item():.4f}, trans_z_std: {trans[...,2].std().item():.4f}, trans_min: {trans[...,2].min().item():.4f}, trans_max: {trans[...,2].max().item():.4f}")
+    for param_group in optimizer_frame.param_groups:
+        param_group['lr'] = 0.1
+    # "jointly roughly training id exp euler trans"
+    for _ in range(200):
+        ret = {}
+        if id_mode == 'global':
+            proj_geo = face_model.compute_for_landmark_fit(
+                id_para.expand((num_frames, id_dim)), exp_para, euler_angle, trans, ret)
+        else:
+            proj_geo = face_model.compute_for_landmark_fit(
+                id_para, exp_para, euler_angle, trans, ret)
+        loss_lan = cal_lan_loss_fn(
+            proj_geo[:, :, :2], lms.detach())
+        # loss_lap = cal_lap_loss(proj_geo)
+        # laplacian对euler影响不大，但是对trans的提升很大
+        loss_lap = cal_lap_loss(id_para) + cal_lap_loss(exp_para) + cal_lap_loss(euler_angle) * 0.3 + cal_lap_loss(trans) * 0.3
+        loss_regid = torch.mean(id_para*id_para) # 正则化
+        loss_regexp = torch.mean(exp_para * exp_para)
+        loss_vel_id = cal_vel_loss(id_para)
+        loss_vel_exp = cal_vel_loss(exp_para)
+        loss = loss_lan  + loss_regid * LAMBDA_REG_ID + loss_regexp * LAMBDA_REG_EXP  + loss_vel_id * LAMBDA_REG_VEL_ID + loss_vel_exp * LAMBDA_REG_VEL_EXP + loss_lap * LAMBDA_REG_LAP
+        optimizer_idexp.zero_grad()
+        optimizer_frame.zero_grad()
+        loss.backward()
+        optimizer_idexp.step()
+        optimizer_frame.step()
+    # print(f"loss_lan: {loss_lan.item():.2f}, loss_reg_id: {loss_regid.item():.2f},loss_reg_exp: {loss_regexp.item():.2f},")
+    # print(f"euler_abs_mean: {euler_angle.abs().mean().item():.4f}, euler_std: {euler_angle.std().item():.4f}, euler_min: {euler_angle.min().item():.4f}, euler_max: {euler_angle.max().item():.4f}")
+    # print(f"trans_z_mean: {trans[...,2].mean().item():.4f}, trans_z_std: {trans[...,2].std().item():.4f}, trans_min: {trans[...,2].min().item():.4f}, trans_max: {trans[...,2].max().item():.4f}")
+    # start fine training, intialize from the roughly trained results
+    if id_mode == 'global':
+        id_para_ = lms.new_zeros((1, id_dim), requires_grad=False)
+    else:
+        id_para_ = lms.new_zeros((num_frames, id_dim), requires_grad=True)
+    id_para_.data = id_para.data.clone()
+    id_para = id_para_
+    exp_para_ = lms.new_zeros((num_frames, exp_dim), requires_grad=True)
+    exp_para_.data = exp_para.data.clone()
+    exp_para = exp_para_
+    euler_angle_ = lms.new_zeros((num_frames, 3), requires_grad=True)
+    euler_angle_.data = euler_angle.data.clone()
+    euler_angle = euler_angle_
+    trans_ = lms.new_zeros((num_frames, 3), requires_grad=True)
+    trans_.data = trans.data.clone()
+    trans = trans_
+    batch_size = 50
+    # "fine fitting the 3DMM in batches"
+    for i in range(int((num_frames-1)/batch_size+1)):
+        if (i+1)*batch_size > num_frames:
+            start_n = num_frames-batch_size
+            sel_ids = np.arange(max(num_frames-batch_size,0), num_frames)
+        else:
+            start_n = i*batch_size
+            sel_ids = np.arange(i*batch_size, i*batch_size+batch_size)
+        sel_lms = lms[sel_ids]
+        if id_mode == 'global':
+            sel_id_para = id_para.expand((sel_ids.shape[0], id_dim))
+        else:
+            sel_id_para = id_para.new_zeros((batch_size, id_dim), requires_grad=True)
+            sel_id_para.data = id_para[sel_ids].clone()
+        sel_exp_para = exp_para.new_zeros(
+            (batch_size, exp_dim), requires_grad=True)
+        sel_exp_para.data = exp_para[sel_ids].clone()
+        sel_euler_angle = euler_angle.new_zeros(
+            (batch_size, 3), requires_grad=True)
+        sel_euler_angle.data = euler_angle[sel_ids].clone()
+        sel_trans = trans.new_zeros((batch_size, 3), requires_grad=True)
+        sel_trans.data = trans[sel_ids].clone()
+        if id_mode == 'global':
+            set_requires_grad([sel_exp_para, sel_euler_angle, sel_trans])
+            optimizer_cur_batch = torch.optim.Adam(
+                [sel_exp_para, sel_euler_angle, sel_trans], lr=0.005)
+        else:
+            set_requires_grad([sel_id_para, sel_exp_para, sel_euler_angle, sel_trans])
+            optimizer_cur_batch = torch.optim.Adam(
+                [sel_id_para, sel_exp_para, sel_euler_angle, sel_trans], lr=0.005)
+        for j in range(50):
+            ret = {}
+            proj_geo = face_model.compute_for_landmark_fit(
+                sel_id_para, sel_exp_para, sel_euler_angle, sel_trans, ret)
+            loss_lan = cal_lan_loss_fn(
+                proj_geo[:, :, :2], lms[sel_ids].detach())
+            # loss_lap = cal_lap_loss(proj_geo)
+            loss_lap = cal_lap_loss(sel_id_para) + cal_lap_loss(sel_exp_para) + cal_lap_loss(sel_euler_angle) * 0.3 + cal_lap_loss(sel_trans) * 0.3
+            loss_vel_id = cal_vel_loss(sel_id_para)
+            loss_vel_exp = cal_vel_loss(sel_exp_para)
+            log_dict = {
+                'loss_vel_id': loss_vel_id,
+                'loss_vel_exp': loss_vel_exp,
+                'loss_vel_euler': cal_vel_loss(sel_euler_angle),
+                'loss_vel_trans': cal_vel_loss(sel_trans),
+            }
+            loss_regid = torch.mean(sel_id_para*sel_id_para) # 正则化
+            loss_regexp = torch.mean(sel_exp_para*sel_exp_para)
+            loss = loss_lan + loss_regid * LAMBDA_REG_ID + loss_regexp * LAMBDA_REG_EXP + loss_lap * LAMBDA_REG_LAP + loss_vel_id * LAMBDA_REG_VEL_ID + loss_vel_exp * LAMBDA_REG_VEL_EXP
+            optimizer_cur_batch.zero_grad()
+            loss.backward()
+            optimizer_cur_batch.step()
+        if debug:
+            print(f"batch {i} | loss_lan: {loss_lan.item():.2f}, loss_reg_id: {loss_regid.item():.2f},loss_reg_exp: {loss_regexp.item():.2f},loss_lap_ldm:{loss_lap.item():.4f}")
+            print("|--------" + ', '.join([f"{k}: {v:.4f}" for k,v in log_dict.items()]))
+        if id_mode != 'global':
+            id_para[sel_ids].data = sel_id_para.data.clone()
+        exp_para[sel_ids].data = sel_exp_para.data.clone()
+        euler_angle[sel_ids].data = sel_euler_angle.data.clone()
+        trans[sel_ids].data = sel_trans.data.clone()
+    coeff_dict = {'id': id_para.detach().cpu().numpy(), 'exp': exp_para.detach().cpu().numpy(),
+                'euler': euler_angle.detach().cpu().numpy(), 'trans': trans.detach().cpu().numpy()}
+    # filter data by side-view pose
+    # bad_yaw = False
+    # yaws = [] # not so accurate
+    # for index in range(coeff_dict["trans"].shape[0]):
+    #     yaw = coeff_dict["euler"][index][1]
+    #     yaw = np.abs(yaw)
+    #     yaws.append(yaw)
+    #     if yaw > large_yaw_threshold:
+    #         bad_yaw = True
+    if debug:
+        import imageio
+        from utils.visualization.vis_cam3d.camera_pose_visualizer import CameraPoseVisualizer
+        from data_util.face3d_helper import Face3DHelper
+        from data_gen.utils.process_video.extract_blink import get_eye_area_percent
+        face3d_helper = Face3DHelper('deep_3drecon/BFM', keypoint_mode='mediapipe')
+        t = coeff_dict['exp'].shape[0]
+        if len(coeff_dict['id']) == 1:
+            coeff_dict['id'] = np.repeat(coeff_dict['id'], t, axis=0)
+        idexp_lm3d = face3d_helper.reconstruct_idexp_lm3d_np(coeff_dict['id'], coeff_dict['exp']).reshape([t, -1])
+        cano_lm3d = idexp_lm3d / 10 + face3d_helper.key_mean_shape.squeeze().reshape([1, -1]).cpu().numpy()
+        cano_lm3d = cano_lm3d.reshape([t, -1, 3])
+        WH = 512
+        cano_lm3d = (cano_lm3d * WH/2 + WH/2).astype(int)
+        with torch.no_grad():
+            rot = ParametricFaceModel.compute_rotation(euler_angle)
+            extrinsic = torch.zeros([rot.shape[0], 4, 4]).to(rot.device)
+            extrinsic[:, :3,:3] = rot
+            extrinsic[:, :3, 3] = trans # / 10
+            extrinsic[:, 3, 3] = 1
+        extrinsic = extrinsic.cpu().numpy()
+        xy_camera_visualizer = CameraPoseVisualizer(xlim=[extrinsic[:,0,3].min().item()-0.5,extrinsic[:,0,3].max().item()+0.5],ylim=[extrinsic[:,1,3].min().item()-0.5,extrinsic[:,1,3].max().item()+0.5], zlim=[extrinsic[:,2,3].min().item()-0.5,extrinsic[:,2,3].max().item()+0.5], view_mode='xy')
+        xz_camera_visualizer = CameraPoseVisualizer(xlim=[extrinsic[:,0,3].min().item()-0.5,extrinsic[:,0,3].max().item()+0.5],ylim=[extrinsic[:,1,3].min().item()-0.5,extrinsic[:,1,3].max().item()+0.5], zlim=[extrinsic[:,2,3].min().item()-0.5,extrinsic[:,2,3].max().item()+0.5], view_mode='xz')
+        if nerf:
+            debug_name = video_name.replace("/raw/", "/processed/").replace(".mp4", "/debug_fit_3dmm.mp4")
+        else:
+            debug_name = video_name.replace("/video/", "/coeff_fit_debug/").replace(".mp4", "_debug.mp4")
+        try:
+            os.makedirs(os.path.dirname(debug_name), exist_ok=True)
+        except: pass
+        writer = imageio.get_writer(debug_name, fps=25)
+        if id_mode == 'global':
+            id_para = id_para.repeat([exp_para.shape[0], 1])
+        proj_geo = face_model.compute_for_landmark_fit(id_para, exp_para, euler_angle, trans)
+        lm68s = proj_geo[:,:,:2].detach().cpu().numpy()  # [T, 68,2]
+        lm68s = lm68s * img_scale_factor
+        lms = lms * img_scale_factor
+        lm68s[..., 1] = img_h - lm68s[..., 1] # flip the height axis
+        lms[..., 1] = img_h - lms[..., 1] # flip the height axis
+        lm68s = lm68s.astype(int)
+        for i in tqdm.trange(min(250, len(frames)), desc=f'rendering debug video to {debug_name}..'):
+            xy_cam3d_img = xy_camera_visualizer.extrinsic2pyramid(extrinsic[i], focal_len_scaled=0.25)
+            xy_cam3d_img = cv2.resize(xy_cam3d_img, (512,512))
+            xz_cam3d_img = xz_camera_visualizer.extrinsic2pyramid(extrinsic[i], focal_len_scaled=0.25)
+            xz_cam3d_img = cv2.resize(xz_cam3d_img, (512,512))
+            img = copy.deepcopy(frames[i])
+            img2 = copy.deepcopy(frames[i])
+            img = draw_axes(img, euler_angle[i,0].item(), euler_angle[i,1].item(), euler_angle[i,2].item(), lm68s[i][4][0].item(), lm68s[i, 4][1].item(), size=50)
+            gt_lm_color = (255, 0, 0)
+            for lm in lm68s[i]:
+                img = cv2.circle(img, lm, 1, (0, 0, 255), thickness=-1) # blue
+            for gt_lm in lms[i]:
+                img2 = cv2.circle(img2, gt_lm.cpu().numpy().astype(int), 2, gt_lm_color, thickness=1)
+            cano_lm3d_img = np.ones([WH, WH, 3], dtype=np.uint8) * 255
+            for j in range(len(cano_lm3d[i])):
+                x, y, _ = cano_lm3d[i, j]
+                color = (255,0,0)
+                cano_lm3d_img = cv2.circle(cano_lm3d_img, center=(x,y), radius=3, color=color, thickness=-1)
+            cano_lm3d_img = cv2.flip(cano_lm3d_img, 0)
+            _, secc_img = secc_renderer(id_para[0:1], exp_para[i:i+1], euler_angle[i:i+1]*0, trans[i:i+1]*0)
+            secc_img = (secc_img +1)*127.5
+            secc_img = F.interpolate(secc_img, size=(img_h, img_w))
+            secc_img = secc_img.permute(0, 2,3,1).int().cpu().numpy()[0]
+            out_img1 = np.concatenate([img, img2, secc_img], axis=1).astype(np.uint8)
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            out_img2 = np.concatenate([xy_cam3d_img, xz_cam3d_img, cano_lm3d_img], axis=1).astype(np.uint8)
+            out_img = np.concatenate([out_img1, out_img2], axis=0)
+            writer.append_data(out_img)
+        writer.close()
+    # if bad_yaw:
+    #     print(f"Skip {video_name} due to TOO LARGE YAW")
+    #     return False
+    if save:
+        np.save(out_name, coeff_dict, allow_pickle=True)
+    return coeff_dict
+def out_exist_job(vid_name):
+    out_name = vid_name.replace("/video/", "/coeff_fit_mp/").replace(".mp4","_coeff_fit_mp.npy")
+    lms_name = vid_name.replace("/video/", "/lms_2d/").replace(".mp4","_lms.npy")
+    if os.path.exists(out_name) or not os.path.exists(lms_name):
+        return None
+    else:
+        return vid_name
+def get_todo_vid_names(vid_names):
+    if len(vid_names) == 1: # single video, nerf
+        return vid_names
+    todo_vid_names = []
+    for i, res in multiprocess_run_tqdm(out_exist_job, vid_names, num_workers=16):
+        if res is not None:
+            todo_vid_names.append(res)
+    return todo_vid_names
+if __name__ == '__main__':
+    import argparse, glob, tqdm
+    parser = argparse.ArgumentParser()
+    # parser.add_argument("--vid_dir", default='/home/tiger/datasets/raw/CelebV-HQ/video')
+    parser.add_argument("--vid_dir", default='data/raw/videos/May_10s.mp4')
+    parser.add_argument("--ds_name", default='nerf') # 'nerf' | 'CelebV-HQ' | 'TH1KH_512' | etc
+    parser.add_argument("--seed", default=0, type=int)
+    parser.add_argument("--process_id", default=0, type=int)
+    parser.add_argument("--total_process", default=1, type=int)
+    parser.add_argument("--id_mode", default='global', type=str) # global | finegrained
+    parser.add_argument("--keypoint_mode", default='mediapipe', type=str)
+    parser.add_argument("--large_yaw_threshold", default=9999999.9, type=float) # could be 0.7
+    parser.add_argument("--debug", action='store_true')
+    parser.add_argument("--reset", action='store_true')
+    parser.add_argument("--load_names", action="store_true")
+    args = parser.parse_args()
+    vid_dir = args.vid_dir
+    ds_name = args.ds_name
+    load_names = args.load_names
+    print(f"args {args}")
+    if ds_name.lower() == 'nerf': # 处理单个视频
+        vid_names = [vid_dir]
+        out_names = [video_name.replace("/raw/", "/processed/").replace(".mp4","_coeff_fit_mp.npy") for video_name in vid_names]
+    else: # 处理整个数据集
+        if ds_name in ['lrs3_trainval']:
+            vid_name_pattern = os.path.join(vid_dir, "*/*.mp4")
+        elif ds_name in ['TH1KH_512', 'CelebV-HQ']:
+            vid_name_pattern = os.path.join(vid_dir, "*.mp4")
+        elif ds_name in ['lrs2', 'lrs3', 'voxceleb2', 'CMLR']:
+            vid_name_pattern = os.path.join(vid_dir, "*/*/*.mp4")
+        elif ds_name in ["RAVDESS", 'VFHQ']:
+            vid_name_pattern = os.path.join(vid_dir, "*/*/*/*.mp4")
+        else:
+            raise NotImplementedError()
+        vid_names_path = os.path.join(vid_dir, "vid_names.pkl")
+        if os.path.exists(vid_names_path) and load_names:
+            print(f"loading vid names from {vid_names_path}")
+            vid_names = load_file(vid_names_path)
+        else:
+            vid_names = multiprocess_glob(vid_name_pattern)
+        vid_names = sorted(vid_names)
+        print(f"saving vid names to {vid_names_path}")
+        save_file(vid_names_path, vid_names)
+        out_names = [video_name.replace("/video/", "/coeff_fit_mp/").replace(".mp4","_coeff_fit_mp.npy") for video_name in vid_names]
+    print(vid_names[:10])
+    random.seed(args.seed)
+    random.shuffle(vid_names)
+    face_model = ParametricFaceModel(bfm_folder='deep_3drecon/BFM',
+                camera_distance=10, focal=1015, keypoint_mode=args.keypoint_mode)
+    face_model.to(torch.device("cuda:0"))
+    secc_renderer = SECC_Renderer(512)
+    secc_renderer.to("cuda:0")
+    process_id = args.process_id
+    total_process = args.total_process
+    if total_process > 1:
+        assert process_id <= total_process -1
+        num_samples_per_process = len(vid_names) // total_process
+        if process_id == total_process:
+            vid_names = vid_names[process_id * num_samples_per_process : ]
+        else:
+            vid_names = vid_names[process_id * num_samples_per_process : (process_id+1) * num_samples_per_process]
+    if not args.reset:
+        vid_names = get_todo_vid_names(vid_names)
+    failed_img_names = []
+    for i in tqdm.trange(len(vid_names), desc=f"process {process_id}: fitting 3dmm ..."):
+        img_name = vid_names[i]
+        try:
+            is_person_specific_data = ds_name=='nerf'
+            success = fit_3dmm_for_a_video(img_name, is_person_specific_data, args.id_mode, args.debug, large_yaw_threshold=args.large_yaw_threshold)
+            if not success:
+                failed_img_names.append(img_name)
+        except Exception as e:
+            print(img_name, e)
+            failed_img_names.append(img_name)
+        print(f"finished {i + 1} / {len(vid_names)} = {(i + 1) / len(vid_names):.4f}, failed {len(failed_img_names)} / {i + 1} = {len(failed_img_names) / (i + 1):.4f}")
+        sys.stdout.flush()
+    print(f"all failed image names: {failed_img_names}")
+    print(f"All finished!")

data_gen/utils/process_video/inpaint_torso_imgs.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import cv2
+import os
+import numpy as np
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm
+from scipy.ndimage import binary_erosion, binary_dilation
+from tasks.eg3ds.loss_utils.segment_loss.mp_segmenter import MediapipeSegmenter
+seg_model = MediapipeSegmenter()
+def inpaint_torso_job(video_name, idx=None, total=None):
+    raw_img_dir = video_name.replace(".mp4", "").replace("/video/","/gt_imgs/")
+    img_names = glob.glob(os.path.join(raw_img_dir, "*.jpg"))
+    for image_path in tqdm.tqdm(img_names):
+        # read ori image
+        ori_image = cv2.imread(image_path, cv2.IMREAD_UNCHANGED) # [H, W, 3]
+        segmap = seg_model._cal_seg_map(cv2.cvtColor(ori_image, cv2.COLOR_BGR2RGB))
+        head_part = (segmap[1] + segmap[3] + segmap[5]).astype(np.bool)
+        torso_part = (segmap[4]).astype(np.bool)
+        neck_part = (segmap[2]).astype(np.bool)
+        bg_part = segmap[0].astype(np.bool)
+        head_image = cv2.imread(image_path.replace("/gt_imgs/", "/head_imgs/"), cv2.IMREAD_UNCHANGED) # [H, W, 3]
+        torso_image = cv2.imread(image_path.replace("/gt_imgs/", "/torso_imgs/"), cv2.IMREAD_UNCHANGED) # [H, W, 3]
+        bg_image = cv2.imread(image_path.replace("/gt_imgs/", "/bg_imgs/"), cv2.IMREAD_UNCHANGED) # [H, W, 3]
+        # head_part = (head_image[...,0] != 0) & (head_image[...,1] != 0) & (head_image[...,2] != 0)
+        # torso_part = (torso_image[...,0] != 0) & (torso_image[...,1] != 0) & (torso_image[...,2] != 0)
+        # bg_part = (bg_image[...,0] != 0) & (bg_image[...,1] != 0) & (bg_image[...,2] != 0)
+        # get gt image
+        gt_image = ori_image.copy()
+        gt_image[bg_part] = bg_image[bg_part]
+        cv2.imwrite(image_path.replace('ori_imgs', 'gt_imgs'), gt_image)
+        # get torso image
+        torso_image = gt_image.copy() # rgb
+        torso_image[head_part] = 0
+        torso_alpha = 255 * np.ones((gt_image.shape[0], gt_image.shape[1], 1), dtype=np.uint8) # alpha
+        # torso part "vertical" in-painting...
+        L = 8 + 1
+        torso_coords = np.stack(np.nonzero(torso_part), axis=-1) # [M, 2]
+        # lexsort: sort 2D coords first by y then by x,
+        # ref: https://stackoverflow.com/questions/2706605/sorting-a-2d-numpy-array-by-multiple-axes
+        inds = np.lexsort((torso_coords[:, 0], torso_coords[:, 1]))
+        torso_coords = torso_coords[inds]
+        # choose the top pixel for each column
+        u, uid, ucnt = np.unique(torso_coords[:, 1], return_index=True, return_counts=True)
+        top_torso_coords = torso_coords[uid] # [m, 2]
+        # only keep top-is-head pixels
+        top_torso_coords_up = top_torso_coords.copy() - np.array([1, 0]) # [N, 2]
+        mask = head_part[tuple(top_torso_coords_up.T)]
+        if mask.any():
+            top_torso_coords = top_torso_coords[mask]
+            # get the color
+            top_torso_colors = gt_image[tuple(top_torso_coords.T)] # [m, 3]
+            # construct inpaint coords (vertically up, or minus in x)
+            inpaint_torso_coords = top_torso_coords[None].repeat(L, 0) # [L, m, 2]
+            inpaint_offsets = np.stack([-np.arange(L), np.zeros(L, dtype=np.int32)], axis=-1)[:, None] # [L, 1, 2]
+            inpaint_torso_coords += inpaint_offsets
+            inpaint_torso_coords = inpaint_torso_coords.reshape(-1, 2) # [Lm, 2]
+            inpaint_torso_colors = top_torso_colors[None].repeat(L, 0) # [L, m, 3]
+            darken_scaler = 0.98 ** np.arange(L).reshape(L, 1, 1) # [L, 1, 1]
+            inpaint_torso_colors = (inpaint_torso_colors * darken_scaler).reshape(-1, 3) # [Lm, 3]
+            # set color
+            torso_image[tuple(inpaint_torso_coords.T)] = inpaint_torso_colors
+            inpaint_torso_mask = np.zeros_like(torso_image[..., 0]).astype(bool)
+            inpaint_torso_mask[tuple(inpaint_torso_coords.T)] = True
+        else:
+            inpaint_torso_mask = None
+        # neck part "vertical" in-painting...
+        push_down = 4
+        L = 48 + push_down + 1
+        neck_part = binary_dilation(neck_part, structure=np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=bool), iterations=3)
+        neck_coords = np.stack(np.nonzero(neck_part), axis=-1) # [M, 2]
+        # lexsort: sort 2D coords first by y then by x,
+        # ref: https://stackoverflow.com/questions/2706605/sorting-a-2d-numpy-array-by-multiple-axes
+        inds = np.lexsort((neck_coords[:, 0], neck_coords[:, 1]))
+        neck_coords = neck_coords[inds]
+        # choose the top pixel for each column
+        u, uid, ucnt = np.unique(neck_coords[:, 1], return_index=True, return_counts=True)
+        top_neck_coords = neck_coords[uid] # [m, 2]
+        # only keep top-is-head pixels
+        top_neck_coords_up = top_neck_coords.copy() - np.array([1, 0])
+        mask = head_part[tuple(top_neck_coords_up.T)]
+        top_neck_coords = top_neck_coords[mask]
+        # push these top down for 4 pixels to make the neck inpainting more natural...
+        offset_down = np.minimum(ucnt[mask] - 1, push_down)
+        top_neck_coords += np.stack([offset_down, np.zeros_like(offset_down)], axis=-1)
+        # get the color
+        top_neck_colors = gt_image[tuple(top_neck_coords.T)] # [m, 3]
+        # construct inpaint coords (vertically up, or minus in x)
+        inpaint_neck_coords = top_neck_coords[None].repeat(L, 0) # [L, m, 2]
+        inpaint_offsets = np.stack([-np.arange(L), np.zeros(L, dtype=np.int32)], axis=-1)[:, None] # [L, 1, 2]
+        inpaint_neck_coords += inpaint_offsets
+        inpaint_neck_coords = inpaint_neck_coords.reshape(-1, 2) # [Lm, 2]
+        inpaint_neck_colors = top_neck_colors[None].repeat(L, 0) # [L, m, 3]
+        darken_scaler = 0.98 ** np.arange(L).reshape(L, 1, 1) # [L, 1, 1]
+        inpaint_neck_colors = (inpaint_neck_colors * darken_scaler).reshape(-1, 3) # [Lm, 3]
+        # set color
+        torso_image[tuple(inpaint_neck_coords.T)] = inpaint_neck_colors
+        # apply blurring to the inpaint area to avoid vertical-line artifects...
+        inpaint_mask = np.zeros_like(torso_image[..., 0]).astype(bool)
+        inpaint_mask[tuple(inpaint_neck_coords.T)] = True
+        blur_img = torso_image.copy()
+        blur_img = cv2.GaussianBlur(blur_img, (5, 5), cv2.BORDER_DEFAULT)
+        torso_image[inpaint_mask] = blur_img[inpaint_mask]
+        # set mask
+        mask = (neck_part | torso_part | inpaint_mask)
+        if inpaint_torso_mask is not None:
+            mask = mask | inpaint_torso_mask
+        torso_image[~mask] = 0
+        torso_alpha[~mask] = 0
+        cv2.imwrite("0.png", np.concatenate([torso_image, torso_alpha], axis=-1))
+    print(f'[INFO] ===== extracted torso and gt images =====')
+def out_exist_job(vid_name):
+    out_dir1 = vid_name.replace("/video/", "/inpaint_torso_imgs/").replace(".mp4","")
+    out_dir2 = vid_name.replace("/video/", "/inpaint_torso_with_bg_imgs/").replace(".mp4","")
+    out_dir3 = vid_name.replace("/video/", "/torso_imgs/").replace(".mp4","")
+    out_dir4 = vid_name.replace("/video/", "/torso_with_bg_imgs/").replace(".mp4","")
+    if os.path.exists(out_dir1) and os.path.exists(out_dir1) and os.path.exists(out_dir2) and os.path.exists(out_dir3) and os.path.exists(out_dir4):
+        num_frames = len(os.listdir(out_dir1))
+        if len(os.listdir(out_dir1)) == num_frames and len(os.listdir(out_dir2)) == num_frames and len(os.listdir(out_dir3)) == num_frames and len(os.listdir(out_dir4)) == num_frames:
+            return None
+        else:
+            return vid_name
+    else:
+        return vid_name
+def get_todo_vid_names(vid_names):
+    todo_vid_names = []
+    for i, res in multiprocess_run_tqdm(out_exist_job, vid_names, num_workers=16):
+        if res is not None:
+            todo_vid_names.append(res)
+    return todo_vid_names
+if __name__ == '__main__':
+    import argparse, glob, tqdm, random
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--vid_dir", default='/home/tiger/datasets/raw/CelebV-HQ/video')
+    parser.add_argument("--ds_name", default='CelebV-HQ')
+    parser.add_argument("--num_workers", default=48, type=int)
+    parser.add_argument("--seed", default=0, type=int)
+    parser.add_argument("--process_id", default=0, type=int)
+    parser.add_argument("--total_process", default=1, type=int)
+    parser.add_argument("--reset", action='store_true')
+    inpaint_torso_job('/home/tiger/datasets/raw/CelebV-HQ/video/dgdEr-mXQT4_8.mp4')
+    # args = parser.parse_args()
+    # vid_dir = args.vid_dir
+    # ds_name = args.ds_name
+    # if ds_name in ['lrs3_trainval']:
+    #     mp4_name_pattern = os.path.join(vid_dir, "*/*.mp4")
+    # if ds_name in ['TH1KH_512', 'CelebV-HQ']:
+    #     vid_names = glob.glob(os.path.join(vid_dir, "*.mp4"))
+    # elif ds_name in ['lrs2', 'lrs3', 'voxceleb2']:
+    #     vid_name_pattern = os.path.join(vid_dir, "*/*/*.mp4")
+    #     vid_names = glob.glob(vid_name_pattern)
+    # vid_names = sorted(vid_names)
+    # random.seed(args.seed)
+    # random.shuffle(vid_names)
+    # process_id = args.process_id
+    # total_process = args.total_process
+    # if total_process > 1:
+    #     assert process_id <= total_process -1
+    #     num_samples_per_process = len(vid_names) // total_process
+    #     if process_id == total_process:
+    #         vid_names = vid_names[process_id * num_samples_per_process : ]
+    #     else:
+    #         vid_names = vid_names[process_id * num_samples_per_process : (process_id+1) * num_samples_per_process]
+    # if not args.reset:
+    #     vid_names = get_todo_vid_names(vid_names)
+    # print(f"todo videos number: {len(vid_names)}")
+    # fn_args = [(vid_name,i,len(vid_names)) for i, vid_name in enumerate(vid_names)]
+    # for vid_name in multiprocess_run_tqdm(inpaint_torso_job ,fn_args, desc=f"Root process {args.process_id}: extracting segment images", num_workers=args.num_workers):
+    #     pass

data_gen/utils/process_video/resample_video_to_25fps_resize_to_512.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import os, glob
+import cv2
+from utils.commons.os_utils import multiprocess_glob
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm
+def get_video_infos(video_path):
+    vid_cap = cv2.VideoCapture(video_path)
+    height = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    width = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    fps = vid_cap.get(cv2.CAP_PROP_FPS)
+    total_frames = int(vid_cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    return {'height': height, 'width': width, 'fps': fps, 'total_frames':total_frames}
+def extract_img_job(video_name:str):
+    out_path = video_name.replace("/video_raw/","/video/",1)
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    ffmpeg_path = "/usr/bin/ffmpeg"
+    vid_info = get_video_infos(video_name)
+    assert vid_info['width'] == vid_info['height']
+    cmd = f'{ffmpeg_path} -i {video_name} -vf fps={25},scale=w=512:h=512 -q:v 1 -c:v libx264 -pix_fmt yuv420p -b:v 2000k -v quiet -y {out_path}'
+    os.system(cmd)
+def extract_img_job_crop(video_name:str):
+    out_path = video_name.replace("/video_raw/","/video/",1)
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    ffmpeg_path = "/usr/bin/ffmpeg"
+    vid_info = get_video_infos(video_name)
+    wh = min(vid_info['width'], vid_info['height'])
+    cmd = f'{ffmpeg_path} -i {video_name} -vf fps={25},crop={wh}:{wh},scale=w=512:h=512 -q:v 1 -c:v libx264 -pix_fmt yuv420p -b:v 2000k -v quiet -y {out_path}'
+    os.system(cmd)
+def extract_img_job_crop_ravdess(video_name:str):
+    out_path = video_name.replace("/video_raw/","/video/",1)
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    ffmpeg_path = "/usr/bin/ffmpeg"
+    cmd = f'{ffmpeg_path} -i {video_name} -vf fps={25},crop=720:720,scale=w=512:h=512 -q:v 1 -c:v libx264 -pix_fmt yuv420p -b:v 2000k -v quiet -y {out_path}'
+    os.system(cmd)
+if __name__ == '__main__':
+    import argparse, glob, tqdm, random
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--vid_dir", default='/home/tiger/datasets/raw/CelebV-HQ/video_raw/')
+    parser.add_argument("--ds_name", default='CelebV-HQ')
+    parser.add_argument("--num_workers", default=32, type=int)
+    parser.add_argument("--process_id", default=0, type=int)
+    parser.add_argument("--total_process", default=1, type=int)
+    args = parser.parse_args()
+    print(f"args {args}")
+    vid_dir = args.vid_dir
+    ds_name = args.ds_name
+    if ds_name in ['lrs3_trainval']:
+        mp4_name_pattern = os.path.join(vid_dir, "*/*.mp4")
+    elif ds_name in ['TH1KH_512', 'CelebV-HQ']:
+        vid_names = multiprocess_glob(os.path.join(vid_dir, "*.mp4"))
+    elif ds_name in ['lrs2', 'lrs3', 'voxceleb2', 'CMLR']:
+        vid_name_pattern = os.path.join(vid_dir, "*/*/*.mp4")
+        vid_names = multiprocess_glob(vid_name_pattern)
+    elif ds_name in ["RAVDESS", 'VFHQ']:
+        vid_name_pattern = os.path.join(vid_dir, "*/*/*/*.mp4")
+        vid_names = multiprocess_glob(vid_name_pattern)
+    else:
+        raise NotImplementedError()
+    vid_names = sorted(vid_names)
+    print(f"total video number : {len(vid_names)}")
+    print(f"first {vid_names[0]} last {vid_names[-1]}")
+    # exit()
+    process_id = args.process_id
+    total_process = args.total_process
+    if total_process > 1:
+        assert process_id <= total_process -1
+        num_samples_per_process = len(vid_names) // total_process
+        if process_id == total_process:
+            vid_names = vid_names[process_id * num_samples_per_process : ]
+        else:
+            vid_names = vid_names[process_id * num_samples_per_process : (process_id+1) * num_samples_per_process]
+    if ds_name == "RAVDESS":
+        for i, res in multiprocess_run_tqdm(extract_img_job_crop_ravdess, vid_names, num_workers=args.num_workers, desc="resampling videos"):
+            pass
+    elif ds_name == "CMLR":
+        for i, res in multiprocess_run_tqdm(extract_img_job_crop, vid_names, num_workers=args.num_workers, desc="resampling videos"):
+            pass
+    else:
+        for i, res in multiprocess_run_tqdm(extract_img_job, vid_names, num_workers=args.num_workers, desc="resampling videos"):
+            pass

data_gen/utils/process_video/split_video_to_imgs.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os, glob
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm
+from data_gen.utils.path_converter import PathConverter, pc
+# mp4_names = glob.glob("/home/tiger/datasets/raw/CelebV-HQ/video/*.mp4")
+def extract_img_job(video_name, raw_img_dir=None):
+    if raw_img_dir is not None:
+        out_path = raw_img_dir
+    else:
+        out_path = pc.to(video_name.replace(".mp4", ""), "vid", "gt")
+    os.makedirs(out_path, exist_ok=True)
+    ffmpeg_path = "/usr/bin/ffmpeg"
+    cmd = f'{ffmpeg_path} -i {video_name} -vf fps={25},scale=w=512:h=512 -qmin 1 -q:v 1 -start_number 0 -v quiet {os.path.join(out_path, "%8d.jpg")}'
+    os.system(cmd)
+if __name__ == '__main__':
+    import argparse, glob, tqdm, random
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--vid_dir", default='/home/tiger/datasets/raw/CelebV-HQ/video')
+    parser.add_argument("--ds_name", default='CelebV-HQ')
+    parser.add_argument("--num_workers", default=64, type=int)
+    parser.add_argument("--process_id", default=0, type=int)
+    parser.add_argument("--total_process", default=1, type=int)
+    args = parser.parse_args()
+    vid_dir = args.vid_dir
+    ds_name = args.ds_name
+    if ds_name in ['lrs3_trainval']:
+        mp4_name_pattern = os.path.join(vid_dir, "*/*.mp4")
+    elif ds_name in ['TH1KH_512', 'CelebV-HQ']:
+        vid_names = glob.glob(os.path.join(vid_dir, "*.mp4"))
+    elif ds_name in ['lrs2', 'lrs3', 'voxceleb2']:
+        vid_name_pattern = os.path.join(vid_dir, "*/*/*.mp4")
+        vid_names = glob.glob(vid_name_pattern)
+    elif ds_name in ["RAVDESS", 'VFHQ']:
+        vid_name_pattern = os.path.join(vid_dir, "*/*/*/*.mp4")
+        vid_names = glob.glob(vid_name_pattern)
+    vid_names = sorted(vid_names)
+    process_id = args.process_id
+    total_process = args.total_process
+    if total_process > 1:
+        assert process_id <= total_process -1
+        num_samples_per_process = len(vid_names) // total_process
+        if process_id == total_process:
+            vid_names = vid_names[process_id * num_samples_per_process : ]
+        else:
+            vid_names = vid_names[process_id * num_samples_per_process : (process_id+1) * num_samples_per_process]
+    for i, res in multiprocess_run_tqdm(extract_img_job, vid_names, num_workers=args.num_workers, desc="extracting images"):
+        pass

data_util/face3d_helper.py ADDED Viewed

	@@ -0,0 +1,309 @@

+import os
+import numpy as np
+import torch
+import torch.nn as nn
+from scipy.io import loadmat
+from deep_3drecon.deep_3drecon_models.bfm import perspective_projection
+class Face3DHelper(nn.Module):
+    def __init__(self, bfm_dir='deep_3drecon/BFM', keypoint_mode='lm68', use_gpu=True):
+        super().__init__()
+        self.keypoint_mode = keypoint_mode # lm68 | mediapipe
+        self.bfm_dir = bfm_dir
+        self.load_3dmm()
+        if use_gpu: self.to("cuda")
+    def load_3dmm(self):
+        model = loadmat(os.path.join(self.bfm_dir, "BFM_model_front.mat"))
+        self.register_buffer('mean_shape',torch.from_numpy(model['meanshape'].transpose()).float()) # mean face shape. [3*N, 1], N=35709, xyz=3, ==> 3*N=107127
+        mean_shape = self.mean_shape.reshape([-1, 3])
+        # re-center
+        mean_shape = mean_shape - torch.mean(mean_shape, dim=0, keepdims=True)
+        self.mean_shape = mean_shape.reshape([-1, 1])
+        self.register_buffer('id_base',torch.from_numpy(model['idBase']).float()) # identity basis. [3*N,80], we have 80 eigen faces for identity
+        self.register_buffer('exp_base',torch.from_numpy(model['exBase']).float()) # expression basis. [3*N,64], we have 64 eigen faces for expression
+        self.register_buffer('mean_texure',torch.from_numpy(model['meantex'].transpose()).float()) # mean face texture. [3*N,1] (0-255)
+        self.register_buffer('tex_base',torch.from_numpy(model['texBase']).float()) # texture basis. [3*N,80], rgb=3
+        self.register_buffer('point_buf',torch.from_numpy(model['point_buf']).float()) # triangle indices for each vertex that lies in. starts from 1. [N,8] (1-F)
+        self.register_buffer('face_buf',torch.from_numpy(model['tri']).float()) # vertex indices in each triangle. starts from 1. [F,3] (1-N)
+        if self.keypoint_mode == 'mediapipe':
+            self.register_buffer('key_points', torch.from_numpy(np.load("deep_3drecon/BFM/index_mp468_from_mesh35709.npy").astype(np.int64)))
+            unmatch_mask = self.key_points < 0
+            self.key_points[unmatch_mask] = 0
+        else:
+            self.register_buffer('key_points',torch.from_numpy(model['keypoints'].squeeze().astype(np.int_)).long()) # vertex indices of 68 facial landmarks. starts from 1. [68,1]
+        self.register_buffer('key_mean_shape',self.mean_shape.reshape([-1,3])[self.key_points,:])
+        self.register_buffer('key_id_base', self.id_base.reshape([-1,3,80])[self.key_points, :, :].reshape([-1,80]))
+        self.register_buffer('key_exp_base', self.exp_base.reshape([-1,3,64])[self.key_points, :, :].reshape([-1,64]))
+        self.key_id_base_np = self.key_id_base.cpu().numpy()
+        self.key_exp_base_np = self.key_exp_base.cpu().numpy()
+        self.register_buffer('persc_proj', torch.tensor(perspective_projection(focal=1015, center=112)))
+    def split_coeff(self, coeff):
+        """
+        coeff: Tensor[B, T, c=257] or [T, c=257]
+        """
+        ret_dict = {
+            'identity': coeff[..., :80],  # identity, [b, t, c=80]
+            'expression': coeff[..., 80:144],  # expression, [b, t, c=80]
+            'texture': coeff[..., 144:224],  # texture, [b, t, c=80]
+            'euler': coeff[..., 224:227],  # euler euler for pose, [b, t, c=3]
+            'translation':  coeff[..., 254:257], # translation, [b, t, c=3]
+            'gamma': coeff[..., 227:254] # lighting, [b, t, c=27]
+        }
+        return ret_dict
+    def reconstruct_face_mesh(self, id_coeff, exp_coeff):
+        """
+        Generate a pose-independent 3D face mesh!
+        id_coeff: Tensor[T, c=80]
+        exp_coeff: Tensor[T, c=64]
+        """
+        id_coeff = id_coeff.to(self.key_id_base.device)
+        exp_coeff = exp_coeff.to(self.key_id_base.device)
+        mean_face = self.mean_shape.squeeze().reshape([1, -1]) # [3N, 1] ==> [1, 3N]
+        id_base, exp_base = self.id_base, self.exp_base # [3*N, C]
+        identity_diff_face = torch.matmul(id_coeff, id_base.transpose(0,1)) # [t,c],[c,3N] ==> [t,3N]
+        expression_diff_face = torch.matmul(exp_coeff, exp_base.transpose(0,1)) # [t,c],[c,3N] ==> [t,3N]
+        face = mean_face + identity_diff_face + expression_diff_face # [t,3N]
+        face = face.reshape([face.shape[0], -1, 3]) # [t,N,3]
+        # re-centering the face with mean_xyz, so the face will be in [-1, 1]
+        # mean_xyz = self.mean_shape.squeeze().reshape([-1,3]).mean(dim=0) # [1, 3]
+        # face_mesh = face - mean_xyz.unsqueeze(0) # [t,N,3]
+        return face
+    def reconstruct_cano_lm3d(self, id_coeff, exp_coeff):
+        """
+        Generate 3D landmark with keypoint base!
+        id_coeff: Tensor[T, c=80]
+        exp_coeff: Tensor[T, c=64]
+        """
+        id_coeff = id_coeff.to(self.key_id_base.device)
+        exp_coeff = exp_coeff.to(self.key_id_base.device)
+        mean_face = self.key_mean_shape.squeeze().reshape([1, -1]) # [3*68, 1] ==> [1, 3*68]
+        id_base, exp_base = self.key_id_base, self.key_exp_base # [3*68, C]
+        identity_diff_face = torch.matmul(id_coeff, id_base.transpose(0,1)) # [t,c],[c,3*68] ==> [t,3*68]
+        expression_diff_face = torch.matmul(exp_coeff, exp_base.transpose(0,1)) # [t,c],[c,3*68] ==> [t,3*68]
+        face = mean_face + identity_diff_face + expression_diff_face # [t,3N]
+        face = face.reshape([face.shape[0], -1, 3]) # [t,N,3]
+        # re-centering the face with mean_xyz, so the face will be in [-1, 1]
+        # mean_xyz = self.key_mean_shape.squeeze().reshape([-1,3]).mean(dim=0) # [1, 3]
+        # lm3d = face - mean_xyz.unsqueeze(0) # [t,N,3]
+        return face
+    def reconstruct_lm3d(self, id_coeff, exp_coeff, euler, trans, to_camera=True):
+        """
+        Generate 3D landmark with keypoint base!
+        id_coeff: Tensor[T, c=80]
+        exp_coeff: Tensor[T, c=64]
+        """
+        id_coeff = id_coeff.to(self.key_id_base.device)
+        exp_coeff = exp_coeff.to(self.key_id_base.device)
+        mean_face = self.key_mean_shape.squeeze().reshape([1, -1]) # [3*68, 1] ==> [1, 3*68]
+        id_base, exp_base = self.key_id_base, self.key_exp_base # [3*68, C]
+        identity_diff_face = torch.matmul(id_coeff, id_base.transpose(0,1)) # [t,c],[c,3*68] ==> [t,3*68]
+        expression_diff_face = torch.matmul(exp_coeff, exp_base.transpose(0,1)) # [t,c],[c,3*68] ==> [t,3*68]
+        face = mean_face + identity_diff_face + expression_diff_face # [t,3N]
+        face = face.reshape([face.shape[0], -1, 3]) # [t,N,3]
+        # re-centering the face with mean_xyz, so the face will be in [-1, 1]
+        rot = self.compute_rotation(euler)
+        # transform
+        lm3d = face @ rot + trans.unsqueeze(1) # [t, N, 3]
+        # to camera
+        if to_camera:
+            lm3d[...,-1] = 10 - lm3d[...,-1]
+        return lm3d
+    def reconstruct_lm2d_nerf(self, id_coeff, exp_coeff, euler, trans):
+        lm2d = self.reconstruct_lm2d(id_coeff, exp_coeff, euler, trans, to_camera=False)
+        lm2d[..., 0] = 1 - lm2d[..., 0]
+        lm2d[..., 1] = 1 - lm2d[..., 1]
+        return lm2d
+    def reconstruct_lm2d(self, id_coeff, exp_coeff, euler, trans, to_camera=True):
+        """
+        Generate 3D landmark with keypoint base!
+        id_coeff: Tensor[T, c=80]
+        exp_coeff: Tensor[T, c=64]
+        """
+        is_btc_flag = True if id_coeff.ndim == 3 else False
+        if is_btc_flag:
+            b,t,_ = id_coeff.shape
+            id_coeff = id_coeff.reshape([b*t,-1])
+            exp_coeff = exp_coeff.reshape([b*t,-1])
+            euler = euler.reshape([b*t,-1])
+            trans = trans.reshape([b*t,-1])
+        id_coeff = id_coeff.to(self.key_id_base.device)
+        exp_coeff = exp_coeff.to(self.key_id_base.device)
+        mean_face = self.key_mean_shape.squeeze().reshape([1, -1]) # [3*68, 1] ==> [1, 3*68]
+        id_base, exp_base = self.key_id_base, self.key_exp_base # [3*68, C]
+        identity_diff_face = torch.matmul(id_coeff, id_base.transpose(0,1)) # [t,c],[c,3*68] ==> [t,3*68]
+        expression_diff_face = torch.matmul(exp_coeff, exp_base.transpose(0,1)) # [t,c],[c,3*68] ==> [t,3*68]
+        face = mean_face + identity_diff_face + expression_diff_face # [t,3N]
+        face = face.reshape([face.shape[0], -1, 3]) # [t,N,3]
+        # re-centering the face with mean_xyz, so the face will be in [-1, 1]
+        rot = self.compute_rotation(euler)
+        # transform
+        lm3d = face @ rot + trans.unsqueeze(1) # [t, N, 3]
+        # to camera
+        if to_camera:
+            lm3d[...,-1] = 10 - lm3d[...,-1]
+        # to image_plane
+        lm3d = lm3d @ self.persc_proj
+        lm2d = lm3d[..., :2] / lm3d[..., 2:]
+        # flip
+        lm2d[..., 1] = 224 - lm2d[..., 1]
+        lm2d /= 224
+        if is_btc_flag:
+            return lm2d.reshape([b,t,-1,2])
+        return lm2d
+    def compute_rotation(self, euler):
+        """
+        Return:
+            rot              -- torch.tensor, size (B, 3, 3) pts @ trans_mat
+        Parameters:
+            euler           -- torch.tensor, size (B, 3), radian
+        """
+        batch_size = euler.shape[0]
+        euler = euler.to(self.key_id_base.device)
+        ones = torch.ones([batch_size, 1]).to(self.key_id_base.device)
+        zeros = torch.zeros([batch_size, 1]).to(self.key_id_base.device)
+        x, y, z = euler[:, :1], euler[:, 1:2], euler[:, 2:],
+        rot_x = torch.cat([
+            ones, zeros, zeros,
+            zeros, torch.cos(x), -torch.sin(x),
+            zeros, torch.sin(x), torch.cos(x)
+        ], dim=1).reshape([batch_size, 3, 3])
+        rot_y = torch.cat([
+            torch.cos(y), zeros, torch.sin(y),
+            zeros, ones, zeros,
+            -torch.sin(y), zeros, torch.cos(y)
+        ], dim=1).reshape([batch_size, 3, 3])
+        rot_z = torch.cat([
+            torch.cos(z), -torch.sin(z), zeros,
+            torch.sin(z), torch.cos(z), zeros,
+            zeros, zeros, ones
+        ], dim=1).reshape([batch_size, 3, 3])
+        rot = rot_z @ rot_y @ rot_x
+        return rot.permute(0, 2, 1)
+    def reconstruct_idexp_lm3d(self, id_coeff, exp_coeff):
+        """
+        Generate 3D landmark with keypoint base!
+        id_coeff: Tensor[T, c=80]
+        exp_coeff: Tensor[T, c=64]
+        """
+        id_coeff = id_coeff.to(self.key_id_base.device)
+        exp_coeff = exp_coeff.to(self.key_id_base.device)
+        id_base, exp_base = self.key_id_base, self.key_exp_base # [3*68, C]
+        identity_diff_face = torch.matmul(id_coeff, id_base.transpose(0,1)) # [t,c],[c,3*68] ==> [t,3*68]
+        expression_diff_face = torch.matmul(exp_coeff, exp_base.transpose(0,1)) # [t,c],[c,3*68] ==> [t,3*68]
+        face = identity_diff_face + expression_diff_face # [t,3N]
+        face = face.reshape([face.shape[0], -1, 3]) # [t,N,3]
+        lm3d = face * 10
+        return lm3d
+    def reconstruct_idexp_lm3d_np(self, id_coeff, exp_coeff):
+        """
+        Generate 3D landmark with keypoint base!
+        id_coeff: Tensor[T, c=80]
+        exp_coeff: Tensor[T, c=64]
+        """
+        id_base, exp_base = self.key_id_base_np, self.key_exp_base_np # [3*68, C]
+        identity_diff_face = np.dot(id_coeff, id_base.T) # [t,c],[c,3*68] ==> [t,3*68]
+        expression_diff_face = np.dot(exp_coeff, exp_base.T) # [t,c],[c,3*68] ==> [t,3*68]
+        face = identity_diff_face + expression_diff_face # [t,3N]
+        face = face.reshape([face.shape[0], -1, 3]) # [t,N,3]
+        lm3d = face * 10
+        return lm3d
+    def get_eye_mouth_lm_from_lm3d(self, lm3d):
+        eye_lm = lm3d[:, 17:48] # [T, 31, 3]
+        mouth_lm = lm3d[:, 48:68] # [T, 20, 3]
+        return eye_lm, mouth_lm
+    def get_eye_mouth_lm_from_lm3d_batch(self, lm3d):
+        eye_lm = lm3d[:, :, 17:48] # [T, 31, 3]
+        mouth_lm = lm3d[:, :, 48:68] # [T, 20, 3]
+        return eye_lm, mouth_lm
+    def close_mouth_for_idexp_lm3d(self, idexp_lm3d, freeze_as_first_frame=True):
+        idexp_lm3d = idexp_lm3d.reshape([-1, 68,3])
+        num_frames = idexp_lm3d.shape[0]
+        eps = 0.0
+        # [n_landmarks=68,xyz=3], x 代表左右，y代表上下，z代表深度
+        idexp_lm3d[:,49:54, 1] = (idexp_lm3d[:,49:54, 1] + idexp_lm3d[:,range(59,54,-1), 1])/2 + eps * 2
+        idexp_lm3d[:,range(59,54,-1), 1] = (idexp_lm3d[:,49:54, 1] + idexp_lm3d[:,range(59,54,-1), 1])/2 - eps * 2
+        idexp_lm3d[:,61:64, 1] = (idexp_lm3d[:,61:64, 1] + idexp_lm3d[:,range(67,64,-1), 1])/2 + eps
+        idexp_lm3d[:,range(67,64,-1), 1] = (idexp_lm3d[:,61:64, 1] + idexp_lm3d[:,range(67,64,-1), 1])/2 - eps
+        idexp_lm3d[:,49:54, 1] += (0.03 - idexp_lm3d[:,49:54, 1].mean(dim=1) + idexp_lm3d[:,61:64, 1].mean(dim=1)).unsqueeze(1).repeat([1,5])
+        idexp_lm3d[:,range(59,54,-1), 1] += (-0.03 - idexp_lm3d[:,range(59,54,-1), 1].mean(dim=1) + idexp_lm3d[:,range(67,64,-1), 1].mean(dim=1)).unsqueeze(1).repeat([1,5])
+        if freeze_as_first_frame:
+            idexp_lm3d[:, 48:68,] = idexp_lm3d[0, 48:68].unsqueeze(0).clone().repeat([num_frames, 1,1])*0
+        return idexp_lm3d.cpu()
+    def close_eyes_for_idexp_lm3d(self, idexp_lm3d):
+        idexp_lm3d = idexp_lm3d.reshape([-1, 68,3])
+        eps = 0.003
+        idexp_lm3d[:,37:39, 1] = (idexp_lm3d[:,37:39, 1] + idexp_lm3d[:,range(41,39,-1), 1])/2 + eps
+        idexp_lm3d[:,range(41,39,-1), 1] = (idexp_lm3d[:,37:39, 1] + idexp_lm3d[:,range(41,39,-1), 1])/2 - eps
+        idexp_lm3d[:,43:45, 1] = (idexp_lm3d[:,43:45, 1] + idexp_lm3d[:,range(47,45,-1), 1])/2 + eps
+        idexp_lm3d[:,range(47,45,-1), 1] = (idexp_lm3d[:,43:45, 1] + idexp_lm3d[:,range(47,45,-1), 1])/2 - eps
+        return idexp_lm3d
+if __name__ == '__main__':
+    import cv2
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    face_mesh_helper = Face3DHelper('deep_3drecon/BFM')
+    coeff_npy = 'data/coeff_fit_mp/crop_nana_003_coeff_fit_mp.npy'
+    coeff_dict = np.load(coeff_npy, allow_pickle=True).tolist()
+    lm3d = face_mesh_helper.reconstruct_lm2d(torch.tensor(coeff_dict['id']).cuda(), torch.tensor(coeff_dict['exp']).cuda(), torch.tensor(coeff_dict['euler']).cuda(), torch.tensor(coeff_dict['trans']).cuda() )
+    WH = 512
+    lm3d = (lm3d * WH).cpu().int().numpy()
+    eye_idx = list(range(36,48))
+    mouth_idx = list(range(48,68))
+    import imageio
+    debug_name = 'debug_lm3d.mp4'
+    writer = imageio.get_writer(debug_name, fps=25)
+    for i_img in range(len(lm3d)):
+        lm2d = lm3d[i_img ,:, :2] # [68, 2]
+        img = np.ones([WH, WH, 3], dtype=np.uint8) * 255
+        for i in range(len(lm2d)):
+            x, y = lm2d[i]
+            if i in eye_idx:
+                color = (0,0,255)
+            elif i in mouth_idx:
+                color = (0,255,0)
+            else:
+                color = (255,0,0)
+            img = cv2.circle(img, center=(x,y), radius=3, color=color, thickness=-1)
+            img = cv2.putText(img, f"{i}", org=(x,y), fontFace=font, fontScale=0.3, color=(255,0,0))
+        writer.append_data(img)
+    writer.close()

deep_3drecon/BFM/.gitkeep ADDED Viewed

File without changes

deep_3drecon/BFM/basel_53201.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

deep_3drecon/BFM/index_mp468_from_mesh35709_v1.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0d238a90df0c55075c9cea43dab76348421379a75c204931e34dbd2c11fb4b65
+size 3872

deep_3drecon/BFM/index_mp468_from_mesh35709_v2.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe95e2bb10ac1e54804006184d7de3c5ccd0eb98a5f1bd28e00b9f3569f6ce5a
+size 3872

deep_3drecon/BFM/index_mp468_from_mesh35709_v3.1.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:053b8cce8424b722db6ec5b068514eb007a23b4c5afd629449eb08746e643211
+size 3872

deep_3drecon/BFM/index_mp468_from_mesh35709_v3.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b007b3619dd02892b38349ba3d4b10e32bc2eff201c265f25d6ed62f67dbd51
+size 3872

deep_3drecon/BFM/select_vertex_id.mat ADDED Viewed

Binary file (62.3 kB). View file

deep_3drecon/BFM/similarity_Lm3D_all.mat ADDED Viewed

Binary file (994 Bytes). View file

deep_3drecon/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .reconstructor import *

deep_3drecon/bfm_left_eye_faces.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9651756ea2c0fac069a1edf858ed1f125eddc358fa74c529a370c1e7b5730d28
+size 4680

deep_3drecon/bfm_right_eye_faces.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28cb5bbacf578d30a3d5006ec28c617fe5a3ecaeeeb87d9433a884e0f0301a2e
+size 4648

deep_3drecon/data_preparation.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""This script is the data preparation script for Deep3DFaceRecon_pytorch
+"""
+import os
+import numpy as np
+import argparse
+from util.detect_lm68 import detect_68p,load_lm_graph
+from util.skin_mask import get_skin_mask
+from util.generate_list import check_list, write_list
+import warnings
+warnings.filterwarnings("ignore")
+parser = argparse.ArgumentParser()
+parser.add_argument('--data_root', type=str, default='datasets', help='root directory for training data')
+parser.add_argument('--img_folder', nargs="+", required=True, help='folders of training images')
+parser.add_argument('--mode', type=str, default='train', help='train or val')
+opt = parser.parse_args()
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+def data_prepare(folder_list,mode):
+    lm_sess,input_op,output_op = load_lm_graph('./checkpoints/lm_model/68lm_detector.pb') # load a tensorflow version 68-landmark detector
+    for img_folder in folder_list:
+        detect_68p(img_folder,lm_sess,input_op,output_op) # detect landmarks for images
+        get_skin_mask(img_folder) # generate skin attention mask for images
+    # create files that record path to all training data
+    msks_list = []
+    for img_folder in folder_list:
+        path = os.path.join(img_folder, 'mask')
+        msks_list += ['/'.join([img_folder, 'mask', i]) for i in sorted(os.listdir(path)) if 'jpg' in i or
+                                                    'png' in i or 'jpeg' in i or 'PNG' in i]
+    imgs_list = [i.replace('mask/', '') for i in msks_list]
+    lms_list = [i.replace('mask', 'landmarks') for i in msks_list]
+    lms_list = ['.'.join(i.split('.')[:-1]) + '.txt' for i in lms_list]
+    lms_list_final, imgs_list_final, msks_list_final = check_list(lms_list, imgs_list, msks_list) # check if the path is valid
+    write_list(lms_list_final, imgs_list_final, msks_list_final, mode=mode) # save files
+if __name__ == '__main__':
+    print('Datasets:',opt.img_folder)
+    data_prepare([os.path.join(opt.data_root,folder) for folder in opt.img_folder],opt.mode)

deep_3drecon/deep_3drecon_models/__init__.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""This package contains modules related to objective functions, optimizations, and network architectures.
+To add a custom model class called 'dummy', you need to add a file called 'dummy_model.py' and define a subclass DummyModel inherited from BaseModel.
+You need to implement the following five functions:
+    -- <__init__>:                      initialize the class; first call BaseModel.__init__(self, opt).
+    -- <set_input>:                     unpack data from dataset and apply preprocessing.
+    -- <forward>:                       produce intermediate results.
+    -- <optimize_parameters>:           calculate loss, gradients, and update network weights.
+    -- <modify_commandline_options>:    (optionally) add model-specific options and set default options.
+In the function <__init__>, you need to define four lists:
+    -- self.loss_names (str list):          specify the training losses that you want to plot and save.
+    -- self.model_names (str list):         define networks used in our training.
+    -- self.visual_names (str list):        specify the images that you want to display and save.
+    -- self.optimizers (optimizer list):    define and initialize optimizers. You can define one optimizer for each network. If two networks are updated at the same time, you can use itertools.chain to group them. See cycle_gan_model.py for an usage.
+Now you can use the model class by specifying flag '--model dummy'.
+See our template model class 'template_model.py' for more details.
+"""
+import importlib
+from .base_model import BaseModel
+def find_model_using_name(model_name):
+    """Import the module "models/[model_name]_model.py".
+    In the file, the class called DatasetNameModel() will
+    be instantiated. It has to be a subclass of BaseModel,
+    and it is case-insensitive.
+    """
+    model_filename = "deep_3drecon_models." + model_name + "_model"
+    modellib = importlib.import_module(model_filename)
+    model = None
+    target_model_name = model_name.replace('_', '') + 'model'
+    for name, cls in modellib.__dict__.items():
+        if name.lower() == target_model_name.lower() \
+           and issubclass(cls, BaseModel):
+            model = cls
+    if model is None:
+        print("In %s.py, there should be a subclass of BaseModel with class name that matches %s in lowercase." % (model_filename, target_model_name))
+        exit(0)
+    return model
+def get_option_setter(model_name):
+    """Return the static method <modify_commandline_options> of the model class."""
+    model_class = find_model_using_name(model_name)
+    return model_class.modify_commandline_options
+def create_model(opt):
+    """Create a model given the option.
+    This function warps the class CustomDatasetDataLoader.
+    This is the main interface between this package and 'train.py'/'test.py'
+    Example:
+        >>> from models import create_model
+        >>> model = create_model(opt)
+    """
+    model = find_model_using_name(opt.model)
+    instance = model(opt)
+    print("model [%s] was created" % type(instance).__name__)
+    return instance

deep_3drecon/deep_3drecon_models/arcface_torch/README.md ADDED Viewed

	@@ -0,0 +1,218 @@

+# Distributed Arcface Training in Pytorch
+The "arcface_torch" repository is the official implementation of the ArcFace algorithm. It supports distributed and sparse training with multiple distributed training examples, including several memory-saving techniques such as mixed precision training and gradient checkpointing. It also supports training for ViT models and datasets including WebFace42M and Glint360K, two of the largest open-source datasets. Additionally, the repository comes with a built-in tool for converting to ONNX format, making it easy to submit to MFR evaluation systems.
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/killing-two-birds-with-one-stone-efficient/face-verification-on-ijb-c)](https://paperswithcode.com/sota/face-verification-on-ijb-c?p=killing-two-birds-with-one-stone-efficient)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/killing-two-birds-with-one-stone-efficient/face-verification-on-ijb-b)](https://paperswithcode.com/sota/face-verification-on-ijb-b?p=killing-two-birds-with-one-stone-efficient)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/killing-two-birds-with-one-stone-efficient/face-verification-on-agedb-30)](https://paperswithcode.com/sota/face-verification-on-agedb-30?p=killing-two-birds-with-one-stone-efficient)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/killing-two-birds-with-one-stone-efficient/face-verification-on-cfp-fp)](https://paperswithcode.com/sota/face-verification-on-cfp-fp?p=killing-two-birds-with-one-stone-efficient)
+## Requirements
+To avail the latest features of PyTorch, we have upgraded to version 1.12.0.
+- Install [PyTorch](https://pytorch.org/get-started/previous-versions/) (torch>=1.12.0).
+- (Optional) Install [DALI](https://docs.nvidia.com/deeplearning/dali/user-guide/docs/), our doc for [install_dali.md](docs/install_dali.md).
+- `pip install -r requirement.txt`.
+## How to Training
+To train a model, execute the `train.py` script with the path to the configuration files. The sample commands provided below demonstrate the process of conducting distributed training.
+### 1. To run on one GPU:
+```shell
+python train_v2.py configs/ms1mv3_r50_onegpu
+```
+Note:
+It is not recommended to use a single GPU for training, as this may result in longer training times and suboptimal performance. For best results, we suggest using multiple GPUs or a GPU cluster.
+### 2. To run on a machine with 8 GPUs:
+```shell
+torchrun --nproc_per_node=8 train.py configs/ms1mv3_r50
+```
+### 3. To run on 2 machines with 8 GPUs each:
+Node 0:
+```shell
+torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr="ip1" --master_port=12581 train.py configs/wf42m_pfc02_16gpus_r100
+```
+Node 1:
+```shell
+torchrun --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr="ip1" --master_port=12581 train.py configs/wf42m_pfc02_16gpus_r100
+```
+### 4. Run ViT-B on a machine with 24k batchsize:
+```shell
+torchrun --nproc_per_node=8 train_v2.py configs/wf42m_pfc03_40epoch_8gpu_vit_b
+```
+## Download Datasets or Prepare Datasets
+- [MS1MV2](https://github.com/deepinsight/insightface/tree/master/recognition/_datasets_#ms1m-arcface-85k-ids58m-images-57) (87k IDs, 5.8M images)
+- [MS1MV3](https://github.com/deepinsight/insightface/tree/master/recognition/_datasets_#ms1m-retinaface) (93k IDs, 5.2M images)
+- [Glint360K](https://github.com/deepinsight/insightface/tree/master/recognition/partial_fc#4-download) (360k IDs, 17.1M images)
+- [WebFace42M](docs/prepare_webface42m.md) (2M IDs, 42.5M images)
+- [Your Dataset, Click Here!](docs/prepare_custom_dataset.md)
+Note:
+If you want to use DALI for data reading, please use the script 'scripts/shuffle_rec.py' to shuffle the InsightFace style rec before using it.
+Example:
+`python scripts/shuffle_rec.py ms1m-retinaface-t1`
+You will get the "shuffled_ms1m-retinaface-t1" folder, where the samples in the "train.rec" file are shuffled.
+## Model Zoo
+- The models are available for non-commercial research purposes only.
+- All models can be found in here.
+- [Baidu Yun Pan](https://pan.baidu.com/s/1CL-l4zWqsI1oDuEEYVhj-g): e8pw
+- [OneDrive](https://1drv.ms/u/s!AswpsDO2toNKq0lWY69vN58GR6mw?e=p9Ov5d)
+### Performance on IJB-C and [**ICCV2021-MFR**](https://github.com/deepinsight/insightface/blob/master/challenges/mfr/README.md)
+ICCV2021-MFR testset consists of non-celebrities so we can ensure that it has very few overlap with public available face
+recognition training set, such as MS1M and CASIA as they mostly collected from online celebrities.
+As the result, we can evaluate the FAIR performance for different algorithms.
+For **ICCV2021-MFR-ALL** set, TAR is measured on all-to-all 1:1 protocal, with FAR less than 0.000001(e-6). The
+globalised multi-racial testset contains 242,143 identities and 1,624,305 images.
+#### 1. Training on Single-Host GPU
+| Datasets       | Backbone            | **MFR-ALL** | IJB-C(1E-4) | IJB-C(1E-5) | log                                                                                                                                 |
+|:---------------|:--------------------|:------------|:------------|:------------|:------------------------------------------------------------------------------------------------------------------------------------|
+| MS1MV2         | mobilefacenet-0.45G | 62.07       | 93.61       | 90.28       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv2_mbf/training.log)                     |
+| MS1MV2         | r50                 | 75.13       | 95.97       | 94.07       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv2_r50/training.log)                     |
+| MS1MV2         | r100                | 78.12       | 96.37       | 94.27       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv2_r100/training.log)                    |
+| MS1MV3         | mobilefacenet-0.45G | 63.78       | 94.23       | 91.33       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv3_mbf/training.log)                     |
+| MS1MV3         | r50                 | 79.14       | 96.37       | 94.47       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv3_r50/training.log)                     |
+| MS1MV3         | r100                | 81.97       | 96.85       | 95.02       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv3_r100/training.log)                    |
+| Glint360K      | mobilefacenet-0.45G | 70.18       | 95.04       | 92.62       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_mbf/training.log)                  |
+| Glint360K      | r50                 | 86.34       | 97.16       | 95.81       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_r50/training.log)                  |
+| Glint360k      | r100                | 89.52       | 97.55       | 96.38       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_r100/training.log)                 |
+| WF4M           | r100                | 89.87       | 97.19       | 95.48       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/wf4m_r100/training.log)                      |
+| WF12M-PFC-0.2  | r100                | 94.75       | 97.60       | 95.90       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/wf12m_pfc02_r100/training.log)               |
+| WF12M-PFC-0.3  | r100                | 94.71       | 97.64       | 96.01       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/wf12m_pfc03_r100/training.log)               |
+| WF12M          | r100                | 94.69       | 97.59       | 95.97       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/wf12m_r100/training.log)                     |
+| WF42M-PFC-0.2  | r100                | 96.27       | 97.70       | 96.31       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/wf42m_pfc02_r100/training.log)               |
+| WF42M-PFC-0.2  | ViT-T-1.5G          | 92.04       | 97.27       | 95.68       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/wf42m_pfc02_40epoch_8gpu_vit_t/training.log) |
+| WF42M-PFC-0.3  | ViT-B-11G           | 97.16       | 97.91       | 97.05       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/pfc03_wf42m_vit_b_8gpu/training.log)         |
+#### 2. Training on Multi-Host GPU
+| Datasets         | Backbone(bs*gpus) | **MFR-ALL** | IJB-C(1E-4) | IJB-C(1E-5) | Throughout | log                                                                                                                                        |
+|:-----------------|:------------------|:------------|:------------|:------------|:-----------|:-------------------------------------------------------------------------------------------------------------------------------------------|
+| WF42M-PFC-0.2    | r50(512*8)        | 93.83       | 97.53       | 96.16       | ~5900      | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/webface42m_r50_bs4k_pfc02/training.log)             |
+| WF42M-PFC-0.2    | r50(512*16)       | 93.96       | 97.46       | 96.12       | ~11000     | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/webface42m_r50_lr01_pfc02_bs8k_16gpus/training.log) |
+| WF42M-PFC-0.2    | r50(128*32)       | 94.04       | 97.48       | 95.94       | ~17000     | click me                                                                                                                                   |
+| WF42M-PFC-0.2    | r100(128*16)      | 96.28       | 97.80       | 96.57       | ~5200      | click me                                                                                                                                   |
+| WF42M-PFC-0.2    | r100(256*16)      | 96.69       | 97.85       | 96.63       | ~5200      | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/webface42m_r100_bs4k_pfc02/training.log)            |
+| WF42M-PFC-0.0018 | r100(512*32)      | 93.08       | 97.51       | 95.88       | ~10000     | click me                                                                                                                                   |
+| WF42M-PFC-0.2    | r100(128*32)      | 96.57       | 97.83       | 96.50       | ~9800      | click me                                                                                                                                   |
+`r100(128*32)` means backbone is r100, batchsize per gpu is 128, the number of gpus is 32.
+#### 3. ViT For Face Recognition
+| Datasets      | Backbone(bs)  | FLOPs | **MFR-ALL** | IJB-C(1E-4) | IJB-C(1E-5) | Throughout | log                                                                                                                          |
+|:--------------|:--------------|:------|:------------|:------------|:------------|:-----------|:-----------------------------------------------------------------------------------------------------------------------------|
+| WF42M-PFC-0.3 | r18(128*32)   | 2.6   | 79.13       | 95.77       | 93.36       | -          | click me                                                                                                                     |
+| WF42M-PFC-0.3 | r50(128*32)   | 6.3   | 94.03       | 97.48       | 95.94       | -          | click me                                                                                                                     |
+| WF42M-PFC-0.3 | r100(128*32)  | 12.1  | 96.69       | 97.82       | 96.45       | -          | click me                                                                                                                     |
+| WF42M-PFC-0.3 | r200(128*32)  | 23.5  | 97.70       | 97.97       | 96.93       | -          | click me                                                                                                                     |
+| WF42M-PFC-0.3 | VIT-T(384*64) | 1.5   | 92.24       | 97.31       | 95.97       | ~35000     | click me                                                                                                                     |
+| WF42M-PFC-0.3 | VIT-S(384*64) | 5.7   | 95.87       | 97.73       | 96.57       | ~25000     | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/pfc03_wf42m_vit_s_64gpu/training.log) |
+| WF42M-PFC-0.3 | VIT-B(384*64) | 11.4  | 97.42       | 97.90       | 97.04       | ~13800     | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/pfc03_wf42m_vit_b_64gpu/training.log) |
+| WF42M-PFC-0.3 | VIT-L(384*64) | 25.3  | 97.85       | 98.00       | 97.23       | ~9406      | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/pfc03_wf42m_vit_l_64gpu/training.log) |
+`WF42M` means WebFace42M, `PFC-0.3` means negivate class centers sample rate is 0.3.
+#### 4. Noisy Datasets
+| Datasets                 | Backbone | **MFR-ALL** | IJB-C(1E-4) | IJB-C(1E-5) | log      |
+|:-------------------------|:---------|:------------|:------------|:------------|:---------|
+| WF12M-Flip(40%)          | r50      | 43.87       | 88.35       | 80.78       | click me |
+| WF12M-Flip(40%)-PFC-0.1* | r50      | 80.20       | 96.11       | 93.79       | click me |
+| WF12M-Conflict           | r50      | 79.93       | 95.30       | 91.56       | click me |
+| WF12M-Conflict-PFC-0.3*  | r50      | 91.68       | 97.28       | 95.75       | click me |
+`WF12M` means WebFace12M, `+PFC-0.1*` denotes additional abnormal inter-class filtering.
+## Speed Benchmark
+<div><img src="https://github.com/anxiangsir/insightface_arcface_log/blob/master/pfc_exp.png" width = "90%" /></div>
+**Arcface-Torch** is an efficient tool for training large-scale face recognition training sets. When the number of classes in the training sets exceeds one million, the partial FC sampling strategy maintains the same accuracy while providing several times faster training performance and lower GPU memory utilization. The partial FC is a sparse variant of the model parallel architecture for large-scale face recognition, utilizing a sparse softmax that dynamically samples a subset of class centers for each training batch. During each iteration, only a sparse portion of the parameters are updated, leading to a significant reduction in GPU memory requirements and computational demands. With the partial FC approach, it is possible to train sets with up to 29 million identities, the largest to date. Furthermore, the partial FC method supports multi-machine distributed training and mixed precision training.
+More details see
+[speed_benchmark.md](docs/speed_benchmark.md) in docs.
+> 1. Training Speed of Various Parallel Techniques (Samples per Second) on a Tesla V100 32GB x 8 System (Higher is Optimal)
+`-` means training failed because of gpu memory limitations.
+| Number of Identities in Dataset | Data Parallel | Model Parallel | Partial FC 0.1 |
+|:--------------------------------|:--------------|:---------------|:---------------|
+| 125000                          | 4681          | 4824           | 5004           |
+| 1400000                         | **1672**      | 3043           | 4738           |
+| 5500000                         | **-**         | **1389**       | 3975           |
+| 8000000                         | **-**         | **-**          | 3565           |
+| 16000000                        | **-**         | **-**          | 2679           |
+| 29000000                        | **-**         | **-**          | **1855**       |
+> 2. GPU Memory Utilization of Various Parallel Techniques (MB per GPU) on a Tesla V100 32GB x 8 System (Lower is Optimal)
+| Number of Identities in Dataset | Data Parallel | Model Parallel | Partial FC 0.1 |
+|:--------------------------------|:--------------|:---------------|:---------------|
+| 125000                          | 7358          | 5306           | 4868           |
+| 1400000                         | 32252         | 11178          | 6056           |
+| 5500000                         | **-**         | 32188          | 9854           |
+| 8000000                         | **-**         | **-**          | 12310          |
+| 16000000                        | **-**         | **-**          | 19950          |
+| 29000000                        | **-**         | **-**          | 32324          |
+## Citations
+```
+@inproceedings{deng2019arcface,
+  title={Arcface: Additive angular margin loss for deep face recognition},
+  author={Deng, Jiankang and Guo, Jia and Xue, Niannan and Zafeiriou, Stefanos},
+  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+  pages={4690--4699},
+  year={2019}
+}
+@inproceedings{An_2022_CVPR,
+    author={An, Xiang and Deng, Jiankang and Guo, Jia and Feng, Ziyong and Zhu, XuHan and Yang, Jing and Liu, Tongliang},
+    title={Killing Two Birds With One Stone: Efficient and Robust Training of Face Recognition CNNs by Partial FC},
+    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+    month={June},
+    year={2022},
+    pages={4042-4051}
+}
+@inproceedings{zhu2021webface260m,
+  title={Webface260m: A benchmark unveiling the power of million-scale deep face recognition},
+  author={Zhu, Zheng and Huang, Guan and Deng, Jiankang and Ye, Yun and Huang, Junjie and Chen, Xinze and Zhu, Jiagang and Yang, Tian and Lu, Jiwen and Du, Dalong and Zhou, Jie},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={10492--10502},
+  year={2021}
+}
+```

deep_3drecon/deep_3drecon_models/arcface_torch/backbones/__init__.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from .iresnet import iresnet18, iresnet34, iresnet50, iresnet100, iresnet200
+from .mobilefacenet import get_mbf
+def get_model(name, **kwargs):
+    # resnet
+    if name == "r18":
+        return iresnet18(False, **kwargs)
+    elif name == "r34":
+        return iresnet34(False, **kwargs)
+    elif name == "r50":
+        return iresnet50(False, **kwargs)
+    elif name == "r100":
+        return iresnet100(False, **kwargs)
+    elif name == "r200":
+        return iresnet200(False, **kwargs)
+    elif name == "r2060":
+        from .iresnet2060 import iresnet2060
+        return iresnet2060(False, **kwargs)
+    elif name == "mbf":
+        fp16 = kwargs.get("fp16", False)
+        num_features = kwargs.get("num_features", 512)
+        return get_mbf(fp16=fp16, num_features=num_features)
+    elif name == "mbf_large":
+        from .mobilefacenet import get_mbf_large
+        fp16 = kwargs.get("fp16", False)
+        num_features = kwargs.get("num_features", 512)
+        return get_mbf_large(fp16=fp16, num_features=num_features)
+    elif name == "vit_t":
+        num_features = kwargs.get("num_features", 512)
+        from .vit import VisionTransformer
+        return VisionTransformer(
+            img_size=112, patch_size=9, num_classes=num_features, embed_dim=256, depth=12,
+            num_heads=8, drop_path_rate=0.1, norm_layer="ln", mask_ratio=0.1)
+    elif name == "vit_t_dp005_mask0": # For WebFace42M
+        num_features = kwargs.get("num_features", 512)
+        from .vit import VisionTransformer
+        return VisionTransformer(
+            img_size=112, patch_size=9, num_classes=num_features, embed_dim=256, depth=12,
+            num_heads=8, drop_path_rate=0.05, norm_layer="ln", mask_ratio=0.0)
+    elif name == "vit_s":
+        num_features = kwargs.get("num_features", 512)
+        from .vit import VisionTransformer
+        return VisionTransformer(
+            img_size=112, patch_size=9, num_classes=num_features, embed_dim=512, depth=12,
+            num_heads=8, drop_path_rate=0.1, norm_layer="ln", mask_ratio=0.1)
+    elif name == "vit_s_dp005_mask_0":  # For WebFace42M
+        num_features = kwargs.get("num_features", 512)
+        from .vit import VisionTransformer
+        return VisionTransformer(
+            img_size=112, patch_size=9, num_classes=num_features, embed_dim=512, depth=12,
+            num_heads=8, drop_path_rate=0.05, norm_layer="ln", mask_ratio=0.0)
+    elif name == "vit_b":
+        # this is a feature
+        num_features = kwargs.get("num_features", 512)
+        from .vit import VisionTransformer
+        return VisionTransformer(
+            img_size=112, patch_size=9, num_classes=num_features, embed_dim=512, depth=24,
+            num_heads=8, drop_path_rate=0.1, norm_layer="ln", mask_ratio=0.1, using_checkpoint=True)
+    elif name == "vit_b_dp005_mask_005":  # For WebFace42M
+        # this is a feature
+        num_features = kwargs.get("num_features", 512)
+        from .vit import VisionTransformer
+        return VisionTransformer(
+            img_size=112, patch_size=9, num_classes=num_features, embed_dim=512, depth=24,
+            num_heads=8, drop_path_rate=0.05, norm_layer="ln", mask_ratio=0.05, using_checkpoint=True)
+    elif name == "vit_l_dp005_mask_005":  # For WebFace42M
+        # this is a feature
+        num_features = kwargs.get("num_features", 512)
+        from .vit import VisionTransformer
+        return VisionTransformer(
+            img_size=112, patch_size=9, num_classes=num_features, embed_dim=768, depth=24,
+            num_heads=8, drop_path_rate=0.05, norm_layer="ln", mask_ratio=0.05, using_checkpoint=True)
+    else:
+        raise ValueError()

deep_3drecon/deep_3drecon_models/arcface_torch/backbones/iresnet.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import torch
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+__all__ = ['iresnet18', 'iresnet34', 'iresnet50', 'iresnet100', 'iresnet200']
+using_ckpt = False
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=3,
+                     stride=stride,
+                     padding=dilation,
+                     groups=groups,
+                     bias=False,
+                     dilation=dilation)
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=1,
+                     stride=stride,
+                     bias=False)
+class IBasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None,
+                 groups=1, base_width=64, dilation=1):
+        super(IBasicBlock, self).__init__()
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        self.bn1 = nn.BatchNorm2d(inplanes, eps=1e-05,)
+        self.conv1 = conv3x3(inplanes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, eps=1e-05,)
+        self.prelu = nn.PReLU(planes)
+        self.conv2 = conv3x3(planes, planes, stride)
+        self.bn3 = nn.BatchNorm2d(planes, eps=1e-05,)
+        self.downsample = downsample
+        self.stride = stride
+    def forward_impl(self, x):
+        identity = x
+        out = self.bn1(x)
+        out = self.conv1(out)
+        out = self.bn2(out)
+        out = self.prelu(out)
+        out = self.conv2(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        return out
+    def forward(self, x):
+        if self.training and using_ckpt:
+            return checkpoint(self.forward_impl, x)
+        else:
+            return self.forward_impl(x)
+class IResNet(nn.Module):
+    fc_scale = 7 * 7
+    def __init__(self,
+                 block, layers, dropout=0, num_features=512, zero_init_residual=False,
+                 groups=1, width_per_group=64, replace_stride_with_dilation=None, fp16=False):
+        super(IResNet, self).__init__()
+        self.extra_gflops = 0.0
+        self.fp16 = fp16
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError("replace_stride_with_dilation should be None "
+                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(self.inplanes, eps=1e-05)
+        self.prelu = nn.PReLU(self.inplanes)
+        self.layer1 = self._make_layer(block, 64, layers[0], stride=2)
+        self.layer2 = self._make_layer(block,
+                                       128,
+                                       layers[1],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block,
+                                       256,
+                                       layers[2],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block,
+                                       512,
+                                       layers[3],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[2])
+        self.bn2 = nn.BatchNorm2d(512 * block.expansion, eps=1e-05,)
+        self.dropout = nn.Dropout(p=dropout, inplace=True)
+        self.fc = nn.Linear(512 * block.expansion * self.fc_scale, num_features)
+        self.features = nn.BatchNorm1d(num_features, eps=1e-05)
+        nn.init.constant_(self.features.weight, 1.0)
+        self.features.weight.requires_grad = False
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.normal_(m.weight, 0, 0.1)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, IBasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                nn.BatchNorm2d(planes * block.expansion, eps=1e-05, ),
+            )
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, self.groups,
+                  self.base_width, previous_dilation))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(self.inplanes,
+                      planes,
+                      groups=self.groups,
+                      base_width=self.base_width,
+                      dilation=self.dilation))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        with torch.cuda.amp.autocast(self.fp16):
+            x = self.conv1(x)
+            x = self.bn1(x)
+            x = self.prelu(x)
+            x = self.layer1(x)
+            x = self.layer2(x)
+            x = self.layer3(x)
+            x = self.layer4(x)
+            x = self.bn2(x)
+            x = torch.flatten(x, 1)
+            x = self.dropout(x)
+        x = self.fc(x.float() if self.fp16 else x)
+        x = self.features(x)
+        return x
+def _iresnet(arch, block, layers, pretrained, progress, **kwargs):
+    model = IResNet(block, layers, **kwargs)
+    if pretrained:
+        raise ValueError()
+    return model
+def iresnet18(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet18', IBasicBlock, [2, 2, 2, 2], pretrained,
+                    progress, **kwargs)
+def iresnet34(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet34', IBasicBlock, [3, 4, 6, 3], pretrained,
+                    progress, **kwargs)
+def iresnet50(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet50', IBasicBlock, [3, 4, 14, 3], pretrained,
+                    progress, **kwargs)
+def iresnet100(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet100', IBasicBlock, [3, 13, 30, 3], pretrained,
+                    progress, **kwargs)
+def iresnet200(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet200', IBasicBlock, [6, 26, 60, 6], pretrained,
+                    progress, **kwargs)

deep_3drecon/deep_3drecon_models/arcface_torch/backbones/iresnet2060.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import torch
+from torch import nn
+assert torch.__version__ >= "1.8.1"
+from torch.utils.checkpoint import checkpoint_sequential
+__all__ = ['iresnet2060']
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=3,
+                     stride=stride,
+                     padding=dilation,
+                     groups=groups,
+                     bias=False,
+                     dilation=dilation)
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=1,
+                     stride=stride,
+                     bias=False)
+class IBasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None,
+                 groups=1, base_width=64, dilation=1):
+        super(IBasicBlock, self).__init__()
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        self.bn1 = nn.BatchNorm2d(inplanes, eps=1e-05, )
+        self.conv1 = conv3x3(inplanes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, eps=1e-05, )
+        self.prelu = nn.PReLU(planes)
+        self.conv2 = conv3x3(planes, planes, stride)
+        self.bn3 = nn.BatchNorm2d(planes, eps=1e-05, )
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        identity = x
+        out = self.bn1(x)
+        out = self.conv1(out)
+        out = self.bn2(out)
+        out = self.prelu(out)
+        out = self.conv2(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        return out
+class IResNet(nn.Module):
+    fc_scale = 7 * 7
+    def __init__(self,
+                 block, layers, dropout=0, num_features=512, zero_init_residual=False,
+                 groups=1, width_per_group=64, replace_stride_with_dilation=None, fp16=False):
+        super(IResNet, self).__init__()
+        self.fp16 = fp16
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError("replace_stride_with_dilation should be None "
+                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(self.inplanes, eps=1e-05)
+        self.prelu = nn.PReLU(self.inplanes)
+        self.layer1 = self._make_layer(block, 64, layers[0], stride=2)
+        self.layer2 = self._make_layer(block,
+                                       128,
+                                       layers[1],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block,
+                                       256,
+                                       layers[2],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block,
+                                       512,
+                                       layers[3],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[2])
+        self.bn2 = nn.BatchNorm2d(512 * block.expansion, eps=1e-05, )
+        self.dropout = nn.Dropout(p=dropout, inplace=True)
+        self.fc = nn.Linear(512 * block.expansion * self.fc_scale, num_features)
+        self.features = nn.BatchNorm1d(num_features, eps=1e-05)
+        nn.init.constant_(self.features.weight, 1.0)
+        self.features.weight.requires_grad = False
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.normal_(m.weight, 0, 0.1)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, IBasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                nn.BatchNorm2d(planes * block.expansion, eps=1e-05, ),
+            )
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, self.groups,
+                  self.base_width, previous_dilation))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(self.inplanes,
+                      planes,
+                      groups=self.groups,
+                      base_width=self.base_width,
+                      dilation=self.dilation))
+        return nn.Sequential(*layers)
+    def checkpoint(self, func, num_seg, x):
+        if self.training:
+            return checkpoint_sequential(func, num_seg, x)
+        else:
+            return func(x)
+    def forward(self, x):
+        with torch.cuda.amp.autocast(self.fp16):
+            x = self.conv1(x)
+            x = self.bn1(x)
+            x = self.prelu(x)
+            x = self.layer1(x)
+            x = self.checkpoint(self.layer2, 20, x)
+            x = self.checkpoint(self.layer3, 100, x)
+            x = self.layer4(x)
+            x = self.bn2(x)
+            x = torch.flatten(x, 1)
+            x = self.dropout(x)
+        x = self.fc(x.float() if self.fp16 else x)
+        x = self.features(x)
+        return x
+def _iresnet(arch, block, layers, pretrained, progress, **kwargs):
+    model = IResNet(block, layers, **kwargs)
+    if pretrained:
+        raise ValueError()
+    return model
+def iresnet2060(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet2060', IBasicBlock, [3, 128, 1024 - 128, 3], pretrained, progress, **kwargs)

deep_3drecon/deep_3drecon_models/arcface_torch/backbones/mobilefacenet.py ADDED Viewed

	@@ -0,0 +1,147 @@

+'''
+Adapted from https://github.com/cavalleria/cavaface.pytorch/blob/master/backbone/mobilefacenet.py
+Original author cavalleria
+'''
+import torch.nn as nn
+from torch.nn import Linear, Conv2d, BatchNorm1d, BatchNorm2d, PReLU, Sequential, Module
+import torch
+class Flatten(Module):
+    def forward(self, x):
+        return x.view(x.size(0), -1)
+class ConvBlock(Module):
+    def __init__(self, in_c, out_c, kernel=(1, 1), stride=(1, 1), padding=(0, 0), groups=1):
+        super(ConvBlock, self).__init__()
+        self.layers = nn.Sequential(
+            Conv2d(in_c, out_c, kernel, groups=groups, stride=stride, padding=padding, bias=False),
+            BatchNorm2d(num_features=out_c),
+            PReLU(num_parameters=out_c)
+        )
+    def forward(self, x):
+        return self.layers(x)
+class LinearBlock(Module):
+    def __init__(self, in_c, out_c, kernel=(1, 1), stride=(1, 1), padding=(0, 0), groups=1):
+        super(LinearBlock, self).__init__()
+        self.layers = nn.Sequential(
+            Conv2d(in_c, out_c, kernel, stride, padding, groups=groups, bias=False),
+            BatchNorm2d(num_features=out_c)
+        )
+    def forward(self, x):
+        return self.layers(x)
+class DepthWise(Module):
+    def __init__(self, in_c, out_c, residual=False, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=1):
+        super(DepthWise, self).__init__()
+        self.residual = residual
+        self.layers = nn.Sequential(
+            ConvBlock(in_c, out_c=groups, kernel=(1, 1), padding=(0, 0), stride=(1, 1)),
+            ConvBlock(groups, groups, groups=groups, kernel=kernel, padding=padding, stride=stride),
+            LinearBlock(groups, out_c, kernel=(1, 1), padding=(0, 0), stride=(1, 1))
+        )
+    def forward(self, x):
+        short_cut = None
+        if self.residual:
+            short_cut = x
+        x = self.layers(x)
+        if self.residual:
+            output = short_cut + x
+        else:
+            output = x
+        return output
+class Residual(Module):
+    def __init__(self, c, num_block, groups, kernel=(3, 3), stride=(1, 1), padding=(1, 1)):
+        super(Residual, self).__init__()
+        modules = []
+        for _ in range(num_block):
+            modules.append(DepthWise(c, c, True, kernel, stride, padding, groups))
+        self.layers = Sequential(*modules)
+    def forward(self, x):
+        return self.layers(x)
+class GDC(Module):
+    def __init__(self, embedding_size):
+        super(GDC, self).__init__()
+        self.layers = nn.Sequential(
+            LinearBlock(512, 512, groups=512, kernel=(7, 7), stride=(1, 1), padding=(0, 0)),
+            Flatten(),
+            Linear(512, embedding_size, bias=False),
+            BatchNorm1d(embedding_size))
+    def forward(self, x):
+        return self.layers(x)
+class MobileFaceNet(Module):
+    def __init__(self, fp16=False, num_features=512, blocks=(1, 4, 6, 2), scale=2):
+        super(MobileFaceNet, self).__init__()
+        self.scale = scale
+        self.fp16 = fp16
+        self.layers = nn.ModuleList()
+        self.layers.append(
+            ConvBlock(3, 64 * self.scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1))
+        )
+        if blocks[0] == 1:
+            self.layers.append(
+                ConvBlock(64 * self.scale, 64 * self.scale, kernel=(3, 3), stride=(1, 1), padding=(1, 1), groups=64)
+            )
+        else:
+            self.layers.append(
+                Residual(64 * self.scale, num_block=blocks[0], groups=128, kernel=(3, 3), stride=(1, 1), padding=(1, 1)),
+            )
+        self.layers.extend(
+        [
+            DepthWise(64 * self.scale, 64 * self.scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=128),
+            Residual(64 * self.scale, num_block=blocks[1], groups=128, kernel=(3, 3), stride=(1, 1), padding=(1, 1)),
+            DepthWise(64 * self.scale, 128 * self.scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=256),
+            Residual(128 * self.scale, num_block=blocks[2], groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1)),
+            DepthWise(128 * self.scale, 128 * self.scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=512),
+            Residual(128 * self.scale, num_block=blocks[3], groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1)),
+        ])
+        self.conv_sep = ConvBlock(128 * self.scale, 512, kernel=(1, 1), stride=(1, 1), padding=(0, 0))
+        self.features = GDC(num_features)
+        self._initialize_weights()
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                if m.bias is not None:
+                    m.bias.data.zero_()
+    def forward(self, x):
+        with torch.cuda.amp.autocast(self.fp16):
+            for func in self.layers:
+                x = func(x)
+        x = self.conv_sep(x.float() if self.fp16 else x)
+        x = self.features(x)
+        return x
+def get_mbf(fp16, num_features, blocks=(1, 4, 6, 2), scale=2):
+    return MobileFaceNet(fp16, num_features, blocks, scale=scale)
+def get_mbf_large(fp16, num_features, blocks=(2, 8, 12, 4), scale=4):
+    return MobileFaceNet(fp16, num_features, blocks, scale=scale)

deep_3drecon/deep_3drecon_models/arcface_torch/backbones/vit.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import torch
+import torch.nn as nn
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from typing import Optional, Callable
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.ReLU6, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class VITBatchNorm(nn.Module):
+    def __init__(self, num_features):
+        super().__init__()
+        self.num_features = num_features
+        self.bn = nn.BatchNorm1d(num_features=num_features)
+    def forward(self, x):
+        return self.bn(x)
+class Attention(nn.Module):
+    def __init__(self,
+                 dim: int,
+                 num_heads: int = 8,
+                 qkv_bias: bool = False,
+                 qk_scale: Optional[None] = None,
+                 attn_drop: float = 0.,
+                 proj_drop: float = 0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        with torch.cuda.amp.autocast(True):
+            batch_size, num_token, embed_dim = x.shape
+            #qkv is [3,batch_size,num_heads,num_token, embed_dim//num_heads]
+            qkv = self.qkv(x).reshape(
+                batch_size, num_token, 3, self.num_heads, embed_dim // self.num_heads).permute(2, 0, 3, 1, 4)
+        with torch.cuda.amp.autocast(False):
+            q, k, v = qkv[0].float(), qkv[1].float(), qkv[2].float()
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2).reshape(batch_size, num_token, embed_dim)
+        with torch.cuda.amp.autocast(True):
+            x = self.proj(x)
+            x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self,
+                 dim: int,
+                 num_heads: int,
+                 num_patches: int,
+                 mlp_ratio: float = 4.,
+                 qkv_bias: bool = False,
+                 qk_scale: Optional[None] = None,
+                 drop: float = 0.,
+                 attn_drop: float = 0.,
+                 drop_path: float = 0.,
+                 act_layer: Callable = nn.ReLU6,
+                 norm_layer: str = "ln",
+                 patch_n: int = 144):
+        super().__init__()
+        if norm_layer == "bn":
+            self.norm1 = VITBatchNorm(num_features=num_patches)
+            self.norm2 = VITBatchNorm(num_features=num_patches)
+        elif norm_layer == "ln":
+            self.norm1 = nn.LayerNorm(dim)
+            self.norm2 = nn.LayerNorm(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer, drop=drop)
+        self.extra_gflops = (num_heads * patch_n * (dim//num_heads)*patch_n * 2) / (1000**3)
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        with torch.cuda.amp.autocast(True):
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class PatchEmbed(nn.Module):
+    def __init__(self, img_size=108, patch_size=9, in_channels=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * \
+            (img_size[0] // patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.proj = nn.Conv2d(in_channels, embed_dim,
+                              kernel_size=patch_size, stride=patch_size)
+    def forward(self, x):
+        batch_size, channels, height, width = x.shape
+        assert height == self.img_size[0] and width == self.img_size[1], \
+            f"Input image size ({height}*{width}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+class VisionTransformer(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+    def __init__(self,
+                 img_size: int = 112,
+                 patch_size: int = 16,
+                 in_channels: int = 3,
+                 num_classes: int = 1000,
+                 embed_dim: int = 768,
+                 depth: int = 12,
+                 num_heads: int = 12,
+                 mlp_ratio: float = 4.,
+                 qkv_bias: bool = False,
+                 qk_scale: Optional[None] = None,
+                 drop_rate: float = 0.,
+                 attn_drop_rate: float = 0.,
+                 drop_path_rate: float = 0.,
+                 hybrid_backbone: Optional[None] = None,
+                 norm_layer: str = "ln",
+                 mask_ratio = 0.1,
+                 using_checkpoint = False,
+                 ):
+        super().__init__()
+        self.num_classes = num_classes
+        # num_features for consistency with other models
+        self.num_features = self.embed_dim = embed_dim
+        if hybrid_backbone is not None:
+            raise ValueError
+        else:
+            self.patch_embed = PatchEmbed(img_size=img_size, patch_size=patch_size, in_channels=in_channels, embed_dim=embed_dim)
+        self.mask_ratio = mask_ratio
+        self.using_checkpoint = using_checkpoint
+        num_patches = self.patch_embed.num_patches
+        self.num_patches = num_patches
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        patch_n = (img_size//patch_size)**2
+        self.blocks = nn.ModuleList(
+            [
+                Block(dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                      drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                      num_patches=num_patches, patch_n=patch_n)
+                for i in range(depth)]
+        )
+        self.extra_gflops = 0.0
+        for _block in self.blocks:
+            self.extra_gflops += _block.extra_gflops
+        if norm_layer == "ln":
+            self.norm = nn.LayerNorm(embed_dim)
+        elif norm_layer == "bn":
+            self.norm = VITBatchNorm(self.num_patches)
+        # features head
+        self.feature = nn.Sequential(
+            nn.Linear(in_features=embed_dim * num_patches, out_features=embed_dim, bias=False),
+            nn.BatchNorm1d(num_features=embed_dim, eps=2e-5),
+            nn.Linear(in_features=embed_dim, out_features=num_classes, bias=False),
+            nn.BatchNorm1d(num_features=num_classes, eps=2e-5)
+        )
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        torch.nn.init.normal_(self.mask_token, std=.02)
+        trunc_normal_(self.pos_embed, std=.02)
+        # trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+    def get_classifier(self):
+        return self.head
+    def random_masking(self, x, mask_ratio=0.1):
+        """
+        Perform per-sample random masking by per-sample shuffling.
+        Per-sample shuffling is done by argsort random noise.
+        x: [N, L, D], sequence
+        """
+        N, L, D = x.size()  # batch, length, dim
+        len_keep = int(L * (1 - mask_ratio))
+        noise = torch.rand(N, L, device=x.device)  # noise in [0, 1]
+        # sort noise for each sample
+        # ascend: small is keep, large is remove
+        ids_shuffle = torch.argsort(noise, dim=1)
+        ids_restore = torch.argsort(ids_shuffle, dim=1)
+        # keep the first subset
+        ids_keep = ids_shuffle[:, :len_keep]
+        x_masked = torch.gather(
+            x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D))
+        # generate the binary mask: 0 is keep, 1 is remove
+        mask = torch.ones([N, L], device=x.device)
+        mask[:, :len_keep] = 0
+        # unshuffle to get the binary mask
+        mask = torch.gather(mask, dim=1, index=ids_restore)
+        return x_masked, mask, ids_restore
+    def forward_features(self, x):
+        B = x.shape[0]
+        x = self.patch_embed(x)
+        x = x + self.pos_embed
+        x = self.pos_drop(x)
+        if self.training and self.mask_ratio > 0:
+            x, _, ids_restore = self.random_masking(x)
+        for func in self.blocks:
+            if self.using_checkpoint and self.training:
+                from torch.utils.checkpoint import checkpoint
+                x = checkpoint(func, x)
+            else:
+                x = func(x)
+        x = self.norm(x.float())
+        if self.training and self.mask_ratio > 0:
+            mask_tokens = self.mask_token.repeat(x.shape[0], ids_restore.shape[1] - x.shape[1], 1)
+            x_ = torch.cat([x[:, :, :], mask_tokens], dim=1)  # no cls token
+            x_ = torch.gather(x_, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[2]))  # unshuffle
+            x = x_
+        return torch.reshape(x, (B, self.num_patches * self.embed_dim))
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.feature(x)
+        return x