Akira00 commited on
Commit
721e031
1 Parent(s): 9ee3aaa

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +26 -0
  2. .gitignore +179 -0
  3. .idea/.gitignore +8 -0
  4. .idea/TANGO.iml +12 -0
  5. .idea/inspectionProfiles/profiles_settings.xml +6 -0
  6. .idea/modules.xml +8 -0
  7. .idea/vcs.xml +8 -0
  8. .idea/workspace.xml +114 -0
  9. LICENSE +407 -0
  10. README.md +111 -12
  11. app.py +796 -0
  12. assets/app.py +149 -0
  13. assets/demo0.gif +3 -0
  14. assets/demo1.gif +3 -0
  15. assets/demo2.gif +3 -0
  16. assets/demo3.gif +3 -0
  17. assets/demo5.gif +3 -0
  18. assets/demo6.gif +3 -0
  19. assets/demo7.gif +3 -0
  20. assets/demo8.gif +3 -0
  21. assets/demo9.gif +3 -0
  22. assets/hg.png +3 -0
  23. assets/inference.py +125 -0
  24. assets/transforms.py +344 -0
  25. assets/video.png +3 -0
  26. audio_0_retri_0_watermarked.mp4 +3 -0
  27. configs/gradio.yaml +77 -0
  28. configs/gradio_speaker1.yaml +77 -0
  29. configs/gradio_speaker7.yaml +77 -0
  30. configs/gradio_speaker8.yaml +77 -0
  31. configs/gradio_speaker9.yaml +77 -0
  32. create_graph.py +507 -0
  33. datasets/beat2_v5.py +80 -0
  34. datasets/cached_audio/101099-00_18_09-00_18_19.mp4 +3 -0
  35. datasets/cached_audio/1wrQ6Msp7wM_00-00-39.69_00-00-45.68.mp4 +3 -0
  36. datasets/cached_audio/demo0.mp4 +0 -0
  37. datasets/cached_audio/demo1.mp4 +3 -0
  38. datasets/cached_audio/demo2.mp4 +0 -0
  39. datasets/cached_audio/demo3.mp4 +3 -0
  40. datasets/cached_audio/demo4.mp4 +3 -0
  41. datasets/cached_audio/demo5.mp4 +3 -0
  42. datasets/cached_audio/demo6.mp4 +0 -0
  43. datasets/cached_audio/demo7.mp4 +3 -0
  44. datasets/cached_audio/demo8.mp4 +0 -0
  45. datasets/cached_audio/demo9.mp4 +3 -0
  46. datasets/cached_audio/example_female_voice_9_seconds.wav +0 -0
  47. datasets/cached_audio/example_male_voice_9_seconds.wav +0 -0
  48. datasets/cached_audio/female_test_V1.mp4 +3 -0
  49. datasets/cached_audio/speaker12_10_BVHw8aCPATM_00-01-05.0_00-01-10.0.mp4 +3 -0
  50. datasets/cached_audio/speaker7_iuYlGRnC7J8_00-00-0.00_00-00-3.25.mp4 +3 -0
.gitattributes CHANGED
@@ -33,3 +33,29 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/demo0.gif filter=lfs diff=lfs merge=lfs -text
37
+ assets/demo1.gif filter=lfs diff=lfs merge=lfs -text
38
+ assets/demo2.gif filter=lfs diff=lfs merge=lfs -text
39
+ assets/demo3.gif filter=lfs diff=lfs merge=lfs -text
40
+ assets/demo5.gif filter=lfs diff=lfs merge=lfs -text
41
+ assets/demo6.gif filter=lfs diff=lfs merge=lfs -text
42
+ assets/demo7.gif filter=lfs diff=lfs merge=lfs -text
43
+ assets/demo8.gif filter=lfs diff=lfs merge=lfs -text
44
+ assets/demo9.gif filter=lfs diff=lfs merge=lfs -text
45
+ assets/hg.png filter=lfs diff=lfs merge=lfs -text
46
+ assets/video.png filter=lfs diff=lfs merge=lfs -text
47
+ audio_0_retri_0_watermarked.mp4 filter=lfs diff=lfs merge=lfs -text
48
+ datasets/cached_audio/101099-00_18_09-00_18_19.mp4 filter=lfs diff=lfs merge=lfs -text
49
+ datasets/cached_audio/1wrQ6Msp7wM_00-00-39.69_00-00-45.68.mp4 filter=lfs diff=lfs merge=lfs -text
50
+ datasets/cached_audio/demo1.mp4 filter=lfs diff=lfs merge=lfs -text
51
+ datasets/cached_audio/demo3.mp4 filter=lfs diff=lfs merge=lfs -text
52
+ datasets/cached_audio/demo4.mp4 filter=lfs diff=lfs merge=lfs -text
53
+ datasets/cached_audio/demo5.mp4 filter=lfs diff=lfs merge=lfs -text
54
+ datasets/cached_audio/demo7.mp4 filter=lfs diff=lfs merge=lfs -text
55
+ datasets/cached_audio/demo9.mp4 filter=lfs diff=lfs merge=lfs -text
56
+ datasets/cached_audio/female_test_V1.mp4 filter=lfs diff=lfs merge=lfs -text
57
+ datasets/cached_audio/speaker12_10_BVHw8aCPATM_00-01-05.0_00-01-10.0.mp4 filter=lfs diff=lfs merge=lfs -text
58
+ datasets/cached_audio/speaker7_iuYlGRnC7J8_00-00-0.00_00-00-3.25.mp4 filter=lfs diff=lfs merge=lfs -text
59
+ datasets/cached_audio/speaker9_o7Ik1OB4TaE_00-00-38.15_00-00-42.33.mp4 filter=lfs diff=lfs merge=lfs -text
60
+ datasets/data_json/show-oliver-s40_w128.json filter=lfs diff=lfs merge=lfs -text
61
+ datasets/data_json/show-oliver-s40_w64.json filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+
30
+ # watermarked videos will be saved at root directory, but we don't want to track them
31
+ demo*watermarked.mp4
32
+ outputs/
33
+ SMPLer-X/common/utils/human_model_files/
34
+ SMPLer-X/pretrained_models/
35
+ Wav2Lip/checkpoints/
36
+ datasets/cached_ckpts/
37
+ datasets/cached_graph/
38
+ emage/smplx_models/
39
+ frame-interpolation-pytorch/*.pt
40
+
41
+ # submodules
42
+ Wav2Lip/
43
+ frame-interpolation-pytorch/
44
+ SMPLer-X/
45
+
46
+ # PyInstaller
47
+ # Usually these files are written by a python script from a template
48
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
49
+ *.manifest
50
+ *.spec
51
+
52
+ # Installer logs
53
+ pip-log.txt
54
+ pip-delete-this-directory.txt
55
+
56
+ # Unit test / coverage reports
57
+ htmlcov/
58
+ .tox/
59
+ .nox/
60
+ .coverage
61
+ .coverage.*
62
+ .cache
63
+ nosetests.xml
64
+ coverage.xml
65
+ *.cover
66
+ *.py,cover
67
+ .hypothesis/
68
+ .pytest_cache/
69
+ cover/
70
+
71
+ # Translations
72
+ *.mo
73
+ *.pot
74
+
75
+ # Django stuff:
76
+ *.log
77
+ local_settings.py
78
+ db.sqlite3
79
+ db.sqlite3-journal
80
+
81
+ # Flask stuff:
82
+ instance/
83
+ .webassets-cache
84
+
85
+ # Scrapy stuff:
86
+ .scrapy
87
+
88
+ # Sphinx documentation
89
+ docs/_build/
90
+
91
+ # PyBuilder
92
+ .pybuilder/
93
+ target/
94
+
95
+ # Jupyter Notebook
96
+ .ipynb_checkpoints
97
+
98
+ # IPython
99
+ profile_default/
100
+ ipython_config.py
101
+
102
+ # pyenv
103
+ # For a library or package, you might want to ignore these files since the code is
104
+ # intended to run in multiple environments; otherwise, check them in:
105
+ # .python-version
106
+
107
+ # pipenv
108
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
109
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
110
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
111
+ # install all needed dependencies.
112
+ #Pipfile.lock
113
+
114
+ # poetry
115
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
116
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
117
+ # commonly ignored for libraries.
118
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
119
+ #poetry.lock
120
+
121
+ # pdm
122
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
123
+ #pdm.lock
124
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
125
+ # in version control.
126
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
127
+ .pdm.toml
128
+ .pdm-python
129
+ .pdm-build/
130
+
131
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
132
+ __pypackages__/
133
+
134
+ # Celery stuff
135
+ celerybeat-schedule
136
+ celerybeat.pid
137
+
138
+ # SageMath parsed files
139
+ *.sage.py
140
+
141
+ # Environments
142
+ .env
143
+ .venv
144
+ env/
145
+ venv/
146
+ ENV/
147
+ env.bak/
148
+ venv.bak/
149
+
150
+ # Spyder project settings
151
+ .spyderproject
152
+ .spyproject
153
+
154
+ # Rope project settings
155
+ .ropeproject
156
+
157
+ # mkdocs documentation
158
+ /site
159
+
160
+ # mypy
161
+ .mypy_cache/
162
+ .dmypy.json
163
+ dmypy.json
164
+
165
+ # Pyre type checker
166
+ .pyre/
167
+
168
+ # pytype static type analyzer
169
+ .pytype/
170
+
171
+ # Cython debug symbols
172
+ cython_debug/
173
+
174
+ # PyCharm
175
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
176
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
177
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
178
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
179
+ #.idea/
.idea/.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # 默认忽略的文件
2
+ /shelf/
3
+ /workspace.xml
4
+ # 基于编辑器的 HTTP 客户端请求
5
+ /httpRequests/
6
+ # Datasource local storage ignored files
7
+ /dataSources/
8
+ /dataSources.local.xml
.idea/TANGO.iml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$" />
5
+ <orderEntry type="inheritedJdk" />
6
+ <orderEntry type="sourceFolder" forTests="false" />
7
+ </component>
8
+ <component name="PyDocumentationSettings">
9
+ <option name="format" value="PLAIN" />
10
+ <option name="myDocStringFormat" value="Plain" />
11
+ </component>
12
+ </module>
.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/TANGO.iml" filepath="$PROJECT_DIR$/.idea/TANGO.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
.idea/vcs.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="" vcs="Git" />
5
+ <mapping directory="$PROJECT_DIR$/Wav2Lip" vcs="Git" />
6
+ <mapping directory="$PROJECT_DIR$/frame-interpolation-pytorch" vcs="Git" />
7
+ </component>
8
+ </project>
.idea/workspace.xml ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="AutoImportSettings">
4
+ <option name="autoReloadType" value="SELECTIVE" />
5
+ </component>
6
+ <component name="ChangeListManager">
7
+ <list default="true" id="09d685d1-f444-43d5-acad-200263565083" name="更改" comment="">
8
+ <change beforePath="$PROJECT_DIR$/requirements.txt" beforeDir="false" afterPath="$PROJECT_DIR$/requirements.txt" afterDir="false" />
9
+ </list>
10
+ <option name="SHOW_DIALOG" value="false" />
11
+ <option name="HIGHLIGHT_CONFLICTS" value="true" />
12
+ <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
13
+ <option name="LAST_RESOLUTION" value="IGNORE" />
14
+ </component>
15
+ <component name="Git.Settings">
16
+ <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
17
+ </component>
18
+ <component name="ProjectColorInfo">{
19
+ &quot;associatedIndex&quot;: 5
20
+ }</component>
21
+ <component name="ProjectId" id="2pVN1AmCm9xKUXNffpEopNvIJF5" />
22
+ <component name="ProjectViewState">
23
+ <option name="hideEmptyMiddlePackages" value="true" />
24
+ <option name="showLibraryContents" value="true" />
25
+ </component>
26
+ <component name="PropertiesComponent">{
27
+ &quot;keyToString&quot;: {
28
+ &quot;RunOnceActivity.ShowReadmeOnStart&quot;: &quot;true&quot;,
29
+ &quot;git-widget-placeholder&quot;: &quot;main&quot;,
30
+ &quot;last_opened_file_path&quot;: &quot;C:/Users/45488/OneDrive/桌面/yzb/TANGO/emage/smplx_models/smplx&quot;,
31
+ &quot;node.js.detected.package.eslint&quot;: &quot;true&quot;,
32
+ &quot;node.js.detected.package.tslint&quot;: &quot;true&quot;,
33
+ &quot;node.js.selected.package.eslint&quot;: &quot;(autodetect)&quot;,
34
+ &quot;node.js.selected.package.tslint&quot;: &quot;(autodetect)&quot;,
35
+ &quot;nodejs_package_manager_path&quot;: &quot;npm&quot;,
36
+ &quot;vue.rearranger.settings.migration&quot;: &quot;true&quot;
37
+ }
38
+ }</component>
39
+ <component name="RecentsManager">
40
+ <key name="CopyFile.RECENT_KEYS">
41
+ <recent name="C:\Users\45488\OneDrive\桌面\yzb\TANGO\emage\smplx_models\smplx" />
42
+ <recent name="C:\Users\45488\OneDrive\桌面\yzb\TANGO\datasets\cached_graph\youtube_test" />
43
+ <recent name="C:\Users\45488\OneDrive\桌面\yzb\TANGO\datasets\cached_graph\show_oliver_test" />
44
+ </key>
45
+ </component>
46
+ <component name="RunManager">
47
+ <configuration name="create_graph" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
48
+ <module name="TANGO" />
49
+ <option name="ENV_FILES" value="" />
50
+ <option name="INTERPRETER_OPTIONS" value="" />
51
+ <option name="PARENT_ENVS" value="true" />
52
+ <envs>
53
+ <env name="PYTHONUNBUFFERED" value="1" />
54
+ </envs>
55
+ <option name="SDK_HOME" value="" />
56
+ <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
57
+ <option name="IS_MODULE_SDK" value="true" />
58
+ <option name="ADD_CONTENT_ROOTS" value="true" />
59
+ <option name="ADD_SOURCE_ROOTS" value="true" />
60
+ <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
61
+ <option name="SCRIPT_NAME" value="$PROJECT_DIR$/create_graph.py" />
62
+ <option name="PARAMETERS" value="" />
63
+ <option name="SHOW_COMMAND_LINE" value="false" />
64
+ <option name="EMULATE_TERMINAL" value="false" />
65
+ <option name="MODULE_MODE" value="false" />
66
+ <option name="REDIRECT_INPUT" value="false" />
67
+ <option name="INPUT_FILE" value="" />
68
+ <method v="2" />
69
+ </configuration>
70
+ <recent_temporary>
71
+ <list>
72
+ <item itemvalue="Python.create_graph" />
73
+ </list>
74
+ </recent_temporary>
75
+ </component>
76
+ <component name="SharedIndexes">
77
+ <attachedChunks>
78
+ <set>
79
+ <option value="bundled-js-predefined-d6986cc7102b-e768b9ed790e-JavaScript-PY-243.21565.199" />
80
+ <option value="bundled-python-sdk-cab1f2013843-4ae2d6a61b08-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-243.21565.199" />
81
+ </set>
82
+ </attachedChunks>
83
+ </component>
84
+ <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="应用程序级" UseSingleDictionary="true" transferred="true" />
85
+ <component name="TaskManager">
86
+ <task active="true" id="Default" summary="默认任务">
87
+ <changelist id="09d685d1-f444-43d5-acad-200263565083" name="更改" comment="" />
88
+ <created>1732847065796</created>
89
+ <option name="number" value="Default" />
90
+ <option name="presentableId" value="Default" />
91
+ <updated>1732847065796</updated>
92
+ <workItem from="1732847068636" duration="14699000" />
93
+ <workItem from="1733105812677" duration="3426000" />
94
+ <workItem from="1733110257766" duration="6291000" />
95
+ <workItem from="1733124449966" duration="296000" />
96
+ <workItem from="1733304778417" duration="892000" />
97
+ </task>
98
+ <servers />
99
+ </component>
100
+ <component name="TypeScriptGeneratedFilesManager">
101
+ <option name="version" value="3" />
102
+ </component>
103
+ <component name="XDebuggerManager">
104
+ <breakpoint-manager>
105
+ <breakpoints>
106
+ <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
107
+ <url>file://$PROJECT_DIR$/create_graph.py</url>
108
+ <line>460</line>
109
+ <option name="timeStamp" value="1" />
110
+ </line-breakpoint>
111
+ </breakpoints>
112
+ </breakpoint-manager>
113
+ </component>
114
+ </project>
LICENSE ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Attribution-NonCommercial 4.0 International
2
+
3
+ =======================================================================
4
+
5
+ Creative Commons Corporation ("Creative Commons") is not a law firm and
6
+ does not provide legal services or legal advice. Distribution of
7
+ Creative Commons public licenses does not create a lawyer-client or
8
+ other relationship. Creative Commons makes its licenses and related
9
+ information available on an "as-is" basis. Creative Commons gives no
10
+ warranties regarding its licenses, any material licensed under their
11
+ terms and conditions, or any related information. Creative Commons
12
+ disclaims all liability for damages resulting from their use to the
13
+ fullest extent possible.
14
+
15
+ Using Creative Commons Public Licenses
16
+
17
+ Creative Commons public licenses provide a standard set of terms and
18
+ conditions that creators and other rights holders may use to share
19
+ original works of authorship and other material subject to copyright
20
+ and certain other rights specified in the public license below. The
21
+ following considerations are for informational purposes only, are not
22
+ exhaustive, and do not form part of our licenses.
23
+
24
+ Considerations for licensors: Our public licenses are
25
+ intended for use by those authorized to give the public
26
+ permission to use material in ways otherwise restricted by
27
+ copyright and certain other rights. Our licenses are
28
+ irrevocable. Licensors should read and understand the terms
29
+ and conditions of the license they choose before applying it.
30
+ Licensors should also secure all rights necessary before
31
+ applying our licenses so that the public can reuse the
32
+ material as expected. Licensors should clearly mark any
33
+ material not subject to the license. This includes other CC-
34
+ licensed material, or material used under an exception or
35
+ limitation to copyright. More considerations for licensors:
36
+ wiki.creativecommons.org/Considerations_for_licensors
37
+
38
+ Considerations for the public: By using one of our public
39
+ licenses, a licensor grants the public permission to use the
40
+ licensed material under specified terms and conditions. If
41
+ the licensor's permission is not necessary for any reason--for
42
+ example, because of any applicable exception or limitation to
43
+ copyright--then that use is not regulated by the license. Our
44
+ licenses grant only permissions under copyright and certain
45
+ other rights that a licensor has authority to grant. Use of
46
+ the licensed material may still be restricted for other
47
+ reasons, including because others have copyright or other
48
+ rights in the material. A licensor may make special requests,
49
+ such as asking that all changes be marked or described.
50
+ Although not required by our licenses, you are encouraged to
51
+ respect those requests where reasonable. More considerations
52
+ for the public:
53
+ wiki.creativecommons.org/Considerations_for_licensees
54
+
55
+ =======================================================================
56
+
57
+ Creative Commons Attribution-NonCommercial 4.0 International Public
58
+ License
59
+
60
+ By exercising the Licensed Rights (defined below), You accept and agree
61
+ to be bound by the terms and conditions of this Creative Commons
62
+ Attribution-NonCommercial 4.0 International Public License ("Public
63
+ License"). To the extent this Public License may be interpreted as a
64
+ contract, You are granted the Licensed Rights in consideration of Your
65
+ acceptance of these terms and conditions, and the Licensor grants You
66
+ such rights in consideration of benefits the Licensor receives from
67
+ making the Licensed Material available under these terms and
68
+ conditions.
69
+
70
+
71
+ Section 1 -- Definitions.
72
+
73
+ a. Adapted Material means material subject to Copyright and Similar
74
+ Rights that is derived from or based upon the Licensed Material
75
+ and in which the Licensed Material is translated, altered,
76
+ arranged, transformed, or otherwise modified in a manner requiring
77
+ permission under the Copyright and Similar Rights held by the
78
+ Licensor. For purposes of this Public License, where the Licensed
79
+ Material is a musical work, performance, or sound recording,
80
+ Adapted Material is always produced where the Licensed Material is
81
+ synched in timed relation with a moving image.
82
+
83
+ b. Adapter's License means the license You apply to Your Copyright
84
+ and Similar Rights in Your contributions to Adapted Material in
85
+ accordance with the terms and conditions of this Public License.
86
+
87
+ c. Copyright and Similar Rights means copyright and/or similar rights
88
+ closely related to copyright including, without limitation,
89
+ performance, broadcast, sound recording, and Sui Generis Database
90
+ Rights, without regard to how the rights are labeled or
91
+ categorized. For purposes of this Public License, the rights
92
+ specified in Section 2(b)(1)-(2) are not Copyright and Similar
93
+ Rights.
94
+ d. Effective Technological Measures means those measures that, in the
95
+ absence of proper authority, may not be circumvented under laws
96
+ fulfilling obligations under Article 11 of the WIPO Copyright
97
+ Treaty adopted on December 20, 1996, and/or similar international
98
+ agreements.
99
+
100
+ e. Exceptions and Limitations means fair use, fair dealing, and/or
101
+ any other exception or limitation to Copyright and Similar Rights
102
+ that applies to Your use of the Licensed Material.
103
+
104
+ f. Licensed Material means the artistic or literary work, database,
105
+ or other material to which the Licensor applied this Public
106
+ License.
107
+
108
+ g. Licensed Rights means the rights granted to You subject to the
109
+ terms and conditions of this Public License, which are limited to
110
+ all Copyright and Similar Rights that apply to Your use of the
111
+ Licensed Material and that the Licensor has authority to license.
112
+
113
+ h. Licensor means the individual(s) or entity(ies) granting rights
114
+ under this Public License.
115
+
116
+ i. NonCommercial means not primarily intended for or directed towards
117
+ commercial advantage or monetary compensation. For purposes of
118
+ this Public License, the exchange of the Licensed Material for
119
+ other material subject to Copyright and Similar Rights by digital
120
+ file-sharing or similar means is NonCommercial provided there is
121
+ no payment of monetary compensation in connection with the
122
+ exchange.
123
+
124
+ j. Share means to provide material to the public by any means or
125
+ process that requires permission under the Licensed Rights, such
126
+ as reproduction, public display, public performance, distribution,
127
+ dissemination, communication, or importation, and to make material
128
+ available to the public including in ways that members of the
129
+ public may access the material from a place and at a time
130
+ individually chosen by them.
131
+
132
+ k. Sui Generis Database Rights means rights other than copyright
133
+ resulting from Directive 96/9/EC of the European Parliament and of
134
+ the Council of 11 March 1996 on the legal protection of databases,
135
+ as amended and/or succeeded, as well as other essentially
136
+ equivalent rights anywhere in the world.
137
+
138
+ l. You means the individual or entity exercising the Licensed Rights
139
+ under this Public License. Your has a corresponding meaning.
140
+
141
+
142
+ Section 2 -- Scope.
143
+
144
+ a. License grant.
145
+
146
+ 1. Subject to the terms and conditions of this Public License,
147
+ the Licensor hereby grants You a worldwide, royalty-free,
148
+ non-sublicensable, non-exclusive, irrevocable license to
149
+ exercise the Licensed Rights in the Licensed Material to:
150
+
151
+ a. reproduce and Share the Licensed Material, in whole or
152
+ in part, for NonCommercial purposes only; and
153
+
154
+ b. produce, reproduce, and Share Adapted Material for
155
+ NonCommercial purposes only.
156
+
157
+ 2. Exceptions and Limitations. For the avoidance of doubt, where
158
+ Exceptions and Limitations apply to Your use, this Public
159
+ License does not apply, and You do not need to comply with
160
+ its terms and conditions.
161
+
162
+ 3. Term. The term of this Public License is specified in Section
163
+ 6(a).
164
+
165
+ 4. Media and formats; technical modifications allowed. The
166
+ Licensor authorizes You to exercise the Licensed Rights in
167
+ all media and formats whether now known or hereafter created,
168
+ and to make technical modifications necessary to do so. The
169
+ Licensor waives and/or agrees not to assert any right or
170
+ authority to forbid You from making technical modifications
171
+ necessary to exercise the Licensed Rights, including
172
+ technical modifications necessary to circumvent Effective
173
+ Technological Measures. For purposes of this Public License,
174
+ simply making modifications authorized by this Section 2(a)
175
+ (4) never produces Adapted Material.
176
+
177
+ 5. Downstream recipients.
178
+
179
+ a. Offer from the Licensor -- Licensed Material. Every
180
+ recipient of the Licensed Material automatically
181
+ receives an offer from the Licensor to exercise the
182
+ Licensed Rights under the terms and conditions of this
183
+ Public License.
184
+
185
+ b. No downstream restrictions. You may not offer or impose
186
+ any additional or different terms or conditions on, or
187
+ apply any Effective Technological Measures to, the
188
+ Licensed Material if doing so restricts exercise of the
189
+ Licensed Rights by any recipient of the Licensed
190
+ Material.
191
+
192
+ 6. No endorsement. Nothing in this Public License constitutes or
193
+ may be construed as permission to assert or imply that You
194
+ are, or that Your use of the Licensed Material is, connected
195
+ with, or sponsored, endorsed, or granted official status by,
196
+ the Licensor or others designated to receive attribution as
197
+ provided in Section 3(a)(1)(A)(i).
198
+
199
+ b. Other rights.
200
+
201
+ 1. Moral rights, such as the right of integrity, are not
202
+ licensed under this Public License, nor are publicity,
203
+ privacy, and/or other similar personality rights; however, to
204
+ the extent possible, the Licensor waives and/or agrees not to
205
+ assert any such rights held by the Licensor to the limited
206
+ extent necessary to allow You to exercise the Licensed
207
+ Rights, but not otherwise.
208
+
209
+ 2. Patent and trademark rights are not licensed under this
210
+ Public License.
211
+
212
+ 3. To the extent possible, the Licensor waives any right to
213
+ collect royalties from You for the exercise of the Licensed
214
+ Rights, whether directly or through a collecting society
215
+ under any voluntary or waivable statutory or compulsory
216
+ licensing scheme. In all other cases the Licensor expressly
217
+ reserves any right to collect such royalties, including when
218
+ the Licensed Material is used other than for NonCommercial
219
+ purposes.
220
+
221
+
222
+ Section 3 -- License Conditions.
223
+
224
+ Your exercise of the Licensed Rights is expressly made subject to the
225
+ following conditions.
226
+
227
+ a. Attribution.
228
+
229
+ 1. If You Share the Licensed Material (including in modified
230
+ form), You must:
231
+
232
+ a. retain the following if it is supplied by the Licensor
233
+ with the Licensed Material:
234
+
235
+ i. identification of the creator(s) of the Licensed
236
+ Material and any others designated to receive
237
+ attribution, in any reasonable manner requested by
238
+ the Licensor (including by pseudonym if
239
+ designated);
240
+
241
+ ii. a copyright notice;
242
+
243
+ iii. a notice that refers to this Public License;
244
+
245
+ iv. a notice that refers to the disclaimer of
246
+ warranties;
247
+
248
+ v. a URI or hyperlink to the Licensed Material to the
249
+ extent reasonably practicable;
250
+
251
+ b. indicate if You modified the Licensed Material and
252
+ retain an indication of any previous modifications; and
253
+
254
+ c. indicate the Licensed Material is licensed under this
255
+ Public License, and include the text of, or the URI or
256
+ hyperlink to, this Public License.
257
+
258
+ 2. You may satisfy the conditions in Section 3(a)(1) in any
259
+ reasonable manner based on the medium, means, and context in
260
+ which You Share the Licensed Material. For example, it may be
261
+ reasonable to satisfy the conditions by providing a URI or
262
+ hyperlink to a resource that includes the required
263
+ information.
264
+
265
+ 3. If requested by the Licensor, You must remove any of the
266
+ information required by Section 3(a)(1)(A) to the extent
267
+ reasonably practicable.
268
+
269
+ 4. If You Share Adapted Material You produce, the Adapter's
270
+ License You apply must not prevent recipients of the Adapted
271
+ Material from complying with this Public License.
272
+
273
+
274
+ Section 4 -- Sui Generis Database Rights.
275
+
276
+ Where the Licensed Rights include Sui Generis Database Rights that
277
+ apply to Your use of the Licensed Material:
278
+
279
+ a. for the avoidance of doubt, Section 2(a)(1) grants You the right
280
+ to extract, reuse, reproduce, and Share all or a substantial
281
+ portion of the contents of the database for NonCommercial purposes
282
+ only;
283
+
284
+ b. if You include all or a substantial portion of the database
285
+ contents in a database in which You have Sui Generis Database
286
+ Rights, then the database in which You have Sui Generis Database
287
+ Rights (but not its individual contents) is Adapted Material; and
288
+
289
+ c. You must comply with the conditions in Section 3(a) if You Share
290
+ all or a substantial portion of the contents of the database.
291
+
292
+ For the avoidance of doubt, this Section 4 supplements and does not
293
+ replace Your obligations under this Public License where the Licensed
294
+ Rights include other Copyright and Similar Rights.
295
+
296
+
297
+ Section 5 -- Disclaimer of Warranties and Limitation of Liability.
298
+
299
+ a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
300
+ EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
301
+ AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
302
+ ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
303
+ IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
304
+ WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
305
+ PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
306
+ ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
307
+ KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
308
+ ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
309
+
310
+ b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
311
+ TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
312
+ NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
313
+ INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
314
+ COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
315
+ USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
316
+ ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
317
+ DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
318
+ IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
319
+
320
+ c. The disclaimer of warranties and limitation of liability provided
321
+ above shall be interpreted in a manner that, to the extent
322
+ possible, most closely approximates an absolute disclaimer and
323
+ waiver of all liability.
324
+
325
+
326
+ Section 6 -- Term and Termination.
327
+
328
+ a. This Public License applies for the term of the Copyright and
329
+ Similar Rights licensed here. However, if You fail to comply with
330
+ this Public License, then Your rights under this Public License
331
+ terminate automatically.
332
+
333
+ b. Where Your right to use the Licensed Material has terminated under
334
+ Section 6(a), it reinstates:
335
+
336
+ 1. automatically as of the date the violation is cured, provided
337
+ it is cured within 30 days of Your discovery of the
338
+ violation; or
339
+
340
+ 2. upon express reinstatement by the Licensor.
341
+
342
+ For the avoidance of doubt, this Section 6(b) does not affect any
343
+ right the Licensor may have to seek remedies for Your violations
344
+ of this Public License.
345
+
346
+ c. For the avoidance of doubt, the Licensor may also offer the
347
+ Licensed Material under separate terms or conditions or stop
348
+ distributing the Licensed Material at any time; however, doing so
349
+ will not terminate this Public License.
350
+
351
+ d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
352
+ License.
353
+
354
+
355
+ Section 7 -- Other Terms and Conditions.
356
+
357
+ a. The Licensor shall not be bound by any additional or different
358
+ terms or conditions communicated by You unless expressly agreed.
359
+
360
+ b. Any arrangements, understandings, or agreements regarding the
361
+ Licensed Material not stated herein are separate from and
362
+ independent of the terms and conditions of this Public License.
363
+
364
+
365
+ Section 8 -- Interpretation.
366
+
367
+ a. For the avoidance of doubt, this Public License does not, and
368
+ shall not be interpreted to, reduce, limit, restrict, or impose
369
+ conditions on any use of the Licensed Material that could lawfully
370
+ be made without permission under this Public License.
371
+
372
+ b. To the extent possible, if any provision of this Public License is
373
+ deemed unenforceable, it shall be automatically reformed to the
374
+ minimum extent necessary to make it enforceable. If the provision
375
+ cannot be reformed, it shall be severed from this Public License
376
+ without affecting the enforceability of the remaining terms and
377
+ conditions.
378
+
379
+ c. No term or condition of this Public License will be waived and no
380
+ failure to comply consented to unless expressly agreed to by the
381
+ Licensor.
382
+
383
+ d. Nothing in this Public License constitutes or may be interpreted
384
+ as a limitation upon, or waiver of, any privileges and immunities
385
+ that apply to the Licensor or You, including from the legal
386
+ processes of any jurisdiction or authority.
387
+
388
+ =======================================================================
389
+
390
+ Creative Commons is not a party to its public
391
+ licenses. Notwithstanding, Creative Commons may elect to apply one of
392
+ its public licenses to material it publishes and in those instances
393
+ will be considered the “Licensor.” The text of the Creative Commons
394
+ public licenses is dedicated to the public domain under the CC0 Public
395
+ Domain Dedication. Except for the limited purpose of indicating that
396
+ material is shared under a Creative Commons public license or as
397
+ otherwise permitted by the Creative Commons policies published at
398
+ creativecommons.org/policies, Creative Commons does not authorize the
399
+ use of the trademark "Creative Commons" or any other trademark or logo
400
+ of Creative Commons without its prior written consent including,
401
+ without limitation, in connection with any unauthorized modifications
402
+ to any of its public licenses or any other arrangements,
403
+ understandings, or agreements concerning use of licensed material. For
404
+ the avoidance of doubt, this paragraph does not form part of the
405
+ public licenses.
406
+
407
+ Creative Commons may be contacted at creativecommons.org.
README.md CHANGED
@@ -1,12 +1,111 @@
1
- ---
2
- title: My Tango
3
- emoji: 🦀
4
- colorFrom: blue
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 5.7.1
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: my_tango
3
+ app_file: app.py
4
+ sdk: gradio
5
+ sdk_version: 4.44.1
6
+ ---
7
+ <div align="center">
8
+ <!-- <p align="center"> <img src="./assets/EMAGE_2024/1711449143651.jpg" width="100px"> </p> -->
9
+ <h2>TANGO: Co-Speech Gesture Video Reenactment with Hierarchical Audio-Motion Embedding and Diffusion Interpolation</h2>
10
+
11
+ <a href='https://pantomatrix.github.io/TANGO/'><img src='https://img.shields.io/badge/Project-TANGO-blue' alt='Project'></a>
12
+ <a href='https://www.youtube.com/watch?v=_DfsA11puBc'><img src='https://img.shields.io/badge/YouTube-TANGO-rgb(255, 0, 0)' alt='Youtube'></a>
13
+ <a href='https://huggingface.co/spaces/H-Liu1997/TANGO'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a>
14
+ <a href='https://arxiv.org/abs/2410.04221'><img src='https://img.shields.io/badge/Paper-ArXiv-yellow' alt='Project'></a>
15
+
16
+ </div>
17
+
18
+ # News
19
+
20
+ Welcome contributors! Feel free to submit the pull requests!
21
+
22
+ - **[2024/10]** Welcome to try our [TANGO](<(https://huggingface.co/spaces/H-Liu1997/TANGO)!>) on Hugging face space !
23
+ - **[2024/10]** Code for create gesture graph is available.
24
+
25
+ <p align=center>
26
+ <img src ="./assets/hg.png" width="60%" >
27
+ </p>
28
+
29
+ # Results Videos
30
+
31
+ <p align="center">
32
+ <img src="./assets/demo8.gif" width="32%" alt="demo0">
33
+ <img src="./assets/demo1.gif" width="32%" alt="demo1">
34
+ <img src="./assets/demo2.gif" width="32%" alt="demo2">
35
+ </p>
36
+ <p align="center">
37
+ <img src="./assets/demo3.gif" width="32%" alt="demo3">
38
+ <img src="./assets/demo5.gif" width="32%" alt="demo5">
39
+ <img src="./assets/demo0.gif" width="32%" alt="demo6">
40
+ </p>
41
+ <p align="center">
42
+ <img src="./assets/demo7.gif" width="32%" alt="demo7">
43
+ <img src="./assets/demo6.gif" width="32%" alt="demo8">
44
+ <img src="./assets/demo9.gif" width="32%" alt="demo9">
45
+ </p>
46
+
47
+ # Demo Video (on Youtube)
48
+
49
+ <p align=center>
50
+ <a href="https://youtu.be/xuhD_-tMH1w?si=Tr6jHAhOR1fxWIjb">
51
+ <img width="68%" src="./assets/video.png">
52
+ </a>
53
+ </p>
54
+
55
+ # 📝 Release Plans
56
+
57
+ - [ ] Training codes for AuMoClip and ACInterp
58
+ - [ ] Inference codes for ACInterp
59
+ - [ ] Processed Youtube Buiness Video data (very small, around 15 mins)
60
+ - [x] Scripts for creating gesture graph
61
+ - [x] Inference codes with AuMoClip and pretrained weights
62
+
63
+ # ⚒️ Installation
64
+
65
+ ## Clone the repository
66
+
67
+ ```shell
68
+ git clone https://github.com/CyberAgentAILab/TANGO.git
69
+ cd TANGO
70
+ git clone https://github.com/justinjohn0306/Wav2Lip.git
71
+ git clone https://github.com/dajes/frame-interpolation-pytorch.git
72
+ ```
73
+
74
+ ## Build Environtment
75
+
76
+ We Recommend a python version `==3.9.20` and cuda version `==11.8`. Then build environment as follows:
77
+
78
+ ```shell
79
+ # [Optional] Create a virtual env
80
+ conda create -n tango python==3.9.20
81
+ conda activate tango
82
+ # Install with pip:
83
+ pip install -r ./pre-requirements.txt
84
+ pip install -r ./requirements.txt
85
+ ```
86
+
87
+ # 🚀 Training and Inference
88
+
89
+ ## Inference
90
+
91
+ Here is the command for running inference scripts under the path `<your root>/TANGO/`, it will take around 3 min to generate two 8s vidoes. You can visualize by directly check the video or check the result .npz files via blender using our blender addon in [EMAGE](https://github.com/PantoMatrix/PantoMatrix).
92
+
93
+ _Necessary checkpoints and pre-computed graphs will be automatically downloaded during the first run. Please ensure that at least 35GB of disk space is available._
94
+
95
+ ```shell
96
+ python app.py
97
+ ```
98
+
99
+ ### Create the graph for custom character
100
+
101
+ ```shell
102
+ python create_graph.py
103
+ ```
104
+
105
+ # Copyright Information
106
+
107
+ We thanks the open-source project [Wav2Lip](https://github.com/Rudrabha/Wav2Lip), [FiLM](https://github.com/caffeinism/FiLM-pytorch), [SMPLerX](https://github.com/caizhongang/SMPLer-X).
108
+
109
+ Check out our previous works for Co-Speech 3D motion Generation <a href="https://github.com/PantoMatrix/PantoMatrix">DisCo, BEAT, EMAGE</a>.
110
+
111
+ This project is only for research or education purposes, and not freely available for commercial use or redistribution. The srcipt is available only under the terms of the [Attribution-NonCommercial 4.0 International](https://creativecommons.org/licenses/by-nc/4.0/legalcode) (CC BY-NC 4.0) license.
app.py ADDED
@@ -0,0 +1,796 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import gc
4
+ import soundfile as sf
5
+ import shutil
6
+ import argparse
7
+ from omegaconf import OmegaConf
8
+ import random
9
+ import numpy as np
10
+ import librosa
11
+ import emage.mertic # noqa: F401 # somehow this must be imported, even though it is not used directly
12
+ from decord import VideoReader
13
+ from PIL import Image
14
+ import cv2
15
+ import subprocess
16
+ import importlib
17
+ import torch
18
+ import torch.nn.functional as F
19
+ import smplx
20
+ import igraph
21
+
22
+ # import emage
23
+ from utils.video_io import save_videos_from_pil
24
+ from utils.genextend_inference_utils import adjust_statistics_to_match_reference
25
+ from create_graph import path_visualization, graph_pruning, get_motion_reps_tensor, path_visualization_v2
26
+ from utils.download_utils import download_files_from_repo
27
+
28
+ SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__))
29
+
30
+ download_files_from_repo()
31
+
32
+ shutil.copyfile("./assets/app.py", "./SMPLer-X/app.py")
33
+ shutil.copyfile("./assets/transforms.py", "./SMPLer-X/common/utils/transforms.py")
34
+ shutil.copyfile("./assets/inference.py", "./SMPLer-X/main/inference.py")
35
+
36
+
37
+ def search_path_dp(graph, audio_low_np, audio_high_np, loop_penalty=0.1, top_k=1, search_mode="both", continue_penalty=0.1):
38
+ T = audio_low_np.shape[0] # Total time steps
39
+ # N = len(graph.vs) # Total number of nodes in the graph
40
+
41
+ # Initialize DP tables
42
+ min_cost = [
43
+ {} for _ in range(T)
44
+ ] # min_cost[t][node_index] = list of tuples: (cost, prev_node_index, prev_tuple_index, non_continue_count, visited_nodes)
45
+
46
+ # Initialize the first time step
47
+ start_nodes = [v for v in graph.vs if v["previous"] is None or v["previous"] == -1]
48
+ for node in start_nodes:
49
+ node_index = node.index
50
+ motion_low = node["motion_low"] # Shape: [C]
51
+ motion_high = node["motion_high"] # Shape: [C]
52
+
53
+ # Cost using cosine similarity
54
+ if search_mode == "both":
55
+ cost = 2 - (np.dot(audio_low_np[0], motion_low.T) + np.dot(audio_high_np[0], motion_high.T))
56
+ elif search_mode == "high_level":
57
+ cost = 1 - np.dot(audio_high_np[0], motion_high.T)
58
+ elif search_mode == "low_level":
59
+ cost = 1 - np.dot(audio_low_np[0], motion_low.T)
60
+
61
+ visited_nodes = {node_index: 1} # Initialize visit count as a dictionary
62
+
63
+ min_cost[0][node_index] = [(cost, None, None, 0, visited_nodes)] # Initialize with no predecessor and 0 non-continue count
64
+
65
+ # DP over time steps
66
+ for t in range(1, T):
67
+ for node in graph.vs:
68
+ node_index = node.index
69
+ candidates = []
70
+
71
+ # Incoming edges to the current node
72
+ incoming_edges = graph.es.select(_to=node_index)
73
+ for edge in incoming_edges:
74
+ prev_node_index = edge.source
75
+ edge_id = edge.index
76
+ is_continue_edge = graph.es[edge_id]["is_continue"]
77
+ # prev_node = graph.vs[prev_node_index]
78
+ if prev_node_index in min_cost[t - 1]:
79
+ for tuple_index, (prev_cost, _, _, prev_non_continue_count, prev_visited) in enumerate(min_cost[t - 1][prev_node_index]):
80
+ # Loop punishment
81
+ if node_index in prev_visited:
82
+ loop_time = prev_visited[node_index] # Get the count of previous visits
83
+ loop_cost = prev_cost + loop_penalty * np.exp(loop_time) # Apply exponential penalty
84
+ new_visited = prev_visited.copy()
85
+ new_visited[node_index] = loop_time + 1 # Increment visit count
86
+ else:
87
+ loop_cost = prev_cost
88
+ new_visited = prev_visited.copy()
89
+ new_visited[node_index] = 1 # Initialize visit count for the new node
90
+
91
+ motion_low = node["motion_low"] # Shape: [C]
92
+ motion_high = node["motion_high"] # Shape: [C]
93
+
94
+ if search_mode == "both":
95
+ cost_increment = 2 - (np.dot(audio_low_np[t], motion_low.T) + np.dot(audio_high_np[t], motion_high.T))
96
+ elif search_mode == "high_level":
97
+ cost_increment = 1 - np.dot(audio_high_np[t], motion_high.T)
98
+ elif search_mode == "low_level":
99
+ cost_increment = 1 - np.dot(audio_low_np[t], motion_low.T)
100
+
101
+ # Check if the edge is "is_continue"
102
+ if not is_continue_edge:
103
+ non_continue_count = prev_non_continue_count + 1 # Increment the count of non-continue edges
104
+ else:
105
+ non_continue_count = prev_non_continue_count
106
+
107
+ # Apply the penalty based on the square of the number of non-continuous edges
108
+ continue_penalty_cost = continue_penalty * non_continue_count
109
+
110
+ total_cost = loop_cost + cost_increment + continue_penalty_cost
111
+
112
+ candidates.append((total_cost, prev_node_index, tuple_index, non_continue_count, new_visited))
113
+
114
+ # Keep the top k candidates
115
+ if candidates:
116
+ # Sort candidates by total_cost
117
+ candidates.sort(key=lambda x: x[0])
118
+ # Keep top k
119
+ min_cost[t][node_index] = candidates[:top_k]
120
+ else:
121
+ # No candidates, do nothing
122
+ pass
123
+
124
+ # Collect all possible end paths at time T-1
125
+ end_candidates = []
126
+ for node_index, tuples in min_cost[T - 1].items():
127
+ for tuple_index, (cost, _, _, _, _) in enumerate(tuples):
128
+ end_candidates.append((cost, node_index, tuple_index))
129
+
130
+ if not end_candidates:
131
+ print("No valid path found.")
132
+ return [], []
133
+
134
+ # Sort end candidates by cost
135
+ end_candidates.sort(key=lambda x: x[0])
136
+
137
+ # Keep top k paths
138
+ top_k_paths_info = end_candidates[:top_k]
139
+
140
+ # Reconstruct the paths
141
+ optimal_paths = []
142
+ is_continue_lists = []
143
+ for final_cost, node_index, tuple_index in top_k_paths_info:
144
+ optimal_path_indices = []
145
+ current_node_index = node_index
146
+ current_tuple_index = tuple_index
147
+ for t in range(T - 1, -1, -1):
148
+ optimal_path_indices.append(current_node_index)
149
+ tuple_data = min_cost[t][current_node_index][current_tuple_index]
150
+ _, prev_node_index, prev_tuple_index, _, _ = tuple_data
151
+ current_node_index = prev_node_index
152
+ current_tuple_index = prev_tuple_index
153
+ if current_node_index is None:
154
+ break # Reached the start node
155
+ optimal_path_indices = optimal_path_indices[::-1] # Reverse to get correct order
156
+ optimal_path = [graph.vs[idx] for idx in optimal_path_indices]
157
+ optimal_paths.append(optimal_path)
158
+
159
+ # Extract continuity information
160
+ is_continue = []
161
+ for i in range(len(optimal_path) - 1):
162
+ edge_id = graph.get_eid(optimal_path[i].index, optimal_path[i + 1].index)
163
+ is_cont = graph.es[edge_id]["is_continue"]
164
+ is_continue.append(is_cont)
165
+ is_continue_lists.append(is_continue)
166
+
167
+ print("Top {} Paths:".format(len(optimal_paths)))
168
+ for i, path in enumerate(optimal_paths):
169
+ path_indices = [node.index for node in path]
170
+ print("Path {}: Cost: {}, Nodes: {}".format(i + 1, top_k_paths_info[i][0], path_indices))
171
+
172
+ return optimal_paths, is_continue_lists
173
+
174
+
175
+ def test_fn(model, device, iteration, candidate_json_path, test_path, cfg, audio_path, **kwargs):
176
+ create_graph = kwargs["create_graph"]
177
+ torch.set_grad_enabled(False)
178
+ pool_path = candidate_json_path.replace("data_json", "cached_graph").replace(".json", ".pkl")
179
+ graph = igraph.Graph.Read_Pickle(fname=pool_path)
180
+ # print(len(graph.vs))
181
+
182
+ save_dir = os.path.join(test_path, f"retrieved_motions_{iteration}")
183
+ os.makedirs(save_dir, exist_ok=True)
184
+
185
+ actual_model = model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model
186
+ actual_model.eval()
187
+
188
+ # with open(candidate_json_path, 'r') as f:
189
+ # candidate_data = json.load(f)
190
+ all_motions = {}
191
+ for i, node in enumerate(graph.vs):
192
+ if all_motions.get(node["name"]) is None:
193
+ all_motions[node["name"]] = [node["axis_angle"].reshape(-1)]
194
+ else:
195
+ all_motions[node["name"]].append(node["axis_angle"].reshape(-1))
196
+ for k, v in all_motions.items():
197
+ all_motions[k] = np.stack(v) # T, J*3
198
+ # print(k, all_motions[k].shape)
199
+
200
+ window_size = cfg.data.pose_length
201
+ motion_high_all = []
202
+ motion_low_all = []
203
+ for k, v in all_motions.items():
204
+ motion_tensor = torch.from_numpy(v).float().to(device).unsqueeze(0)
205
+ _, t, _ = motion_tensor.shape
206
+
207
+ if t >= window_size:
208
+ num_chunks = t // window_size
209
+ motion_high_list = []
210
+ motion_low_list = []
211
+
212
+ for i in range(num_chunks):
213
+ start_idx = i * window_size
214
+ end_idx = start_idx + window_size
215
+ motion_slice = motion_tensor[:, start_idx:end_idx, :]
216
+
217
+ motion_features = actual_model.get_motion_features(motion_slice)
218
+
219
+ motion_low = motion_features["motion_low"].cpu().numpy()
220
+ motion_high = motion_features["motion_cls"].unsqueeze(0).repeat(1, motion_low.shape[1], 1).cpu().numpy()
221
+
222
+ motion_high_list.append(motion_high[0])
223
+ motion_low_list.append(motion_low[0])
224
+
225
+ remain_length = t % window_size
226
+ if remain_length > 0:
227
+ start_idx = t - window_size
228
+ motion_slice = motion_tensor[:, start_idx:, :]
229
+
230
+ motion_features = actual_model.get_motion_features(motion_slice)
231
+ # motion_high = motion_features["motion_high_weight"].cpu().numpy()
232
+ motion_low = motion_features["motion_low"].cpu().numpy()
233
+ motion_high = motion_features["motion_cls"].unsqueeze(0).repeat(1, motion_low.shape[1], 1).cpu().numpy()
234
+
235
+ motion_high_list.append(motion_high[0][-remain_length:])
236
+ motion_low_list.append(motion_low[0][-remain_length:])
237
+
238
+ motion_high_all.append(np.concatenate(motion_high_list, axis=0))
239
+ motion_low_all.append(np.concatenate(motion_low_list, axis=0))
240
+
241
+ else: # t < window_size:
242
+ gap = window_size - t
243
+ motion_slice = torch.cat(
244
+ [motion_tensor, torch.zeros((motion_tensor.shape[0], gap, motion_tensor.shape[2])).to(motion_tensor.device)], 1
245
+ )
246
+ motion_features = actual_model.get_motion_features(motion_slice)
247
+ # motion_high = motion_features["motion_high_weight"].cpu().numpy()
248
+ motion_low = motion_features["motion_low"].cpu().numpy()
249
+ motion_high = motion_features["motion_cls"].unsqueeze(0).repeat(1, motion_low.shape[1], 1).cpu().numpy()
250
+
251
+ motion_high_all.append(motion_high[0][:t])
252
+ motion_low_all.append(motion_low[0][:t])
253
+
254
+ motion_high_all = np.concatenate(motion_high_all, axis=0)
255
+ motion_low_all = np.concatenate(motion_low_all, axis=0)
256
+ # print(motion_high_all.shape, motion_low_all.shape, len(graph.vs))
257
+ motion_low_all = motion_low_all / np.linalg.norm(motion_low_all, axis=1, keepdims=True)
258
+ motion_high_all = motion_high_all / np.linalg.norm(motion_high_all, axis=1, keepdims=True)
259
+ assert motion_high_all.shape[0] == len(graph.vs)
260
+ assert motion_low_all.shape[0] == len(graph.vs)
261
+
262
+ for i, node in enumerate(graph.vs):
263
+ node["motion_high"] = motion_high_all[i]
264
+ node["motion_low"] = motion_low_all[i]
265
+
266
+ graph = graph_pruning(graph)
267
+ # for gradio, use a subgraph
268
+ if len(graph.vs) > 1800:
269
+ gap = len(graph.vs) - 1800
270
+ start_d = random.randint(0, 1800)
271
+ graph.delete_vertices(range(start_d, start_d + gap))
272
+ ascc_2 = graph.clusters(mode="STRONG")
273
+ graph = ascc_2.giant()
274
+
275
+ # drop the id of gt
276
+ idx = 0
277
+ audio_waveform, sr = librosa.load(audio_path)
278
+ audio_waveform = librosa.resample(audio_waveform, orig_sr=sr, target_sr=cfg.data.audio_sr)
279
+ audio_tensor = torch.from_numpy(audio_waveform).float().to(device).unsqueeze(0)
280
+
281
+ target_length = audio_tensor.shape[1] // cfg.data.audio_sr * 30
282
+ window_size = int(cfg.data.audio_sr * (cfg.data.pose_length / 30))
283
+ _, t = audio_tensor.shape
284
+ audio_low_list = []
285
+ audio_high_list = []
286
+
287
+ if t >= window_size:
288
+ num_chunks = t // window_size
289
+ # print(num_chunks, t % window_size)
290
+ for i in range(num_chunks):
291
+ start_idx = i * window_size
292
+ end_idx = start_idx + window_size
293
+ # print(start_idx, end_idx, window_size)
294
+ audio_slice = audio_tensor[:, start_idx:end_idx]
295
+
296
+ model_out_candidates = actual_model.get_audio_features(audio_slice)
297
+ audio_low = model_out_candidates["audio_low"]
298
+ # audio_high = model_out_candidates["audio_high_weight"]
299
+ audio_high = model_out_candidates["audio_cls"].unsqueeze(0).repeat(1, audio_low.shape[1], 1)
300
+ # print(audio_low.shape, audio_high.shape)
301
+
302
+ audio_low = F.normalize(audio_low, dim=2)[0].cpu().numpy()
303
+ audio_high = F.normalize(audio_high, dim=2)[0].cpu().numpy()
304
+
305
+ audio_low_list.append(audio_low)
306
+ audio_high_list.append(audio_high)
307
+ # print(audio_low.shape, audio_high.shape)
308
+
309
+ remain_length = t % window_size
310
+ if remain_length > 1:
311
+ start_idx = t - window_size
312
+ audio_slice = audio_tensor[:, start_idx:]
313
+
314
+ model_out_candidates = actual_model.get_audio_features(audio_slice)
315
+ audio_low = model_out_candidates["audio_low"]
316
+ # audio_high = model_out_candidates["audio_high_weight"]
317
+ audio_high = model_out_candidates["audio_cls"].unsqueeze(0).repeat(1, audio_low.shape[1], 1)
318
+
319
+ gap = target_length - np.concatenate(audio_low_list, axis=0).shape[1]
320
+ audio_low = F.normalize(audio_low, dim=2)[0][-gap:].cpu().numpy()
321
+ audio_high = F.normalize(audio_high, dim=2)[0][-gap:].cpu().numpy()
322
+
323
+ # print(audio_low.shape, audio_high.shape)
324
+ audio_low_list.append(audio_low)
325
+ audio_high_list.append(audio_high)
326
+ else:
327
+ gap = window_size - t
328
+ audio_slice = audio_tensor
329
+ model_out_candidates = actual_model.get_audio_features(audio_slice)
330
+ audio_low = model_out_candidates["audio_low"]
331
+ # audio_high = model_out_candidates["audio_high_weight"]
332
+ audio_high = model_out_candidates["audio_cls"].unsqueeze(0).repeat(1, audio_low.shape[1], 1)
333
+ audio_low = F.normalize(audio_low, dim=2)[0].cpu().numpy()
334
+ audio_high = F.normalize(audio_high, dim=2)[0].cpu().numpy()
335
+ audio_low_list.append(audio_low)
336
+ audio_high_list.append(audio_high)
337
+
338
+ audio_low_all = np.concatenate(audio_low_list, axis=0)
339
+ audio_high_all = np.concatenate(audio_high_list, axis=0)
340
+ path_list, is_continue_list = search_path_dp(graph, audio_low_all, audio_high_all, top_k=1, search_mode="both")
341
+
342
+ res_motion = []
343
+ counter = 0
344
+ wav2lip_checkpoint_path = os.path.join(SCRIPT_PATH, "Wav2Lip/checkpoints/wav2lip_gan.pth") # Update this path to your Wav2Lip checkpoint
345
+ wav2lip_script_path = os.path.join(SCRIPT_PATH, "Wav2Lip/inference.py")
346
+ for path, is_continue in zip(path_list, is_continue_list):
347
+ if False:
348
+ # time is limited if we create graph on hugging face, lets skip blending.
349
+ res_motion_current = path_visualization(
350
+ graph,
351
+ path,
352
+ is_continue,
353
+ os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4"),
354
+ audio_path=audio_path,
355
+ return_motion=True,
356
+ verbose_continue=True,
357
+ )
358
+ video_temp_path = os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4")
359
+ else:
360
+ res_motion_current = path_visualization_v2(
361
+ graph,
362
+ path,
363
+ is_continue,
364
+ os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4"),
365
+ audio_path=None,
366
+ return_motion=True,
367
+ verbose_continue=True,
368
+ )
369
+ video_temp_path = os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4")
370
+ video_reader = VideoReader(video_temp_path)
371
+ video_np = []
372
+ for i in range(len(video_reader)):
373
+ if i == 0:
374
+ continue
375
+ video_frame = video_reader[i].asnumpy()
376
+ video_np.append(Image.fromarray(video_frame))
377
+ adjusted_video_pil = adjust_statistics_to_match_reference([video_np])
378
+ save_videos_from_pil(
379
+ adjusted_video_pil[0], os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4"), fps=graph.vs[0]["fps"], bitrate=2000000
380
+ )
381
+
382
+ audio_temp_path = audio_path
383
+ lipsync_output_path = os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4")
384
+ cmd_wav2lip_1 = f"cd Wav2Lip; python {wav2lip_script_path} --checkpoint_path {wav2lip_checkpoint_path} --face {video_temp_path} --audio {audio_temp_path} --outfile {lipsync_output_path} --nosmooth --out_height 720"
385
+ subprocess.run(cmd_wav2lip_1, shell=True)
386
+
387
+ res_motion.append(res_motion_current)
388
+ np.savez(os.path.join(save_dir, f"audio_{idx}_retri_{counter}.npz"), motion=res_motion_current)
389
+
390
+ start_node = path[1].index
391
+ end_node = start_node + 100
392
+
393
+ if create_graph:
394
+ # time is limited if create graph, let us skip the second video
395
+ result = [
396
+ os.path.join(save_dir, f"audio_{idx}_retri_0.mp4"),
397
+ os.path.join(save_dir, f"audio_{idx}_retri_0.mp4"),
398
+ os.path.join(save_dir, f"audio_{idx}_retri_0.npz"),
399
+ os.path.join(save_dir, f"audio_{idx}_retri_0.npz"),
400
+ ]
401
+ return result
402
+
403
+ print(f"delete gt-nodes {start_node}, {end_node}")
404
+ nodes_to_delete = list(range(start_node, end_node))
405
+ graph.delete_vertices(nodes_to_delete)
406
+ graph = graph_pruning(graph)
407
+ path_list, is_continue_list = search_path_dp(graph, audio_low_all, audio_high_all, top_k=1, search_mode="both")
408
+ res_motion = []
409
+ counter = 1
410
+ for path, is_continue in zip(path_list, is_continue_list):
411
+ res_motion_current = path_visualization_v2(
412
+ graph,
413
+ path,
414
+ is_continue,
415
+ os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4"),
416
+ audio_path=None,
417
+ return_motion=True,
418
+ verbose_continue=True,
419
+ )
420
+ video_temp_path = os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4")
421
+ video_reader = VideoReader(video_temp_path)
422
+ video_np = []
423
+ for i in range(len(video_reader)):
424
+ if i == 0:
425
+ continue
426
+ video_frame = video_reader[i].asnumpy()
427
+ video_np.append(Image.fromarray(video_frame))
428
+ adjusted_video_pil = adjust_statistics_to_match_reference([video_np])
429
+ save_videos_from_pil(
430
+ adjusted_video_pil[0], os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4"), fps=graph.vs[0]["fps"], bitrate=2000000
431
+ )
432
+
433
+ audio_temp_path = audio_path
434
+ lipsync_output_path = os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4")
435
+ cmd_wav2lip_2 = f"cd Wav2Lip; python {wav2lip_script_path} --checkpoint_path {wav2lip_checkpoint_path} --face {video_temp_path} --audio {audio_temp_path} --outfile {lipsync_output_path} --nosmooth --out_height 720"
436
+ subprocess.run(cmd_wav2lip_2, shell=True)
437
+ res_motion.append(res_motion_current)
438
+ np.savez(os.path.join(save_dir, f"audio_{idx}_retri_{counter}.npz"), motion=res_motion_current)
439
+
440
+ result = [
441
+ os.path.join(save_dir, f"audio_{idx}_retri_0.mp4"),
442
+ os.path.join(save_dir, f"audio_{idx}_retri_1.mp4"),
443
+ os.path.join(save_dir, f"audio_{idx}_retri_0.npz"),
444
+ os.path.join(save_dir, f"audio_{idx}_retri_1.npz"),
445
+ ]
446
+ return result
447
+
448
+
449
+ def init_class(module_name, class_name, config, **kwargs):
450
+ module = importlib.import_module(module_name)
451
+ model_class = getattr(module, class_name)
452
+ instance = model_class(config, **kwargs)
453
+ return instance
454
+
455
+
456
+ def seed_everything(seed):
457
+ random.seed(seed)
458
+ np.random.seed(seed)
459
+ torch.manual_seed(seed)
460
+ torch.cuda.manual_seed_all(seed)
461
+
462
+
463
+ def prepare_all(yaml_name):
464
+ parser = argparse.ArgumentParser()
465
+ parser.add_argument("--config", type=str, default=yaml_name)
466
+ parser.add_argument("--debug", action="store_true", help="Enable debugging mode")
467
+ parser.add_argument("overrides", nargs=argparse.REMAINDER)
468
+ args = parser.parse_args()
469
+ if args.config.endswith(".yaml"):
470
+ config = OmegaConf.load(args.config)
471
+ config.exp_name = os.path.basename(args.config)[:-5]
472
+ else:
473
+ raise ValueError("Unsupported config file format. Only .yaml files are allowed.")
474
+ save_dir = os.path.join(OUTPUT_DIR, config.exp_name)
475
+ os.makedirs(save_dir, exist_ok=True)
476
+ return config
477
+
478
+
479
+ def save_first_10_seconds(video_path, output_path="./save_video.mp4", max_length=512):
480
+ if os.path.exists(output_path):
481
+ os.remove(output_path)
482
+
483
+ cap = cv2.VideoCapture(video_path)
484
+
485
+ if not cap.isOpened():
486
+ return
487
+
488
+ fps = int(cap.get(cv2.CAP_PROP_FPS))
489
+ original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
490
+ original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
491
+
492
+ # Calculate the aspect ratio and resize dimensions
493
+ if original_width >= original_height:
494
+ new_width = max_length
495
+ new_height = int(original_height * (max_length / original_width))
496
+ else:
497
+ new_height = max_length
498
+ new_width = int(original_width * (max_length / original_height))
499
+
500
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
501
+ out = cv2.VideoWriter(output_path.replace(".mp4", "_fps.mp4"), fourcc, fps, (new_width, new_height))
502
+
503
+ frames_to_save = fps * 20
504
+ frame_count = 0
505
+
506
+ while cap.isOpened() and frame_count < frames_to_save:
507
+ ret, frame = cap.read()
508
+ if not ret:
509
+ break
510
+ # Resize the frame while keeping the aspect ratio
511
+ resized_frame = cv2.resize(frame, (new_width, new_height))
512
+ # resized_frame = frame
513
+ out.write(resized_frame)
514
+ frame_count += 1
515
+
516
+ cap.release()
517
+ out.release()
518
+ command = [
519
+ "ffmpeg",
520
+ "-i",
521
+ output_path.replace(".mp4", "_fps.mp4"),
522
+ "-vf",
523
+ "minterpolate=fps=30:mi_mode=mci:mc_mode=aobmc:vsbmc=1",
524
+ output_path,
525
+ ]
526
+ subprocess.run(command)
527
+ os.remove(output_path.replace(".mp4", "_fps.mp4"))
528
+
529
+
530
+ character_name_to_yaml = {
531
+ "speaker8_jjRWaMCWs44_00-00-30.16_00-00-33.32.mp4": "./datasets/data_json/youtube_test/speaker8.json",
532
+ "speaker7_iuYlGRnC7J8_00-00-0.00_00-00-3.25.mp4": "./datasets/data_json/youtube_test/speaker7.json",
533
+ "speaker9_o7Ik1OB4TaE_00-00-38.15_00-00-42.33.mp4": "./datasets/data_json/youtube_test/speaker9.json",
534
+ "1wrQ6Msp7wM_00-00-39.69_00-00-45.68.mp4": "./datasets/data_json/youtube_test/speaker1.json",
535
+ "101099-00_18_09-00_18_19.mp4": "./datasets/data_json/show_oliver_test/Stupid_Watergate_-_Last_Week_Tonight_with_John_Oliver_HBO-FVFdsl29s_Q.mkv.json",
536
+ }
537
+
538
+
539
+ TARGET_SR = 16000
540
+ OUTPUT_DIR = os.path.join(SCRIPT_PATH, "outputs/")
541
+
542
+
543
+ # @spaces.GPU(duration=200)
544
+ def tango(audio_path, character_name, seed, create_graph=False, video_folder_path=None):
545
+ shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
546
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
547
+ cfg_file = os.path.join(SCRIPT_PATH, "configs/gradio.yaml")
548
+ cfg = prepare_all(cfg_file)
549
+ cfg.seed = seed
550
+ seed_everything(cfg.seed)
551
+ experiment_ckpt_dir = os.path.join(OUTPUT_DIR, cfg.exp_name)
552
+ saved_audio_path = os.path.join(OUTPUT_DIR, "saved_audio.wav")
553
+ sample_rate, audio_waveform = audio_path
554
+ sf.write(saved_audio_path, audio_waveform, sample_rate)
555
+
556
+ audio_waveform, sample_rate = librosa.load(saved_audio_path)
557
+ # print(audio_waveform.shape)
558
+ resampled_audio = librosa.resample(audio_waveform, orig_sr=sample_rate, target_sr=TARGET_SR)
559
+ required_length = int(TARGET_SR * (128 / 30)) * 2
560
+ resampled_audio = resampled_audio[:required_length]
561
+ sf.write(saved_audio_path, resampled_audio, TARGET_SR)
562
+ audio_path = saved_audio_path
563
+
564
+ yaml_name = os.path.join(SCRIPT_PATH, "datasets/data_json/youtube_test/speaker1.json")
565
+ cfg.data.test_meta_paths = yaml_name
566
+ print(yaml_name)
567
+
568
+ video_folder_path = os.path.join(OUTPUT_DIR, "tmpvideo")
569
+ if os.path.basename(character_name) not in character_name_to_yaml.keys():
570
+ create_graph = True
571
+ # load video, and save it to "./save_video.mp4 for the first 20s of the video."
572
+ os.makedirs(video_folder_path, exist_ok=True)
573
+ save_first_10_seconds(character_name, os.path.join(video_folder_path, "save_video.mp4"))
574
+
575
+ if create_graph:
576
+ data_save_path = os.path.join(OUTPUT_DIR, "tmpdata")
577
+ json_save_path = os.path.join(OUTPUT_DIR, "save_video.json")
578
+ graph_save_path = os.path.join(OUTPUT_DIR, "save_video.pkl")
579
+ cmd_smplx = f"cd ./SMPLer-X/ && python app.py --video_folder_path {video_folder_path} --data_save_path {data_save_path} --json_save_path {json_save_path} && cd .."
580
+ subprocess.run(cmd_smplx, shell=True)
581
+ print("cmd_smplx: ", cmd_smplx)
582
+ cmd_graph = f"python ./create_graph.py --json_save_path {json_save_path} --graph_save_path {graph_save_path}"
583
+ subprocess.run(cmd_graph, shell=True)
584
+ print("cmd_graph: ", cmd_graph)
585
+ cfg.data.test_meta_paths = json_save_path
586
+ gc.collect()
587
+ torch.cuda.empty_cache()
588
+
589
+ smplx_model = smplx.create(
590
+ "./emage/smplx_models/",
591
+ model_type="smplx",
592
+ gender="NEUTRAL_2020",
593
+ use_face_contour=False,
594
+ num_betas=300,
595
+ num_expression_coeffs=100,
596
+ ext="npz",
597
+ use_pca=False,
598
+ )
599
+ model = init_class(cfg.model.name_pyfile, cfg.model.class_name, cfg)
600
+ for param in model.parameters():
601
+ param.requires_grad = False
602
+ model.smplx_model = smplx_model
603
+ model.get_motion_reps = get_motion_reps_tensor
604
+ assert torch.cuda.is_available(), "CUDA is not available"
605
+ device = torch.device("cuda:0")
606
+ smplx_model = smplx_model.to(device).eval()
607
+ model = model.to(device)
608
+ model.smplx_model = model.smplx_model.to(device)
609
+
610
+ checkpoint_path = os.path.join(SCRIPT_PATH, "datasets/cached_ckpts/ckpt.pth")
611
+ checkpoint = torch.load(checkpoint_path)
612
+ state_dict = checkpoint["model_state_dict"]
613
+ new_state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
614
+ model.load_state_dict(new_state_dict, strict=False)
615
+
616
+ test_path = os.path.join(experiment_ckpt_dir, f"test_{0}")
617
+ os.makedirs(test_path, exist_ok=True)
618
+ result = test_fn(model, device, 0, cfg.data.test_meta_paths, test_path, cfg, audio_path, create_graph=create_graph)
619
+ gc.collect()
620
+ torch.cuda.empty_cache()
621
+ return result
622
+
623
+
624
+ examples_audio = [
625
+ ["./datasets/cached_audio/example_male_voice_9_seconds.wav"],
626
+ ["./datasets/cached_audio/example_female_voice_9_seconds.wav"],
627
+ ]
628
+
629
+ examples_video = [
630
+ ["./datasets/cached_audio/speaker8_jjRWaMCWs44_00-00-30.16_00-00-33.32.mp4"],
631
+ ["./datasets/cached_audio/speaker7_iuYlGRnC7J8_00-00-0.00_00-00-3.25.mp4"],
632
+ ["./datasets/cached_audio/speaker9_o7Ik1OB4TaE_00-00-38.15_00-00-42.33.mp4"],
633
+ ["./datasets/cached_audio/1wrQ6Msp7wM_00-00-39.69_00-00-45.68.mp4"],
634
+ ["./datasets/cached_audio/101099-00_18_09-00_18_19.mp4"],
635
+ ]
636
+
637
+ combined_examples = [
638
+ ["./datasets/cached_audio/example_female_voice_9_seconds.wav", "./datasets/cached_audio/female_test_V1.mp4", 2024],
639
+ ]
640
+
641
+
642
+ def make_demo():
643
+ with gr.Blocks(analytics_enabled=False) as Interface:
644
+ gr.Markdown(
645
+ """
646
+ <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
647
+ <div>
648
+ <h1>TANGO</h1>
649
+ <span>Generating full-body talking videos from audio and reference video</span>
650
+ <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
651
+ <a href='https://h-liu1997.github.io/'>Haiyang Liu</a>, \
652
+ <a href='https://yangxingchao.github.io/'>Xingchao Yang</a>, \
653
+ <a href=''>Tomoya Akiyama</a>, \
654
+ <a href='https://sky24h.github.io/'> Yuantian Huang</a>, \
655
+ <a href=''>Qiaoge Li</a>, \
656
+ <a href='https://www.tut.ac.jp/english/university/faculty/cs/164.html'>Shigeru Kuriyama</a>, \
657
+ <a href='https://taketomitakafumi.sakura.ne.jp/web/en/'>Takafumi Taketomi</a>\
658
+ </h2>
659
+ <br>
660
+ <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
661
+ <a href="https://arxiv.org/abs/2410.04221"><img src="https://img.shields.io/badge/arXiv-2410.04221-blue"></a>
662
+ &nbsp;
663
+ <a href="https://pantomatrix.github.io/TANGO/"><img src="https://img.shields.io/badge/Project_Page-TANGO-orange" alt="Project Page"></a>
664
+ &nbsp;
665
+ <a href="https://github.com/CyberAgentAILab/TANGO"><img src="https://img.shields.io/badge/Github-Code-green"></a>
666
+ &nbsp;
667
+ <a href="https://github.com/CyberAgentAILab/TANGO"><img src="https://img.shields.io/github/stars/CyberAgentAILab/TANGO
668
+ "></a>
669
+ </div>
670
+ </div>
671
+ </div>
672
+ """
673
+ )
674
+
675
+ # Create a gallery with 5 videos
676
+ with gr.Row():
677
+ gr.Video(value="./datasets/cached_audio/demo1.mp4", label="Demo 0", watermark="./datasets/watermark.png")
678
+ gr.Video(value="./datasets/cached_audio/demo2.mp4", label="Demo 1", watermark="./datasets/watermark.png")
679
+ gr.Video(value="./datasets/cached_audio/demo3.mp4", label="Demo 2", watermark="./datasets/watermark.png")
680
+ gr.Video(value="./datasets/cached_audio/demo4.mp4", label="Demo 3", watermark="./datasets/watermark.png")
681
+ gr.Video(value="./datasets/cached_audio/demo5.mp4", label="Demo 4", watermark="./datasets/watermark.png")
682
+ with gr.Row():
683
+ gr.Video(value="./datasets/cached_audio/demo6.mp4", label="Demo 5", watermark="./datasets/watermark.png")
684
+ gr.Video(value="./datasets/cached_audio/demo0.mp4", label="Demo 6", watermark="./datasets/watermark.png")
685
+ gr.Video(value="./datasets/cached_audio/demo7.mp4", label="Demo 7", watermark="./datasets/watermark.png")
686
+ gr.Video(value="./datasets/cached_audio/demo8.mp4", label="Demo 8", watermark="./datasets/watermark.png")
687
+ gr.Video(value="./datasets/cached_audio/demo9.mp4", label="Demo 9", watermark="./datasets/watermark.png")
688
+
689
+ with gr.Row():
690
+ gr.Markdown(
691
+ """
692
+ <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
693
+ This is an open-source project running locally, operates in low-quality mode. Some generated results from high-quality mode are shown above.
694
+ <br>
695
+ News:
696
+ <br>
697
+ [10/15]: Add watermark, fix bugs on custom character by downgrades to py3.9, fix bugs to support audio less than 4s.
698
+ </div>
699
+ """
700
+ )
701
+
702
+ with gr.Row():
703
+ with gr.Column(scale=4):
704
+ video_output_1 = gr.Video(
705
+ label="Generated video - 1",
706
+ interactive=False,
707
+ autoplay=False,
708
+ loop=False,
709
+ show_share_button=True,
710
+ watermark="./datasets/watermark.png",
711
+ )
712
+ with gr.Column(scale=4):
713
+ video_output_2 = gr.Video(
714
+ label="Generated video - 2",
715
+ interactive=False,
716
+ autoplay=False,
717
+ loop=False,
718
+ show_share_button=True,
719
+ watermark="./datasets/watermark.png",
720
+ )
721
+ with gr.Column(scale=1):
722
+ file_output_1 = gr.File(label="Download 3D Motion and Visualize in Blender")
723
+ file_output_2 = gr.File(label="Download 3D Motion and Visualize in Blender")
724
+ gr.Markdown("""
725
+ <div style="display: flex; justify-content: center; align-items: center; text-align: left;">
726
+ Details of the low-quality mode:
727
+ <br>
728
+ 1. lower resolution, video resized as long-side 512 and keep aspect ratio.
729
+ <br>
730
+ 2. subgraph instead of full-graph, causing noticeable "frame jumps".
731
+ <br>
732
+ 3. only use the first 8s of your input audio.
733
+ <br>
734
+ 4. only use the first 20s of your input video for custom character. if you custom character, it will only generate one video result without "smoothing" for saving time.
735
+ <br>
736
+ 5. use open-source tools like SMPLerX-s-model, Wav2Lip, and FiLM for faster processing.
737
+ <br>
738
+ <br>
739
+ Feel free to open an issue on GitHub or contact the authors if this does not meet your needs.
740
+ </div>
741
+ """)
742
+
743
+ with gr.Row():
744
+ with gr.Column(scale=1):
745
+ audio_input = gr.Audio(label="Upload your audio")
746
+ seed_input = gr.Number(label="Seed", value=2024, interactive=True)
747
+ with gr.Column(scale=2):
748
+ gr.Examples(
749
+ examples=examples_audio,
750
+ inputs=[audio_input],
751
+ outputs=[video_output_1, video_output_2, file_output_1, file_output_2],
752
+ label="Select existing Audio examples",
753
+ cache_examples=False,
754
+ )
755
+ with gr.Column(scale=1):
756
+ video_input = gr.Video(label="Your Character", elem_classes="video")
757
+ with gr.Column(scale=2):
758
+ gr.Examples(
759
+ examples=examples_video,
760
+ inputs=[video_input], # Correctly refer to video input
761
+ outputs=[video_output_1, video_output_2, file_output_1, file_output_2],
762
+ label="Character Examples",
763
+ cache_examples=False,
764
+ )
765
+
766
+ # Fourth row: Generate video button
767
+ with gr.Row():
768
+ run_button = gr.Button("Generate Video")
769
+
770
+ # Define button click behavior
771
+ run_button.click(
772
+ fn=tango,
773
+ inputs=[audio_input, video_input, seed_input],
774
+ outputs=[video_output_1, video_output_2, file_output_1, file_output_2],
775
+ )
776
+
777
+ with gr.Row():
778
+ with gr.Column(scale=4):
779
+ gr.Examples(
780
+ examples=combined_examples,
781
+ inputs=[audio_input, video_input, seed_input], # Both audio and video as inputs
782
+ outputs=[video_output_1, video_output_2, file_output_1, file_output_2],
783
+ fn=tango, # Function that processes both audio and video inputs
784
+ label="Select Combined Audio and Video Examples (Cached)",
785
+ cache_examples=True,
786
+ )
787
+
788
+ return Interface
789
+
790
+
791
+ if __name__ == "__main__":
792
+ os.environ["MASTER_ADDR"] = "127.0.0.1"
793
+ os.environ["MASTER_PORT"] = "8675"
794
+
795
+ demo = make_demo()
796
+ demo.launch(share=True)
assets/app.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import argparse
4
+ import re
5
+ import json
6
+ import numpy as np
7
+ import cv2
8
+ import torch
9
+ from tqdm import tqdm
10
+
11
+ try:
12
+ import mmpose # noqa: F401
13
+ except Exception as e:
14
+ print(e)
15
+ print("mmpose error, installing transformer_utils")
16
+ os.system("pip install ./main/transformer_utils")
17
+
18
+
19
+ def extract_frame_number(file_name):
20
+ match = re.search(r"(\d{5})", file_name)
21
+ if match:
22
+ return int(match.group(1))
23
+ return None
24
+
25
+
26
+ def merge_npz_files(npz_files, output_file):
27
+ npz_files = sorted(npz_files, key=lambda x: extract_frame_number(os.path.basename(x)))
28
+ merged_data = {}
29
+ for file in npz_files:
30
+ data = np.load(file)
31
+ for key in data.files:
32
+ if key not in merged_data:
33
+ merged_data[key] = []
34
+ merged_data[key].append(data[key])
35
+ for key in merged_data:
36
+ merged_data[key] = np.stack(merged_data[key], axis=0)
37
+ np.savez(output_file, **merged_data)
38
+
39
+
40
+ def npz_to_npz(pkl_path, npz_path):
41
+ # Load the pickle file
42
+ pkl_example = np.load(pkl_path, allow_pickle=True)
43
+ n = pkl_example["expression"].shape[0] # Assuming this is the batch size
44
+ full_pose = np.concatenate(
45
+ [
46
+ pkl_example["global_orient"],
47
+ pkl_example["body_pose"],
48
+ pkl_example["jaw_pose"],
49
+ pkl_example["leye_pose"],
50
+ pkl_example["reye_pose"],
51
+ pkl_example["left_hand_pose"],
52
+ pkl_example["right_hand_pose"],
53
+ ],
54
+ axis=1,
55
+ )
56
+ # print(full_pose.shape)
57
+ np.savez(
58
+ npz_path,
59
+ betas=np.zeros(300),
60
+ poses=full_pose.reshape(n, -1),
61
+ expressions=np.zeros((n, 100)),
62
+ trans=pkl_example["transl"].reshape(n, -1),
63
+ model="smplx2020",
64
+ gender="neutral",
65
+ mocap_frame_rate=30,
66
+ )
67
+
68
+
69
+ def get_json(root_dir, output_dir):
70
+ clips = []
71
+ dirs = os.listdir(root_dir)
72
+ all_length = 0
73
+ for dir in dirs:
74
+ if not dir.endswith(".mp4"):
75
+ continue
76
+ video_id = dir[:-4]
77
+ root = root_dir
78
+ try:
79
+ length = np.load(os.path.join(root, video_id + ".npz"), allow_pickle=True)["poses"].shape[0]
80
+ all_length += length
81
+ except Exception as e:
82
+ print("cant open ", dir, e)
83
+ continue
84
+ clip = {
85
+ "video_id": video_id,
86
+ "video_path": root,
87
+ # "audio_path": root,
88
+ "motion_path": root,
89
+ "mode": "test",
90
+ "start_idx": 0,
91
+ "end_idx": length,
92
+ }
93
+ clips.append(clip)
94
+ if all_length < 1:
95
+ print(f"skip due to total frames is less than 1500 for {root_dir}")
96
+ return 0
97
+ else:
98
+ with open(output_dir, "w") as f:
99
+ json.dump(clips, f, indent=4)
100
+ return all_length
101
+
102
+
103
+ def infer(video_input, in_threshold, num_people, render_mesh, inferer, OUT_FOLDER):
104
+ shutil.rmtree(f"{OUT_FOLDER}/smplx", ignore_errors=True)
105
+ os.makedirs(f"{OUT_FOLDER}/smplx", exist_ok=True)
106
+ multi_person = num_people
107
+ cap = cv2.VideoCapture(video_input)
108
+ video_name = os.path.basename(video_input)
109
+ success = 1
110
+ frame = 0
111
+ while success:
112
+ success, original_img = cap.read()
113
+ if not success:
114
+ break
115
+ frame += 1
116
+ _, _, _ = inferer.infer(original_img, in_threshold, frame, multi_person, not (render_mesh))
117
+ cap.release()
118
+ npz_files = [os.path.join(OUT_FOLDER, "smplx", x) for x in os.listdir(os.path.join(OUT_FOLDER, "smplx"))]
119
+
120
+ merge_npz_files(npz_files, os.path.join(OUT_FOLDER, video_name.replace(".mp4", ".npz")))
121
+ shutil.rmtree(f"{OUT_FOLDER}/smplx", ignore_errors=True)
122
+ npz_to_npz(os.path.join(OUT_FOLDER, video_name.replace(".mp4", ".npz")), os.path.join(OUT_FOLDER, video_name.replace(".mp4", ".npz")))
123
+ source = video_input
124
+ destination = os.path.join(OUT_FOLDER, video_name.replace(".mp4", ".npz")).replace(".npz", ".mp4")
125
+ shutil.copy(source, destination)
126
+
127
+
128
+ if __name__ == "__main__":
129
+ parser = argparse.ArgumentParser()
130
+ parser.add_argument("--video_folder_path", type=str, default="")
131
+ parser.add_argument("--data_save_path", type=str, default="")
132
+ parser.add_argument("--json_save_path", type=str, default="")
133
+ args = parser.parse_args()
134
+ video_folder = args.video_folder_path
135
+
136
+ DEFAULT_MODEL = "smpler_x_s32"
137
+ OUT_FOLDER = args.data_save_path
138
+ os.makedirs(OUT_FOLDER, exist_ok=True)
139
+ num_gpus = 1 if torch.cuda.is_available() else -1
140
+ index = torch.cuda.current_device()
141
+ from main.inference import Inferer
142
+
143
+ inferer = Inferer(DEFAULT_MODEL, num_gpus, OUT_FOLDER)
144
+
145
+ for video_input in tqdm(os.listdir(video_folder)):
146
+ if not video_input.endswith(".mp4"):
147
+ continue
148
+ infer(os.path.join(video_folder, video_input), 0.5, False, False, inferer, OUT_FOLDER)
149
+ get_json(OUT_FOLDER, args.json_save_path)
assets/demo0.gif ADDED

Git LFS Details

  • SHA256: 99d098b237aa762361f0e0601a8006e511ab723f4060e84ee12103ae90d6b332
  • Pointer size: 133 Bytes
  • Size of remote file: 10.8 MB
assets/demo1.gif ADDED

Git LFS Details

  • SHA256: 8cd63a26846a114f23c107b389a0e6e5f5e019a9a0dca86c156acae1e97f89e0
  • Pointer size: 133 Bytes
  • Size of remote file: 15.6 MB
assets/demo2.gif ADDED

Git LFS Details

  • SHA256: a534216bcbeeb64c8a2d45003ff40f8874a52e6a9cccf3ddc16bfce1c3f5325b
  • Pointer size: 133 Bytes
  • Size of remote file: 10.5 MB
assets/demo3.gif ADDED

Git LFS Details

  • SHA256: e8a3002546d013a88d949487b593f13d1aa34fee8715edc719c1e1d22e8f8ef0
  • Pointer size: 133 Bytes
  • Size of remote file: 20.8 MB
assets/demo5.gif ADDED

Git LFS Details

  • SHA256: 78e66e6f3ab4a17b82ec444c80851d37ce2c733990a935f50c94d33c99a4625c
  • Pointer size: 133 Bytes
  • Size of remote file: 17.5 MB
assets/demo6.gif ADDED

Git LFS Details

  • SHA256: 04ed0bbb49c4aac875c61ee3403609889f31ca04cea5a5509b6a0d07cda53b19
  • Pointer size: 132 Bytes
  • Size of remote file: 8.36 MB
assets/demo7.gif ADDED

Git LFS Details

  • SHA256: 68585fde38914834525eecd1fd48b5f9a2b01e29de86026ac859c502d71c34bb
  • Pointer size: 133 Bytes
  • Size of remote file: 17.1 MB
assets/demo8.gif ADDED

Git LFS Details

  • SHA256: 3d222731d3799de062e714f9097b8f760be1d9a0d94068379763389cf0bbb818
  • Pointer size: 132 Bytes
  • Size of remote file: 7.44 MB
assets/demo9.gif ADDED

Git LFS Details

  • SHA256: 254aff4335345befaadc571b61780c21f7e5cdf50608cd9db09a4b1a6604b811
  • Pointer size: 133 Bytes
  • Size of remote file: 12.5 MB
assets/hg.png ADDED

Git LFS Details

  • SHA256: ba83bf5215a95c94daed0d7ca0008be60403235c1467b7ae26950f7c8105892d
  • Pointer size: 132 Bytes
  • Size of remote file: 1.93 MB
assets/inference.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This script is modified from https://github.com/caizhongang/SMPLer-X/blob/main/main/inference.py
2
+ # Licensed under:
3
+ """
4
+ S-Lab License 1.0
5
+
6
+ Copyright 2022 S-Lab
7
+ Redistribution and use for non-commercial purpose in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
8
+ 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
9
+ 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
10
+ 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
11
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
12
+ 4. In the event that redistribution and/or use for commercial purpose in source or binary forms, with or without modification is required, please contact the contributor(s) of the work.
13
+ """
14
+
15
+ import os
16
+ import sys
17
+ import os.path as osp
18
+ import numpy as np
19
+ import torchvision.transforms as transforms
20
+ import torch.backends.cudnn as cudnn
21
+ import torch
22
+
23
+ CUR_DIR = osp.dirname(os.path.abspath(__file__))
24
+ sys.path.insert(0, osp.join(CUR_DIR, "..", "main"))
25
+ sys.path.insert(0, osp.join(CUR_DIR, "..", "common"))
26
+ from config import cfg
27
+ from mmdet.apis import init_detector, inference_detector
28
+ from utils.inference_utils import process_mmdet_results
29
+
30
+
31
+ class Inferer:
32
+ def __init__(self, pretrained_model, num_gpus, output_folder):
33
+ self.output_folder = output_folder
34
+ self.device = torch.device("cuda") if (num_gpus > 0) else torch.device("cpu")
35
+ config_path = osp.join(CUR_DIR, "./config", f"config_{pretrained_model}.py")
36
+ ckpt_path = osp.join(CUR_DIR, "../pretrained_models", f"{pretrained_model}.pth.tar")
37
+ cfg.get_config_fromfile(config_path)
38
+ cfg.update_config(num_gpus, ckpt_path, output_folder, self.device)
39
+ self.cfg = cfg
40
+ cudnn.benchmark = True
41
+
42
+ # load model
43
+ from base import Demoer
44
+
45
+ demoer = Demoer()
46
+ demoer._make_model()
47
+ demoer.model.eval()
48
+ self.demoer = demoer
49
+ checkpoint_file = osp.join(CUR_DIR, "../pretrained_models/mmdet/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth")
50
+ config_file = osp.join(CUR_DIR, "../pretrained_models/mmdet/mmdet_faster_rcnn_r50_fpn_coco.py")
51
+ model = init_detector(config_file, checkpoint_file, device=self.device) # or device='cuda:0'
52
+ self.model = model
53
+
54
+ def infer(self, original_img, iou_thr, frame, multi_person=False, mesh_as_vertices=False):
55
+ from utils.preprocessing import process_bbox, generate_patch_image
56
+
57
+ mesh_paths = []
58
+ smplx_paths = []
59
+ # prepare input image
60
+ transform = transforms.ToTensor()
61
+ vis_img = original_img.copy()
62
+ original_img_height, original_img_width = original_img.shape[:2]
63
+
64
+ ## mmdet inference
65
+ mmdet_results = inference_detector(self.model, original_img)
66
+
67
+ pred_instance = mmdet_results.pred_instances.cpu().numpy()
68
+ bboxes = np.concatenate((pred_instance.bboxes, pred_instance.scores[:, None]), axis=1)
69
+ bboxes = bboxes[pred_instance.labels == 0]
70
+ bboxes = np.expand_dims(bboxes, axis=0)
71
+ mmdet_box = process_mmdet_results(bboxes, cat_id=0, multi_person=True)
72
+
73
+ # save original image if no bbox
74
+ if len(mmdet_box[0]) < 1:
75
+ return original_img, [], []
76
+
77
+ num_bbox = 1
78
+ mmdet_box = mmdet_box[0]
79
+
80
+ ## loop all detected bboxes
81
+ for bbox_id in range(num_bbox):
82
+ mmdet_box_xywh = np.zeros((4))
83
+ mmdet_box_xywh[0] = mmdet_box[bbox_id][0]
84
+ mmdet_box_xywh[1] = mmdet_box[bbox_id][1]
85
+ mmdet_box_xywh[2] = abs(mmdet_box[bbox_id][2] - mmdet_box[bbox_id][0])
86
+ mmdet_box_xywh[3] = abs(mmdet_box[bbox_id][3] - mmdet_box[bbox_id][1])
87
+
88
+ # skip small bboxes by bbox_thr in pixel
89
+ if mmdet_box_xywh[2] < 50 or mmdet_box_xywh[3] < 150:
90
+ continue
91
+
92
+ bbox = process_bbox(mmdet_box_xywh, original_img_width, original_img_height)
93
+ img, img2bb_trans, bb2img_trans = generate_patch_image(original_img, bbox, 1.0, 0.0, False, self.cfg.input_img_shape)
94
+ img = transform(img.astype(np.float32)) / 255
95
+ img = img.to(cfg.device)[None, :, :, :]
96
+ inputs = {"img": img}
97
+ targets = {}
98
+ meta_info = {}
99
+
100
+ # mesh recovery
101
+ with torch.no_grad():
102
+ out = self.demoer.model(inputs, targets, meta_info, "test")
103
+
104
+ ## save single person param
105
+ smplx_pred = {}
106
+ smplx_pred["global_orient"] = out["smplx_root_pose"].reshape(-1, 3).cpu().numpy()
107
+ smplx_pred["body_pose"] = out["smplx_body_pose"].reshape(-1, 3).cpu().numpy()
108
+ smplx_pred["left_hand_pose"] = out["smplx_lhand_pose"].reshape(-1, 3).cpu().numpy()
109
+ smplx_pred["right_hand_pose"] = out["smplx_rhand_pose"].reshape(-1, 3).cpu().numpy()
110
+ smplx_pred["jaw_pose"] = out["smplx_jaw_pose"].reshape(-1, 3).cpu().numpy()
111
+ smplx_pred["leye_pose"] = np.zeros((1, 3))
112
+ smplx_pred["reye_pose"] = np.zeros((1, 3))
113
+ smplx_pred["betas"] = out["smplx_shape"].reshape(-1, 10).cpu().numpy()
114
+ smplx_pred["expression"] = out["smplx_expr"].reshape(-1, 10).cpu().numpy()
115
+ smplx_pred["transl"] = out["cam_trans"].reshape(-1, 3).cpu().numpy()
116
+ save_path_smplx = os.path.join(self.output_folder, "smplx")
117
+ os.makedirs(save_path_smplx, exist_ok=True)
118
+
119
+ npz_path = os.path.join(save_path_smplx, f"{frame:05}_{bbox_id}.npz")
120
+ np.savez(npz_path, **smplx_pred)
121
+ smplx_paths.append(npz_path)
122
+
123
+ vis_img = None
124
+ mesh_paths = None
125
+ return vis_img, mesh_paths, smplx_paths
assets/transforms.py ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This script is modified from https://github.com/caizhongang/SMPLer-X/blob/main/common/utils/transforms.py
2
+ # Licensed under:
3
+ """
4
+ S-Lab License 1.0
5
+
6
+ Copyright 2022 S-Lab
7
+ Redistribution and use for non-commercial purpose in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
8
+ 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
9
+ 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
10
+ 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
11
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
12
+ 4. In the event that redistribution and/or use for commercial purpose in source or binary forms, with or without modification is required, please contact the contributor(s) of the work.
13
+ """
14
+
15
+ """
16
+ Function rotation_matrix_to_angle_axis, rotation_matrix_to_quaternion, and quaternion_to_angle_axis are
17
+ modified from https://github.com/eglxiang/torchgeometry/blob/master/torchgeometry/core/conversions.py
18
+ The original code is licensed under the License: https://github.com/eglxiang/torchgeometry/blob/master/LICENSE
19
+ We modified the code to make it compatible with the torch>=1.9.0.
20
+ """
21
+
22
+ import torch
23
+ import numpy as np
24
+ from config import cfg
25
+ from torch.nn import functional as F
26
+
27
+
28
+ def cam2pixel(cam_coord, f, c):
29
+ x = cam_coord[:, 0] / cam_coord[:, 2] * f[0] + c[0]
30
+ y = cam_coord[:, 1] / cam_coord[:, 2] * f[1] + c[1]
31
+ z = cam_coord[:, 2]
32
+ return np.stack((x, y, z), 1)
33
+
34
+
35
+ def pixel2cam(pixel_coord, f, c):
36
+ x = (pixel_coord[:, 0] - c[0]) / f[0] * pixel_coord[:, 2]
37
+ y = (pixel_coord[:, 1] - c[1]) / f[1] * pixel_coord[:, 2]
38
+ z = pixel_coord[:, 2]
39
+ return np.stack((x, y, z), 1)
40
+
41
+
42
+ def world2cam(world_coord, R, t):
43
+ cam_coord = np.dot(R, world_coord.transpose(1, 0)).transpose(1, 0) + t.reshape(1, 3)
44
+ return cam_coord
45
+
46
+
47
+ def cam2world(cam_coord, R, t):
48
+ world_coord = np.dot(np.linalg.inv(R), (cam_coord - t.reshape(1, 3)).transpose(1, 0)).transpose(1, 0)
49
+ return world_coord
50
+
51
+
52
+ def rigid_transform_3D(A, B):
53
+ n, dim = A.shape
54
+ centroid_A = np.mean(A, axis=0)
55
+ centroid_B = np.mean(B, axis=0)
56
+ H = np.dot(np.transpose(A - centroid_A), B - centroid_B) / n
57
+ U, s, V = np.linalg.svd(H)
58
+ R = np.dot(np.transpose(V), np.transpose(U))
59
+ if np.linalg.det(R) < 0:
60
+ s[-1] = -s[-1]
61
+ V[2] = -V[2]
62
+ R = np.dot(np.transpose(V), np.transpose(U))
63
+
64
+ varP = np.var(A, axis=0).sum()
65
+ c = 1 / varP * np.sum(s)
66
+
67
+ t = -np.dot(c * R, np.transpose(centroid_A)) + np.transpose(centroid_B)
68
+ return c, R, t
69
+
70
+
71
+ def rigid_align(A, B):
72
+ c, R, t = rigid_transform_3D(A, B)
73
+ A2 = np.transpose(np.dot(c * R, np.transpose(A))) + t
74
+ return A2
75
+
76
+
77
+ def transform_joint_to_other_db(src_joint, src_name, dst_name):
78
+ src_joint_num = len(src_name)
79
+ dst_joint_num = len(dst_name)
80
+
81
+ new_joint = np.zeros(((dst_joint_num,) + src_joint.shape[1:]), dtype=np.float32)
82
+ for src_idx in range(len(src_name)):
83
+ name = src_name[src_idx]
84
+ if name in dst_name:
85
+ dst_idx = dst_name.index(name)
86
+ new_joint[dst_idx] = src_joint[src_idx]
87
+
88
+ return new_joint
89
+
90
+
91
+ def rotation_matrix_to_angle_axis(rotation_matrix):
92
+ """Convert 3x4 rotation matrix to Rodrigues vector
93
+
94
+ Args:
95
+ rotation_matrix (Tensor): rotation matrix.
96
+
97
+ Returns:
98
+ Tensor: Rodrigues vector transformation.
99
+
100
+ Shape:
101
+ - Input: :math:`(N, 3, 4)`
102
+ - Output: :math:`(N, 3)`
103
+
104
+ Example:
105
+ >>> input = torch.rand(2, 3, 4) # Nx4x4
106
+ >>> output = tgm.rotation_matrix_to_angle_axis(input) # Nx3
107
+ """
108
+ # todo add check that matrix is a valid rotation matrix
109
+ quaternion = rotation_matrix_to_quaternion(rotation_matrix)
110
+ return quaternion_to_angle_axis(quaternion)
111
+
112
+
113
+ def rotation_matrix_to_quaternion(rotation_matrix, eps=1e-6):
114
+ """Convert 3x4 rotation matrix to 4d quaternion vector
115
+
116
+ This algorithm is based on algorithm described in
117
+ https://github.com/KieranWynn/pyquaternion/blob/master/pyquaternion/quaternion.py#L201
118
+
119
+ Args:
120
+ rotation_matrix (Tensor): the rotation matrix to convert.
121
+
122
+ Return:
123
+ Tensor: the rotation in quaternion
124
+
125
+ Shape:
126
+ - Input: :math:`(N, 3, 4)`
127
+ - Output: :math:`(N, 4)`
128
+
129
+ Example:
130
+ >>> input = torch.rand(4, 3, 4) # Nx3x4
131
+ >>> output = tgm.rotation_matrix_to_quaternion(input) # Nx4
132
+ """
133
+ if not torch.is_tensor(rotation_matrix):
134
+ raise TypeError("Input type is not a torch.Tensor. Got {}".format(type(rotation_matrix)))
135
+
136
+ if len(rotation_matrix.shape) > 3:
137
+ raise ValueError("Input size must be a three dimensional tensor. Got {}".format(rotation_matrix.shape))
138
+ if not rotation_matrix.shape[-2:] == (3, 4):
139
+ raise ValueError("Input size must be a N x 3 x 4 tensor. Got {}".format(rotation_matrix.shape))
140
+
141
+ rmat_t = torch.transpose(rotation_matrix, 1, 2)
142
+
143
+ mask_d2 = rmat_t[:, 2, 2] < eps
144
+
145
+ mask_d0_d1 = rmat_t[:, 0, 0] > rmat_t[:, 1, 1]
146
+ mask_d0_nd1 = rmat_t[:, 0, 0] < -rmat_t[:, 1, 1]
147
+
148
+ t0 = 1 + rmat_t[:, 0, 0] - rmat_t[:, 1, 1] - rmat_t[:, 2, 2]
149
+ q0 = torch.stack([rmat_t[:, 1, 2] - rmat_t[:, 2, 1], t0, rmat_t[:, 0, 1] + rmat_t[:, 1, 0], rmat_t[:, 2, 0] + rmat_t[:, 0, 2]], -1)
150
+ t0_rep = t0.repeat(4, 1).t()
151
+
152
+ t1 = 1 - rmat_t[:, 0, 0] + rmat_t[:, 1, 1] - rmat_t[:, 2, 2]
153
+ q1 = torch.stack([rmat_t[:, 2, 0] - rmat_t[:, 0, 2], rmat_t[:, 0, 1] + rmat_t[:, 1, 0], t1, rmat_t[:, 1, 2] + rmat_t[:, 2, 1]], -1)
154
+ t1_rep = t1.repeat(4, 1).t()
155
+
156
+ t2 = 1 - rmat_t[:, 0, 0] - rmat_t[:, 1, 1] + rmat_t[:, 2, 2]
157
+ q2 = torch.stack([rmat_t[:, 0, 1] - rmat_t[:, 1, 0], rmat_t[:, 2, 0] + rmat_t[:, 0, 2], rmat_t[:, 1, 2] + rmat_t[:, 2, 1], t2], -1)
158
+ t2_rep = t2.repeat(4, 1).t()
159
+
160
+ t3 = 1 + rmat_t[:, 0, 0] + rmat_t[:, 1, 1] + rmat_t[:, 2, 2]
161
+ q3 = torch.stack([t3, rmat_t[:, 1, 2] - rmat_t[:, 2, 1], rmat_t[:, 2, 0] - rmat_t[:, 0, 2], rmat_t[:, 0, 1] - rmat_t[:, 1, 0]], -1)
162
+ t3_rep = t3.repeat(4, 1).t()
163
+
164
+ mask_c0 = mask_d2 * mask_d0_d1
165
+
166
+ """
167
+ Modified the code from the original source to make it compatible with the torch>=1.9.0
168
+ Original code:
169
+ mask_c1 = mask_d2 * (1 - mask_d0_d1)
170
+ mask_c2 = (1 - mask_d2) * mask_d0_nd1
171
+ mask_c3 = (1 - mask_d2) * (1 - mask_d0_nd1)
172
+ """
173
+ # From here
174
+ inv_mask_d0_d1 = ~mask_d0_d1
175
+ inv_mask_d0_nd1 = ~mask_d0_nd1
176
+ inv_mask_d2 = ~mask_d2
177
+ mask_c1 = mask_d2 * inv_mask_d0_d1
178
+ mask_c2 = inv_mask_d2 * mask_d0_nd1
179
+ mask_c3 = inv_mask_d2 * inv_mask_d0_nd1
180
+ # Until here
181
+
182
+ mask_c0 = mask_c0.view(-1, 1).type_as(q0)
183
+ mask_c1 = mask_c1.view(-1, 1).type_as(q1)
184
+ mask_c2 = mask_c2.view(-1, 1).type_as(q2)
185
+ mask_c3 = mask_c3.view(-1, 1).type_as(q3)
186
+
187
+ q = q0 * mask_c0 + q1 * mask_c1 + q2 * mask_c2 + q3 * mask_c3
188
+ q /= torch.sqrt(
189
+ t0_rep * mask_c0
190
+ + t1_rep * mask_c1 # noqa
191
+ + t2_rep * mask_c2
192
+ + t3_rep * mask_c3
193
+ ) # noqa
194
+ q *= 0.5
195
+ return q
196
+
197
+
198
+ def quaternion_to_angle_axis(quaternion: torch.Tensor) -> torch.Tensor:
199
+ """Convert quaternion vector to angle axis of rotation.
200
+
201
+ Adapted from ceres C++ library: ceres-solver/include/ceres/rotation.h
202
+
203
+ Args:
204
+ quaternion (torch.Tensor): tensor with quaternions.
205
+
206
+ Return:
207
+ torch.Tensor: tensor with angle axis of rotation.
208
+
209
+ Shape:
210
+ - Input: :math:`(*, 4)` where `*` means, any number of dimensions
211
+ - Output: :math:`(*, 3)`
212
+
213
+ Example:
214
+ >>> quaternion = torch.rand(2, 4) # Nx4
215
+ >>> angle_axis = tgm.quaternion_to_angle_axis(quaternion) # Nx3
216
+ """
217
+ if not torch.is_tensor(quaternion):
218
+ raise TypeError("Input type is not a torch.Tensor. Got {}".format(type(quaternion)))
219
+
220
+ if not quaternion.shape[-1] == 4:
221
+ raise ValueError("Input must be a tensor of shape Nx4 or 4. Got {}".format(quaternion.shape))
222
+ # unpack input and compute conversion
223
+ q1: torch.Tensor = quaternion[..., 1]
224
+ q2: torch.Tensor = quaternion[..., 2]
225
+ q3: torch.Tensor = quaternion[..., 3]
226
+ sin_squared_theta: torch.Tensor = q1 * q1 + q2 * q2 + q3 * q3
227
+
228
+ sin_theta: torch.Tensor = torch.sqrt(sin_squared_theta)
229
+ cos_theta: torch.Tensor = quaternion[..., 0]
230
+ two_theta: torch.Tensor = 2.0 * torch.where(cos_theta < 0.0, torch.atan2(-sin_theta, -cos_theta), torch.atan2(sin_theta, cos_theta))
231
+
232
+ k_pos: torch.Tensor = two_theta / sin_theta
233
+ k_neg: torch.Tensor = 2.0 * torch.ones_like(sin_theta)
234
+ k: torch.Tensor = torch.where(sin_squared_theta > 0.0, k_pos, k_neg)
235
+
236
+ angle_axis: torch.Tensor = torch.zeros_like(quaternion)[..., :3]
237
+ angle_axis[..., 0] += q1 * k
238
+ angle_axis[..., 1] += q2 * k
239
+ angle_axis[..., 2] += q3 * k
240
+ return angle_axis
241
+
242
+
243
+ def rot6d_to_axis_angle(x):
244
+ batch_size = x.shape[0]
245
+
246
+ x = x.view(-1, 3, 2)
247
+ a1 = x[:, :, 0]
248
+ a2 = x[:, :, 1]
249
+ b1 = F.normalize(a1)
250
+ b2 = F.normalize(a2 - torch.einsum("bi,bi->b", b1, a2).unsqueeze(-1) * b1)
251
+ b3 = torch.cross(b1, b2)
252
+ rot_mat = torch.stack((b1, b2, b3), dim=-1) # 3x3 rotation matrix
253
+
254
+ rot_mat = torch.cat([rot_mat, torch.zeros((batch_size, 3, 1)).to(cfg.device).float()], 2) # 3x4 rotation matrix
255
+ axis_angle = rotation_matrix_to_angle_axis(rot_mat).reshape(-1, 3) # axis-angle
256
+ axis_angle[torch.isnan(axis_angle)] = 0.0
257
+ return axis_angle
258
+
259
+
260
+ def sample_joint_features(img_feat, joint_xy):
261
+ height, width = img_feat.shape[2:]
262
+ x = joint_xy[:, :, 0] / (width - 1) * 2 - 1
263
+ y = joint_xy[:, :, 1] / (height - 1) * 2 - 1
264
+ grid = torch.stack((x, y), 2)[:, :, None, :]
265
+ img_feat = F.grid_sample(img_feat, grid, align_corners=True)[:, :, :, 0] # batch_size, channel_dim, joint_num
266
+ img_feat = img_feat.permute(0, 2, 1).contiguous() # batch_size, joint_num, channel_dim
267
+ return img_feat
268
+
269
+
270
+ def soft_argmax_2d(heatmap2d):
271
+ batch_size = heatmap2d.shape[0]
272
+ height, width = heatmap2d.shape[2:]
273
+ heatmap2d = heatmap2d.reshape((batch_size, -1, height * width))
274
+ heatmap2d = F.softmax(heatmap2d, 2)
275
+ heatmap2d = heatmap2d.reshape((batch_size, -1, height, width))
276
+
277
+ accu_x = heatmap2d.sum(dim=(2))
278
+ accu_y = heatmap2d.sum(dim=(3))
279
+
280
+ accu_x = accu_x * torch.arange(width).float().to(cfg.device)[None, None, :]
281
+ accu_y = accu_y * torch.arange(height).float().to(cfg.device)[None, None, :]
282
+
283
+ accu_x = accu_x.sum(dim=2, keepdim=True)
284
+ accu_y = accu_y.sum(dim=2, keepdim=True)
285
+
286
+ coord_out = torch.cat((accu_x, accu_y), dim=2)
287
+ return coord_out
288
+
289
+
290
+ def soft_argmax_3d(heatmap3d):
291
+ batch_size = heatmap3d.shape[0]
292
+ depth, height, width = heatmap3d.shape[2:]
293
+ heatmap3d = heatmap3d.reshape((batch_size, -1, depth * height * width))
294
+ heatmap3d = F.softmax(heatmap3d, 2)
295
+ heatmap3d = heatmap3d.reshape((batch_size, -1, depth, height, width))
296
+
297
+ accu_x = heatmap3d.sum(dim=(2, 3))
298
+ accu_y = heatmap3d.sum(dim=(2, 4))
299
+ accu_z = heatmap3d.sum(dim=(3, 4))
300
+
301
+ accu_x = accu_x * torch.arange(width).float().to(cfg.device)[None, None, :]
302
+ accu_y = accu_y * torch.arange(height).float().to(cfg.device)[None, None, :]
303
+ accu_z = accu_z * torch.arange(depth).float().to(cfg.device)[None, None, :]
304
+
305
+ accu_x = accu_x.sum(dim=2, keepdim=True)
306
+ accu_y = accu_y.sum(dim=2, keepdim=True)
307
+ accu_z = accu_z.sum(dim=2, keepdim=True)
308
+
309
+ coord_out = torch.cat((accu_x, accu_y, accu_z), dim=2)
310
+ return coord_out
311
+
312
+
313
+ def restore_bbox(bbox_center, bbox_size, aspect_ratio, extension_ratio):
314
+ bbox = bbox_center.view(-1, 1, 2) + torch.cat(
315
+ (-bbox_size.view(-1, 1, 2) / 2.0, bbox_size.view(-1, 1, 2) / 2.0), 1
316
+ ) # xyxy in (cfg.output_hm_shape[2], cfg.output_hm_shape[1]) space
317
+ bbox[:, :, 0] = bbox[:, :, 0] / cfg.output_hm_shape[2] * cfg.input_body_shape[1]
318
+ bbox[:, :, 1] = bbox[:, :, 1] / cfg.output_hm_shape[1] * cfg.input_body_shape[0]
319
+ bbox = bbox.view(-1, 4)
320
+
321
+ # xyxy -> xywh
322
+ bbox[:, 2] = bbox[:, 2] - bbox[:, 0]
323
+ bbox[:, 3] = bbox[:, 3] - bbox[:, 1]
324
+
325
+ # aspect ratio preserving bbox
326
+ w = bbox[:, 2]
327
+ h = bbox[:, 3]
328
+ c_x = bbox[:, 0] + w / 2.0
329
+ c_y = bbox[:, 1] + h / 2.0
330
+
331
+ mask1 = w > (aspect_ratio * h)
332
+ mask2 = w < (aspect_ratio * h)
333
+ h[mask1] = w[mask1] / aspect_ratio
334
+ w[mask2] = h[mask2] * aspect_ratio
335
+
336
+ bbox[:, 2] = w * extension_ratio
337
+ bbox[:, 3] = h * extension_ratio
338
+ bbox[:, 0] = c_x - bbox[:, 2] / 2.0
339
+ bbox[:, 1] = c_y - bbox[:, 3] / 2.0
340
+
341
+ # xywh -> xyxy
342
+ bbox[:, 2] = bbox[:, 2] + bbox[:, 0]
343
+ bbox[:, 3] = bbox[:, 3] + bbox[:, 1]
344
+ return bbox
assets/video.png ADDED

Git LFS Details

  • SHA256: 7eb58de872d8a85add21be077d2358b5c887b54bc44ecd5480b5139b7d17ea37
  • Pointer size: 132 Bytes
  • Size of remote file: 7.64 MB
audio_0_retri_0_watermarked.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7af0a48abf6efb3195378081c8018b809e25ee6e24e5d731cfa989b3d671fe43
3
+ size 1022865
configs/gradio.yaml ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_project: 'TANGO'
2
+ exp_name: 'debug'
3
+
4
+ wandb_entity: ''
5
+ wandb_key: ""
6
+ wandb_log_dir: '/content/outputs/wandb'
7
+ output_dir: ./outputs/
8
+ log_period: 1
9
+ seed: 42
10
+
11
+ data:
12
+ name_pyfile: "datasets.beat2_v5"
13
+ class_name: "BEAT2Dataset"
14
+ train_bs: 2
15
+ meta_paths:
16
+ - "./datasets/data_json/show-oliver-s40_w128.json"
17
+ # test_meta_paths: "./datasets/data_json/show_oliver_test/Abortion_Laws_-_Last_Week_Tonight_with_John_Oliver_HBO-DRauXXz6t0Y.webm.json"
18
+ "test_meta_paths": "./datasets/data_json/show_oliver_test/Stupid_Watergate_-_Last_Week_Tonight_with_John_Oliver_HBO-FVFdsl29s_Q.mkv.json"
19
+ pose_norm: False
20
+ pose_fps: 30
21
+ rot6d: True
22
+ pose_dims: 825
23
+ pose_length: 128
24
+ stride: 20
25
+ test_length: 128
26
+ audio_sr: 16000
27
+ audio_fps: 16000
28
+
29
+ model:
30
+ name_pyfile: "models.jointembedding_high_env0"
31
+ class_name: "JointEmbedding"
32
+ motion_f: 256
33
+ audio_rep: wave16k
34
+ audio_sr: 16000
35
+ audio_fps: 16000
36
+ audio_norm: False
37
+ audio_f: 256
38
+ word_rep: textgrid
39
+ word_index_num: 11195
40
+ word_dims: 300
41
+ facial_rep: smplxflame_30
42
+ facial_dims: 100
43
+ facial_norm: False
44
+ facial_f: 0
45
+ f_pre_encoder: null
46
+ f_encoder: null
47
+ f_fix_pre: False
48
+ id_rep: onehot
49
+ speaker_f: 0
50
+ hidden_size: 512
51
+ n_layer: 1
52
+ motion_dim: 825
53
+
54
+ validation:
55
+ val_loss_steps: 1
56
+ validation_steps: 1000
57
+ # guidance_scale: 3.5
58
+ # denoising_steps: 20
59
+
60
+ solver:
61
+ gradient_accumulation_steps: 1
62
+ # mixed_precision: 'fp16'
63
+ # enable_xformers_memory_efficient_attention: True
64
+ gradient_checkpointing: False
65
+ max_train_steps: 5000000
66
+ max_grad_norm: 1.0
67
+ # lr
68
+ learning_rate: 2e-5
69
+ scale_lr: False
70
+ lr_warmup_steps: 50
71
+ lr_scheduler: 'constant'
72
+ # optimizer
73
+ use_8bit_adam: False
74
+ adam_beta1: 0.9
75
+ adam_beta2: 0.999
76
+ adam_weight_decay: 1.0e-2
77
+ adam_epsilon: 1.0e-8
configs/gradio_speaker1.yaml ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_project: 'TANGO'
2
+ exp_name: 'debug'
3
+
4
+ wandb_entity: ''
5
+ wandb_key: ""
6
+ wandb_log_dir: '/content/outputs/wandb'
7
+ output_dir: ./outputs/
8
+ log_period: 1
9
+ seed: 42
10
+
11
+ data:
12
+ name_pyfile: "datasets.beat2_v5"
13
+ class_name: "BEAT2Dataset"
14
+ train_bs: 2
15
+ meta_paths:
16
+ - "./datasets/data_json/show-oliver-s40_w128.json"
17
+ # test_meta_paths: "./datasets/data_json/show_oliver_test/Abortion_Laws_-_Last_Week_Tonight_with_John_Oliver_HBO-DRauXXz6t0Y.webm.json"
18
+ "test_meta_paths": "./datasets/data_json/youtube_test/speaker1.json"
19
+ pose_norm: False
20
+ pose_fps: 30
21
+ rot6d: True
22
+ pose_dims: 825
23
+ pose_length: 128
24
+ stride: 20
25
+ test_length: 128
26
+ audio_sr: 16000
27
+ audio_fps: 16000
28
+
29
+ model:
30
+ name_pyfile: "models.jointembedding_high_env0"
31
+ class_name: "JointEmbedding"
32
+ motion_f: 256
33
+ audio_rep: wave16k
34
+ audio_sr: 16000
35
+ audio_fps: 16000
36
+ audio_norm: False
37
+ audio_f: 256
38
+ word_rep: textgrid
39
+ word_index_num: 11195
40
+ word_dims: 300
41
+ facial_rep: smplxflame_30
42
+ facial_dims: 100
43
+ facial_norm: False
44
+ facial_f: 0
45
+ f_pre_encoder: null
46
+ f_encoder: null
47
+ f_fix_pre: False
48
+ id_rep: onehot
49
+ speaker_f: 0
50
+ hidden_size: 512
51
+ n_layer: 1
52
+ motion_dim: 825
53
+
54
+ validation:
55
+ val_loss_steps: 1
56
+ validation_steps: 1000
57
+ # guidance_scale: 3.5
58
+ # denoising_steps: 20
59
+
60
+ solver:
61
+ gradient_accumulation_steps: 1
62
+ # mixed_precision: 'fp16'
63
+ # enable_xformers_memory_efficient_attention: True
64
+ gradient_checkpointing: False
65
+ max_train_steps: 5000000
66
+ max_grad_norm: 1.0
67
+ # lr
68
+ learning_rate: 2e-5
69
+ scale_lr: False
70
+ lr_warmup_steps: 50
71
+ lr_scheduler: 'constant'
72
+ # optimizer
73
+ use_8bit_adam: False
74
+ adam_beta1: 0.9
75
+ adam_beta2: 0.999
76
+ adam_weight_decay: 1.0e-2
77
+ adam_epsilon: 1.0e-8
configs/gradio_speaker7.yaml ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_project: 'TANGO'
2
+ exp_name: 'debug'
3
+
4
+ wandb_entity: ''
5
+ wandb_key: ""
6
+ wandb_log_dir: '/content/outputs/wandb'
7
+ output_dir: ./outputs/
8
+ log_period: 1
9
+ seed: 42
10
+
11
+ data:
12
+ name_pyfile: "datasets.beat2_v5"
13
+ class_name: "BEAT2Dataset"
14
+ train_bs: 2
15
+ meta_paths:
16
+ - "./datasets/data_json/show-oliver-s40_w128.json"
17
+ # test_meta_paths: "./datasets/data_json/show_oliver_test/Abortion_Laws_-_Last_Week_Tonight_with_John_Oliver_HBO-DRauXXz6t0Y.webm.json"
18
+ "test_meta_paths": "./datasets/data_json/youtube_test/speaker7.json"
19
+ pose_norm: False
20
+ pose_fps: 30
21
+ rot6d: True
22
+ pose_dims: 825
23
+ pose_length: 128
24
+ stride: 20
25
+ test_length: 128
26
+ audio_sr: 16000
27
+ audio_fps: 16000
28
+
29
+ model:
30
+ name_pyfile: "models.jointembedding_high_env0"
31
+ class_name: "JointEmbedding"
32
+ motion_f: 256
33
+ audio_rep: wave16k
34
+ audio_sr: 16000
35
+ audio_fps: 16000
36
+ audio_norm: False
37
+ audio_f: 256
38
+ word_rep: textgrid
39
+ word_index_num: 11195
40
+ word_dims: 300
41
+ facial_rep: smplxflame_30
42
+ facial_dims: 100
43
+ facial_norm: False
44
+ facial_f: 0
45
+ f_pre_encoder: null
46
+ f_encoder: null
47
+ f_fix_pre: False
48
+ id_rep: onehot
49
+ speaker_f: 0
50
+ hidden_size: 512
51
+ n_layer: 1
52
+ motion_dim: 825
53
+
54
+ validation:
55
+ val_loss_steps: 1
56
+ validation_steps: 1000
57
+ # guidance_scale: 3.5
58
+ # denoising_steps: 20
59
+
60
+ solver:
61
+ gradient_accumulation_steps: 1
62
+ # mixed_precision: 'fp16'
63
+ # enable_xformers_memory_efficient_attention: True
64
+ gradient_checkpointing: False
65
+ max_train_steps: 5000000
66
+ max_grad_norm: 1.0
67
+ # lr
68
+ learning_rate: 2e-5
69
+ scale_lr: False
70
+ lr_warmup_steps: 50
71
+ lr_scheduler: 'constant'
72
+ # optimizer
73
+ use_8bit_adam: False
74
+ adam_beta1: 0.9
75
+ adam_beta2: 0.999
76
+ adam_weight_decay: 1.0e-2
77
+ adam_epsilon: 1.0e-8
configs/gradio_speaker8.yaml ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_project: 'TANGO'
2
+ exp_name: 'debug'
3
+
4
+ wandb_entity: ''
5
+ wandb_key: ""
6
+ wandb_log_dir: '/content/outputs/wandb'
7
+ output_dir: ./outputs/
8
+ log_period: 1
9
+ seed: 42
10
+
11
+ data:
12
+ name_pyfile: "datasets.beat2_v5"
13
+ class_name: "BEAT2Dataset"
14
+ train_bs: 2
15
+ meta_paths:
16
+ - "./datasets/data_json/show-oliver-s40_w128.json"
17
+ # test_meta_paths: "./datasets/data_json/show_oliver_test/Abortion_Laws_-_Last_Week_Tonight_with_John_Oliver_HBO-DRauXXz6t0Y.webm.json"
18
+ "test_meta_paths": "./datasets/data_json/youtube_test/speaker8.json"
19
+ pose_norm: False
20
+ pose_fps: 30
21
+ rot6d: True
22
+ pose_dims: 825
23
+ pose_length: 128
24
+ stride: 20
25
+ test_length: 128
26
+ audio_sr: 16000
27
+ audio_fps: 16000
28
+
29
+ model:
30
+ name_pyfile: "models.jointembedding_high_env0"
31
+ class_name: "JointEmbedding"
32
+ motion_f: 256
33
+ audio_rep: wave16k
34
+ audio_sr: 16000
35
+ audio_fps: 16000
36
+ audio_norm: False
37
+ audio_f: 256
38
+ word_rep: textgrid
39
+ word_index_num: 11195
40
+ word_dims: 300
41
+ facial_rep: smplxflame_30
42
+ facial_dims: 100
43
+ facial_norm: False
44
+ facial_f: 0
45
+ f_pre_encoder: null
46
+ f_encoder: null
47
+ f_fix_pre: False
48
+ id_rep: onehot
49
+ speaker_f: 0
50
+ hidden_size: 512
51
+ n_layer: 1
52
+ motion_dim: 825
53
+
54
+ validation:
55
+ val_loss_steps: 1
56
+ validation_steps: 1000
57
+ # guidance_scale: 3.5
58
+ # denoising_steps: 20
59
+
60
+ solver:
61
+ gradient_accumulation_steps: 1
62
+ # mixed_precision: 'fp16'
63
+ # enable_xformers_memory_efficient_attention: True
64
+ gradient_checkpointing: False
65
+ max_train_steps: 5000000
66
+ max_grad_norm: 1.0
67
+ # lr
68
+ learning_rate: 2e-5
69
+ scale_lr: False
70
+ lr_warmup_steps: 50
71
+ lr_scheduler: 'constant'
72
+ # optimizer
73
+ use_8bit_adam: False
74
+ adam_beta1: 0.9
75
+ adam_beta2: 0.999
76
+ adam_weight_decay: 1.0e-2
77
+ adam_epsilon: 1.0e-8
configs/gradio_speaker9.yaml ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_project: 'TANGO'
2
+ exp_name: 'debug'
3
+
4
+ wandb_entity: ''
5
+ wandb_key: ""
6
+ wandb_log_dir: '/content/outputs/wandb'
7
+ output_dir: ./outputs/
8
+ log_period: 1
9
+ seed: 42
10
+
11
+ data:
12
+ name_pyfile: "datasets.beat2_v5"
13
+ class_name: "BEAT2Dataset"
14
+ train_bs: 2
15
+ meta_paths:
16
+ - "./datasets/data_json/show-oliver-s40_w128.json"
17
+ # test_meta_paths: "./datasets/data_json/show_oliver_test/Abortion_Laws_-_Last_Week_Tonight_with_John_Oliver_HBO-DRauXXz6t0Y.webm.json"
18
+ "test_meta_paths": "./datasets/data_json/youtube_test/speaker9.json"
19
+ pose_norm: False
20
+ pose_fps: 30
21
+ rot6d: True
22
+ pose_dims: 825
23
+ pose_length: 128
24
+ stride: 20
25
+ test_length: 128
26
+ audio_sr: 16000
27
+ audio_fps: 16000
28
+
29
+ model:
30
+ name_pyfile: "models.jointembedding_high_env0"
31
+ class_name: "JointEmbedding"
32
+ motion_f: 256
33
+ audio_rep: wave16k
34
+ audio_sr: 16000
35
+ audio_fps: 16000
36
+ audio_norm: False
37
+ audio_f: 256
38
+ word_rep: textgrid
39
+ word_index_num: 11195
40
+ word_dims: 300
41
+ facial_rep: smplxflame_30
42
+ facial_dims: 100
43
+ facial_norm: False
44
+ facial_f: 0
45
+ f_pre_encoder: null
46
+ f_encoder: null
47
+ f_fix_pre: False
48
+ id_rep: onehot
49
+ speaker_f: 0
50
+ hidden_size: 512
51
+ n_layer: 1
52
+ motion_dim: 825
53
+
54
+ validation:
55
+ val_loss_steps: 1
56
+ validation_steps: 1000
57
+ # guidance_scale: 3.5
58
+ # denoising_steps: 20
59
+
60
+ solver:
61
+ gradient_accumulation_steps: 1
62
+ # mixed_precision: 'fp16'
63
+ # enable_xformers_memory_efficient_attention: True
64
+ gradient_checkpointing: False
65
+ max_train_steps: 5000000
66
+ max_grad_norm: 1.0
67
+ # lr
68
+ learning_rate: 2e-5
69
+ scale_lr: False
70
+ lr_warmup_steps: 50
71
+ lr_scheduler: 'constant'
72
+ # optimizer
73
+ use_8bit_adam: False
74
+ adam_beta1: 0.9
75
+ adam_beta2: 0.999
76
+ adam_weight_decay: 1.0e-2
77
+ adam_epsilon: 1.0e-8
create_graph.py ADDED
@@ -0,0 +1,507 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ input: json file with video, audio, motion paths
3
+ output: igraph object with nodes containing video, audio, motion, position, velocity, axis_angle, previous, next, frame, fps
4
+
5
+ preprocess:
6
+ 1. assume you have a video for one speaker in folder, listed in
7
+ -- video_a.mp4
8
+ -- video_b.mp4
9
+ run process_video.py to extract frames and audio
10
+ """
11
+
12
+ import os
13
+ import json
14
+ import smplx
15
+ import torch
16
+ import igraph
17
+ import numpy as np
18
+ import subprocess
19
+ import utils.rotation_conversions as rc
20
+ from moviepy.editor import VideoClip, AudioFileClip
21
+ from tqdm import tqdm
22
+ import imageio
23
+ import tempfile
24
+ import argparse
25
+ import time
26
+
27
+ SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__))
28
+
29
+
30
+ def get_motion_reps_tensor(motion_tensor, smplx_model, pose_fps=30, device="cuda"):
31
+ bs, n, _ = motion_tensor.shape
32
+ motion_tensor = motion_tensor.float().to(device)
33
+ motion_tensor_reshaped = motion_tensor.reshape(bs * n, 165)
34
+
35
+ output = smplx_model(
36
+ betas=torch.zeros(bs * n, 300, device=device),
37
+ transl=torch.zeros(bs * n, 3, device=device),
38
+ expression=torch.zeros(bs * n, 100, device=device),
39
+ jaw_pose=torch.zeros(bs * n, 3, device=device),
40
+ global_orient=torch.zeros(bs * n, 3, device=device),
41
+ body_pose=motion_tensor_reshaped[:, 3 : 21 * 3 + 3],
42
+ left_hand_pose=motion_tensor_reshaped[:, 25 * 3 : 40 * 3],
43
+ right_hand_pose=motion_tensor_reshaped[:, 40 * 3 : 55 * 3],
44
+ return_joints=True,
45
+ leye_pose=torch.zeros(bs * n, 3, device=device),
46
+ reye_pose=torch.zeros(bs * n, 3, device=device),
47
+ )
48
+
49
+ joints = output["joints"].reshape(bs, n, 127, 3)[:, :, :55, :]
50
+ dt = 1 / pose_fps
51
+ init_vel = (joints[:, 1:2] - joints[:, 0:1]) / dt
52
+ middle_vel = (joints[:, 2:] - joints[:, :-2]) / (2 * dt)
53
+ final_vel = (joints[:, -1:] - joints[:, -2:-1]) / dt
54
+ vel = torch.cat([init_vel, middle_vel, final_vel], dim=1)
55
+
56
+ position = joints
57
+ rot_matrices = rc.axis_angle_to_matrix(motion_tensor.reshape(bs, n, 55, 3))
58
+ rot6d = rc.matrix_to_rotation_6d(rot_matrices).reshape(bs, n, 55, 6)
59
+
60
+ init_vel_ang = (motion_tensor[:, 1:2] - motion_tensor[:, 0:1]) / dt
61
+ middle_vel_ang = (motion_tensor[:, 2:] - motion_tensor[:, :-2]) / (2 * dt)
62
+ final_vel_ang = (motion_tensor[:, -1:] - motion_tensor[:, -2:-1]) / dt
63
+ angular_velocity = torch.cat([init_vel_ang, middle_vel_ang, final_vel_ang], dim=1).reshape(bs, n, 55, 3)
64
+
65
+ rep15d = torch.cat([position, vel, rot6d, angular_velocity], dim=3).reshape(bs, n, 55 * 15)
66
+
67
+ return {
68
+ "position": position,
69
+ "velocity": vel,
70
+ "rotation": rot6d,
71
+ "axis_angle": motion_tensor,
72
+ "angular_velocity": angular_velocity,
73
+ "rep15d": rep15d,
74
+ }
75
+
76
+
77
+ def get_motion_reps(motion, smplx_model, pose_fps=30):
78
+ gt_motion_tensor = motion["poses"]
79
+ n = gt_motion_tensor.shape[0]
80
+ bs = 1
81
+ gt_motion_tensor = torch.from_numpy(gt_motion_tensor).float().to(device).unsqueeze(0)
82
+ gt_motion_tensor_reshaped = gt_motion_tensor.reshape(bs * n, 165)
83
+ output = smplx_model(
84
+ betas=torch.zeros(bs * n, 300).to(device),
85
+ transl=torch.zeros(bs * n, 3).to(device),
86
+ expression=torch.zeros(bs * n, 100).to(device),
87
+ jaw_pose=torch.zeros(bs * n, 3).to(device),
88
+ global_orient=torch.zeros(bs * n, 3).to(device),
89
+ body_pose=gt_motion_tensor_reshaped[:, 3 : 21 * 3 + 3],
90
+ left_hand_pose=gt_motion_tensor_reshaped[:, 25 * 3 : 40 * 3],
91
+ right_hand_pose=gt_motion_tensor_reshaped[:, 40 * 3 : 55 * 3],
92
+ return_joints=True,
93
+ leye_pose=torch.zeros(bs * n, 3).to(device),
94
+ reye_pose=torch.zeros(bs * n, 3).to(device),
95
+ )
96
+ joints = output["joints"].detach().cpu().numpy().reshape(n, 127, 3)[:, :55, :]
97
+ dt = 1 / pose_fps
98
+ init_vel = (joints[1:2] - joints[0:1]) / dt
99
+ middle_vel = (joints[2:] - joints[:-2]) / (2 * dt)
100
+ final_vel = (joints[-1:] - joints[-2:-1]) / dt
101
+ vel = np.concatenate([init_vel, middle_vel, final_vel], axis=0)
102
+ position = joints
103
+ rot_matrices = rc.axis_angle_to_matrix(gt_motion_tensor.reshape(1, n, 55, 3))[0]
104
+ rot6d = rc.matrix_to_rotation_6d(rot_matrices).reshape(n, 55, 6).cpu().numpy()
105
+
106
+ init_vel = (motion["poses"][1:2] - motion["poses"][0:1]) / dt
107
+ middle_vel = (motion["poses"][2:] - motion["poses"][:-2]) / (2 * dt)
108
+ final_vel = (motion["poses"][-1:] - motion["poses"][-2:-1]) / dt
109
+ angular_velocity = np.concatenate([init_vel, middle_vel, final_vel], axis=0).reshape(n, 55, 3)
110
+
111
+ rep15d = np.concatenate([position, vel, rot6d, angular_velocity], axis=2).reshape(n, 55 * 15)
112
+ return {
113
+ "position": position,
114
+ "velocity": vel,
115
+ "rotation": rot6d,
116
+ "axis_angle": motion["poses"],
117
+ "angular_velocity": angular_velocity,
118
+ "rep15d": rep15d,
119
+ "trans": motion["trans"],
120
+ }
121
+
122
+
123
+ def create_graph(json_path, smplx_model):
124
+ fps = 30
125
+ data_meta = json.load(open(json_path, "r"))
126
+ graph = igraph.Graph(directed=True)
127
+ global_i = 0
128
+ for data_item in data_meta:
129
+ video_path = os.path.join(data_item["video_path"], data_item["video_id"] + ".mp4")
130
+ # audio_path = os.path.join(data_item['audio_path'], data_item['video_id'] + ".wav")
131
+ motion_path = os.path.join(data_item["motion_path"], data_item["video_id"] + ".npz")
132
+ video_id = data_item.get("video_id", "")
133
+ motion = np.load(motion_path, allow_pickle=True)
134
+ motion_reps = get_motion_reps(motion, smplx_model)
135
+ position = motion_reps["position"]
136
+ velocity = motion_reps["velocity"]
137
+ trans = motion_reps["trans"]
138
+ axis_angle = motion_reps["axis_angle"]
139
+ # audio, sr = librosa.load(audio_path, sr=None)
140
+ # audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
141
+ all_frames = []
142
+ reader = imageio.get_reader(video_path)
143
+ all_frames = []
144
+ for frame in reader:
145
+ all_frames.append(frame)
146
+ video_frames = np.array(all_frames)
147
+ min_frames = min(len(video_frames), position.shape[0])
148
+ position = position[:min_frames]
149
+ velocity = velocity[:min_frames]
150
+ video_frames = video_frames[:min_frames]
151
+ # print(min_frames)
152
+ for i in tqdm(range(min_frames)):
153
+ if i == 0:
154
+ previous = -1
155
+ next_node = global_i + 1
156
+ elif i == min_frames - 1:
157
+ previous = global_i - 1
158
+ next_node = -1
159
+ else:
160
+ previous = global_i - 1
161
+ next_node = global_i + 1
162
+ graph.add_vertex(
163
+ idx=global_i,
164
+ name=video_id,
165
+ motion=motion_reps,
166
+ position=position[i],
167
+ velocity=velocity[i],
168
+ axis_angle=axis_angle[i],
169
+ trans=trans[i],
170
+ # audio=audio[],
171
+ video=video_frames[i],
172
+ previous=previous,
173
+ next=next_node,
174
+ frame=i,
175
+ fps=fps,
176
+ )
177
+ global_i += 1
178
+ return graph
179
+
180
+
181
+ def create_edges(graph):
182
+ adaptive_length = [-4, -3, -2, -1, 1, 2, 3, 4]
183
+ # print()
184
+ for i, node in enumerate(graph.vs):
185
+ current_position = node["position"]
186
+ current_velocity = node["velocity"]
187
+ current_trans = node["trans"]
188
+ # print(current_position.shape, current_velocity.shape)
189
+ avg_position = np.zeros(current_position.shape[0])
190
+ avg_velocity = np.zeros(current_position.shape[0])
191
+ avg_trans = 0
192
+ count = 0
193
+ for node_offset in adaptive_length:
194
+ idx = i + node_offset
195
+ if idx < 0 or idx >= len(graph.vs):
196
+ continue
197
+ if node_offset < 0:
198
+ if graph.vs[idx]["next"] == -1:
199
+ continue
200
+ else:
201
+ if graph.vs[idx]["previous"] == -1:
202
+ continue
203
+ # add check
204
+ other_node = graph.vs[idx]
205
+ other_position = other_node["position"]
206
+ other_velocity = other_node["velocity"]
207
+ other_trans = other_node["trans"]
208
+ # print(other_position.shape, other_velocity.shape)
209
+ avg_position += np.linalg.norm(current_position - other_position, axis=1)
210
+ avg_velocity += np.linalg.norm(current_velocity - other_velocity, axis=1)
211
+ avg_trans += np.linalg.norm(current_trans - other_trans, axis=0)
212
+ count += 1
213
+
214
+ if count == 0:
215
+ continue
216
+ threshold_position = avg_position / count
217
+ threshold_velocity = avg_velocity / count
218
+ threshold_trans = avg_trans / count
219
+ # print(threshold_position, threshold_velocity, threshold_trans)
220
+ for j, other_node in enumerate(graph.vs):
221
+ if i == j:
222
+ continue
223
+ if j == node["previous"] or j == node["next"]:
224
+ graph.add_edge(i, j, is_continue=1)
225
+ continue
226
+ other_position = other_node["position"]
227
+ other_velocity = other_node["velocity"]
228
+ other_trans = other_node["trans"]
229
+ position_similarity = np.linalg.norm(current_position - other_position, axis=1)
230
+ velocity_similarity = np.linalg.norm(current_velocity - other_velocity, axis=1)
231
+ trans_similarity = np.linalg.norm(current_trans - other_trans, axis=0)
232
+ if trans_similarity < threshold_trans:
233
+ if np.sum(position_similarity < threshold_position) >= 45 and np.sum(velocity_similarity < threshold_velocity) >= 45:
234
+ graph.add_edge(i, j, is_continue=0)
235
+
236
+ print(f"nodes: {len(graph.vs)}, edges: {len(graph.es)}")
237
+ in_degrees = graph.indegree()
238
+ out_degrees = graph.outdegree()
239
+ avg_in_degree = sum(in_degrees) / len(in_degrees)
240
+ avg_out_degree = sum(out_degrees) / len(out_degrees)
241
+ print(f"Average In-degree: {avg_in_degree}")
242
+ print(f"Average Out-degree: {avg_out_degree}")
243
+ print(f"max in degree: {max(in_degrees)}, max out degree: {max(out_degrees)}")
244
+ print(f"min in degree: {min(in_degrees)}, min out degree: {min(out_degrees)}")
245
+ # igraph.plot(graph, target="/content/test.png", bbox=(1000, 1000), vertex_size=10)
246
+ return graph
247
+
248
+
249
+ def random_walk(graph, walk_length, start_node=None):
250
+ if start_node is None:
251
+ start_node = np.random.choice(graph.vs)
252
+ walk = [start_node]
253
+ is_continue = [1]
254
+ for _ in range(walk_length):
255
+ current_node = walk[-1]
256
+ neighbor_indices = graph.neighbors(current_node.index, mode="OUT")
257
+ if not neighbor_indices:
258
+ break
259
+ next_idx = np.random.choice(neighbor_indices)
260
+ edge_id = graph.get_eid(current_node.index, next_idx)
261
+ is_cont = graph.es[edge_id]["is_continue"]
262
+ walk.append(graph.vs[next_idx])
263
+ is_continue.append(is_cont)
264
+ return walk, is_continue
265
+
266
+
267
+ def path_visualization(graph, path, is_continue, save_path, verbose_continue=False, audio_path=None, return_motion=False):
268
+ all_frames = [node["video"] for node in path]
269
+ average_dis_continue = 1 - sum(is_continue) / len(is_continue)
270
+ if verbose_continue:
271
+ print("average_dis_continue:", average_dis_continue)
272
+
273
+ fps = graph.vs[0]["fps"]
274
+ duration = len(all_frames) / fps
275
+
276
+ def make_frame(t):
277
+ idx = min(int(t * fps), len(all_frames) - 1)
278
+ return all_frames[idx]
279
+
280
+ video_only_path = f"/tmp/video_only_{time.time()}.mp4" # Temporary file
281
+ video_clip = VideoClip(make_frame, duration=duration)
282
+ video_clip.write_videofile(video_only_path, codec="libx264", fps=fps, audio=False)
283
+
284
+ # Optionally, ensure audio and video durations match
285
+ if audio_path is not None:
286
+ audio_clip = AudioFileClip(audio_path)
287
+ video_duration = video_clip.duration
288
+ audio_duration = audio_clip.duration
289
+
290
+ if audio_duration > video_duration:
291
+ # Trim the audio
292
+ trimmed_audio_path = "trimmed_audio.aac"
293
+ audio_clip = audio_clip.subclip(0, video_duration)
294
+ audio_clip.write_audiofile(trimmed_audio_path)
295
+ audio_input = trimmed_audio_path
296
+ else:
297
+ audio_input = audio_path
298
+
299
+ # Use FFmpeg to combine video and audio
300
+ ffmpeg_command = [
301
+ "ffmpeg",
302
+ "-y",
303
+ "-i",
304
+ video_only_path,
305
+ "-i",
306
+ audio_input,
307
+ "-c:v",
308
+ "copy",
309
+ "-c:a",
310
+ "aac",
311
+ "-strict",
312
+ "experimental",
313
+ save_path,
314
+ ]
315
+ subprocess.check_call(ffmpeg_command)
316
+
317
+ # Clean up temporary files if necessary
318
+ os.remove(video_only_path)
319
+ if audio_input != audio_path:
320
+ os.remove(audio_input)
321
+
322
+ if return_motion:
323
+ all_motion = [node["axis_angle"] for node in path]
324
+ all_motion = np.stack(all_motion, 0)
325
+ return all_motion
326
+
327
+
328
+ def generate_transition_video(frame_start_path, frame_end_path, output_video_path):
329
+ import subprocess
330
+ import os
331
+
332
+ # Define the path to your model and inference script
333
+ model_path = os.path.join(SCRIPT_PATH, "frame-interpolation-pytorch/film_net_fp32.pt")
334
+ inference_script = os.path.join(SCRIPT_PATH, "frame-interpolation-pytorch/inference.py")
335
+
336
+ # Build the command to run the inference script
337
+ command = [
338
+ "python",
339
+ inference_script,
340
+ model_path,
341
+ frame_start_path,
342
+ frame_end_path,
343
+ "--save_path",
344
+ output_video_path,
345
+ "--gpu",
346
+ "--frames",
347
+ "3",
348
+ "--fps",
349
+ "30",
350
+ ]
351
+
352
+ # Run the command
353
+ try:
354
+ subprocess.run(command, check=True)
355
+ print(f"Generated transition video saved at {output_video_path}")
356
+ except subprocess.CalledProcessError as e:
357
+ print(f"Error occurred while generating transition video: {e}")
358
+
359
+
360
+ def path_visualization_v2(graph, path, is_continue, save_path, verbose_continue=False, audio_path=None, return_motion=False):
361
+ """
362
+ this is for hugging face demo for fast interpolation. our paper use a diffusion based interpolation method
363
+ """
364
+ all_frames = [node["video"] for node in path]
365
+ average_dis_continue = 1 - sum(is_continue) / len(is_continue)
366
+ if verbose_continue:
367
+ print("average_dis_continue:", average_dis_continue)
368
+ duration = len(all_frames) / graph.vs[0]["fps"]
369
+
370
+ # First loop: Confirm where blending is needed
371
+ discontinuity_indices = []
372
+ for i, cont in enumerate(is_continue):
373
+ if cont == 0:
374
+ discontinuity_indices.append(i)
375
+
376
+ # Identify blending positions without overlapping
377
+ blend_positions = []
378
+ processed_frames = set()
379
+ for i in discontinuity_indices:
380
+ # Define the frames for blending: i-2 to i+2
381
+ start_idx = i - 2
382
+ end_idx = i + 2
383
+ # Check index boundaries
384
+ if start_idx < 0 or end_idx >= len(all_frames):
385
+ continue # Skip if indices are out of bounds
386
+ # Check for overlapping frames
387
+ overlap = any(idx in processed_frames for idx in range(i - 1, i + 2))
388
+ if overlap:
389
+ continue # Skip if frames have been processed
390
+ # Mark frames as processed
391
+ processed_frames.update(range(i - 1, i + 2))
392
+ blend_positions.append(i)
393
+
394
+ # Second loop: Perform blending
395
+ temp_dir = tempfile.mkdtemp(prefix="blending_frames_")
396
+ for i in tqdm(blend_positions):
397
+ start_frame_idx = i - 2
398
+ end_frame_idx = i + 2
399
+ frame_start = all_frames[start_frame_idx]
400
+ frame_end = all_frames[end_frame_idx]
401
+ frame_start_path = os.path.join(temp_dir, f"frame_{start_frame_idx}.png")
402
+ frame_end_path = os.path.join(temp_dir, f"frame_{end_frame_idx}.png")
403
+ # Save the start and end frames as images
404
+ imageio.imwrite(frame_start_path, frame_start)
405
+ imageio.imwrite(frame_end_path, frame_end)
406
+
407
+ # Call FiLM API to generate video
408
+ generated_video_path = os.path.join(temp_dir, f"generated_{start_frame_idx}_{end_frame_idx}.mp4")
409
+ generate_transition_video(frame_start_path, frame_end_path, generated_video_path)
410
+
411
+ # Read the generated video frames
412
+ reader = imageio.get_reader(generated_video_path)
413
+ generated_frames = [frame for frame in reader]
414
+ reader.close()
415
+
416
+ # Replace the middle three frames (i-1, i, i+1) in all_frames
417
+ total_generated_frames = len(generated_frames)
418
+ if total_generated_frames < 5:
419
+ print(f"Generated video has insufficient frames ({total_generated_frames}). Skipping blending at position {i}.")
420
+ continue
421
+ middle_start = 1 # Start index for middle 3 frames
422
+ middle_frames = generated_frames[middle_start : middle_start + 3]
423
+ for idx, frame_idx in enumerate(range(i - 1, i + 2)):
424
+ all_frames[frame_idx] = middle_frames[idx]
425
+
426
+ # Create the video clip
427
+ def make_frame(t):
428
+ idx = min(int(t * graph.vs[0]["fps"]), len(all_frames) - 1)
429
+ return all_frames[idx]
430
+
431
+ video_clip = VideoClip(make_frame, duration=duration)
432
+ if audio_path is not None:
433
+ audio_clip = AudioFileClip(audio_path)
434
+ video_clip = video_clip.set_audio(audio_clip)
435
+ video_clip.write_videofile(save_path, codec="libx264", fps=graph.vs[0]["fps"], audio_codec="aac")
436
+
437
+ if return_motion:
438
+ all_motion = [node["axis_angle"] for node in path]
439
+ all_motion = np.stack(all_motion, 0)
440
+ return all_motion
441
+
442
+
443
+ def graph_pruning(graph):
444
+ ascc = graph.clusters(mode="STRONG")
445
+ lascc = ascc.giant()
446
+ print(f"before nodes: {len(graph.vs)}, edges: {len(graph.es)}")
447
+ print(f"after nodes: {len(lascc.vs)}, edges: {len(lascc.es)}")
448
+ in_degrees = lascc.indegree()
449
+ out_degrees = lascc.outdegree()
450
+ avg_in_degree = sum(in_degrees) / len(in_degrees)
451
+ avg_out_degree = sum(out_degrees) / len(out_degrees)
452
+ print(f"Average In-degree: {avg_in_degree}")
453
+ print(f"Average Out-degree: {avg_out_degree}")
454
+ print(f"max in degree: {max(in_degrees)}, max out degree: {max(out_degrees)}")
455
+ print(f"min in degree: {min(in_degrees)}, min out degree: {min(out_degrees)}")
456
+ return lascc
457
+
458
+
459
+ if __name__ == "__main__":
460
+ parser = argparse.ArgumentParser()
461
+ parser.add_argument("--json_save_path", type=str, default="")
462
+ parser.add_argument("--graph_save_path", type=str, default="")
463
+ args = parser.parse_args()
464
+ json_path = args.json_save_path
465
+ print("json_path", json_path)
466
+ graph_path = args.graph_save_path
467
+
468
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
469
+ smplx_model = (
470
+ smplx.create(
471
+ os.path.join(SCRIPT_PATH, "emage/smplx_models/"),
472
+ model_type="smplx",
473
+ gender="NEUTRAL_2020",
474
+ use_face_contour=False,
475
+ num_betas=300,
476
+ num_expression_coeffs=100,
477
+ ext="npz",
478
+ use_pca=False,
479
+ )
480
+ .to(device)
481
+ .eval()
482
+ )
483
+
484
+ # single_test
485
+ # graph = create_graph('/content/drive/MyDrive/003_Codes/TANGO/datasets/data_json/show_oliver_test/Abortion_Laws_-_Last_Week_Tonight_with_John_Oliver_HBO-DRauXXz6t0Y.webm.json')
486
+ graph = create_graph(json_path, smplx_model)
487
+ graph = create_edges(graph)
488
+ # pool_path = "/content/drive/MyDrive/003_Codes/TANGO-JointEmbedding/datasets/oliver_test/show-oliver-test.pkl"
489
+ # graph = igraph.Graph.Read_Pickle(fname=pool_path)
490
+ # graph = igraph.Graph.Read_Pickle(fname="/content/drive/MyDrive/003_Codes/TANGO-JointEmbedding/datasets/oliver_test/test.pkl")
491
+
492
+ walk, is_continue = random_walk(graph, 100)
493
+ motion = path_visualization(graph, walk, is_continue, "./test.mp4", audio_path=None, verbose_continue=True, return_motion=True)
494
+ # print(motion.shape)
495
+ save_graph = graph.write_pickle(fname=graph_path)
496
+ graph = graph_pruning(graph)
497
+
498
+ # show-oliver
499
+ # json_path = "/content/drive/MyDrive/003_Codes/TANGO/datasets/data_json/show_oliver_test/"
500
+ # pre_node_path = "/content/drive/MyDrive/003_Codes/TANGO/datasets/cached_graph/show_oliver_test/"
501
+ # for json_file in tqdm(os.listdir(json_path)):
502
+ # graph = create_graph(os.path.join(json_path, json_file))
503
+ # graph = create_edges(graph)
504
+ # if not len(graph.vs) >= 1500:
505
+ # print(f"skip: {len(graph.vs)}", json_file)
506
+ # graph.write_pickle(fname=os.path.join(pre_node_path, json_file.split(".")[0] + ".pkl"))
507
+ # print(f"Graph saved at {json_file.split('.')[0]}.pkl")
datasets/beat2_v5.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import torch
3
+ from torch.utils import data
4
+ import numpy as np
5
+ import librosa
6
+ import textgrid as tg
7
+ import os
8
+ import math
9
+
10
+
11
+ class BEAT2Dataset(data.Dataset):
12
+ def __init__(self, cfg, split):
13
+ data_meta_paths = cfg.data.meta_paths
14
+ vid_meta = []
15
+ for data_meta_path in data_meta_paths:
16
+ vid_meta.extend(json.load(open(data_meta_path, "r")))
17
+ self.vid_meta = [item for item in vid_meta if item.get("mode") == split]
18
+ self.mean = 0 # np.load(cfg.data.mean_path) if cfg.data.mean_path is not None else 0
19
+ self.std = 1 # np.load(cfg.data.std_path) if cfg.data.std_path is not None else 1
20
+ self.joint_mask = None # cfg.data.joint_mask if cfg.data.joint_mask is not None else None
21
+ self.data_list = self.vid_meta
22
+ self.fps = cfg.data.pose_fps
23
+ self.audio_sr = cfg.data.audio_sr
24
+ self.use_text = False # cfg.data.use_text
25
+
26
+ def __len__(self):
27
+ return len(self.data_list)
28
+
29
+ @staticmethod
30
+ def normalize(motion, mean, std):
31
+ return (motion - mean) / (std + 1e-7)
32
+
33
+ @staticmethod
34
+ def inverse_normalize(motion, mean, std):
35
+ return motion * std + mean
36
+
37
+ @staticmethod
38
+ def select_joints(motion, joint_mask):
39
+ return motion[:, joint_mask]
40
+
41
+ @staticmethod
42
+ def unselect_joints(motion, joint_mask):
43
+ # for visualization
44
+ full_motion = np.zeros((motion.shape[0], joint_mask.shape[0]))
45
+ full_motion[:, joint_mask] = motion
46
+
47
+ def __getitem__(self, item):
48
+ data = self.data_list[item]
49
+ motion = np.load(os.path.join(data["video_path"], data["video_id"] + ".npy"))
50
+ sdx = data["start_idx"]
51
+ edx = data["end_idx"]
52
+
53
+ SMPLX_FPS = 30
54
+ motion = motion[sdx:edx]
55
+ audio = np.load(os.path.join(data["audio_path"], data["video_id"] + "_text.npz"), allow_pickle=True)
56
+ sdx_audio = math.floor(sdx * (1 / SMPLX_FPS * 50))
57
+ edx_audio = sdx_audio + int((edx - sdx) * 50 / SMPLX_FPS) + 1
58
+ cached_audio_low = audio["wav2vec2_low"][sdx_audio:edx_audio]
59
+ cached_audio_high = audio["wav2vec2_high"][sdx_audio:edx_audio]
60
+ bert_time_aligned = audio["bert_time_aligned"][sdx_audio:edx_audio]
61
+
62
+ motion_tensor = torch.from_numpy(motion).float() # T x D
63
+ cached_audio_low = torch.from_numpy(cached_audio_low).float()
64
+ cached_audio_high = torch.from_numpy(cached_audio_high).float()
65
+ bert_time_aligned = torch.from_numpy(bert_time_aligned).float()
66
+
67
+ audio_wave, sr = librosa.load(os.path.join(data["audio_path"], data["video_id"] + ".wav"))
68
+ audio_wave = librosa.resample(audio_wave, orig_sr=sr, target_sr=self.audio_sr)
69
+ sdx_audio = sdx * int(1 / SMPLX_FPS * self.audio_sr)
70
+ edx_audio = edx * int(1 / SMPLX_FPS * self.audio_sr)
71
+ audio_wave = audio_wave[sdx_audio:edx_audio]
72
+ audio_tensor = torch.from_numpy(audio_wave).float()
73
+
74
+ return dict(
75
+ cached_rep15d=motion_tensor,
76
+ cached_audio_low=cached_audio_low,
77
+ cached_audio_high=cached_audio_high,
78
+ bert_time_aligned=bert_time_aligned,
79
+ audio_tensor=audio_tensor,
80
+ )
datasets/cached_audio/101099-00_18_09-00_18_19.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:addd2c332242bf4e234adee59d8220f85a3ba4e587e145ed8aece0c9f4b8c358
3
+ size 1393776
datasets/cached_audio/1wrQ6Msp7wM_00-00-39.69_00-00-45.68.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0eb3ec8a6ded1a3e378b6b8745695beff96cdc4976570c1d070d688ab1dbeba
3
+ size 2569514
datasets/cached_audio/demo0.mp4 ADDED
Binary file (877 kB). View file
 
datasets/cached_audio/demo1.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e01aee2e94689d95514749887d5c5ab77455b7e1adf4c8bded9f72e9c69b2db0
3
+ size 1142106
datasets/cached_audio/demo2.mp4 ADDED
Binary file (741 kB). View file
 
datasets/cached_audio/demo3.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5914ea9b0001ad9f1ee7e9595f73a36d75ea17aae36294f32bee70ca3439a956
3
+ size 1378144
datasets/cached_audio/demo4.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2929191b3089a22503538a29d38d5444b2d245715e0767e7af29a2341cfe9a8f
3
+ size 1054816
datasets/cached_audio/demo5.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87d40d0740a740934e74a376e3125cd0ee01332c563faf91613787f99ab9110a
3
+ size 1348398
datasets/cached_audio/demo6.mp4 ADDED
Binary file (983 kB). View file
 
datasets/cached_audio/demo7.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06deda4ba3eed683ff774d32a5999a2c624f956a7f95d77d2d0c3bd943f069c8
3
+ size 1120862
datasets/cached_audio/demo8.mp4 ADDED
Binary file (719 kB). View file
 
datasets/cached_audio/demo9.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b172832933ffb13fffaa8dd649e5aa7130aad1c5e25d53e331f87ed1a4815b63
3
+ size 1284539
datasets/cached_audio/example_female_voice_9_seconds.wav ADDED
Binary file (606 kB). View file
 
datasets/cached_audio/example_male_voice_9_seconds.wav ADDED
Binary file (880 kB). View file
 
datasets/cached_audio/female_test_V1.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b89e47ea2febb05fef6a321822c9d14cd4fd752f7fcb2e27f28abec3104f5823
3
+ size 2727168
datasets/cached_audio/speaker12_10_BVHw8aCPATM_00-01-05.0_00-01-10.0.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:926444bb200639713b1d0d48ea1ff544685c1dc24b9f1d42e8133724563e18bd
3
+ size 1577443
datasets/cached_audio/speaker7_iuYlGRnC7J8_00-00-0.00_00-00-3.25.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffb58134d03dd7dabe2bfc587ea615c540cf0c161b20c754f95b74de07379bb9
3
+ size 1679489