ElenaRyumina commited on
Commit
47aeb66
β€’
1 Parent(s): d6af211
Files changed (18) hide show
  1. .flake8 +5 -0
  2. .gitignore +172 -0
  3. CODE_OF_CONDUCT.md +80 -0
  4. LICENSE +21 -0
  5. README.md +6 -5
  6. app.css +116 -0
  7. app.py +115 -0
  8. app/__init__.py +0 -0
  9. app/app_utils.py +296 -0
  10. app/authors.py +31 -0
  11. app/config.py +127 -0
  12. app/description.py +20 -0
  13. app/model.py +72 -0
  14. app/model_architectures.py +483 -0
  15. app/plot.py +177 -0
  16. app/utils.py +273 -0
  17. config.toml +16 -0
  18. requirements.txt +8 -0
.flake8 ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ ; https://www.flake8rules.com/
2
+
3
+ [flake8]
4
+ max-line-length = 120
5
+ ignore = E203, E402, E741, W503
.gitignore ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Compiled source #
2
+ ###################
3
+ *.com
4
+ *.class
5
+ *.dll
6
+ *.exe
7
+ *.o
8
+ *.so
9
+ *.pyc
10
+
11
+ # Packages #
12
+ ############
13
+ # it's better to unpack these files and commit the raw source
14
+ # git has its own built in compression methods
15
+ *.7z
16
+ *.dmg
17
+ *.gz
18
+ *.iso
19
+ *.rar
20
+ #*.tar
21
+ *.zip
22
+
23
+ # Logs and databases #
24
+ ######################
25
+ *.log
26
+ *.sqlite
27
+
28
+ # OS generated files #
29
+ ######################
30
+ .DS_Store
31
+ ehthumbs.db
32
+ Icon
33
+ Thumbs.db
34
+ .tmtags
35
+ .idea
36
+ .vscode
37
+ tags
38
+ vendor.tags
39
+ tmtagsHistory
40
+ *.sublime-project
41
+ *.sublime-workspace
42
+ .bundle
43
+
44
+ # Byte-compiled / optimized / DLL files
45
+ __pycache__/
46
+ *.py[cod]
47
+ *$py.class
48
+
49
+ # C extensions
50
+ *.so
51
+
52
+ # Distribution / packaging
53
+ .Python
54
+ build/
55
+ develop-eggs/
56
+ dist/
57
+ downloads/
58
+ eggs/
59
+ .eggs/
60
+ lib/
61
+ lib64/
62
+ parts/
63
+ sdist/
64
+ var/
65
+ wheels/
66
+ pip-wheel-metadata/
67
+ share/python-wheels/
68
+ *.egg-info/
69
+ .installed.cfg
70
+ *.egg
71
+ MANIFEST
72
+ node_modules/
73
+
74
+ # PyInstaller
75
+ # Usually these files are written by a python script from a template
76
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
77
+ *.manifest
78
+ *.spec
79
+
80
+ # Installer logs
81
+ pip-log.txt
82
+ pip-delete-this-directory.txt
83
+
84
+ # Unit test / coverage reports
85
+ htmlcov/
86
+ .tox/
87
+ .nox/
88
+ .coverage
89
+ .coverage.*
90
+ .cache
91
+ nosetests.xml
92
+ coverage.xml
93
+ *.cover
94
+ .hypothesis/
95
+ .pytest_cache/
96
+
97
+ # Translations
98
+ *.mo
99
+ *.pot
100
+
101
+ # Django stuff:
102
+ *.log
103
+ local_settings.py
104
+ db.sqlite3
105
+ db.sqlite3-journal
106
+
107
+ # Flask stuff:
108
+ instance/
109
+ .webassets-cache
110
+
111
+ # Scrapy stuff:
112
+ .scrapy
113
+
114
+ # Sphinx documentation
115
+ docs/_build/
116
+
117
+ # PyBuilder
118
+ target/
119
+
120
+ # Jupyter Notebook
121
+ .ipynb_checkpoints
122
+
123
+ # IPython
124
+ profile_default/
125
+ ipython_config.py
126
+
127
+ # pyenv
128
+ .python-version
129
+
130
+ # pipenv
131
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
132
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
133
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
134
+ # install all needed dependencies.
135
+ #Pipfile.lock
136
+
137
+ # celery beat schedule file
138
+ celerybeat-schedule
139
+
140
+ # SageMath parsed files
141
+ *.sage.py
142
+
143
+ # Environments
144
+ .env
145
+ .venv
146
+ env/
147
+ venv/
148
+ ENV/
149
+ env.bak/
150
+ venv.bak/
151
+
152
+ # Spyder project settings
153
+ .spyderproject
154
+ .spyproject
155
+
156
+ # Rope project settings
157
+ .ropeproject
158
+
159
+ # mkdocs documentation
160
+ /site
161
+
162
+ # mypy
163
+ .mypy_cache/
164
+ .dmypy.json
165
+ dmypy.json
166
+
167
+ # Pyre type checker
168
+ .pyre/
169
+
170
+ # Custom
171
+ *.pth
172
+ *.pt
CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ In the interest of fostering an open and welcoming environment, we as
6
+ contributors and maintainers pledge to make participation in our project and
7
+ our community a harassment-free experience for everyone, regardless of age, body
8
+ size, disability, ethnicity, sex characteristics, gender identity and expression,
9
+ level of experience, education, socio-economic status, nationality, personal
10
+ appearance, race, religion, or sexual identity and orientation.
11
+
12
+ ## Our Standards
13
+
14
+ Examples of behavior that contributes to creating a positive environment
15
+ include:
16
+
17
+ * Using welcoming and inclusive language
18
+ * Being respectful of differing viewpoints and experiences
19
+ * Gracefully accepting constructive criticism
20
+ * Focusing on what is best for the community
21
+ * Showing empathy towards other community members
22
+
23
+ Examples of unacceptable behavior by participants include:
24
+
25
+ * The use of sexualized language or imagery and unwelcome sexual attention or
26
+ advances
27
+ * Trolling, insulting/derogatory comments, and personal or political attacks
28
+ * Public or private harassment
29
+ * Publishing others' private information, such as a physical or electronic
30
+ address, without explicit permission
31
+ * Other conduct which could reasonably be considered inappropriate in a
32
+ professional setting
33
+
34
+ ## Our Responsibilities
35
+
36
+ Project maintainers are responsible for clarifying the standards of acceptable
37
+ behavior and are expected to take appropriate and fair corrective action in
38
+ response to any instances of unacceptable behavior.
39
+
40
+ Project maintainers have the right and responsibility to remove, edit, or
41
+ reject comments, commits, code, wiki edits, issues, and other contributions
42
+ that are not aligned to this Code of Conduct, or to ban temporarily or
43
+ permanently any contributor for other behaviors that they deem inappropriate,
44
+ threatening, offensive, or harmful.
45
+
46
+ ## Scope
47
+
48
+ This Code of Conduct applies within all project spaces, and it also applies when
49
+ an individual is representing the project or its community in public spaces.
50
+ Examples of representing a project or community include using an official
51
+ project e-mail address, posting via an official social media account, or acting
52
+ as an appointed representative at an online or offline event. Representation of
53
+ a project may be further defined and clarified by project maintainers.
54
+
55
+ This Code of Conduct also applies outside the project spaces when there is a
56
+ reasonable belief that an individual's behavior may have a negative impact on
57
+ the project or its community.
58
+
59
+ ## Enforcement
60
+
61
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
62
+ reported by contacting the project team at <[email protected]>. All
63
+ complaints will be reviewed and investigated and will result in a response that
64
+ is deemed necessary and appropriate to the circumstances. The project team is
65
+ obligated to maintain confidentiality with regard to the reporter of an incident.
66
+ Further details of specific enforcement policies may be posted separately.
67
+
68
+ Project maintainers who do not follow or enforce the Code of Conduct in good
69
+ faith may face temporary or permanent repercussions as determined by other
70
+ members of the project's leadership.
71
+
72
+ ## Attribution
73
+
74
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
75
+ available at <https://www.contributor-covenant.org/version/1/4/code-of-conduct.html>
76
+
77
+ [homepage]: https://www.contributor-covenant.org
78
+
79
+ For answers to common questions about this code of conduct, see
80
+ <https://www.contributor-covenant.org/faq>
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Elena Ryumina
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,13 +1,14 @@
1
  ---
2
- title: AVCER
3
- emoji: πŸ‘€
4
- colorFrom: indigo
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 4.36.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Audio-visual compound expression recognition
3
+ emoji: πŸ˜€πŸ˜²πŸ˜πŸ˜₯πŸ₯΄πŸ˜±πŸ˜‘
4
+ colorFrom: blue
5
+ colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 4.24.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
+ short_description: A tool to detect audio-visual compound expressions
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.css ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ div.app-flex-container {
2
+ display: flex;
3
+ align-items: left;
4
+ }
5
+
6
+ div.app-flex-container > a {
7
+ margin-left: 6px;
8
+ }
9
+
10
+ div.dl1 div.upload-container {
11
+ height: 350px;
12
+ max-height: 350px;
13
+ }
14
+
15
+ div.dl2 {
16
+ max-height: 200px;
17
+ }
18
+
19
+ div.dl2 img {
20
+ max-height: 200px;
21
+ }
22
+
23
+ div.dl5 {
24
+ max-height: 200px;
25
+ }
26
+
27
+ div.dl5 img {
28
+ max-height: 200px;
29
+ }
30
+
31
+ div.video1 div.video-container {
32
+ height: 500px;
33
+ }
34
+
35
+ div.video2 {
36
+ height: 200px;
37
+ }
38
+
39
+ div.video3 {
40
+ height: 200px;
41
+ }
42
+
43
+ div.video4 {
44
+ height: 200px;
45
+ }
46
+
47
+ div.stat {
48
+ height: 350px;
49
+ }
50
+
51
+ div.audio {
52
+ height: 120px;
53
+ }
54
+
55
+ div.pred {
56
+ height: 65px;
57
+ }
58
+
59
+ div.video {
60
+ height: 65px;
61
+ }
62
+
63
+ div.img {
64
+ height: 120px;
65
+ }
66
+
67
+ div.settings-wrapper {
68
+ display: none;
69
+ }
70
+
71
+ .submit {
72
+ display: inline-block;
73
+ padding: 10px 20px;
74
+ font-size: 16px;
75
+ font-weight: bold;
76
+ text-align: center;
77
+ text-decoration: none;
78
+ cursor: pointer;
79
+ border: var(--button-border-width) solid var(--button-primary-border-color);
80
+ background: var(--button-primary-background-fill);
81
+ color: var(--button-primary-text-color);
82
+ border-radius: 8px;
83
+ transition: all 0.3s ease;
84
+ }
85
+
86
+ .submit[disabled] {
87
+ cursor: not-allowed;
88
+ opacity: 0.6;
89
+ }
90
+
91
+ .submit:hover:not([disabled]) {
92
+ border-color: var(--button-primary-border-color-hover);
93
+ background: var(--button-primary-background-fill-hover);
94
+ color: var(--button-primary-text-color-hover);
95
+ }
96
+
97
+ .clear {
98
+ display: inline-block;
99
+ padding: 10px 20px;
100
+ font-size: 16px;
101
+ font-weight: bold;
102
+ text-align: center;
103
+ text-decoration: none;
104
+ cursor: pointer;
105
+ border-radius: 8px;
106
+ transition: all 0.3s ease;
107
+ }
108
+
109
+ .clear[disabled] {
110
+ cursor: not-allowed;
111
+ opacity: 0.6;
112
+ }
113
+
114
+ .submit:active:not([disabled]), .clear:active:not([disabled]) {
115
+ transform: scale(0.98);
116
+ }
app.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ File: app.py
3
+ Author: Elena Ryumina and Dmitry Ryumin
4
+ Description: Description: Main application file for Facial_Expression_Recognition.
5
+ The file defines the Gradio interface, sets up the main blocks,
6
+ and includes event handlers for various components.
7
+ License: MIT License
8
+ """
9
+
10
+ import gradio as gr
11
+
12
+ # Importing necessary components for the Gradio app
13
+ from app.description import DESCRIPTION_DYNAMIC
14
+ from app.authors import AUTHORS
15
+ from app.app_utils import preprocess_video_and_predict
16
+
17
+
18
+ def clear_static_info():
19
+ return (
20
+ gr.Image(value=None, type="pil"),
21
+ gr.Image(value=None, scale=1, elem_classes="dl5"),
22
+ gr.Image(value=None, scale=1, elem_classes="dl2"),
23
+ gr.Label(value=None, num_top_classes=3, scale=1, elem_classes="dl3"),
24
+ )
25
+
26
+ def clear_dynamic_info():
27
+ return (
28
+ gr.Video(value=None),
29
+ gr.Plot(value=None),
30
+ gr.Plot(value=None),
31
+ gr.Plot(value=None),
32
+ gr.Textbox(value=None),
33
+ # gr.HTML(value=None),
34
+ gr.File(value=None),
35
+ gr.File(value=None),
36
+ )
37
+
38
+ with gr.Blocks(css="app.css") as demo:
39
+ with gr.Tab("AVCER App"):
40
+ gr.Markdown(value=DESCRIPTION_DYNAMIC)
41
+ with gr.Row():
42
+ with gr.Column(scale=2):
43
+ input_video = gr.Video(elem_classes="video1")
44
+ with gr.Row():
45
+ clear_btn_dynamic = gr.Button(
46
+ value="Clear", interactive=True, scale=1
47
+ )
48
+ submit_dynamic = gr.Button(
49
+ value="Submit", interactive=True, scale=1, elem_classes="submit"
50
+ )
51
+ text = gr.Textbox(label="Result", info='Positive state includes Happiness, Surprise, Happily Surprised, and Happily Disgusted emotions. Negative state includes other emotions and Surprise.')
52
+ # question_mark = gr.HTML(tooltip_html)
53
+ with gr.Column(scale=2, elem_classes="dl4"):
54
+ output_face = gr.Plot(label="Face images", elem_classes="img")
55
+ output_heatmaps = gr.Plot(label="Waveform", elem_classes="audio")
56
+ output_statistics = gr.Plot(label="Statistics of emotions", elem_classes="stat")
57
+ with gr.Row():
58
+ output_video = gr.File(label="Original video",
59
+ file_count="single",
60
+ file_types=[".mp4"],
61
+ show_label=True,
62
+ interactive=False,
63
+ visible=True,
64
+ elem_classes="video")
65
+ prediction_file = gr.File(label="Prediction file",
66
+ file_count="single",
67
+ file_types=[".csv"],
68
+ show_label=True,
69
+ interactive=False,
70
+ visible=True,
71
+ elem_classes="pred")
72
+ gr.Examples(
73
+ ["videos/video1.mp4",
74
+ "videos/video2.mp4",
75
+ "videos/video3.mp4",
76
+ "videos/video4.mp4",
77
+ ],
78
+ [input_video],
79
+ )
80
+
81
+ with gr.Tab("Authors"):
82
+ gr.Markdown(value=AUTHORS)
83
+
84
+ submit_dynamic.click(
85
+ fn=preprocess_video_and_predict,
86
+ inputs=input_video,
87
+ outputs=[
88
+ output_face,
89
+ output_heatmaps,
90
+ output_statistics,
91
+ text,
92
+ # question_mark,
93
+ output_video,
94
+ prediction_file,
95
+ ],
96
+ queue=True,
97
+ )
98
+ clear_btn_dynamic.click(
99
+ fn=clear_dynamic_info,
100
+ inputs=[],
101
+ outputs=[
102
+ input_video,
103
+ output_face,
104
+ output_heatmaps,
105
+ output_statistics,
106
+ text,
107
+ # question_mark,
108
+ output_video,
109
+ prediction_file,
110
+ ],
111
+ queue=True,
112
+ )
113
+
114
+ if __name__ == "__main__":
115
+ demo.queue(api_open=False).launch(share=False)
app/__init__.py ADDED
File without changes
app/app_utils.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ File: app_utils.py
3
+ Author: Elena Ryumina and Dmitry Ryumin
4
+ Description: This module contains utility functions for facial expression recognition application.
5
+ License: MIT License
6
+ """
7
+
8
+ import torch
9
+ import numpy as np
10
+ import mediapipe as mp
11
+ import pandas as pd
12
+ from PIL import Image
13
+ import cv2
14
+
15
+ # Importing necessary components for the Gradio app
16
+ from app.model import (
17
+ pth_model_static,
18
+ pth_model_dynamic,
19
+ activations,
20
+ audio_processor,
21
+ audio_model,
22
+ device
23
+ )
24
+
25
+ from app.utils import (
26
+ convert_mp4_to_mp3,
27
+ pad_wav,
28
+ pad_wav_zeros,
29
+ get_box,
30
+ pth_processing,
31
+ convert_webm_to_mp4,
32
+ get_evenly_spaced_frame_indices,
33
+ get_c_expr_db_pred
34
+ )
35
+
36
+ from app.config import DICT_EMO_VIDEO, AV_WEIGHTS, NAME_EMO_AUDIO, DICT_PRED, config_data
37
+ from app.plot import display_frame_info, plot_images
38
+ from collections import Counter
39
+
40
+ mp_face_mesh = mp.solutions.face_mesh
41
+
42
+ class EmotionRecognition:
43
+ def __init__(
44
+ self,
45
+ step=2,
46
+ window=4,
47
+ sr=16000,
48
+ save_path="",
49
+ padding="",
50
+ ):
51
+ self.save_path = save_path
52
+ self.step = step
53
+ self.window = window
54
+ self.sr = sr
55
+ self.padding = padding
56
+
57
+ def predict_emotion(self, path, frame_indices, fps):
58
+ prob, plt = self.load_audio_features(path, frame_indices, fps)
59
+ return prob, plt
60
+
61
+ def load_audio_features(self, path, frame_indices, fps):
62
+
63
+ window_a = self.window * self.sr
64
+ step_a = int(self.step * self.sr)
65
+
66
+ wav, audio_plt = convert_mp4_to_mp3(path, frame_indices, fps, self.sr)
67
+
68
+ probs = []
69
+ framess = []
70
+
71
+ for start_a in range(0, len(wav) + 1, step_a):
72
+ end_a = min(start_a + window_a, len(wav))
73
+ a_fss_chunk = wav[start_a:end_a]
74
+ if self.padding == "mean" or self.padding == "constant":
75
+ a_fss = pad_wav_zeros(a_fss_chunk, window_a, mode=self.padding)
76
+ elif self.padding == "repeat":
77
+ a_fss = pad_wav(a_fss_chunk, window_a)
78
+ a_fss = torch.unsqueeze(a_fss, 0)
79
+ a_fss = audio_processor(a_fss, sampling_rate=self.sr)
80
+ a_fss = a_fss["input_values"][0]
81
+ a_fss = torch.from_numpy(a_fss)
82
+ with torch.no_grad():
83
+ prob = audio_model(a_fss.to(device))
84
+ prob = prob.cpu().numpy()
85
+ frames = [
86
+ str(i).zfill(6) + ".jpg"
87
+ for i in range(
88
+ round(start_a / self.sr * fps), round(end_a / self.sr * fps + 1)
89
+ )
90
+ ]
91
+ probs.extend([prob] * len(frames))
92
+ framess.extend(frames)
93
+
94
+ if len(probs[0]) == 7:
95
+ emo_ABAW = NAME_EMO_AUDIO[:-1]
96
+ else:
97
+ emo_ABAW = NAME_EMO_AUDIO
98
+
99
+ df = pd.DataFrame(np.array(probs), columns=emo_ABAW)
100
+ df["frames"] = framess
101
+
102
+ return df, audio_plt
103
+
104
+ def preprocess_audio_and_predict(
105
+ path_video="",
106
+ save_path="src/pred_results/C-EXPR-DB",
107
+ frame_indices=[],
108
+ fps=25,
109
+ step=0.5,
110
+ padding="mean",
111
+ window=4,
112
+ sr=16000,
113
+ ):
114
+ audio_ER = EmotionRecognition(
115
+ step=step,
116
+ window=window,
117
+ sr=sr,
118
+ save_path=save_path,
119
+ padding=padding,
120
+ )
121
+ df_pred, audio_plt = audio_ER.predict_emotion(path_video, frame_indices, fps)
122
+
123
+ return df_pred, audio_plt
124
+
125
+ def preprocess_video_and_predict(video):
126
+
127
+ if video:
128
+
129
+ if video.split('.')[-1] == 'webm':
130
+ video = convert_webm_to_mp4(video)
131
+
132
+ cap = cv2.VideoCapture(video)
133
+ w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
134
+ h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
135
+ fps = np.round(cap.get(cv2.CAP_PROP_FPS))
136
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
137
+
138
+ frame_indices = get_evenly_spaced_frame_indices(total_frames, 9)
139
+ df_probs_audio, audio_plt = preprocess_audio_and_predict(
140
+ path_video=video,
141
+ frame_indices=frame_indices,
142
+ fps=fps,
143
+ step=config_data.AUDIO_STEP,
144
+ padding="mean",
145
+ save_path="",
146
+ window=4,
147
+ sr=16000,
148
+ )
149
+
150
+ lstm_features = []
151
+ count_frame = 1
152
+ count_face = 0
153
+ probs_dynamic = []
154
+ probs_static = []
155
+ frames = []
156
+ last_output = None
157
+ cur_face = None
158
+ faces = []
159
+
160
+ zeros = np.zeros((1, 7))
161
+
162
+ with torch.no_grad():
163
+ with mp_face_mesh.FaceMesh(
164
+ max_num_faces=1,
165
+ refine_landmarks=False,
166
+ min_detection_confidence=0.5,
167
+ min_tracking_confidence=0.5) as face_mesh:
168
+
169
+ while cap.isOpened():
170
+ _, frame = cap.read()
171
+ if frame is None: break
172
+
173
+ frame_copy = frame.copy()
174
+ frame_copy.flags.writeable = False
175
+ frame_copy = cv2.cvtColor(frame_copy, cv2.COLOR_BGR2RGB)
176
+ results = face_mesh.process(frame_copy)
177
+ frame_copy.flags.writeable = True
178
+
179
+ if results.multi_face_landmarks:
180
+ for fl in results.multi_face_landmarks:
181
+ startX, startY, endX, endY = get_box(fl, w, h)
182
+ cur_face = frame_copy[startY:endY, startX: endX]
183
+
184
+ if count_face%config_data.FRAME_DOWNSAMPLING == 0:
185
+ cur_face_copy = pth_processing(Image.fromarray(cur_face))
186
+
187
+ prediction = torch.nn.functional.softmax(pth_model_static(cur_face_copy.to(device)), dim=1)
188
+
189
+ features = torch.nn.functional.relu(activations['features']).detach().cpu().numpy()
190
+
191
+ output_s = prediction.clone()
192
+ output_s = output_s.detach().cpu().numpy()
193
+
194
+ if len(lstm_features) == 0:
195
+ lstm_features = [features]*10
196
+ else:
197
+ lstm_features = lstm_features[1:] + [features]
198
+
199
+ lstm_f = torch.from_numpy(np.vstack(lstm_features))
200
+ lstm_f = torch.unsqueeze(lstm_f, 0)
201
+
202
+ output_d = pth_model_dynamic(lstm_f.to(device)).detach().cpu().numpy()
203
+
204
+ last_output = output_d
205
+
206
+ if count_face == 0:
207
+ count_face += 1
208
+
209
+ else:
210
+ if last_output is not None:
211
+ output_d = last_output
212
+
213
+ elif last_output is None:
214
+ output_d = zeros
215
+
216
+ probs_static.append(output_s[0])
217
+ probs_dynamic.append(output_d[0])
218
+ frames.append(count_frame)
219
+ else:
220
+ lstm_features = []
221
+ if last_output is not None:
222
+ probs_static.append(probs_static[-1])
223
+ probs_dynamic.append(probs_dynamic[-1])
224
+ frames.append(count_frame)
225
+
226
+ elif last_output is None:
227
+ probs_static.append(zeros[0])
228
+ probs_dynamic.append(zeros[0])
229
+ frames.append(count_frame)
230
+
231
+ if cur_face is not None:
232
+
233
+ if count_frame-1 in frame_indices:
234
+
235
+ cur_face = cv2.resize(cur_face, (224,224), interpolation = cv2.INTER_AREA)
236
+ cur_face = display_frame_info(cur_face, 'Frame: {}'.format(count_frame), box_scale=.3)
237
+ faces.append(cur_face)
238
+
239
+ count_frame += 1
240
+ if count_face != 0:
241
+ count_face += 1
242
+
243
+ img_plt = plot_images(faces)
244
+
245
+ df_dynamic = pd.DataFrame(
246
+ np.array(probs_dynamic), columns=list(DICT_EMO_VIDEO.values())
247
+ )
248
+ df_static = pd.DataFrame(
249
+ np.array(probs_static), columns=list(DICT_EMO_VIDEO.values())
250
+ )
251
+
252
+ df, pred_plt = get_c_expr_db_pred(
253
+ stat_df=df_static,
254
+ dyn_df=df_dynamic,
255
+ audio_df=df_probs_audio,
256
+ name_video='',
257
+ weights_1=AV_WEIGHTS,
258
+ frame_indices=frame_indices,
259
+ )
260
+
261
+ av_pred = df['Audio-visual fusion'].tolist()
262
+
263
+ states = ['negative', 'neutral', 'positive']
264
+
265
+ dict_av_pred = Counter(av_pred)
266
+ count_states = np.zeros(3)
267
+ for k, v in dict_av_pred.items():
268
+ if k in [0]:
269
+ count_states[1] += v
270
+ elif k in [4, 6, 8, 18]:
271
+ count_states[2] += v
272
+ else:
273
+ count_states[0] += v
274
+
275
+ state_percent = count_states/np.sum(count_states)
276
+
277
+ # if np.argmax(state_percent) in [0,2]:
278
+ # text1 = "The audio-visual model predicts that a person mostly experiences {} ({:.2f}%) emotions. ".format(states[np.argmax(state_percent)], np.max(state_percent)*100)
279
+ # else:
280
+ text1 = "The audio-visual model predicts that a person is mostly in {} ({:.2f}%) state. ".format(states[np.argmax(state_percent)], np.max(state_percent)*100)
281
+
282
+ top_three = dict_av_pred.most_common(3)
283
+
284
+ top_three_text = "Predictions of the three most probable emotions: "
285
+ for index, count in top_three:
286
+ percentage = (count / np.sum(count_states)) * 100
287
+ top_three_text += f"{DICT_PRED[index]} ({percentage:.2f}%), "
288
+
289
+ top_three_text = top_three_text.rstrip(", ") + "."
290
+
291
+ df.to_csv(video.split('.')[0] + '.csv', index=False)
292
+
293
+ return img_plt, audio_plt, pred_plt, text1+top_three_text, video, video.split('.')[0] + '.csv'
294
+
295
+ else:
296
+ return None, None, None, None, None, None
app/authors.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ File: authors.py
3
+ Author: Elena Ryumina and Dmitry Ryumin
4
+ Description: About the authors.
5
+ License: MIT License
6
+ """
7
+
8
+
9
+ AUTHORS = """
10
+ Authors: [Elena Ryumina](https://github.com/ElenaRyumina), [Maxim Markitantov](https://github.com/markitantov), [Dmitry Ryumin](https://github.com/DmitryRyumin), [Heysem Kaya](https://www.uu.nl/staff/HKaya) and [Alexey Karpov](https://hci.nw.ru/en/employees/1)
11
+
12
+ Authorship contribution:
13
+
14
+ App developers: ``Elena Ryumina`` and ``Dmitry Ryumin``
15
+
16
+ Methodology developers: ``Elena Ryumina``, ``Maxim Markitantov``, ``Dmitry Ryumin``, ``Heysem Kaya`` and ``Alexey Karpov``
17
+
18
+ Model developers: ``Elena Ryumina`` and ``Maxim Markitantov``
19
+
20
+ Citation
21
+
22
+ If you are using AVCER in your research, please site:
23
+
24
+ <div class="highlight highlight-text-bibtex notranslate position-relative overflow-auto" dir="auto"><pre><span class="pl-k">@inproceedings</span>{<span class="pl-en">RYUMINA2024CWPRV</span>,
25
+ <span class="pl-s">title</span> = <span class="pl-s"><span class="pl-pds">{</span>Zero-Shot Audio-Visual Compound Expression Recognition Method based on Emotion Probability Fusion<span class="pl-pds">}</span></span>,
26
+ <span class="pl-s">author</span> = <span class="pl-s"><span class="pl-pds">{</span>Elena Ryumina and Maxim Markitantov and Dmitry Ryumin and Heysem Kaya and Alexey Karpov<span class="pl-pds">}</span></span>,
27
+ <span class="pl-s">booktitle</span> = <span class="pl-s"><span class="pl-pds">{</span>IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops<span class="pl-pds">}</span></span>,
28
+ <span class="pl-s">year</span> = <span class="pl-s"><span class="pl-pds">{</span>2024<span class="pl-pds">}</span></span>,
29
+ <span class="pl-s">pages</span> = <span class="pl-s"><span class="pl-pds">{</span>1--9<span class="pl-pds">}</span></span>,
30
+ }</div>
31
+ """
app/config.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ File: config.py
3
+ Author: Elena Ryumina and Dmitry Ryumin
4
+ Description: Configuration file.
5
+ License: MIT License
6
+ """
7
+
8
+ import toml
9
+ from typing import Dict
10
+ from types import SimpleNamespace
11
+
12
+
13
+ def flatten_dict(prefix: str, d: Dict) -> Dict:
14
+ result = {}
15
+
16
+ for k, v in d.items():
17
+ if isinstance(v, dict):
18
+ result.update(flatten_dict(f"{prefix}{k}_", v))
19
+ else:
20
+ result[f"{prefix}{k}"] = v
21
+
22
+ return result
23
+
24
+
25
+ config = toml.load("config.toml")
26
+
27
+ config_data = flatten_dict("", config)
28
+
29
+ config_data = SimpleNamespace(**config_data)
30
+
31
+ DICT_EMO_VIDEO = {
32
+ 0: "Neutral",
33
+ 1: "Happiness",
34
+ 2: "Sadness",
35
+ 3: "Surprise",
36
+ 4: "Fear",
37
+ 5: "Disgust",
38
+ 6: "Anger",
39
+ }
40
+
41
+ NAME_EMO_AUDIO = [
42
+ "Neutral",
43
+ "Anger",
44
+ "Disgust",
45
+ "Fear",
46
+ "Happiness",
47
+ "Sadness",
48
+ "Surprise",
49
+ "Other",
50
+ ]
51
+
52
+ DICT_CE = {
53
+ "Fearfully Surprised": [3, 6],
54
+ "Happily Surprised": [4, 6],
55
+ "Sadly Surprised": [5, 6],
56
+ "Disgustedly Surprised": [2, 6],
57
+ "Angrily Surprised": [1, 6],
58
+ "Sadly Fearful": [3, 5],
59
+ "Sadly Angry": [1, 5],
60
+ "Sadly Disgusted": [2, 5],
61
+ "Fearfully Angry": [1, 3],
62
+ "Fearfully Disgusted": [2, 3],
63
+ "Angrily Disgusted": [1, 2],
64
+ "Happily Disgusted": [2, 4],
65
+ }
66
+
67
+ DICT_PRED = {
68
+ 0: 'Neutral',
69
+ 1: 'Anger',
70
+ 2: 'Disgust',
71
+ 3: 'Fear',
72
+ 4: 'Happiness',
73
+ 5: 'Sadness',
74
+ 6: 'Surprise',
75
+ 7: 'Fearfully Surprised',
76
+ 8: 'Happily Surprised',
77
+ 9: 'Sadly Surprised',
78
+ 10: 'Disgustedly Surprised',
79
+ 11: 'Angrily Surprised',
80
+ 12: 'Sadly Fearful',
81
+ 13: 'Sadly Angry',
82
+ 14: 'Sadly Disgusted',
83
+ 15: 'Fearfully Angry',
84
+ 16: 'Fearfully Disgusted',
85
+ 17: 'Angrily Disgusted',
86
+ 18: 'Happily Disgusted',
87
+ }
88
+
89
+ AV_WEIGHTS = [
90
+ [
91
+ 0.89900098,
92
+ 0.10362151,
93
+ 0.08577635,
94
+ 0.04428126,
95
+ 0.89679865,
96
+ 0.02656456,
97
+ 0.63040305,
98
+ ],
99
+ [
100
+ 0.01223291,
101
+ 0.21364307,
102
+ 0.66688002,
103
+ 0.93791526,
104
+ 0.0398964,
105
+ 0.48670648,
106
+ 0.22089692,
107
+ ],
108
+ [
109
+ 0.08876611,
110
+ 0.68273542,
111
+ 0.24734363,
112
+ 0.01780348,
113
+ 0.06330495,
114
+ 0.48672896,
115
+ 0.14870002,
116
+ ],
117
+ ]
118
+
119
+ COLORS = {
120
+ 0: 'blue',
121
+ 1: 'orange',
122
+ 2: 'green',
123
+ 3: 'red',
124
+ 4: 'purple',
125
+ 5: 'brown',
126
+ 6: 'pink'
127
+ }
app/description.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ File: description.py
3
+ Author: Elena Ryumina and Dmitry Ryumin
4
+ Description: Project description for the Gradio app.
5
+ License: MIT License
6
+ """
7
+
8
+ # Importing necessary components for the Gradio app
9
+ from app.config import config_data
10
+
11
+ DESCRIPTION_DYNAMIC = f"""\
12
+ # Zero-Shot Audio-Visual Compound Expression Recognition (AVCER)
13
+
14
+ AVCER predicts six basic emotions (Anger, Disgust, Fear, Happiness, Sadness, Surprise), neutral state (Neutral), and twelve compound emotions
15
+ (Fearfully Surprised, Happily Surprised, Sadly Surprised, Disgustedly Surprised, Angrily Surprised, Sadly Fearful, Sadly Angry, Sadly Disgusted,
16
+ Fearfully Angry, Fearfully Disgusted, Angrily Disgusted, Happily Disgusted).
17
+
18
+ <div class="app-flex-container">
19
+ <img src="https://img.shields.io/badge/version-v{config_data.APP_VERSION}-rc0" alt="Version">
20
+ """
app/model.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ File: model.py
3
+ Author: Elena Ryumina and Dmitry Ryumin
4
+ Description: This module provides functions for loading and processing a pre-trained deep learning model
5
+ for facial expression recognition.
6
+ License: MIT License
7
+ """
8
+
9
+ import torch
10
+ import requests
11
+
12
+ # Importing necessary components for the Gradio app
13
+ from app.config import config_data
14
+ from app.model_architectures import ResNet50, LSTMPyTorch, ExprModelV3
15
+ from transformers import AutoFeatureExtractor
16
+
17
+ device = "cuda" if torch.cuda.is_available() else "cpu"
18
+
19
+ def load_model(model_url, model_path):
20
+ try:
21
+ with requests.get(model_url, stream=True) as response:
22
+ with open(model_path, "wb") as file:
23
+ for chunk in response.iter_content(chunk_size=8192):
24
+ file.write(chunk)
25
+ return model_path
26
+ except Exception as e:
27
+ print(f"Error loading model: {e}")
28
+ return None
29
+
30
+ gradients = {}
31
+ def get_gradients(name):
32
+ def hook(model, input, output):
33
+ gradients[name] = output
34
+ return hook
35
+
36
+ activations = {}
37
+ def get_activations(name):
38
+ def hook(model, input, output):
39
+ activations[name] = output.detach()
40
+ return hook
41
+
42
+ test_static = torch.rand(1, 3, 224, 224)
43
+ test_dynamic = torch.rand(1, 10, 512)
44
+ test_audio = torch.rand(1, 64000)
45
+
46
+ path_static = load_model(config_data.model_static_url, config_data.model_static_path)
47
+ pth_model_static = ResNet50(7, channels=3)
48
+ pth_model_static.load_state_dict(torch.load(path_static))
49
+ pth_model_static.to(device)
50
+ pth_model_static.eval()
51
+ pth_model_static(test_static.to(device))
52
+
53
+ pth_model_static.layer4.register_full_backward_hook(get_gradients('layer4'))
54
+ pth_model_static.layer4.register_forward_hook(get_activations('layer4'))
55
+ pth_model_static.fc1.register_forward_hook(get_activations('features'))
56
+
57
+ path_dynamic = load_model(config_data.model_dynamic_url, config_data.model_dynamic_path)
58
+ pth_model_dynamic = LSTMPyTorch()
59
+ pth_model_dynamic.load_state_dict(torch.load(path_dynamic))
60
+ pth_model_dynamic.to(device)
61
+ pth_model_dynamic.eval()
62
+ pth_model_dynamic(test_dynamic.to(device))
63
+
64
+ path_audio_model_1 = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
65
+ path_audio_model_2 = load_model(config_data.model_audio_url, config_data.model_audio_path)
66
+ audio_processor = AutoFeatureExtractor.from_pretrained(path_audio_model_1)
67
+
68
+ audio_model = ExprModelV3.from_pretrained(path_audio_model_1)
69
+ audio_model.load_state_dict(torch.load(path_audio_model_2)["model_state_dict"])
70
+ audio_model.to(device)
71
+ audio_model.eval()
72
+ audio_model(test_audio.to(device))
app/model_architectures.py ADDED
@@ -0,0 +1,483 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ File: model.py
3
+ Author: Elena Ryumina and Dmitry Ryumin
4
+ Description: This module provides model architectures.
5
+ License: MIT License
6
+ """
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+ import torch.nn.functional as F
11
+ import math
12
+ import numpy as np
13
+
14
+ from transformers.models.wav2vec2.modeling_wav2vec2 import (
15
+ Wav2Vec2Model,
16
+ Wav2Vec2PreTrainedModel,
17
+ )
18
+ from typing import Optional
19
+
20
+
21
+ class Bottleneck(nn.Module):
22
+ expansion = 4
23
+ def __init__(self, in_channels, out_channels, i_downsample=None, stride=1):
24
+ super(Bottleneck, self).__init__()
25
+
26
+ self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, padding=0, bias=False)
27
+ self.batch_norm1 = nn.BatchNorm2d(out_channels, eps=0.001, momentum=0.99)
28
+
29
+ self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding='same', bias=False)
30
+ self.batch_norm2 = nn.BatchNorm2d(out_channels, eps=0.001, momentum=0.99)
31
+
32
+ self.conv3 = nn.Conv2d(out_channels, out_channels*self.expansion, kernel_size=1, stride=1, padding=0, bias=False)
33
+ self.batch_norm3 = nn.BatchNorm2d(out_channels*self.expansion, eps=0.001, momentum=0.99)
34
+
35
+ self.i_downsample = i_downsample
36
+ self.stride = stride
37
+ self.relu = nn.ReLU()
38
+
39
+ def forward(self, x):
40
+ identity = x.clone()
41
+ x = self.relu(self.batch_norm1(self.conv1(x)))
42
+
43
+ x = self.relu(self.batch_norm2(self.conv2(x)))
44
+
45
+ x = self.conv3(x)
46
+ x = self.batch_norm3(x)
47
+
48
+ #downsample if needed
49
+ if self.i_downsample is not None:
50
+ identity = self.i_downsample(identity)
51
+ #add identity
52
+ x+=identity
53
+ x=self.relu(x)
54
+
55
+ return x
56
+
57
+
58
+ class Conv2dSame(torch.nn.Conv2d):
59
+
60
+ def calc_same_pad(self, i: int, k: int, s: int, d: int) -> int:
61
+ return max((math.ceil(i / s) - 1) * s + (k - 1) * d + 1 - i, 0)
62
+
63
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
64
+ ih, iw = x.size()[-2:]
65
+
66
+ pad_h = self.calc_same_pad(i=ih, k=self.kernel_size[0], s=self.stride[0], d=self.dilation[0])
67
+ pad_w = self.calc_same_pad(i=iw, k=self.kernel_size[1], s=self.stride[1], d=self.dilation[1])
68
+
69
+ if pad_h > 0 or pad_w > 0:
70
+ x = F.pad(
71
+ x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2]
72
+ )
73
+ return F.conv2d(
74
+ x,
75
+ self.weight,
76
+ self.bias,
77
+ self.stride,
78
+ self.padding,
79
+ self.dilation,
80
+ self.groups,
81
+ )
82
+
83
+
84
+ class ResNet(nn.Module):
85
+ def __init__(self, ResBlock, layer_list, num_classes, num_channels=3):
86
+ super(ResNet, self).__init__()
87
+ self.in_channels = 64
88
+
89
+ self.conv_layer_s2_same = Conv2dSame(num_channels, 64, 7, stride=2, groups=1, bias=False)
90
+ self.batch_norm1 = nn.BatchNorm2d(64, eps=0.001, momentum=0.99)
91
+ self.relu = nn.ReLU()
92
+ self.max_pool = nn.MaxPool2d(kernel_size = 3, stride=2)
93
+
94
+ self.layer1 = self._make_layer(ResBlock, layer_list[0], planes=64, stride=1)
95
+ self.layer2 = self._make_layer(ResBlock, layer_list[1], planes=128, stride=2)
96
+ self.layer3 = self._make_layer(ResBlock, layer_list[2], planes=256, stride=2)
97
+ self.layer4 = self._make_layer(ResBlock, layer_list[3], planes=512, stride=2)
98
+
99
+ self.avgpool = nn.AdaptiveAvgPool2d((1,1))
100
+ self.fc1 = nn.Linear(512*ResBlock.expansion, 512)
101
+ self.relu1 = nn.ReLU()
102
+ self.fc2 = nn.Linear(512, num_classes)
103
+
104
+ def extract_features(self, x):
105
+ x = self.relu(self.batch_norm1(self.conv_layer_s2_same(x)))
106
+ x = self.max_pool(x)
107
+ # print(x.shape)
108
+ x = self.layer1(x)
109
+ x = self.layer2(x)
110
+ x = self.layer3(x)
111
+ x = self.layer4(x)
112
+
113
+ x = self.avgpool(x)
114
+ x = x.reshape(x.shape[0], -1)
115
+ x = self.fc1(x)
116
+ return x
117
+
118
+ def forward(self, x):
119
+ x = self.extract_features(x)
120
+ x = self.relu1(x)
121
+ x = self.fc2(x)
122
+ return x
123
+
124
+ def _make_layer(self, ResBlock, blocks, planes, stride=1):
125
+ ii_downsample = None
126
+ layers = []
127
+
128
+ if stride != 1 or self.in_channels != planes*ResBlock.expansion:
129
+ ii_downsample = nn.Sequential(
130
+ nn.Conv2d(self.in_channels, planes*ResBlock.expansion, kernel_size=1, stride=stride, bias=False, padding=0),
131
+ nn.BatchNorm2d(planes*ResBlock.expansion, eps=0.001, momentum=0.99)
132
+ )
133
+
134
+ layers.append(ResBlock(self.in_channels, planes, i_downsample=ii_downsample, stride=stride))
135
+ self.in_channels = planes*ResBlock.expansion
136
+
137
+ for i in range(blocks-1):
138
+ layers.append(ResBlock(self.in_channels, planes))
139
+
140
+ return nn.Sequential(*layers)
141
+
142
+
143
+ def ResNet50(num_classes, channels=3):
144
+ return ResNet(Bottleneck, [3,4,6,3], num_classes, channels)
145
+
146
+
147
+ class LSTMPyTorch(nn.Module):
148
+ def __init__(self):
149
+ super(LSTMPyTorch, self).__init__()
150
+
151
+ self.lstm1 = nn.LSTM(input_size=512, hidden_size=512, batch_first=True, bidirectional=False)
152
+ self.lstm2 = nn.LSTM(input_size=512, hidden_size=256, batch_first=True, bidirectional=False)
153
+ self.fc = nn.Linear(256, 7)
154
+ # self.softmax = nn.Softmax(dim=1)
155
+
156
+ def forward(self, x):
157
+ x, _ = self.lstm1(x)
158
+ x, _ = self.lstm2(x)
159
+ x = self.fc(x[:, -1, :])
160
+ # x = self.softmax(x)
161
+ return x
162
+
163
+
164
+ class ExprModelV3(Wav2Vec2PreTrainedModel):
165
+ def __init__(self, config) -> None:
166
+ super().__init__(config)
167
+ self.config = config
168
+ self.wav2vec2 = Wav2Vec2Model(config)
169
+
170
+ self.tl1 = TransformerLayer(
171
+ input_dim=1024, num_heads=32, dropout=0.1, positional_encoding=True
172
+ )
173
+ self.tl2 = TransformerLayer(
174
+ input_dim=1024, num_heads=16, dropout=0.1, positional_encoding=True
175
+ )
176
+
177
+ self.f_size = 1024
178
+
179
+ self.time_downsample = torch.nn.Sequential(
180
+ torch.nn.Conv1d(
181
+ self.f_size, self.f_size, kernel_size=5, stride=3, dilation=2
182
+ ),
183
+ torch.nn.BatchNorm1d(self.f_size),
184
+ torch.nn.MaxPool1d(5),
185
+ torch.nn.ReLU(),
186
+ torch.nn.Conv1d(self.f_size, self.f_size, kernel_size=3),
187
+ torch.nn.BatchNorm1d(self.f_size),
188
+ torch.nn.AdaptiveAvgPool1d(1),
189
+ torch.nn.ReLU(),
190
+ )
191
+
192
+ self.feature_downsample = nn.Linear(self.f_size, 8)
193
+
194
+ self.init_weights()
195
+ self.unfreeze_last_n_blocks(4)
196
+
197
+ def freeze_conv_only(self):
198
+ # freeze conv
199
+ for param in self.wav2vec2.feature_extractor.conv_layers.parameters():
200
+ param.requires_grad = False
201
+
202
+ def unfreeze_last_n_blocks(self, num_blocks: int) -> None:
203
+ # freeze all wav2vec
204
+ for param in self.wav2vec2.parameters():
205
+ param.requires_grad = False
206
+
207
+ # unfreeze last n transformer blocks
208
+ for i in range(0, num_blocks):
209
+ for param in self.wav2vec2.encoder.layers[-1 * (i + 1)].parameters():
210
+ param.requires_grad = True
211
+
212
+ def forward(self, x):
213
+ x = self.wav2vec2(x)[0]
214
+
215
+ x = self.tl1(query=x, key=x, value=x)
216
+ x = self.tl2(query=x, key=x, value=x)
217
+
218
+ x = x.permute(0, 2, 1)
219
+ x = self.time_downsample(x)
220
+
221
+ x = x.squeeze()
222
+ x = self.feature_downsample(x)
223
+ return x
224
+
225
+
226
+ class ScaledDotProductAttention_MultiHead(nn.Module):
227
+
228
+ def __init__(self):
229
+ super(ScaledDotProductAttention_MultiHead, self).__init__()
230
+ self.softmax = nn.Softmax(dim=-1)
231
+
232
+ def forward(self, query, key, value, mask=None):
233
+ if mask is not None:
234
+ raise ValueError("Mask is not supported yet")
235
+
236
+ # key, query, value shapes: [batch_size, num_heads, seq_len, dim]
237
+ emb_dim = key.shape[-1]
238
+
239
+ # Calculate attention weights
240
+ attention_weights = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(
241
+ emb_dim
242
+ )
243
+
244
+ # masking
245
+ if mask is not None:
246
+ raise ValueError("Mask is not supported yet")
247
+
248
+ # Softmax
249
+ attention_weights = self.softmax(attention_weights)
250
+
251
+ # modify value
252
+ value = torch.matmul(attention_weights, value)
253
+
254
+ return value, attention_weights
255
+
256
+
257
+ class PositionWiseFeedForward(nn.Module):
258
+
259
+ def __init__(self, input_dim, hidden_dim, dropout: float = 0.1):
260
+ super().__init__()
261
+ self.layer_1 = nn.Linear(input_dim, hidden_dim)
262
+ self.layer_2 = nn.Linear(hidden_dim, input_dim)
263
+ self.layer_norm = nn.LayerNorm(input_dim)
264
+ self.dropout = nn.Dropout(dropout)
265
+
266
+ def forward(self, x):
267
+ # feed-forward network
268
+ x = self.layer_1(x)
269
+ x = self.dropout(x)
270
+ x = F.relu(x)
271
+ x = self.layer_2(x)
272
+
273
+ return x
274
+
275
+
276
+ class Add_and_Norm(nn.Module):
277
+
278
+ def __init__(self, input_dim, dropout: Optional[float] = 0.1):
279
+ super().__init__()
280
+ self.layer_norm = nn.LayerNorm(input_dim)
281
+ if dropout is not None:
282
+ self.dropout = nn.Dropout(dropout)
283
+
284
+ def forward(self, x1, residual):
285
+ x = x1
286
+ # apply dropout of needed
287
+ if hasattr(self, "dropout"):
288
+ x = self.dropout(x)
289
+ # add and then norm
290
+ x = x + residual
291
+ x = self.layer_norm(x)
292
+
293
+ return x
294
+
295
+
296
+ class MultiHeadAttention(nn.Module):
297
+
298
+ def __init__(self, input_dim, num_heads, dropout: Optional[float] = 0.1):
299
+ super().__init__()
300
+ self.input_dim = input_dim
301
+ self.num_heads = num_heads
302
+ if input_dim % num_heads != 0:
303
+ raise ValueError("input_dim must be divisible by num_heads")
304
+ self.head_dim = input_dim // num_heads
305
+ self.dropout = dropout
306
+
307
+ # initialize weights
308
+ self.query_w = nn.Linear(input_dim, self.num_heads * self.head_dim, bias=False)
309
+ self.keys_w = nn.Linear(input_dim, self.num_heads * self.head_dim, bias=False)
310
+ self.values_w = nn.Linear(input_dim, self.num_heads * self.head_dim, bias=False)
311
+ self.ff_layer_after_concat = nn.Linear(
312
+ self.num_heads * self.head_dim, input_dim, bias=False
313
+ )
314
+
315
+ self.attention = ScaledDotProductAttention_MultiHead()
316
+
317
+ if self.dropout is not None:
318
+ self.dropout = nn.Dropout(dropout)
319
+
320
+ def forward(self, queries, keys, values, mask=None):
321
+ # query, keys, values shapes: [batch_size, seq_len, input_dim]
322
+ batch_size, len_query, len_keys, len_values = (
323
+ queries.size(0),
324
+ queries.size(1),
325
+ keys.size(1),
326
+ values.size(1),
327
+ )
328
+
329
+ # linear transformation before attention
330
+ queries = (
331
+ self.query_w(queries)
332
+ .view(batch_size, len_query, self.num_heads, self.head_dim)
333
+ .transpose(1, 2)
334
+ ) # [batch_size, num_heads, seq_len, dim]
335
+ keys = (
336
+ self.keys_w(keys)
337
+ .view(batch_size, len_keys, self.num_heads, self.head_dim)
338
+ .transpose(1, 2)
339
+ ) # [batch_size, num_heads, seq_len, dim]
340
+ values = (
341
+ self.values_w(values)
342
+ .view(batch_size, len_values, self.num_heads, self.head_dim)
343
+ .transpose(1, 2)
344
+ ) # [batch_size, num_heads, seq_len, dim]
345
+
346
+ # attention itself
347
+ values, attention_weights = self.attention(
348
+ queries, keys, values, mask=mask
349
+ ) # values shape:[batch_size, num_heads, seq_len, dim]
350
+
351
+ # concatenation
352
+ out = (
353
+ values.transpose(1, 2)
354
+ .contiguous()
355
+ .view(batch_size, len_values, self.num_heads * self.head_dim)
356
+ ) # [batch_size, seq_len, num_heads * dim = input_dim]
357
+ # go through last linear layer
358
+ out = self.ff_layer_after_concat(out)
359
+
360
+ return out
361
+
362
+
363
+ class EncoderLayer(nn.Module):
364
+
365
+ def __init__(
366
+ self,
367
+ input_dim,
368
+ num_heads,
369
+ dropout: Optional[float] = 0.1,
370
+ positional_encoding: bool = True,
371
+ ):
372
+ super(EncoderLayer, self).__init__()
373
+ self.positional_encoding = positional_encoding
374
+ self.input_dim = input_dim
375
+ self.num_heads = num_heads
376
+ self.head_dim = input_dim // num_heads
377
+ self.dropout = dropout
378
+
379
+ # initialize layers
380
+ self.self_attention = MultiHeadAttention(input_dim, num_heads, dropout=dropout)
381
+ self.feed_forward = PositionWiseFeedForward(
382
+ input_dim, input_dim, dropout=dropout
383
+ )
384
+ self.add_norm_after_attention = Add_and_Norm(input_dim, dropout=dropout)
385
+ self.add_norm_after_ff = Add_and_Norm(input_dim, dropout=dropout)
386
+
387
+ # calculate positional encoding
388
+ if self.positional_encoding:
389
+ self.positional_encoding = PositionalEncoding(input_dim)
390
+
391
+ def forward(self, x):
392
+ # x shape: [batch_size, seq_len, input_dim]
393
+ # positional encoding
394
+ if self.positional_encoding:
395
+ x = self.positional_encoding(x)
396
+
397
+ # multi-head attention
398
+ residual = x
399
+ x = self.self_attention(x, x, x)
400
+ x = self.add_norm_after_attention(x, residual)
401
+
402
+ # feed forward
403
+ residual = x
404
+ x = self.feed_forward(x)
405
+ x = self.add_norm_after_ff(x, residual)
406
+
407
+ return x
408
+
409
+
410
+ class PositionalEncoding(nn.Module):
411
+
412
+ def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
413
+ super().__init__()
414
+ self.dropout = nn.Dropout(p=dropout)
415
+
416
+ position = torch.arange(max_len).unsqueeze(1)
417
+ div_term = torch.exp(
418
+ torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
419
+ )
420
+ pe = torch.zeros(max_len, 1, d_model)
421
+ pe[:, 0, 0::2] = torch.sin(position * div_term)
422
+ pe[:, 0, 1::2] = torch.cos(position * div_term)
423
+ pe = pe.permute(
424
+ 1, 0, 2
425
+ ) # [seq_len, batch_size, embedding_dim] -> [batch_size, seq_len, embedding_dim]
426
+ self.register_buffer("pe", pe)
427
+
428
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
429
+ """
430
+ Args:
431
+ x: Tensor, shape [batch_size, seq_len, embedding_dim]
432
+ """
433
+ x = x + self.pe[:, : x.size(1)]
434
+ return self.dropout(x)
435
+
436
+
437
+ class TransformerLayer(nn.Module):
438
+
439
+ def __init__(
440
+ self,
441
+ input_dim,
442
+ num_heads,
443
+ dropout: Optional[float] = 0.1,
444
+ positional_encoding: bool = True,
445
+ ):
446
+ super(TransformerLayer, self).__init__()
447
+ self.positional_encoding = positional_encoding
448
+ self.input_dim = input_dim
449
+ self.num_heads = num_heads
450
+ self.head_dim = input_dim // num_heads
451
+ self.dropout = dropout
452
+
453
+ # initialize layers
454
+ self.self_attention = MultiHeadAttention(input_dim, num_heads, dropout=dropout)
455
+ self.feed_forward = PositionWiseFeedForward(
456
+ input_dim, input_dim, dropout=dropout
457
+ )
458
+ self.add_norm_after_attention = Add_and_Norm(input_dim, dropout=dropout)
459
+ self.add_norm_after_ff = Add_and_Norm(input_dim, dropout=dropout)
460
+
461
+ # calculate positional encoding
462
+ if self.positional_encoding:
463
+ self.positional_encoding = PositionalEncoding(input_dim)
464
+
465
+ def forward(self, key, value, query, mask=None):
466
+ # key, value, and query shapes: [batch_size, seq_len, input_dim]
467
+ # positional encoding
468
+ if self.positional_encoding:
469
+ key = self.positional_encoding(key)
470
+ value = self.positional_encoding(value)
471
+ query = self.positional_encoding(query)
472
+
473
+ # multi-head attention
474
+ residual = query
475
+ x = self.self_attention(queries=query, keys=key, values=value, mask=mask)
476
+ x = self.add_norm_after_attention(x, residual)
477
+
478
+ # feed forward
479
+ residual = x
480
+ x = self.feed_forward(x)
481
+ x = self.add_norm_after_ff(x, residual)
482
+
483
+ return x
app/plot.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ File: config.py
3
+ Author: Elena Ryumina and Dmitry Ryumin
4
+ Description: Plotting statistical information.
5
+ License: MIT License
6
+ """
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import cv2
10
+ import torch
11
+
12
+ # Importing necessary components for the Gradio app
13
+ from app.config import DICT_PRED
14
+
15
+ def show_cam_on_image(
16
+ img: np.ndarray,
17
+ mask: np.ndarray,
18
+ use_rgb: bool = False,
19
+ colormap: int = cv2.COLORMAP_JET,
20
+ image_weight: float = 0.5,
21
+ ) -> np.ndarray:
22
+ """This function overlays the cam mask on the image as an heatmap.
23
+ By default the heatmap is in BGR format.
24
+
25
+ :param img: The base image in RGB or BGR format.
26
+ :param mask: The cam mask.
27
+ :param use_rgb: Whether to use an RGB or BGR heatmap, this should be set to True if 'img' is in RGB format.
28
+ :param colormap: The OpenCV colormap to be used.
29
+ :param image_weight: The final result is image_weight * img + (1-image_weight) * mask.
30
+ :returns: The default image with the cam overlay.
31
+
32
+ Implemented by https://github.com/jacobgil/pytorch-grad-cam/blob/master/pytorch_grad_cam/utils/image.py
33
+ """
34
+ heatmap = cv2.applyColorMap(np.uint8(255 * mask), colormap)
35
+ if use_rgb:
36
+ heatmap = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB)
37
+ heatmap = np.float32(heatmap) / 255
38
+
39
+ if np.max(img) > 1:
40
+ raise Exception("The input image should np.float32 in the range [0, 1]")
41
+
42
+ if image_weight < 0 or image_weight > 1:
43
+ raise Exception(
44
+ f"image_weight should be in the range [0, 1].\
45
+ Got: {image_weight}"
46
+ )
47
+
48
+ cam = (1 - image_weight) * heatmap + image_weight * img
49
+ cam = cam / np.max(cam)
50
+ return np.uint8(255 * cam)
51
+
52
+
53
+ def get_heatmaps(
54
+ gradients, activations, name_layer, face_image, use_rgb=True, image_weight=0.6
55
+ ):
56
+ gradient = gradients[name_layer]
57
+ activation = activations[name_layer]
58
+ pooled_gradients = torch.mean(gradient[0], dim=[0, 2, 3])
59
+ for i in range(activation.size()[1]):
60
+ activation[:, i, :, :] *= pooled_gradients[i]
61
+ heatmap = torch.mean(activation, dim=1).squeeze().cpu()
62
+ heatmap = np.maximum(heatmap, 0)
63
+ heatmap /= torch.max(heatmap)
64
+ heatmap = torch.unsqueeze(heatmap, -1)
65
+ heatmap = cv2.resize(heatmap.detach().numpy(), (224, 224))
66
+ cur_face_hm = cv2.resize(face_image, (224, 224))
67
+ cur_face_hm = np.float32(cur_face_hm) / 255
68
+
69
+ heatmap = show_cam_on_image(
70
+ cur_face_hm, heatmap, use_rgb=use_rgb, image_weight=image_weight
71
+ )
72
+
73
+ return heatmap
74
+
75
+ def plot_compound_expression_prediction(
76
+ dict_preds: dict[str, list[float]],
77
+ save_path: str = None,
78
+ frame_indices: list[int] = None,
79
+ colors: list[str] = ["green", "orange", "red", "purple", "blue"],
80
+ figsize: tuple = (12, 6),
81
+ title: str = "Confusion Matrix",
82
+ ) -> plt.Figure:
83
+ fig, ax = plt.subplots(figsize=figsize)
84
+
85
+ for idx, (k, v) in enumerate(dict_preds.items()):
86
+ if idx == 2:
87
+ offset = (idx+1 - len(dict_preds) // 2) * 0.1
88
+ elif idx == 3:
89
+ offset = (idx-1 - len(dict_preds) // 2) * 0.1
90
+ else:
91
+ offset = (idx - len(dict_preds) // 2) * 0.1
92
+ shifted_v = [val + offset + 1 for val in v]
93
+ ax.plot(range(1, len(shifted_v) + 1), shifted_v, color=colors[idx], linestyle='dotted', label=k)
94
+
95
+ ax.legend()
96
+ ax.grid(True)
97
+ ax.set_xlabel("Number of frames")
98
+ ax.set_ylabel("Basic emotion / compound expression")
99
+ ax.set_title(title)
100
+
101
+ ax.set_xticks([i+1 for i in frame_indices])
102
+ ax.set_yticks(
103
+ range(0, 21)
104
+ )
105
+ ax.set_yticklabels([''] + list(DICT_PRED.values()) + [''])
106
+
107
+ fig.tight_layout()
108
+
109
+ if save_path:
110
+ fig.savefig(
111
+ save_path,
112
+ format=save_path.rsplit(".", 1)[1],
113
+ bbox_inches="tight",
114
+ pad_inches=0,
115
+ )
116
+
117
+ return fig
118
+
119
+ def display_frame_info(img, text, margin=1.0, box_scale=1.0):
120
+ img_copy = img.copy()
121
+ img_h, img_w, _ = img_copy.shape
122
+ line_width = int(min(img_h, img_w) * 0.001)
123
+ thickness = max(int(line_width / 3), 1)
124
+
125
+ font_face = cv2.FONT_HERSHEY_SIMPLEX
126
+ font_color = (0, 0, 0)
127
+ font_scale = thickness / 1.5
128
+
129
+ t_w, t_h = cv2.getTextSize(text, font_face, font_scale, None)[0]
130
+
131
+ margin_n = int(t_h * margin)
132
+ sub_img = img_copy[0 + margin_n: 0 + margin_n + t_h + int(2 * t_h * box_scale),
133
+ img_w - t_w - margin_n - int(2 * t_h * box_scale): img_w - margin_n]
134
+
135
+ white_rect = np.ones(sub_img.shape, dtype=np.uint8) * 255
136
+
137
+ img_copy[0 + margin_n: 0 + margin_n + t_h + int(2 * t_h * box_scale),
138
+ img_w - t_w - margin_n - int(2 * t_h * box_scale):img_w - margin_n] = cv2.addWeighted(sub_img, 0.5, white_rect, .5, 1.0)
139
+
140
+ cv2.putText(img=img_copy,
141
+ text=text,
142
+ org=(img_w - t_w - margin_n - int(2 * t_h * box_scale) // 2,
143
+ 0 + margin_n + t_h + int(2 * t_h * box_scale) // 2),
144
+ fontFace=font_face,
145
+ fontScale=font_scale,
146
+ color=font_color,
147
+ thickness=thickness,
148
+ lineType=cv2.LINE_AA,
149
+ bottomLeftOrigin=False)
150
+
151
+ return img_copy
152
+
153
+ def plot_audio(time_axis, waveform, frame_indices, fps, figsize=(10, 4)) -> plt.Figure:
154
+ frame_times = np.array(frame_indices) / fps
155
+
156
+ fig, ax = plt.subplots(figsize=figsize)
157
+ ax.plot(time_axis, waveform[0])
158
+ ax.set_xlabel('Time (frames)')
159
+ ax.set_ylabel('Amplitude')
160
+ ax.grid(True)
161
+
162
+ ax.set_xticks(frame_times)
163
+ ax.set_xticklabels([f'{int(frame_time*fps)+1}' for frame_time in frame_times])
164
+
165
+ fig.tight_layout()
166
+
167
+ return fig
168
+
169
+ def plot_images(image_paths):
170
+ fig, axes = plt.subplots(1, len(image_paths), figsize=(12, 2))
171
+
172
+ for ax, img_path in zip(axes, image_paths):
173
+ ax.imshow(img_path)
174
+ ax.axis('off')
175
+
176
+ fig.tight_layout()
177
+ return fig
app/utils.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ File: face_utils.py
3
+ Author: Elena Ryumina and Dmitry Ryumin
4
+ Description: This module contains utility functions related to facial landmarks and image processing.
5
+ License: MIT License
6
+ """
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ import math
11
+
12
+ import subprocess
13
+ import torchaudio
14
+ import torch
15
+ import os
16
+
17
+ from PIL import Image
18
+ from torchvision import transforms
19
+
20
+ # Importing necessary components for the Gradio app
21
+ from app.config import NAME_EMO_AUDIO, DICT_CE, config_data
22
+ from app.plot import plot_compound_expression_prediction, plot_audio
23
+
24
+
25
+ def norm_coordinates(normalized_x, normalized_y, image_width, image_height):
26
+ x_px = min(math.floor(normalized_x * image_width), image_width - 1)
27
+ y_px = min(math.floor(normalized_y * image_height), image_height - 1)
28
+ return x_px, y_px
29
+
30
+
31
+ def get_box(fl, w, h):
32
+ idx_to_coors = {}
33
+ for idx, landmark in enumerate(fl.landmark):
34
+ landmark_px = norm_coordinates(landmark.x, landmark.y, w, h)
35
+ if landmark_px:
36
+ idx_to_coors[idx] = landmark_px
37
+
38
+ x_min = np.min(np.asarray(list(idx_to_coors.values()))[:, 0])
39
+ y_min = np.min(np.asarray(list(idx_to_coors.values()))[:, 1])
40
+ endX = np.max(np.asarray(list(idx_to_coors.values()))[:, 0])
41
+ endY = np.max(np.asarray(list(idx_to_coors.values()))[:, 1])
42
+
43
+ (startX, startY) = (max(0, x_min), max(0, y_min))
44
+ (endX, endY) = (min(w - 1, endX), min(h - 1, endY))
45
+
46
+ return startX, startY, endX, endY
47
+
48
+
49
+ def pth_processing(fp):
50
+ class PreprocessInput(torch.nn.Module):
51
+ def init(self):
52
+ super(PreprocessInput, self).init()
53
+
54
+ def forward(self, x):
55
+ x = x.to(torch.float32)
56
+ x = torch.flip(x, dims=(0,))
57
+ x[0, :, :] -= 91.4953
58
+ x[1, :, :] -= 103.8827
59
+ x[2, :, :] -= 131.0912
60
+ return x
61
+
62
+ def get_img_torch(img, target_size=(224, 224)):
63
+ transform = transforms.Compose([transforms.PILToTensor(), PreprocessInput()])
64
+ img = img.resize(target_size, Image.Resampling.NEAREST)
65
+ img = transform(img)
66
+ img = torch.unsqueeze(img, 0)
67
+ return img
68
+
69
+ return get_img_torch(fp)
70
+
71
+ def convert_webm_to_mp4(input_file):
72
+
73
+ path_save = input_file.split('.')[0] + ".mp4"
74
+
75
+ if not os.path.exists(path_save):
76
+ ff_video = "ffmpeg -i {} -c:v copy -c:a aac -strict experimental {}".format(
77
+ input_file, path_save
78
+ )
79
+ subprocess.call(ff_video, shell=True)
80
+
81
+ return path_save
82
+
83
+ def convert_mp4_to_mp3(path, frame_indices, fps, sampling_rate=16000):
84
+
85
+ path_save = path.split('.')[0] + ".wav"
86
+ if not os.path.exists(path_save):
87
+ ff_audio = "ffmpeg -i {} -vn -acodec pcm_s16le -ar 44100 -ac 2 {}".format(
88
+ path, path_save
89
+ )
90
+ subprocess.call(ff_audio, shell=True)
91
+ wav, sr = torchaudio.load(path_save)
92
+
93
+ num_frames = wav.numpy().shape[1]
94
+ time_axis = [i / sr for i in range(num_frames)]
95
+
96
+ plt = plot_audio(time_axis, wav, frame_indices, fps, (12, 2))
97
+
98
+ if wav.size(0) > 1:
99
+ wav = wav.mean(dim=0, keepdim=True)
100
+
101
+ if sr != sampling_rate:
102
+ transform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sampling_rate)
103
+ wav = transform(wav)
104
+ sr = sampling_rate
105
+
106
+ assert sr == sampling_rate
107
+ return wav.squeeze(0), plt
108
+
109
+
110
+ def pad_wav(wav, max_length):
111
+ current_length = len(wav)
112
+ if current_length < max_length:
113
+ repetitions = (max_length + current_length - 1) // current_length
114
+ wav = torch.cat([wav] * repetitions, dim=0)[:max_length]
115
+ elif current_length > max_length:
116
+ wav = wav[:max_length]
117
+
118
+ return wav
119
+
120
+
121
+ def pad_wav_zeros(wav, max_length, mode="constant"):
122
+
123
+ if mode == "mean":
124
+ wav = torch.nn.functional.pad(
125
+ wav,
126
+ (0, max(0, max_length - wav.shape[0])),
127
+ mode="constant",
128
+ value=torch.mean(wav),
129
+ )
130
+
131
+ else:
132
+ wav = torch.nn.functional.pad(
133
+ wav, (0, max(0, max_length - wav.shape[0])), mode=mode
134
+ )
135
+
136
+ return wav
137
+
138
+ def softmax(matrix):
139
+ exp_matrix = np.exp(matrix - np.max(matrix, axis=1, keepdims=True))
140
+ return exp_matrix / np.sum(exp_matrix, axis=1, keepdims=True)
141
+
142
+
143
+ def get_compound_expression(pred, com_emo):
144
+ pred = np.asarray(pred)
145
+ prob = np.zeros((len(pred), len(com_emo)))
146
+ for idx, (_, v) in enumerate(com_emo.items()):
147
+ idx_1 = v[0]
148
+ idx_2 = v[1]
149
+ prob[:, idx] = pred[:, idx_1] + pred[:, idx_2]
150
+ return prob
151
+
152
+
153
+ def get_image_location(curr_video, frame):
154
+ frame = int(frame.split(".")[0]) + 1
155
+ frame = str(frame).zfill(5) + ".jpg"
156
+ return f"{curr_video}/{frame}"
157
+
158
+
159
+ def save_txt(column_names, file_names, labels, save_name):
160
+ data_lines = [",".join(column_names)]
161
+ for file_name, label in zip(file_names, labels):
162
+ data_lines.append(f"{file_name},{label}")
163
+
164
+ with open(save_name, "w") as file:
165
+ for line in data_lines:
166
+ file.write(line + "\n")
167
+
168
+ def get_mix_pred(emo_pred, ce_prob):
169
+ pred = []
170
+ for idx, curr_pred in enumerate(emo_pred):
171
+ if np.max(curr_pred) > config_data.CONFIDENCE_BE:
172
+ pred.append(np.argmax(curr_pred))
173
+ else:
174
+ pred.append(ce_prob[idx]+6)
175
+ return pred
176
+
177
+ def get_c_expr_db_pred(
178
+ stat_df: pd.DataFrame,
179
+ dyn_df: pd.DataFrame,
180
+ audio_df: pd.DataFrame,
181
+ name_video: str,
182
+ weights_1: list[float],
183
+ frame_indices: list[int],
184
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, list[str]]:
185
+ """
186
+ Predict compound expressions using audio-visual emotional probabilities, optimized weights, and rules.
187
+
188
+ Args:
189
+ stat_df (pd.DataFrame): DataFrame containing static visual probabilities.
190
+ dyn_df (pd.DataFrame): DataFrame containing dynamic visual probabilities.
191
+ audio_df (pd.DataFrame): DataFrame containing audio probabilities.
192
+ name_video (str): Name of the video.
193
+ weights_1 (List[float]): List of weights for the Dirichlet-based fusion.
194
+
195
+ Returns:
196
+ Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, List[str]]: Predictions for compound expressions,
197
+ and list of image locations.
198
+ """
199
+
200
+ stat_df["image_location"] = [
201
+ f"{name_video}/{str(f+1).zfill(5)}.jpg" for f in stat_df.index
202
+ ]
203
+ dyn_df["image_location"] = [
204
+ f"{name_video}/{str(f+1).zfill(5)}.jpg" for f in dyn_df.index
205
+ ]
206
+
207
+ image_location = dyn_df.image_location.tolist()
208
+
209
+ stat_df = stat_df[stat_df.image_location.isin(image_location)][NAME_EMO_AUDIO[:-1]].values
210
+ dyn_df = softmax(
211
+ dyn_df[dyn_df.image_location.isin(image_location)][NAME_EMO_AUDIO[:-1]].values
212
+ )
213
+
214
+ audio_df = audio_df.groupby(["frames"]).mean().reset_index()
215
+ audio_df = audio_df.rename(columns={"frames": "image_location"})
216
+ audio_df["image_location"] = [
217
+ get_image_location(name_video, i) for i in audio_df.image_location
218
+ ]
219
+ audio_df = softmax(
220
+ audio_df[audio_df.image_location.isin(image_location)][NAME_EMO_AUDIO[:-1]].values
221
+ )
222
+
223
+ if len(image_location) > len(audio_df):
224
+ last_pred_audio = audio_df[-1]
225
+ audio_df = np.vstack(
226
+ (audio_df, [last_pred_audio] * (len(image_location) - len(audio_df)))
227
+ )
228
+
229
+ predictions = [stat_df, dyn_df, audio_df]
230
+ num_predictions = len(predictions)
231
+
232
+ if weights_1:
233
+ final_predictions = predictions[0] * weights_1[0]
234
+ for i in range(1, num_predictions):
235
+ final_predictions += predictions[i] * weights_1[i]
236
+
237
+ else:
238
+ final_predictions = np.sum(predictions, axis=0) / num_predictions
239
+
240
+ av_prob = np.argmax(get_compound_expression(
241
+ final_predictions, DICT_CE,
242
+ ), axis=1)
243
+
244
+ vs_prob = get_compound_expression(
245
+ predictions[0], DICT_CE)
246
+ vd_prob = get_compound_expression(
247
+ predictions[1], DICT_CE)
248
+ a_prob = get_compound_expression(
249
+ predictions[2], DICT_CE)
250
+
251
+ av_pred = get_mix_pred(final_predictions, av_prob)
252
+ vs_pred = get_mix_pred(predictions[0], np.argmax(vs_prob, axis=1))
253
+ vd_pred = get_mix_pred(predictions[1], np.argmax(vd_prob, axis=1))
254
+ a_pred = get_mix_pred(predictions[2], np.argmax(a_prob, axis=1))
255
+
256
+ dict_pred_final = {'Audio-visual fusion':av_pred, 'Static visual model':vs_pred,'Dynamic visual model':vd_pred,'Audio model':a_pred}
257
+
258
+ plt = plot_compound_expression_prediction(
259
+ dict_preds = dict_pred_final,
260
+ save_path = None,
261
+ frame_indices = frame_indices,
262
+ title = "Basic emotion and compound expression predictions")
263
+
264
+ df = pd.DataFrame(dict_pred_final)
265
+
266
+ return df, plt
267
+
268
+ def get_evenly_spaced_frame_indices(total_frames, num_frames=10):
269
+ if total_frames <= num_frames:
270
+ return list(range(total_frames))
271
+
272
+ step = total_frames / num_frames
273
+ return [int(np.round(i * step)) for i in range(num_frames)]
config.toml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ APP_VERSION = "0.0.0"
2
+ FRAME_DOWNSAMPLING = 5
3
+ AUDIO_STEP = 0.5
4
+ CONFIDENCE_BE = 0.7
5
+
6
+ [model_static]
7
+ url = "https://huggingface.co/ElenaRyumina/face_emotion_recognition/resolve/main/FER_static_ResNet50_AffectNet.pt"
8
+ path = "FER_static_ResNet50_AffectNet.pt"
9
+
10
+ [model_dynamic]
11
+ url = "https://huggingface.co/ElenaRyumina/face_emotion_recognition/resolve/main/FER_dinamic_LSTM_Aff-Wild2.pt"
12
+ path = "FER_dinamic_LSTM_Aff-Wild2.pt"
13
+
14
+ [model_audio]
15
+ url = "https://drive.usercontent.google.com/download?id=11m53Kys3mdPALxbHQYc6kyww9QlQIkHA&export=download&authuser=0&confirm=t&uuid=ff23fbb0-5e4f-40b1-85bc-1cbbcdf0aeb7&at=APZUnTV5OentCsQjMpGGmIjKHBVP%3A1717752164159"
16
+ path = "audio_ExprModelV3.pth"
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio==4.24.0
2
+ requests==2.31.0
3
+ torch==2.1.2
4
+ torchaudio==2.1.2
5
+ torchvision==0.16.2
6
+ mediapipe==0.10.9
7
+ pillow==10.2.0
8
+ toml==0.10.