Spaces:

DmitryRyumin
/

MASAI

Running on Zero

App Files Files Community

DmitryRyumin commited on Nov 3, 2024

Commit

f16bb9f

1 Parent(s): c6923ed

Summary

Browse files

Files changed (27) hide show

.flake8 +5 -0
.gitignore +177 -0
CODE_OF_CONDUCT.md +80 -0
LICENSE +21 -0
README.md +8 -5
app.css +40 -0
app.py +48 -0
app/__init__.py +0 -0
app/components.py +18 -0
app/config.py +53 -0
app/data_init.py +56 -0
app/description.py +17 -0
app/event_handlers/__init__.py +0 -0
app/event_handlers/clear.py +31 -0
app/event_handlers/event_handlers.py +61 -0
app/event_handlers/submit.py +172 -0
app/event_handlers/video.py +26 -0
app/gpu_init.py +10 -0
app/load_models.py +909 -0
app/plots.py +115 -0
app/requirements_app.py +37 -0
app/tabs.py +154 -0
app/utils.py +287 -0
config.toml +66 -0
images/clear.ico +0 -0
images/submit.ico +0 -0
requirements.txt +13 -0

.flake8 ADDED Viewed

	@@ -0,0 +1,5 @@

+; https://www.flake8rules.com/
+[flake8]
+max-line-length = 120
+ignore = E203, E402, E741, W503

.gitignore ADDED Viewed

	@@ -0,0 +1,177 @@

+# Compiled source #
+###################
+*.com
+*.class
+*.dll
+*.exe
+*.o
+*.so
+*.pyc
+# Packages #
+############
+# it's better to unpack these files and commit the raw source
+# git has its own built in compression methods
+*.7z
+*.dmg
+*.gz
+*.iso
+*.rar
+#*.tar
+*.zip
+# Logs and databases #
+######################
+*.log
+*.sqlite
+# OS generated files #
+######################
+.DS_Store
+ehthumbs.db
+Icon
+Thumbs.db
+.tmtags
+.idea
+.vscode
+tags
+vendor.tags
+tmtagsHistory
+*.sublime-project
+*.sublime-workspace
+.bundle
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+node_modules/
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# Custom
+.gradio/
+data/
+models/
+fonts/
+notebooks/
+weights/
+project_structure.txt

CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,80 @@

+# Code of Conduct
+## Our Pledge
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+## Our Standards
+Examples of behavior that contributes to creating a positive environment
+include:
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+Examples of unacceptable behavior by participants include:
+* The use of sexualized language or imagery and unwelcome sexual attention or
+advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+professional setting
+## Our Responsibilities
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+## Scope
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+This Code of Conduct also applies outside the project spaces when there is a
+reasonable belief that an individual's behavior may have a negative impact on
+the project or its community.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <[email protected]>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at <https://www.contributor-covenant.org/version/1/4/code-of-conduct.html>
+[homepage]: https://www.contributor-covenant.org
+For answers to common questions about this code of conduct, see
+<https://www.contributor-covenant.org/faq>

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 HSE
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,14 +1,17 @@
 ---
 title: MASAI
-emoji: 📚
-colorFrom: blue
-colorTo: green
 sdk: gradio
 sdk_version: 5.4.0
 app_file: app.py
-pinned: false
 license: mit
 short_description: Intelligent system for Multimodal Affective States Analysis
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: MASAI
+emoji: 🔝
+colorFrom: gray
+colorTo: red
 sdk: gradio
+python_version: 3.12
 sdk_version: 5.4.0
 app_file: app.py
+app_port: 7860
+header: default
+pinned: true
 license: mit
 short_description: Intelligent system for Multimodal Affective States Analysis
 ---
+Check out the configuration reference at <https://huggingface.co/docs/hub/spaces-config-reference>

app.css ADDED Viewed

	@@ -0,0 +1,40 @@

+.noti_err {
+    color: var(--color-accent);
+}
+.noti_true {
+    color: #006900;
+}
+div.app-flex-container {
+    display: flex;
+    align-items: left;
+    gap: 6px;
+}
+button.submit {
+    display: flex;
+    border: var(--button-border-width) solid var(--button-primary-border-color);
+    background: var(--button-primary-background-fill);
+    color: var(--button-primary-text-color);
+    border-radius: 8px;
+    transition: all 0.3s ease;
+}
+button.submit[disabled],
+button.clear[disabled] {
+    cursor: not-allowed;
+    opacity: 0.6;
+}
+button.submit:hover:not([disabled]) {
+    border-color: var(--button-primary-border-color-hover);
+    background: var(--button-primary-background-fill-hover);
+    color: var(--button-primary-text-color-hover);
+}
+div.audio:hover label[data-testid="block-label"],
+div.imgs:hover label[data-testid="block-label"],
+div.emo-stats:hover label[data-testid="block-label"],
+div.sent-stats:hover label[data-testid="block-label"] {
+    display: none;
+}

app.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""
+File: app.py
+Author: Dmitry Ryumin, Maxim Markitantov, Elena Ryumina, Anastasia Dvoynikova, and Alexey Karpov
+Description: Main application file.
+             The file defines the Gradio interface, sets up the main blocks and tabs,
+             and includes event handlers for various components.
+License: MIT License
+"""
+import gradio as gr
+# Importing necessary components for the Gradio app
+from app.config import CONFIG_NAME, config_data, load_tab_creators
+from app.event_handlers.event_handlers import setup_app_event_handlers
+import app.tabs
+gr.set_static_paths(paths=[config_data.Path_APP / config_data.StaticPaths_IMAGES])
+def create_gradio_app() -> gr.Blocks:
+    with gr.Blocks(
+        theme=gr.themes.Default(), css_paths=config_data.AppSettings_CSS_PATH
+    ) as gradio_app:
+        tab_results = {}
+        available_functions = {
+            attr: getattr(app.tabs, attr)
+            for attr in dir(app.tabs)
+            if callable(getattr(app.tabs, attr)) and attr.endswith("_tab")
+        }
+        tab_creators = load_tab_creators(CONFIG_NAME, available_functions)
+        for tab_name, create_tab_function in tab_creators.items():
+            with gr.Tab(tab_name):
+                app_instance = create_tab_function()
+                tab_results[tab_name] = app_instance
+        keys = list(tab_results.keys())
+        setup_app_event_handlers(*(tab_results[keys[0]]))
+    return gradio_app
+if __name__ == "__main__":
+    create_gradio_app().queue(api_open=False).launch(share=False)

app/__init__.py ADDED Viewed

File without changes

app/components.py ADDED Viewed

	@@ -0,0 +1,18 @@

+"""
+File: components.py
+Author: Dmitry Ryumin, Maxim Markitantov, Elena Ryumina, Anastasia Dvoynikova, and Alexey Karpov
+Description: Utility functions for creating Gradio components.
+License: MIT License
+"""
+import gradio as gr
+# Importing necessary components for the Gradio app
+def html_message(
+    message: str = "", error: bool = True, visible: bool = True
+) -> gr.HTML:
+    css_class = "noti_err" if error else "noti_true"
+    return gr.HTML(value=f"<h3 class='{css_class}'>{message}</h3>", visible=visible)

app/config.py ADDED Viewed

	@@ -0,0 +1,53 @@

+"""
+File: config.py
+Author: Dmitry Ryumin, Maxim Markitantov, Elena Ryumina, Anastasia Dvoynikova, and Alexey Karpov
+Description: Configuration module for handling settings.
+License: MIT License
+"""
+import tomllib
+from pathlib import Path
+from collections.abc import Callable
+from types import SimpleNamespace
+CONFIG_NAME = "config.toml"
+def flatten_dict(prefix: str, d: dict) -> dict:
+    result = {}
+    for k, v in d.items():
+        result.update(
+            flatten_dict(f"{prefix}{k}_", v)
+            if isinstance(v, dict)
+            else {f"{prefix}{k}": v}
+        )
+    return result
+def load_tab_creators(
+    file_path: str, available_functions: dict[str, Callable]
+) -> dict[str, Callable]:
+    with open(file_path, "rb") as f:
+        config = tomllib.load(f)
+    tab_creators_data = config.get("TabCreators", {})
+    return {key: available_functions[value] for key, value in tab_creators_data.items()}
+def load_config(file_path: str) -> SimpleNamespace:
+    with open(file_path, "rb") as f:
+        config = tomllib.load(f)
+    config_data = flatten_dict("", config)
+    config_namespace = SimpleNamespace(**config_data)
+    setattr(config_namespace, "Path_APP", Path(__file__).parent.parent.resolve())
+    return config_namespace
+config_data = load_config(CONFIG_NAME)

app/data_init.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""
+File: data_init.py
+Author: Dmitry Ryumin, Maxim Markitantov, Elena Ryumina, Anastasia Dvoynikova, and Alexey Karpov
+Description: Initial data loading.
+License: MIT License
+"""
+import torch
+# Importing necessary components for the Gradio app
+from app.config import config_data
+from app.gpu_init import device
+from app.load_models import (
+    AudioFeatureExtractor,
+    VideoModelLoader,
+    TextFeatureExtractor,
+)
+from app.utils import ASRModel
+vad_model, vad_utils = torch.hub.load(
+    repo_or_dir=config_data.StaticPaths_VAD_MODEL,
+    model="silero_vad",
+    force_reload=False,
+    onnx=False,
+)
+get_speech_timestamps, _, read_audio, _, _ = vad_utils
+audio_model = AudioFeatureExtractor(
+    checkpoint_url=config_data.StaticPaths_HF_MODELS
+    + config_data.StaticPaths_EMO_SENT_AUDIO_WEIGHTS,
+    folder_path=config_data.StaticPaths_WEIGHTS,
+    device=device,
+    with_features=False,
+)
+video_model = VideoModelLoader(
+    face_checkpoint_url=config_data.StaticPaths_HF_MODELS
+    + config_data.StaticPaths_YOLOV8N_FACE,
+    emotion_checkpoint_url=config_data.StaticPaths_HF_MODELS
+    + config_data.StaticPaths_EMO_AFFECTNET_WEIGHTS,
+    emo_sent_checkpoint_url=config_data.StaticPaths_HF_MODELS
+    + config_data.StaticPaths_EMO_SENT_VIDEO_WEIGHTS,
+    folder_path=config_data.StaticPaths_WEIGHTS,
+    device=device,
+)
+text_model = TextFeatureExtractor(
+    checkpoint_url=config_data.StaticPaths_HF_MODELS
+    + config_data.StaticPaths_EMO_SENT_TEXT_WEIGHTS,
+    folder_path=config_data.StaticPaths_WEIGHTS,
+    device=device,
+    with_features=False,
+)
+asr = ASRModel(checkpoint_path=config_data.StaticPaths_OPENAI_WHISPER, device=device)

app/description.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""
+File: description.py
+Author: Dmitry Ryumin, Maxim Markitantov, Elena Ryumina, Anastasia Dvoynikova, and Alexey Karpov
+Description: Project description for the Gradio app.
+License: MIT License
+"""
+# Importing necessary components for the Gradio app
+from app.config import config_data
+DESCRIPTION = f"""\
+# Intelligent system for Multimodal Affective States Analysis (MASAI)
+<div class="app-flex-container">
+    <img src="https://img.shields.io/badge/version-v{config_data.AppSettings_APP_VERSION}-stable" alt="Version">
+</div>
+"""

app/event_handlers/__init__.py ADDED Viewed

File without changes

app/event_handlers/clear.py ADDED Viewed

	@@ -0,0 +1,31 @@

+"""
+File: clear.py
+Author: Dmitry Ryumin, Maxim Markitantov, Elena Ryumina, Anastasia Dvoynikova, and Alexey Karpov
+Description: Event handler for Gradio app to clear.
+License: MIT License
+"""
+import gradio as gr
+# Importing necessary components for the Gradio app
+from app.config import config_data
+from app.components import html_message
+def event_handler_clear() -> (
+    tuple[gr.Video, gr.Button, gr.Button, gr.HTML, gr.Plot, gr.Plot, gr.Plot, gr.Plot]
+):
+    return (
+        gr.Video(value=None),
+        gr.Button(interactive=False),
+        gr.Button(interactive=False),
+        html_message(
+            message=config_data.InformationMessages_NOTI_RESULTS[0],
+            error=True,
+            visible=True,
+        ),
+        gr.Plot(value=None, visible=False),
+        gr.Plot(value=None, visible=False),
+        gr.Plot(value=None, visible=False),
+        gr.Plot(value=None, visible=False),
+    )

app/event_handlers/event_handlers.py ADDED Viewed

	@@ -0,0 +1,61 @@

+"""
+File: event_handlers.py
+Author: Dmitry Ryumin, Maxim Markitantov, Elena Ryumina, Anastasia Dvoynikova, and Alexey Karpov
+Description: File containing functions for configuring event handlers for Gradio components.
+License: MIT License
+"""
+import gradio as gr
+# Importing necessary components for the Gradio app
+from app.event_handlers.video import event_handler_video
+from app.event_handlers.submit import event_handler_submit
+from app.event_handlers.clear import event_handler_clear
+def setup_app_event_handlers(
+    video,
+    clear,
+    submit,
+    noti_results,
+    waveform,
+    faces,
+    emotion_stats,
+    sent_stats,
+):
+    gr.on(
+        triggers=[video.change, video.upload, video.stop_recording, video.clear],
+        fn=event_handler_video,
+        inputs=[video],
+        outputs=[clear, submit, noti_results],
+        queue=True,
+    )
+    submit.click(
+        fn=event_handler_submit,
+        inputs=[video],
+        outputs=[
+            noti_results,
+            waveform,
+            faces,
+            emotion_stats,
+            sent_stats,
+        ],
+        queue=True,
+    )
+    clear.click(
+        fn=event_handler_clear,
+        inputs=[],
+        outputs=[
+            video,
+            clear,
+            submit,
+            noti_results,
+            waveform,
+            faces,
+            emotion_stats,
+            sent_stats,
+        ],
+        queue=True,
+    )

app/event_handlers/submit.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""
+File: submit.py
+Author: Dmitry Ryumin, Maxim Markitantov, Elena Ryumina, Anastasia Dvoynikova, and Alexey Karpov
+Description: Event handler for Gradio app to submit.
+License: MIT License
+"""
+import torch
+import pandas as pd
+import cv2
+import gradio as gr
+# Importing necessary components for the Gradio app
+from app.config import config_data
+from app.utils import (
+    convert_video_to_audio,
+    readetect_speech,
+    slice_audio,
+    find_intersections,
+    calculate_mode,
+    find_nearest_frames,
+)
+from app.plots import (
+    get_evenly_spaced_frame_indices,
+    plot_audio,
+    display_frame_info,
+    plot_images,
+    plot_predictions,
+)
+from app.data_init import (
+    read_audio,
+    get_speech_timestamps,
+    vad_model,
+    video_model,
+    asr,
+    audio_model,
+    text_model,
+)
+from app.load_models import VideoFeatureExtractor
+from app.components import html_message
+def event_handler_submit(
+    video: str,
+) -> tuple[gr.HTML, gr.Plot, gr.Plot, gr.Plot, gr.Plot]:
+    audio_file_path = convert_video_to_audio(file_path=video, sr=config_data.General_SR)
+    wav, vad_info = readetect_speech(
+        file_path=audio_file_path,
+        read_audio=read_audio,
+        get_speech_timestamps=get_speech_timestamps,
+        vad_model=vad_model,
+        sr=config_data.General_SR,
+    )
+    audio_windows = slice_audio(
+        start_time=config_data.General_START_TIME,
+        end_time=int(len(wav)),
+        win_max_length=int(config_data.General_WIN_MAX_LENGTH * config_data.General_SR),
+        win_shift=int(config_data.General_WIN_SHIFT * config_data.General_SR),
+        win_min_length=int(config_data.General_WIN_MIN_LENGTH * config_data.General_SR),
+    )
+    intersections = find_intersections(
+        x=audio_windows,
+        y=vad_info,
+        min_length=config_data.General_WIN_MIN_LENGTH * config_data.General_SR,
+    )
+    vfe = VideoFeatureExtractor(video_model, file_path=video, with_features=False)
+    vfe.preprocess_video()
+    transcriptions, total_text = asr(wav, audio_windows)
+    window_frames = []
+    preds_emo = []
+    preds_sen = []
+    for w_idx, window in enumerate(audio_windows):
+        a_w = intersections[w_idx]
+        if not a_w["speech"]:
+            a_pred = None
+        else:
+            wave = wav[a_w["start"] : a_w["end"]].clone()
+            a_pred, _ = audio_model(wave)
+        v_pred, _ = vfe(window, config_data.General_WIN_MAX_LENGTH)
+        t_pred, _ = text_model(transcriptions[w_idx][0])
+        if a_pred:
+            pred_emo = (a_pred["emo"] + v_pred["emo"] + t_pred["emo"]) / 3
+            pred_sen = (a_pred["sen"] + v_pred["sen"] + t_pred["sen"]) / 3
+        else:
+            pred_emo = (v_pred["emo"] + t_pred["emo"]) / 2
+            pred_sen = (v_pred["sen"] + t_pred["sen"]) / 2
+        frames = list(
+            range(
+                int(window["start"] * vfe.fps / config_data.General_SR) + 1,
+                int(window["end"] * vfe.fps / config_data.General_SR) + 2,
+            )
+        )
+        preds_emo.extend([torch.argmax(pred_emo).numpy()] * len(frames))
+        preds_sen.extend([torch.argmax(pred_sen).numpy()] * len(frames))
+        window_frames.extend(frames)
+    if max(window_frames) < vfe.frame_number:
+        missed_frames = list(range(max(window_frames) + 1, vfe.frame_number + 1))
+        window_frames.extend(missed_frames)
+        preds_emo.extend([preds_emo[-1]] * len(missed_frames))
+        preds_sen.extend([preds_sen[-1]] * len(missed_frames))
+    df_pred = pd.DataFrame(columns=["frames", "pred_emo", "pred_sent"])
+    df_pred["frames"] = window_frames
+    df_pred["pred_emo"] = preds_emo
+    df_pred["pred_sent"] = preds_sen
+    df_pred = df_pred.groupby("frames").agg(
+        {
+            "pred_emo": calculate_mode,
+            "pred_sent": calculate_mode,
+        }
+    )
+    frame_indices = get_evenly_spaced_frame_indices(vfe.frame_number, 9)
+    num_frames = len(wav)
+    time_axis = [i / config_data.General_SR for i in range(num_frames)]
+    plt_audio = plot_audio(time_axis, wav.unsqueeze(0), frame_indices, vfe.fps, (12, 2))
+    all_idx_faces = list(vfe.faces[1].keys())
+    need_idx_faces = find_nearest_frames(frame_indices, all_idx_faces)
+    faces = []
+    for idx_frame, idx_faces in zip(frame_indices, need_idx_faces):
+        cur_face = cv2.resize(
+            vfe.faces[1][idx_faces], (224, 224), interpolation=cv2.INTER_AREA
+        )
+        faces.append(
+            display_frame_info(
+                cur_face, "Frame: {}".format(idx_frame + 1), box_scale=0.3
+            )
+        )
+    plt_faces = plot_images(faces)
+    plt_emo = plot_predictions(
+        df_pred,
+        "pred_emo",
+        "Emotion",
+        list(config_data.General_DICT_EMO),
+        (12, 2.5),
+        [i + 1 for i in frame_indices],
+        2,
+    )
+    plt_sent = plot_predictions(
+        df_pred,
+        "pred_sent",
+        "Sentiment",
+        list(config_data.General_DICT_SENT),
+        (12, 1.5),
+        [i + 1 for i in frame_indices],
+        2,
+    )
+    return (
+        html_message(
+            message=config_data.InformationMessages_NOTI_RESULTS[1],
+            error=False,
+            visible=False,
+        ),
+        gr.Plot(value=plt_audio, visible=True),
+        gr.Plot(value=plt_faces, visible=True),
+        gr.Plot(value=plt_emo, visible=True),
+        gr.Plot(value=plt_sent, visible=True),
+    )

app/event_handlers/video.py ADDED Viewed

	@@ -0,0 +1,26 @@

+"""
+File: video.py
+Author: Dmitry Ryumin, Maxim Markitantov, Elena Ryumina, Anastasia Dvoynikova, and Alexey Karpov
+Description: Event handler for Gradio app to video.
+License: MIT License
+"""
+import gradio as gr
+# Importing necessary components for the Gradio app
+from app.config import config_data
+from app.components import html_message
+def event_handler_video(video: str) -> gr.HTML:
+    is_video_valid = bool(video)
+    return (
+        gr.Button(interactive=is_video_valid),
+        gr.Button(interactive=is_video_valid),
+        html_message(
+            message=config_data.InformationMessages_NOTI_RESULTS[int(is_video_valid)],
+            error=not is_video_valid,
+            visible=True,
+        ),
+    )

app/gpu_init.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""
+File: gpu_init.py
+Author: Dmitry Ryumin, Maxim Markitantov, Elena Ryumina, Anastasia Dvoynikova, and Alexey Karpov
+Description: GPU initialization.
+License: MIT License
+"""
+import torch
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

app/load_models.py ADDED Viewed

	@@ -0,0 +1,909 @@

+"""
+File: load_models.py
+Author: Dmitry Ryumin, Maxim Markitantov, Elena Ryumina, Anastasia Dvoynikova, and Alexey Karpov
+Description: Load pretrained models.
+License: MIT License
+"""
+import math
+import numpy as np
+import cv2
+import torch.nn.functional as F
+import torch.nn as nn
+import torch
+from typing import Optional
+from PIL import Image
+from ultralytics import YOLO
+from transformers.models.wav2vec2.modeling_wav2vec2 import (
+    Wav2Vec2Model,
+    Wav2Vec2PreTrainedModel,
+)
+from transformers import AutoConfig, Wav2Vec2Processor, AutoTokenizer, AutoModel
+from app.utils import pth_processing, get_idx_frames_in_windows
+# Importing necessary components for the Gradio app
+from app.utils import load_model
+class ScaledDotProductAttention_MultiHead(nn.Module):
+    def __init__(self):
+        super(ScaledDotProductAttention_MultiHead, self).__init__()
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, query, key, value, mask=None):
+        if mask is not None:
+            raise ValueError("Mask is not supported yet")
+        # key, query, value shapes: [batch_size, num_heads, seq_len, dim]
+        emb_dim = key.shape[-1]
+        # Calculate attention weights
+        attention_weights = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(
+            emb_dim
+        )
+        # masking
+        if mask is not None:
+            raise ValueError("Mask is not supported yet")
+        # Softmax
+        attention_weights = self.softmax(attention_weights)
+        # modify value
+        value = torch.matmul(attention_weights, value)
+        return value, attention_weights
+class PositionWiseFeedForward(nn.Module):
+    def __init__(self, input_dim, hidden_dim, dropout: float = 0.1):
+        super().__init__()
+        self.layer_1 = nn.Linear(input_dim, hidden_dim)
+        self.layer_2 = nn.Linear(hidden_dim, input_dim)
+        self.layer_norm = nn.LayerNorm(input_dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        # feed-forward network
+        x = self.layer_1(x)
+        x = self.dropout(x)
+        x = F.relu(x)
+        x = self.layer_2(x)
+        return x
+class Add_and_Norm(nn.Module):
+    def __init__(self, input_dim, dropout: Optional[float] = 0.1):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(input_dim)
+        if dropout is not None:
+            self.dropout = nn.Dropout(dropout)
+    def forward(self, x1, residual):
+        x = x1
+        # apply dropout of needed
+        if hasattr(self, "dropout"):
+            x = self.dropout(x)
+        # add and then norm
+        x = x + residual
+        x = self.layer_norm(x)
+        return x
+class MultiHeadAttention(nn.Module):
+    def __init__(self, input_dim, num_heads, dropout: Optional[float] = 0.1):
+        super().__init__()
+        self.input_dim = input_dim
+        self.num_heads = num_heads
+        if input_dim % num_heads != 0:
+            raise ValueError("input_dim must be divisible by num_heads")
+        self.head_dim = input_dim // num_heads
+        self.dropout = dropout
+        # initialize weights
+        self.query_w = nn.Linear(input_dim, self.num_heads * self.head_dim, bias=False)
+        self.keys_w = nn.Linear(input_dim, self.num_heads * self.head_dim, bias=False)
+        self.values_w = nn.Linear(input_dim, self.num_heads * self.head_dim, bias=False)
+        self.ff_layer_after_concat = nn.Linear(
+            self.num_heads * self.head_dim, input_dim, bias=False
+        )
+        self.attention = ScaledDotProductAttention_MultiHead()
+        if self.dropout is not None:
+            self.dropout = nn.Dropout(dropout)
+    def forward(self, queries, keys, values, mask=None):
+        # query, keys, values shapes: [batch_size, seq_len, input_dim]
+        batch_size, len_query, len_keys, len_values = (
+            queries.size(0),
+            queries.size(1),
+            keys.size(1),
+            values.size(1),
+        )
+        # linear transformation before attention
+        queries = (
+            self.query_w(queries)
+            .view(batch_size, len_query, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+        )  # [batch_size, num_heads, seq_len, dim]
+        keys = (
+            self.keys_w(keys)
+            .view(batch_size, len_keys, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+        )  # [batch_size, num_heads, seq_len, dim]
+        values = (
+            self.values_w(values)
+            .view(batch_size, len_values, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+        )  # [batch_size, num_heads, seq_len, dim]
+        # attention itself
+        values, attention_weights = self.attention(
+            queries, keys, values, mask=mask
+        )  # values shape:[batch_size, num_heads, seq_len, dim]
+        # concatenation
+        out = (
+            values.transpose(1, 2)
+            .contiguous()
+            .view(batch_size, len_values, self.num_heads * self.head_dim)
+        )  # [batch_size, seq_len, num_heads * dim = input_dim]
+        # go through last linear layer
+        out = self.ff_layer_after_concat(out)
+        return out
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        position = torch.arange(max_len).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
+        )
+        pe = torch.zeros(max_len, 1, d_model)
+        pe[:, 0, 0::2] = torch.sin(position * div_term)
+        pe[:, 0, 1::2] = torch.cos(position * div_term)
+        pe = pe.permute(
+            1, 0, 2
+        )  # [seq_len, batch_size, embedding_dim] -> [batch_size, seq_len, embedding_dim]
+        self.register_buffer("pe", pe)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: Tensor, shape [batch_size, seq_len, embedding_dim]
+        """
+        x = x + self.pe[:, : x.size(1)]
+        return self.dropout(x)
+class TransformerLayer(nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        num_heads,
+        dropout: Optional[float] = 0.1,
+        positional_encoding: bool = True,
+    ):
+        super(TransformerLayer, self).__init__()
+        self.positional_encoding = positional_encoding
+        self.input_dim = input_dim
+        self.num_heads = num_heads
+        self.head_dim = input_dim // num_heads
+        self.dropout = dropout
+        # initialize layers
+        self.self_attention = MultiHeadAttention(input_dim, num_heads, dropout=dropout)
+        self.feed_forward = PositionWiseFeedForward(
+            input_dim, input_dim, dropout=dropout
+        )
+        self.add_norm_after_attention = Add_and_Norm(input_dim, dropout=dropout)
+        self.add_norm_after_ff = Add_and_Norm(input_dim, dropout=dropout)
+        # calculate positional encoding
+        if self.positional_encoding:
+            self.positional_encoding = PositionalEncoding(input_dim)
+    def forward(self, key, value, query, mask=None):
+        # key, value, and query shapes: [batch_size, seq_len, input_dim]
+        # positional encoding
+        if self.positional_encoding:
+            key = self.positional_encoding(key)
+            value = self.positional_encoding(value)
+            query = self.positional_encoding(query)
+        # multi-head attention
+        residual = query
+        x = self.self_attention(queries=query, keys=key, values=value, mask=mask)
+        x = self.add_norm_after_attention(x, residual)
+        # feed forward
+        residual = x
+        x = self.feed_forward(x)
+        x = self.add_norm_after_ff(x, residual)
+        return x
+class SelfTransformer(nn.Module):
+    def __init__(self, input_size: int = int(1024), num_heads=1, dropout=0.1):
+        super(SelfTransformer, self).__init__()
+        self.att = torch.nn.MultiheadAttention(
+            input_size, num_heads, dropout, bias=True, batch_first=True
+        )
+        self.norm1 = nn.LayerNorm(input_size)
+        self.fcl = nn.Linear(input_size, input_size)
+        self.norm2 = nn.LayerNorm(input_size)
+    def forward(self, video):
+        represent, _ = self.att(video, video, video)
+        represent_norm = self.norm1(video + represent)
+        represent_fcl = self.fcl(represent_norm)
+        represent = self.norm1(represent_norm + represent_fcl)
+        return represent
+class SmallClassificationHead(nn.Module):
+    """ClassificationHead"""
+    def __init__(self, input_size=256, out_emo=6, out_sen=3):
+        super(SmallClassificationHead, self).__init__()
+        self.fc_emo = nn.Linear(input_size, out_emo)
+        self.fc_sen = nn.Linear(input_size, out_sen)
+    def forward(self, x):
+        x_emo = self.fc_emo(x)
+        x_sen = self.fc_sen(x)
+        return {"emo": x_emo, "sen": x_sen}
+class AudioModelWT(Wav2Vec2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.wav2vec2 = Wav2Vec2Model(config)
+        self.f_size = 1024
+        self.tl1 = TransformerLayer(
+            input_dim=self.f_size, num_heads=4, dropout=0.1, positional_encoding=True
+        )
+        self.tl2 = TransformerLayer(
+            input_dim=self.f_size, num_heads=4, dropout=0.1, positional_encoding=True
+        )
+        self.fc1 = nn.Linear(1024, 1)
+        self.dp = nn.Dropout(p=0.5)
+        self.selu = nn.SELU()
+        self.relu = nn.ReLU()
+        self.cl_head = SmallClassificationHead(
+            input_size=199, out_emo=config.out_emo, out_sen=config.out_sen
+        )
+        self.init_weights()
+        # freeze conv
+        self.freeze_feature_encoder()
+    def freeze_feature_encoder(self):
+        for param in self.wav2vec2.feature_extractor.conv_layers.parameters():
+            param.requires_grad = False
+    def forward(self, x, with_features=False):
+        outputs = self.wav2vec2(x)
+        x = self.tl1(outputs[0], outputs[0], outputs[0])
+        x = self.selu(x)
+        features = self.tl2(x, x, x)
+        x = self.selu(features)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.dp(x)
+        x = x.view(x.size(0), -1)
+        if with_features:
+            return self.cl_head(x), features
+        else:
+            return self.cl_head(x)
+class AudioFeatureExtractor:
+    def __init__(
+        self,
+        checkpoint_url: str,
+        folder_path: str,
+        device: torch.device,
+        sr: int = 16000,
+        win_max_length: int = 4,
+        with_features: bool = True,
+    ) -> None:
+        """
+        Args:
+            sr (int, optional): Sample rate of audio. Defaults to 16000.
+            win_max_length (int, optional): Max length of window. Defaults to 4.
+            with_features (bool, optional): Extract features or not
+        """
+        self.device = device
+        self.sr = sr
+        self.win_max_length = win_max_length
+        self.with_features = with_features
+        checkpoint_path = load_model(checkpoint_url, folder_path)
+        model_name = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
+        model_config = AutoConfig.from_pretrained(model_name)
+        model_config.out_emo = 7
+        model_config.out_sen = 3
+        model_config.context_length = 199
+        self.processor = Wav2Vec2Processor.from_pretrained(model_name)
+        self.model = AudioModelWT.from_pretrained(
+            pretrained_model_name_or_path=model_name, config=model_config
+        )
+        checkpoint = torch.load(checkpoint_path, map_location=self.device)
+        self.model.load_state_dict(checkpoint["model_state_dict"])
+        self.model.to(self.device)
+    def preprocess_wave(self, x: torch.Tensor) -> torch.Tensor:
+        """Extracts features for wav2vec
+        Apply padding to max length of audio
+        Args:
+            x (torch.Tensor): Input data
+        Returns:
+            np.ndarray: Preprocessed data
+        """
+        a_data = self.processor(
+            x,
+            sampling_rate=self.sr,
+            return_tensors="pt",
+            padding="max_length",
+            max_length=self.sr * self.win_max_length,
+        )
+        return a_data["input_values"][0]
+    def __call__(
+        self, waveform: torch.Tensor
+    ) -> tuple[dict[torch.Tensor], torch.Tensor]:
+        """Extracts acoustic features
+        Apply padding to max length of audio
+        Args:
+            wave (torch.Tensor): wave
+        Returns:
+            torch.Tensor: Extracted features
+        """
+        waveform = self.preprocess_wave(waveform).unsqueeze(0).to(self.device)
+        with torch.no_grad():
+            if self.with_features:
+                preds, features = self.model(waveform, with_features=self.with_features)
+            else:
+                preds = self.model(waveform, with_features=self.with_features)
+            predicts = {
+                "emo": F.softmax(preds["emo"], dim=-1).detach().cpu().squeeze(),
+                "sen": F.softmax(preds["sen"], dim=-1).detach().cpu().squeeze(),
+            }
+        return (
+            (predicts, features.detach().cpu().squeeze())
+            if self.with_features
+            else (predicts, None)
+        )
+class Tmodel(nn.Module):
+    def __init__(
+        self,
+        input_size: int = int(1024),
+        activation=nn.SELU(),
+        feature_size1=256,
+        feature_size2=64,
+        num_heads=1,
+        num_layers=2,
+        n_emo=7,
+        n_sent=3,
+    ):
+        super(Tmodel, self).__init__()
+        self.feature_text_dynamic = nn.ModuleList(
+            [
+                SelfTransformer(input_size=input_size, num_heads=num_heads)
+                for i in range(num_layers)
+            ]
+        )
+        self.fcl = nn.Linear(input_size, feature_size1)
+        self.activation = activation
+        self.feature_emo = nn.Linear(feature_size1, feature_size2)
+        self.feature_sent = nn.Linear(feature_size1, feature_size2)
+        self.fc_emo = nn.Linear(feature_size2, n_emo)
+        self.fc_sent = nn.Linear(feature_size2, n_sent)
+    def get_features(self, t):
+        for i, l in enumerate(self.feature_text_dynamic):
+            self.features = l(t)
+    def forward(self, t):
+        self.get_features(t)
+        represent = self.activation(torch.mean(t, axis=1))
+        represent = self.activation(self.fcl(represent))
+        represent_emo = self.activation(self.feature_emo(represent))
+        represent_sent = self.activation(self.feature_sent(represent))
+        prob_emo = self.fc_emo(represent_emo)
+        prob_sent = self.fc_sent(represent_sent)
+        return prob_emo, prob_sent
+class TextFeatureExtractor:
+    def __init__(
+        self,
+        checkpoint_url: str,
+        folder_path: str,
+        device: torch.device,
+        with_features: bool = True,
+    ) -> None:
+        self.device = device
+        self.with_features = with_features
+        model_name_bert = "julian-schelb/roberta-ner-multilingual"
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_name_bert, add_prefix_space=True
+        )
+        self.model_bert = AutoModel.from_pretrained(model_name_bert)
+        checkpoint_path = load_model(checkpoint_url, folder_path)
+        self.model = Tmodel()
+        self.model.load_state_dict(
+            torch.load(checkpoint_path, map_location=self.device)
+        )
+        self.model.to(self.device)
+    def preprocess_text(self, text: torch.Tensor) -> torch.Tensor:
+        if text != "" and str(text) != "nan":
+            inputs = self.tokenizer(
+                text.lower(),
+                padding="max_length",
+                truncation="longest_first",
+                return_tensors="pt",
+                max_length=6,
+            ).to(self.device)
+            with torch.no_grad():
+                self.model_bert = self.model_bert.to(self.device)
+                outputs = (
+                    self.model_bert(
+                        input_ids=inputs["input_ids"],
+                        attention_mask=inputs["attention_mask"],
+                    )
+                    .last_hidden_state.cpu()
+                    .detach()
+                )
+        else:
+            outputs = torch.zeros((1, 6, 1024))
+        return outputs
+    def __call__(self, text: torch.Tensor) -> tuple[dict[torch.Tensor], torch.Tensor]:
+        text_features = self.preprocess_text(text)
+        with torch.no_grad():
+            if self.with_features:
+                pred_emo, pred_sent = self.model(text_features.float().to(self.device))
+                temporal_features = self.model.features
+            else:
+                pred_emo, pred_sent = self.model(text_features.float().to(self.device))
+            predicts = {
+                "emo": F.softmax(pred_emo, dim=-1).detach().cpu().squeeze(),
+                "sen": F.softmax(pred_sent, dim=-1).detach().cpu().squeeze(),
+            }
+        return (
+            (predicts, temporal_features.detach().cpu().squeeze())
+            if self.with_features
+            else (predicts, None)
+        )
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, in_channels, out_channels, i_downsample=None, stride=1):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            stride=stride,
+            padding=0,
+            bias=False,
+        )
+        self.batch_norm1 = nn.BatchNorm2d(out_channels, eps=0.001, momentum=0.99)
+        self.conv2 = nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, padding="same", bias=False
+        )
+        self.batch_norm2 = nn.BatchNorm2d(out_channels, eps=0.001, momentum=0.99)
+        self.conv3 = nn.Conv2d(
+            out_channels,
+            out_channels * self.expansion,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.batch_norm3 = nn.BatchNorm2d(
+            out_channels * self.expansion, eps=0.001, momentum=0.99
+        )
+        self.i_downsample = i_downsample
+        self.stride = stride
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        identity = x.clone()
+        x = self.relu(self.batch_norm1(self.conv1(x)))
+        x = self.relu(self.batch_norm2(self.conv2(x)))
+        x = self.conv3(x)
+        x = self.batch_norm3(x)
+        # downsample if needed
+        if self.i_downsample is not None:
+            identity = self.i_downsample(identity)
+        # add identity
+        x += identity
+        x = self.relu(x)
+        return x
+class Conv2dSame(torch.nn.Conv2d):
+    def calc_same_pad(self, i: int, k: int, s: int, d: int) -> int:
+        return max((math.ceil(i / s) - 1) * s + (k - 1) * d + 1 - i, 0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        ih, iw = x.size()[-2:]
+        pad_h = self.calc_same_pad(
+            i=ih, k=self.kernel_size[0], s=self.stride[0], d=self.dilation[0]
+        )
+        pad_w = self.calc_same_pad(
+            i=iw, k=self.kernel_size[1], s=self.stride[1], d=self.dilation[1]
+        )
+        if pad_h > 0 or pad_w > 0:
+            x = F.pad(
+                x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2]
+            )
+        return F.conv2d(
+            x,
+            self.weight,
+            self.bias,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+        )
+class ResNet(nn.Module):
+    def __init__(self, ResBlock, layer_list, num_classes, num_channels=3):
+        super(ResNet, self).__init__()
+        self.in_channels = 64
+        self.conv_layer_s2_same = Conv2dSame(
+            num_channels, 64, 7, stride=2, groups=1, bias=False
+        )
+        self.batch_norm1 = nn.BatchNorm2d(64, eps=0.001, momentum=0.99)
+        self.relu = nn.ReLU()
+        self.max_pool = nn.MaxPool2d(kernel_size=3, stride=2)
+        self.layer1 = self._make_layer(ResBlock, layer_list[0], planes=64, stride=1)
+        self.layer2 = self._make_layer(ResBlock, layer_list[1], planes=128, stride=2)
+        self.layer3 = self._make_layer(ResBlock, layer_list[2], planes=256, stride=2)
+        self.layer4 = self._make_layer(ResBlock, layer_list[3], planes=512, stride=2)
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc1 = nn.Linear(512 * ResBlock.expansion, 512)
+        self.relu1 = nn.ReLU()
+        self.fc2 = nn.Linear(512, num_classes)
+    def extract_features_four(self, x):
+        x = self.relu(self.batch_norm1(self.conv_layer_s2_same(x)))
+        x = self.max_pool(x)
+        # print(x.shape)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        return x
+    def extract_features(self, x):
+        x = self.extract_features_four(x)
+        x = self.avgpool(x)
+        x = x.reshape(x.shape[0], -1)
+        x = self.fc1(x)
+        return x
+    def forward(self, x):
+        x = self.extract_features(x)
+        x = self.relu1(x)
+        x = self.fc2(x)
+        return x
+    def _make_layer(self, ResBlock, blocks, planes, stride=1):
+        ii_downsample = None
+        layers = []
+        if stride != 1 or self.in_channels != planes * ResBlock.expansion:
+            ii_downsample = nn.Sequential(
+                nn.Conv2d(
+                    self.in_channels,
+                    planes * ResBlock.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False,
+                    padding=0,
+                ),
+                nn.BatchNorm2d(planes * ResBlock.expansion, eps=0.001, momentum=0.99),
+            )
+        layers.append(
+            ResBlock(
+                self.in_channels, planes, i_downsample=ii_downsample, stride=stride
+            )
+        )
+        self.in_channels = planes * ResBlock.expansion
+        for i in range(blocks - 1):
+            layers.append(ResBlock(self.in_channels, planes))
+        return nn.Sequential(*layers)
+def ResNet50(num_classes, channels=3):
+    return ResNet(Bottleneck, [3, 4, 6, 3], num_classes, channels)
+class Vmodel(nn.Module):
+    def __init__(
+        self,
+        input_size=512,
+        activation=nn.SELU(),
+        feature_size=64,
+        num_heads=1,
+        num_layers=1,
+        positional_encoding=False,
+        n_emo=7,
+        n_sent=3,
+    ):
+        super(Vmodel, self).__init__()
+        self.feature_video_dynamic = nn.ModuleList(
+            [
+                TransformerLayer(
+                    input_dim=input_size,
+                    num_heads=num_heads,
+                    positional_encoding=positional_encoding,
+                )
+                for i in range(num_layers)
+            ]
+        )
+        self.fcl = nn.Linear(input_size, feature_size)
+        self.activation = activation
+        self.feature_emo = nn.Linear(feature_size, feature_size)
+        self.feature_sent = nn.Linear(feature_size, feature_size)
+        self.fc_emo = nn.Linear(feature_size, n_emo)
+        self.fc_sent = nn.Linear(feature_size, n_sent)
+    def forward(self, x, with_features=False):
+        for i, l in enumerate(self.feature_video_dynamic):
+            x = l(x, x, x)
+        represent = self.activation(torch.mean(x, axis=1))
+        represent = self.activation(self.fcl(represent))
+        represent_emo = self.activation(self.feature_emo(represent))
+        represent_sent = self.activation(self.feature_sent(represent))
+        prob_emo = self.fc_emo(represent_emo)
+        prob_sent = self.fc_sent(represent_sent)
+        if with_features:
+            return {"emo": prob_emo, "sen": prob_sent}, x
+        else:
+            return {"emo": prob_emo, "sen": prob_sent}
+class VideoModelLoader:
+    def __init__(
+        self,
+        face_checkpoint_url: str,
+        emotion_checkpoint_url: str,
+        emo_sent_checkpoint_url: str,
+        folder_path: str,
+        device: torch.device,
+    ) -> None:
+        self.device = device
+        # YOLO face recognition model initialization
+        face_model_path = load_model(face_checkpoint_url, folder_path)
+        emotion_video_model_path = load_model(emotion_checkpoint_url, folder_path)
+        emo_sent_video_model_path = load_model(emo_sent_checkpoint_url, folder_path)
+        self.face_model = YOLO(face_model_path)
+        # EmoAffectet model initialization (static model)
+        self.emo_affectnet_model = ResNet50(num_classes=7, channels=3)
+        self.emo_affectnet_model.load_state_dict(
+            torch.load(emotion_video_model_path, map_location=self.device)
+        )
+        self.emo_affectnet_model.to(self.device).eval()
+        # Visual emotion and sentiment recognition model (dynamic model)
+        self.emo_sent_video_model = Vmodel()
+        self.emo_sent_video_model.load_state_dict(
+            torch.load(emo_sent_video_model_path, map_location=self.device)
+        )
+        self.emo_sent_video_model.to(self.device).eval()
+    def extract_zeros_features(self):
+        zeros = torch.unsqueeze(torch.zeros((3, 224, 224)), 0).to(self.device)
+        zeros_features = self.emo_affectnet_model.extract_features(zeros)
+        return zeros_features.cpu().detach().numpy()[0]
+class VideoFeatureExtractor:
+    def __init__(
+        self,
+        model_loader: VideoModelLoader,
+        file_path: str,
+        target_fps: int = 5,
+        with_features: bool = True,
+    ) -> None:
+        self.model_loader = model_loader
+        self.with_features = with_features
+        # Video options
+        self.cap = cv2.VideoCapture(file_path)
+        self.w, self.h, self.fps, self.frame_number = (
+            int(self.cap.get(x))
+            for x in (
+                cv2.CAP_PROP_FRAME_WIDTH,
+                cv2.CAP_PROP_FRAME_HEIGHT,
+                cv2.CAP_PROP_FPS,
+                cv2.CAP_PROP_FRAME_COUNT,
+            )
+        )
+        self.dur = self.frame_number / self.fps
+        self.target_fps = target_fps
+        self.frame_interval = int(self.fps / target_fps)
+        # Extract zero features if no face found in frame
+        self.zeros_features = self.model_loader.extract_zeros_features()
+        # Dictionaries with facial features and faces
+        self.facial_features = {}
+        self.faces = {}
+    def preprocess_frame(self, frame: np.ndarray, counter: int) -> None:
+        curr_fr = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        results = self.model_loader.face_model.track(
+            curr_fr,
+            persist=True,
+            imgsz=640,
+            conf=0.01,
+            iou=0.5,
+            augment=False,
+            device=self.model_loader.device,
+            verbose=False,
+        )
+        need_features = np.zeros(512)
+        count_face = 0
+        if results[0].boxes.xyxy.cpu().tolist() != []:
+            for i in results[0].boxes:
+                idx_box = i.id.int().cpu().tolist()[0] if i.id else -1
+                box = i.xyxy.int().cpu().tolist()[0]
+                startX, startY = max(0, box[0]), max(0, box[1])
+                endX, endY = min(self.w - 1, box[2]), min(self.h - 1, box[3])
+                face_region = curr_fr[startY:endY, startX:endX]
+                norm_face_region = pth_processing(Image.fromarray(face_region))
+                with torch.no_grad():
+                    curr_features = (
+                        self.model_loader.emo_affectnet_model.extract_features(
+                            norm_face_region.to(self.model_loader.device)
+                        )
+                    )
+                need_features += curr_features.cpu().detach().numpy()[0]
+                count_face += 1
+                # face_region = cv2.resize(face_region, (224,224), interpolation = cv2.INTER_AREA)
+                # face_region = display_frame_info(face_region, 'Frame: {}'.format(count_face), box_scale=.3)
+                if idx_box in self.faces:
+                    self.faces[idx_box].update({counter: face_region})
+                else:
+                    self.faces[idx_box] = {counter: face_region}
+            need_features /= count_face
+            self.facial_features[counter] = need_features
+        else:
+            if counter - 1 in self.facial_features:
+                self.facial_features[counter] = self.facial_features[counter - 1]
+            else:
+                self.facial_features[counter] = self.zeros_features
+    def preprocess_video(self) -> None:
+        counter = 0
+        while True:
+            ret, frame = self.cap.read()
+            if not ret:
+                break
+            if counter % self.frame_interval == 0:
+                self.preprocess_frame(frame, counter)
+            counter += 1
+    def __call__(
+        self, window: dict, win_max_length: int, sr: int = 16000
+    ) -> tuple[dict[torch.Tensor], torch.Tensor]:
+        curr_idx_frames = get_idx_frames_in_windows(
+            list(self.facial_features.keys()), window, self.fps, sr
+        )
+        video_features = np.array(list(self.facial_features.values()))
+        curr_features = video_features[curr_idx_frames, :]
+        if len(curr_features) < self.target_fps * win_max_length:
+            diff = self.target_fps * win_max_length - len(curr_features)
+            curr_features = np.concatenate(
+                [curr_features, [curr_features[-1]] * diff], axis=0
+            )
+        curr_features = (
+            torch.FloatTensor(curr_features).unsqueeze(0).to(self.model_loader.device)
+        )
+        with torch.no_grad():
+            if self.with_features:
+                preds, features = self.model_loader.emo_sent_video_model(
+                    curr_features, with_features=self.with_features
+                )
+            else:
+                preds = self.model_loader.emo_sent_video_model(
+                    curr_features, with_features=self.with_features
+                )
+            predicts = {
+                "emo": F.softmax(preds["emo"], dim=-1).detach().cpu().squeeze(),
+                "sen": F.softmax(preds["sen"], dim=-1).detach().cpu().squeeze(),
+            }
+        return (
+            (predicts, features.detach().cpu().squeeze())
+            if self.with_features
+            else (predicts, None)
+        )

app/plots.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""
+File: plots.py
+Author: Dmitry Ryumin, Maxim Markitantov, Elena Ryumina, Anastasia Dvoynikova, and Alexey Karpov
+Description: Plotting functions.
+License: MIT License
+"""
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import cv2
+def plot_audio(time_axis, waveform, frame_indices, fps, figsize=(10, 4)) -> plt.Figure:
+    frame_times = np.array(frame_indices) / fps
+    fig, ax = plt.subplots(figsize=figsize)
+    ax.plot(time_axis, waveform[0])
+    ax.set_xlabel("Time (frames)")
+    ax.set_ylabel("Amplitude")
+    ax.grid(True)
+    ax.set_xticks(frame_times)
+    ax.set_xticklabels([f"{int(frame_time * fps) + 1}" for frame_time in frame_times])
+    fig.tight_layout()
+    return fig
+def plot_images(image_paths):
+    fig, axes = plt.subplots(1, len(image_paths), figsize=(12, 2))
+    for ax, img_path in zip(axes, image_paths):
+        ax.imshow(img_path)
+        ax.axis("off")
+    fig.tight_layout()
+    return fig
+def get_evenly_spaced_frame_indices(total_frames, num_frames=10):
+    if total_frames <= num_frames:
+        return list(range(total_frames))
+    step = total_frames / num_frames
+    return [int(np.round(i * step)) for i in range(num_frames)]
+def plot_predictions(
+    df: pd.DataFrame,
+    column: str,
+    title: str,
+    y_labels: list[str],
+    figsize: tuple[int, int],
+    x_ticks: list[int],
+    line_width: float = 2.0,
+) -> None:
+    fig, ax = plt.subplots(figsize=figsize)
+    ax.plot(df.index, df[column], linestyle="dotted", linewidth=line_width)
+    ax.set_title(title)
+    ax.set_xlabel("Frames")
+    ax.set_ylabel(title)
+    ax.set_xticks(x_ticks)
+    ax.set_yticks(range(len(y_labels)))
+    ax.set_yticklabels(y_labels)
+    ax.grid(True)
+    fig.tight_layout()
+    return fig
+def display_frame_info(img, text, margin=1.0, box_scale=1.0, scale=1.5):
+    img_copy = img.copy()
+    img_h, img_w, _ = img_copy.shape
+    line_width = int(min(img_h, img_w) * 0.001)
+    thickness = max(int(line_width / 3), 1)
+    font_face = cv2.FONT_HERSHEY_SIMPLEX
+    font_color = (0, 0, 0)
+    font_scale = thickness / scale
+    t_w, t_h = cv2.getTextSize(text, font_face, font_scale, None)[0]
+    margin_n = int(t_h * margin)
+    sub_img = img_copy[
+        0 + margin_n : 0 + margin_n + t_h + int(2 * t_h * box_scale),
+        img_w - t_w - margin_n - int(2 * t_h * box_scale) : img_w - margin_n,
+    ]
+    white_rect = np.ones(sub_img.shape, dtype=np.uint8) * 255
+    img_copy[
+        0 + margin_n : 0 + margin_n + t_h + int(2 * t_h * box_scale),
+        img_w - t_w - margin_n - int(2 * t_h * box_scale) : img_w - margin_n,
+    ] = cv2.addWeighted(sub_img, 0.5, white_rect, 0.5, 1.0)
+    cv2.putText(
+        img=img_copy,
+        text=text,
+        org=(
+            img_w - t_w - margin_n - int(2 * t_h * box_scale) // 2,
+            0 + margin_n + t_h + int(2 * t_h * box_scale) // 2,
+        ),
+        fontFace=font_face,
+        fontScale=font_scale,
+        color=font_color,
+        thickness=thickness,
+        lineType=cv2.LINE_AA,
+        bottomLeftOrigin=False,
+    )
+    return img_copy

app/requirements_app.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""
+File: requirements_app.py
+Author: Dmitry Ryumin, Maxim Markitantov, Elena Ryumina, Anastasia Dvoynikova, and Alexey Karpov
+Description: Project requirements for the Gradio app.
+License: MIT License
+"""
+import polars as pl
+# Importing necessary components for the Gradio app
+from app.config import config_data
+def read_requirements(file_path="requirements.txt"):
+    with open(file_path, "r") as file:
+        lines = file.readlines()
+    data = []
+    pypi = (
+        lambda x: f"<a href='https://pypi.org/project/{x}' target='_blank'>"
+        + f"<img src='https://img.shields.io/pypi/v/{x}' alt='PyPI' /></a>"
+    )
+    data = [
+        {
+            config_data.Requirements_LIBRARY: split_line[0],
+            config_data.Requirements_RECOMMENDED_VERSION: split_line[1],
+            config_data.Requirements_CURRENT_VERSION: pypi(split_line[0]),
+        }
+        for line in lines
+        if (split_line := line.strip().split("==")) and len(split_line) == 2
+    ]
+    df = pl.DataFrame(data)
+    return df

app/tabs.py ADDED Viewed

	@@ -0,0 +1,154 @@

+"""
+File: tabs.py
+Author: Dmitry Ryumin, Maxim Markitantov, Elena Ryumina, Anastasia Dvoynikova, and Alexey Karpov
+Description: Gradio app tabs - Contains the definition of various tabs for the Gradio app interface.
+License: MIT License
+"""
+import gradio as gr
+# Importing necessary components for the Gradio app
+from app.description import DESCRIPTION
+from app.config import config_data
+from app.components import html_message
+from app.requirements_app import read_requirements
+def app_tab():
+    gr.Markdown(value=DESCRIPTION)
+    with gr.Row(
+        visible=True,
+        render=True,
+        variant="default",
+        elem_classes="app-container",
+    ):
+        with gr.Column(
+            visible=True,
+            render=True,
+            variant="default",
+            elem_classes="video-container",
+        ):
+            video = gr.Video(
+                label=config_data.Labels_VIDEO,
+                show_label=True,
+                interactive=True,
+                visible=True,
+                mirror_webcam=True,
+                include_audio=True,
+                elem_classes="video",
+                autoplay=False,
+            )
+            with gr.Row(
+                visible=True,
+                render=True,
+                variant="default",
+                elem_classes="submit-container",
+            ):
+                clear = gr.Button(
+                    value=config_data.OtherMessages_CLEAR,
+                    interactive=False,
+                    icon=config_data.Path_APP
+                    / config_data.StaticPaths_IMAGES
+                    / "clear.ico",
+                    visible=True,
+                    elem_classes="clear",
+                )
+                submit = gr.Button(
+                    value=config_data.OtherMessages_SUBMIT,
+                    interactive=False,
+                    icon=config_data.Path_APP
+                    / config_data.StaticPaths_IMAGES
+                    / "submit.ico",
+                    visible=True,
+                    elem_classes="submit",
+                )
+            gr.Examples(
+                [
+                    "videos/1.mp4",
+                    "videos/2.mp4",
+                ],
+                [video],
+            )
+        with gr.Column(
+            visible=True,
+            render=True,
+            variant="default",
+            elem_classes="results-container",
+        ):
+            noti_results = html_message(
+                message=config_data.InformationMessages_NOTI_RESULTS[0],
+                error=True,
+                visible=True,
+            )
+            waveform = gr.Plot(
+                value=None,
+                label=config_data.Labels_WAVEFORM,
+                show_label=True,
+                visible=False,
+                elem_classes="audio",
+            )
+            faces = gr.Plot(
+                value=None,
+                label=config_data.Labels_FACE_IMAGES,
+                show_label=True,
+                visible=False,
+                elem_classes="imgs",
+            )
+            emotion_stats = gr.Plot(
+                value=None,
+                label=config_data.Labels_EMO_STATS,
+                show_label=True,
+                visible=False,
+                elem_classes="emo-stats",
+            )
+            sent_stats = gr.Plot(
+                value=None,
+                label=config_data.Labels_SENT_STATS,
+                show_label=True,
+                visible=False,
+                elem_classes="sent-stats",
+            )
+    return (
+        video,
+        clear,
+        submit,
+        noti_results,
+        waveform,
+        faces,
+        emotion_stats,
+        sent_stats,
+    )
+def settings_app_tab():
+    pass
+def about_app_tab():
+    pass
+def about_authors_tab():
+    pass
+def requirements_app_tab():
+    reqs = read_requirements()
+    return gr.Dataframe(
+        headers=reqs.columns,
+        value=reqs,
+        datatype=["markdown"] * len(reqs.columns),
+        visible=True,
+        elem_classes="requirements-dataframe",
+        type="polars",
+    )

app/utils.py ADDED Viewed

	@@ -0,0 +1,287 @@

+"""
+File: utils.py
+Author: Dmitry Ryumin, Maxim Markitantov, Elena Ryumina, Anastasia Dvoynikova, and Alexey Karpov
+Description: Utility functions.
+License: MIT License
+"""
+import torch
+import os
+import subprocess
+import bisect
+import re
+import requests
+from torchvision import transforms
+from PIL import Image
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+from pathlib import Path
+from contextlib import suppress
+from urllib.parse import urlparse
+from typing import Callable
+def load_model(
+    model_url: str, folder_path: str, force_reload: bool = False
+) -> str | None:
+    file_name = Path(urlparse(model_url).path).name
+    file_path = Path(folder_path) / file_name
+    if file_path.exists() and not force_reload:
+        return str(file_path)
+    with suppress(Exception), requests.get(model_url, stream=True) as response:
+        file_path.parent.mkdir(parents=True, exist_ok=True)
+        with file_path.open("wb") as file:
+            for chunk in response.iter_content(chunk_size=8192):
+                file.write(chunk)
+        return str(file_path)
+    return None
+def readetect_speech(
+    file_path: str,
+    read_audio: Callable,
+    get_speech_timestamps: Callable,
+    vad_model: torch.jit.ScriptModule,
+    sr: int = 16000,
+) -> list[dict]:
+    wav = read_audio(file_path, sampling_rate=sr)
+    # get speech timestamps from full audio file
+    speech_timestamps = get_speech_timestamps(wav, vad_model, sampling_rate=sr)
+    return wav, speech_timestamps
+def calculate_mode(series):
+    mode = series.mode()
+    return mode[0] if not mode.empty else None
+def pth_processing(fp):
+    class PreprocessInput(torch.nn.Module):
+        def init(self):
+            super(PreprocessInput, self).init()
+        def forward(self, x):
+            x = x.to(torch.float32)
+            x = torch.flip(x, dims=(0,))
+            x[0, :, :] -= 91.4953
+            x[1, :, :] -= 103.8827
+            x[2, :, :] -= 131.0912
+            return x
+    def get_img_torch(img, target_size=(224, 224)):
+        transform = transforms.Compose([transforms.PILToTensor(), PreprocessInput()])
+        img = img.resize(target_size, Image.Resampling.NEAREST)
+        img = transform(img)
+        img = torch.unsqueeze(img, 0)
+        return img
+    return get_img_torch(fp)
+def get_idx_frames_in_windows(
+    frames: list[int], window: dict, fps: int, sr: int = 16000
+) -> list[list]:
+    frames_in_windows = [
+        idx
+        for idx, frame in enumerate(frames)
+        if window["start"] * fps / sr <= frame < window["end"] * fps / sr
+    ]
+    return frames_in_windows
+# Maxim code
+def slice_audio(
+    start_time: float,
+    end_time: float,
+    win_max_length: float,
+    win_shift: float,
+    win_min_length: float,
+) -> list[dict]:
+    """Slices audio on windows
+    Args:
+        start_time (float): Start time of audio
+        end_time (float): End time of audio
+        win_max_length (float): Window max length
+        win_shift (float): Window shift
+        win_min_length (float): Window min length
+    Returns:
+        list[dict]: List of dict with timings, f.e.: {'start': 0, 'end': 12}
+    """
+    if end_time < start_time:
+        return []
+    elif (end_time - start_time) > win_max_length:
+        timings = []
+        while start_time < end_time:
+            end_time_chunk = start_time + win_max_length
+            if end_time_chunk < end_time:
+                timings.append({"start": start_time, "end": end_time_chunk})
+            elif end_time_chunk == end_time:  # if tail exact `win_max_length` seconds
+                timings.append({"start": start_time, "end": end_time_chunk})
+                break
+            else:  # if tail less then `win_max_length` seconds
+                if (
+                    end_time - start_time < win_min_length
+                ):  # if tail less then `win_min_length` seconds
+                    break
+                timings.append({"start": start_time, "end": end_time})
+                break
+            start_time += win_shift
+        return timings
+    else:
+        return [{"start": start_time, "end": end_time}]
+def convert_video_to_audio(file_path: str, sr: int = 16000) -> str:
+    path_save = file_path.split(".")[0] + ".wav"
+    if not os.path.exists(path_save):
+        ffmpeg_command = f"ffmpeg -y -i {file_path} -async 1 -vn -acodec pcm_s16le -ar {sr} {path_save}"
+        subprocess.call(ffmpeg_command, shell=True)
+    return path_save
+def find_nearest_frames(target_frames, all_frames):
+    nearest_frames = []
+    for frame in target_frames:
+        pos = bisect.bisect_left(all_frames, frame)
+        if pos == 0:
+            nearest_frame = all_frames[0]
+        elif pos == len(all_frames):
+            nearest_frame = all_frames[-1]
+        else:
+            before = all_frames[pos - 1]
+            after = all_frames[pos]
+            nearest_frame = before if frame - before <= after - frame else after
+        nearest_frames.append(nearest_frame)
+    return nearest_frames
+def find_intersections(
+    x: list[dict], y: list[dict], min_length: float = 0
+) -> list[dict]:
+    """Find intersections of two lists of dicts with intervals, preserving structure of `x` and adding intersection info
+    Args:
+        x (list[dict]): First list of intervals
+        y (list[dict]): Second list of intervals
+        min_length (float, optional): Minimum length of intersection. Defaults to 0.
+    Returns:
+        list[dict]: Windows with intersections, maintaining structure of `x`, and indicating intersection presence.
+    """
+    timings = []
+    j = 0
+    for interval_x in x:
+        original_start = int(interval_x["start"])
+        original_end = int(interval_x["end"])
+        intersections_found = False
+        while j < len(y) and y[j]["end"] < original_start:
+            j += 1  # Skip any intervals in `y` that end before the current interval in `x` starts
+        # Check for all overlapping intervals in `y`
+        temp_j = (
+            j  # Temporary pointer to check intersections within `y` for current `x`
+        )
+        while temp_j < len(y) and y[temp_j]["start"] <= original_end:
+            # Calculate the intersection between `x[i]` and `y[j]`
+            intersection_start = max(original_start, y[temp_j]["start"])
+            intersection_end = min(original_end, y[temp_j]["end"])
+            if (
+                intersection_start < intersection_end
+                and (intersection_end - intersection_start) >= min_length
+            ):
+                timings.append(
+                    {
+                        "original_start": original_start,
+                        "original_end": original_end,
+                        "start": intersection_start,
+                        "end": intersection_end,
+                        "speech": True,
+                    }
+                )
+                intersections_found = True
+            temp_j += 1  # Move to the next interval in `y` for further intersections
+        # If no intersections were found, add the interval with `intersected` set to False
+        if not intersections_found:
+            timings.append(
+                {
+                    "original_start": original_start,
+                    "original_end": original_end,
+                    "start": None,
+                    "end": None,
+                    "speech": False,
+                }
+            )
+    return timings
+# Anastasia code
+class ASRModel:
+    def __init__(self, checkpoint_path: str, device: torch.device):
+        self.processor = WhisperProcessor.from_pretrained(checkpoint_path)
+        self.model = WhisperForConditionalGeneration.from_pretrained(
+            checkpoint_path
+        ).to(device)
+        self.device = device
+        self.model.config.forced_decoder_ids = None
+    def __call__(
+        self, sample: torch.Tensor, audio_windows: dict, sr: int = 16000
+    ) -> tuple:
+        texts = []
+        for t in range(len(audio_windows)):
+            input_features = self.processor(
+                sample[audio_windows[t]["start"] : audio_windows[t]["end"]],
+                sampling_rate=sr,
+                return_tensors="pt",
+            ).input_features
+            predicted_ids = self.model.generate(input_features.to(self.device))
+            transcription = self.processor.batch_decode(
+                predicted_ids, skip_special_tokens=False
+            )
+            texts.append(re.findall(r"> ([^<>]+)", transcription[0]))
+        # for drawing
+        input_features = self.processor(
+            sample, sampling_rate=sr, return_tensors="pt"
+        ).input_features
+        predicted_ids = self.model.generate(input_features.to(self.device))
+        transcription = self.processor.batch_decode(
+            predicted_ids, skip_special_tokens=False
+        )
+        total_text = re.findall(r"> ([^<>]+)", transcription[0])
+        return texts, total_text
+def convert_webm_to_mp4(input_file):
+    path_save = input_file.split(".")[0] + ".mp4"
+    if not os.path.exists(path_save):
+        ff_video = "ffmpeg -i {} -c:v copy -c:a aac -strict experimental {}".format(
+            input_file, path_save
+        )
+        subprocess.call(ff_video, shell=True)
+    return path_save

config.toml ADDED Viewed

	@@ -0,0 +1,66 @@

+[AppSettings]
+APP_VERSION = "0.0.1"
+CSS_PATH = "app.css"
+[General]
+SR = 16000
+START_TIME = 0
+WIN_MAX_LENGTH = 4
+WIN_SHIFT = 2
+WIN_MIN_LENGTH = 2
+DICT_EMO = [
+    "Neutral",
+    "Happy",
+    "Sad",
+    "Anger",
+    "Surprise",
+    "Disgust",
+    "Fear",
+]
+DICT_SENT = [
+    "Negative",
+    "Neutral",
+    "Positive",
+]
+[InformationMessages]
+NOTI_RESULTS = [
+    "Upload or record video",
+    "Video uploaded, you can perform calculations",
+]
+[OtherMessages]
+CLEAR = "Clear"
+SUBMIT = "Calculate"
+[Labels]
+VIDEO = "Video"
+FACE_IMAGES = "Face images"
+WAVEFORM = "Waveform"
+EMO_STATS = "Statistics of emotions"
+SENT_STATS = "Statistics of sentiments"
+[TabCreators]
+"⭐ App" = "app_tab"
+"⚙️ Settings" = "settings_app_tab"
+"💡 About App" = "about_app_tab"
+"🎭 Authors" = "about_authors_tab"
+"📋 Requirements" = "requirements_app_tab"
+[StaticPaths]
+MODELS = "models"
+IMAGES = "images"
+WEIGHTS = "weights"
+VAD_MODEL = "snakers4/silero-vad"
+HF_MODELS = "https://huggingface.co/ElenaRyumina/MASAI_models/resolve/main/"
+EMO_AFFECTNET_WEIGHTS = "emo_affectnet_weights.pt"
+EMO_SENT_AUDIO_WEIGHTS = "emo_sent_audio_weights.pth"
+EMO_SENT_TEXT_WEIGHTS = "emo_sent_text_weights.pth"
+EMO_SENT_VIDEO_WEIGHTS = "emo_sent_video_weights.pth"
+YOLOV8N_FACE = "yolov8n-face.pt"
+OPENAI_WHISPER = "openai/whisper-base"
+[Requirements]
+LIBRARY = "Library"
+RECOMMENDED_VERSION = "Recommended version"
+CURRENT_VERSION = "Current version"

images/clear.ico ADDED Viewed

images/submit.ico ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+gradio==5.4.0
+gradio_client==1.4.2
+polars==1.12.0
+torch==2.2.2
+torchaudio==2.2.2
+opencv-contrib-python==4.10.0.84
+ultralytics==8.3.26
+lapx==0.5.11
+transformers==4.46.1
+pillow==11.0.0
+pandas==2.2.3
+numpy==1.26.4
+matplotlib==3.9.2