Spaces:

jerpint
/

vox-clone-guesser

Sleeping

File size: 4,734 Bytes

72c20ae

import os
import gradio as gr
import json
import requests
import random

labels = ["Real Audio 🗣️", "Cloned Audio 🤖"]
DURATION = 2


def get_accuracy(score_matrix) -> str:

    correct = score_matrix[0][0] + score_matrix[1][1]
    total = sum(score_matrix[0]) + sum(score_matrix[1])
    if total == 0:
        return ""

    accuracy = correct / total * 100
    return f"{accuracy:.2f}%"


def audio_link(path: str, model: str):
    """Get the link to the audio file for a given path and model."""
    return f"https://huggingface.co/datasets/jerpint/vox-cloned-data/resolve/main/{model}/{path}?download=true"


def confusion_matrix_to_markdown(matrix, labels=None):
    num_labels = len(matrix)
    labels = labels or [f"Class {i}" for i in range(num_labels)]
    accuracy = get_accuracy(matrix)

    # Header row
    markdown = f"| {' | '.join([''] + labels)} |\n"
    markdown += f"| {' | '.join(['---'] * (num_labels + 1))} |\n"

    # Data rows
    for i, row in enumerate(matrix):
        markdown += f"| {labels[i]} | " + " | ".join(map(str, row)) + " |\n"

    markdown += f"\nAccuracy %: {accuracy}\n"

    return markdown


def load_and_cache_data():
    json_link = "https://huggingface.co/datasets/jerpint/vox-cloned-data/resolve/main/files.json?download=true"
    local_file = "files.json"

    if not os.path.exists(local_file):
        json_file = requests.get(json_link)
        if json_file.status_code != 200:
            raise Exception(f"Failed to load data from {json_link}")

        # Cache the file
        with open(local_file, "w") as f:
            f.write(json_file.text)

    with open(local_file, "r") as f:
        return json.load(f)


def load_data():
    json_link = "https://huggingface.co/datasets/jerpint/vox-cloned-data/resolve/main/files.json?download=true"
    json_file = requests.get(json_link)
    if json_file.status_code != 200:
        raise Exception(f"Failed to load data from {json_link}")
    print("Loaded data")
    return json.loads(json_file.text)


def select_random_model(path):
    """Select a random model from the list of models for a given path.
    Will select commonvoice 50% of the time, and a random other model 50% of the time.
    """
    if random.random() < 0.5:
        return "commonvoice"
    else:
        other_models = [m for m in data[path] if m != "commonvoice"]
        return random.choice(other_models)


def get_random_audio():
    path = random.choice(paths)
    model = select_random_model(path)
    return path, model


def next_audio():
    new_audio = get_random_audio()
    audio_cmp = gr.Audio(audio_link(new_audio[0], new_audio[1]))
    return audio_cmp, new_audio


data = load_data()

# Keep only samples with minimum 2 sources
data = {path: data[path] for path in data if len(data[path]) >= 2}

# List all available paths
paths = list(data.keys())


with gr.Blocks() as demo:
    current_audio = gr.State(get_random_audio)
    score_matrix = gr.State([[0, 0], [0, 0]])

    with gr.Column():
        with gr.Row():
            audio_cmp = gr.Audio(
                audio_link(current_audio.value[0], current_audio.value[1])
            )
    with gr.Column():
        with gr.Row():
            button1 = gr.Button("Real Audio 🗣️")
            button2 = gr.Button("Cloned Audio 🤖")

    score_md = gr.Markdown(confusion_matrix_to_markdown(score_matrix.value, labels))

    @gr.on(
        triggers=[button1.click],
        inputs=[current_audio, score_matrix],
        outputs=[audio_cmp, current_audio, score_matrix, score_md],
    )
    def check_result(x, score_matrix):
        is_correct = x[1] == "commonvoice"
        audio_cmp, current_audio = next_audio()
        if is_correct:
            gr.Info("Correct! Real Audio", duration=DURATION)
            score_matrix[0][0] += 1
        else:
            gr.Warning("Incorrect! Cloned Audio", duration=DURATION)
            score_matrix[0][1] += 1

        score_md = confusion_matrix_to_markdown(score_matrix, labels)
        return audio_cmp, current_audio, score_matrix, score_md

    @gr.on(
        triggers=[button2.click],
        inputs=[current_audio, score_matrix],
        outputs=[audio_cmp, current_audio, score_matrix, score_md],
    )
    def check_result(x, score_matrix):
        is_correct = x[1] != "commonvoice"
        audio_cmp, current_audio = next_audio()
        if is_correct:
            gr.Info("Correct! Cloned Audio", duration=DURATION)
            score_matrix[1][1] += 1
        else:
            gr.Warning("Incorrect! Real Audio", duration=DURATION)
            score_matrix[1][0] += 1
        score_md = confusion_matrix_to_markdown(score_matrix, labels)
        return audio_cmp, current_audio, score_matrix, score_md


demo.launch()