import gradio as gr
from transformers import pipeline
import numpy as np

transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")


def transcribe(state, words_list, new_chunk):
    print(f"state: {state}")

    if state is None:
        state = {}

    stream = state.get("stream", None)
    previous_transcription = state.get("full_transcription", "")
    previous_counts_of_words = state.get("counts_of_words", {})

    if new_chunk is None:
        gr.Info("You can start transcribing by clicking on the Record button")
        print("new chunk is None")
        return state, previous_counts_of_words, previous_transcription

    sr, y = new_chunk

    try:
        words_to_check_for = [word.strip() for word in words_list.split(",")]
    except:
        gr.Warning("Please enter a valid list of words to check for")
        words_to_check_for = []

    # Convert to mono if stereo
    if y.ndim > 1:
        y = y.mean(axis=1)

    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    if stream is not None:
        stream = np.concatenate([stream, y])
    else:
        stream = y

    try:
        new_transcription = transcriber({"sampling_rate": sr, "raw": stream})
    except Exception as e:
        gr.Error(f"Transcription failed. Error: {e}")
        print(f"Transcription failed. Error: {e}")
        return state, previous_counts_of_words, previous_transcription

    print(f"new transcription: {new_transcription}")
    new_transcription_text = new_transcription["text"]
    full_transcription_text = f"{previous_transcription} {new_transcription_text}"

    new_transcription_text_lower = new_transcription_text.lower()

    new_counts_of_words = {
        word: new_transcription_text_lower.count(word) for word in words_to_check_for
    }

    new_counts_of_words = {
        word: new_counts_of_words.get(word, 0) + previous_counts_of_words.get(word, 0)
        for word in words_to_check_for
    }

    new_state = {
        "stream": stream,
        "full_transcription": full_transcription_text,
        "counts_of_words": new_counts_of_words,
    }

    print(f"new state: {new_state}")

    return new_state, new_counts_of_words, full_transcription_text


demo = gr.Interface(
    transcribe,
    [
        "state",
        gr.Textbox(label="List of filer words"),
        gr.Audio(sources=["microphone"], streaming=True),
    ],
    ["state", gr.JSON(label="Filler words count"), gr.Text(label="Transcription")],
    live=True,
)

demo.launch()