Spaces:

jessica07
/

Playground-Deepgram

Sleeping

File size: 4,427 Bytes

# this is a script that transcibe downloaded youtube video using deepgram
# the audio should be cleaned with UVR5 first, so the file is flac
# it will upload the full length of interview or podcast to deepgram
# and will return the speaker id. User must manually listen audio clip to find out which speaker is wanted
# discard remaining speakers and short length audio
# 

import os
from dotenv import load_dotenv
from pydub import AudioSegment
import math
from os.path import join
import shutil

from deepgram import (
    DeepgramClient,
    PrerecordedOptions,
    FileSource,
)

def write_csv_file(csv_file, csv_data):
    with open(csv_file, 'w') as file:
        # Iterate over each row in the data
        for row in csv_data:
            # Create a string where each field is separated by a '|'
            row_string = '|'.join(str(item) for item in row)
            # Write the string to the file, followed by a newline character
            file.write(row_string + '\n')
    
    print(f"Data written to {csv_file}") 

def process(audio_file, tag, progress):
    load_dotenv("myenv-variable.env")
     
    # Path to the audio file
    AUDIO_FILE = audio_file #audio name
    TAGS = tag # youtube source, for categorization
     
    API_KEY = os.getenv('API_DEEPGRAM')
    original_parent_folder = os.getcwd()

    output_folder = join(original_parent_folder, "output")

    if os.path.isdir(output_folder):
        shutil.rmtree(output_folder)

    if os.path.exists("output.zip"):
        os.remove("output.zip")

    os.mkdir(output_folder) 
     
    deepgram = DeepgramClient(API_KEY)
     
    with open(AUDIO_FILE, "rb") as file:
        buffer_data = file.read()
    
    payload: FileSource = {
        "buffer": buffer_data,
    }

    #STEP 2: Configure Deepgram options for audio analysis
    options = PrerecordedOptions(
        model="nova-2",
        smart_format=True,
        filler_words=True,
        diarize=True
    )

    progress(0.20)

    try:
        response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
    except Exception as e:
        print(e)

    progress(0.30)

    audio = AudioSegment.from_file(AUDIO_FILE)
    data = response

    paragraphs = data['results']['channels'][0]['alternatives'][0]['paragraphs']['paragraphs']

    csv_data_dict = dict()

    i=1

    progress(0.40)

    for paragraph in progress.tqdm(paragraphs, desc="Generating..."):
        sentences = paragraph['sentences']
        for text in sentences:
        
            # convert the start and end time of the sentence to ms, add +- 5ms buffer to it
            start_time_ms = math.floor(text['start']*1000)-5
            end_time_ms = math.ceil(text['end']*1000)+5
            duration_s = round(text['end']-text['start'],3)
            duration_ms = str(end_time_ms-start_time_ms).zfill(6)

            if duration_s < 2:
                continue

            speaker_id = paragraph['speaker']

            folder_path = join(output_folder, "Speaker_"+str(speaker_id))

            if not os.path.isdir(folder_path):
                os.mkdir(folder_path) 
                csv_data_dict.update({str(speaker_id): [["filename", "speaker", "text", "start_time", "end_time", "duration"]]})
                print(csv_data_dict)

            if speaker_id == 10:
                speaker_id = "Tayr"

            # Slice the audio segment
            segment = audio[start_time_ms:end_time_ms]

            # Generate file name
            file_name = join("wavs",f"{TAGS}_Speaker_{speaker_id}_i{str(i).zfill(3)}_d{duration_ms}.wav")

            # Export the segment to temp folder
            temp_folder = join(folder_path,f"{TAGS}_Speaker_{speaker_id}_i{str(i).zfill(3)}_d{duration_ms}.wav")
            segment.export(temp_folder, format="wav")

            # Add data to CSV list
            csv_data_dict[str(speaker_id)].append([file_name, speaker_id, text['text'], start_time_ms, end_time_ms, duration_s])

            i += 1
    
    progress(0.80)

    # write output.txt file
    for key, value in csv_data_dict.items():
        # Specify the filename
        speaker_folder = join(output_folder, f"Speaker_{key}")
        csv_filename = join(speaker_folder,f"Speaker_{key}_{TAGS}_output.txt")
        write_csv_file(csv_filename, value)
    
    progress(0.90)

    shutil.make_archive("output", 'zip', output_folder)
    
    progress(1.00) 
    return "output.zip"