Spaces:

poemsforaphrodite
/

transcribe

Sleeping

File size: 2,341 Bytes

3589cd1

import os
import json
import torch
from tqdm import tqdm  # Progress bar
import whisper

def transcribe_audio(audio_path, model):
    """
    Transcribe a single audio file using OpenAI's Whisper model locally.

    Args:
        audio_path (str): Path to the audio file.
        model (whisper.Whisper): Loaded Whisper model.

    Returns:
        str: Continuous string of transcribed text.
    """
    # Perform transcription
    result = model.transcribe(audio_path)
    
    # Extract the transcribed text
    transcriptions = result["text"].strip()
    
    return transcriptions

def transcribe_all_audios(directory, output_json, model_size="large"):
    """
    Transcribe all audio files in a directory and save the transcriptions to a JSON file.

    Args:
        directory (str): Directory containing audio files.
        output_json (str): Path to the output JSON file.
        model_size (str): Size of the Whisper model to load. Options: tiny, base, small, medium, large.
    """
    transcriptions = {}

    # Check if CUDA is available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # Load the Whisper model
    print(f"Loading Whisper model '{model_size}'...")
    model = whisper.load_model(model_size, device=device)
    print("Model loaded successfully.")

    # Walk through the directory to find all audio files, including subdirectories
    audio_files = [
        os.path.join(root, file)
        for root, dirs, files in os.walk(directory)
        for file in files
        if file.lower().endswith((".wav", ".mp3", ".m4a", ".flac", ".aac"))
    ]

    for file_path in tqdm(audio_files, desc="Transcribing Audio files"):
        file_name = os.path.basename(file_path)
        print(f"Transcribing: {file_path}")
        transcription = transcribe_audio(file_path, model)
        transcriptions[file_name] = transcription

    # Save the transcriptions to a JSON file
    with open(output_json, "w", encoding='utf-8') as f:
        json.dump(transcriptions, f, ensure_ascii=False, indent=4)

    print(f"Transcriptions saved to {output_json}")

if __name__ == "__main__":
    directory = "wav"  # Ensure the input directory is "wav"
    output_json = "transcriptions.json"
    model_size = "large"
    transcribe_all_audios(directory, output_json, model_size)