Spaces:

Tingusto
/

audio-transcriptor

Runtime error

File size: 6,792 Bytes

import os
from dotenv import load_dotenv
import whisper
from pyannote.audio import Pipeline
import torch
from tqdm import tqdm
from time import time
from transformers import pipeline
from .transcription import Transcription
from .audio_processing import AudioProcessor

load_dotenv()

class Transcriptor:
    """
    A class for transcribing and diarizing audio files.

    This class uses the Whisper model for transcription and the PyAnnote speaker diarization pipeline for speaker identification.

    Attributes
    ----------
    model_size : str
        The size of the Whisper model to use for transcription. Available options are:
        - 'tiny': Fastest, lowest accuracy
        - 'base': Fast, good accuracy for many use cases
        - 'small': Balanced speed and accuracy
        - 'medium': High accuracy, slower than smaller models
        - 'large': High accuracy, slower and more resource-intensive
        - 'large-v1': Improved version of the large model
        - 'large-v2': Further improved version of the large model
        - 'large-v3': Latest and most accurate version of the large model
        - 'large-v3-turbo': Optimized version of the large-v3 model for faster processing
    model : whisper.model.Whisper
        The Whisper model for transcription.
    pipeline : pyannote.audio.pipelines.SpeakerDiarization
        The PyAnnote speaker diarization pipeline.

    Usage:
        >>> transcript = Transcriptor(model_size="large-v3")
        >>> transcription = transcript.transcribe_audio("/path/to/audio.wav")
        >>> transcription.get_name_speakers()
        >>> transcription.save("/path/to/transcripts")

    Note:
        Larger models, especially 'large-v3', provide higher accuracy but require more 
        computational resources and may be slower to process audio.
    """

    def __init__(self, model_size: str = "base"):
        self.model_size = model_size
        self.HF_TOKEN = os.environ.get("HF_TOKEN")
        if not self.HF_TOKEN:
            raise ValueError("HF_TOKEN not found. Please set it as a Gradio secret.")
        self._setup()

    def _setup(self):
        """Initialize the Whisper model and diarization pipeline."""
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print("Initializing Whisper model...")
        if self.model_size == "large-v3-turbo":
            self.model = pipeline(
                task="automatic-speech-recognition",
                model="ylacombe/whisper-large-v3-turbo",
                chunk_length_s=30,
                device=self.device,
            )
        else:
            self.model = whisper.load_model(self.model_size, device=self.device)
        print("Building diarization pipeline...")
        self.pipeline = Pipeline.from_pretrained(
            "pyannote/speaker-diarization-3.1", 
            use_auth_token=self.HF_TOKEN
        ).to(torch.device(self.device))
        print("Setup completed successfully!")

    def transcribe_audio(self, audio_file_path: str, enhanced: bool = False) -> Transcription:
        """
        Transcribe an audio file.

        Parameters:
        -----------
        audio_file_path : str
            Path to the audio file to be transcribed.
        enhanced : bool, optional
            If True, applies audio enhancement techniques to improve transcription quality.
            This includes noise reduction, voice enhancement, and volume boosting.

        Returns:
        --------
        Transcription
            A Transcription object containing the transcribed text and speaker segments.
        """
        try:
            print("Processing audio file...")
            processed_audio = self.process_audio(audio_file_path, enhanced)
            audio_file_path = processed_audio.path
            audio, sr, duration = processed_audio.load_as_array(), processed_audio.sample_rate, processed_audio.duration
            
            print("Diarization in progress...")
            start_time = time()
            diarization = self.perform_diarization(audio_file_path)
            print(f"Diarization completed in {time() - start_time:.2f} seconds.")
            segments = list(diarization.itertracks(yield_label=True))

            transcriptions = self.transcribe_segments(audio, sr, duration, segments)
            return Transcription(audio_file_path, transcriptions, segments)
        except Exception as e:
            raise RuntimeError(f"Failed to process the audio file: {e}")

    def process_audio(self, audio_file_path: str, enhanced: bool = False) -> AudioProcessor:
        """
        Process the audio file to ensure it meets the requirements for transcription.

        Parameters:
        -----------
        audio_file_path : str
            Path to the audio file to be processed.
        enhanced : bool, optional
            If True, applies audio enhancement techniques to improve audio quality.
            This includes optimizing noise reduction, voice enhancement, and volume boosting
            parameters based on the audio characteristics.

        Returns:
        --------
        AudioProcessor
            An AudioProcessor object containing the processed audio file.
        """
        processed_audio = AudioProcessor(audio_file_path)
        if processed_audio.format != ".wav":
            processed_audio.convert_to_wav()
        
        if processed_audio.sample_rate != 16000:
            processed_audio.resample_wav()
        
        if enhanced:
            parameters = processed_audio.optimize_enhancement_parameters()
            processed_audio.enhance_audio(noise_reduce_strength=parameters[0], 
                                        voice_enhance_strength=parameters[1], 
                                        volume_boost=parameters[2])
        
        processed_audio.display_changes()
        return processed_audio

    def perform_diarization(self, audio_file_path: str):
        """Perform speaker diarization on the audio file."""
        with torch.no_grad():
            return self.pipeline(audio_file_path)

    def transcribe_segments(self, audio, sr, duration, segments):
        """Transcribe audio segments based on diarization."""
        transcriptions = []
        
        for turn, _, speaker in tqdm(segments, desc="Transcribing segments", unit="segment", ncols=100, colour="green"):
            start = turn.start
            end = min(turn.end, duration)
            segment = audio[int(start * sr):int(end * sr)]
            if self.model_size == "large-v3-turbo":
                result = self.model(segment)
            else:
                result = self.model.transcribe(segment, fp16=self.device == "cuda")
            transcriptions.append((speaker, result['text'].strip()))
        
        return transcriptions