Spaces:

Tingusto
/

audio-transcriptor

Running

App Files Files Community

Tingusto commited on Mar 9

Commit

3cdeba6

verified ·

1 Parent(s): a8f9b8a

Uploaded initial demo

Browse files

Files changed (13) hide show

.gitattributes +4 -0
.gradio/cached_examples/16/log.csv +12 -0
audio-test/harvard.wav +3 -0
audio-test/jackhammer.wav +3 -0
audio-test/meeting-clip1.wav +3 -0
audio-test/meeting-clip2.wav +3 -0
demo.py +70 -0
pyscript/__init__.py +3 -0
pyscript/audio_processing.py +238 -0
pyscript/audio_recording.py +77 -0
pyscript/transcription.py +110 -0
pyscript/transcriptor.py +242 -0
requirements.txt +17 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+audio-test/harvard.wav filter=lfs diff=lfs merge=lfs -text
+audio-test/jackhammer.wav filter=lfs diff=lfs merge=lfs -text
+audio-test/meeting-clip1.wav filter=lfs diff=lfs merge=lfs -text
+audio-test/meeting-clip2.wav filter=lfs diff=lfs merge=lfs -text

.gradio/cached_examples/16/log.csv ADDED Viewed

	@@ -0,0 +1,12 @@

+Transcription, timestamp
+"SPEAKER_00:
+The stale smell of old beer lingers.It takes heat to bring out the odor.A cold dipRestores health and zest.A salt pickle tastes fine with ham.Tacos El Pastor are my favorite.A zestful food is the hot cross bun.", 2025-03-09 11:24:19.265900
+"SPEAKER_00:
+The stale smell of old beer lingers.",2025-03-09 11:25:11.512019
+"SPEAKER_00:
+of the research company we contracted to carry out the work.The Miss Reyes will arrive at 11.30.So I plan to break at about 11.15 to give her time toset up.It may also mean that we need to interrupt the first few agenda items,we'll come back to those.Um,And lastly, I'd like to leave a little bit of time under any other business.to discuss whatever might come out of the presentation.Okay?Item one.relocationand plans for flexible working.Now, as you know, Paul and his teamI've been working on plans to extendflexible working hours across the company.So Paul, perhaps I can begin by asking you to fill us in on your progress.Sure.Thanks.",2025-03-09 11:37:29.111311
+"SPEAKER_00:
+Thank you.Well,From my point of view, what Paul is proposing sounds fine.I am a bit concerned about working with a system of core hours and then flexible hoursBut I think we all need time to read through Paul's proposal in more detail.before discussing it any further?
+SPEAKER_01:
+Okay, that sounds reasonable.",2025-03-09 11:41:54.192049

audio-test/harvard.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:948297f29790ae1fae0d081a28f96fd47fcec03c365ad5d3a20efb5fc1b90184
+size 3238076

audio-test/jackhammer.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a9484bb0ec40468683ebe6a064f6b4b579bfa800ac8b360a15ae3d225c5037e2
+size 600204

audio-test/meeting-clip1.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab06d0dd823b6cf40e2b5f2ee79e25a8231620348fc7538b2cb9c8a2a590f16a
+size 9534030

audio-test/meeting-clip2.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e42a46651b2f3b464027327e1f3dd1336b3068f2fdd599235dbae1767bc2cb82
+size 3528078

demo.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import gradio as gr
+from pyscript import Transcriptor
+import os
+transcriptor = Transcriptor(model_size="small")
+demo_dir = "audio-test"
+demo_files = {
+    "Short Sample": os.path.join(demo_dir, "harvard.wav"),
+    "Noise Sample": os.path.join(demo_dir, "jackhammer.wav"),
+    "Meeting Sample 1 person": os.path.join(demo_dir, "meeting-clip1.wav"),
+    "Meeting Sample 2 people": os.path.join(demo_dir, "meeting-clip2.wav"),
+}
+def process_audio(audio_path, enhancement):
+    if audio_path is None:
+        raise ValueError("Please provide an audio file.")
+    transcription = transcriptor.transcribe_audio(audio_path, enhanced=enhancement)
+    return str(transcription)
+def create_download(text):
+    os.makedirs(".temp", exist_ok=True)
+    temp_file = ".temp/transcription.txt"
+    with open(temp_file, "w", encoding="utf-8") as f:
+        f.write(text)
+    return temp_file
+interface = gr.Interface(
+    fn=process_audio,
+    inputs=[
+        gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio"),
+        gr.Radio(choices=[True, False], value=False, label="Audio Enhancement", info="Enable for noisy audio")
+    ],
+    outputs=gr.Textbox(
+        label="Complete Transcription",
+        interactive=True,
+        info="You can edit the transcription here"
+    ),
+    title="🎙️ Audio Transcription Tool",
+    description="""
+    ⚠️ **Performance Notice**: This application performs intensive computations that are optimized for GPU usage.
+    If running on CPU only, transcription may take significantly longer (5-10x slower). For the best experience,
+    using a system with GPU is recommended.
+    Upload an audio file or record directly to get a transcription.
+    """,
+    examples=[
+        [demo_files["Short Sample"], False],
+        [demo_files["Noise Sample"], True],
+        [demo_files["Meeting Sample 1 person"], False],
+        [demo_files["Meeting Sample 2 people"], False],
+    ],
+    cache_examples=True,
+    cache_mode="eager",
+    allow_flagging="never"
+)
+with gr.Blocks() as demo:
+    interface.render()
+    with gr.Column():
+        download_button = gr.Button("📥 Download Edited Transcription")
+        file_output = gr.File(label="Download Transcription")
+    textbox = interface.output_components[0]
+    download_button.click(fn=create_download, inputs=[textbox], outputs=[file_output])
+if __name__ == "__main__":
+    demo.launch(share=False)

pyscript/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .transcriptor import Transcriptor
+from .audio_processing import AudioProcessor
+__all__ = ["Transcriptor", "AudioProcessor"]

pyscript/audio_processing.py ADDED Viewed

	@@ -0,0 +1,238 @@

+import os
+import librosa
+import numpy as np
+from tabulate import tabulate
+import soundfile as sf
+import scipy.ndimage
+import itertools
+from tqdm import tqdm
+import torch
+import torchaudio
+class AudioProcessor:
+    def __init__(self, audio_file):
+         self.path = audio_file
+         self.name = os.path.splitext(os.path.basename(audio_file))[0]
+         self.format = os.path.splitext(os.path.basename(audio_file))[1]
+         self.duration = librosa.get_duration(path=audio_file)
+         self.sample_rate = librosa.get_samplerate(audio_file)
+         self.changes = []
+         self.optimized_params = None
+         self.load_details()
+    # File information methods
+    def load_details(self):
+        """Save the attributes of the audio file."""
+        data = [
+            ["File Name", self.name],
+            ["File Format", self.format],
+            ["Duration", f"{self.duration} seconds"],
+            ["Sample Rate", f"{self.sample_rate} Hz"]
+        ]
+        table = tabulate(data, headers=["Attribute", "Value"], tablefmt="outline")
+        self.changes.append(table)
+        return table
+    def display_details(self):
+        """Display the details of the audio file."""
+        print(self.changes[-1])
+    def display_changes(self):
+        """Display the changes made to the audio file side by side."""
+        self._clean_duplicates_changes()
+        if len(self.changes) == 1:
+            self.display_details()
+        else:
+            table1 = self.changes[0].split('\n')
+            table2 = self.changes[-1].split('\n')
+            combined_table = []
+            for line1, line2 in zip(table1, table2):
+                combined_table.append([line1, '===>', line2])
+            print(tabulate(combined_table, tablefmt="plain"))
+    def _clean_duplicates_changes(self):
+        """Remove duplicate consecutive changes from the audio file."""
+        self.changes = [change for i, change in enumerate(self.changes)
+                        if i == 0 or change != self.changes[i-1]]
+    # Audio processing methods
+    def load_as_array(self, sample_rate: int = 16000) -> np.ndarray:
+        """
+        Load an audio file and convert it into a NumPy array.
+        Parameters
+        ----------
+        sample_rate : int, optional
+            The sample rate to which the audio will be resampled (default is 16000 Hz).
+        Returns
+        -------
+        np.ndarray
+            A NumPy array containing the audio data.
+        """
+        try:
+            audio, sr = librosa.load(self.path, sr=sample_rate)
+            self.sample_rate = sr
+            return audio
+        except Exception as e:
+            raise RuntimeError(f"Failed to load audio file: {e}")
+    def resample_wav(self) -> str:
+        output_path = os.path.join('resampled_files', f'{self.name}.wav')
+        try:
+            audio, sr = librosa.load(self.path)
+            resampled_audio = librosa.resample(y=audio, orig_sr=sr, target_sr=16000)
+            os.makedirs(os.path.dirname(output_path), exist_ok=True)
+            sf.write(output_path, resampled_audio, 16000)
+            self._update_file_info(output_path)
+            return output_path
+        except Exception as e:
+            raise RuntimeError(f"Failed to resample audio file: {e}")
+    def convert_to_wav(self):
+        """
+        Converts an audio file to WAV format.
+        Returns
+        -------
+        str
+            The path to the converted audio file.
+        """
+        output_path = os.path.join('converted_files', f'{self.name}.wav')
+        try:
+            os.makedirs(os.path.dirname(output_path), exist_ok=True)
+            audio, sr = librosa.load(self.path, sr=16000)
+            sf.write(output_path, audio, 16000)
+            self._update_file_info(output_path)
+            return output_path
+        except Exception as e:
+            raise RuntimeError(f"Failed to convert audio file to WAV: {e}")
+    def enhance_audio(self, noise_reduce_strength=0.5, voice_enhance_strength=1.5, volume_boost=1.2):
+        """
+        Enhance audio quality by reducing noise and clarifying voices.
+        """
+        try:
+            y, sr = librosa.load(self.path, sr=16000)
+            y_enhanced = self._enhance_audio_sample(y, noise_reduce_strength, voice_enhance_strength, volume_boost)
+            output_path = os.path.join('enhanced_files', f'{self.name}_enhanced.wav')
+            os.makedirs(os.path.dirname(output_path), exist_ok=True)
+            sf.write(output_path, y_enhanced, sr)
+            self._update_file_info(output_path)
+            return output_path
+        except Exception as e:
+            raise RuntimeError(f"Failed to enhance audio: {e}")
+    def _compute_spectral_contrast(self, y, sr, n_bands=6, fmin=200.0, quantile=0.02, hop_length=512):
+        """
+        Compute spectral contrast using librosa.
+        Higher contrast generally indicates clearer speech separation from background.
+        """
+        S = np.abs(librosa.stft(y, hop_length=hop_length))
+        contrast = librosa.feature.spectral_contrast(
+            S=S,
+            sr=sr,
+            n_bands=n_bands,
+            fmin=fmin,
+            quantile=quantile,
+            hop_length=hop_length
+        )
+        return np.mean(contrast)
+    def optimize_enhancement_parameters(self, step=0.25, max_iterations=50, sample_duration=30):
+        """
+        Find optimal parameters for audio enhancement using grid search on a sample.
+        """
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        y_orig, sr = librosa.load(self.path, duration=sample_duration)
+        y_orig_tensor = torch.tensor(y_orig, device=device)
+        param_ranges = [
+            np.arange(0.25, 1.5, step),  # noise_reduce_strength
+            np.arange(1.0, 3.0, step),   # voice_enhance_strength
+            np.arange(1.0, 2.0, step)    # volume_boost
+        ]
+        best_score = float('-inf')
+        best_params = None
+        total_iterations = min(max_iterations, len(list(itertools.product(*param_ranges))))
+        for params in tqdm(itertools.islice(itertools.product(*param_ranges), max_iterations),
+                          total=total_iterations,
+                          desc="Searching for optimal parameters"):
+            y_enhanced = self._enhance_audio_sample(y_orig, *params)
+            y_enhanced_tensor = torch.tensor(y_enhanced, device=device)
+            # Correlation between original and enhanced audio
+            min_length = min(len(y_orig_tensor), len(y_enhanced_tensor))
+            y_orig_trimmed = y_orig_tensor[:min_length]
+            y_enhanced_trimmed = y_enhanced_tensor[:min_length]
+            correlation = torch.corrcoef(torch.stack([y_orig_trimmed, y_enhanced_trimmed]))[0, 1].item()
+            # Spectral contrast improvement
+            contrast_orig = self._compute_spectral_contrast(y_orig, sr)
+            contrast_enhanced = self._compute_spectral_contrast(y_enhanced, sr)
+            contrast_improvement = contrast_enhanced - contrast_orig
+            score = (0.3 * correlation) + (0.7 * contrast_improvement)
+            if score > best_score:
+                best_score = score
+                best_params = params
+        self.optimized_params = best_params
+        return best_params
+    def _enhance_audio_sample(self, y, noise_reduce_strength=0.5, voice_enhance_strength=1.5, volume_boost=1.2):
+        """
+        Enhance an audio sample by reducing noise and enhancing voice clarity.
+        Parameters
+        ----------
+        y : np.ndarray
+            Input audio signal
+        noise_reduce_strength : float
+            Strength of noise reduction (default: 0.5)
+        voice_enhance_strength : float
+            Strength of voice enhancement (default: 1.5)
+        volume_boost : float
+            Volume boost factor (default: 1.2)
+        Returns
+        -------
+        np.ndarray
+            Enhanced audio signal
+        """
+        # STFT
+        S = librosa.stft(y, n_fft=2048)
+        S_mag, S_phase = np.abs(S), np.angle(S)
+        S_filtered = scipy.ndimage.median_filter(S_mag, size=(1, 31))
+        # Noise reduction mask
+        mask = np.clip((S_mag - S_filtered) / (S_mag + 1e-10), 0, 1) ** noise_reduce_strength
+        S_denoised = S_mag * mask * np.exp(1j * S_phase)
+        # Inverse STFT
+        y_denoised = librosa.istft(S_denoised)
+        # Harmonic-percussive separation and enhancement
+        y_harmonic, y_percussive = librosa.effects.hpss(y_denoised)
+        y_enhanced = (y_harmonic * voice_enhance_strength + y_percussive) * volume_boost
+        return librosa.util.normalize(y_enhanced, norm=np.inf, threshold=1.0)
+    # Helper method
+    def _update_file_info(self, new_path):
+        """Update file information after processing."""
+        self.path = new_path
+        self.sample_rate = librosa.get_samplerate(new_path)
+        self.format = os.path.splitext(new_path)[1]
+        self.duration = librosa.get_duration(path=new_path)
+        self.load_details()

pyscript/audio_recording.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import speech_recognition as sr
+import os
+import datetime
+from termcolor import colored
+from tabulate import tabulate
+def micro_recording(save_folder_path: str = "audio_files", file_name: str = None, device_index: int = 0) -> str:
+    """Records audio from a microphone and saves it to a designated file."""
+    r = sr.Recognizer()
+    mic = sr.Microphone(device_index=device_index)
+    print_colored_separator("Starting microphone recording...", "green")
+    with mic as source:
+        print_colored("Recording...", "yellow")
+        audio = r.listen(source)
+        print_colored("Recording finished.", "green")
+    saved_path = save_audio_file(audio, save_folder_path, file_name)
+    print_colored_separator(f"Audio file saved to: {saved_path}", "green")
+    return saved_path
+def check_input_device(test_duration: int = 1) -> dict:
+    """Checks the available microphone devices."""
+    devices = sr.Microphone.list_microphone_names()
+    available_devices, non_working_devices = [], []
+    for i, device in enumerate(devices):
+        try:
+            with sr.Microphone(device_index=i) as source:
+                sr.Recognizer().listen(source, timeout=test_duration)
+            available_devices.append(device)
+        except sr.WaitTimeoutError:
+            non_working_devices.append(device)
+        except Exception as e:
+            print(f"An error occurred while testing device {device}: {e}")
+    print_device_table("Available Devices", available_devices)
+    print_device_table("Non-Working Devices", non_working_devices)
+    return {'available_devices': available_devices, 'non_working_devices': non_working_devices}
+def save_audio_file(audio, save_folder_path: str, file_name: str = None) -> str:
+    """Saves the audio file to the specified path."""
+    os.makedirs(save_folder_path, exist_ok=True)
+    if not file_name:
+        timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
+        file_name = f"recording_{timestamp}.wav"
+    else:
+        file_name = f"{file_name}.wav"
+    saved_path = os.path.join(save_folder_path, file_name)
+    with open(saved_path, "wb") as f:
+        f.write(audio.get_wav_data())
+    print_colored("Saving audio file...", "yellow")
+    return saved_path
+def print_colored(message: str, color: str):
+    """Prints a colored message."""
+    print(colored(message, color))
+def print_colored_separator(message: str, color: str):
+    """Prints a colored message with separators."""
+    print("--------------------------------")
+    print_colored(message, color)
+    print("--------------------------------")
+def print_device_table(title: str, devices: list):
+    """Prints a table of devices."""
+    device_table = [[i+1, device] for i, device in enumerate(devices)]
+    print(f"\n{title}:")
+    print(tabulate(device_table, headers=["Index", "Device Name"]))

pyscript/transcription.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import os
+from itertools import cycle
+from termcolor import colored
+class Transcription:
+    """
+    A class for storing and saving transcriptions.
+    Attributes:
+    -----------
+    audio_file_path : str
+        The path to the audio file that was transcribed.
+    filename : str
+        The name of the audio file, without the extension.
+    transcriptions : list[str]
+        A list of tuples containing the speaker's label and their corresponding transcription, grouped by speaker.
+    speaker_names : dict
+        A dictionary mapping speaker labels to their assigned names.
+    segments : list
+        A list of segments from diarization.
+    """
+    def __init__(self, audio_file_path: str, transcriptions: list[str], segments: list[str]):
+        self.audio_file_path = audio_file_path
+        self.filename = os.path.splitext(os.path.basename(audio_file_path))[0]
+        self.transcriptions = self.group_by_speaker(transcriptions)
+        self.speaker_names = {}
+        self.segments = segments
+        self.colors = cycle(['red', 'green', 'blue', 'magenta', 'cyan', 'yellow'])
+    def __repr__(self) -> str:
+        result = []
+        for speaker, text in self.transcriptions:
+            speaker_name = self.speaker_names.get(speaker, speaker)
+            result.append(f"{speaker_name}:\n{text}")
+        return "\n\n".join(result)
+    def group_by_speaker(self, transcriptions: list[str]) -> list[str]:
+        """
+        Groups transcriptions by speaker.
+        Parameters
+        ----------
+        transcriptions : list[str]
+            A list of tuples containing the speaker's label and their corresponding transcription.
+        Returns
+        -------
+        list[str]
+            A list of tuples containing the speaker's label and their corresponding transcription, grouped by speaker.
+        """
+        speaker_transcriptions = []
+        previous_speaker = transcriptions[0][0]
+        speaker_text = ""
+        for speaker, text in transcriptions:
+            if speaker == previous_speaker:
+                speaker_text += text
+            else:
+                speaker_transcriptions.append((previous_speaker, speaker_text))
+                speaker_text = text
+                previous_speaker = speaker
+        speaker_transcriptions.append((previous_speaker, speaker_text))
+        return speaker_transcriptions
+    def save(self, directory: str = "transcripts") -> None:
+        """
+        Saves the transcription to a text file.
+        Parameters
+        ----------
+        directory : str, optional
+            The directory to save the transcription to. Defaults to "transcripts".
+        """
+        if not self.transcriptions:
+            raise ValueError("No transcriptions available to save.")
+        os.makedirs(directory, exist_ok=True)
+        saving_path = os.path.join(directory, f"{self.filename}_transcript.txt")
+        with open(saving_path, 'w', encoding='utf-8') as f:
+            for speaker, text in self.transcriptions:
+                if text:
+                    speaker_name = self.speaker_names.get(speaker, speaker)
+                    f.write(f"{speaker_name}: {text}\n")
+        print(f"Transcription saved to {saving_path}")
+    def get_name_speakers(self) -> None:
+        """
+        Interactively assigns names to speakers in the transcriptions and retrieves the name of the speaker.
+        Provides a preview of one sentence for each speaker to help recognize who is speaking.
+        """
+        for speaker, full_text in self.transcriptions:
+            if speaker in self.speaker_names:
+                continue
+            preview = full_text.split('.')[0] + '.'
+            print(f"\nCurrent speaker: {speaker}")
+            print(f"Preview: {preview}")
+            new_name = input(f"Enter a name for {speaker} (or press Enter to skip): ").strip()
+            if new_name:
+                self.speaker_names[speaker] = new_name
+                print(f"Speaker {speaker} renamed to {new_name}")
+            else:
+                print(f"Skipped renaming {speaker}")
+        print("\nSpeaker naming completed.")
+        print(f"Updated speaker names: {self.speaker_names}")

pyscript/transcriptor.py ADDED Viewed

	@@ -0,0 +1,242 @@

+import os
+from dotenv import load_dotenv
+import whisper
+from pyannote.audio import Pipeline
+import torch
+from tqdm import tqdm
+from time import time
+from transformers import pipeline
+from .transcription import Transcription
+from .audio_processing import AudioProcessor
+import io
+from contextlib import redirect_stdout
+import sys
+load_dotenv()
+class Transcriptor:
+    """
+    A class for transcribing and diarizing audio files.
+    This class uses the Whisper model for transcription and the PyAnnote speaker diarization pipeline for speaker identification.
+    Attributes
+    ----------
+    model_size : str
+        The size of the Whisper model to use for transcription. Available options are:
+        - 'tiny': Fastest, lowest accuracy
+        - 'base': Fast, good accuracy for many use cases
+        - 'small': Balanced speed and accuracy
+        - 'medium': High accuracy, slower than smaller models
+        - 'large-v3': Latest and most accurate version of the large model
+        - 'large-v3-turbo': Optimized version of the large-v3 model for faster processing
+    model : whisper.model.Whisper
+        The Whisper model for transcription.
+    pipeline : pyannote.audio.pipelines.SpeakerDiarization
+        The PyAnnote speaker diarization pipeline.
+    Usage:
+        >>> transcript = Transcriptor(model_size="large-v3")
+        >>> transcription = transcript.transcribe_audio("/path/to/audio.wav")
+        >>> transcription.get_name_speakers()
+        >>> transcription.save("/path/to/transcripts")
+    Note:
+        Larger models, especially 'large-v3', provide higher accuracy but require more
+        computational resources and may be slower to process audio.
+    """
+    def __init__(self, model_size: str = "base"):
+        self.model_size = model_size
+        self.HF_TOKEN = os.getenv("HF_TOKEN")
+        if not self.HF_TOKEN:
+            raise ValueError("HF_TOKEN not found. Please store token in .env")
+        self._setup()
+    def _setup(self):
+        """Initialize the Whisper model and diarization pipeline."""
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"Using device: {self.device}")
+        print("Initializing Whisper model...")
+        if self.model_size == "large-v3-turbo":
+            self.model = pipeline(
+                task="automatic-speech-recognition",
+                model="ylacombe/whisper-large-v3-turbo",
+                chunk_length_s=30,
+                device=self.device,
+            )
+        else:
+            self.model = whisper.load_model(self.model_size, device=self.device)
+        print("Building diarization pipeline...")
+        self.pipeline = Pipeline.from_pretrained(
+            "pyannote/speaker-diarization-3.1",
+            use_auth_token=self.HF_TOKEN
+        ).to(torch.device(self.device))
+        print("Setup completed successfully!")
+    def transcribe_audio(self, audio_file_path: str, enhanced: bool = False, buffer_logs: bool = False):
+        """
+        Transcribe an audio file.
+        Parameters:
+        -----------
+        audio_file_path : str
+            Path to the audio file to be transcribed.
+        enhanced : bool, optional
+            If True, applies audio enhancement techniques to improve transcription quality.
+        buffer_logs : bool, optional
+            If True, captures logs and returns them with the transcription. If False, prints to terminal.
+        Returns:
+        --------
+        Union[Transcription, Tuple[Transcription, str]]
+            Returns either just the Transcription object (if buffer_logs=False)
+            or a tuple of (Transcription, logs string) if buffer_logs=True
+        """
+        if buffer_logs:
+            logs_buffer = io.StringIO()
+            with redirect_stdout(logs_buffer):
+                transcription = self._perform_transcription(audio_file_path, enhanced)
+                logs = logs_buffer.getvalue()
+                return transcription, logs
+        else:
+            transcription = self._perform_transcription(audio_file_path, enhanced)
+            return transcription
+    def _perform_transcription(self, audio_file_path: str, enhanced: bool = False):
+        """Internal method to handle the actual transcription process."""
+        try:
+            print(f"Received audio_file_path: {audio_file_path}")
+            print(f"Type of audio_file_path: {type(audio_file_path)}")
+            if audio_file_path is None:
+                raise ValueError("No audio file was uploaded. Please upload an audio file.")
+            if not isinstance(audio_file_path, (str, bytes, os.PathLike)):
+                raise ValueError(f"Invalid audio file path type: {type(audio_file_path)}")
+            if not os.path.exists(audio_file_path):
+                raise FileNotFoundError(f"Audio file not found at path: {audio_file_path}")
+            print("Processing audio file...")
+            processed_audio = self.process_audio(audio_file_path, enhanced)
+            audio_file_path = processed_audio.path
+            audio, sr, duration = processed_audio.load_as_array(), processed_audio.sample_rate, processed_audio.duration
+            print("Diarization in progress...")
+            start_time = time()
+            diarization = self.perform_diarization(audio_file_path)
+            print(f"Diarization completed in {time() - start_time:.2f} seconds.")
+            segments = list(diarization.itertracks(yield_label=True))
+            transcriptions = self.transcribe_segments(audio, sr, duration, segments)
+            return Transcription(audio_file_path, transcriptions, segments)
+        except Exception as e:
+            print(f"Error occurred: {str(e)}")
+            raise RuntimeError(f"Failed to process the audio file: {str(e)}")
+    def process_audio(self, audio_file_path: str, enhanced: bool = False) -> AudioProcessor:
+        """
+        Process the audio file to ensure it meets the requirements for transcription.
+        Parameters:
+        -----------
+        audio_file_path : str
+            Path to the audio file to be processed.
+        enhanced : bool, optional
+            If True, applies audio enhancement techniques to improve audio quality.
+            This includes optimizing noise reduction, voice enhancement, and volume boosting
+            parameters based on the audio characteristics.
+        Returns:
+        --------
+        AudioProcessor
+            An AudioProcessor object containing the processed audio file.
+        """
+        processed_audio = AudioProcessor(audio_file_path)
+        if processed_audio.format != ".wav":
+            processed_audio.convert_to_wav()
+        if processed_audio.sample_rate != 16000:
+            processed_audio.resample_wav()
+        if enhanced:
+            parameters = processed_audio.optimize_enhancement_parameters()
+            processed_audio.enhance_audio(noise_reduce_strength=parameters[0],
+                                        voice_enhance_strength=parameters[1],
+                                        volume_boost=parameters[2])
+        processed_audio.display_changes()
+        return processed_audio
+    def perform_diarization(self, audio_file_path: str):
+        """Perform speaker diarization on the audio file."""
+        with torch.no_grad():
+            return self.pipeline(audio_file_path)
+    def transcribe_segments(self, audio, sr, duration, segments):
+        """Transcribe audio segments based on diarization."""
+        transcriptions = []
+        audio_segments = []
+        for turn, _, speaker in segments:
+            start = turn.start
+            end = min(turn.end, duration)
+            segment = audio[int(start * sr):int(end * sr)]
+            audio_segments.append((segment, speaker))
+        with tqdm(
+            total=len(audio_segments),
+            desc="Transcribing segments",
+            unit="segment",
+            ncols=100,
+            colour="green",
+            file=sys.stdout,
+            mininterval=0.1,
+            dynamic_ncols=True,
+            leave=True
+        ) as pbar:
+            if self.device == "cuda":
+                try:
+                    total_memory = torch.cuda.get_device_properties(0).total_memory
+                    reserved_memory = torch.cuda.memory_reserved(0)
+                    allocated_memory = torch.cuda.memory_allocated(0)
+                    free_memory = total_memory - reserved_memory - allocated_memory
+                    memory_per_sample = 1024 * 1024 * 1024  # 1GB
+                    batch_size = max(1, min(4, int((free_memory * 0.7) // memory_per_sample)))
+                    print(f"Using batch size of {batch_size} for GPU processing")
+                    for i in range(0, len(audio_segments), batch_size):
+                        try:
+                            batch = audio_segments[i:i + batch_size]
+                            torch.cuda.empty_cache()
+                            results = self.model([segment for segment, _ in batch])
+                            for (_, speaker), result in zip(batch, results):
+                                transcriptions.append((speaker, result['text'].strip()))
+                            pbar.update(len(batch))
+                        except RuntimeError as e:
+                            if "out of memory" in str(e):
+                                torch.cuda.empty_cache()
+                                for segment, speaker in batch:
+                                    results = self.model([segment])
+                                    transcriptions.append((speaker, results[0]['text'].strip()))
+                                    pbar.update(0.5)
+                            else:
+                                raise e
+                except Exception as e:
+                    print(f"GPU processing failed: {str(e)}. Falling back to CPU processing...")
+                    self.model = self.model.to('cpu')
+                    self.device = 'cpu'
+            else:
+                for segment, speaker in audio_segments:
+                    if self.model_size == "large-v3-turbo":
+                        result = self.model(segment)
+                        transcriptions.append((speaker, result['text'].strip()))
+                    else:
+                        result = self.model.transcribe(segment, fp16=self.device == "cuda")
+                        transcriptions.append((speaker, result['text'].strip()))
+                    pbar.update(1)
+        return transcriptions

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+openai-whisper @ git+https://github.com/openai/whisper.git@ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab
+pyannote.audio==3.3.1
+librosa==0.10.2.post1
+tqdm==4.66.5
+python-dotenv==1.0.1
+termcolor==2.4.0
+pydub==0.25.1
+SpeechRecognition==3.10.4
+PyAudio==0.2.14
+tabulate==0.9.0
+soundfile==0.12.1
+numpy==1.26.4
+transformers==4.46.0
+gradio==5.3.0
+torch==2.4.1
+torchaudio==2.4.1
+python-multipart==0.0.12