Tingusto commited on
Commit
3cdeba6
·
verified ·
1 Parent(s): a8f9b8a

Uploaded initial demo

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ audio-test/harvard.wav filter=lfs diff=lfs merge=lfs -text
37
+ audio-test/jackhammer.wav filter=lfs diff=lfs merge=lfs -text
38
+ audio-test/meeting-clip1.wav filter=lfs diff=lfs merge=lfs -text
39
+ audio-test/meeting-clip2.wav filter=lfs diff=lfs merge=lfs -text
.gradio/cached_examples/16/log.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Transcription, timestamp
2
+ "SPEAKER_00:
3
+ The stale smell of old beer lingers.It takes heat to bring out the odor.A cold dipRestores health and zest.A salt pickle tastes fine with ham.Tacos El Pastor are my favorite.A zestful food is the hot cross bun.", 2025-03-09 11:24:19.265900
4
+ "SPEAKER_00:
5
+ The stale smell of old beer lingers.",2025-03-09 11:25:11.512019
6
+ "SPEAKER_00:
7
+ of the research company we contracted to carry out the work.The Miss Reyes will arrive at 11.30.So I plan to break at about 11.15 to give her time toset up.It may also mean that we need to interrupt the first few agenda items,we'll come back to those.Um,And lastly, I'd like to leave a little bit of time under any other business.to discuss whatever might come out of the presentation.Okay?Item one.relocationand plans for flexible working.Now, as you know, Paul and his teamI've been working on plans to extendflexible working hours across the company.So Paul, perhaps I can begin by asking you to fill us in on your progress.Sure.Thanks.",2025-03-09 11:37:29.111311
8
+ "SPEAKER_00:
9
+ Thank you.Well,From my point of view, what Paul is proposing sounds fine.I am a bit concerned about working with a system of core hours and then flexible hoursBut I think we all need time to read through Paul's proposal in more detail.before discussing it any further?
10
+
11
+ SPEAKER_01:
12
+ Okay, that sounds reasonable.",2025-03-09 11:41:54.192049
audio-test/harvard.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:948297f29790ae1fae0d081a28f96fd47fcec03c365ad5d3a20efb5fc1b90184
3
+ size 3238076
audio-test/jackhammer.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9484bb0ec40468683ebe6a064f6b4b579bfa800ac8b360a15ae3d225c5037e2
3
+ size 600204
audio-test/meeting-clip1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab06d0dd823b6cf40e2b5f2ee79e25a8231620348fc7538b2cb9c8a2a590f16a
3
+ size 9534030
audio-test/meeting-clip2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e42a46651b2f3b464027327e1f3dd1336b3068f2fdd599235dbae1767bc2cb82
3
+ size 3528078
demo.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from pyscript import Transcriptor
3
+ import os
4
+
5
+ transcriptor = Transcriptor(model_size="small")
6
+
7
+ demo_dir = "audio-test"
8
+ demo_files = {
9
+ "Short Sample": os.path.join(demo_dir, "harvard.wav"),
10
+ "Noise Sample": os.path.join(demo_dir, "jackhammer.wav"),
11
+ "Meeting Sample 1 person": os.path.join(demo_dir, "meeting-clip1.wav"),
12
+ "Meeting Sample 2 people": os.path.join(demo_dir, "meeting-clip2.wav"),
13
+ }
14
+
15
+ def process_audio(audio_path, enhancement):
16
+ if audio_path is None:
17
+ raise ValueError("Please provide an audio file.")
18
+
19
+ transcription = transcriptor.transcribe_audio(audio_path, enhanced=enhancement)
20
+ return str(transcription)
21
+
22
+ def create_download(text):
23
+ os.makedirs(".temp", exist_ok=True)
24
+ temp_file = ".temp/transcription.txt"
25
+ with open(temp_file, "w", encoding="utf-8") as f:
26
+ f.write(text)
27
+ return temp_file
28
+
29
+ interface = gr.Interface(
30
+ fn=process_audio,
31
+ inputs=[
32
+ gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio"),
33
+ gr.Radio(choices=[True, False], value=False, label="Audio Enhancement", info="Enable for noisy audio")
34
+ ],
35
+ outputs=gr.Textbox(
36
+ label="Complete Transcription",
37
+ interactive=True,
38
+ info="You can edit the transcription here"
39
+ ),
40
+ title="🎙️ Audio Transcription Tool",
41
+ description="""
42
+ ⚠️ **Performance Notice**: This application performs intensive computations that are optimized for GPU usage.
43
+ If running on CPU only, transcription may take significantly longer (5-10x slower). For the best experience,
44
+ using a system with GPU is recommended.
45
+
46
+ Upload an audio file or record directly to get a transcription.
47
+ """,
48
+ examples=[
49
+ [demo_files["Short Sample"], False],
50
+ [demo_files["Noise Sample"], True],
51
+ [demo_files["Meeting Sample 1 person"], False],
52
+ [demo_files["Meeting Sample 2 people"], False],
53
+ ],
54
+ cache_examples=True,
55
+ cache_mode="eager",
56
+ allow_flagging="never"
57
+ )
58
+
59
+ with gr.Blocks() as demo:
60
+ interface.render()
61
+ with gr.Column():
62
+ download_button = gr.Button("📥 Download Edited Transcription")
63
+ file_output = gr.File(label="Download Transcription")
64
+
65
+ textbox = interface.output_components[0]
66
+
67
+ download_button.click(fn=create_download, inputs=[textbox], outputs=[file_output])
68
+
69
+ if __name__ == "__main__":
70
+ demo.launch(share=False)
pyscript/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .transcriptor import Transcriptor
2
+ from .audio_processing import AudioProcessor
3
+ __all__ = ["Transcriptor", "AudioProcessor"]
pyscript/audio_processing.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import librosa
3
+ import numpy as np
4
+ from tabulate import tabulate
5
+ import soundfile as sf
6
+ import scipy.ndimage
7
+ import itertools
8
+ from tqdm import tqdm
9
+ import torch
10
+ import torchaudio
11
+
12
+ class AudioProcessor:
13
+
14
+ def __init__(self, audio_file):
15
+ self.path = audio_file
16
+ self.name = os.path.splitext(os.path.basename(audio_file))[0]
17
+ self.format = os.path.splitext(os.path.basename(audio_file))[1]
18
+ self.duration = librosa.get_duration(path=audio_file)
19
+ self.sample_rate = librosa.get_samplerate(audio_file)
20
+ self.changes = []
21
+ self.optimized_params = None
22
+ self.load_details()
23
+
24
+ # File information methods
25
+ def load_details(self):
26
+ """Save the attributes of the audio file."""
27
+ data = [
28
+ ["File Name", self.name],
29
+ ["File Format", self.format],
30
+ ["Duration", f"{self.duration} seconds"],
31
+ ["Sample Rate", f"{self.sample_rate} Hz"]
32
+ ]
33
+ table = tabulate(data, headers=["Attribute", "Value"], tablefmt="outline")
34
+ self.changes.append(table)
35
+ return table
36
+
37
+ def display_details(self):
38
+ """Display the details of the audio file."""
39
+ print(self.changes[-1])
40
+
41
+ def display_changes(self):
42
+ """Display the changes made to the audio file side by side."""
43
+ self._clean_duplicates_changes()
44
+ if len(self.changes) == 1:
45
+ self.display_details()
46
+ else:
47
+ table1 = self.changes[0].split('\n')
48
+ table2 = self.changes[-1].split('\n')
49
+
50
+ combined_table = []
51
+ for line1, line2 in zip(table1, table2):
52
+ combined_table.append([line1, '===>', line2])
53
+
54
+ print(tabulate(combined_table, tablefmt="plain"))
55
+
56
+ def _clean_duplicates_changes(self):
57
+ """Remove duplicate consecutive changes from the audio file."""
58
+ self.changes = [change for i, change in enumerate(self.changes)
59
+ if i == 0 or change != self.changes[i-1]]
60
+
61
+ # Audio processing methods
62
+ def load_as_array(self, sample_rate: int = 16000) -> np.ndarray:
63
+ """
64
+ Load an audio file and convert it into a NumPy array.
65
+
66
+ Parameters
67
+ ----------
68
+ sample_rate : int, optional
69
+ The sample rate to which the audio will be resampled (default is 16000 Hz).
70
+
71
+ Returns
72
+ -------
73
+ np.ndarray
74
+ A NumPy array containing the audio data.
75
+ """
76
+ try:
77
+ audio, sr = librosa.load(self.path, sr=sample_rate)
78
+ self.sample_rate = sr
79
+ return audio
80
+ except Exception as e:
81
+ raise RuntimeError(f"Failed to load audio file: {e}")
82
+
83
+ def resample_wav(self) -> str:
84
+ output_path = os.path.join('resampled_files', f'{self.name}.wav')
85
+ try:
86
+ audio, sr = librosa.load(self.path)
87
+ resampled_audio = librosa.resample(y=audio, orig_sr=sr, target_sr=16000)
88
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
89
+ sf.write(output_path, resampled_audio, 16000)
90
+ self._update_file_info(output_path)
91
+ return output_path
92
+ except Exception as e:
93
+ raise RuntimeError(f"Failed to resample audio file: {e}")
94
+
95
+ def convert_to_wav(self):
96
+ """
97
+ Converts an audio file to WAV format.
98
+
99
+ Returns
100
+ -------
101
+ str
102
+ The path to the converted audio file.
103
+ """
104
+ output_path = os.path.join('converted_files', f'{self.name}.wav')
105
+ try:
106
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
107
+ audio, sr = librosa.load(self.path, sr=16000)
108
+ sf.write(output_path, audio, 16000)
109
+ self._update_file_info(output_path)
110
+ return output_path
111
+ except Exception as e:
112
+ raise RuntimeError(f"Failed to convert audio file to WAV: {e}")
113
+
114
+ def enhance_audio(self, noise_reduce_strength=0.5, voice_enhance_strength=1.5, volume_boost=1.2):
115
+ """
116
+ Enhance audio quality by reducing noise and clarifying voices.
117
+ """
118
+ try:
119
+ y, sr = librosa.load(self.path, sr=16000)
120
+ y_enhanced = self._enhance_audio_sample(y, noise_reduce_strength, voice_enhance_strength, volume_boost)
121
+
122
+ output_path = os.path.join('enhanced_files', f'{self.name}_enhanced.wav')
123
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
124
+ sf.write(output_path, y_enhanced, sr)
125
+
126
+ self._update_file_info(output_path)
127
+ return output_path
128
+ except Exception as e:
129
+ raise RuntimeError(f"Failed to enhance audio: {e}")
130
+
131
+ def _compute_spectral_contrast(self, y, sr, n_bands=6, fmin=200.0, quantile=0.02, hop_length=512):
132
+ """
133
+ Compute spectral contrast using librosa.
134
+ Higher contrast generally indicates clearer speech separation from background.
135
+ """
136
+ S = np.abs(librosa.stft(y, hop_length=hop_length))
137
+ contrast = librosa.feature.spectral_contrast(
138
+ S=S,
139
+ sr=sr,
140
+ n_bands=n_bands,
141
+ fmin=fmin,
142
+ quantile=quantile,
143
+ hop_length=hop_length
144
+ )
145
+ return np.mean(contrast)
146
+
147
+ def optimize_enhancement_parameters(self, step=0.25, max_iterations=50, sample_duration=30):
148
+ """
149
+ Find optimal parameters for audio enhancement using grid search on a sample.
150
+ """
151
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
152
+
153
+ y_orig, sr = librosa.load(self.path, duration=sample_duration)
154
+ y_orig_tensor = torch.tensor(y_orig, device=device)
155
+
156
+ param_ranges = [
157
+ np.arange(0.25, 1.5, step), # noise_reduce_strength
158
+ np.arange(1.0, 3.0, step), # voice_enhance_strength
159
+ np.arange(1.0, 2.0, step) # volume_boost
160
+ ]
161
+
162
+ best_score = float('-inf')
163
+ best_params = None
164
+
165
+ total_iterations = min(max_iterations, len(list(itertools.product(*param_ranges))))
166
+
167
+ for params in tqdm(itertools.islice(itertools.product(*param_ranges), max_iterations),
168
+ total=total_iterations,
169
+ desc="Searching for optimal parameters"):
170
+ y_enhanced = self._enhance_audio_sample(y_orig, *params)
171
+ y_enhanced_tensor = torch.tensor(y_enhanced, device=device)
172
+
173
+ # Correlation between original and enhanced audio
174
+ min_length = min(len(y_orig_tensor), len(y_enhanced_tensor))
175
+ y_orig_trimmed = y_orig_tensor[:min_length]
176
+ y_enhanced_trimmed = y_enhanced_tensor[:min_length]
177
+ correlation = torch.corrcoef(torch.stack([y_orig_trimmed, y_enhanced_trimmed]))[0, 1].item()
178
+
179
+ # Spectral contrast improvement
180
+ contrast_orig = self._compute_spectral_contrast(y_orig, sr)
181
+ contrast_enhanced = self._compute_spectral_contrast(y_enhanced, sr)
182
+ contrast_improvement = contrast_enhanced - contrast_orig
183
+
184
+ score = (0.3 * correlation) + (0.7 * contrast_improvement)
185
+
186
+ if score > best_score:
187
+ best_score = score
188
+ best_params = params
189
+
190
+ self.optimized_params = best_params
191
+ return best_params
192
+
193
+ def _enhance_audio_sample(self, y, noise_reduce_strength=0.5, voice_enhance_strength=1.5, volume_boost=1.2):
194
+ """
195
+ Enhance an audio sample by reducing noise and enhancing voice clarity.
196
+
197
+ Parameters
198
+ ----------
199
+ y : np.ndarray
200
+ Input audio signal
201
+ noise_reduce_strength : float
202
+ Strength of noise reduction (default: 0.5)
203
+ voice_enhance_strength : float
204
+ Strength of voice enhancement (default: 1.5)
205
+ volume_boost : float
206
+ Volume boost factor (default: 1.2)
207
+
208
+ Returns
209
+ -------
210
+ np.ndarray
211
+ Enhanced audio signal
212
+ """
213
+ # STFT
214
+ S = librosa.stft(y, n_fft=2048)
215
+ S_mag, S_phase = np.abs(S), np.angle(S)
216
+ S_filtered = scipy.ndimage.median_filter(S_mag, size=(1, 31))
217
+
218
+ # Noise reduction mask
219
+ mask = np.clip((S_mag - S_filtered) / (S_mag + 1e-10), 0, 1) ** noise_reduce_strength
220
+ S_denoised = S_mag * mask * np.exp(1j * S_phase)
221
+
222
+ # Inverse STFT
223
+ y_denoised = librosa.istft(S_denoised)
224
+
225
+ # Harmonic-percussive separation and enhancement
226
+ y_harmonic, y_percussive = librosa.effects.hpss(y_denoised)
227
+ y_enhanced = (y_harmonic * voice_enhance_strength + y_percussive) * volume_boost
228
+
229
+ return librosa.util.normalize(y_enhanced, norm=np.inf, threshold=1.0)
230
+
231
+ # Helper method
232
+ def _update_file_info(self, new_path):
233
+ """Update file information after processing."""
234
+ self.path = new_path
235
+ self.sample_rate = librosa.get_samplerate(new_path)
236
+ self.format = os.path.splitext(new_path)[1]
237
+ self.duration = librosa.get_duration(path=new_path)
238
+ self.load_details()
pyscript/audio_recording.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import speech_recognition as sr
2
+ import os
3
+ import datetime
4
+ from termcolor import colored
5
+ from tabulate import tabulate
6
+
7
+ def micro_recording(save_folder_path: str = "audio_files", file_name: str = None, device_index: int = 0) -> str:
8
+ """Records audio from a microphone and saves it to a designated file."""
9
+ r = sr.Recognizer()
10
+ mic = sr.Microphone(device_index=device_index)
11
+
12
+ print_colored_separator("Starting microphone recording...", "green")
13
+
14
+ with mic as source:
15
+ print_colored("Recording...", "yellow")
16
+ audio = r.listen(source)
17
+ print_colored("Recording finished.", "green")
18
+
19
+ saved_path = save_audio_file(audio, save_folder_path, file_name)
20
+
21
+ print_colored_separator(f"Audio file saved to: {saved_path}", "green")
22
+ return saved_path
23
+
24
+ def check_input_device(test_duration: int = 1) -> dict:
25
+ """Checks the available microphone devices."""
26
+ devices = sr.Microphone.list_microphone_names()
27
+ available_devices, non_working_devices = [], []
28
+
29
+ for i, device in enumerate(devices):
30
+ try:
31
+ with sr.Microphone(device_index=i) as source:
32
+ sr.Recognizer().listen(source, timeout=test_duration)
33
+ available_devices.append(device)
34
+ except sr.WaitTimeoutError:
35
+ non_working_devices.append(device)
36
+ except Exception as e:
37
+ print(f"An error occurred while testing device {device}: {e}")
38
+
39
+ print_device_table("Available Devices", available_devices)
40
+ print_device_table("Non-Working Devices", non_working_devices)
41
+
42
+ return {'available_devices': available_devices, 'non_working_devices': non_working_devices}
43
+
44
+ def save_audio_file(audio, save_folder_path: str, file_name: str = None) -> str:
45
+ """Saves the audio file to the specified path."""
46
+ os.makedirs(save_folder_path, exist_ok=True)
47
+
48
+ if not file_name:
49
+ timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
50
+ file_name = f"recording_{timestamp}.wav"
51
+ else:
52
+ file_name = f"{file_name}.wav"
53
+
54
+ saved_path = os.path.join(save_folder_path, file_name)
55
+
56
+ with open(saved_path, "wb") as f:
57
+ f.write(audio.get_wav_data())
58
+
59
+ print_colored("Saving audio file...", "yellow")
60
+ return saved_path
61
+
62
+ def print_colored(message: str, color: str):
63
+ """Prints a colored message."""
64
+ print(colored(message, color))
65
+
66
+ def print_colored_separator(message: str, color: str):
67
+ """Prints a colored message with separators."""
68
+ print("--------------------------------")
69
+ print_colored(message, color)
70
+ print("--------------------------------")
71
+
72
+ def print_device_table(title: str, devices: list):
73
+ """Prints a table of devices."""
74
+ device_table = [[i+1, device] for i, device in enumerate(devices)]
75
+ print(f"\n{title}:")
76
+ print(tabulate(device_table, headers=["Index", "Device Name"]))
77
+
pyscript/transcription.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from itertools import cycle
3
+ from termcolor import colored
4
+
5
+ class Transcription:
6
+ """
7
+ A class for storing and saving transcriptions.
8
+
9
+ Attributes:
10
+ -----------
11
+ audio_file_path : str
12
+ The path to the audio file that was transcribed.
13
+ filename : str
14
+ The name of the audio file, without the extension.
15
+ transcriptions : list[str]
16
+ A list of tuples containing the speaker's label and their corresponding transcription, grouped by speaker.
17
+ speaker_names : dict
18
+ A dictionary mapping speaker labels to their assigned names.
19
+ segments : list
20
+ A list of segments from diarization.
21
+
22
+ """
23
+
24
+ def __init__(self, audio_file_path: str, transcriptions: list[str], segments: list[str]):
25
+ self.audio_file_path = audio_file_path
26
+ self.filename = os.path.splitext(os.path.basename(audio_file_path))[0]
27
+ self.transcriptions = self.group_by_speaker(transcriptions)
28
+ self.speaker_names = {}
29
+ self.segments = segments
30
+ self.colors = cycle(['red', 'green', 'blue', 'magenta', 'cyan', 'yellow'])
31
+
32
+ def __repr__(self) -> str:
33
+ result = []
34
+ for speaker, text in self.transcriptions:
35
+ speaker_name = self.speaker_names.get(speaker, speaker)
36
+ result.append(f"{speaker_name}:\n{text}")
37
+ return "\n\n".join(result)
38
+
39
+ def group_by_speaker(self, transcriptions: list[str]) -> list[str]:
40
+ """
41
+ Groups transcriptions by speaker.
42
+
43
+ Parameters
44
+ ----------
45
+ transcriptions : list[str]
46
+ A list of tuples containing the speaker's label and their corresponding transcription.
47
+
48
+ Returns
49
+ -------
50
+ list[str]
51
+ A list of tuples containing the speaker's label and their corresponding transcription, grouped by speaker.
52
+ """
53
+ speaker_transcriptions = []
54
+ previous_speaker = transcriptions[0][0]
55
+ speaker_text = ""
56
+ for speaker, text in transcriptions:
57
+ if speaker == previous_speaker:
58
+ speaker_text += text
59
+ else:
60
+ speaker_transcriptions.append((previous_speaker, speaker_text))
61
+ speaker_text = text
62
+ previous_speaker = speaker
63
+ speaker_transcriptions.append((previous_speaker, speaker_text))
64
+ return speaker_transcriptions
65
+
66
+ def save(self, directory: str = "transcripts") -> None:
67
+ """
68
+ Saves the transcription to a text file.
69
+
70
+ Parameters
71
+ ----------
72
+ directory : str, optional
73
+ The directory to save the transcription to. Defaults to "transcripts".
74
+ """
75
+ if not self.transcriptions:
76
+ raise ValueError("No transcriptions available to save.")
77
+
78
+ os.makedirs(directory, exist_ok=True)
79
+ saving_path = os.path.join(directory, f"{self.filename}_transcript.txt")
80
+
81
+ with open(saving_path, 'w', encoding='utf-8') as f:
82
+ for speaker, text in self.transcriptions:
83
+ if text:
84
+ speaker_name = self.speaker_names.get(speaker, speaker)
85
+ f.write(f"{speaker_name}: {text}\n")
86
+
87
+ print(f"Transcription saved to {saving_path}")
88
+
89
+ def get_name_speakers(self) -> None:
90
+ """
91
+ Interactively assigns names to speakers in the transcriptions and retrieves the name of the speaker.
92
+ Provides a preview of one sentence for each speaker to help recognize who is speaking.
93
+ """
94
+ for speaker, full_text in self.transcriptions:
95
+ if speaker in self.speaker_names:
96
+ continue
97
+
98
+ preview = full_text.split('.')[0] + '.'
99
+ print(f"\nCurrent speaker: {speaker}")
100
+ print(f"Preview: {preview}")
101
+
102
+ new_name = input(f"Enter a name for {speaker} (or press Enter to skip): ").strip()
103
+ if new_name:
104
+ self.speaker_names[speaker] = new_name
105
+ print(f"Speaker {speaker} renamed to {new_name}")
106
+ else:
107
+ print(f"Skipped renaming {speaker}")
108
+
109
+ print("\nSpeaker naming completed.")
110
+ print(f"Updated speaker names: {self.speaker_names}")
pyscript/transcriptor.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ import whisper
4
+ from pyannote.audio import Pipeline
5
+ import torch
6
+ from tqdm import tqdm
7
+ from time import time
8
+ from transformers import pipeline
9
+ from .transcription import Transcription
10
+ from .audio_processing import AudioProcessor
11
+ import io
12
+ from contextlib import redirect_stdout
13
+ import sys
14
+
15
+ load_dotenv()
16
+
17
+ class Transcriptor:
18
+ """
19
+ A class for transcribing and diarizing audio files.
20
+
21
+ This class uses the Whisper model for transcription and the PyAnnote speaker diarization pipeline for speaker identification.
22
+
23
+ Attributes
24
+ ----------
25
+ model_size : str
26
+ The size of the Whisper model to use for transcription. Available options are:
27
+ - 'tiny': Fastest, lowest accuracy
28
+ - 'base': Fast, good accuracy for many use cases
29
+ - 'small': Balanced speed and accuracy
30
+ - 'medium': High accuracy, slower than smaller models
31
+ - 'large-v3': Latest and most accurate version of the large model
32
+ - 'large-v3-turbo': Optimized version of the large-v3 model for faster processing
33
+ model : whisper.model.Whisper
34
+ The Whisper model for transcription.
35
+ pipeline : pyannote.audio.pipelines.SpeakerDiarization
36
+ The PyAnnote speaker diarization pipeline.
37
+
38
+ Usage:
39
+ >>> transcript = Transcriptor(model_size="large-v3")
40
+ >>> transcription = transcript.transcribe_audio("/path/to/audio.wav")
41
+ >>> transcription.get_name_speakers()
42
+ >>> transcription.save("/path/to/transcripts")
43
+
44
+ Note:
45
+ Larger models, especially 'large-v3', provide higher accuracy but require more
46
+ computational resources and may be slower to process audio.
47
+ """
48
+
49
+ def __init__(self, model_size: str = "base"):
50
+ self.model_size = model_size
51
+ self.HF_TOKEN = os.getenv("HF_TOKEN")
52
+ if not self.HF_TOKEN:
53
+ raise ValueError("HF_TOKEN not found. Please store token in .env")
54
+ self._setup()
55
+
56
+ def _setup(self):
57
+ """Initialize the Whisper model and diarization pipeline."""
58
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
59
+ print(f"Using device: {self.device}")
60
+ print("Initializing Whisper model...")
61
+ if self.model_size == "large-v3-turbo":
62
+ self.model = pipeline(
63
+ task="automatic-speech-recognition",
64
+ model="ylacombe/whisper-large-v3-turbo",
65
+ chunk_length_s=30,
66
+ device=self.device,
67
+ )
68
+ else:
69
+ self.model = whisper.load_model(self.model_size, device=self.device)
70
+ print("Building diarization pipeline...")
71
+ self.pipeline = Pipeline.from_pretrained(
72
+ "pyannote/speaker-diarization-3.1",
73
+ use_auth_token=self.HF_TOKEN
74
+ ).to(torch.device(self.device))
75
+ print("Setup completed successfully!")
76
+
77
+ def transcribe_audio(self, audio_file_path: str, enhanced: bool = False, buffer_logs: bool = False):
78
+ """
79
+ Transcribe an audio file.
80
+
81
+ Parameters:
82
+ -----------
83
+ audio_file_path : str
84
+ Path to the audio file to be transcribed.
85
+ enhanced : bool, optional
86
+ If True, applies audio enhancement techniques to improve transcription quality.
87
+ buffer_logs : bool, optional
88
+ If True, captures logs and returns them with the transcription. If False, prints to terminal.
89
+
90
+ Returns:
91
+ --------
92
+ Union[Transcription, Tuple[Transcription, str]]
93
+ Returns either just the Transcription object (if buffer_logs=False)
94
+ or a tuple of (Transcription, logs string) if buffer_logs=True
95
+ """
96
+ if buffer_logs:
97
+ logs_buffer = io.StringIO()
98
+ with redirect_stdout(logs_buffer):
99
+ transcription = self._perform_transcription(audio_file_path, enhanced)
100
+ logs = logs_buffer.getvalue()
101
+ return transcription, logs
102
+ else:
103
+ transcription = self._perform_transcription(audio_file_path, enhanced)
104
+ return transcription
105
+
106
+ def _perform_transcription(self, audio_file_path: str, enhanced: bool = False):
107
+ """Internal method to handle the actual transcription process."""
108
+ try:
109
+ print(f"Received audio_file_path: {audio_file_path}")
110
+ print(f"Type of audio_file_path: {type(audio_file_path)}")
111
+
112
+ if audio_file_path is None:
113
+ raise ValueError("No audio file was uploaded. Please upload an audio file.")
114
+
115
+ if not isinstance(audio_file_path, (str, bytes, os.PathLike)):
116
+ raise ValueError(f"Invalid audio file path type: {type(audio_file_path)}")
117
+
118
+ if not os.path.exists(audio_file_path):
119
+ raise FileNotFoundError(f"Audio file not found at path: {audio_file_path}")
120
+
121
+ print("Processing audio file...")
122
+ processed_audio = self.process_audio(audio_file_path, enhanced)
123
+ audio_file_path = processed_audio.path
124
+ audio, sr, duration = processed_audio.load_as_array(), processed_audio.sample_rate, processed_audio.duration
125
+
126
+ print("Diarization in progress...")
127
+ start_time = time()
128
+ diarization = self.perform_diarization(audio_file_path)
129
+ print(f"Diarization completed in {time() - start_time:.2f} seconds.")
130
+ segments = list(diarization.itertracks(yield_label=True))
131
+
132
+ transcriptions = self.transcribe_segments(audio, sr, duration, segments)
133
+ return Transcription(audio_file_path, transcriptions, segments)
134
+
135
+ except Exception as e:
136
+ print(f"Error occurred: {str(e)}")
137
+ raise RuntimeError(f"Failed to process the audio file: {str(e)}")
138
+
139
+ def process_audio(self, audio_file_path: str, enhanced: bool = False) -> AudioProcessor:
140
+ """
141
+ Process the audio file to ensure it meets the requirements for transcription.
142
+
143
+ Parameters:
144
+ -----------
145
+ audio_file_path : str
146
+ Path to the audio file to be processed.
147
+ enhanced : bool, optional
148
+ If True, applies audio enhancement techniques to improve audio quality.
149
+ This includes optimizing noise reduction, voice enhancement, and volume boosting
150
+ parameters based on the audio characteristics.
151
+
152
+ Returns:
153
+ --------
154
+ AudioProcessor
155
+ An AudioProcessor object containing the processed audio file.
156
+ """
157
+ processed_audio = AudioProcessor(audio_file_path)
158
+ if processed_audio.format != ".wav":
159
+ processed_audio.convert_to_wav()
160
+
161
+ if processed_audio.sample_rate != 16000:
162
+ processed_audio.resample_wav()
163
+
164
+ if enhanced:
165
+ parameters = processed_audio.optimize_enhancement_parameters()
166
+ processed_audio.enhance_audio(noise_reduce_strength=parameters[0],
167
+ voice_enhance_strength=parameters[1],
168
+ volume_boost=parameters[2])
169
+
170
+ processed_audio.display_changes()
171
+ return processed_audio
172
+
173
+ def perform_diarization(self, audio_file_path: str):
174
+ """Perform speaker diarization on the audio file."""
175
+ with torch.no_grad():
176
+ return self.pipeline(audio_file_path)
177
+
178
+ def transcribe_segments(self, audio, sr, duration, segments):
179
+ """Transcribe audio segments based on diarization."""
180
+ transcriptions = []
181
+
182
+ audio_segments = []
183
+ for turn, _, speaker in segments:
184
+ start = turn.start
185
+ end = min(turn.end, duration)
186
+ segment = audio[int(start * sr):int(end * sr)]
187
+ audio_segments.append((segment, speaker))
188
+
189
+ with tqdm(
190
+ total=len(audio_segments),
191
+ desc="Transcribing segments",
192
+ unit="segment",
193
+ ncols=100,
194
+ colour="green",
195
+ file=sys.stdout,
196
+ mininterval=0.1,
197
+ dynamic_ncols=True,
198
+ leave=True
199
+ ) as pbar:
200
+ if self.device == "cuda":
201
+ try:
202
+ total_memory = torch.cuda.get_device_properties(0).total_memory
203
+ reserved_memory = torch.cuda.memory_reserved(0)
204
+ allocated_memory = torch.cuda.memory_allocated(0)
205
+ free_memory = total_memory - reserved_memory - allocated_memory
206
+
207
+ memory_per_sample = 1024 * 1024 * 1024 # 1GB
208
+ batch_size = max(1, min(4, int((free_memory * 0.7) // memory_per_sample)))
209
+ print(f"Using batch size of {batch_size} for GPU processing")
210
+
211
+ for i in range(0, len(audio_segments), batch_size):
212
+ try:
213
+ batch = audio_segments[i:i + batch_size]
214
+ torch.cuda.empty_cache()
215
+ results = self.model([segment for segment, _ in batch])
216
+ for (_, speaker), result in zip(batch, results):
217
+ transcriptions.append((speaker, result['text'].strip()))
218
+ pbar.update(len(batch))
219
+ except RuntimeError as e:
220
+ if "out of memory" in str(e):
221
+ torch.cuda.empty_cache()
222
+ for segment, speaker in batch:
223
+ results = self.model([segment])
224
+ transcriptions.append((speaker, results[0]['text'].strip()))
225
+ pbar.update(0.5)
226
+ else:
227
+ raise e
228
+ except Exception as e:
229
+ print(f"GPU processing failed: {str(e)}. Falling back to CPU processing...")
230
+ self.model = self.model.to('cpu')
231
+ self.device = 'cpu'
232
+ else:
233
+ for segment, speaker in audio_segments:
234
+ if self.model_size == "large-v3-turbo":
235
+ result = self.model(segment)
236
+ transcriptions.append((speaker, result['text'].strip()))
237
+ else:
238
+ result = self.model.transcribe(segment, fp16=self.device == "cuda")
239
+ transcriptions.append((speaker, result['text'].strip()))
240
+ pbar.update(1)
241
+
242
+ return transcriptions
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ openai-whisper @ git+https://github.com/openai/whisper.git@ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab
2
+ pyannote.audio==3.3.1
3
+ librosa==0.10.2.post1
4
+ tqdm==4.66.5
5
+ python-dotenv==1.0.1
6
+ termcolor==2.4.0
7
+ pydub==0.25.1
8
+ SpeechRecognition==3.10.4
9
+ PyAudio==0.2.14
10
+ tabulate==0.9.0
11
+ soundfile==0.12.1
12
+ numpy==1.26.4
13
+ transformers==4.46.0
14
+ gradio==5.3.0
15
+ torch==2.4.1
16
+ torchaudio==2.4.1
17
+ python-multipart==0.0.12