kadirnar commited on
Commit
bd8e31e
1 Parent(s): 580270a
Files changed (1) hide show
  1. app.py +64 -2
app.py CHANGED
@@ -1,9 +1,71 @@
1
  import gradio as gr
2
 
3
- from whisperplus.pipelines.whisper import SpeechToTextPipeline
4
  from whisperplus.utils.download_utils import download_and_convert_to_mp3
5
- from whisperplus.utils.text_utils import format_speech_to_dialogue
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  def youtube_url_to_text(url, model_id, language_choice):
9
  """
 
1
  import gradio as gr
2
 
 
3
  from whisperplus.utils.download_utils import download_and_convert_to_mp3
 
4
 
5
+ import logging
6
+
7
+ import torch
8
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
9
+
10
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
11
+
12
+
13
+ class SpeechToTextPipeline:
14
+ """Class for converting audio to text using a pre-trained speech recognition model."""
15
+
16
+ def __init__(self, model_id: str = "openai/whisper-large-v3"):
17
+ self.model = None
18
+ self.device = None
19
+
20
+ if self.model is None:
21
+ self.load_model(model_id)
22
+ else:
23
+ logging.info("Model already loaded.")
24
+
25
+ def load_model(self, model_id: str = "openai/whisper-large-v3"):
26
+ """
27
+ Loads the pre-trained speech recognition model and moves it to the specified device.
28
+
29
+ Args:
30
+ model_id (str): Identifier of the pre-trained model to be loaded.
31
+ """
32
+ logging.info("Loading model...")
33
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
34
+ model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True)
35
+ model.to(self.device)
36
+ logging.info("Model loaded successfully.")
37
+
38
+ self.model = model
39
+
40
+ def __call__(self, audio_path: str, model_id: str = "openai/whisper-large-v3", language: str = "turkish"):
41
+ """
42
+ Converts audio to text using the pre-trained speech recognition model.
43
+
44
+ Args:
45
+ audio_path (str): Path to the audio file to be transcribed.
46
+ model_id (str): Identifier of the pre-trained model to be used for transcription.
47
+
48
+ Returns:
49
+ str: Transcribed text from the audio.
50
+ """
51
+ processor = AutoProcessor.from_pretrained(model_id)
52
+ pipe = pipeline(
53
+ "automatic-speech-recognition",
54
+ model=self.model,
55
+ torch_dtype=torch.float16,
56
+ chunk_length_s=30,
57
+ max_new_tokens=128,
58
+ batch_size=24,
59
+ return_timestamps=True,
60
+ device="cuda",
61
+ tokenizer=processor.tokenizer,
62
+ feature_extractor=processor.feature_extractor,
63
+ model_kwargs={"use_flash_attention_2": True},
64
+ generate_kwargs={"language": language},
65
+ )
66
+ logging.info("Transcribing audio...")
67
+ result = pipe(audio_path)["text"]
68
+ return result
69
 
70
  def youtube_url_to_text(url, model_id, language_choice):
71
  """