Siddhant Arora commited on
Commit
330bd18
1 Parent(s): 38787ca

Update space

Browse files
LLM/__pycache__/chat.cpython-310.pyc ADDED
Binary file (1.04 kB). View file
 
LLM/__pycache__/chat.cpython-39.pyc ADDED
Binary file (1.03 kB). View file
 
LLM/mlx_language_model.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from LLM.chat import Chat
3
+ from baseHandler import BaseHandler
4
+ from mlx_lm import load, stream_generate, generate
5
+ from rich.console import Console
6
+ import torch
7
+
8
+ logging.basicConfig(
9
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
10
+ )
11
+ logger = logging.getLogger(__name__)
12
+
13
+ console = Console()
14
+
15
+
16
+ class MLXLanguageModelHandler(BaseHandler):
17
+ """
18
+ Handles the language model part.
19
+ """
20
+
21
+ def setup(
22
+ self,
23
+ model_name="microsoft/Phi-3-mini-4k-instruct",
24
+ device="mps",
25
+ torch_dtype="float16",
26
+ gen_kwargs={},
27
+ user_role="user",
28
+ chat_size=1,
29
+ init_chat_role=None,
30
+ init_chat_prompt="You are a helpful AI assistant.",
31
+ ):
32
+ self.model_name = model_name
33
+ self.model, self.tokenizer = load(self.model_name)
34
+ self.gen_kwargs = gen_kwargs
35
+
36
+ self.chat = Chat(chat_size)
37
+ if init_chat_role:
38
+ if not init_chat_prompt:
39
+ raise ValueError(
40
+ "An initial promt needs to be specified when setting init_chat_role."
41
+ )
42
+ self.chat.init_chat({"role": init_chat_role, "content": init_chat_prompt})
43
+ self.user_role = user_role
44
+
45
+ self.warmup()
46
+
47
+ def warmup(self):
48
+ logger.info(f"Warming up {self.__class__.__name__}")
49
+
50
+ dummy_input_text = "Write me a poem about Machine Learning."
51
+ dummy_chat = [{"role": self.user_role, "content": dummy_input_text}]
52
+
53
+ n_steps = 2
54
+
55
+ for _ in range(n_steps):
56
+ prompt = self.tokenizer.apply_chat_template(dummy_chat, tokenize=False)
57
+ generate(
58
+ self.model,
59
+ self.tokenizer,
60
+ prompt=prompt,
61
+ max_tokens=self.gen_kwargs["max_new_tokens"],
62
+ verbose=False,
63
+ )
64
+
65
+ def process(self, prompt):
66
+ logger.debug("infering language model...")
67
+
68
+ self.chat.append({"role": self.user_role, "content": prompt})
69
+
70
+ # Remove system messages if using a Gemma model
71
+ if "gemma" in self.model_name.lower():
72
+ chat_messages = [
73
+ msg for msg in self.chat.to_list() if msg["role"] != "system"
74
+ ]
75
+ else:
76
+ chat_messages = self.chat.to_list()
77
+
78
+ prompt = self.tokenizer.apply_chat_template(
79
+ chat_messages, tokenize=False, add_generation_prompt=True
80
+ )
81
+ output = ""
82
+ curr_output = ""
83
+ for t in stream_generate(
84
+ self.model,
85
+ self.tokenizer,
86
+ prompt,
87
+ max_tokens=self.gen_kwargs["max_new_tokens"],
88
+ ):
89
+ output += t
90
+ curr_output += t
91
+ if curr_output.endswith((".", "?", "!", "<|end|>")):
92
+ yield curr_output.replace("<|end|>", "")
93
+ curr_output = ""
94
+ generated_text = output.replace("<|end|>", "")
95
+ torch.mps.empty_cache()
96
+
97
+ self.chat.append({"role": "assistant", "content": generated_text})
VAD/__pycache__/vad_iterator.cpython-310.pyc ADDED
Binary file (2.98 kB). View file
 
VAD/__pycache__/vad_iterator.cpython-39.pyc ADDED
Binary file (2.96 kB). View file
 
app.py CHANGED
@@ -1,8 +1,43 @@
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from transformers import pipeline
3
  import numpy as np
4
  from VAD.vad_iterator import VADIterator
5
  import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  def int2float(sound):
8
  """
@@ -16,10 +51,13 @@ def int2float(sound):
16
  sound = sound.squeeze() # depends on the use case
17
  return sound
18
 
19
- min_speech_ms=500,
20
- max_speech_ms=float("inf"),
 
 
 
21
  transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
22
- vad_model, _ = torch.hub.load("snakers4/silero-vad", "silero_vad")
23
  vad_iterator = VADIterator(
24
  vad_model,
25
  threshold=0.3,
@@ -31,131 +69,72 @@ vad_iterator = VADIterator(
31
 
32
  def transcribe(stream, new_chunk):
33
  sr, y = new_chunk
34
- print(sr)
35
- print(y.shape)
 
 
 
36
  audio_int16 = np.frombuffer(y, dtype=np.int16)
37
  audio_float32 = int2float(audio_int16)
 
 
 
 
38
  vad_output = vad_iterator(torch.from_numpy(audio_float32))
 
39
  if vad_output is not None and len(vad_output) != 0:
40
- logger.debug("VAD: end of speech detected")
41
  array = torch.cat(vad_output).cpu().numpy()
42
  duration_ms = len(array) / sr * 1000
43
  if (not(duration_ms < min_speech_ms or duration_ms > max_speech_ms)):
44
- y = array.astype(np.float32)
45
- y /= np.max(np.abs(y))
46
- return stream, transcriber({"sampling_rate": sr, "raw": y})["text"] , y
47
- return stream, None , None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  demo = gr.Interface(
50
  transcribe,
51
  ["state", gr.Audio(sources=["microphone"], streaming=True, waveform_options=gr.WaveformOptions(sample_rate=16000))],
52
- ["state", "text", gr.Audio(label="Output", streaming=True, autoplay=True, waveform_options=gr.WaveformOptions(sample_rate=16000))],
53
  live=True,
54
  )
55
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  demo.launch()
57
- # from transformers import pipeline
58
- # import torch
59
-
60
- # device = "cuda:0" if torch.cuda.is_available() else "cpu"
61
-
62
- # classifier = pipeline(
63
- # "audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=device
64
- # )
65
-
66
- # from transformers.pipelines.audio_utils import ffmpeg_microphone_live
67
-
68
-
69
- # def launch_fn(
70
- # wake_word="marvin",
71
- # prob_threshold=0.5,
72
- # chunk_length_s=2.0,
73
- # stream_chunk_s=0.25,
74
- # debug=False,
75
- # ):
76
- # if wake_word not in classifier.model.config.label2id.keys():
77
- # raise ValueError(
78
- # f"Wake word {wake_word} not in set of valid class labels, pick a wake word in the set {classifier.model.config.label2id.keys()}."
79
- # )
80
-
81
- # sampling_rate = classifier.feature_extractor.sampling_rate
82
-
83
- # mic = ffmpeg_microphone_live(
84
- # sampling_rate=sampling_rate,
85
- # chunk_length_s=chunk_length_s,
86
- # stream_chunk_s=stream_chunk_s,
87
- # )
88
-
89
- # print("Listening for wake word...")
90
- # for prediction in classifier(mic):
91
- # prediction = prediction[0]
92
- # if debug:
93
- # print(prediction)
94
- # if prediction["label"] == wake_word:
95
- # if prediction["score"] > prob_threshold:
96
- # return True
97
-
98
- # transcriber = pipeline(
99
- # "automatic-speech-recognition", model="openai/whisper-base.en", device=device
100
- # )
101
- # import sys
102
-
103
-
104
- # def transcribe(chunk_length_s=5.0, stream_chunk_s=1.0):
105
- # sampling_rate = transcriber.feature_extractor.sampling_rate
106
-
107
- # mic = ffmpeg_microphone_live(
108
- # sampling_rate=sampling_rate,
109
- # chunk_length_s=chunk_length_s,
110
- # stream_chunk_s=stream_chunk_s,
111
- # )
112
-
113
- # print("Start speaking...")
114
- # for item in transcriber(mic, generate_kwargs={"max_new_tokens": 128}):
115
- # sys.stdout.write("\033[K")
116
- # print(item["text"], end="\r")
117
- # if not item["partial"][0]:
118
- # break
119
-
120
- # return item["text"]
121
-
122
- # from huggingface_hub import HfFolder
123
- # import requests
124
-
125
-
126
- # def query(text, model_id="tiiuae/falcon-7b-instruct"):
127
- # api_url = f"https://api-inference.huggingface.co/models/{model_id}"
128
- # headers = {"Authorization": f"Bearer {HfFolder().get_token()}"}
129
- # payload = {"inputs": text}
130
-
131
- # print(f"Querying...: {text}")
132
- # response = requests.post(api_url, headers=headers, json=payload)
133
- # return response.json()[0]["generated_text"][len(text) + 1 :]
134
-
135
- # from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
136
-
137
- # processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
138
-
139
- # model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
140
- # vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
141
-
142
- # from datasets import load_dataset
143
-
144
- # embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
145
- # speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
146
-
147
- # def synthesise(text):
148
- # inputs = processor(text=text, return_tensors="pt")
149
- # speech = model.generate_speech(
150
- # inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
151
- # )
152
- # return speech.cpu()
153
-
154
-
155
- # if __name__ == "__main__":
156
- # launch_fn(debug=True)
157
- # # transcription = transcribe()
158
- # # response = query(transcription)
159
- # # audio = synthesise(response)
160
-
161
- # # Audio(audio, rate=16000, autoplay=True)
 
1
+ # import base64
2
+ # import pathlib
3
+ # import tempfile
4
  import gradio as gr
5
+
6
+ # recorder_js = pathlib.Path('recorder.js').read_text()
7
+ # main_js = pathlib.Path('main.js').read_text()
8
+ # record_button_js = pathlib.Path('record_button.js').read_text().replace('let recorder_js = null;', recorder_js).replace(
9
+ # 'let main_js = null;', main_js)
10
+
11
+
12
+ # def save_base64_video(base64_string):
13
+ # base64_video = base64_string
14
+ # video_data = base64.b64decode(base64_video)
15
+ # with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
16
+ # temp_filename = temp_file.name
17
+ # temp_file.write(video_data)
18
+ # print(f"Temporary MP4 file saved as: {temp_filename}")
19
+ # return temp_filename
20
+ # import os
21
+
22
+ # os.system('python -m unidic download')
23
  from transformers import pipeline
24
  import numpy as np
25
  from VAD.vad_iterator import VADIterator
26
  import torch
27
+ import librosa
28
+ from mlx_lm import load, stream_generate, generate
29
+ from LLM.chat import Chat
30
+ from lightning_whisper_mlx import LightningWhisperMLX
31
+ from melo.api import TTS
32
+
33
+ LM_model, LM_tokenizer = load("mlx-community/SmolLM-360M-Instruct")
34
+ chat = Chat(2)
35
+ chat.init_chat({"role": "system", "content": "You are a helpful and friendly AI assistant. You are polite, respectful, and aim to provide concise responses of less than 20 words."})
36
+ user_role = "user"
37
+
38
+ tts_model = TTS(language="EN_NEWEST", device="auto")
39
+ speaker_id = tts_model.hps.data.spk2id["EN-Newest"]
40
+ blocksize = 512
41
 
42
  def int2float(sound):
43
  """
 
51
  sound = sound.squeeze() # depends on the use case
52
  return sound
53
 
54
+ text_str=""
55
+ audio_output = None
56
+ min_speech_ms=500
57
+ max_speech_ms=float("inf")
58
+ ASR_model = LightningWhisperMLX(model="distil-large-v3", batch_size=6, quant=None)
59
  transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
60
+ vad_model, _ = torch.hub.load("snakers4/silero-vad:v4.0", "silero_vad")
61
  vad_iterator = VADIterator(
62
  vad_model,
63
  threshold=0.3,
 
69
 
70
  def transcribe(stream, new_chunk):
71
  sr, y = new_chunk
72
+ global text_str
73
+ global chat
74
+ global user_role
75
+ global audio_output
76
+
77
  audio_int16 = np.frombuffer(y, dtype=np.int16)
78
  audio_float32 = int2float(audio_int16)
79
+ audio_float32=librosa.resample(audio_float32, orig_sr=sr, target_sr=16000)
80
+ sr=16000
81
+ print(sr)
82
+ print(audio_float32.shape)
83
  vad_output = vad_iterator(torch.from_numpy(audio_float32))
84
+
85
  if vad_output is not None and len(vad_output) != 0:
86
+ print("VAD: end of speech detected")
87
  array = torch.cat(vad_output).cpu().numpy()
88
  duration_ms = len(array) / sr * 1000
89
  if (not(duration_ms < min_speech_ms or duration_ms > max_speech_ms)):
90
+ prompt=ASR_model.transcribe(array)["text"].strip()
91
+ chat.append({"role": user_role, "content": prompt})
92
+ chat_messages = chat.to_list()
93
+ prompt = LM_tokenizer.apply_chat_template(
94
+ chat_messages, tokenize=False, add_generation_prompt=True
95
+ )
96
+ output = generate(
97
+ LM_model,
98
+ LM_tokenizer,
99
+ prompt,
100
+ max_tokens=128,
101
+ )
102
+ # import pdb;pdb.set_trace()
103
+ generated_text = output.replace("<|end|>", "")
104
+ torch.mps.empty_cache()
105
+
106
+ chat.append({"role": "assistant", "content": generated_text})
107
+ text_str=generated_text
108
+ # import pdb;pdb.set_trace()
109
+ audio_chunk = tts_model.tts_to_file(text_str, speaker_id, quiet=True)
110
+ audio_chunk = (audio_chunk * 32768).astype(np.int16)
111
+ audio_output=(44100, audio_chunk)
112
+ # else:
113
+ # audio_output=None
114
+ text_str1=text_str
115
+
116
+ return stream, text_str1, audio_output
117
 
118
  demo = gr.Interface(
119
  transcribe,
120
  ["state", gr.Audio(sources=["microphone"], streaming=True, waveform_options=gr.WaveformOptions(sample_rate=16000))],
121
+ ["state", "text", gr.Audio(label="Output", autoplay=True)],
122
  live=True,
123
  )
124
+ # with demo:
125
+ # start_button = gr.Button("Record Screen 🔴")
126
+ # video_component = gr.Video(interactive=True, show_share_button=True, include_audio=True)
127
+
128
+
129
+ # def toggle_button_label(returned_string):
130
+ # if returned_string.startswith("Record"):
131
+ # return gr.Button(value="Stop Recording ⚪"), None
132
+ # else:
133
+ # try:
134
+ # temp_filename = save_base64_video(returned_string)
135
+ # except Exception as e:
136
+ # return gr.Button(value="Record Screen 🔴"), gr.Warning(f'Failed to convert video to mp4:\n{e}')
137
+ # return gr.Button(value="Record Screen 🔴"), gr.Video(value=temp_filename, interactive=True,
138
+ # show_share_button=True)
139
+ # start_button.click(toggle_button_label, start_button, [start_button, video_component], js=record_button_js)
140
  demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
flagged/log.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ stream,new_chunk,stream,output 1,flag,username,timestamp
2
+ ,flagged/new_chunk/65327197a5439319f87d/audio.wav,,,,,2024-09-07 15:26:52.280189
flagged/new_chunk/65327197a5439319f87d/audio.wav ADDED
Binary file (34.6 kB). View file
 
main.js ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // main.js
2
+ if (!ScreenCastRecorder.isSupportedBrowser()) {
3
+ console.error("Screen Recording not supported in this browser");
4
+ }
5
+ let recorder;
6
+ let outputBlob;
7
+ const stopRecording = () => __awaiter(void 0, void 0, void 0, function* () {
8
+ let currentState = "RECORDING";
9
+ // We should do nothing if the user try to stop recording when it is not started
10
+ if (currentState === "OFF" || recorder == null) {
11
+ return;
12
+ }
13
+ // if (currentState === "COUNTDOWN") {
14
+ // this.setState({
15
+ // currentState: "OFF",
16
+ // })
17
+ // }
18
+ if (currentState === "RECORDING") {
19
+ if (recorder.getState() === "inactive") {
20
+ // this.setState({
21
+ // currentState: "OFF",
22
+ // })
23
+ console.log("Inactive");
24
+ }
25
+ else {
26
+ outputBlob = yield recorder.stop();
27
+ console.log("Done recording");
28
+ // this.setState({
29
+ // outputBlob,
30
+ // currentState: "PREVIEW_FILE",
31
+ // })
32
+ window.currentState = "PREVIEW_FILE";
33
+ const videoSource = URL.createObjectURL(outputBlob);
34
+ window.videoSource = videoSource;
35
+ const fileName = "recording";
36
+ const link = document.createElement("a");
37
+ link.setAttribute("href", videoSource);
38
+ link.setAttribute("download", `${fileName}.webm`);
39
+ link.click();
40
+ }
41
+ }
42
+ });
43
+ const startRecording = () => __awaiter(void 0, void 0, void 0, function* () {
44
+ const recordAudio = true;
45
+ recorder = new ScreenCastRecorder({
46
+ recordAudio,
47
+ onErrorOrStop: () => stopRecording(),
48
+ });
49
+ try {
50
+ yield recorder.initialize();
51
+ }
52
+ catch (e) {
53
+ console.warn(`ScreenCastRecorder.initialize error: ${e}`);
54
+ // this.setState({ currentState: "UNSUPPORTED" })
55
+ window.currentState = "UNSUPPORTED";
56
+ return;
57
+ }
58
+ // this.setState({ currentState: "COUNTDOWN" })
59
+ const hasStarted = recorder.start();
60
+ if (hasStarted) {
61
+ // this.setState({
62
+ // currentState: "RECORDING",
63
+ // })
64
+ console.log("Started recording");
65
+ window.currentState = "RECORDING";
66
+ }
67
+ else {
68
+ stopRecording().catch(err => console.warn(`withScreencast.stopRecording threw an error: ${err}`));
69
+ }
70
+ });
71
+
72
+ // Set global functions to window.
73
+ window.startRecording = startRecording;
74
+ window.stopRecording = stopRecording;
mlx_models/distil-large-v3/config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_mels": 128,
3
+ "n_audio_ctx": 1500,
4
+ "n_audio_state": 1280,
5
+ "n_audio_head": 20,
6
+ "n_audio_layer": 32,
7
+ "n_vocab": 51866,
8
+ "n_text_ctx": 448,
9
+ "n_text_state": 1280,
10
+ "n_text_head": 20,
11
+ "n_text_layer": 2,
12
+ "model_type": "whisper"
13
+ }
mlx_models/distil-large-v3/weights.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fd01bf050289525f91ff3d96e2880381367a34beb3520ad516181517b209ebc
3
+ size 1509130112
record_button.js ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Setup if needed and start recording.
2
+ async () => {
3
+ // Set up recording functions if not already initialized
4
+ if (!window.startRecording) {
5
+ let recorder_js = null;
6
+ let main_js = null;
7
+ }
8
+
9
+ // Function to fetch and convert video blob to base64 using async/await without explicit Promise
10
+ async function getVideoBlobAsBase64(objectURL) {
11
+ const response = await fetch(objectURL);
12
+ if (!response.ok) {
13
+ throw new Error('Failed to fetch video blob.');
14
+ }
15
+
16
+ const blob = await response.blob();
17
+
18
+ const reader = new FileReader();
19
+ reader.readAsDataURL(blob);
20
+
21
+ return new Promise((resolve, reject) => {
22
+ reader.onloadend = () => {
23
+ if (reader.result) {
24
+ resolve(reader.result.split(',')[1]); // Return the base64 string (without data URI prefix)
25
+ } else {
26
+ reject('Failed to convert blob to base64.');
27
+ }
28
+ };
29
+ });
30
+ }
31
+
32
+ if (window.currentState === "RECORDING") {
33
+ await window.stopRecording();
34
+ const base64String = await getVideoBlobAsBase64(window.videoSource);
35
+ return base64String;
36
+ } else {
37
+ window.startRecording();
38
+ return "Record";
39
+ }
40
+ }
recorder.js ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // recorder.js
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ const BLOB_TYPE = "video/webm";
12
+ class ScreenCastRecorder {
13
+ /** True if the current browser likely supports screencasts. */
14
+ static isSupportedBrowser() {
15
+ return (navigator.mediaDevices != null &&
16
+ navigator.mediaDevices.getUserMedia != null &&
17
+ navigator.mediaDevices.getDisplayMedia != null &&
18
+ MediaRecorder.isTypeSupported(BLOB_TYPE));
19
+ }
20
+ constructor({ recordAudio, onErrorOrStop }) {
21
+ this.recordAudio = recordAudio;
22
+ this.onErrorOrStopCallback = onErrorOrStop;
23
+ this.inputStream = null;
24
+ this.recordedChunks = [];
25
+ this.mediaRecorder = null;
26
+ }
27
+ /**
28
+ * This asynchronous method will initialize the screen recording object asking
29
+ * for permissions to the user which are needed to start recording.
30
+ */
31
+ initialize() {
32
+ return __awaiter(this, void 0, void 0, function* () {
33
+ const desktopStream = yield navigator.mediaDevices.getDisplayMedia({
34
+ video: true,
35
+ });
36
+ let tracks = desktopStream.getTracks();
37
+ if (this.recordAudio) {
38
+ const voiceStream = yield navigator.mediaDevices.getUserMedia({
39
+ video: false,
40
+ audio: true,
41
+ });
42
+ tracks = tracks.concat(voiceStream.getAudioTracks());
43
+ }
44
+ this.recordedChunks = [];
45
+ this.inputStream = new MediaStream(tracks);
46
+ this.mediaRecorder = new MediaRecorder(this.inputStream, {
47
+ mimeType: BLOB_TYPE,
48
+ });
49
+ this.mediaRecorder.ondataavailable = e => this.recordedChunks.push(e.data);
50
+ });
51
+ }
52
+ getState() {
53
+ if (this.mediaRecorder) {
54
+ return this.mediaRecorder.state;
55
+ }
56
+ return "inactive";
57
+ }
58
+ /**
59
+ * This method will start the screen recording if the user has granted permissions
60
+ * and the mediaRecorder has been initialized
61
+ *
62
+ * @returns {boolean}
63
+ */
64
+ start() {
65
+ if (!this.mediaRecorder) {
66
+ console.warn(`ScreenCastRecorder.start: mediaRecorder is null`);
67
+ return false;
68
+ }
69
+ const logRecorderError = (e) => {
70
+ console.warn(`mediaRecorder.start threw an error: ${e}`);
71
+ };
72
+ this.mediaRecorder.onerror = (e) => {
73
+ logRecorderError(e);
74
+ this.onErrorOrStopCallback();
75
+ };
76
+ this.mediaRecorder.onstop = () => this.onErrorOrStopCallback();
77
+ try {
78
+ this.mediaRecorder.start();
79
+ }
80
+ catch (e) {
81
+ logRecorderError(e);
82
+ return false;
83
+ }
84
+ return true;
85
+ }
86
+ /**
87
+ * This method will stop recording and then return the generated Blob
88
+ *
89
+ * @returns {(Promise|undefined)}
90
+ * A Promise which will return the generated Blob
91
+ * Undefined if the MediaRecorder could not initialize
92
+ */
93
+ stop() {
94
+ if (!this.mediaRecorder) {
95
+ return undefined;
96
+ }
97
+ let resolver;
98
+ const promise = new Promise(r => {
99
+ resolver = r;
100
+ });
101
+ this.mediaRecorder.onstop = () => resolver();
102
+ this.mediaRecorder.stop();
103
+ if (this.inputStream) {
104
+ this.inputStream.getTracks().forEach(s => s.stop());
105
+ this.inputStream = null;
106
+ }
107
+ return promise.then(() => this.buildOutputBlob());
108
+ }
109
+ buildOutputBlob() {
110
+ return new Blob(this.recordedChunks, { type: BLOB_TYPE });
111
+ }
112
+ }
requirements.txt CHANGED
@@ -1,7 +1,11 @@
1
- huggingface_hub==0.22.2
2
  transformers[sentencepiece]
3
- sentencepiece
4
  datasets
5
- huggingface_hub
6
  torch==2.4.0
7
- torchaudio
 
 
 
 
 
 
1
+ huggingface_hub==0.23.2
2
  transformers[sentencepiece]
3
+ sentencepiece==0.2.0
4
  datasets
 
5
  torch==2.4.0
6
+ torchaudio
7
+ librosa
8
+ lightning-whisper-mlx>=0.0.10
9
+ mlx-lm>=0.14.0
10
+ melotts @ git+https://github.com/andimarafioti/MeloTTS.git#egg=MeloTTS # made a copy of MeloTTS to have compatible versions of transformers
11
+ sounddevice==0.5.0