John Liao commited on
Commit
5cfb839
·
verified ·
1 Parent(s): e8e93a8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -2
app.py CHANGED
@@ -1,5 +1,7 @@
1
  import subprocess
2
  import sys
 
 
3
 
4
  try:
5
  import openai
@@ -25,6 +27,40 @@ def openai_api(prompt, key):
25
  )
26
  return completion.choices[0].message.content
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def setup_gradio_interface():
29
  with gr.Blocks() as demo:
30
  gr.Markdown("音頻轉文字,並擷取重點")
@@ -37,12 +73,12 @@ def setup_gradio_interface():
37
  with gr.Row():
38
  content = gr.Textbox(label="第五步:檢視轉譯逐字稿", value="轉譯逐字稿")
39
  submit2_button = gr.Button("第六步:開始重點摘錄")
40
- summary = gr.Textbox(label="第七步:輸出重點摘錄", value="重點摘錄")
41
  file_output2_txt = gr.File(label="第八步:下載重點摘錄(Optional)")
42
 
43
  def transcribe_and_download(file, key):
44
  if file is not None:
45
- txt_content = transcribe(file,key)
46
  txt_path = "transcribe.txt"
47
  with open(txt_path, "w") as txt_file:
48
  txt_file.write(txt_content)
 
1
  import subprocess
2
  import sys
3
+ import os
4
+ from pydub import AudioSegment
5
 
6
  try:
7
  import openai
 
27
  )
28
  return completion.choices[0].message.content
29
 
30
+ def transcribe_large_audio(filename, key, segment_length_ms = 30 * 60 *1000):
31
+ def get_file_size_in_mb(file_path):
32
+ return os.path.getsize(file_path) / (1024 * 1024)
33
+
34
+ def split_audio_file(file_path, segment_length_ms = 30 * 60 *1000):
35
+ audio = AudioSegment.from_file(filename, format="mp3")
36
+ segment_filenames = []
37
+ for i in range(0, len(audio), segment_length_ms):
38
+ end = min(i + segment_length_ms, len(audio))
39
+ segment = audio[i:end]
40
+ segment_filename = f"{file_path}_part{len(segment_filenames) + 1}.mp3"
41
+ segment.export(segment_filename, format="mp3", bitrate="36k")
42
+ segment_filenames.append(segment_filename)
43
+ return segment_filenames
44
+
45
+ openai.api_key = key
46
+
47
+ transcript_txt = ""
48
+
49
+ if get_file_size_in_mb(filename) > 25:
50
+ # Split the audio file if it exceeds the chunk size
51
+ audio_chunks = split_audio_file(filename)
52
+ print(audio_chunks)
53
+ # Process each chunk separately
54
+ for chunk_filename in audio_chunks:
55
+ print(chunk_filename)
56
+ transcript_txt += transcribe(chunk_filename, key)
57
+ # Remove the temporary chunk file
58
+ os.remove(chunk_filename)
59
+ else:
60
+ transcript_txt = transcribe(filename, key)
61
+
62
+ return transcript_txt
63
+
64
  def setup_gradio_interface():
65
  with gr.Blocks() as demo:
66
  gr.Markdown("音頻轉文字,並擷取重點")
 
73
  with gr.Row():
74
  content = gr.Textbox(label="第五步:檢視轉譯逐字稿", value="轉譯逐字稿")
75
  submit2_button = gr.Button("第六步:開始重點摘錄")
76
+ summary = gr.Textbox(label="第七步:輸出重點摘錄(Markdown格式)", value="重點摘錄")
77
  file_output2_txt = gr.File(label="第八步:下載重點摘錄(Optional)")
78
 
79
  def transcribe_and_download(file, key):
80
  if file is not None:
81
+ txt_content = transcribe_large_audio(file,key)
82
  txt_path = "transcribe.txt"
83
  with open(txt_path, "w") as txt_file:
84
  txt_file.write(txt_content)