Cryptic commited on
Commit
10cfa3b
·
1 Parent(s): 394213a
Files changed (2) hide show
  1. app.py +58 -182
  2. requirements.txt +6 -6
app.py CHANGED
@@ -1,202 +1,78 @@
1
- import streamlit as st
2
- import tempfile
3
  import os
 
 
4
  import librosa
5
  import numpy as np
6
- from transformers import pipeline
7
- import torch
8
  import soundfile as sf
9
- import json # For JSON response
10
-
11
- # Page configuration
12
- st.set_page_config(page_title="Audio Processing App", layout="wide")
13
- st.title("Audio Lecture Processing App")
14
 
15
- # Initialize session state
16
- if 'models_loaded' not in st.session_state:
17
- st.session_state.models_loaded = False
18
 
19
- @st.cache_resource
20
- def load_models():
21
- """Load ML models with proper error handling"""
22
- try:
23
- # Check for CUDA availability
24
- device = 0 if torch.cuda.is_available() else -1
25
-
26
- models = {
27
- 'transcriber': pipeline("automatic-speech-recognition",
28
- model="openai/whisper-tiny.en",
29
- device=device,
30
- chunk_length_s=30), # Process in 30-second chunks
31
- 'summarizer': pipeline("summarization",
32
- model="sshleifer/distilbart-cnn-12-6",
33
- device=device)
34
- }
35
- return models, None
36
- except Exception as e:
37
- return None, f"Error loading models: {str(e)}"
38
 
39
  def load_and_convert_audio(audio_path):
40
  """Load audio using librosa and convert to WAV format"""
41
- try:
42
- # Load audio with librosa (handles many formats)
43
- audio_data, sample_rate = librosa.load(audio_path, sr=16000) # Whisper expects 16kHz
44
-
45
- # Convert to float32
46
- audio_data = audio_data.astype(np.float32)
47
-
48
- # Create a temporary WAV file
49
- with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_wav:
50
- sf.write(temp_wav.name, audio_data, sample_rate, format='WAV')
51
- return temp_wav.name
52
- except Exception as e:
53
- raise Exception(f"Error converting audio: {str(e)}")
54
 
55
- def process_audio(audio_path, models):
56
- """Process audio file with progress tracking"""
57
  results = {}
58
- temp_wav_path = None
59
-
60
- try:
61
- # Convert audio to compatible format
62
- with st.spinner('Converting audio format...'):
63
- temp_wav_path = load_and_convert_audio(audio_path)
64
-
65
- # Transcription with progress bar
66
- with st.spinner('Transcribing audio...'):
67
- # Use return_timestamps=True for long audio files
68
- transcription = models['transcriber'](
69
- temp_wav_path,
70
- return_timestamps=True # Remove task specification for English-only model
71
- )
72
-
73
- # Extract full text from chunks
74
- if isinstance(transcription, dict):
75
- results['transcription'] = transcription['text']
76
- else:
77
- # Combine chunks maintaining order
78
- results['transcription'] = ' '.join([chunk['text'] for chunk in transcription])
79
-
80
- # Summarization with chunking for long text
81
- with st.spinner('Generating summary...'):
82
- text = results['transcription']
83
-
84
- # Split long text into chunks of ~1000 words for summarization
85
- words = text.split()
86
- chunk_size = 1000
87
- chunks = [' '.join(words[i:i + chunk_size])
88
- for i in range(0, len(words), chunk_size)]
89
-
90
- # Summarize each chunk
91
- summaries = []
92
- progress_text = st.empty()
93
- for i, chunk in enumerate(chunks):
94
- progress_text.text(f"Summarizing chunk {i+1} of {len(chunks)}")
95
-
96
- summary = models['summarizer'](
97
- chunk,
98
- max_length=200,
99
- min_length=50,
100
- truncation=True
101
- )
102
- summaries.append(summary[0]['summary_text'])
103
-
104
- # Combine summaries
105
- combined_summary = ' '.join(summaries)
106
-
107
- # Final summarization if multiple chunks exist
108
- if len(summaries) > 1:
109
- progress_text.text("Creating final summary...")
110
- combined_summary = models['summarizer'](
111
- combined_summary,
112
- max_length=200,
113
- min_length=50,
114
- truncation=True
115
- )[0]['summary_text']
116
-
117
- progress_text.empty()
118
- results['summary'] = combined_summary
119
-
120
- # Clean up summary
121
- if not results['summary'].endswith((".", "!", "?")):
122
- last_period_index = results['summary'].rfind(".")
123
- if last_period_index != -1:
124
- results['summary'] = results['summary'][:last_period_index + 1]
125
-
126
- except Exception as e:
127
- st.error(f"Error processing audio: {str(e)}")
128
- return None
129
 
130
- finally:
131
- # Clean up temporary WAV file
132
- if temp_wav_path and os.path.exists(temp_wav_path):
133
- try:
134
- os.unlink(temp_wav_path)
135
- except:
136
- pass
137
 
138
- return results
 
139
 
140
- # Main app
141
- def main():
142
- # Load models
143
- if not st.session_state.models_loaded:
144
- with st.spinner('Loading models... This may take a few minutes...'):
145
- models, error = load_models()
146
- if error:
147
- st.error(error)
148
- return
149
- st.session_state.models_loaded = True
150
- st.session_state.models = models
151
-
152
- # Check if an audio file was uploaded via API
153
- query_params = st.experimental_get_query_params()
154
- if "file" in query_params:
155
- audio_file_path = query_params["file"][0] # This should be the path to the uploaded audio file
156
 
157
- # Process the audio
158
- results = process_audio(audio_file_path, st.session_state.models)
159
 
160
- if results:
161
- # Return the results as JSON
162
- st.json(results)
163
- return # Exit the function early to avoid further processing in the UI
 
164
 
165
- # Normal Streamlit UI flow for file upload
166
- st.write("Upload an audio file of your lecture (supported formats: WAV, MP3, M4A, FLAC)")
167
- st.write("Note: Processing long audio files may take several minutes.")
168
-
169
- uploaded_file = st.file_uploader("Choose a file", type=["wav", "mp3", "m4a", "flac"])
170
-
171
- if uploaded_file is not None:
172
- # Create a temporary file for the uploaded content
173
- with tempfile.NamedTemporaryFile(delete=False, suffix=f".{uploaded_file.name.split('.')[-1]}") as temp_audio_file:
174
- temp_audio_file.write(uploaded_file.getbuffer())
175
- temp_audio_path = temp_audio_file.name
176
-
177
- try:
178
- # Process the audio
179
- results = process_audio(temp_audio_path, st.session_state.models)
180
-
181
- if results:
182
- # Display results in organized sections
183
- st.subheader("📝 Transcription")
184
- with st.expander("Show full transcription"):
185
- st.write(results['transcription'])
186
-
187
- st.subheader("📌 Summary")
188
- st.write(results['summary'])
189
 
190
- except Exception as e:
191
- st.error(f"An unexpected error occurred: {str(e)}")
192
-
193
- finally:
194
- # Cleanup original uploaded file
195
- if os.path.exists(temp_audio_path):
196
- try:
197
- os.unlink(temp_audio_path)
198
- except:
199
- pass
200
 
201
  if __name__ == "__main__":
202
- main()
 
 
 
1
  import os
2
+ import tempfile
3
+ import json
4
  import librosa
5
  import numpy as np
 
 
6
  import soundfile as sf
7
+ import torch
8
+ from flask import Flask, request, jsonify
9
+ from transformers import pipeline
 
 
10
 
11
+ # Initialize Flask app
12
+ app = Flask(__name__)
 
13
 
14
+ # Load models globally to avoid reloading on every request
15
+ device = 0 if torch.cuda.is_available() else -1
16
+ models = {
17
+ 'transcriber': pipeline("automatic-speech-recognition", model="openai/whisper-tiny.en", device=device, chunk_length_s=30),
18
+ 'summarizer': pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=device)
19
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def load_and_convert_audio(audio_path):
22
  """Load audio using librosa and convert to WAV format"""
23
+ audio_data, sample_rate = librosa.load(audio_path, sr=16000) # Whisper expects 16kHz
24
+ audio_data = audio_data.astype(np.float32)
25
+
26
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_wav:
27
+ sf.write(temp_wav.name, audio_data, sample_rate, format='WAV')
28
+ return temp_wav.name
 
 
 
 
 
 
 
29
 
30
+ def process_audio(audio_path):
31
+ """Process audio file and return transcription and summary"""
32
  results = {}
33
+ temp_wav_path = load_and_convert_audio(audio_path)
34
+
35
+ # Transcription
36
+ transcription = models['transcriber'](temp_wav_path, return_timestamps=True)
37
+ if isinstance(transcription, dict):
38
+ results['transcription'] = transcription['text']
39
+ else:
40
+ results['transcription'] = ' '.join([chunk['text'] for chunk in transcription])
41
+
42
+ # Summarization
43
+ text = results['transcription']
44
+ words = text.split()
45
+ chunk_size = 1000
46
+ chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
+ summaries = []
49
+ for chunk in chunks:
50
+ summary = models['summarizer'](chunk, max_length=200, min_length=50, truncation=True)
51
+ summaries.append(summary[0]['summary_text'])
 
 
 
52
 
53
+ combined_summary = ' '.join(summaries)
54
+ results['summary'] = combined_summary
55
 
56
+ # Clean up temporary WAV file
57
+ if os.path.exists(temp_wav_path):
58
+ os.unlink(temp_wav_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
+ return results
 
61
 
62
+ @app.route('/process-audio', methods=['POST'])
63
+ def process_audio_endpoint():
64
+ """API endpoint to process audio file"""
65
+ if 'file' not in request.files:
66
+ return jsonify({'error': 'No file part'}), 400
67
 
68
+ audio_file = request.files['file']
69
+ temp_audio_path = os.path.join(tempfile.gettempdir(), audio_file.filename)
70
+ audio_file.save(temp_audio_path)
71
+
72
+ results = process_audio(temp_audio_path)
73
+ os.remove(temp_audio_path) # Clean up the temporary audio file
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
+ return jsonify(results)
 
 
 
 
 
 
 
 
 
76
 
77
  if __name__ == "__main__":
78
+ app.run(host='0.0.0.0', port=5000)
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
- streamlit
2
- transformers
3
- torch
4
- soundfile
5
- numpy
6
- librosa
 
1
+ Flask==2.2.3
2
+ torch==1.12.1
3
+ transformers==4.20.1
4
+ librosa==0.9.2
5
+ soundfile==0.10.3.post1
6
+ numpy==1.21.6