Update app.py
Browse files
app.py
CHANGED
@@ -4,7 +4,8 @@ import logging
|
|
4 |
from fastapi import FastAPI, File, UploadFile, HTTPException
|
5 |
from fastapi.responses import JSONResponse
|
6 |
import torch
|
7 |
-
|
|
|
8 |
import librosa
|
9 |
import soundfile # Often needed by librosa/transformers for specific formats
|
10 |
import numpy as np
|
@@ -14,39 +15,52 @@ logging.basicConfig(level=logging.INFO)
|
|
14 |
logger = logging.getLogger(__name__)
|
15 |
|
16 |
# --- Configuration ---
|
17 |
-
# IMPORTANT: whisper-large-v3 is very resource intensive.
|
18 |
-
# Consider 'openai/whisper-base' or 'openai/whisper-small' for HF Free Spaces
|
19 |
MODEL_NAME = "openai/whisper-large-v3"
|
20 |
-
# Set device (use GPU if available, otherwise CPU)
|
21 |
-
# Note: Free HF Spaces typically don't provide GPUs. CPU will be slow.
|
22 |
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
|
23 |
-
#
|
24 |
-
|
25 |
-
TORCH_DTYPE = torch.float16 if torch.cuda.is_available() and DEVICE != "cpu" else torch.float32 # Ensure float32 on CPU
|
26 |
|
27 |
logger.info(f"Using device: {DEVICE}")
|
28 |
logger.info(f"Using dtype: {TORCH_DTYPE}")
|
29 |
-
logger.info(f"Loading model: {MODEL_NAME}...")
|
30 |
|
31 |
-
# --- Load the Model and Processor ---
|
32 |
-
# Use try-except for robust model loading
|
33 |
try:
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
"automatic-speech-recognition",
|
38 |
-
model=
|
|
|
|
|
39 |
torch_dtype=TORCH_DTYPE,
|
40 |
device=DEVICE,
|
41 |
-
|
42 |
-
#
|
43 |
-
# If still failing, you could try loading model/processor explicitly first
|
44 |
)
|
45 |
-
logger.info("
|
|
|
46 |
except Exception as e:
|
47 |
-
logger.error(f"Error loading
|
48 |
-
# Exit if
|
49 |
-
raise RuntimeError(f"Failed to
|
50 |
|
51 |
# --- FastAPI App ---
|
52 |
app = FastAPI()
|
@@ -81,33 +95,26 @@ async def transcribe_audio(file: UploadFile = File(...)):
|
|
81 |
# Process the audio file
|
82 |
try:
|
83 |
logger.info("Processing audio...")
|
84 |
-
# Use io.BytesIO to treat the byte content like a file
|
85 |
audio_stream = io.BytesIO(contents)
|
86 |
-
|
87 |
-
# Load audio using librosa. Whisper expects 16kHz mono.
|
88 |
-
# Librosa automatically resamples and converts to mono by default.
|
89 |
-
# Specify sr=16000 to ensure the correct sample rate.
|
90 |
audio_input, sample_rate = librosa.load(audio_stream, sr=16000, mono=True)
|
91 |
logger.info(f"Audio loaded. Sample rate: {sample_rate}, Duration: {len(audio_input)/sample_rate:.2f}s")
|
92 |
|
93 |
-
# Ensure audio_input is a NumPy array (librosa usually returns this)
|
94 |
if not isinstance(audio_input, np.ndarray):
|
95 |
audio_input = np.array(audio_input)
|
96 |
|
97 |
logger.info("Starting transcription...")
|
98 |
-
# Perform inference
|
99 |
-
#
|
100 |
-
|
101 |
-
result = pipe(audio_input.copy(), # Pass a copy to avoid potential issues
|
102 |
chunk_length_s=30, # Process in 30-second chunks
|
103 |
-
batch_size=4, # Adjust
|
104 |
-
#
|
105 |
-
# generate_kwargs={"language": "en"} #
|
106 |
)
|
107 |
|
108 |
transcription = result["text"]
|
109 |
logger.info(f"Transcription successful for {filename}.")
|
110 |
-
# logger.debug(f"Transcription result: {transcription}") # Optional: log full text
|
111 |
|
112 |
return JSONResponse(content={
|
113 |
"filename": filename,
|
@@ -119,18 +126,23 @@ async def transcribe_audio(file: UploadFile = File(...)):
|
|
119 |
raise HTTPException(status_code=400, detail=f"Error processing audio file: {e}. Ensure it's a valid audio format.")
|
120 |
except Exception as e:
|
121 |
logger.error(f"Transcription failed for {filename}: {e}", exc_info=True)
|
122 |
-
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
# Optional: Add health check endpoint
|
125 |
@app.get("/health")
|
126 |
async def health_check():
|
127 |
""" Health check endpoint """
|
128 |
try:
|
129 |
-
if
|
|
|
130 |
return {"status": "ok", "model": MODEL_NAME, "device": DEVICE}
|
131 |
else:
|
132 |
-
|
133 |
-
return {"status": "error", "detail": "Model pipeline not loaded"}
|
134 |
except Exception as e:
|
135 |
logger.error(f"Health check failed: {e}", exc_info=True)
|
136 |
return {"status": "error", "detail": str(e)}
|
@@ -139,4 +151,4 @@ async def health_check():
|
|
139 |
if __name__ == "__main__":
|
140 |
import uvicorn
|
141 |
logger.info("Starting Uvicorn server locally...")
|
142 |
-
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|
4 |
from fastapi import FastAPI, File, UploadFile, HTTPException
|
5 |
from fastapi.responses import JSONResponse
|
6 |
import torch
|
7 |
+
# Import specific classes from transformers
|
8 |
+
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
|
9 |
import librosa
|
10 |
import soundfile # Often needed by librosa/transformers for specific formats
|
11 |
import numpy as np
|
|
|
15 |
logger = logging.getLogger(__name__)
|
16 |
|
17 |
# --- Configuration ---
|
|
|
|
|
18 |
MODEL_NAME = "openai/whisper-large-v3"
|
|
|
|
|
19 |
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
|
20 |
+
# Ensure float32 on CPU, float16 on GPU if available
|
21 |
+
TORCH_DTYPE = torch.float16 if torch.cuda.is_available() and DEVICE != "cpu" else torch.float32
|
|
|
22 |
|
23 |
logger.info(f"Using device: {DEVICE}")
|
24 |
logger.info(f"Using dtype: {TORCH_DTYPE}")
|
25 |
+
logger.info(f"Loading model and processor for: {MODEL_NAME}...")
|
26 |
|
27 |
+
# --- Load the Model and Processor Explicitly ---
|
|
|
28 |
try:
|
29 |
+
logger.info("Loading model...")
|
30 |
+
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
31 |
+
MODEL_NAME,
|
32 |
+
torch_dtype=TORCH_DTYPE,
|
33 |
+
low_cpu_mem_usage=True, # Crucial for large models on limited RAM
|
34 |
+
use_safetensors=True # Explicitly use safetensors
|
35 |
+
# If using GPU and have flash-attn installed, could add:
|
36 |
+
# attn_implementation="flash_attention_2"
|
37 |
+
)
|
38 |
+
# Move model to device *after* loading with low_cpu_mem_usage
|
39 |
+
model.to(DEVICE)
|
40 |
+
logger.info("Model loaded successfully.")
|
41 |
+
|
42 |
+
logger.info("Loading processor...")
|
43 |
+
processor = AutoProcessor.from_pretrained(MODEL_NAME)
|
44 |
+
logger.info("Processor loaded successfully.")
|
45 |
+
|
46 |
+
logger.info("Creating pipeline...")
|
47 |
+
# Create the pipeline using the pre-loaded model and processor components
|
48 |
+
pipe = pipeline(
|
49 |
"automatic-speech-recognition",
|
50 |
+
model=model,
|
51 |
+
tokenizer=processor.tokenizer,
|
52 |
+
feature_extractor=processor.feature_extractor,
|
53 |
torch_dtype=TORCH_DTYPE,
|
54 |
device=DEVICE,
|
55 |
+
# Note: chunk_length_s and batch_size are inference-time args,
|
56 |
+
# better applied when calling pipe(), not during initialization.
|
|
|
57 |
)
|
58 |
+
logger.info("Pipeline created successfully.")
|
59 |
+
|
60 |
except Exception as e:
|
61 |
+
logger.error(f"Error during model/processor loading or pipeline creation: {e}", exc_info=True)
|
62 |
+
# Exit if loading fails
|
63 |
+
raise RuntimeError(f"Failed to initialize model pipeline for {MODEL_NAME}") from e
|
64 |
|
65 |
# --- FastAPI App ---
|
66 |
app = FastAPI()
|
|
|
95 |
# Process the audio file
|
96 |
try:
|
97 |
logger.info("Processing audio...")
|
|
|
98 |
audio_stream = io.BytesIO(contents)
|
99 |
+
# Load audio using librosa, ensuring 16kHz mono
|
|
|
|
|
|
|
100 |
audio_input, sample_rate = librosa.load(audio_stream, sr=16000, mono=True)
|
101 |
logger.info(f"Audio loaded. Sample rate: {sample_rate}, Duration: {len(audio_input)/sample_rate:.2f}s")
|
102 |
|
|
|
103 |
if not isinstance(audio_input, np.ndarray):
|
104 |
audio_input = np.array(audio_input)
|
105 |
|
106 |
logger.info("Starting transcription...")
|
107 |
+
# Perform inference using the pipeline
|
108 |
+
# Apply chunking and batching here for inference
|
109 |
+
result = pipe(audio_input.copy(),
|
|
|
110 |
chunk_length_s=30, # Process in 30-second chunks
|
111 |
+
batch_size=4, # Adjust based on memory (start low, e.g., 1 or 2 for free tier)
|
112 |
+
return_timestamps=False # Set to True or "word" if needed
|
113 |
+
# generate_kwargs={"language": "en"} # Optional: Force language
|
114 |
)
|
115 |
|
116 |
transcription = result["text"]
|
117 |
logger.info(f"Transcription successful for {filename}.")
|
|
|
118 |
|
119 |
return JSONResponse(content={
|
120 |
"filename": filename,
|
|
|
126 |
raise HTTPException(status_code=400, detail=f"Error processing audio file: {e}. Ensure it's a valid audio format.")
|
127 |
except Exception as e:
|
128 |
logger.error(f"Transcription failed for {filename}: {e}", exc_info=True)
|
129 |
+
# Check for common errors like Out of Memory (OOM)
|
130 |
+
if "out of memory" in str(e).lower():
|
131 |
+
logger.error("Potential Out-of-Memory error. The model might be too large for the available resources.")
|
132 |
+
raise HTTPException(status_code=507, detail=f"Transcription failed: Insufficient Memory. Try a smaller model or shorter audio.")
|
133 |
+
else:
|
134 |
+
raise HTTPException(status_code=500, detail=f"Transcription failed: {e}")
|
135 |
|
136 |
# Optional: Add health check endpoint
|
137 |
@app.get("/health")
|
138 |
async def health_check():
|
139 |
""" Health check endpoint """
|
140 |
try:
|
141 |
+
# Check if the pipeline object exists and seems valid
|
142 |
+
if pipe and pipe.model and pipe.processor:
|
143 |
return {"status": "ok", "model": MODEL_NAME, "device": DEVICE}
|
144 |
else:
|
145 |
+
return {"status": "error", "detail": "Model pipeline component missing"}
|
|
|
146 |
except Exception as e:
|
147 |
logger.error(f"Health check failed: {e}", exc_info=True)
|
148 |
return {"status": "error", "detail": str(e)}
|
|
|
151 |
if __name__ == "__main__":
|
152 |
import uvicorn
|
153 |
logger.info("Starting Uvicorn server locally...")
|
154 |
+
uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True) # Added reload for local dev
|