opex792 commited on
Commit
a7193be
·
verified ·
1 Parent(s): d049016

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -41
app.py CHANGED
@@ -4,7 +4,8 @@ import logging
4
  from fastapi import FastAPI, File, UploadFile, HTTPException
5
  from fastapi.responses import JSONResponse
6
  import torch
7
- import transformers
 
8
  import librosa
9
  import soundfile # Often needed by librosa/transformers for specific formats
10
  import numpy as np
@@ -14,39 +15,52 @@ logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
15
 
16
  # --- Configuration ---
17
- # IMPORTANT: whisper-large-v3 is very resource intensive.
18
- # Consider 'openai/whisper-base' or 'openai/whisper-small' for HF Free Spaces
19
  MODEL_NAME = "openai/whisper-large-v3"
20
- # Set device (use GPU if available, otherwise CPU)
21
- # Note: Free HF Spaces typically don't provide GPUs. CPU will be slow.
22
  DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
23
- # Set torch dtype for potential speedup/memory saving (optional)
24
- # Use float16 on GPU if supported, otherwise float32
25
- TORCH_DTYPE = torch.float16 if torch.cuda.is_available() and DEVICE != "cpu" else torch.float32 # Ensure float32 on CPU
26
 
27
  logger.info(f"Using device: {DEVICE}")
28
  logger.info(f"Using dtype: {TORCH_DTYPE}")
29
- logger.info(f"Loading model: {MODEL_NAME}...")
30
 
31
- # --- Load the Model and Processor ---
32
- # Use try-except for robust model loading
33
  try:
34
- # Load the pipeline
35
- # Add model_kwargs to potentially help with loading, especially safetensors
36
- pipe = transformers.pipeline(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  "automatic-speech-recognition",
38
- model=MODEL_NAME,
 
 
39
  torch_dtype=TORCH_DTYPE,
40
  device=DEVICE,
41
- model_kwargs={"low_cpu_mem_usage": True} # Helps with large models on CPU
42
- # Note: use_safetensors=True is often default now if safetensors lib is installed
43
- # If still failing, you could try loading model/processor explicitly first
44
  )
45
- logger.info("Model loaded successfully.")
 
46
  except Exception as e:
47
- logger.error(f"Error loading model: {e}", exc_info=True)
48
- # Exit if model loading fails, as the app is useless without it
49
- raise RuntimeError(f"Failed to load model {MODEL_NAME}") from e
50
 
51
  # --- FastAPI App ---
52
  app = FastAPI()
@@ -81,33 +95,26 @@ async def transcribe_audio(file: UploadFile = File(...)):
81
  # Process the audio file
82
  try:
83
  logger.info("Processing audio...")
84
- # Use io.BytesIO to treat the byte content like a file
85
  audio_stream = io.BytesIO(contents)
86
-
87
- # Load audio using librosa. Whisper expects 16kHz mono.
88
- # Librosa automatically resamples and converts to mono by default.
89
- # Specify sr=16000 to ensure the correct sample rate.
90
  audio_input, sample_rate = librosa.load(audio_stream, sr=16000, mono=True)
91
  logger.info(f"Audio loaded. Sample rate: {sample_rate}, Duration: {len(audio_input)/sample_rate:.2f}s")
92
 
93
- # Ensure audio_input is a NumPy array (librosa usually returns this)
94
  if not isinstance(audio_input, np.ndarray):
95
  audio_input = np.array(audio_input)
96
 
97
  logger.info("Starting transcription...")
98
- # Perform inference
99
- # The pipeline can often handle numpy arrays directly
100
- # Set chunk_length_s for potentially better handling of long files on limited memory
101
- result = pipe(audio_input.copy(), # Pass a copy to avoid potential issues
102
  chunk_length_s=30, # Process in 30-second chunks
103
- batch_size=4, # Adjust batch size based on memory
104
- # Add generation config for more control if needed
105
- # generate_kwargs={"language": "en"} # Example: Force language
106
  )
107
 
108
  transcription = result["text"]
109
  logger.info(f"Transcription successful for {filename}.")
110
- # logger.debug(f"Transcription result: {transcription}") # Optional: log full text
111
 
112
  return JSONResponse(content={
113
  "filename": filename,
@@ -119,18 +126,23 @@ async def transcribe_audio(file: UploadFile = File(...)):
119
  raise HTTPException(status_code=400, detail=f"Error processing audio file: {e}. Ensure it's a valid audio format.")
120
  except Exception as e:
121
  logger.error(f"Transcription failed for {filename}: {e}", exc_info=True)
122
- raise HTTPException(status_code=500, detail=f"Transcription failed: {e}")
 
 
 
 
 
123
 
124
  # Optional: Add health check endpoint
125
  @app.get("/health")
126
  async def health_check():
127
  """ Health check endpoint """
128
  try:
129
- if pipe:
 
130
  return {"status": "ok", "model": MODEL_NAME, "device": DEVICE}
131
  else:
132
- # This state shouldn't be reachable if startup fails
133
- return {"status": "error", "detail": "Model pipeline not loaded"}
134
  except Exception as e:
135
  logger.error(f"Health check failed: {e}", exc_info=True)
136
  return {"status": "error", "detail": str(e)}
@@ -139,4 +151,4 @@ async def health_check():
139
  if __name__ == "__main__":
140
  import uvicorn
141
  logger.info("Starting Uvicorn server locally...")
142
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
4
  from fastapi import FastAPI, File, UploadFile, HTTPException
5
  from fastapi.responses import JSONResponse
6
  import torch
7
+ # Import specific classes from transformers
8
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
9
  import librosa
10
  import soundfile # Often needed by librosa/transformers for specific formats
11
  import numpy as np
 
15
  logger = logging.getLogger(__name__)
16
 
17
  # --- Configuration ---
 
 
18
  MODEL_NAME = "openai/whisper-large-v3"
 
 
19
  DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
20
+ # Ensure float32 on CPU, float16 on GPU if available
21
+ TORCH_DTYPE = torch.float16 if torch.cuda.is_available() and DEVICE != "cpu" else torch.float32
 
22
 
23
  logger.info(f"Using device: {DEVICE}")
24
  logger.info(f"Using dtype: {TORCH_DTYPE}")
25
+ logger.info(f"Loading model and processor for: {MODEL_NAME}...")
26
 
27
+ # --- Load the Model and Processor Explicitly ---
 
28
  try:
29
+ logger.info("Loading model...")
30
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
31
+ MODEL_NAME,
32
+ torch_dtype=TORCH_DTYPE,
33
+ low_cpu_mem_usage=True, # Crucial for large models on limited RAM
34
+ use_safetensors=True # Explicitly use safetensors
35
+ # If using GPU and have flash-attn installed, could add:
36
+ # attn_implementation="flash_attention_2"
37
+ )
38
+ # Move model to device *after* loading with low_cpu_mem_usage
39
+ model.to(DEVICE)
40
+ logger.info("Model loaded successfully.")
41
+
42
+ logger.info("Loading processor...")
43
+ processor = AutoProcessor.from_pretrained(MODEL_NAME)
44
+ logger.info("Processor loaded successfully.")
45
+
46
+ logger.info("Creating pipeline...")
47
+ # Create the pipeline using the pre-loaded model and processor components
48
+ pipe = pipeline(
49
  "automatic-speech-recognition",
50
+ model=model,
51
+ tokenizer=processor.tokenizer,
52
+ feature_extractor=processor.feature_extractor,
53
  torch_dtype=TORCH_DTYPE,
54
  device=DEVICE,
55
+ # Note: chunk_length_s and batch_size are inference-time args,
56
+ # better applied when calling pipe(), not during initialization.
 
57
  )
58
+ logger.info("Pipeline created successfully.")
59
+
60
  except Exception as e:
61
+ logger.error(f"Error during model/processor loading or pipeline creation: {e}", exc_info=True)
62
+ # Exit if loading fails
63
+ raise RuntimeError(f"Failed to initialize model pipeline for {MODEL_NAME}") from e
64
 
65
  # --- FastAPI App ---
66
  app = FastAPI()
 
95
  # Process the audio file
96
  try:
97
  logger.info("Processing audio...")
 
98
  audio_stream = io.BytesIO(contents)
99
+ # Load audio using librosa, ensuring 16kHz mono
 
 
 
100
  audio_input, sample_rate = librosa.load(audio_stream, sr=16000, mono=True)
101
  logger.info(f"Audio loaded. Sample rate: {sample_rate}, Duration: {len(audio_input)/sample_rate:.2f}s")
102
 
 
103
  if not isinstance(audio_input, np.ndarray):
104
  audio_input = np.array(audio_input)
105
 
106
  logger.info("Starting transcription...")
107
+ # Perform inference using the pipeline
108
+ # Apply chunking and batching here for inference
109
+ result = pipe(audio_input.copy(),
 
110
  chunk_length_s=30, # Process in 30-second chunks
111
+ batch_size=4, # Adjust based on memory (start low, e.g., 1 or 2 for free tier)
112
+ return_timestamps=False # Set to True or "word" if needed
113
+ # generate_kwargs={"language": "en"} # Optional: Force language
114
  )
115
 
116
  transcription = result["text"]
117
  logger.info(f"Transcription successful for {filename}.")
 
118
 
119
  return JSONResponse(content={
120
  "filename": filename,
 
126
  raise HTTPException(status_code=400, detail=f"Error processing audio file: {e}. Ensure it's a valid audio format.")
127
  except Exception as e:
128
  logger.error(f"Transcription failed for {filename}: {e}", exc_info=True)
129
+ # Check for common errors like Out of Memory (OOM)
130
+ if "out of memory" in str(e).lower():
131
+ logger.error("Potential Out-of-Memory error. The model might be too large for the available resources.")
132
+ raise HTTPException(status_code=507, detail=f"Transcription failed: Insufficient Memory. Try a smaller model or shorter audio.")
133
+ else:
134
+ raise HTTPException(status_code=500, detail=f"Transcription failed: {e}")
135
 
136
  # Optional: Add health check endpoint
137
  @app.get("/health")
138
  async def health_check():
139
  """ Health check endpoint """
140
  try:
141
+ # Check if the pipeline object exists and seems valid
142
+ if pipe and pipe.model and pipe.processor:
143
  return {"status": "ok", "model": MODEL_NAME, "device": DEVICE}
144
  else:
145
+ return {"status": "error", "detail": "Model pipeline component missing"}
 
146
  except Exception as e:
147
  logger.error(f"Health check failed: {e}", exc_info=True)
148
  return {"status": "error", "detail": str(e)}
 
151
  if __name__ == "__main__":
152
  import uvicorn
153
  logger.info("Starting Uvicorn server locally...")
154
+ uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True) # Added reload for local dev