Spaces:

opex792
/

whisper

Runtime error

File size: 6,332 Bytes

import os
import io
import logging
from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
import torch
# Import specific classes from transformers
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import librosa
import soundfile # Often needed by librosa/transformers for specific formats
import numpy as np

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# --- Configuration ---
MODEL_NAME = "openai/whisper-large-v3"
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
# Ensure float32 on CPU, float16 on GPU if available
TORCH_DTYPE = torch.float16 if torch.cuda.is_available() and DEVICE != "cpu" else torch.float32

logger.info(f"Using device: {DEVICE}")
logger.info(f"Using dtype: {TORCH_DTYPE}")
logger.info(f"Loading model and processor for: {MODEL_NAME}...")

# --- Load the Model and Processor Explicitly ---
try:
    logger.info("Loading model...")
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        MODEL_NAME,
        torch_dtype=TORCH_DTYPE,
        low_cpu_mem_usage=True, # Crucial for large models on limited RAM
        use_safetensors=True    # Explicitly use safetensors
        # If using GPU and have flash-attn installed, could add:
        # attn_implementation="flash_attention_2"
    )
    # Move model to device *after* loading with low_cpu_mem_usage
    model.to(DEVICE)
    logger.info("Model loaded successfully.")

    logger.info("Loading processor...")
    processor = AutoProcessor.from_pretrained(MODEL_NAME)
    logger.info("Processor loaded successfully.")

    logger.info("Creating pipeline...")
    # Create the pipeline using the pre-loaded model and processor components
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        torch_dtype=TORCH_DTYPE,
        device=DEVICE,
        # Note: chunk_length_s and batch_size are inference-time args,
        # better applied when calling pipe(), not during initialization.
    )
    logger.info("Pipeline created successfully.")

except Exception as e:
    logger.error(f"Error during model/processor loading or pipeline creation: {e}", exc_info=True)
    # Exit if loading fails
    raise RuntimeError(f"Failed to initialize model pipeline for {MODEL_NAME}") from e

# --- FastAPI App ---
app = FastAPI()

@app.get("/")
async def read_root():
    """ Basic endpoint to check if the API is running """
    return {"message": f"Whisper API using {MODEL_NAME} is running."}

@app.post("/transcribe")
async def transcribe_audio(file: UploadFile = File(...)):
    """
    Endpoint to transcribe an uploaded audio file (e.g., MP3).
    """
    if not file:
        raise HTTPException(status_code=400, detail="No file uploaded.")

    filename = file.filename
    logger.info(f"Received file: {filename}")
    logger.info(f"Content type: {file.content_type}")

    # Read file content into memory
    try:
        contents = await file.read()
        logger.info(f"File read into memory ({len(contents)} bytes).")
    except Exception as e:
        logger.error(f"Error reading file: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=f"Error reading uploaded file: {e}")
    finally:
        await file.close() # Ensure file handle is closed

    # Process the audio file
    try:
        logger.info("Processing audio...")
        audio_stream = io.BytesIO(contents)
        # Load audio using librosa, ensuring 16kHz mono
        audio_input, sample_rate = librosa.load(audio_stream, sr=16000, mono=True)
        logger.info(f"Audio loaded. Sample rate: {sample_rate}, Duration: {len(audio_input)/sample_rate:.2f}s")

        if not isinstance(audio_input, np.ndarray):
             audio_input = np.array(audio_input)

        logger.info("Starting transcription...")
        # Perform inference using the pipeline
        # Apply chunking and batching here for inference
        result = pipe(audio_input.copy(),
                      chunk_length_s=30, # Process in 30-second chunks
                      batch_size=4,      # Adjust based on memory (start low, e.g., 1 or 2 for free tier)
                      return_timestamps=False # Set to True or "word" if needed
                      # generate_kwargs={"language": "en"} # Optional: Force language
                     )

        transcription = result["text"]
        logger.info(f"Transcription successful for {filename}.")

        return JSONResponse(content={
            "filename": filename,
            "transcription": transcription
        })

    except librosa.LibrosaError as e:
        logger.error(f"Error processing audio file {filename} with librosa: {e}", exc_info=True)
        raise HTTPException(status_code=400, detail=f"Error processing audio file: {e}. Ensure it's a valid audio format.")
    except Exception as e:
        logger.error(f"Transcription failed for {filename}: {e}", exc_info=True)
        # Check for common errors like Out of Memory (OOM)
        if "out of memory" in str(e).lower():
             logger.error("Potential Out-of-Memory error. The model might be too large for the available resources.")
             raise HTTPException(status_code=507, detail=f"Transcription failed: Insufficient Memory. Try a smaller model or shorter audio.")
        else:
             raise HTTPException(status_code=500, detail=f"Transcription failed: {e}")

# Optional: Add health check endpoint
@app.get("/health")
async def health_check():
    """ Health check endpoint """
    try:
        # Check if the pipeline object exists and seems valid
        if pipe and pipe.model and pipe.processor:
             return {"status": "ok", "model": MODEL_NAME, "device": DEVICE}
        else:
             return {"status": "error", "detail": "Model pipeline component missing"}
    except Exception as e:
        logger.error(f"Health check failed: {e}", exc_info=True)
        return {"status": "error", "detail": str(e)}

# --- Run with Uvicorn (for local testing) ---
if __name__ == "__main__":
    import uvicorn
    logger.info("Starting Uvicorn server locally...")
    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True) # Added reload for local dev