MalikIbrar commited on
Commit
c41e64b
1 Parent(s): 1d22b69

Add application file

Browse files
Files changed (2) hide show
  1. main.py +83 -0
  2. requirements.txt +13 -0
main.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile, HTTPException
2
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
3
+ import torch
4
+ import uvicorn
5
+ import librosa
6
+ import soundfile as sf
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ import os
9
+ import tempfile
10
+
11
+ # Initialize FastAPI
12
+ app = FastAPI()
13
+ app.add_middleware(
14
+ CORSMiddleware,
15
+ allow_origins=['*'],
16
+ allow_credentials=True,
17
+ allow_methods=["*"],
18
+ allow_headers=["*"],
19
+ )
20
+
21
+ # Load the model and processor
22
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
23
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
24
+
25
+ model_id = "openai/whisper-large-v3"
26
+
27
+ # Check if model exists locally, otherwise download it
28
+ if not os.path.exists(f"./{model_id}"):
29
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
30
+ processor = AutoProcessor.from_pretrained(model_id)
31
+ else:
32
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(f"./{model_id}", torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
33
+ processor = AutoProcessor.from_pretrained(f"./{model_id}")
34
+
35
+ model.to(device)
36
+
37
+ pipe = pipeline(
38
+ "automatic-speech-recognition",
39
+ model=model,
40
+ tokenizer=processor.tokenizer,
41
+ feature_extractor=processor.feature_extractor,
42
+ torch_dtype=torch_dtype,
43
+ device=device,
44
+ )
45
+
46
+ # API endpoint to upload audio and get the transcribed text
47
+ @app.post("/transcribe")
48
+ async def transcribe_audio(file: UploadFile = File(...)):
49
+ try:
50
+ # Create a temporary file to save the uploaded content
51
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
52
+ temp_file.write(await file.read())
53
+ temp_path = temp_file.name
54
+
55
+ # Load the audio file using librosa
56
+ audio, sr = librosa.load(temp_path, sr=16000)
57
+
58
+ # Convert to a format that the model can process (in case the file needs reformatting)
59
+ processed_path = temp_path # Reuse temp file if format is already correct
60
+ sf.write(processed_path, audio, 16000)
61
+
62
+ # Pass the processed audio to the pipeline
63
+ result = pipe(processed_path)
64
+
65
+ # Remove the temp file after processing
66
+ os.remove(temp_path)
67
+
68
+ # Return the transcribed text
69
+ return {"text": result["text"]}
70
+
71
+ except Exception as e:
72
+ # Clean up temp file in case of error
73
+ if os.path.exists(temp_path):
74
+ os.remove(temp_path)
75
+ raise HTTPException(status_code=500, detail=f"Error occurred: {str(e)}")
76
+
77
+ @app.get("/")
78
+ async def root():
79
+ return {"message": "Welcome to the speech-to-text API!"}
80
+
81
+ # Running FastAPI with Uvicorn
82
+ if __name__ == "__main__":
83
+ uvicorn.run(app, host="0.0.0.0", port=8000)
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==0.34.2
2
+ aiohttp==3.10.5
3
+ attrs==24.2.0
4
+ fastapi==0.114.1
5
+ librosa==0.10.2.post1
6
+ numpy==1.26.4
7
+ pandas==2.1.4
8
+ requests==2.32.3
9
+ soundfile==0.12.1
10
+ torch==2.4.1
11
+ transformers==4.44.2
12
+ uvicorn==0.30.6
13
+ python-multipart