Spaces:
Build error
Build error
Removed final.wav to comply with Hugging Face restrictions
Browse files- app.py +91 -4
- catract.png +0 -0
app.py
CHANGED
@@ -1,7 +1,94 @@
|
|
|
|
|
|
1 |
import gradio as gr
|
|
|
2 |
|
3 |
-
|
4 |
-
|
|
|
|
|
5 |
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Import necessary libraries
|
2 |
+
import os
|
3 |
import gradio as gr
|
4 |
+
from pydub import AudioSegment
|
5 |
|
6 |
+
# Importing AI processing functions
|
7 |
+
from brain_of_the_doctor import encode_image, analyze_image_with_query
|
8 |
+
from voice_of_the_patient import transcribe_with_groq
|
9 |
+
from voice_of_the_doctor import text_to_speech_with_elevenlabs
|
10 |
|
11 |
+
# System prompt for the AI doctor
|
12 |
+
system_prompt = """You have to act as a professional doctor, I know you are not but this is for learning purposes.
|
13 |
+
With what I see, I think you have .... Do you find anything wrong with it medically?
|
14 |
+
If you make a differential, suggest some remedies for them. Do not add any numbers or special characters in
|
15 |
+
your response. Your response should be in one long paragraph. Always answer as if you are answering a real person.
|
16 |
+
Do not respond as an AI model in markdown. Keep your answer concise (max 2 sentences). No preamble, start your answer right away please."""
|
17 |
+
|
18 |
+
|
19 |
+
# Function to process inputs
|
20 |
+
def process_inputs(audio_filepath, image_filepath):
|
21 |
+
"""Handles audio transcription, image analysis, and text-to-speech generation."""
|
22 |
+
|
23 |
+
print(f"DEBUG: Received audio file path: {audio_filepath}")
|
24 |
+
|
25 |
+
# Ensure audio file exists before processing
|
26 |
+
if not audio_filepath or not os.path.exists(audio_filepath):
|
27 |
+
return "Error: No valid audio file provided.", "No response generated.", None
|
28 |
+
|
29 |
+
try:
|
30 |
+
# Convert speech to text using Groq API
|
31 |
+
speech_to_text_output = transcribe_with_groq(
|
32 |
+
GROQ_API_KEY=os.getenv("GROQ_API_KEY"),
|
33 |
+
audio_filepath=audio_filepath,
|
34 |
+
stt_model="whisper-large-v3",
|
35 |
+
)
|
36 |
+
except Exception as e:
|
37 |
+
return f"Error transcribing audio: {e}", "No response generated.", None
|
38 |
+
|
39 |
+
# Handle image analysis
|
40 |
+
if image_filepath and os.path.exists(image_filepath):
|
41 |
+
try:
|
42 |
+
encoded_img = encode_image(image_filepath)
|
43 |
+
doctor_response = analyze_image_with_query(
|
44 |
+
query=system_prompt + speech_to_text_output,
|
45 |
+
encoded_image=encoded_img,
|
46 |
+
model="llama-3.2-11b-vision-preview",
|
47 |
+
)
|
48 |
+
except Exception as e:
|
49 |
+
doctor_response = f"Error analyzing image: {e}"
|
50 |
+
else:
|
51 |
+
doctor_response = "No image provided for analysis."
|
52 |
+
|
53 |
+
# Convert doctor's response to speech using ElevenLabs
|
54 |
+
output_wav = "final.wav"
|
55 |
+
try:
|
56 |
+
text_to_speech_with_elevenlabs(
|
57 |
+
input_text=doctor_response,
|
58 |
+
output_filepath="final.mp3", # Generate MP3 first
|
59 |
+
)
|
60 |
+
|
61 |
+
# Convert MP3 to WAV
|
62 |
+
if os.path.exists("final.mp3"):
|
63 |
+
audio = AudioSegment.from_mp3("final.mp3")
|
64 |
+
audio.export(output_wav, format="wav")
|
65 |
+
else:
|
66 |
+
return (
|
67 |
+
speech_to_text_output,
|
68 |
+
doctor_response,
|
69 |
+
"Error: Failed to generate audio.",
|
70 |
+
)
|
71 |
+
except Exception as e:
|
72 |
+
return speech_to_text_output, doctor_response, f"Error generating speech: {e}"
|
73 |
+
|
74 |
+
return speech_to_text_output, doctor_response, output_wav
|
75 |
+
|
76 |
+
|
77 |
+
# Create Gradio Interface
|
78 |
+
iface = gr.Interface(
|
79 |
+
fn=process_inputs,
|
80 |
+
inputs=[
|
81 |
+
gr.Audio(sources=["microphone"], type="filepath"),
|
82 |
+
gr.Image(type="filepath"),
|
83 |
+
],
|
84 |
+
outputs=[
|
85 |
+
gr.Textbox(label="Speech to Text"),
|
86 |
+
gr.Textbox(label="Doctor's Response"),
|
87 |
+
gr.Audio(label="Doctor's Voice Response"),
|
88 |
+
],
|
89 |
+
title="AI Doctor with Vision and Voice",
|
90 |
+
description="Upload an image and speak into the microphone. The AI doctor will analyze the image, transcribe your speech, and respond in both text and voice.",
|
91 |
+
)
|
92 |
+
|
93 |
+
# Launch
|
94 |
+
iface.launch()
|
catract.png
DELETED
Binary file (384 kB)
|
|