salzzyy commited on
Commit
8bb232d
·
1 Parent(s): 066999f

Removed final.wav to comply with Hugging Face restrictions

Browse files
Files changed (2) hide show
  1. app.py +91 -4
  2. catract.png +0 -0
app.py CHANGED
@@ -1,7 +1,94 @@
 
 
1
  import gradio as gr
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries
2
+ import os
3
  import gradio as gr
4
+ from pydub import AudioSegment
5
 
6
+ # Importing AI processing functions
7
+ from brain_of_the_doctor import encode_image, analyze_image_with_query
8
+ from voice_of_the_patient import transcribe_with_groq
9
+ from voice_of_the_doctor import text_to_speech_with_elevenlabs
10
 
11
+ # System prompt for the AI doctor
12
+ system_prompt = """You have to act as a professional doctor, I know you are not but this is for learning purposes.
13
+ With what I see, I think you have .... Do you find anything wrong with it medically?
14
+ If you make a differential, suggest some remedies for them. Do not add any numbers or special characters in
15
+ your response. Your response should be in one long paragraph. Always answer as if you are answering a real person.
16
+ Do not respond as an AI model in markdown. Keep your answer concise (max 2 sentences). No preamble, start your answer right away please."""
17
+
18
+
19
+ # Function to process inputs
20
+ def process_inputs(audio_filepath, image_filepath):
21
+ """Handles audio transcription, image analysis, and text-to-speech generation."""
22
+
23
+ print(f"DEBUG: Received audio file path: {audio_filepath}")
24
+
25
+ # Ensure audio file exists before processing
26
+ if not audio_filepath or not os.path.exists(audio_filepath):
27
+ return "Error: No valid audio file provided.", "No response generated.", None
28
+
29
+ try:
30
+ # Convert speech to text using Groq API
31
+ speech_to_text_output = transcribe_with_groq(
32
+ GROQ_API_KEY=os.getenv("GROQ_API_KEY"),
33
+ audio_filepath=audio_filepath,
34
+ stt_model="whisper-large-v3",
35
+ )
36
+ except Exception as e:
37
+ return f"Error transcribing audio: {e}", "No response generated.", None
38
+
39
+ # Handle image analysis
40
+ if image_filepath and os.path.exists(image_filepath):
41
+ try:
42
+ encoded_img = encode_image(image_filepath)
43
+ doctor_response = analyze_image_with_query(
44
+ query=system_prompt + speech_to_text_output,
45
+ encoded_image=encoded_img,
46
+ model="llama-3.2-11b-vision-preview",
47
+ )
48
+ except Exception as e:
49
+ doctor_response = f"Error analyzing image: {e}"
50
+ else:
51
+ doctor_response = "No image provided for analysis."
52
+
53
+ # Convert doctor's response to speech using ElevenLabs
54
+ output_wav = "final.wav"
55
+ try:
56
+ text_to_speech_with_elevenlabs(
57
+ input_text=doctor_response,
58
+ output_filepath="final.mp3", # Generate MP3 first
59
+ )
60
+
61
+ # Convert MP3 to WAV
62
+ if os.path.exists("final.mp3"):
63
+ audio = AudioSegment.from_mp3("final.mp3")
64
+ audio.export(output_wav, format="wav")
65
+ else:
66
+ return (
67
+ speech_to_text_output,
68
+ doctor_response,
69
+ "Error: Failed to generate audio.",
70
+ )
71
+ except Exception as e:
72
+ return speech_to_text_output, doctor_response, f"Error generating speech: {e}"
73
+
74
+ return speech_to_text_output, doctor_response, output_wav
75
+
76
+
77
+ # Create Gradio Interface
78
+ iface = gr.Interface(
79
+ fn=process_inputs,
80
+ inputs=[
81
+ gr.Audio(sources=["microphone"], type="filepath"),
82
+ gr.Image(type="filepath"),
83
+ ],
84
+ outputs=[
85
+ gr.Textbox(label="Speech to Text"),
86
+ gr.Textbox(label="Doctor's Response"),
87
+ gr.Audio(label="Doctor's Voice Response"),
88
+ ],
89
+ title="AI Doctor with Vision and Voice",
90
+ description="Upload an image and speak into the microphone. The AI doctor will analyze the image, transcribe your speech, and respond in both text and voice.",
91
+ )
92
+
93
+ # Launch
94
+ iface.launch()
catract.png DELETED
Binary file (384 kB)