Spaces:

salzzyy
/

ai-doctor-assistant

Build error

App Files Files Community

ai-doctor-assistant / app.py

salzzyy

Removed final.wav to comply with Hugging Face restrictions

8bb232d 4 months ago

raw

history blame contribute delete

3.62 kB

	# Import necessary libraries
	import os
	import gradio as gr
	from pydub import AudioSegment

	# Importing AI processing functions
	from brain_of_the_doctor import encode_image, analyze_image_with_query
	from voice_of_the_patient import transcribe_with_groq
	from voice_of_the_doctor import text_to_speech_with_elevenlabs

	# System prompt for the AI doctor
	system_prompt = """You have to act as a professional doctor, I know you are not but this is for learning purposes.
	With what I see, I think you have .... Do you find anything wrong with it medically?
	If you make a differential, suggest some remedies for them. Do not add any numbers or special characters in
	your response. Your response should be in one long paragraph. Always answer as if you are answering a real person.
	Do not respond as an AI model in markdown. Keep your answer concise (max 2 sentences). No preamble, start your answer right away please."""


	# Function to process inputs
	def process_inputs(audio_filepath, image_filepath):
	"""Handles audio transcription, image analysis, and text-to-speech generation."""

	print(f"DEBUG: Received audio file path: {audio_filepath}")

	# Ensure audio file exists before processing
	if not audio_filepath or not os.path.exists(audio_filepath):
	return "Error: No valid audio file provided.", "No response generated.", None

	try:
	# Convert speech to text using Groq API
	speech_to_text_output = transcribe_with_groq(
	GROQ_API_KEY=os.getenv("GROQ_API_KEY"),
	audio_filepath=audio_filepath,
	stt_model="whisper-large-v3",
	)
	except Exception as e:
	return f"Error transcribing audio: {e}", "No response generated.", None

	# Handle image analysis
	if image_filepath and os.path.exists(image_filepath):
	try:
	encoded_img = encode_image(image_filepath)
	doctor_response = analyze_image_with_query(
	query=system_prompt + speech_to_text_output,
	encoded_image=encoded_img,
	model="llama-3.2-11b-vision-preview",
	)
	except Exception as e:
	doctor_response = f"Error analyzing image: {e}"
	else:
	doctor_response = "No image provided for analysis."

	# Convert doctor's response to speech using ElevenLabs
	output_wav = "final.wav"
	try:
	text_to_speech_with_elevenlabs(
	input_text=doctor_response,
	output_filepath="final.mp3", # Generate MP3 first
	)

	# Convert MP3 to WAV
	if os.path.exists("final.mp3"):
	audio = AudioSegment.from_mp3("final.mp3")
	audio.export(output_wav, format="wav")
	else:
	return (
	speech_to_text_output,
	doctor_response,
	"Error: Failed to generate audio.",
	)
	except Exception as e:
	return speech_to_text_output, doctor_response, f"Error generating speech: {e}"

	return speech_to_text_output, doctor_response, output_wav


	# Create Gradio Interface
	iface = gr.Interface(
	fn=process_inputs,
	inputs=[
	gr.Audio(sources=["microphone"], type="filepath"),
	gr.Image(type="filepath"),
	],
	outputs=[
	gr.Textbox(label="Speech to Text"),
	gr.Textbox(label="Doctor's Response"),
	gr.Audio(label="Doctor's Voice Response"),
	],
	title="AI Doctor with Vision and Voice",
	description="Upload an image and speak into the microphone. The AI doctor will analyze the image, transcribe your speech, and respond in both text and voice.",
	)

	# Launch
	iface.launch()