Spaces:
Running
Running
// digital.human.video.js | |
import * as webllm from "https://esm.run/@mlc-ai/web-llm"; | |
// Ensure the script runs after the DOM is fully loaded | |
document.addEventListener("DOMContentLoaded", () => { | |
// Initialize the Digital Human Video Assistant section | |
const videoMessages = [ | |
{ | |
content: "You are Aged Guru, an intelligent assistant skilled in video analysis and related interdisciplinary studies. Provide insightful and comprehensive answers to complex video-related questions.", | |
role: "system" | |
} | |
]; | |
const videoAvailableModels = webllm.prebuiltAppConfig.model_list.map( | |
(m) => m.model_id | |
); | |
let videoSelectedModel = "Llama-3.2-3B-Instruct-q4f16_1-MLC"; // Default model | |
function videoUpdateEngineInitProgressCallback(report) { | |
console.log("Digital Human Video Initialize", report.progress); | |
// Instead of updating a status span, log the progress | |
logMessage(`Model Initialization Progress: ${report.text}`, "system"); | |
} | |
const videoEngine = new webllm.MLCEngine(); | |
videoEngine.setInitProgressCallback(videoUpdateEngineInitProgressCallback); | |
let videoIsGenerating = false; // Flag to prevent multiple generations | |
async function videoInitializeWebLLMEngine() { | |
logMessage("Model initialization started.", "system"); | |
document.getElementById("video-loading-spinner").classList.remove("hidden"); // Show spinner | |
videoSelectedModel = document.getElementById("video-model-selection").value; | |
const config = { | |
temperature: 0.7, // Adjusted for more precise answers | |
top_p: 0.9 | |
}; | |
try { | |
await videoEngine.reload(videoSelectedModel, config); | |
document.getElementById("video-selected-model").textContent = videoSelectedModel; | |
document.getElementById("video-start_button").disabled = false; | |
document.getElementById("video-text-input").disabled = false; // Enable text input after initialization | |
document.getElementById("video-submit-button").disabled = false; // Enable submit button after initialization | |
document.getElementById("video-speech-controls").disabled = false; // Enable speech controls after initialization | |
document.getElementById("video-configuration").classList.remove("hidden"); | |
logMessage("Model initialized successfully.", "system"); | |
} catch (error) { | |
console.error("Error initializing the model:", error); | |
alert("Failed to initialize the model. Please try again."); | |
logMessage("Failed to initialize the model.", "error"); | |
} finally { | |
document.getElementById("video-loading-spinner").classList.add("hidden"); // Hide spinner | |
} | |
} | |
async function videoStreamingGenerating(messages, onUpdate, onFinish, onError) { | |
if (videoIsGenerating) { | |
console.warn("Video Generation already in progress."); | |
return; | |
} | |
videoIsGenerating = true; | |
try { | |
let curMessage = ""; | |
const completion = await videoEngine.chat.completions.create({ | |
stream: true, | |
messages | |
}); | |
for await (const chunk of completion) { | |
const curDelta = chunk.choices[0].delta.content; | |
if (curDelta) { | |
curMessage += curDelta; | |
} | |
onUpdate(curMessage); | |
} | |
const finalMessage = await videoEngine.getMessage(); | |
console.log(`Digital Human Video Generated final message: ${finalMessage}`); // Debugging | |
onFinish(finalMessage); | |
logMessage("Response generated successfully.", "system"); | |
} catch (err) { | |
console.error(err); | |
onError(err); | |
logMessage("An error occurred during response generation.", "error"); | |
} finally { | |
videoIsGenerating = false; | |
} | |
} | |
// Flag to track the last input method | |
let videoLastInputWasVoice = false; | |
function videoAppendMessage(message) { | |
console.log(`Digital Human Video Appending message: ${message.content} (Role: ${message.role})`); // Debugging | |
const videoChatBox = document.getElementById("video-chat-box"); | |
// Check if the assistant's message is already appended to avoid duplication | |
if (message.role === "assistant") { | |
const existingMessages = videoChatBox.querySelectorAll(".message"); | |
const lastMessage = existingMessages[existingMessages.length - 1]; | |
if (lastMessage && lastMessage.textContent === message.content) { | |
console.warn("Duplicate assistant message detected in Video section, skipping append."); | |
// Only trigger TTS for assistant messages if the last input was via voice | |
if (message.role === "assistant" && message.content !== "typing..." && videoLastInputWasVoice) { | |
videoSpeak(message.content); | |
} | |
return; // Exit to avoid appending the same message twice | |
} | |
} | |
const container = document.createElement("div"); | |
container.classList.add("message-container"); | |
const newMessage = document.createElement("div"); | |
newMessage.classList.add("message"); | |
newMessage.textContent = message.content; | |
if (message.role === "user") { | |
container.classList.add("user"); | |
} else { | |
container.classList.add("assistant"); | |
} | |
container.appendChild(newMessage); | |
videoChatBox.appendChild(container); | |
videoChatBox.scrollTop = videoChatBox.scrollHeight; | |
// Only trigger TTS for assistant messages if the last input was via voice | |
if (message.role === "assistant" && message.content !== "typing..." && videoLastInputWasVoice) { | |
videoSpeak(message.content); | |
} | |
} | |
function videoUpdateLastMessage(content) { | |
const messageDoms = document.getElementById("video-chat-box").querySelectorAll(".message"); | |
const lastMessageDom = messageDoms[messageDoms.length - 1]; | |
lastMessageDom.textContent = content; | |
} | |
function videoOnSpeechRecognized(transcript) { | |
const input = transcript.trim(); | |
const message = { | |
content: input, | |
role: "user" | |
}; | |
if (input.length === 0) { | |
return; | |
} | |
videoLastInputWasVoice = true; // Set flag as voice input | |
console.log(`Digital Human Video Voice input received: ${input}`); // Debugging | |
document.getElementById("video-start_button").disabled = true; | |
document.getElementById("video-submit-button").disabled = true; // Disable submit button during processing | |
videoMessages.push(message); | |
videoAppendMessage(message); | |
logMessage(`User (Voice): ${input}`, "user"); | |
// Append "typing..." placeholder | |
const aiPlaceholder = { | |
content: "typing...", | |
role: "assistant" | |
}; | |
videoAppendMessage(aiPlaceholder); | |
logMessage("VideoBot is typing...", "system"); | |
const onFinishGenerating = (finalMessage) => { | |
console.log(`Digital Human Video Finishing generation with message: ${finalMessage}`); // Debugging | |
// Remove the "typing..." placeholder | |
const videoChatBox = document.getElementById("video-chat-box"); | |
const lastMessageContainer = videoChatBox.lastElementChild; | |
if (lastMessageContainer && lastMessageContainer.querySelector(".message").textContent === "typing...") { | |
videoChatBox.removeChild(lastMessageContainer); | |
} | |
// Append the final message | |
const aiMessage = { | |
content: finalMessage, | |
role: "assistant" | |
}; | |
videoAppendMessage(aiMessage); | |
logMessage(`VideoBot: ${finalMessage}`, "assistant"); | |
document.getElementById("video-start_button").disabled = false; | |
document.getElementById("video-submit-button").disabled = false; // Re-enable submit button after processing | |
videoEngine.runtimeStatsText().then((statsText) => { | |
document.getElementById("video-chat-stats").classList.remove("hidden"); | |
document.getElementById("video-chat-stats").textContent = statsText; | |
logMessage(`Runtime Stats: ${statsText}`, "system"); | |
}); | |
}; | |
videoStreamingGenerating( | |
videoMessages, | |
videoUpdateLastMessage, | |
onFinishGenerating, | |
(err) => { | |
console.error(err); | |
alert("An error occurred while generating the response. Please try again."); | |
logMessage("Error during response generation.", "error"); | |
document.getElementById("video-start_button").disabled = false; | |
document.getElementById("video-submit-button").disabled = false; | |
} | |
); | |
} | |
// Speech Recognition Code for Video | |
let videoRecognizing = false; | |
let videoIgnore_onend; | |
let videoFinal_transcript = ''; | |
let videoRecognition; | |
function videoStartButton(event) { | |
if (videoRecognizing) { | |
videoRecognition.stop(); | |
return; | |
} | |
videoFinal_transcript = ''; | |
videoRecognition.lang = 'en-US'; | |
videoRecognition.start(); | |
videoIgnore_onend = false; | |
document.getElementById("video-start_button").classList.add("mic-animate"); | |
logMessage("Voice input started.", "system"); | |
} | |
if (!('webkitSpeechRecognition' in window)) { | |
alert("Web Speech API is not supported by this browser."); | |
logMessage("Web Speech API is not supported by this browser.", "error"); | |
} else { | |
videoRecognition = new webkitSpeechRecognition(); | |
videoRecognition.continuous = false; // Non-continuous recognition | |
videoRecognition.interimResults = false; // Get only final results | |
videoRecognition.onstart = function() { | |
videoRecognizing = true; | |
logMessage("Speech recognition started.", "system"); | |
}; | |
videoRecognition.onerror = function(event) { | |
if (event.error == 'no-speech') { | |
document.getElementById("video-start_button").classList.remove("mic-animate"); | |
alert('No speech was detected in Video section.'); | |
logMessage("No speech detected.", "error"); | |
videoIgnore_onend = true; | |
} | |
if (event.error == 'audio-capture') { | |
document.getElementById("video-start_button").classList.remove("mic-animate"); | |
alert('No microphone was found in Video section.'); | |
logMessage("No microphone found.", "error"); | |
videoIgnore_onend = true; | |
} | |
if (event.error == 'not-allowed') { | |
alert('Permission to use microphone was denied in Video section.'); | |
logMessage("Microphone permission denied.", "error"); | |
videoIgnore_onend = true; | |
} | |
}; | |
videoRecognition.onend = function() { | |
videoRecognizing = false; | |
document.getElementById("video-start_button").classList.remove("mic-animate"); | |
logMessage("Speech recognition ended.", "system"); | |
if (videoIgnore_onend) { | |
return; | |
} | |
if (!videoFinal_transcript) { | |
logMessage("No transcript captured.", "error"); | |
return; | |
} | |
// Process the final transcript | |
videoOnSpeechRecognized(videoFinal_transcript); | |
}; | |
videoRecognition.onresult = function(event) { | |
for (let i = event.resultIndex; i < event.results.length; ++i) { | |
if (event.results[i].isFinal) { | |
videoFinal_transcript += event.results[i][0].transcript; | |
} | |
} | |
videoFinal_transcript = videoFinal_transcript.trim(); | |
logMessage(`Recognized Speech: ${videoFinal_transcript}`, "user"); | |
}; | |
} | |
document.getElementById("video-start_button").addEventListener("click", function(event) { | |
videoStartButton(event); | |
}); | |
// Initialize Model Selection | |
videoAvailableModels.forEach((modelId) => { | |
const option = document.createElement("option"); | |
option.value = modelId; | |
option.textContent = modelId; | |
document.getElementById("video-model-selection").appendChild(option); | |
}); | |
document.getElementById("video-model-selection").value = videoSelectedModel; | |
// **Enable the Download Model button after models are loaded** | |
document.getElementById("video-download").disabled = false; | |
document.getElementById("video-download").addEventListener("click", function () { | |
videoInitializeWebLLMEngine().then(() => { | |
document.getElementById("video-start_button").disabled = false; | |
// Enable speech controls after model initialization | |
document.getElementById("video-speech-rate").disabled = false; | |
document.getElementById("video-speech-pitch").disabled = false; | |
logMessage("Model download initiated.", "system"); | |
}); | |
}); | |
document.getElementById("video-clear-logs").addEventListener("click", function () { | |
document.getElementById("video-logs").innerHTML = ''; | |
logMessage("Logs cleared.", "system"); | |
}); | |
// ===== TTS Integration ===== | |
// Initialize Speech Synthesis | |
let videoSpeech = new SpeechSynthesisUtterance(); | |
videoSpeech.lang = "en"; | |
let videoVoices = []; | |
// Use addEventListener instead of directly assigning to onvoiceschanged | |
window.speechSynthesis.addEventListener("voiceschanged", () => { | |
videoVoices = window.speechSynthesis.getVoices(); | |
videoPopulateVoices(); | |
}); | |
function videoPopulateVoices() { | |
const voiceSelect = document.getElementById("video-tools"); | |
voiceSelect.innerHTML = ''; // Clear existing options | |
videoVoices.forEach((voice, i) => { | |
const option = new Option(voice.name, i); | |
voiceSelect.appendChild(option); | |
}); | |
if (videoVoices.length > 0) { | |
const savedVoice = localStorage.getItem("video-selectedVoice"); | |
if (savedVoice !== null && videoVoices[savedVoice]) { | |
videoSpeech.voice = videoVoices[savedVoice]; | |
voiceSelect.value = savedVoice; | |
} else { | |
videoSpeech.voice = videoVoices[0]; | |
} | |
} | |
} | |
// Voice Selection Event Listener | |
document.getElementById("video-tools").addEventListener("change", () => { | |
const selectedVoiceIndex = document.getElementById("video-tools").value; | |
videoSpeech.voice = videoVoices[selectedVoiceIndex]; | |
// Save to localStorage | |
localStorage.setItem("video-selectedVoice", selectedVoiceIndex); | |
logMessage(`Voice changed to: ${videoVoices[selectedVoiceIndex].name}`, "system"); | |
}); | |
// Function to Speak Text with Voice Selection and Handling Large Texts | |
function videoSpeak(text) { | |
if (!window.speechSynthesis) { | |
console.warn("Speech Synthesis not supported in this browser for Video section."); | |
logMessage("Speech Synthesis not supported in this browser.", "error"); | |
return; | |
} | |
// Show spinner and enable Stop button | |
document.getElementById("video-loading-spinner").classList.remove("hidden"); | |
document.getElementById("video-stop_button").disabled = false; | |
logMessage("TTS started.", "system"); | |
// Retrieve the currently selected voice | |
const selectedVoice = videoSpeech.voice; | |
// Split the text into sentences to manage large texts | |
const sentences = text.match(/[^\.!\?]+[\.!\?]+/g) || [text]; | |
let utterancesCount = sentences.length; | |
sentences.forEach(sentence => { | |
const utterance = new SpeechSynthesisUtterance(sentence.trim()); | |
// Assign the selected voice to the utterance | |
if (selectedVoice) { | |
utterance.voice = selectedVoice; | |
} | |
// Assign rate and pitch from sliders | |
const rate = parseFloat(document.getElementById("video-speech-rate").value); | |
const pitch = parseFloat(document.getElementById("video-speech-pitch").value); | |
utterance.rate = rate; // Adjust the speaking rate (0.1 to 10) | |
utterance.pitch = pitch; // Adjust the pitch (0 to 2) | |
// Add event listeners for debugging or additional functionality | |
utterance.onstart = () => { | |
console.log("Speech started:", sentence); | |
logMessage(`TTS started: ${sentence.trim()}`, "system"); | |
}; | |
utterance.onend = () => { | |
console.log("Speech ended:", sentence); | |
logMessage(`TTS ended: ${sentence.trim()}`, "system"); | |
utterancesCount--; | |
if (utterancesCount === 0) { | |
// Hide spinner and disable Stop button when all utterances have been spoken | |
document.getElementById("video-loading-spinner").classList.add("hidden"); | |
document.getElementById("video-stop_button").disabled = true; | |
logMessage("All TTS messages have been spoken.", "system"); | |
} | |
}; | |
utterance.onerror = (e) => { | |
console.error("Speech Synthesis Error:", e); | |
alert("An error occurred during speech synthesis. Please try again."); | |
logMessage("Speech synthesis encountered an error.", "error"); | |
utterancesCount = 0; | |
document.getElementById("video-loading-spinner").classList.add("hidden"); | |
document.getElementById("video-stop_button").disabled = true; | |
}; | |
window.speechSynthesis.speak(utterance); | |
}); | |
} | |
// ===== New: Stop Speech Functionality ===== | |
/** | |
* Stops any ongoing speech synthesis. | |
*/ | |
function videoStopSpeech() { | |
if (window.speechSynthesis.speaking) { | |
window.speechSynthesis.cancel(); | |
document.getElementById("video-loading-spinner").classList.add("hidden"); | |
document.getElementById("video-stop_button").disabled = true; | |
logMessage("Speech synthesis stopped by user.", "system"); | |
} | |
} | |
// Event Listener for Stop Button | |
document.getElementById("video-stop_button").addEventListener("click", function () { | |
videoStopSpeech(); | |
}); | |
// ===== New: Text Input Handling ===== | |
// Function to Handle Text Submission | |
function videoHandleTextSubmit() { | |
const textInput = document.getElementById("video-text-input"); | |
const input = textInput.value.trim(); | |
if (input.length === 0) { | |
return; | |
} | |
textInput.value = ''; // Clear the input field | |
const message = { | |
content: input, | |
role: "user" // Ensure this is correctly set | |
}; | |
console.log(`Digital Human Video Text input received: ${input}`); // Debugging | |
logMessage(`User: ${input}`, "user"); | |
videoLastInputWasVoice = false; // Set flag as text input | |
document.getElementById("video-submit-button").disabled = true; // Disable to prevent multiple submissions | |
videoMessages.push(message); | |
videoAppendMessage(message); | |
// Append "typing..." placeholder | |
const aiPlaceholder = { | |
content: "typing...", | |
role: "assistant" | |
}; | |
videoAppendMessage(aiPlaceholder); | |
logMessage("VideoBot is typing...", "system"); | |
const onFinishGenerating = (finalMessage) => { | |
console.log(`Digital Human Video Finishing generation with message: ${finalMessage}`); // Debugging | |
// Remove the "typing..." placeholder | |
const videoChatBox = document.getElementById("video-chat-box"); | |
const lastMessageContainer = videoChatBox.lastElementChild; | |
if (lastMessageContainer && lastMessageContainer.querySelector(".message").textContent === "typing...") { | |
videoChatBox.removeChild(lastMessageContainer); | |
} | |
// Append the final message | |
const aiMessage = { | |
content: finalMessage, | |
role: "assistant" | |
}; | |
videoAppendMessage(aiMessage); | |
logMessage(`VideoBot: ${finalMessage}`, "assistant"); | |
// Trigger TTS for assistant messages if required | |
if (videoLastInputWasVoice) { | |
videoSpeak(finalMessage); | |
} | |
document.getElementById("video-submit-button").disabled = false; // Re-enable submit button after processing | |
videoEngine.runtimeStatsText().then((statsText) => { | |
document.getElementById("video-chat-stats").classList.remove("hidden"); | |
document.getElementById("video-chat-stats").textContent = statsText; | |
logMessage(`Runtime Stats: ${statsText}`, "system"); | |
}); | |
}; | |
videoStreamingGenerating( | |
videoMessages, | |
videoUpdateLastMessage, | |
onFinishGenerating, | |
(err) => { | |
console.error(err); | |
alert("An error occurred while generating the response. Please try again."); | |
logMessage("Error during response generation.", "error"); | |
document.getElementById("video-submit-button").disabled = false; | |
} | |
); | |
} | |
// Event Listener for Submit Button | |
document.getElementById("video-submit-button").addEventListener("click", function () { | |
videoHandleTextSubmit(); | |
}); | |
// Event Listener for Enter Key in Text Input | |
document.getElementById("video-text-input").addEventListener("keypress", function (e) { | |
if (e.key === 'Enter') { | |
videoHandleTextSubmit(); | |
} | |
}); | |
// ===== Persisting User Preferences ===== | |
// Load Preferences on Initialization | |
window.addEventListener("load", () => { | |
const savedVoice = localStorage.getItem("video-selectedVoice"); | |
if (savedVoice !== null && videoVoices[savedVoice]) { | |
document.getElementById("video-tools").value = savedVoice; | |
videoSpeech.voice = videoVoices[savedVoice]; | |
logMessage(`Loaded saved voice: ${videoVoices[savedVoice].name}`, "system"); | |
} | |
const savedRate = localStorage.getItem("video-speechRate"); | |
if (savedRate !== null) { | |
document.getElementById("video-speech-rate").value = savedRate; | |
videoSpeech.rate = parseFloat(savedRate); | |
logMessage(`Loaded saved speech rate: ${savedRate}`, "system"); | |
} | |
const savedPitch = localStorage.getItem("video-speechPitch"); | |
if (savedPitch !== null) { | |
document.getElementById("video-speech-pitch").value = savedPitch; | |
videoSpeech.pitch = parseFloat(savedPitch); | |
logMessage(`Loaded saved speech pitch: ${savedPitch}`, "system"); | |
} | |
}); | |
// Save Speech Rate | |
document.getElementById("video-speech-rate").addEventListener("input", (e) => { | |
const rate = e.target.value; | |
videoSpeech.rate = parseFloat(rate); | |
localStorage.setItem("video-speechRate", rate); | |
logMessage(`Speech rate changed to: ${rate}`, "system"); | |
}); | |
// Save Speech Pitch | |
document.getElementById("video-speech-pitch").addEventListener("input", (e) => { | |
const pitch = e.target.value; | |
videoSpeech.pitch = parseFloat(pitch); | |
localStorage.setItem("video-speechPitch", pitch); | |
logMessage(`Speech pitch changed to: ${pitch}`, "system"); | |
}); | |
// ===== Logging Function ===== | |
/** | |
* Logs messages to the #video-logs container. | |
* @param {string} message - The message to log. | |
* @param {string} type - The type of message: 'user', 'assistant', 'system', 'error'. | |
*/ | |
function logMessage(message, type) { | |
const videoLogs = document.getElementById("video-logs"); | |
const logEntry = document.createElement("div"); | |
logEntry.classList.add("log-entry"); | |
logEntry.textContent = `[${type.toUpperCase()}] ${message}`; | |
// Style log entries based on type | |
switch(type) { | |
case 'user': | |
logEntry.style.color = "#00796B"; | |
break; | |
case 'assistant': | |
logEntry.style.color = "#004D40"; | |
break; | |
case 'system': | |
logEntry.style.color = "#555555"; | |
break; | |
case 'error': | |
logEntry.style.color = "#E53935"; | |
break; | |
default: | |
logEntry.style.color = "#000000"; | |
} | |
videoLogs.appendChild(logEntry); | |
videoLogs.scrollTop = videoLogs.scrollHeight; | |
} | |
// ===== TTS Integration Continued ===== | |
// Optional: Global Listener to Detect When All Speech Has Finished | |
window.speechSynthesis.addEventListener('end', () => { | |
console.log("All video speech has been spoken."); | |
logMessage("All TTS messages have been spoken.", "system"); | |
// Ensure Stop button is disabled after speech ends | |
document.getElementById("video-stop_button").disabled = true; | |
}); | |
}); | |