import { serve } from "https://deno.land/std/http/server.ts"; import { EdgeSpeechTTS } from "https://esm.sh/@lobehub/tts@1"; const AUTH_TOKEN = Deno.env.get("AUTH_TOKEN"); const VOICES_URL = "https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=6A5AA1D4EAFF4E9FB37E23D68491D6F4"; async function fetchVoiceList() { const response = await fetch(VOICES_URL); const voices = await response.json(); return voices.reduce((acc: Record, voice: any) => { const { ShortName: model, ShortName: name, FriendlyName: friendlyName, Locale: locale } = voice; if (!acc[locale]) acc[locale] = []; acc[locale].push({ model, name, friendlyName, locale }); return acc; }, {}); } async function synthesizeSpeech(model: string, voice: string, text: string) { let voiceName; let rate = 0; let pitch = 0; if (model.includes("tts")) { rate = 0.1; pitch = 0.2; switch (voice) { case "alloy": voiceName = "zh-CN-YunjianNeural"; break; case "echo": voiceName = "zh-CN-YunyangNeural"; break; case "fable": voiceName = "zh-CN-XiaoxiaoNeural"; break; case "onyx": voiceName = "zh-TW-HsiaoChenNeural"; break; default: voiceName = "zh-CN-YunxiNeural"; break; } } else { voiceName = model; const params = Object.fromEntries( voice.split("|").map((p) => p.split(":") as [string, string]) ); rate = Number(params["rate"] || 0); pitch = Number(params["pitch"] || 0); } const tts = new EdgeSpeechTTS(); const payload = { input: text, options: { rate: rate, pitch: pitch, voice: voiceName }, }; const response = await tts.create(payload); const mp3Buffer = new Uint8Array(await response.arrayBuffer()); console.log(`Successfully synthesized speech, returning audio/mpeg response`); return new Response(mp3Buffer, { headers: { "Content-Type": "audio/mpeg" }, }); } function unauthorized(req: Request) { const authHeader = req.headers.get("Authorization"); return AUTH_TOKEN && authHeader !== `Bearer ${AUTH_TOKEN}`; } function validateContentType(req: Request, expected: string) { const contentType = req.headers.get("Content-Type"); if (contentType !== expected) { console.log(`Invalid Content-Type ${contentType}, expected ${expected}`); return new Response("Bad Request", { status: 400 }); } } async function handleDebugRequest(req: Request) { const url = new URL(req.url); const voice = url.searchParams.get("voice") || ""; const model = url.searchParams.get("model") || ""; const text = url.searchParams.get("text") || ""; console.log(`Debug request with model=${model}, voice=${voice}, text=${text}`); if (!voice || !model || !text) { console.log("Missing required parameters"); return new Response("Bad Request", { status: 400 }); } return synthesizeSpeech(model, voice, text); } async function handleSynthesisRequest(req: Request) { if (unauthorized(req)) { console.log("Unauthorized request"); return new Response("Unauthorized", { status: 401 }); } if (req.method !== "POST") { console.log(`Invalid method ${req.method}, expected POST`); return new Response("Method Not Allowed", { status: 405 }); } const invalidContentType = validateContentType(req, "application/json"); if (invalidContentType) return invalidContentType; const { model, input, voice } = await req.json(); console.log(`Synthesis request with model=${model}, input=${input}, voice=${voice}`); return synthesizeSpeech(model, voice, input); } async function handleDemoRequest(req: Request) { const groupedVoiceList = await fetchVoiceList(); const html = `语音合成演示

输入文本

语速:

-0.1

音调:

0.1

输入文本:

选择语音

`; return new Response(html, { headers: { "Content-Type": "text/html" }, }); } serve(async (req) => { try { const url = new URL(req.url); if (url.pathname === "/") { return handleDemoRequest(req); } if (url.pathname === "/tts") { return handleDebugRequest(req); } if (url.pathname !== "/v1/audio/speech") { console.log(`Unhandled path ${url.pathname}`); return new Response("Not Found", { status: 404 }); } return handleSynthesisRequest(req); } catch (err) { console.error(`Error processing request: ${err.message}`); return new Response(`Internal Server Error\n${err.message}`, { status: 500, }); } });