Spaces:

jbilcke-hf
/

ai-tube

Running

App Files Files Community

jbilcke-hf HF staff commited on May 14

Commit

3c87951

•

1 Parent(s): 8b547c3

let's up the game

Browse files

Files changed (23) hide show

src/app/api/utils/imagePrompts.ts +1 -0
src/app/api/v1/edit/dialogues/processShot.ts +3 -1
src/app/api/v1/edit/entities/generateEntityPrompts.ts +1 -1
src/app/api/v1/edit/music/cluster.ts +1 -1
src/app/api/v1/edit/music/generateMusic.ts +1 -1
src/app/api/v1/edit/music/generateMusicPrompt.ts +1 -2
src/app/api/v1/edit/sounds/cluster.ts +45 -0
src/app/api/v1/edit/sounds/generateSound.ts +97 -0
src/app/api/v1/edit/sounds/generateSoundPrompt.ts +84 -0
src/app/api/v1/edit/sounds/generateSourceWithMagnet.ts +69 -0
src/app/api/v1/edit/sounds/processShot.ts +86 -0
src/app/api/v1/edit/sounds/route.txt +57 -0
src/app/api/v1/edit/sounds/systemPrompt.ts +25 -0
src/app/api/v1/edit/sounds/types.ts +7 -0
src/app/api/v1/edit/storyboards/processShot.ts +3 -1
src/app/api/v1/edit/videos/processShot.ts +24 -9
src/app/api/v1/render/animatediff-lcm-svd/cluster.ts +54 -0
src/app/api/v1/render/animatediff-lcm-svd/index.ts +148 -0
src/app/api/v1/render/{cluster.ts → animatediff-lightning/cluster.ts} +2 -2
src/app/api/v1/render/{index.ts → animatediff-lightning/index.ts} +0 -0
src/app/api/v1/render/route.ts +4 -3
src/lib/utils/logImage.ts +31 -0
src/lib/utils/resizeImage.ts +55 -0

src/app/api/utils/imagePrompts.ts CHANGED Viewed

@@ -13,6 +13,7 @@ export function addWordsIfNotPartOfThePrompt(prompt: string = "", words: string[
  export function getPositivePrompt(prompt: string = "", triggerWord = "") {
   return addWordsIfNotPartOfThePrompt(prompt, [
     triggerWord,
     "sublime",
     "pro quality",

  export function getPositivePrompt(prompt: string = "", triggerWord = "") {
   return addWordsIfNotPartOfThePrompt(prompt, [
+    "cinematic photo",
     triggerWord,
     "sublime",
     "pro quality",

src/app/api/v1/edit/dialogues/processShot.ts CHANGED Viewed

@@ -39,7 +39,7 @@ export async function processShot({
   let shotDialogueSegment: ClapSegment | undefined = shotDialogueSegments.at(0)
-  console.log(`[api/edit/dialogues] processShot: shot [${shotSegment.startTimeInMs}:${shotSegment.endTimeInMs}] has ${shotSegments.length} segments (${shotDialogueSegments.length} dialogues)`)
   if (shotDialogueSegment && !shotDialogueSegment.assetUrl) {
     // console.log(`[api/edit/dialogues] generating audio..`)
@@ -58,6 +58,8 @@ export async function processShot({
       })
       shotDialogueSegment.assetSourceType = getClapAssetSourceType(shotDialogueSegment.assetUrl)
       const { durationInMs, hasAudio } = await getMediaInfo(shotDialogueSegment.assetUrl)
       if (hasAudio && durationInMs > 1000) {

   let shotDialogueSegment: ClapSegment | undefined = shotDialogueSegments.at(0)
+  // console.log(`[api/edit/dialogues] processShot: shot [${shotSegment.startTimeInMs}:${shotSegment.endTimeInMs}] has ${shotSegments.length} segments (${shotDialogueSegments.length} dialogues)`)
   if (shotDialogueSegment && !shotDialogueSegment.assetUrl) {
     // console.log(`[api/edit/dialogues] generating audio..`)
       })
       shotDialogueSegment.assetSourceType = getClapAssetSourceType(shotDialogueSegment.assetUrl)
+      shotDialogueSegment.status = "completed"
       const { durationInMs, hasAudio } = await getMediaInfo(shotDialogueSegment.assetUrl)
       if (hasAudio && durationInMs > 1000) {

src/app/api/v1/edit/entities/generateEntityPrompts.ts CHANGED Viewed

@@ -93,7 +93,7 @@ Now please generate the output entities:`
     maybeEntities = parseRawStringToYAML<LatentEntity[]>(rawString, [])
     if (!Array.isArray(maybeEntities) || maybeEntities.length === 0) {
-      console.log(`generateEntityPrompts(): failed to generate shots for the second time, which indicates an issue with the Hugging Face API`)
     }
   }

     maybeEntities = parseRawStringToYAML<LatentEntity[]>(rawString, [])
     if (!Array.isArray(maybeEntities) || maybeEntities.length === 0) {
+      console.log(`generateEntityPrompts(): failed to generate entities for the second time, which indicates an issue with the Hugging Face API`)
     }
   }

src/app/api/v1/edit/music/cluster.ts CHANGED Viewed

@@ -33,7 +33,7 @@ export async function getClusterMachine(maxWaitTimeInMs: number = 10000): Promis
   }
   if (!clusterMachine) {
-    throw new Error(`failed to find a cluster machine within ${maxWaitTimeInMs/10} seconds`)
   }
   // change the global state

   }
   if (!clusterMachine) {
+    throw new Error(`failed to find a cluster machine within ${maxWaitTimeInMs/1000} seconds`)
   }
   // change the global state

src/app/api/v1/edit/music/generateMusic.ts CHANGED Viewed

@@ -48,7 +48,7 @@ export async function generateMusic({
   }
-  const durationInSec = 12 // musicSegment.assetDurationInMs / 1000
   console.log(`generateMusic(): generating a music with:\n  duration: ${durationInSec} sec\n  prompt: ${prompt}`)

   }
+  const durationInSec = 14 // musicSegment.assetDurationInMs / 1000
   console.log(`generateMusic(): generating a music with:\n  duration: ${durationInSec} sec\n  prompt: ${prompt}`)

src/app/api/v1/edit/music/generateMusicPrompt.ts CHANGED Viewed

@@ -29,8 +29,7 @@ export async function generateMusicPrompts({
   // console.log("generateMusicPrompts(): latentStory:", latentStory)
   const userPrompt = `The input story is about: ${prompt}.
-The input story is:
 \`\`\`yaml
 ${YAML.stringify(
   // we need to help the LLM by marking the shots with a simple numeric ID

   // console.log("generateMusicPrompts(): latentStory:", latentStory)
   const userPrompt = `The input story is about: ${prompt}.
 \`\`\`yaml
 ${YAML.stringify(
   // we need to help the LLM by marking the shots with a simple numeric ID

src/app/api/v1/edit/sounds/cluster.ts ADDED Viewed

	@@ -0,0 +1,45 @@

+import { sleep } from "@/lib/utils/sleep"
+import { ClusterMachine } from "../../types"
+export const nbClusterMachines = 1
+// make sure the machines are running!!
+// https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-magnet-1/settings
+// https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-magnet-2/settings
+// https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-magnet-3/settings
+// https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-magnet-4/settings
+// we maintain a global cluster state
+export const clusterMachines: ClusterMachine[] = []
+for (let i = 0; i < nbClusterMachines; i++) {
+  clusterMachines.push({
+    id: i,
+    url: `https://jbilcke-hf-ai-tube-model-magnet-${i + 1}.hf.space`,
+    busy: false
+  })
+}
+export async function getClusterMachine(maxWaitTimeInMs: number = 10000): Promise<ClusterMachine> {
+  let clusterMachine: ClusterMachine | undefined = undefined
+  let timeSpentWaitingInMs = 0
+  const intervalInMs = 500
+  while (true) {
+    clusterMachine = clusterMachines.find(m => !m.busy)
+    if (clusterMachine) { break }
+    if (timeSpentWaitingInMs > maxWaitTimeInMs) { break }
+    await sleep(intervalInMs)
+  }
+  if (!clusterMachine) {
+    throw new Error(`failed to find a cluster machine within ${maxWaitTimeInMs/1000} seconds`)
+  }
+  // change the global state
+  clusterMachine.busy = true
+  return clusterMachine
+}
+export const token = `${process.env.MICROSERVICE_API_SECRET_TOKEN || ""}`

src/app/api/v1/edit/sounds/generateSound.ts ADDED Viewed

	@@ -0,0 +1,97 @@

+import {
+  ClapProject,
+  ClapSegment,
+  getClapAssetSourceType,
+  filterSegments,
+  ClapSegmentFilteringMode,
+  ClapSegmentCategory,
+  newSegment
+} from "@aitube/clap"
+import { ClapCompletionMode } from "@aitube/client"
+import { getMediaInfo } from "@/app/api/utils/getMediaInfo"
+import { generateSoundWithMagnet } from "./generateSourceWithMagnet"
+export async function generateSound({
+  soundSegment,
+  existingClap,
+  newerClap,
+  mode,
+  turbo,
+}: {
+  soundSegment?: ClapSegment
+  existingClap: ClapProject
+  newerClap: ClapProject
+  mode: ClapCompletionMode
+  turbo: boolean
+}): Promise<void> {
+  if (!soundSegment) {
+    console.log(`generateSound(): sound segment is empty, so skipping sound generation.`)
+    return
+  }
+    // for now we do something very basic
+  if (soundSegment.status === "completed") {
+    console.log(`generateSound(): sound segment is already generated, skipping doing it twice.`)
+    return
+  }
+  // for now we do something very basic
+  const prompt = soundSegment.prompt
+  if (!prompt) {
+    console.log(`generateSound(): sound prompt is empty, so skipping sound generation.`)
+    return
+  }
+  const durationInSec = 12 // soundSegment.assetDurationInMs / 1000
+  console.log(`generateSound(): generating a sound with:\n  duration: ${durationInSec} sec\n  prompt: ${prompt}`)
+  const assetUrl = await generateSoundWithMagnet({
+    prompt,
+    durationInSec,
+    hd: false,
+    debug: true,
+    neverThrow: true,
+  })
+  if (!assetUrl || assetUrl?.length < 30) {
+    console.log(`generateSound(): the generated assetUrl is empty, so sound generation failed.`)
+    return
+  }
+  let { durationInMs, hasAudio } = await getMediaInfo(assetUrl)
+  const newProperties: Partial<ClapSegment> = {
+    assetUrl,
+    assetDurationInMs: durationInMs,
+    outputGain: 1.0,
+    status: "completed"
+  }
+  if (!hasAudio) {
+    console.warn(`generateSound(): the generated sound waveform appears to be silent (might be a ffprobe malfunction)`)
+    // return
+    // we have a bug on AiTube, basically the ffmpeg probe isn't working,
+    // because it doesn't find ffmpeg
+    // if think the issue is how the Dockerfile is formed
+    // so until this is fixed, we need to fake a "correct" result
+    newProperties.assetDurationInMs = soundSegment.assetDurationInMs
+  }
+  if (mode !== ClapCompletionMode.FULL) {
+    console.log(`generateSound(): adding sound to a new clap file`)
+    newerClap.segments.push(newSegment({
+      ...soundSegment,
+      ...newProperties,
+    }))
+  } else {
+    console.log(`generateSound(): overwriting the sound inside the existing clap file`)
+    // this will update the existing clap (normally)
+    Object.assign(soundSegment, newProperties)
+  }
+}

src/app/api/v1/edit/sounds/generateSoundPrompt.ts ADDED Viewed

	@@ -0,0 +1,84 @@

+import YAML from "yaml"
+import { predict } from "@/app/api/providers/huggingface/predictWithHuggingFace"
+import { LatentStory } from "@/app/api/v1/types"
+import { systemPrompt } from "./systemPrompt"
+export async function generateSoundPrompts({
+  prompt = "",
+  latentStory = [],
+  turbo = false,
+}: {
+  prompt?: string
+  latentStory?: LatentStory[]
+  turbo?: boolean
+} = {
+  prompt: "",
+  latentStory: [],
+  turbo: false
+}): Promise<string[]> {
+  if (!prompt.length) { throw new Error(`please provide a prompt`) }
+  console.log("generateSoundPrompts(): prompt:", prompt)
+  if (!latentStory.length) { throw new Error(`please provide a story`) }
+  // console.log("generateSoundPrompts(): latentStory:", latentStory)
+  const userPrompt = `The input story is about: ${prompt}.
+  # Output`
+  /*
+  NOTE Julian: maybe later we can use this:
+  const userPrompt = `The input story is about: ${prompt}.
+\`\`\`yaml
+${YAML.stringify(
+  // we need to help the LLM by marking the shots with a simple numeric ID
+  latentStory.map((shot, i) => ({
+    shot: i,
+    ...shot,
+  }))
+)}
+\`\`\`
+# Output`
+*/
+  const prefix = "\""
+  // we don't need a lot here!
+  const nbMaxNewTokens = 120
+  // TODO use streaming for the Hugging Face prediction
+  //
+  // note that a Clap file is actually a YAML stream of documents
+  // so technically we could stream everything from end-to-end
+  // (but I haven't coded the helpers to do this yet)
+  let rawString = await predict({
+    systemPrompt,
+    userPrompt,
+    nbMaxNewTokens,
+    prefix,
+    turbo,
+  })
+  // console.log("generateEntityPrompts(): rawString: ", rawString)
+  let results: string[] = []
+  // we remove everything after the last ``` (or ``)
+  rawString = rawString.split(/```?/)[0].trim()
+  results.push(rawString)
+  if (!Array.isArray(results) || typeof results.at(0) !== "string" || !results) {
+    throw new Error(`failed to generate the output (rawString is: ${rawString})`)
+  }
+  return results
+}

src/app/api/v1/edit/sounds/generateSourceWithMagnet.ts ADDED Viewed

	@@ -0,0 +1,69 @@

+import { addBase64Header } from "@/lib/data/addBase64Header"
+import { SoundGenerationParams } from "./types"
+import { getClusterMachine } from "./cluster"
+const microserviceApiKey = `${process.env.MICROSERVICE_API_SECRET_TOKEN || ""}`
+/**
+ * Note: this generates a base64 mp3 file
+ */
+export async function generateSoundWithMagnet({
+  prompt,
+  durationInSec,
+  hd,
+  debug = false,
+  neverThrow = false,
+}: SoundGenerationParams): Promise<string> {
+  if (!prompt?.length) {
+    throw new Error(`prompt is too short!`)
+  }
+  const machine = await getClusterMachine()
+  try {
+    const res = await fetch(machine.url + (machine.url.endsWith("/") ? "" : "/") + "api/predict", {
+      method: "POST",
+      headers: {
+        "Content-Type": "application/json",
+        // Authorization: `Bearer ${token}`,
+      },
+      body: JSON.stringify({
+        fn_index: 1, // <- important!
+        data: [
+          microserviceApiKey, // string  in 'Secret Token' Textbox component
+          // TODO
+        ],
+      }),
+      cache: "no-store",
+      // we can also use this (see https://vercel.com/blog/vercel-cache-api-nextjs-cache)
+      // next: { revalidate: 1 }
+    })
+    if (res.status !== 200) {
+      throw new Error('Failed to fetch data')
+    }
+    const { data } = await res.json()
+    // console.log("data:", data)
+    // Recommendation: handle errors
+    if (res.status !== 200 || !Array.isArray(data)) {
+      // This will activate the closest `error.js` Error Boundary
+      throw new Error(`Failed to fetch data (status: ${res.status})`)
+    }
+    // console.log("data:", data.slice(0, 50))
+    if (!data[0]) {
+      throw new Error(`the returned sound was empty`)
+    }
+    // console.log("data:", data[0].slice(0, 60))
+    return addBase64Header(data[0] as string, "mp3")
+  } catch (err) {
+    throw err
+  } finally {
+    // important: we need to free up the machine!
+    machine.busy = false
+  }
+}

src/app/api/v1/edit/sounds/processShot.ts ADDED Viewed

	@@ -0,0 +1,86 @@

+import {
+  ClapProject,
+  ClapSegment,
+  getClapAssetSourceType,
+  filterSegments,
+  ClapSegmentFilteringMode,
+  ClapSegmentCategory
+} from "@aitube/clap"
+import { ClapCompletionMode } from "@aitube/client"
+import { generateSoundWithMagnet } from "./generateSourceWithMagnet"
+import { getMediaInfo } from "@/app/api/utils/getMediaInfo"
+export async function processShot({
+  shotSegment,
+  existingClap,
+  newerClap,
+  mode,
+  turbo,
+}: {
+  shotSegment: ClapSegment
+  existingClap: ClapProject
+  newerClap: ClapProject
+  mode: ClapCompletionMode
+  turbo: boolean
+}): Promise<void> {
+  const shotSegments: ClapSegment[] = filterSegments(
+    ClapSegmentFilteringMode.BOTH,
+    shotSegment,
+    existingClap.segments
+  )
+  const shotSoundSegments: ClapSegment[] = shotSegments.filter(s =>
+    s.category === ClapSegmentCategory.SOUND
+  )
+  let shotSoundSegment: ClapSegment | undefined = shotSoundSegments.at(0)
+  console.log(`[api/edit/sounds] processShot: shot [${shotSegment.startTimeInMs}:${shotSegment.endTimeInMs}] has ${shotSegments.length} segments (${shotSoundSegments.length} sounds)`)
+  if (shotSoundSegment && !shotSoundSegment.assetUrl) {
+    // console.log(`[api/edit/sounds] generating background sound effect..`)
+    try {
+      // this generates a mp3
+      shotSoundSegment.assetUrl = await generateSoundWithMagnet({
+        prompt: shotSoundSegment.prompt,
+        durationInSec: shotSegment.assetDurationInMs,
+        hd: false,
+        debug: true,
+        neverThrow: false,
+      })
+      shotSoundSegment.assetSourceType = getClapAssetSourceType(shotSoundSegment.assetUrl)
+      shotSoundSegment.status = "completed"
+      const { durationInMs, hasAudio } = await getMediaInfo(shotSoundSegment.assetUrl)
+      if (hasAudio && durationInMs > 1000) {
+        shotSoundSegment.assetDurationInMs = durationInMs
+        shotSegment.assetDurationInMs = durationInMs
+        // we update the duration of all the segments for this shot
+        // (it is possible that this makes the two previous lines redundant)
+        existingClap.segments.forEach(s => {
+          s.assetDurationInMs = durationInMs
+        })
+      }
+    } catch (err) {
+      console.log(`[api/edit/sounds] processShot: failed to generate audio: ${err}`)
+      throw err
+    }
+    console.log(`[api/edit/sounds] processShot: generated sound audio: ${shotSoundSegment?.assetUrl?.slice?.(0, 50)}...`)
+  // if it's partial, we need to manually add it
+  if (mode !== ClapCompletionMode.FULL) {
+      newerClap.segments.push(shotSoundSegment)
+    }
+  } else {
+    console.log(`[api/edit/sounds] processShot: there is already a sound audio: ${shotSoundSegment?.assetUrl?.slice?.(0, 50)}...`)
+  }
+}

src/app/api/v1/edit/sounds/route.txt ADDED Viewed

	@@ -0,0 +1,57 @@

+import { NextResponse, NextRequest } from "next/server"
+import queryString from "query-string"
+import { ClapProject, ClapSegment, ClapSegmentCategory, newClap, parseClap, serializeClap } from "@aitube/clap"
+import { ClapCompletionMode } from "@aitube/client"
+import { parseCompletionMode } from "@/app/api/parsers/parseCompletionMode"
+import { throwIfInvalidToken } from "@/app/api/v1/auth/throwIfInvalidToken"
+import { parseTurbo } from "@/app/api/parsers/parseTurbo"
+import { processShot } from "./processShot"
+// a helper to generate speech for a Clap
+export async function POST(req: NextRequest) {
+  await throwIfInvalidToken(req.headers.get("Authorization"))
+  const qs = queryString.parseUrl(req.url || "")
+  const query = (qs || {}).query
+  const mode = parseCompletionMode(query?.c)
+  const turbo = parseTurbo(query?.t)
+  const blob = await req.blob()
+  const existingClap: ClapProject = await parseClap(blob)
+  if (!existingClap?.segments) { throw new Error(`no segment found in the provided clap!`) }
+  // console.log(`[api/edit/dialogues] detected ${existingClap.segments.length} segments`)
+  const shotsSegments: ClapSegment[] = existingClap.segments.filter(s => s.category === ClapSegmentCategory.CAMERA)
+  // console.log(`[api/edit/dialogues] detected ${shotsSegments.length} shots`)
+  if (shotsSegments.length > 32) {
+    throw new Error(`Error, this endpoint being synchronous, it is designed for short stories only (max 32 shots).`)
+  }
+  const newerClap = mode === ClapCompletionMode.FULL ? existingClap : newClap({
+    meta: existingClap.meta
+  })
+  // we process the shots in parallel (this will increase the queue size in the Gradio spaces)
+  await Promise.all(shotsSegments.map(shotSegment =>
+    processShot({
+      shotSegment,
+      existingClap,
+      newerClap,
+      mode,
+      turbo,
+    })
+  ))
+  // console.log(`[api/edit/dialogues] returning the clap augmented with dialogues`)
+  return new NextResponse(await serializeClap(newerClap), {
+    status: 200,
+    headers: new Headers({ "content-type": "application/x-gzip" }),
+  })
+}

src/app/api/v1/edit/sounds/systemPrompt.ts ADDED Viewed

	@@ -0,0 +1,25 @@

+export const systemPrompt: string = `
+You are a backend API engine, designed to generate a background audio and sound effect prompt output from a story input.
+## Prompting guidelines
+We already know we are generating sound, no need to tell us again, so e concise!
+Don't speak too much or give your opinion so don't say things like "The audio track should have a wind and chimes sounds, giving an eerie, ominous mood.." instead just say "wind, chimes".
+Avoid concepts that don't translate well to sound.
+To create a background soundtrack prompt, you need to combine locations with objects and their characteristics.
+## Example of input/output
+Given the following input story, provided as YAML:
+# Input
+"A king goes to see a witch to ask if or how he can win an upcoming and challenging battle"
+As you can see, the theme is modern, describing a city. So you should generate an audio soundtrack like this:
+## Output
+"Downtown New York, busy street, pedestrians, taxis."
+`

src/app/api/v1/edit/sounds/types.ts ADDED Viewed

	@@ -0,0 +1,7 @@

+export type SoundGenerationParams = {
+  prompt: string
+  durationInSec: number
+  hd?: boolean
+  debug?: boolean
+  neverThrow?: boolean
+}

src/app/api/v1/edit/storyboards/processShot.ts CHANGED Viewed

@@ -87,12 +87,14 @@ export async function processShot({
         turbo,
       })
       shotStoryboardSegment.assetSourceType = getClapAssetSourceType(shotStoryboardSegment.assetUrl)
     } catch (err) {
       console.log(`[api/v1/edit/storyboards] processShot: failed to generate an image: ${err}`)
       throw err
     }
-    console.log(`[api/v1/edit/storyboards] processShot: generated storyboard image: ${shotStoryboardSegment?.assetUrl?.slice?.(0, 50)}...`)
     // if mode is full, newerClap already contains the ference to shotStoryboardSegment
     // but if it's partial, we need to manually add it

         turbo,
       })
       shotStoryboardSegment.assetSourceType = getClapAssetSourceType(shotStoryboardSegment.assetUrl)
+      shotStoryboardSegment.status = "completed"
     } catch (err) {
       console.log(`[api/v1/edit/storyboards] processShot: failed to generate an image: ${err}`)
+      shotStoryboardSegment.status = "to_generate"
       throw err
     }
+    // console.log(`[api/v1/edit/storyboards] processShot: generated storyboard image: ${shotStoryboardSegment?.assetUrl?.slice?.(0, 50)}...`)
     // if mode is full, newerClap already contains the ference to shotStoryboardSegment
     // but if it's partial, we need to manually add it

src/app/api/v1/edit/videos/processShot.ts CHANGED Viewed

@@ -15,7 +15,8 @@ import { getVideoPrompt } from "@aitube/engine"
 import { getPositivePrompt } from "@/app/api/utils/imagePrompts"
-import { render } from "@/app/api/v1/render"
 import { extractFirstFrame } from "@/app/api/utils/extractFirstFrame"
 export async function processShot({
@@ -49,7 +50,7 @@ export async function processShot({
   let shotStoryboardSegment: ClapSegment | undefined = shotStoryboardSegments.at(0)
-  console.log(`[api/edit/videos] processShot: shot [${shotSegment.startTimeInMs}:${shotSegment.endTimeInMs}] has ${shotSegments.length} segments (${shotVideoSegments.length} videos)`)
   // TASK 1: GENERATE MISSING VIDEO SEGMENT
   if (!shotVideoSegment) {
@@ -90,7 +91,7 @@ export async function processShot({
   // TASK 3: GENERATE MISSING VIDEO FILE
   if (!shotVideoSegment.assetUrl) {
-    console.log(`[api/edit/videos] processShot: generating video file..`)
     const debug = false
@@ -102,30 +103,44 @@ export async function processShot({
     // height = Math.round(height / 2)
     // }
     if (width > height) {
-      width = 512
-      height = 288
     } else if (width < height) {
-      width = 288
-      height = 512
     } else {
       width = 512
       height = 512
     }
     try {
       shotVideoSegment.assetUrl = await render({
-        prompt: getPositivePrompt(shotVideoSegment.prompt),
         seed: shotSegment.seed,
         width,
         height,
         nbFrames: 80,
         nbFPS: 24,
         nbSteps: 4, // turbo ? 4 : 8,
         debug,
       })
       shotVideoSegment.assetSourceType = getClapAssetSourceType(shotVideoSegment.assetUrl)
     } catch (err) {
       console.log(`[api/edit/videos] processShot: failed to generate a video file: ${err}`)
       throw err
     }
@@ -182,7 +197,7 @@ export async function processShot({
       shotStoryboardSegment.status = "completed"
     } catch (err) {
-      console.warn(`[api/edit/videos] processShot: couldn't generate the missing storyboard (probably an error with the ffmpeg config). Message:`, err)
       shotStoryboardSegment.status = "to_generate"
     }

 import { getPositivePrompt } from "@/app/api/utils/imagePrompts"
+import { render } from "@/app/api/v1/render/animatediff-lcm-svd"
+// import { render } from "@/app/api/v1/render/animatediff-lightning"
 import { extractFirstFrame } from "@/app/api/utils/extractFirstFrame"
 export async function processShot({
   let shotStoryboardSegment: ClapSegment | undefined = shotStoryboardSegments.at(0)
+  // console.log(`[api/edit/videos] processShot: shot [${shotSegment.startTimeInMs}:${shotSegment.endTimeInMs}] has ${shotSegments.length} segments (${shotVideoSegments.length} videos)`)
   // TASK 1: GENERATE MISSING VIDEO SEGMENT
   if (!shotVideoSegment) {
   // TASK 3: GENERATE MISSING VIDEO FILE
   if (!shotVideoSegment.assetUrl) {
+    // console.log(`[api/edit/videos] processShot: generating video file..`)
     const debug = false
     // height = Math.round(height / 2)
     // }
+    /*
     if (width > height) {
+      width = 768
+      height = 384
     } else if (width < height) {
+      width = 384
+      height = 768
     } else {
       width = 512
       height = 512
     }
+    */
+    if (!shotStoryboardSegment?.assetUrl) {
+      const error = `cannot generate a video without a storyboard! (at least not with AnimateDiff-LCM SVD)`
+      console.error(error)
+      throw new Error(error)
+    }
     try {
       shotVideoSegment.assetUrl = await render({
+        // prompt: getPositivePrompt(shotVideoSegment.prompt),
+        imageInputBase64: shotStoryboardSegment.assetUrl,
         seed: shotSegment.seed,
         width,
         height,
+        // by default we do 1 second of 24 fps
+        // but it would look better if we had 2 seconds of 24 fps
         nbFrames: 80,
         nbFPS: 24,
         nbSteps: 4, // turbo ? 4 : 8,
         debug,
       })
       shotVideoSegment.assetSourceType = getClapAssetSourceType(shotVideoSegment.assetUrl)
+      shotStoryboardSegment.status = "completed"
     } catch (err) {
       console.log(`[api/edit/videos] processShot: failed to generate a video file: ${err}`)
+      shotStoryboardSegment.status = "to_generate"
       throw err
     }
       shotStoryboardSegment.status = "completed"
     } catch (err) {
+      console.warn(`[api/edit/videos] processShot: couldn't generate the missing storyboard (probably an error with the ffmpeg not being found)`)
       shotStoryboardSegment.status = "to_generate"
     }

src/app/api/v1/render/animatediff-lcm-svd/cluster.ts ADDED Viewed

	@@ -0,0 +1,54 @@

+import { sleep } from "@/lib/utils/sleep"
+import { ClusterMachine } from "../../types"
+export const nbClusterMachines = 8
+// make sure the machines are running!!
+// https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-als-1/settings
+// https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-als-2/settings
+// https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-als-3/settings
+// https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-als-4/settings
+// https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-als-5/settings
+// https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-als-6/settings
+// https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-als-7/settings
+// https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-als-8/settings
+// we maintain a global cluster state
+export const clusterMachines: ClusterMachine[] = []
+for (let i = 0; i < nbClusterMachines; i++) {
+  clusterMachines.push({
+    id: i,
+    url: `https://jbilcke-hf-ai-tube-model-als-${i + 1}.hf.space`,
+    // careful when trying this one (check number of Gradio parameters, fps etc):
+    // url: `https://jbilcke-hf-ai-tube-model-als-experimental.hf.space`,
+    busy: false
+  })
+}
+export async function getClusterMachine(maxWaitTimeInMs: number = 10000): Promise<ClusterMachine> {
+  let clusterMachine: ClusterMachine | undefined = undefined
+  let timeSpentWaitingInMs = 0
+  const intervalInMs = 500
+  while (true) {
+    clusterMachine = clusterMachines.find(m => !m.busy)
+    if (clusterMachine) { break }
+    if (timeSpentWaitingInMs > maxWaitTimeInMs) { break }
+    await sleep(intervalInMs)
+  }
+  if (!clusterMachine) {
+    throw new Error(`failed to find a cluster machine within ${maxWaitTimeInMs/1000} seconds`)
+  }
+  // change the global state
+  clusterMachine.busy = true
+  return clusterMachine
+}
+export const token = `${process.env.MICROSERVICE_API_SECRET_TOKEN || ""}`

src/app/api/v1/render/animatediff-lcm-svd/index.ts ADDED Viewed

	@@ -0,0 +1,148 @@

+import { generateSeed, getValidNumber } from "@aitube/clap"
+import { getClusterMachine, token } from "./cluster"
+import { resizeImage } from "@/lib/utils/resizeImage"
+/**
+ * Render a video using AnimateDiff-LCM-SVD
+ *
+ * @param request
+ * @returns
+ */
+export async function render(request: {
+  imageInputBase64?: string
+  seed?: number
+  width?: number
+  height?: number
+  nbFrames?: number
+  nbFPS?: number
+  nbSteps?: number
+  debug?: boolean
+}): Promise<string> {
+  const imageInputBase64 = request.imageInputBase64 || ""
+  if (!imageInputBase64) {
+    throw new Error(`missing imageInputBase64`)
+  }
+  const debug = !!request.debug
+  // I think we have a problem with the seed?
+  // const seed = request?.seed || generateSeed()
+  // the motion LoRA - could be useful one day
+  const motion = ""
+  const nbSteps = getValidNumber(request.nbSteps, 1, 12, 4)
+  const width = getValidNumber(request.width, 256, 1024, 896)
+  const height = getValidNumber(request.height, 256, 1024, 512)
+  // important note: by default our AnimateDiff-LCM SVD
+  // is a 24 fps model, so either 24 fps for 1 sec of footage,
+  // or 8 fps for 3 seconds of footage
+  const nbFrames = getValidNumber(request.nbFrames, 10, 120, 24)
+  const nbFPS = getValidNumber(request.nbFPS, 10, 120, 8)
+  // by default AnimateDiff generates about 2 seconds of video at 10 fps
+  // the Gradio API now has some code to optional fix that using FFmpeg,
+  // but this will add some delay overhead, so use with care!
+  const durationInSec = Math.round(nbFrames / nbFPS)
+  const framesPerSec = nbFPS
+  // vital step: image size must match the output video size
+  const resizedImageBase64 = await resizeImage({
+    input: imageInputBase64,
+    width,
+    height,
+    debug: true,
+    asBase64: true
+  })
+  // console.log(`resizedImage: ${resizedImageBase64.slice(0, 64)}`)
+  const machine = await getClusterMachine()
+  try {
+    if (debug) {
+      console.log(`calling AnimateDiff-LCM-SVD API with params (some are hidden):`, {
+        motion,
+        nbSteps,
+        width,
+        height,
+        nbFrames,
+        nbFPS,
+        durationInSec,
+        framesPerSec,
+      })
+    }
+    const res = await fetch(machine.url + (machine.url.endsWith("/") ? "" : "/") + "api/predict", {
+      method: "POST",
+      headers: {
+        "Content-Type": "application/json",
+        // Authorization: `Bearer ${token}`,
+      },
+      body: JSON.stringify({
+        fn_index: 0, // <- important! it is currently 4, not 1!
+        data: [
+          token,
+          resizedImageBase64,
+          0, // seed,
+          true,
+          33, // motion_bucket_id,
+          // attention: we are experimenting with ffmpeg to change the speed,
+          // on the server "als-2"
+          // but only this server supports "durationInSec" as an extra parameter
+          durationInSec,
+          // same here, if using als-2 you need to pick a small value
+          framesPerSec,
+          1.2, // max_guidance_scale,
+          1.0, // min_guidance_scale,
+          width,
+          height,
+          nbSteps,
+        ],
+      }),
+      // necessary since we are using the fetch() provided by NextJS
+      cache: "no-store",
+      // we can also use this (see https://vercel.com/blog/vercel-cache-api-nextjs-cache)
+      // next: { revalidate: 1 }
+    })
+    // console.log("res:", res)
+    const { data } = await res.json()
+    // console.log("data:", data)
+    // Recommendation: handle errors
+    if (res.status !== 200 || !Array.isArray(data)) {
+      // This will activate the closest `error.js` Error Boundary
+      throw new Error(`Failed to fetch data (status: ${res.status})`)
+    }
+    // console.log("data:", data.slice(0, 50))
+    const base64Content = (data?.[0] || "") as string
+    if (!base64Content) {
+      throw new Error(`invalid response (no content)`)
+    }
+    // this API already emits a data-uri with a content type
+    // addBase64HeaderToMp4(base64Content)
+    return base64Content
+  } catch (err) {
+    if (debug) {
+      console.error(`failed to call the AnimateDiff-LCM-SVD API:`)
+      console.error(err)
+    }
+    throw err
+  } finally {
+    // important: we need to free up the machine!
+    machine.busy = false
+  }
+}

src/app/api/v1/render/{cluster.ts → animatediff-lightning/cluster.ts} RENAMED Viewed

@@ -1,5 +1,5 @@
 import { sleep } from "@/lib/utils/sleep"
-import { ClusterMachine } from "../types"
 // video generation requires A100s so we need to be parcimonous here,
@@ -36,7 +36,7 @@ export async function getClusterMachine(maxWaitTimeInMs: number = 10000): Promis
   }
   if (!clusterMachine) {
-    throw new Error(`failed to find a cluster machine within ${maxWaitTimeInMs/10} seconds`)
   }
   // change the global state

 import { sleep } from "@/lib/utils/sleep"
+import { ClusterMachine } from "../../types"
 // video generation requires A100s so we need to be parcimonous here,
   }
   if (!clusterMachine) {
+    throw new Error(`failed to find a cluster machine within ${maxWaitTimeInMs/1000} seconds`)
   }
   // change the global state

src/app/api/v1/render/{index.ts → animatediff-lightning/index.ts} RENAMED Viewed

File without changes

src/app/api/v1/render/route.ts CHANGED Viewed

@@ -1,11 +1,12 @@
 import { NextResponse, NextRequest } from "next/server"
 import queryString from "query-string"
-import { ClapMediaOrientation, getValidNumber } from "@aitube/clap"
 import { throwIfInvalidToken } from "@/app/api/v1/auth/throwIfInvalidToken"
 import { getContentType } from "@/lib/data/getContentType"
-import { render } from "."
 export async function POST(req: NextRequest, res: NextResponse) {
   await throwIfInvalidToken(req.headers.get("Authorization"))
@@ -31,7 +32,7 @@ export async function POST(req: NextRequest, res: NextResponse) {
   const nbFrames = 80
   const nbFPS = 24
   const nbSteps = turbo ? 4 : 8
-  const debug = false
   const assetUrl = await render({
     prompt,

 import { NextResponse, NextRequest } from "next/server"
 import queryString from "query-string"
+import { getValidNumber } from "@aitube/clap"
 import { throwIfInvalidToken } from "@/app/api/v1/auth/throwIfInvalidToken"
 import { getContentType } from "@/lib/data/getContentType"
+// import { render } from "./animatediff-lcm-svd"
+import { render } from "./animatediff-lightning"
 export async function POST(req: NextRequest, res: NextResponse) {
   await throwIfInvalidToken(req.headers.get("Authorization"))
   const nbFrames = 80
   const nbFPS = 24
   const nbSteps = turbo ? 4 : 8
+  const debug = true
   const assetUrl = await render({
     prompt,

src/lib/utils/logImage.ts ADDED Viewed

	@@ -0,0 +1,31 @@

+export async function logImage(uri: string): Promise<void> {
+  // Create an image element
+  const img = new Image();
+  // Load the image asynchronously
+  img.src = uri;
+  await new Promise<void>((resolve, reject) => {
+      img.onload = () => resolve();
+      img.onerror = (error) => reject(error);
+  });
+  // Get the image dimensions
+  const { width, height } = img;
+  // Log the image in the console
+  console.log(
+      "%c+",
+      `font-size: 1px; padding: ${Math.floor(height / 2)}px ${Math.floor(width / 2)}px; line-height: ${height}px; background: url('${uri}'); background-size: ${width}px ${height}px; background-repeat: no-repeat; color: transparent;`
+  );
+}
+(async function() {
+  if (typeof window !== "undefined") {
+    // Add the logImage function to the console object
+    (console as any).image = logImage;
+    // Example usage
+    // console.image('https://example.com/path/to/your/image.jpg');
+  }
+})()

src/lib/utils/resizeImage.ts ADDED Viewed

	@@ -0,0 +1,55 @@

+import sharp from "sharp";
+export type ResizeImageParams = {
+  input: string
+  width?: number
+  height?: number
+  debug?: boolean
+  asBase64?: boolean // TODO: not implemented yet!
+};
+/**
+ * Resize an image to a given width and height.
+ * The input image can be a file path or a data URI (base64)
+ * The image ratio will be preserved if only one side is given.
+ * The image format (WebP, Jpeg, PNG) will be preserved.
+ * This function always return a base64 string (data URI with the mime type)
+ *
+ * @param param0
+ * @returns
+ */
+export async function resizeImage({ input, width, height, debug, asBase64 }: ResizeImageParams): Promise<string> {
+  let inputBuffer: Buffer;
+  // Test if input is a data URI
+  const dataUriPattern = /^data:([a-zA-Z]+\/[a-zA-Z]+);base64,(.*)$/;
+  const matches = input.match(dataUriPattern);
+  if (matches) {
+    const [, mimeType, base64Data] = matches;
+    if (!/^image\/(png|jpeg|webp|heic)$/.test(mimeType)) {
+      throw new Error(`Unsupported image format. Expected PNG, JPEG, or WebP.`);
+    }
+    inputBuffer = Buffer.from(base64Data, "base64");
+  } else {
+    // Assuming input is a file path
+    inputBuffer = await sharp(input).toBuffer();
+  }
+  const sharpInstance = sharp(inputBuffer)
+    .resize(width, height, {
+      fit: "inside",
+      withoutEnlargement: true
+    });
+  const outputBuffer = await sharpInstance.toBuffer();
+  const outputMimeType = await sharpInstance.metadata().then(meta => meta.format);
+  if (!outputMimeType) {
+    throw new Error("Failed to determine the image mime type after resizing.");
+  }
+  const prefix = `data:image/${outputMimeType};base64,`;
+  const outputBase64 = outputBuffer.toString("base64");
+  return `${prefix}${outputBase64}`;
+}