jbilcke-hf HF staff commited on
Commit
3c87951
1 Parent(s): 8b547c3

let's up the game

Browse files
src/app/api/utils/imagePrompts.ts CHANGED
@@ -13,6 +13,7 @@ export function addWordsIfNotPartOfThePrompt(prompt: string = "", words: string[
13
 
14
  export function getPositivePrompt(prompt: string = "", triggerWord = "") {
15
  return addWordsIfNotPartOfThePrompt(prompt, [
 
16
  triggerWord,
17
  "sublime",
18
  "pro quality",
 
13
 
14
  export function getPositivePrompt(prompt: string = "", triggerWord = "") {
15
  return addWordsIfNotPartOfThePrompt(prompt, [
16
+ "cinematic photo",
17
  triggerWord,
18
  "sublime",
19
  "pro quality",
src/app/api/v1/edit/dialogues/processShot.ts CHANGED
@@ -39,7 +39,7 @@ export async function processShot({
39
 
40
  let shotDialogueSegment: ClapSegment | undefined = shotDialogueSegments.at(0)
41
 
42
- console.log(`[api/edit/dialogues] processShot: shot [${shotSegment.startTimeInMs}:${shotSegment.endTimeInMs}] has ${shotSegments.length} segments (${shotDialogueSegments.length} dialogues)`)
43
 
44
  if (shotDialogueSegment && !shotDialogueSegment.assetUrl) {
45
  // console.log(`[api/edit/dialogues] generating audio..`)
@@ -58,6 +58,8 @@ export async function processShot({
58
  })
59
  shotDialogueSegment.assetSourceType = getClapAssetSourceType(shotDialogueSegment.assetUrl)
60
 
 
 
61
  const { durationInMs, hasAudio } = await getMediaInfo(shotDialogueSegment.assetUrl)
62
 
63
  if (hasAudio && durationInMs > 1000) {
 
39
 
40
  let shotDialogueSegment: ClapSegment | undefined = shotDialogueSegments.at(0)
41
 
42
+ // console.log(`[api/edit/dialogues] processShot: shot [${shotSegment.startTimeInMs}:${shotSegment.endTimeInMs}] has ${shotSegments.length} segments (${shotDialogueSegments.length} dialogues)`)
43
 
44
  if (shotDialogueSegment && !shotDialogueSegment.assetUrl) {
45
  // console.log(`[api/edit/dialogues] generating audio..`)
 
58
  })
59
  shotDialogueSegment.assetSourceType = getClapAssetSourceType(shotDialogueSegment.assetUrl)
60
 
61
+ shotDialogueSegment.status = "completed"
62
+
63
  const { durationInMs, hasAudio } = await getMediaInfo(shotDialogueSegment.assetUrl)
64
 
65
  if (hasAudio && durationInMs > 1000) {
src/app/api/v1/edit/entities/generateEntityPrompts.ts CHANGED
@@ -93,7 +93,7 @@ Now please generate the output entities:`
93
 
94
  maybeEntities = parseRawStringToYAML<LatentEntity[]>(rawString, [])
95
  if (!Array.isArray(maybeEntities) || maybeEntities.length === 0) {
96
- console.log(`generateEntityPrompts(): failed to generate shots for the second time, which indicates an issue with the Hugging Face API`)
97
  }
98
  }
99
 
 
93
 
94
  maybeEntities = parseRawStringToYAML<LatentEntity[]>(rawString, [])
95
  if (!Array.isArray(maybeEntities) || maybeEntities.length === 0) {
96
+ console.log(`generateEntityPrompts(): failed to generate entities for the second time, which indicates an issue with the Hugging Face API`)
97
  }
98
  }
99
 
src/app/api/v1/edit/music/cluster.ts CHANGED
@@ -33,7 +33,7 @@ export async function getClusterMachine(maxWaitTimeInMs: number = 10000): Promis
33
  }
34
 
35
  if (!clusterMachine) {
36
- throw new Error(`failed to find a cluster machine within ${maxWaitTimeInMs/10} seconds`)
37
  }
38
 
39
  // change the global state
 
33
  }
34
 
35
  if (!clusterMachine) {
36
+ throw new Error(`failed to find a cluster machine within ${maxWaitTimeInMs/1000} seconds`)
37
  }
38
 
39
  // change the global state
src/app/api/v1/edit/music/generateMusic.ts CHANGED
@@ -48,7 +48,7 @@ export async function generateMusic({
48
  }
49
 
50
 
51
- const durationInSec = 12 // musicSegment.assetDurationInMs / 1000
52
 
53
  console.log(`generateMusic(): generating a music with:\n duration: ${durationInSec} sec\n prompt: ${prompt}`)
54
 
 
48
  }
49
 
50
 
51
+ const durationInSec = 14 // musicSegment.assetDurationInMs / 1000
52
 
53
  console.log(`generateMusic(): generating a music with:\n duration: ${durationInSec} sec\n prompt: ${prompt}`)
54
 
src/app/api/v1/edit/music/generateMusicPrompt.ts CHANGED
@@ -29,8 +29,7 @@ export async function generateMusicPrompts({
29
  // console.log("generateMusicPrompts(): latentStory:", latentStory)
30
 
31
  const userPrompt = `The input story is about: ${prompt}.
32
-
33
- The input story is:
34
  \`\`\`yaml
35
  ${YAML.stringify(
36
  // we need to help the LLM by marking the shots with a simple numeric ID
 
29
  // console.log("generateMusicPrompts(): latentStory:", latentStory)
30
 
31
  const userPrompt = `The input story is about: ${prompt}.
32
+
 
33
  \`\`\`yaml
34
  ${YAML.stringify(
35
  // we need to help the LLM by marking the shots with a simple numeric ID
src/app/api/v1/edit/sounds/cluster.ts ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { sleep } from "@/lib/utils/sleep"
2
+ import { ClusterMachine } from "../../types"
3
+
4
+ export const nbClusterMachines = 1
5
+ // make sure the machines are running!!
6
+
7
+ // https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-magnet-1/settings
8
+ // https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-magnet-2/settings
9
+ // https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-magnet-3/settings
10
+ // https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-magnet-4/settings
11
+
12
+ // we maintain a global cluster state
13
+
14
+ export const clusterMachines: ClusterMachine[] = []
15
+ for (let i = 0; i < nbClusterMachines; i++) {
16
+ clusterMachines.push({
17
+ id: i,
18
+ url: `https://jbilcke-hf-ai-tube-model-magnet-${i + 1}.hf.space`,
19
+ busy: false
20
+ })
21
+ }
22
+
23
+ export async function getClusterMachine(maxWaitTimeInMs: number = 10000): Promise<ClusterMachine> {
24
+ let clusterMachine: ClusterMachine | undefined = undefined
25
+ let timeSpentWaitingInMs = 0
26
+ const intervalInMs = 500
27
+
28
+ while (true) {
29
+ clusterMachine = clusterMachines.find(m => !m.busy)
30
+ if (clusterMachine) { break }
31
+ if (timeSpentWaitingInMs > maxWaitTimeInMs) { break }
32
+ await sleep(intervalInMs)
33
+ }
34
+
35
+ if (!clusterMachine) {
36
+ throw new Error(`failed to find a cluster machine within ${maxWaitTimeInMs/1000} seconds`)
37
+ }
38
+
39
+ // change the global state
40
+ clusterMachine.busy = true
41
+
42
+ return clusterMachine
43
+ }
44
+
45
+ export const token = `${process.env.MICROSERVICE_API_SECRET_TOKEN || ""}`
src/app/api/v1/edit/sounds/generateSound.ts ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import {
3
+ ClapProject,
4
+ ClapSegment,
5
+ getClapAssetSourceType,
6
+ filterSegments,
7
+ ClapSegmentFilteringMode,
8
+ ClapSegmentCategory,
9
+ newSegment
10
+ } from "@aitube/clap"
11
+ import { ClapCompletionMode } from "@aitube/client"
12
+ import { getMediaInfo } from "@/app/api/utils/getMediaInfo"
13
+ import { generateSoundWithMagnet } from "./generateSourceWithMagnet"
14
+
15
+ export async function generateSound({
16
+ soundSegment,
17
+ existingClap,
18
+ newerClap,
19
+ mode,
20
+ turbo,
21
+ }: {
22
+ soundSegment?: ClapSegment
23
+ existingClap: ClapProject
24
+ newerClap: ClapProject
25
+ mode: ClapCompletionMode
26
+ turbo: boolean
27
+ }): Promise<void> {
28
+ if (!soundSegment) {
29
+ console.log(`generateSound(): sound segment is empty, so skipping sound generation.`)
30
+ return
31
+ }
32
+
33
+ // for now we do something very basic
34
+
35
+ if (soundSegment.status === "completed") {
36
+ console.log(`generateSound(): sound segment is already generated, skipping doing it twice.`)
37
+ return
38
+ }
39
+
40
+ // for now we do something very basic
41
+ const prompt = soundSegment.prompt
42
+ if (!prompt) {
43
+ console.log(`generateSound(): sound prompt is empty, so skipping sound generation.`)
44
+ return
45
+ }
46
+
47
+
48
+ const durationInSec = 12 // soundSegment.assetDurationInMs / 1000
49
+
50
+ console.log(`generateSound(): generating a sound with:\n duration: ${durationInSec} sec\n prompt: ${prompt}`)
51
+
52
+ const assetUrl = await generateSoundWithMagnet({
53
+ prompt,
54
+ durationInSec,
55
+ hd: false,
56
+ debug: true,
57
+ neverThrow: true,
58
+ })
59
+
60
+
61
+ if (!assetUrl || assetUrl?.length < 30) {
62
+ console.log(`generateSound(): the generated assetUrl is empty, so sound generation failed.`)
63
+ return
64
+ }
65
+
66
+ let { durationInMs, hasAudio } = await getMediaInfo(assetUrl)
67
+
68
+ const newProperties: Partial<ClapSegment> = {
69
+ assetUrl,
70
+ assetDurationInMs: durationInMs,
71
+ outputGain: 1.0,
72
+ status: "completed"
73
+ }
74
+
75
+
76
+ if (!hasAudio) {
77
+ console.warn(`generateSound(): the generated sound waveform appears to be silent (might be a ffprobe malfunction)`)
78
+ // return
79
+ // we have a bug on AiTube, basically the ffmpeg probe isn't working,
80
+ // because it doesn't find ffmpeg
81
+ // if think the issue is how the Dockerfile is formed
82
+ // so until this is fixed, we need to fake a "correct" result
83
+ newProperties.assetDurationInMs = soundSegment.assetDurationInMs
84
+ }
85
+
86
+ if (mode !== ClapCompletionMode.FULL) {
87
+ console.log(`generateSound(): adding sound to a new clap file`)
88
+ newerClap.segments.push(newSegment({
89
+ ...soundSegment,
90
+ ...newProperties,
91
+ }))
92
+ } else {
93
+ console.log(`generateSound(): overwriting the sound inside the existing clap file`)
94
+ // this will update the existing clap (normally)
95
+ Object.assign(soundSegment, newProperties)
96
+ }
97
+ }
src/app/api/v1/edit/sounds/generateSoundPrompt.ts ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import YAML from "yaml"
3
+
4
+ import { predict } from "@/app/api/providers/huggingface/predictWithHuggingFace"
5
+ import { LatentStory } from "@/app/api/v1/types"
6
+
7
+ import { systemPrompt } from "./systemPrompt"
8
+
9
+ export async function generateSoundPrompts({
10
+ prompt = "",
11
+ latentStory = [],
12
+ turbo = false,
13
+ }: {
14
+ prompt?: string
15
+ latentStory?: LatentStory[]
16
+ turbo?: boolean
17
+ } = {
18
+ prompt: "",
19
+ latentStory: [],
20
+ turbo: false
21
+ }): Promise<string[]> {
22
+
23
+ if (!prompt.length) { throw new Error(`please provide a prompt`) }
24
+ console.log("generateSoundPrompts(): prompt:", prompt)
25
+
26
+
27
+ if (!latentStory.length) { throw new Error(`please provide a story`) }
28
+
29
+ // console.log("generateSoundPrompts(): latentStory:", latentStory)
30
+
31
+
32
+ const userPrompt = `The input story is about: ${prompt}.
33
+
34
+ # Output`
35
+ /*
36
+ NOTE Julian: maybe later we can use this:
37
+
38
+ const userPrompt = `The input story is about: ${prompt}.
39
+
40
+ \`\`\`yaml
41
+ ${YAML.stringify(
42
+ // we need to help the LLM by marking the shots with a simple numeric ID
43
+ latentStory.map((shot, i) => ({
44
+ shot: i,
45
+ ...shot,
46
+ }))
47
+ )}
48
+ \`\`\`
49
+
50
+ # Output`
51
+ */
52
+
53
+ const prefix = "\""
54
+
55
+ // we don't need a lot here!
56
+ const nbMaxNewTokens = 120
57
+
58
+ // TODO use streaming for the Hugging Face prediction
59
+ //
60
+ // note that a Clap file is actually a YAML stream of documents
61
+ // so technically we could stream everything from end-to-end
62
+ // (but I haven't coded the helpers to do this yet)
63
+ let rawString = await predict({
64
+ systemPrompt,
65
+ userPrompt,
66
+ nbMaxNewTokens,
67
+ prefix,
68
+ turbo,
69
+ })
70
+
71
+ // console.log("generateEntityPrompts(): rawString: ", rawString)
72
+
73
+ let results: string[] = []
74
+
75
+ // we remove everything after the last ``` (or ``)
76
+ rawString = rawString.split(/```?/)[0].trim()
77
+ results.push(rawString)
78
+
79
+ if (!Array.isArray(results) || typeof results.at(0) !== "string" || !results) {
80
+ throw new Error(`failed to generate the output (rawString is: ${rawString})`)
81
+ }
82
+
83
+ return results
84
+ }
src/app/api/v1/edit/sounds/generateSourceWithMagnet.ts ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { addBase64Header } from "@/lib/data/addBase64Header"
2
+ import { SoundGenerationParams } from "./types"
3
+ import { getClusterMachine } from "./cluster"
4
+
5
+ const microserviceApiKey = `${process.env.MICROSERVICE_API_SECRET_TOKEN || ""}`
6
+
7
+ /**
8
+ * Note: this generates a base64 mp3 file
9
+ */
10
+ export async function generateSoundWithMagnet({
11
+ prompt,
12
+ durationInSec,
13
+ hd,
14
+ debug = false,
15
+ neverThrow = false,
16
+ }: SoundGenerationParams): Promise<string> {
17
+
18
+ if (!prompt?.length) {
19
+ throw new Error(`prompt is too short!`)
20
+ }
21
+
22
+ const machine = await getClusterMachine()
23
+
24
+ try {
25
+ const res = await fetch(machine.url + (machine.url.endsWith("/") ? "" : "/") + "api/predict", {
26
+ method: "POST",
27
+ headers: {
28
+ "Content-Type": "application/json",
29
+ // Authorization: `Bearer ${token}`,
30
+ },
31
+ body: JSON.stringify({
32
+ fn_index: 1, // <- important!
33
+ data: [
34
+ microserviceApiKey, // string in 'Secret Token' Textbox component
35
+ // TODO
36
+ ],
37
+ }),
38
+ cache: "no-store",
39
+ // we can also use this (see https://vercel.com/blog/vercel-cache-api-nextjs-cache)
40
+ // next: { revalidate: 1 }
41
+ })
42
+
43
+ if (res.status !== 200) {
44
+ throw new Error('Failed to fetch data')
45
+ }
46
+
47
+ const { data } = await res.json()
48
+
49
+ // console.log("data:", data)
50
+ // Recommendation: handle errors
51
+ if (res.status !== 200 || !Array.isArray(data)) {
52
+ // This will activate the closest `error.js` Error Boundary
53
+ throw new Error(`Failed to fetch data (status: ${res.status})`)
54
+ }
55
+ // console.log("data:", data.slice(0, 50))
56
+
57
+ if (!data[0]) {
58
+ throw new Error(`the returned sound was empty`)
59
+ }
60
+
61
+ // console.log("data:", data[0].slice(0, 60))
62
+ return addBase64Header(data[0] as string, "mp3")
63
+ } catch (err) {
64
+ throw err
65
+ } finally {
66
+ // important: we need to free up the machine!
67
+ machine.busy = false
68
+ }
69
+ }
src/app/api/v1/edit/sounds/processShot.ts ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import {
3
+ ClapProject,
4
+ ClapSegment,
5
+ getClapAssetSourceType,
6
+ filterSegments,
7
+ ClapSegmentFilteringMode,
8
+ ClapSegmentCategory
9
+ } from "@aitube/clap"
10
+ import { ClapCompletionMode } from "@aitube/client"
11
+
12
+ import { generateSoundWithMagnet } from "./generateSourceWithMagnet"
13
+ import { getMediaInfo } from "@/app/api/utils/getMediaInfo"
14
+
15
+ export async function processShot({
16
+ shotSegment,
17
+ existingClap,
18
+ newerClap,
19
+ mode,
20
+ turbo,
21
+ }: {
22
+ shotSegment: ClapSegment
23
+ existingClap: ClapProject
24
+ newerClap: ClapProject
25
+ mode: ClapCompletionMode
26
+ turbo: boolean
27
+ }): Promise<void> {
28
+
29
+ const shotSegments: ClapSegment[] = filterSegments(
30
+ ClapSegmentFilteringMode.BOTH,
31
+ shotSegment,
32
+ existingClap.segments
33
+ )
34
+
35
+ const shotSoundSegments: ClapSegment[] = shotSegments.filter(s =>
36
+ s.category === ClapSegmentCategory.SOUND
37
+ )
38
+
39
+ let shotSoundSegment: ClapSegment | undefined = shotSoundSegments.at(0)
40
+
41
+ console.log(`[api/edit/sounds] processShot: shot [${shotSegment.startTimeInMs}:${shotSegment.endTimeInMs}] has ${shotSegments.length} segments (${shotSoundSegments.length} sounds)`)
42
+
43
+ if (shotSoundSegment && !shotSoundSegment.assetUrl) {
44
+ // console.log(`[api/edit/sounds] generating background sound effect..`)
45
+
46
+ try {
47
+ // this generates a mp3
48
+ shotSoundSegment.assetUrl = await generateSoundWithMagnet({
49
+ prompt: shotSoundSegment.prompt,
50
+ durationInSec: shotSegment.assetDurationInMs,
51
+ hd: false,
52
+ debug: true,
53
+ neverThrow: false,
54
+ })
55
+ shotSoundSegment.assetSourceType = getClapAssetSourceType(shotSoundSegment.assetUrl)
56
+
57
+ shotSoundSegment.status = "completed"
58
+
59
+ const { durationInMs, hasAudio } = await getMediaInfo(shotSoundSegment.assetUrl)
60
+
61
+ if (hasAudio && durationInMs > 1000) {
62
+ shotSoundSegment.assetDurationInMs = durationInMs
63
+ shotSegment.assetDurationInMs = durationInMs
64
+
65
+ // we update the duration of all the segments for this shot
66
+ // (it is possible that this makes the two previous lines redundant)
67
+ existingClap.segments.forEach(s => {
68
+ s.assetDurationInMs = durationInMs
69
+ })
70
+ }
71
+
72
+ } catch (err) {
73
+ console.log(`[api/edit/sounds] processShot: failed to generate audio: ${err}`)
74
+ throw err
75
+ }
76
+
77
+ console.log(`[api/edit/sounds] processShot: generated sound audio: ${shotSoundSegment?.assetUrl?.slice?.(0, 50)}...`)
78
+
79
+ // if it's partial, we need to manually add it
80
+ if (mode !== ClapCompletionMode.FULL) {
81
+ newerClap.segments.push(shotSoundSegment)
82
+ }
83
+ } else {
84
+ console.log(`[api/edit/sounds] processShot: there is already a sound audio: ${shotSoundSegment?.assetUrl?.slice?.(0, 50)}...`)
85
+ }
86
+ }
src/app/api/v1/edit/sounds/route.txt ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { NextResponse, NextRequest } from "next/server"
2
+ import queryString from "query-string"
3
+ import { ClapProject, ClapSegment, ClapSegmentCategory, newClap, parseClap, serializeClap } from "@aitube/clap"
4
+ import { ClapCompletionMode } from "@aitube/client"
5
+
6
+ import { parseCompletionMode } from "@/app/api/parsers/parseCompletionMode"
7
+ import { throwIfInvalidToken } from "@/app/api/v1/auth/throwIfInvalidToken"
8
+ import { parseTurbo } from "@/app/api/parsers/parseTurbo"
9
+
10
+ import { processShot } from "./processShot"
11
+ // a helper to generate speech for a Clap
12
+ export async function POST(req: NextRequest) {
13
+ await throwIfInvalidToken(req.headers.get("Authorization"))
14
+
15
+ const qs = queryString.parseUrl(req.url || "")
16
+ const query = (qs || {}).query
17
+
18
+ const mode = parseCompletionMode(query?.c)
19
+ const turbo = parseTurbo(query?.t)
20
+
21
+ const blob = await req.blob()
22
+
23
+ const existingClap: ClapProject = await parseClap(blob)
24
+
25
+ if (!existingClap?.segments) { throw new Error(`no segment found in the provided clap!`) }
26
+
27
+ // console.log(`[api/edit/dialogues] detected ${existingClap.segments.length} segments`)
28
+
29
+ const shotsSegments: ClapSegment[] = existingClap.segments.filter(s => s.category === ClapSegmentCategory.CAMERA)
30
+ // console.log(`[api/edit/dialogues] detected ${shotsSegments.length} shots`)
31
+
32
+ if (shotsSegments.length > 32) {
33
+ throw new Error(`Error, this endpoint being synchronous, it is designed for short stories only (max 32 shots).`)
34
+ }
35
+
36
+ const newerClap = mode === ClapCompletionMode.FULL ? existingClap : newClap({
37
+ meta: existingClap.meta
38
+ })
39
+
40
+ // we process the shots in parallel (this will increase the queue size in the Gradio spaces)
41
+ await Promise.all(shotsSegments.map(shotSegment =>
42
+ processShot({
43
+ shotSegment,
44
+ existingClap,
45
+ newerClap,
46
+ mode,
47
+ turbo,
48
+ })
49
+ ))
50
+
51
+ // console.log(`[api/edit/dialogues] returning the clap augmented with dialogues`)
52
+
53
+ return new NextResponse(await serializeClap(newerClap), {
54
+ status: 200,
55
+ headers: new Headers({ "content-type": "application/x-gzip" }),
56
+ })
57
+ }
src/app/api/v1/edit/sounds/systemPrompt.ts ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export const systemPrompt: string = `
2
+ You are a backend API engine, designed to generate a background audio and sound effect prompt output from a story input.
3
+
4
+ ## Prompting guidelines
5
+
6
+ We already know we are generating sound, no need to tell us again, so e concise!
7
+ Don't speak too much or give your opinion so don't say things like "The audio track should have a wind and chimes sounds, giving an eerie, ominous mood.." instead just say "wind, chimes".
8
+ Avoid concepts that don't translate well to sound.
9
+
10
+ To create a background soundtrack prompt, you need to combine locations with objects and their characteristics.
11
+
12
+ ## Example of input/output
13
+
14
+ Given the following input story, provided as YAML:
15
+
16
+ # Input
17
+
18
+ "A king goes to see a witch to ask if or how he can win an upcoming and challenging battle"
19
+
20
+ As you can see, the theme is modern, describing a city. So you should generate an audio soundtrack like this:
21
+
22
+ ## Output
23
+
24
+ "Downtown New York, busy street, pedestrians, taxis."
25
+ `
src/app/api/v1/edit/sounds/types.ts ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ export type SoundGenerationParams = {
2
+ prompt: string
3
+ durationInSec: number
4
+ hd?: boolean
5
+ debug?: boolean
6
+ neverThrow?: boolean
7
+ }
src/app/api/v1/edit/storyboards/processShot.ts CHANGED
@@ -87,12 +87,14 @@ export async function processShot({
87
  turbo,
88
  })
89
  shotStoryboardSegment.assetSourceType = getClapAssetSourceType(shotStoryboardSegment.assetUrl)
 
90
  } catch (err) {
91
  console.log(`[api/v1/edit/storyboards] processShot: failed to generate an image: ${err}`)
 
92
  throw err
93
  }
94
 
95
- console.log(`[api/v1/edit/storyboards] processShot: generated storyboard image: ${shotStoryboardSegment?.assetUrl?.slice?.(0, 50)}...`)
96
 
97
  // if mode is full, newerClap already contains the ference to shotStoryboardSegment
98
  // but if it's partial, we need to manually add it
 
87
  turbo,
88
  })
89
  shotStoryboardSegment.assetSourceType = getClapAssetSourceType(shotStoryboardSegment.assetUrl)
90
+ shotStoryboardSegment.status = "completed"
91
  } catch (err) {
92
  console.log(`[api/v1/edit/storyboards] processShot: failed to generate an image: ${err}`)
93
+ shotStoryboardSegment.status = "to_generate"
94
  throw err
95
  }
96
 
97
+ // console.log(`[api/v1/edit/storyboards] processShot: generated storyboard image: ${shotStoryboardSegment?.assetUrl?.slice?.(0, 50)}...`)
98
 
99
  // if mode is full, newerClap already contains the ference to shotStoryboardSegment
100
  // but if it's partial, we need to manually add it
src/app/api/v1/edit/videos/processShot.ts CHANGED
@@ -15,7 +15,8 @@ import { getVideoPrompt } from "@aitube/engine"
15
 
16
  import { getPositivePrompt } from "@/app/api/utils/imagePrompts"
17
 
18
- import { render } from "@/app/api/v1/render"
 
19
  import { extractFirstFrame } from "@/app/api/utils/extractFirstFrame"
20
 
21
  export async function processShot({
@@ -49,7 +50,7 @@ export async function processShot({
49
 
50
  let shotStoryboardSegment: ClapSegment | undefined = shotStoryboardSegments.at(0)
51
 
52
- console.log(`[api/edit/videos] processShot: shot [${shotSegment.startTimeInMs}:${shotSegment.endTimeInMs}] has ${shotSegments.length} segments (${shotVideoSegments.length} videos)`)
53
 
54
  // TASK 1: GENERATE MISSING VIDEO SEGMENT
55
  if (!shotVideoSegment) {
@@ -90,7 +91,7 @@ export async function processShot({
90
 
91
  // TASK 3: GENERATE MISSING VIDEO FILE
92
  if (!shotVideoSegment.assetUrl) {
93
- console.log(`[api/edit/videos] processShot: generating video file..`)
94
 
95
  const debug = false
96
 
@@ -102,30 +103,44 @@ export async function processShot({
102
  // height = Math.round(height / 2)
103
  // }
104
 
 
105
  if (width > height) {
106
- width = 512
107
- height = 288
108
  } else if (width < height) {
109
- width = 288
110
- height = 512
111
  } else {
112
  width = 512
113
  height = 512
114
  }
 
 
 
 
 
 
 
 
115
  try {
116
  shotVideoSegment.assetUrl = await render({
117
- prompt: getPositivePrompt(shotVideoSegment.prompt),
 
118
  seed: shotSegment.seed,
119
  width,
120
  height,
 
 
121
  nbFrames: 80,
122
  nbFPS: 24,
123
  nbSteps: 4, // turbo ? 4 : 8,
124
  debug,
125
  })
126
  shotVideoSegment.assetSourceType = getClapAssetSourceType(shotVideoSegment.assetUrl)
 
127
  } catch (err) {
128
  console.log(`[api/edit/videos] processShot: failed to generate a video file: ${err}`)
 
129
  throw err
130
  }
131
 
@@ -182,7 +197,7 @@ export async function processShot({
182
 
183
  shotStoryboardSegment.status = "completed"
184
  } catch (err) {
185
- console.warn(`[api/edit/videos] processShot: couldn't generate the missing storyboard (probably an error with the ffmpeg config). Message:`, err)
186
  shotStoryboardSegment.status = "to_generate"
187
  }
188
 
 
15
 
16
  import { getPositivePrompt } from "@/app/api/utils/imagePrompts"
17
 
18
+ import { render } from "@/app/api/v1/render/animatediff-lcm-svd"
19
+ // import { render } from "@/app/api/v1/render/animatediff-lightning"
20
  import { extractFirstFrame } from "@/app/api/utils/extractFirstFrame"
21
 
22
  export async function processShot({
 
50
 
51
  let shotStoryboardSegment: ClapSegment | undefined = shotStoryboardSegments.at(0)
52
 
53
+ // console.log(`[api/edit/videos] processShot: shot [${shotSegment.startTimeInMs}:${shotSegment.endTimeInMs}] has ${shotSegments.length} segments (${shotVideoSegments.length} videos)`)
54
 
55
  // TASK 1: GENERATE MISSING VIDEO SEGMENT
56
  if (!shotVideoSegment) {
 
91
 
92
  // TASK 3: GENERATE MISSING VIDEO FILE
93
  if (!shotVideoSegment.assetUrl) {
94
+ // console.log(`[api/edit/videos] processShot: generating video file..`)
95
 
96
  const debug = false
97
 
 
103
  // height = Math.round(height / 2)
104
  // }
105
 
106
+ /*
107
  if (width > height) {
108
+ width = 768
109
+ height = 384
110
  } else if (width < height) {
111
+ width = 384
112
+ height = 768
113
  } else {
114
  width = 512
115
  height = 512
116
  }
117
+ */
118
+
119
+ if (!shotStoryboardSegment?.assetUrl) {
120
+ const error = `cannot generate a video without a storyboard! (at least not with AnimateDiff-LCM SVD)`
121
+ console.error(error)
122
+ throw new Error(error)
123
+ }
124
+
125
  try {
126
  shotVideoSegment.assetUrl = await render({
127
+ // prompt: getPositivePrompt(shotVideoSegment.prompt),
128
+ imageInputBase64: shotStoryboardSegment.assetUrl,
129
  seed: shotSegment.seed,
130
  width,
131
  height,
132
+ // by default we do 1 second of 24 fps
133
+ // but it would look better if we had 2 seconds of 24 fps
134
  nbFrames: 80,
135
  nbFPS: 24,
136
  nbSteps: 4, // turbo ? 4 : 8,
137
  debug,
138
  })
139
  shotVideoSegment.assetSourceType = getClapAssetSourceType(shotVideoSegment.assetUrl)
140
+ shotStoryboardSegment.status = "completed"
141
  } catch (err) {
142
  console.log(`[api/edit/videos] processShot: failed to generate a video file: ${err}`)
143
+ shotStoryboardSegment.status = "to_generate"
144
  throw err
145
  }
146
 
 
197
 
198
  shotStoryboardSegment.status = "completed"
199
  } catch (err) {
200
+ console.warn(`[api/edit/videos] processShot: couldn't generate the missing storyboard (probably an error with the ffmpeg not being found)`)
201
  shotStoryboardSegment.status = "to_generate"
202
  }
203
 
src/app/api/v1/render/animatediff-lcm-svd/cluster.ts ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { sleep } from "@/lib/utils/sleep"
2
+ import { ClusterMachine } from "../../types"
3
+
4
+
5
+
6
+ export const nbClusterMachines = 8
7
+ // make sure the machines are running!!
8
+
9
+ // https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-als-1/settings
10
+ // https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-als-2/settings
11
+ // https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-als-3/settings
12
+ // https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-als-4/settings
13
+ // https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-als-5/settings
14
+ // https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-als-6/settings
15
+ // https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-als-7/settings
16
+ // https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-als-8/settings
17
+
18
+ // we maintain a global cluster state
19
+
20
+ export const clusterMachines: ClusterMachine[] = []
21
+ for (let i = 0; i < nbClusterMachines; i++) {
22
+ clusterMachines.push({
23
+ id: i,
24
+ url: `https://jbilcke-hf-ai-tube-model-als-${i + 1}.hf.space`,
25
+
26
+ // careful when trying this one (check number of Gradio parameters, fps etc):
27
+ // url: `https://jbilcke-hf-ai-tube-model-als-experimental.hf.space`,
28
+ busy: false
29
+ })
30
+ }
31
+
32
+ export async function getClusterMachine(maxWaitTimeInMs: number = 10000): Promise<ClusterMachine> {
33
+ let clusterMachine: ClusterMachine | undefined = undefined
34
+ let timeSpentWaitingInMs = 0
35
+ const intervalInMs = 500
36
+
37
+ while (true) {
38
+ clusterMachine = clusterMachines.find(m => !m.busy)
39
+ if (clusterMachine) { break }
40
+ if (timeSpentWaitingInMs > maxWaitTimeInMs) { break }
41
+ await sleep(intervalInMs)
42
+ }
43
+
44
+ if (!clusterMachine) {
45
+ throw new Error(`failed to find a cluster machine within ${maxWaitTimeInMs/1000} seconds`)
46
+ }
47
+
48
+ // change the global state
49
+ clusterMachine.busy = true
50
+
51
+ return clusterMachine
52
+ }
53
+
54
+ export const token = `${process.env.MICROSERVICE_API_SECRET_TOKEN || ""}`
src/app/api/v1/render/animatediff-lcm-svd/index.ts ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { generateSeed, getValidNumber } from "@aitube/clap"
2
+ import { getClusterMachine, token } from "./cluster"
3
+ import { resizeImage } from "@/lib/utils/resizeImage"
4
+
5
+ /**
6
+ * Render a video using AnimateDiff-LCM-SVD
7
+ *
8
+ * @param request
9
+ * @returns
10
+ */
11
+ export async function render(request: {
12
+ imageInputBase64?: string
13
+ seed?: number
14
+ width?: number
15
+ height?: number
16
+ nbFrames?: number
17
+ nbFPS?: number
18
+ nbSteps?: number
19
+ debug?: boolean
20
+ }): Promise<string> {
21
+
22
+ const imageInputBase64 = request.imageInputBase64 || ""
23
+ if (!imageInputBase64) {
24
+ throw new Error(`missing imageInputBase64`)
25
+ }
26
+
27
+ const debug = !!request.debug
28
+
29
+ // I think we have a problem with the seed?
30
+ // const seed = request?.seed || generateSeed()
31
+
32
+ // the motion LoRA - could be useful one day
33
+ const motion = ""
34
+
35
+ const nbSteps = getValidNumber(request.nbSteps, 1, 12, 4)
36
+ const width = getValidNumber(request.width, 256, 1024, 896)
37
+ const height = getValidNumber(request.height, 256, 1024, 512)
38
+
39
+ // important note: by default our AnimateDiff-LCM SVD
40
+ // is a 24 fps model, so either 24 fps for 1 sec of footage,
41
+ // or 8 fps for 3 seconds of footage
42
+ const nbFrames = getValidNumber(request.nbFrames, 10, 120, 24)
43
+ const nbFPS = getValidNumber(request.nbFPS, 10, 120, 8)
44
+
45
+ // by default AnimateDiff generates about 2 seconds of video at 10 fps
46
+ // the Gradio API now has some code to optional fix that using FFmpeg,
47
+ // but this will add some delay overhead, so use with care!
48
+ const durationInSec = Math.round(nbFrames / nbFPS)
49
+ const framesPerSec = nbFPS
50
+
51
+ // vital step: image size must match the output video size
52
+ const resizedImageBase64 = await resizeImage({
53
+ input: imageInputBase64,
54
+ width,
55
+ height,
56
+ debug: true,
57
+ asBase64: true
58
+ })
59
+
60
+ // console.log(`resizedImage: ${resizedImageBase64.slice(0, 64)}`)
61
+
62
+ const machine = await getClusterMachine()
63
+
64
+ try {
65
+ if (debug) {
66
+ console.log(`calling AnimateDiff-LCM-SVD API with params (some are hidden):`, {
67
+ motion,
68
+ nbSteps,
69
+ width,
70
+ height,
71
+ nbFrames,
72
+ nbFPS,
73
+ durationInSec,
74
+ framesPerSec,
75
+ })
76
+ }
77
+
78
+ const res = await fetch(machine.url + (machine.url.endsWith("/") ? "" : "/") + "api/predict", {
79
+ method: "POST",
80
+ headers: {
81
+ "Content-Type": "application/json",
82
+ // Authorization: `Bearer ${token}`,
83
+ },
84
+ body: JSON.stringify({
85
+ fn_index: 0, // <- important! it is currently 4, not 1!
86
+ data: [
87
+ token,
88
+ resizedImageBase64,
89
+ 0, // seed,
90
+ true,
91
+ 33, // motion_bucket_id,
92
+
93
+ // attention: we are experimenting with ffmpeg to change the speed,
94
+ // on the server "als-2"
95
+ // but only this server supports "durationInSec" as an extra parameter
96
+
97
+ durationInSec,
98
+
99
+ // same here, if using als-2 you need to pick a small value
100
+ framesPerSec,
101
+
102
+ 1.2, // max_guidance_scale,
103
+ 1.0, // min_guidance_scale,
104
+ width,
105
+ height,
106
+ nbSteps,
107
+ ],
108
+ }),
109
+
110
+ // necessary since we are using the fetch() provided by NextJS
111
+ cache: "no-store",
112
+
113
+ // we can also use this (see https://vercel.com/blog/vercel-cache-api-nextjs-cache)
114
+ // next: { revalidate: 1 }
115
+ })
116
+
117
+ // console.log("res:", res)
118
+
119
+ const { data } = await res.json()
120
+
121
+ // console.log("data:", data)
122
+ // Recommendation: handle errors
123
+ if (res.status !== 200 || !Array.isArray(data)) {
124
+ // This will activate the closest `error.js` Error Boundary
125
+ throw new Error(`Failed to fetch data (status: ${res.status})`)
126
+ }
127
+ // console.log("data:", data.slice(0, 50))
128
+
129
+ const base64Content = (data?.[0] || "") as string
130
+
131
+ if (!base64Content) {
132
+ throw new Error(`invalid response (no content)`)
133
+ }
134
+
135
+ // this API already emits a data-uri with a content type
136
+ // addBase64HeaderToMp4(base64Content)
137
+ return base64Content
138
+ } catch (err) {
139
+ if (debug) {
140
+ console.error(`failed to call the AnimateDiff-LCM-SVD API:`)
141
+ console.error(err)
142
+ }
143
+ throw err
144
+ } finally {
145
+ // important: we need to free up the machine!
146
+ machine.busy = false
147
+ }
148
+ }
src/app/api/v1/render/{cluster.ts → animatediff-lightning/cluster.ts} RENAMED
@@ -1,5 +1,5 @@
1
  import { sleep } from "@/lib/utils/sleep"
2
- import { ClusterMachine } from "../types"
3
 
4
 
5
  // video generation requires A100s so we need to be parcimonous here,
@@ -36,7 +36,7 @@ export async function getClusterMachine(maxWaitTimeInMs: number = 10000): Promis
36
  }
37
 
38
  if (!clusterMachine) {
39
- throw new Error(`failed to find a cluster machine within ${maxWaitTimeInMs/10} seconds`)
40
  }
41
 
42
  // change the global state
 
1
  import { sleep } from "@/lib/utils/sleep"
2
+ import { ClusterMachine } from "../../types"
3
 
4
 
5
  // video generation requires A100s so we need to be parcimonous here,
 
36
  }
37
 
38
  if (!clusterMachine) {
39
+ throw new Error(`failed to find a cluster machine within ${maxWaitTimeInMs/1000} seconds`)
40
  }
41
 
42
  // change the global state
src/app/api/v1/render/{index.ts → animatediff-lightning/index.ts} RENAMED
File without changes
src/app/api/v1/render/route.ts CHANGED
@@ -1,11 +1,12 @@
1
  import { NextResponse, NextRequest } from "next/server"
2
  import queryString from "query-string"
3
- import { ClapMediaOrientation, getValidNumber } from "@aitube/clap"
4
 
5
  import { throwIfInvalidToken } from "@/app/api/v1/auth/throwIfInvalidToken"
6
  import { getContentType } from "@/lib/data/getContentType"
7
 
8
- import { render } from "."
 
9
 
10
  export async function POST(req: NextRequest, res: NextResponse) {
11
  await throwIfInvalidToken(req.headers.get("Authorization"))
@@ -31,7 +32,7 @@ export async function POST(req: NextRequest, res: NextResponse) {
31
  const nbFrames = 80
32
  const nbFPS = 24
33
  const nbSteps = turbo ? 4 : 8
34
- const debug = false
35
 
36
  const assetUrl = await render({
37
  prompt,
 
1
  import { NextResponse, NextRequest } from "next/server"
2
  import queryString from "query-string"
3
+ import { getValidNumber } from "@aitube/clap"
4
 
5
  import { throwIfInvalidToken } from "@/app/api/v1/auth/throwIfInvalidToken"
6
  import { getContentType } from "@/lib/data/getContentType"
7
 
8
+ // import { render } from "./animatediff-lcm-svd"
9
+ import { render } from "./animatediff-lightning"
10
 
11
  export async function POST(req: NextRequest, res: NextResponse) {
12
  await throwIfInvalidToken(req.headers.get("Authorization"))
 
32
  const nbFrames = 80
33
  const nbFPS = 24
34
  const nbSteps = turbo ? 4 : 8
35
+ const debug = true
36
 
37
  const assetUrl = await render({
38
  prompt,
src/lib/utils/logImage.ts ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export async function logImage(uri: string): Promise<void> {
2
+ // Create an image element
3
+ const img = new Image();
4
+
5
+ // Load the image asynchronously
6
+ img.src = uri;
7
+ await new Promise<void>((resolve, reject) => {
8
+ img.onload = () => resolve();
9
+ img.onerror = (error) => reject(error);
10
+ });
11
+
12
+ // Get the image dimensions
13
+ const { width, height } = img;
14
+
15
+ // Log the image in the console
16
+ console.log(
17
+ "%c+",
18
+ `font-size: 1px; padding: ${Math.floor(height / 2)}px ${Math.floor(width / 2)}px; line-height: ${height}px; background: url('${uri}'); background-size: ${width}px ${height}px; background-repeat: no-repeat; color: transparent;`
19
+ );
20
+ }
21
+
22
+ (async function() {
23
+
24
+ if (typeof window !== "undefined") {
25
+ // Add the logImage function to the console object
26
+ (console as any).image = logImage;
27
+
28
+ // Example usage
29
+ // console.image('https://example.com/path/to/your/image.jpg');
30
+ }
31
+ })()
src/lib/utils/resizeImage.ts ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sharp from "sharp";
2
+
3
+ export type ResizeImageParams = {
4
+ input: string
5
+ width?: number
6
+ height?: number
7
+ debug?: boolean
8
+ asBase64?: boolean // TODO: not implemented yet!
9
+ };
10
+
11
+ /**
12
+ * Resize an image to a given width and height.
13
+ * The input image can be a file path or a data URI (base64)
14
+ * The image ratio will be preserved if only one side is given.
15
+ * The image format (WebP, Jpeg, PNG) will be preserved.
16
+ * This function always return a base64 string (data URI with the mime type)
17
+ *
18
+ * @param param0
19
+ * @returns
20
+ */
21
+ export async function resizeImage({ input, width, height, debug, asBase64 }: ResizeImageParams): Promise<string> {
22
+ let inputBuffer: Buffer;
23
+
24
+ // Test if input is a data URI
25
+ const dataUriPattern = /^data:([a-zA-Z]+\/[a-zA-Z]+);base64,(.*)$/;
26
+ const matches = input.match(dataUriPattern);
27
+
28
+ if (matches) {
29
+ const [, mimeType, base64Data] = matches;
30
+ if (!/^image\/(png|jpeg|webp|heic)$/.test(mimeType)) {
31
+ throw new Error(`Unsupported image format. Expected PNG, JPEG, or WebP.`);
32
+ }
33
+ inputBuffer = Buffer.from(base64Data, "base64");
34
+ } else {
35
+ // Assuming input is a file path
36
+ inputBuffer = await sharp(input).toBuffer();
37
+ }
38
+
39
+ const sharpInstance = sharp(inputBuffer)
40
+ .resize(width, height, {
41
+ fit: "inside",
42
+ withoutEnlargement: true
43
+ });
44
+
45
+ const outputBuffer = await sharpInstance.toBuffer();
46
+ const outputMimeType = await sharpInstance.metadata().then(meta => meta.format);
47
+
48
+ if (!outputMimeType) {
49
+ throw new Error("Failed to determine the image mime type after resizing.");
50
+ }
51
+
52
+ const prefix = `data:image/${outputMimeType};base64,`;
53
+ const outputBase64 = outputBuffer.toString("base64");
54
+ return `${prefix}${outputBase64}`;
55
+ }