Spaces:
Running
Running
Commit
•
3c87951
1
Parent(s):
8b547c3
let's up the game
Browse files- src/app/api/utils/imagePrompts.ts +1 -0
- src/app/api/v1/edit/dialogues/processShot.ts +3 -1
- src/app/api/v1/edit/entities/generateEntityPrompts.ts +1 -1
- src/app/api/v1/edit/music/cluster.ts +1 -1
- src/app/api/v1/edit/music/generateMusic.ts +1 -1
- src/app/api/v1/edit/music/generateMusicPrompt.ts +1 -2
- src/app/api/v1/edit/sounds/cluster.ts +45 -0
- src/app/api/v1/edit/sounds/generateSound.ts +97 -0
- src/app/api/v1/edit/sounds/generateSoundPrompt.ts +84 -0
- src/app/api/v1/edit/sounds/generateSourceWithMagnet.ts +69 -0
- src/app/api/v1/edit/sounds/processShot.ts +86 -0
- src/app/api/v1/edit/sounds/route.txt +57 -0
- src/app/api/v1/edit/sounds/systemPrompt.ts +25 -0
- src/app/api/v1/edit/sounds/types.ts +7 -0
- src/app/api/v1/edit/storyboards/processShot.ts +3 -1
- src/app/api/v1/edit/videos/processShot.ts +24 -9
- src/app/api/v1/render/animatediff-lcm-svd/cluster.ts +54 -0
- src/app/api/v1/render/animatediff-lcm-svd/index.ts +148 -0
- src/app/api/v1/render/{cluster.ts → animatediff-lightning/cluster.ts} +2 -2
- src/app/api/v1/render/{index.ts → animatediff-lightning/index.ts} +0 -0
- src/app/api/v1/render/route.ts +4 -3
- src/lib/utils/logImage.ts +31 -0
- src/lib/utils/resizeImage.ts +55 -0
src/app/api/utils/imagePrompts.ts
CHANGED
@@ -13,6 +13,7 @@ export function addWordsIfNotPartOfThePrompt(prompt: string = "", words: string[
|
|
13 |
|
14 |
export function getPositivePrompt(prompt: string = "", triggerWord = "") {
|
15 |
return addWordsIfNotPartOfThePrompt(prompt, [
|
|
|
16 |
triggerWord,
|
17 |
"sublime",
|
18 |
"pro quality",
|
|
|
13 |
|
14 |
export function getPositivePrompt(prompt: string = "", triggerWord = "") {
|
15 |
return addWordsIfNotPartOfThePrompt(prompt, [
|
16 |
+
"cinematic photo",
|
17 |
triggerWord,
|
18 |
"sublime",
|
19 |
"pro quality",
|
src/app/api/v1/edit/dialogues/processShot.ts
CHANGED
@@ -39,7 +39,7 @@ export async function processShot({
|
|
39 |
|
40 |
let shotDialogueSegment: ClapSegment | undefined = shotDialogueSegments.at(0)
|
41 |
|
42 |
-
console.log(`[api/edit/dialogues] processShot: shot [${shotSegment.startTimeInMs}:${shotSegment.endTimeInMs}] has ${shotSegments.length} segments (${shotDialogueSegments.length} dialogues)`)
|
43 |
|
44 |
if (shotDialogueSegment && !shotDialogueSegment.assetUrl) {
|
45 |
// console.log(`[api/edit/dialogues] generating audio..`)
|
@@ -58,6 +58,8 @@ export async function processShot({
|
|
58 |
})
|
59 |
shotDialogueSegment.assetSourceType = getClapAssetSourceType(shotDialogueSegment.assetUrl)
|
60 |
|
|
|
|
|
61 |
const { durationInMs, hasAudio } = await getMediaInfo(shotDialogueSegment.assetUrl)
|
62 |
|
63 |
if (hasAudio && durationInMs > 1000) {
|
|
|
39 |
|
40 |
let shotDialogueSegment: ClapSegment | undefined = shotDialogueSegments.at(0)
|
41 |
|
42 |
+
// console.log(`[api/edit/dialogues] processShot: shot [${shotSegment.startTimeInMs}:${shotSegment.endTimeInMs}] has ${shotSegments.length} segments (${shotDialogueSegments.length} dialogues)`)
|
43 |
|
44 |
if (shotDialogueSegment && !shotDialogueSegment.assetUrl) {
|
45 |
// console.log(`[api/edit/dialogues] generating audio..`)
|
|
|
58 |
})
|
59 |
shotDialogueSegment.assetSourceType = getClapAssetSourceType(shotDialogueSegment.assetUrl)
|
60 |
|
61 |
+
shotDialogueSegment.status = "completed"
|
62 |
+
|
63 |
const { durationInMs, hasAudio } = await getMediaInfo(shotDialogueSegment.assetUrl)
|
64 |
|
65 |
if (hasAudio && durationInMs > 1000) {
|
src/app/api/v1/edit/entities/generateEntityPrompts.ts
CHANGED
@@ -93,7 +93,7 @@ Now please generate the output entities:`
|
|
93 |
|
94 |
maybeEntities = parseRawStringToYAML<LatentEntity[]>(rawString, [])
|
95 |
if (!Array.isArray(maybeEntities) || maybeEntities.length === 0) {
|
96 |
-
console.log(`generateEntityPrompts(): failed to generate
|
97 |
}
|
98 |
}
|
99 |
|
|
|
93 |
|
94 |
maybeEntities = parseRawStringToYAML<LatentEntity[]>(rawString, [])
|
95 |
if (!Array.isArray(maybeEntities) || maybeEntities.length === 0) {
|
96 |
+
console.log(`generateEntityPrompts(): failed to generate entities for the second time, which indicates an issue with the Hugging Face API`)
|
97 |
}
|
98 |
}
|
99 |
|
src/app/api/v1/edit/music/cluster.ts
CHANGED
@@ -33,7 +33,7 @@ export async function getClusterMachine(maxWaitTimeInMs: number = 10000): Promis
|
|
33 |
}
|
34 |
|
35 |
if (!clusterMachine) {
|
36 |
-
throw new Error(`failed to find a cluster machine within ${maxWaitTimeInMs/
|
37 |
}
|
38 |
|
39 |
// change the global state
|
|
|
33 |
}
|
34 |
|
35 |
if (!clusterMachine) {
|
36 |
+
throw new Error(`failed to find a cluster machine within ${maxWaitTimeInMs/1000} seconds`)
|
37 |
}
|
38 |
|
39 |
// change the global state
|
src/app/api/v1/edit/music/generateMusic.ts
CHANGED
@@ -48,7 +48,7 @@ export async function generateMusic({
|
|
48 |
}
|
49 |
|
50 |
|
51 |
-
const durationInSec =
|
52 |
|
53 |
console.log(`generateMusic(): generating a music with:\n duration: ${durationInSec} sec\n prompt: ${prompt}`)
|
54 |
|
|
|
48 |
}
|
49 |
|
50 |
|
51 |
+
const durationInSec = 14 // musicSegment.assetDurationInMs / 1000
|
52 |
|
53 |
console.log(`generateMusic(): generating a music with:\n duration: ${durationInSec} sec\n prompt: ${prompt}`)
|
54 |
|
src/app/api/v1/edit/music/generateMusicPrompt.ts
CHANGED
@@ -29,8 +29,7 @@ export async function generateMusicPrompts({
|
|
29 |
// console.log("generateMusicPrompts(): latentStory:", latentStory)
|
30 |
|
31 |
const userPrompt = `The input story is about: ${prompt}.
|
32 |
-
|
33 |
-
The input story is:
|
34 |
\`\`\`yaml
|
35 |
${YAML.stringify(
|
36 |
// we need to help the LLM by marking the shots with a simple numeric ID
|
|
|
29 |
// console.log("generateMusicPrompts(): latentStory:", latentStory)
|
30 |
|
31 |
const userPrompt = `The input story is about: ${prompt}.
|
32 |
+
|
|
|
33 |
\`\`\`yaml
|
34 |
${YAML.stringify(
|
35 |
// we need to help the LLM by marking the shots with a simple numeric ID
|
src/app/api/v1/edit/sounds/cluster.ts
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { sleep } from "@/lib/utils/sleep"
|
2 |
+
import { ClusterMachine } from "../../types"
|
3 |
+
|
4 |
+
export const nbClusterMachines = 1
|
5 |
+
// make sure the machines are running!!
|
6 |
+
|
7 |
+
// https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-magnet-1/settings
|
8 |
+
// https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-magnet-2/settings
|
9 |
+
// https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-magnet-3/settings
|
10 |
+
// https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-magnet-4/settings
|
11 |
+
|
12 |
+
// we maintain a global cluster state
|
13 |
+
|
14 |
+
export const clusterMachines: ClusterMachine[] = []
|
15 |
+
for (let i = 0; i < nbClusterMachines; i++) {
|
16 |
+
clusterMachines.push({
|
17 |
+
id: i,
|
18 |
+
url: `https://jbilcke-hf-ai-tube-model-magnet-${i + 1}.hf.space`,
|
19 |
+
busy: false
|
20 |
+
})
|
21 |
+
}
|
22 |
+
|
23 |
+
export async function getClusterMachine(maxWaitTimeInMs: number = 10000): Promise<ClusterMachine> {
|
24 |
+
let clusterMachine: ClusterMachine | undefined = undefined
|
25 |
+
let timeSpentWaitingInMs = 0
|
26 |
+
const intervalInMs = 500
|
27 |
+
|
28 |
+
while (true) {
|
29 |
+
clusterMachine = clusterMachines.find(m => !m.busy)
|
30 |
+
if (clusterMachine) { break }
|
31 |
+
if (timeSpentWaitingInMs > maxWaitTimeInMs) { break }
|
32 |
+
await sleep(intervalInMs)
|
33 |
+
}
|
34 |
+
|
35 |
+
if (!clusterMachine) {
|
36 |
+
throw new Error(`failed to find a cluster machine within ${maxWaitTimeInMs/1000} seconds`)
|
37 |
+
}
|
38 |
+
|
39 |
+
// change the global state
|
40 |
+
clusterMachine.busy = true
|
41 |
+
|
42 |
+
return clusterMachine
|
43 |
+
}
|
44 |
+
|
45 |
+
export const token = `${process.env.MICROSERVICE_API_SECRET_TOKEN || ""}`
|
src/app/api/v1/edit/sounds/generateSound.ts
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import {
|
3 |
+
ClapProject,
|
4 |
+
ClapSegment,
|
5 |
+
getClapAssetSourceType,
|
6 |
+
filterSegments,
|
7 |
+
ClapSegmentFilteringMode,
|
8 |
+
ClapSegmentCategory,
|
9 |
+
newSegment
|
10 |
+
} from "@aitube/clap"
|
11 |
+
import { ClapCompletionMode } from "@aitube/client"
|
12 |
+
import { getMediaInfo } from "@/app/api/utils/getMediaInfo"
|
13 |
+
import { generateSoundWithMagnet } from "./generateSourceWithMagnet"
|
14 |
+
|
15 |
+
export async function generateSound({
|
16 |
+
soundSegment,
|
17 |
+
existingClap,
|
18 |
+
newerClap,
|
19 |
+
mode,
|
20 |
+
turbo,
|
21 |
+
}: {
|
22 |
+
soundSegment?: ClapSegment
|
23 |
+
existingClap: ClapProject
|
24 |
+
newerClap: ClapProject
|
25 |
+
mode: ClapCompletionMode
|
26 |
+
turbo: boolean
|
27 |
+
}): Promise<void> {
|
28 |
+
if (!soundSegment) {
|
29 |
+
console.log(`generateSound(): sound segment is empty, so skipping sound generation.`)
|
30 |
+
return
|
31 |
+
}
|
32 |
+
|
33 |
+
// for now we do something very basic
|
34 |
+
|
35 |
+
if (soundSegment.status === "completed") {
|
36 |
+
console.log(`generateSound(): sound segment is already generated, skipping doing it twice.`)
|
37 |
+
return
|
38 |
+
}
|
39 |
+
|
40 |
+
// for now we do something very basic
|
41 |
+
const prompt = soundSegment.prompt
|
42 |
+
if (!prompt) {
|
43 |
+
console.log(`generateSound(): sound prompt is empty, so skipping sound generation.`)
|
44 |
+
return
|
45 |
+
}
|
46 |
+
|
47 |
+
|
48 |
+
const durationInSec = 12 // soundSegment.assetDurationInMs / 1000
|
49 |
+
|
50 |
+
console.log(`generateSound(): generating a sound with:\n duration: ${durationInSec} sec\n prompt: ${prompt}`)
|
51 |
+
|
52 |
+
const assetUrl = await generateSoundWithMagnet({
|
53 |
+
prompt,
|
54 |
+
durationInSec,
|
55 |
+
hd: false,
|
56 |
+
debug: true,
|
57 |
+
neverThrow: true,
|
58 |
+
})
|
59 |
+
|
60 |
+
|
61 |
+
if (!assetUrl || assetUrl?.length < 30) {
|
62 |
+
console.log(`generateSound(): the generated assetUrl is empty, so sound generation failed.`)
|
63 |
+
return
|
64 |
+
}
|
65 |
+
|
66 |
+
let { durationInMs, hasAudio } = await getMediaInfo(assetUrl)
|
67 |
+
|
68 |
+
const newProperties: Partial<ClapSegment> = {
|
69 |
+
assetUrl,
|
70 |
+
assetDurationInMs: durationInMs,
|
71 |
+
outputGain: 1.0,
|
72 |
+
status: "completed"
|
73 |
+
}
|
74 |
+
|
75 |
+
|
76 |
+
if (!hasAudio) {
|
77 |
+
console.warn(`generateSound(): the generated sound waveform appears to be silent (might be a ffprobe malfunction)`)
|
78 |
+
// return
|
79 |
+
// we have a bug on AiTube, basically the ffmpeg probe isn't working,
|
80 |
+
// because it doesn't find ffmpeg
|
81 |
+
// if think the issue is how the Dockerfile is formed
|
82 |
+
// so until this is fixed, we need to fake a "correct" result
|
83 |
+
newProperties.assetDurationInMs = soundSegment.assetDurationInMs
|
84 |
+
}
|
85 |
+
|
86 |
+
if (mode !== ClapCompletionMode.FULL) {
|
87 |
+
console.log(`generateSound(): adding sound to a new clap file`)
|
88 |
+
newerClap.segments.push(newSegment({
|
89 |
+
...soundSegment,
|
90 |
+
...newProperties,
|
91 |
+
}))
|
92 |
+
} else {
|
93 |
+
console.log(`generateSound(): overwriting the sound inside the existing clap file`)
|
94 |
+
// this will update the existing clap (normally)
|
95 |
+
Object.assign(soundSegment, newProperties)
|
96 |
+
}
|
97 |
+
}
|
src/app/api/v1/edit/sounds/generateSoundPrompt.ts
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import YAML from "yaml"
|
3 |
+
|
4 |
+
import { predict } from "@/app/api/providers/huggingface/predictWithHuggingFace"
|
5 |
+
import { LatentStory } from "@/app/api/v1/types"
|
6 |
+
|
7 |
+
import { systemPrompt } from "./systemPrompt"
|
8 |
+
|
9 |
+
export async function generateSoundPrompts({
|
10 |
+
prompt = "",
|
11 |
+
latentStory = [],
|
12 |
+
turbo = false,
|
13 |
+
}: {
|
14 |
+
prompt?: string
|
15 |
+
latentStory?: LatentStory[]
|
16 |
+
turbo?: boolean
|
17 |
+
} = {
|
18 |
+
prompt: "",
|
19 |
+
latentStory: [],
|
20 |
+
turbo: false
|
21 |
+
}): Promise<string[]> {
|
22 |
+
|
23 |
+
if (!prompt.length) { throw new Error(`please provide a prompt`) }
|
24 |
+
console.log("generateSoundPrompts(): prompt:", prompt)
|
25 |
+
|
26 |
+
|
27 |
+
if (!latentStory.length) { throw new Error(`please provide a story`) }
|
28 |
+
|
29 |
+
// console.log("generateSoundPrompts(): latentStory:", latentStory)
|
30 |
+
|
31 |
+
|
32 |
+
const userPrompt = `The input story is about: ${prompt}.
|
33 |
+
|
34 |
+
# Output`
|
35 |
+
/*
|
36 |
+
NOTE Julian: maybe later we can use this:
|
37 |
+
|
38 |
+
const userPrompt = `The input story is about: ${prompt}.
|
39 |
+
|
40 |
+
\`\`\`yaml
|
41 |
+
${YAML.stringify(
|
42 |
+
// we need to help the LLM by marking the shots with a simple numeric ID
|
43 |
+
latentStory.map((shot, i) => ({
|
44 |
+
shot: i,
|
45 |
+
...shot,
|
46 |
+
}))
|
47 |
+
)}
|
48 |
+
\`\`\`
|
49 |
+
|
50 |
+
# Output`
|
51 |
+
*/
|
52 |
+
|
53 |
+
const prefix = "\""
|
54 |
+
|
55 |
+
// we don't need a lot here!
|
56 |
+
const nbMaxNewTokens = 120
|
57 |
+
|
58 |
+
// TODO use streaming for the Hugging Face prediction
|
59 |
+
//
|
60 |
+
// note that a Clap file is actually a YAML stream of documents
|
61 |
+
// so technically we could stream everything from end-to-end
|
62 |
+
// (but I haven't coded the helpers to do this yet)
|
63 |
+
let rawString = await predict({
|
64 |
+
systemPrompt,
|
65 |
+
userPrompt,
|
66 |
+
nbMaxNewTokens,
|
67 |
+
prefix,
|
68 |
+
turbo,
|
69 |
+
})
|
70 |
+
|
71 |
+
// console.log("generateEntityPrompts(): rawString: ", rawString)
|
72 |
+
|
73 |
+
let results: string[] = []
|
74 |
+
|
75 |
+
// we remove everything after the last ``` (or ``)
|
76 |
+
rawString = rawString.split(/```?/)[0].trim()
|
77 |
+
results.push(rawString)
|
78 |
+
|
79 |
+
if (!Array.isArray(results) || typeof results.at(0) !== "string" || !results) {
|
80 |
+
throw new Error(`failed to generate the output (rawString is: ${rawString})`)
|
81 |
+
}
|
82 |
+
|
83 |
+
return results
|
84 |
+
}
|
src/app/api/v1/edit/sounds/generateSourceWithMagnet.ts
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { addBase64Header } from "@/lib/data/addBase64Header"
|
2 |
+
import { SoundGenerationParams } from "./types"
|
3 |
+
import { getClusterMachine } from "./cluster"
|
4 |
+
|
5 |
+
const microserviceApiKey = `${process.env.MICROSERVICE_API_SECRET_TOKEN || ""}`
|
6 |
+
|
7 |
+
/**
|
8 |
+
* Note: this generates a base64 mp3 file
|
9 |
+
*/
|
10 |
+
export async function generateSoundWithMagnet({
|
11 |
+
prompt,
|
12 |
+
durationInSec,
|
13 |
+
hd,
|
14 |
+
debug = false,
|
15 |
+
neverThrow = false,
|
16 |
+
}: SoundGenerationParams): Promise<string> {
|
17 |
+
|
18 |
+
if (!prompt?.length) {
|
19 |
+
throw new Error(`prompt is too short!`)
|
20 |
+
}
|
21 |
+
|
22 |
+
const machine = await getClusterMachine()
|
23 |
+
|
24 |
+
try {
|
25 |
+
const res = await fetch(machine.url + (machine.url.endsWith("/") ? "" : "/") + "api/predict", {
|
26 |
+
method: "POST",
|
27 |
+
headers: {
|
28 |
+
"Content-Type": "application/json",
|
29 |
+
// Authorization: `Bearer ${token}`,
|
30 |
+
},
|
31 |
+
body: JSON.stringify({
|
32 |
+
fn_index: 1, // <- important!
|
33 |
+
data: [
|
34 |
+
microserviceApiKey, // string in 'Secret Token' Textbox component
|
35 |
+
// TODO
|
36 |
+
],
|
37 |
+
}),
|
38 |
+
cache: "no-store",
|
39 |
+
// we can also use this (see https://vercel.com/blog/vercel-cache-api-nextjs-cache)
|
40 |
+
// next: { revalidate: 1 }
|
41 |
+
})
|
42 |
+
|
43 |
+
if (res.status !== 200) {
|
44 |
+
throw new Error('Failed to fetch data')
|
45 |
+
}
|
46 |
+
|
47 |
+
const { data } = await res.json()
|
48 |
+
|
49 |
+
// console.log("data:", data)
|
50 |
+
// Recommendation: handle errors
|
51 |
+
if (res.status !== 200 || !Array.isArray(data)) {
|
52 |
+
// This will activate the closest `error.js` Error Boundary
|
53 |
+
throw new Error(`Failed to fetch data (status: ${res.status})`)
|
54 |
+
}
|
55 |
+
// console.log("data:", data.slice(0, 50))
|
56 |
+
|
57 |
+
if (!data[0]) {
|
58 |
+
throw new Error(`the returned sound was empty`)
|
59 |
+
}
|
60 |
+
|
61 |
+
// console.log("data:", data[0].slice(0, 60))
|
62 |
+
return addBase64Header(data[0] as string, "mp3")
|
63 |
+
} catch (err) {
|
64 |
+
throw err
|
65 |
+
} finally {
|
66 |
+
// important: we need to free up the machine!
|
67 |
+
machine.busy = false
|
68 |
+
}
|
69 |
+
}
|
src/app/api/v1/edit/sounds/processShot.ts
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import {
|
3 |
+
ClapProject,
|
4 |
+
ClapSegment,
|
5 |
+
getClapAssetSourceType,
|
6 |
+
filterSegments,
|
7 |
+
ClapSegmentFilteringMode,
|
8 |
+
ClapSegmentCategory
|
9 |
+
} from "@aitube/clap"
|
10 |
+
import { ClapCompletionMode } from "@aitube/client"
|
11 |
+
|
12 |
+
import { generateSoundWithMagnet } from "./generateSourceWithMagnet"
|
13 |
+
import { getMediaInfo } from "@/app/api/utils/getMediaInfo"
|
14 |
+
|
15 |
+
export async function processShot({
|
16 |
+
shotSegment,
|
17 |
+
existingClap,
|
18 |
+
newerClap,
|
19 |
+
mode,
|
20 |
+
turbo,
|
21 |
+
}: {
|
22 |
+
shotSegment: ClapSegment
|
23 |
+
existingClap: ClapProject
|
24 |
+
newerClap: ClapProject
|
25 |
+
mode: ClapCompletionMode
|
26 |
+
turbo: boolean
|
27 |
+
}): Promise<void> {
|
28 |
+
|
29 |
+
const shotSegments: ClapSegment[] = filterSegments(
|
30 |
+
ClapSegmentFilteringMode.BOTH,
|
31 |
+
shotSegment,
|
32 |
+
existingClap.segments
|
33 |
+
)
|
34 |
+
|
35 |
+
const shotSoundSegments: ClapSegment[] = shotSegments.filter(s =>
|
36 |
+
s.category === ClapSegmentCategory.SOUND
|
37 |
+
)
|
38 |
+
|
39 |
+
let shotSoundSegment: ClapSegment | undefined = shotSoundSegments.at(0)
|
40 |
+
|
41 |
+
console.log(`[api/edit/sounds] processShot: shot [${shotSegment.startTimeInMs}:${shotSegment.endTimeInMs}] has ${shotSegments.length} segments (${shotSoundSegments.length} sounds)`)
|
42 |
+
|
43 |
+
if (shotSoundSegment && !shotSoundSegment.assetUrl) {
|
44 |
+
// console.log(`[api/edit/sounds] generating background sound effect..`)
|
45 |
+
|
46 |
+
try {
|
47 |
+
// this generates a mp3
|
48 |
+
shotSoundSegment.assetUrl = await generateSoundWithMagnet({
|
49 |
+
prompt: shotSoundSegment.prompt,
|
50 |
+
durationInSec: shotSegment.assetDurationInMs,
|
51 |
+
hd: false,
|
52 |
+
debug: true,
|
53 |
+
neverThrow: false,
|
54 |
+
})
|
55 |
+
shotSoundSegment.assetSourceType = getClapAssetSourceType(shotSoundSegment.assetUrl)
|
56 |
+
|
57 |
+
shotSoundSegment.status = "completed"
|
58 |
+
|
59 |
+
const { durationInMs, hasAudio } = await getMediaInfo(shotSoundSegment.assetUrl)
|
60 |
+
|
61 |
+
if (hasAudio && durationInMs > 1000) {
|
62 |
+
shotSoundSegment.assetDurationInMs = durationInMs
|
63 |
+
shotSegment.assetDurationInMs = durationInMs
|
64 |
+
|
65 |
+
// we update the duration of all the segments for this shot
|
66 |
+
// (it is possible that this makes the two previous lines redundant)
|
67 |
+
existingClap.segments.forEach(s => {
|
68 |
+
s.assetDurationInMs = durationInMs
|
69 |
+
})
|
70 |
+
}
|
71 |
+
|
72 |
+
} catch (err) {
|
73 |
+
console.log(`[api/edit/sounds] processShot: failed to generate audio: ${err}`)
|
74 |
+
throw err
|
75 |
+
}
|
76 |
+
|
77 |
+
console.log(`[api/edit/sounds] processShot: generated sound audio: ${shotSoundSegment?.assetUrl?.slice?.(0, 50)}...`)
|
78 |
+
|
79 |
+
// if it's partial, we need to manually add it
|
80 |
+
if (mode !== ClapCompletionMode.FULL) {
|
81 |
+
newerClap.segments.push(shotSoundSegment)
|
82 |
+
}
|
83 |
+
} else {
|
84 |
+
console.log(`[api/edit/sounds] processShot: there is already a sound audio: ${shotSoundSegment?.assetUrl?.slice?.(0, 50)}...`)
|
85 |
+
}
|
86 |
+
}
|
src/app/api/v1/edit/sounds/route.txt
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { NextResponse, NextRequest } from "next/server"
|
2 |
+
import queryString from "query-string"
|
3 |
+
import { ClapProject, ClapSegment, ClapSegmentCategory, newClap, parseClap, serializeClap } from "@aitube/clap"
|
4 |
+
import { ClapCompletionMode } from "@aitube/client"
|
5 |
+
|
6 |
+
import { parseCompletionMode } from "@/app/api/parsers/parseCompletionMode"
|
7 |
+
import { throwIfInvalidToken } from "@/app/api/v1/auth/throwIfInvalidToken"
|
8 |
+
import { parseTurbo } from "@/app/api/parsers/parseTurbo"
|
9 |
+
|
10 |
+
import { processShot } from "./processShot"
|
11 |
+
// a helper to generate speech for a Clap
|
12 |
+
export async function POST(req: NextRequest) {
|
13 |
+
await throwIfInvalidToken(req.headers.get("Authorization"))
|
14 |
+
|
15 |
+
const qs = queryString.parseUrl(req.url || "")
|
16 |
+
const query = (qs || {}).query
|
17 |
+
|
18 |
+
const mode = parseCompletionMode(query?.c)
|
19 |
+
const turbo = parseTurbo(query?.t)
|
20 |
+
|
21 |
+
const blob = await req.blob()
|
22 |
+
|
23 |
+
const existingClap: ClapProject = await parseClap(blob)
|
24 |
+
|
25 |
+
if (!existingClap?.segments) { throw new Error(`no segment found in the provided clap!`) }
|
26 |
+
|
27 |
+
// console.log(`[api/edit/dialogues] detected ${existingClap.segments.length} segments`)
|
28 |
+
|
29 |
+
const shotsSegments: ClapSegment[] = existingClap.segments.filter(s => s.category === ClapSegmentCategory.CAMERA)
|
30 |
+
// console.log(`[api/edit/dialogues] detected ${shotsSegments.length} shots`)
|
31 |
+
|
32 |
+
if (shotsSegments.length > 32) {
|
33 |
+
throw new Error(`Error, this endpoint being synchronous, it is designed for short stories only (max 32 shots).`)
|
34 |
+
}
|
35 |
+
|
36 |
+
const newerClap = mode === ClapCompletionMode.FULL ? existingClap : newClap({
|
37 |
+
meta: existingClap.meta
|
38 |
+
})
|
39 |
+
|
40 |
+
// we process the shots in parallel (this will increase the queue size in the Gradio spaces)
|
41 |
+
await Promise.all(shotsSegments.map(shotSegment =>
|
42 |
+
processShot({
|
43 |
+
shotSegment,
|
44 |
+
existingClap,
|
45 |
+
newerClap,
|
46 |
+
mode,
|
47 |
+
turbo,
|
48 |
+
})
|
49 |
+
))
|
50 |
+
|
51 |
+
// console.log(`[api/edit/dialogues] returning the clap augmented with dialogues`)
|
52 |
+
|
53 |
+
return new NextResponse(await serializeClap(newerClap), {
|
54 |
+
status: 200,
|
55 |
+
headers: new Headers({ "content-type": "application/x-gzip" }),
|
56 |
+
})
|
57 |
+
}
|
src/app/api/v1/edit/sounds/systemPrompt.ts
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
export const systemPrompt: string = `
|
2 |
+
You are a backend API engine, designed to generate a background audio and sound effect prompt output from a story input.
|
3 |
+
|
4 |
+
## Prompting guidelines
|
5 |
+
|
6 |
+
We already know we are generating sound, no need to tell us again, so e concise!
|
7 |
+
Don't speak too much or give your opinion so don't say things like "The audio track should have a wind and chimes sounds, giving an eerie, ominous mood.." instead just say "wind, chimes".
|
8 |
+
Avoid concepts that don't translate well to sound.
|
9 |
+
|
10 |
+
To create a background soundtrack prompt, you need to combine locations with objects and their characteristics.
|
11 |
+
|
12 |
+
## Example of input/output
|
13 |
+
|
14 |
+
Given the following input story, provided as YAML:
|
15 |
+
|
16 |
+
# Input
|
17 |
+
|
18 |
+
"A king goes to see a witch to ask if or how he can win an upcoming and challenging battle"
|
19 |
+
|
20 |
+
As you can see, the theme is modern, describing a city. So you should generate an audio soundtrack like this:
|
21 |
+
|
22 |
+
## Output
|
23 |
+
|
24 |
+
"Downtown New York, busy street, pedestrians, taxis."
|
25 |
+
`
|
src/app/api/v1/edit/sounds/types.ts
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
export type SoundGenerationParams = {
|
2 |
+
prompt: string
|
3 |
+
durationInSec: number
|
4 |
+
hd?: boolean
|
5 |
+
debug?: boolean
|
6 |
+
neverThrow?: boolean
|
7 |
+
}
|
src/app/api/v1/edit/storyboards/processShot.ts
CHANGED
@@ -87,12 +87,14 @@ export async function processShot({
|
|
87 |
turbo,
|
88 |
})
|
89 |
shotStoryboardSegment.assetSourceType = getClapAssetSourceType(shotStoryboardSegment.assetUrl)
|
|
|
90 |
} catch (err) {
|
91 |
console.log(`[api/v1/edit/storyboards] processShot: failed to generate an image: ${err}`)
|
|
|
92 |
throw err
|
93 |
}
|
94 |
|
95 |
-
console.log(`[api/v1/edit/storyboards] processShot: generated storyboard image: ${shotStoryboardSegment?.assetUrl?.slice?.(0, 50)}...`)
|
96 |
|
97 |
// if mode is full, newerClap already contains the ference to shotStoryboardSegment
|
98 |
// but if it's partial, we need to manually add it
|
|
|
87 |
turbo,
|
88 |
})
|
89 |
shotStoryboardSegment.assetSourceType = getClapAssetSourceType(shotStoryboardSegment.assetUrl)
|
90 |
+
shotStoryboardSegment.status = "completed"
|
91 |
} catch (err) {
|
92 |
console.log(`[api/v1/edit/storyboards] processShot: failed to generate an image: ${err}`)
|
93 |
+
shotStoryboardSegment.status = "to_generate"
|
94 |
throw err
|
95 |
}
|
96 |
|
97 |
+
// console.log(`[api/v1/edit/storyboards] processShot: generated storyboard image: ${shotStoryboardSegment?.assetUrl?.slice?.(0, 50)}...`)
|
98 |
|
99 |
// if mode is full, newerClap already contains the ference to shotStoryboardSegment
|
100 |
// but if it's partial, we need to manually add it
|
src/app/api/v1/edit/videos/processShot.ts
CHANGED
@@ -15,7 +15,8 @@ import { getVideoPrompt } from "@aitube/engine"
|
|
15 |
|
16 |
import { getPositivePrompt } from "@/app/api/utils/imagePrompts"
|
17 |
|
18 |
-
import { render } from "@/app/api/v1/render"
|
|
|
19 |
import { extractFirstFrame } from "@/app/api/utils/extractFirstFrame"
|
20 |
|
21 |
export async function processShot({
|
@@ -49,7 +50,7 @@ export async function processShot({
|
|
49 |
|
50 |
let shotStoryboardSegment: ClapSegment | undefined = shotStoryboardSegments.at(0)
|
51 |
|
52 |
-
console.log(`[api/edit/videos] processShot: shot [${shotSegment.startTimeInMs}:${shotSegment.endTimeInMs}] has ${shotSegments.length} segments (${shotVideoSegments.length} videos)`)
|
53 |
|
54 |
// TASK 1: GENERATE MISSING VIDEO SEGMENT
|
55 |
if (!shotVideoSegment) {
|
@@ -90,7 +91,7 @@ export async function processShot({
|
|
90 |
|
91 |
// TASK 3: GENERATE MISSING VIDEO FILE
|
92 |
if (!shotVideoSegment.assetUrl) {
|
93 |
-
console.log(`[api/edit/videos] processShot: generating video file..`)
|
94 |
|
95 |
const debug = false
|
96 |
|
@@ -102,30 +103,44 @@ export async function processShot({
|
|
102 |
// height = Math.round(height / 2)
|
103 |
// }
|
104 |
|
|
|
105 |
if (width > height) {
|
106 |
-
width =
|
107 |
-
height =
|
108 |
} else if (width < height) {
|
109 |
-
width =
|
110 |
-
height =
|
111 |
} else {
|
112 |
width = 512
|
113 |
height = 512
|
114 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
try {
|
116 |
shotVideoSegment.assetUrl = await render({
|
117 |
-
prompt: getPositivePrompt(shotVideoSegment.prompt),
|
|
|
118 |
seed: shotSegment.seed,
|
119 |
width,
|
120 |
height,
|
|
|
|
|
121 |
nbFrames: 80,
|
122 |
nbFPS: 24,
|
123 |
nbSteps: 4, // turbo ? 4 : 8,
|
124 |
debug,
|
125 |
})
|
126 |
shotVideoSegment.assetSourceType = getClapAssetSourceType(shotVideoSegment.assetUrl)
|
|
|
127 |
} catch (err) {
|
128 |
console.log(`[api/edit/videos] processShot: failed to generate a video file: ${err}`)
|
|
|
129 |
throw err
|
130 |
}
|
131 |
|
@@ -182,7 +197,7 @@ export async function processShot({
|
|
182 |
|
183 |
shotStoryboardSegment.status = "completed"
|
184 |
} catch (err) {
|
185 |
-
console.warn(`[api/edit/videos] processShot: couldn't generate the missing storyboard (probably an error with the ffmpeg
|
186 |
shotStoryboardSegment.status = "to_generate"
|
187 |
}
|
188 |
|
|
|
15 |
|
16 |
import { getPositivePrompt } from "@/app/api/utils/imagePrompts"
|
17 |
|
18 |
+
import { render } from "@/app/api/v1/render/animatediff-lcm-svd"
|
19 |
+
// import { render } from "@/app/api/v1/render/animatediff-lightning"
|
20 |
import { extractFirstFrame } from "@/app/api/utils/extractFirstFrame"
|
21 |
|
22 |
export async function processShot({
|
|
|
50 |
|
51 |
let shotStoryboardSegment: ClapSegment | undefined = shotStoryboardSegments.at(0)
|
52 |
|
53 |
+
// console.log(`[api/edit/videos] processShot: shot [${shotSegment.startTimeInMs}:${shotSegment.endTimeInMs}] has ${shotSegments.length} segments (${shotVideoSegments.length} videos)`)
|
54 |
|
55 |
// TASK 1: GENERATE MISSING VIDEO SEGMENT
|
56 |
if (!shotVideoSegment) {
|
|
|
91 |
|
92 |
// TASK 3: GENERATE MISSING VIDEO FILE
|
93 |
if (!shotVideoSegment.assetUrl) {
|
94 |
+
// console.log(`[api/edit/videos] processShot: generating video file..`)
|
95 |
|
96 |
const debug = false
|
97 |
|
|
|
103 |
// height = Math.round(height / 2)
|
104 |
// }
|
105 |
|
106 |
+
/*
|
107 |
if (width > height) {
|
108 |
+
width = 768
|
109 |
+
height = 384
|
110 |
} else if (width < height) {
|
111 |
+
width = 384
|
112 |
+
height = 768
|
113 |
} else {
|
114 |
width = 512
|
115 |
height = 512
|
116 |
}
|
117 |
+
*/
|
118 |
+
|
119 |
+
if (!shotStoryboardSegment?.assetUrl) {
|
120 |
+
const error = `cannot generate a video without a storyboard! (at least not with AnimateDiff-LCM SVD)`
|
121 |
+
console.error(error)
|
122 |
+
throw new Error(error)
|
123 |
+
}
|
124 |
+
|
125 |
try {
|
126 |
shotVideoSegment.assetUrl = await render({
|
127 |
+
// prompt: getPositivePrompt(shotVideoSegment.prompt),
|
128 |
+
imageInputBase64: shotStoryboardSegment.assetUrl,
|
129 |
seed: shotSegment.seed,
|
130 |
width,
|
131 |
height,
|
132 |
+
// by default we do 1 second of 24 fps
|
133 |
+
// but it would look better if we had 2 seconds of 24 fps
|
134 |
nbFrames: 80,
|
135 |
nbFPS: 24,
|
136 |
nbSteps: 4, // turbo ? 4 : 8,
|
137 |
debug,
|
138 |
})
|
139 |
shotVideoSegment.assetSourceType = getClapAssetSourceType(shotVideoSegment.assetUrl)
|
140 |
+
shotStoryboardSegment.status = "completed"
|
141 |
} catch (err) {
|
142 |
console.log(`[api/edit/videos] processShot: failed to generate a video file: ${err}`)
|
143 |
+
shotStoryboardSegment.status = "to_generate"
|
144 |
throw err
|
145 |
}
|
146 |
|
|
|
197 |
|
198 |
shotStoryboardSegment.status = "completed"
|
199 |
} catch (err) {
|
200 |
+
console.warn(`[api/edit/videos] processShot: couldn't generate the missing storyboard (probably an error with the ffmpeg not being found)`)
|
201 |
shotStoryboardSegment.status = "to_generate"
|
202 |
}
|
203 |
|
src/app/api/v1/render/animatediff-lcm-svd/cluster.ts
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { sleep } from "@/lib/utils/sleep"
|
2 |
+
import { ClusterMachine } from "../../types"
|
3 |
+
|
4 |
+
|
5 |
+
|
6 |
+
export const nbClusterMachines = 8
|
7 |
+
// make sure the machines are running!!
|
8 |
+
|
9 |
+
// https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-als-1/settings
|
10 |
+
// https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-als-2/settings
|
11 |
+
// https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-als-3/settings
|
12 |
+
// https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-als-4/settings
|
13 |
+
// https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-als-5/settings
|
14 |
+
// https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-als-6/settings
|
15 |
+
// https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-als-7/settings
|
16 |
+
// https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-als-8/settings
|
17 |
+
|
18 |
+
// we maintain a global cluster state
|
19 |
+
|
20 |
+
export const clusterMachines: ClusterMachine[] = []
|
21 |
+
for (let i = 0; i < nbClusterMachines; i++) {
|
22 |
+
clusterMachines.push({
|
23 |
+
id: i,
|
24 |
+
url: `https://jbilcke-hf-ai-tube-model-als-${i + 1}.hf.space`,
|
25 |
+
|
26 |
+
// careful when trying this one (check number of Gradio parameters, fps etc):
|
27 |
+
// url: `https://jbilcke-hf-ai-tube-model-als-experimental.hf.space`,
|
28 |
+
busy: false
|
29 |
+
})
|
30 |
+
}
|
31 |
+
|
32 |
+
export async function getClusterMachine(maxWaitTimeInMs: number = 10000): Promise<ClusterMachine> {
|
33 |
+
let clusterMachine: ClusterMachine | undefined = undefined
|
34 |
+
let timeSpentWaitingInMs = 0
|
35 |
+
const intervalInMs = 500
|
36 |
+
|
37 |
+
while (true) {
|
38 |
+
clusterMachine = clusterMachines.find(m => !m.busy)
|
39 |
+
if (clusterMachine) { break }
|
40 |
+
if (timeSpentWaitingInMs > maxWaitTimeInMs) { break }
|
41 |
+
await sleep(intervalInMs)
|
42 |
+
}
|
43 |
+
|
44 |
+
if (!clusterMachine) {
|
45 |
+
throw new Error(`failed to find a cluster machine within ${maxWaitTimeInMs/1000} seconds`)
|
46 |
+
}
|
47 |
+
|
48 |
+
// change the global state
|
49 |
+
clusterMachine.busy = true
|
50 |
+
|
51 |
+
return clusterMachine
|
52 |
+
}
|
53 |
+
|
54 |
+
export const token = `${process.env.MICROSERVICE_API_SECRET_TOKEN || ""}`
|
src/app/api/v1/render/animatediff-lcm-svd/index.ts
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { generateSeed, getValidNumber } from "@aitube/clap"
|
2 |
+
import { getClusterMachine, token } from "./cluster"
|
3 |
+
import { resizeImage } from "@/lib/utils/resizeImage"
|
4 |
+
|
5 |
+
/**
|
6 |
+
* Render a video using AnimateDiff-LCM-SVD
|
7 |
+
*
|
8 |
+
* @param request
|
9 |
+
* @returns
|
10 |
+
*/
|
11 |
+
export async function render(request: {
|
12 |
+
imageInputBase64?: string
|
13 |
+
seed?: number
|
14 |
+
width?: number
|
15 |
+
height?: number
|
16 |
+
nbFrames?: number
|
17 |
+
nbFPS?: number
|
18 |
+
nbSteps?: number
|
19 |
+
debug?: boolean
|
20 |
+
}): Promise<string> {
|
21 |
+
|
22 |
+
const imageInputBase64 = request.imageInputBase64 || ""
|
23 |
+
if (!imageInputBase64) {
|
24 |
+
throw new Error(`missing imageInputBase64`)
|
25 |
+
}
|
26 |
+
|
27 |
+
const debug = !!request.debug
|
28 |
+
|
29 |
+
// I think we have a problem with the seed?
|
30 |
+
// const seed = request?.seed || generateSeed()
|
31 |
+
|
32 |
+
// the motion LoRA - could be useful one day
|
33 |
+
const motion = ""
|
34 |
+
|
35 |
+
const nbSteps = getValidNumber(request.nbSteps, 1, 12, 4)
|
36 |
+
const width = getValidNumber(request.width, 256, 1024, 896)
|
37 |
+
const height = getValidNumber(request.height, 256, 1024, 512)
|
38 |
+
|
39 |
+
// important note: by default our AnimateDiff-LCM SVD
|
40 |
+
// is a 24 fps model, so either 24 fps for 1 sec of footage,
|
41 |
+
// or 8 fps for 3 seconds of footage
|
42 |
+
const nbFrames = getValidNumber(request.nbFrames, 10, 120, 24)
|
43 |
+
const nbFPS = getValidNumber(request.nbFPS, 10, 120, 8)
|
44 |
+
|
45 |
+
// by default AnimateDiff generates about 2 seconds of video at 10 fps
|
46 |
+
// the Gradio API now has some code to optional fix that using FFmpeg,
|
47 |
+
// but this will add some delay overhead, so use with care!
|
48 |
+
const durationInSec = Math.round(nbFrames / nbFPS)
|
49 |
+
const framesPerSec = nbFPS
|
50 |
+
|
51 |
+
// vital step: image size must match the output video size
|
52 |
+
const resizedImageBase64 = await resizeImage({
|
53 |
+
input: imageInputBase64,
|
54 |
+
width,
|
55 |
+
height,
|
56 |
+
debug: true,
|
57 |
+
asBase64: true
|
58 |
+
})
|
59 |
+
|
60 |
+
// console.log(`resizedImage: ${resizedImageBase64.slice(0, 64)}`)
|
61 |
+
|
62 |
+
const machine = await getClusterMachine()
|
63 |
+
|
64 |
+
try {
|
65 |
+
if (debug) {
|
66 |
+
console.log(`calling AnimateDiff-LCM-SVD API with params (some are hidden):`, {
|
67 |
+
motion,
|
68 |
+
nbSteps,
|
69 |
+
width,
|
70 |
+
height,
|
71 |
+
nbFrames,
|
72 |
+
nbFPS,
|
73 |
+
durationInSec,
|
74 |
+
framesPerSec,
|
75 |
+
})
|
76 |
+
}
|
77 |
+
|
78 |
+
const res = await fetch(machine.url + (machine.url.endsWith("/") ? "" : "/") + "api/predict", {
|
79 |
+
method: "POST",
|
80 |
+
headers: {
|
81 |
+
"Content-Type": "application/json",
|
82 |
+
// Authorization: `Bearer ${token}`,
|
83 |
+
},
|
84 |
+
body: JSON.stringify({
|
85 |
+
fn_index: 0, // <- important! it is currently 4, not 1!
|
86 |
+
data: [
|
87 |
+
token,
|
88 |
+
resizedImageBase64,
|
89 |
+
0, // seed,
|
90 |
+
true,
|
91 |
+
33, // motion_bucket_id,
|
92 |
+
|
93 |
+
// attention: we are experimenting with ffmpeg to change the speed,
|
94 |
+
// on the server "als-2"
|
95 |
+
// but only this server supports "durationInSec" as an extra parameter
|
96 |
+
|
97 |
+
durationInSec,
|
98 |
+
|
99 |
+
// same here, if using als-2 you need to pick a small value
|
100 |
+
framesPerSec,
|
101 |
+
|
102 |
+
1.2, // max_guidance_scale,
|
103 |
+
1.0, // min_guidance_scale,
|
104 |
+
width,
|
105 |
+
height,
|
106 |
+
nbSteps,
|
107 |
+
],
|
108 |
+
}),
|
109 |
+
|
110 |
+
// necessary since we are using the fetch() provided by NextJS
|
111 |
+
cache: "no-store",
|
112 |
+
|
113 |
+
// we can also use this (see https://vercel.com/blog/vercel-cache-api-nextjs-cache)
|
114 |
+
// next: { revalidate: 1 }
|
115 |
+
})
|
116 |
+
|
117 |
+
// console.log("res:", res)
|
118 |
+
|
119 |
+
const { data } = await res.json()
|
120 |
+
|
121 |
+
// console.log("data:", data)
|
122 |
+
// Recommendation: handle errors
|
123 |
+
if (res.status !== 200 || !Array.isArray(data)) {
|
124 |
+
// This will activate the closest `error.js` Error Boundary
|
125 |
+
throw new Error(`Failed to fetch data (status: ${res.status})`)
|
126 |
+
}
|
127 |
+
// console.log("data:", data.slice(0, 50))
|
128 |
+
|
129 |
+
const base64Content = (data?.[0] || "") as string
|
130 |
+
|
131 |
+
if (!base64Content) {
|
132 |
+
throw new Error(`invalid response (no content)`)
|
133 |
+
}
|
134 |
+
|
135 |
+
// this API already emits a data-uri with a content type
|
136 |
+
// addBase64HeaderToMp4(base64Content)
|
137 |
+
return base64Content
|
138 |
+
} catch (err) {
|
139 |
+
if (debug) {
|
140 |
+
console.error(`failed to call the AnimateDiff-LCM-SVD API:`)
|
141 |
+
console.error(err)
|
142 |
+
}
|
143 |
+
throw err
|
144 |
+
} finally {
|
145 |
+
// important: we need to free up the machine!
|
146 |
+
machine.busy = false
|
147 |
+
}
|
148 |
+
}
|
src/app/api/v1/render/{cluster.ts → animatediff-lightning/cluster.ts}
RENAMED
@@ -1,5 +1,5 @@
|
|
1 |
import { sleep } from "@/lib/utils/sleep"
|
2 |
-
import { ClusterMachine } from "
|
3 |
|
4 |
|
5 |
// video generation requires A100s so we need to be parcimonous here,
|
@@ -36,7 +36,7 @@ export async function getClusterMachine(maxWaitTimeInMs: number = 10000): Promis
|
|
36 |
}
|
37 |
|
38 |
if (!clusterMachine) {
|
39 |
-
throw new Error(`failed to find a cluster machine within ${maxWaitTimeInMs/
|
40 |
}
|
41 |
|
42 |
// change the global state
|
|
|
1 |
import { sleep } from "@/lib/utils/sleep"
|
2 |
+
import { ClusterMachine } from "../../types"
|
3 |
|
4 |
|
5 |
// video generation requires A100s so we need to be parcimonous here,
|
|
|
36 |
}
|
37 |
|
38 |
if (!clusterMachine) {
|
39 |
+
throw new Error(`failed to find a cluster machine within ${maxWaitTimeInMs/1000} seconds`)
|
40 |
}
|
41 |
|
42 |
// change the global state
|
src/app/api/v1/render/{index.ts → animatediff-lightning/index.ts}
RENAMED
File without changes
|
src/app/api/v1/render/route.ts
CHANGED
@@ -1,11 +1,12 @@
|
|
1 |
import { NextResponse, NextRequest } from "next/server"
|
2 |
import queryString from "query-string"
|
3 |
-
import {
|
4 |
|
5 |
import { throwIfInvalidToken } from "@/app/api/v1/auth/throwIfInvalidToken"
|
6 |
import { getContentType } from "@/lib/data/getContentType"
|
7 |
|
8 |
-
import { render } from "
|
|
|
9 |
|
10 |
export async function POST(req: NextRequest, res: NextResponse) {
|
11 |
await throwIfInvalidToken(req.headers.get("Authorization"))
|
@@ -31,7 +32,7 @@ export async function POST(req: NextRequest, res: NextResponse) {
|
|
31 |
const nbFrames = 80
|
32 |
const nbFPS = 24
|
33 |
const nbSteps = turbo ? 4 : 8
|
34 |
-
const debug =
|
35 |
|
36 |
const assetUrl = await render({
|
37 |
prompt,
|
|
|
1 |
import { NextResponse, NextRequest } from "next/server"
|
2 |
import queryString from "query-string"
|
3 |
+
import { getValidNumber } from "@aitube/clap"
|
4 |
|
5 |
import { throwIfInvalidToken } from "@/app/api/v1/auth/throwIfInvalidToken"
|
6 |
import { getContentType } from "@/lib/data/getContentType"
|
7 |
|
8 |
+
// import { render } from "./animatediff-lcm-svd"
|
9 |
+
import { render } from "./animatediff-lightning"
|
10 |
|
11 |
export async function POST(req: NextRequest, res: NextResponse) {
|
12 |
await throwIfInvalidToken(req.headers.get("Authorization"))
|
|
|
32 |
const nbFrames = 80
|
33 |
const nbFPS = 24
|
34 |
const nbSteps = turbo ? 4 : 8
|
35 |
+
const debug = true
|
36 |
|
37 |
const assetUrl = await render({
|
38 |
prompt,
|
src/lib/utils/logImage.ts
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
export async function logImage(uri: string): Promise<void> {
|
2 |
+
// Create an image element
|
3 |
+
const img = new Image();
|
4 |
+
|
5 |
+
// Load the image asynchronously
|
6 |
+
img.src = uri;
|
7 |
+
await new Promise<void>((resolve, reject) => {
|
8 |
+
img.onload = () => resolve();
|
9 |
+
img.onerror = (error) => reject(error);
|
10 |
+
});
|
11 |
+
|
12 |
+
// Get the image dimensions
|
13 |
+
const { width, height } = img;
|
14 |
+
|
15 |
+
// Log the image in the console
|
16 |
+
console.log(
|
17 |
+
"%c+",
|
18 |
+
`font-size: 1px; padding: ${Math.floor(height / 2)}px ${Math.floor(width / 2)}px; line-height: ${height}px; background: url('${uri}'); background-size: ${width}px ${height}px; background-repeat: no-repeat; color: transparent;`
|
19 |
+
);
|
20 |
+
}
|
21 |
+
|
22 |
+
(async function() {
|
23 |
+
|
24 |
+
if (typeof window !== "undefined") {
|
25 |
+
// Add the logImage function to the console object
|
26 |
+
(console as any).image = logImage;
|
27 |
+
|
28 |
+
// Example usage
|
29 |
+
// console.image('https://example.com/path/to/your/image.jpg');
|
30 |
+
}
|
31 |
+
})()
|
src/lib/utils/resizeImage.ts
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sharp from "sharp";
|
2 |
+
|
3 |
+
export type ResizeImageParams = {
|
4 |
+
input: string
|
5 |
+
width?: number
|
6 |
+
height?: number
|
7 |
+
debug?: boolean
|
8 |
+
asBase64?: boolean // TODO: not implemented yet!
|
9 |
+
};
|
10 |
+
|
11 |
+
/**
|
12 |
+
* Resize an image to a given width and height.
|
13 |
+
* The input image can be a file path or a data URI (base64)
|
14 |
+
* The image ratio will be preserved if only one side is given.
|
15 |
+
* The image format (WebP, Jpeg, PNG) will be preserved.
|
16 |
+
* This function always return a base64 string (data URI with the mime type)
|
17 |
+
*
|
18 |
+
* @param param0
|
19 |
+
* @returns
|
20 |
+
*/
|
21 |
+
export async function resizeImage({ input, width, height, debug, asBase64 }: ResizeImageParams): Promise<string> {
|
22 |
+
let inputBuffer: Buffer;
|
23 |
+
|
24 |
+
// Test if input is a data URI
|
25 |
+
const dataUriPattern = /^data:([a-zA-Z]+\/[a-zA-Z]+);base64,(.*)$/;
|
26 |
+
const matches = input.match(dataUriPattern);
|
27 |
+
|
28 |
+
if (matches) {
|
29 |
+
const [, mimeType, base64Data] = matches;
|
30 |
+
if (!/^image\/(png|jpeg|webp|heic)$/.test(mimeType)) {
|
31 |
+
throw new Error(`Unsupported image format. Expected PNG, JPEG, or WebP.`);
|
32 |
+
}
|
33 |
+
inputBuffer = Buffer.from(base64Data, "base64");
|
34 |
+
} else {
|
35 |
+
// Assuming input is a file path
|
36 |
+
inputBuffer = await sharp(input).toBuffer();
|
37 |
+
}
|
38 |
+
|
39 |
+
const sharpInstance = sharp(inputBuffer)
|
40 |
+
.resize(width, height, {
|
41 |
+
fit: "inside",
|
42 |
+
withoutEnlargement: true
|
43 |
+
});
|
44 |
+
|
45 |
+
const outputBuffer = await sharpInstance.toBuffer();
|
46 |
+
const outputMimeType = await sharpInstance.metadata().then(meta => meta.format);
|
47 |
+
|
48 |
+
if (!outputMimeType) {
|
49 |
+
throw new Error("Failed to determine the image mime type after resizing.");
|
50 |
+
}
|
51 |
+
|
52 |
+
const prefix = `data:image/${outputMimeType};base64,`;
|
53 |
+
const outputBase64 = outputBuffer.toString("base64");
|
54 |
+
return `${prefix}${outputBase64}`;
|
55 |
+
}
|