versantus's picture
Upload folder using huggingface_hub
d6c2737 verified
import modal
from pathlib import Path
# Creating our Modal App
app = modal.App("mochi-finetune")
# Creating volumes for data, intermediate data, and produced weights
videos_volume = modal.Volume.from_name("mochi-tune-videos", create_if_missing=True)
videos_prepared_volume = modal.Volume.from_name("mochi-tune-videos-prepared", create_if_missing=True)
weights_volume = modal.Volume.from_name("mochi-tune-weights", create_if_missing=True)
finetunes_volume = modal.Volume.from_name("mochi-tune-finetunes", create_if_missing=True)
outputs_volume = modal.Volume.from_name("mochi-tune-outputs", create_if_missing=True)
USERNAME = "genmoai"
REPOSITORY = "mochi"
CLONE_CMD = f"git clone https://github.com/{USERNAME}/{REPOSITORY}.git"
# Building our container image
base_img = (
modal.Image.debian_slim()
.apt_install("git", "ffmpeg", "bc", "zlib1g-dev", "libjpeg-dev", "wget")
.run_commands(CLONE_CMD)
.workdir(REPOSITORY)
.pip_install("gdown", "setuptools", "wheel")
.run_commands('pip install -e . --no-build-isolation')
)
MINUTES = 60
HOURS = 60 * MINUTES
# Remote function for downloading a labeled video dataset from Google Drive
# Run it with:
# modal run main::download_videos
@app.function(image=base_img,
volumes={
"/videos": videos_volume,
}
)
def download_videos():
'''Downloads videos from google drive into our volume'''
import gdown
import zipfile
name = "dissolve"
url = "https://drive.google.com/uc?id=1ldoBppcsv5Ueoikh0zCmNviojRCrGXQN"
output = f"{name}.zip"
gdown.download(url, output, quiet=False)
with zipfile.ZipFile(output, "r") as zip_ref:
zip_ref.extractall("/videos")
# Remote function for downloading the model weights from Hugging Face
# Run it with:
# modal run main::download_weights
@app.function(image=base_img,
volumes={
"/weights": weights_volume,
},
timeout=1*HOURS,
)
def download_weights():
# HF-transfer and snapshot download tend to hang on the large model, so we download it manually with wget
import subprocess
print("🍑 Downloading weights from Hugging Face. This may take 30 minutes.")
# ~30 min
subprocess.run(["wget", "https://huggingface.co/genmo/mochi-1-preview/resolve/main/dit.safetensors", "-O", "/weights/dit.safetensors"])
# ~1 min
subprocess.run(["wget", "https://huggingface.co/genmo/mochi-1-preview/resolve/main/decoder.safetensors", "-O", "/weights/decoder.safetensors"])
# ~20 sec
subprocess.run(["wget", "https://huggingface.co/genmo/mochi-1-preview/resolve/main/encoder.safetensors", "-O", "/weights/encoder.safetensors"])
# Remote function for preprocessing the video dataset
# Run it with:
# modal run main::preprocess
@app.function(
image=base_img,
volumes={
"/videos": videos_volume,
"/videos_prepared": videos_prepared_volume,
"/weights": weights_volume,
},
timeout=30*MINUTES,
gpu="H100"
)
def preprocess():
import subprocess
print("🍑 Preprocessing videos. This may take 2-3 minutes.")
video_dir = "videos_dissolve"
subprocess.run([
"bash", "demos/fine_tuner/preprocess.bash",
"-v", f"/videos/{video_dir}/",
"-o", "/videos_prepared/",
"-w", "/weights/",
"-n", "37"
])
# Remote function for finetuning the model using the prepared dataset
# Configure the run in lora.yaml
# Run it with:
# modal run main::finetune
@app.function(
image=base_img,
volumes={
"/videos": videos_volume,
"/videos_prepared": videos_prepared_volume,
"/weights": weights_volume,
"/finetunes": finetunes_volume,
},
mounts=[modal.Mount.from_local_file("lora.yaml", remote_path=f"{REPOSITORY}/lora.yaml")],
timeout=4*HOURS,
gpu="H100"
)
def finetune():
import subprocess
print("🍑 Finetuning Mochi. This may take 3 hours.")
print("🍑 See your mochi-tune-finetunes volume for intermediate checkpoints and samples.")
subprocess.run([
"bash", "demos/fine_tuner/run.bash",
"-c", "lora.yaml", # from our locally mounted yaml file
"-n", "1",
])
# Remote function (Modal @cls) for running inference on one or multiple videos
# Run it with the @local_entrypoint below
@app.cls(
image = base_img,
volumes={
"/weights": weights_volume,
"/finetunes": finetunes_volume,
"/outputs": outputs_volume,
},
timeout=30*MINUTES,
gpu="H100"
)
class MochiLora():
def __init__(self, model_dir: str = "/weights", lora_path: str = None, cpu_offload: bool = False):
self.model_dir = model_dir
self.lora_path = lora_path
self.cpu_offload = cpu_offload
@modal.enter()
def start(self):
from genmo.mochi_preview.pipelines import (
DecoderModelFactory,
DitModelFactory,
MochiMultiGPUPipeline,
MochiSingleGPUPipeline,
T5ModelFactory,
)
import torch
"""Initialize the model - this runs once when the container starts"""
print("🍑 Loading Mochi model.")
self.num_gpus = torch.cuda.device_count()
# Configure pipeline based on GPU count
klass = MochiSingleGPUPipeline if self.num_gpus == 1 else MochiMultiGPUPipeline
kwargs = dict(
text_encoder_factory=T5ModelFactory(),
dit_factory=DitModelFactory(
model_path=f"{self.model_dir}/dit.safetensors",
lora_path=self.lora_path,
model_dtype="bf16",
),
decoder_factory=DecoderModelFactory(
model_path=f"{self.model_dir}/decoder.safetensors",
),
)
if self.num_gpus > 1:
assert not self.lora_path, f"Lora not supported in multi-GPU mode"
assert not self.cpu_offload, "CPU offload not supported in multi-GPU mode"
kwargs["world_size"] = self.num_gpus
else:
kwargs["cpu_offload"] = self.cpu_offload
kwargs["decode_type"] = "tiled_spatial"
kwargs["fast_init"] = not self.lora_path
kwargs["strict_load"] = not self.lora_path
kwargs["decode_args"] = dict(overlap=8)
self.pipeline = klass(**kwargs)
print(f"🍑 Model loaded successfully with {self.num_gpus} GPUs")
@modal.method()
def generate(self,
prompt: str,
negative_prompt: str = "",
width: int = 848,
height: int = 480,
num_frames: int = 163,
seed: int = 1710977262,
cfg_scale: float = 6.0,
num_inference_steps: int = 64) -> str:
"""Generate video based on the prompt and parameters"""
print("🍑 Generating video.")
import json
import os
import time
import numpy as np
from genmo.lib.progress import progress_bar
from genmo.lib.utils import save_video
from genmo.mochi_preview.pipelines import linear_quadratic_schedule
# Create sigma schedule
sigma_schedule = linear_quadratic_schedule(num_inference_steps, 0.025)
cfg_schedule = [cfg_scale] * num_inference_steps
args = {
"height": height,
"width": width,
"num_frames": num_frames,
"sigma_schedule": sigma_schedule,
"cfg_schedule": cfg_schedule,
"num_inference_steps": num_inference_steps,
"batch_cfg": False,
"prompt": prompt,
"negative_prompt": negative_prompt,
"seed": seed,
}
with progress_bar(type="tqdm"):
final_frames = self.pipeline(**args)
final_frames = final_frames[0]
assert isinstance(final_frames, np.ndarray)
assert final_frames.dtype == np.float32
# Save to mounted volume
output_dir = "/outputs" # Assuming this path exists in the mounted volume
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, f"output_{int(time.time())}.mp4")
save_video(final_frames, output_path)
# Save generation parameters
json_path = os.path.splitext(output_path)[0] + ".json"
json.dump(args, open(json_path, "w"), indent=4)
print(f"🍑 Video saved to {output_path}")
outputs_volume.commit()
return output_path.split("/")[-1]
# Local entrypoint for using the MochiLora class
# Select the lora_path you'd want to use from the finetunes volume
# Then it with:
# modal run main
@app.local_entrypoint()
def main(
prompt="A pristine snowglobe featuring a winter scene sits peacefully. The glass begins to crumble into fine powder, as the entire sphere deteriorates into sparkling dust that drifts outward. The fake snow mingles with the crystalline particles, creating a glittering cloud captured in high-speed photography.",
negative_prompt="blurry, low quality",
width=848,
height=480,
num_frames=49, # (num_frames - 1) must be divisible by 6
seed=1710977262,
cfg_scale=6.0,
num_inference_steps=64,
lora_path="/finetunes/my_mochi_lora/model_2000.lora.safetensors",
cpu_offload=True,
):
lora = MochiLora(
lora_path=lora_path, # your lora path
cpu_offload=cpu_offload,
)
output_path = lora.generate.remote(
prompt=prompt,
negative_prompt=negative_prompt,
width=width,
height=height,
num_frames=num_frames,
seed=seed,
cfg_scale=cfg_scale,
num_inference_steps=num_inference_steps,
)
local_dir = Path("/tmp/mochi")
local_dir.mkdir(exist_ok=True, parents=True)
local_path = local_dir / output_path
local_path.write_bytes(b"".join(outputs_volume.read_file(output_path)))
print(f"🍑 video saved locally at {local_path}")