TemporalNet2 initial changes

Browse files

Files changed (7) hide show

README.md +36 -0
cldm_v15.yaml +79 -0
config.json +42 -0
crop_all_images.py +33 -0
temporalnetversion2.ckpt +3 -0
temporalnetversion2.yaml +79 -0
temporalvideo.py +213 -0

README.md CHANGED Viewed

@@ -1,3 +1,39 @@
 ---
 license: openrail
 ---

 ---
 license: openrail
+tags:
+- controlnet
+- stable-diffusion
+- diffusers
+base_model: runwayml/stable-diffusion-v1-5
 ---
+Introducing the Beta Version of TemporalNet
+TemporalNet is a ControlNet model designed to enhance the temporal consistency of generated outputs, as demonstrated in this example: https://twitter.com/CiaraRowles1/status/1637486561917906944. While it does not eliminate all flickering, it significantly reduces it, particularly at higher denoise levels. For optimal results, it is recommended to use TemporalNet in combination with other methods.
+Instructions for Use:
+1) Add the model "diff_control_sd15_temporalnet_fp16.safetensors" to your models folder in the ControlNet extension in Automatic1111's Web UI.
+2) Create a folder that contains:
+- A subfolder named "Input_Images" with the input frames
+- A PNG file called "init.png" that is pre-stylized in your desired style
+- The "temporalvideo.py" script
+3) Customize the "temporalvideo.py" script according to your preferences, such as the image resolution, prompt, and control net settings.
+4) Launch Automatic1111's Web UI with the --api setting enabled.
+5) Execute the Python script.
+*Please note that the "init.png" image will not significantly influence the style of the output video. Its primary purpose is to prevent a drastic change in aesthetics during the first few frames.*
+Also, I highly recommend you use this in conjunction with the hed model, the settings are already in the script.
+ToDo:
+Write an Extension for the web ui.
+Write a feature that automatically generates an "init.png" image if none is provided.
+ ̶C̶h̶a̶n̶g̶e̶ ̶t̶h̶e̶ ̶e̶x̶t̶e̶n̶s̶i̶o̶n̶ ̶t̶o̶ ̶.̶s̶a̶f̶e̶t̶e̶n̶s̶o̶r̶s̶ ̶a̶n̶d̶ ̶i̶n̶v̶e̶s̶t̶i̶g̶a̶t̶e̶ ̶c̶o̶m̶p̶r̶e̶s̶s̶i̶o̶n̶.̶

cldm_v15.yaml ADDED Viewed

	@@ -0,0 +1,79 @@

+model:
+  target: cldm.cldm.ControlLDM
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    control_key: "hint"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    only_mid_control: False
+    control_stage_config:
+      target: cldm.cldm.ControlNet
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        hint_channels: 3
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    unet_config:
+      target: cldm.cldm.ControlledUnetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "_class_name": "ControlNetModel",
+  "_diffusers_version": "0.15.0.dev0",
+  "_name_or_path": "./",
+  "act_fn": "silu",
+  "attention_head_dim": 8,
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "class_embed_type": null,
+  "conditioning_embedding_out_channels": [
+    16,
+    32,
+    96,
+    256
+  ],
+  "controlnet_conditioning_channel_order": "rgb",
+  "cross_attention_dim": 768,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_scale_factor": 1,
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "projection_class_embeddings_input_dim": null,
+  "resnet_time_scale_shift": "default",
+  "upcast_attention": false,
+  "use_linear_projection": false
+}

crop_all_images.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import cv2
+import os
+y_folder = "./Input_Images"
+def crop_and_save_images(folder_path):
+    # Get a list of all files in the folder
+    files = os.listdir(folder_path)
+    for file in files:
+        # Construct the full file path
+        file_path = os.path.join(folder_path, file)
+        # Load the image
+        img = cv2.imread(file_path)
+        # Get the dimensions of the image
+        h, w = img.shape[:2]
+        # Determine the size of the crop
+        crop_size = min(h, w)
+        # Calculate the start coordinates of the crop
+        start_y = (h - crop_size) // 2
+        start_x = (w - crop_size) // 2
+        # Perform the crop
+        img_cropped = img[start_y : start_y + crop_size, start_x : start_x + crop_size]
+        # Save the cropped image, overwriting the original image
+        cv2.imwrite(file_path, img_cropped)
+# Example usage:
+crop_and_save_images(y_folder)

temporalnetversion2.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51eea78ec529e554291e5c6f66f5a37c27ae8565594b22da297e28eb0f47ab27
+size 12688113093

temporalnetversion2.yaml ADDED Viewed

	@@ -0,0 +1,79 @@

+model:
+  target: cldm.cldm.ControlLDM
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    control_key: "hint"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    only_mid_control: False
+    control_stage_config:
+      target: cldm.cldm.ControlNet
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        hint_channels: 6
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    unet_config:
+      target: cldm.cldm.ControlledUnetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

temporalvideo.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import os
+import glob
+import requests
+import json
+import cv2
+import numpy as np
+import sys
+import torch
+from PIL import Image
+from pprint import pprint
+import base64
+from io import BytesIO
+import torchvision.transforms.functional as F
+from torchvision.io import read_video, read_image, ImageReadMode
+from torchvision.models.optical_flow import Raft_Large_Weights
+from torchvision.models.optical_flow import raft_large
+from torchvision.io import read_video, read_image, ImageReadMode
+from torchvision.utils import flow_to_image
+import cv2
+from torchvision.io import write_jpeg
+import pickle
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = raft_large(weights=Raft_Large_Weights.DEFAULT, progress=False).to(device)
+model = model.eval()
+# Replace with the actual path to your image file and folder
+x_path = "./init.png"
+y_folder = "./Input_Images"
+output_folder = "output"
+os.makedirs(output_folder, exist_ok=True)
+def get_image_paths(folder):
+    image_extensions = ("*.jpg", "*.jpeg", "*.png", "*.bmp")
+    files = []
+    for ext in image_extensions:
+        files.extend(glob.glob(os.path.join(folder, ext)))
+    return sorted(files)
+y_paths = get_image_paths(y_folder)
+def send_request(last_image_path, optical_flow_path,current_image_path):
+    url = "http://localhost:7860/sdapi/v1/img2img"
+    with open(last_image_path, "rb") as b:
+       last_image_encoded = base64.b64encode(b.read()).decode("utf-8")
+    # Load and process the last image
+    last_image = cv2.imread(last_image_path)
+    last_image = cv2.cvtColor(last_image, cv2.COLOR_BGR2RGB)
+    last_image = cv2.resize(last_image, (512, 512))
+    # Load and process the optical flow image
+    flow_image = cv2.imread(optical_flow_path)
+    flow_image = cv2.cvtColor(flow_image, cv2.COLOR_BGR2RGB)
+    # Load and process the current image
+    with open(current_image_path, "rb") as b:
+       current_image = base64.b64encode(b.read()).decode("utf-8")
+    # Concatenating the three images to make a 6-channel image
+    six_channel_image = np.dstack((last_image, flow_image))
+    # Serializing the 6-channel image
+    serialized_image = pickle.dumps(six_channel_image)
+    # Encoding the serialized image
+    encoded_image = base64.b64encode(serialized_image).decode('utf-8')
+    data = {
+        "init_images": [current_image],
+        "inpainting_fill": 0,
+        "inpaint_full_res": True,
+        "inpaint_full_res_padding": 1,
+        "inpainting_mask_invert": 1,
+        "resize_mode": 0,
+        "denoising_strength": 0.4,
+        "prompt": "1girl, woman",
+        "negative_prompt": "",
+        "alwayson_scripts": {
+            "ControlNet":{
+                "args": [
+                    {
+                        "input_image": current_image,
+                        "module": "hed",
+                        "model": "control_hed-fp16 [13fee50b]",
+                        "weight": 0.7,
+                        "guidance": 1,
+                   },
+                    {
+                        "input_image": encoded_image,
+                        "model": "temporalnetversion2 [b146ac48]",
+                        "module": "none",
+                        "weight": 0.6,
+                        "guidance": 1,
+                    },
+                    {
+                        "input_image": current_image,
+                        "model": "control_v11p_sd15_openpose [cab727d4]",
+                        "module": "openpose_full",
+                        "weight": 0.7,
+                        "guidance":1,
+                    }
+                ]
+            }
+        },
+        "seed": 4123457655,
+        "subseed": -1,
+        "subseed_strength": -1,
+        "sampler_index": "Euler a",
+        "batch_size": 1,
+        "n_iter": 1,
+        "steps": 20,
+        "cfg_scale": 6,
+        "width": 512,
+        "height": 512,
+        "restore_faces": True,
+        "include_init_images": True,
+        "override_settings": {},
+        "override_settings_restore_afterwards": True
+    }
+    response = requests.post(url, json=data)
+    if response.status_code == 200:
+        return response.content
+    else:
+        try:
+            error_data = response.json()
+            print("Error:")
+            print(str(error_data))
+        except json.JSONDecodeError:
+            print(f"Error: Unable to parse JSON error data.")
+        return None
+def infer(frameA, frameB):
+    input_frame_1 = read_image(str(frameA), ImageReadMode.RGB)
+    input_frame_2 = read_image(str(frameB), ImageReadMode.RGB)
+    #img1_batch = torch.stack([frames[0]])
+    #img2_batch = torch.stack([frames[1]])
+    img1_batch = torch.stack([input_frame_1])
+    img2_batch = torch.stack([input_frame_2])
+    weights = Raft_Large_Weights.DEFAULT
+    transforms = weights.transforms()
+    def preprocess(img1_batch, img2_batch):
+        img1_batch = F.resize(img1_batch, size=[512, 512])
+        img2_batch = F.resize(img2_batch, size=[512, 512])
+        return transforms(img1_batch, img2_batch)
+    img1_batch, img2_batch = preprocess(img1_batch, img2_batch)
+    list_of_flows = model(img1_batch.to(device), img2_batch.to(device))
+    predicted_flows = list_of_flows[-1]
+    #flow_imgs = flow_to_image(predicted_flows)
+    #print(flow_imgs)
+    predicted_flow = list_of_flows[-1][0]
+    opitcal_flow_path = os.path.join(output_folder, f"flow_{i}.png")
+    flow_img = flow_to_image(predicted_flow).to("cpu")
+    write_jpeg(flow_img,opitcal_flow_path)
+    return opitcal_flow_path
+output_images = []
+output_paths = []
+# Initialize with the first image path
+result = x_path
+output_image_path = os.path.join(output_folder, f"output_image_0.png")
+#with open(output_image_path, "wb") as f:
+   # f.write(result)
+last_image_path = x_path
+for i in range(1, len(y_paths)):
+    # Use the last image path and optical flow map to generate the next input
+    optical_flow = infer(y_paths[i - 1], y_paths[i])
+    # Modify your send_request to use the last_image_path
+    result = send_request(last_image_path, optical_flow, y_paths[i])
+    data = json.loads(result)
+    encoded_image = data["images"][0]
+    output_image_path = os.path.join(output_folder, f"output_image_{i}.png")
+    last_image_path = output_image_path
+    with open(output_image_path, "wb") as f:
+       f.write(base64.b64decode(encoded_image))
+    print(f"Written data for frame {i}:")