Usage InstanID-XS:

1.Download model .

# InstanID-XS
huggingface-cli download --resume-download RED-AIGC/InstantID-XS --local-dir ./checkpoints
# vae: madebyollin/sdxl-vae-fp16-fix
huggingface-cli download --resume-download madebyollin/sdxl-vae-fp16-fix --local-dir ./checkpoints
# base model: RealVisXL V4.0
huggingface-cli download --resume-download frankjoshua/realvisxlV40_v40Bakedvae --local-dir ./checkpoints

2.Get pipeline

Note: In ControlNetXS, the input of encoder_hidden_states in the controlnet part is the same as that of UNET by default, which is prompt-embeddings. We decouple the inputs of the two, so that the input of encoder_hidden_states in UNET is prompt-embeddings, while the input of encoder_hidden_states in the controlnet part is face-embeddings.

from diffusers import AutoencoderKL,UNet2DConditionModel,UniPCMultistepScheduler
from controlnet_xs import ControlNetXSAdapter, UNetControlNetXSModel
import torch


base_model = './checkpoints/frankjoshua/realvisxlV40_v40Bakedvae'
vae_path = './checkpoints/madebyollin/sdxl-vae-fp16-fix'
ckpt = './checkpoints/RED-AIGC/InstantID-XS'

image_proj_path = os.path.join(ckpt, "image_proj.bin")
cnxs_path =  os.path.join(ckpt, "controlnetxs.bin")
cross_attn_path = os.path.join(ckpt, "cross_attn.bin")

# Get ControlNetXS:
unet = UNet2DConditionModel.from_pretrained(base_model, subfolder="unet").to(device, dtype=weight_dtype)
controlnet = ControlNetXSAdapter.from_unet(unet, size_ratio=0.125, learn_time_embedding=True)
state_dict = torch.load(cnxs_path, map_location="cpu", weights_only=True)
ctrl_state_dict = {}
for key, value in state_dict.items():
    if 'ctrl_' in key and 'ctrl_to_base' not in key:
        key = key.replace('ctrl_', '')
    if 'up_blocks' in key:
        key = key.replace('up_blocks', 'up_connections')
    ctrl_state_dict[key] = value
controlnet.load_state_dict(ctrl_state_dict, strict=True)
controlnet.to(device, dtype=weight_dtype)
ControlNetXS = UNetControlNetXSModel.from_unet(unet, controlnet).to(device, dtype=weight_dtype)


# Get pipeline
vae = AutoencoderKL.from_pretrained(vae_model)

pipe = StableDiffusionXLInstantIDXSPipeline.from_pretrained(
    base_model,
    vae=vae,
    unet=ControlNetXS,
    controlnet=None,
    torch_dtype=weight_dtype,
)

pipe.cuda(device=device, dtype=weight_dtype, use_xformers=True)
pipe.load_ip_adapter(image_proj_path, cross_attn_path)

pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe.unet.config.ctrl_learn_time_embedding = True
pipe = pipe.to(device)

3.Infer:

import cv2
import os
from PIL import Image
from insightface.app import FaceAnalysis

app = FaceAnalysis(name='antelopev2', root='./', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
app.prepare(ctx_id=0, det_size=(640, 640))


img_path = './image.jpg'
image = cv2.imread(img_path)
image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
image = resize_img(image)

face_infos = app.get(cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR))
face_info = sorted(face_infos, key=lambda x:(x['bbox'][2]-x['bbox'][0])*(x['bbox'][3]-x['bbox'][1]))[-1] 
face_emb = torch.from_numpy(face_info.normed_embedding)
face_kps = draw_kps_pil(image, face_info['kps'])

prompt = 'a woman, (looking at the viewer), portrait, daily wear, 8K texture, realistic, symmetrical hyperdetailed texture, masterpiece, enhanced details, (eye highlight:2), perfect composition, natural lighting, best quality, authentic, natural posture'
n_prompt = '(worst quality:2), (low quality:2), (normal quality:2), lowres, bad anatomy, bad hands, normal quality, long neck, hunchback, narrow shoulder, wall, (blurry), vague, indistinct, (shiny face:2), (buffing:2), (face highlight:2), pale skin'

seed = 0
image = pipe(
    prompt=prompt,
    negative_prompt=n_prompt,
    image=face_kps,
    face_emb=face_emb,
    num_images_per_prompt=1,
    num_inference_steps=20,
    generator=torch.Generator(device=device).manual_seed(seed),
    ip_adapter_scale=0.8,
    guidance_scale=4.0,
    controlnet_conditioning_scale=0.8,
).images[0]

RED-AIGC
/

InstantID-XS

1.Download model .

2.Get pipeline

3.Infer:

Space using RED-AIGC/InstantID-XS 1