当前位置:   article > 正文

AIGC笔记--基于Stable Diffusion实现图片的inpainting

AIGC笔记--基于Stable Diffusion实现图片的inpainting

1--完整代码

SD_Inpainting

2--简单代码

  1. import PIL
  2. import torch
  3. import numpy as np
  4. from PIL import Image
  5. from tqdm import tqdm
  6. import torchvision
  7. from diffusers import AutoencoderKL, UNet2DConditionModel, DDIMScheduler
  8. from transformers import CLIPTextModel, CLIPTokenizer
  9. # 预处理mask
  10. def preprocess_mask(mask):
  11. mask = mask.convert("L") # 转换为灰度图: L = R * 299/1000 + G * 587/1000+ B * 114/1000。
  12. w, h = mask.size # 512, 512
  13. w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32
  14. mask = mask.resize((w // 8, h // 8), resample = PIL.Image.NEAREST) # 64, 64
  15. mask = np.array(mask).astype(np.float32) / 255.0 # 归一化 64, 64
  16. mask = np.tile(mask, (4, 1, 1)) # 4, 64, 64
  17. mask = mask[None].transpose(0, 1, 2, 3)
  18. mask = 1 - mask # repaint white, keep black # mask图中,mask的部分变为0
  19. mask = torch.from_numpy(mask)
  20. return mask
  21. # 预处理image
  22. def preprocess(image):
  23. w, h = image.size
  24. w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32
  25. image = image.resize((w, h), resample=PIL.Image.LANCZOS)
  26. image = np.array(image).astype(np.float32) / 255.0
  27. image = image[None].transpose(0, 3, 1, 2)
  28. image = torch.from_numpy(image)
  29. return 2.0 * image - 1.0
  30. if __name__ == "__main__":
  31. model_id = "runwayml/stable-diffusion-v1-5" # online download
  32. # model_id = "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-waimai-aigc/liujinfu/All_test/test0714/huggingface.co/runwayml/stable-diffusion-v1-5" # local path
  33. # 读取输入图像和输入mask
  34. input_image = Image.open("./images/overture-creations-5sI6fQgYIuo.png").resize((512, 512))
  35. input_mask = Image.open("./images/overture-creations-5sI6fQgYIuo_mask.png").resize((512, 512))
  36. # 1. 加载autoencoder
  37. vae = AutoencoderKL.from_pretrained(model_id, subfolder = "vae")
  38. # 2. 加载tokenizer和text encoder
  39. tokenizer = CLIPTokenizer.from_pretrained(model_id, subfolder = "tokenizer")
  40. text_encoder = CLIPTextModel.from_pretrained(model_id, subfolder = "text_encoder")
  41. # 3. 加载扩散模型UNet
  42. unet = UNet2DConditionModel.from_pretrained(model_id, subfolder = "unet")
  43. # 4. 定义noise scheduler
  44. noise_scheduler = DDIMScheduler(
  45. num_train_timesteps = 1000,
  46. beta_start = 0.00085,
  47. beta_end = 0.012,
  48. beta_schedule = "scaled_linear",
  49. clip_sample = False, # don't clip sample, the x0 in stable diffusion not in range [-1, 1]
  50. set_alpha_to_one = False,
  51. )
  52. # 将模型复制到GPU上
  53. device = "cuda"
  54. vae.to(device, dtype = torch.float16)
  55. text_encoder.to(device, dtype = torch.float16)
  56. unet = unet.to(device, dtype = torch.float16)
  57. # 设置prompt和超参数
  58. prompt = "a mecha robot sitting on a bench"
  59. negative_prompt = ""
  60. strength = 0.75
  61. guidance_scale = 7.5
  62. batch_size = 1
  63. num_inference_steps = 50
  64. generator = torch.Generator(device).manual_seed(0)
  65. with torch.no_grad():
  66. # get prompt text_embeddings
  67. text_input = tokenizer(prompt, padding = "max_length",
  68. max_length = tokenizer.model_max_length,
  69. truncation = True,
  70. return_tensors = "pt")
  71. text_embeddings = text_encoder(text_input.input_ids.to(device))[0]
  72. # get unconditional text embeddings
  73. max_length = text_input.input_ids.shape[-1]
  74. uncond_input = tokenizer(
  75. [negative_prompt] * batch_size, padding = "max_length", max_length = max_length, return_tensors = "pt"
  76. )
  77. uncond_embeddings = text_encoder(uncond_input.input_ids.to(device))[0]
  78. # concat batch
  79. text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
  80. # 设置采样步数
  81. noise_scheduler.set_timesteps(num_inference_steps, device = device)
  82. # 根据strength计算timesteps
  83. init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
  84. t_start = max(num_inference_steps - init_timestep, 0)
  85. timesteps = noise_scheduler.timesteps[t_start:]
  86. # 预处理init_image
  87. init_input = preprocess(input_image)
  88. init_latents = vae.encode(init_input.to(device, dtype=torch.float16)).latent_dist.sample(generator)
  89. init_latents = 0.18215 * init_latents
  90. init_latents = torch.cat([init_latents] * batch_size, dim=0)
  91. init_latents_orig = init_latents
  92. # 处理mask
  93. mask_image = preprocess_mask(input_mask)
  94. mask_image = mask_image.to(device=device, dtype=init_latents.dtype)
  95. mask = torch.cat([mask_image] * batch_size)
  96. # 给init_latents加噪音
  97. noise = torch.randn(init_latents.shape, generator = generator, device = device, dtype = init_latents.dtype)
  98. init_latents = noise_scheduler.add_noise(init_latents, noise, timesteps[:1])
  99. latents = init_latents # 作为初始latents
  100. # Do denoise steps
  101. for t in tqdm(timesteps):
  102. # 这里latens扩展2份,是为了同时计算unconditional prediction
  103. latent_model_input = torch.cat([latents] * 2)
  104. latent_model_input = noise_scheduler.scale_model_input(latent_model_input, t) # for DDIM, do nothing
  105. # 预测噪音
  106. noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
  107. # Classifier Free Guidance
  108. noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
  109. noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
  110. # x_t -> x_t-1
  111. latents = noise_scheduler.step(noise_pred, t, latents).prev_sample
  112. # 将unmask区域替换原始图像的nosiy latents
  113. init_latents_proper = noise_scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
  114. # mask的部分数值为0
  115. # 因此init_latents_proper * mask为保留原始latents(不mask)
  116. # 而latents * (1 - mask)为用生成的latents替换mask的部分
  117. latents = (init_latents_proper * mask) + (latents * (1 - mask))
  118. # 注意要对latents进行scale
  119. latents = 1 / 0.18215 * latents
  120. image = vae.decode(latents).sample
  121. # 转成pillow
  122. img = (image / 2 + 0.5).clamp(0, 1).detach().cpu()
  123. img = torchvision.transforms.ToPILImage()(img.squeeze())
  124. img.save("./outputs/output.png")
  125. print("All Done!")

运行结果:

3--基于Diffuser进行调用

  1. import torch
  2. import torchvision
  3. from PIL import Image
  4. from diffusers import StableDiffusionInpaintPipelineLegacy
  5. if __name__ == "__main__":
  6. # load inpainting pipeline
  7. model_id = "runwayml/stable-diffusion-v1-5"
  8. # model_id = "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-waimai-aigc/liujinfu/All_test/test0714/huggingface.co/runwayml/stable-diffusion-v1-5" # local path
  9. pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(model_id, torch_dtype = torch.float16).to("cuda")
  10. # load input image and input mask
  11. input_image = Image.open("./images/overture-creations-5sI6fQgYIuo.png").resize((512, 512))
  12. input_mask = Image.open("./images/overture-creations-5sI6fQgYIuo_mask.png").resize((512, 512))
  13. # run inference
  14. prompt = ["a mecha robot sitting on a bench", "a cat sitting on a bench"]
  15. generator = torch.Generator("cuda").manual_seed(0)
  16. with torch.autocast("cuda"):
  17. images = pipe(
  18. prompt = prompt,
  19. image = input_image,
  20. mask_image = input_mask,
  21. num_inference_steps = 50,
  22. strength = 0.75,
  23. guidance_scale = 7.5,
  24. num_images_per_prompt = 1,
  25. generator = generator
  26. ).images
  27. # 转成pillow
  28. for idx, image in enumerate(images):
  29. image.save("./outputs/output_{:d}.png".format(idx))
  30. print("All Done!")

运行结果:

声明:本文内容由网友自发贡献,转载请注明出处:【wpsshop博客】
推荐阅读
相关标签
  

闽ICP备14008679号