第四章 Diffusers实战

python 复制代码
from huggingface_hub import notebook_login

# 登录huggingface
notebook_login()

# 记得设置代理,端口是梯子的端口号
# set HTTPS_PROXY=http://127.0.0.1:自己的
        
# set HTTP_PROXY=http://127.0.0.1:自己的
less 复制代码
VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv...
python 复制代码
import numpy as np
import torch
import torch.nn.functional as F
import torchvision
from matplotlib import pyplot as plt
from PIL import Image

def show_images(x):
    """给定一批图像,创建一个网格并将其转换为PIL"""
    x = x * 0.5 + 0.5
    grid = torchvision.utils.make_grid(x)
    grid_im = grid.detach().cpu().permute(1, 2, 0).clip(0, 1) * 255
    grid_im = Image.fromarray(np.array(grid_im).astype(np.uint8))
    return grid_im

def make_grid(images, size=64):
    """给定一个PIL图像列表,将它们叠加成一行以便查看"""
    output_im = Image.new("RGB", (size * len(images), size))
    for i, im in enumerate(images):
        output_im.paste(im.resize((size, size)), (i * size, 0))
    return output_im

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
python 复制代码
# 加载管线
from diffusers import StableDiffusionPipeline

model_id = "sd-dreambooth-library/mr-potato-head"

pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(device)

num_inference_steps代表采样步骤的数量

guidance_scale决定模型输出与提示词之间的匹配程度

Diffusers核心API:

  • 管线:从高层次设计的多种类函数,便于部署的方式实现,能够快速利用预训练的主流扩散模型来生成样本。
  • 模型:在训练新的扩散模型时需要用到的网络结构。
  • 调度器:在推理过程中使用多种不同的技巧来从噪声中生成图像,同时可以生成训练过程中所需的"带噪"图像。
python 复制代码
from diffusers import DDPMPipeline

# 加载预设好的管线
model = "johnowhitaker/ddpm-butterflies-32px"
butterfly_pipeline = DDPMPipeline.from_pretrained(model).to(device)

# 生成8张图片
images = butterfly_pipeline(batch_size=8).images

make_grid(images)

实战 生成蝴蝶图像

下载蝴蝶图像集

python 复制代码
import torchvision
from datasets import load_dataset
from torchvision import transforms

dataset = load_dataset("huggan/smithsonian_butterflies_subset", split="train")

image_size = 32
batch_size = 64

# 定义数据增强过程
preprocess = transforms.Compose(
[
    transforms.Resize((image_size, image_size)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])

def transform(examples):
    images = [preprocess(image.convert("RGB")) for image in examples["image"]]
    return {"images": images}

dataset.set_transform(transform)
train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
python 复制代码
xb = next(iter(train_dataloader))["images"].to(device)[:8]
print("X shape:", xb.shape)
show_images(xb).resize((8 * 64, 64), resample=Image.NEAREST)
css 复制代码
X shape: torch.Size([8, 3, 32, 32])

调度器

python 复制代码
from diffusers import DDPMScheduler

noise_scheduler = DDPMScheduler(num_train_timesteps=1000, beta_start=0.001, beta_end=0.004)
python 复制代码
# 添加不同程度的噪声
timesteps = torch.linspace(0, 999, 8).long().to(device)
noise = torch.rand_like(xb)
noisy_xb = noise_scheduler.add_noise(xb, noise, timesteps)
print("Nosiy X Shape", noisy_xb.shape)
show_images(noisy_xb).resize((8 * 64, 64), resample=Image.NEAREST)
css 复制代码
Nosiy X Shape torch.Size([8, 3, 32, 32])

定义扩散模型

python 复制代码
from diffusers import UNet2DModel

model = UNet2DModel(
    sample_size=image_size,   # 目标图像的分辨率
    in_channels=3,            
    out_channels=3,           
    layers_per_block=2,       # 每一个UNet块中的ResNet层数
    block_out_channels=(64, 128, 128, 256), 
    down_block_types=( 
        "DownBlock2D",        
        "DownBlock2D",
        "AttnDownBlock2D",    # 带有空域维度的self-att的ResNet下采样模块
        "AttnDownBlock2D",
    ), 
    up_block_types=(
        "AttnUpBlock2D", 
        "AttnUpBlock2D",      # 带有空域维度的self-att的ResNet上采样模块
        "UpBlock2D",  
        "UpBlock2D"
      ),
)

model.to(device);

with torch.no_grad():
    model_prediction = model(noisy_xb, timesteps).sample

model_prediction.shape
css 复制代码
torch.Size([8, 3, 32, 32])

训练扩散模型

python 复制代码
# 设定噪声调度器
noise_scheduler = DDPMScheduler(num_train_timesteps=1000, beta_schedule="squaredcos_cap_v2")

# 训练循环
optimizer = torch.optim.AdamW(model.parameters(), lr=4e-4)

losses = []

for epoch in range(30):
    for step, batch in enumerate(train_dataloader):
        clean_images = batch["images"].to(device)
        # 为图片添加采样噪声
        noise = torch.randn(clean_images.shape).to(clean_images.device)
        bs = clean_images.shape[0]
        
        # 为每张图片随机采样一个时间步
        timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bs, ), 
                                  device=clean_images.device).long()
        
        # 根据每个时间步的噪声幅度,向清晰的图片中添加噪声
        noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)
        
        # 获得模型的预测结果
        noise_pred = model(noisy_images, timesteps, return_dict=False)[0]
        
        # 计算损失
        loss = F.mse_loss(noise_pred, noise)
        loss.backward(loss)
        losses.append(loss.item())
        
        # 迭代模型参数
        optimizer.step()
        optimizer.zero_grad()
        
    if (epoch + 1) % 5 == 0:
        loss_last_epoch = sum(losses[-len(train_dataloader):]) / len(train_dataloader)
        print(f"Epoch:{epoch + 1}, loss: {loss_last_epoch}")
makefile 复制代码
Epoch:5, loss: 0.14714546548202634
Epoch:10, loss: 0.11393213225528598
Epoch:15, loss: 0.09713644068688154
Epoch:20, loss: 0.08644208777695894
Epoch:25, loss: 0.07677834457717836
Epoch:30, loss: 0.0714822830632329
python 复制代码
fig, axs = plt.subplots(1, 2, figsize=(12, 4))
axs[0].plot(losses)
axs[1].plot(np.log(losses))
plt.show()

图像的生成

python 复制代码
# 方法1:建立一个管线生成图像
from diffusers import DDPMPipeline

image_pipe = DDPMPipeline(unet=model, scheduler=noise_scheduler)
pipeline_output = image_pipe()
print(pipeline_output.images[0])
pipeline_output.images[0].resize((64, 64), resample=Image.NEAREST)
python 复制代码
# 方法2:采样循环
sample = torch.randn(8, 3, 32, 32).to(device)
show_images(sample).resize((8 * 64, 64), resample=Image.NEAREST)
python 复制代码
# print(noise_scheduler.timesteps) # 0-999
for i, t in enumerate(noise_scheduler.timesteps):
    with torch.no_grad():
        residual = model(sample, t).sample
        
    sample = noise_scheduler.step(residual, t, sample).prev_sample

show_images(sample).resize((8 * 64, 64), resample=Image.NEAREST)
python 复制代码
相关推荐
桃花键神几秒前
AI可信论坛亮点:合合信息分享视觉内容安全技术前沿
人工智能
野蛮的大西瓜21 分钟前
开源呼叫中心中,如何将ASR与IVR菜单结合,实现动态的IVR交互
人工智能·机器人·自动化·音视频·信息与通信
CountingStars6191 小时前
目标检测常用评估指标(metrics)
人工智能·目标检测·目标跟踪
tangjunjun-owen1 小时前
第四节:GLM-4v-9b模型的tokenizer源码解读
人工智能·glm-4v-9b·多模态大模型教程
冰蓝蓝1 小时前
深度学习中的注意力机制:解锁智能模型的新视角
人工智能·深度学习
橙子小哥的代码世界1 小时前
【计算机视觉基础CV-图像分类】01- 从历史源头到深度时代:一文读懂计算机视觉的进化脉络、核心任务与产业蓝图
人工智能·计算机视觉
新加坡内哥谈技术2 小时前
苏黎世联邦理工学院与加州大学伯克利分校推出MaxInfoRL:平衡内在与外在探索的全新强化学习框架
大数据·人工智能·语言模型
fanstuck2 小时前
Prompt提示工程上手指南(七)Prompt编写实战-基于智能客服问答系统下的Prompt编写
人工智能·数据挖掘·openai
lovelin+v175030409662 小时前
安全性升级:API接口在零信任架构下的安全防护策略
大数据·数据库·人工智能·爬虫·数据分析
唐小旭3 小时前
python3.6搭建pytorch环境
人工智能·pytorch·python