伏羲0.06(文生图)

主要改进点:

数据准备:

数据清洗:增加了 clean_data 函数,用于去除空值和重复值。

数据增强:增加了 augment_data 函数,用于在训练时进行数据增强。

模型选择:

生成对抗网络 (GAN):增加了 Discriminator 类,用于判别生成的图像是否真实。

损失函数:增加了 GAN 损失和 L1 损失,用于训练生成器和判别器。

模型架构设计:

文本编码器:使用预训练的 Transformer 模型(如 BERT)来编码文本描述。

图像生成器:增加了更多的卷积转置层,并使用了批量归一化和激活函数。

多模态融合:将文本特征和图像特征进行有效融合,确保生成的图像与文本描述一致。

训练过程:

损失函数:使用 GAN 损失和 L1 损失,分别用于训练生成器和判别器。

优化算法:使用 Adam 优化器。

训练策略:使用批量归一化、梯度裁剪等技术来稳定训练过程。

正则化:防止过拟合,可以使用 L1/L2 正则化、Dropout 等技术。

python 复制代码
import tkinter as tk
from tkinter import filedialog, messagebox
from PIL import Image, ImageTk
import torch
import torch.optim as optim
import torch.nn as nn
import torchvision.transforms as transforms
import yaml
import os
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import random
import numpy as np

# 配置文件加载
def load_config(config_path):
    with open(config_path, 'r', encoding='utf-8') as file:
        config = yaml.safe_load(file)
    return config

# 数据加载
def load_text_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text_data = file.readlines()
    return [line.strip() for line in text_data]

# 数据清洗
def clean_data(data):
    # 这里可以添加更多的数据清洗逻辑
    return data.dropna().drop_duplicates()

# 数据增强
def augment_data(image, mode):
    if mode == 'train':
        transform = transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(10),
            transforms.RandomResizedCrop(64, scale=(0.8, 1.0)),
            transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])
    else:
        transform = transforms.Compose([
            transforms.Resize((64, 64)),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])
    return transform(image)

# 文本编码器
class TextEncoder(nn.Module):
    def __init__(self, model_name):
        super(TextEncoder, self).__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def forward(self, text):
        inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True)
        outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(dim=1)

# 图像生成器
class ImageGenerator(nn.Module):
    def __init__(self, in_channels):
        super(ImageGenerator, self).__init__()
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(in_channels, 512, kernel_size=4, stride=1, padding=0),
            nn.BatchNorm2d(512),
            nn.ReLU(True),
            nn.ConvTranspose2d(512, 256, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(True),
            nn.ConvTranspose2d(256, 128, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(True),
            nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(True),
            nn.ConvTranspose2d(64, 3, kernel_size=4, stride=2, padding=1),
            nn.Tanh()
        )

    def forward(self, x):
        x = x.view(-1, x.size(1), 1, 1)
        return self.decoder(x)

# 判别器
class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.main = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=4, stride=2, padding=1),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(64, 128, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(128, 256, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(256, 512, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(512, 1, kernel_size=4, stride=1, padding=0),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.main(x)

# 模型定义
class TextToImageModel(nn.Module):
    def __init__(self, text_encoder_model_name):
        super(TextToImageModel, self).__init__()
        self.text_encoder = TextEncoder(text_encoder_model_name)
        self.image_generator = ImageGenerator(768)  # 768 is the hidden size of BERT

    def forward(self, text):
        text_features = self.text_encoder(text)
        return self.image_generator(text_features)

# 模型加载
def load_model(model_path, text_encoder_model_name):
    model = TextToImageModel(text_encoder_model_name)
    if os.path.exists(model_path):
        model.load_state_dict(torch.load(model_path))
    model.eval()
    return model

# 图像保存
def save_image(image, path):
    if not os.path.exists(os.path.dirname(path)):
        os.makedirs(os.path.dirname(path))
    image.save(path)

# 数据集类
class TextToImageDataset(Dataset):
    def __init__(self, csv_file, transform=None, mode='train'):
        self.data = pd.read_csv(csv_file)
        self.data = clean_data(self.data)
        self.transform = transform
        self.mode = mode

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        image_path = self.data.iloc[idx]['image_path']
        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image, self.mode)
        return text, image

# 模型训练
def train_model(config):
    transform = transforms.Compose([
        transforms.Resize((64, 64)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    dataset = TextToImageDataset(config['training']['dataset_path'], transform=augment_data, mode='train')
    dataloader = DataLoader(dataset, batch_size=config['training']['batch_size'], shuffle=True)

    model = TextToImageModel(config['model']['text_encoder_model_name'])
    discriminator = Discriminator()

    optimizer_g = optim.Adam(model.parameters(), lr=config['training']['learning_rate'])
    optimizer_d = optim.Adam(discriminator.parameters(), lr=config['training']['learning_rate'])

    criterion_gan = nn.BCELoss()
    criterion_l1 = nn.L1Loss()

    for epoch in range(config['training']['epochs']):
        model.train()
        discriminator.train()
        running_loss_g = 0.0
        running_loss_d = 0.0

        for i, (text, images) in enumerate(dataloader):
            real_labels = torch.ones(images.size(0), 1)
            fake_labels = torch.zeros(images.size(0), 1)

            # Train Discriminator
            optimizer_d.zero_grad()
            real_outputs = discriminator(images)
            d_loss_real = criterion_gan(real_outputs, real_labels)

            generated_images = model(text)
            fake_outputs = discriminator(generated_images.detach())
            d_loss_fake = criterion_gan(fake_outputs, fake_labels)

            d_loss = (d_loss_real + d_loss_fake) / 2
            d_loss.backward()
            optimizer_d.step()

            # Train Generator
            optimizer_g.zero_grad()
            generated_images = model(text)
            g_outputs = discriminator(generated_images)
            g_loss_gan = criterion_gan(g_outputs, real_labels)
            g_loss_l1 = criterion_l1(generated_images, images)
            g_loss = g_loss_gan + 100 * g_loss_l1  # Weighted sum of GAN loss and L1 loss
            g_loss.backward()
            optimizer_g.step()

            running_loss_g += g_loss.item()
            running_loss_d += d_loss.item()

        print(f"Epoch {epoch + 1}, Generator Loss: {running_loss_g / len(dataloader)}, Discriminator Loss: {running_loss_d / len(dataloader)}")

    # 保存训练好的模型
    torch.save(model.state_dict(), config['model']['path'])

# 图像生成
def generate_images(model, text_data, output_dir):
    for text in text_data:
        input_tensor = model.text_encoder([text])
        image = model.image_generator(input_tensor)
        image = image.squeeze(0).detach().cpu().numpy()
        image = (image * 127.5 + 127.5).astype('uint8')
        image = Image.fromarray(image.transpose(1, 2, 0))

        # 保存图像
        save_image(image, f"{output_dir}/{text}.png")

# 图形用户界面
class TextToImageGUI:
    def __init__(self, root):
        self.root = root
        self.root.title("文本生成图像")
        self.config = load_config('config.yaml')
        self.model = load_model(self.config['model']['path'], self.config['model']['text_encoder_model_name'])

        self.text_input = tk.Text(root, height=10, width=50)
        self.text_input.pack(pady=10)

        self.train_button = tk.Button(root, text="训练模型", command=self.train_model)
        self.train_button.pack(pady=10)

        self.generate_button = tk.Button(root, text="生成图像", command=self.generate_image)
        self.generate_button.pack(pady=10)

        self.image_label = tk.Label(root)
        self.image_label.pack(pady=10)

    def train_model(self):
        train_model(self.config)
        self.model = load_model(self.config['model']['path'], self.config['model']['text_encoder_model_name'])
        messagebox.showinfo("成功", "模型训练完成")

    def generate_image(self):
        text = self.text_input.get("1.0", tk.END).strip()
        if not text:
            messagebox.showwarning("警告", "请输入文本")
            return

        input_tensor = self.model.text_encoder([text])
        image = self.model.image_generator(input_tensor)
        image = image.squeeze(0).detach().cpu().numpy()
        image = (image * 127.5 + 127.5).astype('uint8')
        image = Image.fromarray(image.transpose(1, 2, 0))

        # 显示图像
        img_tk = ImageTk.PhotoImage(image)
        self.image_label.config(image=img_tk)
        self.image_label.image = img_tk

        # 保存图像
        save_image(image, f"{self.config['data']['output_dir']}/{text}.png")
        messagebox.showinfo("成功", "图像已生成并保存")

if __name__ == "__main__":
    config = load_config('config.yaml')

    # 加载模型
    model = load_model(config['model']['path'], config['model']['text_encoder_model_name'])

    # 加载文本数据
    text_data = load_text_data(config['data']['input_file'])

    # 生成图像
    generate_images(model, text_data, config['data']['output_dir'])

    # 启动图形用户界面
    root = tk.Tk()
    app = TextToImageGUI(root)
    root.mainloop()

希望这些改进能帮助你更好地实现文本生成图像的功能。如果有任何问题或需要进一步的帮助,请随时告诉我!

相关推荐
没事勤琢磨13 小时前
如何让 OpenClaw 控制使用浏览器:让 AI 像真人一样操控你的浏览器
人工智能
用户51914958484513 小时前
CrushFTP 认证绕过漏洞利用工具 (CVE-2024-4040)
人工智能·aigc
牛马摆渡人52813 小时前
OpenClaw实战--Day1: 本地化
人工智能
前端小豆13 小时前
玩转 OpenClaw:打造你的私有 AI 助手网关
人工智能
BugShare14 小时前
写一个你自己的Agent Skills
人工智能·程序员
机器之心14 小时前
英伟达护城河被AI攻破,字节清华CUDA Agent,让人人能搓CUDA内核
人工智能·openai
后端小肥肠15 小时前
公众号躺更神器!OpenClaw+Claude Skill 实现自动读对标 + 写文 + 配图 + 存入草稿箱
人工智能·aigc·agent
爱可生开源社区15 小时前
SCALE | 重构 AI 时代数据库能力的全新评估标准
人工智能
Jahzo16 小时前
openclaw本地化部署体验与踩坑记录--飞书机器人配置
人工智能·开源
Narrastory16 小时前
明日香 - Pytorch 快速入门保姆级教程(一)
人工智能·pytorch·深度学习