伏羲0.06(文生图)

主要改进点:

数据准备:

数据清洗:增加了 clean_data 函数,用于去除空值和重复值。

数据增强:增加了 augment_data 函数,用于在训练时进行数据增强。

模型选择:

生成对抗网络 (GAN):增加了 Discriminator 类,用于判别生成的图像是否真实。

损失函数:增加了 GAN 损失和 L1 损失,用于训练生成器和判别器。

模型架构设计:

文本编码器:使用预训练的 Transformer 模型(如 BERT)来编码文本描述。

图像生成器:增加了更多的卷积转置层,并使用了批量归一化和激活函数。

多模态融合:将文本特征和图像特征进行有效融合,确保生成的图像与文本描述一致。

训练过程:

损失函数:使用 GAN 损失和 L1 损失,分别用于训练生成器和判别器。

优化算法:使用 Adam 优化器。

训练策略:使用批量归一化、梯度裁剪等技术来稳定训练过程。

正则化:防止过拟合,可以使用 L1/L2 正则化、Dropout 等技术。

python 复制代码
import tkinter as tk
from tkinter import filedialog, messagebox
from PIL import Image, ImageTk
import torch
import torch.optim as optim
import torch.nn as nn
import torchvision.transforms as transforms
import yaml
import os
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import random
import numpy as np

# 配置文件加载
def load_config(config_path):
    with open(config_path, 'r', encoding='utf-8') as file:
        config = yaml.safe_load(file)
    return config

# 数据加载
def load_text_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text_data = file.readlines()
    return [line.strip() for line in text_data]

# 数据清洗
def clean_data(data):
    # 这里可以添加更多的数据清洗逻辑
    return data.dropna().drop_duplicates()

# 数据增强
def augment_data(image, mode):
    if mode == 'train':
        transform = transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(10),
            transforms.RandomResizedCrop(64, scale=(0.8, 1.0)),
            transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])
    else:
        transform = transforms.Compose([
            transforms.Resize((64, 64)),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])
    return transform(image)

# 文本编码器
class TextEncoder(nn.Module):
    def __init__(self, model_name):
        super(TextEncoder, self).__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def forward(self, text):
        inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True)
        outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(dim=1)

# 图像生成器
class ImageGenerator(nn.Module):
    def __init__(self, in_channels):
        super(ImageGenerator, self).__init__()
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(in_channels, 512, kernel_size=4, stride=1, padding=0),
            nn.BatchNorm2d(512),
            nn.ReLU(True),
            nn.ConvTranspose2d(512, 256, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(True),
            nn.ConvTranspose2d(256, 128, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(True),
            nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(True),
            nn.ConvTranspose2d(64, 3, kernel_size=4, stride=2, padding=1),
            nn.Tanh()
        )

    def forward(self, x):
        x = x.view(-1, x.size(1), 1, 1)
        return self.decoder(x)

# 判别器
class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.main = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=4, stride=2, padding=1),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(64, 128, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(128, 256, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(256, 512, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(512, 1, kernel_size=4, stride=1, padding=0),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.main(x)

# 模型定义
class TextToImageModel(nn.Module):
    def __init__(self, text_encoder_model_name):
        super(TextToImageModel, self).__init__()
        self.text_encoder = TextEncoder(text_encoder_model_name)
        self.image_generator = ImageGenerator(768)  # 768 is the hidden size of BERT

    def forward(self, text):
        text_features = self.text_encoder(text)
        return self.image_generator(text_features)

# 模型加载
def load_model(model_path, text_encoder_model_name):
    model = TextToImageModel(text_encoder_model_name)
    if os.path.exists(model_path):
        model.load_state_dict(torch.load(model_path))
    model.eval()
    return model

# 图像保存
def save_image(image, path):
    if not os.path.exists(os.path.dirname(path)):
        os.makedirs(os.path.dirname(path))
    image.save(path)

# 数据集类
class TextToImageDataset(Dataset):
    def __init__(self, csv_file, transform=None, mode='train'):
        self.data = pd.read_csv(csv_file)
        self.data = clean_data(self.data)
        self.transform = transform
        self.mode = mode

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        image_path = self.data.iloc[idx]['image_path']
        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image, self.mode)
        return text, image

# 模型训练
def train_model(config):
    transform = transforms.Compose([
        transforms.Resize((64, 64)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    dataset = TextToImageDataset(config['training']['dataset_path'], transform=augment_data, mode='train')
    dataloader = DataLoader(dataset, batch_size=config['training']['batch_size'], shuffle=True)

    model = TextToImageModel(config['model']['text_encoder_model_name'])
    discriminator = Discriminator()

    optimizer_g = optim.Adam(model.parameters(), lr=config['training']['learning_rate'])
    optimizer_d = optim.Adam(discriminator.parameters(), lr=config['training']['learning_rate'])

    criterion_gan = nn.BCELoss()
    criterion_l1 = nn.L1Loss()

    for epoch in range(config['training']['epochs']):
        model.train()
        discriminator.train()
        running_loss_g = 0.0
        running_loss_d = 0.0

        for i, (text, images) in enumerate(dataloader):
            real_labels = torch.ones(images.size(0), 1)
            fake_labels = torch.zeros(images.size(0), 1)

            # Train Discriminator
            optimizer_d.zero_grad()
            real_outputs = discriminator(images)
            d_loss_real = criterion_gan(real_outputs, real_labels)

            generated_images = model(text)
            fake_outputs = discriminator(generated_images.detach())
            d_loss_fake = criterion_gan(fake_outputs, fake_labels)

            d_loss = (d_loss_real + d_loss_fake) / 2
            d_loss.backward()
            optimizer_d.step()

            # Train Generator
            optimizer_g.zero_grad()
            generated_images = model(text)
            g_outputs = discriminator(generated_images)
            g_loss_gan = criterion_gan(g_outputs, real_labels)
            g_loss_l1 = criterion_l1(generated_images, images)
            g_loss = g_loss_gan + 100 * g_loss_l1  # Weighted sum of GAN loss and L1 loss
            g_loss.backward()
            optimizer_g.step()

            running_loss_g += g_loss.item()
            running_loss_d += d_loss.item()

        print(f"Epoch {epoch + 1}, Generator Loss: {running_loss_g / len(dataloader)}, Discriminator Loss: {running_loss_d / len(dataloader)}")

    # 保存训练好的模型
    torch.save(model.state_dict(), config['model']['path'])

# 图像生成
def generate_images(model, text_data, output_dir):
    for text in text_data:
        input_tensor = model.text_encoder([text])
        image = model.image_generator(input_tensor)
        image = image.squeeze(0).detach().cpu().numpy()
        image = (image * 127.5 + 127.5).astype('uint8')
        image = Image.fromarray(image.transpose(1, 2, 0))

        # 保存图像
        save_image(image, f"{output_dir}/{text}.png")

# 图形用户界面
class TextToImageGUI:
    def __init__(self, root):
        self.root = root
        self.root.title("文本生成图像")
        self.config = load_config('config.yaml')
        self.model = load_model(self.config['model']['path'], self.config['model']['text_encoder_model_name'])

        self.text_input = tk.Text(root, height=10, width=50)
        self.text_input.pack(pady=10)

        self.train_button = tk.Button(root, text="训练模型", command=self.train_model)
        self.train_button.pack(pady=10)

        self.generate_button = tk.Button(root, text="生成图像", command=self.generate_image)
        self.generate_button.pack(pady=10)

        self.image_label = tk.Label(root)
        self.image_label.pack(pady=10)

    def train_model(self):
        train_model(self.config)
        self.model = load_model(self.config['model']['path'], self.config['model']['text_encoder_model_name'])
        messagebox.showinfo("成功", "模型训练完成")

    def generate_image(self):
        text = self.text_input.get("1.0", tk.END).strip()
        if not text:
            messagebox.showwarning("警告", "请输入文本")
            return

        input_tensor = self.model.text_encoder([text])
        image = self.model.image_generator(input_tensor)
        image = image.squeeze(0).detach().cpu().numpy()
        image = (image * 127.5 + 127.5).astype('uint8')
        image = Image.fromarray(image.transpose(1, 2, 0))

        # 显示图像
        img_tk = ImageTk.PhotoImage(image)
        self.image_label.config(image=img_tk)
        self.image_label.image = img_tk

        # 保存图像
        save_image(image, f"{self.config['data']['output_dir']}/{text}.png")
        messagebox.showinfo("成功", "图像已生成并保存")

if __name__ == "__main__":
    config = load_config('config.yaml')

    # 加载模型
    model = load_model(config['model']['path'], config['model']['text_encoder_model_name'])

    # 加载文本数据
    text_data = load_text_data(config['data']['input_file'])

    # 生成图像
    generate_images(model, text_data, config['data']['output_dir'])

    # 启动图形用户界面
    root = tk.Tk()
    app = TextToImageGUI(root)
    root.mainloop()

希望这些改进能帮助你更好地实现文本生成图像的功能。如果有任何问题或需要进一步的帮助,请随时告诉我!

相关推荐
AI周红伟6 分钟前
AI学习第一课:OpenClaw企业实战应用工作坊
大数据·人工智能
AI科技星19 分钟前
全域数学·第二部 几何本原部 《无穷维射影几何原本》合订典藏版【乖乖数学】
人工智能·线性代数·数学建模·矩阵·量子计算
ProgramHelpOa27 分钟前
Optiver 2026 OA 全面复盘|26NG / Intern 最新高频题型整理
人工智能·算法·机器学习
MobotStone28 分钟前
一个人的 AI 能力级别:从会问,到会用,再到会造系统
人工智能
暗夜猎手-大魔王41 分钟前
ClaudeCode提示词工程学习
人工智能
ShareCreators42 分钟前
新能源车险拐点将至,险企迎来千亿增长机遇
人工智能·汽车·blueberry
一只理智恩44 分钟前
一个会“顶嘴”、会陪聊、拥有数字人的情绪型 AI
人工智能
weixin_511840471 小时前
2026年5月4日 AI对存储产业链上下游影响的深度研究
人工智能·市场分析
anew___1 小时前
深度学习基础全攻略
人工智能
2zcode1 小时前
基于深度学习的肺部听诊音疾病智能诊断方法研究
人工智能·深度学习