GPT - 2 文本生成任务全流程

数据集下载

数据预处理

python 复制代码
import json
import pandas as pd

all_data = []

with open("part-00018.jsonl",encoding="utf-8") as f:
    for line in f.readlines():
        data = json.loads(line)
        all_data.append(data["text"])

batch_size = 10000

for i in range(0,len(all_data),batch_size):
    begin = i
    end = i + batch_size

    df = pd.DataFrame({"content":all_data[begin:end]})
    df.to_csv(f"./data/{i}.csv",index=False)

GPT-2 模型的配置

这部分代码的功能是初始化一个 GPT-2 模型的配置对象 GPT2Config,该对象将用于后续创建 GPT-2 模型实例。

方式一:在线配置

复制代码
config = GPT2Config.from_pretrained("openai-community/gpt2",
                                    vocab_size=len(tokenizer),
                                    n_ctx=context_length,
                                    bos_token_id = tokenizer.bos_token_id,
                                    eos_token_id = tokenizer.eos_token_id,
                                    )

方式二:复制官网配置文件到本地

创建本地文件夹

复制官网配置文件到本地 https://huggingface.co/openai-community/gpt2/blob/main/config.json

python 复制代码
{
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "vocab_size": 50257
}
复制代码
config = GPT2Config.from_pretrained("config/gpt2.config",
                                    vocab_size=len(tokenizer),
                                    n_ctx=context_length,
                                    bos_token_id = tokenizer.bos_token_id,
                                    eos_token_id = tokenizer.eos_token_id,
                                    )

模型映射、模型训练

python 复制代码
from glob import glob
import os
from torch.utils.data import Dataset
from datasets import load_dataset
import random
from transformers import BertTokenizerFast
from transformers import GPT2Config
from transformers import GPT2LMHeadModel
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer,TrainingArguments

def tokenize(element):
    outputs = tokenizer(element["content"],truncation=True,max_length=context_length,return_overflowing_tokens=True,return_length=True)

    input_batch = []

    for length,input_ids in zip(outputs["length"],outputs["input_ids"]):

        if length == context_length:
            input_batch.append(input_ids)

    return {"input_ids":input_batch}

if __name__ == "__main__":
    random.seed(1002)
    test_rate = 0.2
    context_length = 128

    all_files = glob(pathname=os.path.join("data","*"))

    test_file_list = random.sample(all_files,int(len(all_files)*test_rate))
    train_file_list = [i for i in all_files if i not in test_file_list]

    raw_datasets = load_dataset("csv",data_files={"train":train_file_list,"vaild":test_file_list},cache_dir="cache_data")


    tokenizer = BertTokenizerFast.from_pretrained("D:/bert-base-chinese")
    tokenizer.add_special_tokens({"bos_token":"[begin]","eos_token":"[end]"})

    tokenize_datasets = raw_datasets.map(tokenize,batched=True,remove_columns=raw_datasets["train"].column_names)

    config = GPT2Config.from_pretrained("config/gpt2.config",
                                        vocab_size=len(tokenizer),
                                        n_ctx=context_length,
                                        bos_token_id = tokenizer.bos_token_id,
                                        eos_token_id = tokenizer.eos_token_id,
                                        )

    model = GPT2LMHeadModel(config)
    model_size = sum([ t.numel() for t in model.parameters()])
    print(f"model_size: {model_size/1000/1000} M")

    data_collator = DataCollatorForLanguageModeling(tokenizer,mlm=False)

    args = TrainingArguments(
        learning_rate=1e-5,
        num_train_epochs=100,
        per_device_train_batch_size=10,
        per_device_eval_batch_size=10,
        eval_steps=2000,
        logging_steps=2000,
        gradient_accumulation_steps=5,
        weight_decay=0.1,
        warmup_steps=1000,
        lr_scheduler_type="cosine",
        save_steps=100,
        output_dir="model_output",
        fp16=True,
    )

    trianer = Trainer(
        model=model,
        args=args,
        tokenizer=tokenizer,
        data_collator=data_collator,
        train_dataset=tokenize_datasets["train"],
        eval_dataset=tokenize_datasets["vaild"]
    )

    trianer.train()

文本生成交互界面

python 复制代码
from transformers import GPT2LMHeadModel,BertTokenizerFast
import os

tokenizer = BertTokenizerFast.from_pretrained("bert-base-chinese")
model_path = os.path.join("model_output","checkpoint-100")

model = GPT2LMHeadModel.from_pretrained(model_path,pad_token_id=tokenizer.pad_token_id)
model = model.to("cuda")

while True:
    input_text = input("请输入:")
    input_ids = tokenizer.encode(input_text,return_tensors="pt")
    input_ids = input_ids.to("cuda")

    output = model.generate(input_ids,max_length=400,num_beams=5,repetition_penalty=1,early_stopping=True)

    output_text = tokenizer.decode(output[0],skip_special_tokens=True)
    print(f"输出:{output_text}")
相关推荐
京东云开发者9 小时前
DeepSeek冲击(含本地化部署实践)
人工智能
@国境以南,太阳以西9 小时前
基于Grad-CAM(Gradient-weighted Class Activation Mapping)的可解释性分析
人工智能·深度学习
AI人工智能+9 小时前
表格识别技术:通过计算机视觉和OCR,实现非结构化表格向结构化数据的转换,推动数字化转型。
人工智能·计算机视觉·ocr
小宁爱Python10 小时前
基于 Django+Vue3 的 AI 海报生成平台开发(海报模块专项)
人工智能·python·django
破烂儿10 小时前
基于机器学习的缓存准入策略研究
人工智能·机器学习·缓存
算法打盹中10 小时前
SimLingo:纯视觉框架下的自动驾驶视觉 - 语言 - 动作融合模型
人工智能·机器学习·计算机视觉·语言模型·自动驾驶
大嘴带你水论文11 小时前
震惊!仅用10张照片就能随意编辑3D人脸?韩国KAIST最新黑科技FFaceNeRF解析!
论文阅读·人工智能·python·科技·计算机视觉·3d·transformer
IT_陈寒11 小时前
🔥3分钟掌握JavaScript性能优化:从V8引擎原理到5个实战提速技巧
前端·人工智能·后端
格林威11 小时前
棱镜的技术加持:线扫相机如何同时拍RGB和SWIR?
人工智能·深度学习·数码相机·yolo·计算机视觉
JoinApper11 小时前
小白学OpenCV系列3-图像算数运算
人工智能·opencv·计算机视觉