在300多条抽象qq空间数据 加了十几条正常聊天记录 现在像个人了
🎯 能力展示:
⚠️ 春节到了想说什么?
, ˇ_ˇ 春节到了 瞬间长大~
⚠️ 朋友分享感情经历怎么回应?
, =_= 每个感情经历都是一个里程碑~
⚠️ 晚安
晚安~
🤔 开始对话吗? (y/n): y
==================================================
💬 与虚拟的你对话
提示:
- 直接问训练过的问题:得到准确回答
- 简单日常聊天:得到有风格的回复
- 复杂/专业问题:可能得到有趣但不准确的回答
-
输入 quit 退出
你: 你是
🎭: , =_= 原来我是根棍~
你: 你女朋友去哪了
🎭: 她去上班了~
你: 你数学学的好不
🎭: , 哼 还能说文艺事呢~
你: 傻瓜
🎭: , 瞬间长大,脑袋里装着'傻子'~
你: 傻子
🎭: , 傻逼
你: 你骂人
🎭: 擦擦擦 你是个坏女孩~
在数据中加了十几条正常聊天对话
go
{"conversations": [{"role": "user", "content": "破单位"}, {"role": "assistant", "content": "不是 我要是没辞职的话 是要去上班的"}]},
{"conversations": [{"role": "user", "content": "挺好 跟家歇着"}, {"role": "assistant", "content": "住那 吵不 是不是没水了 喊你打水 不打水就打你"}]},
{"conversations": [{"role": "user", "content": "就我们俩人 吵个屁"}, {"role": "assistant", "content": "半夜鸡叫 周扒皮"}]}
go
# ========== Cell 1:安装依赖库 ==========
import sys
import subprocess
def install_packages():
print("正在安装必要的库...")
packages = [
"transformers",
"datasets",
"peft",
"accelerate",
"sentencepiece",
"bitsandbytes",
"scikit-learn",
"einops"
]
for package in packages:
try:
subprocess.check_call([
sys.executable, "-m", "pip", "install",
package, "-q", "--no-deps"
])
print(f"✅ 安装: {package}")
except:
print(f"⚠️ 可能已安装: {package}")
print("\n安装完成!")
install_packages()
# ========== Cell 2:环境准备和导入 ==========
import os
import json
import torch
import random
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
# 设置随机种子以确保可重复性
def set_seed(seed=42):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
set_seed(42)
# ========== Cell 3:设备检查 ==========
print("🔍 环境检查和GPU设置...")
print(f"PyTorch版本: {torch.__version__}")
print(f"CUDA是否可用: {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f"可用GPU数量: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
print(f" GPU {i}: {torch.cuda.get_device_name(i)}")
# 强制使用GPU 0
torch.cuda.set_device(0)
print(f"当前已设置为使用GPU: {torch.cuda.current_device()}")
else:
print("使用CPU训练")
print("✅ 环境准备完成")
# ========== Cell 4:数据加载 ==========
print("\n📂 加载数据...")
DATA_PATH = "/kaggle/input/jsonoooho/qqjson.json"
def load_and_analyze_data(data_path):
if not os.path.exists(data_path):
print(f"❌ 数据文件不存在: {data_path}")
for root, dirs, files in os.walk("/kaggle/input"):
for file in files:
if file.endswith('.json'):
print(f"找到: {os.path.join(root, file)}")
data_path = os.path.join(root, file)
break
print(f"📄 加载数据: {data_path}")
with open(data_path, 'r', encoding='utf-8') as f:
raw_data = json.load(f)
print(f"✅ 成功加载 {len(raw_data)} 条数据")
if len(raw_data) == 0:
raise ValueError("数据文件为空")
print("\n📊 数据样例 (前2条):")
for i in range(min(2, len(raw_data))):
print(f" [{i+1}] {raw_data[i]}")
return raw_data
raw_data = load_and_analyze_data(DATA_PATH)
# ========== Cell 5:模型和Tokenizer配置 ==========
print("\n🤖 配置模型...")
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
print(f"选择模型: {MODEL_NAME}")
from transformers import AutoTokenizer, AutoModelForCausalLM
print("加载tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
padding_side="right"
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print(f"✅ Tokenizer已加载")
# ========== Cell 6:数据预处理 ==========
print("\n🔄 处理训练数据...")
def prepare_training_data_from_conversations(raw_data, tokenizer):
formatted_examples = []
print(f"处理 {len(raw_data)} 条数据...")
processed_count = 0
for i, item in enumerate(tqdm(raw_data, desc="格式化数据")):
try:
if 'conversations' not in item:
continue
conversations = item['conversations']
if len(conversations) < 2:
continue
if conversations[0]['role'] != 'user':
if len(conversations) >= 2:
if conversations[0]['role'] == 'assistant' and conversations[1]['role'] == 'user':
conversations = [conversations[1], conversations[0]]
formatted_text = tokenizer.apply_chat_template(
conversations,
tokenize=False,
add_generation_prompt=False
)
formatted_examples.append({"text": formatted_text})
processed_count += 1
except Exception as e:
continue
print(f"📊 数据预处理完成: {processed_count} 条")
return formatted_examples
formatted_data = prepare_training_data_from_conversations(raw_data, tokenizer)
from datasets import Dataset
dataset = Dataset.from_list(formatted_data)
print(f"✅ 数据集创建完成: {len(dataset)} 条样本")
def tokenize_function(examples, max_length=256):
tokenized = tokenizer(
examples["text"],
truncation=True,
padding="max_length",
max_length=max_length,
return_tensors=None
)
tokenized["labels"] = tokenized["input_ids"].copy()
return tokenized
print("应用Tokenization...")
tokenized_dataset = dataset.map(
lambda x: tokenize_function(x, max_length=256),
batched=True,
batch_size=32,
remove_columns=["text"]
)
split_dataset = tokenized_dataset.train_test_split(
test_size=0.1,
seed=42,
shuffle=True
)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]
print(f"📋 数据集分割:")
print(f" 🏋️ 训练集: {len(train_dataset)} 条")
print(f" 🧪 验证集: {len(eval_dataset)} 条")
# ========== Cell 7:配置强化LoRA和模型加载 ==========
print("\n🤖 配置强化版LoRA...")
from peft import LoraConfig, get_peft_model
# ★★★ 强化LoRA配置 ★★★
print("配置强化LoRA参数...")
lora_config = LoraConfig(
task_type="CAUSAL_LM",
r=32, # ★ 增加秩,提升表达能力
lora_alpha=64, # ★ 对应缩放系数
lora_dropout=0.1,
target_modules=[
# 基础注意力层
"q_proj", "k_proj", "v_proj", "o_proj",
# ★★★ 新增MLP层!(真正改进对话逻辑)
"gate_proj", "up_proj", "down_proj",
# ★★★ 新增lm_head!(直接改善输出质量)
"lm_head"
],
bias="lora_only",
inference_mode=False,
)
print(f"✨ 强化LoRA配置:")
print(f" - 秩(r): {lora_config.r} (原来是8)")
print(f" - 缩放参数: {lora_config.lora_alpha}")
print(f" - 目标模块: {len(lora_config.target_modules)}个")
# 🚨 关键修改:强制使用GPU 0,避免跨设备错误
print("\n加载基础模型到GPU 0...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float16,
device_map="cuda:0" if torch.cuda.is_available() else "auto", # 强制GPU 0
trust_remote_code=True,
use_cache=False,
)
# 🚨 额外确保模型在当前GPU上
if torch.cuda.is_available():
model = model.to("cuda")
print(f"✅ 模型强制移动到: {model.device}")
else:
print("✅ 使用CPU")
print(f"基础模型设备: {next(model.parameters()).device}")
# 应用强化LoRA适配器
print("\n应用强化LoRA适配器...")
model = get_peft_model(model, lora_config)
# ★★★ 打印更强的可训练参数
print("\n🔍 模型参数统计:")
model.print_trainable_parameters()
# 验证配置
print("\n🔍 模型配置验证:")
print(f" 是Peft模型: {hasattr(model, 'peft_config')}")
model.train()
print("✅ 模型已设置为训练模式")
# ========== Cell 8:训练参数配置 ==========
print("\n⚙️ 配置强化训练参数...")
from transformers import TrainingArguments
import math
num_train_samples = len(train_dataset)
print(f"训练样本数: {num_train_samples}")
# 批次大小配置
per_device_batch_size = 2
gradient_accumulation_steps = 4
effective_batch_size = per_device_batch_size * gradient_accumulation_steps
steps_per_epoch = math.ceil(num_train_samples / effective_batch_size)
total_epochs = 8 # ★ 增加到8轮配合强化LoRA
max_steps = steps_per_epoch * total_epochs
print(f"📊 训练计算:")
print(f" - 每设备批次: {per_device_batch_size}")
print(f" - 梯度累积步数: {gradient_accumulation_steps}")
print(f" - 有效批次大小: {effective_batch_size}")
print(f" - 每epoch步数: {steps_per_epoch}")
print(f" - 总训练轮数: {total_epochs}")
print(f" - 总训练步数: ~{max_steps}")
# ★★★ 配合强化LoRA的训练参数 ★★★
training_args = TrainingArguments(
output_dir="./qwen_qq_enhanced", # ★ 改名字区分
# 训练循环
num_train_epochs=total_epochs,
per_device_train_batch_size=per_device_batch_size,
per_device_eval_batch_size=per_device_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
# ★ 优化器设置(配合强化LoRA)
learning_rate=2e-4, # ★ 提高学习率
weight_decay=0.01,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-8,
max_grad_norm=1.0,
# 学习率调度
lr_scheduler_type="cosine",
warmup_ratio=0.1,
# 评估和保存
eval_strategy="steps",
eval_steps=min(50, steps_per_epoch // 2),
save_strategy="steps",
save_steps=min(100, steps_per_epoch),
save_total_limit=3,
# 日志
logging_strategy="steps",
logging_steps=10,
logging_first_step=True,
report_to="none", # Kaggle中禁用外部报告
# 🚨 解决FP16梯度错误:自动处理
fp16=False,
bf16=True,
fp16_full_eval=False,
bf16_full_eval=True,
# 其他关键参数
dataloader_drop_last=True,
remove_unused_columns=True,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
)
print(f"\n✅ 训练参数配置完成:")
print(f" 输出目录: {training_args.output_dir}")
print(f" 学习率: {training_args.learning_rate}")
print(f" 训练轮数: {training_args.num_train_epochs}")
print(f" FP16训练: {training_args.fp16}")
estimated_time_minutes = max_steps * 1.5 / 60
print(f" 预计训练时间: ~{estimated_time_minutes:.1f} 分钟")
# ========== Cell 9:创建训练器 ==========
print("\n🎪 创建训练器...")
from transformers import Trainer, DataCollatorForLanguageModeling
import time
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False,
pad_to_multiple_of=8,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=data_collator,
)
print(f"✅ 训练器创建完成!")
print(f" 模型参数量: {sum(p.numel() for p in model.parameters()):,}")
print(f" 可训练参数: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
# ========== Cell 10:开始训练(带错误处理) ==========
print("\n" + "="*60)
print("🚀 开始模型训练!")
print("="*60)
try:
# 开始训练
trainer.train()
print("🎉 训练完成!")
except Exception as e:
print(f"❌ 训练出错: {e}")
if "Attempting to unscale FP16 gradients" in str(e):
print("\n🔧 检测到FP16梯度错误,尝试无FP16训练...")
# 备选方案:禁用FP16
training_args.fp16 = False
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=data_collator,
)
try:
trainer.train()
print("🎉 无FP16训练完成!")
except Exception as e2:
print(f"❌ 备选训练也失败: {e2}")
import traceback
traceback.print_exc()
raise
else:
import traceback
traceback.print_exc()
raise
# ========== Cell 11:保存模型(修正路径) ==========
print("\n💾 保存模型...")
try:
# 🚨 关键:保存路径不要用"./"开头
save_path = "my_qq_virtual_self_enhanced" # ✅ 正确路径
# 保存模型
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
print(f"✅ 模型已保存到: {save_path}")
print(f"📁 保存目录内容:")
import os
if os.path.exists(save_path):
for file in os.listdir(save_path):
size = os.path.getsize(os.path.join(save_path, file))
print(f" - {file} ({size:,} bytes)")
print("\n🎯 模型保存完成!")
except Exception as e:
print(f"❌ 保存失败: {e}")
import traceback
traceback.print_exc()
# ========== Cell 12:测试训练结果(修正加载方法) ==========
print("\n" + "="*60)
print("🧪 测试训练结果")
print("="*60)
from peft import PeftModel
def test_trained_model():
try:
save_path = "my_qq_virtual_self_enhanced"
print(f"加载保存的LoRA适配器: {save_path}")
# ✅ 正确方法:先加载基础模型,再加载LoRA
print("1. 加载基础Qwen模型...")
base_model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True
)
print("2. 加载LoRA适配器...")
trained_model = PeftModel.from_pretrained(
base_model,
save_path,
adapter_name="qq_virtual"
)
print("3. 加载Tokenizer...")
trained_tokenizer = AutoTokenizer.from_pretrained(
save_path,
trust_remote_code=True
)
if trained_tokenizer.pad_token is None:
trained_tokenizer.pad_token = trained_tokenizer.eos_token
print("✅ 模型加载成功!")
# 设置为评估模式
trained_model.eval()
# 测试几个问题
test_questions = [
"春节到了想说什么?",
"你好",
"今天心情怎么样?",
]
print("\n📝 测试对话:")
for question in test_questions:
print(f"\n你: {question}")
messages = [{"role": "user", "content": question}]
text = trained_tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
inputs = trained_tokenizer(text, return_tensors="pt").to(trained_model.device)
with torch.no_grad():
outputs = trained_model.generate(
**inputs,
max_new_tokens=80,
temperature=0.7,
do_sample=True,
pad_token_id=trained_tokenizer.eos_token_id
)
response = trained_tokenizer.decode(
outputs[0][inputs.input_ids.shape[1]:],
skip_special_tokens=True
)
print(f"🎭: {response}")
return trained_model, trained_tokenizer
except Exception as e:
print(f"❌ 测试失败: {e}")
import traceback
traceback.print_exc()
return None, None
# 运行测试
model, tokenizer = test_trained_model()
# ========== Cell 13:本地部署助手包装器 ==========
测试
go
# ========== 完整打包:你的虚拟助手 ==========
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
class YourVirtualSelf:
"""你的虚拟自我完整版"""
def __init__(self, model_path="./my_qq_virtual_self_enhanced"):
print(f"🎭 加载虚拟的你...")
try:
self.model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True
)
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
print("✅ 加载成功!")
print(" 特点:= _ =,简洁回复,有点小情绪")
# 功能演示
self.demo()
except Exception as e:
print(f"❌ 加载失败: {e}")
self.model = None
def demo(self):
"""展示能力"""
print("\n🎯 能力展示:")
demos = [
("春节到了想说什么?", "春节 快乐。。。"),
("朋友分享感情经历怎么回应?", "别逗比了=_="),
("晚安", "午梦佳人~"),
]
for q, expected in demos:
response = self.chat(q, temperature=0.1)
match = "✅" if expected in response else "⚠️"
print(f" {match} {q}")
print(f" {response}")
def chat(self, message, temperature=0.6):
"""基本聊天"""
if self.model is None:
return "模型未加载"
messages = [{"role": "user", "content": message}]
text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)
generation_config = GenerationConfig(
max_new_tokens=70,
temperature=temperature,
top_p=0.95,
do_sample=True,
repetition_penalty=1.01,
pad_token_id=self.tokenizer.eos_token_id,
)
with torch.no_grad():
outputs = self.model.generate(**inputs, generation_config=generation_config)
response = self.tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
return response.strip()
def interactive(self):
"""交互式聊天"""
if self.model is None:
print("无法启动聊天")
return
print("\n" + "="*50)
print("💬 与虚拟的你对话")
print("提示:")
print(" - 直接问训练过的问题:得到准确回答")
print(" - 简单日常聊天:得到有风格的回复")
print(" - 复杂/专业问题:可能得到有趣但不准确的回答")
print(" - 输入 quit 退出")
print("="*50)
history = []
while True:
try:
user_input = input("\n你: ").strip()
if user_input.lower() in ['quit', 'exit', '退出', 'bye']:
print("🎭 下次见~ =_=")
break
if not user_input:
continue
# 简单历史管理
if len(history) > 4:
history = history[-2:] # 只保留最近一轮
response = self.chat(user_input)
print(f"🎭: {response}")
# 记录对话
history.extend([user_input, response])
except KeyboardInterrupt:
print("\n\n对话结束")
break
except Exception as e:
print(f"错误: {e}")
# 一键启动你的虚拟助手
print("🚀 准备启动你的虚拟自我助手...")
my_virtual_self = YourVirtualSelf()
if my_virtual_self.model is not None:
if input("\n🤔 开始对话吗? (y/n): ").lower() == 'y':
my_virtual_self.interactive()
else:
print("✅ 模型已就绪!随时可以对话。")
# 快速测试几个问题
print("\n📝 快速测试:")
test_q = input("问一个问题测试: ").strip()
if test_q:
response = my_virtual_self.chat(test_q)
print(f"🎭 回答: {response}")
else:
print("❌ 无法加载模型")
print("\n" + "="*50)
print("🎉 完成!你的个性化AI助手已准备就绪")
print(" 特点:有记忆、有个性、不同于标准AI")
print(" 使用方法:直接对话即可")
print("="*50)