BERT的中文问答系统55

为了使日历中的月份和星期显示为中文，我们对 tkcalendar 进行一些定制。tkcalendar 提供了一些选项来设置月份和星期的显示语言。我们可以使用 locale 参数来实现这一点。

以下是完善后的代码，包括日历中月份和星期显示为中文的部分：

完善后的代码

python

python 复制代码

import os
import json
import jsonlines
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer
import tkinter as tk
from tkinter import filedialog, messagebox, ttk, simpledialog
import logging
from difflib import SequenceMatcher
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import tkcalendar

# 获取项目根目录
PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))

# 配置日志
LOGS_DIR = os.path.join(PROJECT_ROOT, 'logs')
os.makedirs(LOGS_DIR, exist_ok=True)

def setup_logging():
    log_file = os.path.join(LOGS_DIR, datetime.now().strftime('%Y-%m-%d_%H-%M-%S_羲和.txt'))
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_file),
            logging.StreamHandler()
        ]
    )

setup_logging()

# 数据集类
class XihuaDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = self.load_data(file_path)

    def load_data(self, file_path):
        data = []
        if file_path.endswith('.jsonl'):
            with jsonlines.open(file_path) as reader:
                for i, item in enumerate(reader):
                    try:
                        data.append(item)
                    except jsonlines.jsonlines.InvalidLineError as e:
                        logging.warning(f"跳过无效行 {i + 1}: {e}")
        elif file_path.endswith('.json'):
            with open(file_path, 'r') as f:
                try:
                    data = json.load(f)
                except json.JSONDecodeError as e:
                    logging.warning(f"跳过无效文件 {file_path}: {e}")
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item.get('question', '')
        human_answer = item.get('human_answers', [''])[0]
        chatgpt_answer = item.get('chatgpt_answers', [''])[0]

        try:
            inputs = self.tokenizer(question, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
            human_inputs = self.tokenizer(human_answer, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
            chatgpt_inputs = self.tokenizer(chatgpt_answer, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
        except Exception as e:
            logging.warning(f"跳过无效项 {idx}: {e}")
            return self.__getitem__((idx + 1) % len(self.data))

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'human_input_ids': human_inputs['input_ids'].squeeze(),
            'human_attention_mask': human_inputs['attention_mask'].squeeze(),
            'chatgpt_input_ids': chatgpt_inputs['input_ids'].squeeze(),
            'chatgpt_attention_mask': chatgpt_inputs['attention_mask'].squeeze(),
            'human_answer': human_answer,
            'chatgpt_answer': chatgpt_answer
        }

# 获取数据加载器
def get_data_loader(file_path, tokenizer, batch_size=8, max_length=128):
    dataset = XihuaDataset(file_path, tokenizer, max_length)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# 模型定义
class XihuaModel(torch.nn.Module):
    def __init__(self, pretrained_model_name):
        super(XihuaModel, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        logits = self.classifier(pooled_output)
        return logits

# 训练函数
def train(model, data_loader, optimizer, criterion, device, progress_var=None):
    model.train()
    total_loss = 0.0
    num_batches = len(data_loader)
    for batch_idx, batch in enumerate(data_loader):
        try:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            human_input_ids = batch['human_input_ids'].to(device)
            human_attention_mask = batch['human_attention_mask'].to(device)
            chatgpt_input_ids = batch['chatgpt_input_ids'].to(device)
            chatgpt_attention_mask = batch['chatgpt_attention_mask'].to(device)

            optimizer.zero_grad()
            human_logits = model(human_input_ids, human_attention_mask)
            chatgpt_logits = model(chatgpt_input_ids, chatgpt_attention_mask)

            human_labels = torch.ones(human_logits.size(0), 1).to(device)
            chatgpt_labels = torch.zeros(chatgpt_logits.size(0), 1).to(device)

            loss = criterion(human_logits, human_labels) + criterion(chatgpt_logits, chatgpt_labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            if progress_var:
                progress_var.set((batch_idx + 1) / num_batches * 100)
        except Exception as e:
            logging.warning(f"跳过无效批次: {e}")

    return total_loss / len(data_loader)

# 模型评估函数
def evaluate_model(model, data_loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            human_input_ids = batch['human_input_ids'].to(device)
            human_attention_mask = batch['human_attention_mask'].to(device)
            chatgpt_input_ids = batch['chatgpt_input_ids'].to(device)
            chatgpt_attention_mask = batch['chatgpt_attention_mask'].to(device)

            human_logits = model(human_input_ids, human_attention_mask)
            chatgpt_logits = model(chatgpt_input_ids, chatgpt_attention_mask)

            human_labels = torch.ones(human_logits.size(0), 1).to(device)
            chatgpt_labels = torch.zeros(chatgpt_logits.size(0), 1).to(device)

            human_correct = (torch.sigmoid(human_logits) > 0.5).float() == human_labels
            chatgpt_correct = (torch.sigmoid(chatgpt_logits) > 0.5).float() == chatgpt_labels

            correct += human_correct.sum().item() + chatgpt_correct.sum().item()
            total += human_labels.size(0) + chatgpt_labels.size(0)

    accuracy = correct / total
    return accuracy

# 网络搜索函数
def search_baidu(query):
    url = f"https://www.baidu.com/s?wd={query}"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    results = soup.find_all('div', class_='c-abstract')
    if results:
        return results[0].get_text().strip()
    return "没有找到相关信息"

# 百度百科搜索函数
def search_baidu_baike(query):
    url = f"https://baike.baidu.com/item/{query}"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    meta_description = soup.find('meta', attrs={'name': 'description'})
    if meta_description:
        return meta_description['content']
    return "没有找到相关信息"

# GUI界面
class XihuaChatbotGUI:
    def __init__(self, root):
        self.root = root
        self.root.title("羲和聊天机器人")

        self.language = tk.StringVar(value='zh')
        self.tokenizer = None
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.models = {}
        self.current_model_type = None

        self.load_models()
        self.load_data()

        # 历史记录
        self.history = []

        self.create_widgets()

    def create_widgets(self):
        # 设置样式
        style = ttk.Style()
        style.theme_use('clam')
        style.configure('TButton', font=('Arial', 12), padding=10)
        style.configure('TLabel', font=('Arial', 12), padding=10)
        style.configure('TEntry', font=('Arial', 12), padding=10)
        style.configure('TText', font=('Arial', 12), padding=10)

        # 顶部框架
        top_frame = ttk.Frame(self.root)
        top_frame.pack(pady=10)

        self.date_label = ttk.Label(top_frame, text="", font=("Arial", 12))
        self.date_label.grid(row=0, column=0, padx=10)
        self.update_date_label()

        language_frame = ttk.Frame(top_frame)
        language_frame.grid(row=0, column=1, padx=10)

        language_label = ttk.Label(language_frame, text="选择语言:", font=("Arial", 12))
        language_label.grid(row=0, column=0, padx=10)

        language_menu = ttk.Combobox(language_frame, textvariable=self.language, values=['zh', 'en'], state='readonly')
        language_menu.grid(row=0, column=1, padx=10)
        language_menu.bind('<<ComboboxSelected>>', self.change_language)

        self.question_label = ttk.Label(top_frame, text="问题:", font=("Arial", 12))
        self.question_label.grid(row=0, column=2, padx=10)

        self.question_entry = ttk.Entry(top_frame, width=50, font=("Arial", 12))
        self.question_entry.grid(row=0, column=3, padx=10)

        self.answer_button = ttk.Button(top_frame, text="获取回答", command=self.get_answer, style='TButton')
        self.answer_button.grid(row=0, column=4, padx=10)

        # 中部框架
        middle_frame = ttk.Frame(self.root)
        middle_frame.pack(pady=10)

        self.chat_text = tk.Text(middle_frame, height=20, width=100, font=("Arial", 12), wrap='word')
        self.chat_text.grid(row=0, column=0, padx=10, pady=10)
        self.chat_text.tag_configure("user", justify='right', foreground='blue')
        self.chat_text.tag_configure("xihua", justify='left', foreground='green')

        # 底部框架
        bottom_frame = ttk.Frame(self.root)
        bottom_frame.pack(pady=10)

        self.clear_button = ttk.Button(bottom_frame, text="清空聊天记录", command=self.clear_chat, style='TButton')
        self.clear_button.grid(row=0, column=0, padx=10)

        self.correct_button = ttk.Button(bottom_frame, text="准确", command=self.mark_correct, style='TButton')
        self.correct_button.grid(row=0, column=1, padx=10)

        self.incorrect_button = ttk.Button(bottom_frame, text="不准确", command=self.mark_incorrect, style='TButton')
        self.incorrect_button.grid(row=0, column=2, padx=10)

        self.train_button = ttk.Button(bottom_frame, text="训练模型", command=self.train_model, style='TButton')
        self.train_button.grid(row=0, column=3, padx=10)

        self.retrain_button = ttk.Button(bottom_frame, text="重新训练模型", command=lambda: self.train_model(retrain=True), style='TButton')
        self.retrain_button.grid(row=0, column=4, padx=10)

        self.progress_var = tk.DoubleVar()
        self.progress_bar = ttk.Progressbar(bottom_frame, variable=self.progress_var, maximum=100, length=200, mode='determinate')
        self.progress_bar.grid(row=1, column=0, columnspan=5, pady=10)

        self.log_text = tk.Text(bottom_frame, height=10, width=70, font=("Arial", 12))
        self.log_text.grid(row=2, column=0, columnspan=5, pady=10)

        self.evaluate_button = ttk.Button(bottom_frame, text="评估模型", command=self.evaluate_model, style='TButton')
        self.evaluate_button.grid(row=3, column=0, padx=10, pady=10)

        self.history_button = ttk.Button(bottom_frame, text="查看历史记录", command=self.view_history, style='TButton')
        self.history_button.grid(row=3, column=1, padx=10, pady=10)

        self.save_history_button = ttk.Button(bottom_frame, text="保存历史记录", command=self.save_history, style='TButton')
        self.save_history_button.grid(row=3, column=2, padx=10, pady=10)

        self.help_button = ttk.Button(bottom_frame, text="使用说明", command=self.show_help, style='TButton')
        self.help_button.grid(row=3, column=3, padx=10, pady=10)

        # 日历框架
        calendar_frame = ttk.Frame(self.root)
        calendar_frame.pack(pady=10)

        self.calendar = tkcalendar.Calendar(calendar_frame, selectmode='day', locale='zh_CN', year=datetime.now().year, month=datetime.now().month, day=datetime.now().day)
        self.calendar.pack(pady=10)

    def update_date_label(self):
        current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        self.date_label.config(text=f"当前时间: {current_time}")
        self.root.after(1000, self.update_date_label)

    def clear_chat(self):
        self.chat_text.delete(1.0, tk.END)

    def get_answer(self):
        question = self.question_entry.get()
        if not question:
            messagebox.showwarning("输入错误", "请输入问题")
            return

        # 自动选择模型
        model_type = self.detect_model_type(question)
        self.select_model(model_type)

        if self.tokenizer is None:
            self.tokenizer = BertTokenizer.from_pretrained(self.get_pretrained_model_name())

        inputs = self.tokenizer(question, return_tensors='pt', padding='max_length', truncation=True, max_length=128)
        with torch.no_grad():
            input_ids = inputs['input_ids'].to(self.device)
            attention_mask = inputs['attention_mask'].to(self.device)
            logits = self.model(input_ids, attention_mask)
        
        if logits.item() > 0:
            answer_type = "羲和回答"
        else:
            answer_type = "零回答"

        specific_answer = self.get_specific_answer(question, answer_type)

        self.chat_text.insert(tk.END, f"用户: {question}\n", "user")
        self.chat_text.insert(tk.END, f"羲和: {specific_answer}\n", "xihua")

        # 添加到历史记录
        self.history.append({
            'question': question,
            'answer_type': answer_type,
            'specific_answer': specific_answer,
            'accuracy': None,  # 初始状态为未评价
            'baidu_baike': None  # 初始状态为无百度百科结果
        })

    def get_specific_answer(self, question, answer_type):
        # 使用模糊匹配查找最相似的问题
        best_match = None
        best_ratio = 0.0
        for item in self.data:
            ratio = SequenceMatcher(None, question, item['question']).ratio()
            if ratio > best_ratio:
                best_ratio = ratio
                best_match = item

        if best_match:
            if answer_type == "羲和回答":
                return best_match['human_answers'][0]
            else:
                return best_match['chatgpt_answers'][0]
        return "这个我也不清楚，你问问零吧"

    def load_data(self):
        self.data = self.load_data_from_file(os.path.join(PROJECT_ROOT, 'data/train_data.jsonl'))

    def load_data_from_file(self, file_path):
        data = []
        if file_path.endswith('.jsonl'):
            with jsonlines.open(file_path) as reader:
                for i, item in enumerate(reader):
                    try:
                        data.append(item)
                    except jsonlines.jsonlines.InvalidLineError as e:
                        logging.warning(f"跳过无效行 {i + 1}: {e}")
        elif file_path.endswith('.json'):
            with open(file_path, 'r') as f:
                try:
                    data = json.load(f)
                except json.JSONDecodeError as e:
                    logging.warning(f"跳过无效文件 {file_path}: {e}")
        return data

    def load_models(self):
        MODELS_DIR = os.path.join(PROJECT_ROOT, 'models')
        model_types = [
            '历史', '聊天', '娱乐', '电脑', '军事', '汽车', '植物', '科技',
            '名人', '生活', '法律', '企业', '标准'
        ]
        for model_type in model_types:
            model_path = os.path.join(MODELS_DIR, f'xihua_model_{model_type}_{self.language.get()}.pth')
            if os.path.exists(model_path):
                model = XihuaModel(pretrained_model_name=self.get_pretrained_model_name()).to(self.device)
                model.load_state_dict(torch.load(model_path, map_location=self.device))
                self.models[model_type] = model
                logging.info(f"加载 {model_type} 模型")
            else:
                logging.info(f"没有找到 {model_type} 模型，将使用预训练模型")
                self.models[model_type] = XihuaModel(pretrained_model_name=self.get_pretrained_model_name()).to(self.device)

        if self.tokenizer is None:
            self.tokenizer = BertTokenizer.from_pretrained(self.get_pretrained_model_name())

    def get_pretrained_model_name(self):
        if self.language.get() == 'zh':
            return 'F:/models/bert-base-chinese'
        elif self.language.get() == 'en':
            return 'bert-base-uncased'
        return 'bert-base-uncased'

    def select_model(self, model_type):
        if model_type in self.models:
            self.model = self.models[model_type]
            self.current_model_type = model_type
            logging.info(f"选择 {model_type} 模型")
        else:
            logging.warning(f"没有找到 {model_type} 模型，使用默认模型")
            self.model = XihuaModel(pretrained_model_name=self.get_pretrained_model_name()).to(self.device)
            self.current_model_type = None

    def detect_model_type(self, question):
        if "皇帝" in question or "朝代" in question:
            return '历史'
        if "娱乐" in question:
            return '娱乐'
        if "电脑" in question:
            return '电脑'
        if "军事" in question:
            return '军事'
        if "汽车" in question:
            return '汽车'
        if "植物" in question:
            return '植物'
        if "科技" in question:
            return '科技'
        if "名人" in question:
            return '名人'
        if "生活" in question or "出行" in question or "菜品" in question or "菜谱" in question or "居家" in question:
            return '生活'
        if "法律" in question:
            return '法律'
        if "企业" in question:
            return '企业'
        if "标准" in question:
            return '标准'
        return '聊天'

    def change_language(self, event):
        self.language = event.widget.get()
        self.load_models()
        self.load_data()

    def train_model(self, retrain=False):
        file_path = filedialog.askopenfilename(filetypes=[("JSONL files", "*.jsonl"), ("JSON files", "*.json")])
        if not file_path:
            messagebox.showwarning("文件选择错误", "请选择一个有效的数据文件")
            return

        model_type = self.detect_model_type(file_path)
        self.select_model(model_type)

        try:
            dataset = XihuaDataset(file_path, self.tokenizer)
            data_loader = DataLoader(dataset, batch_size=8, shuffle=True)
            
            # 加载已训练的模型权重
            if retrain:
                model_path = os.path.join(PROJECT_ROOT, 'models', f'xihua_model_{model_type}_{self.language.get()}.pth')
                self.model.load_state_dict(torch.load(model_path, map_location=self.device))
                self.model.to(self.device)
                self.model.train()

            optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-5)
            criterion = torch.nn.BCEWithLogitsLoss()
            num_epochs = 30
            best_loss = float('inf')
            patience = 5
            no_improvement_count = 0

            for epoch in range(num_epochs):
                train_loss = train(self.model, data_loader, optimizer, criterion, self.device, self.progress_var)
                logging.info(f'第 {epoch+1} 轮次, 损失: {train_loss:.10f}')
                self.log_text.insert(tk.END, f'第 {epoch+1} 轮次, 损失: {train_loss:.10f}\n')
                self.log_text.see(tk.END)

                if train_loss < best_loss:
                    best_loss = train_loss
                    no_improvement_count = 0
                    model_path = os.path.join(PROJECT_ROOT, 'models', f'xihua_model_{model_type}_{self.language.get()}.pth')
                    torch.save(self.model.state_dict(), model_path)
                    logging.info("模型保存")
                else:
                    no_improvement_count += 1
                    if no_improvement_count >= patience:
                        logging.info("早停机制触发，停止训练")
                        break

            logging.info("模型训练完成并保存")
            self.log_text.insert(tk.END, "模型训练完成并保存\n")
            self.log_text.see(tk.END)
            messagebox.showinfo("训练完成", "模型训练完成并保存")
        except Exception as e:
            logging.error(f"模型训练失败: {e}")
            self.log_text.insert(tk.END, f"模型训练失败: {e}\n")
            self.log_text.see(tk.END)
            messagebox.showerror("训练失败", f"模型训练失败: {e}")

    def evaluate_model(self):
        test_data_loader = get_data_loader(os.path.join(PROJECT_ROOT, 'data/test_data.jsonl'), self.tokenizer, batch_size=8, max_length=128)
        accuracy = evaluate_model(self.model, test_data_loader, self.device)
        logging.info(f"模型评估准确率: {accuracy:.4f}")
        self.log_text.insert(tk.END, f"模型评估准确率: {accuracy:.4f}\n")
        self.log_text.see(tk.END)
        messagebox.showinfo("评估结果", f"模型评估准确率: {accuracy:.4f}")

    def mark_correct(self):
        if self.history:
            self.history[-1]['accuracy'] = True
            messagebox.showinfo("评价成功", "您认为这次回答是准确的")

    def mark_incorrect(self):
        if self.history:
            self.history[-1]['accuracy'] = False
            question = self.history[-1]['question']
            self.show_reference_options(question)

    def show_reference_options(self, question):
        reference_window = tk.Toplevel(self.root)
        reference_window.title("参考答案")

        reference_label = ttk.Label(reference_window, text="请选择参考答案来源:", font=("Arial", 12))
        reference_label.pack(pady=10)

        baidu_button = ttk.Button(reference_window, text="百度百科", command=lambda: self.get_reference_answer(question, 'baidu_baike'), style='TButton')
        baidu_button.pack(pady=5)

    def get_reference_answer(self, question, source):
        if source == 'baidu_baike':
            baike_answer = self.search_baidu_baike(question)
            self.chat_text.insert(tk.END, f"百度百科结果: {baike_answer}\n", "xihua")
            self.history[-1]['baidu_baike'] = baike_answer

        messagebox.showinfo("参考答案", f"已获取{source}的结果")

    def search_baidu_baike(self, query):
        return search_baidu_baike(query)

    def view_history(self):
        history_window = tk.Toplevel(self.root)
        history_window.title("历史记录")

        history_text = tk.Text(history_window, height=20, width=80, font=("Arial", 12))
        history_text.pack(padx=10, pady=10)

        for entry in self.history:
            history_text.insert(tk.END, f"问题: {entry['question']}\n")
            history_text.insert(tk.END, f"回答类型: {entry['answer_type']}\n")
            history_text.insert(tk.END, f"具体回答: {entry['specific_answer']}\n")
            if entry['accuracy'] is None:
                history_text.insert(tk.END, "评价: 未评价\n")
            elif entry['accuracy']:
                history_text.insert(tk.END, "评价: 准确\n")
            else:
                history_text.insert(tk.END, "评价: 不准确\n")
            if entry['baidu_baike']:
                history_text.insert(tk.END, f"百度百科结果: {entry['baidu_baike']}\n")
            history_text.insert(tk.END, "-" * 50 + "\n")

    def save_history(self):
        RECORDS_DIR = os.path.join(PROJECT_ROOT, 'records')
        os.makedirs(RECORDS_DIR, exist_ok=True)

        file_name = datetime.now().strftime('%Y-%m-%d_%H-%M-%S.txt')
        file_path = os.path.join(RECORDS_DIR, file_name)

        with open(file_path, 'w', encoding='utf-8') as f:
            for entry in self.history:
                f.write(f"用户: {entry['question']}\n")
                f.write(f"羲和: {entry['specific_answer']}\n")
                if entry['baidu_baike']:
                    f.write(f"百度百科结果: {entry['baidu_baike']}\n")
                f.write("-" * 50 + "\n")

        # 保存为JSON格式
        json_records = []
        for entry in self.history:
            record = {
                "question": entry['question'],
                "human_answers": [entry['specific_answer']] if entry['answer_type'] == "羲和回答" else [],
                "chatgpt_answers": [entry['specific_answer']] if entry['answer_type'] == "零回答" else [],
                "baidu_baike": entry['baidu_baike']
            }
            json_records.append(record)

        json_file_name = datetime.now().strftime('%Y-%m-%d_%H-%M-%S.json')
        json_file_path = os.path.join(RECORDS_DIR, json_file_name)
        with open(json_file_path, 'w', encoding='utf-8') as f:
            json.dump(json_records, f, ensure_ascii=False, indent=4)

        messagebox.showinfo("保存成功", f"历史记录已保存到 {file_path} 和 {json_file_path}")

    def show_help(self):
        help_text = """
        使用说明:
        1. 在"问题"输入框中输入您的问题。
        2. 点击"获取回答"按钮，羲和将为您提供答案。
        3. 如果您认为回答准确，请点击"准确"按钮；如果不准确，请点击"不准确"按钮。
        4. 点击"查看历史记录"按钮可以查看之前的聊天记录。
        5. 点击"保存历史记录"按钮可以将聊天记录保存到文件。
        6. 点击"训练模型"或"重新训练模型"按钮可以对模型进行训练或重新训练。
        7. 点击"评估模型"按钮可以评估模型的准确率。
        8. 点击"使用说明"按钮可以查看此帮助信息。
        """
        help_window = tk.Toplevel(self.root)
        help_window.title("使用说明")
        help_label = ttk.Label(help_window, text=help_text, font=("Arial", 12), justify='left')
        help_label.pack(padx=10, pady=10)

# 主函数
if __name__ == "__main__":
    # 启动GUI
    root = tk.Tk()
    app = XihuaChatbotGUI(root)
    root.mainloop()

关键修改点

日历设置：

在 create_widgets 方法中，创建 tkcalendar.Calendar 时添加了 locale='zh_CN' 参数，以确保月份和星期显示为中文。

其他部分：

确保所有必要的初始化步骤都已执行，特别是 tokenizer 和 models 的初始化。

示例数据集内容

假设你的数据集文件名为 train_data.jsonl，内容如下：

jsonl 复制代码

{"question": "谁是明朝的第一个皇帝？", "human_answers": ["朱元璋是明朝的第一个皇帝。"], "chatgpt_answers": ["朱棣是明朝的第一个皇帝。"]}
{"question": "如何制作巧克力蛋糕？", "human_answers": ["首先准备巧克力、糖、面粉等材料，然后混合搅拌，最后放入烤箱烘烤。"], "chatgpt_answers": ["首先准备鸡蛋、牛奶、面粉等材料，然后混合搅拌，最后放入烤箱烘烤。"]}
{"question": "什么是人工智能？", "human_answers": ["人工智能是指由计算机系统所表现出的智能行为。"], "chatgpt_answers": ["人工智能是指机器模拟人类智能的能力。"]}
{"question": "世界上最高的山峰是什么？", "human_answers": ["珠穆朗玛峰是世界上最高的山峰。"], "chatgpt_answers": ["喜马拉雅山脉的珠穆朗玛峰是世界上最高的山峰。"]}

说明

数据集格式：每个条目都是一个 JSON 对象，包含 question、human_answers 和 chatgpt_answers 三个字段。

训练模型：使用 train_model 方法选择数据文件并训练模型。

评估模型：使用 evaluate_model 方法评估模型的准确率。

日历显示：日历中的月份和星期现在显示为中文。

希望这些改进能帮助你顺利运行和测试你的项目。如果有任何其他问题，请随时告诉我！