利用DS制作了一个简易的数据分析程序
支持加载excel或CSV文档的数据进行常见的数据分析及可视化展示
支持常见数据清洗:移除重复值、处理缺失值、类型转换、规整小数、修改列名等
支持常见的统计分析、查看基本的数据信息
支持自定义XY轴数据的图表展示,包括柱状图、折线图、直方图、饼图、散点图、箱线图、热力图、雷达图,并可以保存图表到指定位置
支持经过清洗后的数据保存到excel文件


python
# -*- coding: utf-8 -*-
"""
Created on 2025-11-7 14:30:28
@author: oldhen
"""
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
import seaborn as sns
import numpy as np
import requests
import threading
import time
from datetime import datetime
#import json
import matplotlib.font_manager as fm # 添加字体管理器
class DataAnalysisApp:
# 设置图形样式
sns.set_style("whitegrid")
def __init__(self, root):
self.root = root
self.root.title("智能数据分析助手")
self.root.geometry("1200x800")
# 初始化数据
self.df = None
self.original_df = None
self.api_url = None
self.api_auto_update = False
self.api_update_interval = 60 # 默认60秒
# 用于列排序的状态跟踪
self.sort_states = {} # 记录每列的排序状态
# 设置中文字体 - 新增
self.setup_chinese_font()
# 设置样式
self.setup_styles()
# 创建界面
self.create_widgets()
def setup_chinese_font(self):
"""设置中文字体,解决字体缺失问题"""
try:
# 方法1:尝试使用系统字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
# 方法2:检查字体是否真正可用
available_fonts = [f.name for f in fm.fontManager.ttflist]
chinese_fonts = ['SimHei', 'Microsoft YaHei', 'KaiTi', 'SimSun']
for font in chinese_fonts:
if font in available_fonts:
print(f"找到中文字体: {font}")
plt.rcParams['font.sans-serif'] = [font, 'DejaVu Sans']
break
else:
print("未找到中文字体,使用默认字体")
# 方法3:强制重新加载字体缓存
fm._rebuild()
except Exception as e:
print(f"字体设置失败: {e}")
# 如果所有方法都失败,使用英文标签
return False
return True
def setup_styles(self):
# 设置主题颜色
self.bg_color = "#f0f0f0"
self.frame_bg = "#ffffff"
self.accent_color = "#4a6fa5"
self.highlight_color = "#6b8cbc"
# 配置样式
style = ttk.Style()
style.configure("TFrame", background=self.bg_color)
style.configure("TButton", background=self.accent_color, foreground="black", font=("Arial", 10))
style.configure("TLabel", background=self.bg_color, font=("Arial", 10))
style.configure("Title.TLabel", background=self.bg_color, font=("Arial", 12, "bold"))
style.configure("TRadiobutton", background=self.bg_color, font=("Arial", 10))
def center_window(self, window, parent=None):
"""将窗口居中显示在父窗口中间"""
window.update_idletasks()
if parent is None:
parent = self.root
# 获取父窗口位置和尺寸
parent_x = parent.winfo_x()
parent_y = parent.winfo_y()
parent_width = parent.winfo_width()
parent_height = parent.winfo_height()
# 获取子窗口尺寸
width = window.winfo_width()
height = window.winfo_height()
# 计算居中位置
x = parent_x + (parent_width - width) // 2
y = parent_y + (parent_height - height) // 2
# 设置窗口位置
window.geometry(f"+{x}+{y}")
def create_widgets(self):
# 主框架
main_frame = ttk.Frame(self.root, padding="10")
main_frame.pack(fill=tk.BOTH, expand=True)
# 区域1: 数据控制
self.create_data_control_frame(main_frame)
# 区域2: 数据清洗和简单分析 - 左右分布的两个区域
analysis_cleaning_frame = ttk.Frame(main_frame)
analysis_cleaning_frame.pack(fill=tk.X, pady=(0, 10))
# 左侧: 数据清洗
self.create_data_cleaning_frame(analysis_cleaning_frame)
# 右侧: 简单分析
self.create_simple_analysis_frame(analysis_cleaning_frame)
# 区域3: 可视化选项和预览
self.create_visualization_frame(main_frame)
def create_data_control_frame(self, parent):
control_frame = ttk.LabelFrame(parent, text="数据控制", padding="10")
control_frame.pack(fill=tk.X, pady=(0, 10))
# 按钮
buttons = [
("导入CSV文件", self.load_csv),
("导入Excel文件", self.load_excel),
("从API获取数据", self.get_api_data),
("设置API自动更新", self.set_api_auto_update),
("停止自动更新", self.stop_auto_update),
("保存到Excel文件", self.save_to_excel)
]
for text, command in buttons:
btn = ttk.Button(control_frame, text=text, command=command)
btn.pack(side=tk.LEFT, padx=5)
def create_data_cleaning_frame(self, parent):
# 左侧分布的数据清洗区域
cleaning_frame = ttk.LabelFrame(parent, text="数据清洗", padding="10")
cleaning_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=False, padx=(0, 5))
# 按钮
cleaning_buttons = [
("移除重复值", self.remove_duplicates),
("处理缺失值", self.handle_missing_values),
("数据类型转换", self.convert_data_types),
("规整小数位", self.round_decimal_places),
("修改列名", self.rename_column),
("重置数据", self.reset_data)
]
for text, command in cleaning_buttons:
btn = ttk.Button(cleaning_frame, text=text, command=command)
btn.pack(side=tk.LEFT, padx=2)
def create_simple_analysis_frame(self, parent):
# 右侧分布的简单分析区域
analysis_frame = ttk.LabelFrame(parent, text="简单分析", padding="10")
analysis_frame.pack(side=tk.RIGHT, fill=tk.BOTH, expand=True, padx=(5, 0))
# 按钮
analysis_buttons = [
("显示数据信息", self.show_data_info),
("查看统计信息", self.show_statistics),
("相关分析", self.correlation_analysis)
]
for text, command in analysis_buttons:
btn = ttk.Button(analysis_frame, text=text, command=command)
btn.pack(side=tk.LEFT, padx=2)
def create_visualization_frame(self, parent):
viz_frame = ttk.Frame(parent)
viz_frame.pack(fill=tk.BOTH, expand=True)
# 左侧: 可视化选项
self.create_viz_options_frame(viz_frame)
# 右侧: 数据预览和图表展示
self.create_preview_frame(viz_frame)
def create_viz_options_frame(self, parent):
options_frame = ttk.LabelFrame(parent, text="可视化选项", padding="10")
options_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=True, padx=(0, 10))
# 图表类型选择
chart_type_label = ttk.Label(options_frame, text="图表类型:", style="Title.TLabel")
chart_type_label.pack(anchor=tk.W, pady=(0, 5))
self.chart_type = tk.StringVar(value="柱状图")
chart_types = ["柱状图", "折线图", "散点图", "饼图", "箱线图", "热力图", "直方图","雷达图"]
for chart in chart_types:
rb = ttk.Radiobutton(options_frame, text=chart, variable=self.chart_type, value=chart)
rb.pack(anchor=tk.W)
# X轴数据选择
x_axis_label = ttk.Label(options_frame, text="X轴数据:", style="Title.TLabel")
x_axis_label.pack(anchor=tk.W, pady=(10, 5))
self.x_axis_var = tk.StringVar()
self.x_axis_combo = ttk.Combobox(options_frame, textvariable=self.x_axis_var, state="readonly")
self.x_axis_combo.pack(fill=tk.X, pady=(0, 10))
# Y轴数据选择
y_axis_label = ttk.Label(options_frame, text="Y轴数据:", style="Title.TLabel")
y_axis_label.pack(anchor=tk.W, pady=(0, 5))
self.y_axis_var = tk.StringVar()
self.y_axis_combo = ttk.Combobox(options_frame, textvariable=self.y_axis_var, state="readonly")
self.y_axis_combo.pack(fill=tk.X, pady=(0, 10))
# 分组数据选择
group_label = ttk.Label(options_frame, text="分组数据(可选):", style="Title.TLabel")
group_label.pack(anchor=tk.W, pady=(0, 5))
self.group_var = tk.StringVar(value="无")
self.group_combo = ttk.Combobox(options_frame, textvariable=self.group_var, state="readonly")
self.group_combo.pack(fill=tk.X, pady=(0, 10))
# 显示选项框架
display_frame = ttk.LabelFrame(options_frame, text="显示选项", padding="5")
display_frame.pack(fill=tk.X, pady=10)
# 限制X轴类别数量选项
self.limit_categories = tk.BooleanVar(value=True)
limit_cb = ttk.Checkbutton(
display_frame,
text="限制X轴类别数量 (最多10个)",
variable=self.limit_categories
)
limit_cb.pack(anchor=tk.W, pady=2)
# 显示所有数据选项
self.show_all_data = tk.BooleanVar(value=False)
all_data_cb = ttk.Checkbutton(
display_frame,
text="显示所有数据 (可能影响性能)",
variable=self.show_all_data,
command=self.update_data_preview # 当选项改变时更新预览
)
all_data_cb.pack(anchor=tk.W, pady=2)
# 生成和保存按钮
button_frame = ttk.Frame(options_frame)
button_frame.pack(fill=tk.X, pady=10)
generate_btn = ttk.Button(button_frame, text="生成图表", command=self.generate_chart)
generate_btn.pack(side=tk.LEFT, padx=(0, 10))
save_btn = ttk.Button(button_frame, text="保存图表", command=self.save_chart)
save_btn.pack(side=tk.LEFT)
def create_preview_frame(self, parent):
preview_frame = ttk.Frame(parent)
preview_frame.pack(side=tk.RIGHT, fill=tk.BOTH, expand=True)
# 数据预览
data_preview_frame = ttk.LabelFrame(preview_frame, text="数据预览", padding="10")
data_preview_frame.pack(fill=tk.BOTH, expand=True, pady=(0, 10))
# 创建Treeview显示数据
self.create_data_treeview(data_preview_frame)
# 图表展示
chart_frame = ttk.LabelFrame(preview_frame, text="图表展示", padding="10")
chart_frame.pack(fill=tk.BOTH, expand=True)
# 创建图表区域
self.create_chart_area(chart_frame)
def create_data_treeview(self, parent):
# 创建滚动条
tree_scroll = ttk.Scrollbar(parent)
tree_scroll.pack(side=tk.RIGHT, fill=tk.Y)
# 创建Treeview
self.data_tree = ttk.Treeview(parent, yscrollcommand=tree_scroll.set)
self.data_tree.pack(fill=tk.BOTH, expand=True)
# 配置滚动条
tree_scroll.config(command=self.data_tree.yview)
def create_chart_area(self, parent):
# 创建图表框架
self.chart_frame = ttk.Frame(parent)
self.chart_frame.pack(fill=tk.BOTH, expand=True)
# 初始时显示空图表
self.fig, self.ax = plt.subplots(figsize=(8, 6))
self.ax.text(0.5, 0.5, "请导入数据并选择图表类型",
horizontalalignment='center', verticalalignment='center',
transform=self.ax.transAxes, fontsize=14)
self.ax.set_xticks([])
self.ax.set_yticks([])
# 创建画布
self.canvas = FigureCanvasTkAgg(self.fig, self.chart_frame)
self.canvas.get_tk_widget().pack(fill=tk.BOTH, expand=True)
self.canvas.draw()
# 新增功能:相关分析
def correlation_analysis(self):
"""执行相关分析并显示相关性矩阵"""
if self.df is None:
messagebox.showwarning("警告", "请先导入数据")
return
# 检查是否有数值列
numeric_columns = self.df.select_dtypes(include=[np.number]).columns
if len(numeric_columns) < 2:
messagebox.showwarning("警告", "相关分析需要至少两个数值列")
return
# 创建相关分析窗口
self.create_correlation_window()
def create_correlation_window(self):
"""创建相关分析窗口"""
# 创建新窗口
corr_window = tk.Toplevel(self.root)
corr_window.title("相关分析")
corr_window.geometry("600x500")
corr_window.resizable(True, True)
# 设置窗口居中和模态
corr_window.transient(self.root)
corr_window.grab_set()
self.center_window(corr_window)
# 主框架
main_frame = ttk.Frame(corr_window, padding="10")
main_frame.pack(fill=tk.BOTH, expand=True)
# 标题
#title_label = ttk.Label(main_frame, text="相关性矩阵", style="Title.TLabel")
#title_label.pack(pady=(0, 10))
# 说明文本
info_label = ttk.Label(main_frame, text="以下显示各数值列之间的相关系数矩阵:", wraplength=550)
info_label.pack(pady=(0, 10))
# 创建Treeview框架
tree_frame = ttk.Frame(main_frame)
tree_frame.pack(fill=tk.BOTH, expand=True, pady=10)
# 创建滚动条
v_scrollbar = ttk.Scrollbar(tree_frame)
v_scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
h_scrollbar = ttk.Scrollbar(tree_frame, orient=tk.HORIZONTAL)
h_scrollbar.pack(side=tk.BOTTOM, fill=tk.X)
# 创建Treeview显示相关性矩阵
corr_tree = ttk.Treeview(tree_frame,
yscrollcommand=v_scrollbar.set,
xscrollcommand=h_scrollbar.set)
corr_tree.pack(fill=tk.BOTH, expand=True)
# 配置滚动条
v_scrollbar.config(command=corr_tree.yview)
h_scrollbar.config(command=corr_tree.xview)
# 计算相关性矩阵
numeric_df = self.df.select_dtypes(include=[np.number])
correlation_matrix = numeric_df.corr()
# 设置Treeview列
columns = ['列名'] + list(correlation_matrix.columns)
corr_tree["columns"] = columns
corr_tree["show"] = "headings"
# 设置列标题
for col in columns:
corr_tree.heading(col, text=col)
corr_tree.column(col, width=80, anchor=tk.CENTER)
# 添加数据行
for idx, row in correlation_matrix.iterrows():
row_values = [idx] + [f"{val:.4f}" for val in row]
corr_tree.insert("", "end", values=row_values)
# 按钮框架
button_frame = ttk.Frame(main_frame)
button_frame.pack(fill=tk.X, pady=10)
# 关闭按钮
close_btn = ttk.Button(button_frame, text="关闭", command=corr_window.destroy)
close_btn.pack(side=tk.RIGHT, padx=5)
# 保存按钮
save_btn = ttk.Button(button_frame, text="保存相关性矩阵",
command=lambda: self.save_correlation_matrix(correlation_matrix))
save_btn.pack(side=tk.RIGHT, padx=5)
def save_correlation_matrix(self, correlation_matrix):
"""保存相关性矩阵到文件"""
file_path = filedialog.asksaveasfilename(
defaultextension=".csv",
filetypes=[("CSV files", "*.csv"), ("Excel files", "*.xlsx")]
)
if file_path:
try:
if file_path.endswith('.csv'):
correlation_matrix.to_csv(file_path)
else:
correlation_matrix.to_excel(file_path)
messagebox.showinfo("成功", f"相关性矩阵已保存到: {file_path}")
except Exception as e:
messagebox.showerror("错误", f"保存文件时出错: {str(e)}")
# 数据控制功能
def load_csv(self):
file_path = filedialog.askopenfilename(filetypes=[("CSV files", "*.csv")])
if file_path:
try:
self.df = pd.read_csv(file_path)
self.original_df = self.df.copy()
self.update_data_preview()
self.update_column_combos()
messagebox.showinfo("成功", "CSV文件导入成功!")
except Exception as e:
messagebox.showerror("错误", f"导入CSV文件时出错: {str(e)}")
def load_excel(self):
file_path = filedialog.askopenfilename(filetypes=[("Excel files", "*.xlsx *.xls")])
if file_path:
try:
self.df = pd.read_excel(file_path)
self.original_df = self.df.copy()
self.update_data_preview()
self.update_column_combos()
messagebox.showinfo("成功", "Excel文件导入成功!")
except Exception as e:
messagebox.showerror("错误", f"导入Excel文件时出错: {str(e)}")
def get_api_data(self):
# 创建自定义对话框而不是使用simpledialog
dialog = tk.Toplevel(self.root)
dialog.title("API数据")
dialog.geometry("400x150")
dialog.resizable(False, False)
dialog.transient(self.root)
dialog.grab_set()
# 设置窗口居中
self.center_window(dialog)
# 创建内容
ttk.Label(dialog, text="请输入API URL:").pack(pady=10)
url_var = tk.StringVar()
url_entry = ttk.Entry(dialog, textvariable=url_var, width=50)
url_entry.pack(pady=5, padx=20, fill=tk.X)
def on_ok():
url = url_var.get()
dialog.destroy()
if url:
self.api_url = url
try:
response = requests.get(url)
if response.status_code == 200:
# 假设API返回JSON格式数据
data = response.json()
self.df = pd.DataFrame(data)
self.original_df = self.df.copy()
self.update_data_preview()
self.update_column_combos()
messagebox.showinfo("成功", "API数据获取成功!")
else:
messagebox.showerror("错误", f"API请求失败,状态码: {response.status_code}")
except Exception as e:
messagebox.showerror("错误", f"获取API数据时出错: {str(e)}")
def on_cancel():
dialog.destroy()
# 按钮框架
button_frame = ttk.Frame(dialog)
button_frame.pack(pady=10)
ttk.Button(button_frame, text="确定", command=on_ok).pack(side=tk.LEFT, padx=5)
ttk.Button(button_frame, text="取消", command=on_cancel).pack(side=tk.LEFT, padx=5)
# 绑定回车键
dialog.bind('<Return>', lambda e: on_ok())
url_entry.focus_set()
def set_api_auto_update(self):
if not self.api_url:
messagebox.showwarning("警告", "请先设置API URL")
return
# 创建自定义对话框
dialog = tk.Toplevel(self.root)
dialog.title("自动更新设置")
dialog.geometry("400x150")
dialog.resizable(False, False)
dialog.transient(self.root)
dialog.grab_set()
# 设置窗口居中
self.center_window(dialog)
# 创建内容
ttk.Label(dialog, text="请输入更新间隔(秒):").pack(pady=10)
interval_var = tk.StringVar(value="60")
interval_entry = ttk.Entry(dialog, textvariable=interval_var, width=20)
interval_entry.pack(pady=5)
def on_ok():
try:
interval = int(interval_var.get())
dialog.destroy()
if interval > 0:
self.api_update_interval = interval
self.api_auto_update = True
self.auto_update_thread = threading.Thread(target=self.auto_update_data, daemon=True)
self.auto_update_thread.start()
messagebox.showinfo("成功", f"已设置自动更新,间隔: {interval}秒")
else:
messagebox.showerror("错误", "更新间隔必须大于0")
except ValueError:
messagebox.showerror("错误", "请输入有效的数字")
def on_cancel():
dialog.destroy()
# 按钮框架
button_frame = ttk.Frame(dialog)
button_frame.pack(pady=10)
ttk.Button(button_frame, text="确定", command=on_ok).pack(side=tk.LEFT, padx=5)
ttk.Button(button_frame, text="取消", command=on_cancel).pack(side=tk.LEFT, padx=5)
# 绑定回车键
dialog.bind('<Return>', lambda e: on_ok())
interval_entry.focus_set()
interval_entry.select_range(0, tk.END)
def stop_auto_update(self):
self.api_auto_update = False
messagebox.showinfo("成功", "已停止自动更新")
def auto_update_data(self):
while self.api_auto_update:
time.sleep(self.api_update_interval)
try:
response = requests.get(self.api_url)
if response.status_code == 200:
data = response.json()
self.df = pd.DataFrame(data)
self.update_data_preview()
self.update_column_combos()
# 在主线程中更新UI
self.root.after(0, lambda: messagebox.showinfo("自动更新", f"数据已自动更新于 {datetime.now().strftime('%H:%M:%S')}"))
except Exception as e:
self.root.after(0, lambda: messagebox.showerror("自动更新错误", f"自动更新数据时出错: {str(e)}"))
def save_to_excel(self):
"""保存数据到Excel文件"""
if self.df is None:
messagebox.showwarning("警告", "没有数据可保存")
return
try:
# 弹出保存文件对话框
file_path = filedialog.asksaveasfilename(
defaultextension=".xlsx",
filetypes=[
("Excel files", "*.xlsx"),
("Excel 97-2003 files", "*.xls"),
("All files", "*.*")
],
title="保存数据到Excel"
)
if file_path:
# 显示保存进度
progress_window = tk.Toplevel(self.root)
progress_window.title("保存中...")
progress_window.geometry("300x100")
progress_window.resizable(False, False)
progress_window.transient(self.root)
# 设置窗口居中
self.center_window(progress_window)
ttk.Label(progress_window, text="正在保存数据,请稍候...").pack(pady=20)
progress_window.update()
# 保存数据到Excel
self.df.to_excel(file_path, index=False)
# 关闭进度窗口
progress_window.destroy()
# 显示成功消息
messagebox.showinfo("成功", f"数据已成功保存到:\n{file_path}")
except Exception as e:
messagebox.showerror("错误", f"保存Excel文件时出错: {str(e)}")
# 数据清洗功能
def show_data_info(self):
if self.df is None:
messagebox.showwarning("警告", "请先导入数据")
return
info = f"数据形状: {self.df.shape}\n\n"
info += "列信息:\n"
for col in self.df.columns:
info += f"- {col}: {self.df[col].dtype}\n"
info += f"\n缺失值统计:\n{self.df.isnull().sum()}"
messagebox.showinfo("数据信息", info)
# 新增功能:查看均值等统计信息
def show_statistics(self):
"""显示统计信息窗口"""
if self.df is None:
messagebox.showwarning("警告", "请先导入数据")
return
# 创建统计信息窗口
stats_window = tk.Toplevel(self.root)
stats_window.title("统计信息")
stats_window.geometry("500x450")
stats_window.resizable(False, False)
stats_window.transient(self.root)
stats_window.grab_set()
# 设置窗口居中
self.center_window(stats_window)
# 主框架
main_frame = ttk.Frame(stats_window, padding="15")
main_frame.pack(fill=tk.BOTH, expand=True)
# 标题
#title_label = ttk.Label(main_frame, text="选择列查看统计信息", style="Title.TLabel")
#title_label.pack(pady=(0, 15))
# 列选择框架
column_frame = ttk.Frame(main_frame)
column_frame.pack(fill=tk.X, pady=10)
ttk.Label(column_frame, text="选择列:").pack(side=tk.LEFT, padx=(0, 10))
self.selected_column_stats = tk.StringVar()
column_combo = ttk.Combobox(column_frame, textvariable=self.selected_column_stats, state="readonly", width=30)
column_combo.pack(side=tk.LEFT, fill=tk.X, expand=True)
# 设置列选项
if self.df is not None:
columns = list(self.df.columns)
column_combo['values'] = columns
if columns:
column_combo.set(columns[0])
# 统计信息显示框架
stats_frame = ttk.LabelFrame(main_frame, text="统计信息", padding="10")
stats_frame.pack(fill=tk.BOTH, expand=True, pady=10)
# 创建统计信息显示标签
self.stats_labels = {}
stats_config = [
("均值", "mean"),
("最小值", "min"),
("最大值", "max"),
("中位数", "median"),
("标准差", "std"),
("方差", "var"),
("第一四分位数 (Q1)", "q1"),
("第三四分位数 (Q3)", "q3")
]
for stat_name, stat_key in stats_config:
row_frame = ttk.Frame(stats_frame)
row_frame.pack(fill=tk.X, pady=5)
ttk.Label(row_frame, text=f"{stat_name}:").pack(side=tk.LEFT, padx=(0, 10))
result_label = ttk.Label(row_frame, text="")
result_label.pack(side=tk.LEFT)
self.stats_labels[stat_key] = result_label
# 关闭按钮
close_btn = ttk.Button(main_frame, text="关闭", command=stats_window.destroy)
close_btn.pack(pady=10)
# 绑定列选择事件
def on_column_selected(*args):
column_name = self.selected_column_stats.get()
if column_name:
self.update_statistics_display(column_name)
self.selected_column_stats.trace("w", on_column_selected)
# 初始化显示
on_column_selected()
def is_numeric_column(self, column_name):
"""检查列是否为数值型"""
if self.df is None or column_name not in self.df.columns:
return False
return pd.api.types.is_numeric_dtype(self.df[column_name])
def update_statistics_display(self, column_name):
"""更新统计信息显示"""
if not self.is_numeric_column(column_name):
# 清空所有统计信息显示
for label in self.stats_labels.values():
label.config(text="非数值列")
return
try:
# 计算所有统计信息
mean_value = self.df[column_name].mean()
min_value = self.df[column_name].min()
max_value = self.df[column_name].max()
median_value = self.df[column_name].median()
std_value = self.df[column_name].std()
var_value = self.df[column_name].var()
q1_value = self.df[column_name].quantile(0.25)
q3_value = self.df[column_name].quantile(0.75)
# 更新显示
self.stats_labels["mean"].config(text=f"{mean_value:.4f}")
self.stats_labels["min"].config(text=f"{min_value:.4f}")
self.stats_labels["max"].config(text=f"{max_value:.4f}")
self.stats_labels["median"].config(text=f"{median_value:.4f}")
self.stats_labels["std"].config(text=f"{std_value:.4f}")
self.stats_labels["var"].config(text=f"{var_value:.4f}")
self.stats_labels["q1"].config(text=f"{q1_value:.4f}")
self.stats_labels["q3"].config(text=f"{q3_value:.4f}")
except Exception as e:
# 如果计算出错,显示错误信息
for label in self.stats_labels.values():
label.config(text="计算错误")
def remove_duplicates(self):
if self.df is None:
messagebox.showwarning("警告", "请先导入数据")
return
before_count = len(self.df)
self.df = self.df.drop_duplicates()
after_count = len(self.df)
self.update_data_preview()
messagebox.showinfo("成功", f"已移除 {before_count - after_count} 个重复值")
def handle_missing_values(self):
if self.df is None:
messagebox.showwarning("警告", "请先导入数据")
return
# 创建自定义对话框
dialog = tk.Toplevel(self.root)
dialog.title("处理缺失值")
dialog.geometry("500x200")
dialog.resizable(False, False)
dialog.transient(self.root)
dialog.grab_set()
# 设置窗口居中
self.center_window(dialog)
# 创建内容
ttk.Label(dialog, text="选择处理方式:", style="Title.TLabel").pack(pady=10)
option_var = tk.StringVar()
options = [
("删除包含缺失值的行", "1"),
("用均值填充(数值列)", "2"),
("用众数填充(分类列)", "3")
]
for text, value in options:
rb = ttk.Radiobutton(dialog, text=text, variable=option_var, value=value)
rb.pack(anchor=tk.W, padx=20)
def on_ok():
option = option_var.get()
dialog.destroy()
if option:
if option == "1":
before_count = self.df.isnull().sum().sum()
self.df = self.df.dropna()
after_count = self.df.isnull().sum().sum()
messagebox.showinfo("成功", f"已删除包含缺失值的行,处理了 {before_count - after_count} 个缺失值")
elif option == "2":
for col in self.df.select_dtypes(include=[np.number]).columns:
self.df[col].fillna(self.df[col].mean(), inplace=True)
messagebox.showinfo("成功", "已用均值填充数值列的缺失值")
elif option == "3":
for col in self.df.select_dtypes(include=['object']).columns:
self.df[col].fillna(self.df[col].mode()[0] if not self.df[col].mode().empty else "未知", inplace=True)
messagebox.showinfo("成功", "已用众数填充分类列的缺失值")
self.update_data_preview()
else:
messagebox.showwarning("警告", "请选择处理方式")
def on_cancel():
dialog.destroy()
# 按钮框架
button_frame = ttk.Frame(dialog)
button_frame.pack(pady=10)
ttk.Button(button_frame, text="确定", command=on_ok).pack(side=tk.LEFT, padx=5)
ttk.Button(button_frame, text="取消", command=on_cancel).pack(side=tk.LEFT, padx=5)
# 设置默认选项
option_var.set("1")
def convert_data_types(self):
if self.df is None:
messagebox.showwarning("警告", "请先导入数据")
return
# 创建数据类型转换窗口
self.create_data_type_conversion_window()
def create_data_type_conversion_window(self):
"""创建数据类型转换窗口"""
# 创建新窗口
conversion_window = tk.Toplevel(self.root)
conversion_window.title("数据类型转换")
conversion_window.geometry("500x350")
conversion_window.resizable(True, True)
# 设置窗口居中和模态
conversion_window.transient(self.root)
conversion_window.grab_set()
# 设置窗口居中
self.center_window(conversion_window)
# 主框架
main_frame = ttk.Frame(conversion_window, padding="15")
main_frame.pack(fill=tk.BOTH, expand=True)
# 列选择框架
column_frame = ttk.Frame(main_frame)
column_frame.pack(fill=tk.X, pady=10)
ttk.Label(column_frame, text="选择列:").pack(side=tk.LEFT, padx=(0, 10))
self.selected_column = tk.StringVar()
column_combo = ttk.Combobox(column_frame, textvariable=self.selected_column, state="readonly", width=30)
column_combo.pack(side=tk.LEFT, fill=tk.X, expand=True)
# 设置列选项
if self.df is not None:
columns = list(self.df.columns)
column_combo['values'] = columns
if columns:
column_combo.set(columns[0])
# 当前数据类型显示
current_type_frame = ttk.Frame(main_frame)
current_type_frame.pack(fill=tk.X, pady=10)
ttk.Label(current_type_frame, text="当前数据类型:").pack(side=tk.LEFT, padx=(0, 10))
self.current_type_var = tk.StringVar(value="未选择")
current_type_label = ttk.Label(current_type_frame, textvariable=self.current_type_var)
current_type_label.pack(side=tk.LEFT)
# 更新当前数据类型显示
def update_current_type(*args):
col = self.selected_column.get()
if col and self.df is not None:
self.current_type_var.set(str(self.df[col].dtype))
self.selected_column.trace("w", update_current_type)
# 目标数据类型选择框架
target_type_frame = ttk.Frame(main_frame)
target_type_frame.pack(fill=tk.X, pady=10)
ttk.Label(target_type_frame, text="目标数据类型:").pack(side=tk.LEFT, padx=(0, 10))
self.target_type = tk.StringVar()
type_combo = ttk.Combobox(target_type_frame, textvariable=self.target_type, state="readonly", width=30)
type_combo.pack(side=tk.LEFT, fill=tk.X, expand=True)
# 定义支持的数据类型
data_types = [
"字符串 (str)",
"整数 (int)",
"浮点数 (float)",
"布尔值 (bool)",
"日期时间 (datetime)",
"分类数据 (category)",
"时间差 (timedelta)",
"复数 (complex)",
"对象 (object)"
]
type_combo['values'] = data_types
type_combo.set(data_types[0])
# 转换选项框架
options_frame = ttk.LabelFrame(main_frame, text="转换选项", padding="10")
options_frame.pack(fill=tk.X, pady=10)
self.ignore_errors = tk.BooleanVar(value=True)
ignore_cb = ttk.Checkbutton(
options_frame,
text="忽略转换错误 (将无法转换的值设为NaN)",
variable=self.ignore_errors
)
ignore_cb.pack(anchor=tk.W)
self.downcast_int = tk.BooleanVar(value=False)
downcast_cb = ttk.Checkbutton(
options_frame,
text="向下转换整数 (使用最小可能的整数类型)",
variable=self.downcast_int
)
downcast_cb.pack(anchor=tk.W, pady=(5, 0))
# 按钮框架
button_frame = ttk.Frame(main_frame)
button_frame.pack(fill=tk.X, pady=20)
# 预览按钮
preview_btn = ttk.Button(
button_frame,
text="预览转换",
command=lambda: self.preview_conversion(self.selected_column.get(), self.target_type.get())
)
preview_btn.pack(side=tk.LEFT, padx=(0, 10))
# 应用按钮
apply_btn = ttk.Button(
button_frame,
text="应用转换",
command=lambda: self.apply_conversion(self.selected_column.get(), self.target_type.get())
)
apply_btn.pack(side=tk.LEFT, padx=(0, 10))
# 关闭按钮
close_btn = ttk.Button(
button_frame,
text="关闭",
command=conversion_window.destroy
)
close_btn.pack(side=tk.LEFT)
# 初始化当前数据类型显示
update_current_type()
def preview_conversion(self, column, target_type):
"""预览数据类型转换结果"""
if not column:
messagebox.showwarning("警告", "请选择要转换的列")
return
try:
# 获取转换前的数据样本
original_sample = self.df[column].head(10).copy()
original_info = f"原始数据类型: {self.df[column].dtype}\n\n前10行数据:\n{original_sample}"
# 尝试转换
converted_series = self.convert_column_type(column, target_type, preview=True)
if converted_series is not None:
converted_sample = converted_series.head(10)
converted_info = f"转换后数据类型: {converted_series.dtype}\n\n前10行数据:\n{converted_sample}"
# 显示预览结果
preview_text = f"转换预览:\n\n{original_info}\n\n{converted_info}"
messagebox.showinfo("转换预览", preview_text)
else:
messagebox.showerror("错误", "无法执行转换预览")
except Exception as e:
messagebox.showerror("错误", f"预览转换时出错: {str(e)}")
def apply_conversion(self, column, target_type):
"""应用数据类型转换"""
if not column:
messagebox.showwarning("警告", "请选择要转换的列")
return
try:
# 执行转换
result = self.convert_column_type(column, target_type, preview=False)
if result is not None:
self.update_data_preview()
self.update_column_combos()
messagebox.showinfo("成功", f"已成功将列 '{column}' 转换为 {target_type}")
else:
messagebox.showerror("错误", "数据类型转换失败")
except Exception as e:
messagebox.showerror("错误", f"应用转换时出错: {str(e)}")
def convert_column_type(self, column, target_type, preview=False):
"""执行具体的数据类型转换"""
if self.df is None or column not in self.df.columns:
return None
try:
# 根据用户选择的目标类型进行转换
if target_type == "字符串 (str)":
result = self.df[column].astype(str)
elif target_type == "整数 (int)":
if self.ignore_errors.get():
result = pd.to_numeric(self.df[column], errors='coerce').astype('Int64') # 可空整数类型
else:
result = pd.to_numeric(self.df[column], errors='raise').astype(int)
elif target_type == "浮点数 (float)":
if self.ignore_errors.get():
result = pd.to_numeric(self.df[column], errors='coerce').astype(float)
else:
result = pd.to_numeric(self.df[column], errors='raise').astype(float)
elif target_type == "布尔值 (bool)":
# 尝试常见布尔值转换
bool_map = {
'true': True, 'false': False,
'是': True, '否': False,
'yes': True, 'no': False,
'1': True, '0': False
}
if self.ignore_errors.get():
result = self.df[column].map(bool_map).fillna(self.df[column]).astype(bool)
else:
result = self.df[column].astype(bool)
elif target_type == "日期时间 (datetime)":
if self.ignore_errors.get():
result = pd.to_datetime(self.df[column], errors='coerce')
else:
result = pd.to_datetime(self.df[column], errors='raise')
elif target_type == "分类数据 (category)":
result = self.df[column].astype('category')
elif target_type == "时间差 (timedelta)":
if self.ignore_errors.get():
result = pd.to_timedelta(self.df[column], errors='coerce')
else:
result = pd.to_timedelta(self.df[column], errors='raise')
elif target_type == "复数 (complex)":
if self.ignore_errors.get():
result = self.df[column].apply(lambda x: complex(x) if self.is_complex_convertible(x) else np.nan)
else:
result = self.df[column].apply(complex)
elif target_type == "对象 (object)":
result = self.df[column].astype(object)
else:
messagebox.showerror("错误", f"不支持的数据类型: {target_type}")
return None
# 如果不是预览模式,则实际更新数据框
if not preview:
self.df[column] = result
return result
except Exception as e:
if not preview: # 只在实际应用时显示错误
messagebox.showerror("转换错误", f"将列 '{column}' 转换为 {target_type} 时出错:\n{str(e)}")
raise
def is_complex_convertible(self, value):
"""检查值是否可以转换为复数"""
try:
complex(value)
return True
except (ValueError, TypeError):
return False
# 新增方法:规整小数位功能
def round_decimal_places(self):
"""规整小数位数"""
if self.df is None:
messagebox.showwarning("警告", "请先导入数据")
return
# 检查数据中是否有数值列
numeric_columns = self.df.select_dtypes(include=[np.number]).columns
if len(numeric_columns) == 0:
messagebox.showwarning("警告", "数据中没有数值列可供规整")
return
# 创建规整小数位窗口
self.create_round_decimal_window()
def create_round_decimal_window(self):
"""创建规整小数位窗口"""
# 创建新窗口
round_window = tk.Toplevel(self.root)
round_window.title("规整小数位")
round_window.geometry("400x220")
round_window.resizable(False,False)
# 设置窗口居中和模态
round_window.transient(self.root)
round_window.grab_set()
self.center_window(round_window)
# 主框架
main_frame = ttk.Frame(round_window, padding="20")
main_frame.pack(fill=tk.BOTH, expand=True)
# 说明文本
info_label = ttk.Label(main_frame, text="请选择要规整的列和要保留的小数位数 (0-6):", wraplength=350)
info_label.pack(pady=(0, 10))
# 列选择框架
column_frame = ttk.Frame(main_frame)
column_frame.pack(fill=tk.X, pady=10)
ttk.Label(column_frame, text="选择列:").pack(side=tk.LEFT, padx=(0, 10))
# 获取数值列
numeric_columns = self.df.select_dtypes(include=[np.number]).columns.tolist()
self.selected_column_round = tk.StringVar()
column_combo = ttk.Combobox(column_frame, textvariable=self.selected_column_round,
state="readonly", width=20)
column_combo.pack(side=tk.LEFT, fill=tk.X, expand=True)
column_combo['values'] = numeric_columns
if numeric_columns:
column_combo.set(numeric_columns[0])
# 输入框架
input_frame = ttk.Frame(main_frame)
input_frame.pack(fill=tk.X,pady=10)
ttk.Label(input_frame, text="小数位数:").pack(side=tk.LEFT, padx=(0, 10))
# 创建验证函数,只允许输入0-6的数字
def validate_input(new_value):
if new_value == "":
return True
try:
value = int(new_value)
return 0 <= value <= 6
except ValueError:
return False
vcmd = (round_window.register(validate_input), '%P')
self.decimal_places_var = tk.StringVar(value="2")
decimal_entry = ttk.Entry(
input_frame,
textvariable=self.decimal_places_var,
validate="key",
validatecommand=vcmd,
width=10
)
decimal_entry.pack(side=tk.LEFT)
# 按钮框架
button_frame = ttk.Frame(main_frame)
button_frame.pack(fill=tk.X, pady=20)
# 确定按钮
ok_btn = ttk.Button(
button_frame,
text="确定",
command=lambda: self.apply_round_decimal(round_window)
)
ok_btn.pack(side=tk.LEFT, padx=(0, 10))
# 取消按钮
cancel_btn = ttk.Button(
button_frame,
text="取消",
command=round_window.destroy
)
cancel_btn.pack(side=tk.LEFT)
# 绑定回车键
round_window.bind('<Return>', lambda e: self.apply_round_decimal(round_window))
decimal_entry.focus_set()
decimal_entry.select_range(0, tk.END)
def apply_round_decimal(self, window):
"""应用小数位规整"""
try:
# 获取输入的小数位数
decimal_places = int(self.decimal_places_var.get())
# 验证输入范围
if not (0 <= decimal_places <= 6):
self.error_label.config(text="请输入0-6之间的数字")
return
# 获取选择的列
selected_column = self.selected_column_round.get()
if not selected_column:
self.error_label.config(text="请选择要规整的列")
return
# 执行规整操作
original_dtype = self.df[selected_column].dtype
# 对选定的列进行小数位规整
self.df[selected_column] = self.df[selected_column].round(decimal_places)
# 保持原始数据类型(如果原来是整数,规整后可能变成浮点数,这里可以保持原类型)
if original_dtype in [np.int64, np.int32, np.int16, np.int8]:
# 如果原始是整数类型,且小数位数为0,可以转换回整数
if decimal_places == 0:
self.df[selected_column] = self.df[selected_column].astype(original_dtype)
# 更新数据预览
self.update_data_preview()
# 显示成功消息
#messagebox.showinfo("成功", f"已成功规整列 '{selected_column}' 的小数位数为 {decimal_places} 位")
# 关闭窗口
window.destroy()
except ValueError:
self.error_label.config(text="请输入有效的数字")
except Exception as e:
messagebox.showerror("错误", f"规整小数位时出错: {str(e)}")
def rename_column(self):
"""修改列名"""
if self.df is None:
messagebox.showwarning("警告", "请先导入数据")
return
# 创建修改列名窗口
self.create_rename_column_window()
def create_rename_column_window(self):
"""创建修改列名窗口"""
# 创建新窗口
rename_window = tk.Toplevel(self.root)
rename_window.title("修改列名")
rename_window.geometry("400x280")
rename_window.resizable(False, False)
# 设置窗口居中和模态
rename_window.transient(self.root)
rename_window.grab_set()
self.center_window(rename_window)
# 主框架
main_frame = ttk.Frame(rename_window, padding="20")
main_frame.pack(fill=tk.BOTH, expand=True)
# 当前列选择
current_col_frame = ttk.Frame(main_frame)
current_col_frame.pack(fill=tk.X, pady=10)
ttk.Label(current_col_frame, text="选择要修改的列:").pack(anchor=tk.W)
self.selected_rename_column = tk.StringVar()
current_col_combo = ttk.Combobox(current_col_frame, textvariable=self.selected_rename_column,
state="readonly", width=30)
current_col_combo.pack(fill=tk.X, pady=5)
# 设置列选项
if self.df is not None:
columns = list(self.df.columns)
current_col_combo['values'] = columns
if columns:
current_col_combo.set(columns[0])
# 新列名输入
new_name_frame = ttk.Frame(main_frame)
new_name_frame.pack(fill=tk.X, pady=10)
ttk.Label(new_name_frame, text="输入新列名:").pack(anchor=tk.W)
self.new_column_name = tk.StringVar()
new_name_entry = ttk.Entry(new_name_frame, textvariable=self.new_column_name, width=30)
new_name_entry.pack(fill=tk.X, pady=5)
# 错误提示标签
self.rename_error_label = ttk.Label(main_frame, text="", foreground="red")
self.rename_error_label.pack(pady=5)
# 按钮框架
button_frame = ttk.Frame(main_frame)
button_frame.pack(fill=tk.X, pady=10)
# 确定按钮
ok_btn = ttk.Button(
button_frame,
text="确定",
command=lambda: self.apply_column_rename(rename_window)
)
ok_btn.pack(side=tk.LEFT, padx=(0, 10))
# 取消按钮
cancel_btn = ttk.Button(
button_frame,
text="取消",
command=rename_window.destroy
)
cancel_btn.pack(side=tk.LEFT)
# 绑定回车键
rename_window.bind('<Return>', lambda e: self.apply_column_rename(rename_window))
new_name_entry.focus_set()
def apply_column_rename(self, window):
"""应用列名修改"""
try:
# 获取选择的列和新列名
selected_column = self.selected_rename_column.get()
new_name = self.new_column_name.get().strip()
# 验证输入
if not selected_column:
self.rename_error_label.config(text="请选择要修改的列")
return
if not new_name:
self.rename_error_label.config(text="请输入新列名")
return
# 检查新列名是否已存在(除了当前列本身)
if new_name in self.df.columns and new_name != selected_column:
self.rename_error_label.config(text=f"列名 '{new_name}' 已存在")
return
# 执行列名修改
self.df = self.df.rename(columns={selected_column: new_name})
# 如果存在原始数据,也更新原始数据的列名
if self.original_df is not None and selected_column in self.original_df.columns:
self.original_df = self.original_df.rename(columns={selected_column: new_name})
# 更新数据预览和列选择框
self.update_data_preview()
self.update_column_combos()
# 显示成功消息
messagebox.showinfo("成功", f"已成功将列 '{selected_column}' 重命名为 '{new_name}'")
# 关闭窗口
window.destroy()
except Exception as e:
messagebox.showerror("错误", f"修改列名时出错: {str(e)}")
def reset_data(self):
if self.original_df is not None:
self.df = self.original_df.copy()
self.update_data_preview()
self.update_column_combos()
messagebox.showinfo("成功", "数据已重置")
else:
messagebox.showwarning("警告", "没有原始数据可重置")
# 可视化功能
def update_column_combos(self):
if self.df is not None:
columns = list(self.df.columns)
self.x_axis_combo['values'] = columns
self.y_axis_combo['values'] = columns
self.group_combo['values'] = ["无"] + columns
# 设置默认值
if columns:
self.x_axis_combo.set(columns[0])
if len(columns) > 1:
self.y_axis_combo.set(columns[1])
else:
self.y_axis_combo.set(columns[0])
def generate_chart(self):
if self.df is None:
messagebox.showwarning("警告", "请先导入数据")
return
chart_type = self.chart_type.get()
x_col = self.x_axis_var.get()
y_col = self.y_axis_var.get()
group_col = self.group_var.get() if self.group_var.get() != "无" else None
if not x_col or not y_col:
messagebox.showwarning("警告", "请选择X轴和Y轴数据")
return
if chart_type == "雷达图":
# 雷达图需要数值数据
if not pd.api.types.is_numeric_dtype(self.df[y_col]):
messagebox.showwarning("警告", "雷达图需要数值数据,请选择数值列作为Y轴")
return
# 如果数据点太多,提示用户
#if self.df[x_col].nunique() > 12:
# messagebox.showwarning("提示", f"雷达图建议不超过12个类别以获得最佳显示效果")
try:
# 清除当前图表
self.ax.clear()
# 如果不是雷达图,确保使用直角坐标系
if chart_type != "雷达图":
# 如果当前是极坐标系,重新创建直角坐标系
if hasattr(self.ax, 'name') and self.ax.name == 'polar':
self.fig.delaxes(self.ax) # 删除极坐标轴
self.ax = self.fig.add_subplot(111) # 创建新的直角坐标轴
# 准备数据 - 根据选项限制类别数量
plot_df = self.df.copy()
# 对于分类数据,如果类别过多且用户选择限制,则只保留前10个类别
if (self.limit_categories.get() and
chart_type in ["柱状图", "折线图", "饼图", "箱线图"] and
self.df[x_col].dtype == 'object'):
# 获取前10个最常见的类别
top_categories = self.df[x_col].value_counts().head(10).index
plot_df = self.df[self.df[x_col].isin(top_categories)]
# 如果过滤后数据为空,使用原始数据
if plot_df.empty:
plot_df = self.df.copy()
messagebox.showwarning("警告", "无法限制类别数量,使用全部数据")
else:
# 显示提示信息
unique_count = self.df[x_col].nunique()
if unique_count > 10:
messagebox.showinfo("提示",
f"X轴数据有 {unique_count} 个类别,已限制显示前10个最常见的类别。\n"
f"如需显示全部数据,请取消勾选'限制X轴类别数量'选项。")
# 根据选择的图表类型生成图表
if chart_type == "柱状图":
if group_col:
sns.barplot(data=plot_df, x=x_col, y=y_col, hue=group_col, ax=self.ax)
else:
sns.barplot(data=plot_df, x=x_col, y=y_col, ax=self.ax)
self.ax.tick_params(axis='x', rotation=45)
elif chart_type == "折线图":
if group_col:
# 对于折线图,需要确保X轴数据是排序的
plot_df = plot_df.sort_values(by=x_col)
sns.lineplot(data=plot_df, x=x_col, y=y_col, hue=group_col, ax=self.ax)
else:
plot_df = plot_df.sort_values(by=x_col)
sns.lineplot(data=plot_df, x=x_col, y=y_col, ax=self.ax)
elif chart_type == "散点图":
if group_col:
sns.scatterplot(data=plot_df, x=x_col, y=y_col, hue=group_col, ax=self.ax)
else:
sns.scatterplot(data=plot_df, x=x_col, y=y_col, ax=self.ax)
elif chart_type == "饼图":
# 饼图需要特殊处理,通常只使用一列数据
if group_col:
pie_data = plot_df.groupby(group_col)[y_col].sum()
else:
pie_data = plot_df[y_col].value_counts()
# 改进的饼图处理:取前9个,其余合并为"其他"
if len(pie_data) > 10 and self.limit_categories.get():
top_9 = pie_data.head(9)
other_sum = pie_data.iloc[9:].sum()
# 创建新的饼图数据,包含前9个和"其他"
pie_data = pd.concat([top_9, pd.Series([other_sum], index=['其他'])])
# 显示提示信息
messagebox.showinfo("提示",
f"饼图数据有 {len(plot_df[y_col].value_counts())} 个类别,已显示前9个最常见的类别,其余合并为'其他'。")
self.ax.pie(pie_data, labels=pie_data.index, autopct='%1.1f%%')
self.ax.set_ylabel('') # 清除Y轴标签
elif chart_type == "箱线图":
if group_col:
sns.boxplot(data=plot_df, x=group_col, y=y_col, ax=self.ax)
else:
sns.boxplot(data=plot_df, y=y_col, ax=self.ax)
elif chart_type == "热力图":
# 热力图通常需要数值数据的相关性矩阵
numeric_df = plot_df.select_dtypes(include=[np.number])
if len(numeric_df.columns) < 2:
messagebox.showwarning("警告", "热力图需要至少两个数值列")
return
corr_matrix = numeric_df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=self.ax)
elif chart_type == "直方图":
if group_col:
for category in plot_df[group_col].unique():
self.ax.hist(plot_df[plot_df[group_col] == category][y_col],
alpha=0.5, label=str(category))
self.ax.legend()
else:
self.ax.hist(plot_df[y_col])
self.ax.set_xlabel(y_col)
self.ax.set_ylabel('频率')
elif chart_type == "雷达图":
# 弹出窗口让用户选择雷达图要素
self.show_radar_chart_options()
return # 雷达图有单独的生成流程,这里直接返回
# 设置标题 - 确保使用中文字体
self.ax.set_title(f"{chart_type}: {y_col} vs {x_col}", fontproperties=self.get_chinese_font())
# 设置坐标轴标签 - 确保使用中文字体
self.ax.set_xlabel(x_col, fontproperties=self.get_chinese_font())
self.ax.set_ylabel(y_col, fontproperties=self.get_chinese_font())
# 更新画布
self.canvas.draw()
except Exception as e:
messagebox.showerror("错误", f"生成图表时出错: {str(e)}")
def show_radar_chart_options(self):
"""显示雷达图选项窗口"""
if self.df is None:
messagebox.showwarning("警告", "请先导入数据")
return
# 创建雷达图选项窗口
radar_window = tk.Toplevel(self.root)
radar_window.title("雷达图选项")
radar_window.geometry("500x450")
radar_window.resizable(False, False)
radar_window.transient(self.root)
radar_window.grab_set()
# 设置窗口居中
self.center_window(radar_window)
# 主框架
main_frame = ttk.Frame(radar_window, padding="15")
main_frame.pack(fill=tk.BOTH, expand=True)
# 标题
title_label = ttk.Label(main_frame, text="雷达图设置", style="Title.TLabel")
title_label.pack(pady=(0, 15))
# 分类列选择
category_frame = ttk.Frame(main_frame)
category_frame.pack(fill=tk.X, pady=10)
ttk.Label(category_frame, text="分类列 (X轴):").pack(anchor=tk.W)
self.radar_category_var = tk.StringVar()
category_combo = ttk.Combobox(category_frame, textvariable=self.radar_category_var,
state="readonly", width=40)
category_combo.pack(fill=tk.X, pady=5)
# 数值列选择
value_frame = ttk.Frame(main_frame)
value_frame.pack(fill=tk.X, pady=10)
ttk.Label(value_frame, text="数值列 (Y轴):").pack(anchor=tk.W)
self.radar_value_var = tk.StringVar()
value_combo = ttk.Combobox(value_frame, textvariable=self.radar_value_var,
state="readonly", width=40)
value_combo.pack(fill=tk.X, pady=5)
# 分组列选择
group_frame = ttk.Frame(main_frame)
group_frame.pack(fill=tk.X, pady=10)
ttk.Label(group_frame, text="分组列 (可选):").pack(anchor=tk.W)
self.radar_group_var = tk.StringVar(value="无")
group_combo = ttk.Combobox(group_frame, textvariable=self.radar_group_var,
state="readonly", width=40)
group_combo.pack(fill=tk.X, pady=5)
# 分类数量限制选项
limit_frame = ttk.Frame(main_frame)
limit_frame.pack(fill=tk.X, pady=10)
self.radar_limit_categories = tk.BooleanVar(value=True)
limit_cb = ttk.Checkbutton(
limit_frame,
text="限制分类数量 (最多10个)",
variable=self.radar_limit_categories
)
limit_cb.pack(anchor=tk.W)
# 说明文本
info_label = ttk.Label(main_frame,
text="注意: 雷达图使用分类列作为角度轴(不建议超过10个),数值列作为半径轴。\n"
"如果分组,每个分组将显示为一个多边形。",
justify=tk.LEFT)
info_label.pack(fill=tk.X, pady=10)
# 按钮框架
button_frame = ttk.Frame(main_frame)
button_frame.pack(fill=tk.X, pady=20)
# 生成按钮
generate_btn = ttk.Button(
button_frame,
text="生成雷达图",
command=lambda: self.generate_radar_from_options(radar_window)
)
generate_btn.pack(side=tk.LEFT, padx=(0, 10))
# 取消按钮
cancel_btn = ttk.Button(
button_frame,
text="取消",
command=radar_window.destroy
)
cancel_btn.pack(side=tk.LEFT)
# 设置选项
columns = list(self.df.columns)
category_combo['values'] = columns
value_combo['values'] = columns
group_combo['values'] = ["无"] + columns
# 设置默认值
if columns:
category_combo.set(columns[0])
if len(columns) > 1:
value_combo.set(columns[1])
else:
value_combo.set(columns[0])
# 绑定回车键
radar_window.bind('<Return>', lambda e: self.generate_radar_from_options(radar_window))
def generate_radar_from_options(self, window):
"""从选项窗口生成雷达图"""
category_col = self.radar_category_var.get()
value_col = self.radar_value_var.get()
group_col = self.radar_group_var.get() if self.radar_group_var.get() != "无" else None
if not category_col or not value_col:
messagebox.showwarning("警告", "请选择分类列和数值列")
return
# 验证数值列是否为数值类型
if not pd.api.types.is_numeric_dtype(self.df[value_col]):
messagebox.showwarning("警告", "数值列必须为数值类型")
return
# 准备数据
plot_df = self.df.copy()
# 限制分类数量
if self.radar_limit_categories.get():
unique_categories = plot_df[category_col].unique()
if len(unique_categories) > 10:
# 获取前10个最常见的分类
top_categories = plot_df[category_col].value_counts().head(10).index
plot_df = plot_df[plot_df[category_col].isin(top_categories)]
# 显示提示信息
messagebox.showinfo("提示",
f"已限制显示前10个最常见的类别。\n"
f"如需显示全部数据,请取消勾选'限制分类数量'选项。")
# 关闭选项窗口
window.destroy()
# 生成雷达图
self.generate_radar_chart(plot_df, category_col, value_col, group_col)
def generate_radar_chart(self, data, category_col, value_col, group_col=None):
"""生成雷达图"""
try:
# 清除当前图表
self.fig.clear()
self.ax = self.fig.add_subplot(111, polar=True)
# 准备数据
if group_col:
# 分组雷达图
groups = data[group_col].unique()
# 获取所有类别
categories = data[category_col].unique()
num_vars = len(categories)
# 计算每个角度的位置
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
angles += angles[:1] # 闭合图形
# 为每个组绘制雷达图
colors = plt.cm.tab10(np.linspace(0, 1, len(groups)))
for i, group in enumerate(groups):
group_data = data[data[group_col] == group]
values = []
# 获取每个类别的值
for category in categories:
category_values = group_data[group_data[category_col] == category][value_col]
if len(category_values) > 0:
values.append(category_values.mean())
else:
values.append(0)
# 闭合图形
values += values[:1]
# 绘制雷达图
self.ax.plot(angles, values, 'o-', linewidth=2, label=str(group), color=colors[i])
self.ax.fill(angles, values, alpha=0.25, color=colors[i])
# 添加类别标签
self.ax.set_xticks(angles[:-1])
self.ax.set_xticklabels(categories, fontproperties=self.get_chinese_font())
# 添加图例
self.ax.legend(loc='upper right', bbox_to_anchor=(1.7, 1.0)) #此处可修改雷达图图例与主图的位置关系,1.7是与主图间距,0.7是在主图纵向的位置
else:
# 单个雷达图
# 获取类别和对应的平均值
category_stats = data.groupby(category_col)[value_col].mean().reset_index()
categories = category_stats[category_col].tolist()
values = category_stats[value_col].tolist()
num_vars = len(categories)
# 计算角度
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
angles += angles[:1] # 闭合图形
values += values[:1] # 闭合图形
# 绘制雷达图
self.ax.plot(angles, values, 'o-', linewidth=2, color='b')
self.ax.fill(angles, values, alpha=0.25, color='b')
# 添加类别标签
self.ax.set_xticks(angles[:-1])
self.ax.set_xticklabels(categories, fontproperties=self.get_chinese_font())
# 设置标题
#title = f"雷达图: {value_col} vs {category_col}"
#if group_col:
# title += f" (按 {group_col} 分组)"
#self.ax.set_title(title, fontproperties=self.get_chinese_font(), pad=20)
# 添加网格
self.ax.grid(True)
# 更新画布
self.canvas.draw()
except Exception as e:
messagebox.showerror("错误", f"生成雷达图时出错: {str(e)}")
# 恢复为直角坐标系
self.fig.clear()
self.ax = self.fig.add_subplot(111)
self.ax.text(0.5, 0.5, "生成雷达图时出错",
horizontalalignment='center', verticalalignment='center',
transform=self.ax.transAxes, fontsize=14)
self.ax.set_xticks([])
self.ax.set_yticks([])
self.canvas.draw()
def get_chinese_font(self):
"""获取中文字体设置"""
try:
# 尝试获取系统中可用的中文字体
available_fonts = [f.name for f in fm.fontManager.ttflist]
chinese_fonts = ['SimHei', 'Microsoft YaHei', 'KaiTi', 'SimSun']
for font in chinese_fonts:
if font in available_fonts:
return fm.FontProperties(fname=fm.findfont(fm.FontProperties(family=font)))
except:
pass
# 如果找不到中文字体,返回默认字体
return fm.FontProperties()
def save_chart(self):
if self.fig is None:
messagebox.showwarning("警告", "没有图表可保存")
return
file_path = filedialog.asksaveasfilename(
defaultextension=".png",
filetypes=[("PNG files", "*.png"), ("JPEG files", "*.jpg"), ("PDF files", "*.pdf")]
)
if file_path:
try:
self.fig.savefig(file_path, dpi=300, bbox_inches='tight')
messagebox.showinfo("成功", f"图表已保存到: {file_path}")
except Exception as e:
messagebox.showerror("错误", f"保存图表时出错: {str(e)}")
# 辅助功能
def update_data_preview(self):
# 清除现有数据
for item in self.data_tree.get_children():
self.data_tree.delete(item)
# 设置列
if self.df is not None:
self.data_tree["columns"] = list(self.df.columns)
self.data_tree["show"] = "headings"
# 设置列标题
for col in self.df.columns:
self.data_tree.heading(col, text=col, command=lambda c=col: self.sort_treeview(c))
self.data_tree.column(col, width=100)
# 添加数据行 (根据选项决定显示多少行)
display_data = self.df if self.show_all_data.get() else self.df.head(100)
for idx, row in display_data.iterrows():
# 将行数据转换为列表,处理NaN值为空字符串
row_values = []
for value in row:
if pd.isna(value):
row_values.append("") # 使用空字符串表示缺失值
else:
row_values.append(value)
# 插入行
self.data_tree.insert("", "end", values=row_values)
# 更新数据预览标题
parent = self.data_tree.master
if hasattr(parent, 'configure'):
parent.configure(text="数据预览")
def sort_treeview(self, column):
"""对Treeview按列进行排序"""
if self.df is None:
return
# 获取当前列的排序状态
if column not in self.sort_states:
self.sort_states[column] = "desc" # 默认第一次点击降序
# 切换排序状态
if self.sort_states[column] == "desc":
self.sort_states[column] = "asc"
sorted_df = self.df.sort_values(by=column, ascending=True)
else:
self.sort_states[column] = "desc"
sorted_df = self.df.sort_values(by=column, ascending=False)
# 更新显示的数据
self.df = sorted_df
self.update_data_preview()
# 更新列标题以显示排序状态
for col in self.data_tree["columns"]:
current_text = col
if col == column:
if self.sort_states[column] == "desc":
current_text = col + " ▼"
else:
current_text = col + " ▲"
self.data_tree.heading(col, text=current_text, command=lambda c=col: self.sort_treeview(c))
# 主程序
if __name__ == "__main__":
root = tk.Tk()
app = DataAnalysisApp(root)
root.mainloop()