文章目录
- 前言
- [一、 Excel 自动化:为什么选择 Python?](#一、 Excel 自动化:为什么选择 Python?)
-
- [1.1 Python Excel 操作库概览](#1.1 Python Excel 操作库概览)
- [1.2 安装必要库](#1.2 安装必要库)
- [二、 写入 Excel 文件](#二、 写入 Excel 文件)
-
- [2.1 使用 xlwt 写入 .xls 文件](#2.1 使用 xlwt 写入 .xls 文件)
- [2.2 使用 XlsxWriter 创建高级 Excel 文件](#2.2 使用 XlsxWriter 创建高级 Excel 文件)
- [2.3 使用 openpyxl 写入 .xlsx 文件](#2.3 使用 openpyxl 写入 .xlsx 文件)
- [三、读取 Excel 文件](#三、读取 Excel 文件)
-
- [3.1 使用 xlrd 读取 Excel](#3.1 使用 xlrd 读取 Excel)
- [3.2 使用 pandas 读取 Excel(推荐)](#3.2 使用 pandas 读取 Excel(推荐))
- [四、修改和更新 Excel 文件](#四、修改和更新 Excel 文件)
-
- [4.1 使用 xlutils 修改现有 Excel 文件](#4.1 使用 xlutils 修改现有 Excel 文件)
- [4.2 使用 openpyxl 修改现有 Excel 文件](#4.2 使用 openpyxl 修改现有 Excel 文件)
前言
本文主要介绍读写Excel文件实用的库及每个库的特点。
一、 Excel 自动化:为什么选择 Python?
在数据处理和分析工作中,Excel 无疑是使用最广泛的工具之一。然而,手动处理大量数据或执行重复性任务不仅效率低下,还容易出错。Python 凭借其强大的数据处理能力和丰富的第三方库,成为 Excel 自动化的理想选择。
1.1 Python Excel 操作库概览
| 库名 | 主要功能 | 支持格式 | 特点 |
|---|---|---|---|
| xlrd | 读取数据 | .xls, .xlsx | 经典读取库,稳定可靠 |
| xlwt | 写入数据 | .xls | 经典写入库,不支持 .xlsx |
| xlutils | 实用工具 | .xls | 提供复制、修改等功能 |
| XlsxWriter | 写入数据 | .xlsx | 功能强大,支持图表生成 |
| openpyxl | 读写操作 | .xlsx | 功能全面,支持读写和修改 |
| pandas | 数据处理 | .xls, .xlsx | 高级数据处理,内置Excel读写功能 |
1.2 安装必要库
bash
bash
# 基本库安装
pip install xlrd xlwt xlutils XlsxWriter openpyxl pandas
# 或者使用国内镜像加速
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple xlrd xlwt openpyxl pandas
二、 写入 Excel 文件
2.1 使用 xlwt 写入 .xls 文件
python
python
import xlwt
from datetime import datetime
def write_with_xlwt():
"""使用 xlwt 创建 Excel 文件"""
# 1. 创建工作簿
workbook = xlwt.Workbook(encoding='utf-8')
# 2. 创建工作表
worksheet = workbook.add_sheet('员工信息', cell_overwrite_ok=True)
# 3. 创建样式
# 标题样式(加粗、居中)
title_style = xlwt.XFStyle()
title_font = xlwt.Font()
title_font.bold = True
title_font.height = 12 * 20 # 12号字体
title_style.font = title_font
title_alignment = xlwt.Alignment()
title_alignment.horz = xlwt.Alignment.HORZ_CENTER # 水平居中
title_alignment.vert = xlwt.Alignment.VERT_CENTER # 垂直居中
title_style.alignment = title_alignment
# 数据样式(左对齐)
data_style = xlwt.XFStyle()
data_alignment = xlwt.Alignment()
data_alignment.horz = xlwt.Alignment.HORZ_LEFT
data_style.alignment = data_alignment
# 数字样式(右对齐)
num_style = xlwt.XFStyle()
num_alignment = xlwt.Alignment()
num_alignment.horz = xlwt.Alignment.HORZ_RIGHT
num_style.alignment = num_alignment
# 4. 写入标题行
headers = ['序号', '姓名', '年龄', '部门', '入职日期', '薪资']
for col, header in enumerate(headers):
worksheet.write(0, col, header, title_style)
# 设置列宽(256为单位)
worksheet.col(col).width = 256 * 12
# 5. 写入数据
employees = [
['001', '张三', 28, '技术部', '2020-03-15', 15000],
['002', '李四', 32, '销售部', '2019-08-22', 18000],
['003', '王五', 25, '市场部', '2021-01-10', 12000],
['004', '赵六', 35, '技术部', '2018-11-05', 22000],
['005', '钱七', 29, '人事部', '2020-06-30', 16000],
]
for row_idx, employee in enumerate(employees, start=1):
for col_idx, value in enumerate(employee):
if col_idx == 2 or col_idx == 5: # 年龄和薪资使用数字样式
worksheet.write(row_idx, col_idx, value, num_style)
else:
worksheet.write(row_idx, col_idx, value, data_style)
# 6. 添加汇总行
summary_row = len(employees) + 1
worksheet.write(summary_row, 0, '平均值', title_style)
worksheet.write(summary_row, 2, xlwt.Formula(f'AVERAGE(C2:C{summary_row})'), num_style)
worksheet.write(summary_row, 5, xlwt.Formula(f'AVERAGE(F2:F{summary_row})'), num_style)
# 7. 保存文件
filename = f'员工信息_{datetime.now().strftime("%Y%m%d_%H%M%S")}.xls'
workbook.save(filename)
print(f'文件已保存: {filename}')
return filename
if __name__ == '__main__':
write_with_xlwt()
2.2 使用 XlsxWriter 创建高级 Excel 文件
python
python
import xlsxwriter
from datetime import datetime
def create_excel_with_charts():
"""使用 XlsxWriter 创建带图表的 Excel 文件"""
# 1. 创建工作簿
filename = '销售数据_图表.xlsx'
workbook = xlsxwriter.Workbook(filename)
# 2. 创建工作表
worksheet = workbook.add_worksheet('销售报表')
# 3. 定义格式
# 标题格式
header_format = workbook.add_format({
'bold': True,
'font_size': 12,
'align': 'center',
'valign': 'vcenter',
'bg_color': '#366092',
'font_color': 'white',
'border': 1
})
# 数据格式
data_format = workbook.add_format({
'align': 'left',
'valign': 'vcenter',
'border': 1
})
# 货币格式
currency_format = workbook.add_format({
'num_format': '¥#,##0',
'align': 'right',
'border': 1
})
# 百分比格式
percent_format = workbook.add_format({
'num_format': '0.00%',
'align': 'right',
'border': 1
})
# 4. 准备数据
headers = ['季度', '产品A', '产品B', '产品C', '合计', '增长率']
data = [
['Q1', 1250000, 980000, 750000, None, None],
['Q2', 1380000, 1050000, 820000, None, None],
['Q3', 1520000, 1120000, 910000, None, None],
['Q4', 1680000, 1280000, 1050000, None, None],
]
# 5. 写入标题
for col, header in enumerate(headers):
worksheet.write(0, col, header, header_format)
worksheet.set_column(col, col, 15) # 设置列宽
# 6. 写入数据并计算合计与增长率
for row_idx, row_data in enumerate(data, start=1):
for col_idx, value in enumerate(row_data):
if col_idx < 4: # 产品销售额
worksheet.write(row_idx, col_idx, value, currency_format)
elif col_idx == 4: # 合计列
# 使用公式计算合计
formula = f'=SUM(B{row_idx+1}:D{row_idx+1})'
worksheet.write_formula(row_idx, col_idx, formula, currency_format)
elif col_idx == 5 and row_idx > 1: # 增长率(从第二行开始)
# 使用公式计算环比增长率
prev_total_cell = f'E{row_idx}' # 上一行的合计
curr_total_cell = f'E{row_idx+1}' # 当前行的合计
formula = f'=({curr_total_cell}-{prev_total_cell})/{prev_total_cell}'
worksheet.write_formula(row_idx, col_idx, formula, percent_format)
# 7. 创建柱状图
chart1 = workbook.add_chart({'type': 'column'})
# 添加数据系列
chart1.add_series({
'name': '=销售报表!$B$1',
'categories': '=销售报表!$A$2:$A$5',
'values': '=销售报表!$B$2:$B$5',
'fill': {'color': '#4472C4'},
})
chart1.add_series({
'name': '=销售报表!$C$1',
'categories': '=销售报表!$A$2:$A$5',
'values': '=销售报表!$C$2:$C$5',
'fill': {'color': '#ED7D31'},
})
chart1.add_series({
'name': '=销售报表!$D$1',
'categories': '=销售报表!$A$2:$A$5',
'values': '=销售报表!$D$2:$D$5',
'fill': {'color': '#A5A5A5'},
})
# 设置图表属性
chart1.set_title({'name': '季度产品销售对比'})
chart1.set_x_axis({'name': '季度'})
chart1.set_y_axis({'name': '销售额(元)'})
chart1.set_legend({'position': 'bottom'})
# 插入图表到工作表
worksheet.insert_chart('H2', chart1, {'x_offset': 25, 'y_offset': 10})
# 8. 创建饼图(年度占比)
chart2 = workbook.add_chart({'type': 'pie'})
# 计算年度总计
worksheet.write(6, 0, '年度总计', header_format)
for i in range(3):
col = chr(ord('B') + i) # B, C, D 列
formula = f'=SUM({col}2:{col}5)'
worksheet.write_formula(6, i+1, formula, currency_format)
# 添加饼图数据
chart2.add_series({
'name': '年度产品占比',
'categories': '=销售报表!$B$1:$D$1',
'values': '=销售报表!$B$7:$D$7',
'data_labels': {'percentage': True, 'leader_lines': True},
})
chart2.set_title({'name': '年度产品销售占比'})
worksheet.insert_chart('H18', chart2, {'x_offset': 25, 'y_offset': 10})
# 9. 添加数据筛选
worksheet.autofilter('A1:F5')
# 10. 保存文件
workbook.close()
print(f'带图表的Excel文件已创建: {filename}')
if __name__ == '__main__':
create_excel_with_charts()
2.3 使用 openpyxl 写入 .xlsx 文件
python
python
import openpyxl
from openpyxl.styles import Font, Alignment, PatternFill, Border, Side
from openpyxl.utils import get_column_letter
from datetime import datetime
def write_with_openpyxl():
"""使用 openpyxl 创建 Excel 文件"""
# 1. 创建工作簿
workbook = openpyxl.Workbook()
# 2. 获取活动工作表
worksheet = workbook.active
worksheet.title = "项目进度"
# 3. 定义样式
# 标题样式
header_font = Font(name='微软雅黑', size=12, bold=True, color='FFFFFF')
header_fill = PatternFill(start_color='366092', end_color='366092', fill_type='solid')
header_alignment = Alignment(horizontal='center', vertical='center')
thin_border = Border(
left=Side(style='thin'),
right=Side(style='thin'),
top=Side(style='thin'),
bottom=Side(style='thin')
)
# 数据样式
data_font = Font(name='宋体', size=11)
data_alignment = Alignment(horizontal='left', vertical='center')
# 进度条样式
progress_fill = PatternFill(start_color='C6EFCE', end_color='C6EFCE', fill_type='solid')
warning_fill = PatternFill(start_color='FFEB9C', end_color='FFEB9C', fill_type='solid')
alert_fill = PatternFill(start_color='FFC7CE', end_color='FFC7CE', fill_type='solid')
# 4. 准备数据
headers = ['项目名称', '负责人', '开始日期', '结束日期', '完成进度', '状态']
projects = [
['客户管理系统', '张三', '2023-01-15', '2023-04-30', 0.85, '进行中'],
['数据中台建设', '李四', '2022-11-01', '2023-03-31', 1.00, '已完成'],
['移动APP开发', '王五', '2023-02-10', '2023-05-20', 0.65, '进行中'],
['网站重构', '赵六', '2023-03-01', '2023-06-15', 0.30, '延迟'],
['数据分析平台', '钱七', '2023-01-20', '2023-05-10', 0.90, '进行中'],
]
# 5. 写入标题行
for col, header in enumerate(headers, start=1):
cell = worksheet.cell(row=1, column=col, value=header)
cell.font = header_font
cell.fill = header_fill
cell.alignment = header_alignment
cell.border = thin_border
# 设置列宽
column_letter = get_column_letter(col)
worksheet.column_dimensions[column_letter].width = 15
# 6. 写入数据行
for row_idx, project in enumerate(projects, start=2):
for col_idx, value in enumerate(project, start=1):
cell = worksheet.cell(row=row_idx, column=col_idx, value=value)
cell.font = data_font
cell.alignment = data_alignment
cell.border = thin_border
# 特殊处理进度列
if col_idx == 5: # 完成进度列
cell.number_format = '0.00%'
progress = value
if progress >= 0.9:
cell.fill = progress_fill
elif progress >= 0.7:
cell.fill = warning_fill
else:
cell.fill = alert_fill
# 特殊处理状态列
if col_idx == 6: # 状态列
if value == '已完成':
cell.font = Font(name='宋体', size=11, bold=True, color='00B050')
elif value == '延迟':
cell.font = Font(name='宋体', size=11, bold=True, color='FF0000')
# 7. 添加条件格式(数据条)
from openpyxl.formatting.rule import DataBarRule
# 为进度列添加数据条
progress_col_letter = get_column_letter(headers.index('完成进度') + 1)
data_bar_rule = DataBarRule(
start_type='num',
start_value=0,
end_type='num',
end_value=1,
color='638EC6'
)
worksheet.conditional_formatting.add(
f'{progress_col_letter}2:{progress_col_letter}{len(projects)+1}',
data_bar_rule
)
# 8. 添加汇总行
summary_row = len(projects) + 2
worksheet.cell(row=summary_row, column=1, value='项目统计').font = Font(bold=True)
worksheet.cell(row=summary_row, column=5, value=f'=AVERAGE(E2:E{len(projects)+1})')
worksheet.cell(row=summary_row, column=5).number_format = '0.00%'
# 9. 添加筛选器
worksheet.auto_filter.ref = worksheet.dimensions
# 10. 保存文件
filename = f'项目进度_{datetime.now().strftime("%Y%m%d")}.xlsx'
workbook.save(filename)
print(f'文件已保存: {filename}')
return filename
if __name__ == '__main__':
write_with_openpyxl()
三、读取 Excel 文件
3.1 使用 xlrd 读取 Excel
python
python
import xlrd
from pprint import pprint
def read_excel_with_xlrd(filepath):
"""使用 xlrd 读取 Excel 文件"""
try:
# 1. 打开工作簿
workbook = xlrd.open_workbook(filepath)
print("=" * 60)
print(f"文件: {filepath}")
print("=" * 60)
# 2. 基本信息
print(f"工作表数量: {workbook.nsheets}")
print(f"工作表名称: {workbook.sheet_names()}")
# 3. 遍历所有工作表
for sheet_name in workbook.sheet_names():
worksheet = workbook.sheet_by_name(sheet_name)
print("\n" + "-" * 60)
print(f"工作表: {sheet_name}")
print(f"行数: {worksheet.nrows}, 列数: {worksheet.ncols}")
# 4. 读取数据(多种方式)
print("\n=== 读取方式示例 ===")
# 方式1:按行读取
print("1. 按行读取(前3行):")
for i in range(min(3, worksheet.nrows)):
row_data = worksheet.row_values(i)
print(f" 第{i+1}行: {row_data}")
# 方式2:按列读取
if worksheet.ncols > 0:
print(f"\n2. 按列读取(第1列):")
col_data = worksheet.col_values(0)
print(f" 第1列(前5个值): {col_data[:5]}")
# 方式3:读取指定区域
print(f"\n3. 读取区域 A1:C3:")
for i in range(min(3, worksheet.nrows)):
for j in range(min(3, worksheet.ncols)):
cell_value = worksheet.cell_value(i, j)
cell_type = worksheet.cell_type(i, j)
type_names = {
xlrd.XL_CELL_EMPTY: '空',
xlrd.XL_CELL_TEXT: '文本',
xlrd.XL_CELL_NUMBER: '数字',
xlrd.XL_CELL_DATE: '日期',
xlrd.XL_CELL_BOOLEAN: '布尔',
xlrd.XL_CELL_ERROR: '错误'
}
print(f" [{i+1},{j+1}]: {cell_value} ({type_names.get(cell_type, '未知')})")
# 方式4:读取所有数据
if worksheet.nrows < 10: # 数据量小的情况下显示全部
print(f"\n4. 所有数据:")
all_data = []
for i in range(worksheet.nrows):
row = []
for j in range(worksheet.ncols):
cell_value = worksheet.cell_value(i, j)
row.append(cell_value)
all_data.append(row)
print(f" 第{i+1}行: {row}")
# 5. 读取带格式的数据
print(f"\n5. 单元格详细信息:")
if worksheet.nrows > 1 and worksheet.ncols > 1:
cell = worksheet.cell(1, 1)
print(f" 单元格(2,2):")
print(f" 值: {cell.value}")
print(f" 类型: {cell.ctype}")
# 处理日期类型
if cell.ctype == xlrd.XL_CELL_DATE:
date_tuple = xlrd.xldate_as_tuple(cell.value, workbook.datemode)
print(f" 日期: {date_tuple[0]}-{date_tuple[1]:02d}-{date_tuple[2]:02d}")
# 6. 读取合并单元格信息
print("\n" + "=" * 60)
print("合并单元格信息:")
worksheet = workbook.sheet_by_index(0)
merged_cells = worksheet.merged_cells
if merged_cells:
for rlo, rhi, clo, chi in merged_cells:
print(f" 合并区域: 行{rlo+1}-{rhi}, 列{clo+1}-{chi}")
else:
print(" 无合并单元格")
except FileNotFoundError:
print(f"错误: 文件 {filepath} 不存在")
except xlrd.XLRDError as e:
print(f"错误: 读取Excel文件失败 - {e}")
except Exception as e:
print(f"错误: 发生未知错误 - {e}")
def analyze_excel_data(filepath):
"""分析Excel数据内容"""
workbook = xlrd.open_workbook(filepath)
worksheet = workbook.sheet_by_index(0)
print("\n" + "=" * 60)
print("数据统计分析")
print("=" * 60)
# 统计不同类型的数据
type_count = {
'文本': 0,
'数字': 0,
'日期': 0,
'布尔': 0,
'空': 0,
'错误': 0
}
for i in range(worksheet.nrows):
for j in range(worksheet.ncols):
cell_type = worksheet.cell_type(i, j)
if cell_type == xlrd.XL_CELL_TEXT:
type_count['文本'] += 1
elif cell_type == xlrd.XL_CELL_NUMBER:
type_count['数字'] += 1
elif cell_type == xlrd.XL_CELL_DATE:
type_count['日期'] += 1
elif cell_type == xlrd.XL_CELL_BOOLEAN:
type_count['布尔'] += 1
elif cell_type == xlrd.XL_CELL_EMPTY:
type_count['空'] += 1
elif cell_type == xlrd.XL_CELL_ERROR:
type_count['错误'] += 1
print("单元格类型统计:")
for type_name, count in type_count.items():
if count > 0:
percentage = (count / (worksheet.nrows * worksheet.ncols)) * 100
print(f" {type_name}: {count}个 ({percentage:.1f}%)")
# 提取数值数据进行统计
numbers = []
for i in range(worksheet.nrows):
for j in range(worksheet.ncols):
if worksheet.cell_type(i, j) == xlrd.XL_CELL_NUMBER:
numbers.append(worksheet.cell_value(i, j))
if numbers:
print(f"\n数值数据统计 (共{len(numbers)}个):")
print(f" 最小值: {min(numbers):.2f}")
print(f" 最大值: {max(numbers):.2f}")
print(f" 平均值: {sum(numbers)/len(numbers):.2f}")
print(f" 总和: {sum(numbers):.2f}")
if __name__ == '__main__':
# 测试读取
read_excel_with_xlrd('test.xls')
analyze_excel_data('test.xls')
3.2 使用 pandas 读取 Excel(推荐)
python
python
import pandas as pd
import numpy as np
def read_excel_with_pandas(filepath):
"""使用 pandas 读取 Excel 文件"""
try:
# 1. 读取整个Excel文件的所有工作表
print("读取所有工作表...")
excel_file = pd.ExcelFile(filepath)
print(f"工作表列表: {excel_file.sheet_names}")
# 2. 读取每个工作表
for sheet_name in excel_file.sheet_names:
print(f"\n{'='*60}")
print(f"工作表: {sheet_name}")
# 读取数据
df = pd.read_excel(filepath, sheet_name=sheet_name)
# 基本信息
print(f"形状: {df.shape[0]} 行 × {df.shape[1]} 列")
print(f"列名: {list(df.columns)}")
# 显示前几行数据
print("\n前5行数据:")
print(df.head())
# 数据类型
print("\n数据类型:")
print(df.dtypes)
# 统计信息
print("\n数值列统计信息:")
print(df.describe())
# 缺失值统计
missing_values = df.isnull().sum()
if missing_values.sum() > 0:
print("\n缺失值统计:")
print(missing_values[missing_values > 0])
# 唯一值统计
print("\n唯一值数量:")
for col in df.columns:
unique_count = df[col].nunique()
if unique_count < 10:
print(f" {col}: {unique_count} 个唯一值 - {df[col].unique()}")
else:
print(f" {col}: {unique_count} 个唯一值")
# 3. 使用特定参数读取
print("\n" + "="*60)
print("使用高级参数读取:")
# 跳过行、指定列等
df_advanced = pd.read_excel(
filepath,
sheet_name=0,
skiprows=1, # 跳过第1行
usecols="A:D,F", # 读取A-D列和F列
dtype={'年龄': np.int32, '薪资': np.float64}, # 指定数据类型
na_values=['N/A', 'NULL', '缺失'], # 自定义缺失值标识
thousands=',', # 千分位分隔符
parse_dates=['入职日期'] # 解析日期列
)
print("使用高级参数读取的结果:")
print(df_advanced.head())
return excel_file
except FileNotFoundError:
print(f"错误: 文件 {filepath} 不存在")
except Exception as e:
print(f"错误: 读取文件失败 - {e}")
def excel_data_analysis(filepath):
"""对Excel数据进行详细分析"""
try:
# 读取数据
df = pd.read_excel(filepath, sheet_name=0)
print("="*60)
print("Excel 数据分析报告")
print("="*60)
# 1. 数据概览
print("\n1. 数据概览:")
print(f"数据集大小: {df.shape[0]} 行 × {df.shape[1]} 列")
print(f"内存使用: {df.memory_usage(deep=True).sum() / 1024:.2f} KB")
# 2. 数据类型分析
print("\n2. 数据类型分析:")
dtype_counts = df.dtypes.value_counts()
for dtype, count in dtype_counts.items():
print(f" {dtype}: {count} 列")
# 3. 缺失值分析
print("\n3. 缺失值分析:")
missing_df = pd.DataFrame({
'列名': df.columns,
'缺失值数量': df.isnull().sum().values,
'缺失值比例': (df.isnull().sum() / len(df) * 100).round(2)
})
missing_df = missing_df[missing_df['缺失值数量'] > 0]
if len(missing_df) > 0:
print(missing_df.to_string(index=False))
else:
print(" 无缺失值")
# 4. 数值数据统计
numeric_cols = df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
print("\n4. 数值数据统计:")
numeric_stats = df[numeric_cols].describe()
print(numeric_stats.round(2))
# 5. 文本数据统计
text_cols = df.select_dtypes(include=['object']).columns
if len(text_cols) > 0:
print("\n5. 文本数据统计:")
for col in text_cols:
unique_count = df[col].nunique()
print(f" {col}:")
print(f" 唯一值数量: {unique_count}")
print(f" 最常出现的5个值:")
top_values = df[col].value_counts().head(5)
for value, count in top_values.items():
percentage = (count / len(df)) * 100
print(f" '{value}': {count}次 ({percentage:.1f}%)")
# 6. 相关性分析
if len(numeric_cols) >= 2:
print("\n6. 相关性分析:")
correlation = df[numeric_cols].corr()
print("相关性矩阵:")
print(correlation.round(2))
# 找出强相关性
strong_corr = []
for i in range(len(numeric_cols)):
for j in range(i+1, len(numeric_cols)):
corr_value = correlation.iloc[i, j]
if abs(corr_value) > 0.7:
strong_corr.append((numeric_cols[i], numeric_cols[j], corr_value))
if strong_corr:
print("\n强相关性 (>0.7):")
for col1, col2, corr in strong_corr:
print(f" {col1} 和 {col2}: {corr:.2f}")
# 7. 异常值检测
if len(numeric_cols) > 0:
print("\n7. 异常值检测 (使用IQR方法):")
for col in numeric_cols:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
if len(outliers) > 0:
print(f" {col}: {len(outliers)} 个异常值 (范围: [{lower_bound:.2f}, {upper_bound:.2f}])")
print(f" 异常值示例: {outliers[col].head(3).tolist()}")
print("\n" + "="*60)
print("分析完成")
except Exception as e:
print(f"分析过程中出错: {e}")
if __name__ == '__main__':
# 测试读取和分析
read_excel_with_pandas('test.xlsx')
excel_data_analysis('test.xlsx')
四、修改和更新 Excel 文件
4.1 使用 xlutils 修改现有 Excel 文件
python
python
import xlrd
import xlwt
from xlutils.copy import copy
import os
def modify_excel_with_xlutils(original_file, output_file=None):
"""
使用 xlutils 修改 Excel 文件
保留原始格式并添加新数据
"""
if not os.path.exists(original_file):
print(f"错误: 文件 {original_file} 不存在")
return
try:
# 1. 读取原始文件(保留格式)
print(f"正在读取文件: {original_file}")
original_wb = xlrd.open_workbook(original_file, formatting_info=True)
# 2. 复制工作簿(用于修改)
modified_wb = copy(original_wb)
# 3. 获取第一个工作表
original_ws = original_wb.sheet_by_index(0)
modified_ws = modified_wb.get_sheet(0)
print(f"原始数据: {original_ws.nrows} 行 × {original_ws.ncols} 列")
# 4. 定义新样式
new_style = xlwt.XFStyle()
# 字体
new_font = xlwt.Font()
new_font.name = '宋体'
new_font.height = 220 # 11号字体
new_font.bold = True
new_font.colour_index = 2 # 红色
new_style.font = new_font
# 对齐方式
new_alignment = xlwt.Alignment()
new_alignment.horz = xlwt.Alignment.HORZ_CENTER
new_alignment.vert = xlwt.Alignment.VERT_CENTER
new_style.alignment = new_alignment
# 边框
new_borders = xlwt.Borders()
new_borders.left = xlwt.Borders.THIN
new_borders.right = xlwt.Borders.THIN
new_borders.top = xlwt.Borders.THIN
new_borders.bottom = xlwt.Borders.THIN
new_style.borders = new_borders
# 5. 在末尾添加新行
new_row_start = original_ws.nrows
# 添加分隔行
modified_ws.write(new_row_start, 0, "=" * 50, new_style)
# 添加统计数据
stat_row = new_row_start + 1
modified_ws.write(stat_row, 0, "统计信息", new_style)
# 计算平均值(假设第3列是年龄)
if original_ws.ncols >= 3:
ages = []
for row in range(1, original_ws.nrows):
try:
age = original_ws.cell_value(row, 2)
if isinstance(age, (int, float)):
ages.append(age)
except:
continue
if ages:
avg_age = sum(ages) / len(ages)
modified_ws.write(stat_row, 1, f"平均年龄: {avg_age:.1f}", new_style)
print(f"计算平均年龄: {avg_age:.1f} (基于 {len(ages)} 个有效数据)")
# 6. 添加新数据
new_data_row = stat_row + 1
new_employees = [
['006', '孙八', 27, '财务部', '2022-09-18', 17000],
['007', '周九', 31, '技术部', '2021-07-25', 19000],
]
for i, employee in enumerate(new_employees):
row = new_data_row + i
for col, value in enumerate(employee):
modified_ws.write(row, col, value)
print(f"添加了 {len(new_employees)} 条新记录")
# 7. 添加汇总行
summary_row = new_data_row + len(new_employees)
modified_ws.write(summary_row, 0, "数据更新时间:")
from datetime import datetime
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
modified_ws.write(summary_row, 1, timestamp)
# 8. 保存文件
if output_file is None:
filename, ext = os.path.splitext(original_file)
output_file = f"{filename}_修改后{ext}"
modified_wb.save(output_file)
print(f"文件已保存: {output_file}")
print(f"总行数: {summary_row + 1}")
return output_file
except Exception as e:
print(f"修改文件时出错: {e}")
def batch_process_excel_files(folder_path, pattern="*.xls"):
"""批量处理Excel文件"""
import glob
if not os.path.exists(folder_path):
print(f"错误: 文件夹 {folder_path} 不存在")
return
# 查找所有匹配的Excel文件
excel_files = glob.glob(os.path.join(folder_path, pattern))
if not excel_files:
print(f"在 {folder_path} 中没有找到 {pattern} 文件")
return
print(f"找到 {len(excel_files)} 个Excel文件:")
processed_files = []
for file_path in excel_files:
print(f"\n处理文件: {os.path.basename(file_path)}")
# 为每个文件生成新的文件名
filename, ext = os.path.splitext(file_path)
output_file = f"{filename}_processed{ext}"
try:
# 修改文件
result = modify_excel_with_xlutils(file_path, output_file)
if result:
processed_files.append(result)
except Exception as e:
print(f" 处理失败: {e}")
print(f"\n处理完成! 共处理 {len(processed_files)} 个文件")
return processed_files
if __name__ == '__main__':
# 修改单个文件
modify_excel_with_xlutils('test.xls')
# 批量处理(如果有文件夹)
# batch_process_excel_files('./excel_files/')
4.2 使用 openpyxl 修改现有 Excel 文件
python
python
import openpyxl
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from openpyxl.utils import get_column_letter
from datetime import datetime
import os
def modify_excel_with_openpyxl(input_file, output_file=None):
"""使用 openpyxl 修改 Excel 文件"""
if not os.path.exists(input_file):
print(f"错误: 文件 {input_file} 不存在")
return
try:
# 1. 加载现有工作簿
print(f"正在加载文件: {input_file}")
workbook = openpyxl.load_workbook(input_file)
# 2. 获取活动工作表
worksheet = workbook.active
original_rows = worksheet.max_row
original_cols = worksheet.max_column
print(f"原始数据: {original_rows} 行 × {original_cols} 列")
# 3. 分析现有数据
print("\n分析现有数据...")
# 查找数值列
numeric_columns = []
for col in range(1, original_cols + 1):
col_letter = get_column_letter(col)
column_values = []
for row in range(2, min(original_rows, 100)): # 检查前100行
cell_value = worksheet[f"{col_letter}{row}"].value
if isinstance(cell_value, (int, float)):
column_values.append(cell_value)
if len(column_values) > 10: # 如果有足够多的数值数据
numeric_columns.append(col)
print(f" 列{col_letter}: 包含数值数据 ({len(column_values)}个)")
# 4. 添加统计数据
print("\n添加统计数据...")
# 定义统计行样式
stat_style = {
'font': Font(name='微软雅黑', size=11, bold=True, color='FFFFFF'),
'fill': PatternFill(start_color='0070C0', end_color='0070C0', fill_type='solid'),
'alignment': Alignment(horizontal='center', vertical='center'),
'border': Border(
left=Side(style='thin'),
right=Side(style='thin'),
top=Side(style='thin'),
bottom=Side(style='thin')
)
}
# 在末尾添加统计行
stat_start_row = original_rows + 2
# 添加分隔行
worksheet.cell(row=stat_start_row, column=1, value="=" * 50)
# 统计标题
stat_title_row = stat_start_row + 1
worksheet.cell(row=stat_title_row, column=1, value="数据统计").font = Font(bold=True)
# 计算每列的统计信息
for col_idx, col in enumerate(numeric_columns, start=1):
col_letter = get_column_letter(col)
# 获取列标题
if worksheet[f"{col_letter}1"].value:
col_title = worksheet[f"{col_letter}1"].value
else:
col_title = f"列{col_letter}"
# 收集数据
values = []
for row in range(2, original_rows + 1):
cell_value = worksheet[f"{col_letter}{row}"].value
if isinstance(cell_value, (int, float)):
values.append(cell_value)
if values:
# 计算统计值
count = len(values)
avg = sum(values) / count
minimum = min(values)
maximum = max(values)
total = sum(values)
# 写入统计信息
stat_data_row = stat_title_row + col_idx
worksheet.cell(row=stat_data_row, column=1, value=col_title).font = Font(bold=True)
worksheet.cell(row=stat_data_row, column=2, value=f"数据量: {count}")
worksheet.cell(row=stat_data_row, column=3, value=f"平均值: {avg:.2f}")
worksheet.cell(row=stat_data_row, column=4, value=f"最小值: {minimum}")
worksheet.cell(row=stat_data_row, column=5, value=f"最大值: {maximum}")
worksheet.cell(row=stat_data_row, column=6, value=f"总和: {total:.2f}")
print(f" {col_title}: 平均值={avg:.2f}, 范围=[{minimum}, {maximum}]")
# 5. 添加新数据
print("\n添加新数据...")
new_data_start_row = stat_start_row + len(numeric_columns) + 3
worksheet.cell(row=new_data_start_row, column=1, value="新增数据").font = Font(bold=True, color='FF0000')
# 示例新数据
new_data = [
['新员工1', '技术部', 28, 15000, '2023-03-15'],
['新员工2', '市场部', 32, 18000, '2023-04-01'],
['新员工3', '销售部', 25, 12000, '2023-05-20'],
]
# 写入新数据
for i, data_row in enumerate(new_data):
row = new_data_start_row + 1 + i
for j, value in enumerate(data_row):
worksheet.cell(row=row, column=j+1, value=value)
print(f"添加了 {len(new_data)} 条新记录")
# 6. 添加时间戳
timestamp_row = new_data_start_row + len(new_data) + 2
worksheet.cell(row=timestamp_row, column=1, value="最后更新时间:")
worksheet.cell(row=timestamp_row, column=2, value=datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
# 7. 自动调整列宽
for column in worksheet.columns:
max_length = 0
column_letter = column[0].column_letter
for cell in column:
try:
if cell.value:
cell_length = len(str(cell.value))
if cell_length > max_length:
max_length = cell_length
except:
pass
adjusted_width = min(max_length + 2, 50) # 限制最大宽度
worksheet.column_dimensions[column_letter].width = adjusted_width
# 8. 保存文件
if output_file is None:
filename, ext = os.path.splitext(input_file)
output_file = f"{filename}_modified_{datetime.now().strftime('%Y%m%d_%H%M%S')}{ext}"
workbook.save(output_file)
print(f"\n文件已保存: {output_file}")
print(f"最终数据: {worksheet.max_row} 行 × {worksheet.max_column} 列")
return output_file
except Exception as e:
print(f"修改文件时出错: {e}")
import traceback
traceback.print_exc()
def create_excel_report(data_file):
"""创建数据分析报告"""
try:
# 读取数据
import pandas as pd
df = pd.read_excel(data_file)
# 创建新的工作簿
workbook = openpyxl.Workbook()
# 1. 数据概览工作表
ws_overview = workbook.active
ws_overview.title = "数据概览"
# 写入基本信息
ws_overview['A1'] = "数据文件分析报告"
ws_overview['A1'].font = Font(size=14, bold=True)
ws_overview['A3'] = f"数据文件: {os.path.basename(data_file)}"
ws_overview['A4'] = f"分析时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
ws_overview['A5'] = f"总行数: {len(df)}"
ws_overview['A6'] = f"总列数: {len(df.columns)}"
# 2. 数据详情工作表
ws_details = workbook.create_sheet("数据详情")
# 写入标题
for col_idx, column in enumerate(df.columns, start=1):
ws_details.cell(row=1, column=col_idx, value=column)
ws_details.cell(row=1, column=col_idx).font = Font(bold=True)
# 写入数据
for row_idx, row in df.iterrows():
for col_idx, value in enumerate(row, start=1):
ws_details.cell(row=row_idx+2, column=col_idx, value=value)
# 3. 统计信息工作表
ws_stats = workbook.create_sheet("统计信息")
if len(df.select_dtypes(include=['number']).columns) > 0:
numeric_stats = df.describe()
# 写入统计标题
ws_stats['A1'] = "数值数据统计"
ws_stats['A1'].font = Font(bold=True)
# 写入统计表
for i, (stat_name, values) in enumerate(numeric_stats.iterrows(), start=3):
ws_stats.cell(row=i, column=1, value=stat_name)
for j, (col_name, value) in enumerate(values.items(), start=2):
if i == 3: # 第一行写入列名
ws_stats.cell(row=2, column=j, value=col_name).font = Font(bold=True)
ws_stats.cell(row=i, column=j, value=value)
# 保存报告
report_file = f"数据分析报告_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
workbook.save(report_file)
print(f"分析报告已生成: {report_file}")
return report_file
except Exception as e:
print(f"生成报告时出错: {e}")
if __name__ == '__main__':
# 修改现有Excel文件
modified_file = modify_excel_with_openpyxl('test.xlsx')
if modified_file:
# 生成分析报告
create_excel_report(modified_file)