批量处理文件
1.1 批量读取Excel文件
Python操作:
python
import pandas as pd
import os
from pathlib import Path
# 读取文件夹中所有Excel文件
folder_path = 'data'
all_files = Path(folder_path).glob('*.xlsx')
# 合并所有文件
df_list = []
for file in all_files:
df = pd.read_excel(file)
df['来源文件'] = file.name
df_list.append(df)
# 合并为一个DataFrame
combined_df = pd.concat(df_list, ignore_index=True)
combined_df.to_excel('合并结果.xlsx', index=False)
print(f'共处理 {len(df_list)} 个文件')
1.2 批量转换格式
Excel转CSV:
python
import pandas as pd
from pathlib import Path
folder_path = 'excel_files'
output_folder = 'csv_files'
Path(output_folder).mkdir(exist_ok=True)
for excel_file in Path(folder_path).glob('*.xlsx'):
df = pd.read_excel(excel_file)
csv_file = Path(output_folder) / f'{excel_file.stem}.csv'
df.to_csv(csv_file, index=False, encoding='utf-8-sig')
print(f'已转换: {excel_file.name} -> {csv_file.name}')
CSV转Excel:
python
import pandas as pd
from pathlib import Path
folder_path = 'csv_files'
output_folder = 'excel_files'
Path(output_folder).mkdir(exist_ok=True)
for csv_file in Path(folder_path).glob('*.csv'):
df = pd.read_csv(csv_file)
excel_file = Path(output_folder) / f'{csv_file.stem}.xlsx'
df.to_excel(excel_file, index=False)
print(f'已转换: {csv_file.name} -> {excel_file.name}')
1.3 批量修改文件
批量添加列:
python
import pandas as pd
from pathlib import Path
from datetime import datetime
folder_path = 'data'
for excel_file in Path(folder_path).glob('*.xlsx'):
df = pd.read_excel(excel_file)
# 添加处理时间列
df['处理时间'] = datetime.now()
# 添加计算列
if '数量' in df.columns and '单价' in df.columns:
df['总价'] = df['数量'] * df['单价']
# 保存
df.to_excel(excel_file, index=False)
print(f'已处理: {excel_file.name}')
1.4 批量拆分文件
按列值拆分:
python
import pandas as pd
from pathlib import Path
df = pd.read_excel('总数据.xlsx')
# 按部门拆分
output_folder = Path('按部门拆分')
output_folder.mkdir(exist_ok=True)
for dept in df['部门'].unique():
dept_df = df[df['部门'] == dept]
output_file = output_folder / f'{dept}.xlsx'
dept_df.to_excel(output_file, index=False)
print(f'已生成: {output_file.name}, 共 {len(dept_df)} 条记录')
按行数拆分:
python
import pandas as pd
from pathlib import Path
df = pd.read_excel('大文件.xlsx')
# 每1000行拆分一个文件
chunk_size = 1000
output_folder = Path('拆分文件')
output_folder.mkdir(exist_ok=True)
for i in range(0, len(df), chunk_size):
chunk_df = df.iloc[i:i+chunk_size]
output_file = output_folder / f'part_{i//chunk_size + 1}.xlsx'
chunk_df.to_excel(output_file, index=False)
print(f'已生成: {output_file.name}')
自动化报表生成
2.1 生成格式化报表
Python操作:
python
import pandas as pd
from openpyxl import load_workbook
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
# 读取数据
df = pd.read_excel('原始数据.xlsx')
# 数据处理
summary = df.groupby('部门').agg({
'销售额': 'sum',
'订单数': 'count',
'客户数': 'nunique'
}).reset_index()
# 保存到Excel
output_file = '销售报表.xlsx'
summary.to_excel(output_file, index=False, sheet_name='汇总')
# 格式化
wb = load_workbook(output_file)
ws = wb['汇总']
# 设置标题样式
header_fill = PatternFill(start_color='4472C4', fill_type='solid')
header_font = Font(color='FFFFFF', bold=True)
for cell in ws[1]:
cell.fill = header_fill
cell.font = header_font
cell.alignment = Alignment(horizontal='center')
# 设置边框
thin_border = Border(
left=Side(style='thin'),
right=Side(style='thin'),
top=Side(style='thin'),
bottom=Side(style='thin')
)
for row in ws.iter_rows(min_row=1, max_row=ws.max_row, min_col=1, max_col=ws.max_column):
for cell in row:
cell.border = thin_border
# 调整列宽
for column in ws.columns:
max_length = 0
column_letter = column[0].column_letter
for cell in column:
if cell.value:
max_length = max(max_length, len(str(cell.value)))
ws.column_dimensions[column_letter].width = max_length + 2
wb.save(output_file)
print(f'报表已生成: {output_file}')
2.2 生成多工作表报表
Python操作:
python
import pandas as pd
from datetime import datetime
df = pd.read_excel('销售数据.xlsx')
# 创建Excel写入器
output_file = f'销售报表_{datetime.now().strftime("%Y%m%d")}.xlsx'
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
# 总览表
summary = df.groupby('部门')['销售额'].sum().reset_index()
summary.to_excel(writer, sheet_name='总览', index=False)
# 按部门分表
for dept in df['部门'].unique():
dept_df = df[df['部门'] == dept]
dept_df.to_excel(writer, sheet_name=dept, index=False)
# 趋势分析
df['日期'] = pd.to_datetime(df['日期'])
trend = df.groupby(df['日期'].dt.to_period('M'))['销售额'].sum().reset_index()
trend.to_excel(writer, sheet_name='月度趋势', index=False)
print(f'多工作表报表已生成: {output_file}')
2.3 生成带图表的报表
Python操作:
python
import pandas as pd
from openpyxl import load_workbook
from openpyxl.chart import BarChart, Reference
# 准备数据
df = pd.read_excel('销售数据.xlsx')
summary = df.groupby('产品')['销售额'].sum().reset_index()
# 保存数据
output_file = '销售报表_带图表.xlsx'
summary.to_excel(output_file, index=False, sheet_name='数据')
# 添加图表
wb = load_workbook(output_file)
ws = wb['数据']
chart = BarChart()
chart.title = "产品销售额"
chart.x_axis.title = "产品"
chart.y_axis.title = "销售��"
data = Reference(ws, min_col=2, min_row=1, max_row=ws.max_row)
cats = Reference(ws, min_col=1, min_row=2, max_row=ws.max_row)
chart.add_data(data, titles_from_data=True)
chart.set_categories(cats)
ws.add_chart(chart, "D2")
wb.save(output_file)
print(f'带图表的报表已生成: {output_file}')
定时任务
3.1 使用schedule库
Python操作:
python
import schedule
import time
import pandas as pd
from datetime import datetime
def generate_daily_report():
"""生成每日报表"""
df = pd.read_excel('实时数据.xlsx')
# 数据处理
summary = df.groupby('类别')['数量'].sum().reset_index()
# 保存报表
output_file = f'日报_{datetime.now().strftime("%Y%m%d")}.xlsx'
summary.to_excel(output_file, index=False)
print(f'{datetime.now()}: 日报已生成 - {output_file}')
# 每天上午9点执行
schedule.every().day.at("09:00").do(generate_daily_report)
# 每小时执行
schedule.every().hour.do(generate_daily_report)
# 每周一执行
schedule.every().monday.at("09:00").do(generate_daily_report)
print('定时任务已启动...')
while True:
schedule.run_pending()
time.sleep(60)
3.2 使用Windows任务计划程序
创建批处理文件 (run_report.bat):
batch
@echo off
cd /d D:\Reports
python generate_report.py
pause
Python脚本 (generate_report.py):
python
import pandas as pd
from datetime import datetime
import logging
# 配置日志
logging.basicConfig(
filename='report_log.txt',
level=logging.INFO,
format='%(asctime)s - %(message)s'
)
try:
# 生成报表
df = pd.read_excel('数据源.xlsx')
summary = df.groupby('部门')['销售额'].sum().reset_index()
output_file = f'报表_{datetime.now().strftime("%Y%m%d_%H%M%S")}.xlsx'
summary.to_excel(output_file, index=False)
logging.info(f'报表生成成功: {output_file}')
print(f'报表生成成功: {output_file}')
except Exception as e:
logging.error(f'报表生成失败: {str(e)}')
print(f'报表生成失败: {str(e)}')
设置Windows任务计划:
- 打开"任务计划程序"
- 创建基本任务
- 设置触发器(每天、每周等)
- 操作:启动程序 → 选择 run_report.bat
- 完成设置
邮件自动发送
4.1 发送Excel附件
Python操作:
python
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.base import MIMEBase
from email import encoders
import pandas as pd
from datetime import datetime
def send_report_email(excel_file, recipients):
"""发送Excel报表邮件"""
# 邮件配置
sender_email = "your_email@example.com"
sender_password = "your_password"
smtp_server = "smtp.example.com"
smtp_port = 587
# 创建邮件
msg = MIMEMultipart()
msg['From'] = sender_email
msg['To'] = ', '.join(recipients)
msg['Subject'] = f'销售报表 - {datetime.now().strftime("%Y-%m-%d")}'
# 邮件正文
body = """
您好,
附件是今日的销售报表,请查收。
此邮件由系统自动发送,请勿回复。
"""
msg.attach(MIMEText(body, 'plain', 'utf-8'))
# 添加附件
with open(excel_file, 'rb') as f:
part = MIMEBase('application', 'octet-stream')
part.set_payload(f.read())
encoders.encode_base64(part)
part.add_header('Content-Disposition', f'attachment; filename={excel_file}')
msg.attach(part)
# 发送邮件
try:
server = smtplib.SMTP(smtp_server, smtp_port)
server.starttls()
server.login(sender_email, sender_password)
server.send_message(msg)
server.quit()
print(f'邮件发送成功: {excel_file}')
except Exception as e:
print(f'邮件发送失败: {str(e)}')
# 使用示例
excel_file = '销售报表.xlsx'
recipients = ['user1@example.com', 'user2@example.com']
send_report_email(excel_file, recipients)
4.2 发送HTML格式邮件
Python操作:
python
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
import pandas as pd
def send_html_report(df, recipients):
"""发送HTML格式的数据表格"""
sender_email = "your_email@example.com"
sender_password = "your_password"
msg = MIMEMultipart('alternative')
msg['Subject'] = '销售数据报表'
msg['From'] = sender_email
msg['To'] = ', '.join(recipients)
# 将DataFrame转为HTML
html_table = df.to_html(index=False, border=1)
html_body = f"""
<html>
<head>
<style>
table {{ border-collapse: collapse; width: 100%; }}
th {{ background-color: #4472C4; color: white; padding: 8px; }}
td {{ padding: 8px; border: 1px solid #ddd; }}
tr:nth-child(even) {{ background-color: #f2f2f2; }}
</style>
</head>
<body>
<h2>销售数据报表</h2>
{html_table}
</body>
</html>
"""
msg.attach(MIMEText(html_body, 'html', 'utf-8'))
# 发送
server = smtplib.SMTP('smtp.example.com', 587)
server.starttls()
server.login(sender_email, sender_password)
server.send_message(msg)
server.quit()
print('HTML邮件发送成功')
# 使用示例
df = pd.read_excel('销售数据.xlsx')
recipients = ['user@example.com']
send_html_report(df, recipients)
数据库集成
5.1 从数据库读取到Excel
Python操作:
python
import pandas as pd
import sqlite3
# SQLite示例
conn = sqlite3.connect('database.db')
query = "SELECT * FROM sales WHERE date >= '2024-01-01'"
df = pd.read_sql_query(query, conn)
conn.close()
df.to_excel('数据库导出.xlsx', index=False)
print(f'已导出 {len(df)} 条记录')
MySQL示例:
python
import pandas as pd
import pymysql
conn = pymysql.connect(
host='localhost',
user='username',
password='password',
database='mydb'
)
query = "SELECT * FROM sales"
df = pd.read_sql(query, conn)
conn.close()
df.to_excel('MySQL导出.xlsx', index=False)
5.2 从Excel导入到数据库
Python操作:
python
import pandas as pd
import sqlite3
# 读取Excel
df = pd.read_excel('数据.xlsx')
# 导入到SQLite
conn = sqlite3.connect('database.db')
df.to_sql('sales', conn, if_exists='replace', index=False)
conn.close()
print(f'已导入 {len(df)} 条记录到数据库')
Web数据抓取
6.1 抓取网页表格
Python操作:
python
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = 'https://example.com/data'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# 查找表格
table = soup.find('table')
# 转换为DataFrame
df = pd.read_html(str(table))[0]
# 保存到Excel
df.to_excel('网页数据.xlsx', index=False)
print('网页数据已保存')
6.2 API数据获取
Python操作:
python
import pandas as pd
import requests
# 调用API
url = 'https://api.example.com/data'
headers = {'Authorization': 'Bearer YOUR_TOKEN'}
response = requests.get(url, headers=headers)
# 解析JSON
data = response.json()
df = pd.DataFrame(data['results'])
# 保存到Excel
df.to_excel('API数据.xlsx', index=False)
print(f'已获取 {len(df)} 条记录')
完整自动化示例
综合案例:每日销售报表自动化
Python操作:
python
import pandas as pd
import schedule
import time
from datetime import datetime
from openpyxl import load_workbook
from openpyxl.styles import Font, PatternFill
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email import encoders
def generate_and_send_report():
"""生成并发送每日销售报表"""
try:
# 1. 读取数据
df = pd.read_excel('销售数据.xlsx')
# 2. 数据处理
today = datetime.now().date()
df['日期'] = pd.to_datetime(df['日期']).dt.date
today_data = df[df['日期'] == today]
summary = today_data.groupby('产品').agg({
'销售额': 'sum',
'数量': 'sum'
}).reset_index()
# 3. 生成Excel报表
output_file = f'日报_{datetime.now().strftime("%Y%m%d")}.xlsx'
summary.to_excel(output_file, index=False)
# 4. 格式化
wb = load_workbook(output_file)
ws = wb.active
for cell in ws[1]:
cell.fill = PatternFill(start_color='4472C4', fill_type='solid')
cell.font = Font(color='FFFFFF', bold=True)
wb.save(output_file)
# 5. 发送邮件
send_email(output_file)
print(f'{datetime.now()}: 报表生成并发送成功')
except Exception as e:
print(f'{datetime.now()}: 错误 - {str(e)}')
def send_email(file_path):
"""发送邮件"""
# 邮件发送逻辑
pass
# 设置定时任务:每天上午9点执行
schedule.every().day.at("09:00").do(generate_and_send_report)
print('自动化任务已启动...')
while True:
schedule.run_pending()
time.sleep(60)