Python 文件操作
目录
- 文件基础
- 文本文件操作
- 二进制文件操作
- 文件和目录管理
- [CSV 文件操作](#CSV 文件操作 "#csv-%E6%96%87%E4%BB%B6%E6%93%8D%E4%BD%9C")
- [JSON 文件操作](#JSON 文件操作 "#json-%E6%96%87%E4%BB%B6%E6%93%8D%E4%BD%9C")
- [Excel 文件操作](#Excel 文件操作 "#excel-%E6%96%87%E4%BB%B6%E6%93%8D%E4%BD%9C")
- [路径操作 pathlib](#路径操作 pathlib "#%E8%B7%AF%E5%BE%84%E6%93%8D%E4%BD%9C-pathlib")
- 综合实战
文件基础
打开和关闭文件
python
# 传统方式(不推荐)
file = open("test.txt", "r")
content = file.read()
file.close() # 必须手动关闭
# 推荐方式:使用 with 语句
with open("test.txt", "r") as file:
content = file.read()
# 自动关闭文件,即使发生异常
文件打开模式
| 模式 | 说明 | 文件不存在 |
|---|---|---|
r |
只读(默认) | 报错 |
w |
写入(覆盖) | 创建 |
a |
追加 | 创建 |
x |
独占创建 | 创建,已存在则报错 |
r+ |
读写 | 报错 |
w+ |
读写(覆盖) | 创建 |
a+ |
读写(追加) | 创建 |
rb |
二进制只读 | 报错 |
wb |
二进制写入 | 创建 |
指定编码
python
# 处理中文文件时指定编码
with open("chinese.txt", "r", encoding="utf-8") as f:
content = f.read()
# Windows 系统可能需要指定 encoding='gbk'
with open("windows_file.txt", "r", encoding="gbk") as f:
content = f.read()
文本文件操作
读取文件
python
# 方法1: read() - 读取全部内容
with open("test.txt", "r", encoding="utf-8") as f:
content = f.read()
print(content)
# 方法2: readline() - 逐行读取
with open("test.txt", "r", encoding="utf-8") as f:
line1 = f.readline() # 读取第一行
line2 = f.readline() # 读取第二行
print(line1.strip()) # strip() 去除换行符
print(line2.strip())
# 方法3: readlines() - 读取所有行为列表
with open("test.txt", "r", encoding="utf-8") as f:
lines = f.readlines()
for line in lines:
print(line.strip())
# 方法4: 直接遍历(推荐,内存效率高)
with open("test.txt", "r", encoding="utf-8") as f:
for line in f:
print(line.strip())
写入文件
python
# 写入文件(覆盖模式)
with open("output.txt", "w", encoding="utf-8") as f:
f.write("第一行\n")
f.write("第二行\n")
f.write("第三行\n")
# 使用 print 写入
with open("output.txt", "w", encoding="utf-8") as f:
print("Hello", file=f)
print("World", file=f)
# 写入多行
lines = ["苹果\n", "香蕉\n", "橙子\n"]
with open("fruits.txt", "w", encoding="utf-8") as f:
f.writelines(lines)
追加内容
python
# 追加模式
with open("log.txt", "a", encoding="utf-8") as f:
f.write("2024-01-15 10:30:00 - 用户登录\n")
f.write("2024-01-15 10:35:00 - 查看商品\n")
文件指针操作
python
with open("test.txt", "r", encoding="utf-8") as f:
# 读取前10个字符
content = f.read(10)
print(f"当前位置: {f.tell()}") # 显示当前指针位置
# 移动指针到开头
f.seek(0)
# 再次读取
content = f.read(10)
print(content)
实战示例:文件复制
python
def copy_file(src, dst):
"""复制文件"""
with open(src, "r", encoding="utf-8") as f_src:
with open(dst, "w", encoding="utf-8") as f_dst:
for line in f_src:
f_dst.write(line)
print(f"文件已从 {src} 复制到 {dst}")
copy_file("source.txt", "destination.txt")
二进制文件操作
读取二进制文件
python
# 读取图片
with open("image.jpg", "rb") as f:
data = f.read()
print(f"文件大小: {len(data)} 字节")
# 读取部分数据
with open("image.jpg", "rb") as f:
header = f.read(100) # 读取前100字节
写入二进制文件
python
# 写入二进制数据
data = bytes([0x89, 0x50, 0x4E, 0x47]) # PNG 文件头
with open("output.bin", "wb") as f:
f.write(data)
复制二进制文件
python
def copy_binary_file(src, dst):
"""复制二进制文件"""
with open(src, "rb") as f_src:
with open(dst, "wb") as f_dst:
while True:
chunk = f_src.read(4096) # 分块读取
if not chunk:
break
f_dst.write(chunk)
print(f"二进制文件已复制")
copy_binary_file("photo.jpg", "photo_copy.jpg")
文件和目录管理
os 模块
python
import os
# 获取当前工作目录
print(os.getcwd())
# 改变工作目录
os.chdir("/path/to/directory")
# 列出目录内容
files = os.listdir(".")
print(files)
# 创建目录
os.mkdir("new_folder")
os.makedirs("parent/child/grandchild", exist_ok=True) # 递归创建
# 删除目录
os.rmdir("empty_folder")
os.removedirs("parent/child/grandchild") # 递归删除空目录
# 重命名
os.rename("old_name.txt", "new_name.txt")
# 删除文件
os.remove("file.txt")
# 判断路径类型
print(os.path.isfile("test.txt")) # True/False
print(os.path.isdir("folder")) # True/False
print(os.path.exists("path")) # True/False
# 获取文件信息
stat_info = os.stat("test.txt")
print(f"文件大小: {stat_info.st_size} 字节")
print(f"创建时间: {stat_info.st_ctime}")
print(f"修改时间: {stat_info.st_mtime}")
shutil 模块
python
import shutil
# 复制文件
shutil.copy("source.txt", "destination.txt")
shutil.copy2("source.txt", "destination.txt") # 保留元数据
# 复制目录
shutil.copytree("source_dir", "dest_dir")
# 移动文件/目录
shutil.move("old_path", "new_path")
# 删除目录及其内容
shutil.rmtree("directory")
# 获取磁盘使用情况
usage = shutil.disk_usage("/")
print(f"总空间: {usage.total / (1024**3):.2f} GB")
print(f"已用: {usage.used / (1024**3):.2f} GB")
print(f"可用: {usage.free / (1024**3):.2f} GB")
glob 模块 - 文件匹配
python
import glob
# 查找特定类型的文件
txt_files = glob.glob("*.txt")
print(txt_files)
# 递归查找
all_py_files = glob.glob("**/*.py", recursive=True)
print(all_py_files)
# 模式匹配
images = glob.glob("images/*.jpg")
logs = glob.glob("logs/2024-*.log")
CSV 文件操作
读取 CSV 文件
python
import csv
# 基本读取
with open("data.csv", "r", encoding="utf-8") as f:
reader = csv.reader(f)
for row in reader:
print(row)
# 使用 DictReader(推荐)
with open("data.csv", "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
print(row)
# {'name': '张三', 'age': '25', 'city': '北京'}
写入 CSV 文件
python
import csv
# 基本写入
data = [
["姓名", "年龄", "城市"],
["张三", "25", "北京"],
["李四", "30", "上海"],
["王五", "28", "广州"]
]
with open("output.csv", "w", encoding="utf-8-sig", newline="") as f:
writer = csv.writer(f)
writer.writerows(data)
# 使用 DictWriter(推荐)
data = [
{"name": "张三", "age": 25, "city": "北京"},
{"name": "李四", "age": 30, "city": "上海"},
{"name": "王五", "age": 28, "city": "广州"}
]
fieldnames = ["name", "age", "city"]
with open("output.csv", "w", encoding="utf-8-sig", newline="") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(data)
CSV 实战:学生成绩管理
python
import csv
import os
class StudentManager:
def __init__(self, filename="students.csv"):
self.filename = filename
self.students = []
self.load_students()
def load_students(self):
"""从 CSV 加载学生数据"""
if not os.path.exists(self.filename):
return
with open(self.filename, "r", encoding="utf-8-sig") as f:
reader = csv.DictReader(f)
for row in reader:
row["age"] = int(row["age"])
row["score"] = float(row["score"])
self.students.append(row)
def save_students(self):
"""保存学生数据到 CSV"""
fieldnames = ["name", "age", "major", "score"]
with open(self.filename, "w", encoding="utf-8-sig", newline="") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(self.students)
def add_student(self, name, age, major, score):
"""添加学生"""
student = {
"name": name,
"age": age,
"major": major,
"score": score
}
self.students.append(student)
self.save_students()
print(f"学生 {name} 添加成功")
def show_all(self):
"""显示所有学生"""
if not self.students:
print("暂无学生数据")
return
print(f"\n{'姓名':<10}{'年龄':<6}{'专业':<15}{'成绩':<8}")
print("-" * 45)
for student in self.students:
print(f"{student['name']:<10}{student['age']:<6}"
f"{student['major']:<15}{student['score']:<8.1f}")
# 统计信息
scores = [s["score"] for s in self.students]
print("-" * 45)
print(f"平均分: {sum(scores)/len(scores):.1f}")
print(f"最高分: {max(scores):.1f}")
print(f"最低分: {min(scores):.1f}")
# 使用示例
manager = StudentManager()
manager.add_student("张三", 20, "计算机科学", 90.5)
manager.add_student("李四", 22, "数学", 85.0)
manager.add_student("王五", 21, "物理", 92.3)
manager.show_all()
JSON 文件操作
基本操作
python
import json
# Python 对象转 JSON 字符串
data = {
"name": "张三",
"age": 25,
"city": "北京",
"hobbies": ["读书", "游泳", "编程"]
}
json_str = json.dumps(data, ensure_ascii=False, indent=2)
print(json_str)
# JSON 字符串转 Python 对象
parsed = json.loads(json_str)
print(parsed["name"])
读写 JSON 文件
python
import json
# 写入 JSON 文件
data = {
"users": [
{"name": "张三", "age": 25},
{"name": "李四", "age": 30}
],
"total": 2
}
with open("data.json", "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
# 读取 JSON 文件
with open("data.json", "r", encoding="utf-8") as f:
data = json.load(f)
print(data)
JSON 实战:配置文件管理
python
import json
import os
class ConfigManager:
def __init__(self, filename="config.json"):
self.filename = filename
self.config = self.load_config()
def load_config(self):
"""加载配置"""
if os.path.exists(self.filename):
with open(self.filename, "r", encoding="utf-8") as f:
return json.load(f)
return {}
def save_config(self):
"""保存配置"""
with open(self.filename, "w", encoding="utf-8") as f:
json.dump(self.config, f, ensure_ascii=False, indent=2)
def get(self, key, default=None):
"""获取配置项"""
return self.config.get(key, default)
def set(self, key, value):
"""设置配置项"""
self.config[key] = value
self.save_config()
def delete(self, key):
"""删除配置项"""
if key in self.config:
del self.config[key]
self.save_config()
# 使用示例
config = ConfigManager()
# 设置配置
config.set("database", {
"host": "localhost",
"port": 3306,
"user": "root",
"password": "123456"
})
config.set("app_name", "我的应用")
config.set("debug", True)
# 获取配置
db_config = config.get("database")
print(f"数据库主机: {db_config['host']}")
print(f"应用名称: {config.get('app_name')}")
Excel 文件操作
使用 openpyxl(需要安装)
bash
pip install openpyxl
创建 Excel 文件
python
from openpyxl import Workbook
# 创建工作簿
wb = Workbook()
ws = wb.active
ws.title = "学生成绩"
# 添加表头
headers = ["姓名", "语文", "数学", "英语", "总分"]
ws.append(headers)
# 添加数据
data = [
["张三", 90, 85, 92],
["李四", 88, 92, 87],
["王五", 95, 90, 88]
]
for row in data:
# 计算总分
total = sum(row[1:])
ws.append(row + [total])
# 保存文件
wb.save("scores.xlsx")
print("Excel 文件已创建")
读取 Excel 文件
python
from openpyxl import load_workbook
# 加载工作簿
wb = load_workbook("scores.xlsx")
ws = wb.active
# 读取所有数据
print(f"工作表: {ws.title}")
print(f"行数: {ws.max_row}")
print(f"列数: {ws.max_column}")
# 遍历数据
for row in ws.iter_rows(min_row=1, values_only=True):
print(row)
# 访问特定单元格
cell_value = ws["A2"].value
print(f"A2 的值: {cell_value}")
格式化 Excel
python
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment, PatternFill
wb = Workbook()
ws = wb.active
# 设置字体
bold_font = Font(bold=True, color="FF0000", size=12)
normal_font = Font(size=11)
# 设置对齐
center_align = Alignment(horizontal="center", vertical="center")
# 设置填充
yellow_fill = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")
# 添加数据并格式化
ws["A1"] = "姓名"
ws["B1"] = "成绩"
ws["A1"].font = bold_font
ws["B1"].font = bold_font
ws["A1"].alignment = center_align
ws["B1"].alignment = center_align
ws["A2"] = "张三"
ws["B2"] = 95
ws["B2"].fill = yellow_fill
# 调整列宽
ws.column_dimensions["A"].width = 15
ws.column_dimensions["B"].width = 10
wb.save("formatted.xlsx")
路径操作 pathlib
Python 3.4+ 推荐的现代化路径操作方式。
基本操作
python
from pathlib import Path
# 创建路径对象
current_dir = Path(".")
absolute_path = Path("/home/user/documents")
relative_path = Path("folder/file.txt")
# 组合路径
config_path = Path.home() / ".config" / "myapp" / "config.json"
print(config_path)
# 获取路径信息
file_path = Path("documents/report.pdf")
print(file_path.name) # report.pdf
print(file_path.stem) # report
print(file_path.suffix) # .pdf
print(file_path.parent) # documents
print(file_path.absolute()) # 绝对路径
文件和目录操作
python
from pathlib import Path
# 创建目录
Path("new_folder").mkdir(exist_ok=True)
Path("parent/child").mkdir(parents=True, exist_ok=True)
# 检查路径
path = Path("test.txt")
print(path.exists()) # True/False
print(path.is_file()) # True/False
print(path.is_dir()) # True/False
# 读取文件
content = path.read_text(encoding="utf-8")
content_bytes = path.read_bytes()
# 写入文件
path.write_text("Hello, World!", encoding="utf-8")
path.write_bytes(b"binary data")
# 删除
path.unlink() # 删除文件
Path("empty_dir").rmdir() # 删除空目录
遍历目录
python
from pathlib import Path
# 列出目录内容
current = Path(".")
for item in current.iterdir():
print(item.name)
# 递归查找文件
for py_file in Path(".").rglob("*.py"):
print(py_file)
# 查找特定模式
for txt_file in Path("documents").glob("*.txt"):
print(txt_file)
路径操作示例
python
from pathlib import Path
def organize_files(directory):
"""按扩展名组织文件"""
dir_path = Path(directory)
if not dir_path.exists():
print("目录不存在")
return
for file_path in dir_path.iterdir():
if file_path.is_file():
suffix = file_path.suffix.lower()
if suffix:
# 创建子目录
subdir = dir_path / suffix[1:] # 去掉点号
subdir.mkdir(exist_ok=True)
# 移动文件
new_path = subdir / file_path.name
file_path.rename(new_path)
print(f"移动: {file_path.name} -> {subdir.name}/")
# 使用
organize_files("./downloads")
综合实战
实战1: 日志系统
python
import os
from datetime import datetime
from pathlib import Path
class Logger:
def __init__(self, log_dir="logs"):
self.log_dir = Path(log_dir)
self.log_dir.mkdir(exist_ok=True)
# 按日期创建日志文件
today = datetime.now().strftime("%Y-%m-%d")
self.log_file = self.log_dir / f"{today}.log"
def _get_timestamp(self):
"""获取时间戳"""
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def _write_log(self, level, message):
"""写入日志"""
timestamp = self._get_timestamp()
log_entry = f"[{timestamp}] [{level}] {message}\n"
with open(self.log_file, "a", encoding="utf-8") as f:
f.write(log_entry)
# 同时输出到控制台
print(log_entry.strip())
def info(self, message):
"""信息日志"""
self._write_log("INFO", message)
def warning(self, message):
"""警告日志"""
self._write_log("WARNING", message)
def error(self, message):
"""错误日志"""
self._write_log("ERROR", message)
def debug(self, message):
"""调试日志"""
self._write_log("DEBUG", message)
def get_logs(self, date=None):
"""获取日志内容"""
if date:
log_file = self.log_dir / f"{date}.log"
else:
log_file = self.log_file
if not log_file.exists():
return []
with open(log_file, "r", encoding="utf-8") as f:
return f.readlines()
def clear_old_logs(self, days=30):
"""清理旧日志"""
from datetime import timedelta
cutoff_date = datetime.now() - timedelta(days=days)
for log_file in self.log_dir.glob("*.log"):
# 从文件名提取日期
try:
file_date = datetime.strptime(log_file.stem, "%Y-%m-%d")
if file_date < cutoff_date:
log_file.unlink()
print(f"删除旧日志: {log_file.name}")
except ValueError:
continue
# 使用示例
logger = Logger()
logger.info("应用程序启动")
logger.debug("加载配置文件")
logger.warning("磁盘空间不足")
logger.error("数据库连接失败")
# 查看日志
logs = logger.get_logs()
print("\n=== 今日日志 ===")
for log in logs:
print(log.strip())
# 清理旧日志
logger.clear_old_logs(days=7)
实战2: 批量文件处理器
python
import os
import shutil
from pathlib import Path
from datetime import datetime
class BatchFileProcessor:
def __init__(self, source_dir, target_dir):
self.source_dir = Path(source_dir)
self.target_dir = Path(target_dir)
self.stats = {
"processed": 0,
"skipped": 0,
"errors": 0
}
def process_files(self, pattern="*.*", action="copy"):
"""批量处理文件"""
if not self.source_dir.exists():
print(f"源目录不存在: {self.source_dir}")
return
# 创建目标目录
self.target_dir.mkdir(parents=True, exist_ok=True)
# 查找文件
files = list(self.source_dir.glob(pattern))
if not files:
print("未找到匹配的文件")
return
print(f"找到 {len(files)} 个文件")
for file_path in files:
if file_path.is_file():
try:
if action == "copy":
self._copy_file(file_path)
elif action == "move":
self._move_file(file_path)
elif action == "rename":
self._rename_file(file_path)
elif action == "backup":
self._backup_file(file_path)
self.stats["processed"] += 1
except Exception as e:
print(f"处理失败 {file_path.name}: {e}")
self.stats["errors"] += 1
def _copy_file(self, file_path):
"""复制文件"""
target_path = self.target_dir / file_path.name
shutil.copy2(file_path, target_path)
print(f"复制: {file_path.name}")
def _move_file(self, file_path):
"""移动文件"""
target_path = self.target_dir / file_path.name
shutil.move(str(file_path), str(target_path))
print(f"移动: {file_path.name}")
def _rename_file(self, file_path):
"""重命名文件(添加时间戳)"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
new_name = f"{file_path.stem}_{timestamp}{file_path.suffix}"
target_path = self.target_dir / new_name
shutil.copy2(file_path, target_path)
print(f"重命名: {file_path.name} -> {new_name}")
def _backup_file(self, file_path):
"""备份文件(创建备份目录)"""
backup_dir = self.target_dir / "backups" / datetime.now().strftime("%Y-%m-%d")
backup_dir.mkdir(parents=True, exist_ok=True)
backup_path = backup_dir / file_path.name
shutil.copy2(file_path, backup_path)
print(f"备份: {file_path.name}")
def filter_by_size(self, min_size=0, max_size=None):
"""按文件大小过滤"""
filtered = []
for file_path in self.source_dir.rglob("*"):
if file_path.is_file():
size = file_path.stat().st_size
if size >= min_size:
if max_size is None or size <= max_size:
filtered.append(file_path)
return filtered
def filter_by_date(self, days_ago=7):
"""按修改日期过滤"""
from datetime import timedelta
cutoff_date = datetime.now() - timedelta(days=days_ago)
filtered = []
for file_path in self.source_dir.rglob("*"):
if file_path.is_file():
mtime = datetime.fromtimestamp(file_path.stat().st_mtime)
if mtime >= cutoff_date:
filtered.append(file_path)
return filtered
def show_stats(self):
"""显示统计信息"""
print("\n=== 处理统计 ===")
print(f"成功处理: {self.stats['processed']}")
print(f"跳过: {self.stats['skipped']}")
print(f"错误: {self.stats['errors']}")
# 使用示例
processor = BatchFileProcessor("./source", "./processed")
# 复制所有 txt 文件
processor.process_files("*.txt", action="copy")
# 移动所有 jpg 文件
processor.process_files("*.jpg", action="move")
# 备份所有文件
processor.process_files("*.*", action="backup")
# 显示统计
processor.show_stats()
# 按大小过滤
large_files = processor.filter_by_size(min_size=1024*1024) # 大于1MB
print(f"\n大文件数量: {len(large_files)}")
# 按日期过滤
recent_files = processor.filter_by_date(days_ago=7)
print(f"最近7天修改的文件: {len(recent_files)}")
实战3: 数据转换工具
python
import csv
import json
from pathlib import Path
class DataConverter:
"""数据格式转换工具"""
@staticmethod
def csv_to_json(csv_file, json_file):
"""CSV 转 JSON"""
data = []
with open(csv_file, "r", encoding="utf-8-sig") as f:
reader = csv.DictReader(f)
for row in reader:
# 尝试转换数字类型
converted_row = {}
for key, value in row.items():
try:
if "." in value:
converted_row[key] = float(value)
else:
converted_row[key] = int(value)
except (ValueError, TypeError):
converted_row[key] = value
data.append(converted_row)
with open(json_file, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f"转换完成: {csv_file} -> {json_file}")
print(f"共 {len(data)} 条记录")
@staticmethod
def json_to_csv(json_file, csv_file):
"""JSON 转 CSV"""
with open(json_file, "r", encoding="utf-8") as f:
data = json.load(f)
if not data:
print("JSON 文件为空")
return
# 获取所有字段名
fieldnames = list(data[0].keys())
with open(csv_file, "w", encoding="utf-8-sig", newline="") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(data)
print(f"转换完成: {json_file} -> {csv_file}")
print(f"共 {len(data)} 条记录")
@staticmethod
def merge_csv_files(input_files, output_file):
"""合并多个 CSV 文件"""
all_data = []
fieldnames = None
for file_path in input_files:
with open(file_path, "r", encoding="utf-8-sig") as f:
reader = csv.DictReader(f)
if fieldnames is None:
fieldnames = reader.fieldnames
for row in reader:
all_data.append(row)
with open(output_file, "w", encoding="utf-8-sig", newline="") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(all_data)
print(f"合并完成: {len(input_files)} 个文件 -> {output_file}")
print(f"共 {len(all_data)} 条记录")
@staticmethod
def split_csv(input_file, output_dir, rows_per_file=1000):
"""拆分大 CSV 文件"""
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
with open(input_file, "r", encoding="utf-8-sig") as f:
reader = csv.DictReader(f)
fieldnames = reader.fieldnames
file_count = 0
current_rows = []
for row in reader:
current_rows.append(row)
if len(current_rows) >= rows_per_file:
file_count += 1
output_file = output_path / f"part_{file_count}.csv"
with open(output_file, "w", encoding="utf-8-sig", newline="") as out_f:
writer = csv.DictWriter(out_f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(current_rows)
print(f"创建文件: {output_file} ({len(current_rows)} 条)")
current_rows = []
# 处理剩余数据
if current_rows:
file_count += 1
output_file = output_path / f"part_{file_count}.csv"
with open(output_file, "w", encoding="utf-8-sig", newline="") as out_f:
writer = csv.DictWriter(out_f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(current_rows)
print(f"创建文件: {output_file} ({len(current_rows)} 条)")
print(f"拆分完成: 共 {file_count} 个文件")
# 使用示例
converter = DataConverter()
# CSV 转 JSON
converter.csv_to_json("students.csv", "students.json")
# JSON 转 CSV
converter.json_to_csv("students.json", "students_new.csv")
# 合并多个 CSV
csv_files = ["data1.csv", "data2.csv", "data3.csv"]
converter.merge_csv_files(csv_files, "merged.csv")
# 拆分大文件
converter.split_csv("large_file.csv", "split_output", rows_per_file=5000)
实战4: 文件搜索工具
python
import os
from pathlib import Path
from datetime import datetime
class FileSearcher:
"""文件搜索工具"""
def __init__(self, search_dir="."):
self.search_dir = Path(search_dir)
self.results = []
def search_by_name(self, pattern, case_sensitive=False):
"""按文件名搜索"""
self.results = []
if case_sensitive:
for file_path in self.search_dir.rglob("*"):
if file_path.is_file() and pattern in file_path.name:
self.results.append(file_path)
else:
for file_path in self.search_dir.rglob(f"*{pattern}*"):
if file_path.is_file():
self.results.append(file_path)
return self.results
def search_by_extension(self, extensions):
"""按扩展名搜索"""
self.results = []
if isinstance(extensions, str):
extensions = [extensions]
for ext in extensions:
if not ext.startswith("."):
ext = f".{ext}"
for file_path in self.search_dir.rglob(f"*{ext}"):
if file_path.is_file():
self.results.append(file_path)
return self.results
def search_by_size(self, min_size=0, max_size=None):
"""按文件大小搜索"""
self.results = []
for file_path in self.search_dir.rglob("*"):
if file_path.is_file():
size = file_path.stat().st_size
if size >= min_size:
if max_size is None or size <= max_size:
self.results.append(file_path)
return self.results
def search_by_date(self, days_ago=None, before_date=None, after_date=None):
"""按日期搜索"""
from datetime import timedelta
self.results = []
for file_path in self.search_dir.rglob("*"):
if not file_path.is_file():
continue
mtime = datetime.fromtimestamp(file_path.stat().st_mtime)
if days_ago:
cutoff = datetime.now() - timedelta(days=days_ago)
if mtime >= cutoff:
self.results.append(file_path)
elif before_date:
if isinstance(before_date, str):
before_date = datetime.strptime(before_date, "%Y-%m-%d")
if mtime < before_date:
self.results.append(file_path)
elif after_date:
if isinstance(after_date, str):
after_date = datetime.strptime(after_date, "%Y-%m-%d")
if mtime > after_date:
self.results.append(file_path)
return self.results
def search_by_content(self, keyword, file_pattern="*.txt"):
"""按文件内容搜索"""
self.results = []
for file_path in self.search_dir.glob(file_pattern):
if not file_path.is_file():
continue
try:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
if keyword in content:
self.results.append(file_path)
except (UnicodeDecodeError, PermissionError):
continue
return self.results
def show_results(self, max_results=20):
"""显示搜索结果"""
if not self.results:
print("未找到匹配的文件")
return
print(f"\n找到 {len(self.results)} 个文件:\n")
print(f"{'文件名':<30}{'大小':<12}{'修改时间':<20}{'路径'}")
print("-" * 100)
for i, file_path in enumerate(self.results[:max_results]):
stat = file_path.stat()
size = self._format_size(stat.st_size)
mtime = datetime.fromtimestamp(stat.st_mtime).strftime("%Y-%m-%d %H:%M")
print(f"{file_path.name:<30}{size:<12}{mtime:<20}{file_path.parent}")
if len(self.results) > max_results:
print(f"\n... 还有 {len(self.results) - max_results} 个文件")
def _format_size(self, size_bytes):
"""格式化文件大小"""
for unit in ['B', 'KB', 'MB', 'GB']:
if size_bytes < 1024:
return f"{size_bytes:.1f} {unit}"
size_bytes /= 1024
return f"{size_bytes:.1f} TB"
def export_results(self, output_file="search_results.txt"):
"""导出搜索结果"""
with open(output_file, "w", encoding="utf-8") as f:
f.write(f"搜索结果 - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"搜索目录: {self.search_dir}\n")
f.write(f"找到文件: {len(self.results)}\n")
f.write("=" * 80 + "\n\n")
for file_path in self.results:
stat = file_path.stat()
size = self._format_size(stat.st_size)
mtime = datetime.fromtimestamp(stat.st_mtime).strftime("%Y-%m-%d %H:%M")
f.write(f"文件: {file_path.name}\n")
f.write(f"路径: {file_path}\n")
f.write(f"大小: {size}\n")
f.write(f"修改时间: {mtime}\n")
f.write("-" * 80 + "\n")
print(f"结果已导出到: {output_file}")
# 使用示例
searcher = FileSearcher(".")
# 按名称搜索
print("=== 搜索 Python 文件 ===")
searcher.search_by_name(".py")
searcher.show_results()
# 按扩展名搜索
print("\n=== 搜索图片和文档 ===")
searcher.search_by_extension(["jpg", "png", "pdf", "docx"])
searcher.show_results()
# 按大小搜索
print("\n=== 搜索大文件(>10MB) ===")
searcher.search_by_size(min_size=10*1024*1024)
searcher.show_results()
# 按日期搜索
print("\n=== 搜索最近7天的文件 ===")
searcher.search_by_date(days_ago=7)
searcher.show_results()
# 按内容搜索
print("\n=== 搜索包含 'TODO' 的文件 ===")
searcher.search_by_content("TODO", "*.py")
searcher.show_results()
# 导出结果
searcher.export_results("results.txt")
常见错误与注意事项
1. 忘记关闭文件
python
# ❌ 危险:可能忘记关闭
f = open("test.txt", "r")
content = f.read()
# 如果中间出错,文件不会关闭
# ✅ 推荐:使用 with 语句
with open("test.txt", "r") as f:
content = f.read()
# 自动关闭
2. 编码问题
python
# ❌ 可能导致中文乱码
with open("chinese.txt", "r") as f:
content = f.read()
# ✅ 指定编码
with open("chinese.txt", "r", encoding="utf-8") as f:
content = f.read()
3. 文件不存在
python
from pathlib import Path
# ❌ 可能抛出异常
# with open("not_exist.txt", "r") as f:
# content = f.read()
# ✅ 先检查
file_path = Path("not_exist.txt")
if file_path.exists():
content = file_path.read_text(encoding="utf-8")
else:
print("文件不存在")
# 或使用 try-except
try:
with open("not_exist.txt", "r", encoding="utf-8") as f:
content = f.read()
except FileNotFoundError:
print("文件不存在")
4. 写入时覆盖重要文件
python
# ❌ 危险:可能意外覆盖
with open("important.txt", "w") as f:
f.write("new content")
# ✅ 先检查或使用追加模式
from pathlib import Path
file_path = Path("important.txt")
if file_path.exists():
choice = input("文件已存在,是否覆盖?(y/n): ")
if choice.lower() != "y":
print("取消操作")
else:
file_path.write_text("new content", encoding="utf-8")
else:
file_path.write_text("new content", encoding="utf-8")
5. 大文件内存溢出
python
# ❌ 危险:一次性加载大文件
with open("huge_file.txt", "r") as f:
content = f.read() # 可能占用大量内存
# ✅ 逐行读取
with open("huge_file.txt", "r") as f:
for line in f:
process(line)
# ✅ 分块读取
with open("huge_file.bin", "rb") as f:
while True:
chunk = f.read(8192)
if not chunk:
break
process(chunk)
小结
| 操作类型 | 推荐模块/方法 | 适用场景 |
|---|---|---|
| 文本文件 | open() + with | 普通文本读写 |
| 二进制文件 | open(mode="rb/wb") | 图片、音频等 |
| 目录管理 | os, shutil, pathlib | 文件/目录操作 |
| CSV 文件 | csv 模块 | 表格数据 |
| JSON 文件 | json 模块 | 配置、API 数据 |
| Excel 文件 | openpyxl | 电子表格 |
| 路径操作 | pathlib | 现代化路径处理 |
核心要点:
- 始终使用 with 语句管理文件
- 处理中文时指定 encoding="utf-8"
- 大文件使用逐行或分块读取
- pathlib 是现代化的路径操作方式
- 操作前检查文件是否存在
- 注意文件打开模式的选择
- 合理使用异常处理
掌握文件操作是 Python 编程的重要技能!