Python 文件操作

Python 文件操作

目录

  • 文件基础
  • 文本文件操作
  • 二进制文件操作
  • 文件和目录管理
  • [CSV 文件操作](#CSV 文件操作 "#csv-%E6%96%87%E4%BB%B6%E6%93%8D%E4%BD%9C")
  • [JSON 文件操作](#JSON 文件操作 "#json-%E6%96%87%E4%BB%B6%E6%93%8D%E4%BD%9C")
  • [Excel 文件操作](#Excel 文件操作 "#excel-%E6%96%87%E4%BB%B6%E6%93%8D%E4%BD%9C")
  • [路径操作 pathlib](#路径操作 pathlib "#%E8%B7%AF%E5%BE%84%E6%93%8D%E4%BD%9C-pathlib")
  • 综合实战

文件基础

打开和关闭文件

python 复制代码
# 传统方式(不推荐)
file = open("test.txt", "r")
content = file.read()
file.close()  # 必须手动关闭

# 推荐方式:使用 with 语句
with open("test.txt", "r") as file:
    content = file.read()
# 自动关闭文件,即使发生异常

文件打开模式

模式 说明 文件不存在
r 只读(默认) 报错
w 写入(覆盖) 创建
a 追加 创建
x 独占创建 创建,已存在则报错
r+ 读写 报错
w+ 读写(覆盖) 创建
a+ 读写(追加) 创建
rb 二进制只读 报错
wb 二进制写入 创建

指定编码

python 复制代码
# 处理中文文件时指定编码
with open("chinese.txt", "r", encoding="utf-8") as f:
    content = f.read()

# Windows 系统可能需要指定 encoding='gbk'
with open("windows_file.txt", "r", encoding="gbk") as f:
    content = f.read()

文本文件操作

读取文件

python 复制代码
# 方法1: read() - 读取全部内容
with open("test.txt", "r", encoding="utf-8") as f:
    content = f.read()
    print(content)

# 方法2: readline() - 逐行读取
with open("test.txt", "r", encoding="utf-8") as f:
    line1 = f.readline()  # 读取第一行
    line2 = f.readline()  # 读取第二行
    print(line1.strip())  # strip() 去除换行符
    print(line2.strip())

# 方法3: readlines() - 读取所有行为列表
with open("test.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()
    for line in lines:
        print(line.strip())

# 方法4: 直接遍历(推荐,内存效率高)
with open("test.txt", "r", encoding="utf-8") as f:
    for line in f:
        print(line.strip())

写入文件

python 复制代码
# 写入文件(覆盖模式)
with open("output.txt", "w", encoding="utf-8") as f:
    f.write("第一行\n")
    f.write("第二行\n")
    f.write("第三行\n")

# 使用 print 写入
with open("output.txt", "w", encoding="utf-8") as f:
    print("Hello", file=f)
    print("World", file=f)

# 写入多行
lines = ["苹果\n", "香蕉\n", "橙子\n"]
with open("fruits.txt", "w", encoding="utf-8") as f:
    f.writelines(lines)

追加内容

python 复制代码
# 追加模式
with open("log.txt", "a", encoding="utf-8") as f:
    f.write("2024-01-15 10:30:00 - 用户登录\n")
    f.write("2024-01-15 10:35:00 - 查看商品\n")

文件指针操作

python 复制代码
with open("test.txt", "r", encoding="utf-8") as f:
    # 读取前10个字符
    content = f.read(10)
    print(f"当前位置: {f.tell()}")  # 显示当前指针位置

    # 移动指针到开头
    f.seek(0)

    # 再次读取
    content = f.read(10)
    print(content)

实战示例:文件复制

python 复制代码
def copy_file(src, dst):
    """复制文件"""
    with open(src, "r", encoding="utf-8") as f_src:
        with open(dst, "w", encoding="utf-8") as f_dst:
            for line in f_src:
                f_dst.write(line)
    print(f"文件已从 {src} 复制到 {dst}")

copy_file("source.txt", "destination.txt")

二进制文件操作

读取二进制文件

python 复制代码
# 读取图片
with open("image.jpg", "rb") as f:
    data = f.read()
    print(f"文件大小: {len(data)} 字节")

# 读取部分数据
with open("image.jpg", "rb") as f:
    header = f.read(100)  # 读取前100字节

写入二进制文件

python 复制代码
# 写入二进制数据
data = bytes([0x89, 0x50, 0x4E, 0x47])  # PNG 文件头
with open("output.bin", "wb") as f:
    f.write(data)

复制二进制文件

python 复制代码
def copy_binary_file(src, dst):
    """复制二进制文件"""
    with open(src, "rb") as f_src:
        with open(dst, "wb") as f_dst:
            while True:
                chunk = f_src.read(4096)  # 分块读取
                if not chunk:
                    break
                f_dst.write(chunk)
    print(f"二进制文件已复制")

copy_binary_file("photo.jpg", "photo_copy.jpg")

文件和目录管理

os 模块

python 复制代码
import os

# 获取当前工作目录
print(os.getcwd())

# 改变工作目录
os.chdir("/path/to/directory")

# 列出目录内容
files = os.listdir(".")
print(files)

# 创建目录
os.mkdir("new_folder")
os.makedirs("parent/child/grandchild", exist_ok=True)  # 递归创建

# 删除目录
os.rmdir("empty_folder")
os.removedirs("parent/child/grandchild")  # 递归删除空目录

# 重命名
os.rename("old_name.txt", "new_name.txt")

# 删除文件
os.remove("file.txt")

# 判断路径类型
print(os.path.isfile("test.txt"))     # True/False
print(os.path.isdir("folder"))        # True/False
print(os.path.exists("path"))         # True/False

# 获取文件信息
stat_info = os.stat("test.txt")
print(f"文件大小: {stat_info.st_size} 字节")
print(f"创建时间: {stat_info.st_ctime}")
print(f"修改时间: {stat_info.st_mtime}")

shutil 模块

python 复制代码
import shutil

# 复制文件
shutil.copy("source.txt", "destination.txt")
shutil.copy2("source.txt", "destination.txt")  # 保留元数据

# 复制目录
shutil.copytree("source_dir", "dest_dir")

# 移动文件/目录
shutil.move("old_path", "new_path")

# 删除目录及其内容
shutil.rmtree("directory")

# 获取磁盘使用情况
usage = shutil.disk_usage("/")
print(f"总空间: {usage.total / (1024**3):.2f} GB")
print(f"已用: {usage.used / (1024**3):.2f} GB")
print(f"可用: {usage.free / (1024**3):.2f} GB")

glob 模块 - 文件匹配

python 复制代码
import glob

# 查找特定类型的文件
txt_files = glob.glob("*.txt")
print(txt_files)

# 递归查找
all_py_files = glob.glob("**/*.py", recursive=True)
print(all_py_files)

# 模式匹配
images = glob.glob("images/*.jpg")
logs = glob.glob("logs/2024-*.log")

CSV 文件操作

读取 CSV 文件

python 复制代码
import csv

# 基本读取
with open("data.csv", "r", encoding="utf-8") as f:
    reader = csv.reader(f)
    for row in reader:
        print(row)

# 使用 DictReader(推荐)
with open("data.csv", "r", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        print(row)
        # {'name': '张三', 'age': '25', 'city': '北京'}

写入 CSV 文件

python 复制代码
import csv

# 基本写入
data = [
    ["姓名", "年龄", "城市"],
    ["张三", "25", "北京"],
    ["李四", "30", "上海"],
    ["王五", "28", "广州"]
]

with open("output.csv", "w", encoding="utf-8-sig", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(data)

# 使用 DictWriter(推荐)
data = [
    {"name": "张三", "age": 25, "city": "北京"},
    {"name": "李四", "age": 30, "city": "上海"},
    {"name": "王五", "age": 28, "city": "广州"}
]

fieldnames = ["name", "age", "city"]
with open("output.csv", "w", encoding="utf-8-sig", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(data)

CSV 实战:学生成绩管理

python 复制代码
import csv
import os

class StudentManager:
    def __init__(self, filename="students.csv"):
        self.filename = filename
        self.students = []
        self.load_students()

    def load_students(self):
        """从 CSV 加载学生数据"""
        if not os.path.exists(self.filename):
            return

        with open(self.filename, "r", encoding="utf-8-sig") as f:
            reader = csv.DictReader(f)
            for row in reader:
                row["age"] = int(row["age"])
                row["score"] = float(row["score"])
                self.students.append(row)

    def save_students(self):
        """保存学生数据到 CSV"""
        fieldnames = ["name", "age", "major", "score"]
        with open(self.filename, "w", encoding="utf-8-sig", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(self.students)

    def add_student(self, name, age, major, score):
        """添加学生"""
        student = {
            "name": name,
            "age": age,
            "major": major,
            "score": score
        }
        self.students.append(student)
        self.save_students()
        print(f"学生 {name} 添加成功")

    def show_all(self):
        """显示所有学生"""
        if not self.students:
            print("暂无学生数据")
            return

        print(f"\n{'姓名':<10}{'年龄':<6}{'专业':<15}{'成绩':<8}")
        print("-" * 45)
        for student in self.students:
            print(f"{student['name']:<10}{student['age']:<6}"
                  f"{student['major']:<15}{student['score']:<8.1f}")

        # 统计信息
        scores = [s["score"] for s in self.students]
        print("-" * 45)
        print(f"平均分: {sum(scores)/len(scores):.1f}")
        print(f"最高分: {max(scores):.1f}")
        print(f"最低分: {min(scores):.1f}")

# 使用示例
manager = StudentManager()
manager.add_student("张三", 20, "计算机科学", 90.5)
manager.add_student("李四", 22, "数学", 85.0)
manager.add_student("王五", 21, "物理", 92.3)
manager.show_all()

JSON 文件操作

基本操作

python 复制代码
import json

# Python 对象转 JSON 字符串
data = {
    "name": "张三",
    "age": 25,
    "city": "北京",
    "hobbies": ["读书", "游泳", "编程"]
}

json_str = json.dumps(data, ensure_ascii=False, indent=2)
print(json_str)

# JSON 字符串转 Python 对象
parsed = json.loads(json_str)
print(parsed["name"])

读写 JSON 文件

python 复制代码
import json

# 写入 JSON 文件
data = {
    "users": [
        {"name": "张三", "age": 25},
        {"name": "李四", "age": 30}
    ],
    "total": 2
}

with open("data.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

# 读取 JSON 文件
with open("data.json", "r", encoding="utf-8") as f:
    data = json.load(f)
    print(data)

JSON 实战:配置文件管理

python 复制代码
import json
import os

class ConfigManager:
    def __init__(self, filename="config.json"):
        self.filename = filename
        self.config = self.load_config()

    def load_config(self):
        """加载配置"""
        if os.path.exists(self.filename):
            with open(self.filename, "r", encoding="utf-8") as f:
                return json.load(f)
        return {}

    def save_config(self):
        """保存配置"""
        with open(self.filename, "w", encoding="utf-8") as f:
            json.dump(self.config, f, ensure_ascii=False, indent=2)

    def get(self, key, default=None):
        """获取配置项"""
        return self.config.get(key, default)

    def set(self, key, value):
        """设置配置项"""
        self.config[key] = value
        self.save_config()

    def delete(self, key):
        """删除配置项"""
        if key in self.config:
            del self.config[key]
            self.save_config()

# 使用示例
config = ConfigManager()

# 设置配置
config.set("database", {
    "host": "localhost",
    "port": 3306,
    "user": "root",
    "password": "123456"
})

config.set("app_name", "我的应用")
config.set("debug", True)

# 获取配置
db_config = config.get("database")
print(f"数据库主机: {db_config['host']}")
print(f"应用名称: {config.get('app_name')}")

Excel 文件操作

使用 openpyxl(需要安装)

bash 复制代码
pip install openpyxl

创建 Excel 文件

python 复制代码
from openpyxl import Workbook

# 创建工作簿
wb = Workbook()
ws = wb.active
ws.title = "学生成绩"

# 添加表头
headers = ["姓名", "语文", "数学", "英语", "总分"]
ws.append(headers)

# 添加数据
data = [
    ["张三", 90, 85, 92],
    ["李四", 88, 92, 87],
    ["王五", 95, 90, 88]
]

for row in data:
    # 计算总分
    total = sum(row[1:])
    ws.append(row + [total])

# 保存文件
wb.save("scores.xlsx")
print("Excel 文件已创建")

读取 Excel 文件

python 复制代码
from openpyxl import load_workbook

# 加载工作簿
wb = load_workbook("scores.xlsx")
ws = wb.active

# 读取所有数据
print(f"工作表: {ws.title}")
print(f"行数: {ws.max_row}")
print(f"列数: {ws.max_column}")

# 遍历数据
for row in ws.iter_rows(min_row=1, values_only=True):
    print(row)

# 访问特定单元格
cell_value = ws["A2"].value
print(f"A2 的值: {cell_value}")

格式化 Excel

python 复制代码
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment, PatternFill

wb = Workbook()
ws = wb.active

# 设置字体
bold_font = Font(bold=True, color="FF0000", size=12)
normal_font = Font(size=11)

# 设置对齐
center_align = Alignment(horizontal="center", vertical="center")

# 设置填充
yellow_fill = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")

# 添加数据并格式化
ws["A1"] = "姓名"
ws["B1"] = "成绩"
ws["A1"].font = bold_font
ws["B1"].font = bold_font
ws["A1"].alignment = center_align
ws["B1"].alignment = center_align

ws["A2"] = "张三"
ws["B2"] = 95
ws["B2"].fill = yellow_fill

# 调整列宽
ws.column_dimensions["A"].width = 15
ws.column_dimensions["B"].width = 10

wb.save("formatted.xlsx")

路径操作 pathlib

Python 3.4+ 推荐的现代化路径操作方式。

基本操作

python 复制代码
from pathlib import Path

# 创建路径对象
current_dir = Path(".")
absolute_path = Path("/home/user/documents")
relative_path = Path("folder/file.txt")

# 组合路径
config_path = Path.home() / ".config" / "myapp" / "config.json"
print(config_path)

# 获取路径信息
file_path = Path("documents/report.pdf")
print(file_path.name)       # report.pdf
print(file_path.stem)       # report
print(file_path.suffix)     # .pdf
print(file_path.parent)     # documents
print(file_path.absolute()) # 绝对路径

文件和目录操作

python 复制代码
from pathlib import Path

# 创建目录
Path("new_folder").mkdir(exist_ok=True)
Path("parent/child").mkdir(parents=True, exist_ok=True)

# 检查路径
path = Path("test.txt")
print(path.exists())      # True/False
print(path.is_file())     # True/False
print(path.is_dir())      # True/False

# 读取文件
content = path.read_text(encoding="utf-8")
content_bytes = path.read_bytes()

# 写入文件
path.write_text("Hello, World!", encoding="utf-8")
path.write_bytes(b"binary data")

# 删除
path.unlink()             # 删除文件
Path("empty_dir").rmdir() # 删除空目录

遍历目录

python 复制代码
from pathlib import Path

# 列出目录内容
current = Path(".")
for item in current.iterdir():
    print(item.name)

# 递归查找文件
for py_file in Path(".").rglob("*.py"):
    print(py_file)

# 查找特定模式
for txt_file in Path("documents").glob("*.txt"):
    print(txt_file)

路径操作示例

python 复制代码
from pathlib import Path

def organize_files(directory):
    """按扩展名组织文件"""
    dir_path = Path(directory)

    if not dir_path.exists():
        print("目录不存在")
        return

    for file_path in dir_path.iterdir():
        if file_path.is_file():
            suffix = file_path.suffix.lower()
            if suffix:
                # 创建子目录
                subdir = dir_path / suffix[1:]  # 去掉点号
                subdir.mkdir(exist_ok=True)

                # 移动文件
                new_path = subdir / file_path.name
                file_path.rename(new_path)
                print(f"移动: {file_path.name} -> {subdir.name}/")

# 使用
organize_files("./downloads")

综合实战

实战1: 日志系统

python 复制代码
import os
from datetime import datetime
from pathlib import Path

class Logger:
    def __init__(self, log_dir="logs"):
        self.log_dir = Path(log_dir)
        self.log_dir.mkdir(exist_ok=True)

        # 按日期创建日志文件
        today = datetime.now().strftime("%Y-%m-%d")
        self.log_file = self.log_dir / f"{today}.log"

    def _get_timestamp(self):
        """获取时间戳"""
        return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    def _write_log(self, level, message):
        """写入日志"""
        timestamp = self._get_timestamp()
        log_entry = f"[{timestamp}] [{level}] {message}\n"

        with open(self.log_file, "a", encoding="utf-8") as f:
            f.write(log_entry)

        # 同时输出到控制台
        print(log_entry.strip())

    def info(self, message):
        """信息日志"""
        self._write_log("INFO", message)

    def warning(self, message):
        """警告日志"""
        self._write_log("WARNING", message)

    def error(self, message):
        """错误日志"""
        self._write_log("ERROR", message)

    def debug(self, message):
        """调试日志"""
        self._write_log("DEBUG", message)

    def get_logs(self, date=None):
        """获取日志内容"""
        if date:
            log_file = self.log_dir / f"{date}.log"
        else:
            log_file = self.log_file

        if not log_file.exists():
            return []

        with open(log_file, "r", encoding="utf-8") as f:
            return f.readlines()

    def clear_old_logs(self, days=30):
        """清理旧日志"""
        from datetime import timedelta

        cutoff_date = datetime.now() - timedelta(days=days)

        for log_file in self.log_dir.glob("*.log"):
            # 从文件名提取日期
            try:
                file_date = datetime.strptime(log_file.stem, "%Y-%m-%d")
                if file_date < cutoff_date:
                    log_file.unlink()
                    print(f"删除旧日志: {log_file.name}")
            except ValueError:
                continue

# 使用示例
logger = Logger()

logger.info("应用程序启动")
logger.debug("加载配置文件")
logger.warning("磁盘空间不足")
logger.error("数据库连接失败")

# 查看日志
logs = logger.get_logs()
print("\n=== 今日日志 ===")
for log in logs:
    print(log.strip())

# 清理旧日志
logger.clear_old_logs(days=7)

实战2: 批量文件处理器

python 复制代码
import os
import shutil
from pathlib import Path
from datetime import datetime

class BatchFileProcessor:
    def __init__(self, source_dir, target_dir):
        self.source_dir = Path(source_dir)
        self.target_dir = Path(target_dir)
        self.stats = {
            "processed": 0,
            "skipped": 0,
            "errors": 0
        }

    def process_files(self, pattern="*.*", action="copy"):
        """批量处理文件"""
        if not self.source_dir.exists():
            print(f"源目录不存在: {self.source_dir}")
            return

        # 创建目标目录
        self.target_dir.mkdir(parents=True, exist_ok=True)

        # 查找文件
        files = list(self.source_dir.glob(pattern))

        if not files:
            print("未找到匹配的文件")
            return

        print(f"找到 {len(files)} 个文件")

        for file_path in files:
            if file_path.is_file():
                try:
                    if action == "copy":
                        self._copy_file(file_path)
                    elif action == "move":
                        self._move_file(file_path)
                    elif action == "rename":
                        self._rename_file(file_path)
                    elif action == "backup":
                        self._backup_file(file_path)

                    self.stats["processed"] += 1
                except Exception as e:
                    print(f"处理失败 {file_path.name}: {e}")
                    self.stats["errors"] += 1

    def _copy_file(self, file_path):
        """复制文件"""
        target_path = self.target_dir / file_path.name
        shutil.copy2(file_path, target_path)
        print(f"复制: {file_path.name}")

    def _move_file(self, file_path):
        """移动文件"""
        target_path = self.target_dir / file_path.name
        shutil.move(str(file_path), str(target_path))
        print(f"移动: {file_path.name}")

    def _rename_file(self, file_path):
        """重命名文件(添加时间戳)"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        new_name = f"{file_path.stem}_{timestamp}{file_path.suffix}"
        target_path = self.target_dir / new_name
        shutil.copy2(file_path, target_path)
        print(f"重命名: {file_path.name} -> {new_name}")

    def _backup_file(self, file_path):
        """备份文件(创建备份目录)"""
        backup_dir = self.target_dir / "backups" / datetime.now().strftime("%Y-%m-%d")
        backup_dir.mkdir(parents=True, exist_ok=True)

        backup_path = backup_dir / file_path.name
        shutil.copy2(file_path, backup_path)
        print(f"备份: {file_path.name}")

    def filter_by_size(self, min_size=0, max_size=None):
        """按文件大小过滤"""
        filtered = []
        for file_path in self.source_dir.rglob("*"):
            if file_path.is_file():
                size = file_path.stat().st_size
                if size >= min_size:
                    if max_size is None or size <= max_size:
                        filtered.append(file_path)
        return filtered

    def filter_by_date(self, days_ago=7):
        """按修改日期过滤"""
        from datetime import timedelta

        cutoff_date = datetime.now() - timedelta(days=days_ago)
        filtered = []

        for file_path in self.source_dir.rglob("*"):
            if file_path.is_file():
                mtime = datetime.fromtimestamp(file_path.stat().st_mtime)
                if mtime >= cutoff_date:
                    filtered.append(file_path)

        return filtered

    def show_stats(self):
        """显示统计信息"""
        print("\n=== 处理统计 ===")
        print(f"成功处理: {self.stats['processed']}")
        print(f"跳过: {self.stats['skipped']}")
        print(f"错误: {self.stats['errors']}")

# 使用示例
processor = BatchFileProcessor("./source", "./processed")

# 复制所有 txt 文件
processor.process_files("*.txt", action="copy")

# 移动所有 jpg 文件
processor.process_files("*.jpg", action="move")

# 备份所有文件
processor.process_files("*.*", action="backup")

# 显示统计
processor.show_stats()

# 按大小过滤
large_files = processor.filter_by_size(min_size=1024*1024)  # 大于1MB
print(f"\n大文件数量: {len(large_files)}")

# 按日期过滤
recent_files = processor.filter_by_date(days_ago=7)
print(f"最近7天修改的文件: {len(recent_files)}")

实战3: 数据转换工具

python 复制代码
import csv
import json
from pathlib import Path

class DataConverter:
    """数据格式转换工具"""

    @staticmethod
    def csv_to_json(csv_file, json_file):
        """CSV 转 JSON"""
        data = []

        with open(csv_file, "r", encoding="utf-8-sig") as f:
            reader = csv.DictReader(f)
            for row in reader:
                # 尝试转换数字类型
                converted_row = {}
                for key, value in row.items():
                    try:
                        if "." in value:
                            converted_row[key] = float(value)
                        else:
                            converted_row[key] = int(value)
                    except (ValueError, TypeError):
                        converted_row[key] = value

                data.append(converted_row)

        with open(json_file, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=2)

        print(f"转换完成: {csv_file} -> {json_file}")
        print(f"共 {len(data)} 条记录")

    @staticmethod
    def json_to_csv(json_file, csv_file):
        """JSON 转 CSV"""
        with open(json_file, "r", encoding="utf-8") as f:
            data = json.load(f)

        if not data:
            print("JSON 文件为空")
            return

        # 获取所有字段名
        fieldnames = list(data[0].keys())

        with open(csv_file, "w", encoding="utf-8-sig", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(data)

        print(f"转换完成: {json_file} -> {csv_file}")
        print(f"共 {len(data)} 条记录")

    @staticmethod
    def merge_csv_files(input_files, output_file):
        """合并多个 CSV 文件"""
        all_data = []
        fieldnames = None

        for file_path in input_files:
            with open(file_path, "r", encoding="utf-8-sig") as f:
                reader = csv.DictReader(f)

                if fieldnames is None:
                    fieldnames = reader.fieldnames

                for row in reader:
                    all_data.append(row)

        with open(output_file, "w", encoding="utf-8-sig", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(all_data)

        print(f"合并完成: {len(input_files)} 个文件 -> {output_file}")
        print(f"共 {len(all_data)} 条记录")

    @staticmethod
    def split_csv(input_file, output_dir, rows_per_file=1000):
        """拆分大 CSV 文件"""
        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)

        with open(input_file, "r", encoding="utf-8-sig") as f:
            reader = csv.DictReader(f)
            fieldnames = reader.fieldnames

            file_count = 0
            current_rows = []

            for row in reader:
                current_rows.append(row)

                if len(current_rows) >= rows_per_file:
                    file_count += 1
                    output_file = output_path / f"part_{file_count}.csv"

                    with open(output_file, "w", encoding="utf-8-sig", newline="") as out_f:
                        writer = csv.DictWriter(out_f, fieldnames=fieldnames)
                        writer.writeheader()
                        writer.writerows(current_rows)

                    print(f"创建文件: {output_file} ({len(current_rows)} 条)")
                    current_rows = []

            # 处理剩余数据
            if current_rows:
                file_count += 1
                output_file = output_path / f"part_{file_count}.csv"

                with open(output_file, "w", encoding="utf-8-sig", newline="") as out_f:
                    writer = csv.DictWriter(out_f, fieldnames=fieldnames)
                    writer.writeheader()
                    writer.writerows(current_rows)

                print(f"创建文件: {output_file} ({len(current_rows)} 条)")

        print(f"拆分完成: 共 {file_count} 个文件")

# 使用示例
converter = DataConverter()

# CSV 转 JSON
converter.csv_to_json("students.csv", "students.json")

# JSON 转 CSV
converter.json_to_csv("students.json", "students_new.csv")

# 合并多个 CSV
csv_files = ["data1.csv", "data2.csv", "data3.csv"]
converter.merge_csv_files(csv_files, "merged.csv")

# 拆分大文件
converter.split_csv("large_file.csv", "split_output", rows_per_file=5000)

实战4: 文件搜索工具

python 复制代码
import os
from pathlib import Path
from datetime import datetime

class FileSearcher:
    """文件搜索工具"""

    def __init__(self, search_dir="."):
        self.search_dir = Path(search_dir)
        self.results = []

    def search_by_name(self, pattern, case_sensitive=False):
        """按文件名搜索"""
        self.results = []

        if case_sensitive:
            for file_path in self.search_dir.rglob("*"):
                if file_path.is_file() and pattern in file_path.name:
                    self.results.append(file_path)
        else:
            for file_path in self.search_dir.rglob(f"*{pattern}*"):
                if file_path.is_file():
                    self.results.append(file_path)

        return self.results

    def search_by_extension(self, extensions):
        """按扩展名搜索"""
        self.results = []

        if isinstance(extensions, str):
            extensions = [extensions]

        for ext in extensions:
            if not ext.startswith("."):
                ext = f".{ext}"

            for file_path in self.search_dir.rglob(f"*{ext}"):
                if file_path.is_file():
                    self.results.append(file_path)

        return self.results

    def search_by_size(self, min_size=0, max_size=None):
        """按文件大小搜索"""
        self.results = []

        for file_path in self.search_dir.rglob("*"):
            if file_path.is_file():
                size = file_path.stat().st_size

                if size >= min_size:
                    if max_size is None or size <= max_size:
                        self.results.append(file_path)

        return self.results

    def search_by_date(self, days_ago=None, before_date=None, after_date=None):
        """按日期搜索"""
        from datetime import timedelta

        self.results = []

        for file_path in self.search_dir.rglob("*"):
            if not file_path.is_file():
                continue

            mtime = datetime.fromtimestamp(file_path.stat().st_mtime)

            if days_ago:
                cutoff = datetime.now() - timedelta(days=days_ago)
                if mtime >= cutoff:
                    self.results.append(file_path)
            elif before_date:
                if isinstance(before_date, str):
                    before_date = datetime.strptime(before_date, "%Y-%m-%d")
                if mtime < before_date:
                    self.results.append(file_path)
            elif after_date:
                if isinstance(after_date, str):
                    after_date = datetime.strptime(after_date, "%Y-%m-%d")
                if mtime > after_date:
                    self.results.append(file_path)

        return self.results

    def search_by_content(self, keyword, file_pattern="*.txt"):
        """按文件内容搜索"""
        self.results = []

        for file_path in self.search_dir.glob(file_pattern):
            if not file_path.is_file():
                continue

            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    content = f.read()
                    if keyword in content:
                        self.results.append(file_path)
            except (UnicodeDecodeError, PermissionError):
                continue

        return self.results

    def show_results(self, max_results=20):
        """显示搜索结果"""
        if not self.results:
            print("未找到匹配的文件")
            return

        print(f"\n找到 {len(self.results)} 个文件:\n")
        print(f"{'文件名':<30}{'大小':<12}{'修改时间':<20}{'路径'}")
        print("-" * 100)

        for i, file_path in enumerate(self.results[:max_results]):
            stat = file_path.stat()
            size = self._format_size(stat.st_size)
            mtime = datetime.fromtimestamp(stat.st_mtime).strftime("%Y-%m-%d %H:%M")

            print(f"{file_path.name:<30}{size:<12}{mtime:<20}{file_path.parent}")

        if len(self.results) > max_results:
            print(f"\n... 还有 {len(self.results) - max_results} 个文件")

    def _format_size(self, size_bytes):
        """格式化文件大小"""
        for unit in ['B', 'KB', 'MB', 'GB']:
            if size_bytes < 1024:
                return f"{size_bytes:.1f} {unit}"
            size_bytes /= 1024
        return f"{size_bytes:.1f} TB"

    def export_results(self, output_file="search_results.txt"):
        """导出搜索结果"""
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(f"搜索结果 - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"搜索目录: {self.search_dir}\n")
            f.write(f"找到文件: {len(self.results)}\n")
            f.write("=" * 80 + "\n\n")

            for file_path in self.results:
                stat = file_path.stat()
                size = self._format_size(stat.st_size)
                mtime = datetime.fromtimestamp(stat.st_mtime).strftime("%Y-%m-%d %H:%M")

                f.write(f"文件: {file_path.name}\n")
                f.write(f"路径: {file_path}\n")
                f.write(f"大小: {size}\n")
                f.write(f"修改时间: {mtime}\n")
                f.write("-" * 80 + "\n")

        print(f"结果已导出到: {output_file}")

# 使用示例
searcher = FileSearcher(".")

# 按名称搜索
print("=== 搜索 Python 文件 ===")
searcher.search_by_name(".py")
searcher.show_results()

# 按扩展名搜索
print("\n=== 搜索图片和文档 ===")
searcher.search_by_extension(["jpg", "png", "pdf", "docx"])
searcher.show_results()

# 按大小搜索
print("\n=== 搜索大文件(>10MB) ===")
searcher.search_by_size(min_size=10*1024*1024)
searcher.show_results()

# 按日期搜索
print("\n=== 搜索最近7天的文件 ===")
searcher.search_by_date(days_ago=7)
searcher.show_results()

# 按内容搜索
print("\n=== 搜索包含 'TODO' 的文件 ===")
searcher.search_by_content("TODO", "*.py")
searcher.show_results()

# 导出结果
searcher.export_results("results.txt")

常见错误与注意事项

1. 忘记关闭文件

python 复制代码
# ❌ 危险:可能忘记关闭
f = open("test.txt", "r")
content = f.read()
# 如果中间出错,文件不会关闭

# ✅ 推荐:使用 with 语句
with open("test.txt", "r") as f:
    content = f.read()
# 自动关闭

2. 编码问题

python 复制代码
# ❌ 可能导致中文乱码
with open("chinese.txt", "r") as f:
    content = f.read()

# ✅ 指定编码
with open("chinese.txt", "r", encoding="utf-8") as f:
    content = f.read()

3. 文件不存在

python 复制代码
from pathlib import Path

# ❌ 可能抛出异常
# with open("not_exist.txt", "r") as f:
#     content = f.read()

# ✅ 先检查
file_path = Path("not_exist.txt")
if file_path.exists():
    content = file_path.read_text(encoding="utf-8")
else:
    print("文件不存在")

# 或使用 try-except
try:
    with open("not_exist.txt", "r", encoding="utf-8") as f:
        content = f.read()
except FileNotFoundError:
    print("文件不存在")

4. 写入时覆盖重要文件

python 复制代码
# ❌ 危险:可能意外覆盖
with open("important.txt", "w") as f:
    f.write("new content")

# ✅ 先检查或使用追加模式
from pathlib import Path

file_path = Path("important.txt")
if file_path.exists():
    choice = input("文件已存在,是否覆盖?(y/n): ")
    if choice.lower() != "y":
        print("取消操作")
    else:
        file_path.write_text("new content", encoding="utf-8")
else:
    file_path.write_text("new content", encoding="utf-8")

5. 大文件内存溢出

python 复制代码
# ❌ 危险:一次性加载大文件
with open("huge_file.txt", "r") as f:
    content = f.read()  # 可能占用大量内存

# ✅ 逐行读取
with open("huge_file.txt", "r") as f:
    for line in f:
        process(line)

# ✅ 分块读取
with open("huge_file.bin", "rb") as f:
    while True:
        chunk = f.read(8192)
        if not chunk:
            break
        process(chunk)

小结

操作类型 推荐模块/方法 适用场景
文本文件 open() + with 普通文本读写
二进制文件 open(mode="rb/wb") 图片、音频等
目录管理 os, shutil, pathlib 文件/目录操作
CSV 文件 csv 模块 表格数据
JSON 文件 json 模块 配置、API 数据
Excel 文件 openpyxl 电子表格
路径操作 pathlib 现代化路径处理

核心要点

  • 始终使用 with 语句管理文件
  • 处理中文时指定 encoding="utf-8"
  • 大文件使用逐行或分块读取
  • pathlib 是现代化的路径操作方式
  • 操作前检查文件是否存在
  • 注意文件打开模式的选择
  • 合理使用异常处理

掌握文件操作是 Python 编程的重要技能!

相关推荐
龙文浩_4 小时前
AI梯度下降与PyTorch张量操作技术指南
人工智能·pytorch·python·深度学习·神经网络·机器学习·自然语言处理
呱牛do it4 小时前
企业级绩效考核系统设计与实现:基于FastAPI + Vue3的全栈解决方案
python·fastapi
7年前端辞职转AI4 小时前
Python 容器数据类型
python·编程语言
云霄IT4 小时前
安卓开发之java转dex再转smali
android·java·python
果汁华4 小时前
Typer:基于类型提示的现代Python CLI框架
开发语言·网络·python
Claw开发者4 小时前
第1课:用20行Python造出你的第一个AI Agent
python
7年前端辞职转AI4 小时前
Python 流程控制语句
python·编程语言
7年前端辞职转AI4 小时前
Python 运算符
python·编程语言
zhangzeyuaaa4 小时前
Python 异常机制深度剖析
开发语言·python