
Python内置库:AI开发者的瑞士军刀
一、为什么内置库如此重要?
Python有"自带电池"(Batteries Included)的美誉,标准库提供了丰富的功能模块。掌握内置库,意味着无需安装第三方包就能解决80%的常见问题。
python
# 没有内置库:需要自己实现
def calculate_average(numbers):
return sum(numbers) / len(numbers)
# 使用内置库:一行搞定
import statistics
average = statistics.mean([1, 2, 3, 4, 5])
二、最常用的10个内置库速查表
| 库名 | 一句话说明 | AI开发场景 | 重要程度 |
|---|---|---|---|
os |
操作系统接口 | 文件路径、环境变量 | ⭐⭐⭐⭐⭐ |
sys |
Python运行时 | 命令行参数、退出程序 | ⭐⭐⭐⭐⭐ |
math |
数学函数 | 基础数学运算 | ⭐⭐⭐⭐ |
random |
随机数生成 | 数据采样、初始化 | ⭐⭐⭐⭐ |
datetime |
日期时间 | 日志记录、时间戳 | ⭐⭐⭐⭐ |
json |
JSON处理 | 配置文件、API交互 | ⭐⭐⭐⭐⭐ |
csv |
CSV文件 | 数据集读写 | ⭐⭐⭐⭐⭐ |
re |
正则表达式 | 文本清洗、模式匹配 | ⭐⭐⭐⭐ |
collections |
高级容器 | 数据结构优化 | ⭐⭐⭐⭐ |
itertools |
迭代器工具 | 高效循环 | ⭐⭐⭐ |
三、详细讲解:每个库的核心用法
3.1 os - 操作系统接口
文件路径操作(AI开发中最常用)
python
import os
# === 路径操作 ===
# 拼接路径(跨平台)
data_dir = os.path.join('data', 'train', 'images')
# Windows: data\train\images
# Linux/Mac: data/train/images
# 获取当前工作目录
cwd = os.getcwd()
print(f"当前目录: {cwd}")
# 改变工作目录
os.chdir('/path/to/project')
# 获取文件所在目录
file_path = '/home/user/data.csv'
dir_name = os.path.dirname(file_path) # '/home/user'
base_name = os.path.basename(file_path) # 'data.csv'
# 拆分文件名和扩展名
name, ext = os.path.splitext('image.jpg') # ('image', '.jpg')
# === 文件和目录操作 ===
# 检查文件/目录是否存在
if os.path.exists('config.json'):
print("配置文件存在")
# 检查是否为文件/目录
os.path.isfile('data.csv') # True如果是文件
os.path.isdir('data') # True如果是目录
# 创建目录
os.makedirs('data/train/images', exist_ok=True) # exist_ok=True不报错
# 列出目录内容
files = os.listdir('data/')
for f in files:
print(f)
# 获取文件大小(字节)
size = os.path.getsize('large_file.csv')
print(f"文件大小: {size / 1024 / 1024:.2f} MB")
# === 环境变量 ===
# 获取环境变量
api_key = os.environ.get('OPENAI_API_KEY', 'default_key')
# 设置环境变量
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
# === 实战:批量重命名图片 ===
def batch_rename(folder_path, prefix):
files = os.listdir(folder_path)
for i, filename in enumerate(files):
if filename.endswith('.jpg'):
old_path = os.path.join(folder_path, filename)
new_name = f"{prefix}_{i:04d}.jpg"
new_path = os.path.join(folder_path, new_name)
os.rename(old_path, new_path)
print(f"重命名: {filename} -> {new_name}")
3.2 sys - Python运行时
python
import sys
# === 命令行参数 ===
# python train.py --epochs 100 --batch_size 32
print(f"脚本名: {sys.argv[0]}")
print(f"参数列表: {sys.argv[1:]}")
# 解析简单参数
def parse_args():
args = {}
for i in range(1, len(sys.argv), 2):
if i+1 < len(sys.argv):
key = sys.argv[i].lstrip('-')
args[key] = sys.argv[i+1]
return args
# === Python路径 ===
# 添加自定义模块路径
sys.path.append('/path/to/my/modules')
# 查看Python搜索路径
for path in sys.path:
print(path)
# === 退出程序 ===
if not os.path.exists('data.csv'):
print("错误:数据文件不存在")
sys.exit(1) # 退出码1表示错误
# === 标准输入输出 ===
# 从命令行读取输入
name = sys.stdin.readline().strip()
# 重定向输出
sys.stdout.write("Hello, AI World!\n")
sys.stderr.write("这是错误信息\n")
# === 版本信息 ===
print(f"Python版本: {sys.version}")
print(f"版本号: {sys.version_info}")
# === 实战:进度条 ===
def progress_bar(iterable, total=None, prefix='Progress'):
total = total or len(iterable)
for i, item in enumerate(iterable):
percent = (i + 1) / total * 100
bar = '█' * int(percent // 2) + '░' * (50 - int(percent // 2))
sys.stdout.write(f'\r{prefix}: |{bar}| {percent:.1f}%')
sys.stdout.flush()
yield item
sys.stdout.write('\n')
# 使用进度条
for item in progress_bar(range(100), prefix="训练"):
pass # 处理每个item
3.3 math - 数学函数
python
import math
# === 基础数学 ===
print(f"π = {math.pi}") # 3.141592653589793
print(f"e = {math.e}") # 2.718281828459045
print(f"无穷大: {math.inf}") # inf
print(f"非数字: {math.nan}") # nan
# === 常用函数 ===
# 向上取整/向下取整
print(math.ceil(3.14)) # 4
print(math.floor(3.14)) # 3
# 绝对值
print(math.fabs(-5.5)) # 5.5
# 幂和开方
print(math.pow(2, 10)) # 1024.0
print(math.sqrt(16)) # 4.0
# 指数和对数
print(math.exp(1)) # e^1 = 2.718...
print(math.log(100, 10)) # log10(100) = 2.0
print(math.log(math.e)) # ln(e) = 1.0
print(math.log10(100)) # 2.0
# 三角函数(弧度制)
angle_rad = math.radians(45) # 角度转弧度
print(math.sin(angle_rad)) # 0.707...
print(math.cos(angle_rad))
print(math.tan(angle_rad))
# === 组合数学 ===
# 阶乘
print(math.factorial(5)) # 120
# 组合数 C(5,2) = 10
print(math.comb(5, 2))
# 排列数 P(5,2) = 20
print(math.perm(5, 2))
# 最大公约数
print(math.gcd(12, 18)) # 6
# === 实战:计算准确率、精确率、召回率 ===
def calculate_metrics(tp, fp, fn):
"""计算分类指标"""
accuracy = (tp + fp) / (tp + fp + fn)
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
return {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1_score': f1
}
# 使用示例
metrics = calculate_metrics(tp=80, fp=10, fn=10)
for k, v in metrics.items():
print(f"{k}: {v:.4f}")
3.4 random - 随机数生成
python
import random
# === 基础随机数 ===
# 0-1之间的随机浮点数
print(random.random()) # 0.374...
# 指定范围的随机整数 [1, 10]
print(random.randint(1, 10))
# 指定范围的随机浮点数 [0.0, 1.0)
print(random.uniform(1.5, 5.5))
# === 序列随机操作(AI数据增强常用)===
data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
# 随机选择一个元素
print(random.choice(data)) # 如 7
# 随机选择多个元素(可重复)
print(random.choices(data, k=3)) # [4, 2, 4]
# 随机选择多个元素(不重复)
print(random.sample(data, k=3)) # [5, 2, 9]
# 打乱列表(原地修改)
random.shuffle(data)
print(data) # [6, 1, 9, 3, ...]
# === 设置随机种子(保证可重复性)===
random.seed(42) # AI训练必备
print(random.random()) # 每次运行都一样
print(random.random())
# === 实战:数据划分(训练/验证/测试)===
def split_dataset(data, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
"""随机划分数据集"""
assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6
# 打乱数据
random.shuffle(data)
total = len(data)
train_end = int(total * train_ratio)
val_end = train_end + int(total * val_ratio)
train = data[:train_end]
val = data[train_end:val_end]
test = data[val_end:]
return train, val, test
# 使用示例
dataset = list(range(1000))
train, val, test = split_dataset(dataset)
print(f"训练集: {len(train)}, 验证集: {len(val)}, 测试集: {len(test)}")
# === 实战:数据增强(图像简单示例)===
def image_augmentation(image):
"""模拟数据增强"""
augmented = image.copy()
# 随机水平翻转
if random.random() > 0.5:
augmented = flip_horizontal(augmented) # 假设有flip函数
# 随机旋转
if random.random() > 0.5:
angle = random.uniform(-10, 10)
augmented = rotate(augmented, angle)
# 随机亮度调整
brightness_factor = random.uniform(0.8, 1.2)
augmented = adjust_brightness(augmented, brightness_factor)
return augmented
3.5 datetime - 日期时间
python
from datetime import datetime, date, time, timedelta
import time as time_module # 避免与变量名冲突
# === 获取当前时间 ===
now = datetime.now()
print(f"当前时间: {now}")
print(f"日期: {now.date()}")
print(f"时间: {now.time()}")
print(f"年: {now.year}, 月: {now.month}, 日: {now.day}")
print(f"时: {now.hour}, 分: {now.minute}, 秒: {now.second}")
# === 时间戳 ===
timestamp = now.timestamp() # Unix时间戳
print(f"时间戳: {timestamp}")
print(f"从时间戳恢复: {datetime.fromtimestamp(timestamp)}")
# === 时间格式化 ===
# 格式化输出
formatted = now.strftime("%Y-%m-%d %H:%M:%S")
print(formatted) # 2026-04-13 14:30:25
formatted = now.strftime("%Y年%m月%d日 %H:%M:%S")
print(formatted) # 2026年04月13日 14:30:25
# 字符串转datetime
date_str = "2026-04-13 14:30:00"
dt = datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S")
print(dt)
# 常用格式代码
"""
%Y: 4位年份 (2026)
%y: 2位年份 (26)
%m: 月份 (01-12)
%d: 日期 (01-31)
%H: 24小时制 (00-23)
%I: 12小时制 (01-12)
%M: 分钟 (00-59)
%S: 秒 (00-59)
%A: 星期几完整名称 (Monday)
%a: 星期几缩写 (Mon)
"""
# === 时间运算 ===
# 时间差
tomorrow = now + timedelta(days=1)
yesterday = now - timedelta(days=1)
next_hour = now + timedelta(hours=1)
# 计算两个时间的差值
start = datetime(2026, 4, 13, 9, 0, 0)
end = datetime(2026, 4, 13, 17, 30, 0)
duration = end - start
print(f"工作时长: {duration}") # 8:30:00
print(f"总秒数: {duration.total_seconds()}") # 30600
# === 实战:训练日志带时间戳 ===
class TrainingLogger:
def __init__(self, log_file):
self.log_file = log_file
def log(self, message):
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
with open(self.log_file, 'a', encoding='utf-8') as f:
f.write(f"[{timestamp}] {message}\n")
def get_elapsed_time(self, start_time):
elapsed = datetime.now() - start_time
hours = elapsed.seconds // 3600
minutes = (elapsed.seconds % 3600) // 60
seconds = elapsed.seconds % 60
return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
# 使用示例
logger = TrainingLogger('training.log')
start_time = datetime.now()
logger.log("训练开始")
# 模拟训练
import time
time.sleep(2) # 等待2秒
logger.log(f"训练完成,耗时: {logger.get_elapsed_time(start_time)}")
3.6 json - JSON数据处理
python
import json
# === JSON数据结构 ===
ai_config = {
"model": {
"name": "BERT-base",
"hidden_size": 768,
"num_layers": 12,
"num_heads": 12
},
"training": {
"batch_size": 32,
"learning_rate": 2e-5,
"epochs": 3,
"optimizer": "AdamW"
},
"data": {
"train_path": "./data/train.json",
"val_path": "./data/val.json"
}
}
# === 写入JSON文件 ===
with open('config.json', 'w', encoding='utf-8') as f:
json.dump(ai_config, f, indent=4, ensure_ascii=False)
# indent=4: 格式化输出
# ensure_ascii=False: 支持中文
# === 读取JSON文件 ===
with open('config.json', 'r', encoding='utf-8') as f:
loaded_config = json.load(f)
print(f"模型名称: {loaded_config['model']['name']}")
print(f"Batch size: {loaded_config['training']['batch_size']}")
# === 字符串与对象转换 ===
# 对象 → JSON字符串
json_string = json.dumps(ai_config, indent=2)
print(json_string)
# JSON字符串 → 对象
config_obj = json.loads(json_string)
# === 处理复杂类型(自定义编码)===
from datetime import datetime
def custom_encoder(obj):
"""处理datetime等特殊类型"""
if isinstance(obj, datetime):
return obj.isoformat()
raise TypeError(f"无法序列化类型 {type(obj)}")
data = {
"name": "训练任务",
"start_time": datetime.now(),
"duration": 3600
}
# 使用自定义编码器
json_str = json.dumps(data, default=custom_encoder, indent=2)
print(json_str)
# 自定义解码器
def custom_decoder(dict_obj):
if 'start_time' in dict_obj:
dict_obj['start_time'] = datetime.fromisoformat(dict_obj['start_time'])
return dict_obj
loaded = json.loads(json_str, object_hook=custom_decoder)
print(f"恢复的时间对象: {loaded['start_time']}")
# === 实战:加载COCO格式标注 ===
def load_coco_annotations(json_path):
"""加载COCO格式的目标检测标注"""
with open(json_path, 'r', encoding='utf-8') as f:
coco = json.load(f)
# 提取信息
images = {img['id']: img for img in coco['images']}
categories = {cat['id']: cat['name'] for cat in coco['categories']}
# 按图像组织标注
annotations_by_image = {}
for ann in coco['annotations']:
image_id = ann['image_id']
if image_id not in annotations_by_image:
annotations_by_image[image_id] = []
annotations_by_image[image_id].append({
'bbox': ann['bbox'],
'category': categories[ann['category_id']],
'area': ann['area']
})
return images, annotations_by_image
3.7 csv - CSV文件处理
python
import csv
# === 读取CSV(标准库方式)===
# 方式1:读取为列表
with open('data.csv', 'r', encoding='utf-8') as f:
reader = csv.reader(f)
headers = next(reader) # 读取第一行作为表头
for row in reader:
print(row)
# 方式2:读取为字典(推荐)
with open('data.csv', 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
print(row['name'], row['age'])
# === 写入CSV ===
# 方式1:写入列表
data = [
['Name', 'Age', 'Score'],
['Alice', 25, 95.5],
['Bob', 30, 88.0],
['Charlie', 28, 92.3]
]
with open('output.csv', 'w', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
writer.writerows(data)
# 方式2:写入字典
data_dict = [
{'name': 'Alice', 'age': 25, 'score': 95.5},
{'name': 'Bob', 'age': 30, 'score': 88.0},
{'name': 'Charlie', 'age': 28, 'score': 92.3}
]
with open('output_dict.csv', 'w', encoding='utf-8', newline='') as f:
fieldnames = ['name', 'age', 'score']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(data_dict)
# === CSV高级技巧 ===
# 处理不同分隔符
with open('data.tsv', 'r') as f:
reader = csv.reader(f, delimiter='\t') # Tab分隔
for row in reader:
print(row)
# 处理带引号的字段
with open('data.csv', 'r') as f:
reader = csv.reader(f, quotechar='"', quoting=csv.QUOTE_ALL)
for row in reader:
print(row)
# 实战:数据清洗工具
class DataCleaner:
@staticmethod
def remove_duplicates(input_file, output_file, key_column):
"""基于某列去重"""
seen = set()
cleaned_data = []
with open(input_file, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
headers = reader.fieldnames
for row in reader:
key = row[key_column]
if key not in seen:
seen.add(key)
cleaned_data.append(row)
with open(output_file, 'w', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=headers)
writer.writeheader()
writer.writerows(cleaned_data)
print(f"去重完成: {len(cleaned_data)} 条记录")
@staticmethod
def filter_by_condition(input_file, output_file, condition_func):
"""根据条件过滤数据"""
filtered = []
with open(input_file, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
headers = reader.fieldnames
for row in reader:
if condition_func(row):
filtered.append(row)
with open(output_file, 'w', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=headers)
writer.writeheader()
writer.writerows(filtered)
print(f"过滤完成: {len(filtered)} 条记录")
# 使用示例
cleaner = DataCleaner()
cleaner.remove_duplicates('raw_data.csv', 'clean_data.csv', 'user_id')
cleaner.filter_by_condition('clean_data.csv', 'high_score.csv',
lambda x: float(x['score']) > 90)
3.8 re - 正则表达式
python
import re
# === 基本匹配 ===
text = "Contact: email@example.com, phone: 123-456-7890"
# 查找第一个匹配
email_pattern = r'\b[\w.-]+@[\w.-]+\.\w+\b'
match = re.search(email_pattern, text)
if match:
print(f"找到邮箱: {match.group()}") # email@example.com
# 查找所有匹配
phone_pattern = r'\d{3}-\d{3}-\d{4}'
phones = re.findall(phone_pattern, text)
print(f"找到电话: {phones}") # ['123-456-7890']
# === 常用正则模式 ===
patterns = {
'邮箱': r'\b[\w.-]+@[\w.-]+\.\w+\b',
'手机号': r'1[3-9]\d{9}',
'URL': r'https?://[\w.-]+(?:/[\w./-]*)?',
'IP地址': r'\b(?:\d{1,3}\.){3}\d{1,3}\b',
'日期(YYYY-MM-DD)': r'\d{4}-\d{2}-\d{2}',
'中文': r'[\u4e00-\u9fff]+',
'数字': r'\d+(?:\.\d+)?'
}
# === 文本清洗实战 ===
def clean_text(text):
"""清洗文本:移除特殊字符、多余空格"""
# 移除HTML标签
text = re.sub(r'<[^>]+>', '', text)
# 移除标点符号(保留字母、数字、中文、空格)
text = re.sub(r'[^\w\s\u4e00-\u9fff]', '', text)
# 合并多个空格为一个
text = re.sub(r'\s+', ' ', text)
# 去除首尾空格
text = text.strip()
return text
# 使用示例
dirty_text = "<div>Hello, 世界! @AI </div>"
clean = clean_text(dirty_text)
print(clean) # "Hello 世界 AI"
# === 数据提取实战 ===
def extract_info(log_text):
"""从日志中提取信息"""
patterns = {
'timestamp': r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}',
'level': r'(INFO|WARNING|ERROR|DEBUG)',
'loss': r'loss: ([\d.]+)',
'accuracy': r'acc: ([\d.]+)',
'epoch': r'Epoch (\d+)/\d+'
}
results = {}
for key, pattern in patterns.items():
match = re.search(pattern, log_text)
if match:
results[key] = match.group(1) if match.groups() else match.group()
return results
log_line = "2026-04-13 14:30:25 INFO - Epoch 5/10, loss: 0.2345, acc: 0.8923"
info = extract_info(log_line)
print(info)
# {'timestamp': '2026-04-13 14:30:25', 'level': 'INFO', 'loss': '0.2345', 'accuracy': '0.8923', 'epoch': '5'}
# === 替换与分割 ===
# 替换
text = "Hello World! Hello AI!"
new_text = re.sub(r'Hello', 'Hi', text)
print(new_text) # "Hi World! Hi AI!"
# 分割
csv_line = "apple,banana,orange;grape|melon"
tokens = re.split(r'[,;|]', csv_line)
print(tokens) # ['apple', 'banana', 'orange', 'grape', 'melon']
3.9 collections - 高级容器
python
from collections import defaultdict, Counter, deque, OrderedDict, namedtuple
# === 1. defaultdict:带默认值的字典 ===
# 普通字典访问不存在的键会报错
# d = {}; d['key'] # KeyError
# defaultdict:自动创建默认值
from collections import defaultdict
# 统计词频
word_count = defaultdict(int)
text = "apple banana apple orange banana apple"
for word in text.split():
word_count[word] += 1
print(dict(word_count)) # {'apple': 3, 'banana': 2, 'orange': 1}
# 按类别分组
grouped_data = defaultdict(list)
data = [('cat', 'Tom'), ('dog', 'Buddy'), ('cat', 'Jerry'), ('dog', 'Max')]
for category, name in data:
grouped_data[category].append(name)
print(dict(grouped_data)) # {'cat': ['Tom', 'Jerry'], 'dog': ['Buddy', 'Max']}
# === 2. Counter:计数器 ===
from collections import Counter
# 统计元素出现次数
colors = ['red', 'blue', 'red', 'green', 'blue', 'red']
counter = Counter(colors)
print(counter) # Counter({'red': 3, 'blue': 2, 'green': 1})
# 最常见的n个元素
print(counter.most_common(2)) # [('red', 3), ('blue', 2)]
# 加减操作
c1 = Counter(A=3, B=2)
c2 = Counter(A=1, C=4)
print(c1 + c2) # Counter({'A': 4, 'C': 4, 'B': 2})
print(c1 - c2) # Counter({'A': 2, 'B': 2})
# 实战:文本分析
def analyze_text(text):
words = text.lower().split()
word_freq = Counter(words)
print(f"总词数: {len(words)}")
print(f"不同词数: {len(word_freq)}")
print(f"最常见的5个词: {word_freq.most_common(5)}")
return word_freq
analyze_text("the cat and the dog and the bird")
# === 3. deque:双端队列(高效插入删除)===
from collections import deque
# 创建队列
dq = deque([1, 2, 3])
# 两端添加
dq.append(4) # 右侧添加: [1,2,3,4]
dq.appendleft(0) # 左侧添加: [0,1,2,3,4]
# 两端弹出
right = dq.pop() # 弹出右侧: 4
left = dq.popleft() # 弹出左侧: 0
# 限制队列长度(自动淘汰旧数据)
dq = deque(maxlen=3)
for i in range(5):
dq.append(i)
print(dq) # 只保留最近3个元素
# deque([0], maxlen=3)
# deque([0, 1], maxlen=3)
# deque([0, 1, 2], maxlen=3)
# deque([1, 2, 3], maxlen=3)
# deque([2, 3, 4], maxlen=3)
# 实战:滑动窗口
def sliding_window(data, window_size):
window = deque(maxlen=window_size)
for item in data:
window.append(item)
if len(window) == window_size:
yield list(window)
data = [1, 2, 3, 4, 5, 6, 7]
for window in sliding_window(data, 3):
print(window)
# [1, 2, 3]
# [2, 3, 4]
# [3, 4, 5]
# ...
# === 4. namedtuple:轻量级数据类 ===
from collections import namedtuple
# 定义数据类
Point = namedtuple('Point', ['x', 'y'])
Student = namedtuple('Student', ['name', 'age', 'score'])
# 创建实例
p = Point(10, 20)
s = Student('Alice', 25, 95.5)
# 访问属性
print(p.x, p.y) # 10 20
print(s.name, s.age) # Alice 25
# 转为字典
print(s._asdict()) # {'name': 'Alice', 'age': 25, 'score': 95.5}
# 替换值(返回新对象)
s_new = s._replace(score=98.0)
print(s_new)
# 实战:数据批次
Batch = namedtuple('Batch', ['inputs', 'labels', 'batch_size'])
batch = Batch(
inputs=torch.randn(32, 3, 224, 224),
labels=torch.randint(0, 10, (32,)),
batch_size=32
)
# === 5. OrderedDict:有序字典 ===
from collections import OrderedDict
# 保持插入顺序
od = OrderedDict()
od['a'] = 1
od['b'] = 2
od['c'] = 3
for key, value in od.items():
print(key, value) # 保持 a,b,c 顺序
# 移动到最后
od.move_to_end('a')
for key, value in od.items():
print(key, value) # b, c, a
# 移到最前
od.move_to_end('c', last=False)
3.10 itertools - 高效迭代器
python
import itertools
# === 1. count:无限计数器 ===
for i in itertools.count(start=0, step=2):
if i > 10:
break
print(i) # 0, 2, 4, 6, 8, 10
# === 2. cycle:循环迭代 ===
colors = itertools.cycle(['red', 'green', 'blue'])
for i, color in enumerate(colors):
if i >= 6:
break
print(color) # red, green, blue, red, green, blue
# === 3. repeat:重复元素 ===
for x in itertools.repeat('hello', 3):
print(x) # hello, hello, hello
# === 4. chain:连接多个迭代器 ===
list1 = [1, 2, 3]
list2 = [4, 5, 6]
list3 = [7, 8, 9]
combined = itertools.chain(list1, list2, list3)
print(list(combined)) # [1, 2, 3, 4, 5, 6, 7, 8, 9]
# === 5. product:笛卡尔积 ===
colors = ['red', 'blue']
sizes = ['S', 'M', 'L']
combinations = itertools.product(colors, sizes)
print(list(combinations))
# [('red', 'S'), ('red', 'M'), ('red', 'L'), ('blue', 'S'), ('blue', 'M'), ('blue', 'L')]
# 超参数网格搜索
params = {
'lr': [0.001, 0.01, 0.1],
'batch_size': [16, 32, 64],
'optimizer': ['SGD', 'Adam']
}
keys = params.keys()
for values in itertools.product(*params.values()):
config = dict(zip(keys, values))
print(config)
# {'lr': 0.001, 'batch_size': 16, 'optimizer': 'SGD'}
# ...
# === 6. permutations:排列 ===
items = [1, 2, 3]
perms = itertools.permutations(items, 2) # 选2个排列
print(list(perms))
# [(1, 2), (1, 3), (2, 1), (2, 3), (3, 1), (3, 2)]
# === 7. combinations:组合 ===
combs = itertools.combinations(items, 2) # 选2个组合
print(list(combs)) # [(1, 2), (1, 3), (2, 3)]
# === 8. groupby:分组 ===
data = [('cat', 5), ('cat', 3), ('dog', 2), ('dog', 4), ('bird', 1)]
data.sort(key=lambda x: x[0]) # 必须先排序
for key, group in itertools.groupby(data, key=lambda x: x[0]):
values = [item[1] for item in group]
print(f"{key}: {values}")
# cat: [5, 3]
# dog: [2, 4]
# bird: [1]
# === 实战:批量处理 ===
def batch_iter(data, batch_size):
"""将数据分批处理"""
it = iter(data)
while True:
batch = list(itertools.islice(it, batch_size))
if not batch:
break
yield batch
data = list(range(20))
for batch in batch_iter(data, 6):
print(batch)
# [0, 1, 2, 3, 4, 5]
# [6, 7, 8, 9, 10, 11]
# [12, 13, 14, 15, 16, 17]
# [18, 19]
# 实战:数据增强组合
augmentations = ['rotate', 'flip', 'crop', 'scale']
for r in range(1, 3): # 1-2种增强组合
for combo in itertools.combinations(augmentations, r):
print(f"增强组合: {combo}")
四、内置库速查表(完整版)
4.1 常用函数快速参考
| 函数 | 模块 | 作用 | 示例 |
|---|---|---|---|
open() |
built-in | 打开文件 | f = open('file.txt', 'r') |
len() |
built-in | 获取长度 | len([1,2,3]) → 3 |
range() |
built-in | 生成序列 | list(range(5)) → [0,1,2,3,4] |
enumerate() |
built-in | 带索引遍历 | for i, v in enumerate(list) |
zip() |
built-in | 并行遍历 | for a,b in zip(list1, list2) |
map() |
built-in | 批量映射 | list(map(str, [1,2,3])) |
filter() |
built-in | 条件筛选 | list(filter(lambda x: x>0, [-1,2,-3])) |
sorted() |
built-in | 排序 | sorted([3,1,2]) → [1,2,3] |
sum() |
built-in | 求和 | sum([1,2,3]) → 6 |
max()/min() |
built-in | 最大/最小值 | max([1,2,3]) → 3 |
abs() |
built-in | 绝对值 | abs(-5) → 5 |
round() |
built-in | 四舍五入 | round(3.14159, 2) → 3.14 |
isinstance() |
built-in | 类型检查 | isinstance(5, int) → True |
type() |
built-in | 获取类型 | type(3.14) → float |
4.2 各库核心类/方法速查
python
# === os 核心 ===
os.path.join() # 拼接路径
os.path.exists() # 检查存在
os.listdir() # 列出目录
os.makedirs() # 创建目录
os.getcwd() # 当前目录
os.environ.get() # 获取环境变量
# === sys 核心 ===
sys.argv # 命令行参数
sys.exit() # 退出程序
sys.path # 模块搜索路径
sys.stdout.write() # 标准输出
sys.version # Python版本
# === math 核心 ===
math.sqrt() # 平方根
math.pow() # 幂运算
math.log() # 对数
math.sin/cos/tan() # 三角函数
math.pi / math.e # 常数
math.factorial() # 阶乘
# === random 核心 ===
random.random() # [0,1)随机数
random.randint() # 随机整数
random.choice() # 随机选择
random.shuffle() # 打乱列表
random.sample() # 无放回采样
random.seed() # 设置种子
# === datetime 核心 ===
datetime.now() # 当前时间
datetime.strftime() # 格式化输出
datetime.strptime() # 字符串解析
timedelta() # 时间差
# === json 核心 ===
json.load() # 文件→对象
json.dump() # 对象→文件
json.loads() # 字符串→对象
json.dumps() # 对象→字符串
# === csv 核心 ===
csv.reader() # 读为列表
csv.DictReader() # 读为字典
csv.writer() # 写为列表
csv.DictWriter() # 写为字典
# === re 核心 ===
re.search() # 查找第一个
re.findall() # 查找所有
re.sub() # 替换
re.split() # 分割
re.compile() # 编译模式
# === collections 核心 ===
defaultdict() # 默认值字典
Counter() # 计数器
deque() # 双端队列
namedtuple() # 命名元组
OrderedDict() # 有序字典
# === itertools 核心 ===
itertools.chain() # 连接迭代器
itertools.product() # 笛卡尔积
itertools.permutations() # 排列
itertools.combinations() # 组合
itertools.groupby() # 分组
五、实战项目:数据预处理流水线
综合运用多个内置库的完整示例:
python
import os
import sys
import json
import csv
import re
import random
from datetime import datetime
from collections import defaultdict, Counter
import itertools
class DataPreprocessingPipeline:
"""完整的数据预处理流水线"""
def __init__(self, input_dir, output_dir):
self.input_dir = input_dir
self.output_dir = output_dir
self.stats = defaultdict(int)
self.start_time = datetime.now()
# 创建输出目录
os.makedirs(output_dir, exist_ok=True)
def log(self, message):
"""带时间戳的日志"""
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log_msg = f"[{timestamp}] {message}"
print(log_msg)
# 写入日志文件
with open(os.path.join(self.output_dir, 'pipeline.log'), 'a') as f:
f.write(log_msg + '\n')
def discover_files(self):
"""发现所有需要处理的文件"""
files = []
for root, dirs, filenames in os.walk(self.input_dir):
for filename in filenames:
if filename.endswith(('.csv', '.json', '.txt')):
filepath = os.path.join(root, filename)
files.append(filepath)
self.log(f"发现 {len(files)} 个文件")
return files
def clean_text(self, text):
"""清洗文本数据"""
if not isinstance(text, str):
return ""
# 移除HTML标签
text = re.sub(r'<[^>]+>', '', text)
# 移除特殊字符,保留中文、英文、数字、空格
text = re.sub(r'[^\w\s\u4e00-\u9fff]', '', text)
# 合并多个空格
text = re.sub(r'\s+', ' ', text)
# 去除首尾空格
text = text.strip()
return text
def process_csv(self, filepath):
"""处理CSV文件"""
data = []
with open(filepath, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
headers = reader.fieldnames
for row in reader:
# 清洗每个文本字段
for key, value in row.items():
if isinstance(value, str):
row[key] = self.clean_text(value)
# 过滤空行
if any(row.values()):
data.append(row)
self.stats['csv_processed'] += 1
self.stats['csv_rows'] += len(data)
return data, headers
def process_json(self, filepath):
"""处理JSON文件"""
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
# 如果是列表,处理每个元素
if isinstance(data, list):
for item in data:
if isinstance(item, dict):
for key, value in item.items():
if isinstance(value, str):
item[key] = self.clean_text(value)
# 如果是字典
elif isinstance(data, dict):
for key, value in data.items():
if isinstance(value, str):
data[key] = self.clean_text(value)
self.stats['json_processed'] += 1
return data
def split_dataset(self, data, train_ratio=0.7, val_ratio=0.15):
"""划分数据集"""
random.seed(42)
random.shuffle(data)
total = len(data)
train_end = int(total * train_ratio)
val_end = train_end + int(total * val_ratio)
return {
'train': data[:train_end],
'val': data[train_end:val_end],
'test': data[val_end:]
}
def generate_statistics(self, data):
"""生成数据统计"""
stats = {
'total_samples': len(data),
'text_lengths': [],
'label_distribution': Counter()
}
for item in data:
# 统计文本长度(如果有text字段)
if 'text' in item:
stats['text_lengths'].append(len(item['text']))
# 统计标签分布(如果有label字段)
if 'label' in item:
stats['label_distribution'][item['label']] += 1
if stats['text_lengths']:
stats['avg_length'] = sum(stats['text_lengths']) / len(stats['text_lengths'])
stats['max_length'] = max(stats['text_lengths'])
stats['min_length'] = min(stats['text_lengths'])
return stats
def save_results(self, data, filename):
"""保存处理后的数据"""
output_path = os.path.join(self.output_dir, filename)
if filename.endswith('.csv'):
if data:
headers = data[0].keys()
with open(output_path, 'w', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=headers)
writer.writeheader()
writer.writerows(data)
elif filename.endswith('.json'):
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
self.log(f"保存文件: {output_path}")
def run(self):
"""运行完整流水线"""
self.log("=" * 50)
self.log("数据预处理流水线启动")
self.log("=" * 50)
# 1. 发现文件
files = self.discover_files()
# 2. 处理所有文件
all_data = []
for filepath in files:
self.log(f"处理: {filepath}")
if filepath.endswith('.csv'):
data, headers = self.process_csv(filepath)
all_data.extend(data)
elif filepath.endswith('.json'):
data = self.process_json(filepath)
if isinstance(data, list):
all_data.extend(data)
else:
all_data.append(data)
elif filepath.endswith('.txt'):
with open(filepath, 'r', encoding='utf-8') as f:
text = f.read()
cleaned = self.clean_text(text)
all_data.append({'text': cleaned, 'source': filepath})
self.log(f"总共加载 {len(all_data)} 条记录")
# 3. 数据统计
stats = self.generate_statistics(all_data)
self.log(f"统计信息: {stats}")
# 保存统计报告
with open(os.path.join(self.output_dir, 'statistics.json'), 'w') as f:
json.dump(dict(stats), f, indent=2, default=str)
# 4. 划分数据集
splits = self.split_dataset(all_data)
self.log(f"数据集划分: 训练={len(splits['train'])}, "
f"验证={len(splits['val'])}, 测试={len(splits['test'])}")
# 5. 保存结果
self.save_results(splits['train'], 'train.csv')
self.save_results(splits['val'], 'val.csv')
self.save_results(splits['test'], 'test.csv')
# 6. 最终报告
elapsed = datetime.now() - self.start_time
self.log("=" * 50)
self.log(f"流水线完成!耗时: {elapsed.total_seconds():.2f}秒")
self.log(f"统计: {dict(self.stats)}")
self.log("=" * 50)
return splits
# 使用示例
if __name__ == "__main__":
pipeline = DataPreprocessingPipeline(
input_dir="./raw_data",
output_dir="./processed_data"
)
result = pipeline.run()
print(f"处理完成,训练集大小: {len(result['train'])}")
六、学习路径与练习建议
6.1 掌握顺序(由易到难)
第1周:os, sys, math, random
第2周:datetime, json, csv
第3周:re, collections
第4周:itertools + 综合项目
6.2 练习任务
基础练习(1-2天):
python
# 练习1:使用os模块遍历目录,统计每种文件类型的数量
# 练习2:使用random实现一个简单的数据采样器
# 练习3:使用datetime记录程序运行时间
进阶练习(3-4天):
python
# 练习4:使用csv和json实现数据格式转换器
# 练习5:使用re编写一个日志解析器
# 练习6:使用collections.Counter分析文本词频
综合项目(1周):
python
# 项目:构建一个日志分析系统
# - 使用os遍历日志文件
# - 使用re提取关键信息
# - 使用datetime分析时间分布
# - 使用Counter统计错误类型
# - 使用json保存分析结果
# - 使用csv生成报表
6.3 记忆技巧
口诀记忆法:
文件路径找os,参数退出找sys
数学计算找math,随机采样找random
时间日期datetime,配置文件找json
表格数据用csv,文本匹配用正则
高级容器collection,高效迭代itertools
场景对应表:
| 遇到的需求 | 使用的库 |
|---|---|
| 读取配置文件 | json / csv |
| 遍历文件夹 | os |
| 生成随机数 | random |
| 计算平均值 | statistics / math |
| 记录时间 | datetime |
| 解析日志 | re |
| 统计频率 | collections.Counter |
| 排列组合 | itertools |
| 处理命令行 | sys.argv |
| 设置随机种子 | random.seed() |
七、常见坑点与解决方案
| 问题 | 原因 | 解决方案 |
|---|---|---|
FileNotFoundError |
路径错误 | 使用os.path.exists()检查 |
| 中文乱码 | 编码问题 | 指定encoding='utf-8' |
random结果不同 |
未设种子 | 调用random.seed(42) |
| JSON解析失败 | 格式错误 | 先用json.loads()测试 |
| CSV字段包含逗号 | 未用引号 | 使用quoting=csv.QUOTE_ALL |
| 正则匹配太慢 | 贪婪匹配 | 使用*?非贪婪模式 |
| 大文件内存爆炸 | 一次性读取 | 使用迭代器逐行处理 |
| 时区问题 | datetime naive | 使用pytz或zoneinfo |
八、总结
核心要点:
- os和sys:文件系统和运行时,AI项目的基础设施
- math和random:数学计算和随机性,模型训练的基石
- datetime:时间管理,实验追踪必备
- json和csv:数据交换格式,AI数据集的标配
- re:文本处理利器,数据清洗必备
- collections:数据结构优化,提升代码效率
- itertools:迭代器工具,优雅处理循环
学习建议:
- 不需要一次性记住所有函数
- 掌握每个库的核心3-5个函数即可
- 实践中遇到需求再查文档
- 多写小项目巩固理解
进阶方向:
- 学习
functools(函数式编程) - 学习
argparse(命令行参数解析) - 学习
logging(专业日志系统) - 学习
multiprocessing(多进程处理)
记住:内置库是Python的宝藏,掌握它们,你就能用最少的代码做最多的事情。