Linux 文本处理三剑客(grep, sed, awk)核心用法与实战
1. 文本处理三剑客概述
1.1 工具定位与适用场景
graph TB
A[文本处理需求] --> B{处理类型}
B -->|模式搜索| C[grep]
B -->|流编辑| D[sed]
B -->|报表生成| E[awk]
C --> F[快速过滤]
D --> G[批量替换]
E --> H[数据提取]
F --> I[结果输出]
G --> I
H --> I
style A fill:#1e3a5f,color:#ffffff
style B fill:#4a1e5f,color:#ffffff
style C fill:#1e5f3a,color:#ffffff
style D fill:#1e5f3a,color:#ffffff
style E fill:#1e5f3a,color:#ffffff
style F fill:#5f3a1e,color:#ffffff
style G fill:#5f3a1e,color:#ffffff
style H fill:#5f3a1e,color:#ffffff
style I fill:#1e3a5f,color:#ffffff
1.2 创建测试数据文件
创建基础测试数据文件:create_test_data.sh
bash
#!/bin/bash
# 创建测试数据脚本
set -e
echo "=== 创建文本处理测试数据 ==="
# 创建用户数据文件
cat > users.txt << 'EOF'
1,张三,25,工程师,北京,50000
2,李四,30,经理,上海,80000
3,王五,28,设计师,广州,60000
4,赵六,35,总监,深圳,120000
5,钱七,22,实习生,杭州,30000
6,孙八,40,顾问,成都,90000
7,周九,26,开发,西安,55000
8,吴十,33,产品,武汉,70000
9,郑十一,29,测试,南京,52000
10,王十二,31,运维,长沙,58000
EOF
# 创建日志文件
cat > access.log << 'EOF'
192.168.1.100 - - [10/Oct/2023:10:30:01 +0800] "GET /index.html HTTP/1.1" 200 1234 "https://example.com" "Mozilla/5.0"
192.168.1.101 - - [10/Oct/2023:10:30:02 +0800] "POST /api/login HTTP/1.1" 401 567 "https://example.com" "Mozilla/5.0"
192.168.1.102 - - [10/Oct/2023:10:30:03 +0800] "GET /products.html HTTP/1.1" 200 7890 "https://example.com" "Chrome/91.0"
192.168.1.100 - - [10/Oct/2023:10:30:04 +0800] "GET /images/logo.png HTTP/1.1" 304 0 "https://example.com" "Mozilla/5.0"
192.168.1.103 - - [10/Oct/2023:10:30:05 +0800] "PUT /api/users/1 HTTP/1.1" 403 234 "https://example.com" "Firefox/89.0"
192.168.1.104 - - [10/Oct/2023:10:30:06 +0800] "GET /contact.html HTTP/1.1" 200 3456 "https://example.com" "Safari/14.0"
192.168.1.101 - - [10/Oct/2023:10:30:07 +0800] "DELETE /api/products/5 HTTP/1.1" 204 0 "https://example.com" "Mozilla/5.0"
192.168.1.105 - - [10/Oct/2023:10:30:08 +0800] "GET /about.html HTTP/1.1" 500 123 "https://example.com" "Chrome/92.0"
EOF
# 创建配置文件
cat > config.txt << 'EOF'
# 数据库配置
database.host=localhost
database.port=3306
database.name=myapp
database.user=admin
database.password=secret123
# 应用配置
app.name=MyApplication
app.version=1.0.0
app.port=8080
app.debug=true
# 日志配置
log.level=INFO
log.file=/var/log/app.log
log.max_size=100MB
# 功能开关
feature.auth=true
feature.cache=false
feature.export=true
EOF
# 创建多语言文本文件
cat > multilang.txt << 'EOF'
Hello world! 你好世界! Bonjour le monde!
This is a test. 这是一个测试。 C'est un test.
Programming is fun. 编程很有趣。 La programmation est amusante.
Linux is powerful. Linux很强大。 Linux est puissant.
Open source software. 开源软件。 Logiciel open source.
EOF
# 创建 CSV 数据文件
cat > sales.csv << 'EOF'
Date,Product,Category,Region,Sales,Quantity
2023-10-01,Laptop,Electronics,North,50000,10
2023-10-01,Phone,Electronics,South,30000,15
2023-10-02,Desk,Furniture,East,15000,5
2023-10-02,Chair,Furniture,West,8000,8
2023-10-03,Monitor,Electronics,North,20000,4
2023-10-03,Keyboard,Electronics,South,5000,10
2023-10-04,Table,Furniture,East,12000,3
2023-10-04,Books,Education,West,3000,30
EOF
echo "测试数据文件创建完成:"
echo " users.txt - 用户数据"
echo " access.log - 访问日志"
echo " config.txt - 配置文件"
echo " multilang.txt - 多语言文本"
echo " sales.csv - 销售数据"
2. grep - 文本搜索专家
2.1 grep 基础用法
创建 grep 基础教程:grep_basics.sh
bash
#!/bin/bash
# grep 基础用法教程
echo "=== grep 基础用法 ==="
# 1. 基础搜索
echo -e "\n1. 基础搜索:"
echo "搜索包含'工程师'的行:"
grep '工程师' users.txt
# 2. 忽略大小写
echo -e "\n2. 忽略大小写搜索:"
echo "搜索包含'get'的行(忽略大小写):"
grep -i 'get' access.log
# 3. 显示行号
echo -e "\n3. 显示行号:"
echo "搜索'北京'并显示行号:"
grep -n '北京' users.txt
# 4. 反向搜索
echo -e "\n4. 反向搜索(不包含指定内容):"
echo "搜索不包含'GET'的行:"
grep -v 'GET' access.log
# 5. 统计匹配行数
echo -e "\n5. 统计匹配数量:"
echo "统计状态码为200的行数:"
grep -c '200' access.log
# 6. 显示匹配文件名
echo -e "\n6. 显示匹配文件名:"
echo "在多个文件中搜索:"
grep -l 'admin' *.txt
# 7. 递归搜索
echo -e "\n7. 递归目录搜索:"
echo "在当前目录递归搜索'localhost':"
grep -r 'localhost' . 2>/dev/null || echo "搜索完成"
# 8. 完整单词匹配
echo -e "\n8. 完整单词匹配:"
echo "搜索完整单词'test':"
grep -w 'test' multilang.txt
# 9. 显示匹配前后内容
echo -e "\n9. 显示上下文:"
echo "搜索'500'并显示前后2行:"
grep -C 2 '500' access.log
# 10. 只显示匹配部分
echo -e "\n10. 只显示匹配部分:"
echo "只显示匹配的IP地址:"
grep -o '[0-9]\+\.[0-9]\+\.[0-9]\+\.[0-9]\+' access.log
# 11. 扩展正则表达式
echo -e "\n11. 扩展正则表达式:"
echo "使用扩展正则搜索状态码:"
grep -E '(200|404|500)' access.log
# 12. 固定字符串搜索
echo -e "\n12. 固定字符串搜索:"
echo "搜索固定字符串'1.0.0':"
grep -F '1.0.0' config.txt
2.2 grep 高级用法与正则表达式
创建 grep 高级教程:grep_advanced.sh
bash
#!/bin/bash
# grep 高级用法与正则表达式
echo "=== grep 高级用法 ==="
# 创建复杂测试文件
cat > regex_test.txt << 'EOF'
email1: john.doe@example.com
email2: jane_smith123@company.co.uk
phone1: +1-555-123-4567
phone2: (555) 987-6543
date1: 2023-10-15
date2: 10/15/2023
ip1: 192.168.1.1
ip2: 10.0.0.255
url1: https://www.example.com/path
url2: http://localhost:8080/api/v1/users
html: <div class="container">Content</div>
json: {"name": "John", "age": 30, "active": true}
credit_card: 4111-1111-1111-1111
ssn: 123-45-6789
EOF
# 1. 基础正则表达式
echo -e "\n1. 基础正则表达式:"
echo "匹配邮箱地址:"
grep -E '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' regex_test.txt
echo -e "\n匹配IP地址:"
grep -E '([0-9]{1,3}\.){3}[0-9]{1,3}' regex_test.txt
echo -e "\n匹配日期(YYYY-MM-DD):"
grep -E '[0-9]{4}-[0-9]{2}-[0-9]{2}' regex_test.txt
# 2. 字符类
echo -e "\n2. 字符类:"
echo "匹配数字:"
grep '[0-9]' regex_test.txt | head -3
echo -e "\n匹配小写字母:"
grep '[a-z]' regex_test.txt | head -3
echo -e "\n匹配单词字符:"
grep '[[:alnum:]]' regex_test.txt | head -3
# 3. 量词
echo -e "\n3. 量词:"
echo "匹配3位数字:"
grep -E '[0-9]{3}' regex_test.txt
echo -e "\n匹配1个或多个数字:"
grep -E '[0-9]+' regex_test.txt | head -3
echo -e "\n匹配0个或多个字母:"
grep -E '[a-z]*' regex_test.txt | head -3
# 4. 分组和引用
echo -e "\n4. 分组和引用:"
echo "匹配重复单词:"
echo "hello hello world test test example" | grep -E '(\b\w+\b) \1'
# 5. 锚点
echo -e "\n5. 锚点:"
echo "匹配以'email'开头的行:"
grep '^email' regex_test.txt
echo -e "\n匹配以'.com'结尾的行:"
grep '\.com$' regex_test.txt
echo -e "\n匹配完整单词'json':"
grep '\bjson\b' regex_test.txt
# 6. 交替匹配
echo -e "\n6. 交替匹配:"
echo "匹配'http'或'https':"
grep -E 'https?' regex_test.txt
echo -e "\n匹配多种日期格式:"
grep -E '([0-9]{4}-[0-9]{2}-[0-9]{2}|[0-9]{2}/[0-9]{2}/[0-9]{4})' regex_test.txt
# 7. 环视断言
echo -e "\n7. 环视断言(需要PCRE):"
echo "匹配后面跟着'@'的数字:"
grep -P '[0-9]+(?=@)' regex_test.txt 2>/dev/null || echo "PCRE不支持,使用其他方法"
# 8. 复杂模式匹配
echo -e "\n8. 复杂模式匹配:"
echo "提取URL中的域名:"
grep -oE 'https?://[^/]+' regex_test.txt
echo -e "\n提取JSON字段值:"
grep -oE '"name": "[^"]+"' regex_test.txt
# 9. grep 组合技巧
echo -e "\n9. grep 组合技巧:"
echo "搜索多个模式:"
grep -e 'email' -e 'phone' regex_test.txt
echo -e "\n排除多个模式:"
grep -v -e 'email' -e 'phone' regex_test.txt
echo -e "\n管道组合使用:"
grep 'example' regex_test.txt | grep -v 'email'
# 10. 性能优化技巧
echo -e "\n10. 性能优化技巧:"
echo "使用固定字符串加速:"
time grep -F 'example.com' regex_test.txt
echo -e "\n使用简单字符类:"
time grep '[0-9]' regex_test.txt
# 清理
rm -f regex_test.txt
echo -e "\n=== grep 高级用法演示完成 ==="
2.3 grep 实战案例
创建 grep 实战脚本:grep_practical.sh
bash
#!/bin/bash
# grep 实战案例
echo "=== grep 实战案例 ==="
# 案例1: 日志分析
echo -e "\n案例1: 日志分析"
echo "1. 查找错误请求:"
grep -E '(404|500|403)' access.log
echo -e "\n2. 统计各状态码出现次数:"
grep -oE 'HTTP/1.1" [0-9]{3}' access.log | awk '{print $2}' | sort | uniq -c | sort -rn
echo -e "\n3. 查找特定IP的访问记录:"
grep '192.168.1.100' access.log
echo -e "\n4. 搜索特定时间段的日志:"
grep '10/Oct/2023:10:30:0[2-5]' access.log
# 案例2: 配置文件处理
echo -e "\n案例2: 配置文件处理"
echo "1. 提取所有配置项(排除注释):"
grep -v '^#' config.txt | grep '='
echo -e "\n2. 查找数据库相关配置:"
grep -i 'database' config.txt
echo -e "\n3. 提取配置值:"
grep 'app.port' config.txt | grep -oE '[0-9]+'
echo -e "\n4. 查找启用的功能:"
grep 'feature.' config.txt | grep 'true'
# 案例3: 数据提取
echo -e "\n案例3: 数据提取"
echo "1. 提取所有用户名:"
grep -oE ',[^,]+,' users.txt | grep -oE '[^,]+' | grep -vE '^[0-9]+$' | head -5
echo -e "\n2. 查找高薪员工(薪资>70000):"
grep -E ',[0-9]{5,6}$' users.txt | awk -F, '$6 > 70000'
echo -e "\n3. 统计各城市员工数量:"
grep -oE '[^,]+,' users.txt | grep -oE '[^,]+' | grep -vE '^[0-9]+$' | sort | uniq -c
# 案例4: 代码分析
echo -e "\n案例4: 代码分析(模拟)"
# 创建模拟代码文件
cat > sample_code.py << 'EOF'
#!/usr/bin/env python3
"""
示例代码文件
"""
import sys
import os
from typing import List
def calculate_sum(numbers: List[int]) -> int:
"""计算数字列表的总和"""
total = 0
for num in numbers:
total += num
return total
def read_file(filename: str) -> str:
"""读取文件内容"""
try:
with open(filename, 'r') as f:
return f.read()
except FileNotFoundError:
print(f"错误: 文件 {filename} 不存在")
return ""
class DataProcessor:
"""数据处理类"""
def __init__(self, data: List[str]):
self.data = data
def process(self) -> List[str]:
"""处理数据"""
result = []
for item in self.data:
# TODO: 实现处理逻辑
processed = item.strip().upper()
result.append(processed)
return result
if __name__ == "__main__":
# FIXME: 需要添加命令行参数解析
numbers = [1, 2, 3, 4, 5]
print(f"总和: {calculate_sum(numbers)}")
EOF
echo "1. 查找函数定义:"
grep -E '^def ' sample_code.py
echo -e "\n2. 查找类定义:"
grep -E '^class ' sample_code.py
echo -e "\n3. 查找TODO和FIXME注释:"
grep -E '(TODO|FIXME)' sample_code.py
echo -e "\n4. 查找导入语句:"
grep -E '^import|^from' sample_code.py
# 案例5: 系统管理
echo -e "\n案例5: 系统管理"
echo "1. 查找进程:"
ps aux | grep 'bash' | head -5
echo -e "\n2. 检查服务状态:"
systemctl list-units | grep 'running' | head -5
echo -e "\n3. 查找大文件:"
find /tmp -type f -size +1M 2>/dev/null | head -5
# 案例6: 网络分析
echo -e "\n案例6: 网络分析"
echo "1. 分析网络连接:"
netstat -tulpn 2>/dev/null | grep 'LISTEN' | head -5
# 清理
rm -f sample_code.py
echo -e "\n=== grep 实战案例演示完成 ==="
3. sed - 流编辑器大师
3.1 sed 基础用法
创建 sed 基础教程:sed_basics.sh
bash
#!/bin/bash
# sed 基础用法教程
echo "=== sed 基础用法 ==="
# 创建测试文件
cat > sed_test.txt << 'EOF'
Hello World
This is a test file.
Welcome to Linux sed tutorial.
Python programming is fun.
Java is also popular.
We are learning text processing.
EOF
# 1. 基本替换
echo -e "\n1. 基本替换:"
echo "将'is'替换为'IS':"
sed 's/is/IS/' sed_test.txt
# 2. 全局替换
echo -e "\n2. 全局替换:"
echo "全局将'is'替换为'IS':"
sed 's/is/IS/g' sed_test.txt
# 3. 指定行替换
echo -e "\n3. 指定行替换:"
echo "只在第2行替换:'is' -> 'IS':"
sed '2s/is/IS/g' sed_test.txt
# 4. 行范围替换
echo -e "\n4. 行范围替换:"
echo "在第2-4行替换:'is' -> 'IS':"
sed '2,4s/is/IS/g' sed_test.txt
# 5. 删除行
echo -e "\n5. 删除行:"
echo "删除包含'test'的行:"
sed '/test/d' sed_test.txt
# 6. 打印特定行
echo -e "\n6. 打印特定行:"
echo "打印第3行:"
sed -n '3p' sed_test.txt
# 7. 多命令执行
echo -e "\n7. 多命令执行:"
echo "替换并删除:"
sed -e 's/is/IS/g' -e '/Java/d' sed_test.txt
# 8. 原位编辑
echo -e "\n8. 原位编辑(创建备份):"
cp sed_test.txt sed_test_backup.txt
sed -i.bak 's/Linux/UNIX/g' sed_test_backup.txt
echo "原文件:"
cat sed_test.txt | head -1
echo "修改后:"
cat sed_test_backup.txt | head -1
# 9. 插入行
echo -e "\n9. 插入行:"
echo "在第2行前插入新行:"
sed '2i\---插入的行---' sed_test.txt
# 10. 追加行
echo -e "\n10. 追加行:"
echo "在第2行后追加新行:"
sed '2a\---追加的行---' sed_test.txt
# 11. 修改行
echo -e "\n11. 修改行:"
echo "修改第3行:"
sed '3c\---修改的行内容---' sed_test.txt
# 12. 写入文件
echo -e "\n12. 写入文件:"
echo "将包含'Python'的行写入新文件:"
sed -n '/Python/w python_lines.txt' sed_test.txt
cat python_lines.txt
# 13. 读取文件
echo -e "\n13. 读取文件:"
echo "在第3行后读取其他文件内容:"
sed '3r sed_test_backup.txt' sed_test.txt | head -10
# 14. 转换字符
echo -e "\n14. 转换字符:"
echo "转换大小写:"
echo "hello world" | sed 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/'
# 清理
rm -f sed_test.txt sed_test_backup.txt sed_test_backup.txt.bak python_lines.txt
echo -e "\n=== sed 基础用法演示完成 ==="
3.2 sed 高级用法
创建 sed 高级教程:sed_advanced.sh
bash
#!/bin/bash
# sed 高级用法教程
echo "=== sed 高级用法 ==="
# 创建复杂测试文件
cat > advanced_sed_test.txt << 'EOF'
Name: John Doe
Age: 30
Email: john@example.com
Phone: 123-456-7890
Address: 123 Main St, City, State 12345
Name: Jane Smith
Age: 25
Email: jane.smith@company.com
Phone: (555) 987-6543
Address: 456 Oak Ave, Another City, State 67890
Name: Bob Johnson
Age: 35
Email: bob_j@test.org
Phone: 111.222.3333
Address: 789 Pine Rd, Different City, State 54321
EOF
# 1. 模式空间和保持空间
echo -e "\n1. 模式空间和保持空间:"
echo "交换模式空间和保持空间:"
sed -n '1h; 2x; p' advanced_sed_test.txt
# 2. 分支和跳转
echo -e "\n2. 分支和跳转:"
echo "跳过包含'Email'的行:"
sed -n '/Email/!p' advanced_sed_test.txt
# 3. 多行处理
echo -e "\n3. 多行处理:"
echo "将多行合并为一行:"
sed ':a; N; $!ba; s/\n/ /g' advanced_sed_test.txt
# 4. 高级替换
echo -e "\n4. 高级替换:"
echo "使用分组和反向引用:"
echo "123-456-7890" | sed -E 's/([0-9]{3})-([0-9]{3})-([0-9]{4})/(\1) \2-\3/'
echo -e "\n条件替换:"
sed '/Phone/ s/[0-9]/X/g' advanced_sed_test.txt
# 5. 循环和条件
echo -e "\n5. 循环和条件:"
echo "编号非空行:"
sed '/^$/!{=;d}' advanced_sed_test.txt | sed 'N;s/\n/ /'
# 6. 文件处理
echo -e "\n6. 文件处理技巧:"
echo "删除文件中的空白行:"
sed '/^$/d' advanced_sed_test.txt
echo -e "\n删除行尾空格:"
sed 's/[[:space:]]*$//' advanced_sed_test.txt
# 7. 复杂模式匹配
echo -e "\n7. 复杂模式匹配:"
echo "提取邮箱地址:"
sed -nE 's/.*Email: ([^ ]+).*/\1/p' advanced_sed_test.txt
echo -e "\n提取电话号码:"
sed -nE 's/.*Phone: ([0-9().-]+).*/\1/p' advanced_sed_test.txt
# 8. 范围操作
echo -e "\n8. 范围操作:"
echo "处理特定记录(第1-6行):"
sed -n '1,6p' advanced_sed_test.txt
echo -e "\n从模式开始到文件结束:"
sed -n '/Jane Smith/,$p' advanced_sed_test.txt
# 9. 标签和跳转
echo -e "\n9. 标签和跳转:"
echo "使用标签实现循环:"
echo "aaa bbb ccc" | sed ':loop s/a/X/; t loop'
# 10. 保持空间操作
echo -e "\n10. 保持空间操作:"
echo "复制模式空间到保持空间:"
sed -n '1h; 1p; 2g; 2p' advanced_sed_test.txt
# 11. 流控制脚本
echo -e "\n11. 流控制脚本:"
# 创建复杂的sed脚本
cat > complex_script.sed << 'EOF'
# 复杂sed脚本示例
/Name:/ {
h # 复制到保持空间
d # 删除模式空间
}
/Email:/ {
G # 追加保持空间到模式空间
s/\n/ - / # 替换换行符
p # 打印
}
EOF
echo "执行复杂sed脚本:"
sed -n -f complex_script.sed advanced_sed_test.txt
# 12. 实际应用案例
echo -e "\n12. 实际应用案例:"
echo "格式化电话号码:"
sed -E 's/Phone: ([0-9]{3})[-.]([0-9]{3})[-.]([0-9]{4})/Phone: (\1) \2-\3/g' advanced_sed_test.txt
echo -e "\n提取姓名和邮箱:"
sed -nE '/Name:/{s/Name: //;h}; /Email:/{s/Email: //;G;s/\n/ : /p}' advanced_sed_test.txt
# 清理
rm -f advanced_sed_test.txt complex_script.sed
echo -e "\n=== sed 高级用法演示完成 ==="
3.3 sed 实战案例
创建 sed 实战脚本:sed_practical.sh
bash
#!/bin/bash
# sed 实战案例
echo "=== sed 实战案例 ==="
# 案例1: 日志处理
echo -e "\n案例1: 日志处理"
echo "1. 提取特定时间段的日志:"
sed -n '/10\/Oct\/2023:10:30:0[2-5]/p' access.log
echo -e "\n2. 删除调试信息:"
sed '/DEBUG/d' access.log
echo -e "\n3. 替换敏感信息:"
sed 's/192.168.1.[0-9]*/XXX.XXX.XXX.XXX/g' access.log
# 案例2: 配置文件修改
echo -e "\n案例2: 配置文件修改"
echo "1. 修改配置项:"
sed 's/app.port=8080/app.port=9090/' config.txt
echo -e "\n2. 注释掉特定配置:"
sed '/feature.cache/s/^/# /' config.txt
echo -e "\n3. 取消注释:"
sed '/# database.host/s/^# //' config.txt
# 案例3: 数据清洗
echo -e "\n案例3: 数据清洗"
echo "1. 标准化CSV格式:"
sed 's/, /,/g' sales.csv | sed 's/^[[:space:]]*//' | sed 's/[[:space:]]*$//'
echo -e "\n2. 删除空行:"
sed '/^$/d' sales.csv
echo -e "\n3. 转换日期格式:"
sed 's|\([0-9]\{4\}\)-\([0-9]\{2\}\)-\([0-9]\{2\}\)|\2/\3/\1|g' sales.csv
# 案例4: 代码重构
echo -e "\n案例4: 代码重构"
# 创建模拟代码文件
cat > refactor_code.py << 'EOF'
def old_function_name():
print("This is old function")
def another_old_function():
print("Another old function")
# 调用旧函数
old_function_name()
another_old_function()
EOF
echo "1. 重命名函数:"
sed 's/old_function_name/new_function_name/g' refactor_code.py
echo -e "\n2. 添加日志:"
sed '/def /a\ print("Function called")' refactor_code.py
# 案例5: 文本格式化
echo -e "\n案例5: 文本格式化"
echo "1. 添加行号:"
sed = users.txt | sed 'N;s/\n/ /'
echo -e "\n2. 每N行添加分隔符:"
sed '3~3a\---' users.txt
echo -e "\n3. 文本对齐:"
sed 's/^/ /' users.txt | head -3
# 案例6: 批量文件处理
echo -e "\n案例6: 批量文件处理"
# 创建多个测试文件
for i in {1..3}; do
echo "File $i content" > "test_file_$i.txt"
echo "version=1.0" >> "test_file_$i.txt"
done
echo "批量修改文件内容:"
for file in test_file_*.txt; do
echo "处理文件: $file"
sed -i 's/version=1.0/version=2.0/' "$file"
cat "$file"
done
# 案例7: 数据提取和转换
echo -e "\n案例7: 数据提取和转换"
echo "1. 提取薪资大于60000的员工:"
sed -n '/,[0-9]\{5,\}$/p' users.txt | awk -F, '$6 > 60000'
echo -e "\n2. 生成SQL插入语句:"
sed '1d; s/\([^,]*\),\([^,]*\),\([^,]*\),\([^,]*\),\([^,]*\),\([^,]*\)/INSERT INTO users VALUES(\1, \"\2\", \3, \"\4\", \"\5\", \6);/' users.txt
# 案例8: 复杂文本转换
echo -e "\n案例8: 复杂文本转换"
echo "转换多语言文本格式:"
sed -E 's/([^!])!([^!])/\1\n\2/g' multilang.txt
# 清理
rm -f refactor_code.py test_file_*.txt
echo -e "\n=== sed 实战案例演示完成 ==="
4. awk - 文本处理编程语言
4.1 awk 基础用法
创建 awk 基础教程:awk_basics.sh
bash
#!/bin/bash
# awk 基础用法教程
echo "=== awk 基础用法 ==="
# 1. 基本打印
echo -e "\n1. 基本打印:"
echo "打印整个文件:"
awk '{print}' users.txt
echo -e "\n打印第一列:"
awk '{print $1}' users.txt
echo -e "\n打印多列:"
awk '{print $1, $3}' users.txt
# 2. 字段分隔符
echo -e "\n2. 字段分隔符:"
echo "使用逗号分隔符:"
awk -F, '{print $2, $6}' users.txt
echo -e "\n使用多个分隔符:"
echo "apple,banana;cherry" | awk -F'[,;]' '{print $1, $2, $3}'
# 3. 模式匹配
echo -e "\n3. 模式匹配:"
echo "匹配包含'北京'的行:"
awk '/北京/' users.txt
echo -e "\n匹配特定字段:"
awk -F, '$3 > 28' users.txt
# 4. 内置变量
echo -e "\n4. 内置变量:"
echo "行号:"
awk '{print NR, $0}' users.txt | head -3
echo -e "\n字段数量:"
awk -F, '{print NF, $0}' users.txt | head -3
echo -e "\n文件名:"
awk 'END{print FILENAME}' users.txt
# 5. BEGIN 和 END 块
echo -e "\n5. BEGIN 和 END 块:"
echo "添加表头:"
awk -F, 'BEGIN {print "ID\t姓名\t年龄\t职位"} {print $1"\t"$2"\t"$3"\t"$4} END {print "=== 结束 ==="}' users.txt
# 6. 变量和计算
echo -e "\n6. 变量和计算:"
echo "计算平均年龄:"
awk -F, '{sum += $3; count++} END {print "平均年龄:", sum/count}' users.txt
echo -e "\n薪资统计:"
awk -F, '{sum += $6; if($6 > max) max = $6} END {print "总薪资:", sum, "最高薪资:", max}' users.txt
# 7. 条件语句
echo -e "\n7. 条件语句:"
echo "薪资分类:"
awk -F, '{
if ($6 > 80000)
print $2, "高薪"
else if ($6 > 50000)
print $2, "中薪"
else
print $2, "低薪"
}' users.txt
# 8. 循环
echo -e "\n8. 循环:"
echo "遍历字段:"
awk -F, '{
printf "行 %d: ", NR
for(i=1; i<=NF; i++)
printf "[%s] ", $i
print ""
}' users.txt | head -3
# 9. 数组
echo -e "\n9. 数组:"
echo "按城市统计人数:"
awk -F, '{
city[$5]++
} END {
for(c in city)
print c, city[c]
}' users.txt
# 10. 字符串函数
echo -e "\n10. 字符串函数:"
echo "字符串操作:"
awk -F, '{
print "原始:", $2, "大写:", toupper($2), "长度:", length($2)
}' users.txt | head -3
# 11. 数学函数
echo -e "\n11. 数学函数:"
echo "数学运算:"
awk -F, '{
print $2, "薪资:", $6, "平方根:", sqrt($6)
}' users.txt | head -3
# 12. 输出格式控制
echo -e "\n12. 输出格式控制:"
echo "格式化输出:"
awk -F, 'BEGIN {printf "%-10s %-8s %-10s\n", "姓名", "年龄", "薪资"}
{printf "%-10s %-8d %-10.2f\n", $2, $3, $6}' users.txt
echo -e "\n=== awk 基础用法演示完成 ==="
4.2 awk 高级用法
创建 awk 高级教程:awk_advanced.sh
bash
#!/bin/bash
# awk 高级用法教程
echo "=== awk 高级用法 ==="
# 创建复杂测试数据
cat > advanced_awk_test.txt << 'EOF'
2023-10-01 08:30:25 INFO [UserService] User login successful: user_id=123
2023-10-01 08:45:12 ERROR [PaymentService] Payment failed: amount=500.00, reason=insufficient_funds
2023-10-01 09:15:33 WARN [AuthService] Multiple failed login attempts: ip=192.168.1.100
2023-10-01 10:20:45 INFO [OrderService] New order created: order_id=456, amount=299.99
2023-10-01 11:05:17 ERROR [DatabaseService] Connection timeout: retry_count=3
2023-10-01 14:30:22 INFO [InventoryService] Stock updated: product_id=789, quantity=50
EOF
# 1. 复杂字段分割
echo -e "\n1. 复杂字段分割:"
echo "多字符分隔符:"
awk -F'[][]' '{print "服务:", $2, "消息:", $3}' advanced_awk_test.txt
echo -e "\n正则表达式分隔符:"
awk -F'[=,]' '{for(i=1;i<=NF;i++) if($i~/[a-z]_id/) print $i}' advanced_awk_test.txt
# 2. 关联数组
echo -e "\n2. 关联数组:"
echo "统计日志级别:"
awk '{
split($3, level, " ")
levels[level[1]]++
} END {
for(l in levels)
printf "%-6s: %d\n", l, levels[l]
}' advanced_awk_test.txt
# 3. 多维数组
echo -e "\n3. 多维数组:"
echo "按服务和级别统计:"
awk '{
split($0, parts, "[][]")
service = parts[2]
level = $3
stats[service][level]++
} END {
for(service in stats) {
print "服务:", service
for(level in stats[service]) {
print " ", level, ":", stats[service][level]
}
}
}' advanced_awk_test.txt
# 4. 自定义函数
echo -e "\n4. 自定义函数:"
echo "使用自定义函数:"
awk '
function extract_number(str) {
match(str, /[0-9]+(\.[0-9]+)?/)
return substr(str, RSTART, RLENGTH)
}
{
for(i=1; i<=NF; i++) {
if($i ~ /amount=/) {
amount = extract_number($i)
print "金额:", amount
}
}
}' advanced_awk_test.txt
# 5. 模式范围
echo -e "\n5. 模式范围:"
echo "处理特定时间范围:"
awk '$2 >= "09:00:00" && $2 <= "11:00:00"' advanced_awk_test.txt
# 6. 输出重定向
echo -e "\n6. 输出重定向:"
echo "按级别输出到不同文件:"
awk '
$3 ~ /INFO/ {print > "info.log"}
$3 ~ /ERROR/ {print > "error.log"}
$3 ~ /WARN/ {print > "warn.log"}
' advanced_awk_test.txt
echo "INFO日志:"
cat info.log
echo "ERROR日志:"
cat error.log
# 7. 管道输出
echo -e "\n7. 管道输出:"
echo "排序输出:"
awk '{print $3, $5}' advanced_awk_test.txt | sort
# 8. 系统命令集成
echo -e "\n8. 系统命令集成:"
echo "在awk中执行系统命令:"
awk '{
"date +%Y-%m-%d" | getline current_date
close("date +%Y-%m-%d")
if($1 == current_date)
print "今天日志:", $0
}' advanced_awk_test.txt
# 9. 复杂数据处理
echo -e "\n9. 复杂数据处理:"
echo "提取和计算数值:"
awk '{
total = 0
count = 0
for(i=1; i<=NF; i++) {
if($i ~ /amount=([0-9.]+)/) {
match($i, /amount=([0-9.]+)/, arr)
total += arr[1]
count++
}
}
if(count > 0) {
print "总金额:", total, "平均金额:", total/count
}
}' advanced_awk_test.txt
# 10. 报表生成
echo -e "\n10. 报表生成:"
echo "生成统计报表:"
awk '
BEGIN {
printf "%-20s %-10s %-10s\n", "服务", "INFO", "ERROR"
printf "%-20s %-10s %-10s\n", "---", "---", "---"
}
{
split($0, parts, "[][]")
service = parts[2]
if($3 ~ /INFO/) info[service]++
if($3 ~ /ERROR/) error[service]++
}
END {
for(service in info) {
printf "%-20s %-10d %-10d\n", service, info[service], error[service]
}
}' advanced_awk_test.txt
# 11. 数据验证
echo -e "\n11. 数据验证:"
echo "验证数据格式:"
awk '
function is_valid_ip(ip) {
return ip ~ /^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$/
}
{
for(i=1; i<=NF; i++) {
if($i ~ /ip=/) {
split($i, ip_parts, "=")
ip = ip_parts[2]
if(is_valid_ip(ip)) {
print "有效IP:", ip
} else {
print "无效IP:", ip
}
}
}
}' advanced_awk_test.txt
# 清理
rm -f info.log error.log warn.log advanced_awk_test.txt
echo -e "\n=== awk 高级用法演示完成 ==="
4.3 awk 实战案例
创建 awk 实战脚本:awk_practical.sh
bash
#!/bin/bash
# awk 实战案例
echo "=== awk 实战案例 ==="
# 案例1: 销售数据分析
echo -e "\n案例1: 销售数据分析"
echo "1. 按产品分类统计:"
awk -F, '
NR > 1 {
products[$2] += $5
quantity[$2] += $6
}
END {
printf "%-15s %-12s %-10s %-12s\n", "产品", "总销售额", "总数量", "平均单价"
printf "%-15s %-12s %-10s %-12s\n", "---", "---", "---", "---"
for(p in products) {
avg = products[p] / quantity[p]
printf "%-15s %-12.2f %-10d %-12.2f\n", p, products[p], quantity[p], avg
}
}' sales.csv
echo -e "\n2. 按地区统计:"
awk -F, '
NR > 1 {
regions[$4] += $5
}
END {
print "=== 地区销售统计 ==="
for(r in regions) {
printf "%-10s: %.2f\n", r, regions[r]
}
}' sales.csv
# 案例2: 系统监控报告
echo -e "\n案例2: 系统监控报告"
# 创建系统数据模拟
cat > system_stats.txt << 'EOF'
CPU 25% MEM 45% DISK 78% NET 120KB/s
CPU 30% MEM 48% DISK 79% NET 150KB/s
CPU 28% MEM 46% DISK 78% NET 130KB/s
CPU 35% MEM 50% DISK 80% NET 200KB/s
CPU 40% MEM 52% DISK 81% NET 180KB/s
EOF
echo "系统资源分析:"
awk '{
cpu_sum += $2
mem_sum += $4
disk_sum += $6
net_sum += $8
count++
}
END {
print "=== 系统资源统计 ==="
printf "CPU平均使用率: %.1f%%\n", cpu_sum/count
printf "内存平均使用率: %.1f%%\n", mem_sum/count
printf "磁盘平均使用率: %.1f%%\n", disk_sum/count
printf "网络平均速度: %.1fKB/s\n", net_sum/count
}' system_stats.txt
# 案例3: 日志分析
echo -e "\n案例3: 日志分析"
echo "1. HTTP状态码统计:"
awk '{
match($0, /HTTP\/1\.1" ([0-9]{3})/, arr)
if(arr[1] != "") {
status_codes[arr[1]]++
}
}
END {
print "=== HTTP状态码统计 ==="
for(code in status_codes) {
printf "状态码 %s: %d次\n", code, status_codes[code]
}
}' access.log
echo -e "\n2. IP访问频率:"
awk '{
ip = $1
ips[ip]++
}
END {
print "=== IP访问频率 ==="
for(ip in ips) {
printf "%-15s: %d次\n", ip, ips[ip]
}
}' access.log | sort -k2 -nr
# 案例4: 配置解析
echo -e "\n案例4: 配置解析"
echo "解析配置文件:"
awk -F= '
/^[^#]/ && NF == 2 {
config[$1] = $2
}
END {
print "=== 配置信息 ==="
for(key in config) {
printf "%-20s: %s\n", key, config[key]
}
}' config.txt
# 案例5: 数据转换
echo -e "\n案例5: 数据转换"
echo "生成JSON格式:"
awk -F, '
BEGIN {
print "["
}
NR > 1 {
if(NR > 2) print ","
printf " {\n"
printf " \"id\": %s,\n", $1
printf " \"name\": \"%s\",\n", $2
printf " \"age\": %s,\n", $3
printf " \"position\": \"%s\",\n", $4
printf " \"city\": \"%s\",\n", $5
printf " \"salary\": %s\n", $6
printf " }"
}
END {
print "\n]"
}' users.txt
# 案例6: 文本处理
echo -e "\n案例6: 文本处理"
echo "多语言文本分析:"
awk '{
# 统计中文字符
chinese_chars = gsub(/[\\u4e00-\\u9fff]/, "&")
# 统计英文字符
english_words = gsub(/[a-zA-Z]+/, "&")
# 统计法文字符
french_chars = gsub(/[éèêëàâæçîïôœùûüÿ]/, "&")
printf "行 %d: 中文%d个 英文%d个 法文%d个\n", NR, chinese_chars, english_words, french_chars
}' multilang.txt
# 案例7: 复杂计算
echo -e "\n案例7: 复杂计算"
echo "薪资分析报告:"
awk -F, '
BEGIN {
print "=== 薪资分析报告 ==="
printf "%-10s %-8s %-10s %-12s\n", "姓名", "年龄", "薪资", "等级"
printf "%-10s %-8s %-10s %-12s\n", "---", "---", "---", "---"
}
NR > 0 {
salary = $6
age = $3
# 薪资等级
if(salary >= 100000) grade = "A"
else if(salary >= 70000) grade = "B"
else if(salary >= 50000) grade = "C"
else grade = "D"
# 统计
total_salary += salary
count++
if(salary > max_salary) max_salary = salary
if(salary < min_salary || min_salary == 0) min_salary = salary
# 年龄组统计
if(age < 25) age_group["<25"]++
else if(age < 30) age_group["25-29"]++
else if(age < 35) age_group["30-34"]++
else age_group[">=35"]++
printf "%-10s %-8d %-10d %-12s\n", $2, age, salary, grade
}
END {
print "\n=== 统计摘要 ==="
printf "员工总数: %d\n", count
printf "平均薪资: %.2f\n", total_salary/count
printf "最高薪资: %d\n", max_salary
printf "最低薪资: %d\n", min_salary
print "\n=== 年龄分布 ==="
for(group in age_group) {
printf "%s岁: %d人\n", group, age_group[group]
}
}' users.txt
# 清理
rm -f system_stats.txt
echo -e "\n=== awk 实战案例演示完成 ==="
5. 三剑客组合应用
5.1 工具组合工作流
创建组合应用脚本:text_processing_workflow.sh
bash
#!/bin/bash
# 三剑客组合应用工作流
echo "=== 文本处理三剑客组合应用 ==="
# 工作流1: 日志分析管道
echo -e "\n工作流1: 日志分析管道"
echo "分析错误请求并统计:"
grep -E '(404|500|403)' access.log | \
awk '{
match($0, /([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+).*"([A-Z]+) ([^"]+).* ([0-9]{3})/, arr)
print arr[1], arr[2], arr[3], arr[4]
}' | \
sort | \
uniq -c | \
sort -rn | \
head -10
# 工作流2: 数据清洗和转换
echo -e "\n工作流2: 数据清洗和转换"
echo "用户数据清洗和统计:"
sed 's/, /,/g' users.txt | \
awk -F, '
NR > 0 {
gsub(/^[[:space:]]+|[[:space:]]+$/, "", $2) # 清理姓名空格
gsub(/^[[:space:]]+|[[:space:]]+$/, "", $4) # 清理职位空格
print $1","$2","$3","$4","$5","$6
}' | \
awk -F, '{
# 按城市统计薪资
city_salary[$5] += $6
city_count[$5]++
}
END {
print "=== 各城市平均薪资 ==="
for(city in city_salary) {
avg = city_salary[city] / city_count[city]
printf "%-8s: %.2f\n", city, avg
}
}'
# 工作流3: 配置文件处理
echo -e "\n工作流3: 配置文件处理"
echo "生成配置文档:"
grep -v '^#' config.txt | \
sed 's/^# //' | \
awk -F= '{
if(NF == 2) {
category = substr($1, 1, index($1, ".")-1)
key = substr($1, index($1, ".")+1)
config[category][key] = $2
}
}
END {
print "# 配置文档"
print "生成时间: '$(date)'"
print ""
for(category in config) {
print "## " category
print ""
for(key in config[category]) {
printf "%-20s: %s\n", key, config[category][key]
}
print ""
}
}'
# 工作流4: 复杂文本转换
echo -e "\n工作流4: 复杂文本转换"
echo "多语言文本分析报告:"
cat multilang.txt | \
sed 's/! /!\n/g' | \
awk '{
# 分离不同语言
split($0, parts, " ")
chinese_count = 0
english_count = 0
french_count = 0
for(i in parts) {
if(parts[i] ~ /[\u4e00-\u9fff]/) chinese_count++
else if(parts[i] ~ /^[A-Za-z]/) english_count++
else if(parts[i] ~ /[éèêëàâæçîïôœùûüÿ]/) french_count++
}
print "行", NR, "- 中文:", chinese_count, "英文:", english_count, "法文:", french_count
}'
# 工作流5: 性能监控管道
echo -e "\n工作流5: 系统监控管道"
echo "模拟系统监控分析:"
# 创建监控数据
for i in {1..10}; do
echo "CPU $((20 + RANDOM % 30))% MEM $((40 + RANDOM % 20))% DISK $((70 + RANDOM % 15))%"
done > monitor.log
cat monitor.log | \
awk '{
gsub(/%/, "", $2)
gsub(/%/, "", $4)
gsub(/%/, "", $6)
cpu_sum += $2
mem_sum += $4
disk_sum += $6
count++
# 告警检测
if($2 > 80) print "警告: CPU使用率过高 - "$2"%"
if($4 > 85) print "警告: 内存使用率过高 - "$4"%"
if($6 > 90) print "警告: 磁盘使用率过高 - "$6"%"
}
END {
print "=== 平均使用率 ==="
printf "CPU: %.1f%%\n", cpu_sum/count
printf "内存: %.1f%%\n", mem_sum/count
printf "磁盘: %.1f%%\n", disk_sum/count
}'
# 工作流6: 数据报表生成
echo -e "\n工作流6: 销售数据报表"
awk -F, '
BEGIN {
print "=== 销售数据分析报告 ==="
print "生成时间: '$(date)'"
print ""
}
NR == 1 {
# 跳过标题行
next
}
{
# 数据统计
date_sales[$1] += $5
product_sales[$2] += $5
category_sales[$3] += $5
region_sales[$4] += $5
total_sales += $5
total_quantity += $6
record_count++
}
END {
# 日期分析
print "## 按日期统计"
for(date in date_sales) {
printf "%-12s: %8.2f\n", date, date_sales[date]
}
print ""
# 产品分析
print "## 按产品统计"
for(product in product_sales) {
printf "%-15s: %8.2f\n", product, product_sales[product]
}
print ""
# 分类分析
print "## 按分类统计"
for(category in category_sales) {
printf "%-15s: %8.2f\n", category, category_sales[category]
}
print ""
# 地区分析
print "## 按地区统计"
for(region in region_sales) {
printf "%-10s: %8.2f\n", region, region_sales[region]
}
print ""
# 汇总信息
print "## 汇总信息"
printf "总销售额: %.2f\n", total_sales
printf "总数量: %d\n", total_quantity
printf "平均单价: %.2f\n", total_sales/total_quantity
printf "记录数量: %d\n", record_count
}' sales.csv
# 清理
rm -f monitor.log
echo -e "\n=== 组合应用演示完成 ==="
5.2 性能优化与最佳实践
创建优化指南:performance_tips.sh
bash
#!/bin/bash
# 性能优化与最佳实践
echo "=== 文本处理性能优化与最佳实践 ==="
# 创建大文件测试性能
echo "创建大测试文件..."
for i in {1..10000}; do
echo "Line $i: This is test data with some pattern $((i % 100)) and more content here." >> large_test.txt
done
echo -e "\n1. grep 性能优化:"
echo "使用固定字符串搜索:"
time grep -F 'pattern 50' large_test.txt > /dev/null
echo -e "\n使用简单正则:"
time grep 'pattern 50' large_test.txt > /dev/null
echo -e "\n使用扩展正则:"
time grep -E 'pattern (50|51)' large_test.txt > /dev/null
echo -e "\n2. sed 性能优化:"
echo "使用简单替换:"
time sed 's/pattern/PATTERN/g' large_test.txt > /dev/null
echo -e "\n使用复杂替换:"
time sed -E 's/pattern ([0-9]+)/PATTERN \1/g' large_test.txt > /dev/null
echo -e "\n3. awk 性能优化:"
echo "使用字段分割:"
time awk '{print $3}' large_test.txt > /dev/null
echo -e "\n使用正则分割:"
time awk -F'[: ]' '{print $3}' large_test.txt > /dev/null
echo -e "\n4. 管道优化技巧:"
echo "减少管道数量:"
time cat large_test.txt | grep 'pattern' | sed 's/pattern/PATTERN/' > /dev/null
echo -e "\n合并处理命令:"
time awk '/pattern/ {gsub(/pattern/, "PATTERN"); print}' large_test.txt > /dev/null
echo -e "\n5. 内存使用优化:"
echo "流式处理大文件:"
time while read line; do
echo "$line" | grep -q 'pattern 50' && echo "$line"
done < large_test.txt > /dev/null
echo -e "\n使用高效工具组合:"
time grep 'pattern 50' large_test.txt | head -100 > /dev/null
# 最佳实践示例
echo -e "\n6. 最佳实践示例:"
echo "错误处理:"
grep 'nonexistent' large_test.txt || echo "没有找到匹配内容"
echo -e "\n输出控制:"
awk 'NR % 1000 == 0 {print "进度:", NR}' large_test.txt
echo -e "\n资源清理:"
trap 'rm -f temp_file.txt' EXIT
echo -e "\n7. 调试技巧:"
echo "步骤调试:"
grep 'pattern 50' large_test.txt | \
sed 's/pattern/PATTERN/' | \
awk '{print "处理结果:", $0}' | \
head -3
echo -e "\n变量跟踪:"
awk '{
if(NR % 1000 == 0) {
print "处理行数:", NR > "/dev/stderr"
}
print $0
}' large_test.txt > /dev/null
# 清理
rm -f large_test.txt
echo -e "\n=== 性能优化指南完成 ==="
6. 综合实战项目
6.1 完整的日志分析系统
创建日志分析系统:log_analysis_system.sh
bash
#!/bin/bash
# 完整的日志分析系统
echo "=== 日志分析系统 ==="
# 创建更丰富的日志数据
cat > extended_access.log << 'EOF'
192.168.1.100 - john [10/Oct/2023:10:30:01 +0800] "GET /index.html HTTP/1.1" 200 1234 "https://example.com" "Mozilla/5.0" 0.123
192.168.1.101 - jane [10/Oct/2023:10:30:02 +0800] "POST /api/login HTTP/1.1" 401 567 "https://example.com" "Mozilla/5.0" 0.456
192.168.1.102 - bob [10/Oct/2023:10:30:03 +0800] "GET /products.html HTTP/1.1" 200 7890 "https://example.com" "Chrome/91.0" 0.234
192.168.1.100 - john [10/Oct/2023:10:30:04 +0800] "GET /images/logo.png HTTP/1.1" 304 0 "https://example.com" "Mozilla/5.0" 0.078
192.168.1.103 - alice [10/Oct/2023:10:30:05 +0800] "PUT /api/users/1 HTTP/1.1" 403 234 "https://example.com" "Firefox/89.0" 0.345
192.168.1.104 - charlie [10/Oct/2023:10:30:06 +0800] "GET /contact.html HTTP/1.1" 200 3456 "https://example.com" "Safari/14.0" 0.189
192.168.1.101 - jane [10/Oct/2023:10:30:07 +0800] "DELETE /api/products/5 HTTP/1.1" 204 0 "https://example.com" "Mozilla/5.0" 0.267
192.168.1.105 - david [10/Oct/2023:10:30:08 +0800] "GET /about.html HTTP/1.1" 500 123 "https://example.com" "Chrome/92.0" 0.412
192.168.1.106 - eve [10/Oct/2023:10:30:09 +0800] "GET /admin/dashboard HTTP/1.1" 200 4567 "https://example.com" "Mozilla/5.0" 0.156
192.168.1.107 - frank [10/Oct/2023:10:30:10 +0800] "POST /api/orders HTTP/1.1" 201 789 "https://example.com" "Chrome/93.0" 0.298
EOF
# 分析函数定义
analyze_traffic() {
echo "=== 流量分析 ==="
echo -e "\n1. 总请求数:"
wc -l extended_access.log | awk '{print $1}'
echo -e "\n2. 请求方法分布:"
awk -F'"' '{print $2}' extended_access.log | awk '{methods[$1]++} END {for(m in methods) print m, methods[m]}' | sort -k2 -nr
echo -e "\n3. HTTP状态码分布:"
awk '{print $9}' extended_access.log | sort | uniq -c | sort -rn
echo -e "\n4. 用户代理分布:"
awk -F'"' '{print $6}' extended_access.log | sort | uniq -c | sort -rn | head -5
}
analyze_performance() {
echo -e "\n=== 性能分析 ==="
echo -e "\n1. 平均响应时间:"
awk '{response_time = $(NF); sum += response_time; count++} END {printf "%.3f秒\n", sum/count}' extended_access.log
echo -e "\n2. 最慢的请求:"
awk '{print $(NF), $0}' extended_access.log | sort -rn | head -3 | awk '{$1=""; print}'
echo -e "\n3. 按端点统计平均响应时间:"
awk -F'"' '{
split($2, parts, " ")
endpoint = parts[2]
response_time = $(NF)
total_time[endpoint] += response_time
count[endpoint]++
} END {
for(ep in total_time) {
avg = total_time[ep] / count[ep]
printf "%-20s: %.3f秒 (%d次)\n", ep, avg, count[ep]
}
}' extended_access.log | sort -k2 -nr
}
analyze_security() {
echo -e "\n=== 安全分析 ==="
echo -e "\n1. 失败登录尝试:"
grep 'POST /api/login.*401' extended_access.log
echo -e "\n2. 权限拒绝访问:"
awk '/ 403 /' extended_access.log
echo -e "\n3. 服务器错误:"
awk '/ 500 /' extended_access.log
echo -e "\n4. 管理员访问:"
grep '/admin' extended_access.log
}
analyze_users() {
echo -e "\n=== 用户行为分析 ==="
echo -e "\n1. 活跃用户:"
awk '{print $3}' extended_access.log | grep -v '^-$' | sort | uniq -c | sort -rn
echo -e "\n2. 用户会话分析:"
awk '{
ip = $1
user = $3
if(user != "-") {
user_requests[user]++
user_ips[user][ip]++
}
} END {
for(user in user_requests) {
ip_count = 0
for(ip in user_ips[user]) ip_count++
printf "用户: %-10s 请求数: %-3d IP数: %d\n", user, user_requests[user], ip_count
}
}' extended_access.log | sort -k4 -nr
}
generate_report() {
echo -e "\n=== 分析报告生成 ==="
cat > log_report.txt << 'EOF'
# 网站访问日志分析报告
生成时间: $(date)
## 执行摘要
EOF
# 汇总信息
{
echo "总请求数: $(wc -l < extended_access.log)"
echo "成功请求(2xx): $(grep -c ' 2[0-9][0-9] ' extended_access.log)"
echo "客户端错误(4xx): $(grep -c ' 4[0-9][0-9] ' extended_access.log)"
echo "服务器错误(5xx): $(grep -c ' 5[0-9][0-9] ' extended_access.log)"
echo "平均响应时间: $(awk '{sum += $(NF)} END {printf "%.3f秒", sum/NR}' extended_access.log)"
} >> log_report.txt
# 详细分析
echo -e "\n## 详细分析" >> log_report.txt
echo -e "\n### 流量统计" >> log_report.txt
awk -F'"' '{print $2}' extended_access.log | awk '{methods[$1]++} END {for(m in methods) printf "%s: %d\n", m, methods[m]}' >> log_report.txt
echo -e "\n### 性能统计" >> log_report.txt
awk '{
status = $9
response_time = $(NF)
status_time[status] += response_time
status_count[status]++
} END {
for(s in status_count) {
avg = status_time[s] / status_count[s]
printf "状态码 %s: 平均 %.3f秒 (%d次)\n", s, avg, status_count[s]
}
}' extended_access.log >> log_report.txt
echo "报告已生成: log_report.txt"
}
# 执行分析
analyze_traffic
analyze_performance
analyze_security
analyze_users
generate_report
# 清理
rm -f extended_access.log
echo -e "\n=== 日志分析系统演示完成 ==="
6.2 数据转换和ETL管道
创建ETL管道:etl_pipeline.sh
bash
#!/bin/bash
# 数据转换和ETL管道
echo "=== 数据转换和ETL管道 ==="
# 创建原始数据
cat > raw_data.csv << 'EOF'
id|name|age|salary|department|join_date
1|John Doe|30|50000|Engineering|2020-01-15
2|Jane Smith|25|45000|Marketing|2021-03-20
3|Bob Johnson|35|60000|Engineering|2019-11-10
4|Alice Brown|28|52000|Sales|2022-02-28
5|Charlie Wilson|42|75000|Management|2018-05-15
6|Diana Lee|26|48000|Engineering|2021-07-01
7|Edward Zhang|33|55000|Sales|2020-09-10
8|Fiona Chen|29|51000|Marketing|2022-01-05
EOF
# ETL管道函数
extract_data() {
echo "=== 数据提取阶段 ==="
# 验证数据格式
if [[ ! -f raw_data.csv ]]; then
echo "错误: 数据文件不存在"
exit 1
fi
# 检查数据完整性
local line_count=$(wc -l < raw_data.csv)
if [[ $line_count -lt 2 ]]; then
echo "错误: 数据文件为空或只有标题行"
exit 1
fi
echo "数据文件验证通过,共 $line_count 行"
# 提取数据
cp raw_data.csv extracted_data.csv
echo "数据提取完成"
}
transform_data() {
echo -e "\n=== 数据转换阶段 ==="
# 数据清洗和转换
cat extracted_data.csv | \
sed '1s/|/,/g' | \ # 转换标题行分隔符
sed '1s/\([a-z]\)_\([a-z]\)/\1\U\2/g' | \ # 标题驼峰命名
awk -F'|' '
BEGIN {
OFS = ","
print "ID,Name,Age,Salary,Department,JoinDate,Experience,SalaryGrade,Bonus"
}
NR > 1 {
# 数据清洗
gsub(/^[[:space:]]+|[[:space:]]+$/, "", $2) # 清理姓名空格
gsub(/^[[:space:]]+|[[:space:]]+$/, "", $5) # 清理部门空格
# 计算工作经验(年)
split($6, date_parts, "-")
join_year = date_parts[1]
current_year = strftime("%Y")
experience = current_year - join_year
# 薪资等级
salary_grade = "C"
if ($4 >= 70000) salary_grade = "A"
else if ($4 >= 55000) salary_grade = "B"
# 计算奖金(薪资的10-20%)
bonus_rate = 0.1
if (experience >= 3) bonus_rate = 0.15
if (experience >= 5) bonus_rate = 0.2
bonus = $4 * bonus_rate
print $1, $2, $3, $4, $5, $6, experience, salary_grade, int(bonus)
}' > transformed_data.csv
echo "数据转换完成"
# 数据质量检查
echo -e "\n数据质量检查:"
echo "总记录数: $(($(wc -l < transformed_data.csv) - 1))"
echo "空值检查:"
awk -F, '
NR > 1 {
for(i=1; i<=NF; i++) {
if($i == "" || $i == "NULL") {
print "第 " NR " 行第 " i " 列为空"
}
}
}' transformed_data.csv
# 统计信息
echo -e "\n数据统计:"
awk -F, '
NR > 1 {
total_salary += $4
total_bonus += $9
count++
dept[$5]++
}
END {
printf "平均薪资: %.2f\n", total_salary/count
printf "平均奖金: %.2f\n", total_bonus/count
print "部门分布:"
for(d in dept) {
printf " %-15s: %d人\n", d, dept[d]
}
}' transformed_data.csv
}
load_data() {
echo -e "\n=== 数据加载阶段 ==="
# 创建目标表结构(模拟)
cat > database_schema.sql << 'EOF'
-- 员工表结构
CREATE TABLE employees (
id INT PRIMARY KEY,
name VARCHAR(100),
age INT,
salary DECIMAL(10,2),
department VARCHAR(50),
join_date DATE,
experience INT,
salary_grade CHAR(1),
bonus DECIMAL(10,2)
);
EOF
# 生成SQL插入语句
awk -F, '
NR > 1 {
printf "INSERT INTO employees VALUES(%d, '\''%s'\'', %d, %.2f, '\''%s'\'', '\''%s'\'', %d, '\''%s'\'', %.2f);\n",
$1, $2, $3, $4, $5, $6, $7, $8, $9
}' transformed_data.csv > load_data.sql
echo "SQL加载脚本已生成:"
head -3 load_data.sql
echo "..."
# 创建报表
echo -e "\n生成数据报表..."
awk -F, '
BEGIN {
print "# 员工数据分析报告"
print "生成时间: '$(date)'"
print ""
}
NR == 1 { next } # 跳过标题行
{
total_salary += $4
total_bonus += $9
count++
# 部门统计
dept_salary[$5] += $4
dept_count[$5]++
dept_bonus[$5] += $9
# 年龄组统计
if($3 < 25) age_group["<25"]++
else if($3 < 30) age_group["25-29"]++
else if($3 < 35) age_group["30-34"]++
else if($3 < 40) age_group["35-39"]++
else age_group[">=40"]++
# 薪资等级统计
grade_count[$8]++
}
END {
print "## 执行摘要"
printf "员工总数: %d\n", count
printf "总薪资: %.2f\n", total_salary
printf "总奖金: %.2f\n", total_bonus
printf "平均薪资: %.2f\n", total_salary/count
printf "平均奖金: %.2f\n", total_bonus/count
print ""
print "## 部门分析"
for(dept in dept_salary) {
avg_salary = dept_salary[dept] / dept_count[dept]
avg_bonus = dept_bonus[dept] / dept_count[dept]
printf "### %s\n", dept
printf "人数: %d\n", dept_count[dept]
printf "平均薪资: %.2f\n", avg_salary
printf "平均奖金: %.2f\n", avg_bonus
print ""
}
print "## 年龄分布"
for(group in age_group) {
printf "%s岁: %d人 (%.1f%%)\n", group, age_group[group], (age_group[group]/count)*100
}
print ""
print "## 薪资等级分布"
for(grade in grade_count) {
printf "等级 %s: %d人\n", grade, grade_count[grade]
}
}' transformed_data.csv > analysis_report.md
echo "数据分析报告已生成: analysis_report.md"
}
# 执行ETL管道
extract_data
transform_data
load_data
# 清理临时文件
rm -f raw_data.csv extracted_data.csv transformed_data.csv database_schema.sql load_data.sql
echo -e "\n=== ETL管道演示完成 ==="
7. 总结与进阶学习
7.1 工具对比与选择指南
graph TB
A[文本处理任务] --> B{任务类型}
B -->|快速搜索| C[grep]
B -->|简单替换| D[sed]
B -->|复杂处理| E[awk]
C --> F[模式匹配
文件过滤] D --> G[流编辑
批量替换] E --> H[数据提取
报表生成] F --> I[适用场景
日志搜索
文件过滤] G --> J[适用场景
配置修改
数据清洗] H --> K[适用场景
数据分析
报表生成] style A fill:#1e3a5f,color:#ffffff style B fill:#4a1e5f,color:#ffffff style C fill:#1e5f3a,color:#ffffff style D fill:#1e5f3a,color:#ffffff style E fill:#1e5f3a,color:#ffffff style F fill:#5f3a1e,color:#ffffff style G fill:#5f3a1e,color:#ffffff style H fill:#5f3a1e,color:#ffffff style I fill:#1e3a5f,color:#ffffff style J fill:#1e3a5f,color:#ffffff style K fill:#1e3a5f,color:#ffffff
文件过滤] D --> G[流编辑
批量替换] E --> H[数据提取
报表生成] F --> I[适用场景
日志搜索
文件过滤] G --> J[适用场景
配置修改
数据清洗] H --> K[适用场景
数据分析
报表生成] style A fill:#1e3a5f,color:#ffffff style B fill:#4a1e5f,color:#ffffff style C fill:#1e5f3a,color:#ffffff style D fill:#1e5f3a,color:#ffffff style E fill:#1e5f3a,color:#ffffff style F fill:#5f3a1e,color:#ffffff style G fill:#5f3a1e,color:#ffffff style H fill:#5f3a1e,color:#ffffff style I fill:#1e3a5f,color:#ffffff style J fill:#1e3a5f,color:#ffffff style K fill:#1e3a5f,color:#ffffff
7.2 创建学习检查清单
创建学习检查清单:learning_checklist.sh
bash
#!/bin/bash
# 学习检查清单
echo "=== 文本处理三剑客学习检查清单 ==="
cat > learning_checklist.md << 'EOF'
# 文本处理三剑客学习检查清单
## grep 掌握程度检查
- [ ] 基础搜索和选项 (-i, -n, -v, -c)
- [ ] 正则表达式基础 (., *, +, ?, [])
- [ ] 字符类和预定义字符类
- [ ] 锚点和单词边界
- [ ] 分组和引用
- [ ] 扩展正则表达式
- [ ] 递归搜索和文件过滤
- [ ] 性能优化技巧
## sed 掌握程度检查
- [ ] 基本替换命令 (s///)
- [ ] 地址和范围指定
- [ ] 删除、插入、追加命令
- [ ] 模式空间和保持空间
- [ ] 流控制(分支、跳转)
- [ ] 多命令执行和脚本文件
- [ ] 高级替换技巧
- [ ] 原位编辑和备份
## awk 掌握程度检查
- [ ] 基本打印和字段处理
- [ ] 模式匹配和条件语句
- [ ] BEGIN 和 END 块
- [ ] 内置变量 (NR, NF, FS, OFS)
- [ ] 数组和关联数组
- [ ] 字符串和数学函数
- [ ] 控制结构 (if, for, while)
- [ ] 自定义函数
- [ ] 输入输出重定向
## 组合应用掌握程度检查
- [ ] 管道连接多个命令
- [ ] 复杂数据处理流程
- [ ] 性能优化和调试
- [ ] 错误处理和边界情况
- [ ] 实际项目应用
## 实战项目建议
1. 日志分析系统
2. 数据清洗管道
3. 配置文件管理工具
4. 报表生成系统
5. 监控告警脚本
## 进阶学习方向
- Perl 文本处理
- Python 数据处理 (pandas)
- jq (JSON 处理)
- xmlstarlet (XML 处理)
- 数据库查询优化
EOF
echo "学习检查清单已生成: learning_checklist.md"
echo ""
echo "下一步学习建议:"
echo "1. 完成检查清单中的所有项目"
echo "2. 在实际工作中应用所学知识"
echo "3. 阅读官方文档和man页面"
echo "4. 参与开源项目贡献"
echo "5. 学习相关工具如 jq, xmllint 等"
7.3 创建实用脚本库
创建实用脚本库:utility_scripts.sh
bash
#!/bin/bash
# 实用脚本库
echo "=== 创建文本处理实用脚本库 ==="
# 1. 日志分析脚本
cat > analyze_logs.sh << 'EOF'
#!/bin/bash
# 日志分析脚本
LOG_FILE=${1:-access.log}
echo "分析日志文件: $LOG_FILE"
# 基本统计
echo "=== 基本统计 ==="
echo "总行数: $(wc -l < "$LOG_FILE")"
echo "独立IP数: $(awk '{print $1}' "$LOG_FILE" | sort -u | wc -l)"
echo "请求方法分布:"
awk -F'"' '{print $2}' "$LOG_FILE" | awk '{print $1}' | sort | uniq -c | sort -rn
# 状态码分析
echo -e "\n=== 状态码分析 ==="
awk '{print $9}' "$LOG_FILE" | sort | uniq -c | sort -rn
# 热门页面
echo -e "\n=== 热门页面 ==="
awk -F'"' '{print $2}' "$LOG_FILE" | awk '{print $2}' | sort | uniq -c | sort -rn | head -10
# 错误分析
echo -e "\n=== 错误分析 ==="
grep -E ' (4[0-9]{2}|5[0-9]{2}) ' "$LOG_FILE" | head -10
EOF
chmod +x analyze_logs.sh
# 2. 数据清洗脚本
cat > clean_data.sh << 'EOF'
#!/bin/bash
# 数据清洗脚本
INPUT_FILE=$1
OUTPUT_FILE=${2:-cleaned_data.csv}
if [[ -z "$INPUT_FILE" ]]; then
echo "用法: $0 <输入文件> [输出文件]"
exit 1
fi
echo "清洗数据文件: $INPUT_FILE -> $OUTPUT_FILE"
# 执行数据清洗
sed '
# 删除空行
/^$/d
# 删除行首行尾空格
s/^[[:space:]]*//
s/[[:space:]]*$//
# 标准化分隔符
s/[,;|][[:space:]]*/,/g
' "$INPUT_FILE" | \
awk -F, '
BEGIN {OFS=","}
NR == 1 {
# 处理标题行
print $0
next
}
{
# 数据验证和清理
for(i=1; i<=NF; i++) {
# 清理字段空格
gsub(/^[[:space:]]+|[[:space:]]+$/, "", $i)
# 处理空值
if($i == "") $i = "NULL"
}
print $0
}' > "$OUTPUT_FILE"
echo "数据清洗完成: $OUTPUT_FILE"
echo "原始行数: $(wc -l < "$INPUT_FILE")"
echo "清洗后行数: $(wc -l < "$OUTPUT_FILE")"
EOF
chmod +x clean_data.sh
# 3. 配置文件管理脚本
cat > manage_config.sh << 'EOF'
#!/bin/bash
# 配置文件管理脚本
CONFIG_FILE=${1:-config.txt}
ACTION=$2
KEY=$3
VALUE=$4
usage() {
echo "用法: $0 <配置文件> <动作> [键] [值]"
echo "动作:"
echo " list - 列出所有配置"
echo " get <key> - 获取配置值"
echo " set <key> <value> - 设置配置值"
echo " delete <key> - 删除配置项"
}
list_config() {
echo "=== 配置列表 ==="
grep -v '^#' "$CONFIG_FILE" | grep '=' | while IFS= read -r line; do
key=$(echo "$line" | cut -d'=' -f1)
value=$(echo "$line" | cut -d'=' -f2-)
printf "%-25s: %s\n" "$key" "$value"
done
}
get_config() {
local key=$1
grep "^$key=" "$CONFIG_FILE" | cut -d'=' -f2-
}
set_config() {
local key=$1
local value=$2
if grep -q "^$key=" "$CONFIG_FILE"; then
# 更新现有配置
sed -i "s/^$key=.*/$key=$value/" "$CONFIG_FILE"
echo "更新配置: $key=$value"
else
# 添加新配置
echo "$key=$value" >> "$CONFIG_FILE"
echo "添加配置: $key=$value"
fi
}
delete_config() {
local key=$1
sed -i "/^$key=/d" "$CONFIG_FILE"
echo "删除配置: $key"
}
case "$ACTION" in
"list")
list_config
;;
"get")
if [[ -z "$KEY" ]]; then
echo "错误: 需要指定键名"
usage
exit 1
fi
get_config "$KEY"
;;
"set")
if [[ -z "$KEY" || -z "$VALUE" ]]; then
echo "错误: 需要指定键名和值"
usage
exit 1
fi
set_config "$KEY" "$VALUE"
;;
"delete")
if [[ -z "$KEY" ]]; then
echo "错误: 需要指定键名"
usage
exit 1
fi
delete_config "$KEY"
;;
*)
usage
exit 1
;;
esac
EOF
chmod +x manage_config.sh
# 4. 系统监控脚本
cat > system_monitor.sh << 'EOF'
#!/bin/bash
# 系统监控脚本
LOG_FILE="/var/log/system_monitor.log"
ALERT_THRESHOLD=80
# 日志函数
log_message() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >> "$LOG_FILE"
}
# 检查CPU使用率
check_cpu() {
local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
echo "CPU使用率: ${cpu_usage}%"
if (( $(echo "$cpu_usage > $ALERT_THRESHOLD" | bc -l) )); then
log_message "警告: CPU使用率过高 - ${cpu_usage}%"
return 1
fi
return 0
}
# 检查内存使用率
check_memory() {
local mem_info=$(free | grep Mem)
local total_mem=$(echo "$mem_info" | awk '{print $2}')
local used_mem=$(echo "$mem_info" | awk '{print $3}')
local mem_usage=$(echo "scale=2; $used_mem * 100 / $total_mem" | bc)
echo "内存使用率: ${mem_usage}%"
if (( $(echo "$mem_usage > $ALERT_THRESHOLD" | bc -l) )); then
log_message "警告: 内存使用率过高 - ${mem_usage}%"
return 1
fi
return 0
}
# 检查磁盘使用率
check_disk() {
local disk_usage=$(df / | awk 'NR==2{print $5}' | cut -d'%' -f1)
echo "磁盘使用率: ${disk_usage}%"
if [ "$disk_usage" -gt "$ALERT_THRESHOLD" ]; then
log_message "警告: 磁盘使用率过高 - ${disk_usage}%"
return 1
fi
return 0
}
# 检查系统负载
check_load() {
local load_avg=$(uptime | awk -F'load average:' '{print $2}' | awk -F, '{print $1}' | tr -d ' ')
local cpu_cores=$(nproc)
echo "系统负载: $load_avg (CPU核心: $cpu_cores)"
if (( $(echo "$load_avg > $cpu_cores" | bc -l) )); then
log_message "警告: 系统负载过高 - $load_avg"
return 1
fi
return 0
}
# 生成报告
generate_report() {
echo "=== 系统监控报告 ==="
echo "生成时间: $(date)"
echo
check_cpu
check_memory
check_disk
check_load
echo
echo "最近告警:"
tail -5 "$LOG_FILE" 2>/dev/null || echo "无告警记录"
}
# 主函数
main() {
case "${1:-report}" in
"report")
generate_report
;;
"cpu")
check_cpu
;;
"memory")
check_memory
;;
"disk")
check_disk
;;
"load")
check_load
;;
"log")
tail -20 "$LOG_FILE" 2>/dev/null || echo "日志文件不存在"
;;
*)
echo "用法: $0 {report|cpu|memory|disk|load|log}"
exit 1
;;
esac
}
main "$@"
EOF
chmod +x system_monitor.sh
echo "实用脚本库创建完成:"
echo " analyze_logs.sh - 日志分析脚本"
echo " clean_data.sh - 数据清洗脚本"
echo " manage_config.sh - 配置管理脚本"
echo " system_monitor.sh - 系统监控脚本"
echo -e "\n=== 文本处理三剑客教程完成 ==="
echo "现在您已经掌握了 grep, sed, awk 的核心用法和实战技巧!"
echo "建议在实际工作中不断练习和应用这些工具。"
通过本教程,您已经系统学习了 Linux 文本处理三剑客的完整知识体系。从基础用法到高级技巧,从单一工具使用到组合应用,您现在应该能够:
- 使用 grep 进行高效的文本搜索和过滤
- 使用 sed 进行流编辑和批量处理
- 使用 awk 进行复杂的数据处理和报表生成
- 将三个工具组合使用解决复杂问题
- 在实际项目中应用这些技能
继续练习和探索,这些工具将成为您日常工作中不可或缺的利器!