目录
[1. def analyze_statistics(self) -> Dict:](#1. def analyze_statistics(self) -> Dict:)
[2.stats['keywords_frequency'][keyword] += 1](#2.stats['keywords_frequency'][keyword] += 1)
[3.def display_statistics(stats: Dict):](#3.def display_statistics(stats: Dict):)
1.代码
import re
import os
import datetime
import argparse
from collections import Counter, defaultdict
from typing import List, Dict, Tuple, Optional
import sys
class LogAnalyzer:
"""日志分析工具类"""
# 常见日志级别模式
LOG_LEVELS = ['TRACE', 'DEBUG', 'INFO', 'WARN', 'WARNING', 'ERROR', 'FATAL', 'CRITICAL']
def __init__(self):
self.logs = []
self.stats = {}
self.ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
self.clean_log_pattern = re.compile(
r'^(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s+' # 时间戳
r'\[(?P<thread>[^\]]+)\]\s+' # 线程名
r'(?:\[(?P<request_id>[^\]]*)\]\s+)?' # 可选的requestId
r'(?P<level>\w+)\s+' # 日志级别
r'(?P<class_name>\S+)?\s*' # 可选的类名
r'(?P<message>.*)$' # 日志消息
)
def remove_ansi_escape_sequences(self, text):
"""去除 ANSI 转义序列"""
return self.ansi_escape.sub('', text)
def load_logs(self, file_path: str) -> bool:
"""
加载日志文件
"""
try:
if not os.path.exists(file_path):
print(f"错误: 文件 {file_path} 不存在")
return False
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
self.logs = []
current_log=None
for line_num, line in enumerate(lines, 1):
line = line.strip()
if line: # 跳过空行
log_entry = self._parse_log_line(line, line_num)
if current_log:
self.logs.append(current_log)
current_log=log_entry
elif current_log:
# 如果是续行,添加到当前日志的消息中
current_log['message'] += '\n' + self.remove_ansi_escape_sequences(line).strip()
current_log['raw'] += line
# 添加最后一个日志条目
if current_log:
self.logs.append(current_log)
print(f"成功加载 {len(self.logs)} 条日志记录")
return True
except Exception as e:
print(f"加载日志文件时出错: {e}")
return False
def _parse_log_line(self, line: str, line_num: int) -> Dict:
"""
解析单行日志
"""
# 先去除 ANSI 转义序列
clean_line = self.remove_ansi_escape_sequences(line).strip()
# 跳过空行
if not clean_line:
return None
# 使用正则表达式匹配
match = self.clean_log_pattern.match(clean_line)
if match:
return {
'timestamp': match.group('timestamp'),
'thread': match.group('thread'),
'request_id': match.group('request_id'),
'level': match.group('level'),
'class_name': match.group('class_name'),
'message': match.group('message').strip(),
'raw': line, # 保存原始行用于搜索
'line_number': line_num
}
return None
#
# # 基础日志格式: [时间] 级别 [线程] 消息
# log_entry = {
# 'raw': line,
# 'line_number': line_num,
# 'timestamp': None,
# 'level': None,
# 'thread': None,
# 'message': line
# }
#
# # 尝试匹配带时间戳的日志格式
# timestamp_patterns = [
# r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3})', # 2023-10-01 12:00:00.000
# r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})', # 2023-10-01 12:00:00
# r'^(\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2})', # 10/01/2023 12:00:00
# ]
#
# for pattern in timestamp_patterns:
# match = re.match(pattern, line)
# if match:
# log_entry['timestamp'] = match.group(1)
# remaining = line[len(match.group(0)):].strip()
# break
# else:
# remaining = line
#
# # 尝试提取日志级别
# level_pattern = r'^\s*\[?(\w+)\]?\s+'
# match = re.match(level_pattern, remaining)
# if match and match.group(1).upper() in self.LOG_LEVELS:
# log_entry['level'] = match.group(1).upper()
# remaining = remaining[len(match.group(0)):].strip()
#
# # 尝试提取线程信息
# thread_pattern = r'^\[([^\]]+)\]\s*'
# match = re.match(thread_pattern, remaining)
# if match:
# log_entry['thread'] = match.group(1)
# remaining = remaining[len(match.group(0)):].strip()
#
# log_entry['message'] = remaining
# return log_entry
def search_logs(self, keyword: str = None, level: str = None,
thread: str = None, case_sensitive: bool = False,logs: List[Dict] = None) -> List[Dict]:
"""
搜索日志
"""
if logs is None:
logs = self.logs.copy() # 如果没有传入logs,使用所有日志
results = logs.copy() # 基于传入的logs进行过滤
# 按级别过滤
if level:
level_upper = level.upper()
results = [log for log in results if log['level'] and log.get('level').upper() == level_upper]
# 按线程过滤
if thread:
results = [log for log in results if log['thread'] and thread in log['thread']]
# 按关键词过滤
if keyword:
if case_sensitive:
results = [log for log in results if keyword in log['raw']]
else:
keyword_lower = keyword.lower()
results = [log for log in results if keyword_lower in log['raw'].lower()]
return results
def time_range_search(self, start_time: str = None, end_time: str = None,logs: List[Dict] = None) -> List[Dict]:
"""
按时间范围搜索
"""
if not start_time and not end_time:
return self.logs
if logs is None:
logs = self.logs.copy() # 如果没有传入logs,使用所有日志
logs = logs.copy() # 基于传入的logs进行过滤
results = []
for log in logs:
if not log['timestamp']:
continue
log_time = log['timestamp']
match = True
if start_time and log_time < start_time:
match = False
if end_time and log_time > end_time:
match = False
if match:
results.append(log)
return results
def analyze_statistics(self,logs: List[Dict] = None) -> Dict:
"""
生成统计信息
"""
if logs is None:
logs = self.logs.copy() # 如果没有传入logs,使用所有日志
logs = logs.copy() # 基于传入的logs进行过滤
if not self.logs:
return {}
stats = {
'total_logs': len(logs),
'level_distribution': Counter(),
'thread_distribution': Counter(),
'time_range': {},
'keywords_frequency': defaultdict(int)
}
# 统计级别分布
levels = [log['level'] for log in logs if log['level']]
stats['level_distribution'] = Counter(levels)
# 统计线程分布
threads = [log['thread'] for log in logs if log['thread']]
stats['thread_distribution'] = Counter(threads)
# 时间范围
timestamps = [log['timestamp'] for log in logs if log['timestamp']]
if timestamps:
stats['time_range']['start'] = min(timestamps)
stats['time_range']['end'] = max(timestamps)
# 常见错误关键词频率
error_keywords = ['error', 'exception', 'failed', 'timeout', 'null', 'undefined']
for log in logs:
message_lower = log['message'].lower()
for keyword in error_keywords:
if keyword in message_lower:
stats['keywords_frequency'][keyword] += 1
self.stats = stats
return stats
def export_results(self, results: List[Dict], output_file: str):
"""
导出结果到文件
"""
try:
with open(output_file, 'w', encoding='utf-8') as f:
for log in results:
f.write(f"{log['raw']}\n")
print(f"结果已导出到: {output_file}")
except Exception as e:
print(f"导出文件时出错: {e}")
def display_results(results: List[Dict], max_display: int = 50):
"""
显示搜索结果
"""
if not results:
print("未找到匹配的日志记录")
return
print(f"\n找到 {len(results)} 条匹配记录:")
print("-" * 100)
for i, log in enumerate(results[:max_display], 1):
level_display = f"[{log['level']}]" if log['level'] else "[UNKNOWN]"
timestamp_display = log['timestamp'] if log['timestamp'] else "No timestamp"
thread_display = f"({log['thread']})" if log['thread'] else ""
print(f"{i:3d}. {timestamp_display} {level_display:8} {thread_display:15} {log['message']}")
if len(results) > max_display:
print(f"... 还有 {len(results) - max_display} 条记录未显示")
def display_statistics(stats: Dict):
"""
显示统计信息
"""
if not stats:
print("暂无统计信息")
return
print("\n" + "=" * 50)
print("日志统计分析报告")
print("=" * 50)
print(f"总日志数: {stats['total_logs']}")
if 'time_range' in stats and stats['time_range']:
print(f"时间范围: {stats['time_range'].get('start', 'N/A')} 到 {stats['time_range'].get('end', 'N/A')}")
print("\n日志级别分布:")
for level, count in stats['level_distribution'].most_common():
percentage = (count / stats['total_logs']) * 100
print(f" {level:8}: {count:4d} ({percentage:5.1f}%)")
print("\n线程分布 (Top 10):")
for thread, count in stats['thread_distribution'].most_common(10):
print(f" {thread:20}: {count:4d}")
print("\n错误关键词频率:")
# for keyword, count in stats['keywords_frequency'].most_common():
# print(f" {keyword:15}: {count:4d}")
# 修复:将 defaultdict 转换为排序后的列表
if hasattr(stats['keywords_frequency'], 'most_common'):
# 如果是 Counter,使用 most_common()
keyword_items = stats['keywords_frequency'].most_common()
else:
# 如果是 defaultdict 或其他字典类型,手动排序
keyword_items = sorted(
stats['keywords_frequency'].items(),
key=lambda x: x[1],
reverse=True
)
for keyword, count in keyword_items:
print(f" {keyword:15}: {count:4d}")
def main():
"""
主函数 - 命令行接口
"""
parser = argparse.ArgumentParser(description='日志查询分析工具')
parser.add_argument('file', help='日志文件路径')
parser.add_argument('-k', '--keyword', help='搜索关键词')
parser.add_argument('-l', '--level', help='日志级别过滤')
parser.add_argument('-t', '--thread', help='线程名过滤')
parser.add_argument('--start-time', help='开始时间 (格式: YYYY-MM-DD HH:MM:SS)')
parser.add_argument('--end-time', help='结束时间 (格式: YYYY-MM-DD HH:MM:SS)')
parser.add_argument('-c', '--case-sensitive', action='store_true', help='大小写敏感')
parser.add_argument('-s', '--statistics', action='store_true', help='显示统计信息')
parser.add_argument('-o', '--output', help='导出结果到文件')
parser.add_argument('--max-display', type=int, default=50, help='最大显示行数 (默认: 50)')
args = parser.parse_args()
# 创建分析器实例
analyzer = LogAnalyzer()
# 加载日志文件
if not analyzer.load_logs(args.file):
sys.exit(1)
# 生成统计信息
stats = analyzer.analyze_statistics()
# 执行搜索
results = analyzer.logs# 初始化为所有日志
# 先按时间范围过滤
if args.start_time or args.end_time:
results = analyzer.time_range_search(args.start_time, args.end_time,logs=results)
# 再按关键词、级别、线程过滤
if args.keyword or args.level or args.thread:
results = analyzer.search_logs(
keyword=args.keyword,
level=args.level,
thread=args.thread,
case_sensitive=args.case_sensitive,
logs=results # 传入已过滤的结果
)
# 显示结果
display_results(results, args.max_display)
# 显示统计信息
if args.statistics:
display_statistics(stats)
# 导出结果
if args.output:
analyzer.export_results(results, args.output)
def interactive_mode():
"""
交互式模式 - 支持链式搜索
"""
print("=" * 60)
print("日志查询分析小工具 - 交互模式")
print("=" * 60)
file_path = input("请输入日志文件路径: ").strip()
if not file_path:
print("未提供文件路径,退出程序")
return
analyzer = LogAnalyzer()
if not analyzer.load_logs(file_path):
return
# 初始化当前结果集
current_results = analyzer.logs.copy()
current_filters = [] # 记录当前应用的过滤条件
while True:
print("\n" + "=" * 50)
print(f"当前结果集: {len(current_results)} 条记录")
if current_filters:
print("当前过滤条件:", " -> ".join(current_filters))
print("\n请选择操作:")
print("1. 关键词搜索")
print("2. 按级别过滤")
print("3. 按线程过滤")
print("4. 时间范围搜索")
print("5. 显示统计信息")
print("6. 导出结果")
print("7. 重新加载文件")
print("8. 退出")
choice = input("请输入选择 (1-9): ").strip()
if choice == '1':
keyword = input("请输入搜索关键词: ").strip()
if not keyword:
print("关键词不能为空")
continue
case_sensitive = input("是否大小写敏感? (y/N): ").strip().lower() == 'y'
# 在当前结果集上应用关键词过滤
results = analyzer.search_logs(
keyword=keyword,
case_sensitive=case_sensitive,
logs=current_results
)
current_results = results
current_filters.append(f"关键词: {keyword}")
display_results(results)
elif choice == '2':
level = input("请输入日志级别 (DEBUG/INFO/WARN/ERROR等): ").strip().upper()
if not level:
print("级别不能为空")
continue
# 在当前结果集上应用级别过滤
results = analyzer.search_logs(level=level, logs=current_results)
current_results = results
current_filters.append(f"级别: {level}")
display_results(results)
elif choice == '3':
thread = input("请输入线程名: ").strip()
if not thread:
print("线程名不能为空")
continue
# 在当前结果集上应用线程过滤
results = analyzer.search_logs(thread=thread, logs=current_results)
current_results = results
current_filters.append(f"线程: {thread}")
display_results(results)
elif choice == '4':
start_time = input("开始时间 (格式: 2023-10-01 12:00:00): ").strip()
end_time = input("结束时间 (格式: 2023-10-01 13:00:00): ").strip()
# 在当前结果集上应用时间范围过滤
results = analyzer.time_range_search(
start_time or None,
end_time or None,
logs=current_results
)
current_results = results
# 记录时间范围条件
time_filter = "时间范围"
if start_time:
time_filter += f" 从 {start_time}"
if end_time:
time_filter += f" 到 {end_time}"
current_filters.append(time_filter)
display_results(results)
elif choice == '5':
# 基于当前结果集生成统计信息
stats = analyzer.analyze_statistics(logs=current_results)
display_statistics(stats)
elif choice == '6':
if not current_results:
print("没有可导出的数据")
continue
output_file = input("请输入导出文件名: ").strip()
if output_file:
analyzer.export_results(current_results, output_file)
print(f"结果已导出到 {output_file}")
elif choice == '7':
if analyzer.load_logs(file_path):
# 重新加载后重置结果集
current_results = analyzer.logs.copy()
current_filters = []
print("文件重新加载成功,过滤条件已重置")
elif choice == '8':
print("感谢使用,再见!")
break
else:
print("无效选择,请重新输入")
if __name__ == "__main__":
if len(sys.argv) > 1:
# 命令行模式
main()
else:
# 交互式模式
interactive_mode()
2.代码详解
1. def analyze_statistics(self) -> Dict:
stats = {
'total_logs': len(self.logs),
'level_distribution': Counter(),
'thread_distribution': Counter(),
'time_range': {},
'keywords_frequency': defaultdict(int)
}
# 统计级别分布
levels = [log['level'] for log in self.logs if log['level']]
stats['level_distribution'] = Counter(levels)
levels = [log['level'] for log in self.logs if log['level']]
列表推导式提取级别
分解说明:
for log in self.logs:遍历self.logs列表中的每个日志条目log['level']:从每个日志字典中获取level字段的值if log['level']:过滤条件,只保留非空、非None、非False的级别值- 整个表达式:创建一个包含所有非空日志级别的列表
stats['level_distribution'] = Counter(levels)
使用Counter统计频率
分解说明:
Counter(levels):创建Counter对象,自动统计列表中每个元素的出现次数stats['level_distribution'] = ...:将统计结果赋值给stats字典的对应字段
Counter的工作原理:
- 接收一个可迭代对象(如列表)
- 返回一个字典子类,键是列表中的元素,值是出现次数
- 自动处理不存在的键,返回0而不是抛出KeyError
例子:
levels=['INFO', 'ERROR', 'INFO', 'WARNING']
Counter(['INFO', 'ERROR', 'INFO', 'WARNING'])
返回Counter({'INFO': 2, 'ERROR': 1, 'WARNING': 1})
赋值给stats['level_distribution']=Counter({'INFO': 2, 'ERROR': 1, 'WARNING': 1})
counter对不存在键的处理
from collections import Counter
# 创建一个Counter对象
level_counter = Counter(['INFO', 'ERROR', 'INFO', 'WARNING'])
print("原始Counter:", level_counter)
# 访问存在的键
print("INFO count:", level_counter['INFO']) # 输出: 2
# 访问不存在的键
print("DEBUG count:", level_counter['DEBUG']) # 输出: 0 (不会报错!)
2.stats['keywords_frequency'][keyword] += 1
# 常见错误关键词频率
error_keywords = ['error', 'exception', 'failed', 'timeout', 'null', 'undefined']
for log in self.logs:
message_lower = log['message'].lower()
for keyword in error_keywords:
if keyword in message_lower:
stats['keywords_frequency'][keyword] += 1
stats['keywords_frequency'][keyword] += 1
分解执行步骤:
- 访问外层字典 :
stats['keywords_frequency']
-
- 获取
stats字典中'keywords_frequency'键对应的值 - 这个值是一个
defaultdict(int)对象
- 获取
- 访问内层字典 :
[keyword]
-
- 在
defaultdict(int)中访问特定的keyword键 - 如果
keyword不存在,defaultdict(int)会自动创建它并初始化为0
- 在
- 执行加法操作 :
+= 1
-
- 获取当前
keyword的值 - 对该值加1
- 将结果存回
stats['keywords_frequency'][keyword]
- 获取当前
详细的内存变化
# 当我们执行这行代码时:
keyword = 'error'
stats['keywords_frequency'][keyword] += 1
# 实际上发生了:
# 1. 访问 stats['keywords_frequency']['error']
# 2. 由于 'error' 键不存在,defaultdict 自动:
# - 创建 'error' 键
# - 将其值设置为 int() 即 0
# 3. 然后对这个值执行 +1 操作
# 4. 结果:stats['keywords_frequency']['error'] = 1
stats['keywords_frequency'] = defaultdict(<class 'int'>, {})
访问 stats['keywords_frequency']['error'] → 返回 0(默认值)
0 + 1 = 1
存储 stats['keywords_frequency']['error'] = 1
结果: {'error': 1}
stats['keywords_frequency'] = defaultdict(<class 'int'>, {'error': 1})
访问 stats['keywords_frequency']['error'] → 返回 1
1 + 1 = 2
存储 stats['keywords_frequency']['error'] = 2
结果: {'error': 2}
stats['keywords_frequency'] = defaultdict(<class 'int'>, {'error': 2})
访问 stats['keywords_frequency']['exception'] → 返回 0(默认值)
0 + 1 = 1
存储 stats['keywords_frequency']['exception'] = 1
结果: {'error': 2, 'exception': 1}
3.def display_statistics(stats: Dict):
print("\n日志级别分布:")
for level, count in stats['level_distribution'].most_common():
percentage = (count / stats['total_logs']) * 100
print(f" {level:8}: {count:4d} ({percentage:5.1f}%)")
most_common() 是 Python collections.Counter 类的一个方法,用于返回计数器中最常见的元素及其计数的列表。
方法定义:
Counter.most_common(n=None)
- 参数:
-
n(可选): 要返回的最常见元素的数量。如果省略或为None,则返回所有元素
-
返回值 : 由
(element, count)元组组成的列表,按计数从高到低排序from collections import Counter
创建一个 Counter 对象
word_counts = Counter(['apple', 'banana', 'apple', 'orange', 'banana', 'apple'])
获取所有元素的频率排序
print(word_counts.most_common())
输出: [('apple', 3), ('banana', 2), ('orange', 1)]
获取前2个最常见的元素
print(word_counts.most_common(2))
输出: [('apple', 3), ('banana', 2)]