python测开小工具--日志查询分析工具

目录

1.代码

2.代码详解

[1. def analyze_statistics(self) -> Dict:](#1. def analyze_statistics(self) -> Dict:)

[2.stats['keywords_frequency'][keyword] += 1](#2.stats['keywords_frequency'][keyword] += 1)

[3.def display_statistics(stats: Dict):](#3.def display_statistics(stats: Dict):)


1.代码

复制代码
import re
import os
import datetime
import argparse
from collections import Counter, defaultdict
from typing import List, Dict, Tuple, Optional
import sys


class LogAnalyzer:
    """日志分析工具类"""

    # 常见日志级别模式
    LOG_LEVELS = ['TRACE', 'DEBUG', 'INFO', 'WARN', 'WARNING', 'ERROR', 'FATAL', 'CRITICAL']

    def __init__(self):
        self.logs = []
        self.stats = {}
        self.ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
        self.clean_log_pattern = re.compile(
            r'^(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s+'  # 时间戳
            r'\[(?P<thread>[^\]]+)\]\s+'  # 线程名
            r'(?:\[(?P<request_id>[^\]]*)\]\s+)?'  # 可选的requestId
            r'(?P<level>\w+)\s+'  # 日志级别
            r'(?P<class_name>\S+)?\s*'  # 可选的类名
            r'(?P<message>.*)$'  # 日志消息
        )

    def remove_ansi_escape_sequences(self, text):
        """去除 ANSI 转义序列"""
        return self.ansi_escape.sub('', text)

    def load_logs(self, file_path: str) -> bool:
        """
        加载日志文件
        """
        try:
            if not os.path.exists(file_path):
                print(f"错误: 文件 {file_path} 不存在")
                return False

            with open(file_path, 'r', encoding='utf-8') as f:
                lines = f.readlines()

            self.logs = []
            current_log=None
            for line_num, line in enumerate(lines, 1):
                line = line.strip()
                if line:  # 跳过空行
                    log_entry = self._parse_log_line(line, line_num)
                    if current_log:
                        self.logs.append(current_log)
                    current_log=log_entry

                elif current_log:
                    # 如果是续行,添加到当前日志的消息中
                    current_log['message'] += '\n' + self.remove_ansi_escape_sequences(line).strip()
                    current_log['raw'] += line

            # 添加最后一个日志条目
            if current_log:
                self.logs.append(current_log)


            print(f"成功加载 {len(self.logs)} 条日志记录")
            return True

        except Exception as e:
            print(f"加载日志文件时出错: {e}")
            return False

    def _parse_log_line(self, line: str, line_num: int) -> Dict:
        """
        解析单行日志
        """
        # 先去除 ANSI 转义序列
        clean_line = self.remove_ansi_escape_sequences(line).strip()
        # 跳过空行
        if not clean_line:
            return None

        # 使用正则表达式匹配
        match = self.clean_log_pattern.match(clean_line)
        if match:
            return {
                'timestamp': match.group('timestamp'),
                'thread': match.group('thread'),
                'request_id': match.group('request_id'),
                'level': match.group('level'),
                'class_name': match.group('class_name'),
                'message': match.group('message').strip(),
                'raw': line,  # 保存原始行用于搜索
                'line_number': line_num
            }

        return None


        #
        # # 基础日志格式: [时间] 级别 [线程] 消息
        # log_entry = {
        #     'raw': line,
        #     'line_number': line_num,
        #     'timestamp': None,
        #     'level': None,
        #     'thread': None,
        #     'message': line
        # }
        #
        # # 尝试匹配带时间戳的日志格式
        # timestamp_patterns = [
        #     r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3})',  # 2023-10-01 12:00:00.000
        #     r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})',  # 2023-10-01 12:00:00
        #     r'^(\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2})',  # 10/01/2023 12:00:00
        # ]
        #
        # for pattern in timestamp_patterns:
        #     match = re.match(pattern, line)
        #     if match:
        #         log_entry['timestamp'] = match.group(1)
        #         remaining = line[len(match.group(0)):].strip()
        #         break
        # else:
        #     remaining = line
        #
        # # 尝试提取日志级别
        # level_pattern = r'^\s*\[?(\w+)\]?\s+'
        # match = re.match(level_pattern, remaining)
        # if match and match.group(1).upper() in self.LOG_LEVELS:
        #     log_entry['level'] = match.group(1).upper()
        #     remaining = remaining[len(match.group(0)):].strip()
        #
        # # 尝试提取线程信息
        # thread_pattern = r'^\[([^\]]+)\]\s*'
        # match = re.match(thread_pattern, remaining)
        # if match:
        #     log_entry['thread'] = match.group(1)
        #     remaining = remaining[len(match.group(0)):].strip()
        #
        # log_entry['message'] = remaining
        # return log_entry

    def search_logs(self, keyword: str = None, level: str = None,
                    thread: str = None, case_sensitive: bool = False,logs: List[Dict] = None) -> List[Dict]:
        """
        搜索日志
        """
        if logs is None:
            logs = self.logs.copy()  # 如果没有传入logs,使用所有日志

        results = logs.copy()  # 基于传入的logs进行过滤


        # 按级别过滤
        if level:
            level_upper = level.upper()
            results = [log for log in results if log['level'] and log.get('level').upper() == level_upper]

        # 按线程过滤
        if thread:
            results = [log for log in results if log['thread'] and thread in log['thread']]

        # 按关键词过滤
        if keyword:
            if case_sensitive:
                results = [log for log in results if keyword in log['raw']]
            else:
                keyword_lower = keyword.lower()
                results = [log for log in results if keyword_lower in log['raw'].lower()]

        return results

    def time_range_search(self, start_time: str = None, end_time: str = None,logs: List[Dict] = None) -> List[Dict]:
        """
        按时间范围搜索
        """
        if not start_time and not end_time:
            return self.logs

        if logs is None:
            logs = self.logs.copy()  # 如果没有传入logs,使用所有日志

        logs = logs.copy()  # 基于传入的logs进行过滤


        results = []
        for log in logs:
            if not log['timestamp']:
                continue

            log_time = log['timestamp']
            match = True

            if start_time and log_time < start_time:
                match = False
            if end_time and log_time > end_time:
                match = False

            if match:
                results.append(log)

        return results

    def analyze_statistics(self,logs: List[Dict] = None) -> Dict:
        """
        生成统计信息
        """
        if logs is None:
            logs = self.logs.copy()  # 如果没有传入logs,使用所有日志

        logs = logs.copy()  # 基于传入的logs进行过滤

        if not self.logs:
            return {}

        stats = {
            'total_logs': len(logs),
            'level_distribution': Counter(),
            'thread_distribution': Counter(),
            'time_range': {},
            'keywords_frequency': defaultdict(int)
        }

        # 统计级别分布
        levels = [log['level'] for log in logs if log['level']]
        stats['level_distribution'] = Counter(levels)

        # 统计线程分布
        threads = [log['thread'] for log in logs if log['thread']]
        stats['thread_distribution'] = Counter(threads)

        # 时间范围
        timestamps = [log['timestamp'] for log in logs if log['timestamp']]
        if timestamps:
            stats['time_range']['start'] = min(timestamps)
            stats['time_range']['end'] = max(timestamps)

        # 常见错误关键词频率
        error_keywords = ['error', 'exception', 'failed', 'timeout', 'null', 'undefined']
        for log in logs:
            message_lower = log['message'].lower()
            for keyword in error_keywords:
                if keyword in message_lower:
                    stats['keywords_frequency'][keyword] += 1

        self.stats = stats
        return stats

    def export_results(self, results: List[Dict], output_file: str):
        """
        导出结果到文件
        """
        try:
            with open(output_file, 'w', encoding='utf-8') as f:
                for log in results:
                    f.write(f"{log['raw']}\n")
            print(f"结果已导出到: {output_file}")
        except Exception as e:
            print(f"导出文件时出错: {e}")


def display_results(results: List[Dict], max_display: int = 50):
    """
    显示搜索结果
    """
    if not results:
        print("未找到匹配的日志记录")
        return

    print(f"\n找到 {len(results)} 条匹配记录:")
    print("-" * 100)

    for i, log in enumerate(results[:max_display], 1):
        level_display = f"[{log['level']}]" if log['level'] else "[UNKNOWN]"
        timestamp_display = log['timestamp'] if log['timestamp'] else "No timestamp"
        thread_display = f"({log['thread']})" if log['thread'] else ""

        print(f"{i:3d}. {timestamp_display} {level_display:8} {thread_display:15} {log['message']}")

    if len(results) > max_display:
        print(f"... 还有 {len(results) - max_display} 条记录未显示")


def display_statistics(stats: Dict):
    """
    显示统计信息
    """
    if not stats:
        print("暂无统计信息")
        return

    print("\n" + "=" * 50)
    print("日志统计分析报告")
    print("=" * 50)

    print(f"总日志数: {stats['total_logs']}")

    if 'time_range' in stats and stats['time_range']:
        print(f"时间范围: {stats['time_range'].get('start', 'N/A')} 到 {stats['time_range'].get('end', 'N/A')}")

    print("\n日志级别分布:")
    for level, count in stats['level_distribution'].most_common():
        percentage = (count / stats['total_logs']) * 100
        print(f"  {level:8}: {count:4d} ({percentage:5.1f}%)")

    print("\n线程分布 (Top 10):")
    for thread, count in stats['thread_distribution'].most_common(10):
        print(f"  {thread:20}: {count:4d}")

    print("\n错误关键词频率:")
    # for keyword, count in stats['keywords_frequency'].most_common():
    #     print(f"  {keyword:15}: {count:4d}")

    # 修复:将 defaultdict 转换为排序后的列表
    if hasattr(stats['keywords_frequency'], 'most_common'):
        # 如果是 Counter,使用 most_common()
        keyword_items = stats['keywords_frequency'].most_common()
    else:
        # 如果是 defaultdict 或其他字典类型,手动排序
        keyword_items = sorted(
            stats['keywords_frequency'].items(),
            key=lambda x: x[1],
            reverse=True
        )

    for keyword, count in keyword_items:
        print(f"  {keyword:15}: {count:4d}")


def main():
    """
    主函数 - 命令行接口
    """
    parser = argparse.ArgumentParser(description='日志查询分析工具')
    parser.add_argument('file', help='日志文件路径')
    parser.add_argument('-k', '--keyword', help='搜索关键词')
    parser.add_argument('-l', '--level', help='日志级别过滤')
    parser.add_argument('-t', '--thread', help='线程名过滤')
    parser.add_argument('--start-time', help='开始时间 (格式: YYYY-MM-DD HH:MM:SS)')
    parser.add_argument('--end-time', help='结束时间 (格式: YYYY-MM-DD HH:MM:SS)')
    parser.add_argument('-c', '--case-sensitive', action='store_true', help='大小写敏感')
    parser.add_argument('-s', '--statistics', action='store_true', help='显示统计信息')
    parser.add_argument('-o', '--output', help='导出结果到文件')
    parser.add_argument('--max-display', type=int, default=50, help='最大显示行数 (默认: 50)')

    args = parser.parse_args()

    # 创建分析器实例
    analyzer = LogAnalyzer()

    # 加载日志文件
    if not analyzer.load_logs(args.file):
        sys.exit(1)

    # 生成统计信息
    stats = analyzer.analyze_statistics()

    # 执行搜索
    results = analyzer.logs# 初始化为所有日志

    # 先按时间范围过滤
    if args.start_time or args.end_time:
        results = analyzer.time_range_search(args.start_time, args.end_time,logs=results)

    # 再按关键词、级别、线程过滤
    if args.keyword or args.level or args.thread:
        results = analyzer.search_logs(
            keyword=args.keyword,
            level=args.level,
            thread=args.thread,
            case_sensitive=args.case_sensitive,
            logs=results # 传入已过滤的结果
        )


    # 显示结果
    display_results(results, args.max_display)

    # 显示统计信息
    if args.statistics:
        display_statistics(stats)

    # 导出结果
    if args.output:
        analyzer.export_results(results, args.output)

def interactive_mode():
    """
    交互式模式 - 支持链式搜索
    """
    print("=" * 60)
    print("日志查询分析小工具 - 交互模式")
    print("=" * 60)

    file_path = input("请输入日志文件路径: ").strip()
    if not file_path:
        print("未提供文件路径,退出程序")
        return

    analyzer = LogAnalyzer()
    if not analyzer.load_logs(file_path):
        return

    # 初始化当前结果集
    current_results = analyzer.logs.copy()
    current_filters = []  # 记录当前应用的过滤条件

    while True:
        print("\n" + "=" * 50)
        print(f"当前结果集: {len(current_results)} 条记录")
        if current_filters:
            print("当前过滤条件:", " -> ".join(current_filters))
        print("\n请选择操作:")
        print("1. 关键词搜索")
        print("2. 按级别过滤")
        print("3. 按线程过滤")
        print("4. 时间范围搜索")
        print("5. 显示统计信息")
        print("6. 导出结果")
        print("7. 重新加载文件")
        print("8. 退出")

        choice = input("请输入选择 (1-9): ").strip()

        if choice == '1':
            keyword = input("请输入搜索关键词: ").strip()
            if not keyword:
                print("关键词不能为空")
                continue
            case_sensitive = input("是否大小写敏感? (y/N): ").strip().lower() == 'y'

            # 在当前结果集上应用关键词过滤
            results = analyzer.search_logs(
                keyword=keyword,
                case_sensitive=case_sensitive,
                logs=current_results
            )
            current_results = results
            current_filters.append(f"关键词: {keyword}")
            display_results(results)

        elif choice == '2':
            level = input("请输入日志级别 (DEBUG/INFO/WARN/ERROR等): ").strip().upper()
            if not level:
                print("级别不能为空")
                continue

            # 在当前结果集上应用级别过滤
            results = analyzer.search_logs(level=level, logs=current_results)
            current_results = results
            current_filters.append(f"级别: {level}")
            display_results(results)

        elif choice == '3':
            thread = input("请输入线程名: ").strip()
            if not thread:
                print("线程名不能为空")
                continue

            # 在当前结果集上应用线程过滤
            results = analyzer.search_logs(thread=thread, logs=current_results)
            current_results = results
            current_filters.append(f"线程: {thread}")
            display_results(results)

        elif choice == '4':
            start_time = input("开始时间 (格式: 2023-10-01 12:00:00): ").strip()
            end_time = input("结束时间 (格式: 2023-10-01 13:00:00): ").strip()

            # 在当前结果集上应用时间范围过滤
            results = analyzer.time_range_search(
                start_time or None,
                end_time or None,
                logs=current_results
            )
            current_results = results

            # 记录时间范围条件
            time_filter = "时间范围"
            if start_time:
                time_filter += f" 从 {start_time}"
            if end_time:
                time_filter += f" 到 {end_time}"
            current_filters.append(time_filter)

            display_results(results)

        elif choice == '5':
            # 基于当前结果集生成统计信息
            stats = analyzer.analyze_statistics(logs=current_results)
            display_statistics(stats)

        elif choice == '6':
            if not current_results:
                print("没有可导出的数据")
                continue
            output_file = input("请输入导出文件名: ").strip()
            if output_file:
                analyzer.export_results(current_results, output_file)
                print(f"结果已导出到 {output_file}")

        elif choice == '7':
            if analyzer.load_logs(file_path):
                # 重新加载后重置结果集
                current_results = analyzer.logs.copy()
                current_filters = []
                print("文件重新加载成功,过滤条件已重置")

        elif choice == '8':
            print("感谢使用,再见!")
            break

        else:
            print("无效选择,请重新输入")


if __name__ == "__main__":
    if len(sys.argv) > 1:
        # 命令行模式
        main()
    else:
        # 交互式模式
        interactive_mode()

2.代码详解

1. def analyze_statistics(self) -> Dict:

复制代码
stats = {
            'total_logs': len(self.logs),
            'level_distribution': Counter(),
            'thread_distribution': Counter(),
            'time_range': {},
            'keywords_frequency': defaultdict(int)
        }

        # 统计级别分布
        levels = [log['level'] for log in self.logs if log['level']]
        stats['level_distribution'] = Counter(levels)

levels = [log['level'] for log in self.logs if log['level']]

列表推导式提取级别

分解说明:

  • for log in self.logs:遍历self.logs列表中的每个日志条目
  • log['level']:从每个日志字典中获取level字段的值
  • if log['level']:过滤条件,只保留非空、非None、非False的级别值
  • 整个表达式:创建一个包含所有非空日志级别的列表

stats['level_distribution'] = Counter(levels)

使用Counter统计频率

分解说明:

  • Counter(levels):创建Counter对象,自动统计列表中每个元素的出现次数
  • stats['level_distribution'] = ...:将统计结果赋值给stats字典的对应字段

Counter的工作原理:

  • 接收一个可迭代对象(如列表)
  • 返回一个字典子类,键是列表中的元素,值是出现次数
  • 自动处理不存在的键,返回0而不是抛出KeyError

例子:

levels=['INFO', 'ERROR', 'INFO', 'WARNING']

Counter(['INFO', 'ERROR', 'INFO', 'WARNING'])

返回Counter({'INFO': 2, 'ERROR': 1, 'WARNING': 1})

赋值给stats['level_distribution']=Counter({'INFO': 2, 'ERROR': 1, 'WARNING': 1})

counter对不存在键的处理

复制代码
from collections import Counter

# 创建一个Counter对象
level_counter = Counter(['INFO', 'ERROR', 'INFO', 'WARNING'])
print("原始Counter:", level_counter)

# 访问存在的键
print("INFO count:", level_counter['INFO'])  # 输出: 2

# 访问不存在的键
print("DEBUG count:", level_counter['DEBUG'])  # 输出: 0 (不会报错!)

2.stats['keywords_frequency'][keyword] += 1

复制代码
# 常见错误关键词频率
error_keywords = ['error', 'exception', 'failed', 'timeout', 'null', 'undefined']
for log in self.logs:
    message_lower = log['message'].lower()
    for keyword in error_keywords:
        if keyword in message_lower:
            stats['keywords_frequency'][keyword] += 1

stats['keywords_frequency'][keyword] += 1

分解执行步骤:

  1. 访问外层字典stats['keywords_frequency']
    • 获取stats字典中'keywords_frequency'键对应的值
    • 这个值是一个defaultdict(int)对象
  1. 访问内层字典[keyword]
    • defaultdict(int)中访问特定的keyword
    • 如果keyword不存在,defaultdict(int)会自动创建它并初始化为0
  1. 执行加法操作+= 1
    • 获取当前keyword的值
    • 对该值加1
    • 将结果存回stats['keywords_frequency'][keyword]

详细的内存变化

复制代码
# 当我们执行这行代码时:
keyword = 'error'
stats['keywords_frequency'][keyword] += 1

# 实际上发生了:
# 1. 访问 stats['keywords_frequency']['error'] 
# 2. 由于 'error' 键不存在,defaultdict 自动:
#    - 创建 'error' 键
#    - 将其值设置为 int() 即 0
# 3. 然后对这个值执行 +1 操作
# 4. 结果:stats['keywords_frequency']['error'] = 1

stats['keywords_frequency'] = defaultdict(<class 'int'>, {})
访问 stats['keywords_frequency']['error'] → 返回 0(默认值)
0 + 1 = 1
存储 stats['keywords_frequency']['error'] = 1
结果: {'error': 1}

stats['keywords_frequency'] = defaultdict(<class 'int'>, {'error': 1})
访问 stats['keywords_frequency']['error'] → 返回 1
1 + 1 = 2
存储 stats['keywords_frequency']['error'] = 2
结果: {'error': 2}

stats['keywords_frequency'] = defaultdict(<class 'int'>, {'error': 2})
访问 stats['keywords_frequency']['exception'] → 返回 0(默认值)
0 + 1 = 1
存储 stats['keywords_frequency']['exception'] = 1
结果: {'error': 2, 'exception': 1}

3.def display_statistics(stats: Dict):

复制代码
print("\n日志级别分布:")
    for level, count in stats['level_distribution'].most_common():
        percentage = (count / stats['total_logs']) * 100
        print(f"  {level:8}: {count:4d} ({percentage:5.1f}%)")

most_common() 是 Python collections.Counter 类的一个方法,用于返回计数器中最常见的元素及其计数的列表。

方法定义:

Counter.most_common(n=None)

  • 参数:
    • n (可选): 要返回的最常见元素的数量。如果省略或为 None,则返回所有元素
  • 返回值 : 由 (element, count) 元组组成的列表,按计数从高到低排序

    from collections import Counter

    创建一个 Counter 对象

    word_counts = Counter(['apple', 'banana', 'apple', 'orange', 'banana', 'apple'])

    获取所有元素的频率排序

    print(word_counts.most_common())

    输出: [('apple', 3), ('banana', 2), ('orange', 1)]

    获取前2个最常见的元素

    print(word_counts.most_common(2))

    输出: [('apple', 3), ('banana', 2)]

相关推荐
hu_nil2 小时前
LLMOps-第十三周
python·vllm
空影星2 小时前
轻量日记神器RedNotebook,高效记录每一天
python·数据挖掘·数据分析·音视频
搬砖ing换来金砖2 小时前
Python入门-Task02
开发语言·python
databook2 小时前
告别盲人摸象,数据分析的抽样方法总结
后端·python·数据分析
全栈陈序员3 小时前
【Python】基础语法入门(九)—— 代码规范、调试技巧与性能初探
开发语言·python·代码规范
nvd113 小时前
解决 Gemini API 连接卡住问题的方案
python
李剑一3 小时前
Python学习笔记2
python
晨非辰3 小时前
C++ 波澜壮阔 40 年:从基础I/O到函数重载与引用的完整构建
运维·c++·人工智能·后端·python·深度学习·c++40周年
有梦想的西瓜3 小时前
如何优化电力系统潮流分布:最优潮流(OPF)问题
python·电力·opf