【Python代码】谷歌专利CSV处理函数

以下是一个重构后的高可用、可配置、低耦合的专利CSV处理函数，包含清晰的注释和结构：

python 复制代码

import csv
import pandas as pd
from datetime import datetime
import os
from typing import List, Dict, Any, Optional, Tuple
import logging

# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


class PatentProcessor:
    """专利数据处理器"""

    def __init__(self, config: Optional[Dict[str, Any]] = None):
        """
        初始化专利处理器

        Args:
            config: 配置字典，包含处理选项
        """
        self.config = config or self.get_default_config()

    @staticmethod
    def get_default_config() -> Dict[str, Any]:
        """获取默认配置"""
        return {
            'required_columns': ['id', 'inventor/author', 'title', 'priority date'],
            'author_column': 'inventor/author',
            'split_separator': ',',
            'filter_condition': None,  # 例如: {'column': 'assignee', 'value': '百度', 'case_sensitive': False}
            'output_columns': ['author', 'value', 'id', 'title_list', 'priority_date_list', 'start_year', 'end_year'],
            'encoding': 'utf-8'
        }

    def load_csv_data(self, csv_filepath: str) -> Optional[pd.DataFrame]:
        """
        加载CSV文件数据

        Args:
            csv_filepath: CSV文件路径

        Returns:
            pandas DataFrame 或 None（如果加载失败）
        """
        try:
            data = []
            with open(csv_filepath, encoding=self.config['encoding']) as f:
                reader = csv.reader(f)
                for row in reader:
                    data.append(row)

            if len(data) < 2:
                logger.warning(f"File {csv_filepath} has insufficient data rows")
                return None

            # 使用第二行作为列名，第三行开始作为数据
            df = pd.DataFrame(data[2:], columns=data[1])
            logger.info(f"Successfully loaded CSV file: {csv_filepath}")
            return df

        except FileNotFoundError:
            logger.error(f"Error: The file {csv_filepath} does not exist.")
        except Exception as e:
            logger.error(f"An unexpected error occurred while loading {csv_filepath}: {e}")

        return None

    def apply_filters(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        应用数据过滤条件

        Args:
            df: 输入DataFrame

        Returns:
            过滤后的DataFrame
        """
        filter_condition = self.config.get('filter_condition')
        if filter_condition:
            column = filter_condition['column']
            value = filter_condition['value']
            case_sensitive = filter_condition.get('case_sensitive', False)

            if column in df.columns:
                if case_sensitive:
                    df = df[df[column].str.contains(value, na=False)]
                else:
                    df = df[df[column].str.contains(value, case=False, na=False)]
                logger.info(f"Applied filter: {filter_condition}")

        return df

    def convert_to_excel(self, df: pd.DataFrame, csv_filepath: str) -> str:
        """
        将DataFrame转换为Excel文件

        Args:
            df: 输入DataFrame
            csv_filepath: 原始CSV文件路径（用于生成输出路径）

        Returns:
            生成的Excel文件路径
        """
        try:
            excel_filepath = csv_filepath.replace('.csv', '.xlsx')
            df.to_excel(excel_filepath, index=False)
            logger.info(f"CSV file has been converted to Excel: {excel_filepath}")
            return excel_filepath

        except Exception as e:
            logger.error(f"Error converting to Excel: {e}")
            return ""

    def process_authors(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        处理作者数据，进行统计和分析

        Args:
            df: 包含专利数据的DataFrame

        Returns:
            作者统计DataFrame
        """
        # 选择需要的列
        required_cols = self.config['required_columns']
        missing_cols = [col for col in required_cols if col not in df.columns]

        if missing_cols:
            logger.warning(f"Missing columns: {missing_cols}. Available columns: {list(df.columns)}")
            required_cols = [col for col in required_cols if col in df.columns]

        df = df[required_cols].copy()

        # 分割作者列
        author_col = self.config['author_column']
        if author_col in df.columns:
            df[author_col] = df[author_col].str.split(self.config['split_separator'])
            df = df.explode(author_col)
            df[author_col] = df[author_col].str.strip()

        # 统计作者出现次数
        author_counts = df[author_col].value_counts()
        new_df = pd.DataFrame({
            'author': author_counts.index,
            'value': author_counts.values
        })

        # 修复：使用列表推导式而不是直接赋值
        new_df['id'] = [df[df[author_col] == author]['id'].tolist() for author in new_df['author']]
        new_df['title_list'] = [df[df[author_col] == author]['title'].tolist() for author in new_df['author']]

        if 'priority date' in df.columns:
            new_df['priority_date_list'] = [
                df[df[author_col] == author]['priority date'].tolist() for author in new_df['author']
            ]

            # 计算开始和结束年份
            date_ranges = [self.calculate_date_range(dates) for dates in new_df['priority_date_list']]
            new_df['start_year'] = [start for start, _ in date_ranges]
            new_df['end_year'] = [end for _, end in date_ranges]

        return new_df

    @staticmethod
    def calculate_date_range(date_list: List[str]) -> Tuple[Optional[int], Optional[int]]:
        """
        计算日期列表的开始和结束年份

        Args:
            date_list: 日期字符串列表

        Returns:
            (开始年份, 结束年份) 元组
        """
        valid_dates = []
        for date_str in date_list:
            if pd.notna(date_str) and date_str.strip():
                try:
                    # 尝试多种日期格式
                    date_obj = datetime.strptime(date_str.strip(), '%Y-%m-%d')
                    valid_dates.append(date_obj)
                except ValueError:
                    try:
                        date_obj = datetime.strptime(date_str.strip(), '%Y/%m/%d')
                        valid_dates.append(date_obj)
                    except ValueError:
                        # 如果无法解析日期，跳过
                        continue

        if not valid_dates:
            return None, None

        min_date = min(valid_dates)
        max_date = max(valid_dates)

        return min_date.year, max_date.year

    def save_author_stats(self, author_df: pd.DataFrame, csv_filepath: str) -> str:
        """
        保存作者统计结果

        Args:
            author_df: 作者统计DataFrame
            csv_filepath: 原始CSV文件路径（用于生成输出路径）

        Returns:
            生成的统计文件路径
        """
        try:
            rank_excel_filepath = csv_filepath.replace('.csv', '_rank.xlsx')

            # 只保存配置中指定的列
            output_cols = [col for col in self.config['output_columns'] if col in author_df.columns]
            author_df[output_cols].to_excel(rank_excel_filepath, index=False)

            logger.info(f"Author statistics saved: {rank_excel_filepath}")
            return rank_excel_filepath

        except Exception as e:
            logger.error(f"Error saving author statistics: {e}")
            return ""

    def process_patent_file(self, csv_filepath: str) -> Dict[str, str]:
        """
        处理单个专利CSV文件

        Args:
            csv_filepath: CSV文件路径

        Returns:
            包含输出文件路径的字典
        """
        results = {'original_file': csv_filepath}

        # 1. 加载数据
        df = self.load_csv_data(csv_filepath)
        if df is None or df.empty:
            return results

        # 2. 应用过滤条件
        df = self.apply_filters(df)

        # 3. 转换为Excel
        excel_path = self.convert_to_excel(df, csv_filepath)
        results['excel_file'] = excel_path

        # 4. 处理作者数据
        author_df = self.process_authors(df)

        # 5. 保存作者统计
        stats_path = self.save_author_stats(author_df, csv_filepath)
        results['stats_file'] = stats_path

        return results


# 使用示例
def process_patents_folder(folder_path: str, file_list: List[str], config: Optional[Dict[str, Any]] = None):
    """
    处理文件夹中的多个专利CSV文件

    Args:
        folder_path: 文件夹路径
        file_list: 要处理的文件列表
        config: 处理配置
    """
    processor = PatentProcessor(config)

    for filename in file_list:
        file_path = os.path.join(folder_path, filename)
        logger.info(f"Processing file: {filename}")

        try:
            results = processor.process_patent_file(file_path)
            logger.info(f"Completed processing: {filename}")
            logger.info(f"Results: {results}")

        except Exception as e:
            logger.error(f"Error processing {filename}: {e}")


# 配置示例
CUSTOM_CONFIG = {
            'required_columns': ['id', 'inventor/author', 'title', 'priority date'],
            'author_column': 'inventor/author',
            'split_separator': ',',
            'filter_condition': None,  # 例如: {'column': 'assignee', 'value': 'Google', 'case_sensitive': False}
            'output_columns': ['author', 'value', 'id', 'title_list', 'priority_date_list', 'start_year', 'end_year'],
            'encoding': 'utf-8'
        }

if __name__ == "__main__":
    # 使用示例
    folder = r'E:files\patents'
    gp_files = [
        "gp-search-google.csv",

    ]

    # 使用默认配置处理文件
    process_patents_folder(folder, gp_files)

    # 使用自定义配置处理文件
    # process_patents_folder(folder, gp_files, CUSTOM_CONFIG)

这个重构版本具有以下优点：

模块化设计：将功能拆分为多个独立的方法，每个方法只负责一个明确的任务
可配置性：通过配置字典可以灵活调整处理参数
错误处理：完善的异常处理和日志记录
类型提示：添加了类型提示，提高代码可读性和可维护性
灵活性：支持自定义过滤条件、输出列等
可扩展性：易于添加新的处理功能或修改现有逻辑
清晰的文档：每个方法都有详细的文档字符串说明参数和返回值

使用方法：

python 复制代码

# 简单使用
processor = PatentProcessor()
results = processor.process_patent_file('path/to/patents.csv')

# 自定义配置使用
config = {
    'filter_condition': {'column': 'assignee', 'value': 'Google', 'case_sensitive': False},
    'required_columns': ['id', 'inventor/author', 'title', 'priority date']
}
processor = PatentProcessor(config)
results = processor.process_patent_file('path/to/patents.csv')