以下是一个重构后的高可用、可配置、低耦合的专利CSV处理函数,包含清晰的注释和结构:
python
import csv
import pandas as pd
from datetime import datetime
import os
from typing import List, Dict, Any, Optional, Tuple
import logging
# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class PatentProcessor:
"""专利数据处理器"""
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""
初始化专利处理器
Args:
config: 配置字典,包含处理选项
"""
self.config = config or self.get_default_config()
@staticmethod
def get_default_config() -> Dict[str, Any]:
"""获取默认配置"""
return {
'required_columns': ['id', 'inventor/author', 'title', 'priority date'],
'author_column': 'inventor/author',
'split_separator': ',',
'filter_condition': None, # 例如: {'column': 'assignee', 'value': '百度', 'case_sensitive': False}
'output_columns': ['author', 'value', 'id', 'title_list', 'priority_date_list', 'start_year', 'end_year'],
'encoding': 'utf-8'
}
def load_csv_data(self, csv_filepath: str) -> Optional[pd.DataFrame]:
"""
加载CSV文件数据
Args:
csv_filepath: CSV文件路径
Returns:
pandas DataFrame 或 None(如果加载失败)
"""
try:
data = []
with open(csv_filepath, encoding=self.config['encoding']) as f:
reader = csv.reader(f)
for row in reader:
data.append(row)
if len(data) < 2:
logger.warning(f"File {csv_filepath} has insufficient data rows")
return None
# 使用第二行作为列名,第三行开始作为数据
df = pd.DataFrame(data[2:], columns=data[1])
logger.info(f"Successfully loaded CSV file: {csv_filepath}")
return df
except FileNotFoundError:
logger.error(f"Error: The file {csv_filepath} does not exist.")
except Exception as e:
logger.error(f"An unexpected error occurred while loading {csv_filepath}: {e}")
return None
def apply_filters(self, df: pd.DataFrame) -> pd.DataFrame:
"""
应用数据过滤条件
Args:
df: 输入DataFrame
Returns:
过滤后的DataFrame
"""
filter_condition = self.config.get('filter_condition')
if filter_condition:
column = filter_condition['column']
value = filter_condition['value']
case_sensitive = filter_condition.get('case_sensitive', False)
if column in df.columns:
if case_sensitive:
df = df[df[column].str.contains(value, na=False)]
else:
df = df[df[column].str.contains(value, case=False, na=False)]
logger.info(f"Applied filter: {filter_condition}")
return df
def convert_to_excel(self, df: pd.DataFrame, csv_filepath: str) -> str:
"""
将DataFrame转换为Excel文件
Args:
df: 输入DataFrame
csv_filepath: 原始CSV文件路径(用于生成输出路径)
Returns:
生成的Excel文件路径
"""
try:
excel_filepath = csv_filepath.replace('.csv', '.xlsx')
df.to_excel(excel_filepath, index=False)
logger.info(f"CSV file has been converted to Excel: {excel_filepath}")
return excel_filepath
except Exception as e:
logger.error(f"Error converting to Excel: {e}")
return ""
def process_authors(self, df: pd.DataFrame) -> pd.DataFrame:
"""
处理作者数据,进行统计和分析
Args:
df: 包含专利数据的DataFrame
Returns:
作者统计DataFrame
"""
# 选择需要的列
required_cols = self.config['required_columns']
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
logger.warning(f"Missing columns: {missing_cols}. Available columns: {list(df.columns)}")
required_cols = [col for col in required_cols if col in df.columns]
df = df[required_cols].copy()
# 分割作者列
author_col = self.config['author_column']
if author_col in df.columns:
df[author_col] = df[author_col].str.split(self.config['split_separator'])
df = df.explode(author_col)
df[author_col] = df[author_col].str.strip()
# 统计作者出现次数
author_counts = df[author_col].value_counts()
new_df = pd.DataFrame({
'author': author_counts.index,
'value': author_counts.values
})
# 修复:使用列表推导式而不是直接赋值
new_df['id'] = [df[df[author_col] == author]['id'].tolist() for author in new_df['author']]
new_df['title_list'] = [df[df[author_col] == author]['title'].tolist() for author in new_df['author']]
if 'priority date' in df.columns:
new_df['priority_date_list'] = [
df[df[author_col] == author]['priority date'].tolist() for author in new_df['author']
]
# 计算开始和结束年份
date_ranges = [self.calculate_date_range(dates) for dates in new_df['priority_date_list']]
new_df['start_year'] = [start for start, _ in date_ranges]
new_df['end_year'] = [end for _, end in date_ranges]
return new_df
@staticmethod
def calculate_date_range(date_list: List[str]) -> Tuple[Optional[int], Optional[int]]:
"""
计算日期列表的开始和结束年份
Args:
date_list: 日期字符串列表
Returns:
(开始年份, 结束年份) 元组
"""
valid_dates = []
for date_str in date_list:
if pd.notna(date_str) and date_str.strip():
try:
# 尝试多种日期格式
date_obj = datetime.strptime(date_str.strip(), '%Y-%m-%d')
valid_dates.append(date_obj)
except ValueError:
try:
date_obj = datetime.strptime(date_str.strip(), '%Y/%m/%d')
valid_dates.append(date_obj)
except ValueError:
# 如果无法解析日期,跳过
continue
if not valid_dates:
return None, None
min_date = min(valid_dates)
max_date = max(valid_dates)
return min_date.year, max_date.year
def save_author_stats(self, author_df: pd.DataFrame, csv_filepath: str) -> str:
"""
保存作者统计结果
Args:
author_df: 作者统计DataFrame
csv_filepath: 原始CSV文件路径(用于生成输出路径)
Returns:
生成的统计文件路径
"""
try:
rank_excel_filepath = csv_filepath.replace('.csv', '_rank.xlsx')
# 只保存配置中指定的列
output_cols = [col for col in self.config['output_columns'] if col in author_df.columns]
author_df[output_cols].to_excel(rank_excel_filepath, index=False)
logger.info(f"Author statistics saved: {rank_excel_filepath}")
return rank_excel_filepath
except Exception as e:
logger.error(f"Error saving author statistics: {e}")
return ""
def process_patent_file(self, csv_filepath: str) -> Dict[str, str]:
"""
处理单个专利CSV文件
Args:
csv_filepath: CSV文件路径
Returns:
包含输出文件路径的字典
"""
results = {'original_file': csv_filepath}
# 1. 加载数据
df = self.load_csv_data(csv_filepath)
if df is None or df.empty:
return results
# 2. 应用过滤条件
df = self.apply_filters(df)
# 3. 转换为Excel
excel_path = self.convert_to_excel(df, csv_filepath)
results['excel_file'] = excel_path
# 4. 处理作者数据
author_df = self.process_authors(df)
# 5. 保存作者统计
stats_path = self.save_author_stats(author_df, csv_filepath)
results['stats_file'] = stats_path
return results
# 使用示例
def process_patents_folder(folder_path: str, file_list: List[str], config: Optional[Dict[str, Any]] = None):
"""
处理文件夹中的多个专利CSV文件
Args:
folder_path: 文件夹路径
file_list: 要处理的文件列表
config: 处理配置
"""
processor = PatentProcessor(config)
for filename in file_list:
file_path = os.path.join(folder_path, filename)
logger.info(f"Processing file: {filename}")
try:
results = processor.process_patent_file(file_path)
logger.info(f"Completed processing: {filename}")
logger.info(f"Results: {results}")
except Exception as e:
logger.error(f"Error processing {filename}: {e}")
# 配置示例
CUSTOM_CONFIG = {
'required_columns': ['id', 'inventor/author', 'title', 'priority date'],
'author_column': 'inventor/author',
'split_separator': ',',
'filter_condition': None, # 例如: {'column': 'assignee', 'value': 'Google', 'case_sensitive': False}
'output_columns': ['author', 'value', 'id', 'title_list', 'priority_date_list', 'start_year', 'end_year'],
'encoding': 'utf-8'
}
if __name__ == "__main__":
# 使用示例
folder = r'E:files\patents'
gp_files = [
"gp-search-google.csv",
]
# 使用默认配置处理文件
process_patents_folder(folder, gp_files)
# 使用自定义配置处理文件
# process_patents_folder(folder, gp_files, CUSTOM_CONFIG)
这个重构版本具有以下优点:
- 模块化设计:将功能拆分为多个独立的方法,每个方法只负责一个明确的任务
- 可配置性:通过配置字典可以灵活调整处理参数
- 错误处理:完善的异常处理和日志记录
- 类型提示:添加了类型提示,提高代码可读性和可维护性
- 灵活性:支持自定义过滤条件、输出列等
- 可扩展性:易于添加新的处理功能或修改现有逻辑
- 清晰的文档:每个方法都有详细的文档字符串说明参数和返回值
使用方法:
python
# 简单使用
processor = PatentProcessor()
results = processor.process_patent_file('path/to/patents.csv')
# 自定义配置使用
config = {
'filter_condition': {'column': 'assignee', 'value': 'Google', 'case_sensitive': False},
'required_columns': ['id', 'inventor/author', 'title', 'priority date']
}
processor = PatentProcessor(config)
results = processor.process_patent_file('path/to/patents.csv')