1,需求
文本列表如下:
SQL | 平均耗时 | 执行次数 |
---|---|---|
SELECT DISTINCT object from t1 | 5000 | 23 |
SELECT DISTINCT object as cnt from t1 | 5132 | 12 |
SELECT COUNT(*) FROM t3 | 5678 | 56 |
SELECT COUNT(*) as cnt FROM t3 | 5001 | 3 |
经python 脚本处理:结果如下
SQL | 平均耗时 | 执行次数 |
---|---|---|
SELECT DISTINCT object from t1 | 5066 | 35 |
SELECT COUNT(*) FROM t3 | 5339.5 | 59 |
2,python 根据相似度去重,并归类统计
python
import pandas as pd
from fuzzywuzzy import fuzz
from collections import defaultdict
def read_excel(file_path):
# 读取Excel文件
df = pd.read_excel(file_path)
return df
def process_data(df):
# 使用defaultdict来存储处理后的数据
processed_data = defaultdict(lambda: {'sum_col3': 0, 'avg_col2': 0, 'count_col2': 0})
# 遍历DataFrame的每一行
for index, row in df.iterrows():
# 假设第一列是text_col,第二列是num_col2,第三列是num_col3
text_col = row[0]
#print(text_col)
num_col2 = row[1]
num_col3 = row[2]
# 查找最相似的项
max_ratio = 0
best_match = None
for key in processed_data:
ratio = fuzz.ratio(text_col, key)
if ratio > max_ratio:
max_ratio = ratio
best_match = key
# 如果找到了足够相似的项(这里我们假设60%相似度是足够的)
if max_ratio > 60:
# 更新第二列的和及计数
processed_data[best_match]['sum_col2'] += num_col2
processed_data[best_match]['count_col2'] += 1
# 更新第三列的和
processed_data[best_match]['sum_col3'] += num_col3
else:
# 如果没有足够相似的项,则作为一个新项添加
processed_data[text_col]['sum_col2'] = num_col2
processed_data[text_col]['count_col2'] = 1
processed_data[text_col]['sum_col3'] = num_col3
# 计算第二列的平均值
for key, value in processed_data.items():
if value['count_col2'] > 0:
value['avg_col2'] = value['sum_col2'] / value['count_col2']
# 转换为DataFrame以便输出
result_df = pd.DataFrame.from_dict(processed_data, orient='index')
result_df.reset_index(inplace=True)
return result_df
# 结果另存为csv文件
file_path = '/home/test/SQL.xlsx' # 替换为你的Excel文件路径
df = read_excel(file_path)
result_df = process_data(df)
#print(result_df)j
# index sum_col3 avg_col2 count_col2 sum_col2
df2=result_df[['index', 'avg_col2','sum_col3']]
df2.to_csv('example.csv', index=True) # index=False表示不将行索引写入文件