业务实现:对于原先的excel文件,我需要新增一个数据列,不同情况,列数据的值不同,分别为空白行、已爬取、已改名、爬取异常、改名未知异常
python
# -*- coding: utf-8 -*-
import io
import os
import re
import sys
import numpy as np
import pandas as pd
import pandas.io.formats.excel
pandas.io.formats.excel.ExcelFormatter.header_style = None # 表头不加粗
reload(sys)
sys.setdefaultencoding('utf-8')
def change_data(data):
try:
data = data.replace(' ', '') # 删除空格
data = re.sub(r'\([^)]*\)', '', data) # 删除括号以及内容
data = re.sub(r'\*', '', data) # 删除*号
return data
except Exception:
pass
# 文件夹名称
dir_name_set = set()
for item in os.listdir(unicode(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'country'), 'utf-8')):
item = change_data(item)
dir_name_set.add(item)
# 改名字典
spacial_name_dict = {}
with io.open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'spacial_country.txt'), 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
line_split = line.strip().split(' ')
name1 = change_data(line_split[0])
if '/' in name1:
name1 = name1.split('/')[0]
name2 = change_data(line_split[1])
if '/' in name2:
name2 = name2.split('/')[0]
spacial_name_dict[name1] = name2
filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'country.xlsx')
sheet_name='species_data'
df = pd.read_excel(filename, sheet_name=sheet_name)
dfc = df.copy()
idx = 1
# 处理每一行数据
for index, row in dfc.iterrows(): # 返回每一行的索引和对应的行数据
name = row['name']
try:
if isinstance(name, float) and np.isnan(name):
dfc.loc[index, 'status'] = '空白行'
else:
name = change_data(name)
if name in dir_name_set:
dfc.loc[index, 'status'] = '已爬取'
elif name in spacial_name_dict.keys():
if spacial_name_dict[name] in dir_name_set:
dfc.loc[index, 'status'] = '已改名为{}'.format(spacial_name_dict[name])
else:
dfc.loc[index, 'status'] = '改名未知异常'
else:
dfc.loc[index, 'status'] = '爬取异常'
if (index != 0 and index % 10000 == 0) or index == len(dfc) - 1:
print index, idx
if idx == 15:
dfcc = dfc.copy()
output_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'country_{}.xlsx'.format(idx))
print 'output_path', output_path
dfcc.to_excel(output_path, sheet_name=sheet_name, index=False, header=True)
idx += 1
except Exception as e:
print name, e