关于论文
通过规模、动量、交易量以及波动性这四个主题因子,来分析不同特征下加密货币的市场表现,文章将加密货币每周的这四类特征,划分为五个分位数,并根据五分位数和一份位数之间的差异,形成多空策略。关于因子可以可以查阅论文 doi.org/10.1111/jof...
关于数据
因为coinmarketcap.api要付费,所以我换成了CoinGecko,里边的api免费www.coingecko.com/zh/api/docu...
获取数据
py
#连接CoinGecko的API,使用免费版本无需提供KEY
cg = CoinGeckoAPI()
#返回一个字典,其中为n个记录加密货币信息的子字典,子字典包含id,symbol,name三个字段。
cg.get_coins_list()
#coin_list = cg.get_coins_list()
# input:coin_list[1]
# output:{'id': '0-5x-long-algorand-token','symbol': 'algohalf','name': '0.5X Long Algorand'}
#获取指定加密货币一段时期内的数据,返回一个字典,字典的三个Key分别记录price,market_caps,以及total_volumes这几样信息。
#单个Key中,数据以列表形式存储,列表中包含时间戳及时间戳对应的数据信息。
cg.get_coin_market_chart_range_by_id(
crypto,#加密货币id
vs_currency='usd',#货币单位
from_timestamp=begin_date,#起始日期
to_timestamp=end_date)#终止日期
# input:info['prices'][0]
# output:[[1623196800000, 0.0014762190674748388],[1623283200000, 0.0017628434668555861]...]
#注意,from_timestamp 和 to_timestamp的两个参数所需要输入的信息是时间戳而非我们所常用的时间格式,需要进行一定的转换
思路是先找到所有加密货币的id,写一个遍历函数,对于每一个加密货币数据后,就存储到统一的DataFram中
py
def timeStamp(timeNum):
# translate timeNum to standard time format
timeStamp = float(timeNum/1000)
timeArray = time.localtime(timeStamp)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
return otherStyleTime
#Get all cryptocurrency name in the CoinGecko, save its id in id_list
cg = CoinGeckoAPI()
coin_list = cg.get_coins_list()
id_list =[]
for dic in coin_list:
coin_id = dic['id']
id_list.append(coin_id)
#Setup begin_date and end_date
begin_date = "2020-01-01 000000"
timeArray = time.strptime(begin_date, "%Y-%m-%d %H%M%S")
begin_date = int(time.mktime(timeArray))
end_date = "2023-12-31 000000"
timeArray = time.strptime(end_date, "%Y-%m-%d %H%M%S")
end_date = int(time.mktime(timeArray))
df_total = pd.DataFrame()
#Get coin market data by id, save info in df_total
#Attention: For Free API, the request frequency is limited, so we wait 1 minute after every 50 requests
count = 0
# for crypto in tqdm(id_list[0:4481]):
for crypto in tqdm(id_list):
df1 = pd.DataFrame()
df2 = pd.DataFrame()
df3 = pd.DataFrame()
price_list=[]
vol_list=[]
mcap_list=[]
info = cg.get_coin_market_chart_range_by_id(
crypto,
vs_currency='usd',
from_timestamp=begin_date,
to_timestamp=end_date)
time_list=[]
for row in info['prices']:
timeNum = row[0]
timeNum = timeStamp(timeNum)
time_list.append(timeNum)
price = row[1]
price_list.append(price)
df1['date'] = time_list
df1['price'] = price_list
time_list=[]
for row in info['market_caps']:
timeNum = row[0]
timeNum = timeStamp(timeNum)
time_list.append(timeNum)
mcap = row[1]
mcap_list.append(mcap)
df2['date'] = time_list
df2['mcap'] = mcap_list
time_list=[]
for row in info['total_volumes']:
timeNum = row[0]
timeNum = timeStamp(timeNum)
time_list.append(timeNum)
vol = row[1]
vol_list.append(vol)
df3['date'] = time_list
df3['vol'] = vol_list
df = pd.merge(df1,df2,on='date')
df = pd.merge(df,df3,on='date')
df['crypto'] = crypto
df_total = pd.concat([df_total, df], ignore_index=True)
count += 1
if count%50 ==0:
time.sleep(60)
df_total.to_csv(path+os.sep+'df_total01.csv',encoding = 'utf_8_sig')
因为不能频繁的request,所以要设置计数器每执行50次,就停顿1分钟时间
数据清洗和处理
先删除有缺失的条目,减少后期无效数据的处理量
py
all_data = all_data.sort_values(by='date')
all_data['date'] = pd.to_datetime(all_data['date'])
#Delete data without complete info
all_data = all_data.replace(0,np.nan).dropna()
接论文中的数据是以周为单位的,而我获取到的数据是以日为单位,这就意味着,我要将所有日数据改为周数据,并需要有字段区分不同的周。
这里我用了一个算法效率不高但是可行的办法,即设定一个开始日期,然后开始遍历所有的日期,若所遍历的日期属于开始日期的未来7天,则该日期记为第一周,接着开始日期加七天,重复上述流程,所遍历日期在新的时间段的,记为第二周,从而依次记录到最后一周,具体代码实现路径如下,预计用时15分钟
另外要将价格信息,转化为收益率,我的思路是先挑选出所有加密货币,挑选出之后再做滞后和计算,最后将所有结果再合并到一起,这样就能解决上述问题,但由于首行数据没有上一期的数据,所以无法得出收益率,因此最后还需要将该部分缺失数据的行数删除。具体代码实现路径如下:
py
crypto_list =list(all_data['crypto'].unique())
single_data = pd.DataFrame()
new_all_data = pd.DataFrame()
print('Computing daily return...')
for crypto in tqdm(crypto_list):
single_data = all_data[all_data['crypto']==crypto]
single_data['last_price'] = single_data['price'].shift(1)
single_data['daily_ret'] = (single_data['price']-single_data['last_price'])/single_data['last_price']
new_all_data = pd.concat([single_data,new_all_data], ignore_index=True)
new_all_data = new_all_data.replace(0,np.nan).dropna()
在有了周,价格,市值,交易量和收益率这几个字段后,便可以对数据进行处理计算,得到我们所需要的因子和收益率。这部分的思路是,拎出单种加密货币,再从单种加密货币的dataframe中遍历拎出不同周的数据,依次进行计算,并添加至在一起。计算完周收益率以及四类主题因子后,再计算加密货币市场总市值以及市场风险溢价,用于后续的单因子模型中。这里用到的无风险利率为美国十年期国债利率,按照date与df_week合并在了一起。
计算因子
py
#Compute the factor we need in later research
df_week = pd.DataFrame()
print('Computing Factors...')
for crypto in tqdm(crypto_list):
crypto_data = new_all_data[new_all_data['crypto'] == crypto]
AGE = len(crypto_data)
for week in list(set(week_list))[1:]:
data = crypto_data[crypto_data['week'] == week]
if data.empty:
pass
else:
#计算收益率以及规模,交易量和波动性三类因子
flag=1
for dret in data['daily_ret']:
flag = flag * (dret + 1)
flag = flag -1
df_week = df_week.append([{'crypto':crypto,
'date':data['date'].iloc[-1],
'week':week,
'PRC':np.log(data['price'].iloc[-1]+1),
'MAXDPRC':max(data['price']),
'MCAP':np.log(data['mcap'].iloc[-1]+1),
'mcap':data['mcap'].iloc[-1],
'AGE': AGE,
'VOL':np.log(np.average(data['vol']+1)),
'PRCVOL':np.log(np.average(data['vol']*data['price'])+1),
'STDPRCVOL':np.log(np.std(data['vol']*data['price'])+1),
'week_ret':flag}],ignore_index=False)
#计算动量因子
df_week_with_lag=pd.DataFrame()
print('Computing laging data....')
for crypto in tqdm(crypto_list):
crypto_data = df_week[df_week['crypto'] == crypto]
crypto_data['mcap_lag1'] = crypto_data['mcap'].shift(1)
crypto_data['week_ret_lag1'] = crypto_data['week_ret'].shift(1)
crypto_data['week_ret_lag2'] = crypto_data['week_ret'].shift(2)
crypto_data['week_ret_lag3'] = crypto_data['week_ret'].shift(3)
crypto_data['week_ret_lag4'] = crypto_data['week_ret'].shift(4)
crypto_data['week_ret_lag1-4'] = (crypto_data['week_ret_lag1'] +1 )*(crypto_data['week_ret_lag2'] +1 )*(crypto_data['week_ret_lag3'] +1 )*(crypto_data['week_ret_lag4'] +1 ) -1
df_week_with_lag = pd.concat([crypto_data,df_week_with_lag],ignore_index = 'True')
df_week = df_week_with_lag
df_week = df_week.dropna()
#计算该周整体加密货币市场的市值
week_vs_mcap = df_week.groupby('week')['mcap'].sum()
week_vs_mcap = week_vs_mcap.to_frame()
week_vs_mcap.rename(columns={'mcap':'all_crypto_mcap'},inplace =True)
week_vs_mcap['all_crypto_mcap_lag1']= week_vs_mcap['all_crypto_mcap'].shift(1)
week_vs_mcap['Rm'] = (week_vs_mcap['all_crypto_mcap'] - week_vs_mcap['all_crypto_mcap_lag1'] ) / week_vs_mcap['all_crypto_mcap_lag1']
df_week = pd.merge(week_vs_mcap,df_week,on = 'week')
#计算市场整体收益率Rm,载入合并Rf文件,并将年利率转化为周利率。
df_week['date'] = list(map(lambda x: x.replace(' 08:00:00',''),df_week['date']))
df_week['date'] = pd.to_datetime(df_week['date'],format = "%Y-%m-%d")
df_rf = pd.read_excel(path+os.sep+'US_T-bills.xlsx')
df_week = pd.merge(df_week,df_rf, on='date')
df_week['Rf'] = df_week['Rf']/100/52
df_rf = df_week.groupby('week')['Rf'].mean()
df_rf = df_rf.to_frame()
df_rf.rename(columns={'Rf':'week_Rf'},inplace = True)
df_week = pd.merge(df_week,df_rf, on='week')
df_week['CMKT'] = df_week['Rm'] - df_week['week_Rf']
df_week = df_week.dropna()
print(df_week)
df_week.to_csv(r'C:\Users\hp\Desktop\QT Final'+os.sep+'df_week.csv',encoding='utf_8_sig')
我们现在已经将数据采集完毕,然后就是搭建回测系统基本思路就是
- 筛选标的
- 根据因子判断多空
- 执行多空策略
- 计算收益,验证策略 回测框架
py
portfolio = pd.DataFrame()
long=pd.Series(dtype=float)
short=pd.Series(dtype=float)
for week in tqdm(range(min(df_week['week'])+1,max(df_week['week'])+1)):
data = crypto_select(df_week,week) #在这一步筛选出股票/加密货币
#To Decide what stock to long or short
long,short = make_strategy_by_size(data,1) #在这一步按照选择的因子对加密货币进行多空的判断
#Execute the strategy and save postion in "portfolio"
execute_strategy(long,short,week) #执行上述判断,将收益记录在组合中
portfolio = portfolio.dropna()
PRC_result = cpt_return(portfolio) #计算收益
print(PRC_result)
PRC_result = cpt_strategy_ret(PRC_result) #计算策略收益
PRC_result.to_csv(path+os.sep+'PRC_result.csv',encoding = 'utf_8_sig')
runregression(PRC_result) #评估超额收益
筛选标的
筛选市值超过100万美元的加密货币,将满足要求的加密货币筛选出来即可
py
def crypto_select(df,week):
#According to the info of last month return data.
#In other words, we use T-1 info and do transaction at T
last_week = week-1
data = df[df['week']==last_week]
data = df[df['mcap']>1000000]
return data
根据因子判断多空
将池子里的加密货币划分为5个分位数,分别测试持有1至5分位数中加密货币的回报率,同时也测试了多头5分位数,空头1分位数的加密货币的回报率,即5-1,从而得出多空策略。拆解上述需求,即需要我们实现两点,将加密货币按某一指标划分多个bins,并根据bins归类为long or short中。
这里仅以规模因子为例,论文中策略验证显著的是PRC,MAXDPRC,MCAP这三个因子,用pandas库的qcut函数划分为五个分位数,在根据分位数分类为long和short部分。这里不用纠结到底应该long指标高的还是long指标低的,只要根据结果收益率的正负来判断即可。具体代码实现路径如下:
py
def make_strategy_by_size(data,n):
#Strategy: longs the smallest coins and shorts the largest coins generates
if n == 1:
data['PRC_bins'] = pd.qcut(data['PRC'], 5,labels = False)
long = data[data['PRC_bins'] == 4]['crypto']
short = data[data['PRC_bins']== 0]['crypto']
elif n == 2:
data['MAXDPRC_bins'] = pd.qcut(data['MAXDPRC'], 5,labels = False)
long = data[data['MAXDPRC_bins'] == 4]['crypto']
short = data[data['MAXDPRC_bins']== 0]['crypto']
elif n==3:
data['AGE_bins'] = pd.qcut(data['AGE'], 5,labels = False)
long = data[data['AGE_bins'] == 4]['crypto']
short = data[data['AGE_bins']== 0]['crypto']
elif n==4:
data['MCAP_bins'] = pd.qcut(data['MCAP'], 5,labels = False)
long = data[data['MCAP_bins'] == 4]['crypto']
short = data[data['MCAP_bins']== 0]['crypto']
return long,short
计算收益
一部分是分别计算多头与空头的收益率,ew指的是equal weighted,而vw指的的value weighted,后者将收益率按照市值进行了加权
py
def cpt_return(x):
long = x[x['position'] == 'long']
short = x[x['position'] == 'short']
long_portfolios = (
long
.groupby(['week','position'])
.apply(
lambda g: pd.Series({
'portfolio_ew_long': g['week_ret'].mean(),
'portfolio_vw_long': (g['week_ret'] * g['mcap_lag1']).sum() / g['mcap_lag1'].sum()
})
)
).reset_index()
short_portfolios = (
short
.groupby(['week','position'])
.apply(
lambda g: pd.Series({
'portfolio_ew_short': g['week_ret'].mean(),
'portfolio_vw_short': (g['week_ret'] * g['mcap_lag1']).sum() / g['mcap_lag1'].sum()
})
)
).reset_index()
portfolios = pd.merge(long_portfolios,short_portfolios,on='week',how='outer')
return portfolios
计算策略收益
py
def cpt_strategy_ret(df):
df = df.replace(np.nan,0)
df['strategy_vw'] = df['portfolio_vw_long'] - df['portfolio_vw_short']
df['strategy_ew'] = df['portfolio_ew_long'] - df['portfolio_ew_short']
plt.rcParams["figure.figsize"] = (10,7)
df = df.sort_values('week')
df['cum_ew'] = (df['strategy_ew'] + 1).cumprod() - 1
df['cum_vw'] = (df['strategy_vw'] + 1).cumprod() - 1
draw_curve_cum_ret(df)
draw_curve_ret(df)
print('avg_long_ew: ', np.average(df['portfolio_ew_long']))
print('avg_short_ew: ',np.average(list(map(lambda x: -x,df['portfolio_ew_short']))))
print('avg_long_vw: ', np.average(df['portfolio_vw_long']))
print('avg_short_vw: ',np.average(list(map(lambda x: -x,df['portfolio_vw_short']))))
return df
def draw_curve_cum_ret(X):
###make a plot of roc curve
plt.figure(dpi=150)
lw = 2
plt.ylim((round(min(X['cum_vw']),0),round(max(X['cum_vw']),0)))
plt.plot(X['week'], X['cum_vw'], color='navy',lw=lw,label='cum_vw')
plt.xlabel('Week')
plt.ylabel('Cumulative Return')
#plt.savefig(path+os.sep+save_name+'.jpg')
plt.show()
#print('Figure was saved to ' + path)
return plt.show()
def draw_curve_ret(X):
plt.figure(dpi=150)
lw = 2
plt.ylim((round(min(X['strategy_vw']),0),round(max(X['strategy_vw']),0)))
plt.plot(X['week'], X['strategy_vw'], color='navy',lw=lw, label='strategy_vw')
plt.xlabel('Week')
plt.ylabel('Return')
#plt.savefig(path+os.sep+save_name+'.jpg')
plt.show()
#print('Figure was saved to ' + path)
return plt.show()
虽然累计收益率是负数,但是这是在制定量化交易策略,如果调转多空头,那么200周之后收益率可以超过100%,从平均收益来看,多头的周收益率为1.5%,空头的周一率为-3.9%,综上来看,应当采取long 低PRC的加密货币,short高PRC的加密货币这样的策略,即实际应采取的策略为1-5而非5-1。另外,以上收益率为周的回报率,若换算为年回报率,那数字将不可小觑。
不过究竟是这一策略所带来如此丰厚的收益,还是因为其他市场共性因素导致的,则还需要在后续的单因子模型中加以确认
线性回归模型
py
def runregression(result):
result = pd.merge(result,df_week[['week','week_Rf', 'CMKT']].drop_duplicates(),on = 'week')
result['strategy_vw_rf'] = result['strategy_vw'] - result['week_Rf']
print(smf.ols('strategy_vw_rf ~ 1 + CMKT', result).fit().summary())
取得的R2并不是很高,但截距项的t检验还是比较显著的,在90%的显著性水平下,可以认为该策略能够带来超额收益。