import newfeature
import numpy as np
import pandas as pd
import tushare as ts
import baostock as bs
import talib
import time
import bcolz
today=time.strftime('%Y-%m-%d',time.localtime(time.time()))
def get_quantiles(data,x):
bins=np.r_[-1e100,[np.round(np.quantile(data, i/x),3) for i in range(1,x)],1e100] # [负无穷,data中result这列的1,1/2,1/3...,正无穷]
return bins
def precode(c): #在代码后载入sz,sh,6开头代表上证,加sh.前缀
return 'sh.'+c if c[0:1]=='6' else 'sz.'+c
def cut_to1(alpha,minx=-1,maxx=1):
alpha=alpha.copy()
alpha[alpha>maxx]=maxx # 涨幅超过多少,则截取, 应当是在对异常值进行处理
alpha[alpha<minx]=minx # 跌幅超过多少,则截取
return alpha
def getbaostock(symbol, start='2015-01-01', end='2020-10-25', ktype='D',index=0):
# index=0代表大盘,1代表个股
infields="date,close,open,preclose,low,high,turn,volume,amount,peTTM,pbMRQ,psTTM,pcfNcfTTM,isST" if index==0 else "date,open,high,low,close,preclose,volume,amount,pctChg"
rs = bs.query_history_k_data_plus(symbol, infields,start_date=start, end_date=end,
frequency=ktype, adjustflag="2")
data_list = []
while (rs.error_code == '0') & rs.next():
data_list.append(rs.get_row_data())
result = pd.DataFrame(data_list, columns=rs.fields)
if len(result)>50:
result=result.sort_values('date').set_index('date').fillna(0)
result=result[result['volume']!='0'] # 剔除交易量为0数据(即停牌日期数据)
result=result[result['volume']!='']
result=result.astype('float32')
if index==0: # 拿个股数据,才执行
result=result[result['psTTM']>0] # 剔除动态市盈率小于0数据
# 特征工程,通过老特征组合,获得新增特征
# ???????
result['flowmkt']=0.637*np.arctan(np.log(0.00000001*result['close']*result['volume']/result['turn']))
result['everyprofit']=0.637*np.arctan(10/result.peTTM)
result['turn']=0.637*np.arctan((result.turn/10)**0.2)
result['everypb']=0.637*np.arctan(2/result.pbMRQ)
result['eversale']=0.637*np.arctan(2/np.sqrt(result.psTTM))
result['evermoney']=0.637*np.arctan(20/result.pcfNcfTTM)
result.fillna(0,inplace=True)
# result.fillna(0,inplace=True)
# result['p_change']=result['pctChg']
# print(symbol,result.shape)
return result
else:
return []
def saveData(path,symbol,data,szzs,days,binscount,r=1,isfeature=1):
"""
path:文件存储路径
symbol:股票代码
data:个股数据
szzs:上证指数数据
days:默认为5,代表交易日
"""
#如果是用talib、newfeatures计算的数据,需要解除前120条数据
data.dropna(axis=0,inplace=True)
# data['amount']=0.25*(data.open+data.close+data.high+data.low)*data['volume']
data_price=data.values
# 特征工程,组合生成新特征(有些特征往上证指数szzs数据加,又通过szzs新特征算出个股新特征,有些往个股数据加)
data['szpctchg1']=18*szzs['close']/szzs['close'].shift(1)-18 # 今日收盘/昨日收盘 - 1 即18*(当日涨幅 - 1)
data['szpctchg2']=19*szzs['close'].rolling(2).mean()/szzs['close'].shift(2)-19 # 19*(前3日收盘价均值(含当日)/前2日收盘价 - 1)
data['szpctchg4']=10*szzs['close'].rolling(2).mean()/szzs['close'].shift(4)-10 # 10*(前3日收盘价均值(含当日)/前4日收盘价 - 1)
data['szpctchg8']=10*szzs['close'].rolling(4).mean()/szzs['close'].shift(8)-10 # 10*(前4日收盘价均值(含当日)/前8日收盘价 - 1)
data['szpctchg15']=8*szzs['close'].rolling(5).mean()/szzs['close'].shift(15)-8
data['szpctchg30']=6*szzs['close'].rolling(10).mean()/szzs['close'].shift(30)-6
szzs['sma5']=szzs['close'].rolling(5).mean() # 大盘5日均线
szzs['sma10']=szzs['close'].rolling(10).mean()
szzs['sma20']=szzs['close'].rolling(20).mean()
szzs['sma40']=szzs['close'].rolling(40).mean()
szzs['sma80']=szzs['close'].rolling(80).mean()
szzs['max20']=szzs['high'].rolling(20).max().shift() # 大盘20日最高价
szzs['max60']=szzs['high'].rolling(60).max().shift()
szzs['max120']=szzs['high'].rolling(120).max().shift()
szzs['min20']=szzs['low'].rolling(20).min().shift() # 大盘20日最低价
szzs['min60']=szzs['low'].rolling(60).min().shift()
szzs['min120']=szzs['low'].rolling(120).min().shift()
data['zs20_min']=cut_to1(2*szzs['close']/szzs['min20']-2)### 大盘2倍的20日内较最低价的涨幅(涨幅超50截取,或跌幅超50%被截取)
data['zs60_min']=cut_to1(2*szzs['close']/szzs['min60']-2)###
data['zs120_min']=cut_to1(2*szzs['close']/szzs['min120']-2)###
data['zs20_max']=cut_to1(4*szzs['close']/szzs['max20']-4)### 大盘4倍的20日内较最高价的跌幅(涨幅超过25%截取,跌幅超过25%截取)
data['zs60_max']=cut_to1(4*szzs['close']/szzs['max60']-4)###
data['zs120_max']=cut_to1(4*szzs['close']/szzs['max120']-4)###
data['zs5_d']=cut_to1(20*szzs['sma5'].diff()/szzs['sma5'].shift()) # 大盘5日均线差分/昨日5日均线值,涨跌幅限制-5% ~ 5%截取
data['zs10_d']=cut_to1(40*szzs['sma10'].diff()/szzs['sma10'].shift())
data['zs20_d']=cut_to1(60*szzs['sma20'].diff()/szzs['sma20'].shift())
data['zs40_d']=cut_to1(80*szzs['sma40'].diff()/szzs['sma40'].shift())
data['zs80_d']=cut_to1(100*szzs['sma80'].diff()/szzs['sma80'].shift())
data['zsr5']=cut_to1(10*szzs['close']/szzs['sma5']-10) # 大盘收盘价偏离5日均线的幅度,涨跌幅限制-10% ~ 10%截取
data['zsr10']=cut_to1(8*szzs['close']/szzs['sma10']-8)
data['zsr20']=cut_to1(6*szzs['close']/szzs['sma20']-6)
data['zsr40']=cut_to1(6*szzs['close']/szzs['sma40']-6)
data['zsr80']=cut_to1(4*szzs['close']/szzs['sma80']-4)
szzs['r5_20']=(4*szzs['sma5']/szzs['sma20']-4) # 大盘5日均线偏离20日均线幅度,涨跌幅限制-25% ~ 25%截取
szzs['r10_40']=(4*szzs['sma10']/szzs['sma40']-4)
szzs['r20_80']=(4*szzs['sma20']/szzs['sma80']-4)
szzs['r5_40']=(4*szzs['sma5']/szzs['sma40']-4)
data['zsr5_20_d']=cut_to1(10*szzs['r5_20'].diff()) # 大盘5日均线较20日均线的偏移幅度的变化值,反应5日和20日均偏离的剧烈程度
data['zsr10_40_d']=cut_to1(20*szzs['r10_40'].diff())
data['zsr20_80_d']=cut_to1(20*szzs['r20_80'].diff())
data['zsr5_20']=cut_to1(szzs['r5_20'])
data['zsr10_40']=cut_to1(1*szzs['r10_40'])
data['zsr20_80']=cut_to1(1*szzs['r20_80'])
data['zsr5_40']=cut_to1(1*szzs['r5_40'])
szzs['v5']=(10*szzs['volume']/(0.1+szzs['volume'].rolling(5).mean())) # 大盘交易量较5日均交易量均值偏移幅度,0.1是考虑到rolling前几个值为null
szzs['v10']=(10*szzs['volume']/(0.1+szzs['volume'].rolling(10).mean())) # 衡量交易量较平均值异常增减的情况
szzs['v20']=(10*szzs['volume']/(0.1+szzs['volume'].rolling(20).mean()))
szzs['v40']=(10*szzs['volume']/(0.1+szzs['volume'].rolling(40).mean()))
szzs['v80']=(10*szzs['volume']/(0.1+szzs['volume'].rolling(80).mean()))
data['zsv3_9']=(10*szzs['volume'].rolling(3).mean()/(0.1+szzs['volume'].rolling(9).mean())) # 大盘3日交易量较9日均交易量偏移幅度
data['zsv5_20']=(10*szzs['volume'].rolling(5).mean()/(0.1+szzs['volume'].rolling(20).mean()))
data['zsv9_50']=(10*szzs['volume'].rolling(9).mean()/(0.1+szzs['volume'].rolling(50).mean()))
data['zsv5_d']=0.6366*np.arctan(0.2*szzs['v5'].diff()) # 交易量较交易量均线的偏置幅度变化,衡量交易量的变动情况
data['zsv10_d']=0.6366*np.arctan(0.1*szzs['v10'].diff()) # arctan()用于归一化,把任何取值范围数据压缩到0-1
data['zsv20_d']=0.6366*np.arctan(0.2*szzs['v2