#!\usr\bin\python
# -*- coding:UTF-8 -*-
import pandas as pd
import numpy as np
import time
from sklearn import cross_validation
import xgboost as xgb
DATA_DIR = "F:\\workspace\\Python Test\\ijcai\\shop_info.csv"
shop_info_column_names=["shop_id","city_name","location_id","per_pay","score","comment_cnt","shop_level","cate_1","cate_2","cate_3"]
# user_pay_colimn_names=["user_id","shop_id","time_stamp"]#用python实现groupby方法不好实现,利用spark的sparkcontext.sql()实现,然后存取文件
shop_info = pd.read_csv(DATA_DIR,names=shop_info_column_names)
#这个文件是用sparkContext.sql()实现的,在本代码中不做代码展示。
flow_path = "F:\\workspace\\Python Test\\ijcai\\ml_flow_raw_data_file.txt\\part-00000"
merge_data_path = "F:\\workspace\\Python Test\\ijcai\\shop_info_flow.csv" #将合并后的特征存入该文件
feature_save_path = "F:\\workspace\\Python Test\\ijcai\\train.csv"#将最终生成的特征存入该文件
#获取所有的城市名称
def get_all_city(shop_info):
city_colomn = set(shop_info["city_name"])
city_list = list(city_colomn)
return city_list#一共122个城市
#获取所有的分类名称
def get_all_cate(shop_info):
cate1_list = list(shop_info["cate_1"])
cate2_list = list(shop_info["cate_2"])
cate3_list = list(shop_info["cate_3"])
cate_total = cate1_list+cate2_list+cate3_list
cate_total = list(set(cate_total))
return cate_total#一共67个城市
#将中文转为一个list one-hot
def chinese2list(all_chinese,word_name):
return_list = []
for i in all_chinese:
if i == word_name:
return_list.append(1)
else:
return_list.append(0)
return return_list
#将数字转为list 如 星期一,只需要传入7,1就会返回 1,0,0,0,0,0,0 one-hot
def number2list(max_num,num):
return_list = []
day = int(num)
for i in range(1,max_num+1):
if i == day:
return_list.append(1)
else:
return_list.append(0)
return return_list
#将shop_info的信息进行清洗
def clean_shop_info(shop_info):
#缺失数据的填充
shop_info.fillna({"city_name":"其他","cate_1":"其他","cate_2":"其他","cate_3":"其他"},inplace=True)
shop_info.fillna(0,inplace=True)
#从外部文件读入flow并返回df
def get_flow(flow_path):
f = open(flow_path,"r+")
raw_data = f.readlines()
id_list = []
time_stamp_list = []
year_list = []
month_list = []
day_list = []
day_of_week_list = []
flow_list=[]
for ele in raw_data:
ele = ele.split("(")[1]
ele = ele.split(")")[0]
ele = ele.split(",")
id_list.append(ele[0].strip())
date = ele[9].strip()
time_stamp_list.append(date)
time_format = time.strptime(date,'%Y-%m-%d')
year_list.append(time_format.tm_year)
month_list.append(time_format.tm_mon)
day_list.append(time_format.tm_mday)
day_of_week_list.append(time_format.tm_wday+1)
print(time_format)
flow_list.append(ele[10].strip())
return_df = pd.DataFrame({"shop_id":id_list,"date":time_stamp_list,"year":year_list,"month":month_list,"day":day_list,"week_of_day":day_of_week_list,"flow":flow_list})
return_df["shop_id"] = return_df["shop_id"].astype(int)
return_df["flow"] = return_df["flow"].astype(float)
return return_df
#将shop_info和flow进行合并并存入文件
def merge_shop_info_flow(flow_path,merge_save_path):#每次运行都需要进行merge操作,比较话费花费时间,所以直接先存入文件,以后直接取
clean_shop_info(shop_info)
flow_df = get_flow(flow_path)
shop_info_flow = shop_info.merge(flow_df,on="shop_id",how="inner")
shop_info_flow.to_csv(merge_save_path,index=False)#存入文件含有dataframe的列名,可以自己手动删除第一行。
#读取merge后的文件并提取特征存入文件
def build_features(merge_data_path,feature_path):#由于生成一次特征需要花费较长时间,一次性写入文件,之后读取。
all_city_list = get_all_city(shop_info)
all_cate_list = get_all_cate(shop_info)
#读取初始特征
merge_data=pd.read_csv(merge_data_path)
#将dataframe转为二维array
data=pd.np.array(merge_data)
#获取每个shop的flow的max,min,ave
max_dict = {}
min_dict = {}
ave_dict = {}
sum_dict = {}
count_dict = {}
all_shop_id_list = []
for line in data:
all_shop_id_list.append(line[0])
all_shop_id_set=set(all_shop_id_list)
for shop in all_shop_id_set:
max_dict[shop] = 0
min_dict[shop] = 10000
ave_dict[shop] = 0
sum_dict[shop] = 0
count_dict[shop] = 0
for line in data:
flow = line[12]
shop = line[0]
sum_dict[shop] += flow
count_dict[shop] += 1
if max_dict[shop] < flow:
max_dict[shop]=flow
if min_dict[shop] > flow:
min_dict[shop]= flow
for shop in all_shop_id_set:
ave_dict[shop]=sum_dict[shop] / count_dict[shop]
#将city_name转为ont-hot编码
transform_data = []
for line in data:
list_temp = []
shop_id = line[0]
list_temp.append(shop_id)#shop_id
city_name_list = chinese2list(all_city_list,line[1])
list_temp += city_name_list
list_temp.append(line[2])#location_id
list_temp.append(line[3])#per_pay
list_temp.append(line[4])#score
list_temp.append(line[5])#comment_cnt
list_temp.append(line[6])#shop_level
cate1_list=chinese2list(all_cate_list,line[7])
list_temp += cate1_list
cate2_list=chinese2list(all_cate_list,line[8])
list_temp += cate2_list
cate3_list=chinese2list(all_cate_list,line[9])
list_temp += cate3_list
#直接跳过line[10] date
day_list=number2list(31,line[11])#每个月最多有31天
list_temp += day_list
list_temp.append(line[12])#flow
month_list=number2list(12,line[13])#每年做多有12个月
list_temp += month_list
week_of_day_list=number2list(7,line[14])#每个星期做多有7天
list_temp += week_of_day_list
list_temp.append(line[15])#year字段,如果把2015转为独热编码,字段就太多了
list_temp.append(max_dict[shop_id])
list_temp.append(min_dict[shop_id])
list_temp.append(ave_dict[shop_id])
transform_data.append(list_temp)
pd.DataFrame(transform_data).to_csv(feature_path,index=False)
def get_features_target(data):
data_array = pd.np.array(data)#传入dataframe,为了遍历,先转为array
features_list = []
target_list = []
for line in data_array:
temp_list = []
for i in range(0,384):#一共有384个特征
if i == 360 :#index=360对应的特征是flow
target_temp = int(line[i])
else:
temp_list.append(int(line[i]))
features_list.append(temp_list)
target_list.append(target_temp)
# return features_list, target_list
return pd.DataFrame(features_list),pd.DataFrame(target_list)
#得到评价指标rmspe_xg训练模型
def rmspe_xg(yhat, y):
#y DMatrix对象
y = y.get_label()
#y.get_label 二维数组
y =
评论0