#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 24 10:42:24 2017
@author: lab-tan.yun
"""
import pandas as pd
# import lightgbm as lgb
import xgboost as xgb
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from enum import Enum
shop_info_raw = pd.read_csv('data/ccf_first_round_shop_info.csv')
user_info_raw = pd.read_csv('data/ccf_first_round_user_shop_behavior.csv')
evaluation_dataset = pd.read_csv('data/evaluation_public.csv')
# get mall_id
malls = shop_info_raw[['mall_id']]
malls.drop_duplicates(inplace=True)
malls.reset_index(inplace=True)
malls = malls.drop(['index'], axis=1)
# get shop label
mainLabelEncoder = LabelEncoder()
shop_label = shop_info_raw[['shop_id']]
mainLabelEncoder.fit(shop_label)
user_in_shop = user_info_raw[['shop_id']]
user_in_shop = mainLabelEncoder.transform(user_in_shop)
user_info_labeled = user_info_raw
user_info_labeled['shop_label'] = user_in_shop
# user_info_labeled = user_info_labeled.drop(['shop_id','user_id','time_stamp'],axis = 1)
user_info_labeled = user_info_labeled.drop(['shop_id', 'user_id'], axis=1)
def divide_data_into_four_part(s):
day = int(s.split(' ')[0].split('-')[2]) % 4
if day == 0:
return 0
elif day == 1:
return 1
elif day == 2:
return 2
elif day == 3:
return 3
user_info_labeled['time_stamp'] = user_info_labeled.time_stamp.apply(divide_data_into_four_part)
# divide the data into two part
# user_info_eval = user_info_labeled[user_info_labeled['time_stamp'] == 0]
# user_info_train = user_info_labeled[user_info_labeled['time_stamp'] > 0]
# as i think, if we only use the best one, it may be over fitting
def get_best_wifi(s):
s = s.split(';')
best_wifi = ''
strength = -999
for i in range(len(s)):
w = s[i].split('|')
if int(w[1]) > strength:
best_wifi = w[0]
strength = int(w[1])
best_wifi = best_wifi.split('_')
return int(best_wifi[1])
# create a wifi_info class to manage the wifi info
class Wifi_info(object):
bssid = ''
strength = -999
connected = 0
def __init__(self, b, s, c):
self.bssid = int(b.split('_')[1])
self.strength = s
if c == 'true':
self.connected = 1
else:
self.connected = 0
def get_strength(self):
return self.strength
def get_bssid(self):
return self.bssid
# create a wifi info num
class wifi_info_num(Enum):
best_id = 0
second_id = 1
ave = 2
max_cha = 3
square_error = 4
third_id = 5
fourth_id = 6
fifth_id = 7
sixth_id = 8
seventh_id = 9
connect_wifi = 10
# unfinished,if the wifi signal are too close or even they have the same strength.
# the trouble will come.
# im not sure if this will cause over fitting.
# Stop doing this , try the simplest one.
def get_wifi_info_optimized(s):
temp_s = s.split(';')
temp_strength = []
wifi_infos = []
connect_wifi = 0
for i in range(len(temp_s)):
w = temp_s[i].split('|')
temp_strength.append(int(w[1]))
wifi_infos.append(Wifi_info(w[0], int(w[1]), w[2]))
if w[2] == 'true':
connect_wifi = int(w[0].split('_')[1])
# print(i)
temp_strength.sort(reverse=True)
wifi_infos.sort(key=Wifi_info.get_strength, reverse=True)
ave = sum(temp_strength) / len(temp_strength)
max_cha = temp_strength[len(temp_strength) - 1] - temp_strength[0]
square_error = 0
for i in range(len(temp_strength)):
square_error = square_error + (temp_strength[i] - ave) * (temp_strength[i] - ave)
square_error = square_error / len(temp_strength)
# best_s = temp_strength[0]
# best_s = wifi_infos[0].get_strength()
# try:
# second_s = temp_strength[1]
# except IndexError:
# second_s = -1
# we suppose to get the closet group.
# but wait, we should do the simple one first.
# best_id = 0
# second_id = 0
# for i in range(len(temp_s)):
# if temp_s[i].find(str(best_s)) > -1:
# best_id = int(temp_s[i].split('|')[0].split('_')[1])
# break;
#
# for i in range(len(temp_s)):
# if temp_s[i].find(str(second_s)) > -1:
# second_id = int(temp_s[i].split('|')[0].split('_')[1])
# now the best and the second id all get
# best_wifi = best_wifi.split('_')
# if second_wifi != 'null':
# second_wifi = second_wifi.split('_')
# return int(best_wifi[1]),int(second_wifi[1])
# keep relax....Think about the wonderful life.
best_id = wifi_infos[0].get_bssid()
try:
second_id = wifi_infos[1].get_bssid()
except IndexError:
second_id = -1
try:
third_id = wifi_infos[2].get_bssid()
except IndexError:
third_id = -1
try:
fourth_id = wifi_infos[3].get_bssid()
except IndexError:
fourth_id = -1
try:
fifth_id = wifi_infos[4].get_bssid()
except IndexError:
fifth_id = -1
try:
sixth_id = wifi_infos[5].get_bssid()
except IndexError:
sixth_id = -1
try:
seventh_id = wifi_infos[6].get_bssid()
except IndexError:
seventh_id = -1
return best_id, second_id, ave, max_cha, square_error, third_id, fourth_id, fifth_id, sixth_id, seventh_id, connect_wifi
user_info_labeled['wifi_infos_raw'] = user_info_labeled['wifi_infos']
user_info_labeled['wifi_infos'] = user_info_labeled.wifi_infos.apply(get_wifi_info_optimized)
def get_best_one(s):
return s[0]
def get_second_one(s):
return s[1]
def get_wifi_strength_ave(s):
return s[2]
def get_wifi_strength_max_cha(s):
return s[3]
def get_wifi_strength_square_error(s):
return s[4]
def get_third_id(s):
return s[wifi_info_num.third_id.value]
def get_fourth_id(s):
return s[wifi_info_num.fourth_id.value]
def get_fifth_id(s):
return s[wifi_info_num.fifth_id.value]
def get_sixth_id(s):
return s[wifi_info_num.sixth_id.value]
def get_seventh_id(s):
return s[wifi_info_num.seventh_id.value]
def get_connect_wifi(s):
return s[wifi_info_num.connect_wifi.value]
user_info_labeled['best_wifi'] = user_info_labeled.wifi_infos.apply(get_best_one)
user_info_labeled['second_wifi'] = user_info_labeled.wifi_infos.apply(get_second_one)
user_info_labeled['wifi_strength_ave'] = user_info_labeled.wifi_infos.apply(get_wifi_strength_ave)
user_info_labeled['wifi_strength_max_cha'] = user_info_labeled.wifi_infos.apply(get_wifi_strength_max_cha)
# user_info_labeled['wifi_strength_square_error'] = user_info_labeled.wifi_infos.apply(get_wifi_strength_square_error)
# new add in 10.25
user_info_labeled['third_id'] = user_info_labeled.wifi_infos.apply(get_third_id)
user_info_labeled['fourth_id'] = user_info_labeled.wifi_infos.apply(get_fourth_id)
user_info_labeled['fifth_id'] = user_info_labeled.wifi_infos.apply(get_fifth_id)
user_info_labeled['sixth_id'] = user_info_labeled.wifi_infos.apply(get_sixth_id)
user_info_labeled['seventh_id'] = user_info_labeled.wifi_infos.apply(get_seventh_id)
user_info_labeled['connect_wifi'] = user_info_labeled.wifi_infos.apply(get_connect_wifi)
# user_info_labeled.drop(['wifi_infos'],axis = 1,inplace = True)
shop_mall = shop_info_raw[['shop_id', 'mall_id']]
shop_mall['shop_id'] = mainLabelEncoder.fit_transform(shop_info_raw['shop_id'])
shop_mall.rename(columns={'shop_id': 'shop_label'}, inplace=True)
user_info_labeled = pd.merge(user_info_labeled, shop_mall, on=['shop_label'], how='left')
# at here,we are going to train our models
# we need different encoder for different mall
# the common parameter for all models
# exclude num_class,becasue this parameter is unique in each mall
params = {'booster': 'gbtree',
'objective': 'multi:softmax',
'eval_metric': 'merror',
'gamma': 0.1,
'min_child_weight'