# coding=utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import tensorflow as tf
import pandas as pd
import numpy as np
import json
import argparse
import os
import datetime
import math
import xgboost as xgb
import lightgbm as lgb
import time
import random
from sklearn.metrics import roc_auc_score
from pandas import Series
import gc
import datetime
import bisect
from scipy.sparse import coo_matrix
from sklearn import metrics
import shutil
import pickle
import math
def sigmoid(inX):
return 1.0 / (1 + math.exp(-inX))
def get_threshold(tree_json):
threshold = []
if "right_child" in tree_json.keys():
threshold.extend(get_threshold(tree_json['right_child']))
if "left_child" in tree_json.keys():
threshold.extend(get_threshold(tree_json['left_child']))
if "threshold" in tree_json.keys():
threshold.extend([tree_json['threshold']])
return threshold
def get_bucketized_cols_by_tree(train_data, train_label, numeric_columns):
ligbmodel = lgb.LGBMClassifier(num_leaves = 12, max_depth=4, n_estimators= 1)
columns = []
bucketized_cols = []
column_bins = {}
for idx, col in enumerate(numeric_columns):
if col not in train_data.columns:
print(col)
continue
ligbmodel.fit(pd.DataFrame(train_data[col].fillna(0)), train_label)
split = sorted(get_threshold(ligbmodel.booster_.dump_model()["tree_info"][0]["tree_structure"]))
numeric_feature_column = tf.feature_column.numeric_column(col)
bucketized_cols.extend([tf.feature_column.bucketized_column(source_column = numeric_feature_column, boundaries = split)])
column_bins[col] = split
with open('/data/kai.zhang/dnn/onlineModel/feature_quantile.txt', 'w') as fw:
json.dump(column_bins, fw)
return bucketized_cols
def getNumCateFeatures(raw):
col_nunique = raw.nunique()
category_fea = list(set(list(col_nunique[col_nunique <= 15].index)) - set(['label','real_order','sample_weight']))
numeric_fea = list(set(list(raw.columns)) - set(category_fea) - set(['label','real_order','sample_weight']))
return category_fea, numeric_fea
def getModelFeatures(category_fea, numeric_fea):
model_feature_columns = []
for key in category_fea:
model_feature_columns.append(tf.feature_column.numeric_column(key=key))
bucketized_cols = get_bucketized_cols_by_tree(data_train, data_train_click, numeric_fea)
model_feature_columns.extend(bucketized_cols)
return model_feature_columns
def reorderDataColumns(category_feas, numeric_feas):
DATA_COLUMNS = []
for fea_cate in category_feas:
s = 'fea_cate_' + fea_cate
DATA_COLUMNS.append(s)
for fea_cate in numeric_feas:
s = 'fea_real_' + fea_cate
DATA_COLUMNS.append(s)
return DATA_COLUMNS
def rename_col_for_input_order():
cnt = 0
col_mapping = {}
for idx, col in enumerate(DATA_COLUMNS):
if col.startswith('fea_'):
sp = col.split('_')
new_col = '_'.join(sp[:-1])+'{0:03}_'.format(cnt) + sp[-1]
col_mapping[col] = new_col
cnt += 1
else:
col_mapping[col] = col
return col_mapping
my_model_dir = '/data/kai.zhang/dnn/model'
shutil.rmtree("/data/kai.zhang/dnn/model", ignore_errors=True)
os.mkdir("/data/kai.zhang/dnn/model")
pd.set_option('display.max_columns', None)
os.chdir("/data/kai.zhang/dnn")
#/data/kai.zhang/dnn/model/feature_quantile.txt
#raw = pd.read_csv('/data/kai.zhang/dnn/dnn_sampe.csv',sep=',')
raw = pd.read_csv('/data/kai.zhang/dnn/dnn0514.csv',sep='\t', na_values=['\N','NULL','null'], error_bad_lines=False)
raw.columns = ["queryid", "label", "real_order", "sample_weight", "isfoodlevel1", "ishuishop", "distance", "ocr", "seg", "lastview", "yestdayctr", "realclick", "loccvr", "todayctr", "discount", "isdealshop", "logceilpictotal", "ctr", "densitythirty", "pricepref", "catprefwithgeo", "isnewuser", "ispermanentcity", "guessstar", "spl", "istopshop", "topshop", "istakeawayshop", "allcategoryctr", "distancelarge3km", "distanceless3km", "distanceless2km", "distanceless1km", "distanceless500m", "distanceless200m", "fclick", "fclickpv", "fclickpriceless50", "fclickpriceless100", "fclickpriceless200", "fclickpriceless300", "fclickpricelarge300", "crossclickprice", "crossdistance", "crossorderprice", "repeatclickratio", "shopclickusers", "shopclickcount", "click5ratio", "click1ratio", "poirepeatratio", "ctravgratio", "staravgratio", "popscoreavgratio", "scoreavgratio", "branchcntavgratio", "timeseekshophoursctr", "isfavorclickcate2", "cate2clickrecency", "cate2clickfrequency", "cate2orderrecency", "cate2orderfrequency", "cate2ordermonetary", "dividedistancectr", "productdistancectr", "similarshops"]
deletes = ["shop_id","locatecityid","queryid"]
deletes= list(set(raw.columns).intersection([x.lower() for x in deletes]))
raw.drop(deletes, axis=1, inplace=True)
raw[['crossclickprice', 'crossdistance', 'crossorderprice']] = raw[['crossclickprice', 'crossdistance', 'crossorderprice']].fillna(value=-1)
raw[['crossclickprice', 'crossdistance', 'crossorderprice']] = raw[['crossclickprice', 'crossdistance', 'crossorderprice']].fillna(value=-1)
raw.fillna(value=0, inplace=True)
selected = ["guessstar", "ispermanentcity", "isdealshop", "isnewuser", "isfavorclickcate2", "ishuishop", "istopshop", "istakeawayshop", "isfoodlevel1"]
raw[selected].dtypes
col_nunique = raw[selected].nunique()
for col in raw.columns:
if col in ['label', 'real_order']:
continue
raw[col] = raw[col].astype('float64')
category_feas, numeric_feas = getNumCateFeatures(raw)
DATA_COLUMNS = reorderDataColumns(category_feas, numeric_feas)
COLUMN_ALIAS_MAPPING = rename_col_for_input_order()
remove_col = ["shopclicknum","viewedlastshoppos","sample_weight"]
remove_col = [x.lower() for x in remove_col]
removed= list(set(raw.columns).intersection(remove_col))
random.seed(100)
test_index = random.sample(raw.index, int(len(raw.index)*0.1))
#test data
data_test = raw.loc[test_index].drop(removed, axis=1, inplace=False).drop(["label","real_order"], axis=1, inplace=False)
data_test_click = raw.loc[test_index].label
data_test_order = raw.loc[test_index].real_order
#all data
data = raw.drop(test_index)
filter_ori= (data["shopclicknum"] > 0) | ((data["shopclicknum"] <= 0) & (data["viewedlastshoppos"]>=10))
data = data[filter_ori]
random.seed(100)
train_index = random.sample(data.index, int(len(data.index)*0.9))
del raw
import gc
gc.collect()
#train data
data_train_click = data.loc[train_index].label
data_train_order = data.loc[train_index].real_order
data_train = data.loc[train_index].drop(["label","real_order"], axis=1, inplace=False).drop(removed, axis=1, inplace=False)
#eval data
data_dev_click = data.drop(train_index).label
data_dev_order = data.drop(train_index).real_order
data_dev = data.drop(train_index).drop(["label","real_order"], axis=1, inplace=False).drop(removed, axis=1, inplace=False)
#data_train.drop("order_weight",axis=1, inplace=True)
model_feature_columns = []
for key in category_feas:
#raw = raw.rename(columns={key: 'fea_real_' + key})
data_train.rename(columns={key: COLUMN_ALIAS_MAPPING['fea_cate_' + key]} ,inplace=True)
data_dev.rename(columns={key: COLUMN_ALIAS_MAPPING['fea_cate_' + key]} , inplace=True)
data_test.rename(columns={key: COLUMN_ALIAS_MAPPING['fea_cate_' + key]} ,inplace=True)
model_feature_columns.append(tf.feature_column.numeric_column(key=COLUMN_ALIAS_MAPPING['fea_cate_' + key]))
numeric_fea1 = []
for key in numeric_feas:
#raw = raw.rename(columns={key: 'fea_real_' + key})
data_train.rename(columns={key: COLUMN_ALIAS_MAPPING['fea_real_' + key]}, inplace=True)
data_dev.rename(columns={key: COLUMN_ALIAS_MAPPING['fea_real_' + key]}, inplace=True)
data_test.rename(columns={key: COLUMN_ALIAS_MAPPING['fea_real_' + key]}, inplace=True)
numeric_fea1.append(COLUMN_ALIAS_MAPPING['fea_real_' + key])
buc
没有合适的资源?快使用搜索试试~ 我知道了~
资源推荐
资源详情
资源评论
收起资源包目录
基于深度学习的ranking.zip (1个子文件)
tf-learning2rank-master
cutbins_DNN_V3
lgb_dnn.py 26KB
共 1 条
- 1
资源评论
博士僧小星
- 粉丝: 1711
- 资源: 5876
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功