import csv
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
import numpy as np
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
def get_best_model(model, X_train, y_train, params, cv=10):
clf = GridSearchCV(model, params, cv=cv)
clf.fit(X_train, y_train)
#print(clf.score(X_train, y_train))
print('模型评价准确率为:', clf.score(X_train, y_train))
return clf.best_estimator_
def get_part_data(df_data, percent=1):
df_result = pd.DataFrame()
grouped = df_data.groupby('group')
for group_name, group in grouped:
n_group_size = group.shape[0]
n_part_size = np.math.floor(n_group_size * percent)
part_df = group.iloc[:n_part_size, :]
df_result = df_result.append(part_df)
return df_result
def storFile(data, fileName):
with open(fileName, 'w', newline='') as f:
mywrite = csv.writer(f)
for i in range(data.shape[0]):
mywrite.writerow([data[i]])
if __name__ == '__main__':
# ==================有效文件读取=======================================================
gender_age_test_filepath='./china-mobile-user-gemographics/gender_age_test.csv'
gender_age_train_filepath='./china-mobile-user-gemographics/gender_age_train.csv'
app_events_filepath='./china-mobile-user-gemographics/app_events.csv'
app_labels_filepath='./china-mobile-user-gemographics/app_labels.csv'
events_filepath='./china-mobile-user-gemographics/events.csv'
label_categories_filepath='./china-mobile-user-gemographics/label_categories.csv'
phone_brand_device_model_filepath='./china-mobile-user-gemographics/phone_brand_device_model.csv'
test_filepath = './china-mobile-user-gemographics/test.csv'
gender_age_test = pd.read_csv(gender_age_test_filepath) # 都算列 if index=0表示0列为index
gender_age_train = pd.read_csv(gender_age_train_filepath)
print(gender_age_train.shape[0])
app_events = pd.read_csv(app_events_filepath, encoding='utf-8', usecols=['event_id', 'app_id'])
app_labels = pd.read_csv(app_labels_filepath, encoding='utf-8')
events = pd.read_csv(events_filepath, encoding='utf-8', usecols=['device_id', 'event_id'], index_col='event_id')
label_categories_ = pd.read_csv(label_categories_filepath, encoding='utf-8', index_col=None)
phone_brand_device_model = pd.read_csv(phone_brand_device_model_filepath,index_col=None)
gender_age_train = get_part_data(gender_age_train, percent=0.1)
print(gender_age_train.shape[0])
#==================信息查看=======================================================
print('数据集中每列的信息:')
print(gender_age_train.info())
print('该训练数据有{}行,有{}列'.format(gender_age_train.shape[0], gender_age_train.shape[1]))
print('前3行数据集为:')
print(gender_age_train.head(3))
print('数据集各类信息为:')
print(gender_age_train.groupby('group').size())
print('去除重复行前,数据集的维度是:{}'.format(phone_brand_device_model.shape))
phone_brand_device_model = phone_brand_device_model.drop_duplicates('device_id').set_index('device_id') # 去重
print('去除重复行后,数据集的维度是:{}'.format(phone_brand_device_model.shape))
#==================手机品牌信息=======================================================
brand_label_encoder=LabelEncoder()
brand_label_encoder.fit(phone_brand_device_model['phone_brand'].values)#label转化为数字
phone_brand_device_model['brand_label_code']=brand_label_encoder.transform(phone_brand_device_model['phone_brand'].values) #添加列
#phone_brand_device_model.to_csv(test_filepath,index=False) #保存
#print(phone_brand_device_model)
#gender_age_train['brand_label_code']=brand_label_encoder.transform(phone_brand_device_model['phone_brand'].values)
gender_age_train = gender_age_train.set_index('device_id')
gender_age_test = gender_age_test.set_index('device_id')
gender_age_train['brand_label_code']=phone_brand_device_model['brand_label_code']
gender_age_test['brand_label_code'] = phone_brand_device_model['brand_label_code']
gender_age_test.to_csv(test_filepath, index=False) # 保存
#print(gender_age_train.groupby('brand_label_code').size())
brand_onehot_encoder=OneHotEncoder()
brand_onehot_encoder.fit(phone_brand_device_model['brand_label_code'].values.reshape(-1,1))
train_brand_feat=brand_onehot_encoder.transform(gender_age_train['brand_label_code'].values.reshape(-1,1))
test_brand_feat=brand_onehot_encoder.transform(gender_age_test['brand_label_code'].values.reshape(-1,1))
print(train_brand_feat)
print('手机品牌信息维度为:',train_brand_feat.shape[1])
#==================手机型号信息=======================================================
phone_brand_device_model['brand_model'] =(
phone_brand_device_model['phone_brand'].str.cat(phone_brand_device_model['device_model'],sep=' '))
#phone_brand_device_model.to_csv(test_filepath, index=False) # 保存
model_label_encoder=LabelEncoder()
model_label_encoder.fit(phone_brand_device_model['brand_model'].values)
phone_brand_device_model['brand_model_label_code'] = (
model_label_encoder.transform(phone_brand_device_model['brand_model'].values))
gender_age_train['brand_model_label_code']=phone_brand_device_model['brand_model_label_code']
gender_age_test['brand_model_label_code']=phone_brand_device_model['brand_model_label_code']
#print(gender_age_train.groupby('brand_model_label_code').size())
model_label_encoder=OneHotEncoder()
model_label_encoder.fit(phone_brand_device_model['brand_model_label_code'].values.reshape(-1,1))
train_model_feat=model_label_encoder.transform(gender_age_train['brand_model_label_code'].values.reshape(-1,1))
test_model_feat=model_label_encoder.transform(gender_age_test['brand_model_label_code'].values.reshape(-1,1))
#train_brand_feat.to_csv(test_filepath, index=False) # 保存
print('手机型号信息维度为:', train_model_feat.shape[1])
# ==================app信息=======================================================
device_app=app_events.merge(events,how='left',left_on='event_id',right_index=True) #左连接+指定列 event+app+device
#device_app.to_csv(test_filepath, index=False) # 保存
n_run_s=device_app['app_id'].groupby(device_app['device_id']).size() #按照device分类 统计每类个数
print(n_run_s)
n_app_s=device_app['app_id'].groupby(device_app['device_id']).nunique() #按照device分类 统计每大类有多少种app_id(个数)
print(n_app_s)
#gender_age_train= gender_age_train.set_index('device_id')
#gender_age_test = gender_age_test.set_index('device_id')
gender_age_train['n_run']=n_run_s
gender_age_train['n_run'].fillna(0,inplace=True)
#print(gender_age_train)
gender_age_train['n_app']=n_app_s
gender_age_train['n_app'].fillna(0,inplace=True)
gender_age_test['n_run']=n_run_s
gender_age_test['n_run'].fillna(0,inplace=True)
gender_age_test['n_app']=n_app_s
gender_age_test['n_app'].fillna(0,inplace=True)
train_run_feat=gender_age_train['n_run'].values.reshape(-1,1)
train_app_feat=gender_age_train['n_app'].values.reshape(-1,1)
test_run_feat=gender_age_test['n_run'].values.reshape(-1,1)
test_app_feat=gender_age_test['n_app'].values.reshape(-1,1)
print