#引入依赖
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objects as go
from sklearn import preprocessing
from sklearn.cluster import KMeans
df = pd.read_csv(r'data.csv')
print(df.head(5))
print(df.info())
trace = go.Scatter(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], mode='markers')
layout = go.Layout(xaxis=dict(title='Annual Income (k$)'), yaxis=dict(title='Spending Score (1-100)'))
plt = go.Figure(data=[trace], layout=layout)
# fig.show()
plt.show()
# 求出簇内误方差
data = df[['Annual Income (k$)', 'Spending Score (1-100)']]
sse = []
k_range = range(1, 10)
for k in k_range:
km = KMeans(n_clusters=k)
km.fit(data)
sse.append(km.inertia_)
print(sse)
# 利用“肘”方法确定最佳k值
trace = go.Scatter(x=list(k_range), y=sse)
layout = go.Layout(xaxis=dict(title='k'), yaxis=dict(title='sse'))
plt = go.Figure(data=[trace], layout=layout)
plt.show()
# 划分客户类型
km = KMeans(n_clusters = 5)
predicted = km.fit_predict(data)
df['cluster'] = predicted
print(df.head(5))
df0 = df[df['cluster'] == 0]
df1 = df[df['cluster'] == 1]
df2 = df[df['cluster'] == 2]
df3 = df[df['cluster'] == 3]
df4 = df[df['cluster'] == 4]
trace0 = go.Scatter(x=df0['Annual Income (k$)'], y=df0['Spending Score (1-100)'], mode='markers', marker=dict(color='purple'), name='中等收入-中等消费')
trace1 = go.Scatter(x=df1['Annual Income (k$)'], y=df1['Spending Score (1-100)'], mode='markers', marker=dict(color='orange'), name='低收入-低消费')
trace2 = go.Scatter(x=df2['Annual Income (k$)'], y=df2['Spending Score (1-100)'], mode='markers', marker=dict(color='yellow'), name='低收入-高消费')
trace3 = go.Scatter(x=df3['Annual Income (k$)'], y=df3['Spending Score (1-100)'], mode='markers', marker=dict(color='green'), name='高收入-低消费')
trace4 = go.Scatter(x=df4['Annual Income (k$)'], y=df4['Spending Score (1-100)'], mode='markers', marker=dict(color='blue'), name='高收入-高消费')
fig = go.Figure()
fig.add_trace(trace0)
fig.add_trace(trace1)
fig.add_trace(trace2)
fig.add_trace(trace3)
fig.add_trace(trace4)
fig.update_layout(xaxis=dict(title='Annual Income (k$)'), yaxis=dict(title='Spending Score (1-100)'))
fig.show()
# 簇内质心
df_center = km.cluster_centers_
df_center = pd.DataFrame(df_center, columns=['x','y'])
print("簇内质心",df_center)
fig.add_trace(go.Scatter(x=df_center['x'], y=df_center['y'], mode='markers', marker=dict(color='red', line=dict(width=1)), name='不同簇的质心'))
fig.show()
# 四、结论¶
# 根据年收入与消费积分的关系,得出5种客户类型:
#
# 客户群0 中等收入-中等消费 一般价值客户
# 客户群1 低收入-低消费 低价值客户
# 客户群2 低收入-高消费 重要保持客户
# 客户群3 高收入-低消费 重要发展客户
# 客户群4 高收入-高消费 重要价值客户