# %% [markdown]
# ID:客户ID
# Gender:客户性别
# Ever_Married:客户婚姻状况
# Age:客户年龄
# Graduated:客户是毕业生吗?
# Profession:客户的职业
# Work_Experience:多年工作经验
# Spending_Score:客户的消费评分
# Family_Size:客户家庭成员人数(含客户)
# Var_1:客户的匿名类别
# Segmentation:(目标)客户的客户群
# %%
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import seaborn as sns
from sklearn.cluster import DBSCAN
df_train = pd.read_csv('Train.csv')
df_train.head()
# %%
df_train.shape
# %%
df_train.info()
# %%
df_train.describe()
# %%
df_train.describe(include='O')
# %%
df_train.isnull().sum() # 统计缺失值
# %%
df_train.dropna(inplace=True) # 删除缺失值
df_train.shape
# %%
any(df_train.duplicated()) # 检测数据集是否存在重复值
# %%
# 不同组别的性别分布
sns.countplot(x='Segmentation', hue='Gender', data=df_train)
plt.title("Segmentation based on Gender")
plt.show()
# %%
# 不同组别的年龄分布
sns.boxplot(x='Segmentation', y='Age', data=df_train)
plt.title("Age Distribution based on Segmentation")
plt.show()
# %%
# 不同组别的工作经验vs消费得分
sns.boxplot(x='Work_Experience', y='Spending_Score', hue='Segmentation', data=df_train)
plt.title("Work Experience vs Spending Score based on Segmentation")
plt.show()
# %%
sns.violinplot(x='Segmentation', y='ID', data=df_train)
plt.title("Profession Distribution based on Segmentation")
plt.show()
# %%
# 不同组别的家庭规模
plt.figure(figsize=(8,8))
df_train['Family_Size'].value_counts().plot.pie(autopct='%1.1f%%')
plt.title("Family Size Distribution based on Segmentation")
plt.show()
# %%
# 客户年龄分布
plt.figure(figsize=(10,5))
sns.histplot(df_train['Age'], kde=True)
plt.title("Distribution of 'Age'")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()
# %%
# 工作经验分布
plt.figure(figsize=(10,5))
sns.histplot(df_train['Work_Experience'], kde=True)
plt.title("Distribution of 'Work_Experience'")
plt.xlabel("Work_Experience")
plt.ylabel("Frequency")
plt.show()
# %%
# 家庭规模分布
plt.figure(figsize=(10,5))
sns.histplot(df_train['Family_Size'], kde=True)
plt.title("Distribution of 'Family_Size'")
plt.xlabel("Family_Size")
plt.ylabel("Frequency")
plt.show()
# %%
# 删除目标变量
df_train_kmeans = df_train.drop(['Segmentation', 'ID'], axis=1)
df_train_kmeans
# %%
# 将分类列转换为标签编码列
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df_train_kmeans['Gender'] = encoder.fit_transform(df_train_kmeans['Gender'])
df_train_kmeans['Ever_Married'] = encoder.fit_transform(df_train_kmeans['Ever_Married'])
df_train_kmeans['Graduated'] = encoder.fit_transform(df_train_kmeans['Graduated'])
df_train_kmeans['Profession'] = encoder.fit_transform(df_train_kmeans['Profession'])
df_train_kmeans['Spending_Score'] = encoder.fit_transform(df_train_kmeans['Spending_Score'])
df_train_kmeans['Var_1'] = encoder.fit_transform(df_train_kmeans['Var_1'])
df_train_kmeans.head()
# %%
# 相关系数矩阵
corr = df_train_kmeans.corr()
# 绘制热力图
plt.figure(figsize=(10,10))
sns.heatmap(corr, annot=True)
plt.show()
# %%
# 数据标准化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_train_kmeans_scaler = scaler.fit_transform(df_train_kmeans)
df_train_kmeans_scaler
# %% [markdown]
# KMeans
# %%
# 肘部法寻找最佳簇数
Sum_of_squared_distances = []
K = range(2,15)
for k in K:
km = KMeans(n_clusters=k)
km = km.fit(df_train_kmeans_scaler)
Sum_of_squared_distances.append(km.inertia_)
print("For k =", k, ", the inertia is", km.inertia_)
# %%
# 肘部法可视化
import matplotlib.pyplot as plt
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
# %%
# 根据肘部法确定最佳聚类数
best_k = 4
# 训练KMeans模型
kmeans = KMeans(n_clusters=best_k)
kmeans.fit(df_train_kmeans_scaler)
# %%
# 预测聚类标签
labels = kmeans.predict(df_train_kmeans_scaler)
# 统计每个集群中的客户数量
from collections import Counter
counts = Counter(labels)
# 在“Segmentation”列中获得目标类别“A”、“B”、“C”和“D”
target_categories = df_train['Segmentation']
# 找出每个簇中出现频率最高的目标类别
cluster_categories = {}
for label, count in counts.items():
cluster_data = target_categories[labels == label]
most_frequent_category = cluster_data.value_counts().idxmax()
cluster_categories[label] = most_frequent_category
# 将预测的标签映射到目标类别
mapped_labels = [cluster_categories[label] for label in labels]
# 将列'n_Clusters'添加到具有映射标签的数据框
df_train['n_Clusters'] = mapped_labels
df_train
# %%
# 计算正确预测的次数
correct_predictions = sum(df_train['Segmentation'] == df_train['n_Clusters'])
# 计算预测的总数
total_predictions = df_train.shape[0]
# 以百分比计算准确度
accuracy = (correct_predictions / total_predictions) * 100
print("-- Accuracy for KMeans: {:.2f}%".format(accuracy))
# %% [markdown]
# DBSCAN
# %%
# 训练DBSCAN模型
dbscan = DBSCAN(eps=0.6, min_samples=5)
dbscan.fit(df_train_kmeans_scaler)
# 预测聚类标签
labels = dbscan.labels_
# 统计每个集群中的客户数量
from collections import Counter
counts = Counter(labels)
# 在` Segmentation `列中获得目标类别` A `, ` B `, ` C `和` D `
target_categories = df_train['Segmentation']
# 找出每个簇中出现频率最高的目标类别
cluster_categories = {}
for label, count in counts.items():
cluster_data = target_categories[labels == label]
most_frequent_category = cluster_data.value_counts().idxmax()
cluster_categories[label] = most_frequent_category
# 将预测的标签映射到目标类别
mapped_labels = [cluster_categories[label] if label in cluster_categories else 'Noise' for label in labels]
# 将列'n_Clusters'添加到具有映射标签的数据框
df_train['n_Clusters'] = mapped_labels
df_train
# %%
# 计算正确预测的次数
correct_predictions = sum(df_train['Segmentation'] == df_train['n_Clusters'])
# 计算预测的总数
total_predictions = df_train.shape[0]
# 以百分比计算准确度
accuracy = (correct_predictions / total_predictions) * 100
print("-- Accuracy for DBSCAN: {:.2f}%".format(accuracy))
艾派森
- 粉丝: 22w+
- 资源: 110
最新资源
- 03 -劳动合同书.docx
- 10 -全员劳动合同书.docx
- 遨天科技(航天推进系统产品研发商,遨天科技(北京)有限公司)创投信息
- 07 -金融行业劳动合同.docx
- 05 -家政服务协议.docx
- 10 -酒店劳动合同书.docx
- 13 -美发劳动合同书.docx
- 斑马智行(智联网汽车整体解决方案提供商,斑马网络技术有限公司)创投信息
- 01 -送货员岗位聘用协议书.docx
- 02 -专业技术人员聘用合同书.docx
- 04 -员工正式劳动合同书.docx
- 07 -研发经理岗位聘用协议书.docx
- 17 -实习生劳动合同.docx
- 23 -聘用退休人员合同.docx
- 30 -临时工协议.docx
- 33 -会计劳动合同.docx
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈