# coding: utf-8
# In[196]:
import pandas as pd
import re
import numpy as np
a=pd.read_csv('F:\\xiaofang\\1.csv')
t=[]
for i in a['salary']:
t1=re.sub('K','',str(i))
t2=re.sub('k','',t1).split('-')
t.append(np.mean([int(tt) for tt in t2]))
a.loc[:,'salary_average']=t
#print(a.loc[:,'salary_average'])
y=-1
for i in a['workYear']:
y+=1
pat='.*\-.*'
p=re.compile(pat)
p2=re.compile('.*年[以上|以下].*')
p3=re.compile('.*[应届毕业生|不限].*')
if(p.match(i)):
t1=re.sub('年','',str(i))
t2=re.sub('y','',t1).split('-')
vb=np.mean([int(tt) for tt in t2])
a.loc[y,'workyear_average']=vb
elif(p2.match(i)):
t1=re.sub('年以上','',str(i))
t2=re.sub('年以下','',t1)
vb=np.mean([int(tt) for tt in t2])
a.loc[y,'workyear_average']=vb
elif(p3.match(i)):
t1=re.sub('应届毕业生','0',str(i))
t2=re.sub('不限','0',t1)
vb=np.mean([int(tt) for tt in t2])
a.loc[y,'workyear_average']=vb
else:
a.loc[y,'workyear_average']=0
#print(a['workyear_average'])
import seaborn as sns
import matplotlib.pyplot as plt
#1. 薪酬分布
fig=plt.figure()
sns.distplot(a['salary_average'].tolist())
fig.savefig("F:\\tupian\\xinchou.png")
print("薪酬分布")
print(a['salary_average'].describe())
import operator
# 第一类型类别占比+pie
b=[]
for t in a['firstType']:
b.extend(t.split('|'))
print(b)
t=set(b)
dic={}
for i in t:
dic[i]=b.count(i)
print("企业招聘类型数目:")
print(dic)
fig=plt.figure()
#print(dic.values())
#sum=np.sum(list(dic.values()))
ddd=[i/380.0 for i in list (dic.values())]
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
dic2=sorted(dic.items(),key=operator.itemgetter(1),reverse=True)
dic2=pd.DataFrame(dic2,columns=['key','values'])
print("企业类型描述:")
print(dic2.describe())
sns.barplot(dic2['key'][:9], dic2['values'][:9])
plt.xlabel('一级职业分类')
plt.ylabel('数量')
fig.savefig("F:\\tupian\\gongsileixing.png")
b2=[]
po=[]
for t in a['secondType']:
b2.extend(t.split('|'))
print(b2)
t2=set(b2)
dic2={}
dic33={}
for i in t2:
po=[]
dic2[i]=b2.count(i)
for t in a['secondType']:
if i in (t.split('|')):
po.append(True)
else:
po.append(False)
dic33[i]=a[po]['salary_average'].mean()
print("职业类型:")
print(dic2)
print("职业平均薪酬:")
print(dic33)
fig=plt.figure()
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
dic2=sorted(dic2.items(),key=operator.itemgetter(1),reverse=True)
dic2=pd.DataFrame(dic2,columns=['key','values'])
#print("职业2数目描述")
#print(dic2.describe())
sns.barplot(dic2['key'][:9], dic2['values'][:9])
plt.xlabel('二级职业分类',fontsize=15)
plt.ylabel('数量',fontsize=15)
fig.savefig("F:\\tupian\\zhiyebar.png")
fig=plt.figure()
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
dic33=sorted(dic33.items(),key=operator.itemgetter(1),reverse=True)
dic33=pd.DataFrame(dic33,columns=['key','values'])
print("职业薪酬描述")
print(dic33.describe())
sns.barplot(dic33['key'][:9], dic33['values'][:9])
plt.xlabel('二级职业分类',fontsize=15)
plt.ylabel('工资',fontsize=15)
fig.savefig("F:\\tupian\\zhiyexinchou.png")
b3=[]
dic2={}
for t in a['industryField']:
b3.extend(re.split('[,、 ]', str(t)))
print("工业领域:")
print(b3)
t3=set(b3)
dic3={}
dic333={}
for i in t3:
dic3[i]=b3.count(i)
print("工业领域统计")
print(dic3)
fig=plt.figure()
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
dic2=sorted(dic3.items(),key=operator.itemgetter(1),reverse=True)
dic2=pd.DataFrame(dic2,columns=['key','values'])
print("工业领域描述")
print(dic2.describe())
sns.barplot(dic2['key'][:9], dic2['values'][:9])
plt.xlabel('领域')
plt.ylabel('数量')
fig.savefig("F:\\tupian\\gongyelingyu.png")
dic2={}
for i in t3:
po=[]
dic2[i]=b3.count(i)
for t in a['industryField']:
if i in (re.split('[,、 ]', str(t))):
po.append(True)
else:
po.append(False)
dic333[i]=a[po]['salary_average'].mean()
print("工业平均薪酬:")
print(dic333)
fig=plt.figure()
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
dic333=sorted(dic333.items(),key=operator.itemgetter(1),reverse=True)
dic333=pd.DataFrame(dic333,columns=['key','values'])
print("工业薪酬描述")
print(dic333.describe())
sns.barplot(dic333['key'][:9], dic333['values'][:9])
plt.xlabel('工业领域分类',fontsize=15)
plt.ylabel('工资',fontsize=15)
fig.savefig("F:\\tupian\\gongyexinchou.png")
print("工作经验需求:")
print(a['workYear'].describe())
h={}
h2={}
for i in set(a['workYear']):
h[i]=(list(a['workYear']).count(i))
h2[i]=(a[a['workYear']==i]['salary_average']).mean()
print("工作经验统计")
print(h)
fig=plt.figure()
plt.pie(list(h.values()),labels=list(h.keys()))
fig.savefig("F:\\tupian\\gongzuopie.png")
plt.figure()
print("工作经验薪酬")
print(h2)
fig=plt.figure()
sns.barplot(list(h2.keys()),list(h2.values()),order=['应届毕业生','1年以下','1-3年','3-5年','5-10年','10年以上'])
fig.savefig("F:\\tupian\\jingyanxinchou.png")
plt.figure()
print("工作经验要求描述: ")
fig=plt.figure()
print(a['workyear_average'].describe())
sns.swarmplot(a['workYear'],a['salary_average'])
plt.ylabel('工资',fontsize=15)
plt.xlabel('工作经验',fontsize=15)
fig.savefig("F:\\tupian\\gongzixin.png")
fig=plt.figure()
sns.violinplot(a['workyear_average'],a['salary_average'])
fig.savefig("F:\\tupian\\violinjingyan.png")
fig=plt.figure()
sns.boxplot(a['workYear'],a['salary_average'],order=['应届毕业生','1年以下','1-3年','3-5年','5-10年','10年以上'])
fig.savefig("F:\\tupian\\boxjingyan.png")
b3=[]
b3=list(a['education'])
print(b3)
t3=set(b3)
dic3={}
h3={}
for i in t3:
dic3[i]=b3.count(i)
print("教育经历:")
print(dic3)
for i in set(a['education']):
h3[i]=(a[a['education']==i]['salary_average']).mean()
print("教育经历薪酬所有")
print(h3)
fig=plt.figure()
sns.barplot(list(h3.keys()),list(h3.values()),order=['大专','不限','本科','硕士'])
fig.savefig("F:\\tupian\\jiaoyuxinchoubar.png")
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
dic2=sorted(dic3.items(),key=operator.itemgetter(1),reverse=True)
dic2=pd.DataFrame(dic2,columns=['key','values'])
print("教育经历描述:")
print(dic2.describe())
#sns.barplot(dic2['key'][:9], dic2['values'][:9])
fig=plt.figure()
plt.pie(list(dic3.values()),labels=list(dic3.keys()))
plt.xlabel('学历')
plt.ylabel('数量')
plt.figure()
sns.violinplot(a['education'],a['salary_average'])
fig.savefig("F:\\tupian\\jiaoyuliviolin.png")
fig=plt.figure()
sns.boxplot(a['education'],a['salary_average'],order=['大专','不限','本科','硕士'])
fig.savefig("F:\\tupian\\jiaobox.png")
a.to_excel("F:\\xiaofang\\4.xls")
plt.show()
# In[197]:
b=[]
for t in a['positionAdvantage']:
b.extend(re.split('[,,、]',t))
print(b)
t=set(b)
dic={}
for i in t:
dic[i]=b.count(i)
print("福利")
print(dic)
fig=plt.figure(figsize=(8,8))
#print(dic.values())
#sum=np.sum(list(dic.values()))
#ddd=[i/380.0 for i in list (dic.values())]
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
dic2=sorted(dic.items(),key=operator.itemgetter(1),reverse=True)
dic2=pd.DataFrame(dic2,columns=['key','values'])
print("福利描述:")
print(dic2.describe())
sns.barplot(dic2['key'][:9], dic2['values'][:9])
plt.xlabel('福利关键词')
plt.ylabel('数量')
fig.savefig("F:\\tupian\\fuli.png")
# In[ ]: