# from future import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy.random import randn
from pandas import Series, DataFrame
from datetime import datetime
import xlrd, openpyxl
xlsx_file = pd.ExcelFile('data/test1.xlsx')
All = xlsx_file.parse('All')
d1 = All.drop(All.columns[:11], axis=1, inplace = False)
All = d1.drop(d1.columns[-1], axis=1, inplace = False)
# print(All.head())
print(len(All))
All[All.duplicated()==True].index[:20]
All.drop_duplicates(inplace=True)
print(len(All))
All[u'转发数'][All[u'转发数']==u'转发'] = '0'
All[u'评论数'][All[u'评论数']==u'评论'] = '0'
All[u'点赞数'][All[u'点赞数']==u'赞'] = '0'
print(All.describe())
print(All.dtypes)
All[u'转发数']=All[u'转发数'].astype('int64')
All[u'评论数'] = All[u'评论数'].astype('int64')
All[u'点赞数'] = All[u'点赞数'].astype('int64')
All.to_excel('All.xlsx',index=False)
All_pivot= All.pivot_table(values=[u'转发数',u'评论数',u'点赞数',u'微博内容'],index=[u'用户名'], aggfunc={u'转发数':np.sum,u'评论数':np.sum,u'点赞数':np.sum,u'微博内容':np.size})
All_pivot.rename(columns={u'微博内容':u'当月总微博数'},inplace=True)
All_pivot.to_excel('All_pivot.xlsx')
# 22222222
sf = xlsx_file.parse('sf')
sfweibo = xlsx_file.parse('sfweibo')
sf[u'省份前两字'] = np.nan
for i in range(len(sf[u'省份名'])):
sf[u'省份前两字'][i] = sf[u'省份名'][i][:2]
sfweibo[u'省份前两字'] = np.nan
for i in range(len(sfweibo[u'省份名'])):
sfweibo[u'省份前两字'][i] = sfweibo[u'省份名'][i][:2]
print(sf.head())
sf.to_excel('sf.xlsx',index=False)
sfweibo.to_excel('sfweibo.xlsx',index=False)
sf_sfweibo = sf.merge(sfweibo,on=u'省份前两字')
sf_sfweibo1 = sf_sfweibo.iloc[:,[4,1,2]]
sf_sfweibo1.to_excel('sf_sfweibo.xlsx',index=False)
sf_sfweibo = sf_sfweibo1
sf_sfweibo_All_pivot =pd.merge(sf_sfweibo,All_pivot,left_on=u'微博用户名',right_on=u'用户名',right_index=True)
sf_sfweibo_All_pivot.to_excel('sf_sfweibo_All_pivot.xlsx',index=False)
# 3333333333
base = xlsx_file.parse('base_info')
sf_sfweibo_All_pivot_base = base.merge(sf_sfweibo_All_pivot,left_on=u'昵称',right_on=u'微博用户名')
ssapb = sf_sfweibo_All_pivot_base
ssapb.rename(columns={u'当月总微博数_x':u'当月总微博数'},inplace=True)
ssapb = ssapb.drop([u'昵称',u'当月总微博数_y'],axis=1)
print(ssapb.iloc[0])
ssapb[u'当月原创数'] = ssapb[u'当月总微博数']-ssapb[u'当月转发数']
linkfix = "?is_ori=1&is_forward=1&is_text=1&is_pic=1&is_video=1&is_music=1&is_article=1&key_word=&start_time=2017-05-01&end_time=2017-05-31&is_search=1&is_searchadv=1#_0"
ssapb[u'当月博文网址'] = ssapb[u'主页链接']+linkfix
allfix = "?profile_ftype=1&is_all=1#_0"
ssapb[u'全部博文网址'] = ssapb[u'主页链接']+allfix
ssapb[u'篇均点赞'] = ssapb[u'点赞数']/ssapb[u'当月总微博数']
ssapb[u'篇均转发'] = ssapb[u'转发数']/ssapb[u'当月总微博数']
ssapb[u'篇均评论'] = ssapb[u'评论数']/ssapb[u'当月总微博数']
print(ssapb.iloc[0])
ssapb.to_excel('ssapb.xlsx',index=False)
# 3.2222
gb = All.groupby(u'用户名')
gb1 = gb.size()
gbindex = gb1.index
print(gbindex,gb1)
sortAllf = All.sort_values(by=[u'用户名',u'转发数'],ascending=[True,False])
sortAllc = All.sort_values(by=[u'用户名',u'评论数'],ascending=[True,False])
sortAlll = All.sort_values(by=[u'用户名',u'点赞数'],ascending=[True,False])
mm = (sortAllf,sortAllc,sortAlll)
All_h =pd.DataFrame(np.arange(136).reshape(34,4),columns=['fh','ch','lh','max_hdd'],index=gbindex)
fh=[]
ch=[]
lh=[]
max_hdd = []
for j in range(len(mm)):
for i in gbindex:
tempdf =mm[j][mm[j][u'用户名']==i]
tempdf['hdd'] = tempdf[u'转发数']+tempdf[u'评论数']+tempdf[u'点赞数']
max_hdd.append(tempdf['hdd'].max())
tempdf['numf'] = range(len(tempdf))
if j==0:
a =len(tempdf[tempdf[u'转发数']>=tempdf['numf']+1])
fh.append(a)
elif j == 1:
b =len(tempdf[tempdf[u'评论数']>=tempdf['numf']+1])
ch.append(b)
else:
c = len(tempdf[tempdf[u'点赞数']>=tempdf['numf']+1])
lh.append(c)
All_h['fh']=fh
All_h['ch']=ch
All_h['lh']=lh
All_h['max_hdd']=max_hdd[:34]
All_h.insert(3,'HS',All_h.iloc[:,:3].mean(1))
All_h.rename(columns={'fh':u'转发h指数','ch':u'评论h指数','lh':u'点赞h指数','HS':u'综合h指数','max_hdd':u'单篇最大互动度'},inplace=True)
ssapb_All_h= pd.merge(ssapb, All_h, left_on=u'微博用户名',right_on=u'用户名',right_index=True)
ssapb_All_h[u'原创率'] = ssapb_All_h[u'当月原创数']/ssapb_All_h[u'当月总微博数']
ssapb_All_h.to_excel('ssapb_All_h.xlsx',index=False)
# 3.333
f1 = ssapb_All_h.loc[:,[u'综合h指数',u'转发h指数',u'评论h指数',u'点赞h指数']]
corr1 = f1.corr()
corr1.to_excel('corr1.xlsx')
f2 = ssapb_All_h.loc[:,[u'综合h指数',u'转发数',u'评论数',u'点赞数',u'篇均转发',u'篇均评论',u'篇均点赞']]
corr2 = f2.corr()
corr2.to_excel('corr2.xlsx')
f3 = ssapb_All_h.loc[:,[u'综合h指数',u'原创率',u'粉丝数',u'微博总数',u'单篇最大互动度']]
corr3 = f3.corr()
corr3.to_excel('corr3.xlsx')
aa =ssapb_All_h.iloc[:,[8,9,10,5,15,16,0,1,2,3,4,6,14,7,11,12,13,24,20,21,22,23,17,18,19,25]]
aa.to_excel('finally.xlsx')
# 3.44444
f = lambda x:'%.4f' % x
aa.ix[:,21:] = aa.ix[:,21:].applymap(f)
aa.ix[:,21:] = aa.ix[:,21:].astype('float64')
f1 = lambda x :'%.2f%%' % (x*100)
aa[[u'原创率']]= aa[[u'原创率']].applymap(f1)
aa.to_excel('finally1.xlsx',index=False)
aa.sort_values(by=u'综合h指数', ascending=False, inplace=True)
aa['rank'] =np.arange(34)+1
aa['rank'] = aa['rank'].astype('string_')
aa[u'综合h指数'] = aa[u'综合h指数'].astype('string_')
aa[u'综合h指数/排名'] = pd.to_numeric(aa[u'综合h指数'])+pd.to_numeric(aa['rank'])
# 4444444
aa[u'综合h指数'] = aa[u'综合h指数'].astype('float64')
aa.to_excel('finally2.xlsx',index=False)