# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
mnames = ['movieid', 'title', 'genres']
movie = pd.read_table('movies.dat', sep='::', header=None,
names=mnames, engine='python')
print('----------------------------------------------------------------------------')
print("examples of movies:\n")
print(movie.head(5))
print('\nthe length of movies data:%d' % len(movie))
print('----------------------------------------------------------------------------\n')
unames = ['userid', 'gender', 'age', 'occupation', 'zip']
user = pd.read_table('users.dat', sep='::', header=None,
names=unames, engine='python')
print('----------------------------------------------------------------------------')
print("examples of users:\n")
print(user.head(5))
print('\nthe length of users data:%d ' % len(user))
print('----------------------------------------------------------------------------\n')
rnames = ['userid', 'movieid', 'ratings', 'timestamp']
rating = pd.read_table('ratings.dat', sep='::',
header=None, names=rnames, engine='python')
print('----------------------------------------------------------------------------')
print("examples of ratings:\n")
print(rating.head(5))
print('\nthe length of ratings data:%d ' % len(rating))
print('----------------------------------------------------------------------------\n')
# 把三张表的数据合并到一起
data = rating.merge(user).merge(movie)
print('----------------------------------------------------------------------------')
print("examples of datas:\n")
print(data.head(5))
print('----------------------------------------------------------------------------\n')
# 按性别计算每部电影的平均得分
meanratings = pd.pivot_table(data, index=['title'], values='ratings', columns=[
'gender'], aggfunc=np.mean, fill_value=0)
print('----------------------------------------------------------------------------')
print('The average score of each film (by Gender):')
print(meanratings[:10])
print('----------------------------------------------------------------------------\n')
# 过滤掉评分数据不到100条的电影,再按年龄统计电影的平均得分
movie_rating = data.groupby(['title']).size()
# print(movie_rating[:10])
movie_ix = movie_rating.index[movie_rating > 100]
mrating = meanratings.loc[movie_ix]
# print(mrating.head())
# 女性最喜欢的电影
print('----------------------------------------------------------------------------')
print('Women favorite movies TOP10: ')
print(mrating.sort_values(by=['F'], ascending=False).head(10))
print('----------------------------------------------------------------------------\n')
print('----------------------------------------------------------------------------')
print('Men favorite movies TOP10: ')
print(mrating.sort_values(by=['M'], ascending=False).head(10))
print('----------------------------------------------------------------------------\n')
# 找出男性和女性评分分歧最大的电影
# 按男女评分差值的绝对值来统计
mrating['diff'] = np.abs(mrating['F'] - mrating['M'])
#print(mrating.sort_values(by=['diff'], ascending=False).head(5))
# 不考虑性别因素,分歧最大的电影(用方差来统计)
rating_std = data.groupby(by=['title'])['ratings'].std()
rating_std_n = rating_std.loc[movie_ix] # 按照前面超过100条评论的电影进行过滤
# print(rating_std_n.head())
rating_std_n.fillna(0).sort_values(ascending=False)[:10]
# 查看评分最多的电影,最热门电影
ratings_by_title = data.groupby('title').size()
print('----------------------------------------------------------------------------')
print('The hottest movie (the most rated movie TOP10):')
print(ratings_by_title.sort_values(ascending=False).head(10))
print('----------------------------------------------------------------------------\n')
# 查看最高分电影
# 先算出每部电影平均得分
mean_ratings = data.pivot_table(
values='ratings', index='title', aggfunc='mean')
# 对电影平均得分排序
print('---------------------------------------------------------------------------------')
print('High score movie TOP20')
print(mean_ratings.sort_values(ascending=False).head(20))
print('----------------------------------------------------------------------------------\n')
# 查看评分最多且评分最高电影
# 选出评分次数大于1000的电影
hot_movies = ratings_by_title[ratings_by_title > 1000]
# print(len(hot_movies))
# print(hot_movies.head(10))
# 将这些电影的评分进行排序
hot_movies_rating = mean_ratings[hot_movies.index]
top_10_good_movies = hot_movies_rating.sort_values(ascending=False)
print('--------------------------------------------------------------------------------------------')
print('Good movie TOP10(high score and large score):')
print(top_10_good_movies.head(10))
print('--------------------------------------------------------------------------------------------\n')
西电数据挖掘大作业之电影评级数据分析
2星 需积分: 43 104 浏览量
2018-12-11
22:45:06
上传
评论 19
收藏 5.73MB ZIP 举报
xaohuicai
- 粉丝: 5
- 资源: 7
最新资源
- C语言基础-C语言编程基础之Leetcode编程题解之第39题组合总和.zip
- C语言基础-C语言编程基础之Leetcode编程题解之第38题外观数列.zip
- C语言基础-C语言编程基础之Leetcode编程题解之第37题解数独.zip
- C语言基础-C语言编程基础之Leetcode编程题解之第36题有效的数独.zip
- C语言基础-C语言编程基础之Leetcode编程题解之第35题搜索插入位置.zip
- index.wxml
- C语言基础-C语言编程基础之Leetcode编程题解之第33题搜索旋转排序数组.zip
- 基于Python实现的手写数字识别系统源码.zip
- 从网页提取禁止转载的文字
- C语言基础-C语言编程基础之Leetcode编程题解之第32题最长有效括号.zip
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈