from os import path
from wordcloud import WordCloud, ImageColorGenerator
import jieba.analyse
import matplotlib.pyplot as plt
from scipy.misc import imread
baseUrl = "http://gz.zu.fang.com"
import time
from pymongo import MongoClient
class Analycis:
def __init__(self):
self.client = MongoClient('mongodb://localhost:27017/')
self.gzzf2 = self.client.gzzf2
# 定义一个地区拼音的字典,用于爬虫的索引。
pinyinDir = {
"不限": "rent",
"天河": "tianhe",
"番禺": "panyu",
"海珠": "haizhu",
"白云": "baiyun",
"越秀": "yuexiu",
"花都": "huadu",
"增城": "zengcheng",
"荔湾": "liwan",
"黄埔": "huangpu",
"南沙": "nansha",
"从化": "conghua",
}
def getAreaList(self):
return [
# "不限",
"天河",
"番禺",
"海珠",
"白云",
"越秀",
"花都",
"增城",
"荔湾",
"黄埔",
"南沙",
"从化"
]
# 获取区的拼音
def getPinyin(self, region):
try:
pinyin = self.pinyinDir[region]
except:
print("no such region pinyin")
return pinyin
# 求一个区的 元/平方米 的平均数
def getAvgPrice(self, region):
areaPinYin = self.getPinyin(region=region)
collection = self.gzzf2[areaPinYin]
print(region)
totalPrice = collection.aggregate([{'$group': {'_id': '$region', 'total_price': {'$sum': '$price'}}}])
totalArea = collection.aggregate([{'$group': {'_id': '$region', 'total_area': {'$sum': '$area'}}}])
# print(list(totalArea))
#totalNum = collection.aggregate([{'$group': {'_id': '$region', 'total_area': {'$sum': 1}}}])
totalPrice2 = list(totalPrice)[0]["total_price"]
print(totalPrice2)
totalArea2 = list(totalArea)[0]["total_area"]
return totalPrice2 / totalArea2
# 获取各个区 每个月一平方米需要多少钱
def getTotalAvgPrice(self):
totalAvgPriceList = []
totalAvgPriceDirList = []
print(self.getAreaList())
for index, region in enumerate(self.getAreaList()):
avgPrice = self.getAvgPrice(region)
totalAvgPriceList.append(round(avgPrice, 3))
totalAvgPriceDirList.append({"value": round(avgPrice, 3), "name": region + " " + str(round(avgPrice, 3))})
return totalAvgPriceDirList
# 获取各个区 每一天一平方米需要多少钱
def getTotalAvgPricePerDay(self):
totalAvgPriceList = []
for index, region in enumerate(self.getAreaList()):
avgPrice = self.getAvgPrice(region)
totalAvgPriceList.append(round(avgPrice / 30, 3))
return (self.getAreaList(), totalAvgPriceList)
# 获取各区统计数据量
def getAnalycisNum(self):
analycisList = []
for index, region in enumerate(self.getAreaList()):
collection = self.gzzf2[self.pinyinDir[region]]
print(region)
totalNum = collection.aggregate([{'$group': {'_id': '', 'total_num': {'$sum': 1}}}])
totalNum2 = list(totalNum)[0]["total_num"]
analycisList.append(totalNum2)
print(list(analycisList))
return (self.getAreaList(), analycisList)
# 获取各个区的房源比重
def getAreaWeight(self):
result = self.gzzf2.rent.aggregate([{'$group': {'_id': '$region', 'weight': {'$sum': 1}}}])
areaName = []
areaWeight = []
for item in result:
if item["_id"] in self.getAreaList():
areaWeight.append(item["weight"])
areaName.append(item["_id"])
print(item["_id"])
print(item["weight"])
# print(type(item))
return (areaName, areaWeight)
# 获取 title 数据,用于构建词云
def getTitle(self):
collection = self.gzzf2["rent"]
queryArgs = {}
projectionFields = {'_id': False, 'title': True} # 用字典指定
searchRes = collection.find(queryArgs, projection=projectionFields).limit(1000)
content = ''
for result in searchRes:
print(result["title"])
content += result["title"]
return content
# 获取户型数据(3 室 2 厅)
def getRooms(self):
results = self.gzzf2.rent.aggregate([{'$group': {'_id': '$rooms', 'weight': {'$sum': 1}}}])
roomList = []
weightList = []
for result in results:
roomList.append(result["_id"])
weightList.append(result["weight"])
# print(list(result))
return (roomList, weightList)
# 获取租房面积
def getAcreage(self):
results0_30 = self.gzzf2.rent.aggregate([
{'$match': {'area': {'$gt': 0, '$lte': 30}}},
{'$group': {'_id': '', 'count': {'$sum': 1}}}
])
results30_60 = self.gzzf2.rent.aggregate([
{'$match': {'area': {'$gt': 30, '$lte': 60}}},
{'$group': {'_id': '', 'count': {'$sum': 1}}}
])
results60_90 = self.gzzf2.rent.aggregate([
{'$match': {'area': {'$gt': 60, '$lte': 90}}},
{'$group': {'_id': '', 'count': {'$sum': 1}}}
])
results90_120 = self.gzzf2.rent.aggregate([
{'$match': {'area': {'$gt': 90, '$lte': 120}}},
{'$group': {'_id': '', 'count': {'$sum': 1}}}
])
results120_200 = self.gzzf2.rent.aggregate([
{'$match': {'area': {'$gt': 120, '$lte': 200}}},
{'$group': {'_id': '', 'count': {'$sum': 1}}}
])
results200_300 = self.gzzf2.rent.aggregate([
{'$match': {'area': {'$gt': 200, '$lte': 300}}},
{'$group': {'_id': '', 'count': {'$sum': 1}}}
])
results300_400 = self.gzzf2.rent.aggregate([
{'$match': {'area': {'$gt': 300, '$lte': 400}}},
{'$group': {'_id': '', 'count': {'$sum': 1}}}
])
results400_10000 = self.gzzf2.rent.aggregate([
{'$match': {'area': {'$gt': 400, '$lte': 10000}}},
{'$group': {'_id': '', 'count': {'$sum': 1}}}
])
results0_30_ = list(results0_30)[0]["count"]
results30_60_ = list(results30_60)[0]["count"]
results60_90_ = list(results60_90)[0]["count"]
results90_120_ = list(results90_120)[0]["count"]
results120_200_ = list(results120_200)[0]["count"]
results200_300_ = list(results200_300)[0]["count"]
results300_400_ = list(results300_400)[0]["count"]
results400_10000_ = list(results400_10000)[0]["count"]
attr = ["0-30平方米", "30-60平方米", "60-90平方米", "90-120平方米", "120-200平方米", "200-300平方米", "300-400平方米", "400+平方米"]
value = [
results0_30_, results30_60_, results60_90_, results90_120_, results120_200_, results200_300_, results300_400_, results400_10000_
]
return (attr, value)
print()
# 展示饼图
def showPie1(self, title, attr, value):
from pyecharts import Pie
pie = Pie(title)
pie.add("房源分布", attr, value, is_label_show=True)
pie.render(r"c:\Users\Administrator\graduation design\spiderResult1\广州房源分布统计.html")
def showPie2(self, title, attr, value):
from pyecharts import Pie
pie = Pie(title)
pie.add("房源分布", attr, value, is_label_show=True)
pie.render(r"c:\Users\Administrator\graduation design\spiderResult1\租房面积统计.html")
# 展示矩形树图
def showTreeMap(self, title, data):
from pyecharts import TreeMap
data = data
treemap = TreeMap(title, width=1200, height=600)
treemap.add("广州", data, is_label_show=True, label_pos='inside', l
不走小道
- 粉丝: 3371
- 资源: 5054
最新资源
- Java+Swing+Mysql的物资信息管理系统源码+文档说明(高分项目)
- (175345440)校园社区跑腿小程序源码.rar
- (175860602)基于51单片机的LCD1602矩阵键盘密码锁(proteus仿真设计)
- (176103642)「数学建模MATLAB必备程序源代码」方程求根源代码
- MATLAB代码:基于列约束生成法CCG的两阶段鲁棒问题求解 关键词:两阶段鲁棒 列约束生成法 CCG算法 鲁棒优化 参考文档:Solving two-stage robust optimizati
- (176167648)基于php+mysql的简易学生信息管理系统.zip
- 20232319 陈正勇.zip
- (176423806)Matlab与数学建模.doc
- (176962054)微同商城开源微信小程序商城(前后端开源:uniapp+Java) 快速搭建一个属于自己的微信小程序商城
- (177391846)毕业设计基于SpringBoot的在线拍卖系统源码含文档
- 酒店预订数据集.zip
- 基于粒子群算法的配电网无功优化 基于IEEE33节点配电网,以无功补偿器的接入位置和容量作为优化变量,以牛拉法进行潮流计算,以配电网网损最小为优化目标,通过优化求解,得到最佳接入位置和容量,优化结果如
- (177400018)pl2303USB转串口线驱动程序.zip
- (177488642)兼容在线/离线身份证读卡插件Windows PC 端SDK
- 中移M5311模块MQTT协议连接阿里云物联网平台(干货)
- (177506410)PHP学生管理系统 .zip
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈