# 爬取链家网房价数据
# 作者: Charles
# 公众号: Charles的皮卡丘
import json
from openpyxl import Workbook
import requests
from bs4 import BeautifulSoup
import os
import time
import random
f = open('./parse/city_urls.json', 'r')
data = json.load(f)
# 将数据保存到Excel
def save_to_excel(infos, excel_name='loupan'):
print('[INFO]:Start to save data...')
wb = Workbook()
ws = wb.active
ws.append(['楼盘', '地点', '状态', '元/平(均价)', '总价(万/套起)', '建面(m^2)'])
for info in infos:
try:
ws.append([info[0], info[1], info[2], info[3], info[4], info[5]])
except:
print('[WARNING]:One lost...')
continue
if not os.path.exists('./results'):
os.mkdir('./results')
wb.save('./results/' + excel_name + '.xlsx')
print('[INFO]:Data saved to excel successfully...')
# 爬取信息
def GetInfo(url, page_num=10):
print('[INFO]:Start to get infos...')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
}
infos = []
for i in range(page_num):
url = url + 'pg{}/'.format(i+1)
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.text, 'lxml')
resblocks = soup.find_all('div', attrs={'class': 'resblock-name'})
resblocks_area = soup.find_all('div', attrs={'class': 'resblock-area'})
resblocks_location = soup.find_all('div', attrs={'class': 'resblock-location'})
resblocks_price = soup.find_all('div', attrs={'class': 'resblock-price'})
assert len(resblocks) == len(resblocks_area); assert len(resblocks) == len(resblocks_location); assert len(resblocks) == len(resblocks_price)
info_num = len(resblocks)
for i in range(info_num):
resblock_price = resblocks_price[i]
if resblock_price.find('span', attrs={'class': 'desc'}).string.strip() == '万/套(均价)':
main_price = '暂无'
else:
main_price = resblock_price.find('span', attrs={'class': 'number'}).string.strip()
second_price = resblock_price.find('div', attrs={'class': 'second'}).string.strip('总价万/套起')
resblock_location = resblocks_location[i]
location = resblock_location.find('a').string.strip()
resblock_area = resblocks_area[i]
area = resblock_area.find('span').string.strip().split(' ')[1][:-2]
resblock = resblocks[i]
resblock_name = resblock.find('a').string.strip()
resblock_type = resblock.find('span', attrs={'class': 'resblock-type'}).string.strip()
resblock_status = resblock.find('span', attrs={'class': 'sale-status'}).string.strip()
infos.append([resblock_name, location, resblock_type+'-'+resblock_status, main_price, second_price, area])
time.sleep(random.random() * 2)
return infos
# main fun.
def main():
city_name = input('Input the city name you want to know:')
page_num = input('Input the page num you want to get:')
try:
city_url = data[city_name] + '/loupan/'
except:
print('[Error]: City name parse error...')
return
try:
page_num = int(page_num)
except:
print('[Error]: Page number should be number...')
return
infos = GetInfo(city_url, page_num)
save_to_excel(infos, excel_name=city_name)
if __name__ == '__main__':
while True:
main()
Python爬取房价信息
3星 · 超过75%的资源 需积分: 36 99 浏览量
2018-12-27
08:18:15
上传
评论 17
收藏 483KB ZIP 举报
lamjj111
- 粉丝: 0
- 资源: 4
最新资源
- Python大作业:音乐播放软件(爬虫+可视化+数据分析+数据库)
- 课程设计-python爬虫-爬取日报,爬取日报文章后存储到本地,附带源代码+课程设计报告
- 软件和信息技术服务行业投资与前景预测.pptx
- 课程设计-基于SpringBoot + Mybatis+python爬虫NBA球员数据爬取可视化+源代码+文档+sql+效果图
- 软件品质管理系列二项目策划规范.doc
- 基于TensorFlow+PyQt+GUI的酒店评论情感分析,支持分析本地数据文件和网络爬取数据分析+源代码+文档说明+安装教程
- 软件定义无线电中的模拟电路测试技术.pptx
- 软件开发协议(作为技术开发合同附件).doc
- 软件开发和咨询行业技术趋势分析.pptx
- 软件测试题详解及答案.doc
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
评论1