import requests,csv,random,re,time
from lxml import etree
proxies={
'http':'http://113.128.28.170:9999',
'http':'http://47.107.93.105:8000',
'http':'http://123.163.122.189:9999',
'http':'http://182.61.109.24:8888',
'http':'http://123.54.44.220:9999',
'http':'http://116.208.102.146:61234',
'http':'http://122.4.49.199:9999',
'http':'http://47.107.93.105:8000',
'http':'http://123.163.122.189:9999',
'http':'http://182.61.109.24:8888',
'http':'http://123.54.44.220:9999',
'http':'http://116.208.102.146:61234',
'http':'http://122.4.49.199:9999',
'http':'http://218.27.84.219:80',
'http':'http://112.12.91.78:8888',
'http':'http://175.20.199.80:8080',
'http':'http://183.146.29.47:8888',
'http':'http://183.146.29.211:8888',
'http':'http://121.40.162.239:808',
'http':'http://123.139.56.238:9999',
'http':'http://112.12.91.34:8888',
'http':'http://121.10.139.191:3128',
'http':'http://112.12.91.26:8888',
'http':'http://117.127.16.206:8080',
'http':'http://183.146.29.215:8888',
'http':'http://27.203.209.148:8060',
'http':'http://183.146.29.29:8888',
'http':'http://121.227.80.115:8118',
'http':'http://115.208.18.139:61234',
'http':'http://183.6.183.35:3128',
'http':'http://117.28.96.23:9999',
'http':'http://121.40.64.214:80',
'http':'http://121.10.139.204:3128',
'http':'http://106.14.184.255:80',
'http':'http://112.12.91.210:8888',
'http':'http://220.133.218.213:60241',
'http':'http://122.4.41.161:9999',
'http':'http://1.194.118.51:9999',
'http':'http://101.132.193.192:8118',
'http':'http://112.12.91.43:8888',
'http':'http://221.6.201.18:9999',
'http':'http://118.78.196.7:8118',
'http':'http://180.160.54.117:8118',
'http':'http://112.85.129.171:9999',
'http':'http://39.137.69.6:80',
'http':'http://116.62.205.9:3128',
'http':'http://175.16.9.218:80',
'http':'http://121.236.73.101:61234',
'http':'http://171.11.32.17:9999',
'http':'http://42.238.81.226:9999',
'http':'http://39.137.69.7:8080',
'http':'http://183.146.29.220:8888',
'http':'http://183.146.29.216:8888',
'http':'http://125.111.137.36:8088',
'http':'http://183.146.29.34:8888',
'http':'http://42.229.189.66:8060',
'http':'http://183.146.29.240:8888',
'http':'http://112.12.91.74:8888',
'http':'http://1.198.72.196:9999',
'http':'http://112.85.129.183:9999',
'http':'http://123.54.224.95:9999',
'http':'http://182.116.229.53:9999',
'http':'http://1.198.73.16:9999',
'http':'http://61.128.208.94:3128',
'http':'http://112.87.69.179:9999',
'http':'http://113.128.25.245:61234',
'http':'http://1.198.72.207:9999',
'http':'http://123.54.47.81:9999',
'http':'http://171.11.29.187:9999',
'http':'http://171.12.113.153:9999',
'http':'http://112.12.91.241:8888',
'http':'http://183.146.29.208:8888',
'http':'http://112.12.91.209:8888',
'http':'http://118.31.64.170:3128',
'http':'http://113.110.47.37:61234',
'http':'http://39.135.24.11:80',
'http':'http://183.146.29.36:8888',
'http':'http://183.146.29.59:8888',
'http':'http://123.54.251.67:9999',
'http':'http://106.52.181.184:80',
'http':'http://110.86.139.23:9999',
'http':'http://218.60.8.99:3129',
}
user_agent = [
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52"
]
headers={"User-Agent": random.choice(user_agent)}
class lianjia(object):
headers_list = ['名称', '地址','社区名称','楼层', '标签', '关注人数', '发布时间', '平方价格/万', '总价/元']
def __init__(self,url):
self.url=url
self.list=[]
def get_adders_many_html(self,url):
#获取一个大地区中的html
r=requests.get(url,headers=headers)
return r.text
print('******'*30)
def get_adders_html_url(self,html):
#获取一个大地区中的所有小地区的url和名字
xml=etree.HTML(html)
urls=xml.xpath('//div[@data-role="ershoufang"]/div/a/@href')[6:-1:1]
title=xml.xpath('//div[@data-role="ershoufang"]/div/a/text()')[6:-1:1]
#[('/ershoufang/baohe/', '包河'), ('/ershoufang/chaohushi/', '巢湖市'), ('/ershoufang/lujiangxian/', '庐江县'), ('/ershoufang/konggangjingjishifanqu/', '空港经济示范区'), ('/ershoufang/shushan/', '蜀山'), ('/ershoufang/luyang/', '庐阳'), ('/ershoufang/yaohai/', '瑶海'), ('/ershoufang/zhengwu/', '政务'), ('/ershoufang/binhuxinqu/', '滨湖新区'), ('/ershoufang/jingkai2/', '经开'), ('/ershoufang/gaoxin8/', '高新'), ('/ershoufang/xinzhan/', '新站'), ('/ershoufang/feidong/', '肥东'), ('/ershoufang/feixi/', '肥西'), ('/ershoufang/changfeng/', '长丰')]
#[('/ershoufang/konggangjingjishifanqu/', '空港经济示范区'), ('/ershoufang/shushan/', '蜀山'), ('/ershoufang/luyang/', '庐阳'), ('/ershoufang/yaohai/', '瑶海'), ('/ershoufang/zhengwu/', '政务'), ('/ershoufang/binhuxinqu/', '滨湖新区'), ('/ershoufang/jingkai2/', '经开'), ('/ershoufang/gaoxin8/', '高新'), ('/ershoufang/xinzhan/', '新站'), ('/ershoufang/feidong/', '肥东'), ('/ershoufang/feixi/', '肥西'), ('/ershoufang/changfeng/', '长丰')]
print(list(zip(urls,title)))
return zip(urls,title)
def get_url_html(self,url):
#获取每一页的数据
try:
r=requests.get(url,headers=headers,proxies=proxies)
if r.status_code==200:
html=r.text
return html
else:
print(r.status_code)
except Exception as e:
print('错误信息:',e)
def save_csv_headers(self,title):
#首先写入csv文件头
with open('链家_{}.csv'.format(title),'a+',encoding='gbk',newline='')as f:
write=csv.DictWriter(f,self.headers_list)
没有合适的资源?快使用搜索试试~ 我知道了~
温馨提示
要实现通过Python爬虫获取链家网站的数据并将其写入CSV文件,可以按照以下描述进行操作: 导入所需模块:首先,导入所需的模块,例如requests、BeautifulSoup和csv等。requests模块用于发送HTTP请求获取网页内容,BeautifulSoup用于解析网页内容,csv用于写入CSV文件。 发送HTTP请求:使用requests模块发送GET请求,获取链家网站的页面内容。可以提供所需的URL和其他参数,如headers、cookies等。 解析网页内容:使用BeautifulSoup解析网页内容,提取所需的数据。根据网页的结构,使用合适的选择器和方法来定位和提取目标数据。 创建CSV文件:使用csv模块创建一个CSV文件,指定文件路径和打开模式(如写入模式)。 写入CSV文件:遍历获取到的数据,将数据逐行写入CSV文件。可以使用csv模块提供的writer对象来实现写入操作。 关闭CSV文件:在完成写入操作后,关闭CSV文件,释放资源。 请注意,在实际代码编写中,还需要考虑异常处理、数据清洗和转换等方面的需求。此外,爬取链家网站的数据时,请务必遵守
资源推荐
资源详情
资源评论
收起资源包目录
python爬虫 链家,写入CSV.zip (1个子文件)
python爬虫 链家,写入CSV
LianJiaSpidercsv.py 12KB
共 1 条
- 1
资源评论
童小纯
- 粉丝: 3w+
- 资源: 289
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功