python爬虫链家，写入CSV_python中将request请求的大量数据保存到csv中资源-CSDN文库

共1个文件

py：1个

python

爬虫

需积分: 5 8 浏览量 2023-07-26 16:32:04 上传评论收藏 4KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

python爬虫链家，写入CSV.zip （1个子文件）

python爬虫链家，写入CSV

LianJiaSpidercsv.py 12KB

import requests,csv,random,re,time from lxml import etree proxies={ 'http':'http://113.128.28.170:9999', 'http':'http://47.107.93.105:8000', 'http':'http://123.163.122.189:9999', 'http':'http://182.61.109.24:8888', 'http':'http://123.54.44.220:9999', 'http':'http://116.208.102.146:61234', 'http':'http://122.4.49.199:9999', 'http':'http://47.107.93.105:8000', 'http':'http://123.163.122.189:9999', 'http':'http://182.61.109.24:8888', 'http':'http://123.54.44.220:9999', 'http':'http://116.208.102.146:61234', 'http':'http://122.4.49.199:9999', 'http':'http://218.27.84.219:80', 'http':'http://112.12.91.78:8888', 'http':'http://175.20.199.80:8080', 'http':'http://183.146.29.47:8888', 'http':'http://183.146.29.211:8888', 'http':'http://121.40.162.239:808', 'http':'http://123.139.56.238:9999', 'http':'http://112.12.91.34:8888', 'http':'http://121.10.139.191:3128', 'http':'http://112.12.91.26:8888', 'http':'http://117.127.16.206:8080', 'http':'http://183.146.29.215:8888', 'http':'http://27.203.209.148:8060', 'http':'http://183.146.29.29:8888', 'http':'http://121.227.80.115:8118', 'http':'http://115.208.18.139:61234', 'http':'http://183.6.183.35:3128', 'http':'http://117.28.96.23:9999', 'http':'http://121.40.64.214:80', 'http':'http://121.10.139.204:3128', 'http':'http://106.14.184.255:80', 'http':'http://112.12.91.210:8888', 'http':'http://220.133.218.213:60241', 'http':'http://122.4.41.161:9999', 'http':'http://1.194.118.51:9999', 'http':'http://101.132.193.192:8118', 'http':'http://112.12.91.43:8888', 'http':'http://221.6.201.18:9999', 'http':'http://118.78.196.7:8118', 'http':'http://180.160.54.117:8118', 'http':'http://112.85.129.171:9999', 'http':'http://39.137.69.6:80', 'http':'http://116.62.205.9:3128', 'http':'http://175.16.9.218:80', 'http':'http://121.236.73.101:61234', 'http':'http://171.11.32.17:9999', 'http':'http://42.238.81.226:9999', 'http':'http://39.137.69.7:8080', 'http':'http://183.146.29.220:8888', 'http':'http://183.146.29.216:8888', 'http':'http://125.111.137.36:8088', 'http':'http://183.146.29.34:8888', 'http':'http://42.229.189.66:8060', 'http':'http://183.146.29.240:8888', 'http':'http://112.12.91.74:8888', 'http':'http://1.198.72.196:9999', 'http':'http://112.85.129.183:9999', 'http':'http://123.54.224.95:9999', 'http':'http://182.116.229.53:9999', 'http':'http://1.198.73.16:9999', 'http':'http://61.128.208.94:3128', 'http':'http://112.87.69.179:9999', 'http':'http://113.128.25.245:61234', 'http':'http://1.198.72.207:9999', 'http':'http://123.54.47.81:9999', 'http':'http://171.11.29.187:9999', 'http':'http://171.12.113.153:9999', 'http':'http://112.12.91.241:8888', 'http':'http://183.146.29.208:8888', 'http':'http://112.12.91.209:8888', 'http':'http://118.31.64.170:3128', 'http':'http://113.110.47.37:61234', 'http':'http://39.135.24.11:80', 'http':'http://183.146.29.36:8888', 'http':'http://183.146.29.59:8888', 'http':'http://123.54.251.67:9999', 'http':'http://106.52.181.184:80', 'http':'http://110.86.139.23:9999', 'http':'http://218.60.8.99:3129', } user_agent = [ "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52" ] headers={"User-Agent": random.choice(user_agent)} class lianjia(object): headers_list = ['名称', '地址','社区名称','楼层', '标签', '关注人数', '发布时间', '平方价格/万', '总价/元'] def __init__(self,url): self.url=url self.list=[] def get_adders_many_html(self,url): #获取一个大地区中的html r=requests.get(url,headers=headers) return r.text print('******'*30) def get_adders_html_url(self,html): #获取一个大地区中的所有小地区的url和名字 xml=etree.HTML(html) urls=xml.xpath('//div[@data-role="ershoufang"]/div/a/@href')[6:-1:1] title=xml.xpath('//div[@data-role="ershoufang"]/div/a/text()')[6:-1:1] #[('/ershoufang/baohe/', '包河'), ('/ershoufang/chaohushi/', '巢湖市'), ('/ershoufang/lujiangxian/', '庐江县'), ('/ershoufang/konggangjingjishifanqu/', '空港经济示范区'), ('/ershoufang/shushan/', '蜀山'), ('/ershoufang/luyang/', '庐阳'), ('/ershoufang/yaohai/', '瑶海'), ('/ershoufang/zhengwu/', '政务'), ('/ershoufang/binhuxinqu/', '滨湖新区'), ('/ershoufang/jingkai2/', '经开'), ('/ershoufang/gaoxin8/', '高新'), ('/ershoufang/xinzhan/', '新站'), ('/ershoufang/feidong/', '肥东'), ('/ershoufang/feixi/', '肥西'), ('/ershoufang/changfeng/', '长丰')] #[('/ershoufang/konggangjingjishifanqu/', '空港经济示范区'), ('/ershoufang/shushan/', '蜀山'), ('/ershoufang/luyang/', '庐阳'), ('/ershoufang/yaohai/', '瑶海'), ('/ershoufang/zhengwu/', '政务'), ('/ershoufang/binhuxinqu/', '滨湖新区'), ('/ershoufang/jingkai2/', '经开'), ('/ershoufang/gaoxin8/', '高新'), ('/ershoufang/xinzhan/', '新站'), ('/ershoufang/feidong/', '肥东'), ('/ershoufang/feixi/', '肥西'), ('/ershoufang/changfeng/', '长丰')] print(list(zip(urls,title))) return zip(urls,title) def get_url_html(self,url): #获取每一页的数据 try: r=requests.get(url,headers=headers,proxies=proxies) if r.status_code==200: html=r.text return html else: print(r.status_code) except Exception as e: print('错误信息:',e) def save_csv_headers(self,title): #首先写入csv文件头 with open('链家_{}.csv'.format(title),'a+',encoding='gbk',newline='')as f: write=csv.DictWriter(f,self.headers_list)

评论收藏

内容反馈