python多线程爬取图片（自动记录爬取页数，防止断网断电）

共1个文件

py：1个

python

爬取图片

需积分: 1 40 浏览量 2023-03-23 16:11:51 上传评论收藏 2KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

python多线程爬取图片.zip （1个子文件）

多线程爬取图片

webpic.py 5KB

# -*- coding: utf-8 -*- """ Created on Mon Jul 20 17:35:39 2020 @author: Administrator """ import requests import os import time from lxml import etree import threading import random headers = { 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36' } #存储目录· root ="webpic/" #5个大类对应的文件夹 catalog=['','catalog1','catalog2','catalog3','catalog4','catalog5'] #记录已爬取页数 pass_page=['','pass_page1.txt','pass_page2.txt','pass_page3.txt','pass_page4.txt','pass_page5.txt'] """初始化""" def initialization(): #初始化爬取记录文本 if not os.path.exists(root): os.mkdir(root) for i in range(1,6): if not os.path.exists(root+pass_page[i]): open(root+pass_page[i], 'a+') #初始化图片存储目录 for i in range(1,6): if not os.path.exists(root+catalog[i]): os.mkdir(root+catalog[i]) """当前商品链接""" def commodity(url,c_whichcatalog): try: #请求当前商品链接 r = requests.get(url,headers = headers) r.encoding='utf-8' s = r.text selector = etree.HTML(s) #解析图片链接 img = selector.xpath('//*[@id="app"]/div/div[1]/main/div/div/div/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/div[2]/div/div[2]/img/@src') #解析标题 title = selector.xpath('//*[@id="app"]/div/div[1]/main/div/div/div/div[2]/div[1]/div[2]/div/div/div/div[1]/h1/text()') c = requests.get(img[0],headers = headers) #保存图片，后缀名为jpg #判断图片是否存在，不存在则下载 if not os.path.exists(root+catalog[int(c_whichcatalog)]+"/"+title[0]+".jpg"): with open(root+catalog[int(c_whichcatalog)]+"/"+title[0]+".jpg","wb") as f: f.write(c.content) f.close() except: #重试再次请求图片链接 try: #请求当前商品链接 r = requests.get(url,headers = headers) r.encoding='utf-8' s = r.text selector = etree.HTML(s) #解析图片链接 img = selector.xpath('//*[@id="app"]/div/div[1]/main/div/div/div/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/div[2]/div/div[2]/img/@src') #解析标题 title = selector.xpath('//*[@id="app"]/div/div[1]/main/div/div/div/div[2]/div[1]/div[2]/div/div/div/div[1]/h1/text()') c = requests.get(img[0],headers = headers) #保存图片，后缀名为jpg #判断图片是否存在，不存在则下载 if not os.path.exists(root+catalog[int(c_whichcatalog)]+"/"+title[0]+".jpg"): with open(root+catalog[int(c_whichcatalog)]+"/"+title[0]+".jpg","wb") as f: f.write(c.content) f.close() except: pass """当前大类链接""" def categories(url,whichcatalog,page): try: #请求当前大类链接 r = requests.get(url,headers = headers) r.encoding='utf-8' s = r.text selector = etree.HTML(s) #遍历当前页的所有商品 for item in selector.xpath('//*[@id="SearchResultsGrid"]/a'): href = item.xpath('.//@href') print(href[0]) #多线程下载图片 threading.Thread(target=commodity,args=(href[0],whichcatalog,)).start() #随机延时1~2秒，防止网站禁止ip #time.sleep(int(random.randint(1,2))) #如果网速稳定可以注释掉time.sleep(2) #time.sleep(2) #commodity(href[0],whichcatalog) #记录当前爬取页数 with open(root+pass_page[int(whichcatalog)],"a+",encoding='utf8') as f: f.writelines(str(page)) f.writelines("\n") except: pass def start(whichcata): #当前爬取的页数 current_page=1 while(True): with open(root+pass_page[int(whichcata)],"r") as f: #设置文件对象 pagelist = f.read() #可以是随便对文件的操作 #判断当前页数是否被爬取 if str(current_page) not in pagelist: if whichcata==1: categories("https://www.redbubble.com/shop/*?iaCode=u-tees&page="+str(current_page)+"&searchType=find&tShirtColor=tShirtColor-black",1,current_page) elif whichcata==2: categories("https://www.redbubble.com/shop/*?iaCode=u-tees&page="+str(current_page)+"&searchType=find&tShirtColor=tShirtColor-white",2,current_page) elif whichcata==3: categories("https://www.redbubble.com/shop/stickers?page="+str(current_page)+"&searchType=browse",3,current_page) elif whichcata==4: categories("https://www.redbubble.com/shop/cloth+face+masks?page="+str(current_page)+"&searchType=browse",4,current_page) elif whichcata==5: categories("https://www.redbubble.com/shop/phone-cases?page="+str(current_page)+"&searchType=browse",5,current_page) current_page=current_page+1 if __name__ == '__main__': """初始化""" initialization() #指定大类 whichcata=input("请输入数字(1~5),指定爬取哪一个大类:") #开始爬取 print("开始爬取第"+str(whichcata)+"大类.....") start(int(whichcata))

评论收藏

内容反馈