# -*- coding: utf-8 -*-
"""
Created on Mon Jul 20 17:35:39 2020
@author: Administrator
"""
import requests
import os
import time
from lxml import etree
import threading
import random
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
#存储目录·
root ="webpic/"
#5个大类对应的文件夹
catalog=['','catalog1','catalog2','catalog3','catalog4','catalog5']
#记录已爬取页数
pass_page=['','pass_page1.txt','pass_page2.txt','pass_page3.txt','pass_page4.txt','pass_page5.txt']
"""初始化"""
def initialization():
#初始化爬取记录文本
if not os.path.exists(root):
os.mkdir(root)
for i in range(1,6):
if not os.path.exists(root+pass_page[i]):
open(root+pass_page[i], 'a+')
#初始化图片存储目录
for i in range(1,6):
if not os.path.exists(root+catalog[i]):
os.mkdir(root+catalog[i])
"""当前商品链接"""
def commodity(url,c_whichcatalog):
try:
#请求当前商品链接
r = requests.get(url,headers = headers)
r.encoding='utf-8'
s = r.text
selector = etree.HTML(s)
#解析图片链接
img = selector.xpath('//*[@id="app"]/div/div[1]/main/div/div/div/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/div[2]/div/div[2]/img/@src')
#解析标题
title = selector.xpath('//*[@id="app"]/div/div[1]/main/div/div/div/div[2]/div[1]/div[2]/div/div/div/div[1]/h1/text()')
c = requests.get(img[0],headers = headers)
#保存图片,后缀名为jpg
#判断图片是否存在,不存在则下载
if not os.path.exists(root+catalog[int(c_whichcatalog)]+"/"+title[0]+".jpg"):
with open(root+catalog[int(c_whichcatalog)]+"/"+title[0]+".jpg","wb") as f:
f.write(c.content)
f.close()
except:
#重试再次请求图片链接
try:
#请求当前商品链接
r = requests.get(url,headers = headers)
r.encoding='utf-8'
s = r.text
selector = etree.HTML(s)
#解析图片链接
img = selector.xpath('//*[@id="app"]/div/div[1]/main/div/div/div/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/div[2]/div/div[2]/img/@src')
#解析标题
title = selector.xpath('//*[@id="app"]/div/div[1]/main/div/div/div/div[2]/div[1]/div[2]/div/div/div/div[1]/h1/text()')
c = requests.get(img[0],headers = headers)
#保存图片,后缀名为jpg
#判断图片是否存在,不存在则下载
if not os.path.exists(root+catalog[int(c_whichcatalog)]+"/"+title[0]+".jpg"):
with open(root+catalog[int(c_whichcatalog)]+"/"+title[0]+".jpg","wb") as f:
f.write(c.content)
f.close()
except:
pass
"""当前大类链接"""
def categories(url,whichcatalog,page):
try:
#请求当前大类链接
r = requests.get(url,headers = headers)
r.encoding='utf-8'
s = r.text
selector = etree.HTML(s)
#遍历当前页的所有商品
for item in selector.xpath('//*[@id="SearchResultsGrid"]/a'):
href = item.xpath('.//@href')
print(href[0])
#多线程下载图片
threading.Thread(target=commodity,args=(href[0],whichcatalog,)).start()
#随机延时1~2秒,防止网站禁止ip
#time.sleep(int(random.randint(1,2)))
#如果网速稳定可以注释掉time.sleep(2)
#time.sleep(2)
#commodity(href[0],whichcatalog)
#记录当前爬取页数
with open(root+pass_page[int(whichcatalog)],"a+",encoding='utf8') as f:
f.writelines(str(page))
f.writelines("\n")
except:
pass
def start(whichcata):
#当前爬取的页数
current_page=1
while(True):
with open(root+pass_page[int(whichcata)],"r") as f: #设置文件对象
pagelist = f.read() #可以是随便对文件的操作
#判断当前页数是否被爬取
if str(current_page) not in pagelist:
if whichcata==1:
categories("https://www.redbubble.com/shop/*?iaCode=u-tees&page="+str(current_page)+"&searchType=find&tShirtColor=tShirtColor-black",1,current_page)
elif whichcata==2:
categories("https://www.redbubble.com/shop/*?iaCode=u-tees&page="+str(current_page)+"&searchType=find&tShirtColor=tShirtColor-white",2,current_page)
elif whichcata==3:
categories("https://www.redbubble.com/shop/stickers?page="+str(current_page)+"&searchType=browse",3,current_page)
elif whichcata==4:
categories("https://www.redbubble.com/shop/cloth+face+masks?page="+str(current_page)+"&searchType=browse",4,current_page)
elif whichcata==5:
categories("https://www.redbubble.com/shop/phone-cases?page="+str(current_page)+"&searchType=browse",5,current_page)
current_page=current_page+1
if __name__ == '__main__':
"""初始化"""
initialization()
#指定大类
whichcata=input("请输入数字(1~5),指定爬取哪一个大类:")
#开始爬取
print("开始爬取第"+str(whichcata)+"大类.....")
start(int(whichcata))