import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.selector import Selector
from jdscrapy.items import JdscrapyItem
from time import sleep
class JingdongSpider(scrapy.Spider):
name = 'jingdong'
custom_settings = {'REACTOR_THREADPOOL_MAXSIZE': 20}
allowed_domains = ['www.jd.com']
start_urls = ['https://www.jd.com']
def __init__(self):
# 设置 Chrome 无头模式
chrome_options = Options()
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-software-rasterizer")
# chrome_options.add_argument("--headless") # 启用无头模式
# 使用 Chrome 初始化 WebDriver,确保将选项传递给WebDriver
self.driver = webdriver.Chrome(options=chrome_options)
def parse(self, response):
# 使用 Selenium 打开页面
self.driver.get(self.start_urls[0])
# 使用显式等待确保异步加载完成
wait = WebDriverWait(self.driver, 5)
# 模拟向下滚动
for i in range(1):
self.driver.execute_script("window.scrollTo(0, 0.3*document.body.scrollHeight);")
sleep(2)
# 首先搜索“时尚达人模块”
# 查找并点击指定按钮
fashion_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//span[contains(text(), '时尚达人')]")))
fashion_button.click()
sleep(5)
# 循环滚动和解析
for _ in range(10): # 滚动次数
# 模拟向下滚动
self.driver.execute_script("window.scrollTo(0, 0.8*document.body.scrollHeight);")
sleep(6)
# 获取页面源码
body = self.driver.page_source
selector = Selector(text=body)
# 解析页面内容
commodities = selector.xpath('//*[@id="feedContent4"]/li')
for commodity in commodities:
item = JdscrapyItem()
image = commodity.xpath('.//a/div[1]/img/@src').extract_first()
item['image'] = 'https:' + image if image else ''
item['text'] = commodity.xpath('.//a/div[2]/p/text()').extract_first()
price_1 = commodity.xpath('.//a/div[2]/div/div/span/text()').extract()
price_2 = commodity.xpath('.//a/div[2]/div/div/span/span/text()').extract()
item['price'] = ''.join(price_1) + ''.join(price_2)
link = commodity.xpath('.//a/@href').extract_first()
item['link'] = 'https:' + link if link else ''
# 使用 yield 将item传递给item pipeline
yield item
# 第二波,搜索“智能先锋”模块
# 使用 Selenium 打开页面
self.driver.get(self.start_urls[0])
# 使用显式等待确保异步加载完成
wait = WebDriverWait(self.driver, 5)
# 模拟向下滚动
for i in range(1):
self.driver.execute_script("window.scrollTo(0, 0.3*document.body.scrollHeight);")
sleep(2)
# 查找并点击指定按钮
fashion_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//span[contains(text(), '智能先锋')]")))
fashion_button.click()
sleep(5)
# 循环滚动和解析
for _ in range(10): # 滚动次数
# 模拟向下滚动
self.driver.execute_script("window.scrollTo(0, 0.8*document.body.scrollHeight);")
sleep(6)
# 获取页面源码
body = self.driver.page_source
selector = Selector(text=body)
# 解析页面内容
commodities = selector.xpath('//*[@id="feedContent1"]/li')
for commodity in commodities:
item = JdscrapyItem()
image = commodity.xpath('.//a/div[1]/img/@src').extract_first()
item['image'] = 'https:' + image if image else ''
item['text'] = commodity.xpath('.//a/div[2]/p/text()').extract_first()
price_1 = commodity.xpath('.//a/div[2]/div/div/span/text()').extract()
price_2 = commodity.xpath('.//a/div[2]/div/div/span/span/text()').extract()
item['price'] = ''.join(price_1) + ''.join(price_2)
link = commodity.xpath('.//a/@href').extract_first()
item['link'] = 'https:' + link if link else ''
# 使用 yield 将item传递给item pipeline
yield item
# 第三波,搜索“进口好物”模块
# 使用 Selenium 打开页面
self.driver.get(self.start_urls[0])
# 使用显式等待确保异步加载完成
wait = WebDriverWait(self.driver, 5)
# 模拟向下滚动
for i in range(1):
self.driver.execute_script("window.scrollTo(0, 0.3*document.body.scrollHeight);")
sleep(2)
# 查找并点击指定按钮
fashion_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//span[contains(text(), '进口好物')]")))
fashion_button.click()
sleep(5)
# 循环滚动和解析
for _ in range(10): # 滚动次数
# 模拟向下滚动
self.driver.execute_script("window.scrollTo(0, 0.8*document.body.scrollHeight);")
sleep(6)
# 获取页面源码
body = self.driver.page_source
selector = Selector(text=body)
# 解析页面内容
commodities = selector.xpath('//*[@id="feedContent5"]/li')
for commodity in commodities:
item = JdscrapyItem()
image = commodity.xpath('.//a/div[1]/img/@src').extract_first()
item['image'] = 'https:' + image if image else ''
item['text'] = commodity.xpath('.//a/div[2]/p/text()').extract_first()
price_1 = commodity.xpath('.//a/div[2]/div/div/span/text()').extract()
price_2 = commodity.xpath('.//a/div[2]/div/div/span/span/text()').extract()
item['price'] = ''.join(price_1) + ''.join(price_2)
link = commodity.xpath('.//a/@href').extract_first()
item['link'] = 'https:' + link if link else ''
# 使用 yield 将item传递给item pipeline
yield item
# 关闭浏览器
self.driver.quit()