import multiprocessing
from concurrent.futures import ThreadPoolExecutor
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
from lxml import etree
from mongodb_save import save_data
import time
from multiprocessing import Pool
from multiprocessing import freeze_support
from scheduler import Scheduler
from storages.redis import RedisClient
class Spider1():
def __init__(self, t, link):
self.t = t
self.link = link
def spider1(self, num):
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('user-agent='
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
conn = RedisClient()
result = conn.random()
options.add_argument('--proxy-server=http://' + str(result))
drivers = webdriver.Chrome(options)
drivers.set_page_load_timeout(300) # 根据页面实际需要加载时间设置
drivers.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': 'Object.defineProperty(navigator, "webdriver", {get:()=>undefined})'
})
try:
print('proxy:', result)
drivers.get(self.link + 'index{i}.html#fy01'.format(i=num))
wait = WebDriverWait(drivers, 10)
inputs = wait.until(
EC.presence_of_all_elements_located(
(By.XPATH, '//div[contains(@class, "headingNews")]/div[contains(@class, "hdNews")]/div[@class="on"]')))
texts = wait.until(
EC.presence_of_all_elements_located(
(By.XPATH,
'//div[contains(@class, "headingNews")]/div[contains(@class, "hdNews")]/div[@class="on"]/em/a')))
for input, text in zip(inputs, texts):
html = input.get_attribute('outerHTML')
href = re.search('<h5>\s*<a href="(.*?)".*?>.*?</a>\s*</h5>', html).group(1)
#print('http://' + self.link.split('//')[1].split('/')[0] + href)
title = re.search('<h5>\s*<a.*?>(.*?)</a>\s*</h5>', html).group(1)
content = text.get_attribute("innerText")
html = etree.HTML(html)
if html.xpath('//div/a'):
img = html.xpath('//div/a/img/@src')
#print('http://' + self.link.split('//')[1].split('/')[0] + img[0])
else:
img = 'Zero'
#print(img)
save_data(self.link, href, title, content, img)
print(self.t, '第' + str(num) + '页')
except Exception as e:
print(self.t, '第' + str(num) + '页', e)
f = open('error.txt', 'a', encoding='utf-8')
f.write(self.t + '第' + str(num) + '页' + str(e))
f.close()
class Spider2():
def __init__(self, t, link):
self.t = t
self.link = link
def spider2(self, num):
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('user-agent='
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
conn = RedisClient()
result = conn.random()
options.add_argument('--proxy-server=http://' + str(result))
drivers = webdriver.Chrome(options)
drivers.set_page_load_timeout(300) # 根据页面实际需要加载时间设置
drivers.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': 'Object.defineProperty(navigator, "webdriver", {get:()=>undefined})'
})
try:
print('proxy:', result)
drivers.get(self.link + 'index{i}.html#fy01'.format(i=num))
wait = WebDriverWait(drivers, 10)
inputs = wait.until(
EC.presence_of_all_elements_located(
(By.XPATH, '//div[@class="headingNews"]/div[contains(@class, "hdNews")]/p')))
texts = wait.until(
EC.presence_of_all_elements_located(
(
By.XPATH, '//div[@class="headingNews"]/div[contains(@class, "hdNews")]/p/em/a')))
for input, text in zip(inputs, texts):
html = input.get_attribute('outerHTML')
href = re.search('<strong>\s*<a href="(.*?)".*?>.*?</a>\s*</strong>', html).group(1)
#print('http://' + self.link.split('//')[1].split('/')[0] + href)
title = re.search('<strong>\s*<a.*?>(.*?)</a>\s*</strong>', html).group(1)
content = text.get_attribute("innerText")
html = etree.HTML(html)
if html.xpath('//p/a'):
img = html.xpath('//p/a/img/@src')
#print('http://' + self.link.split('//')[1].split('/')[0] + img[0])
else:
img = 'Zero'
#print(img)
save_data(self.link, href, title, content, img)
print(self.t, '第' + str(num) + '页')
except Exception as e:
print(self.t, '第' + str(num) + '页', e)
f = open('error.txt', 'a', encoding='utf-8')
f.write(self.t + '第' + str(num) + '页' + str(e))
f.close()
def main():
option = webdriver.ChromeOptions()
option.add_argument('--headless')
option.add_argument('user-agent='
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
driver = webdriver.Chrome(option)
driver.set_page_load_timeout(300) # 根据页面实际需要加载时间设置
driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': 'Object.defineProperty(navigator, "webdriver", {get:()=>undefined})'
})
driver.get('http://www.people.com.cn/')
wait = WebDriverWait(driver, 10)
inputs = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//ul[@class="cf"]//div[@class="xinwen"]//a')))
element = dict()
for input in inputs:
t = input.get_attribute("innerText")
link = input.get_attribute('href')
element[t] = link
start = time.time()
# pool = Pool() #创建进程池
for v in element.keys():
t = v
link = element[v]
if link.split('//')[1].split('.')[0] == 'finance':
spider1 = Spider1(t, link)
#for i in range(1, 14):
# spider1.spider1(i)
with ThreadPoolExecutor(max_workers=5) as t:
all_task = [t.submit(spider1.spider1, page) for page in range(1, 14)]
print(all_task)
# pool.map(spider1.spider1, range(1, 14))
elif link.split('//')[1].split('.')[0] == 'society':
spider1 = Spider1(t, link)
#for i in range(1, 11):
# spider1.spider1(i)
with ThreadPoolExecutor(max_workers=5) as t:
all_task = [t.submit(spider1.spider1, page) for page in range(1, 11)]
print(all_task)
# pool.map(spider1.spider1, range(1, 11))
elif link.split('//')[1].split('.')[0] == 'ent':
spider1 = Spider1(t, link)
#for i in range(1, 9):
# spider1.spider1(i)
with ThreadPoolExecutor(max_workers=5) as t:
all_task = [t.submit(spider1.spider1, page) for page in range(1, 9)]
print(all_task)
# pool.map(spider1.spider1, range(1, 9))
elif link.split('//')[1].split('.')[0] == 'worl