# -*-coding:utf-8-*-
import time
import random
from selenium import webdriver
# from selenium.webdriver.chrome.options import Options
from selenium.webdriver.firefox.options import Options
from browsermobproxy import Server
import os
import re
import datetime
from configparser import ConfigParser
from bs4 import BeautifulSoup
from douyinBaseInfo import DouyinBaseInfo
import douyinVideoInfo
from apiWork import ApiWork
from tools.TNLog import TNLog
from tools.dbHelper import DbHelper
class XiaohongshuSpider(object):
def __init__(self):
# self.__dbHelper = DbHelper()
self.logger = TNLog()
self.apiWork = ApiWork()
self.douyinBaseInfo = DouyinBaseInfo()
self.__mobileEmulations = [
"iPhone X", "iPad Mini", "iPhone 7", "Galaxy S5", "Nexus 10", "Pixel 2"
]
self.__retryCount = 0
# # 开发端环境
# self.__browsermobproxy = 'D:/Program Files/Python38/Lib/site-packages/browsermobproxy/browsermob-proxy-2.1.4/bin/browsermob-proxy.bat'
# self.__chrome_driver = "C:/Users/Jie/AppData/Local/Google/ChromeApplication/chromedriver.exe"
self.__browsermobproxy = 'F:/Program Files/Python/Lib/site-packages/browsermobproxy/browsermob-proxy-2.1.4/bin/browsermob-proxy.bat'
self.__chrome_driver = "C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe"
def _start_server(self, emuIndex=0):
# 设置browsermobproxy Server
self.server = Server(self.__browsermobproxy)
self.server.start()
self.proxy = self.server.create_proxy()
chrome_options = Options()
chrome_options.add_argument(
'--proxy-server={0}'.format(self.proxy.proxy))
self.browser = webdriver.Firefox(options=chrome_options)
def _stop_server(self):
self.server.stop()
self.browser.quit()
def do_work(self):
cp = ConfigParser()
cp.read('config/appsetting.conf')
init_interval = int(cp.get('spider', 'init_interval')) # 初始化间隔时间 【分】
init_sleep = 23 # 初始化停止时间 【时】
init_start = 8 # 初始化开始时间 【时】f
# init_intervel = 5 # 初始化间歇时间 【分】
self._start_server()
while True:
n_time = int(datetime.datetime.now().hour)
if n_time <= init_sleep and n_time >= init_start:
init_talents = self.apiWork.get_red_talents(1)
for t in init_talents['data']['lists']:
self._do_spider_work(t['home_page'], t['id'])
cp.read('config/appsetting.conf')
init_interval = int(
cp.get('spider', 'init_interval')) # 初始化间隔时间 【分】
time.sleep(init_interval * 60)
else:
time.sleep(60 * 30)
self._stop_server()
def get_talents_from_api(self):
init_talents = self.apiWork.get_red_talents(1)
for t in init_talents:
self._do_spider_work(t['home_page'], t['id'])
def _strnum_to_num(self, num):
if '万' in num:
return str(int(float(num.strip('万')) * 10000))
return str(int(num))
def _do_spider_work(self, url, id):
try:
self.browser.get(url)
except BaseException as e:
self.logger.error(str(e) + "___主页不存在,请修改主页链接___" +
str(url) + "___" + str(id))
if 'Failed to establish a new connection' in str(e):
self._stop_server()
time.sleep(300)
self.__retryCount = self.__retryCount + 1
if self.__retryCount > 5:
self.__retryCount = 0
self._start_server(self.__retryCount)
else:
self._feed_back({'id': id,
'is_success': False,
'err_msg': '主页不存在,请修改主页链接'})
# self._update_talent(
# {'id': id, 'is_success': False, 'err_msg': '主页不存在,请修改主页链接'})
return
time.sleep(float(random.randint(5, 12)))
page_source = self.browser.page_source
try:
soup = BeautifulSoup(page_source)
author_container = soup.find(class_='author-container')
nick_name = author_container.find(
'span', class_='name-detail').text
avatar = author_container.find(
'img', class_='lazyload lazyload-image loaded').attrs['src']
description = author_container.find(
'div', class_='user-brief').text
location = author_container.find(
'span', class_='location-text').text
info_numbers = author_container.find(
'div', class_='card-info').find_all('span', class_='info-number')
fans_count = self._strnum_to_num(info_numbers[1].text)
total_like = self._strnum_to_num(info_numbers[2].text)
ret = {'id': id,
'is_success': True,
'err_msg': '',
'avatar': avatar,
'nick_name': nick_name,
'description': description,
'location': location,
'fans_count': fans_count,
'total_like': total_like
}
# self._update_talent(ret)
self._feed_back(ret)
print(ret)
except BaseException as e:
self.logger.error('小红书认证__'+str(id)+'__'+str(e))
# 数据返回给接口
def _feed_back(self, data):
base_data = {
"fans_count": data['fans_count'],
"praise_count": data['total_like'],
"nick_name": data['nick_name'],
"des": data['description'],
"avatar": data['avatar'],
"location": data['location']
}
if data['is_success'] == True:
resData = {"code": 1, "err_msg": "", "id": data['id'],
"base_info": base_data}
else:
resData = {"code": -1,
"err_msg": data['err_msg'], "id": data['id']}
self.apiWork.report_red_data(resData)
# def _update_talent(self, talent):
# if talent['is_success'] == False: # 设置状态为2,审核拒绝,添加拒绝理由audit_notes
# value = ('2',talent['err_msg'])
# self.__dbHelper.update_entity('zy_talent',talent['id'],value,'state','audit_notes')
# else: # 增加 audit_notes:数据查询成功
# value = (talent['avatar'],talent['nick_name'],talent['fans_count'],talent['total_like'],talent['location'],talent['description'],'数据查询成功')
# self.__dbHelper.update_entity('zy_talent',talent['id'],value,'avatar','name','fans_count','praise_count','region','signature','audit_notes')
if __name__ == "__main__":
x = XiaohongshuSpider()
x.do_work()
- 1
- 2
- 3
- 4
前往页