import os
import re
import sys
from datetime import datetime, timedelta
from urllib.parse import unquote
import scrapy
import dbscrapy.utils.util as util
from scrapy.exceptions import CloseSpider
from scrapy.utils.project import get_project_settings
from dbscrapy.items import DbscrapyItem
class SearchSpider(scrapy.Spider):
# 提取了 settings.py 的信息
name = 'search'
allowed_domains = ['weibo.com']
settings = get_project_settings()
keyword_list = settings.get('KEYWORD_LIST')
if not isinstance(keyword_list, list):
if not os.path.isabs(keyword_list):
keyword_list = os.getcwd() + os.sep + keyword_list
if not os.path.isfile(keyword_list):
sys.exit('不存在%s文件' % keyword_list)
keyword_list = util.get_keyword_list(keyword_list)
for i, keyword in enumerate(keyword_list):
if len(keyword) > 2 and keyword[0] == '#' and keyword[-1] == '#':
keyword_list[i] = '%23' + keyword[1:-1] + '%23'
weibo_type = util.convert_weibo_type(settings.get('WEIBO_TYPE'))
contain_type = util.convert_contain_type(settings.get('CONTAIN_TYPE'))
regions = util.get_regions(settings.get('REGION'))
base_url = 'https://s.weibo.com'
start_date = settings.get('START_DATE',
datetime.now().strftime('%Y-%m-%d'))
end_date = settings.get('END_DATE', datetime.now().strftime('%Y-%m-%d'))
if util.str_to_time(start_date) > util.str_to_time(end_date):
sys.exit('settings.py配置错误,START_DATE值应早于或等于END_DATE值,请重新配置settings.py')
further_threshold = settings.get('FURTHER_THRESHOLD', 46)
mongo_error = False
pymongo_error = False
mysql_error = False
pymysql_error = False
def start_requests(self):
start_date = datetime.strptime(self.start_date, '%Y-%m-%d')
end_date = datetime.strptime(self.end_date,
'%Y-%m-%d') + timedelta(days=1)
start_str = start_date.strftime('%Y-%m-%d') + '-0'
end_str = end_date.strftime('%Y-%m-%d') + '-0'
for keyword in self.keyword_list:
if not self.settings.get('REGION') or '全部' in self.settings.get(
'REGION'):
base_url = 'https://s.weibo.com/weibo?q=%s' % keyword
# https://s.weibo.com/weibo?q=人工智能
url = base_url + self.weibo_type
# 0 https://s.weibo.com/weibo?q=人工智能&typeall=1
url += self.contain_type
# 0 https://s.weibo.com/weibo?q=人工智能&typeall=1×cope=custom:{2015-03-01}:{2020-03-01}
url += '×cope=custom:{}:{}'.format(start_str, end_str)
yield scrapy.Request(url=url,
callback=self.parse,
meta={
'base_url': base_url,
'keyword': keyword
})
else:
for region in self.regions.values():
base_url = (
'https://s.weibo.com/weibo?q={}®ion=custom:{}:1000'
).format(keyword, region['code'])
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}'.format(start_str, end_str)
# 获取一个省的搜索结果
# meta: 可以将数据传入到parse_detail函数中
yield scrapy.Request(url=url,
callback=self.parse,
meta={
'base_url': base_url,
'keyword': keyword,
'province': region
})
def check_environment(self):
"""判断配置要求的软件是否已安装"""
if self.pymongo_error:
print('系统中可能没有安装pymongo库,请先运行 pip install pymongo ,再运行程序')
raise CloseSpider()
if self.mongo_error:
print('系统中可能没有安装或启动MongoDB数据库,请先根据系统环境安装或启动MongoDB,再运行程序')
raise CloseSpider()
if self.pymysql_error:
print('系统中可能没有安装pymysql库,请先运行 pip install pymysql ,再运行程序')
raise CloseSpider()
if self.mysql_error:
print('系统中可能没有安装或正确配置MySQL数据库,请先根据系统环境安装或配置MySQL,再运行程序')
raise CloseSpider()
def parse(self, response):
base_url = response.meta.get('base_url')
keyword = response.meta.get('keyword')
province = response.meta.get('province')
is_empty = response.xpath(
'//div[@class="card card-no-result s-pt20b40"]')
page_count = len(response.xpath('//ul[@class="s-scroll"]/li'))
if is_empty:
print('当前页面搜索结果为空')
elif page_count < self.further_threshold:
# 解析当前页面
for weibo in self.parse_weibo(response):
self.check_environment()
yield weibo
next_url = response.xpath(
'//a[@class="next"]/@href').extract_first()
if next_url:
next_url = self.base_url + next_url
yield scrapy.Request(url=next_url,
callback=self.parse_page,
meta={'keyword': keyword})
else:
start_date = datetime.strptime(self.start_date, '%Y-%m-%d')
end_date = datetime.strptime(self.end_date, '%Y-%m-%d')
while start_date <= end_date:
start_str = start_date.strftime('%Y-%m-%d') + '-0'
start_date = start_date + timedelta(days=1)
end_str = start_date.strftime('%Y-%m-%d') + '-0'
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
start_str, end_str)
# 获取一天的搜索结果
yield scrapy.Request(url=url,
callback=self.parse_by_day,
meta={
'base_url': base_url,
'keyword': keyword,
'province': province,
'date': start_str[:-2]
})
def parse_by_day(self, response):
"""以天为单位筛选"""
base_url = response.meta.get('base_url')
keyword = response.meta.get('keyword')
province = response.meta.get('province')
is_empty = response.xpath(
'//div[@class="card card-no-result s-pt20b40"]')
date = response.meta.get('date')
page_count = len(response.xpath('//ul[@class="s-scroll"]/li'))
if is_empty:
print('当前页面搜索结果为空')
elif page_count < self.further_threshold:
# 解析当前页面
for weibo in self.parse_weibo(response):
self.check_environment()
yield weibo
next_url = response.xpath(
'//a[@class="next"]/@href').extract_first()
if next_url:
next_url = self.base_url + next_url
yield scrapy.Request(url=next_url,
callback=self.parse_page,
meta={'keyword': keyword})
else:
start_date_str = date +