#!/usr/bin/env python
# encoding: utf-8
import re
from lxml import etree
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.utils.project import get_project_settings
from scrapy_redis.spiders import RedisSpider
from sina.items import TweetsItem, InformationItem
from utils import time_fix, extract_weibo_content
import time
class WeiboSpider(RedisSpider):
name = "weibo_spider"
base_url = "https://weibo.cn"
redis_key = "weibo_spider:start_urls"
custom_settings = {
'CONCURRENT_REQUESTS': 16,
"DOWNLOAD_DELAY": 0.1,
}
def parse(self, response):
if response.url.endswith('page=1'):
# 如果是第1页,一次性获取后面的所有页
all_page = re.search(r'/> 1/(\d+)页</div>', response.text)
if all_page:
all_page = all_page.group(1)
all_page = int(all_page)
for page_num in range(2, all_page + 1):
page_url = response.url.replace('page=1', 'page={}'.format(page_num))
yield Request(page_url, self.parse, dont_filter=True, meta=response.meta)
"""
解析本页的数据
"""
"""
解析本页的数据
"""
tree_node = etree.HTML(response.body)
tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
for tweet_node in tweet_nodes:
try:
tweet_item = TweetsItem()
tweet_item['crawl_time'] = int(time.time())
tweet_repost_url = tweet_node.xpath('.//a[contains(text(),"转发[")]/@href')[0]
user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url)
tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(user_tweet_id.group(2),
user_tweet_id.group(1))
tweet_item['user_id'] = user_tweet_id.group(2)
tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1))
create_time_info_node = tweet_node.xpath('.//span[@class="ct"]')[-1]
create_time_info = create_time_info_node.xpath('string(.)')
if "来自" in create_time_info:
tweet_item['created_at'] = time_fix(create_time_info.split('来自')[0].strip())
tweet_item['tool'] = create_time_info.split('来自')[1].strip()
else:
tweet_item['created_at'] = time_fix(create_time_info.strip())
like_num = tweet_node.xpath('.//a[contains(text(),"赞[")]/text()')[-1]
tweet_item['like_num'] = int(re.search('\d+', like_num).group())
repost_num = tweet_node.xpath('.//a[contains(text(),"转发[")]/text()')[-1]
tweet_item['repost_num'] = int(re.search('\d+', repost_num).group())
comment_num = tweet_node.xpath(
'.//a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()')[-1]
tweet_item['comment_num'] = int(re.search('\d+', comment_num).group())
images = tweet_node.xpath('.//img[@alt="图片"]/@src')
if images:
tweet_item['image_url'] = images[0]
videos = tweet_node.xpath('.//a[contains(@href,"https://m.weibo.cn/s/video/show?object_id=")]/@href')
if videos:
tweet_item['video_url'] = videos[0]
map_node = tweet_node.xpath('.//a[contains(text(),"显示地图")]')
if map_node:
map_node = map_node[0]
map_node_url = map_node.xpath('./@href')[0]
map_info = re.search(r'xy=(.*?)&', map_node_url).group(1)
tweet_item['location_map_info'] = map_info
repost_node = tweet_node.xpath('.//a[contains(text(),"原文评论[")]/@href')
if repost_node:
tweet_item['origin_weibo'] = repost_node[0]
# 检测由没有阅读全文:
all_content_link = tweet_node.xpath('.//a[text()="全文" and contains(@href,"ckAll=1")]')
if all_content_link:
all_content_url = self.base_url + all_content_link[0].xpath('./@href')[0]
yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item},
priority=1)
else:
tweet_html = etree.tostring(tweet_node, encoding='unicode')
tweet_item['content'] = extract_weibo_content(tweet_html)
yield tweet_item
yield Request(url="https://weibo.cn/{}/info".format(tweet_item['user_id']),
callback=self.parse_information, priority=2)
except Exception as e:
self.logger.error(e)
def parse_all_content(self, response):
# 有阅读全文的情况,获取全文
tree_node = etree.HTML(response.body)
tweet_item = response.meta['item']
content_node = tree_node.xpath('//*[@id="M_"]/div[1]')[0]
tweet_html = etree.tostring(content_node, encoding='unicode')
tweet_item['content'] = extract_weibo_content(tweet_html)
yield tweet_item
def parse_information(self, response):
""" 抓取个人信息 """
information_item = InformationItem()
information_item['crawl_time'] = int(time.time())
selector = Selector(response)
information_item['_id'] = re.findall('(\d+)/info', response.url)[0]
text1 = ";".join(selector.xpath('body/div[@class="c"]//text()').extract()) # 获取标签里的所有text()
nick_name = re.findall('昵称;?[::]?(.*?);', text1)
gender = re.findall('性别;?[::]?(.*?);', text1)
place = re.findall('地区;?[::]?(.*?);', text1)
briefIntroduction = re.findall('简介;?[::]?(.*?);', text1)
birthday = re.findall('生日;?[::]?(.*?);', text1)
sex_orientation = re.findall('性取向;?[::]?(.*?);', text1)
sentiment = re.findall('感情状况;?[::]?(.*?);', text1)
vip_level = re.findall('会员等级;?[::]?(.*?);', text1)
authentication = re.findall('认证;?[::]?(.*?);', text1)
labels = re.findall('标签;?[::]?(.*?)更多>>', text1)
if nick_name and nick_name[0]:
information_item["nick_name"] = nick_name[0].replace(u"\xa0", "")
if gender and gender[0]:
information_item["gender"] = gender[0].replace(u"\xa0", "")
if place and place[0]:
place = place[0].replace(u"\xa0", "").split(" ")
information_item["province"] = place[0]
if len(place) > 1:
information_item["city"] = place[1]
if briefIntroduction and briefIntroduction[0]:
information_item["brief_introduction"] = briefIntroduction[0].replace(u"\xa0", "")
if birthday and birthday[0]:
information_item['birthday'] = birthday[0]
if sex_orientation and sex_orientation[0]:
if sex_orientation[0].replace(u"\xa0", "") == gender[0]:
information_item["sex_orientation"] = "同性恋"
else:
information_item["sex_orientation"] = "异性恋"
if sentiment and sentiment[0]:
information_item["sentiment"] = sentiment[0].replace(u"\xa0", "")
if vip_level and vip_level[0]:
information_item["vip_level"] = vip_level[0].replace(u"\xa0", "")
if authentication and authentication[0]:
information_item["authentication"] = authentication[0].replace(u"\xa0", "")
if labels and labels[0]:
information_item["labels"] = labels[0].replace(u"\xa0", ",").replace(';', '').strip(',')
r
没有合适的资源?快使用搜索试试~ 我知道了~
资源推荐
资源详情
资源评论
收起资源包目录
weibo_spider_爬虫python_关键词爬虫_python_python爬虫_spider_源码.zip (1个子文件)
weibo_spider.py 9KB
共 1 条
- 1
心梓
- 粉丝: 807
- 资源: 8057
下载权益
C知道特权
VIP文章
课程特权
开通VIP
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功
- 1
- 2
前往页