weibo_spider_爬虫python_关键词爬虫_python_python爬虫_spider

共1个文件

py：1个

版权申诉

爬虫python

python

python爬虫

spider

5星 · 超过95%的资源 182 浏览量 2021-09-11 12:33:20 上传评论 3 收藏 3KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

weibo_spider_爬虫python_关键词爬虫_python_python爬虫_spider_源码.zip （1个子文件）

weibo_spider.py 9KB

#!/usr/bin/env python # encoding: utf-8 import re from lxml import etree from scrapy.crawler import CrawlerProcess from scrapy.selector import Selector from scrapy.http import Request from scrapy.utils.project import get_project_settings from scrapy_redis.spiders import RedisSpider from sina.items import TweetsItem, InformationItem from utils import time_fix, extract_weibo_content import time class WeiboSpider(RedisSpider): name = "weibo_spider" base_url = "https://weibo.cn" redis_key = "weibo_spider:start_urls" custom_settings = { 'CONCURRENT_REQUESTS': 16, "DOWNLOAD_DELAY": 0.1, } def parse(self, response): if response.url.endswith('page=1'): # 如果是第1页，一次性获取后面的所有页 all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse, dont_filter=True, meta=response.meta) """ 解析本页的数据 """ """ 解析本页的数据 """ tree_node = etree.HTML(response.body) tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') for tweet_node in tweet_nodes: try: tweet_item = TweetsItem() tweet_item['crawl_time'] = int(time.time()) tweet_repost_url = tweet_node.xpath('.//a[contains(text(),"转发[")]/@href')[0] user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url) tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) tweet_item['user_id'] = user_tweet_id.group(2) tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) create_time_info_node = tweet_node.xpath('.//span[@class="ct"]')[-1] create_time_info = create_time_info_node.xpath('string(.)') if "来自" in create_time_info: tweet_item['created_at'] = time_fix(create_time_info.split('来自')[0].strip()) tweet_item['tool'] = create_time_info.split('来自')[1].strip() else: tweet_item['created_at'] = time_fix(create_time_info.strip()) like_num = tweet_node.xpath('.//a[contains(text(),"赞[")]/text()')[-1] tweet_item['like_num'] = int(re.search('\d+', like_num).group()) repost_num = tweet_node.xpath('.//a[contains(text(),"转发[")]/text()')[-1] tweet_item['repost_num'] = int(re.search('\d+', repost_num).group()) comment_num = tweet_node.xpath( './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()')[-1] tweet_item['comment_num'] = int(re.search('\d+', comment_num).group()) images = tweet_node.xpath('.//img[@alt="图片"]/@src') if images: tweet_item['image_url'] = images[0] videos = tweet_node.xpath('.//a[contains(@href,"https://m.weibo.cn/s/video/show?object_id=")]/@href') if videos: tweet_item['video_url'] = videos[0] map_node = tweet_node.xpath('.//a[contains(text(),"显示地图")]') if map_node: map_node = map_node[0] map_node_url = map_node.xpath('./@href')[0] map_info = re.search(r'xy=(.*?)&', map_node_url).group(1) tweet_item['location_map_info'] = map_info repost_node = tweet_node.xpath('.//a[contains(text(),"原文评论[")]/@href') if repost_node: tweet_item['origin_weibo'] = repost_node[0] # 检测由没有阅读全文: all_content_link = tweet_node.xpath('.//a[text()="全文" and contains(@href,"ckAll=1")]') if all_content_link: all_content_url = self.base_url + all_content_link[0].xpath('./@href')[0] yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item}, priority=1) else: tweet_html = etree.tostring(tweet_node, encoding='unicode') tweet_item['content'] = extract_weibo_content(tweet_html) yield tweet_item yield Request(url="https://weibo.cn/{}/info".format(tweet_item['user_id']), callback=self.parse_information, priority=2) except Exception as e: self.logger.error(e) def parse_all_content(self, response): # 有阅读全文的情况，获取全文 tree_node = etree.HTML(response.body) tweet_item = response.meta['item'] content_node = tree_node.xpath('//*[@id="M_"]/div[1]')[0] tweet_html = etree.tostring(content_node, encoding='unicode') tweet_item['content'] = extract_weibo_content(tweet_html) yield tweet_item def parse_information(self, response): """ 抓取个人信息 """ information_item = InformationItem() information_item['crawl_time'] = int(time.time()) selector = Selector(response) information_item['_id'] = re.findall('(\d+)/info', response.url)[0] text1 = ";".join(selector.xpath('body/div[@class="c"]//text()').extract()) # 获取标签里的所有text() nick_name = re.findall('昵称;?[：:]?(.*?);', text1) gender = re.findall('性别;?[：:]?(.*?);', text1) place = re.findall('地区;?[：:]?(.*?);', text1) briefIntroduction = re.findall('简介;?[：:]?(.*?);', text1) birthday = re.findall('生日;?[：:]?(.*?);', text1) sex_orientation = re.findall('性取向;?[：:]?(.*?);', text1) sentiment = re.findall('感情状况;?[：:]?(.*?);', text1) vip_level = re.findall('会员等级;?[：:]?(.*?);', text1) authentication = re.findall('认证;?[：:]?(.*?);', text1) labels = re.findall('标签;?[：:]?(.*?)更多>>', text1) if nick_name and nick_name[0]: information_item["nick_name"] = nick_name[0].replace(u"\xa0", "") if gender and gender[0]: information_item["gender"] = gender[0].replace(u"\xa0", "") if place and place[0]: place = place[0].replace(u"\xa0", "").split(" ") information_item["province"] = place[0] if len(place) > 1: information_item["city"] = place[1] if briefIntroduction and briefIntroduction[0]: information_item["brief_introduction"] = briefIntroduction[0].replace(u"\xa0", "") if birthday and birthday[0]: information_item['birthday'] = birthday[0] if sex_orientation and sex_orientation[0]: if sex_orientation[0].replace(u"\xa0", "") == gender[0]: information_item["sex_orientation"] = "同性恋" else: information_item["sex_orientation"] = "异性恋" if sentiment and sentiment[0]: information_item["sentiment"] = sentiment[0].replace(u"\xa0", "") if vip_level and vip_level[0]: information_item["vip_level"] = vip_level[0].replace(u"\xa0", "") if authentication and authentication[0]: information_item["authentication"] = authentication[0].replace(u"\xa0", "") if labels and labels[0]: information_item["labels"] = labels[0].replace(u"\xa0", ",").replace(';', '').strip(',') r

评论收藏

内容反馈

版权申诉