# -*- coding: utf-8 -*-
import json
import re
from scrapy import Request
from xpc.items import *
strip = lambda x: x.strip() if x else ''
def convert_int(s):
if s:
return int(s.replace(',', ''))
return 0
ci = convert_int
class ZuopinSpider(scrapy.Spider):
name = 'zuopin'
allowed_domains = ['xinpianchang.com', 'openapi-vtom.vmovier.com']
start_urls = ['http://www.xinpianchang.com/channel/index/sort-like?from=tabArticle']
def parse(self, response):
url = 'http://www.xinpianchang.com/a%s?from=ArticleList'
post_list = response.xpath('//ul[@class="video-list"]/li')
# 生成每个li的请求,即每个视屏详情页的请求
for post in post_list:
pid = post.xpath('./@data-articleid').extract_first()
request = Request(url % pid, callback=self.parse_post)
# 传递作品id
request.meta['pid'] = pid
# 缩略图
request.meta['thumbnail'] = post.xpath('./a/img/@_src').get()
# from scrapy.shell import inspect_response
# inspect_response(response, self)
# break
yield request
def parse_post(self, response):
# 取出作品id
pid = response.meta['pid']
post = PostItem()
post['pid'] = pid
# 缩略图
post['thumbnail'] = response.meta['thumbnail']
# 标题
post['title'] = response.xpath('//div[@class="title-wrap"]/h3/text()').extract_first()
# 所有类型
cates = response.xpath('//span[contains(@class, "cate")]/a/text()').extract()
# 对类型进行处理
post['category'] = '-'.join([cate.strip() for cate in cates])
# 日期
post['created_at'] = response.xpath('//span[contains(@class, "update-time")]/i/text()').get()
# 播放量
post['play_counts'] = response.xpath('//i[contains(@class, "play-counts")]/@data-curplaycounts').get()
# 喜欢数
post['like_counts'] = response.xpath('//span[contains(@class, "like-counts")]/@data-counts').get()
# 描述
post['description'] = strip(response.xpath('//p[contains(@class, "desc")]/text()').get())
# 视屏详情页有的数据是动态加载的,例如视屏的url和其他的相关信息
# 取出视频对应的json文件的id
vid, = re.findall(r'vid: \"(\w+)\",', response.text)
video_url = 'https://openapi-vtom.vmovier.com/v3/video/%s?expand=resource,resource_origin?'
# 构造对json文件得请求
# 抛出请求并携带post
yield Request(video_url % vid, callback=self.parse_video, meta={'post': post})
# 评论的json请求地址,ajax是个莫名参数,更改发现0代表json文件,1代表html,page代表页数,这里只解析第一页
comment_url = 'http://www.xinpianchang.com/article/filmplay/ts-getCommentApi?id=%s&ajax=0&page=1'
# 请求评论的json文件
request = Request(comment_url % pid, callback=self.parse_comment)
yield request
composer_url = 'http://www.xinpianchang.com/u%s?from=articleList'
# 作者id
composer_list = response.xpath('//div[@class="user-team"]//ul[@class="creator-list"]/li')
for composer in composer_list:
# 取出每个作者的id并发送请求
cid = composer.xpath('./a/@data-userid').get()
request = Request(composer_url % cid, callback=self.parse_composer)
request.meta['cid'] = cid
yield request
cr = CopyrightItem()
cr['pcid'] = '%s_%s' % (cid, pid)
cr['cid'] = cid
cr['pid'] = pid
cr['roles'] = composer.xpath('.//span[contains(@class, "roles")]/text()').get()
yield cr
def parse_video(self, response):
# 解析视屏对应的json文件
post = response.meta['post']
resp = json.loads(response.text)
# 视屏地址
post['video'] = resp['data']['resource']['default']['url']
# 视频默认图
post['preview'] = resp['data']['video']['cover']
yield post
def parse_comment(self, response):
# 解析视屏对应的json文件,这里解析了第一页
comments = json.loads(response.text)
# 用户主页的url
composer_url = 'http://www.xinpianchang.com/u%s?from=articleList'
next_url = comments['data']['next_page_url']
if next_url:
yield Request(url=next_url, callback=self.parse_comment)
for c in comments['data']['list']:
comment = CommentItem()
# 评论id
comment['commentid'] = c['commentid']
# 作品uid
comment['pid'] = c['articleid']
# 评论内容
comment['content'] = c['content']
# 时间
comment['created_at'] = c['addtime_int']
# 评论者id
comment['cid'] = c['userInfo']['userid']
# 评论者名字
comment['uname'] = c['userInfo']['username']
# 评论者头像
comment['avatar'] = c['userInfo']['face']
# 点赞数
comment['like_counts'] = c['count_approve']
# 回复
if c['reply']:
comment['reply'] = c['reply']['commentid']
yield comment
# 对每个用户主页进行请求
request = Request(composer_url % comment['cid'], callback=self.parse_composer)
request.meta['cid'] = comment['cid']
yield request
def parse_composer(self, response):
# 对用户主页进行解析
banner = response.xpath('//div[@class="banner-wrap"]/@style').get()
composer = ComposerItem()
composer['cid'] = response.meta['cid']
# 背景图
composer['banner'], = re.findall(r'background-image:url\((.+?)\)', banner)
# 用户头像
composer['avatar'] = response.xpath('//span[@class="avator-wrap-s"]/img/@src').get()
# 用户昵称
composer['name'] = response.xpath('//p[contains(@class, "creator-name")]/text()').get()
# 签名
composer['intro'] = response.xpath('//p[contains(@class, "creator-desc")]/text()').get()
# 人气
composer['like_counts'] = ci(response.xpath('//span[contains(@class, "like-counts")]/text()').get())
# 粉丝
composer['fans_counts'] = response.xpath('//span[contains(@class, "fans-counts")]/@data-counts').get()
# 关注
composer['follow_counts'] = ci(response.xpath('//span[@class="follow-wrap"]/span[2]/text()').get())
# 地址
composer['location'] = response.xpath(
'//span[contains(@class, "icon-location")]/following-sibling::span[1]/text()').get() or ''
# 角色
composer['career'] = response.xpath(
'//span[contains(@class, "icon-career")]/following-sibling::span[1]/text()').get() or ''
yield composer