# -*- coding:utf-8 -*-
# @Time : 2020/7/10 17:52
# @Author : AX
# @File : __init__.py.py
# @Software: PyCharm
import requests
import re
from bs4 import BeautifulSoup
import json
# 爬取腾讯视频 + 调用解析
def home_page_data():
# headers = {
# 'User-Agent': r'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
# r' AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
# }
# html = requests.get(url="https://v.qq.com/", headers=headers)
# html.encoding="utf-8"
# html = html.text
file = open("temp.html", "r", encoding="utf-8")
html = file.read()
# print(html)
html = BeautifulSoup(html, 'html.parser')
# 定义要返回的结果
data = {
"banners": [],
}
# 开始进行正则解析
# 首先解析banners
banners = html.select("a.slider_figure")
print(len(banners))
for banner in banners:
# if banner.span:
# print(banner)
current = {}
# 链接
current["playerUrl"] = banner.get("href")
# 标题
temp_title = banner.select(".slider_figure_title")[0]
current["title"] = temp_title.get("title")
# 描述
temp_des = banner.select(".slider_figure_desc2")[0]
# print(temp_des)
current["desc"] = temp_des.get("title")
# 更新状态
temp_status = banner.select(".slider_figure_desc")[0]
if temp_status.get("data-update"):
current["updateStatus"] = temp_status.get("data-update")
else:
current["updateStatus"] = ''
# 图片链接
# 由于存在懒加载
# print(banner.img)
temp_img = banner.img
if temp_img.get("lz_next"):
current["imgSrc"] = "https:" + temp_img.get("lz_next")
elif temp_img.get("src"):
current["imgSrc"] = "https:" + temp_img.get("src")
# print(current)
data["banners"].append(current)
# 解析电影等元素
total_element = html.select(".mod_row_box")
# print(len(total_element))
element = total_element[0]
# print(element)
# 确定名称
name = element.get("id")
if name and re.match(r"new_vs_hot_", name):
current_name = name[11:]
data[current_name] = []
item_list = element.select(".list_item")
for current_item in item_list:
temp = {}
# print(current_item)
# 拿到href
# print(current_item.a.get("href"))
temp["href"] = current_item.a.get("playerUrl")
if temp["href"]:
# 拿到title
temp_img = current_item.img
temp["title"] = temp_img.get("alt")
temp["imgSrc"] = "https:" + temp_img.get("src")
data[current_name].append(temp)
print(data)
def searchTXplayer(keyword):
# https://v.qq.com/x/search/?q=斗罗大陆&stag=0&smartbox_ab=
headers = {
'user-agent': r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) App'
r'leWebKit/537.36 (KHTML, like Gecko) Chrome/83'
r'.0.4103.116 Safari/537.36'
}
response = requests.get('https://v.qq.com/x/search/?q=' + str(keyword) + '&stag=0&smartbox_ab=')
# print(response.text)
html = BeautifulSoup(str(response.text), 'html.parser')
# 返回带有剧集的结果:
result_items = html.select(".result_item_v")
# print(result_items)
result = []
print(len(result_items))
for item in result_items:
temp = {
"id": "",
"imgSrc": "",
"title": "",
"desc": "",
"current_status": "",
"current_count": 1
}
img = item.select(".figure_pic")
# 图片
temp["imgSrc"] = "https:" + img[0].get("src")
# 状态
temp["current_status"] = item.select(".figure_info")[0].string or "无集数信息"
# 集数
temp["current_count"] = (re.search(r"\d+", temp["current_status"]) and (
int(re.search(r"\d+", temp["current_status"]).group()))) or 0
# id
temp_a = item.select(".result_title")[0].a
temp["id"] = temp_a.get("href")[len("https://v.qq.com/detail/m/"):-5]
# 简介
temp["desc"] = list(item.select(".desc_text")[0].children)[0].strip()
# 标题
for temp_str in list(temp_a.children):
temp["title"] = temp["title"] + temp_str.string.strip()
# 最新集数
# print(temp)
result.append(temp)
return result
def get_page_message(id, count):
response = requests.get(r"https://s.video.qq.com/get_playsource?" \
r"id=" + id + "&plat=2&type=4&" \
r"data_type=2&video_type=3&range=1-" + count + "&" \
r"plname=qq&otype=json&num_mod_cnt=20&" \
r"callback=_jsonp_2_9081&_t=1595256007313")
ans = json.loads(response.text[14:-1])
print(ans)
print(type(ans))
return ans
# 拿到最后一集
# test_find = html.select(".result_episode_list")[0]
# count = test_find.select(".item")[-2]
# id = count.select("a")[0].get("href")[len("https://v.qq.com/x/cover/"):]
# id = id[:str(id).find("/")]
# print(id)
# count = count.select("a")[0].string
# print(count)
# 调用接口读取数据
# ans = get_page_message(id, count)
# return ans
没有合适的资源?快使用搜索试试~ 我知道了~
基于flask搭建的python爬虫项目,爬取tx视频并返回对应api.zip
共23个文件
xml:8个
py:8个
pyc:4个
需积分: 5 0 下载量 125 浏览量
2023-09-28
12:34:19
上传
评论
收藏 29KB ZIP 举报
温馨提示
基于flask搭建的python爬虫项目,爬取tx视频并返回对应api
资源推荐
资源详情
资源评论
收起资源包目录
基于flask搭建的python爬虫项目,爬取tx视频并返回对应api.zip (23个子文件)
txplayer-flask-master
baiduAPI
__init__.py 2KB
__pycache__
__init__.cpython-38.pyc 584B
test.py 186B
app.py 1KB
.idea
codeStyles
codeStyleConfig.xml 153B
webServers.xml 638B
vcs.xml 185B
misc.xml 294B
inspectionProfiles
profiles_settings.xml 174B
modules.xml 285B
deployment.xml 482B
movies-player.iml 645B
.gitignore 184B
sshConfigs.xml 308B
txplayer
__init__.py 6KB
__pycache__
__init__.cpython-38.pyc 3KB
test.py 307B
HomePage
__init__.py 3KB
__pycache__
__init__.cpython-38.pyc 2KB
test.py 235B
temp.html 68KB
iqiyiPlayer
__init__.py 250B
__pycache__
app.cpython-38.pyc 2KB
共 23 条
- 1
资源评论
天天501
- 粉丝: 606
- 资源: 4665
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功