"""
网易云音乐爬虫
原理:
1、从https://music.163.com/#/discover/playlist歌单列表页面中,手动获取分类中的风格,作为音乐类型
2、从https://music.163.com/discover/playlist/?cat=%E6%B5%81%E8%A1%8C中(将音乐类型作为链接参数)
爬取每种类型下的歌单
3、从https://music.163.com/#/playlist?id=4991155974中爬取歌单详情(id是歌单id)
4、解析歌单详情,并从中获取歌单下的音乐id、音乐名称
5、从https://music.163.com/#/song?id=108298中爬取音乐详情(id是音乐id)
6、解析音乐详情页面中的图片、mp3、歌词、歌手(其中图片、mp3、歌词又分别又对应的api链接,参数都是音乐id)
7、保存
"""
import random
import re
import emoji
import requests
from bs4 import BeautifulSoup
from cloudmusicspider.dbhelper import DBHelper
from cloudmusicspider.settings import User_Agents
from cloudmusicspider.util import Util
class MusicSpider(object):
def __init__(self, musicCount, imagePath):
# 基本的URL
self.base_url = 'https://music.163.com/'
# 全url=基本url+查询参数url+分页
self.full_url = self.base_url + 'discover/playlist/?order=hot&cat={cat}&limit={limit}&offset={offset}'
# 从User-Agents中选择一个User-Agent
self.headers = {'User-Agent': random.choice(User_Agents)}
# 每种音乐类型下爬取的音乐数量,排查已经爬取的音乐
self.musicCount = musicCount
self.pageSize = 35 # 每页查询数据条数,默认35条,可修改
# 初始化数据库链接
self.dbHelper = DBHelper(imagePath)
# 设置音乐类型字典
self.musicTypeList = list()
self.musicTypeList.append("流行")
self.musicTypeList.append("摇滚")
self.musicTypeList.append("民谣")
self.musicTypeList.append("电子")
self.musicTypeList.append("舞曲")
self.musicTypeList.append("说唱")
self.musicTypeList.append("轻音乐")
self.musicTypeList.append("爵士")
self.musicTypeList.append("乡村")
self.musicTypeList.append("R&B/Soul")
self.musicTypeList.append("古典")
self.musicTypeList.append("民族")
self.musicTypeList.append("英伦")
self.musicTypeList.append("金属")
self.musicTypeList.append("朋克")
self.musicTypeList.append("蓝调")
self.musicTypeList.append("雷鬼")
self.musicTypeList.append("世界音乐")
self.musicTypeList.append("拉丁")
self.musicTypeList.append("New Age")
self.musicTypeList.append("古风")
self.musicTypeList.append("后摇")
self.musicTypeList.append("Bossa Nova")
# 爬虫运行方法
def catchData(self):
print("***爬取网易云音乐数据开始***")
self.getMusicTypes()
print("***爬取网易云音乐数据结束***")
# 爬取音乐类型
def getMusicTypes(self):
try:
# 遍历所有音乐类型
for typeName in self.musicTypeList:
print("爬取音乐类型:【%s" % typeName + "】开始")
# 查询数据库中是否已存在当前音乐类型,如果存在那么查询出来,不存在那么添加音乐类型后查询出来
type = self.dbHelper.findType(typeName)
# 爬取歌单列表
self.getPlaylists(type)
print("爬取音乐类型:【%s" % typeName + "】结束")
except Exception as ex:
print(ex)
# 爬取歌单列表
def getPlaylists(self, type):
try:
# 分页查询当前音乐类型下的歌单,从第一页开始
currentPage = 0
# 临时变量,标记每页从第几条数据开始
start = 0
# 每种类型下爬取的音乐数量,临时变量
musicCountTemp = self.musicCount
endFlag = False # 每种类型下爬取音乐结束标志
# 遍历
while True:
# 每页35条数据
start = currentPage * self.pageSize
currentPage = currentPage + 1
cat = type.get("typename")
# 赋值歌单列表链接查询参数
fullUrlTemp = self.full_url.format(cat=cat, limit=self.pageSize, offset=start)
print("爬取音乐类型:【%s" % type.get("typename") + "】第【" + str(currentPage) + "】页开始")
print("链接地址:" + fullUrlTemp)
# 反爬虫技术,所以不能一直爬取,休息一会
Util().getRandomSleep()
# 爬取数据
resp = requests.get(fullUrlTemp, headers=self.headers)
# 使用bs4模块解析html数据
soup = BeautifulSoup(resp.text, 'lxml')
# 解析出歌单列表
playlistsUl = soup.find(id="m-pl-container")
liList = playlistsUl.find_all("li")
# 遍历歌单列表
for playlistTemp in liList:
# 歌单数据标签
playlistTag = playlistTemp.find('p', class_="dec").a
# 歌单名称
playlistName = playlistTag["title"]
# 歌单详情地址
playlistUrl = self.base_url + playlistTag["href"]
print("爬取音乐类型:【%s" % type.get("typename") + "】下的歌单【" + playlistName + "】开始")
print("歌单地址:" + playlistUrl)
# 反爬虫技术,所以不能一直爬取,休息一会
Util().getRandomSleep()
# 爬取歌单详情
playlistDict = {"playlistName": playlistName, "playlistUrl": playlistUrl}
musicCountTemp = self.getPlaylist(playlistDict, type, musicCountTemp)
print("爬取音乐类型:【%s" % type.get("typename") + "】下的歌单【" + playlistName + "】结束")
# 控制爬取数量
if musicCountTemp <= 0:
endFlag = True
break
print("爬取音乐类型:【%s" % type.get("typename") + "】第【" + str(currentPage) + "】页结束")
if endFlag:
break
except Exception as ex:
print(ex)
# 爬取歌单详情
def getPlaylist(self, playlistDict, type, musicCountTemp):
try:
# 爬取数据
resp = requests.get(playlistDict["playlistUrl"], headers=self.headers)
# 解析html数据
soup = BeautifulSoup(resp.text, "lxml")
contentDiv = soup.find(id="song-list-pre-cache")
# 获取音乐列表(隐藏域)
ulTag = contentDiv.find_all("ul", class_="f-hide")[0]
aTags = ulTag.find_all("a")
# 遍历音乐a标签
for aTag in aTags:
# 音乐详情地址
musicUrl = self.base_url + aTag["href"]
# 音乐名称
musicName = aTag.get_text()
# 网络id
wid = musicUrl[musicUrl.find("id=") + 3:]
print("爬取歌单:【%s" % playlistDict["playlistName"] + "】下的音乐【" + musicName + "】开始")
# 从数据库中查询音乐名称判断是否存在
result = self.dbHelper.findItemEx(musicName)
if result is None or len(result) == 0: # 数据库中不存在该音乐
print("音乐地址:" + musicUrl)
# 创建一个音乐字典
music = dict()