【免费】bilibili视频评论爬取资源-CSDN文库

共2个文件

py：2个

爬虫

需积分: 0 96 浏览量 2024-05-06 21:22:29 上传评论收藏 3KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

bili.zip （2个子文件）

get_bili_cookie.py 364B

bili_spider.py 8KB

from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.common.exceptions import StaleElementReferenceException import time import json from selenium import webdriver from openpyxl import Workbook browser = webdriver.Chrome() # 登录并加载视频页面 def login_in(browser): # 进入要爬取的页面 browser.get("https://www.bilibili.com/video/BV1yZ421n768/?spm_id_from=333.1007.tianma.10-4-38.click&vd_source=836844ca3b9f90bce0fa984e10ec3c0a") f = open("b站cookie.txt", 'r') cookie = json.loads(f.read()) # 利用cookie自动登录账号 for i in cookie: browser.add_cookie(i) browser.refresh() time.sleep(10) # 让网页加载完毕，防止后面找元素没找到（这里卡了我1个多小时） # 滑动页面的操作 def slip_page(browser): # browser: 驱动器对象 document = browser.execute_script('return document.body.scrollHeight;') # 滑动之前的页面高度 time.sleep(2) browser.execute_script(f'window.scrollTo(0,{document})') # 滑动页面 time.sleep(2) document2 = browser.execute_script('return document.body.scrollHeight;')# 滑动之后的页面高度 # 获取子评论内容函数 def get_content(reply, attr): # reply: 找到的评论元素 # attr: 页面中对应的类属性 for i in reply: sub_reply = i.find_element(By.CLASS_NAME, 'reply-content') if sub_reply.text is None: pass else: print("子评论：" + sub_reply.text) # 获取已加载的所有评论函数 def get_comments(browser): # 1.登录b站，找到对应视频页面 login_in(browser) comment_count=0 # 2. 加载当前页面出现了的所有评论 while True: comment_area = browser.find_element(By.CSS_SELECTOR, '#comment > div > div > div > div.reply-warp > div.reply-list') print(comment_area) reply_items = comment_area.find_elements(By.CLASS_NAME, 'reply-item') # 获得了当前加载的评论，不是所有评论（动态加载） print(len(reply_items)) # comment_count+=len(reply_items) for item in reply_items[comment_count:]: root_reply = item.find_element(By.CSS_SELECTOR, 'span[class="reply-content"]').text reply_time=item.find_element(By.CSS_SELECTOR, 'span[class="reply-time"]').text reply_like=item.find_element(By.CSS_SELECTOR, 'span[class="reply-like"]').text user_name=item.find_element(By.CSS_SELECTOR, 'div[class="user-name"]').text # <div data-v-eb69efad="" class="user-name" data-user-id="1666218504" data-root-reply-id="183958745056">SK-1-21</div> print(user_name,reply_time,reply_like,root_reply) sheet.append([user_name,reply_time,reply_like,root_reply]) comment_count=len(reply_items) browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(3) end_flag=browser.find_elements(By.CLASS_NAME, 'reply-end') if len(end_flag)!=0: break # while True: # #comment > div > div > div > div.reply-warp > div.reply-list > div.reply-end # # end_flag = browser.find_element(By.CLASS_NAME, 'reply-end') # # if len(end_flag)!=0: # # break # for item in reply_items: # """ # 这里有个逻辑： # 如果一条评论没有查看更多按钮： # 那么直接去获取内容 # 如果有查看更多按钮则进行点击： # 如果点击完查看更多按钮，没有别的内容： # 则跳到下一条评论 # 如果点击完还有“下一页”按钮，则点击下一页按钮： # 如果点击完下一页按钮，还有下一页按钮，则继续点击 # 如果点击完下一页按钮，没有下一页按钮，则跳到下一条评论 # """ # # 子评论中没有查看更多，直接获取内容 # view_more = item.find_elements(By.CLASS_NAME, 'view-more-btn') # if len(view_more) == 0: # sub_replys = item.find_elements(By.CLASS_NAME, 'sub-reply-item') # get_content(sub_replys, "reply-content") # # 子评论区有查看更多，则先找到这个按钮并点击 # elif len(view_more) != 0: # browser.execute_script("arguments[0].click();", view_more[0]) # time.sleep(3) # 等待3秒页面更新 # sub_replys = item.find_elements(By.CLASS_NAME, 'sub-reply-item') # 找到这一页完整的子评论 # get_content(sub_replys, "reply-content") # 循环判断有无下一页按钮 # while True: # pagination_btn = item.find_elements(By.CLASS_NAME, 'pagination-btn') # # 如果这条评论没有下一页子评论，则结束循环，获取完主评论后跳到下一条评论 # if len(pagination_btn) == 0: # break # # 如果有下一页，则点击下一页 # elif len(pagination_btn) != 0: # # 这里会有3种情况，分别是“只有下一页”、“上一页+下一页”、“只有上一页” # # 针对只有上一页，则退出循环 # if len(pagination_btn) == 1 and pagination_btn[0].text == "上一页": # break # time.sleep(3) # 等待网页加载 # print(pagination_btn) # # 针对只有下一页，则点击第一个按钮，即下一页 # if len(pagination_btn) == 1 and pagination_btn[0].text == "下一页": # browser.execute_script("arguments[0].click();", pagination_btn[0]) # time.sleep(3) # 等待网页加载 # sub_replys = item.find_elements(By.CLASS_NAME, 'sub-reply-item') # 找到这一页完整的子评论 # get_content(sub_replys, "reply-content") # # 针对有上一页和下一页，我们要点击第二个按钮，也就是下一页 # if len(pagination_btn) == 2: # browser.execute_script("arguments[0].click();", pagination_btn[1]) # time.sleep(3) # 等待网页加载 # sub_replys = item.find_elements(By.CLASS_NAME, 'sub-reply-item') # 找到这一页完整的子评论 # get_content(sub_replys, "reply-content") # 不要忘记主评论的内容 # root_reply = item.find_element(By.CSS_SELECTOR, 'span[class="reply-content"]').text # if root_reply is None: # pass # else: # print("主评论：" + root_reply) # # 当前加载的评论全部获取完毕，需要滑动页面，并获得新加载的评论 # browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") # reply_items = comment_area.find_elements(By.CLASS_NAME, 'reply-item') # end_flag=browser.find_elements(By.CLASS_NAME, 'reply-end') # if len(end_flag)!=0: # break wb = Workbook() sheet = wb.active sheet.title = 'comment' datas = ['user', 'reply_time',"reply_like" 'comment'] sheet.append(datas) get_comments(browser) wb.save('bilibili评论数据.xlsx')

评论收藏

内容反馈