# -*- coding: utf-8 -*-
# @Time : 2021/12/8 10:20
# @Author : MinChess
# @File : weibo.py
# @Software: PyCharm
import requests
from lxml import etree
from urllib.parse import quote
import csv
import re
import time
import random
from html.parser import HTMLParser
headers_com = {
'Cookie': 'SINAGLOBAL=8530308723572.626.1594801309423; UOR=,,www.baidu.com; ALF=1649247528; SUB=_2A25PIYh4DeRhGeVJ7lQS8C_Oyz-IHXVs7SgwrDV8PUJbkNAKLU7kkW1NT8v1zHG7ndkzD92IXpMb9pyYayxZv5p-; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh0wcVpufLRVzXYfa3pjY.Q5JpX5oz75NHD95Q0S0-ce05peo50Ws4Dqcjci--fi-i2i-zpi--fi-82iK.7i--Xi-zRiKy2i--Ri-88i-zRi--fiKyWi-2Ri--Xi-zRi-zc; ULV=1646658060503:5:1:1:243708356859.10474.1646658060493:1627299580145; WBPSESS=--AcJbA8Oa9zAFKPUTSD5sLC7tlhjwiMerLK05g7m8lHcPohg3YiE1uguj3KKHQSomGyiIUaCIxDcQ8SorKYE4ke6D9-V88QEw1v2kXwKPZzIIKl9DmiSwzsb_sbtUU6Qjd2cgrlnOdjlrSBRc2ayQ==; XSRF-TOKEN=6W6_mdFm7HVugNLWYT4De_m1',
# 'Cookie': 'SINAGLOBAL=6692839056672.04.1575211969653; XSRF-TOKEN=dvRQl6BFSaBrVQ9aoz2c-lau; _s_tentry=weibo.com; Apache=3761453327801.136.1646320550576; ULV=1646320550596:1:1:1:3761453327801.136.1646320550576:; UOR=,,www.baidu.com; login_sid_t=c055241708da7f3454da76e72babbf12; cross_origin_proto=SSL; ALF=1678169320; SCF=ApwEkyMNH24hMtFSKVLzEW_0pBDzpWweLRBt1V_aCUAGg_823TfLvw0CHZ5_j8J4itdbpBNo1i3scT2gChOhJEI.; WBPSESS=bwbIVYtIkNrJPKlhfJ1TLTI4ZrCYeqURLSWwfQ5QY1_xCzsPzZlG2luc9qbEWVTtt5VJEkReleyuvjb8g-Jcu6vCjNqx2-d46YFyR7feEIwjYWLqWU7e4c64BX02U-ujI5rXPTsoipPuP6cTVWkBFQ==; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWy-ExKsYH4ouv2pp40B4Md5NHD95QESoBcSoBXe0.pWs4Dqcj8TK84Ug2t; SSOLoginState=1646659617; SUB=_2A25PInhxDeRhGeRI71QX9CrPwj2IHXVs7Rg5rDV8PUJbkNB-LW3fkW1NUrg5CZibBb5RX_J1BdEiJc81kTpMpuEb',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
}
headers_cn = {
'Cookie': 'SUB=_2A25PIYh3DeRhGeVJ7lQS8C_Oyz-IHXVs7Sg_rDV6PUJbkdAKLU2nkW1NT8v1zA4Q0lakelzRb0x1ELOzOBSmUcOp; SCF=AtH4DCXADZs6FfnqoYSmwpUovXDFEGtPUyLpA8VSmAi8gymQXfWethijtJEZtyxd0rxS2BSamrES0Z3AQaim2FQ.; _T_WM=16923311902; WEIBOCN_FROM=1110006030; MLOGIN=1; M_WEIBOCN_PARAMS=uicode%3D20000174',
# 'Cookie': '_T_WM=48060870584; BAIDU_SSP_lcr=https://security.weibo.com/; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWy-ExKsYH4ouv2pp40B4Md5NHD95QESoBcSoBXe0.pWs4Dqcj8TK84Ug2t; SCF=ApwEkyMNH24hMtFSKVLzEW_0pBDzpWweLRBt1V_aCUAGx_5t8MsvZWG0xU0wbpvlyHnR-BdbVzll0Tm4JqNmyzs.; SUB=_2A25PInhxDeRhGeRI71QX9CrPwj2IHXVs7Rg5rDV6PUJbktCOLRTYkW1NUrg5CV_Fr49NHRIZpmpCqqeqJSJ0Dpol; SSOLoginState=1646659617',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
}
baseUrl = 'https://s.weibo.com/weibo?q={}&Refer=index'
topic = '郑州暴雨'
csvfile = open(topic + '.csv', 'a', newline='', encoding='utf-8-sig')
writer = csv.writer(csvfile)
def getTopic(url):
page = 0
pageCount = 1
while True:
weibo_content = []
weibo_liketimes = []
weibo_date = []
page = page + 1
tempUrl = url + '&page=' + str(page)
print('-' * 36, tempUrl, '-' * 36)
response = requests.get(tempUrl, headers=headers_com)
html = etree.HTML(response.text, parser=etree.HTMLParser(encoding='utf-8'))
count = len(html.xpath('//div[@class="card-wrap"]')) - 2
for i in range(1, count + 1):
try:
contents = html.xpath('//div[@class="card-wrap"][' + str(
i) + ']/div[@class="card"]/div[1]/div[2]/p[@node-type="feed_list_content_full"]')
contents = contents[0].xpath('string(.)').strip() # 读取该节点下的所有字符串
except:
contents = html.xpath('//div[@class="card-wrap"][' + str(
i) + ']/div[@class="card"]/div[1]/div[2]/p[@node-type="feed_list_content"]')
# 如果出错就代表当前这条微博有问题
try:
contents = contents[0].xpath('string(.)').strip()
except:
continue
contents = contents.replace('收起全文d', '')
contents = contents.replace('收起d', '')
contents = contents.split(' 2')[0]
# 发微博的人的名字
name = \
html.xpath(
'//div[@class="card-wrap"][' + str(i) + ']/div[@class="card"]/div[1]/div[2]/div[1]/div[2]/a')[
0].text
# 微博url
weibo_url = html.xpath(
'//div[@class="card-wrap"][' + str(i) + ']/div[@class="card"]/div[1]/div[2]/p[@class="from"]/a/@href')[
0]
url_str = '.*?com\/\d+\/(.*)\?refer_flag=\d+_'
res = re.findall(url_str, weibo_url)
weibo_url = res[0]
host_url = 'https://weibo.cn/comment/' + weibo_url
# 发微博的时间
timeA = \
html.xpath(
'//div[@class="card-wrap"][' + str(i) + ']/div[@class="card"]/div[1]/div[2]/p[@class="from"]/a')[
0].text.strip()
# 点赞数
likeA = html.xpath(
'//div[@class="card-wrap"][' + str(i) + ']/div[@class="card"]/div[2]/ul[1]/li[3]/a/button/span[2]')[
0].text
hostComment = \
html.xpath('//div[@class="card-wrap"][' + str(i) + ']/div[@class="card"]/div[2]/ul[1]/li[2]/a')[0].text
# 如果点赞数为空,那么代表点赞数为0
if likeA == '赞':
likeA = 0
if hostComment == '评论 ':
hostComment = 0
if hostComment != 0:
print('正在爬取第', page, '页,第', i, '条微博的评论。')
getComment(host_url)
try:
hosturl, host_sex, host_location, hostcount, hostfollow, hostfans = getpeople(name)
list = ['微博', name, hosturl, host_sex, host_location, hostcount, hostfollow, hostfans, contents, timeA,
likeA]
writer.writerow(list)
except:
continue
print('=' * 66)
try:
if pageCount == 1:
pageA = html.xpath('//*[@id="pl_feedlist_index"]/div[5]/div/a')[0].text
print(pageA)
pageCount = pageCount + 1
elif pageCount == 50:
print('没有下一页了')
break
else:
pageA = html.xpath('//*[@id="pl_feedlist_index"]/div[5]/div/a[2]')[0].text
pageCount = pageCount + 1
print(pageA)
except:
print('没有下一页了')
break
def getpeople(name):
findPoeple = 0
url2 = 'https://s.weibo.com/user?q='
while True:
try:
response = requests.post('https://weibo.cn/search/?pos=search', headers=headers_cn,
data={'suser': '找人', 'keyword': name})
tempUrl2 = url2 + quote(str(name)) + '&Refer=weibo_user'
print('搜索页面', tempUrl2)
response2 = requests.get(tempUrl2, headers=headers_com)
html = etree.HTML(response2.content, parser=etree.HTMLParser(encoding='utf-8'))
hosturl_01 = html.xpath('/html/body/div[1]/div[2]/div/div[2]/div[1]/div[3]/div[1]/div[2]/div/a/@href')[0]
url_str = '.*?com\/(.*)'
res = re.findall(url_str, hosturl_01)
hosturl = 'https://weibo.cn/' + res[0]
print('找人主页:', hosturl)
break
except:
if findPoeple == 10:
stop = random.randint(60, 300)
print('IP被封等待一段时间在爬', stop, '秒')
time.sleep(stop)
if response.status_code == 200:
return
print('找人')
time.sleep(random.randint(0, 10))
findPoeple =