#需要调用的requests 库和 BeautifulSoup库中的bs4工具
import requests
from bs4 import BeautifulSoup
import json
import os
import time
import random
import jieba
import numpy as np
import PIL
from PIL import Image
import matplotlib.pyplot as plt
from wordcloud import WordCloud
kv={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' , 'Referer':'https://item.jd.com/1263013576.html'}
# 评论数据保存文件
commentfile=r'C:\Users\chenf\Desktop\Python\GUI\Projects\FilmComennt\Comments.txt'
# 词云字体
WCFont = r'C:\WINDOWS\Fonts\simsun.ttc'
# 词云形状图片
WCIMG=r'C:\Users\chenf\Desktop\Python\GUI\Projects\FilmComennt\CASC.jpg'
# 词云形状图片保存
WCSave=r'C:\Users\chenf\Desktop\Python\GUI\Projects\FilmComennt\WC.jpg'
def JDcomment(start=0):
##def JDcomment():
url = 'https://movie.douban.com/subject/26849758/comments?' \
'start=%s&limit=20&sort=new_score&status=P' %start
## url = 'https://movie.douban.com/subject/26849758/comments?' \
## 'start=0&limit=20&sort=new_score&status=P'
try:
r = requests.get(url,headers=kv)
r.raise_for_status()
except:
print('Failed!')
soup = BeautifulSoup(r.text,'lxml')
xml = soup.find_all('span', class_='short')
## print (xml)
for i in range(len(xml)):#表示从0到xml的len()长度
msg = xml[i].string
if not msg is None:
with open(commentfile,'a+',encoding = 'utf-8') as file:
file.write(msg + '\n')
print(msg)
def batch_JDcomment():
#写入前先清空
if os.path.exists(commentfile):
os.remove(commentfile)
for i in range(100):
JDcomment(i)
#随机暂停爬取防止ip被封
time.sleep(random.random()*5)
def cut_word():
with open(commentfile,encoding = 'utf-8') as file:
comment_txt=file.read()
wordlist=jieba.cut(comment_txt,cut_all=True)
wl = " ".join(wordlist)
print(wl)
return wl
def create_word_cloud():
coloring = np.array(Image.open(WCIMG))
#词云配置
wc = WordCloud(background_color='white',max_words=500,mask=coloring,scale=4,
max_font_size=50,random_state=42,font_path=WCFont,
stopwords=('电视','电视剧','古装','古装剧','第一部','一部','十二','时辰','二时','十二时'))
#生成词云
wc.generate(cut_word())
#保存词云图片
wc.to_file(WCSave)
#生成图片词云
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
## plt.figure()
plt.show()
if __name__=='__main__':
## JDcomment()
## batch_JDcomment()
## cut_word()
create_word_cloud()