from flask import request,Blueprint,jsonify
import requests
from bs4 import BeautifulSoup
import pymysql as pymysql
from analyze.analyze import createIndex
from hotword.hotword import hotwordCreate
from pojo.Content import Content
import analyze
import re
xlSpider = Blueprint('xlSpider', __name__)
url='https://s.weibo.com/top/summary?Refer=top_hot&topnav=1&wvr=6'
def getHTMLText(url,timeout=30):
r=requests.get(url,timeout=timeout)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
@xlSpider.route("/createSpider")
def createSpider():
html = getHTMLText(url)
soup = BeautifulSoup(html, 'html.parser')
list=[]
list2=[]
for item in soup.find_all('tr',class_=""): # 查找div的class为item的dom
# print(item) #测试,查看电影item信息
# 保存一部电影信息
dict = {
"url": "",
"content": "",
"score": ""
}
item = str(item) # 转换字符串
# innerHtml=BeautifulSoup(item, 'html.parser')
# ahtml=innerHtml.find_all('a')
# ahtml[0].string
# ahtml[0]['href']
# 正则表达式查找影片链接
obj =re.findall(re.compile(r'<a href="(.*?)" target="_blank">(.*)</a>'), item)
content=Content()
if obj.__len__()>0:
content.url = obj[0][0]
dict.update({
"url":obj[0][0]
})
if obj[0].__len__()>0:
content.content=obj[0][1]
dict.update({
"content": obj[0][1]
})
# content = re.findall(re.compile(r'<a target="_blank">(.*)</a>'), item)[0]
score = re.findall(re.compile(r'<span>(.*)</span>'), item)
if score.__len__()>0:
content.score=score[0]
dict.update({
"score": score[0]
})
list2.append(dict)
list.append(content)
saveToDB(list)
# createIndex(list)
# hotwordCreate(list)
return "爬取创建成功"
@xlSpider.route("/clearData")
def clearData():
conn = getConn()
conn.autocommit(1)
cursor = conn.cursor()
try:
sql = "truncate xlwb_data"
cursor.execute(sql)
except:
import traceback
traceback.print_exc()
# 发生错误时回滚
conn.rollback()
finally:
# 关闭游标连接
cursor.close()
# 关闭数据库连接
conn.close()
return "清空数据成功"
@xlSpider.route("/createIndexDoc")
def createIndexDoc():
conn = getConn()
conn.autocommit(1)
cursor = conn.cursor()
try:
sql = "SELECT distinct * FROM xlwb_data"
cursor.execute(sql)
data = cursor.fetchall()
list = []
for d in data:
content = Content()
content.content=d[1]
content.url=d[2]
content.score=d[3]
list.append(content)
createIndex(list)
hotwordCreate(list)
except:
import traceback
traceback.print_exc()
# 发生错误时回滚
conn.rollback()
finally:
# 关闭游标连接
cursor.close()
# 关闭数据库连接
conn.close()
return "成功"
@xlSpider.route("/getDataFromDB")
def getDataFromDB():
keyword=request.args.get("keyword")
page=int(request.args.get("page"))
limit=int(request.args.get("limit"))
start=(page-1)*limit
conn=getConn()
conn.autocommit(1)
cursor = conn.cursor()
try:
countSQL="select count(id) count from xlwb_data where 1=1 and content like '%{}%'".format(keyword)
cursor.execute(countSQL);
count =cursor.fetchone()
sql="SELECT distinct * FROM xlwb_data WHERE 1=1 and content like '%{}%' limit {},{} ".format(keyword,start,limit)
cursor.execute(sql)
data = cursor.fetchall()
list=[]
for d in data:
dict = {
"id": d[0],
"content": d[1],
"url": d[2],
"score":d[3]
}
list.append(dict)
resp = {
"code": 0,
"msg": "",
"data": list,
"count": count
}
return jsonify(resp)
except:
import traceback
traceback.print_exc()
# 发生错误时回滚
conn.rollback()
finally:
# 关闭游标连接
cursor.close()
# 关闭数据库连接
conn.close()
return
def getConn():
return pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='123456', db='xlwb_spider', charset='utf8')
def saveToDB(list):
conn = getConn()
conn.autocommit(1)
cursor=conn.cursor();
try:
for item in list:
if item.content=="":
continue
sql= 'INSERT INTO xlwb_data(content,url,score) VALUES ("{content}","{url}","{score}")'.format(
content=pymysql.escape_string(item.content),
url=pymysql.escape_string(item.getAbsoloteURL()),
score=pymysql.escape_string(item.score)
)
cursor.execute(sql)
except:
import traceback
traceback.print_exc()
# 发生错误时回滚
conn.rollback()
finally:
# 关闭游标连接
cursor.close()
# 关闭数据库连接
conn.close()
没有合适的资源?快使用搜索试试~ 我知道了~
资源推荐
资源详情
资源评论
收起资源包目录
毕业设计源码之微博舆情分析系统的设计与实现(python).zip (49个子文件)
项目部署说明.zip 525KB
myProject
项目部署说明.zip 525KB
xlwb_spider
spider
spiderObtain.py 5KB
app.py 601B
analyze
analyze.py 2KB
api
htmlAPI.py 379B
resource
xlwb_spider_xlwb_data.sql 12KB
hotword
hotword.py 2KB
.idea
dataSources.xml 917B
encodings.xml 258B
pojo
Content.py 209B
static
js
table.js 0B
form.js 1KB
lay-config.js 0B
jquery-confirm.min.js 0B
jquery-3.4.1.min.js 0B
api
init.json 839B
upload.json 135B
tableSelect.json 1KB
menus_bak.json 6KB
clear.json 55B
accepttable.json 6KB
table.json 2KB
menus.json 6KB
resource 0B
css
bootstrap-theme.min.css.map 25KB
bootstrap.min.css 118KB
bootstrap-theme.css.map 47KB
bootstrap-theme.css 26KB
public.css 1KB
bootstrap.css.map 380KB
layuimini.css 20KB
themes
default.css 4KB
bootstrap.css 143KB
bootstrap-theme.min.css 23KB
jquery-confirm.min.css 22KB
bootstrap.min.css.map 529KB
images
logo.png 74KB
home.png 146KB
pic.png 1.28MB
bg.jpg 26KB
favicon.ico 4KB
captcha.jpg 2KB
donate_qrcode.png 50KB
fonts
glyphicons-halflings-regular.svg 106KB
glyphicons-halflings-regular.ttf 44KB
glyphicons-halflings-regular.woff 23KB
glyphicons-halflings-regular.eot 20KB
glyphicons-halflings-regular.woff2 18KB
共 49 条
- 1
资源评论
码农落落
- 粉丝: 530
- 资源: 2304
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功