# 爬取豆瓣书录
import requests
import bs4
from bs4 import BeautifulSoup
import sqlite3
import jieba
from jieba.analyse import extract_tags
import imageio
from wordcloud import WordCloud
# 豆瓣书店-新书 网址:https://market.douban.com/book/?utm_campaign=book_nav_freyr&utm_source=douban&utm_medium=pc_web&page=1&page_num=18&
print("连接数据库......")
conn = sqlite3.connect("book.db")
# 使用 execute() 方法执行 SQL,如果表存在则删除
conn.execute("DROP TABLE IF EXISTS book")
# 在该数据库下创建表
conn.execute('''CREATE TABLE book(
id INTEGER PRIMARY KEY NOT NULL,
image_url str null,
wordcloudpic str null,
bookname str null,
subtitle str null,
author str null,
translator str null,
press str null,
pub_time str null,
num_of_page str null,
now_price str null,
price str null,
bind str null,
isbn str null,
book_url str null,
rating str null,
eva_num str null,
book_intro str null
)''')
print("Table created successfully!")
print("数据库连接完成!")
# 设置请求头
headers = {}
headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' \
'Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'
headers['Accept'] = '*/*'
headers['Host'] = 'market.douban.com'
headers['Connection'] = 'keep-alive'
headers['Accept-Encoding'] = 'gzip, deflate, sdch'
headers['Accept-Language'] = 'zh-CN,zh;q=0.8'
def main():
url = "https://market.douban.com/book/?utm_campaign=book_nav_freyr&utm_source=douban&utm_medium=pc_web&page={}" \
"&page_num=18&"
book_totals = []
# 由于书籍信息有三页,所以加一个循环将两页书籍信息都添加进列表中方便生成表格
for ench in range(1, 4):
# print(ench)
newurl = url.format(ench) # 通过format将URL地址实现可变性,可以将两页书籍信息都打印出来
book_result = bookList(newurl)
book_totals.append(book_result.copy())
# print(book_totals)
for book_total in book_totals:
for each_book in book_total:
# print(type(each_book))
# 字典名映射转换
# 英文
mvh = ["id", "wordcloudpic", "image_url", "bookname", "subtitle", "author", "translator", "press", "pub_time",
"num_of_page", "now_price", "price", "bind", "isbn", "book_url", "rating", "eva_num", "book_intro"]
# 这里拼接sql语句
filed_sql = []
value_sql = []
# index代表索引号,item代表索引号对应的值
for index, item in enumerate(mvh):
if item in each_book.keys():
filed_sql.append(item)
value_sql.append(each_book[item])
sql = "insert into book(" + ",".join(filed_sql) + ")"
sql += " values ('" + "','".join(value_sql) + "')"
# 调用数据库函数
if SqlExec(conn, sql):
print("数据插入完毕!")
else:
print(sql)
# 数据库执行函数
def SqlExec(conn, sql):
try:
cur = conn.cursor()
cur.execute(sql)
conn.commit()
except Exception as e:
print(e)
return cur
# 该函数式用来返回一个列表存放含有书籍信息的字典
def bookList(url):
newurl = requests.get(url, headers=headers)
soup = BeautifulSoup(newurl.text, "html.parser") # html.parser 解析网页的一种形式
result_total = []
result_url = [] # 每个书的url
booklist = soup.find_all('li', attrs={'class': 'book-item'})
for book in booklist:
bookurl = book.find('a').get('href')
result_url.append(bookurl) # 将所抓取书的信息字典添加到列表里面
for i in result_url:
#print(i)
result_total.append(booktextscore(i).copy())
# print(result_total)
return result_total # 返回一个列表
# 该函数式用来爬取书籍的名字,评分,评价人数及书的简单介绍
def booktextscore(url):
booktexturl = requests.get(url)
soup = BeautifulSoup(booktexturl.text, 'html.parser')
result = {} # 创建一个字典存储相关书籍信息
# 图片链接
image_url = soup.find('div', attrs={'class': 'book-face-img'}).find('img').get('src')
result["image_url"] = image_url
# print("图片链接:", image_url)
# 现价
now_price = soup.select('.book-price span')[0].text
result["now_price"] = now_price
# print("现价:", now_price)
infor_key = []
infor_value = []
value_new = []
information ={}
# 键
information2 = soup.find("div", attrs={'class': 'text-right'})
if isinstance(information2, bs4.element.Tag):
informa2 = information2.find_all("dt")
for infor2 in informa2:
infor_key.append(infor2.text)
# print(infor_key)
# 值
information1 = soup.find("div", attrs={'class': 'text-right'})
if isinstance(information2, bs4.element.Tag):
informa1 = information2.find_all("em")
for infor1 in informa1:
infor_value.append(infor1.text)
# print(infor_value)
# 书名
bookname = soup.select('.book-breintro h3')[0].text
# bookname = infor_value[0].split(":", 1)[1]
bookname2 = '《' + bookname + '》'
result['bookname'] = bookname2
# print("书籍名称:", bookname2)
# 获取图片
photo_path = "photo\\"
bookimage = soup.find('div', attrs={'class': 'book-face-img'}).find('img').get('src')
pic_name = str(bookname) + '.jpg'
book_img = requests.get(bookimage)
last_path = photo_path + pic_name
with open(last_path, "wb") as fp:
fp.write(book_img.content)
# 去掉第一个元素
value_new = infor_value[1: len(infor_value)]
# print(value_new)
# 将两个数组变成字典
information = dict(zip(infor_key, value_new))
# print(information)
# print(information.keys())
if '副标题:' in information.keys():
result['subtitle'] = information['副标题:']
else:
result['subtitle'] = " "
# print("副标题:", information['副标题:'])
if '作者:' in information.keys():
result['author'] = information['作者:']
else:
result['author'] = " "
# print("作者:", information['作者:'])
if '译者:' in information.keys():
result['translator'] = information['译者:']
else:
result['translator'] = " "
# print("译者:", information['译者:'])
if '出版社:' in information.keys():
result['press'] = information['出版社:']
else:
result['press'] = " "
# print("出版社:", information['出版社:'])
if '出版时间:' in information.keys():
result['pub_time'] = information['出版时间:']
else:
result['pub_time'] = " "
# print("出版时间:", information['出版时间:'])
if '页数:' in information.keys():
result['num_of_page'] = information['页数:']
else:
result['num_of_page'] = " "
# print("页数:", informati