#!/user/bin/python
#coding:utf8
import requests,time,xlrd,xlwt,random,re,os
from bs4 import BeautifulSoup
url0 = 'https://book.douban.com' #总的初始地址,后面用来字符串连接
library,library_data,popular,popular_data,culture, culture_data = ([],[],[],[],[],[])
life,life_data,economy,economy_data,science,science_data= ([],[],[],[],[],[])
sheet_list = [library,popular,culture,life,economy,science]
sheet_list_data = [library_data, popular_data, culture_data, life_data, economy_data, science_data] #数据表
num_list = 0 #用作sheet_list_data的索引
user_agent = ['Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0',
'Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36',
'Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0']
first_sum = time.time()
second_time = time.time()
proxies = {'http':'socks5://127.0.0.1:9050', 'https':'socks5://127.0.0.1:9050'}
def getSoup(url): #获取对应url的soup对象
global first_sum
res = ''
while(res == ''):
try:
if time.time() - first_sum >= 5*60:
os.system("""(echo authenticate '"mypassword"'; echo signal newnym; echo quit) | nc localhost 9051""")
time.sleep(2)
first_sum = time.time()
headers = {'user-agent':user_agent[random.randint(0,4)],'Connection':'close'}
res = requests.get(url,headers = headers,proxies = proxies)
res.encoding = 'utf - 8'
soup = BeautifulSoup(res.text,'html.parser')
except requests.exceptions.ProxyError as e:
print type(e)
os.system("""(echo authenticate '"mypassword"'; echo signal newnym; echo quit) | nc localhost 9051""")
time.sleep(2)
except Exception as e:
raise
return soup
def getFinal(list_name): #将爬取的数据存入数据列表中
global second_time
for urls in list_name:
count = 0
url = url0 + urls #得到此标签(首页)的url
while(url): #在此标签各页中循环
print '用时:', time.time() - second_time
second_time = time.time()
soups = getSoup(url)
getData(soups)
#time.sleep(random.uniform(1, 3)) #爬一个休息1到3秒,爬10个再休息个10秒
if count == 10:
time.sleep(random.uniform(3, 4))
count = 0
url_next = soups.select('.next')#得到此页的下页链接
if url_next == []:
break
else:
url_next_n = url_next[0].select('a')
if url_next_n == []:
break
else:
url = url_next_n[0]['href']
url = url0 + url
count += 1
def getData(soup): #获取此页所需的数据
global num_errno
r = r'[0-9]'
r_press = r'出版社'
infos = soup.select('.info')
for info in infos:
try:
num_people = ''
bookname = info.select('a')[0].text.strip().split()[0]
pub = info.select('.pub')[0].text.strip().split('/')
author = pub[0]
press = '未显示'
for i in pub:
if re.findall(r_press,i) != '未显示':
press = i
break
if info.select('.star')[0].select('span')[0].text == '':
score = info.select('.star')[0].select('span')[1].text
nums = re.findall(r,info.select('.star')[0].select('span')[2].text.strip())
for num in nums:
num_people = num_people + str(num)
else:
score = '评价人数不足'
num_people = score
print '成功爬取一条信息'
sheet_list_data[num_list].append((bookname, author, score, num_people, press))
except:
print('错误!!!!!!!!!!') #错误提示
num_errno += 1
if raw_input('请继续') in ['','y']:
pass
def writeExcel(): #Excel写入
row = 1
wb=xlwt.Workbook(encoding = 'utf-8')
style = xlwt.XFStyle() #下面字体的设置是为了能正常写入中文
font = xlwt.Font()
font.name = 'SimSun' # 指定“宋体”
style.font = font
sheet_name = ['library','popular','culture','life','economy','science']
sheet = ['sheet1','sheet2','sheet3','sheet4','sheet5','sheet6']
sheet_row_name = ['书名','作者','评分','评分人数','出版社']
for i in range(6): #为Excel添加6张表
sheet[i] = wb.add_sheet(sheet_name[i])
for i in sheet: #为每张表的开头添加行名字
for i_col in range(5):
i.write(0,i_col,sheet_row_name[i_col])
for i in sheet_list_data: #数据写入excel
for ii in i:
for i_col in range(5):
i.write(row,i_col,ii(i_col))
row += 1
wb.save('myfile.xls')
def f(x): #排序所用的权值函数
if x[2] == '评价人数不足':
return 0
else:
return x[2]
num_errno = 0 #用来记录可能的错误
requests.adapters.DEFAULT_RETRIES = 3
url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'
soup = getSoup(url)
for lables in soup.select('.tagCol'): #获取6个大表,其中的数据为各个标签的url
for lable in lables.select('a'):
sheet_list[num_list].append(lable['href'])
num_list += 1
num_list = 0
for i in sheet_list: #实现数据的爬取并写入相应列表中
getFinal(i)
print '错误数量:',num_errno
time.sleep(5)
sheet_list_data[num_list] = sorted(sheet_list_data[num_list], key=f, reverse = True)
num_list += 1
writeExcel() #Excel操作