# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from selenium.webdriver.common.proxy import Proxy, ProxyType
import time
from contextlib import closing
import requests
import xlwt
import xlrd
from xlutils.copy import copy
import urllib
import urllib2
import os
from common import *
from gevent import monkey;monkey.patch_all()
import gevent
import re
import shutil
import random
import sys
import urllib
import hashlib
import logging
def access(url,date):
prox = Proxy()
prox.proxy_type = ProxyType.MANUAL
#代理记得要挂上
prox.http_proxy = "http://proxy.****.com:10086"
prox.ssl_proxy = "http://proxy.****.com:10086"
capabilities = webdriver.DesiredCapabilities.CHROME
prox.add_to_capabilities(capabilities)
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(chrome_options=options,desired_capabilities=capabilities)
driver.get(url)
js="var q=document.documentElement.scrollTop=1000000"
for i in range(0,40):
driver.execute_script(js)
time.sleep(4)
html = driver.page_source
content = BeautifulSoup(html, 'html.parser')
div=content.find(name="div",attrs={"class":"_7jjx"})
div_list=div.find_all(name="div",attrs={"class":"_7owt"})
print "%s 总广告数是: %d"%(all_date,len(div_list))
filter_list=[]
for div in div_list:
advertise_date=div.find(name="div",attrs={"class":"_7jwu"}).span.text
if advertise_date == date:
filter_list.append(div)
print "%s 广告数是 %d"%(date,len(filter_list))
return filter_list
def download_file(div,path,id):
video=div.find(name="video")
path=path+id+"-.mp4"
if video != None:
try:
print "start rm {}-.img".format(id)
os.remove("{}/{}/{}/{}-.jpg".format(mount_point,keyword,all_date,id))
except OSError as e:
print e
video_src=video.get("src")
with closing(requests.get(video_src, stream=True)) as r:
chunk_size = 1024*10
content_size = int(r.headers['content-length'])
print '下载开始'
with open(path, "wb") as f:
# p = ProgressData(size = content_size, unit='Kb', block=chunk_size)
for chunk in r.iter_content(chunk_size=chunk_size):
f.write(chunk)
# p.output()
print "下载完成"
def convert(str):
return int("".join(re.findall("\d*", str)))
class ProgressData(object):
def __init__(self, block,size, unit, file_name='', ):
self.file_name = file_name
self.block = block/1000.0
self.size = size/1000.0
self.unit = unit
self.count = 0
self.start = time.time()
def output(self):
self.end = time.time()
self.count += 1
speed = self.block/(self.end-self.start) if (self.end-self.start)>0 else 0
self.start = time.time()
loaded = self.count*self.block
progress = round(loaded/self.size, 4)
if loaded >= self.size:
print u'%s下载完成\r\n'%self.file_name
else:
pass
print u'{0}下载进度{1:.2f}{2}/{3:.2f}{4} 下载速度{5:.2%} {6:.2f}{7}/s'.\
format(self.file_name, loaded, self.unit,\
self.size, self.unit, progress, speed, self.unit)
print '%50s'%('/'*int((1-progress)*50))
def imageDownload(div,id):
video=div.find(name="video")
if video != None:
imgurl=video.get("poster")
else:
imgurl=div.find(name="img",attrs={"class":"_7jys img"})
if imgurl==None:
imgurl=div.find(name="img",attrs={"class":"_7jys _7jyt img"})
imgurl=imgurl.get("src")
urllib.urlretrieve(imgurl, "/%s/%s/%s.jpg"%("facebook","all",id))
def textInfo(div_list,keyword):
text_id=os.listdir(path)[0]
text_id=int(text_id)
text_list=[]
for div in div_list:
dlist=[]
date=div.find(name="div",attrs={"class":"_7jwu"}).span.text
theme=div.find(name="div",attrs={"class":"_7jyr"})
theme=theme.find(name="div",attrs={"class":"_4ik4 _4ik5"}).text
for root,dirs,files in os.walk("/facebook/patch3"):
# print "/facebook/patch3:{}".format(files)
if files==[]:
link=str(text_id)+"-"+keyword
for file in files:
if file.split(".")[1] == str(text_id):
link="{}-{}".format(file.split(".")[0],keyword)
break
else:
link=str(text_id)+"-"+keyword
dlist.append(date)
dlist.append(keyword.decode("utf-8"))
dlist.append(link.decode("utf-8"))
dlist.append(theme)
text_list.append(dlist)
text_id=text_id+1
print "文本信息提取完成 删除/facebook/patch3"
shutil.rmtree('/facebook/patch3')
os.mkdir('/facebook/patch3')
return text_list
def occurence_textcontent(row,div_list,fileName,text_list):
dlist={}
for text in text_list:
theme=text[3]
if theme not in dlist:
dlist[theme]=1
else:
dlist[theme]=dlist[theme]+1
readbook = xlrd.open_workbook(fileName, formatting_info=True)
workbook = copy(readbook)
rdata_sheet = readbook.sheets()[1]
content_list=rdata_sheet.col_values(1)
workbook = copy(readbook)
wdata_sheet=workbook.get_sheet(1)
for key,value in dlist.items():
if key in content_list:
id=content_list.index(key)
num=rdata_sheet.cell(id,2).value
wdata_sheet.write(id,2,value+num)
else:
wdata_sheet.write(row,0,keyword.decode("utf-8"))
wdata_sheet.write(row,1,key)
wdata_sheet.write(row,2,value)
row=row+1
workbook.save(fileName)
def occurence_liblary(row,div_list,fileName,text_list):
dlist={}
for text in text_list:
link=text[2]
if link not in dlist:
dlist[link]=1
else:
dlist[link]=dlist[link]+1
readbook = xlrd.open_workbook(fileName, formatting_info=True)
workbook = copy(readbook)
rdata_sheet = readbook.sheets()[2]
content_list=rdata_sheet.col_values(1)
workbook = copy(readbook)
wdata_sheet=workbook.get_sheet(2)
for key,value in dlist.items():
if key in content_list:
id=content_list.index(key)
num=rdata_sheet.cell(id,2).value
wdata_sheet.write(id,2,value+num)
else:
wdata_sheet.write(row,0,keyword.decode("utf-8"))
wdata_sheet.write(row,1,key)
wdata_sheet.write(row,2,value)
row=row+1
workbook.save(fileName)
def exportExecl(row,allInfoList,fileName):
print "excel 第{}行插入数据".format(row)
readbook = xlrd.open_workbook(fileName, formatting_info=True)
workbook = copy(readbook)
data_sheet = workbook.get_sheet(0)
for infolist in allInfoList:
for column in range(0,4):
# print type(infolist[column]),infolist[column]
data_sheet.write(row,column,infolist[column])
row=row+1
workbook.save(fileName)
def createEx