#!/usr/bin/python
#encoding=utf-8
import socket
import time
import urllib2
import pycurl
import logging
import threading
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S',
filename='rsgoo.log',
filemode='w')
def logd(text):
trdName= threading.currentThread().getName()
dt = (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
print((trdName,dt,text))
def logi(text):
trdName= threading.currentThread().getName()
dt = (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
print((trdName,dt,text))
def logw(text):
trdName= threading.currentThread().getName()
dt = (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
print((trdName,dt,text))
class CALLBACK:
def __init__(self):
self.data = ""
def write(self,data):
data = data.replace("\r","")
data = data.replace("\n","")
self.data+=data
def getvalue(self):
return self.data
def close(self):
pass
def pull(url,method='get',timeout = 10):
#print pycurl.version_info()
logd(("pull",url))
c = pycurl.Curl()
b = CALLBACK()
#设置要访问的网址
#c.setopt(pycurl.CURLOPT_NOSIGNAL, 1)
c.setopt(pycurl.URL, url)
#写的回调
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.FOLLOWLOCATION, 1) #参数有1、2
#最大重定向次数,可以预防重定向陷阱
c.setopt(pycurl.MAXREDIRS, 5)
#连接超时设置
c.setopt(pycurl.CONNECTTIMEOUT, 1000) #链接超时
# c.setopt(pycurl.TIMEOUT, 300) #下载超时
# c.setopt(pycurl.HEADER, True)
# c.setopt(c.HTTPHEADER, ["Content-Type: application/x-www-form-urlencoded","X-Requested-With:XMLHttpRequest","Cookie:"+set_cookie[0]])
#模拟浏览器
c.setopt(pycurl.USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)")
# c.setopt(pycurl.AUTOREFERER,1)
# c.setopt(c.REFERER, url)
# cookie设置
# Option -b/--cookie <name=string/file> Cookie string or file to read cookies from
# Note: must be a string, not a file object.
# c.setopt(pycurl.COOKIEFILE, "cookie_file_name")
# Option -c/--cookie-jar Write cookies to this file after operation
# Note: must be a string, not a file object.
# c.setopt(pycurl.COOKIEJAR, "cookie_file_name")
# Option -d/--data HTTP POST data
#post_data_dic = {"name":"value"}
#c.setopt(c.POSTFIELDS, urllib.urlencode(post_data_dic))
#设置代理
# c.setopt(pycurl.PROXY, ‘http://11.11.11.11:8080′)
# c.setopt(pycurl.PROXYUSERPWD, ‘aaa:aaa’)
#不明确作用
# c.setopt(pycurl.HTTPPROXYTUNNEL,1) #隧道代理
# c.setopt(pycurl.NOSIGNAL, 1)
#设置post请求, 上传文件的字段名 上传的文件
#post_file = "/home/ubuntu/avatar.jpg"
#c.setopt(c.HTTPPOST, [("textname", (c.FORM_FILE, post_file))])
# 调试回调.调试信息类型是一个调试信 息的整数标示类型.在这个回调被调用时VERBOSE选项必须可用
# c.setopt(c.VERBOSE, 1) #verbose 详细
# c.setopt(c.DEBUGFUNCTION, test)
# f = open("body", "wb")
# c.setopt(c.WRITEDATA, f)
# h = open("header", "wb")
# c.setopt(c.WRITEHEADER, h)
# print "Header is in file 'header', body is in file 'body'"
# f.close()
# h.close()
# c.setopt(c.NOPROGRESS, 0)
# c.setopt(c.PROGRESSFUNCTION, progress)
# c.setopt(c.OPT_FILETIME, 1)
#访问,阻塞到访问结束
c.perform()
html = b.getvalue()
b.close()
c.close()
return html
def verifyPorxy(url,domain,port,timeout = 2,valid_flag = "",invalid_flag = "",debug = False):
#第一步启用 cookie
cookies = urllib2.HTTPCookieProcessor()
proxy_server = r'http://%s:%s' %(domain,port)
#第二步 装载代理
proxy_hander = urllib2.ProxyHandler({"http":proxy_server})
#第三步 组合request
try:
opener = urllib2.build_opener(cookies, proxy_hander)
pass
except urllib2.URLError:
return -1
opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1')]
delayms = time.clock()
urllib2.install_opener(opener)
try:
req = urllib2.urlopen(url,timeout=timeout)
result = req.read()
delayms = time.clock() - delayms
isValid = False
isInvalid = False
if len(invalid_flag) > 0:
isInvalid= result.find(invalid_flag) > 0
if len(valid_flag) > 0:
isValid = result.find(valid_flag) > 0
if len(invalid_flag) > 0 & isInvalid:
return -1
if len(valid_flag) > 0 & isValid:
return delayms
return delayms
except Exception,e:
print "Excetion in verify %s:%s failed.{%s}" % (domain,port,e)
return -1
def verifyPorxyBySocket(domain,port,timeout = 2,debug = False):
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
try:
start = time.clock()
sock.connect((domain,port))
delayms = int((time.clock() - start) * 1000)
sock.close()
return True
except Exception, e:
print "Excetion in verify %s:%s failed.{%s}" % (domain,port,e)
return -1
python实现51JOB网站职位统计抓取并用matplotlib展示
需积分: 34 95 浏览量
2018-11-15
13:06:15
上传
评论 3
收藏 10KB RAR 举报
白錵錵
- 粉丝: 33
- 资源: 13