#coding: utf-8
from urllib import request
from htmlparser import UrlParser
import os.path
import re
# regular expression
re_word = re.compile('(\w+)')
re_dataname = re.compile('(\w+)\.(\w+)')
class DataBean:
url = ''
raw_data = None
data = None
encoding = None
content_type = None # html jpg gif css
headers = None
def reset(self):
self.url = ''
self.raw_data = None
self.data = None
self.encoding = None
self.content_type = None # html jpg gif css
self.headers = None
def set(self, resp):
self.reset()
self.headers = resp.getheaders()
temp = resp.getheader('Content-Type').strip()
self.raw_data = resp.read()
if 'text/' in temp:
temp = temp.split(';')
self.content_type = temp[0]
if len(temp) == 2 and 'charset' in temp[1]:
self.encoding = temp[1].replace('charset=', '').strip()
else:
self.encoding = 'utf-8'
self.data = self.raw_data.decode(self.encoding)
else:
self.content_type = temp
class UrlHandler:
""" is used to handle all kinds of urls """
def __init__(self, downpath, siteurl, listurl):
self.downpath = downpath
self.siteurl = siteurl
self.listurl = listurl
self.mkdirs(siteurl)
def set(self, databean):
self.databean = databean
temppos = databean.url.rfind('/')
self.parenturl = databean.url[:temppos + 1]
self.raw_data = self.databean.data
self.databean.data = ''
self.prepos = (0, 0)
# convert url postfix to local postfix
def pfx2localpfx(self, postfix):
m = re_dataname.search(postfix)
if not m:
return postfix
dataname = m.group(1)
datatype = m.group(2)
localpostfix = dataname
if datatype in ['php', 'htm', 'html']:
words = re_word.findall(postfix, m.end())
for word in words:
localpostfix += word
datatype = 'html'
localpostfix += '.' + datatype
return localpostfix
def url2filepath(self, url):
""" This method can be called by UrlHandler object at outside
It can conver an absloute url to an absloute local path."""
if url.endswith('/'):
url += 'index.html'
if '://' in url:
pos = url.find('/')
url = url[pos + 2:]
dires = url.split('/')
path = self.downpath
dires[len(dires) - 1] = self.pfx2localpfx(dires[len(dires) - 1])
for dire in dires:
path = os.path.join(path, dire)
return path
def url2localurl(self, url):
if '://' in url:
if 'google.' in url:
return '#'
elif url.startswith(self.siteurl):
url = url.replace(self.siteurl, '')
cnt = self.parenturl.count('/') - 3
for i in range(cnt):
url = '../' + url
if url.startswith('/'):
url = '.' + url
if url.endswith('/'):
return url
pos = url.rfind('/')
postfix = self.pfx2localpfx(url[pos + 1:])
return url[:pos + 1] + postfix
# make absloute local dirictories according to the absloute url
def mkdirs(self, url):
path = self.url2filepath(url) # this path is a file
path = os.path.dirname(path) # get the parent directory of the file
if not os.path.lexists(path):
os.makedirs(path)
# convert the relative url in the html to an absloute url
# because, urllib cann't open relative url in an html
def absurl(self, url):
if '://' in url:
return url
absurl = ''
if url.startswith('../'):
url = url.lstrip('../') # Caution: strip() is very confusing.
pos = self.parenturl.rfind('/')
absurl = self.parenturl[:pos + 1] + url
elif url.startswith('./'):
url = url.lstrip('./')
absurl = self.parenturl + url
elif url.startswith('/'):
absurl = self.parenturl + url.lstrip('/')
else:
absurl = self.parenturl + url
return absurl
# Note: argument pos should be a two-tuple(start, end)
# None which means the htmldata has been completly handled.
def handle(self, url, pos):
# url == 'http://...' or 'https://...'
if not pos:
self.databean.data += self.raw_data[self.prepos[1]:]
return
if '://' in url and not url.startswith(self.siteurl) \
or 'javascript:;' == url:
return
else:
localurl = self.url2localurl(url)
url = self.absurl(url)
# substitute the relative url to relative local url
self.databean.data += self.raw_data[self.prepos[1]:pos[0]] + localurl
self.prepos = pos
if url not in self.listurl:
self.mkdirs(url)
self.listurl.append(url) # add the url to download list.
class DownLoader:
def __init__(self, downpath, siteurl):
self.siteurl = siteurl
# This list record url that will be downloaded
self.listurl = [siteurl]
# Retrieve the url in a html page
self.urlparser = UrlParser()
# Convert the url to a locla path which used to make directory
self.urlhandler = UrlHandler(downpath, siteurl, self.listurl)
#
self.databean = DataBean()
def getdata(self, i):
""" down loca data from an url."""
try:
url = self.listurl[i]
resp = request.urlopen(url)
self.databean.set(resp)
self.databean.url = url
except:
print("*" * 10, 'Occur an error : url = ', url)
# Internal --- Parse the data, and retrieve the url, href, src from the html page.
#
def parsedata(self):
data = self.databean.data
self.urlhandler.set(self.databean)
self.urlparser.reset()
self.urlparser.feed(data, self.urlhandler)
# Internal --- Storge the data to local path.
# for html page, it will use the mode 'w' to write.
# to binary data, the mode should be 'wb'
def storge(self):
filepath = self.urlhandler.url2filepath(self.databean.url)
if 'html' in self.databean.content_type:
fd = open(filepath, 'w', encoding=self.databean.encoding)
fd.write(self.databean.data)
fd.close()
else:
fd = open(filepath, 'wb')
fd.write(self.databean.raw_data)
fd.close()
print(filepath)
def gohead(self):
""" Down all the url of the site with a while loop block"""
i = 0
while len(self.listurl) > i:
self.getdata(i)
if 'text/html' == self.databean.content_type:
self.parsedata()
self.storge()
i += 1
if __name__ == '__main__':
path = 'e:\\'
siteurl = 'http://www.woookliu.com/'
loader = DownLoader(path, siteurl)
loader.gohead()
评论10