# -- coding: utf-8 --
import re
import urlparse
import os
import lxml.html
class Scraper:
def __init__(self, seed_url, link_regx, filter_link_regx):
self.link_regx = re.compile(link_regx, re.IGNORECASE)
self.filter_link_regx = filter_link_regx
self.links = []
self.seed_url = seed_url
self.basepath = os.getcwd()+"/data/"
def handler_data(self, html):
self.get_links(html)
self.get_data(html)
def get_links(self, html):
self.links = []
temp_links = self.link_regx.findall(html)
for link in temp_links:
if(re.match(self.filter_link_regx, link)):
link = urlparse.urljoin(self.seed_url, link)
self.links.append(link)
def get_data(self, html):
tree = lxml.html.fromstring(html)
title = tree.cssselect('div.article_title > h1 > span.link_title > a')[0].text_content()
title = re.sub('[\/:*?"<>|\r\n]', "", str.strip(str(title)))
content = tree.cssselect('div#article_content')
if content:
self.save(title, str(content[0].text_content()))
def save(self, title, content):
title = unicode(title)
filehandler = open(self.basepath+title+".txt", "w+")
filehandler.write(content)
filehandler.close()