import re
from Parser import *
class NewsParser(Parser):
"""
doParse 这个方法必须实现,
参数page为一个dict,page有两个键-url和html,通过page['url']可以获得url,page['html']获得网页的HTML
"""
def doParse(self,page):
self.URL = page['url']
self.Html = page['html']
result={}
result['url'] = page['url']
sPattern='notice(?P<code>\d{6})_(?P<id>\d{1,7})\.html'
result.update(self.suckItem(page['url'], sPattern))
sPattern='<h1>(?P<title>.*?)</h1>.*?<div class="artibody" id="artibody">(?P<content>.*?)</div>'
result.update(self.suckItem(page['html'], sPattern))
if result.has_key('title'):
print result['url'],result['title']
return result
def suckItem(self,str,sPattern):
pattern = re.compile(sPattern,re.IGNORECASE | re.DOTALL)
m=pattern.search(str)
if m:
return m.groupdict()
else:
return {}
- 1
- 2
- 3
- 4
- 5
- 6
前往页