import urllib2
import sys
import re
import time
import ConfigParser
import sqlite3
import hashlib
cx=sqlite3.connect("data.db")
cu=cx.cursor()
fileSystemEncoding=sys.getfilesystemencoding()
configFileName="config.ini"
configSection="config"
md5Serializer = hashlib.md5()
config = ConfigParser.ConfigParser()
config.read(configFileName)
url=config.get(configSection,'url')
pageEncoding=config.get(configSection,'encoding')
hrefRegex=config.get(configSection,'href')
titleRegex=config.get(configSection,'title')
descriptionRegex=config.get(configSection,'description')
contentRegex=config.get(configSection,'content')
authorRegex=config.get(configSection,'author')
pubdateRegex=config.get(configSection,'pubdate')
interval=config.getint(configSection,'interval')
useragent=config.get(configSection,'useragent')
print "Please Waiting..."
while True:
timeNow =time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', useragent)]
htmlAll = opener.open(url).read()
hrefResult=re.findall(hrefRegex, htmlAll)
titleResult=re.findall(titleRegex, htmlAll)
descriptionResult=re.findall(descriptionRegex, htmlAll)
contentResult=re.findall(contentRegex, htmlAll)
authorResult=re.findall(authorRegex, htmlAll)
pubdateResult=re.findall(pubdateRegex, htmlAll)
count=len(titleResult)
for i in range(count):
href=hrefResult[i].decode(pageEncoding,'ignore')
md5=hashlib.md5(url+href).hexdigest()
cu.execute("SELECT Id FROM Articles where MD5 ='"+md5+"' limit 1")
if(len(cu.fetchall())>0):
continue
title=titleResult[i].decode(pageEncoding,'ignore')
description=''
if(len(descriptionResult)>i):
description=descriptionResult[i].decode(pageEncoding,'ignore')
content=''
if(len(contentResult)>i):
content=contentResult[i].decode(pageEncoding,'ignore')
author=''
if(len(authorResult)>i):
author=authorResult[i].decode(pageEncoding,'ignore')
pubdate=''
if(len(pubdateResult)>i):
pubdate=pubdateResult[i].decode(pageEncoding,'ignore')
sql="insert into Articles(SourceUrl,Href,Title,MD5,Description,Content,Author,PublishDate,RecordTime)Values(?,?,?,?,?,?,?,?,?)"
cu.execute(sql,(url,href,title,md5,description,content,author,pubdate,timeNow))
print "=========="+title.encode(fileSystemEncoding,'ignore')
print description.encode(fileSystemEncoding,'ignore')
cx.commit()
print ">>>>>>"+timeNow
time.sleep(interval)