# -*- coding: utf-8 -*-
#from scrapy.spider import BaseSpider
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.dupefilter import RFPDupeFilter
from scrapy.selector import HtmlXPathSelector
from searchEngine.items import SearchengineItem
class DmozSpider(CrawlSpider):
name = "dmoz"
allowed_domains = [
"news.cn",
"news.xinhuanet.com"
]
start_urls = [
"http://www.news.cn/",
"http://www.news.cn/mil/index.htm",
"http://www.news.cn/politics/"
"http://www.news.cn/world/index.htm",
"http://www.news.cn/tech/index.htm"
]
rules = (
#Rule(SgmlLinkExtractor(allow=('page/[0-9]+', ))),
#Rule(SgmlLinkExtractor(allow=['/' ]),'item_parse')
Rule(SgmlLinkExtractor(allow=('/', )),callback='item_parse'),
)
def item_parse(self,response,dont_filter=False):
#self.log("%s"%response.url)
item = SearchengineItem()
item['url'] = response.url
item['title'] = response.selector.xpath('//title/text()').extract()
item['keywords'] = response.selector.xpath('//meta[@name="keywords"]/@content').extract()
item['description'] = response.selector.xpath('//meta[@name="description"]/@content').extract()
for t in item['title']:
print t.encode('utf-8')
for t in item['keywords']:
print t.encode('utf-8')
for t in item['description']:
print t.encode('utf-8')
return item
#print item['title']
'''
def parse(self, response):
#hxs = HtmlXPathSelector(response)
#sites = hxs.select('//head')
#res = HtmlXPathSelector(response)
item = SearchengineItem()
#for site in sites:
# item = SearchengineItem()
# item['title'] = site.select('//title/text()').extract()
# item['link'] = site.select('meta/@keywords').extract()
#item['desc'] = site.select('text()').extract()
#items.append(item)
item['title'] = response.selector.xpath('//title/text()').extract()
item['keywords'] = response.selector.xpath('//meta[@name="keywords"]/@content').extract()
item['description'] = response.selector.xpath('//meta[@name="description"]/@content').extract()
for t in item['title']:
print t.encode('utf-8')
for t in item['keywords']:
print t.encode('utf-8')
for t in item['description']:
print t.encode('utf-8')
#print item['title']
return item
'''

博士僧小星
- 粉丝: 2460
- 资源: 5998
最新资源
- TVP5150/TVP5151数字视频解码器硬件与软件设计方案及FAQ
- 西门子PLC与C#上位机高效通讯:WPF界面开发实践与S7netpuls库的自定义封装,西门子PLC与C#上位机高效通讯:WPF界面开发实践与S7netpuls库的自定义封装新方法WriteReadC
- 基于A*算法的机器人路径规划系统:无缝切换五种地图,详细代码注释辅助理解,基于A*算法的机器人路径规划系统:五种地图自由切换与详细代码注释指引,基于A*算法的机器人路径规划 五种地图随意切, 内涵详细
- 全差分运放的设计与应用:简化实现和性能优势
- 差动放大器性能优化方法及其应用场景的技术探讨
- (源码)基于Java的LeetCode题解项目.zip
- (源码)基于Python的微信智能机器人.zip
- 自动化所考博真题-数学-算法-英语2025.pdf
- navicat连接MySQL的神器
- (源码)基于Python的动态掩码生成工具.zip
- 永磁同步电机无传感器控制及滑膜观测模型Matlab实现,附反正切观测模型对比及参考文献,永磁同步电机无传感器控制及滑膜观测模型Matlab实现与反正切观测模型对比研究参考文献分享,永磁同步电机无传感器
- 电流反馈(CFB)与电压反馈(VFB)运算放大器的工作原理及应用场景对比
- bp神经网络python代码.py
- python爱心代码高级.py
- python爱心代码高级粒子.py
- python烟花代码.py
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈


