# -*- coding: utf-8 -*-
# Scrapy settings for cnki project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'cnki'
SPIDER_MODULES = ['cnki.spiders']
NEWSPIDER_MODULE = 'cnki.spiders'
DEFAULT_REQUEST_HEADERS = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0',
'Referer':'http://kns.cnki.net/kns/brief/result.aspx?dbprefix=SCOD',
'Cookie':'e01110a4-d552-48f6-32b5-999450e9ff06;Ecp_ClientcnkiUserKey=Id=2170909081801313483;kc_cnki_net_uid=76e702f3-b239-278d-1fd7-05d3143f2c9c;Ecp_IpLoginFail=170927112.81.2.110;ASP.NET_SessionId=xrzxi2drywctu0njaupkcdql;SID_kns=123121;'
}
ITEM_PIPELINES = {'cnki.pipelines.CnkiPipeline':300,} #设置管道优先级别,因为只有一个pipeline,所以可以设置0-1000任意
MONGODB_SERVER = "localhost"
MONGODB_PORT = 27017 #PORT DBNAME COLLECTION SERVER 必需
MONGODB_DBNAME = 'cnki' #数据库的名字
#MONGODB_DOCNAME = 'book'
MONGODB_COLLECTION = 'cnkizl' #数据库table 的名字
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'cnki (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'cnki.middlewares.CnkiSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'cnki.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'cnki.pipelines.CnkiPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
没有合适的资源?快使用搜索试试~ 我知道了~
温馨提示
使用Scrapy框架开发爬取中国知网专利信息的爬虫,可以提供以下500字的说明: Scrapy是一个功能强大、高效的Python网络爬虫框架,非常适合用于爬取中国知网这样的专业学术资源网站。利用Scrapy可以快速开发一个高质量的知网专利信息爬虫。 首先,需要确定要爬取的目标信息。对于知网专利,可以包括专利名称、专利号、专利类型、申请人、发明人、申请日期、授权日期、引用次数等关键字段。 然后, 设计爬虫的抓取流程。一般来说,可以分为以下几个步骤: 1. 确定入口URL。可以从知网的专利检索入口页开始,构造初始爬取URL。 2. 解析搜索结果页。使用Scrapy的Selector解析器,提取每个专利条目的详情页URL。 3. 抓取专利详情页。访问每个专利详情页,使用Selector解析出所需的各项专利信息。 4. 存储数据。可以将解析到的专利数据保存到CSV文件或者数据库中。 在Scrapy中,这些步骤可以通过Item、Spider、Pipeline等组件很好地实现。其中,Item定义了需要抓取的数据结构,Spider负责页面抓取和数据提取,Pipeline负责数据的存储
资源推荐
资源详情
资源评论
收起资源包目录
cnki-master.zip (24个子文件)
cnki-master
scrapy.cfg 252B
.idea
cnki.iml 519B
workspace.xml 28KB
misc.xml 212B
inspectionProfiles
profiles_settings.xml 228B
modules.xml 260B
cnki
__init__.py 0B
pipelines.py 1KB
settings.pyc 1024B
spiders
cnkispider.pyc 2KB
__init__.py 161B
cnkispider.py 1KB
__init__.pyc 167B
__pycache__
cnkispider.cpython-36.pyc 1KB
__init__.cpython-36.pyc 128B
items.py 667B
__init__.pyc 159B
pipelines.pyc 1KB
settings.py 4KB
__pycache__
settings.cpython-36.pyc 663B
__init__.cpython-36.pyc 120B
items.cpython-36.pyc 551B
middlewares.py 2KB
items.pyc 767B
共 24 条
- 1
资源评论
进击的代码家
- 粉丝: 2203
- 资源: 204
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功