# -*- coding: utf-8 -*-
# Scrapy settings for cnblogs project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'cnblogs'
SPIDER_MODULES = ['cnblogs.spiders']
NEWSPIDER_MODULE = 'cnblogs.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'cnblogs (+http://www.yourdomain.com)'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS=32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY=3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN=16
#CONCURRENT_REQUESTS_PER_IP=16
# Disable cookies (enabled by default)
COOKIES_ENABLED=False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED=False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'cnblogs.middlewares.MyCustomSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
# 'cnblogs.middlewares.MyCustomDownloaderMiddleware': 543,
'cnblogs.middlewares.RandomUserAgent': 1, #随机user agent
#'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 110, #此API已经弃用
#'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110, #代理需要用到
#'cnblogs.middlewares.ProxyMiddleware': 100, #代理需要用到
#'scrapy_crawlera.CrawleraMiddleware': 600, #crawlera代理用到
}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'cnblogs.pipelines.JsonWithEncodingPipeline': 300,
#'cnblogs.pipelines.MySQLStorePipeline': 301,
#'cnblogs.pipelines.MongoStorePipeline': 302,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# NOTE: AutoThrottle will honour the standard settings for concurrency and delay
#AUTOTHROTTLE_ENABLED=True
# The initial download delay
#AUTOTHROTTLE_START_DELAY=5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY=60
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG=False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED=True
#HTTPCACHE_EXPIRATION_SECS=0
#HTTPCACHE_DIR='httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES=[]
#HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
LOG_LEVEL = 'INFO'
# start MySQL database configure setting
MYSQL_HOST = '101.200.36.72'
MYSQL_DBNAME = 'cnblogsdb'
MYSQL_USER = 'sxydb'
MYSQL_PASSWD = 'sxydb'
# end of MySQL database configure setting
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]
PROXIES = [
{'ip_port': '111.11.228.75:80', 'user_pass': ''},
{'ip_port': '120.198.243.22:80', 'user_pass': ''},
{'ip_port': '111.8.60.9:8123', 'user_pass': ''},
{'ip_port': '101.71.27.120:80', 'user_pass': ''},
{'ip_port': '122.96.59.104:80', 'user_pass': ''},
{'ip_port': '122.224.249.122:8088', 'user_pass': ''},
]
#CRAWLERA_ENABLED = True
#CRAWLERA_USER = '<API key>'
#CRAWLERA_PASS = '你crawlera账号的密码'
#MONGODB_HOST = 'localhost' # Change in prod
#MONGODB_PORT = 27017 # Change in prod
#MONGODB_DATABASE = "cnblog" # Change in prod
#MONGODB_COLLECTION = "blogs_info"
#MONGODB_USERNAME = "" # Change in prod
#MONGODB_PASSWORD = "" # Change in prod
没有合适的资源?快使用搜索试试~ 我知道了~
scrapy爬取豆瓣,携程代码,动态加载页面等
共58个文件
py:28个
pyc:16个
ds_store:6个
需积分: 43 33 下载量 190 浏览量
2018-08-23
15:18:52
上传
评论 2
收藏 52KB ZIP 举报
温馨提示
python的所有爬虫案例都有实现,动态加载页面,模拟火狐浏览器,各种高难度姿势爬取内容
资源推荐
资源详情
资源评论
收起资源包目录
crawler_codes.zip (58个子文件)
douban
.DS_Store 6KB
douban
settings.py 3KB
.DS_Store 6KB
pipelines.py 671B
__init__.py 0B
items.py 1018B
spiders
.DS_Store 6KB
doumailspider.py 3KB
bookspider.py 1006B
__init__.py 161B
douban_mail_page1.csv 518B
douban_book_top250.csv 15KB
scrapy.cfg 256B
cnblog_spider.py 2KB
qqnews_spider.py 1KB
quotes_spider.py 687B
stackoverflow_spider.py 958B
TravelInfo
.DS_Store 6KB
scrapy.cfg 264B
ghostdriver.log 1KB
TravelInfo
settings.py 3KB
__init__.pyc 168B
pipelines.py 2KB
middlewares.py 3KB
useragent.pyc 2KB
items.pyc 727B
__init__.py 0B
ghostdriver.log 1KB
pipelines.pyc 2KB
items.py 589B
useragent.py 1KB
spiders
qua_spider.pyc 2KB
__init__.pyc 176B
qua_spider.py 2KB
ctrip_spider.py 2KB
__init__.py 161B
ctrip_spider.pyc 2KB
middlewares.pyc 3KB
settings.pyc 867B
MyCnblogSpider
.DS_Store 6KB
db
database.sql 498B
cnblogs
settings.py 6KB
.DS_Store 6KB
__init__.pyc 167B
pipelines.py 4KB
middlewares.py 1KB
items.pyc 600B
__init__.py 0B
pipelines.pyc 6KB
items.py 597B
spiders
__init__.pyc 175B
__init__.py 161B
cnblogs_spider.pyc 2KB
cnblogs_spider.py 2KB
middlewares.pyc 2KB
settings.pyc 3KB
scrapy.cfg 258B
json2txt.py 456B
共 58 条
- 1
资源评论
时间的快慢
- 粉丝: 49
- 资源: 8
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功