# Scrapy settings for myblog project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "myblog"
SPIDER_MODULES = ["myblog.spiders"]
NEWSPIDER_MODULE = "myblog.spiders"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "User-Agent':'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en",
"User-Agent": "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36",
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# "myblog.middlewares.MyblogSpiderMiddleware": 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# "myblog.middlewares.MyblogDownloaderMiddleware": 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
"myblog.pipelines.MyblogPipeline": 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
# Mysql数据库的配置信息
MYSQL_HOST = 'localhost'
MYSQL_DBNAME = 'test' # 数据库名字,请修改
MYSQL_USER = 'root' # 数据库账号,请修改
MYSQL_PASSWD = '123456' # 数据库密码,请修改
MYSQL_PORT = 3306 # 数据库端口,在db中使用
没有合适的资源?快使用搜索试试~ 我知道了~
温馨提示
写在前面 本期内容:基于scrapy+mysql爬取博客信息并保存到数据库中 实验需求 - anaconda丨pycharm - python3.11.4 - scrapy - mysql 实验描述 本次实验实现了:使用Scrapy框架爬取博客专栏的目录信息并保存到MySQL数据库中,实验主要涉及到Python的爬虫技术以及MySQL的基本操作,需要有一定的基础。 实验框架 - Scrapy 实验需求 - Scrapy:Scrapy是一个基于Python的开源网络爬虫框架,用于快速、高效地获取网页数据。它具有强大的抓取能力,支持多线程和分布式爬虫,能够并行爬取多个网页。Scrapy提供了方便的API和丰富的功能,可以自定义爬虫规则和处理流程,支持数据的持久化存储和导出。它还提供了可视化的调试工具和强大的反爬虫策略,可以帮助开发者更轻松地构建和管理网络爬虫。Scrapy是一个成熟、稳定和广泛应用的爬虫框架,被广泛用于数据抓取、搜索引擎和大数据分析等领域。 - MySQL:MySQL是一个开源的关系型数据库管理系统,由Oracle Corporation开发和 ……
资源推荐
资源详情
资源评论
收起资源包目录
24-基于scrapy+mysql爬取博客信息并保存到数据库中.rar (14个子文件)
24-基于scrapy+mysql爬取博客信息并保存到数据库中
myblog
scrapy.cfg 266B
myblog
__init__.py 0B
pipelines.py 2KB
spiders
__init__.py 161B
sp_blogs.py 1KB
__pycache__
__init__.cpython-311.pyc 238B
sp_blogs.cpython-311.pyc 3KB
items.py 238B
settings.py 4KB
__pycache__
__init__.cpython-311.pyc 230B
settings.cpython-311.pyc 1KB
pipelines.cpython-311.pyc 2KB
middlewares.py 4KB
db.py 2KB
共 14 条
- 1
资源评论
- 肖运华2024-08-18资源内容详实,描述详尽,解决了我的问题,受益匪浅,学到了。
Want595
- 粉丝: 9w+
- 资源: 67
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
最新资源
- bootstrap企业网站前端模板下载
- 矩阵作业-包含Eigen安装相关内容
- CSS3几何透明层文本悬停变色特效代码.zip
- CSS3实现的九宫格图片鼠标悬停去除遮罩层特效源码.zip
- MQTT协议的原理、特点、工作流程及应用场景
- Ruby语言教程从介绍入门到精通详教程跟代码.zip
- PM2.5-Prediction-Based-on-Random-Forest-Algorithm-master.zip
- Delphi开发详解:从入门到高级全面教程
- 物理机安装群晖DS3617教程(用U盘做引导)
- 本项目旨在开发一个基于情感词典加权组合方式的文本情感分析系统,通过以下几个目标来实现: 构建情感词典:收集并整理包含情感极性(正面或负面)的词汇 加权组合:通过加权机制,根据词汇在文本中的重要性、
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功