# Scrapy settings for novel project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "novel"
SPIDER_MODULES = ["novel.spiders"]
NEWSPIDER_MODULE = "novel.spiders"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "novel (+http://www.yourdomain.com)"
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = True
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en",
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# "novel.middlewares.NovelSpiderMiddleware": 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
"novel.middlewares.NovelDownloaderMiddleware": 400,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
"novel.pipelines.NovelPipeline": 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
USER_AGENT_LIST = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
FEED_EXPORT_ENCODING = "utf-8"
scrapy练习 获取喜欢的书籍
需积分: 0 39 浏览量
更新于2024-05-22
1
收藏 20KB ZIP 举报
Scrapy是一个强大的Python爬虫框架,它为开发者提供了一套完整的结构来构建网络爬虫,以便高效地抓取网页数据并进行后续处理。在“scrapy练习 获取喜欢的书籍”这个项目中,我们主要学习如何利用Scrapy来获取网上书籍的相关信息。
让我们了解一下Scrapy的基本架构。Scrapy由几个核心组件构成,包括Spiders、Item、Item Pipeline、Request/Response、Selectors等。Spiders是爬虫的核心,定义了如何从一个或多个网站提取数据。Item用于定义我们想抓取的数据结构,而Item Pipeline则负责处理抓取到的数据,如清洗、验证和存储。Request和Response对象则用于网络通信,Selectors(如XPath或CSS选择器)用于从HTML或XML文档中提取数据。
在项目一中,参考了知乎文章《https://zhuanlan.zhihu.com/p/687522335》,我们可以看到作者详细介绍了如何构建一个Scrapy爬虫来抓取特定书籍的详情。你需要创建一个新的Scrapy项目,使用`scrapy startproject book_scraper`命令初始化。然后,创建一个Spider,定义其名称和要爬取的网站。在Spider中,你需要编写解析函数(如`parse()`),使用XPath或CSS选择器来定位书籍信息。
例如,如果你要抓取书籍的标题,你可能需要找到包含标题的HTML元素,并使用如下的XPath表达式:
```python
response.xpath('//h1[@class="book-title"]/text()').get()
```
Scrapy提供了中间件(Middleware)机制,允许你在请求发送前或响应接收后执行自定义逻辑,例如处理cookies、管理代理IP或者设置User-Agent。
对于数据存储,Scrapy的Item Pipeline可以将抓取的数据保存到数据库(如SQLite、MySQL)、文件(如JSON、CSV)或者其他存储系统。例如,你可以创建一个CSV导出管道:
```python
class CSVExportPipeline:
def open_spider(self, spider):
self.file = open('books.csv', 'w', encoding='utf-8')
self.writer = csv.writer(self.file)
def process_item(self, item, spider):
self.writer.writerow([item['title'], item['author'], item['url']])
return item
def close_spider(self, spider):
self.file.close()
```
在实际操作中,你还需要考虑反爬策略,比如网站的Robots协议、验证码、动态加载内容等。对于动态加载的内容,你可能需要结合Scrapy的Selenium或Splash中间件来模拟浏览器行为。
“scrapy练习 获取喜欢的书籍”这个项目是一个很好的起点,通过实践,你可以深入理解Scrapy的工作原理,并掌握如何使用它来抓取网络上的书籍信息。记得在实践中遵守法律法规,尊重网站的robots.txt规则,并合理处理反爬措施,以确保爬虫的可持续性和合法性。

你好王先生
- 粉丝: 2
最新资源
- <标题>Android实现单反照片读取并保存至手机内部存储的完整方案探究</标题>
- c语言编程大题.doc
- 基于智慧课堂环境-利用大数据提高物理教学质量.docx
- code-runner-0.9.17.vsix
- C语言数组教程 PPT.ppt
- 基于单片机的步进电机控制系统设计.doc
- 8086 汇编语言分支程序设计.doc
- 南开大学2021年9月《Java语言程序设计》作业考核试题及答案参考6.docx
- 电子商务运营团队架构(1).doc
- 基于单片机的工频电压(电流)表的设计.doc
- 基于广东电网电力通信网的NGN软交换网络QoS保证研究的开题报告.docx
- 物业管理的电子商务之路.doc
- 西北工业大学软件与微电子学院教学文稿.ppt
- c语言基础知识.docx
- 编译原理-词法分析器语法分析器语义计算器JavaSwingGUI界面设计-基于编译原理课程实验的词法分析语法分析语义计算实现的计算器项目-包含词法分析DFA实现语法分析递归下降实现.zip
- 84e02-main.zip