AUTHORS.txt
CHANGES.txt
LICENSE.txt
MANIFEST.in
README.rst
setup.cfg
setup.py
docs/Makefile
docs/__init__.py
docs/conf.py
docs/configuration.rst
docs/cookbook.rst
docs/development.rst
docs/index.rst
docs/installing.rst
docs/logic.rst
docs/make.bat
docs/news.rst
docs/pipext.py
docs/quickstart.rst
docs/usage.rst
docs/user_guide.rst
docs/reference/index.rst
docs/reference/pip.rst
docs/reference/pip_download.rst
docs/reference/pip_freeze.rst
docs/reference/pip_hash.rst
docs/reference/pip_install.rst
docs/reference/pip_list.rst
docs/reference/pip_search.rst
docs/reference/pip_show.rst
docs/reference/pip_uninstall.rst
docs/reference/pip_wheel.rst
pip/__init__.py
pip/__main__.py
pip/basecommand.py
pip/baseparser.py
pip/cmdoptions.py
pip/download.py
pip/exceptions.py
pip/index.py
pip/locations.py
pip/pep425tags.py
pip/status_codes.py
pip/wheel.py
pip.egg-info/PKG-INFO
pip.egg-info/SOURCES.txt
pip.egg-info/dependency_links.txt
pip.egg-info/entry_points.txt
pip.egg-info/not-zip-safe
pip.egg-info/requires.txt
pip.egg-info/top_level.txt
pip/_vendor/README.rst
pip/_vendor/__init__.py
pip/_vendor/appdirs.py
pip/_vendor/distro.py
pip/_vendor/ipaddress.py
pip/_vendor/ordereddict.py
pip/_vendor/pyparsing.py
pip/_vendor/re-vendor.py
pip/_vendor/retrying.py
pip/_vendor/six.py
pip/_vendor/vendor.txt
pip/_vendor/cachecontrol/__init__.py
pip/_vendor/cachecontrol/_cmd.py
pip/_vendor/cachecontrol/adapter.py
pip/_vendor/cachecontrol/cache.py
pip/_vendor/cachecontrol/compat.py
pip/_vendor/cachecontrol/controller.py
pip/_vendor/cachecontrol/filewrapper.py
pip/_vendor/cachecontrol/heuristics.py
pip/_vendor/cachecontrol/serialize.py
pip/_vendor/cachecontrol/wrapper.py
pip/_vendor/cachecontrol/caches/__init__.py
pip/_vendor/cachecontrol/caches/file_cache.py
pip/_vendor/cachecontrol/caches/redis_cache.py
pip/_vendor/colorama/__init__.py
pip/_vendor/colorama/ansi.py
pip/_vendor/colorama/ansitowin32.py
pip/_vendor/colorama/initialise.py
pip/_vendor/colorama/win32.py
pip/_vendor/colorama/winterm.py
pip/_vendor/distlib/__init__.py
pip/_vendor/distlib/compat.py
pip/_vendor/distlib/database.py
pip/_vendor/distlib/index.py
pip/_vendor/distlib/locators.py
pip/_vendor/distlib/manifest.py
pip/_vendor/distlib/markers.py
pip/_vendor/distlib/metadata.py
pip/_vendor/distlib/resources.py
pip/_vendor/distlib/scripts.py
pip/_vendor/distlib/t32.exe
pip/_vendor/distlib/t64.exe
pip/_vendor/distlib/util.py
pip/_vendor/distlib/version.py
pip/_vendor/distlib/w32.exe
pip/_vendor/distlib/w64.exe
pip/_vendor/distlib/wheel.py
pip/_vendor/distlib/_backport/__init__.py
pip/_vendor/distlib/_backport/misc.py
pip/_vendor/distlib/_backport/shutil.py
pip/_vendor/distlib/_backport/sysconfig.cfg
pip/_vendor/distlib/_backport/sysconfig.py
pip/_vendor/distlib/_backport/tarfile.py
pip/_vendor/html5lib/__init__.py
pip/_vendor/html5lib/_ihatexml.py
pip/_vendor/html5lib/_inputstream.py
pip/_vendor/html5lib/_tokenizer.py
pip/_vendor/html5lib/_utils.py
pip/_vendor/html5lib/constants.py
pip/_vendor/html5lib/html5parser.py
pip/_vendor/html5lib/serializer.py
pip/_vendor/html5lib/_trie/__init__.py
pip/_vendor/html5lib/_trie/_base.py
pip/_vendor/html5lib/_trie/datrie.py
pip/_vendor/html5lib/_trie/py.py
pip/_vendor/html5lib/filters/__init__.py
pip/_vendor/html5lib/filters/alphabeticalattributes.py
pip/_vendor/html5lib/filters/base.py
pip/_vendor/html5lib/filters/inject_meta_charset.py
pip/_vendor/html5lib/filters/lint.py
pip/_vendor/html5lib/filters/optionaltags.py
pip/_vendor/html5lib/filters/sanitizer.py
pip/_vendor/html5lib/filters/whitespace.py
pip/_vendor/html5lib/treeadapters/__init__.py
pip/_vendor/html5lib/treeadapters/genshi.py
pip/_vendor/html5lib/treeadapters/sax.py
pip/_vendor/html5lib/treebuilders/__init__.py
pip/_vendor/html5lib/treebuilders/base.py
pip/_vendor/html5lib/treebuilders/dom.py
pip/_vendor/html5lib/treebuilders/etree.py
pip/_vendor/html5lib/treebuilders/etree_lxml.py
pip/_vendor/html5lib/treewalkers/__init__.py
pip/_vendor/html5lib/treewalkers/base.py
pip/_vendor/html5lib/treewalkers/dom.py
pip/_vendor/html5lib/treewalkers/etree.py
pip/_vendor/html5lib/treewalkers/etree_lxml.py
pip/_vendor/html5lib/treewalkers/genshi.py
pip/_vendor/lockfile/__init__.py
pip/_vendor/lockfile/linklockfile.py
pip/_vendor/lockfile/mkdirlockfile.py
pip/_vendor/lockfile/pidlockfile.py
pip/_vendor/lockfile/sqlitelockfile.py
pip/_vendor/lockfile/symlinklockfile.py
pip/_vendor/packaging/__about__.py
pip/_vendor/packaging/__init__.py
pip/_vendor/packaging/_compat.py
pip/_vendor/packaging/_structures.py
pip/_vendor/packaging/markers.py
pip/_vendor/packaging/requirements.py
pip/_vendor/packaging/specifiers.py
pip/_vendor/packaging/utils.py
pip/_vendor/packaging/version.py
pip/_vendor/pkg_resources/__init__.py
pip/_vendor/progress/__init__.py
pip/_vendor/progress/bar.py
pip/_vendor/progress/counter.py
pip/_vendor/progress/helpers.py
pip/_vendor/progress/spinner.py
pip/_vendor/requests/__init__.py
pip/_vendor/requests/adapters.py
pip/_vendor/requests/api.py
pip/_vendor/requests/auth.py
pip/_vendor/requests/cacert.pem
pip/_vendor/requests/certs.py
pip/_vendor/requests/compat.py
pip/_vendor/requests/cookies.py
pip/_vendor/requests/exceptions.py
pip/_vendor/requests/hooks.py
pip/_vendor/requests/models.py
pip/_vendor/requests/sessions.py
pip/_vendor/requests/status_codes.py
pip/_vendor/requests/structures.py
pip/_vendor/requests/utils.py
pip/_vendor/requests/packages/__init__.py
pip/_vendor/requests/packages/chardet/__init__.py
pip/_vendor/requests/packages/chardet/big5freq.py
pip/_vendor/requests/packages/chardet/big5prober.py
pip/_vendor/requests/packages/chardet/chardetect.py
pip/_vendor/requests/packages/chardet/chardistribution.py
pip/_vendor/requests/packages/chardet/charsetgroupprober.py
pip/_vendor/requests/packages/chardet/charsetprober.py
pip/_vendor/requests/packages/chardet/codingstatemachine.py
pip/_vendor/requests/packages/chardet/compat.py
pip/_vendor/requests/packages/chardet/constants.py
pip/_vendor/requests/packages/chardet/cp949prober.py
pip/_vendor/requests/packages/chardet/escprober.py
pip/_vendor/requests/packages/chardet/escsm.py
pip/_vendor/requests/packages/chardet/eucjpprober.py
pip/_vendor/requests/packages/chardet/euckrfreq.py
pip/_vendor/requests/packages/chardet/euckrprober.py
pip/_vendor/requests/packages/chardet/euctwfreq.py
pip/_vendor/requests/packages/chardet/euctwprober.py
pip/_vendor/requests/packages/chardet/gb2312freq.py
pip/_vendor/requests/packages/chardet/gb2312prober.py
pip/_vendor/requests/packages/chardet/hebrewprober.py
pip/_vendor/requests/packages/chardet/jisfreq.py
pip/_vendor/requests/packages/chardet/jpcntx.py
pip/_vendor/requests/packages/chardet/langbulgarianmodel.py
pip/_vendor/requests/packages/chardet/langcyrillicmodel.py
pip/_vendor/requests/packages/chardet/langgreekmodel.py
pip/_vendor/requests/packages/chardet/langhebrewmodel.py
pip/_vendor/requests/packages/chardet/langhungarianmodel.py
pip/_vendor/requests/packages/chardet/langthaimodel.py
pip/_vendor/requests/packages/chardet/latin1prober.py
pip/_vendor/requests/packages/chardet/mbcharsetprober.py
pip/_vendor/requests/packages/chardet/mbcsgroupprober.py
pip/_vendor/requests/packages/chardet/mbcssm.py
pip/_vendor/requests/packages/chardet/sbcharsetprober.py
pip/_vendor/requests/packages/chardet/sbcsgroupprober.py
pip/_vendor/requests/packages/chardet/sjisprober.py
pip/_vendor/requests/packages/chardet/universaldetector.py
pip/_vendor/requests/packages/chardet/utf8prober.py
pip/_vendor/requests/packages/urllib3/__init__.py
pip/_vendor/requests/packages/urllib3/_collections.py
pip/_vendor/requests/packages/urllib3/connection.py
pip/_vendor/requests/packages/urllib3/connectionpool.py
pip/_vendor/requests/packages/urllib3/exceptions.py
pip/_vendor/requests/packages/urllib3/fields.py
pip/_vendor/requests/packages/urllib3/filepost.py
pip/_vendor/requests/packages/urllib3/poolmanager.py
pip/_vendor/requests/packages/urllib3/request.py
pip/_vendor/requests/packages/urllib3/response.py
pip/_vendor/requests/packages/urllib3/contrib/__in
没有合适的资源?快使用搜索试试~ 我知道了~
温馨提示
爬虫(Web Crawler)是一种自动化程序,用于从互联网上收集信息。其主要功能是访问网页、提取数据并存储,以便后续分析或展示。爬虫通常由搜索引擎、数据挖掘工具、监测系统等应用于网络数据抓取的场景。 爬虫的工作流程包括以下几个关键步骤: URL收集: 爬虫从一个或多个初始URL开始,递归或迭代地发现新的URL,构建一个URL队列。这些URL可以通过链接分析、站点地图、搜索引擎等方式获取。 请求网页: 爬虫使用HTTP或其他协议向目标URL发起请求,获取网页的HTML内容。这通常通过HTTP请求库实现,如Python中的Requests库。 解析内容: 爬虫对获取的HTML进行解析,提取有用的信息。常用的解析工具有正则表达式、XPath、Beautiful Soup等。这些工具帮助爬虫定位和提取目标数据,如文本、图片、链接等。 数据存储: 爬虫将提取的数据存储到数据库、文件或其他存储介质中,以备后续分析或展示。常用的存储形式包括关系型数据库、NoSQL数据库、JSON文件等。 遵守规则: 为避免对网站造成过大负担或触发反爬虫机制,爬虫需要遵守网站的robots.txt协议,限制访问频率和深度,并模拟人类访问行为,如设置User-Agent。 反爬虫应对: 由于爬虫的存在,一些网站采取了反爬虫措施,如验证码、IP封锁等。爬虫工程师需要设计相应的策略来应对这些挑战。 爬虫在各个领域都有广泛的应用,包括搜索引擎索引、数据挖掘、价格监测、新闻聚合等。然而,使用爬虫需要遵守法律和伦理规范,尊重网站的使用政策,并确保对被访问网站的服务器负责。
资源推荐
资源详情
资源评论
收起资源包目录
爬虫的Demo--按照不同模块上传.zip (412个子文件)
python3.6 25KB
easy_install-3.6 446B
pip3.6 402B
activate 2KB
sysconfig.cfg 3KB
scrapy.cfg 253B
scrapy.cfg 251B
pyvenv.cfg 75B
activate.csh 1KB
classroom1.csv 66B
classroom.csv 66B
.DS_Store 6KB
.DS_Store 6KB
.DS_Store 6KB
.DS_Store 6KB
.DS_Store 6KB
easy_install 438B
setuptools-28.8.0-py3.6.egg 236KB
t64.exe 96KB
w64.exe 92KB
t32.exe 87KB
w32.exe 84KB
activate.fish 2KB
movie.html 17KB
tencent.html 15KB
re_demo.iml 567B
Demo02.iml 499B
untitled2.iml 451B
03CrawlerAdvanced.iml 438B
EmojiDemo.iml 438B
ajax_spider_demo.iml 438B
01NetworkRequests.iml 398B
wxapp.iml 398B
scrapy_demo.iml 398B
qsbk.iml 398B
untitled3.iml 398B
wxjc.json 592KB
duanzi.json 120KB
person.json 105B
README.md 336B
.name 16B
not-zip-safe 1B
cacert.pem 337KB
pip 396B
pip3 398B
PKG-INFO 3KB
qq.png 610KB
easy-install.pth 52B
setuptools.pth 30B
pyparsing.py 219KB
html5parser.py 114KB
__init__.py 101KB
tarfile.py 90KB
constants.py 81KB
big5freq.py 81KB
ipaddress.py 78KB
_tokenizer.py 75KB
util.py 52KB
locators.py 50KB
database.py 49KB
jisfreq.py 46KB
req_install.py 45KB
euckrfreq.py 45KB
compat.py 40KB
index.py 39KB
wheel.py 38KB
metadata.py 38KB
distro.py 37KB
gb2312freq.py 35KB
euctwfreq.py 34KB
req_set.py 34KB
connectionpool.py 33KB
_inputstream.py 32KB
download.py 31KB
wheel.py 31KB
models.py 30KB
six.py 29KB
six.py 29KB
specifiers.py 27KB
__init__.py 27KB
sysconfig.py 26KB
shutil.py 25KB
sessions.py 25KB
sanitizer.py 25KB
utils.py 24KB
version.py 23KB
dictconfig.py 23KB
appdirs.py 22KB
index.py 21KB
adapters.py 19KB
mbcssm.py 19KB
jpcntx.py 19KB
response.py 18KB
cookies.py 18KB
langcyrillicmodel.py 17KB
install.py 17KB
_ihatexml.py 16KB
cmdoptions.py 16KB
scripts.py 15KB
manifest.py 14KB
共 412 条
- 1
- 2
- 3
- 4
- 5
资源评论
JJJ69
- 粉丝: 6365
- 资源: 5917
下载权益
C知道特权
VIP文章
课程特权
开通VIP
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
最新资源
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功