import pickle
import zlib
from enum import Enum, unique
from hashlib import sha1
from random import random
from threading import Thread, current_thread, local
from time import sleep
from urllib.parse import urlparse
import pymongo
import redis
import requests
from bs4 import BeautifulSoup
from bson import Binary
@unique
class SpiderStatus(Enum):
IDLE = 0
WORKING = 1
def decode_page(page_bytes, charsets=('utf-8',)):
page_html = None
for charset in charsets:
try:
page_html = page_bytes.decode(charset)
break
except UnicodeDecodeError:
pass
return page_html
class Retry(object):
def __init__(self, *, retry_times=3,
wait_secs=5, errors=(Exception, )):
self.retry_times = retry_times
self.wait_secs = wait_secs
self.errors = errors
def __call__(self, fn):
def wrapper(*args, **kwargs):
for _ in range(self.retry_times):
try:
return fn(*args, **kwargs)
except self.errors as e:
print(e)
sleep((random() + 1) * self.wait_secs)
return None
return wrapper
class Spider(object):
def __init__(self):
self.status = SpiderStatus.IDLE
@Retry()
def fetch(self, current_url, *, charsets=('utf-8', ),
user_agent=None, proxies=None):
thread_name = current_thread().name
print(f'[{thread_name}]: {current_url}')
headers = {'user-agent': user_agent} if user_agent else {}
resp = requests.get(current_url,
headers=headers, proxies=proxies)
return decode_page(resp.content, charsets) \
if resp.status_code == 200 else None
def parse(self, html_page, *, domain='m.sohu.com'):
soup = BeautifulSoup(html_page, 'lxml')
for a_tag in soup.body.select('a[href]'):
parser = urlparse(a_tag.attrs['href'])
scheme = parser.scheme or 'http'
netloc = parser.netloc or domain
if scheme != 'javascript' and netloc == domain:
path = parser.path
query = '?' + parser.query if parser.query else ''
full_url = f'{scheme}://{netloc}{path}{query}'
redis_client = thread_local.redis_client
if not redis_client.sismember('visited_urls', full_url):
redis_client.rpush('m_sohu_task', full_url)
def extract(self, html_page):
pass
def store(self, data_dict):
# redis_client = thread_local.redis_client
# mongo_db = thread_local.mongo_db
pass
class SpiderThread(Thread):
def __init__(self, name, spider):
super().__init__(name=name, daemon=True)
self.spider = spider
def run(self):
redis_client = redis.Redis(host='1.2.3.4', port=6379, password='1qaz2wsx')
mongo_client = pymongo.MongoClient(host='1.2.3.4', port=27017)
thread_local.redis_client = redis_client
thread_local.mongo_db = mongo_client.msohu
while True:
current_url = redis_client.lpop('m_sohu_task')
while not current_url:
current_url = redis_client.lpop('m_sohu_task')
self.spider.status = SpiderStatus.WORKING
current_url = current_url.decode('utf-8')
if not redis_client.sismember('visited_urls', current_url):
redis_client.sadd('visited_urls', current_url)
html_page = self.spider.fetch(current_url)
if html_page not in [None, '']:
hasher = hasher_proto.copy()
hasher.update(current_url.encode('utf-8'))
doc_id = hasher.hexdigest()
sohu_data_coll = mongo_client.msohu.webpages
if not sohu_data_coll.find_one({'_id': doc_id}):
sohu_data_coll.insert_one({
'_id': doc_id,
'url': current_url,
'page': Binary(zlib.compress(pickle.dumps(html_page)))
})
self.spider.parse(html_page)
self.spider.status = SpiderStatus.IDLE
def is_any_alive(spider_threads):
return any([spider_thread.spider.status == SpiderStatus.WORKING
for spider_thread in spider_threads])
thread_local = local()
hasher_proto = sha1()
def main():
redis_client = redis.Redis(host='1.2.3.4', port=6379, password='1qaz2wsx')
if not redis_client.exists('m_sohu_task'):
redis_client.rpush('m_sohu_task', 'http://m.sohu.com/')
spider_threads = [SpiderThread('thread-%d' % i, Spider())
for i in range(10)]
for spider_thread in spider_threads:
spider_thread.start()
while redis_client.exists('m_sohu_task') or is_any_alive(spider_threads):
pass
print('Over!')
if __name__ == '__main__':
main()
没有合适的资源?快使用搜索试试~ 我知道了~
python第66-75天,Day66-75.rar
共66个文件
py:38个
png:12个
md:11个
需积分: 5 0 下载量 80 浏览量
2024-05-14
07:52:05
上传
评论
收藏 4.27MB RAR 举报
温馨提示
python第66-75天,Day66-75.rar
资源推荐
资源详情
资源评论
收起资源包目录
python-Day66-75.rar (66个子文件)
Day66-75
10.爬虫项目实战.md 22B
02.数据采集和解析.md 5KB
常见反爬策略及应对方案.md 4KB
06.表单交互和验证码处理.md 1KB
08.Scrapy高级应用.md 2KB
res
postman.png 329KB
baidu-search-taobao.png 427KB
api-image360.png 261KB
chrome-developer-tools.png 233KB
douban-xpath.png 585KB
crawler-workflow.png 74KB
http-request.png 6KB
redis-save.png 292KB
tesseract.gif 791KB
scrapy-architecture.png 53KB
image360-website.png 1.33MB
http-response.png 6KB
09.Scrapy分布式实现.md 834B
04.并发下载.md 13KB
01.网络爬虫和相关工具.md 16KB
07.Scrapy入门.md 16KB
05.解析动态内容.md 5KB
03.存储数据.md 3KB
code
example06.py 2KB
coroutine02.py 983B
example04.py 837B
asyncio02.py 696B
example10a.py 316B
main.py 4KB
guido.jpg 59KB
main_redis.py 5KB
example01.py 3KB
example07.py 1KB
asyncio01.py 400B
example11.py 566B
generator01.py 292B
douban
scrapy.cfg 255B
result.json 0B
douban
__init__.py 0B
pipelines.py 510B
spiders
__init__.py 161B
movie.py 895B
items.py 281B
settings.py 3KB
middlewares.py 4KB
image360
scrapy.cfg 259B
image360
__init__.py 0B
pipelines.py 1KB
spiders
__init__.py 161B
taobao.py 1KB
image.py 931B
items.py 446B
settings.py 3KB
middlewares.py 5KB
example05.py 3KB
example02.py 2KB
example10.py 275B
example09.py 377B
example11a.py 415B
example03.py 960B
example12.py 848B
coroutine01.py 433B
tesseract.png 75KB
example08.py 801B
generator02.py 255B
myutils.py 190B
共 66 条
- 1
资源评论
流华追梦
- 粉丝: 4805
- 资源: 2185
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
最新资源
- 基于Typescript和PHP的编程知识储备库设计源码 - study-php
- Screenshot_2024-05-28-11-40-58-177_com.tencent.mm.jpg
- 基于Dart的Flutter小提琴调音器APP设计源码 - violinhelper
- 基于JavaScript和CSS的随寻订购网页设计源码 - web-order
- 基于MATLAB的声纹识别系统设计源码 - VoiceprintRecognition
- 基于Java的微服务插件集合设计源码 - wsy-plugins
- 基于Vue和微信小程序的监理日志系统设计源码 - supervisionLog
- 基于Java和LCN分布式事务框架的设计源码 - tx-lcn
- 基于Java和JavaScript的茶叶评级管理系统设计源码 - tea
- IMG_5680.JPG
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功