python第66-75天，Day66-75.rar资源-CSDN文库

共66个文件

py：38个

png：12个

md：11个

python

需积分: 5 80 浏览量 2024-05-14 07:52:05 上传评论收藏 4.27MB RAR 举报

资源推荐

资源详情

资源评论

收起资源包目录

python-Day66-75.rar （66个子文件）

Day66-75

10.爬虫项目实战.md 22B

02.数据采集和解析.md 5KB

常见反爬策略及应对方案.md 4KB

06.表单交互和验证码处理.md 1KB

08.Scrapy高级应用.md 2KB

res

postman.png 329KB

baidu-search-taobao.png 427KB

api-image360.png 261KB

chrome-developer-tools.png 233KB

douban-xpath.png 585KB

crawler-workflow.png 74KB

http-request.png 6KB

redis-save.png 292KB

tesseract.gif 791KB

scrapy-architecture.png 53KB

image360-website.png 1.33MB

http-response.png 6KB

09.Scrapy分布式实现.md 834B

04.并发下载.md 13KB

01.网络爬虫和相关工具.md 16KB

07.Scrapy入门.md 16KB

05.解析动态内容.md 5KB

03.存储数据.md 3KB

code

example06.py 2KB

coroutine02.py 983B

example04.py 837B

asyncio02.py 696B

example10a.py 316B

main.py 4KB

guido.jpg 59KB

main_redis.py 5KB

example01.py 3KB

example07.py 1KB

asyncio01.py 400B

example11.py 566B

generator01.py 292B

douban

scrapy.cfg 255B

result.json 0B

douban

__init__.py 0B

pipelines.py 510B

spiders

__init__.py 161B

movie.py 895B

items.py 281B

settings.py 3KB

middlewares.py 4KB

image360

scrapy.cfg 259B

image360

__init__.py 0B

pipelines.py 1KB

spiders

__init__.py 161B

taobao.py 1KB

image.py 931B

items.py 446B

settings.py 3KB

middlewares.py 5KB

example05.py 3KB

example02.py 2KB

example10.py 275B

example09.py 377B

example11a.py 415B

example03.py 960B

example12.py 848B

coroutine01.py 433B

tesseract.png 75KB

example08.py 801B

generator02.py 255B

myutils.py 190B

import pickle import zlib from enum import Enum, unique from hashlib import sha1 from random import random from threading import Thread, current_thread, local from time import sleep from urllib.parse import urlparse import pymongo import redis import requests from bs4 import BeautifulSoup from bson import Binary @unique class SpiderStatus(Enum): IDLE = 0 WORKING = 1 def decode_page(page_bytes, charsets=('utf-8',)): page_html = None for charset in charsets: try: page_html = page_bytes.decode(charset) break except UnicodeDecodeError: pass return page_html class Retry(object): def __init__(self, *, retry_times=3, wait_secs=5, errors=(Exception, )): self.retry_times = retry_times self.wait_secs = wait_secs self.errors = errors def __call__(self, fn): def wrapper(*args, **kwargs): for _ in range(self.retry_times): try: return fn(*args, **kwargs) except self.errors as e: print(e) sleep((random() + 1) * self.wait_secs) return None return wrapper class Spider(object): def __init__(self): self.status = SpiderStatus.IDLE @Retry() def fetch(self, current_url, *, charsets=('utf-8', ), user_agent=None, proxies=None): thread_name = current_thread().name print(f'[{thread_name}]: {current_url}') headers = {'user-agent': user_agent} if user_agent else {} resp = requests.get(current_url, headers=headers, proxies=proxies) return decode_page(resp.content, charsets) \ if resp.status_code == 200 else None def parse(self, html_page, *, domain='m.sohu.com'): soup = BeautifulSoup(html_page, 'lxml') for a_tag in soup.body.select('a[href]'): parser = urlparse(a_tag.attrs['href']) scheme = parser.scheme or 'http' netloc = parser.netloc or domain if scheme != 'javascript' and netloc == domain: path = parser.path query = '?' + parser.query if parser.query else '' full_url = f'{scheme}://{netloc}{path}{query}' redis_client = thread_local.redis_client if not redis_client.sismember('visited_urls', full_url): redis_client.rpush('m_sohu_task', full_url) def extract(self, html_page): pass def store(self, data_dict): # redis_client = thread_local.redis_client # mongo_db = thread_local.mongo_db pass class SpiderThread(Thread): def __init__(self, name, spider): super().__init__(name=name, daemon=True) self.spider = spider def run(self): redis_client = redis.Redis(host='1.2.3.4', port=6379, password='1qaz2wsx') mongo_client = pymongo.MongoClient(host='1.2.3.4', port=27017) thread_local.redis_client = redis_client thread_local.mongo_db = mongo_client.msohu while True: current_url = redis_client.lpop('m_sohu_task') while not current_url: current_url = redis_client.lpop('m_sohu_task') self.spider.status = SpiderStatus.WORKING current_url = current_url.decode('utf-8') if not redis_client.sismember('visited_urls', current_url): redis_client.sadd('visited_urls', current_url) html_page = self.spider.fetch(current_url) if html_page not in [None, '']: hasher = hasher_proto.copy() hasher.update(current_url.encode('utf-8')) doc_id = hasher.hexdigest() sohu_data_coll = mongo_client.msohu.webpages if not sohu_data_coll.find_one({'_id': doc_id}): sohu_data_coll.insert_one({ '_id': doc_id, 'url': current_url, 'page': Binary(zlib.compress(pickle.dumps(html_page))) }) self.spider.parse(html_page) self.spider.status = SpiderStatus.IDLE def is_any_alive(spider_threads): return any([spider_thread.spider.status == SpiderStatus.WORKING for spider_thread in spider_threads]) thread_local = local() hasher_proto = sha1() def main(): redis_client = redis.Redis(host='1.2.3.4', port=6379, password='1qaz2wsx') if not redis_client.exists('m_sohu_task'): redis_client.rpush('m_sohu_task', 'http://m.sohu.com/') spider_threads = [SpiderThread('thread-%d' % i, Spider()) for i in range(10)] for spider_thread in spider_threads: spider_thread.start() while redis_client.exists('m_sohu_task') or is_any_alive(spider_threads): pass print('Over!') if __name__ == '__main__': main()

评论收藏

内容反馈