Python3爬虫使用requests抓取活动行（www.huodongxing.com/）进行数据分析资源-CSDN文库

共1个文件

py：1个

需积分: 18 105 浏览量 2023-01-12 14:26:33 上传评论收藏 2KB ZIP 举报

在本文中，我们将深入探讨如何使用Python3的requests库来抓取活动行（www.huodongxing.com/）网站的数据，并进行后续的数据分析。活动行是一个提供各类线上线下活动信息的平台，通过爬虫技术，我们可以获取到丰富的活动数据，如活动名称、时间、地点、类型等，为数据分析提供素材。我们要了解requests库。requests是Python中一个非常方便的HTTP客户端库，用于发送HTTP请求。在爬虫项目中，我们通常会用它来获取网页的HTML源码。使用requests.get()函数，我们可以向指定URL发送GET请求，返回的是Response对象，包含了服务器的响应内容。接下来，我们需要解析HTML文档。Python的BeautifulSoup库非常适合这个任务。安装好BeautifulSoup后，我们可以通过Response对象的text属性获取HTML文本，然后使用BeautifulSoup解析，提取出我们需要的信息。例如，我们可以找到特定类名或ID的HTML元素，提取其中的活动标题、日期等。在实际抓取过程中，可能会遇到反爬虫策略，比如IP限制。这时，我们可以通过使用代理IP来解决。Python有多个库支持代理，如requests-socksify或http-proxy-agent。设置代理IP时，可以将代理地址和端口添加到requests.get()的proxies参数中。对于分城市和分类处理，我们需要在解析HTML时关注相关的标识符。活动行网站可能根据城市和类别对活动进行分类，这些信息通常体现在URL或HTML元素的class或id属性中。通过定位这些元素，我们可以筛选出特定城市或类型的活动。在获取到数据后，数据分析阶段就开始了。我们可以利用pandas库对数据进行清洗、整理，如去除空值、转换数据格式、合并多条记录等。数据分析阶段的目标可能是找出热门活动类型、最受欢迎的城市，或者发现活动时间的规律。数据分析的结果通常需要可视化展示，matplotlib和seaborn库提供了丰富的图表类型，如柱状图、饼图、热力图等，可以帮助我们更好地理解数据。例如，可以绘制不同城市的活动数量分布，或者展示活动在一周中的分布情况。此外，要注意遵守网站的robots.txt协议，不要过度抓取，以免对服务器造成负担。同时，确保了解并遵守相关法律法规，尊重网站的版权和用户隐私。通过Python3的requests库和相关工具，我们可以有效地抓取活动行网站的数据，并进行深度分析，从而获取有价值的洞察。这个过程涵盖了网络请求、HTML解析、数据处理、数据分析等多个环节，对于提升我们的编程和数据处理能力具有重要意义。

资源推荐

资源详情

资源评论

收起资源包目录

python爬虫.zip （1个子文件）

python爬虫

crawl_huodong_requests.py 4KB

#!/usr/bin/python3 # -*- coding: UTF-8 -*- import sys import os import requests from multiprocessing.dummy import Pool as ThreadPool from multiprocessing import Queue from urllib.parse import urlparse from pyquery import PyQuery as pq import hashlib import time import re from common import save_to_file, read_file, cur_file_dir from common.peteer import launch, P import urllib3 urllib3.disable_warnings() proxies = { 'http':'192.168.1.50:1080', 'https':'192.168.1.50:1080' } headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } q1 = Queue() q2 = Queue() urls = [] # 存取url列表 citys = ["北京","上海","广州","深圳","杭州","成都","南京","苏州","武汉","天津","重庆","西安","厦门","宁波","郑州","青岛","东莞","佛山","长沙","石家庄","昆明"] channels = [ "行业", "生活", "亲子", "学习" ] tags = [ "IT互联网", "创业", "科技", "金融", "游戏", "文娱", "电商", "教育", "营销", "设计", "地产", "医疗", "服务业", "演出", "文艺", "手工", "公益", "户外出游", "运动健康", "聚会交友", "休闲娱乐", "投资理财", "时尚", "心理", "体育赛事", "音乐", "儿童才艺", "益智潮玩", "儿童剧/展览", "亲子旅游", "早教/升学", "课程", "读书", "职场", "社团", "讲座" ] ################################################ async def url_content(url): resp= None try: resp = requests.get(url, headers=headers, verify=False, proxies=proxies).content.decode("utf-8") except Exception as e: print(e) return resp # 读取活动url def get_huodong_urls(): for channel in channels: progress = read_file(os.path.join(cur_file_dir(), 'dist/huodongxing', channel), "progress.txt") for city in citys: get_city_url(channel, city) def get_city_url(channel, city): url = "https://www.huodongxing.com/events?orderby=o&channel={}&city={}&page={}&isChannel=true" base_url = "https://www.huodongxing.com" page_url = url.format(channel, city, 1) print("抓取:%s" % page_url) resp = url_content(page_url) #print(resp) print("response lenght:%d" % len(resp)); # 获取页面 countSearch = re.search(r"count: (\d+),", resp); limitSearch = re.search(r"limit: (\d+),", resp); if countSearch : print(countSearch.group()) print(limitSearch.group()) count = int(countSearch.group(1)) limit = int(limitSearch.group(1)) page = count // limit + 1 if count % limit > 0 else 0; else: page = 1 print("总页数: %d" % page) for i in range(1, page): page_url = url.format(channel, city, i) print("抓取:%s" % page_url) print(">>>>>>>>page: %d ..." % i) if os.path.exists(os.path.join(cur_file_dir(), 'dist/huodongxing', channel, city + '_' + str(i) + '.html')): continue success = False resp = None while not success: resp = url_content(page_url) #print(resp) print("response lenght:%d" % len(resp)); q = pq(resp) items = q.find("div.search-tab-content-list-check > div.search-tab-content-item-mesh") if len(items) <= 0: print("数据空，可能爬虫被禁止!") test_proxy_ip() print("sleep 20s") progress = channel + " " + city + " " + str(i) save_to_file(os.path.join(cur_file_dir(), 'dist/huodongxing', channel), "progress.txt", progress) time.sleep(10.0) continue print("save to file!") save_to_file(os.path.join(cur_file_dir(), 'dist/huodongxing', channel), city + '_' + str(i) + '.html', resp) success = True print("sleep 5s") time.sleep(4.5) print('共 %d 页在 channel:%s,city:%s' % (page, channel, city)) if __name__ == '__main__': get_huodong_urls()

评论收藏

内容反馈