#!/usr/bin/python3
# -*- coding: UTF-8 -*-
import sys
import os
import requests
from multiprocessing.dummy import Pool as ThreadPool
from multiprocessing import Queue
from urllib.parse import urlparse
from pyquery import PyQuery as pq
import hashlib
import time
import re
from common import save_to_file, read_file, cur_file_dir
from common.peteer import launch, P
import urllib3
urllib3.disable_warnings()
proxies = {
'http':'192.168.1.50:1080',
'https':'192.168.1.50:1080'
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
q1 = Queue()
q2 = Queue()
urls = [] # 存取url列表
citys = ["北京","上海","广州","深圳","杭州","成都","南京","苏州","武汉","天津","重庆","西安","厦门","宁波","郑州","青岛","东莞","佛山","长沙","石家庄","昆明"]
channels = [
"行业",
"生活",
"亲子",
"学习"
]
tags = [
"IT互联网",
"创业",
"科技",
"金融",
"游戏",
"文娱",
"电商",
"教育",
"营销",
"设计",
"地产",
"医疗",
"服务业",
"演出",
"文艺",
"手工",
"公益",
"户外出游",
"运动健康",
"聚会交友",
"休闲娱乐",
"投资理财",
"时尚",
"心理",
"体育赛事",
"音乐",
"儿童才艺",
"益智潮玩",
"儿童剧/展览",
"亲子旅游",
"早教/升学",
"课程",
"读书",
"职场",
"社团",
"讲座"
]
################################################
async def url_content(url):
resp= None
try:
resp = requests.get(url, headers=headers, verify=False, proxies=proxies).content.decode("utf-8")
except Exception as e:
print(e)
return resp
# 读取活动url
def get_huodong_urls():
for channel in channels:
progress = read_file(os.path.join(cur_file_dir(), 'dist/huodongxing', channel), "progress.txt")
for city in citys:
get_city_url(channel, city)
def get_city_url(channel, city):
url = "https://www.huodongxing.com/events?orderby=o&channel={}&city={}&page={}&isChannel=true"
base_url = "https://www.huodongxing.com"
page_url = url.format(channel, city, 1)
print("抓取:%s" % page_url)
resp = url_content(page_url)
#print(resp)
print("response lenght:%d" % len(resp));
# 获取页面
countSearch = re.search(r"count: (\d+),", resp);
limitSearch = re.search(r"limit: (\d+),", resp);
if countSearch :
print(countSearch.group())
print(limitSearch.group())
count = int(countSearch.group(1))
limit = int(limitSearch.group(1))
page = count // limit + 1 if count % limit > 0 else 0;
else:
page = 1
print("总页数: %d" % page)
for i in range(1, page):
page_url = url.format(channel, city, i)
print("抓取:%s" % page_url)
print(">>>>>>>>page: %d ..." % i)
if os.path.exists(os.path.join(cur_file_dir(), 'dist/huodongxing', channel, city + '_' + str(i) + '.html')):
continue
success = False
resp = None
while not success:
resp = url_content(page_url)
#print(resp)
print("response lenght:%d" % len(resp));
q = pq(resp)
items = q.find("div.search-tab-content-list-check > div.search-tab-content-item-mesh")
if len(items) <= 0:
print("数据空,可能爬虫被禁止!")
test_proxy_ip()
print("sleep 20s")
progress = channel + " " + city + " " + str(i)
save_to_file(os.path.join(cur_file_dir(), 'dist/huodongxing', channel), "progress.txt", progress)
time.sleep(10.0)
continue
print("save to file!")
save_to_file(os.path.join(cur_file_dir(), 'dist/huodongxing', channel), city + '_' + str(i) + '.html', resp)
success = True
print("sleep 5s")
time.sleep(4.5)
print('共 %d 页 在 channel:%s,city:%s' % (page, channel, city))
if __name__ == '__main__':
get_huodong_urls()