# -*- coding: utf-8 -*-
import csv
import datetime
import json
import math
import time
import redis
from sshtunnel import SSHTunnelForwarder
from lib.crawler import StatsGovCn
from lib.util import DBUtilStatsGovCn
def fetch_stats_gov_cn(url, db_path, show_log=True, sleep_time=0):
"""
采集统计局信息
:param url: 统计局信息根网址
:type url: str
:param db_path: SQLite数据库路径
:type db_path: str
:param show_log: 是否显示日志
:type show_log: bool
:param sleep_time: 爬虫每次爬取后的休眠时间,单位为秒。
:type sleep_time: int
:return:
"""
# 程序开始时间
begin_time = time.time()
stats_gov_cn_crawler = StatsGovCn()
stats_gov_cn_crawler.sleep_time = sleep_time
if stats_gov_cn_crawler.check(url.replace('$ROUTE$', 'index.html'))[0] != 'province':
raise Exception('不是省级信息页面')
# 数据库操作对象
db_util = DBUtilStatsGovCn(db_path + 'db_stats.gov.cn.sqlite')
# 上级 URL 地址
url_base = {}
url_base_temp = ''
# 抓取并保存省级信息
if show_log:
print(f'[Log][{datetime.datetime.now()}] 开始抓取并保存省级信息')
provinces = stats_gov_cn_crawler.province(url.replace('$ROUTE$', 'index.html'))
db_util.truncate_province()
for province in provinces:
db_util.insert_province(province['statistical_code'], province['code'], province['name'])
url_base[province['statistical_code']] = url.replace('$ROUTE$', '')
if show_log:
print(f'[Log][{datetime.datetime.now()}] 完成抓取并保存省级信息')
print(f'[REPORT] 省级信息 {len(provinces)} 个')
# 抓取并保存地级信息
if show_log:
print(f'[Log][{datetime.datetime.now()}] 开始抓取并保存地级信息')
cities = []
db_util.truncate_city()
for province in provinces:
if show_log:
province_name_temp = province['name']
print(f'[Log][{datetime.datetime.now()}] [{provinces.index(province) + 1}/{len(provinces)}] '
f'开始抓取并保存【{province_name_temp}】')
if province['href'] != '':
url_base_temp = url_base[province['statistical_code']] + province['href']
cities_temp = stats_gov_cn_crawler.city(url_base_temp)
for city in cities_temp:
cities.append(city)
db_util.insert_city(city['statistical_code'], city['code'], city['name'], province['statistical_code'])
url_base[city['statistical_code']] = url_base_temp[0:url_base_temp.rfind('/')+1]
if show_log:
province_name_temp = province['name']
print(f'[Log][{datetime.datetime.now()}] [{provinces.index(province) + 1}/{len(provinces)}] '
f'完成抓取并保存【{province_name_temp}】')
if show_log:
print(f'[Log][{datetime.datetime.now()}] 完成抓取并保存地级信息')
print(f'[REPORT] 地级信息 {len(cities)} 个')
# 抓取并保存县级信息
if show_log:
print(f'[Log][{datetime.datetime.now()}] 开始抓取并保存县级信息')
counties = []
db_util.truncate_county()
for city in cities:
city_db_temp = db_util.select_city(city['statistical_code'])
if show_log:
province_name_temp = db_util.select_province(city_db_temp['province_statistical_code'])['name']
city_name_temp = city['name']
print(f'[Log][{datetime.datetime.now()}] [{cities.index(city) + 1}/{len(cities)}] '
f'开始抓取并保存【{province_name_temp}】【{city_name_temp}】')
if city['href'] != '':
try:
url_base_temp = url_base[city['statistical_code']] + city['href']
counties_temp = stats_gov_cn_crawler.county(url_base_temp)
except Exception as e:
if e.args[0] == '不是县级信息页面':
counties_temp = [{
'href': city['href'][city['href'].find('/')+1:],
'statistical_code': city['statistical_code'],
'code': city['statistical_code'][0:6],
'name': city['name']
}]
else:
raise e
for county in counties_temp:
counties.append(county)
db_util.insert_county(
county['statistical_code'],
county['code'],
county['name'],
city_db_temp['province_statistical_code'],
city['statistical_code']
)
url_base[county['statistical_code']] = url_base_temp[0:url_base_temp.rfind('/')+1]
if show_log:
province_name_temp = db_util.select_province(city_db_temp['province_statistical_code'])['name']
city_name_temp = city['name']
print(f'[Log][{datetime.datetime.now()}] [{cities.index(city) + 1}/{len(cities)}] '
f'完成抓取并保存【{province_name_temp}】【{city_name_temp}】')
if show_log:
print(f'[Log][{datetime.datetime.now()}] 完成抓取并保存县级信息')
print(f'[REPORT] 县级信息 {len(counties)} 个')
# 抓取并保存乡级信息
if show_log:
print(f'[Log][{datetime.datetime.now()}] 开始抓取并保存乡级信息')
towns = []
db_util.truncate_town()
for county in counties:
county_db_temp = db_util.select_county(county['statistical_code'])
if show_log:
province_name_temp = db_util.select_province(county_db_temp['province_statistical_code'])['name']
city_name_temp = db_util.select_city(county_db_temp['city_statistical_code'])['name']
county_name_temp = county['name']
print(f'[Log][{datetime.datetime.now()}] [{counties.index(county) + 1}/{len(counties)}] '
f'开始抓取并保存【{province_name_temp}】【{city_name_temp}】【{county_name_temp}】')
if county['href'] != '':
try:
url_base_temp = url_base[county['statistical_code']] + county['href']
towns_temp = stats_gov_cn_crawler.town(url_base_temp)
except Exception as e:
if e.args[0] == '不是乡级信息页面':
towns_temp = [{
'href': county['href'][county['href'].find('/')+1:],
'statistical_code': county['statistical_code'],
'code': county['statistical_code'][0:9],
'name': county['name']
}]
else:
raise e
for town in towns_temp:
towns.append(town)
db_util.insert_town(
town['statistical_code'],
town['code'],
town['name'],
county_db_temp['province_statistical_code'],
county_db_temp['city_statistical_code'],
county['statistical_code']
)
url_base[town['statistical_code']] = url_base_temp[0:url_base_temp.rfind('/')+1]
if show_log:
province_name_temp = db_util.select_province(county_db_temp['province_statistical_code'])['name']
city_name_temp = db_util.select_city(county_db_temp['city_statistical_code'])['name']
county_name_temp = county['name']
print(f'[Log][{datetime.datetime.now()}] [{counties.index(county) + 1}/{len(counties)}] '
f'完成抓取并保存【{province_name_temp}】【{city_name_temp}】【{county_name_temp}】')
if show_log:
print(f'[Log][{datetime.datetime.now()}] 完成抓取并保存乡级信息')
print(f'[REPORT] 乡级信息 {len(towns)} 个')
# 抓取并保存村级信息
if show_log:
print(f'[Log][{
没有合适的资源?快使用搜索试试~ 我知道了~
温馨提示
【资源说明】 1、该资源包括项目的全部源码,下载可以直接使用! 2、本项目适合作为计算机、数学、电子信息等专业的课程设计、期末大作业和毕设项目,作为参考资料学习借鉴。 3、本资源作为“参考资料”如果需要实现其他功能,需要能看懂代码,并且热爱钻研,自行调试。 基于Spark通过Web访问系统设计源码.zip
资源推荐
资源详情
资源评论
收起资源包目录
基于Spark通过Web访问系统设计源码.zip (75个子文件)
code_20105
datatables.html 8KB
profile.html 8KB
card.html 13KB
bootstrap-wysihtml5
bootstrap3-wysihtml5.min.css 2KB
maps.html 7KB
js
calender.js 3KB
popper.js 19KB
jquery.googlemap.js 3KB
dashboard3.js 2KB
tooltip.js 5KB
forms.js 2KB
widgets.js 2KB
dashboard.js 5KB
morris.js 2KB
dashboard2.js 3KB
chartjs.js 2KB
apexcharts.js 961KB
jquery.ui-sliders.js 1KB
echarts.js 7KB
formeditor.js 1KB
gmaps.js 65KB
chart.min.js 154KB
flot.js 3KB
barcharts.js 4KB
jvectormap.js 3KB
moment.min.js 50KB
jquery.min.js 85KB
dataclick.js 238B
scripts.js 4KB
othercharts.js 6KB
index2.html 12KB
css01
bootstrap.min.css 118KB
app.css 2KB
bootstrap
js
bootstrap.min.js 50KB
css
bootstrap.min.css 138KB
chartist
chart.chartist.js 5KB
chartist.css 14KB
chartist-plugin-tooltip.css 565B
chartist-plugin-tooltip.js 5KB
chartist.js 174KB
count-down
jquery.lwtCountdown-1.0.js 5KB
img
brand
logo.png 41KB
logo-tst.png 3KB
loader.svg 1KB
typing.svg 1KB
news
img15.jpg 1.45MB
avatar
avatar-1.jpeg.jpg 22KB
avatar-1.jpeg 22KB
favicon.ico 462B
spinner.svg 2KB
china.js 59KB
css
style.css 105KB
icons.css 770B
login.html 3KB
table.html 26KB
vector-map.html 52KB
echarts.min.js 468KB
bootstrap-datepicker
bootstrap-datepicker.css 17KB
bootstrap-datepicker.js 56KB
index.html 82KB
marker.html 2KB
爬虫代码
lib
util.py 15KB
crawler.py 14KB
worker.py 24KB
main.py 6KB
config.py 1KB
about.html 8KB
bootstrap-timepicker
bootstrap-timepicker.js 34KB
bootstrap-timepicker.min.css 3KB
index3.html 26KB
bootstrap-colorpicker
bootstrap-colorpicker.min.js 20KB
bootstrap-colorpicker.min.css 13KB
Chart.js
dist
Chart.bundle.js 429KB
bootstrap-daterangepicker
daterangepicker.css 8KB
daterangepicker.js 69KB
共 75 条
- 1
资源评论
土豆片片
- 粉丝: 1558
- 资源: 5641
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功