package com.reptile.area.jsoup;
import java.nio.charset.Charset;
import java.util.LinkedList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import cn.hutool.core.util.CharsetUtil;
import cn.hutool.http.HttpUtil;
import cn.hutool.json.JSONUtil;
import cn.hutool.log.StaticLog;
/**
* * 省市区区划地址解析
*
* @author zhang.xiaoming
*/
public class CityStats {
private static final String COMMON_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/";
private static final Charset CHARSET = CharsetUtil.CHARSET_GBK;
private CityStats() {
}
public static void parseProvince(String url) {
String htmlStr = HttpUtil.get(url, CHARSET);
Document document = Jsoup.parse(htmlStr);
// 获取 class='provincetr' 的元素
Elements elements = document.getElementsByClass("provincetr");
List<Node> provinces = new LinkedList<Node>();
for (Element element : elements) {
// 获取 elements 下属性是 href 的元素
Elements links = element.getElementsByAttribute("href");
for (Element link : links) {
String provinceName = link.text();
String href = link.attr("href");
String provinceCode = href.substring(0, 2);
StaticLog.info("provinceName: {} , provinceCode: {} .", provinceName, provinceCode);
Node provinceNode = Node.builder().code(provinceCode).name(provinceName).dataFromUrl(url).build();
StaticLog.info("省级数据: {} ", provinceNode);
parseCity(COMMON_URL + href, provinceNode);
provinces.add(provinceNode);
}
}
StaticLog.info(JSONUtil.toJsonPrettyStr(provinces));
}
public static void parseCity(String url, Node provinceNode) {
String htmlStr = HttpUtil.get(url, CHARSET);
Document document = Jsoup.parse(htmlStr);
Elements trs = document.getElementsByClass("citytr");
List<Node> cities = new LinkedList<Node>();
for (Element tr : trs) {
Elements links = tr.getElementsByTag("a");
String href = links.get(0).attr("href");
String cityCode = links.get(0).text().substring(0, 4);
String cityName = links.get(1).text();
Node cityNode = Node.builder().name(cityName).code(cityCode).dataFromUrl(url).build();
StaticLog.info(" 市级数据: {} ", cityNode);
parseCounty(COMMON_URL + href, cityNode);
cities.add(cityNode);
}
provinceNode.setNodes(cities);
}
public static void parseCounty(String url, Node cityNode) {
String htmlStr = HttpUtil.get(url, CHARSET);
Document document = Jsoup.parse(htmlStr);
Elements trs = document.getElementsByClass("countytr");
List<Node> counties = new LinkedList<Node>();
for (Element tr : trs) {
Elements links = tr.getElementsByTag("a");
if (links == null || links.size() != 2) {
continue;
}
String href = links.get(0).attr("href");
String countyCode = links.get(0).text().substring(0, 6);
String countyName = links.get(1).text();
Node countyNode = Node.builder().name(countyName).code(countyCode).dataFromUrl(url).build();
StaticLog.info(" 县级数据: {} ", countyNode);
parseTowntr(COMMON_URL + href.subSequence(2, 5).toString() + "/" + href, countyNode);
counties.add(cityNode);
}
cityNode.setNodes(counties);
}
public static void parseTowntr(String url, Node countyNode) {
String htmlStr = HttpUtil.get(url, CHARSET);
Document document = Jsoup.parse(htmlStr);
Elements trs = document.getElementsByClass("towntr");
List<Node> counties = new LinkedList<Node>();
for (Element tr : trs) {
Elements links = tr.getElementsByTag("a");
if (links == null || links.size() != 2) {
continue;
}
String href = links.get(0).attr("href");
String towntrCode = links.get(0).text().substring(0, 6);
String towntrName = links.get(1).text();
Node towntrNode = Node.builder().name(towntrName).code(towntrCode).dataFromUrl(url).build();
StaticLog.info(" 乡镇级数据: {} ", towntrNode);
parseVillagetr(COMMON_URL + href.subSequence(2, 5).toString() + "/" + href.substring(5, 7) + "/" + href,
countyNode);
counties.add(towntrNode);
}
countyNode.setNodes(counties);
}
public static void parseVillagetr(String url, Node countyNode) {
String htmlStr = HttpUtil.get(url, CHARSET);
Document document = Jsoup.parse(htmlStr);
Elements trs = document.getElementsByClass("villagetr");
List<Node> counties = new LinkedList<Node>();
for (Element tr : trs) {
Elements tds = tr.getElementsByTag("td");
if (tds == null || tds.size() != 3) {
continue;
}
String villagetrCode = tds.get(0).text();
String villagetrName = tds.get(2).text();
Node villagetrNode = Node.builder().code(villagetrCode).name(villagetrName).dataFromUrl(url).build();
StaticLog.info(" 村级数据: {} ", villagetrNode);
counties.add(villagetrNode);
}
countyNode.setNodes(counties);
}
public static void main(String[] args) {
/**
* # 查看省份数据 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html
*
* # 查看 内蒙古 市级数据 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15.html
*
* # 查看 内蒙古 区级数据
* http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15/1509.html
*
* # 查看 内蒙古 街道级数据
* http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15/09/150902.html
*
* # 查看 内蒙古 社区居委会级数据
* http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15/09/02/150902003.html
*
* *我们发现这个是有规律的,15是内蒙古的区划代码,而1509是乌兰察布市的区划代码,
* *前面的http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/这一大串都是一样的,我们就叫commonUrl。
* *规律就是:
*
* # 获取省的数据 commonUrl + index.html
*
* # 获取市级数据 commonUrl + 对应省级区划代码.html
*
* # 获取县区级数据 commonUrl + 对应省级区划代码 + / + 对应市级区划代码.html
*/
String provinceUrl = COMMON_URL + "index.html";
CityStats.parseProvince(provinceUrl);
String cityUrl = COMMON_URL + "15.html";
CityStats.parseCity(cityUrl, new Node());
String countyUrl = COMMON_URL + "15/1509.html";
CityStats.parseCounty(countyUrl, new Node());
String towntrUrl = COMMON_URL + "15/09/150981.html";
CityStats.parseTowntr(towntrUrl, new Node());
}
}
没有合适的资源?快使用搜索试试~ 我知道了~
Java爬虫-2018国家统计局区划和城乡划分代码
共44个文件
class:14个
java:12个
sql:4个
需积分: 16 13 下载量 102 浏览量
2019-03-13
17:05:44
上传
评论
收藏 30.42MB ZIP 举报
温馨提示
java爬虫,国家统计局区划编码http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/;
资源推荐
资源详情
资源评论
收起资源包目录
reptile-area.zip (44个子文件)
reptile-area
pom.xml 2KB
target
test-classes
com
reptile
area
FileTest.class 5KB
classes
db.setting 429B
META-INF
maven
com.reptile.area
reptile-area
pom.xml 2KB
pom.properties 236B
MANIFEST.MF 116B
com
reptile
area
CityParserTest.class 2KB
jsoup
Node.class 3KB
Node$NodeBuilder.class 2KB
CityStats.class 7KB
FutureDemo$CallableTask.class 2KB
CityParserThreadTest.class 437B
decorator
SqlCityParserDecorator.class 5KB
CityParser.class 6KB
CityParserDecorator.class 785B
ICityParser.class 284B
JsonCityParserDecorator.class 1KB
LocationCityParserDecorator.class 1KB
FutureDemo.class 3KB
sql
area.sql 139.68MB
table.sql 951B
table.sql.bak 0B
generated-sources
annotations
.factorypath 875B
.settings
org.eclipse.m2e.core.prefs 90B
org.eclipse.core.resources.prefs 191B
org.eclipse.jdt.apt.core.prefs 138B
org.eclipse.jdt.core.prefs 349B
src
test
resources
java
com
reptile
area
FileTest.java 4KB
main
resources
db.setting 429B
sql
area.sql 139.68MB
table.sql 951B
java
com
reptile
area
FutureDemo.java 3KB
jsoup
Node.java 281B
CityStats.java 6KB
CityParserTest.java 1KB
CityParserThreadTest.java 116B
decorator
SqlCityParserDecorator.java 3KB
CityParser.java 5KB
ICityParser.java 285B
JsonCityParserDecorator.java 736B
LocationCityParserDecorator.java 475B
CityParserDecorator.java 380B
.project 564B
.classpath 2KB
.apt_generated_tests
共 44 条
- 1
资源评论
点滴1993
- 粉丝: 58
- 资源: 8
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功