/**
*
* APDPlat - Application Product Development Platform
* Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package org.apdplat.search.person;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class PersonCollector{
private static final Logger LOG = LoggerFactory.getLogger(PersonCollector.class);
private static final int PAGES = 298;
public List<Person> collect() {
List<Person> persons = new ArrayList<>();
try {
String url = "http://renwu.hexun.com/search.aspx?z=All&Filter=All&page=";
//共298页
for(int i=1; i<PAGES+1; i++){
url += i;
Document document = Jsoup.connect(url).get();
String cssQuery = "html body div.wrap div.mainBox div.main div.contBox div.cont div.slistBox ul li a";
LOG.debug("cssQuery: " + cssQuery);
Elements elements = document.select(cssQuery);
for(Element element : elements){
try{
String personName = element.text().replace(Jsoup.parse(" ").text(), " ").replace(Jsoup.parse("・").text(), "·");
LOG.debug("人物姓名:"+personName);
String href = element.attr("href");
LOG.debug("人物链接:"+href);
document = Jsoup.connect(href).get();
//基本信息
String basicInfoCSSQuery = "html body div.wrap div.mainBox div.main div.setBase div.right ul li";
LOG.debug("basicInfoCSSQuery: " + basicInfoCSSQuery);
Elements basicElements = document.select(basicInfoCSSQuery);
Map<String, String> basicInfos = new HashMap<>();
for(Element basicElement : basicElements){
String info = basicElement.text().replace(Jsoup.parse(" ").text(), " ").replace(Jsoup.parse("・").text(), "·");
if(info != null){
String[] attrs = info.split(":");
if(attrs != null && attrs.length == 2){
basicInfos.put(attrs[0], attrs[1]);
}
}
}
String moreCSSQuery = "html body div.wrap div.mainBox div.main div.contBox";
LOG.debug("moreCSSQuery: " + moreCSSQuery);
Elements moreElements = document.select(moreCSSQuery);
//教育经历
List<String> educations = new ArrayList<>();
Elements educationElements = moreElements.get(0).select("div.cont p");
for(Element educationElement : educationElements){
String education = educationElement.text().replace(Jsoup.parse(" ").text(), " ").replace(Jsoup.parse("・").text(), "·");
if(education != null && !"".equals(education.trim())){
educations.add(education);
}
}
//工作经历
List<String> jobs = new ArrayList<>();
Elements jobElements = moreElements.get(1).select("div.cont p");
for(Element jobElement : jobElements){
String job = jobElement.text().replace(Jsoup.parse(" ").text(), " ").replace(Jsoup.parse("・").text(), "·");
if(job != null && !"".equals(job.trim())){
jobs.add(job);
}
}
//重要事件
List<String> importants = new ArrayList<>();
Elements importantElements = moreElements.get(4).select("div.cont p");
for(Element importantElement : importantElements){
String important = importantElement.text().replace(Jsoup.parse(" ").text(), " ").replace(Jsoup.parse("・").text(), "·");
if(important != null && !"".equals(important.trim())){
importants.add(important);
}
}
Person person = new Person();
person.setName(personName);
person.setBasicInfos(basicInfos);
person.setEducations(educations);
person.setJobs(jobs);
person.setImportants(importants);
persons.add(person);
}catch(IOException e){
LOG.error("采集出错",e);
}
}
}
} catch (IOException ex) {
LOG.error("采集出错",ex);
}
return persons;
}
public static void main(String[] args) {
PersonCollector personCollector = new PersonCollector();
List<Person> persons = personCollector.collect();
if (persons != null) {
int i = 1;
for (Person person : persons) {
LOG.info("采集结果 " + (i++) + " "+person.getName()+ " :");
if(person.getBasicInfos() != null && person.getBasicInfos().size() > 0){
LOG.info("基本信息************************************************************");
for(Entry<String, String> basicInfo : person.getBasicInfos().entrySet()){
LOG.info(basicInfo.getKey() +":" + basicInfo.getValue());
}
}
if(person.getEducations() != null && person.getEducations().size() > 0){
LOG.info("");
LOG.info("教育经历************************************************************");
for(String education : person.getEducations()){
LOG.info(education);
}
}
if(person.getJobs() != null && person.getJobs().size() > 0){
LOG.info("");
LOG.info("工作经历************************************************************");
for(String job : person.getJobs()){
LOG.info(job);
}
}
if(person.getImportants() != null && person.getImportants().size() > 0){
LOG.info("");
LOG.info("重要事件************************************************************");
for(String important : person.getImportants()){
LOG.info(important.replace("\\?", " "));
没有合适的资源?快使用搜索试试~ 我知道了~
温馨提示
通过一个统一的用户界面帮助用户在多个搜索引擎中选择和利用合适的搜索引擎来实现检索操作,是对分布于网络的多种检索工具的全局控制机制。 自己没搜索引擎,又想要大规模的数据源,怎么办?可以对百度搜索和谷歌搜索善加利用,以小搏大,站在巨人的肩膀上。有很多的应用场景可以很巧妙地借助百度搜索和谷歌搜索来实现,比如网站的新闻采集,比如技术、品牌的新闻跟踪,比如知识库的收集,比如人机问答系统等,我之前做的一个准确率达百分之九十几的人机问答系统的数据源,其中一部分就是充分利用了百度搜索和谷歌搜索。我们可以很容易地扩展到其他的搜索引擎,使用JSoup+CSSPath技术,轻松获取页面的自定义的内容。
资源推荐
资源详情
资源评论
收起资源包目录
search-master.zip (29个子文件)
search-master
mvnw.cmd 5KB
.travis.yml 80B
pom.xml 4KB
src
test
java
com
apdplat
demo
test
JsoupParseDemo.java 624B
main
resources
logback.xml 1KB
java
org
apdplat
search
AbstractBaiduSearcher.java 3KB
person
PersonCollector.java 8KB
Person.java 2KB
SearchResult.java 2KB
Tools.java 3KB
TextExtract.java 5KB
BaiduSearcher.java 2KB
paper
YZWBPaperCollector.java 5KB
AbstractPaperCollector.java 4KB
CTDSBPaperCollector.java 4KB
PaperCollector.java 1KB
RMRBPaperCollector.java 4KB
JRZBPaperCollector.java 3KB
XHRBPaperCollector.java 4KB
XXSBPaperCollector.java 6KB
JJWBPaperCollector.java 3KB
YCWBPaperCollector.java 3KB
Searcher.java 940B
JSoupBaiduSearcher.java 7KB
GoogleSearcher.java 894B
Webpage.java 1KB
util
baidu
JsoupBaiduInfoUtil.java 3KB
GoogleAjaxSearcher.java 6KB
mvnw 7KB
共 29 条
- 1
资源评论
博士僧小星
- 粉丝: 1745
- 资源: 5850
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功