package com.gtja.util;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.ProxyConfig;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.DomNode;
import com.gargoylesoftware.htmlunit.html.HtmlDivision;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlTable;
import com.gargoylesoftware.htmlunit.html.HtmlTableCell;
import com.gargoylesoftware.htmlunit.html.HtmlTableRow;
import com.gargoylesoftware.htmlunit.javascript.host.dom.Document;
import com.gargoylesoftware.htmlunit.javascript.host.dom.Node;
import com.gtja.bean.QsztBean;
import com.gtja.bean.SmjjBean;
import com.gtja.bean.SmjjGlrBean;
public class PageMessage {
/**
* 私募基金管理人分类公示
* @param smjglr
* @return
*/
public static SmjjGlrBean SmjjGlrMessage(SmjjGlrBean smjglr){
// TODO Auto-generated method stub
WebClient wc = new WebClient(BrowserVersion.FIREFOX_38);
wc.getOptions().setUseInsecureSSL(true);
wc.getOptions().setJavaScriptEnabled(false); // 启用JS解释器,默认为true
wc.getOptions().setCssEnabled(false); // 禁用css支持
wc.getOptions().setThrowExceptionOnScriptError(false); // js运行错误时,是否抛出异常
wc.getOptions().setTimeout(10000); // 设置连接超时时间 ,这里是10S。如果为0,则无限期等待
wc.getOptions().setDoNotTrackEnabled(false);
//服务器代理
ProxyConfig pro =wc.getOptions().getProxyConfig();
pro.setProxyHost("10.88.100.100");
pro.setProxyPort(8080);
HtmlPage page = null;
try {
page = wc.getPage("http://gs.amac.org.cn/amac-infodisc/res/pof/manager/"+smjglr.getUrl()+"");
String url = page.getUrl().toString();
smjglr.setUrl(url);
//分析最终的页面信息
HtmlTable table=(HtmlTable) page.getDocumentElement().getElementsByTagName("table").get(0);
//基金管理人全称
HtmlDivision allName = (HtmlDivision) page.getElementById("complaint1");
smjglr.setFullName(Template.trimVal(allName.asText()).trim());
for(HtmlTableRow row:table.getRows()){ // 行
for(HtmlTableCell cell:row.getCells()){ // 列
if("机构诚信信息:".equals(cell.asText())){
String cxm = cell.getNextElementSibling().asText();
smjglr.setCxm(Template.trimVal(cxm));
}else if("基金管理人全称(英文):".equals(cell.asText())){
String enName = cell.getNextElementSibling().asText();
smjglr.setEnName(Template.trimVal(enName));
}else if("登记编号:".equals(cell.asText())){
String regiNumber = cell.getNextElementSibling().asText();
smjglr.setRegiNumber(Template.trimVal(regiNumber));
}else if("组织机构代码:".equals(cell.asText())){
String meCode = cell.getNextElementSibling().asText();
smjglr.setMeCode(Template.trimVal(meCode));
}else if("登记时间:".equals(cell.asText())){
String regiDate = cell.getNextElementSibling().asText();
smjglr.setRegiDate(Template.trimVal(regiDate));
}else if("成立时间:".equals(cell.asText())){
String creDate = cell.getNextElementSibling().asText();
smjglr.setCreDate(Template.trimVal(creDate));
}else if("注册地址:".equals(cell.asText())){
String regiAddress = cell.getNextElementSibling().asText();
smjglr.setRegiAddress(Template.trimVal(regiAddress));
}else if("办公地址:".equals(cell.asText())){
String offAddress = cell.getNextElementSibling().asText();
smjglr.setOffAddress(Template.trimVal(offAddress));
}else if("注册资本(万元)(人民币):".equals(cell.asText())){
String regiCapital = cell.getNextElementSibling().asText();
smjglr.setRegiCapital(Template.trimVal(regiCapital));
}else if("实缴资本(万元)(人民币):".equals(cell.asText())){
String paiCapital = cell.getNextElementSibling().asText();
smjglr.setPaiCapital(Template.trimVal(paiCapital));
}else if("企业性质:".equals(cell.asText())){
String natPrise = cell.getNextElementSibling().asText();
smjglr.setNatPrise(Template.trimVal(natPrise));
}else if("注册资本实缴比例:".equals(cell.asText())){
String proRegiCapital = cell.getNextElementSibling().asText();
smjglr.setProRegiCapital(Template.trimVal(proRegiCapital));
}else if("机构类型:".equals(cell.asText())){
String meanType = cell.getNextElementSibling().asText();
smjglr.setMeanType(Template.trimVal(meanType));
}else if("业务类型:".equals(cell.asText())){
String busiType = cell.getNextElementSibling().asText();
smjglr.setBusiType(Template.trimVal(busiType));
}else if("员工人数:".equals(cell.asText())){
String employNumber = cell.getNextElementSibling().asText();
smjglr.setEmployNumber(Template.trimVal(employNumber));
}else if("机构网址:".equals(cell.asText())){
String insWebsite = cell.getNextElementSibling().asText();
smjglr.setInsWebsite(Template.trimVal(insWebsite));
}else if("是否为会员:".equals(cell.asText())){
String isMember = cell.getNextElementSibling().asText();
smjglr.setIsMember(Template.trimVal(isMember));
}else if("当前会员类型:".equals(cell.asText())){
String memberType = cell.getNextElementSibling().asText();
smjglr.setMemberType(Template.trimVal(memberType));
}else if("入会时间:".equals(cell.asText())){
String admissTime = cell.getNextElementSibling().asText();
smjglr.setAdmissTime(Template.trimVal(admissTime));
}else if("法律意见书状态:".equals(cell.asText())){
String legionStatus = cell.getNextElementSibling().asText();
smjglr.setLegionStatus(Template.trimVal(legionStatus));
}else if("律师事务所名称:".equals(cell.asText())){
String swsName = cell.getNextElementSibling().asText();
smjglr.setSwsName(Template.trimVal(swsName));
}else if("律师姓名:".equals(cell.asText())){
String lsName = cell.getNextElementSibling().asText();
smjglr.setLsName(Template.trimVal(lsName));
}else if("法定代表人/执行事务合伙人(委派代表)姓名:".equals(cell.asText())){
String legalRepre = cell.getNextElementSibling().asText();
smjglr.setLegalRepre(Template.trimVal(legalRepre));
}else if("是否有从业资格:".equals(cell.asText())){
String isQualifition = cell.getNextElementSibling().asText();
smjglr.setIsQualifition(Template.trimVal(isQualifition));
}else if("资格取得方式:".equals(cell.asText())){
String eligMethod = cell.getNextElementSibling().asText();
smjglr.setEligMethod(Template.trimVal(eligMethod));
}else if("机构信息最后更新时间:".equals(cell.asText())){
String laupTime = cell.getNextElementSibling().asText();
smjglr.setLaupTime(Template.trimVal(laupTime));
}else if("特别提示信息:".equals(cell.asText())){
String specInformation = cell.getNextElementSibling().asT
没有合适的资源?快使用搜索试试~ 我知道了~
温馨提示
高效的java爬虫,内附代码 sql数据表 ,main方法启动。jdk1.8. 有htmlunit的各种获取标签的方法。避免了jsoup无法抓取js代码生成的数据内容的弊端。避免了client无法一次性获取大量信息的弊端。有能获取静态页面形成之前的对方数据内容的高手 欢迎借阅指导
资源推荐
资源详情
资源评论
收起资源包目录
高级爬虫进阶:HtmlUnit+多线线程+消息队列快速抓取大量信息数据 (138个子文件)
111sql 8KB
PageMessage.class 14KB
AddDataDaoImpl.class 13KB
SmjjGlrBean.class 12KB
SmjjGlrBean.class 12KB
PageMessage.class 11KB
AddDataDaoImpl.class 11KB
Template.class 8KB
Template.class 8KB
Test.class 6KB
AutoSmjjAction.class 6KB
AutoGlrAction.class 6KB
AutoQhzgAction.class 6KB
AutoSmjjAction.class 6KB
AutoGlrAction.class 6KB
AutoQsztAction.class 5KB
AutoSmjjWebIntercepter.class 5KB
AutoSmjjWebIntercepter.class 5KB
AutoGlrWebIntercepter.class 5KB
SmjjBean.class 4KB
AutoQszgjhAction.class 4KB
SmjjBean.class 4KB
AutoGlrWebIntercepter.class 4KB
AutoQsztWebIntercepter.class 4KB
QsztBean.class 4KB
AutoQhzgWebIntercepter.class 4KB
AutoQszgjhWebIntercepter.class 3KB
QsztBean.class 3KB
QszgjhBean.class 3KB
AutoGlrWebIntercepter$QueThread.class 3KB
AutoSmjjWebIntercepter$QueThread.class 3KB
AutoGlrWebIntercepter$QueThread.class 3KB
AutoSmjjWebIntercepter$QueThread.class 2KB
AutoQsztWebIntercepter$QueThread.class 2KB
QhzgBean.class 2KB
DBConnection.class 2KB
DBConnection.class 1KB
AddDataDao.class 551B
AddDataDao.class 512B
.classpath 842B
.classpath 842B
org.eclipse.wst.common.component 482B
org.eclipse.wst.common.component 476B
org.eclipse.wst.jsdt.ui.superType.container 49B
org.eclipse.wst.jsdt.ui.superType.container 49B
xalan-2.7.2.jar 3.01MB
xalan-2.7.2.jar 3.01MB
ojdbc14_g.jar 1.84MB
ojdbc14_g.jar 1.84MB
htmlunit-2.20.jar 1.76MB
htmlunit-2.20.jar 1.76MB
xercesImpl-2.11.0.jar 1.3MB
xercesImpl-2.11.0.jar 1.3MB
htmlunit-core-js-2.17.jar 1.04MB
htmlunit-core-js-2.17.jar 1.04MB
httpclient-4.5.2.jar 719KB
httpclient-4.5.2.jar 719KB
commons-lang3-3.4.jar 424KB
commons-lang3-3.4.jar 424KB
fastjson-1.2.20.jar 404KB
fastjson-1.2.20.jar 404KB
cssparser-0.9.18.jar 364KB
cssparser-0.9.18.jar 364KB
jetty-util-9.2.15.v20160210.jar 360KB
jetty-util-9.2.15.v20160210.jar 360KB
httpcore-4.4.4.jar 319KB
httpcore-4.4.4.jar 319KB
commons-codec-1.10.jar 278KB
commons-codec-1.10.jar 278KB
serializer-2.7.2.jar 270KB
serializer-2.7.2.jar 270KB
xml-apis-1.4.01.jar 215KB
xml-apis-1.4.01.jar 215KB
commons-io-2.4.jar 181KB
commons-io-2.4.jar 181KB
websocket-common-9.2.15.v20160210.jar 176KB
websocket-common-9.2.15.v20160210.jar 176KB
nekohtml-1.9.22.jar 122KB
nekohtml-1.9.22.jar 122KB
jetty-io-9.2.15.v20160210.jar 106KB
jetty-io-9.2.15.v20160210.jar 106KB
commons-logging-1.2.jar 60KB
commons-logging-1.2.jar 60KB
websocket-api-9.2.15.v20160210.jar 43KB
websocket-api-9.2.15.v20160210.jar 43KB
httpmime-4.5.2.jar 40KB
httpmime-4.5.2.jar 40KB
websocket-client-9.2.15.v20160210.jar 35KB
websocket-client-9.2.15.v20160210.jar 35KB
sac-1.3.jar 15KB
sac-1.3.jar 15KB
PageMessage.java 19KB
AddDataDaoImpl.java 15KB
PageMessage.java 14KB
AddDataDaoImpl.java 13KB
SmjjGlrBean.java 11KB
SmjjGlrBean.java 11KB
Template.java 10KB
Template.java 10KB
AutoGlrAction.java 8KB
共 138 条
- 1
- 2
资源评论
- weixin_402444102018-04-27嗯嗯 很不错的java爬虫程序啊。表结构都是现成的啊!环境配置好,就启动了。跑的很快,很牛啊。学习了,感谢分享啊!要是可以把二次请求转成json效率又会高很多啊。不过貌似难度会更大啊。希望博主留个微信交流下经验啊
- luozhangwen2019-05-15资源不错, 可以运行
qq_38685186
- 粉丝: 0
- 资源: 4
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功