package org.qing.searchengine.webcollector;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import org.apache.http.Consts;
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
/**
* 网页下载类
* @author Qing
*
*/
public class FileDownLoader {
/**
* 根据url和网页类型生成需要保存的网页中文名,去除url中非文件名字符
* @param url
* @param contentType
* @return
*/
public String getFileNameByUrl(String url,String contentType){
url = url.substring(7);//移除http://
if(contentType.indexOf("html") != -1){//html
url = url.replaceAll("[\\?/:*|<>\"]","_");//去掉url中非文件名字符生成文件名
return url;
}
else{
return url.replaceAll("[\\?/:*|<>\"]","_")+"." + contentType.substring(contentType.lastIndexOf("/")+1);
}
}
/**
* 保存网页字节数组到本地文件
* @param data
* @param filePath
*/
public void saveToLocal(String data,String filePath){
try{
DataOutputStream out = new DataOutputStream(new FileOutputStream(new File(filePath)));
out.writeUTF(data);//write in utf-8
out.flush();
out.close();
}catch(Exception e){
e.printStackTrace();
}
}
/**
* 下载url网页
* @param url
* @return
*/
public String downloadFile(String url){
UrlEncodedFormEntity uefEntity;
String filePath = null;
CloseableHttpClient httpclient = HttpClients.createDefault();
try{
HttpGet httpget = new HttpGet(url);
List<NameValuePair> params = new ArrayList<NameValuePair>();
String str = EntityUtils.toString(new UrlEncodedFormEntity(params, Consts.UTF_8));
httpget.setURI(new URI(httpget.getURI().toString() +"?" + str));
//执行get请求
CloseableHttpResponse response = httpclient.execute(httpget);
//获取响应实体
HttpEntity entity = response.getEntity();
filePath = "temp/"+ getFileNameByUrl(url,response.getFirstHeader("Content-Type").getValue());
saveToLocal(EntityUtils.toString(entity),filePath);
}catch(Exception e){
e.printStackTrace();
}
return filePath;
}
public static void main(String[] args){
FileDownLoader downloader = new FileDownLoader();
downloader.downloadFile("http://club.xdnice.com/thread-1400344-1-1.html");
}
}
没有合适的资源?快使用搜索试试~ 我知道了~
利用HttpClient和HtmlParser实现的简单爬虫(Java)
共42个文件
jar:16个
class:8个
java:5个
4星 · 超过85%的资源 需积分: 9 59 下载量 33 浏览量
2016-04-20
11:31:39
上传
评论 1
收藏 3.1MB RAR 举报
温馨提示
利用HttpClient和HtmlParser实现的简单爬虫(Java)
资源推荐
资源详情
资源评论
收起资源包目录
SearchEngine.rar (42个子文件)
SearchEngine
.project 388B
src
org
qing
searchengine
webcollector
Clawler.java 2KB
FileDownLoader.java 3KB
LinkFilter.java 205B
LinkDB.java 1KB
HtmlParserTool.java 2KB
lib
htmllexer.jar 68KB
sax2.jar 35KB
httpmime-4.5.2.jar 40KB
htmlparser.jar 281KB
junit.jar 118KB
httpclient-win-4.5.2.jar 17KB
httpclient-cache-4.5.2.jar 155KB
fluent-hc-4.5.2.jar 31KB
httpclient-4.5.2.jar 719KB
filterbuilder.jar 66KB
jna-4.1.0.jar 893KB
thumbelina.jar 32KB
jna-platform-4.1.0.jar 1.4MB
commons-logging-1.2.jar 60KB
httpcore-4.4.4.jar 319KB
commons-codec-1.9.jar 258KB
.classpath 1KB
bin
org
qing
searchengine
webcollector
Clawler.class 2KB
HtmlParserTool$2.class 795B
HtmlParserTool.class 3KB
Clawler$1.class 900B
FileDownLoader.class 4KB
LinkFilter.class 177B
LinkDB.class 2KB
HtmlParserTool$1.class 933B
temp
club.xdnice.com_thread-1399675-1-1.html 22KB
club.xdnice.com_home.php_mod=magic&mid=namepost&idtype=pid&id=11258287_1400344 5KB
club.xdnice.com_forum.php_gid=640 10KB
club.xdnice.com_thread-1400514-1-1.html 8KB
club.xdnice.com_home.php_mod=spacecp&ac=usergroup&gid=52 5KB
club.xdnice.com_plugin.php_id=dsu_paulsign_sign 3KB
club.xdnice.com_# 10KB
club.xdnice.com_forum.php_mod=viewthread&tid=1400344&extra=page%3D1&ordertype=2#comiis_allreplies 4KB
club.xdnice.com_forum.php_mod=misc&action=recommend&do=add&tid=1400344&hash=47b5ca08 4KB
club.xdnice.com_thread-1397868-1-1.html 30KB
club.xdnice.com_thread-1400344-1-1.html 23KB
共 42 条
- 1
资源评论
- 晓呆同学2017-10-09学习了,谢谢分享
Q1n6
- 粉丝: 104
- 资源: 2
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功