/*本版本可以实现输入url即抓取,速度比较理想
*
*/
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Scanner;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.params.ConnRouteParams;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.PrototypicalNodeFactory;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.Div;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.PTag;
import org.htmlparser.util.NodeList;
/**
* @author JacobWolf
* Create Data: 2014-5-8
* Email: 642268474@qq.com
*
* 版权所有,翻版不究,但在修改本程序的时候必须加上这些注释!
* 仅用于学习交流之用
* 本类主要是获取新闻正文 提供url即可,原理是匹配p标签,如果网页新闻不在p标签中,也可以自己重写标签类以相应识别
*/
public class Grab3
{
/**
* 根据URL抓取新闻页面正文内容
*
* @param url
* @tag
*/
private final String URL;
public static synchronized void getContentFormUrl(String url)
{
/* 实例化一个HttpClient客户端 */
HttpClient client = getHttpClient();
HttpGet getHttp = new HttpGet(url);
String content = null;
HttpResponse response;
try
{
/*获得信息载体*/
response = client.execute(getHttp);
HttpEntity entity = response.getEntity();
if (entity != null)
{
/* 转化为文本信息 */
content = EntityUtils.toString(entity).replaceAll("\"", "'");
System.out.println("成功响应");
}
byte[] result=getPTag(content);
String trans=new String(result,"utf-8");
String Content=clearUnvalidWorld(trans); //最终结果保存在content中
} catch (ClientProtocolException e)
{ System.out.println("响应失败");
e.printStackTrace();
} catch (IOException e)
{ System.out.println("响应失败");
e.printStackTrace();
} catch (Exception e) {
System.out.println("响应异常");
e.printStackTrace();
} finally
{
client.getConnectionManager().shutdown();
}
}
//提取网页正文
public static synchronized byte[] getPTag(String data) throws Exception
{
Parser parser=new Parser();
parser.setEncoding("utf-8");
data=data.replaceAll("\"", "'");
parser.setInputHTML(data);
//自定义标签
PrototypicalNodeFactory p=new PrototypicalNodeFactory();
p.registerTag(new PTag());
parser.setNodeFactory(p);
NodeFilter filter=new NodeFilter() {
@Override
public boolean accept(Node node) {
if(node instanceof PTag)
{ return true;
}
else
return false;
}
};
byte[] dat=null;
NodeList listtxt = parser.extractAllNodesThatMatch(filter);
String fiTem="";
for(int i=0;i<listtxt.size();i++){
fiTem=fiTem+listtxt.elementAt(i).toHtml();
}
dat=fiTem.getBytes();
return dat;
}
public static HttpClient getHttpClient() {
DefaultHttpClient httpClient = new DefaultHttpClient();
String proxyHost = "14.29.117.37"; //设置ip
int proxyPort =80; //端口
String userName = ""; //账户密码为空即可
String password = "";
httpClient.getCredentialsProvider().setCredentials(new AuthScope(proxyHost, proxyPort),new UsernamePasswordCredentials(userName, password));
HttpHost proxy = new HttpHost(proxyHost,proxyPort);
httpClient.getParams().setParameter(ConnRouteParams.DEFAULT_PROXY, proxy);
return httpClient;
}
//过滤无效字符串
public static String clearUnvalidWorld(String string)
{
String res ="";
Pattern pattern=Pattern.compile("<(p|P|(.*?)br(.*?))(.*?)>(.*?)<(/p|/P|(.*?)br(.*?))>");
Matcher matcher=pattern.matcher(string);
while(matcher.find()){
String a=matcher.group(0);
res=res+a;
}
res=res.replaceAll("&\\w*\\s*;","");
System.out.println(res.replaceAll("<[^>]*>",""));
return res.replaceAll("<[^>]*>"," ");
}
public static void main(String args[]){
Scanner scanner = new Scanner(System.in);
System.out.print("请输入页面url:");
String url=scanner.nextLine();
System.out.println("抓取中,请稍后......");
getContentFormUrl(url);
}
public Grab3(String url){
URL=url;
}
}
//测试url以及特征码
//http://in.reuters.com/article/2014/05/09/sensex-nifty-india-markets-idINKBN0DP05420140509
//id="articleText"
//http://hereisthecity.com/en-gb/2014/05/09/barclays-bank-boss-has-a-coherent-plan-at-last/
//class="post-content-container"
//http://www.investing.com/analysis/zinc-may-witness-selling-recommended-at-higher-levels-212260 该网站有防爬虫功能,暂无法抓取
//class="arial_14 clear WYSIWYG" itemprop="articleBody" id="contentSection"
//http://www.insidefutures.com/article/1210705/EURUSD%20Bearish%20Outside%20Pattern%20Top%20Back%20from%201.4000.html
//id="articleBody"
//http://www.bdlive.co.za/markets/2014/05/09/asian-shares-pare-losses-after-china-releases-tame-inflation-report
//class="articlebody"
//http://www.thehindubusinessline.com/markets/commodities/crude-oil-futures-up-at-rs-6032-per-barrel/article5992603.ece
//class="article-text "
//http://afkinsider.com/55386/afki-commodities-report-still-deal-south-africas-platinum-mine-strike/
//class="article-frame sub"
//http://www.logisticsmgmt.com/article/aar_reports_annual_carload_and_intermodal_gains_for_april_2014
//id="articleLeft"
//http://www.ft.com/intl/cms/s/0/5da624ca-d6ba-11e3-b251-00144feabdc0.html#axzz31Cb2vfrO 这个网站有浏览次数限制
//id="storyContent"
//http://economictimes.indiatimes.com/news/international/business/great-moderation-2-0-could-the-stabilising-world-economy-result-in-an-economic-expansion-that-lasts-a-very-long-time/articleshow/34845212.cms
//class="artText"
//http://www.aysor.am/en/news/2014/05/08/serj-sargsyan-putin-handipum/
//id="conti"
//http://www.weeklytimesnow.com.au/commodities/livestock-sales/shepparton-may-9-5400-top/story-fnker6hx-1226912082607
//class="story-body lead-media-large"
//http://in.reuters.com/article/2014/05/09/usa-obama-energy-idINL2N0NV1CY20140509
//id="articleText"
评论9
最新资源