package com.sohu;
import com.sohu.bean.NewsBean;
import com.sohu.db.ConnectionManager;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.beans.StringBean;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.Div;
import org.htmlparser.tags.HeadingTag;
import org.htmlparser.tags.Span;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import java.sql.PreparedStatement;
import java.sql.SQLException;
/**
* 用于对搜狐网站上的新闻进行抓取
* @author guanminglin <guanminglin@gmail.com>
*/
public class SohuNews {
private Parser parser = null; //用于分析网页的分析器。
private List newsList = new ArrayList(); //暂存新闻的List;
private NewsBean bean = new NewsBean();
private ConnectionManager manager = null; //数据库连接管理器。
private PreparedStatement pstmt = null;
public SohuNews() {
}
/**
* 获得一条完整的新闻。
* @param newsBean
* @return
*/
public List getNewsList(final NewsBean newsBean) {
List list = new ArrayList();
String newstitle = newsBean.getNewsTitle();
String newsauthor = newsBean.getNewsAuthor();
String newscontent = newsBean.getNewsContent();
String newsdate = newsBean.getNewsDate();
list.add(newstitle);
list.add(newsauthor);
list.add(newscontent);
list.add(newsdate);
return list;
}
/**
* 设置新闻对象,让新闻对象里有新闻数据
* @param newsTitle 新闻标题
* @param newsauthor 新闻作者
* @param newsContent 新闻内容
* @param newsDate 新闻日期
* @param url 新闻链接
*/
public void setNews(String newsTitle, String newsauthor, String newsContent, String newsDate, String url) {
bean.setNewsTitle(newsTitle);
bean.setNewsAuthor(newsauthor);
bean.setNewsContent(newsContent);
bean.setNewsDate(newsDate);
bean.setNewsURL(url);
}
/**
* 该方法用于将新闻添加到数据库中。
*/
protected void newsToDataBase() {
//建立一个线程用来执行将新闻插入到数据库中。
Thread thread = new Thread(new Runnable() {
public void run() {
boolean sucess = saveToDB(bean);
if (sucess != false) {
System.out.println("插入数据失败");
}
}
});
thread.start();
}
/**
* 将新闻插入到数据库中
* @param bean
* @return
*/
public boolean saveToDB(NewsBean bean) {
boolean flag = true;
String sql = "insert into news(newstitle,newsauthor,newscontent,newsurl,newsdate) values(?,?,?,?,?)";
manager = new ConnectionManager();
String titleLength = bean.getNewsTitle();
if (titleLength.length() > 60) { //标题太长的新闻不要。
return flag;
}
try {
pstmt = manager.getConnection().prepareStatement(sql);
pstmt.setString(1, bean.getNewsTitle());
pstmt.setString(2, bean.getNewsAuthor());
pstmt.setString(3, bean.getNewsContent());
pstmt.setString(4, bean.getNewsURL());
pstmt.setString(5, bean.getNewsDate());
flag = pstmt.execute();
} catch (SQLException ex) {
Logger.getLogger(SohuNews.class.getName()).log(Level.SEVERE, null, ex);
} finally {
try {
pstmt.close();
manager.close();
} catch (SQLException ex) {
Logger.getLogger(SohuNews.class.getName()).log(Level.SEVERE, null, ex);
}
}
return flag;
}
/**
* 获得新闻的标题
* @param titleFilter
* @param parser
* @return
*/
private String getTitle(NodeFilter titleFilter, Parser parser) {
String titleName = "";
try {
NodeList titleNodeList = (NodeList) parser.parse(titleFilter);
for (int i = 0; i < titleNodeList.size(); i++) {
HeadingTag title = (HeadingTag) titleNodeList.elementAt(i);
titleName = title.getStringText();
}
} catch (ParserException ex) {
Logger.getLogger(SohuNews.class.getName()).log(Level.SEVERE, null, ex);
}
return titleName;
}
/**
* 获得新闻的责任编辑,也就是作者。
* @param newsauthorFilter
* @param parser
* @return
*/
private String getNewsAuthor(NodeFilter newsauthorFilter, Parser parser) {
String newsAuthor = "";
try {
NodeList authorList = (NodeList) parser.parse(newsauthorFilter);
for (int i = 0; i < authorList.size(); i++) {
Div authorSpan = (Div) authorList.elementAt(i);
newsAuthor = authorSpan.getStringText();
}
} catch (ParserException ex) {
Logger.getLogger(SohuNews.class.getName()).log(Level.SEVERE, null, ex);
}
return newsAuthor;
}
/*
* 获得新闻的日期
*/
private String getNewsDate(NodeFilter dateFilter, Parser parser) {
String newsDate = null;
try {
NodeList dateList = (NodeList) parser.parse(dateFilter);
for (int i = 0; i < dateList.size(); i++) {
Span dateTag = (Span) dateList.elementAt(i);
newsDate = dateTag.getStringText();
}
} catch (ParserException ex) {
Logger.getLogger(SohuNews.class.getName()).log(Level.SEVERE, null, ex);
}
return newsDate;
}
/**
* 获取新闻的内容
* @param newsContentFilter
* @param parser
* @return content 新闻内容
*/
private String getNewsContent(NodeFilter newsContentFilter, Parser parser) {
String content = null;
StringBuilder builder = new StringBuilder();
try {
NodeList newsContentList = (NodeList) parser.parse(newsContentFilter);
for (int i = 0; i < newsContentList.size(); i++) {
Div newsContenTag = (Div) newsContentList.elementAt(i);
builder = builder.append(newsContenTag.getStringText());
}
content = builder.toString(); //转换为String 类型。
if (content != null) {
parser.reset();
parser = Parser.createParser(content, "gb2312");
StringBean sb = new StringBean();
sb.setCollapse(true);
parser.visitAllNodesWith(sb);
content = sb.getStrings();
// String s = "\";} else{ document.getElementById('TurnAD444').innerHTML = \"\";} } showTurnAD444(intTurnAD444); }catch(e){}";
content = content.replaceAll("\\\".*[a-z].*\\}", "");
content = content.replace("[我来说两句]", "");
} else {
System.out.println("没有得到新闻内容!");
}
} catch (ParserException ex) {
Logger.getLogger(SohuNews.class.getName()).log(Level.SEVERE, null, ex);
}
return content;
}
/**
* 根据提供的URL,获取此URL对应网页所有的纯文本信息,次方法得到的信息不是很纯,
*常常会得到我们不想要的数据。不过如果你只是想得到某个URL 里的所有纯文本信息,该方法还是很好用的。
* @param url 提供的URL链接
* @return RL对应网页的纯文本信息
* @throws ParserException
* @deprecated 该方法被 getNewsContent
没有合适的资源?快使用搜索试试~ 我知道了~
资源推荐
资源详情
资源评论
收起资源包目录
zhizhupc.rar (55个子文件)
codefans.net
zhizhu
test
com
sohu
SohuNewsTest.java 1KB
dist
Sohu.war 1.05MB
src
conf
MANIFEST.MF 25B
java
com
sohu
bean
NewsBean.java 2KB
crawler
LinkDB.java 1KB
LinkParser.java 4KB
NewsToDB.java 270B
Crawler.java 2KB
LinkFilter.java 231B
Queue.java 620B
db
ConnectionManager.java 2KB
servlet
GetNewsServlet.java 3KB
SohuNews.java 10KB
lib
htmllexer.jar 68KB
commons-logging-1.0.4.jar 37KB
commons-codec-1.3.jar 46KB
commons-httpclient-3.1.jar 298KB
htmlparser.jar 281KB
web
WEB-INF
web.xml 790B
index.jsp 750B
META-INF
context.xml 85B
detail.jsp 920B
news.sql 440B
build
web
WEB-INF
web.xml 790B
lib
htmllexer.jar 68KB
mysql-connector-java-5.1.6-bin.jar 687KB
htmlparser.jar 281KB
classes
com
sohu
bean
NewsBean.class 1KB
crawler
LinkParser.class 3KB
LinkParser$1.class 819B
Crawler.class 2KB
Queue.class 1KB
LinkDB.class 2KB
LinkFilter.class 203B
LinkParser$2.class 796B
Crawler$1.class 779B
NewsToDB.class 453B
SohuNews.class 8KB
db
ConnectionManager.class 2KB
servlet
GetNewsServlet.class 2KB
GetNewsServlet$1.class 969B
SohuNews$1.class 885B
.netbeans_automatic_build 0B
index.jsp 750B
META-INF
context.xml 85B
MANIFEST.MF 25B
detail.jsp 1KB
build.xml 3KB
nbproject
build-impl.xml 46KB
private
private.xml 211B
private.properties 2KB
ant-deploy.xml 2KB
project.properties 2KB
project.xml 1KB
genfiles.properties 473B
共 55 条
- 1
资源评论
- jinyaoqbb2014-12-20很好的资源
jjh371898307
- 粉丝: 2
- 资源: 4
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功