package testparser;
import java.net.URL;
import junit.framework.TestCase;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.beans.LinkBean;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.tags.BodyTag;
import org.htmlparser.tags.FrameTag;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.InputTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.OptionTag;
import org.htmlparser.tags.SelectTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;
import org.htmlparser.visitors.NodeVisitor;
import org.htmlparser.visitors.ObjectFindingVisitor;
public class ParserTest extends TestCase {
/*http://htmlparser.sourceforge.net/
* 测试TagNameFilter
*/
public void testNodeFilter() {
Parser parser = null;
NodeFilter filter = null;
NodeList nodeList = null;
try {
parser = new Parser("http://www.g.cn");
//创建过图片滤器
filter = new TagNameFilter("IMG");
//过滤节点内容, 返回节点集
nodeList = parser.extractAllNodesThatMatch(filter);
//遍历
for (int i=0; i<nodeList.size(); i++) {
ImageTag tag = (ImageTag) nodeList.elementAt(i);
System.out.println("Image------URL:->" + tag.getImageURL());
System.out.println("Imagelocatcion:->" + tag.extractImageLocn());
System.out.println("Image------Src:->" + tag.getAttribute("src"));
}
} catch (ParserException e) {
e.printStackTrace();
}
}
/*
* 测试NodeClassFilter用法
*/
public void testLinkTag() {
Parser parser = null;
NodeFilter filter = null;
NodeList list = null;
try {
parser = new Parser("http://www.hao123.com");
//创建过超链接滤器
filter = new NodeClassFilter(LinkTag.class);
//过滤节点内容, 返回节点集
list = parser.extractAllNodesThatMatch(filter);
//遍历
for (int i=0; i<list.size(); i++) {
LinkTag link = (LinkTag) list.elementAt(i);
System.out.println(link.getLink() + "<-:->" + link.getLinkText());
}
} catch (ParserException e) {
e.printStackTrace();
}
}
/*
* 测试OrFilter
*/
public void testOrFilter() {
Parser parser = null;
NodeList nodeList = null;
NodeFilter inputFilter = null;
NodeFilter selectFilter = null;
OrFilter orFilter = null;
try {
parser = new Parser();
String inputHTML = "<head><title>OrFilter Test</title></head>" +
"<body>" +
"<input type=’text’ value=’text1′ name=’text1′/>" +
"<input type=’text’ value=’text2′ name=’text2′/>" +
"<select>" +
"<option id=’1′>1</option>" +
"<option id=’2′>2</option>" +
"<option id=’3′></option>" +
"</select>" +
"<a href=’http://www.baidu.com'/>yeeach.com</a>" +
"</body>";
parser.setInputHTML(inputHTML);
inputFilter = new NodeClassFilter(InputTag.class);
selectFilter = new NodeClassFilter(SelectTag.class);
NodeFilter[] filters = new NodeFilter[] {selectFilter, inputFilter};
orFilter = new OrFilter(filters);
//orFilter = new OrFilter();
//orFilter.setPredicates(filters);
nodeList = parser.extractAllNodesThatMatch(orFilter);
for (int i=0; i<nodeList.size(); i++) {
TagNode tag = (TagNode) nodeList.elementAt(i);
if (tag instanceof InputTag) {
InputTag inputTag = (InputTag) tag;
System.out.println("inputTag.getTagName()=" + inputTag.getTagName());
System.out.println("inputTag.getAttribute(value)=" + inputTag.getAttribute("value"));
} else if (tag instanceof SelectTag) {
SelectTag selectTag = (SelectTag) tag;
System.out.println("selectTag.getTagName()= " + selectTag.getTagName());
NodeList list = selectTag.getChildren();
for (int j=0; j<list.size(); j++) {
OptionTag optionTag = (OptionTag) list.elementAt(j);
System.out.println("optionTag.getOptionText()=" + optionTag.getOptionText());
}
}
}
} catch (ParserException e) {
e.printStackTrace();
}
}
/*
* 测试对<table><tr><td></td></tr></table>的解析
*/
public void testTable() {
Parser parser = null;
NodeList tableList = null;
NodeFilter tableFilter = null;
try {
String html = "<body><table id=’table1′ >" +
"<tr><td>1-11</td><td>1-12</td><td>1-13</td>" +
"<tr><td>1-21</td><td>1-22</td><td>1-23</td>" +
"<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>" +
"<table id=’table2′ >" +
"<tr><td>2-11</td><td>2-12</td><td>2-13</td" +
"<tr><td>2-21</td><td>2-22</td><td>2-23</td>" +
"<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>" +
"</body>";
//parser = Parser.createParser(html, "GBK");
parser = new Parser("http://www.hao123.com");
tableFilter = new NodeClassFilter(TableTag.class);
//tableFilter = new TagNameFilter("TABLE");
tableList = parser.extractAllNodesThatMatch(tableFilter);
for (int i=0; i<tableList.size(); i++) {
TableTag table = (TableTag) tableList.elementAt(i);
//取得表中的行集
TableRow[] rows = table.getRows();
//遍历每行
for (int r=0; r<rows.length; r++) {
TableRow tr = rows[r];
TableColumn[] td = tr.getColumns();
//行中的列
for (int c=0; c<td.length; c++) {
System.out.print(td[c].toPlainTextString() + " ");
}
System.out.println();
}
}
} catch (ParserException e) {
e.printStackTrace();
}
}
/*
* 测试ObjectFindingVisitor
*/
public void testImagesVisitor() {
Parser parser = null;
ObjectFindingVisitor visitor = null;
Node[] nodes = null;
try {
parser = new Parser("http://www.g.cn");
visitor = new ObjectFindingVisitor(ImageTag.class);
//设置访问页面
parser.visitAllNodesWith(visitor);
//取得ImageTag节点
nodes = visitor.getTags();
//遍历节点
for (int i=0; i<nodes.length; i++) {
ImageTag tag = (ImageTag) nodes[i];
System.out.println("Image------URL:->" + tag.getImageURL());
System.out.println("Imagelocatcion:->" + tag.extractImageLocn());
System.out.println("Image------Src:->" + tag.getAttribute("src"));
}
} catch (ParserException e) {
e.printStackTrace();
}
}
/*
* 测试NodeVisitor的用法,遍历所有节点
*/
public void testVisitorAll() {
Parser parser = null;
NodeVisitor visitor = null;
try {
parser = new Parser("http://www.baidu.com");
visitor = new NodeVisitor() {
//重写实现方法
public void visitTag(Tag tag) {
System.out.println (tag.getTagName());
}
};
parser.visitAllNodesWith(visitor);
} catch (ParserException e) {
e.printStackTrace();
}
}
/*
* 测试NodeVisitor的用法,遍历指定Tag
*/
public void testTagVisitor() {
Parser parser = null;
NodeVisitor visitor = null;
try {
parser = new Parser("http://www.baidu.com");
visitor = new NodeVisitor() {
@Override
public void visitTag(Tag tag) {
if (tag instanceof BodyTag) {
BodyTag body = (BodyTag)tag;
// ... 处理 body
} else if (tag instanceof FrameTag) {
FrameTag frame = (FrameTag) tag;
// ... 处理frame
} else if (tag instanceof TitleTag) {
TitleTag title = (TitleTag) tag;
// ... 处理Title
} else {
// ... 其他
}
System.ou
没有合适的资源?快使用搜索试试~ 我知道了~
资源推荐
资源详情
资源评论
收起资源包目录
HTMLparser 测试代码.rar (21个子文件)
ParserTest.java 10KB
TestTableLink.java 3KB
HTMLparser 测试代码.doc 65KB
getURL
.project 1KB
.mymetadata 291B
WebRoot
WEB-INF
web.xml 683B
isifg-servlet.xml 786B
lib
classes
com
isofthome
url
Test.class 1KB
GetLinkController.class 2KB
GetLinks.class 3KB
URLLinkBean.class 726B
file
ParseFile.class 2KB
index.jsp 845B
META-INF
MANIFEST.MF 39B
jsp
linkList.jsp 522B
.myeclipse
src
com
isofthome
url
GetLinks.java 2KB
Test.java 533B
GetLinkController.java 1KB
URLLinkBean.java 332B
file
ParseFile.java 937B
.classpath 1021B
共 21 条
- 1
资源评论
javpsei
- 粉丝: 1
- 资源: 3
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功