package com.space;
import java.net.URL;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
public class HtmlCleanerDemo
{
public static void main(String[] args) throws Exception
{
try
{
HtmlCleaner cleaner = new HtmlCleaner();
URL url = new URL(
"http://haikou.58.com/baomu/23058539091978x.shtml?adtype=1&entinfo=23058539091978_0&adact=3&psid=118324118190869277073175706");
// URL url = new URL("http://haikou.58.com/baomu/21632367579150x.shtml?adtype=1&entinfo=21632367579150_0&adact=3&psid=101062395190937233949225183");
// URL url = new URL("http://www.hizw.gov.cn/data/news/2016/02/57095");
TagNode node = cleaner.clean(url, "utf-8");
Object[] tagNodes = node.evaluateXPath("//*[@id='sub_1']/div"); // 找到<p>标签中id为nv的<p>标签
// 这些<p>标签下有子标签<a>的
// 标签们
// Object[] tagNodes = node.evaluateXPath("//*[@id='sub_1']/div/article"); //找到<p>标签中id为nv的<p>标签 这些<p>标签下有子标签<a>的 标签们
// Object[] tagNodes = node.evaluateXPath("//*[@id='neirongText']/p[2]/span/font/text()[1]"); //找到<p>标签中id为nv的<p>标签 这些<p>标签下有子标签<a>的 标签们
// 按tag取. 取出title
// Object[] ns = node.getElementsByName("title", true); //标题
// 按属性值取 取出 name="my_href" 的链接
// ns = node.getElementsByAttValue(attname, attvalue, 是否递归查找, 是否大小写敏感);
for (Object tagNode : tagNodes)
{
if (tagNode instanceof TagNode)
{
TagNode t = (TagNode) tagNode;
StringBuffer s = t.getText();
System.out.println(formatContent(
formatContent(s, "boot.require", "});"), "联系我时",
"谢谢!"));
// System.out.println(((TagNode)tagNode).getAttributeByName("href"));
// System.out.println(((TagNode)tagNode).getText());
} else
{
System.out.println(tagNode.toString());
}
}
} catch (Exception exception)
{
exception.printStackTrace();
}
}
/**
*
* @param s
* 需格式化的字符串
* @param dStart
* 需删除字符串的开头
* @param dEnd
* 需删除字符串的结尾
* @return 格式化后的字符串
*/
public static StringBuffer formatContent(StringBuffer s, String dStart,
String dEnd)
{
int start = s.indexOf(dStart);
int end = 0;
if (start > end)
{
end = s.indexOf(dEnd);
if (start < end && start >= 0)
{
s.delete(start, end + dEnd.length());
}
}
return s;
}
}
评论2
最新资源