用htmlparser截取html摘要实现源码资源-CSDN文库

共28个文件

class：19个

java：7个

project：1个

htmlparser

摘要截取

5星 · 超过95%的资源需积分: 10 44 浏览量 2008-08-27 17:12:08 上传评论收藏 23KB RAR 举报

资源详情

资源评论

资源推荐

收起资源包目录

testhtml.rar （28个子文件）

testhtml

.project 384B

bin

TestCmd.class 619B

TestContent.class 5KB

TestMore.class 2KB

org

htmlparser

tags

WbrTag.java 415B

PtypeTag.java 415B

BrTag.java 413B

LinkTypeTag.java 717B

PTag.java 558B

FontTag.java 417B

TestNew.java 4KB

.classpath 967B

import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.UnsupportedEncodingException; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.PrototypicalNodeFactory; import org.htmlparser.tags.BrTag; import org.htmlparser.tags.CompositeTag; import org.htmlparser.tags.FontTag; import org.htmlparser.tags.LinkTypeTag; import org.htmlparser.tags.PtypeTag; import org.htmlparser.tags.WbrTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; public class TestNew { public static String readFileByLines(String filename) { File file = new File(filename); BufferedReader reader = null; try { reader = new BufferedReader(new FileReader(file)); String tempString = ""; StringBuffer sb = new StringBuffer(); while ((tempString = reader.readLine()) != null) { tempString = new String(tempString.getBytes(),"utf-8"); sb.append(tempString+"\n"); } return sb.toString(); } catch (Exception e) { // TODO Auto-generated catch block return ""; } } /*public static String extractText(String file) throws Exception { StringBuffer text = new StringBuffer(); Parser parser = new Parser(file); parser.setEncoding("utf-8"); NodeList nodes = parser.extractAllNodesThatMatch(new NodeFilter() { public boolean accept(Node node) { return true; } }); for (int i=0;i<nodes.size();i++){ Node nodet = nodes.elementAt(i); text.append(new String(nodet.toPlainTextString().getBytes())); } return text.toString(); }*/ public static String readWithTag(String filename,int length) throws IOException { String content = readFileByLines(filename); int pos = 0,len = 0,count = 0; String s = ""; StringBuffer sb = new StringBuffer(); while(true) { if(count >= length) break; s = content.substring(pos, pos+1); if(s.equals("<")) { len = content.indexOf(">", pos)-pos; for(int i=0;i<len;i++) { s = content.substring(pos+i, pos+i+1); sb.append(s); } pos += len; } else { if(count < length) { if(s.equals(">")) { sb.append(s); pos++; } else{ sb.append(s); count++; pos++; } } } } return sb.toString(); } public static String cutContent(String file,int len) { try{ /*String temp = readFileByLines(file); String content = extractText(file); content = content.substring(0, len); String tmp1 = content.substring(content.length()-2, content.length()); String html = temp.substring(0, temp.indexOf(tmp1));*/ String html = readWithTag(file,len); Parser parser = Parser.createParser(new String(html.getBytes(),"8859_1"),""); //注册自定义的新结点解析器,这是必要的... PrototypicalNodeFactory factory = new PrototypicalNodeFactory (); factory.registerTag(new LinkTypeTag ()); factory.registerTag(new BrTag ()); factory.registerTag(new FontTag ()); factory.registerTag(new PtypeTag ()); factory.registerTag(new WbrTag ()); parser.setNodeFactory(factory); NodeList nodelist = parser.extractAllNodesThatMatch(new NodeFilter(){ public boolean accept(Node node) { if(node instanceof CompositeTag) return true; return false; } }); String str = ""; String tmp = ""; for (int i = 0; i < nodelist.size(); i++) { CompositeTag testTag = (CompositeTag)nodelist.elementAt(i); if (testTag.getParent() == null) { tmp = new String(testTag.toHtml().getBytes("8859_1")); str += tmp + "\n"; } } return str; } catch(Exception e) { return ""; } } /** * @param args * @throws Exception */ public static void main(String[] args) throws Exception { // TODO Auto-generated method stub String str = cutContent("c:/双卫,医学社区---文章阅读.htm",200); System.out.println(str); } }