import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.PrototypicalNodeFactory;
import org.htmlparser.tags.BrTag;
import org.htmlparser.tags.CompositeTag;
import org.htmlparser.tags.FontTag;
import org.htmlparser.tags.LinkTypeTag;
import org.htmlparser.tags.PtypeTag;
import org.htmlparser.tags.WbrTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
public class TestNew {
public static String readFileByLines(String filename) {
File file = new File(filename);
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(file));
String tempString = "";
StringBuffer sb = new StringBuffer();
while ((tempString = reader.readLine()) != null) {
tempString = new String(tempString.getBytes(),"utf-8");
sb.append(tempString+"\n");
}
return sb.toString();
} catch (Exception e) {
// TODO Auto-generated catch block
return "";
}
}
/*public static String extractText(String file) throws Exception {
StringBuffer text = new StringBuffer();
Parser parser = new Parser(file);
parser.setEncoding("utf-8");
NodeList nodes = parser.extractAllNodesThatMatch(new NodeFilter() {
public boolean accept(Node node) {
return true;
}
});
for (int i=0;i<nodes.size();i++){
Node nodet = nodes.elementAt(i);
text.append(new String(nodet.toPlainTextString().getBytes()));
}
return text.toString();
}*/
public static String readWithTag(String filename,int length) throws IOException {
String content = readFileByLines(filename);
int pos = 0,len = 0,count = 0;
String s = "";
StringBuffer sb = new StringBuffer();
while(true)
{
if(count >= length)
break;
s = content.substring(pos, pos+1);
if(s.equals("<"))
{
len = content.indexOf(">", pos)-pos;
for(int i=0;i<len;i++)
{
s = content.substring(pos+i, pos+i+1);
sb.append(s);
}
pos += len;
}
else
{
if(count < length)
{
if(s.equals(">"))
{
sb.append(s);
pos++;
}
else{
sb.append(s);
count++;
pos++;
}
}
}
}
return sb.toString();
}
public static String cutContent(String file,int len)
{
try{
/*String temp = readFileByLines(file);
String content = extractText(file);
content = content.substring(0, len);
String tmp1 = content.substring(content.length()-2, content.length());
String html = temp.substring(0, temp.indexOf(tmp1));*/
String html = readWithTag(file,len);
Parser parser = Parser.createParser(new String(html.getBytes(),"8859_1"),"");
//注册自定义的新结点解析器,这是必要的...
PrototypicalNodeFactory factory = new PrototypicalNodeFactory ();
factory.registerTag(new LinkTypeTag ());
factory.registerTag(new BrTag ());
factory.registerTag(new FontTag ());
factory.registerTag(new PtypeTag ());
factory.registerTag(new WbrTag ());
parser.setNodeFactory(factory);
NodeList nodelist = parser.extractAllNodesThatMatch(new NodeFilter(){
public boolean accept(Node node)
{
if(node instanceof CompositeTag)
return true;
return false;
}
});
String str = "";
String tmp = "";
for (int i = 0; i < nodelist.size(); i++) {
CompositeTag testTag = (CompositeTag)nodelist.elementAt(i);
if (testTag.getParent() == null) {
tmp = new String(testTag.toHtml().getBytes("8859_1"));
str += tmp + "\n";
}
}
return str;
}
catch(Exception e)
{
return "";
}
}
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
String str = cutContent("c:/双卫,医学社区---文章阅读.htm",200);
System.out.println(str);
}
}
评论2