import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.net.URL;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.PrototypicalNodeFactory;
import org.htmlparser.Tag;
import org.htmlparser.beans.StringBean;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.StringFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.Div;
import org.htmlparser.tags.FrameTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.ScriptTag;
import org.htmlparser.tags.StyleTag;
import org.htmlparser.util.NodeList;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
public class finalWork {
TagNameSet tagNameSet = new TagNameSet();
private final static String CRLF = System.getProperty("line.separator");
//private final static String StrBegin = "<div class=";
public static String replaceBlank(String str) {
String dest = "";
if (str!=null) {
Pattern p = Pattern.compile("\\s*|\t|\r|\n");
Matcher m = p.matcher(str);
dest = m.replaceAll("");
}
return dest;
}
public static Set<String> ParseTagStr(String tagStr, String atrTgt, NodeList nodeList){
Set<String> sStr = new HashSet<String>();
NodeList tgtList = null;
try{
NodeFilter nodeFilter = new TagNameFilter(tagStr);
tgtList = nodeList.extractAllNodesThatMatch(nodeFilter);
for(int index = 0; index < tgtList.size(); index++){
TagNode tagNode = (TagNode)tgtList.elementAt(index);
sStr.add(tagNode.getAttribute(atrTgt));
}
}catch(Exception e){
e.printStackTrace();
}
return sStr;
}
public static NodeList ParseTagNode(String tagStr, NodeList nodeList){
NodeList tagList = null;
try{
NodeFilter nodeFilter = new TagNameFilter(tagStr);
tagList = nodeList.extractAllNodesThatMatch(nodeFilter);
}catch(Exception e){
e.printStackTrace();
}
return tagList;
}
public static int ParseTagStr(Node childNode){
String str = null;
NodeList tgtList = null;
NodeList nodeList = null;
try{
if(childNode.getText().contains("WB_text W_f14")){
System.out.println(childNode.toPlainTextString());
System.out.println("=======================================================");
return 2;
}
else{
nodeList = childNode.getChildren();
NodeFilter nodeFilter = new HasAttributeFilter("class","W_face_radius");
tgtList = nodeList.extractAllNodesThatMatch(nodeFilter);
if(0 != tgtList.size()){
TagNode tagNode = (TagNode)tgtList.elementAt(0);
System.out.println(tagNode.getAttribute("href"));
System.out.println(tagNode.getAttribute("title"));
}
}
/*
for(int i = 0; i < 2; i++){
NodeFilter nodeFilter = new HasAttributeFilter("class",TagNameSet.ClassTagSet[i]);
//NodeFilter nodeFilter = new TagNameFilter(tagStr);
tgtList = nodeList.extractAllNodesThatMatch(nodeFilter);
if(0 != tgtList.size()){
TagNode tagNode = (TagNode)tgtList.elementAt(0);
switch (i){
case 0 :
System.out.println(tagNode.getAttribute("href"));
System.out.println(tagNode.getAttribute("title"));
break;
case 1 :
System.out.println("=======================================================");
break;
}
}
}
*/
}catch(Exception e){
e.printStackTrace();
}
return 0;
}
public static int ParseDiv(NodeList nodeList){
if(null == nodeList){
return 1;
}
int isNull = 0;
int isDone = 0;
NodeFilter divFilter = new TagNameFilter("div");
//NodeFilter divFilter = new NodeClassFilter(Div.class);
//NodeFilter childFilter = new HasChildFilter(divFilter);
NodeList divList = nodeList.extractAllNodesThatMatch(divFilter);
if(0 == divList.size()){
/*处理标签*/
return 0;
}
else{
for(int index = 0; index < divList.size(); index++){
//System.out.println(divList.elementAt(index).toHtml());
//System.out.println("==========qqqqqqqqqqqqqqqqqqqqqqqq========================");
isNull = ParseDiv(divList.elementAt(index).getChildren());
if(0 == isNull){
isDone = ParseTagStr(divList.elementAt(index));
//if(2 == isDone){
// return 2;
//}
}
//else{
// if(2 == isNull)
//}
}
return 1;
}
}
public static void ShowTxtAndLink(String strHTML){
try{
String linkTxt = null;
String linkAdd = null;
Set<String> setStr = null;
NodeList srcList = null;
NodeList tagList = null;
NodeList finalList = null;
FileWriter file = new FileWriter("D:/最终获取数据.html");
BufferedWriter out = new BufferedWriter(file);
//Parser parser = new Parser(strHTML);
Parser parser = new Parser();
parser.setInputHTML(strHTML);
//NodeFilter linkFilter = new TagNameFilter("a");
NodeFilter linkFilter = new HasAttributeFilter("class","WB_cardwrap WB_feed_type S_bg2 WB_feed_vipcover");
//NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
srcList = parser.extractAllNodesThatMatch(linkFilter);
for(int index = 0; index < srcList.size(); index++){
tagList = ParseTagNode(TagNameSet.TagSet[2], srcList.elementAt(index).getChildren());
ParseDiv(tagList.elementAt(0).getChildren());
/*
NodeFilter txtFilter = new HasAttributeFilter( "class", "WB_text W_f14" );
NodeFilter nameFilter = new HasAttributeFilter("class","W_face_radius");
NodeFilter nodeFilter = new OrFilter(txtFilter,nameFilter);
finalList = tagList.elementAt(0).getChildren().extractAllNodesThatMatch(nodeFilter);
for(int i = 0; i < finalList.size(); i++){
TagNode tagNode = (TagNode)finalList.elementAt(0);
if(tagNode.getText().contains("W_face_radius")){
System.out.println(tagNode.getAttribute("href"));
System.out.println(tagNode.getAttribute("title"));
}
else{
if(tagNode.getText().contains("WB_text W_f14")){
System.out.println(tagNode.toPlainTextString());
System.out.println("=======================================================");
}
}
}
*/
/*
out.write(tagList.elementAt(0).toHtml());
out.newLine();
out.write("==================================================================");
out.newLine();
out.write(tagList.elementAt(1).getText());
out.newLine();
*/
/*setStr = ParseTagStr(TagNameSet.TagSet[3], TagNameSet.AttributeSet[0], tagList);
if(!(setStr.isEmpty())){
out.write(setStr.toString());
out.newLine();
out.write("==================================================================");
out.newLine();
System.out.println(setStr.toString());
}*/
}
//for(int index = 0; index < nodeList.size(); index++){
//ParseNode(nodeList.elementAt(index ).getChildren());
/*
LinkTag linkTag = (LinkTag)nodeList.elementAt(index);
linkTxt = linkTag.getAttribute("title");
linkTxt += linkTag.toPlainTextString();
linkAdd = linkTag.getAttribute("href");
*/
/*
Node NodeTag = nodeList.elementAt(index);
linkTxt = nodeList.elementAt(index).toPlainTextString();
linkTxt = replaceBlank(linkTxt);
linkAdd = nod