import java.awt.*;
import java.awt.event.*;
import javax.swing.*;
import java.util.*;
import java.io.*;
import com.heaton.bot.*;
import org.w3c.dom.*;
import org.cyberneko.html.parsers.*;
import org.xml.sax.*;
import org.apache.html.dom.*;
import javax.xml.parsers.*;
import javax.xml.transform.*;
import javax.xml.transform.dom.*;
import javax.xml.transform.stream.*;
/**
* 网络爬虫,通过深度优先算法将互联网上的网页下载解析。经过测试,能正常运行!
*/
public class mySpider extends JFrame implements ISpiderReportable {
/**
* @param args
*/
Spider _Spider = null;
Document doc = null;
int _pagesCount;
public mySpider(){
setTitle("网络爬虫之Spider");
getContentPane().setLayout(null);
setSize(405,268);
setVisible(false);
D.setHorizontalTextPosition(
SwingConstants.LEFT);
D.setVerticalTextPosition(
SwingConstants.TOP);
D.setVerticalAlignment(
SwingConstants.TOP);
D.setText("下载的页面数目: ");
getContentPane().add(D);
D.setBounds(12,12,384,24);
JLabel2.setText("URL:");
getContentPane().add(JLabel2);
JLabel2.setBounds(12,36,36,24);
getContentPane().add(_url);
_url.setBounds(48,36,348,24);
JLabel3.setText("选择存储XML文档的目录:");
getContentPane().add(JLabel3);
JLabel3.setBounds(12,72,384,24);
getContentPane().add(_save);
_save.setBounds(12,96,384,24);
_go.setText("GO!");
getContentPane().add(_go);
_go.setBounds(96,228,216,24);
getContentPane().add(_current);
_current.setBounds(12,204,384,12);
JLabel4.setText("Number of pages:");
getContentPane().add(JLabel4);
JLabel4.setBounds(12,180,120,12);
_pages.setText("0");
getContentPane().add(_pages);
_pages.setBounds(120,180,108,12);
JLabel6.setText("选择Log,当前需要保存的日志:");
getContentPane().add(JLabel6);
JLabel6.setBounds(12,120,384,24);
_logPath.setText("./Spider.log");
getContentPane().add(_logPath);
_logPath.setBounds(12,144,384,24);
_go.setActionCommand("jbutton");
SymAction lSymAction = new SymAction();
_go.addActionListener(lSymAction);
SymWindow aSymWindow = new SymWindow();
this.addWindowListener(aSymWindow);
try{
DOMImplementation domImpl = DocumentBuilderFactory
.newInstance().newDocumentBuilder().getDOMImplementation();
doc = domImpl.createDocument(null,"spider",null);
}catch(ParserConfigurationException e){
e.printStackTrace();
}catch(DOMException e){
e.printStackTrace();
}
}
public void setVisible(boolean b){
if(b)
setLocation(50,50);
super.setVisible(b);
}
public static void main(String[] args) {
// TODO Auto-generated method stub
(new mySpider()).setVisible(true);
}
public void addNotify(){
Dimension size = getSize();
super.addNotify();
if(frameSizeAdjusted)
return;
frameSizeAdjusted = true;
Insets insets = getInsets();
JMenuBar menuBar = getRootPane().getJMenuBar();
int menuBarHeight = 0;
if(menuBar!=null)
menuBarHeight = menuBar.getPreferredSize().height;
setSize(insets.left+
insets.right+
size.width,
insets.top+
insets.bottom+
size.height+menuBarHeight);
}
boolean frameSizeAdjusted = false;
JLabel D = new JLabel();
JLabel JLabel2 = new JLabel();
JTextField _url = new JTextField();
JLabel JLabel3 = new JLabel();
JTextField _save = new JTextField();
JButton _go = new JButton();
JLabel _current = new JLabel();
JLabel JLabel4 = new JLabel();
JLabel _pages = new JLabel();
JLabel JLabel6 = new JLabel();
JTextField _logPath = new JTextField();
class SymAction implements ActionListener{
public void actionPerformed(ActionEvent event){
Object object = event.getSource();
if(object == _go)
Go_actionPerformed(event);
}
}
protected void processFile(HTTP file){
try{
if(_save.getText().length()>0){
int i=file.getURL().lastIndexOf('/');
if(i!=-1){
int iPoint = file.getURL().lastIndexOf('.');
String extendName = file.getURL().substring(iPoint+1);
if(extendName.equals("html") || extendName.equals("htm") || extendName.equals("shtml")){
String fileBody = new String(file.getBody().getBytes("iso-8859-1"),"GBK");
DOMFragmentParser parser = new DOMFragmentParser();
DocumentFragment node =
new HTMLDocumentImpl().createDocumentFragment();
try {
parser.setProperty("http://cyberneko.org/html/properties/default-encoding","GBK");
parser.parse(new InputSource(new ByteArrayInputStream(fileBody.getBytes())), node);
}catch (IOException e) {
e.printStackTrace();
}catch (SAXException e) {
e.printStackTrace();
}
StringBuffer sb = new StringBuffer();
getText(sb, node, "title");
String title = sb.toString();
sb.setLength(0);
getText(sb, node,"body");
String text = sb.toString();
text = text.replaceAll("<","<")
.replaceAll(">",">");
if(title.length()!=0 && text.length()!=0)
addElementNode(doc,title,text,file.getURL());
}
}
}
}catch(Exception e){
Log.logException("Can't save output file: ",e);
}
}
private Element createTitleElement(Document docs,String title){
Element titleElement = docs.createElement("TITLE");
titleElement.setTextContent(title);
return titleElement;
}
private Element createBodyElement(Document docs,String body){
Element bodyElement = docs.createElement("BODY");
bodyElement.setTextContent(body);
return bodyElement;
}
private Element createURLElement(Document docs,String URL){
Element URLElement = docs.createElement("URL");
URLElement.setTextContent(URL);
return URLElement;
}
public void addElementNode(Document docs,String title,String body,String URL){
Element HTMLElement = docs.createElement("HTMLPAPER");
HTMLElement.appendChild(createTitleElement(docs,title));
HTMLElement.appendChild(createBodyElement(docs,body));
HTMLElement.appendChild(createURLElement(docs,URL));
docs.getDocumentElement().appendChild(HTMLElement);
}
private void getText(StringBuffer sb, Node node) {
if (node.getNodeType() == Node.TEXT_NODE) {
sb.append(node.getNodeValue());
}
NodeList children = node.getChildNodes();
if (children != null) {
int len = children.getLength();
for (int i = 0; i < len; i++) {
getText(sb, children.item(i));
}
}
}
private boolean getText(StringBuffer sb, Node node,
String element) {
if (node.getNodeType() == Node.ELEMENT_NODE) {
if (element.equalsIgnoreCase(node.getNodeName())) {
getText(sb, node);
}
}
NodeList children = node.getChildNodes();
if (children != null) {
int len = children.getLength();
for (int i = 0; i < len; i++) {
if (getText(sb, children.item(i), element)) {
return true;
}
}
}
return false;
}
void Go_actionPerformed(ActionEvent event){
IWorkloadStorable wl = new SpiderInternalWorkload();
if(_Spider!=null){
Runnable doLater = new Runnable(){
public void run(){
_go.setText("Canceling...");
}
};
SwingUtilities.invokeLater(doLater);
_Spider.halt();
return;
}
try{
if(_url.getText().length()>0){
HTTPSocket http = new HTTPSocket();
http.send(_url.getText(),null);
}else{
_current.setText("<<distributed mode>>");
}
}catch(Exception e){
JOptionPane.showMessageDialog(this,e,"Error",JOptionPane.OK_CANCEL_OPTION,null);
return;
}
Runnable doLater = new Runnable(){
public void run(){
_go.setText("Cancel");
_current.setText("Loading...");
}
};
SwingUtilities.in