一个用java实现的抓取网站程序资源-CSDN文库

共5个文件

java：5个

4星 · 超过85%的资源需积分: 9 79 浏览量 2010-08-13 10:53:42 上传评论 2 收藏 9KB RAR 举报

在IT领域，网络爬虫（spider）是一种用于自动化地从互联网上收集信息的程序。本项目是一个基于Java实现的网站抓取程序，利用多线程技术和HTML解析技术，同时具备防屏蔽功能，以高效、稳定的方式从目标网站获取数据。我们要理解Java在编程中的地位。Java是一种跨平台的面向对象的编程语言，因其优秀的性能和广泛的应用场景，常被用来开发大型企业级应用，包括网络爬虫项目。这个程序选择了Java作为主要开发语言，意味着它可以在不同的操作系统上运行，且具有良好的可维护性和扩展性。多线程是该程序的一个关键特性。在爬虫设计中，多线程可以提高抓取速度，同时处理多个网页请求，使得整个抓取过程并行化。通过合理调度线程，可以避免单一连接导致的服务器压力过大，同时提高了整体的抓取效率。在Java中，可以使用`Thread`类或者`ExecutorService`来创建和管理线程。 HTML分析是爬虫的另一核心组成部分。在获取到网页内容后，程序需要解析HTML，提取出需要的数据。Java提供了多种库来解析HTML，如Jsoup。Jsoup库支持CSS选择器，能方便地定位到HTML元素，提取文本、属性等信息。通过这些库，开发者可以更高效地处理网页内容，而不是手动处理字符串。防屏蔽功能则体现了爬虫的智能性。网站通常会设置反爬策略，如IP限制、User-Agent检测等。为应对这些策略，程序可能需要动态改变请求头（如User-Agent）、使用代理IP池进行轮换、设置合理的爬取间隔、模拟浏览器行为等。这些策略的实施，可以使爬虫在面对反爬机制时保持存活，保证数据抓取的连续性。在压缩包中的“crawler”文件，很可能是该项目的源代码目录。其中可能包含以下部分： 1. `Crawler.java`：主爬虫类，负责启动和管理爬取任务。 2. `ThreadPool.java`：线程池管理类，用于控制并发执行的线程数量。 3. `HtmlParser.java`：HTML解析类，处理获取的网页内容。 4. `RequestHandler.java`：请求处理类，封装HTTP请求，可能包含防屏蔽策略的实现。 5. `Config.properties`：配置文件，存放如代理IP、User-Agent等信息。 6. `Utils.java`：通用工具类，可能包含URL处理、日志记录等功能。这个Java实现的网站抓取程序结合了多线程、HTML解析和防屏蔽技术，提供了一个高效且智能的数据抓取解决方案。对于想要学习网络爬虫或提升Java编程技能的开发者来说，这是一个极好的实践案例。通过研究和理解这个项目的代码，可以深入学习到如何在实际场景中运用这些技术，从而提升自己的编程能力。

资源推荐

资源详情

资源评论

收起资源包目录

crawler.rar （5个子文件）

crawler

SimpleHTMLParser.java 5KB

PageInfo.java 4KB

WebPageXtractor.java 4KB

Arachnid.java 6KB

SimpleHTMLToken.java 2KB

package com.samson.crawler; /** Arachnid - Abstract Web spider class * To use, derive class from Arachnid, * Add handleLink(), handleBadLink(), handleNonHTMLlink(), * handleExternalLink(), and handleBadIO() methods * Instantiate and call traverse() * * Copyright 2002, Robert L. Platt, All rights reserved * @author Robert L. Platt * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ import java.io.*; import java.net.*; import java.util.*; public abstract class Arachnid { private String base; private URL baseUrl; private HashSet visited; private int delay; private static final String HTML = "text/html"; /** Constructor */ public Arachnid(String base) throws MalformedURLException { this.base = base; baseUrl = new URL(base); visited = new HashSet(); delay = 2; } /** Traverse Web site */ public void traverse() { traverse(baseUrl,null); } private void traverse(URL url, URL parent) { boolean isHTMLfile = true; PageInfo p = null; try { p = getWebPage(url,parent); }catch(IOException e) { e.printStackTrace(); handleBadIO(url,parent); sleep(delay); return; } if (p == null) { handleBadLink(url,parent,null); sleep(delay); return; } if (p.isValid() == false) { if (p.getContentType().equalsIgnoreCase(HTML) == false) handleNonHTMLlink(url,parent,p); else handleBadLink(url,parent,p); sleep(delay); return; } else handleLink(p); // Navigate through links on page URL[] links = p.getLinks(); if (links == null) { sleep(delay); return; } int n = links.length; for (int i=0; i<n; ++i) { if (isOKtoVisit(links[i])) { visited.add(links[i]); traverse(links[i],url); } else if (isExternalSite(links[i])) handleExternalLink(links[i],url); } sleep(delay); return; } /** (Abstract) Handle bad URL */ protected abstract void handleBadLink(URL url,URL parent,PageInfo p); /** (Abstract) Handle a link; a Web page in the site */ protected abstract void handleLink(PageInfo p); /** (Abstract) Handle a non-HTML link */ protected abstract void handleNonHTMLlink(URL url, URL parent, PageInfo p); /** (Abstract) Handle an external (outside of Web site) link */ protected abstract void handleExternalLink(URL url, URL parent); /** (Abstract) Handle an I/O Exception (server problem) */ protected abstract void handleBadIO(URL url, URL parent); /** Return true if it's OK to visit the link, false if it's not */ private boolean isOKtoVisit(URL link) { // Return false if it's not HTTP protocol if (!link.getProtocol().equals("http")) return(false); // Return false if it's an external site else if (isExternalSite(link)) return(false); else if (visited.contains(link)) return(false); else return(true); } private boolean isExternalSite(URL link) { // Return true if link host is different from base or // if path of link is not a superset of base URL if (link.getAuthority() != baseUrl.getAuthority() || (!UrlPathDir(link).startsWith(UrlPathDir(baseUrl)))) return(true); else return(false); } private String UrlPathDir(URL u) { String p = u.getPath(); if (p == null || p.equals("")) return("/"); int i = p.lastIndexOf("/"); if (i == -1) return("/"); else p = p.substring(0,i+1); return(p); } // Populate a PageInfo object from a URL private PageInfo getWebPage(URL url, URL parentUrl) throws IOException{ HttpURLConnection connection = (HttpURLConnection)url.openConnection(); int responseCode = connection.getResponseCode(); String contentType = connection.getContentType(); // Note: contentLength == -1 if NOT KNOWN (i.e. not returned from server) int contentLength = connection.getContentLength(); PageInfo p = new PageInfo(url,parentUrl,contentType,contentLength,responseCode); InputStreamReader rdr = new InputStreamReader(connection.getInputStream()); p.extract(rdr); rdr.close(); connection.disconnect(); return p ; } /** Get contents of a URL */ public byte[] getContent(URL url) { byte[] buf = null; try { HttpURLConnection connection = (HttpURLConnection)url.openConnection(); int responseCode = connection.getResponseCode(); int contentLength = connection.getContentLength(); // System.out.println("Content length: "+contentLength); if (responseCode != HttpURLConnection.HTTP_OK || contentLength <= 0) return(null); InputStream in = connection.getInputStream(); BufferedInputStream bufIn = new BufferedInputStream(in); buf = new byte[contentLength]; // Added code to handle blocked reads int bytesToRead = contentLength; int flag = 10; while(bytesToRead != 0 && flag != 0) { int bytesRead = bufIn.read(buf,(contentLength-bytesToRead),bytesToRead); bytesToRead = bytesToRead - bytesRead; flag--; if (flag <= 5) sleep(1); } in.close(); connection.disconnect(); if (flag == 0) return(null); }catch(Exception e) { return(null); } return(buf); } /** Return base URL (starting point for Web traversal) */ public URL getBaseUrl() { return(baseUrl); } // Sleep N seconds private void sleep(int n) { if (n <= 0) return; Thread mythread = Thread.currentThread(); try { mythread.sleep(n*1000); } catch(InterruptedException e) { // Ignore } } /** * Returns delay (N second pause after processing EACH web page) * @return int */ public int getDelay() { return delay; } /** * Sets delay (N second pause after processing EACH web page) * @param delay The delay to set */ public void setDelay(int delay) { this.delay = delay; } }

评论收藏

内容反馈