非常实用的java多线程网络爬虫(非常适合正学习网络爬虫的初学者)资源-CSDN文库

共4个文件

java：4个

java

需积分: 50 74 浏览量 2017-12-22 11:27:58 上传评论 1 收藏 4KB RAR 举报

资源推荐

资源详情

资源评论

收起资源包目录

nips论文下载脚本.rar （4个子文件）

nips论文下载脚本

MultiHandling

MultiThread.java 3KB

Downloadandsave

DownloadURLFile.java 740B

FetchLinks

GetWeb.java 2KB

StartDownloading.java 428B

package MultiHandling; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import Downloadandsave.DownloadURLFile; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; public class MultiThread implements Runnable { private static BufferedReader br = null; private List<String> list; private String paperdir = ""; String myDomain = "/paper/"; static { try { br = new BufferedReader(new FileReader("report.txt"), 10); } catch (FileNotFoundException e) { e.printStackTrace(); } } public void run() { String line = null; while (true) { this.list = new ArrayList<String>(); synchronized (br) { try { while ((line = br.readLine()) != null) { paperdir = line.substring(line.indexOf("/paper/") + "/paper/".length(), line.length()); System.out.println("===============准备开始下载 " + paperdir + " 的全文及相关资源 ================"); prepareDownFiles(line); break; } } catch (IOException e) { e.printStackTrace(); } } try { Thread.sleep(3000); downFiles(this.list); } catch (InterruptedException e) { e.printStackTrace(); } if (line == null) break; } } public void downFiles(List<String> list) throws InterruptedException { for (String str : list) { System.out.println("===============正在下载 " + str + " ================"); DownloadURLFile a = new DownloadURLFile(); a.downloadFromUrl(str, "Test" + File.separator + paperdir); } } public void prepareDownFiles(String line) { try { URL url = new URL(line); URLConnection conn = url.openConnection(); conn.setDoOutput(true); InputStream is = null; is = url.openStream(); BufferedReader bReader = new BufferedReader(new InputStreamReader(is)); String rLine = null; String tmp_rLine = null; while ((rLine = bReader.readLine()) != null) { tmp_rLine = rLine; int str_len = tmp_rLine.length(); if (str_len > 0) { String regUrl = "(?<=(href=)[\"]?[\']?)[^\\s\"\'\\?]*(" + myDomain + ")[^\\s\"\'>]*"; Pattern p = Pattern.compile(regUrl, Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(tmp_rLine); boolean blnp = m.find(); while (blnp == true) { list.add("https://papers.nips.cc" + m.group(0)); tmp_rLine = tmp_rLine.substring(m.end(), tmp_rLine.length()); m = p.matcher(tmp_rLine); blnp = m.find(); } } tmp_rLine = null; } is.close(); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }

评论收藏

内容反馈