package MultiHandling;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import Downloadandsave.DownloadURLFile;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
public class MultiThread implements Runnable {
private static BufferedReader br = null;
private List<String> list;
private String paperdir = "";
String myDomain = "/paper/";
static {
try {
br = new BufferedReader(new FileReader("report.txt"), 10);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
public void run() {
String line = null;
while (true) {
this.list = new ArrayList<String>();
synchronized (br) {
try {
while ((line = br.readLine()) != null) {
paperdir = line.substring(line.indexOf("/paper/") + "/paper/".length(), line.length());
System.out.println("===============准备开始下载 " + paperdir + " 的全文及相关资源 ================");
prepareDownFiles(line);
break;
}
} catch (IOException e) {
e.printStackTrace();
}
}
try {
Thread.sleep(3000);
downFiles(this.list);
} catch (InterruptedException e) {
e.printStackTrace();
}
if (line == null)
break;
}
}
public void downFiles(List<String> list) throws InterruptedException {
for (String str : list) {
System.out.println("===============正在下载 " + str + " ================");
DownloadURLFile a = new DownloadURLFile();
a.downloadFromUrl(str, "Test" + File.separator + paperdir);
}
}
public void prepareDownFiles(String line) {
try {
URL url = new URL(line);
URLConnection conn = url.openConnection();
conn.setDoOutput(true);
InputStream is = null;
is = url.openStream();
BufferedReader bReader = new BufferedReader(new InputStreamReader(is));
String rLine = null;
String tmp_rLine = null;
while ((rLine = bReader.readLine()) != null) {
tmp_rLine = rLine;
int str_len = tmp_rLine.length();
if (str_len > 0) {
String regUrl = "(?<=(href=)[\"]?[\']?)[^\\s\"\'\\?]*(" + myDomain + ")[^\\s\"\'>]*";
Pattern p = Pattern.compile(regUrl, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(tmp_rLine);
boolean blnp = m.find();
while (blnp == true) {
list.add("https://papers.nips.cc" + m.group(0));
tmp_rLine = tmp_rLine.substring(m.end(), tmp_rLine.length());
m = p.matcher(tmp_rLine);
blnp = m.find();
}
}
tmp_rLine = null;
}
is.close();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
非常实用的 java 多线程 网络 爬虫 (非常适合正学习网络爬虫的初学者)
需积分: 50 40 浏览量
2017-12-22
11:27:58
上传
评论 1
收藏 4KB RAR 举报
wuxinxing1981
- 粉丝: 3
- 资源: 30
最新资源
- 正点原子开拓者FPGA多人表决器代码项目
- EOP-Last5Years.txt
- windows 32位、64位系统常见缺少的库
- 毕业设计基于springboot+vue实现的求职招聘类型网站源码+数据库(高分项目).zip
- 535springboot + vue 体质测试数据分析及可视化设计.zip(可运行源码+数据库文件+文档)
- python毕业设计-基于Django+OpenCV的二维码生成与识别系统源码.zip
- 基于springboot+vue实现的求职招聘类型网站源代码+数据库(优质毕设项目).zip
- iOS APP提审checklist
- 第十四届中北大学ACM程序设计竞赛.zip
- UIGF_200852355_202404242026.json
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈