package cn.edu.hfut.dmic.webcollector.example;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Random;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
public class DemoBTannCrawler extends BreadthCrawler{
private StringBuilder sb = new StringBuilder();
public DemoBTannCrawler(String crawlPath, boolean autoParse) {
super(crawlPath, autoParse);
}
public String getSb(){
return this.sb.toString();
}
@Override
public void visit(Page page, CrawlDatums next) {
System.out.println("###################################################"+page.getUrl()+"###################################################");
if(page.getUrl().contains("search")){
System.out.println("搜索的关键词"+page.meta("keyword"));
Elements select = page.select("div[class=search-item]");
Elements pageHref = page.select("div[class=bottom-pager]");
String[] split = pageHref.select("a").last().attr("href").split("-");
int countPage = Integer.parseInt(split[split.length-1]);
int currentPage = Integer.parseInt(pageHref.select("span").first().text());//当前页
// System.out.println("----------------------------:"+countPage);
for (Element element : select) {
System.out.println("标题:"+element.select("div[class=item-title]").select("a").text());
System.out.println("集合:"+element.select("div[class=item-list]").select("p").text());
Elements select2 = element.select("div[class=item-bar]");//正文
sb.append(element.select("div[class=item-title]").select("a").text()+"\n");
sb.append(element.select("div[class=item-list]").select("p").text()+"\n");
for (Element element2 : select2) {
Elements select3 = element2.select("span");
for (Element element3 : select3) { //标题等
System.out.println(element3.text().replace(element3.select("b").get(0).text(), "")+":"+element3.select("b").get(0).text());
sb.append(element3.text().replace(element3.select("b").get(0).text(), "")+":"+element3.select("b").get(0).text()+"\n");
}
//磁力连接和迅雷连接
Elements select4 = element2.select("a");
for (Element element4 : select4) {
System.out.println(element4.text()+":"+element4.attr("href"));
sb.append(element4.text()+":"+element4.attr("href")+"\n");
}
}
System.out.println("==============================================================================================");
sb.append("\n\n");
}
next.add("http://www.btann.com/search/"+page.meta("keyword")+"-first-asc-"+(currentPage+1));
next.meta("User-Agent", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4; en-US; rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2");
next.meta("keyword", page.meta("keyword"));
System.out.println("*******************************共"+countPage+"页,当前第"+currentPage+"页*****************************************");
sb.append("*******************************共"+countPage+"页,当前第"+currentPage+"页*****************************************\n\n");
}else if("http://www.btann.com/".equals(page.getUrl())){
Elements select = page.select("div[class=info-box]").select("a");
String keyWord = select.get(0).text().replace("新", "");
System.out.println(keyWord+"-------------------------");
next.add("http://www.btann.com/search/"+keyWord+"-first-asc-1");
next.meta("keyword", keyWord);
next.meta("User-Agent", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4; en-US; rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2");
}
}
public static void main(String[] args) throws Exception {
//爬去首页推荐
DemoBTannCrawler bt = new DemoBTannCrawler("crawler", true);
bt.setResumable(true);
bt.addSeed("http://www.btann.com/");
bt.setThreads(200);
bt.start(200);
FileWriter writer;
int id = new Random().nextInt(1000);
try {
System.out.println("正在为您写入文件。。。");
writer = new FileWriter(id+".txt");
writer.write(bt.getSb());
writer.flush();
writer.close();
} catch (IOException e) {
e.printStackTrace();
}finally {
System.out.println("恭喜您,文件写入完成!,文件名是"+id+".txt");
}
System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++++++++end+++++++++++++++++++++++++++++++++++++++++++++++++++++");
}
}