package com.rgy.utils;
import java.util.ArrayList;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.rgy.entity.PageInfo;
public class PageUtils {
public static PageInfo getPageInfo(String url){
PageInfo info = new PageInfo();
if(url.endsWith("/")){
url = url.substring(0, url.length()-1);
}
info.setUrl(url);
try{
Document doc = Jsoup.connect(url).timeout(30000).get();
String title = doc.title().toString();
info.setTitle(title);
String keywords = doc.getElementsByTag("meta").select("[name=keywords]").attr("content");
info.setKeywords(keywords);
Elements links = doc.getElementsByTag("a");
ArrayList<String> href_list = new ArrayList<String>();
for (Element link : links) {
String linkHref = link.attr("href");
if(linkHref.endsWith("/")){
linkHref = linkHref.substring(0, linkHref.length()-1);
}
//如果数组中不存在这个链接
if(linkIsAvailable(linkHref)&&!href_list.contains(linkHref)){
href_list.add(linkHref);
info.setHref_list(href_list);
}
}
}catch(Exception ex){
ex.printStackTrace();
}
return info;
}
public static boolean linkIsAvailable(String url){
if(url.startsWith("http://")){
String regex = ".*.exe|.*.apk|.*.zip|.*.rar|.*.pdf|.*.doc";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(url);
return !matcher.matches();
}
return false;
}
public static boolean keywordsIsAvailable(String keywords){
String regex = ".*青春.*|.*搞笑.*|.*微电影.*|.*短片.*|.*迷你剧.*|.*喜剧.*";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(keywords);
return matcher.matches();
}
//存放已经访问过的url
public static ArrayList<String> history_list = new ArrayList<String>();
//记录一路走来的父结点
public static Stack<String> parent_stack = new Stack<String>();
public static void searchUrl(String url){
PageInfo info = getPageInfo(url);
String keywords = info.getKeywords();
int hlist_size = history_list.size();
System.out.println(hlist_size+"-->"+history_list.get(hlist_size-1));
// if(keywordsIsAvailable(keywords)){//如果匹配上了,
// System.out.println(url+"===>"+keywords);
// }
ArrayList<String> href_list = info.getHref_list();
if(href_list==null){//该结点不可用,回到父亲结点继续走0
parent_stack.pop();
if(!parent_stack.empty()){//不为空栈
searchUrl(parent_stack.peek());
}else{//空栈
System.out.println("Yir,爬虫1号已完成任务!!!");
}
}else{//结点可用
int size = href_list.size();
for(int i=0;i<size;i++){
String strUrl = href_list.get(i);
if(history_list.contains(strUrl)){//如果当前链接已经被访问过了
continue;
}
history_list.add(strUrl);
parent_stack.push(strUrl);
searchUrl(strUrl);
}
}
}
public static void hrefShow(String url){
PageInfo info = getPageInfo(url);
ArrayList<String> href_list = info.getHref_list();
int size = href_list.size();
for(int i=0;i<size;i++){
System.out.println(href_list.get(i));
}
}
}