import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class GetHTML {
/** *//**
* 读取一个网页全部内容
*/
public String getOneHtml(String htmlurl) throws IOException {
URL url;
String temp;
StringBuffer sb = new StringBuffer();
try {
url = new URL(htmlurl);
BufferedReader in = new BufferedReader(new InputStreamReader(url
.openStream(), "utf-8"));// 读取网页全部内容
while ((temp = in.readLine()) != null) {
sb.append(temp);
}
in.close();
}catch(MalformedURLException me) {
System.out.println("你输入的URL格式有问题!请仔细输入");
me.getMessage();
throw me;
}catch (IOException e) {
e.printStackTrace();
throw e;
}
return sb.toString();
}
/** *//**
*
* @param s
* @return 获得网页标题
*/
public String getTitle(String s) {
String regex;
String title = "";
List<String> list = new ArrayList<String>();
regex = "<TITLE>.*?</TITLE>";
Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
title = title + list.get(i);
}
return outTag(title);
}
/** *//**
*
* @param s
* @return 获得指定内容中间的内容
*/
public String getString(String s,String begin,String end) {
String regex;
String s2 = "";
List<String> list = new ArrayList<String>();
regex = begin+".*?"+end;
Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
s2 = s2 + list.get(i);
}
return s2;
}
/** *//**
*
* @param s
* @return 获得链接
*/
public List<String> getLink(String s) {
String regex;
List<String> list = new ArrayList<String>();
regex = "<a[^>]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.*?)</a>";
Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
return list;
}
/** *//**
*
* @param s
* @return 获得脚本代码
*/
public List<String> getScript(String s) {
String regex;
List<String> list = new ArrayList<String>();
regex = "<script.*?</script>";
Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
return list;
}
/** *//**
*
* @param s
* @return 获得CSS
*/
public List<String> getCSS(String s) {
String regex;
List<String> list = new ArrayList<String>();
regex = "<style.*?</style>";
Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
return list;
}
/** *//**
*
* @param s
* @return 去掉标记
*/
public String outTag(String s) {
return s.replaceAll("<.*?>", "");
}
/** *//**
*
* @param s
* @return 去掉指定字符串
*/
public String outString(String s,String begain,String end) {
return s.replaceAll(begain+".*?"+end, "");
}
/** *//**
*
* @param s
* @return 替换字符串s1为s2
*/
public String reString(String s,String s1,String s2) {
return s.replaceAll(s1, s2);
}
public String getImg(String imgUrl,String name) {
int b = 0;
String imgName=null;
FileOutputStream out = null;
try {
HttpURLConnection conn = (HttpURLConnection) new URL(imgUrl)
.openConnection();
conn.setRequestProperty("Accept-Encoding", "gzip");
conn.setRequestProperty("referer", "http://www.da-pei.com/");
conn.setRequestProperty("cookie", "http://www.da-pei.com/");
InputStream inputStream = conn.getInputStream();
out = new FileOutputStream("d:/img/"+name);
while ((b = inputStream.read()) != -1) {
out.write(b);
}
//System.out.println(inputStream);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return imgName;
}
}