import java.io.*;
import java.net.HttpURLConnection;
import java.net.InetAddress;
import java.net.Socket;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class DownLoadFile
{
//CrawlerUrl cu=new CrawlerUrl();
public void SaveHTML(String strServer,String strPage,String filename,String charset)
{
//你想获取代码的网站
//String strServer = "www.cup.edu.cn";
//起始页面,/为根页
//String strPage = "/";
//url=strServer+strPage;
String IP="([1-9]|[1-9]\\d|1\\d{2}|2[0-4]\\d|25[0-5])(\\.(\\d|[1-9]\\d|1\\d{2}|2[0-4]\\d|25[0-5])){3}";
InetAddress addr;
try
{
//设置端口,通常http端口不就是80罗,你在地址栏上没输就是这个值
int port = 80;
Pattern p1 = Pattern.compile(IP);
Matcher m1= p1.matcher(strServer);
if(m1.find())
{
String[] strarray=strServer.split(":",2);
Pattern p = Pattern.compile(":");
Matcher m = p.matcher(strServer);
if(m.find())
{
//用域名反向获得IP地址
addr=InetAddress.getByName(strarray[0]);
port = Integer.parseInt(strarray[1]);
}
else
addr = InetAddress.getByName(strServer);
}
else
{
addr = InetAddress.getByName(strServer);
}
//建立一个Socket
Socket socket = new Socket(addr, port);
//发送命令,无非就是在Socket发送流的基础上加多一些握手信息,详情请了解HTTP协议
BufferedWriter wr = new BufferedWriter(new OutputStreamWriter(socket.getOutputStream(), "UTF-8"));
wr.write("GET " + strPage + " HTTP/1.0\r\n");
wr.write("HOST:" + strServer + "\r\n");
wr.write("Accept:*/*\r\n");
wr.write("\r\n");
wr.flush();
BufferedReader rd = new BufferedReader(new InputStreamReader(socket.getInputStream(),charset));
String line;
File a1=new File("E:\\savehtml\\"+filename + ".html");
boolean a=a1.createNewFile();
FileWriter fw=null;
fw =new FileWriter("E:\\savehtml\\"+filename + ".html");
while ((line = rd.readLine()) != null)
{
fw.write(line);
}
wr.close();
rd.close();
fw.close();
}
catch (Exception e)
{
e.printStackTrace();
}
}
public void getContext(String url,String input,String output) throws Exception
{
String temp;
StringBuffer sb = new StringBuffer();
String regex1;
String regex2;
String Url=url;
String title = "";
final List<String> list = new ArrayList<String>();
regex1 = "<title>.*?</title>";
regex2 = "([\u4e00-\u9fa5]+)";
try {
final BufferedReader in1 = new BufferedReader(new FileReader(
"E:\\savehtml\\"+input + ".html"));// 读取网页全部内容
//FileWriter fw1=null;
FileWriter fw1 =new FileWriter("E:\\saveContext\\"+output + "Title.txt");
FileWriter fw3 =new FileWriter("E:\\saveContext\\"+output + "Url.txt");
while ((temp = in1.readLine()) != null) {
//sb.append(temp);
final Pattern pa1 = Pattern.compile(regex1, Pattern.CANON_EQ);
final Matcher ma1 = pa1.matcher(temp);
// final Pattern pa2 = Pattern.compile(regex2, Pattern.DOTALL);
// final Matcher ma2 = pa2.matcher(temp);
while (ma1.find()) {
fw1.write(ma1.group()+" ");
System.out.println(ma1.group());
}
}
final BufferedReader in2 = new BufferedReader(new FileReader(
"E:\\savehtml\\"+input + ".html"));// 读取网页全部内容
FileWriter fw2 =new FileWriter("E:\\saveContext\\"+output + "Context.txt");
while ((temp = in2.readLine()) != null) {
//sb.append(temp);
// final Pattern pa1 = Pattern.compile(regex1, Pattern.CANON_EQ);
// final Matcher ma1 = pa1.matcher(temp);
final Pattern pa2 = Pattern.compile(regex2, Pattern.DOTALL);
final Matcher ma2 = pa2.matcher(temp);
while (ma2.find()) {
ContextTools ct = new ContextTools();
String CT = ct.getChinese(ma2.group());
fw2.write(CT);
System.out.println(CT);
// fw2.write(ma2.group());
// System.out.println(ma2.group());
}
fw3.write(url);
}
in1.close();
fw1.close();
in2.close();
fw2.close();
fw3.close();
}
catch(Exception e)
{
e.printStackTrace();
}
}
public String getFileNameByUrl(String url)
{
//remove http://
url=url.substring(7);
url= url.replaceAll("[\\?/:*|<>\"]", "_");
return url;
}
}