package haitou.proxy;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import haitou.db.DBAccess;
public class SearchInfoToDB {
private static String dbURL = "jdbc:mysql://localhost:3306/test";
private static String dbUser = "troy";
private static String dbPassword = "3031801";
private static String dbDriverClassPath = "org.gjt.mm.mysql.Driver";
private static int SLEEP_TIME = 1000 * 60 * 60;
private static String HAI_TOU_COM = "http://xjh.haitou.cc";
private static String tableName = "workInfo";
public static void main(String[] args) {
//配置数据库
DBAccess db = prepareDB();
if (db == null)
{
//数据库连接错误
return;
}
//创建http请求
URL haitouURL = null;
try
{
haitouURL = new URL(HAI_TOU_COM);
} catch (MalformedURLException e1)
{
// url格式出错
e1.printStackTrace();
}
try
{
HttpURLConnection connection = (HttpURLConnection) haitouURL
.openConnection();
//不可缺省,缺省403
connection
.setRequestProperty("user-agent",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0b11) Gecko/20100101 Firefox/4.0b11");
connection.connect();
if (connection.getResponseCode() == HttpURLConnection.HTTP_OK)
{
//以gb2312来编码
BufferedReader htmlReader = new BufferedReader(
new InputStreamReader(connection.getInputStream()));
ArrayList<CompanyInfo> workInfo = parseHTML(htmlReader);
if (workInfo != null)
{
for (CompanyInfo ci : workInfo)
{
String[] insertValue = { ci.getCompanyName(),
ci.getHoldAddress(), ci.getHoldTime(),
ci.getPublishTime() };
db.add(insertValue);
}
}
}
} catch (IOException e1)
{
e1.printStackTrace();
}
//每隔一段时间访问
while (true)
{
try
{
//每隔1小时更新一次数据库内容
Thread.sleep(SLEEP_TIME);
} catch (InterruptedException e)
{
continue;
}
}
}
public static DBAccess prepareDB() {
try
{
//使用CLASS 类加载驱动程序
Class.forName(dbDriverClassPath);
} catch (ClassNotFoundException e)
{
return null;
}
Connection conn = null;
try
{
conn = DriverManager.getConnection(dbURL, dbUser, dbPassword);
} catch (SQLException e)
{
//数据库连接错误
e.printStackTrace();
}
return new DBAccess(conn, tableName);
}
@SuppressWarnings("deprecation")
public static ArrayList<CompanyInfo> parseHTML(BufferedReader htmlReader)
throws IOException {
String line = "";
String xmlWorfInfo = "";
int lineNum = 0;
while ((line = htmlReader.readLine()) != null)
{
//寻找table开始标记
if (-1 != line.indexOf("<table"))
{
System.out.println(line);
xmlWorfInfo = xmlWorfInfo + line;
break;
}
lineNum++;
}
while ((line = htmlReader.readLine()) != null)
{
//特殊处理
line = line.replace(" ", "");
if (-1 != line.indexOf("122681.html"))
{
line = line.replace("&&", "&");
}
if (-1 != line.indexOf("122673.html"))
{
line = line.replace("24066;\"", "24066;");
line = line.replace("23703;\"", "23703;");
}
xmlWorfInfo = xmlWorfInfo + line;
if (-1 != line.indexOf("</table>"))
{
//读取到</table>结束
break;
}
lineNum++;
}
ArrayList<CompanyInfo> workInfo = new ArrayList<>();
Document document = null;
try
{
document = DocumentHelper.parseText(xmlWorfInfo);
} catch (DocumentException e)
{
return null;
}
Element element = document.getRootElement();
@SuppressWarnings("unchecked")
// 获得xml文件元素
List<Element> works = element.elements();
int i = 0;
//从xml中取出一行课表进行分析
for (Element work : works)
{
if (i > 0)
{
int j = 1;
@SuppressWarnings("unchecked")
List<Element> companyInfo = work.elements();
CompanyInfo ci = new CompanyInfo();
for (Element info : companyInfo)
{
String value = info.getText();
value = URLDecoder.decode(value, "UTF-8");
if (j == 2)
{
Element sonElement = (Element) info.elements().get(0);
value = sonElement.getText();
value = URLDecoder.decode(value, "UTF-8");
ci.setCompanyName(value);
}
else if (j == 3)
{
ci.setHoldTime(value);
}
else if (j == 4)
{
ci.setHoldAddress(value);
}
else if (j == 5)
{
ci.setPublishTime(value);
}
j++;
}
workInfo.add(ci);
}
i++;
}
return workInfo;
}
}