package jsoup;
import java.io.IOException;
import java.net.URL;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.hibernate.Query;
import org.hibernate.Session;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.ibaiqi.dao.WebSiteDAO;
import com.ibaiqi.dao.WebSiteDAOImpl;
import com.ibaiqi.factory.WebSiteDaoFactory;
import com.ibaiqi.factory.WebSiteServiceFactory;
import com.ibaiqi.factory.WebSiteUniqueServiceFactory;
import com.ibaiqi.po.WebSite;
import com.ibaiqi.po.WebSiteUnique;
import com.ibaiqi.service.WebSiteService;
import com.ibaiqi.service.WebSiteServiceImpl;
import com.ibaiqi.service.WebSiteUniqueService;
import com.ibaiqi.service.WebSiteUniqueServiceImpl;
import com.ibaiqi.util.HibernateUtil;
import com.opensymphony.xwork2.ActionSupport;
import com.sun.org.apache.xml.internal.utils.SuballocatedByteVector;
public class WebSiteDomainUniqueGetDetails extends ActionSupport {
static Integer pageID = 1;
/**
*
*/
private static final long serialVersionUID = 1L;
public void webSiteUniqueGetDetails_httpClient(Integer ID, String webUrl) {
// String webUrl="http://www.sqttly.com/";
// String webUrl="http://www.sqrcw.com";
String html = null;
System.out.println("被分析网址:" + webUrl);
try {
// 如果网站链接超过25秒,则抛出如下 catch错误
Session session = HibernateUtil.getSession();
session.beginTransaction();
// 使用HQL查询
String hql = "update WebSiteUnique set catchWebSiteURLTag='GetDetails' Where ID ="
+ ID;
// 通过Query方法查询
Query q = session.createQuery(hql);
q.executeUpdate();
//
// 关闭session
HibernateUtil.closeSession(session);
//Document doc = Jsoup.parse(new URL(link).openStream(), "ISO-8859-1", link);
html = getHtmlByUrl(webUrl);
//不需要插件:doc = Jsoup.connect(webUrl).timeout(20000).get();
} catch (Exception e) {
// try内打开链接,超过25秒,则执行如下语句
System.out
.println("打开网页连接超时01--------------------------------------------------------------------------");
// 为分析过做标识begin
Session session = HibernateUtil.getSession();
session.beginTransaction();
// 使用HQL查询
String hql = "update WebSiteUnique set catchWebSiteURLTag='GetDetailsTimeOut' Where id ="
+ ID;
// 通过Query方法查询
Query q = session.createQuery(hql);
q.executeUpdate();
//
// 关闭session
HibernateUtil.closeSession(session);
// 为分析过做标识end
runCatchSite();
// e.printStackTrace();
System.out
.println("打开网页连接超时02--------------------------------------------------");
runCatchSite();
}
// String html =
// "<a href=\"mailto:hslimf@tp.edu.sg\">Gary Lim Hock Seng </a><a href=\"mailto:zhangdaxu@tp.edu.sg\">Gary Lim Hock Seng </a>";
//Document doca = Jsoup.parse(doc.html());
Document doca=null;
if ( html!= null && !"".equals(html)) {
doca = Jsoup.parse(html);
}
System.out
.println("----------------------------------------------------");
System.out.println("网站标题:" + doca.title());
// Document doc=Jsoup.connect("http://www.sqlife.cn").get();
Elements links = doca.select("a[href^=mailto]");
// System.out.println(links);
String emails = "无";
for (Element lnk : links) {
// String email = lnk.attr("href");
emails = lnk.attr("href").substring(7);
// System.out.println("邮箱地址为:" + email);
// System.out.printf("%s:%s\n", lnk.text(), email);
if (emails.length() > 100) {
emails = emails.substring(0, 99);
}
}
// System.out.println("tagName:" + doca.tagName());
// 调用判断是不是手机号码
String text = doca.text();
System.out.println(text);
// String text =
// "骗子的电话是13944447777和076812345678,86-0527-88089988 18672178588 13732680805 15151163228 13337846525银行帐号:2240755123456780 身份证:4744134123456789, 错误的号码139444477771(多了一位)";
String phones = WebSiteGetDetails.pickUpPhone(text);
System.out.println("提取手机号码是:" + phones);
// String text2="骗子电话是05168808998,请不要打他电话,直接打他手机15151163288试试";
String tels = WebSiteGetDetails.pickUpTel(text);
System.out.println("提取电话号码是:" + tels);
// 添加采集电话号码,邮箱,等信息到数据库___begin
WebSiteUniqueService ps = WebSiteUniqueServiceFactory
.getServiceInstance();
// 将接受的参数设置到Product实例中
WebSiteUnique webSiteUnique = new WebSiteUnique();
webSiteUnique.setID(ID);
webSiteUnique.setDomain(webUrl);
String siteNameString = doca.title();
if (siteNameString.length() > 100) {
siteNameString = siteNameString.substring(0, 99);
}
webSiteUnique.setSiteName(siteNameString);
webSiteUnique.setIntroduction("教育培训");
webSiteUnique.setCategoryNum(120000);
webSiteUnique.setVisitorTime(new Date());
// 判断邮箱格式是否正确________________begin
Pattern pattern = Pattern
.compile("^([a-zA-Z0-9_\\-\\.]+)@((\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.)|(([a-zA-Z0-9\\-]+\\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\\]?)$");
Matcher matcher = pattern.matcher(emails);
System.out.println(matcher.matches());
if(matcher.matches() !=true ){
emails="邮件格式错误";
}
// 判断邮箱格式是否正确________________end
webSiteUnique.setEmail(emails);
if (phones.length() > 100) {
phones = phones.substring(0, 99);
}
webSiteUnique.setPhone(phones);
System.out.println("phones_______________" + phones);
if (tels.length() > 100) {
tels = tels.substring(0, 99);
}
webSiteUnique.setTel(tels);
ps.updateWebSiteUniquee(webSiteUnique);
// 添加采集电话号码,邮箱,等信息到数据库___begin
}
/**
* 完全采用jsoup ,没有用httpClient插件
* **/
public void webSiteUniqueGetDetails(Integer ID, String webUrl) {
// String webUrl="http://www.sqttly.com/";
// String webUrl="http://www.sqrcw.com";
Document doc = null;
System.out.println("被分析网址:" + webUrl);
try {
// 如果网站链接超过25秒,则抛出如下 catch错误
Session session = HibernateUtil.getSession();
session.beginTransaction();
// 使用HQL查询
String hql = "update WebSiteUnique set catchWebSiteURLTag='GetDetails' Where ID ="
+ ID;
// 通过Query方法查询
Query q = session.createQuery(hql);
q.executeUpdate();
//
// 关闭session
HibernateUtil.closeSession(session);
//Document doc = Jsoup.parse(new URL(link).openStream(), "ISO-8859-1", link);
//doc = Jsoup.connect(webUrl).timeout(20000).get();
/* public static Document parse(InputStream in,
String charsetName,
String baseUri)
throws IOException*/
/* doc=Jsoup.parse(in, "utf-8", webUrl);
Jsoup.p*/
doc = Jsoup.parse(new URL(webUrl).openStream(), "ISo-8859-1", webUrl);
} catch (IOException e) {
// try内打开链接,超过25秒,则执行如下语句
System.out
.println("打开网页连接超时01--------------------------------------------------------------------------");
// 为分析过做标识begin
Session session = HibernateUtil.getSession();
session.beginTr
- 1
- 2
前往页