import java.util.*;
import java.io.*;
import java.net.*;
import java.util.regex.*;
import org.dom4j.*;
import org.dom4j.io.*;
public class Crawler implements Runnable {
// 商品的评论内容页面链接列表
ArrayList<CommodityLink> AllCommodityReviewLink = new ArrayList<CommodityLink>();
Document document = DocumentHelper.createDocument();
Element AllCommodity = document.addElement("AllCommodity");
String pageURL = "";
// 下载页面内容
public String DownloadPageContent(URL pageUrl) {
String line;
StringBuffer pageBuffer = new StringBuffer();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(
pageUrl.openStream()));
while ((line = br.readLine()) != null) {
pageBuffer = pageBuffer.append(line);
}
} catch (IOException e) {
e.printStackTrace();
}
// System.out.println(pageBuffer.toString());
return pageBuffer.toString();
}
public void Search() {
}
public void retrieveLinks(URL pageUrl) {
String pageContent = DownloadPageContent(pageUrl);
// System.out.println(pageContent);
Pattern p = Pattern.compile("<li><dl>.*?</dl></li>");
Matcher m = p.matcher(pageContent);
String li = "";
while (m.find()) {
CommodityLink dol = new CommodityLink();
li = m.group();
Pattern pproduct = Pattern
.compile("http://www.360buy.com/product/\\d*.html");
Matcher mproduct = pproduct.matcher(li);
if (mproduct.find()) {
dol.commodityLink = mproduct.group();
// System.out.println(dol.commodityLink);
}
Pattern preview = Pattern
.compile("http://club.360buy.com/review/\\S*.html");
Matcher mreview = preview.matcher(li);
if (mreview.find()) {
dol.reviewLink = mreview.group();
// System.out.println(dol.reviewLink);
}
AllCommodityReviewLink.add(dol);
}
Pattern ppage = Pattern.compile("<div class=\"Pagination\">.*?</div>");
Matcher mpage = ppage.matcher(pageContent);
String page = "";
if (mpage.find())
page = mpage.group();
Pattern pnextpage = Pattern
.compile("<a href=\"\\d*-\\d*-\\d*-\\d*-\\d*-\\d*-\\d*-\\d*-\\d*-\\d*-\\d*-\\d*-\\d*.html\">下一页");
Matcher mnextpage = pnextpage.matcher(page);
String nextpagenumber = "";
if (mnextpage.find()) {
nextpagenumber = mnextpage.group();
Pattern purl = Pattern
.compile("\\d*-\\d*-\\d*-\\d*-\\d*-\\d*-\\d*-\\d*-\\d*-\\d*-\\d*-\\d*-\\d*");
Matcher murl = purl.matcher(nextpagenumber);
String url = "";
if (murl.find())
url = murl.group();
url = "http://www.360buy.com/products/" + url + ".html";
try {
retrieveLinks(new URL(url));
} catch (MalformedURLException e) {
e.printStackTrace();
}
}
}
public void retrieveCommodity() {
URL commodityURL = null;
Iterator it = AllCommodityReviewLink.iterator();
// Pattern pscore = Pattern.compile("<p>评价得分.*</p>");
// Pattern pnumber = Pattern.compile("<p>评 论 数.*</p>");
while (it.hasNext()) {
Commodity commodity = new Commodity();
CommodityLink cl = (CommodityLink) it.next();
String a = "";
a = cl.commodityLink;
Element CommodityListURL = AllCommodity
.addElement("CommodityListURL");
CommodityListURL.addAttribute("URL", a);
Element Commodity = CommodityListURL.addElement("Commodity");
System.out.println(a);
try {
commodityURL = new URL(a);
} catch (MalformedURLException e) {
e.printStackTrace();
System.out.println("Exception");
}
String pageContent = DownloadPageContent(commodityURL);
Pattern pname = Pattern.compile("商品名称(.*?)</li>");
Matcher mname = pname.matcher(pageContent);
String name = "";
if (mname.find())
name = mname.group(1);
name = (String) name.subSequence(1, name.length());
System.out.println(name);
Element CommodityName = Commodity.addElement("CommodityName");
Commodity.addText(name);
Pattern pmanufacture = Pattern
.compile("生产厂家.*?<a.*?\">(.*?)</a></li>");
Matcher mmanufacture = pmanufacture.matcher(pageContent);
String manufacture = "";
if (mmanufacture.find())
manufacture = mmanufacture.group(1);
Element Manufacture = Commodity.addElement("Manufacture");
Manufacture.addText(manufacture);
Pattern ptime = Pattern.compile("<li>上架时间(.*?)</li>");
Matcher mtime = ptime.matcher(pageContent);
String time = "";
if (mtime.find())
time = mtime.group(1);
time = time.substring(1, time.length());
// System.out.println(time);
Element Time = Commodity.addElement("Time");
Time.addText(time);
String b = "";
b = cl.reviewLink;
// System.out.println(b);
String CommodityContent = "";
try {
CommodityContent = DownloadPageContent(new URL(b));
} catch (MalformedURLException e1) {
e1.printStackTrace();
}
Pattern pscore = Pattern.compile("<p>评价得分.*</p>");
Pattern pnumber = Pattern.compile("<p>评 论 数.*</p>");
Matcher mscore = pscore.matcher(CommodityContent);
Matcher mnumber = pnumber.matcher(CommodityContent);
if (mscore.find()) {
int score = 0;
Pattern p = Pattern.compile("\\d分");
Matcher m = p.matcher(mscore.group());
if (m.find())
score = Integer.parseInt((String) m.group().subSequence(0,
1));
// commodity.reviewScore = score;
// System.out.print(score + " ");
Element TotalScore = Commodity.addElement("TotalScore");
TotalScore.addText(new Integer(score).toString());
}
if (mnumber.find()) {
int number = 0;
Pattern p = Pattern.compile("\\d*条");
Matcher m = p.matcher(mnumber.group());
if (m.find()) {
String temp = m.group();
number = Integer.parseInt(temp.replaceAll("条", ""));
}
Element Number = Commodity.addElement("Number");
Number.addText(new Integer(number).toString());
Element Reviews = Commodity.addElement("Reviews");
try {
retrieveReviews(Reviews, DownloadPageContent(new URL(b)));
} catch (MalformedURLException e) {
e.printStackTrace();
}
}
}
}
public void retrieveReviews(Element Reviews, String pageContent) {
Pattern preview = Pattern.compile("<ul class=\"PR_list\".*?</ul>");
Matcher mreview = preview.matcher(pageContent);
while (mreview.find()) {
String ulclassPRList = mreview.group();
// System.out.println(ulclassPRList);
// Review review = new Review();
Element Review = Reviews.addElement("Review");
Pattern pusername = Pattern
.compile("<li\\sname=\"(\\w*?)\"\\sclass=\"PR_list_l\">");
Matcher musername = pusername.matcher(ulclassPRList);
String username = "";
if (musername.find())
username = musername.group(1);
System.out.println(username);
Element UserName = Review.addElement("UserName");
UserName.addText(username);
Pattern pplace = Pattern
.compile("<div style=\"color: Green;\">(\\S*)</div>");
Matcher mplace = pplace.matcher(ulclassPRList);
String place = "";
if (mplace.find())
place = mplace.group();
place = place.replaceAll("<div style=\"color: Green;\">", "");
place = place.replaceAll("</div>", "");
place = place.substring(1, place.length() - 1);
// review.place = place;
Element HomeTown = Review.addElement("HomeTown");
HomeTown.addText(place);
// System.out.println(place);
Pattern puserlevel = Pattern
.compile("<div>([\u4E00-\u9FA5]*?)</div>");
Matcher muserlevel = puserlevel.matcher(ulclassPRList);
String userlevel = "";
if (muserlevel.find())
userlevel = muserlevel.group(1);
// System.out.println(userlevel);
Element UserLevel = Review.addElement("UserLevel");
UserLevel.addText(userlevel);
Pattern pcommenttime = Pattern
.compile("<span\\sclass=\"float_Right\">(\\d*-\\d*-\\d*\\s\\d*:\\d*)</span>");
Matcher
- 1
- 2
- 3
- 4
- 5
- 6
前往页