package cn.casia.ailab.ldy.cmt;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
public class UrlCommentExtractor {
private String encoding="gb18030";
/**
* urlExtract
* (一定为8个=)
* @param rFile txt文件的绝对路径,文件名为相机品牌名,该文件包含一品牌所有型号相机的网页内容,各型号之间用“========相机品牌+型号====”标注
* @param wFile txt文件的绝对路径,文件名为相机品牌名,该文件用来保存该品牌所有型号相机的评论的url,以“http://------ \t相机品牌+型号”的形式储存
* @throws IOException
* @throws InterruptedException
*/
public void urlExtract(String rFile,String wFile) throws IOException, InterruptedException{
BufferedReader pageContent=null;
try{
pageContent=new BufferedReader(new InputStreamReader(
new FileInputStream(rFile),encoding));
Mql_Write2File wUrl= new Mql_Write2File(wFile);
String contentLine="";
String urlComment=""; //某型号机的评论url
String pcStyle=""; //该品牌下pc机的型号
for(contentLine =pageContent.readLine();contentLine!=null;contentLine=pageContent.readLine()){
if(contentLine.contains("========")){
pcStyle=contentLine.substring(contentLine.indexOf("========")+8,
contentLine.lastIndexOf("====="));
System.out.println(pcStyle+"url is ok");
}//if
if(contentLine.contains("http://comments")){
// System.out.println(contentLine);
urlComment=contentLine.substring(contentLine.indexOf("http"),
contentLine.indexOf("\"",contentLine.indexOf("http")));
//
wUrl.writeWebDownFile(urlComment+"\t");
wUrl.writeWebDownFile(pcStyle+"\r\n");
}//if
}//for
pageContent.close();
}catch (IOException ex){
System.err.println(ex);
}
}//urlExtract
/**
* commentCrawl
* @param rFile 文件的绝对路径,文件中存该品牌所有型号相机的评论的url,以“http://------ \t相机品牌+型号”的形式储存
* @param wFolder comment网页内容的目标文件夹的路径
* @throws IOException
* @throws InterruptedException
*/
public void commentCrawl(String rFile,String wFolder) throws IOException, InterruptedException{
try{
BufferedReader brUrl= null;
brUrl= new BufferedReader(new InputStreamReader(new FileInputStream(rFile),encoding));
String urlComLine="";//rFile中的每行
String[] urlLine;
String comContent="";
//~~~新建文件夹
String fileName=rFile.substring(rFile.lastIndexOf("\\")+1,rFile.indexOf(".txt"));
File fd=new File(wFolder+fileName+"\\");
if(!fd.exists()){
fd.mkdir();
}
wFolder=wFolder+fileName+"\\";
//~~~
Mql_WebDownloader webdown= new Mql_WebDownloader();
for(urlComLine=brUrl.readLine();urlComLine!=null;urlComLine=brUrl.readLine()){
urlLine=urlComLine.split("\t");
urlLine[1].replace("/","_");
//~~~~更改url,为标准评论页
StringBuffer sbUrlLine= new StringBuffer(urlLine[0]);
sbUrlLine.replace(urlLine[0].indexOf("index"),urlLine[0].indexOf("index")+5, "more");
sbUrlLine.replace(urlLine[0].indexOf("quote")-1,urlLine[0].indexOf("quote=0&")+7, "");
sbUrlLine.insert(urlLine[0].indexOf("&title")-9, "&page=0");
//~~~~
urlLine[0]=sbUrlLine.toString();
comContent=webdown.webpageDownload(urlLine[0], encoding);
Mql_Write2File wr2f= new Mql_Write2File(wFolder+urlLine[1]+".txt");
wr2f.writeWebDownFile("========"+urlLine[1]+"=====\r\n");
wr2f.writeWebDownFile(comContent+"\r\n");
//~~~~~~~抓取第一页的评论,并抽取页数信息
BufferedReader brMore=null;
brMore= new BufferedReader(new InputStreamReader(new FileInputStream(wFolder+urlLine[1]+".txt"),encoding));
String brMoreLine="";
for(brMoreLine=brMore.readLine();brMoreLine!=null;brMoreLine=brMore.readLine()){
// System.out.println(brMoreLine);
if(brMoreLine.contains("turn-page")){
break;
}
}//for
if(brMoreLine==null){
System.out.println(urlLine[1]+"查找turn-page无结果");
continue;
}
brMoreLine=brMore.readLine();
// System.out.println(brMoreLine);
brMoreLine=brMoreLine.replace(" ", "");
String[] lineInf=null;
lineInf=brMoreLine.split(";");
// System.out.println(lineInf[0]);
int pageNum=Integer.parseInt(lineInf[2].substring(1, lineInf[2].length()-1));//抽取页数
// int commentNum=Integer.parseInt(lineInf[0].substring(2, lineInf[0].length()-5));//抽取评论数
//~~~~~~~
for(int pageI=1;pageI<pageNum;pageI++){
urlLine[0]=sbUrlLine.replace(sbUrlLine.indexOf("page=")+5,
sbUrlLine.indexOf("page=")+6, String.valueOf(pageI)).toString();
comContent=webdown.webpageDownload(urlLine[0], encoding);
wr2f.writeWebDownFile("========"+"第"+(pageI+1)+"页"+"=====\r\n");
wr2f.writeWebDownFile(comContent+"\r\n");
}
}//for(urlComLine)
System.out.println("抓取 "+fileName+" 的评论网页 OK");
brUrl.close();
}catch(IOException ex){
System.err.println(ex);
}
}//commentCrawl
/**
* commentExtract
* @param rFolder 以品牌名命名的文件夹,文件夹内为各个型号的评论网页内容
* @param wFolder 以品牌名命名的文件夹,文件夹内为各个型号的评论内容
* @throws IOException
*/
public void commentExtract(String rFolder, String wFolder) throws IOException{
String[] fileName=null;
try{
File fileList=new File(rFolder);
fileName=fileList.list();
}catch(Exception e){
e.printStackTrace();
}//catch
String commentInf="";
String brComLine="";
BufferedReader brComment=null;
for(int fileI=0;fileI<fileName.length;fileI++){
brComment=new BufferedReader(new InputStreamReader(new FileInputStream(rFolder+fileName[fileI]),encoding));
//
int commentI=0;
Mql_Write2File wr2file=new Mql_Write2File(wFolder+fileName[fileI]);
wr2file.writeWebDownFile("========\r\n");
for(brComLine=brComment.readLine();brComLine!=null;brComLine=brComment.readLine()){
if(brComLine.contains("comment-bar-a")){//抽取评论人,评论时间
commentI++;//用来记录评论数,记在文件末
brComLine=brComment.readLine();
brComLine=brComLine.replace(" ","");//去掉代码中无用的空格
if((brComLine.lastIndexOf(">")+1)==brComLine.length()){//留言人为注册网友时
System.out.println("ok");
brComLine=brComment.readLine();
commentInf=brComLine.substring(brComLine.indexOf(">")+1, brComLine.lastIndexOf("<"));
}else{
commentInf=brComLine.substring(brComLine.lastIndexOf(">")+1, brComLine.length());
}
wr2file.writeWebDownFile("("+commentI+") "+commentInf+"\t");//写入留言人
brComLine=brComment.readLine();
commentInf=brComLine.substring(brComLine.indexOf(">")+1,brComLine.lastIndexOf("<"));
// System.out.println(commentInf);
wr2file.writeWebDownFile(commentInf+"\r\n");//写入留言日期
}//if
if(brComLine.contains("comment-bar-b")){//抽取评论内容
commentInf="";
if(!brComLine.contains("</div>")){
commentInf=commentInf+brComLine.substring(brComLine.indexOf(">")+1, brComLine.length());
brComLine=brComment.readLine();
while(!brComLine.contains("</div>")){
commentInf=commentInf+brComLine.substring(0,brComLine.length());
brComLine=brComment.readLine();
}
commentInf=commentInf+brComLine.substring(0,brComLine.indexOf("<"));
}else{
commentInf=brComLine.substring(brComLine.indexOf(">")+1,brComLine.indexOf("</div>"));
}
wr2file.writeWebDownFile(commentInf+"\r\n");
wr2file.writeWebDownFile("***************************************************\r\n");
// System.out.println(brComLine);
}//if
}//for(brComLine)
wr2file.writeWebDownFile("========共"+commentI+"条评论\r\n");
System.out.println("抽取 "+fileName[fileI]+" 的评论内容is OK");
}//for(fileI)
}//commentExtract
/* public static void main(String[] args) throws IOException, InterruptedExcepti