package test;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.wltea.analyzer.lucene.IKAnalyzer;
import java.io.*;
import java.util.*;
import java.util.regex.Pattern;
import static java.lang.Math.log;
import static java.lang.Math.pow;
/**
* Created by yizhou on 14-4-9.
*/
public class TezhenXuanze_副本 {
private static Map<String,Integer> WordList = new HashMap<String, Integer>();//所有词
private static Map<String,Integer> ClassList = new HashMap<String,Integer>();//所有分类
// private static Map<String,Map<Integer,Double>> WordFeatureValue = new HashMap<String, Map<Integer, Double>>();//用来存放所有类的互信息
private static Map<String,Double> WordFeatureValue = new HashMap<String, Double>();
private static List<String> KeyWord = new ArrayList<String>();
private static int wordnum=0;
private static int classnum=0;
private static Pattern pattern = Pattern.compile("[0-9]{6}");
private static Pattern pattern1 = Pattern.compile("[0-9]{1,10}");
// basefilepath为根目录,里面包含所有分类(如看多目录,看空目录等)filepath[]为一级子目录,每个filepath[i]目录里包含该分类下所有文本
// 输入的文本为未经分词的原始文本
public static void Product_WordMatrix(File basefilepath,File writepath) throws IOException {
IKAnalyzer analyzer = new IKAnalyzer(true);
File [] filepath = basefilepath.listFiles();
Integer count = 0;
classnum = filepath.length;
for(int i=0;i<classnum;i++){
ClassList.put(filepath[i].getName(),i);
File []files = filepath[i].listFiles();//存储子目录下所有文件
File path = new File(writepath.getPath()+"\\"+filepath[i].getName());//生成类别目录
if(!path.exists()) path.mkdirs();
int dayux00=0;
for(File filename:files){
File writefile=new File(path.getPath()+"\\"+filename.getName());//将原始文档转化为矩阵形式的词文档
FileWriter fw=new FileWriter(writefile);
BufferedWriter writer=new BufferedWriter(fw);
InputStreamReader reader = new InputStreamReader(new FileInputStream(filename)); // 建立一个输入流对象reader
BufferedReader br = new BufferedReader(reader); // 建立一个对象,它把文件内容转成计算机能读懂的语言
String line = br.readLine();
while (line != null) {
count++;
line = shaixuan(line,filename.getName());
if(line == null) {dayux00++;line=br.readLine();continue;}
String[] lines = line.split("[。??]");
// for(int it=0;it<lines.length;it++){ //长句子分为短句子
// TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(lines[it]));
TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(line));
tokenStream.addAttribute(CharTermAttribute.class);
while (tokenStream.incrementToken()){
String word = tokenStream.getAttribute(CharTermAttribute.class).toString();
writer.write(word + "\t");
if(!pattern1.matcher(word).find()) {
WordList.put(word, wordnum);
wordnum++;
}
}
writer.write(count.toString());
writer.write("\r\n");
writer.flush();
// }
line = br.readLine();
}
writer.close();
reader.close();
}
System.out.println("类别:" + filepath[i].getName() + "过滤数目:" + dayux00+":"+count);
}
}
public static String shaixuan(String line,String filename){
line = line.replaceAll("[+~$`^=|<>~`$^+=|<>¥×]","");
if(line.indexOf("评级")!=-1){
line=line.replaceAll("[“”\"的]", "");
if(line.indexOf("评级")>5)
line = line.substring(line.indexOf("评级")-6,line.indexOf("评级"));
else
line.substring(0,line.indexOf("评级"));
}
else if(line.indexOf("投资建议")!=-1){
if(line.indexOf("风险提示")!=-1)
line = line.substring(line.indexOf("投资建议"),line.indexOf("风险提示"));
else
line = line.substring(line.indexOf("投资建议"));
}
else if(line.indexOf("研报")!=-1&&line.indexOf("评级")==-1)
line=null;
else if(line.indexOf("公告日期")!=-1||(pattern.matcher(line).find()&&!(line.indexOf(filename.substring(0,5))!=-1)))
line=null;
return line;
}
public static void Product_WordMatrix_Test(File basefilepath,File writepath) throws IOException {
IKAnalyzer analyzer = new IKAnalyzer(true);
File [] filepath = basefilepath.listFiles();
Integer count = 0;
for(int i=0;i<filepath.length;i++){
File []files = filepath[i].listFiles();//存储子目录下所有文件
File path = new File(writepath.getPath()+"\\"+filepath[i].getName());//生成类别目录
if(!path.exists()) path.mkdirs();
int dayux00=0;
for(File filename:files){
File writefile=new File(path.getPath()+"\\"+filename.getName());//将原始文档转化为矩阵形式的词文档
FileWriter fw=new FileWriter(writefile);
BufferedWriter writer=new BufferedWriter(fw);
InputStreamReader reader = new InputStreamReader(new FileInputStream(filename)); // 建立一个输入流对象reader
BufferedReader br = new BufferedReader(reader); // 建立一个对象,它把文件内容转成计算机能读懂的语言
String line = br.readLine();
while (line != null) {
count++;
line = shaixuan(line,filename.getName());
if(line == null) {dayux00++;line=br.readLine();continue;}
String[] lines = line.split("[。??]");
// for(int it=0;it<lines.length;it++){ //长句子分为短句子
// TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(lines[it]));
TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(line));
tokenStream.addAttribute(CharTermAttribute.class);
while (tokenStream.incrementToken()){
String word = tokenStream.getAttribute(CharTermAttribute.class).toString();
writer.write(word + "\t");
}
writer.write(count.toString());
writer.write("\r\n");
writer.flush();
// }
line = br.readLine();
}
writer.close();
reader.close();
}
System.out.println("测试类别:"+filepath[i].getName()+"\t过滤数目:"+dayux00);
}
}
public static void Product_ZiMatrix(File basefilepath,File writepath) throws IOException {
File [] filepath = basefilepath.listFiles();
classnum = filepath.length;
for(int i=0;i<classnum;i++){
ClassList.put(filepath[i].getName(),i);
File []files = filepath[i].listFiles();//存储子目录下所有文件
File path = new File(writepath.getPath()+"\\"+filepath[i].getName(